Fix several bugs to get Python scanner/parser basically working, processes ../tests...
[pilex.git] / dfa.py
1 import bisect
2 import element
3 import flex_dfa
4 import numpy
5 import numpy_heap
6 import work
7 import sys
8
9 # defines the alphabet size, set this to 0x11000 for unicode
10 n_characters = 0x100
11
12 class DFA:
13   # transition classes:
14   # instructions to transform thread list from a multistate to next multistate
15   # (TRANSITION_POP, n)                 j += n
16   # (TRANSITION_DUP, n)                 threads0[j - n:j] = threads0[j:j + n]
17   #                                     j -= n
18   # (TRANSITION_MARK, n, mark_value)    threads0[j:j + n] = [
19   #                                       (i, mark, thread)
20   #                                       for thread in threads0[j:j + n]
21   #                                     ]
22   # (TRANSITION_MOVE, n)                threads1.extend(threads0[j:j + n])
23   #                                     j += n
24   # (TRANSITION_DEL, n)                 del threads1[-n:]
25
26   TRANSITION_POP = 0
27   TRANSITION_DUP = 1
28   TRANSITION_MARK = 2
29   TRANSITION_MOVE = 3
30   #TRANSITION_DEL = 4
31  
32   def __init__(
33     self,
34     states = [([n_characters], [0], [0])],
35     actions = [(0, [])],
36     start_action = [] # can have multiple DFAs in same container
37   ):
38     # states: list of state_desc
39     # state_desc: (list of breaks, list of action to do, accept_threads)
40     # actions: list of action_desc
41     # action_desc: (state to go to next, compiled transition to do first)
42     # accept_threads: list of accepting thread numbers (in thread list)
43     self.states = states
44     self.actions = actions
45     self.start_action = start_action
46
47   def to_flex_dfa(self):
48     # we use a modified version of the transition routine, we do not know
49     # how many threads are active, so we just create null threads as they
50     # are referred to (resulting threads have current marks but no history),
51     # each thread is a list in forward order, not a stack in reverse order
52     def transit(transition):
53       nonlocal threads0, threads1, prefix_slop # note: also uses i
54       j = prefix_slop
55       for trans in transition:
56         if len(threads0) < j + trans[1]:
57           threads0.extend([[] for k in range(j + trans[1] - len(threads0))])
58         if trans[0] == DFA.TRANSITION_POP:
59           j += trans[1]
60         elif trans[0] == DFA.TRANSITION_DUP:
61           while j < trans[1]:
62             threads0[:0] = [None] * prefix_slop
63             threads1[:0] = [None] * prefix_slop
64             j += prefix_slop
65             prefix_slop *= 2
66           threads0[j - trans[1]:j] = [
67             list(k)
68             for k in threads0[j:j + trans[1]]
69           ]
70           j -= trans[1]
71         elif trans[0] == DFA.TRANSITION_MARK:
72           for k in range(j, j + trans[1]):
73             threads0[j].append(trans[2])
74         elif trans[0] == DFA.TRANSITION_MOVE:
75           threads1.extend(threads0[j:j + trans[1]])
76           j += trans[1]
77         #elif trans[0] == DFA.TRANSITION_DEL:
78         #  del threads1[-trans[1]:]
79         else:
80           assert False
81       assert j == len(threads0)
82       threads0, threads1 = threads1, threads0
83       del threads1[prefix_slop:]
84
85     threads0 = [None]
86     threads1 = [None]
87     prefix_slop = 1
88
89     # action numbers in the DFA become state numbers in the FlexDFA,
90     # with the start-action for each start-condition being copied into
91     # the correctly numbered slot (may cause duplicates), and then all
92     # actions reachable from these being copied to the subsequent slots
93     # (if start-action is reached again, uses the lower numbered copy)
94     flex_state_to_action = [0] + self.start_action
95     action_to_flex_state = {-1: 0} # see comment about state -1 below
96     for i in range(len(flex_state_to_action)):
97       action = flex_state_to_action[i]
98       if action not in action_to_flex_state:
99         action_to_flex_state[action] = i
100
101     # last start-condition is really end-of-buffer (EOB), it has only
102     # a dummy rule that accepts the null string and executes EOB action
103     eob_state = len(self.start_action)
104
105     # state 0 is the jam state, make it exist but have empty acclist
106     acclist = numpy.zeros((0x100,), numpy.uint16)
107     n_acclist = 0
108     accept = numpy.zeros((0x100,), numpy.uint16)
109     n_states = 1
110
111     # transitions[i, j] is transition on character j in state i
112     # in our way of thinking, 0 is don't care and -1 is failure
113     # however, in the flex way these are both 0 (don't care),
114     # the distinction being that failure has no associated action
115     # thus all entries of states are filled, with 0 as a catch-all
116     transitions = numpy.zeros((0x100, 0x101), numpy.uint16)
117
118     # generate acclist, accept and transition tables
119     while n_states < len(flex_state_to_action):
120       action = flex_state_to_action[n_states]
121       state, transition = self.actions[action]
122       #print('state', n_states, 'transition', transition)
123
124       del threads0[prefix_slop:]
125       transit(transition)
126       #print(threads0[prefix_slop:])
127       if n_states >= accept.shape[0]:
128         # extend accept
129         new_accept = numpy.zeros(
130           (accept.shape[0] * 2,),
131           numpy.uint16
132         )
133         new_accept[:accept.shape[0]] = accept
134         accept = new_accept
135       accept[n_states] = n_acclist
136       accept_set = set()
137       for k in [j for i in threads0[prefix_slop:] for j in i]:
138         if k != -1: # ignore user-defined groups
139           acc = k >> 1
140           if k & 1:
141             if (
142               (acc | flex_dfa.FlexDFA.YY_TRAILING_HEAD_MASK) not in accept_set
143             ):
144               # look back to start of trailing context, then accept
145               acc |= flex_dfa.FlexDFA.YY_TRAILING_MASK
146             # otherwise zero length trailing context, accept immediately
147           else:
148             # mark start of (hopefully safe) trailing context
149             acc |= flex_dfa.FlexDFA.YY_TRAILING_HEAD_MASK
150           if acc not in accept_set:
151             if n_acclist >= acclist.shape[0]:
152               # extend acclist
153               new_acclist = numpy.zeros(
154                 (acclist.shape[0] * 2,),
155                 numpy.uint16
156               )
157               new_acclist[:acclist.shape[0]] = acclist
158               acclist = new_acclist
159             acclist[n_acclist] = acc
160             n_acclist += 1
161             accept_set.add(acc)
162
163       # calculate transition row from self.state character-to-action table
164       if n_states >= transitions.shape[0]:
165         # extend transitions
166         new_transitions = numpy.zeros(
167           (transitions.shape[0] * 2, 0x101),
168           numpy.uint16
169         )
170         new_transitions[:transitions.shape[0], :] = transitions
171         transitions = new_transitions
172       breaks, actions, _ = self.states[state]
173       character0 = 0
174       for i in range(len(breaks)):
175         character1 = breaks[i]
176         next_action = actions[i]
177         if next_action in action_to_flex_state:
178           next_flex_state = action_to_flex_state[next_action]
179         else:
180           next_flex_state = len(flex_state_to_action)
181           action_to_flex_state[next_action] = next_flex_state
182           flex_state_to_action.append(next_action)
183         transitions[n_states, character0:character1] = next_flex_state
184         character0 = character1
185       assert character0 == 0x100
186
187       n_states += 1
188
189     # finalize the acclist and accept tables
190     acclist = acclist[:n_acclist]
191     if n_states >= accept.shape[0]:
192       new_accept = numpy.zeros(
193         (accept.shape[0] * 2,),
194         numpy.uint16
195       )
196       new_accept[:accept.shape[0]] = accept
197       accept = new_accept
198     accept[n_states] = n_acclist
199     accept = accept[:n_states + 1]
200
201     # finalize the transitions table
202     transitions = transitions[:n_states, :]
203     transitions[:, 0x100] = transitions[:, 0]
204     transitions[:, 0] = eob_state
205
206     # calculate default states by constructing minimum spanning tree
207     # heap contains n states todo followed by n_states - n states done
208     # each heap entry is [distance, hop count, state done, state todo]
209     heap = numpy.zeros((n_states, 4), numpy.uint16)
210     heap[:-1, 0] = numpy.sum(transitions[1:, :] != transitions[:1, :], 1)
211     heap[:-1, 3] = numpy.arange(1, n_states, dtype = numpy.uint16)
212     numpy_heap.heapify(heap, n_states - 1)
213     for n in range(n_states - 2, 0, -1):
214       if n % 100 == 0:
215         print('mst', n)
216
217       key = tuple(heap[n, :])
218       heap[n, :] = heap[0, :]
219       numpy_heap.bubble_down(heap, 0, key, n)
220       hop_count = heap[n, 1] + 1 # proposed hop_count is current hop_count + 1
221       state_done = heap[n, 3] # proposed state_done is current state_todo
222       dist = numpy.sum(
223         transitions[heap[:n, 3], :] !=
224         transitions[state_done:state_done + 1, :],
225         1
226       )
227       # although numpy cannot do lexicographic comparisons, check the
228       # first field via numpy to quickly generate a list of candidates
229       for i in numpy.nonzero(dist <= heap[:n, 0])[0]:
230         key = (dist[i], hop_count, state_done, heap[i, 3])
231         if key < tuple(heap[i, :]):
232           numpy_heap.bubble_up(heap, i, key)
233
234     # state 0 is the jam state, the EOB state will be added later on
235     states = numpy.zeros((n_states, 2), numpy.uint16) # base, def
236     entries = numpy.full((0x200, 2), -1, numpy.uint16) # nxt, chk
237     entries[:0x101, :] = 0 # jam state just returns to jam state
238     entries[0, 0] = eob_state # except for the EOB transition
239     entries_free = numpy.full(0x200, True, numpy.bool)
240     entries_free[:0x101] = False # account for the jam (don't care) state
241     n_entries = 0x101
242
243     # pack states in reverse order (larger distances first)
244     dupes = []
245     for i in range(n_states - 1):
246       if (n_states - i) % 100 == 0:
247         print('pack', n_states - i)
248  
249       if heap[i, 0] == 0:
250         # when copying another state, need to have the same base, though
251         # the base will not matter since there will be no entries, it is
252         # is because of the awkward way the compressed lookup is written
253         dupes.append(i)
254       else:
255         state_done = heap[i, 2]
256         state_todo = heap[i, 3]
257         indices = numpy.nonzero(
258           transitions[state_todo, :] != transitions[state_done, :]
259         )[0]
260
261         # make sure entries array is at least large enough to find a spot
262         while entries.shape[0] < n_entries + 0x101:
263           # extend entries, copying only n_entries entries
264           new_entries = numpy.full(
265             (entries.shape[0] * 2, 2),
266             -1,
267             numpy.uint16
268           )
269           new_entries[:n_entries, :] = entries[:n_entries, :]
270           entries = new_entries
271
272           # extend entries_free, copying only n_entries entries
273           new_entries_free = numpy.full(
274             (entries_free.shape[0] * 2,),
275             True,
276             numpy.bool
277           )
278           new_entries_free[:n_entries] = entries_free[:n_entries]
279           entries_free = new_entries_free
280
281         # find a suitable spot and store differences from default state
282         # generate a list of candidates via numpy for the first slot only
283         for start_index in numpy.nonzero(
284           entries_free[indices[0]:n_entries]
285         )[0]:
286           indices_offset = indices + start_index
287           if numpy.all(entries_free[indices_offset]):
288             break
289         else:
290           start_index = n_entries
291           indices_offset = indices + start_index
292         entries[indices_offset, 0] = transitions[state_todo, indices]
293         entries[indices_offset, 1] = state_todo
294         entries_free[indices_offset] = False
295         if n_entries < start_index + 0x101:
296           n_entries = start_index + 0x101
297
298         states[state_todo, 0] = start_index
299         states[state_todo, 1] = state_done
300     while len(dupes):
301       if len(dupes) % 100 == 0:
302         print('dupe', len(dupes))
303
304       i = dupes.pop()
305       state_done = heap[i, 2]
306       state_todo = heap[i, 3]
307       states[state_todo, 0] = states[state_done, 0]
308       states[state_todo, 1] = state_done
309
310     # finalize entries table
311     entries = entries[:n_entries, :]
312     print('n_entries', n_entries)
313
314     return flex_dfa.FlexDFA(accept, acclist, states, entries)
315
316   def match_text(self, text, i, start_index = 0):
317     def transit(transition):
318       nonlocal threads0, threads1, prefix_slop # note: also uses i
319       j = prefix_slop
320       for trans in transition:
321         if trans[0] == DFA.TRANSITION_POP:
322           j += trans[1]
323         elif trans[0] == DFA.TRANSITION_DUP:
324           while j < trans[1]:
325             threads0[:0] = [None] * prefix_slop
326             threads1[:0] = [None] * prefix_slop
327             j += prefix_slop
328             prefix_slop *= 2
329           threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
330           j -= trans[1]
331         elif trans[0] == DFA.TRANSITION_MARK:
332           threads0[j:j + trans[1]] = [
333             (i, trans[2], thread)
334             for thread in threads0[j:j + trans[1]]
335           ]
336         elif trans[0] == DFA.TRANSITION_MOVE:
337           threads1.extend(threads0[j:j + trans[1]])
338           j += trans[1]
339         #elif trans[0] == DFA.TRANSITION_DEL:
340         #  del threads1[-trans[1]:]
341         else:
342           assert False
343       assert j == len(threads0)
344       threads0, threads1 = threads1, threads0
345       del threads1[prefix_slop:]
346
347     threads0 = [None, None]
348     threads1 = [None]
349     prefix_slop = 1
350
351     action = self.start_action[start_index]
352     while action != -1:
353       state, transition = self.actions[action]
354       #print('i', i, 'action', action, 'state', state, 'transition', transition)
355       transit(transition)
356       if state == 0:
357         # there is only one match, which is complete
358         assert len(threads0) == prefix_slop + 1
359         assert self.states[state][2] == [0]
360         return threads0[prefix_slop]
361       if i >= len(text):
362         # return best match we have, but not incomplete match
363         accept = self.states[state][2]
364         return threads0[prefix_slop + accept[0]] if len(accept) else None
365       action = self.states[state][1][
366         bisect.bisect_right(self.states[state][0], ord(text[i]))
367       ]
368       i += 1
369     return None
370
371   def match_yychunk(self, root, pos, off, yychunk_iter, start_index = 0):
372     if pos < 0:
373       pos, off = element.to_start_relative(root, pos, off)
374
375     def transit(transition):
376       nonlocal threads0, threads1, prefix_slop # note: also uses pos, off
377       j = prefix_slop
378       for trans in transition:
379         if trans[0] == DFA.TRANSITION_POP:
380           j += trans[1]
381         elif trans[0] == DFA.TRANSITION_DUP:
382           while j < trans[1]:
383             threads0[:0] = [None] * prefix_slop
384             threads1[:0] = [None] * prefix_slop
385             j += prefix_slop
386             prefix_slop *= 2
387           threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
388           j -= trans[1]
389         elif trans[0] == DFA.TRANSITION_MARK:
390           threads0[j:j + trans[1]] = [
391             (pos, off, trans[2], thread)
392             for thread in threads0[j:j + trans[1]]
393           ]
394         elif trans[0] == DFA.TRANSITION_MOVE:
395           threads1.extend(threads0[j:j + trans[1]])
396           j += trans[1]
397         #elif trans[0] == DFA.TRANSITION_DEL:
398         #  del threads1[-trans[1]:]
399         else:
400           assert False
401       assert j == len(threads0)
402       threads0, threads1 = threads1, threads0
403       del threads1[prefix_slop:]
404
405     threads0 = [None, None]
406     threads1 = [None]
407     prefix_slop = 1
408
409     action = self.start_action[start_index]
410     text = element.get_text(root, pos)
411     while action != -1:
412       state, transition = self.actions[action]
413       transit(transition)
414       if state == 0:
415         # there is only one match, which is complete
416         assert len(threads0) == prefix_slop + 1
417         assert self.states[state][2] == [0]
418         return threads0[prefix_slop]
419       while off >= len(text):
420         if pos < len(root):
421           pos += 1
422           off = 0
423         else:
424           try:
425             next(yychunk_iter)
426           except StopIteration:
427             # return best match we have, but not incomplete match
428             accept = self.states[state][2]
429             return threads0[prefix_slop + accept[0]] if len(accept) else None
430           text = element.get_text(root, pos)
431       #print(
432       #  'state {0:d} pos {1:d} off {2:d} text "{3:s}"'.format(
433       #    state,
434       #    pos,
435       #    off,
436       #    text.replace('\n', '$')
437       #  )
438       #)
439       action = self.states[state][1][
440         bisect.bisect_right(self.states[state][0], ord(text[off]))
441       ]
442       off += 1
443     return None
444
445   #def yylex(self, root, pos, off, factory, yychunk_iter):
446   #  if pos < 0:
447   #    pos, off = element.to_start_relative(root, pos, off)
448
449   #  while True:
450   #    # note: pointers must be kept start relative during the below call,
451   #    # because it extends the following text by calling the yychunk_iter
452   #    thread = self.match_yychunk(root, pos, off, yychunk_iter)
453   #    if thread is None:
454   #      break
455   #    stack = []
456   #    while True:
457   #      pos, off, mark_value, thread = thread
458   #      group_index = mark_value >> 1
459   #      if (mark_value & 1) != 0:
460   #        end_pos, end_off = element.to_end_relative(root, pos, off)
461   #        stack.append((end_pos, end_off, group_index))
462   #      else:
463   #        end_pos, end_off, temp = stack.pop()
464   #        assert temp == group_index
465   #        if len(stack) == 0:
466   #          break
467   #        tag, kwargs = self.groups[group_index]
468   #        if tag != '':
469   #          work.apply_markup(
470   #            root,
471   #            pos,
472   #            off,
473   #            end_pos,
474   #            end_off,
475   #            factory,
476   #            tag,
477   #            **kwargs
478   #          )
479   #    # note: pointers must be kept end relative during the below call,
480   #    # because it modifies the preceding text by calling apply_markup()
481   #    yield end_pos, end_off, group_index
482   #    pos, off = element.to_start_relative(root, end_pos, end_off)
483
484   def __repr__(self):
485     return 'dfa.DFA({0:s}, {1:s}, {2:s})'.format(
486       repr(self.states),
487       repr(self.actions),
488       repr(self.start_action)
489     )