9 # defines the alphabet size, set this to 0x11000 for unicode
14 # instructions to transform thread list from a multistate to next multistate
15 # (TRANSITION_POP, n) j += n
16 # (TRANSITION_DUP, n) threads0[j - n:j] = threads0[j:j + n]
18 # (TRANSITION_MARK, n, mark_value) threads0[j:j + n] = [
20 # for thread in threads0[j:j + n]
22 # (TRANSITION_MOVE, n) threads1.extend(threads0[j:j + n])
24 # (TRANSITION_DEL, n) del threads1[-n:]
34 states = [([n_characters], [0], [0])],
36 start_action = [] # can have multiple DFAs in same container
38 # states: list of state_desc
39 # state_desc: (list of breaks, list of action to do, accept_threads)
40 # actions: list of action_desc
41 # action_desc: (state to go to next, compiled transition to do first)
42 # accept_threads: list of accepting thread numbers (in thread list)
44 self.actions = actions
45 self.start_action = start_action
47 def to_flex_dfa(self):
48 # we use a modified version of the transition routine, we do not know
49 # how many threads are active, so we just create null threads as they
50 # are referred to (resulting threads have current marks but no history),
51 # each thread is a list in forward order, not a stack in reverse order
52 def transit(transition):
53 nonlocal threads0, threads1, prefix_slop # note: also uses i
55 for trans in transition:
56 if len(threads0) < j + trans[1]:
57 threads0.extend([[] for k in range(j + trans[1] - len(threads0))])
58 if trans[0] == DFA.TRANSITION_POP:
60 elif trans[0] == DFA.TRANSITION_DUP:
62 threads0[:0] = [None] * prefix_slop
63 threads1[:0] = [None] * prefix_slop
66 threads0[j - trans[1]:j] = [
68 for k in threads0[j:j + trans[1]]
71 elif trans[0] == DFA.TRANSITION_MARK:
72 for k in range(j, j + trans[1]):
73 threads0[j].append(trans[2])
74 elif trans[0] == DFA.TRANSITION_MOVE:
75 threads1.extend(threads0[j:j + trans[1]])
77 #elif trans[0] == DFA.TRANSITION_DEL:
78 # del threads1[-trans[1]:]
81 assert j == len(threads0)
82 threads0, threads1 = threads1, threads0
83 del threads1[prefix_slop:]
89 # action numbers in the DFA become state numbers in the FlexDFA,
90 # with the start-action for each start-condition being copied into
91 # the correctly numbered slot (may cause duplicates), and then all
92 # actions reachable from these being copied to the subsequent slots
93 # (if start-action is reached again, uses the lower numbered copy)
94 flex_state_to_action = [0] + self.start_action
95 action_to_flex_state = {-1: 0} # see comment about state -1 below
96 for i in range(len(flex_state_to_action)):
97 action = flex_state_to_action[i]
98 if action not in action_to_flex_state:
99 action_to_flex_state[action] = i
101 # last start-condition is really end-of-buffer (EOB), it has only
102 # a dummy rule that accepts the null string and executes EOB action
103 eob_state = len(self.start_action)
105 # state 0 is the jam state, make it exist but have empty acclist
106 acclist = numpy.zeros((0x100,), numpy.uint16)
108 accept = numpy.zeros((0x100,), numpy.uint16)
111 # transitions[i, j] is transition on character j in state i
112 # in our way of thinking, 0 is don't care and -1 is failure
113 # however, in the flex way these are both 0 (don't care),
114 # the distinction being that failure has no associated action
115 # thus all entries of states are filled, with 0 as a catch-all
116 transitions = numpy.zeros((0x100, 0x101), numpy.uint16)
118 # generate acclist, accept and transition tables
119 while n_states < len(flex_state_to_action):
120 action = flex_state_to_action[n_states]
121 state, transition = self.actions[action]
122 #print('state', n_states, 'transition', transition)
124 del threads0[prefix_slop:]
126 #print(threads0[prefix_slop:])
127 if n_states >= accept.shape[0]:
129 new_accept = numpy.zeros(
130 (accept.shape[0] * 2,),
133 new_accept[:accept.shape[0]] = accept
135 accept[n_states] = n_acclist
137 for k in [j for i in threads0[prefix_slop:] for j in i]:
138 if k != -1: # ignore user-defined groups
142 (acc | flex_dfa.FlexDFA.YY_TRAILING_HEAD_MASK) not in accept_set
144 # look back to start of trailing context, then accept
145 acc |= flex_dfa.FlexDFA.YY_TRAILING_MASK
146 # otherwise zero length trailing context, accept immediately
148 # mark start of (hopefully safe) trailing context
149 acc |= flex_dfa.FlexDFA.YY_TRAILING_HEAD_MASK
150 if acc not in accept_set:
151 if n_acclist >= acclist.shape[0]:
153 new_acclist = numpy.zeros(
154 (acclist.shape[0] * 2,),
157 new_acclist[:acclist.shape[0]] = acclist
158 acclist = new_acclist
159 acclist[n_acclist] = acc
163 # calculate transition row from self.state character-to-action table
164 if n_states >= transitions.shape[0]:
166 new_transitions = numpy.zeros(
167 (transitions.shape[0] * 2, 0x101),
170 new_transitions[:transitions.shape[0], :] = transitions
171 transitions = new_transitions
172 breaks, actions, _ = self.states[state]
174 for i in range(len(breaks)):
175 character1 = breaks[i]
176 next_action = actions[i]
177 if next_action in action_to_flex_state:
178 next_flex_state = action_to_flex_state[next_action]
180 next_flex_state = len(flex_state_to_action)
181 action_to_flex_state[next_action] = next_flex_state
182 flex_state_to_action.append(next_action)
183 transitions[n_states, character0:character1] = next_flex_state
184 character0 = character1
185 assert character0 == 0x100
189 # finalize the acclist and accept tables
190 acclist = acclist[:n_acclist]
191 if n_states >= accept.shape[0]:
192 new_accept = numpy.zeros(
193 (accept.shape[0] * 2,),
196 new_accept[:accept.shape[0]] = accept
198 accept[n_states] = n_acclist
199 accept = accept[:n_states + 1]
201 # finalize the transitions table
202 transitions = transitions[:n_states, :]
203 transitions[:, 0x100] = transitions[:, 0]
204 transitions[:, 0] = eob_state
206 # calculate default states by constructing minimum spanning tree
207 # heap contains n states todo followed by n_states - n states done
208 # each heap entry is [distance, hop count, state done, state todo]
209 heap = numpy.zeros((n_states, 4), numpy.uint16)
210 heap[:-1, 0] = numpy.sum(transitions[1:, :] != transitions[:1, :], 1)
211 heap[:-1, 3] = numpy.arange(1, n_states, dtype = numpy.uint16)
212 numpy_heap.heapify(heap, n_states - 1)
213 for n in range(n_states - 2, 0, -1):
217 key = tuple(heap[n, :])
218 heap[n, :] = heap[0, :]
219 numpy_heap.bubble_down(heap, 0, key, n)
220 hop_count = heap[n, 1] + 1 # proposed hop_count is current hop_count + 1
221 state_done = heap[n, 3] # proposed state_done is current state_todo
223 transitions[heap[:n, 3], :] !=
224 transitions[state_done:state_done + 1, :],
227 # although numpy cannot do lexicographic comparisons, check the
228 # first field via numpy to quickly generate a list of candidates
229 for i in numpy.nonzero(dist <= heap[:n, 0])[0]:
230 key = (dist[i], hop_count, state_done, heap[i, 3])
231 if key < tuple(heap[i, :]):
232 numpy_heap.bubble_up(heap, i, key)
234 # state 0 is the jam state, the EOB state will be added later on
235 states = numpy.zeros((n_states, 2), numpy.uint16) # base, def
236 entries = numpy.full((0x200, 2), -1, numpy.uint16) # nxt, chk
237 entries[:0x101, :] = 0 # jam state just returns to jam state
238 entries[0, 0] = eob_state # except for the EOB transition
239 entries_free = numpy.full(0x200, True, numpy.bool)
240 entries_free[:0x101] = False # account for the jam (don't care) state
243 # pack states in reverse order (larger distances first)
245 for i in range(n_states - 1):
246 if (n_states - i) % 100 == 0:
247 print('pack', n_states - i)
250 # when copying another state, need to have the same base, though
251 # the base will not matter since there will be no entries, it is
252 # is because of the awkward way the compressed lookup is written
255 state_done = heap[i, 2]
256 state_todo = heap[i, 3]
257 indices = numpy.nonzero(
258 transitions[state_todo, :] != transitions[state_done, :]
261 # make sure entries array is at least large enough to find a spot
262 while entries.shape[0] < n_entries + 0x101:
263 # extend entries, copying only n_entries entries
264 new_entries = numpy.full(
265 (entries.shape[0] * 2, 2),
269 new_entries[:n_entries, :] = entries[:n_entries, :]
270 entries = new_entries
272 # extend entries_free, copying only n_entries entries
273 new_entries_free = numpy.full(
274 (entries_free.shape[0] * 2,),
278 new_entries_free[:n_entries] = entries_free[:n_entries]
279 entries_free = new_entries_free
281 # find a suitable spot and store differences from default state
282 # generate a list of candidates via numpy for the first slot only
283 for start_index in numpy.nonzero(
284 entries_free[indices[0]:n_entries]
286 indices_offset = indices + start_index
287 if numpy.all(entries_free[indices_offset]):
290 start_index = n_entries
291 indices_offset = indices + start_index
292 entries[indices_offset, 0] = transitions[state_todo, indices]
293 entries[indices_offset, 1] = state_todo
294 entries_free[indices_offset] = False
295 if n_entries < start_index + 0x101:
296 n_entries = start_index + 0x101
298 states[state_todo, 0] = start_index
299 states[state_todo, 1] = state_done
301 if len(dupes) % 100 == 0:
302 print('dupe', len(dupes))
305 state_done = heap[i, 2]
306 state_todo = heap[i, 3]
307 states[state_todo, 0] = states[state_done, 0]
308 states[state_todo, 1] = state_done
310 # finalize entries table
311 entries = entries[:n_entries, :]
312 print('n_entries', n_entries)
314 return flex_dfa.FlexDFA(accept, acclist, states, entries)
316 def match_text(self, text, i, start_index = 0):
317 def transit(transition):
318 nonlocal threads0, threads1, prefix_slop # note: also uses i
320 for trans in transition:
321 if trans[0] == DFA.TRANSITION_POP:
323 elif trans[0] == DFA.TRANSITION_DUP:
325 threads0[:0] = [None] * prefix_slop
326 threads1[:0] = [None] * prefix_slop
329 threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
331 elif trans[0] == DFA.TRANSITION_MARK:
332 threads0[j:j + trans[1]] = [
333 (i, trans[2], thread)
334 for thread in threads0[j:j + trans[1]]
336 elif trans[0] == DFA.TRANSITION_MOVE:
337 threads1.extend(threads0[j:j + trans[1]])
339 #elif trans[0] == DFA.TRANSITION_DEL:
340 # del threads1[-trans[1]:]
343 assert j == len(threads0)
344 threads0, threads1 = threads1, threads0
345 del threads1[prefix_slop:]
347 threads0 = [None, None]
351 action = self.start_action[start_index]
353 state, transition = self.actions[action]
354 #print('i', i, 'action', action, 'state', state, 'transition', transition)
357 # there is only one match, which is complete
358 assert len(threads0) == prefix_slop + 1
359 assert self.states[state][2] == [0]
360 return threads0[prefix_slop]
362 # return best match we have, but not incomplete match
363 accept = self.states[state][2]
364 return threads0[prefix_slop + accept[0]] if len(accept) else None
365 action = self.states[state][1][
366 bisect.bisect_right(self.states[state][0], ord(text[i]))
371 def match_yychunk(self, root, pos, off, yychunk_iter, start_index = 0):
373 pos, off = element.to_start_relative(root, pos, off)
375 def transit(transition):
376 nonlocal threads0, threads1, prefix_slop # note: also uses pos, off
378 for trans in transition:
379 if trans[0] == DFA.TRANSITION_POP:
381 elif trans[0] == DFA.TRANSITION_DUP:
383 threads0[:0] = [None] * prefix_slop
384 threads1[:0] = [None] * prefix_slop
387 threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
389 elif trans[0] == DFA.TRANSITION_MARK:
390 threads0[j:j + trans[1]] = [
391 (pos, off, trans[2], thread)
392 for thread in threads0[j:j + trans[1]]
394 elif trans[0] == DFA.TRANSITION_MOVE:
395 threads1.extend(threads0[j:j + trans[1]])
397 #elif trans[0] == DFA.TRANSITION_DEL:
398 # del threads1[-trans[1]:]
401 assert j == len(threads0)
402 threads0, threads1 = threads1, threads0
403 del threads1[prefix_slop:]
405 threads0 = [None, None]
409 action = self.start_action[start_index]
410 text = element.get_text(root, pos)
412 state, transition = self.actions[action]
415 # there is only one match, which is complete
416 assert len(threads0) == prefix_slop + 1
417 assert self.states[state][2] == [0]
418 return threads0[prefix_slop]
419 while off >= len(text):
426 except StopIteration:
427 # return best match we have, but not incomplete match
428 accept = self.states[state][2]
429 return threads0[prefix_slop + accept[0]] if len(accept) else None
430 text = element.get_text(root, pos)
432 # 'state {0:d} pos {1:d} off {2:d} text "{3:s}"'.format(
436 # text.replace('\n', '$')
439 action = self.states[state][1][
440 bisect.bisect_right(self.states[state][0], ord(text[off]))
445 #def yylex(self, root, pos, off, factory, yychunk_iter):
447 # pos, off = element.to_start_relative(root, pos, off)
450 # # note: pointers must be kept start relative during the below call,
451 # # because it extends the following text by calling the yychunk_iter
452 # thread = self.match_yychunk(root, pos, off, yychunk_iter)
457 # pos, off, mark_value, thread = thread
458 # group_index = mark_value >> 1
459 # if (mark_value & 1) != 0:
460 # end_pos, end_off = element.to_end_relative(root, pos, off)
461 # stack.append((end_pos, end_off, group_index))
463 # end_pos, end_off, temp = stack.pop()
464 # assert temp == group_index
465 # if len(stack) == 0:
467 # tag, kwargs = self.groups[group_index]
479 # # note: pointers must be kept end relative during the below call,
480 # # because it modifies the preceding text by calling apply_markup()
481 # yield end_pos, end_off, group_index
482 # pos, off = element.to_start_relative(root, end_pos, end_off)
485 return 'dfa.DFA({0:s}, {1:s}, {2:s})'.format(
488 repr(self.start_action)