dfa.py

   1 import bisect
   2 import element
   3 import flex_dfa
   4 import numpy
   5 import numpy_heap
   6 import work
   7 import sys
   8
   9 # defines the alphabet size, set this to 0x11000 for unicode
  10 n_characters = 0x100
  11
  12 class DFA:
  13   # transition classes:
  14   # instructions to transform thread list from a multistate to next multistate
  15   # (TRANSITION_POP, n)                 j += n
  16   # (TRANSITION_DUP, n)                 threads0[j - n:j] = threads0[j:j + n]
  17   #                                     j -= n
  18   # (TRANSITION_MARK, n, mark_value)    threads0[j:j + n] = [
  19   #                                       (i, mark, thread)
  20   #                                       for thread in threads0[j:j + n]
  21   #                                     ]
  22   # (TRANSITION_MOVE, n)                threads1.extend(threads0[j:j + n])
  23   #                                     j += n
  24   # (TRANSITION_DEL, n)                 del threads1[-n:]
  25
  26   TRANSITION_POP = 0
  27   TRANSITION_DUP = 1
  28   TRANSITION_MARK = 2
  29   TRANSITION_MOVE = 3
  30   #TRANSITION_DEL = 4
  31
  32   def __init__(
  33     self,
  34     states = [([n_characters], [0], [0])],
  35     actions = [(0, [])],
  36     start_action = [] # can have multiple DFAs in same container
  37   ):
  38     # states: list of state_desc
  39     # state_desc: (list of breaks, list of action to do, accept_threads)
  40     # actions: list of action_desc
  41     # action_desc: (state to go to next, compiled transition to do first)
  42     # accept_threads: list of accepting thread numbers (in thread list)
  43     self.states = states
  44     self.actions = actions
  45     self.start_action = start_action
  46
  47   def to_flex_dfa(self):
  48     # we use a modified version of the transition routine, we do not know
  49     # how many threads are active, so we just create null threads as they
  50     # are referred to (resulting threads have current marks but no history),
  51     # each thread is a list in forward order, not a stack in reverse order
  52     def transit(transition):
  53       nonlocal threads0, threads1, prefix_slop # note: also uses i
  54       j = prefix_slop
  55       for trans in transition:
  56         if len(threads0) < j + trans[1]:
  57           threads0.extend([[] for k in range(j + trans[1] - len(threads0))])
  58         if trans[0] == DFA.TRANSITION_POP:
  59           j += trans[1]
  60         elif trans[0] == DFA.TRANSITION_DUP:
  61           while j < trans[1]:
  62             threads0[:0] = [None] * prefix_slop
  63             threads1[:0] = [None] * prefix_slop
  64             j += prefix_slop
  65             prefix_slop *= 2
  66           threads0[j - trans[1]:j] = [
  67             list(k)
  68             for k in threads0[j:j + trans[1]]
  69           ]
  70           j -= trans[1]
  71         elif trans[0] == DFA.TRANSITION_MARK:
  72           for k in range(j, j + trans[1]):
  73             threads0[j].append(trans[2])
  74         elif trans[0] == DFA.TRANSITION_MOVE:
  75           threads1.extend(threads0[j:j + trans[1]])
  76           j += trans[1]
  77         #elif trans[0] == DFA.TRANSITION_DEL:
  78         #  del threads1[-trans[1]:]
  79         else:
  80           assert False
  81       assert j == len(threads0)
  82       threads0, threads1 = threads1, threads0
  83       del threads1[prefix_slop:]
  84
  85     threads0 = [None]
  86     threads1 = [None]
  87     prefix_slop = 1
  88
  89     # action numbers in the DFA become state numbers in the FlexDFA,
  90     # with the start-action for each start-condition being copied into
  91     # the correctly numbered slot (may cause duplicates), and then all
  92     # actions reachable from these being copied to the subsequent slots
  93     # (if start-action is reached again, uses the lower numbered copy)
  94     flex_state_to_action = [0] + self.start_action
  95     action_to_flex_state = {-1: 0} # see comment about state -1 below
  96     for i in range(len(flex_state_to_action)):
  97       action = flex_state_to_action[i]
  98       if action not in action_to_flex_state:
  99         action_to_flex_state[action] = i
 100
 101     # last start-condition is really end-of-buffer (EOB), it has only
 102     # a dummy rule that accepts the null string and executes EOB action
 103     eob_state = len(self.start_action)
 104
 105     # state 0 is the jam state, make it exist but have empty acclist
 106     acclist = numpy.zeros((0x100,), numpy.uint16)
 107     n_acclist = 0
 108     accept = numpy.zeros((0x100,), numpy.uint16)
 109     n_states = 1
 110
 111     # transitions[i, j] is transition on character j in state i
 112     # in our way of thinking, 0 is don't care and -1 is failure
 113     # however, in the flex way these are both 0 (don't care),
 114     # the distinction being that failure has no associated action
 115     # thus all entries of states are filled, with 0 as a catch-all
 116     transitions = numpy.zeros((0x100, 0x101), numpy.uint16)
 117
 118     # generate acclist, accept and transition tables
 119     while n_states < len(flex_state_to_action):
 120       action = flex_state_to_action[n_states]
 121       state, transition = self.actions[action]
 122       #print('state', n_states, 'transition', transition)
 123
 124       del threads0[prefix_slop:]
 125       transit(transition)
 126       #print(threads0[prefix_slop:])
 127       if n_states >= accept.shape[0]:
 128         # extend accept
 129         new_accept = numpy.zeros(
 130           (accept.shape[0] * 2,),
 131           numpy.uint16
 132         )
 133         new_accept[:accept.shape[0]] = accept
 134         accept = new_accept
 135       accept[n_states] = n_acclist
 136       accept_set = set()
 137       for k in [j for i in threads0[prefix_slop:] for j in i]:
 138         if k != -1: # ignore user-defined groups
 139           acc = k >> 1
 140           if k & 1:
 141             if (
 142               (acc | flex_dfa.FlexDFA.YY_TRAILING_HEAD_MASK) not in accept_set
 143             ):
 144               # look back to start of trailing context, then accept
 145               acc |= flex_dfa.FlexDFA.YY_TRAILING_MASK
 146             # otherwise zero length trailing context, accept immediately
 147           else:
 148             # mark start of (hopefully safe) trailing context
 149             acc |= flex_dfa.FlexDFA.YY_TRAILING_HEAD_MASK
 150           if acc not in accept_set:
 151             if n_acclist >= acclist.shape[0]:
 152               # extend acclist
 153               new_acclist = numpy.zeros(
 154                 (acclist.shape[0] * 2,),
 155                 numpy.uint16
 156               )
 157               new_acclist[:acclist.shape[0]] = acclist
 158               acclist = new_acclist
 159             acclist[n_acclist] = acc
 160             n_acclist += 1
 161             accept_set.add(acc)
 162
 163       # calculate transition row from self.state character-to-action table
 164       if n_states >= transitions.shape[0]:
 165         # extend transitions
 166         new_transitions = numpy.zeros(
 167           (transitions.shape[0] * 2, 0x101),
 168           numpy.uint16
 169         )
 170         new_transitions[:transitions.shape[0], :] = transitions
 171         transitions = new_transitions
 172       breaks, actions, _ = self.states[state]
 173       character0 = 0
 174       for i in range(len(breaks)):
 175         character1 = breaks[i]
 176         next_action = actions[i]
 177         if next_action in action_to_flex_state:
 178           next_flex_state = action_to_flex_state[next_action]
 179         else:
 180           next_flex_state = len(flex_state_to_action)
 181           action_to_flex_state[next_action] = next_flex_state
 182           flex_state_to_action.append(next_action)
 183         transitions[n_states, character0:character1] = next_flex_state
 184         character0 = character1
 185       assert character0 == 0x100
 186
 187       n_states += 1
 188
 189     # finalize the acclist and accept tables
 190     acclist = acclist[:n_acclist]
 191     if n_states >= accept.shape[0]:
 192       new_accept = numpy.zeros(
 193         (accept.shape[0] * 2,),
 194         numpy.uint16
 195       )
 196       new_accept[:accept.shape[0]] = accept
 197       accept = new_accept
 198     accept[n_states] = n_acclist
 199     accept = accept[:n_states + 1]
 200
 201     # finalize the transitions table
 202     transitions = transitions[:n_states, :]
 203     transitions[:, 0x100] = transitions[:, 0]
 204     transitions[:, 0] = eob_state
 205
 206     # calculate default states by constructing minimum spanning tree
 207     # heap contains n states todo followed by n_states - n states done
 208     # each heap entry is [distance, hop count, state done, state todo]
 209     heap = numpy.zeros((n_states, 4), numpy.uint16)
 210     heap[:-1, 0] = numpy.sum(transitions[1:, :] != transitions[:1, :], 1)
 211     heap[:-1, 3] = numpy.arange(1, n_states, dtype = numpy.uint16)
 212     numpy_heap.heapify(heap, n_states - 1)
 213     for n in range(n_states - 2, 0, -1):
 214       if n % 100 == 0:
 215         print('mst', n)
 216
 217       key = tuple(heap[n, :])
 218       heap[n, :] = heap[0, :]
 219       numpy_heap.bubble_down(heap, 0, key, n)
 220       hop_count = heap[n, 1] + 1 # proposed hop_count is current hop_count + 1
 221       state_done = heap[n, 3] # proposed state_done is current state_todo
 222       dist = numpy.sum(
 223         transitions[heap[:n, 3], :] !=
 224         transitions[state_done:state_done + 1, :],
 225         1
 226       )
 227       # although numpy cannot do lexicographic comparisons, check the
 228       # first field via numpy to quickly generate a list of candidates
 229       for i in numpy.nonzero(dist <= heap[:n, 0])[0]:
 230         key = (dist[i], hop_count, state_done, heap[i, 3])
 231         if key < tuple(heap[i, :]):
 232           numpy_heap.bubble_up(heap, i, key)
 233
 234     # state 0 is the jam state, the EOB state will be added later on
 235     states = numpy.zeros((n_states, 2), numpy.uint16) # base, def
 236     entries = numpy.full((0x200, 2), -1, numpy.uint16) # nxt, chk
 237     entries[:0x101, :] = 0 # jam state just returns to jam state
 238     entries[0, 0] = eob_state # except for the EOB transition
 239     entries_free = numpy.full(0x200, True, numpy.bool)
 240     entries_free[:0x101] = False # account for the jam (don't care) state
 241     n_entries = 0x101
 242
 243     # pack states in reverse order (larger distances first)
 244     dupes = []
 245     for i in range(n_states - 1):
 246       if (n_states - i) % 100 == 0:
 247         print('pack', n_states - i)
 248
 249       if heap[i, 0] == 0:
 250         # when copying another state, need to have the same base, though
 251         # the base will not matter since there will be no entries, it is
 252         # is because of the awkward way the compressed lookup is written
 253         dupes.append(i)
 254       else:
 255         state_done = heap[i, 2]
 256         state_todo = heap[i, 3]
 257         indices = numpy.nonzero(
 258           transitions[state_todo, :] != transitions[state_done, :]
 259         )[0]
 260
 261         # make sure entries array is at least large enough to find a spot
 262         while entries.shape[0] < n_entries + 0x101:
 263           # extend entries, copying only n_entries entries
 264           new_entries = numpy.full(
 265             (entries.shape[0] * 2, 2),
 266             -1,
 267             numpy.uint16
 268           )
 269           new_entries[:n_entries, :] = entries[:n_entries, :]
 270           entries = new_entries
 271
 272           # extend entries_free, copying only n_entries entries
 273           new_entries_free = numpy.full(
 274             (entries_free.shape[0] * 2,),
 275             True,
 276             numpy.bool
 277           )
 278           new_entries_free[:n_entries] = entries_free[:n_entries]
 279           entries_free = new_entries_free
 280
 281         # find a suitable spot and store differences from default state
 282         # generate a list of candidates via numpy for the first slot only
 283         for start_index in numpy.nonzero(
 284           entries_free[indices[0]:n_entries]
 285         )[0]:
 286           indices_offset = indices + start_index
 287           if numpy.all(entries_free[indices_offset]):
 288             break
 289         else:
 290           start_index = n_entries
 291           indices_offset = indices + start_index
 292         entries[indices_offset, 0] = transitions[state_todo, indices]
 293         entries[indices_offset, 1] = state_todo
 294         entries_free[indices_offset] = False
 295         if n_entries < start_index + 0x101:
 296           n_entries = start_index + 0x101
 297
 298         states[state_todo, 0] = start_index
 299         states[state_todo, 1] = state_done
 300     while len(dupes):
 301       if len(dupes) % 100 == 0:
 302         print('dupe', len(dupes))
 303
 304       i = dupes.pop()
 305       state_done = heap[i, 2]
 306       state_todo = heap[i, 3]
 307       states[state_todo, 0] = states[state_done, 0]
 308       states[state_todo, 1] = state_done
 309
 310     # finalize entries table
 311     entries = entries[:n_entries, :]
 312     print('n_entries', n_entries)
 313
 314     return flex_dfa.FlexDFA(accept, acclist, states, entries)
 315
 316   def match_text(self, text, i, start_index = 0):
 317     def transit(transition):
 318       nonlocal threads0, threads1, prefix_slop # note: also uses i
 319       j = prefix_slop
 320       for trans in transition:
 321         if trans[0] == DFA.TRANSITION_POP:
 322           j += trans[1]
 323         elif trans[0] == DFA.TRANSITION_DUP:
 324           while j < trans[1]:
 325             threads0[:0] = [None] * prefix_slop
 326             threads1[:0] = [None] * prefix_slop
 327             j += prefix_slop
 328             prefix_slop *= 2
 329           threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
 330           j -= trans[1]
 331         elif trans[0] == DFA.TRANSITION_MARK:
 332           threads0[j:j + trans[1]] = [
 333             (i, trans[2], thread)
 334             for thread in threads0[j:j + trans[1]]
 335           ]
 336         elif trans[0] == DFA.TRANSITION_MOVE:
 337           threads1.extend(threads0[j:j + trans[1]])
 338           j += trans[1]
 339         #elif trans[0] == DFA.TRANSITION_DEL:
 340         #  del threads1[-trans[1]:]
 341         else:
 342           assert False
 343       assert j == len(threads0)
 344       threads0, threads1 = threads1, threads0
 345       del threads1[prefix_slop:]
 346
 347     threads0 = [None, None]
 348     threads1 = [None]
 349     prefix_slop = 1
 350
 351     action = self.start_action[start_index]
 352     while action != -1:
 353       state, transition = self.actions[action]
 354       #print('i', i, 'action', action, 'state', state, 'transition', transition)
 355       transit(transition)
 356       if state == 0:
 357         # there is only one match, which is complete
 358         assert len(threads0) == prefix_slop + 1
 359         assert self.states[state][2] == [0]
 360         return threads0[prefix_slop]
 361       if i >= len(text):
 362         # return best match we have, but not incomplete match
 363         accept = self.states[state][2]
 364         return threads0[prefix_slop + accept[0]] if len(accept) else None
 365       action = self.states[state][1][
 366         bisect.bisect_right(self.states[state][0], ord(text[i]))
 367       ]
 368       i += 1
 369     return None
 370
 371   def match_yychunk(self, root, pos, off, yychunk_iter, start_index = 0):
 372     if pos < 0:
 373       pos, off = element.to_start_relative(root, pos, off)
 374
 375     def transit(transition):
 376       nonlocal threads0, threads1, prefix_slop # note: also uses pos, off
 377       j = prefix_slop
 378       for trans in transition:
 379         if trans[0] == DFA.TRANSITION_POP:
 380           j += trans[1]
 381         elif trans[0] == DFA.TRANSITION_DUP:
 382           while j < trans[1]:
 383             threads0[:0] = [None] * prefix_slop
 384             threads1[:0] = [None] * prefix_slop
 385             j += prefix_slop
 386             prefix_slop *= 2
 387           threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
 388           j -= trans[1]
 389         elif trans[0] == DFA.TRANSITION_MARK:
 390           threads0[j:j + trans[1]] = [
 391             (pos, off, trans[2], thread)
 392             for thread in threads0[j:j + trans[1]]
 393           ]
 394         elif trans[0] == DFA.TRANSITION_MOVE:
 395           threads1.extend(threads0[j:j + trans[1]])
 396           j += trans[1]
 397         #elif trans[0] == DFA.TRANSITION_DEL:
 398         #  del threads1[-trans[1]:]
 399         else:
 400           assert False
 401       assert j == len(threads0)
 402       threads0, threads1 = threads1, threads0
 403       del threads1[prefix_slop:]
 404
 405     threads0 = [None, None]
 406     threads1 = [None]
 407     prefix_slop = 1
 408
 409     action = self.start_action[start_index]
 410     text = element.get_text(root, pos)
 411     while action != -1:
 412       state, transition = self.actions[action]
 413       transit(transition)
 414       if state == 0:
 415         # there is only one match, which is complete
 416         assert len(threads0) == prefix_slop + 1
 417         assert self.states[state][2] == [0]
 418         return threads0[prefix_slop]
 419       while off >= len(text):
 420         if pos < len(root):
 421           pos += 1
 422           off = 0
 423         else:
 424           try:
 425             next(yychunk_iter)
 426           except StopIteration:
 427             # return best match we have, but not incomplete match
 428             accept = self.states[state][2]
 429             return threads0[prefix_slop + accept[0]] if len(accept) else None
 430           text = element.get_text(root, pos)
 431       #print(
 432       #  'state {0:d} pos {1:d} off {2:d} text "{3:s}"'.format(
 433       #    state,
 434       #    pos,
 435       #    off,
 436       #    text.replace('\n', '$')
 437       #  )
 438       #)
 439       action = self.states[state][1][
 440         bisect.bisect_right(self.states[state][0], ord(text[off]))
 441       ]
 442       off += 1
 443     return None
 444
 445   #def yylex(self, root, pos, off, factory, yychunk_iter):
 446   #  if pos < 0:
 447   #    pos, off = element.to_start_relative(root, pos, off)
 448
 449   #  while True:
 450   #    # note: pointers must be kept start relative during the below call,
 451   #    # because it extends the following text by calling the yychunk_iter
 452   #    thread = self.match_yychunk(root, pos, off, yychunk_iter)
 453   #    if thread is None:
 454   #      break
 455   #    stack = []
 456   #    while True:
 457   #      pos, off, mark_value, thread = thread
 458   #      group_index = mark_value >> 1
 459   #      if (mark_value & 1) != 0:
 460   #        end_pos, end_off = element.to_end_relative(root, pos, off)
 461   #        stack.append((end_pos, end_off, group_index))
 462   #      else:
 463   #        end_pos, end_off, temp = stack.pop()
 464   #        assert temp == group_index
 465   #        if len(stack) == 0:
 466   #          break
 467   #        tag, kwargs = self.groups[group_index]
 468   #        if tag != '':
 469   #          work.apply_markup(
 470   #            root,
 471   #            pos,
 472   #            off,
 473   #            end_pos,
 474   #            end_off,
 475   #            factory,
 476   #            tag,
 477   #            **kwargs
 478   #          )
 479   #    # note: pointers must be kept end relative during the below call,
 480   #    # because it modifies the preceding text by calling apply_markup()
 481   #    yield end_pos, end_off, group_index
 482   #    pos, off = element.to_start_relative(root, end_pos, end_off)
 483
 484   def __repr__(self):
 485     return 'dfa.DFA({0:s}, {1:s}, {2:s})'.format(
 486       repr(self.states),
 487       repr(self.actions),
 488       repr(self.start_action)
 489     )