Fix issue with transitions/threads and issue with duplicate states (but the generated...
authorNick Downing <downing.nick@gmail.com>
Thu, 28 Jun 2018 23:48:49 +0000 (09:48 +1000)
committerNick Downing <downing.nick@gmail.com>
Thu, 28 Jun 2018 23:48:49 +0000 (09:48 +1000)
plex.py
regex.py

diff --git a/plex.py b/plex.py
index 6d552ef..2d8eec1 100755 (executable)
--- a/plex.py
+++ b/plex.py
@@ -24,6 +24,47 @@ class FlexDFA:
   YY_TRAILING_HEAD_MASK = 0x4000
 
   def __init__(self, dfa):
+    # we use a modified version of the transition routine, we do not know
+    # how many threads are active, so we just create null threads as they
+    # are referred to (resulting threads have current marks but no history),
+    # each thread is a list in forward order, not a stack in reverse order
+    def transit(transition):
+      nonlocal threads0, threads1, prefix_slop # note: also uses i
+      j = prefix_slop
+      for trans in transition:
+        if len(threads0) < j + trans[1]:
+          threads0.extend([[] for k in range(j + trans[1] - len(threads0))])
+        if trans[0] == regex.DFA.TRANSITION_POP:
+          j += trans[1]
+        elif trans[0] == regex.DFA.TRANSITION_DUP:
+          while j < trans[1]:
+            threads0[:0] = [None] * prefix_slop
+            threads1[:0] = [None] * prefix_slop
+            j += prefix_slop
+            prefix_slop *= 2
+          threads0[j - trans[1]:j] = [
+            list(k)
+            for k in threads0[j:j + trans[1]]
+          ]
+          j -= trans[1]
+        elif trans[0] == regex.DFA.TRANSITION_MARK:
+          for k in range(j, j + trans[1]):
+            threads0[j].append(trans[2])
+        elif trans[0] == regex.DFA.TRANSITION_MOVE:
+          threads1.extend(threads0[j:j + trans[1]])
+          j += trans[1]
+        #elif trans[0] == regex.DFA.TRANSITION_DEL:
+        #  del threads1[-trans[1]:]
+        else:
+          assert False
+      assert j == len(threads0)
+      threads0, threads1 = threads1, threads0
+      del threads1[prefix_slop:]
+
+    threads0 = [None]
+    threads1 = [None]
+    prefix_slop = 1
+
     # this is basically just a renumbering
 
     # state numbers in the DFA become base/def numbers in the FlexDFA,
@@ -65,25 +106,24 @@ class FlexDFA:
       state, transition = dfa.actions[action]
       #print('state', len(self.states), 'transition', transition)
 
-      # we collect marks without regard to which thread they refer to,
-      # they should already be in priority order without any duplicates
-      # (because the deduplication removes subsequent identical threads)
+      del threads0[prefix_slop:]
+      transit(transition)
+      #print(threads0[prefix_slop:])
       flex_accept = []
-      for j in [i[2] for i in transition if i[0] == regex.DFA.TRANSITION_MARK]:
-        #print(j)
-        if j & 1:
+      for k in [j for i in threads0[prefix_slop:] for j in i]:
+        if k & 1:
           if (
             len(flex_accept) > 0 and
-            flex_accept[-1] == (j >> 1) | FlexDFA.YY_TRAILING_HEAD_MASK
+            flex_accept[-1] == (k >> 1) | FlexDFA.YY_TRAILING_HEAD_MASK
           ):
             # zero length trailing context, accept immediately
-            flex_accept.append(j >> 1)
+            flex_accept.append(k >> 1)
           else:
             # look back to start of trailing context, then accept
-            flex_accept.append((j >> 1) | FlexDFA.YY_HEAD_MASK)
+            flex_accept.append((k >> 1) | FlexDFA.YY_HEAD_MASK)
         else:
           # mark start of (hopefully safe) trailing context
-          flex_accept.append((j >> 1) | FlexDFA.YY_TRAILING_HEAD_MASK)
+          flex_accept.append((k >> 1) | FlexDFA.YY_TRAILING_HEAD_MASK)
       #print(flex_accept)
 
       if state in state_to_flex_base_def:
@@ -128,10 +168,18 @@ class FlexDFA:
         diff = numpy.sum(mask, 1)
         flex_def = numpy.argmin(diff, 0)
         if diff[flex_def] == 0: # exactly matching state
-          # this is supposed to have been compressed by DFA already,
-          # but can happen since we ignore the accept field of the DFA,
-          # just copy the base and def fields from the matching state
-          _, flex_base, flex_def = self.states[flex_def]
+          # can't use the normal similarity mechanism here, because it
+          # will choose a base of 0 (happens to correspond to the jam
+          # state) and this will make flex's inner loop abort early
+          # ... highlights various issues with flex's tables format,
+          # for instance that duplicate states may be unavoidable when
+          # start conditions are used, and that checking the base is
+          # not an ideal way to check if a state has the same transitions
+          # as the jam state, in fact a state can only share the same base
+          # as another state by COINCIDENCE due to the yy_chk[] issue!
+          # ... fix this by merging indistinguishable states (except for
+          # duplicate start conditions, which may have to use this hack)
+          flex_base = self.states[flex_def][1]
         else:
           mask = mask[flex_def, :]
 
@@ -161,7 +209,8 @@ class FlexDFA:
 
       self.states.append((flex_accept, flex_base, flex_def))
     #print(full_entries[:len(self.states), :])
-
+    #print(flex_state_to_action)
 if len(sys.argv) < 2:
   sys.stdout.write(
     'usage: {0:s} rules.l\n'.format(
@@ -252,6 +301,8 @@ expr.add_to_nfa(nfa)
 dfa = nfa.to_dfa()
 #print(dfa.start_action)
 #print(dfa.actions[2])
+#print(dfa.match_text('1.0 + 5', 0))
 flex_dfa = FlexDFA(dfa) #nfa.to_dfa())
 with open('skel/lex.yy.c', 'r') as fin:
   with open('lex.yy.c', 'w+') as fout:
index d9da32a..080b4c3 100644 (file)
--- a/regex.py
+++ b/regex.py
@@ -1554,6 +1554,7 @@ class DFA:
     action = self.start_action[start_index]
     while action != -1:
       state, transition = self.actions[action]
+      #print('i', i, 'action', action, 'state', state, 'transition', transition)
       transit(transition)
       if state == 0:
         # there is only one match, which is complete