Move FlexDFA generation into DFA.to_flex_dfa() consistent with other generators,...
authorNick Downing <downing.nick@gmail.com>
Wed, 8 Aug 2018 05:20:23 +0000 (15:20 +1000)
committerNick Downing <downing.nick@gmail.com>
Wed, 8 Aug 2018 06:08:30 +0000 (16:08 +1000)
ast.sh
bootstrap_plex.py
dfa.py
flex_dfa.py
generate_ast.py [moved from generate.py with 100% similarity]
generate_flex.py [new file with mode: 0644]
regex.sh

diff --git a/ast.sh b/ast.sh
index dd16e2d..2ba2388 100755 (executable)
--- a/ast.sh
+++ b/ast.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-if ./generate.py ast <ast.py >ast.py.new && ! diff -q ast.py ast.py.new
+if ./generate_ast.py ast <ast.py >ast.py.new && ! diff -q ast.py ast.py.new
 then
   mv ast.py.new ast.py
 else
index 6c80c50..6dd6ec1 100755 (executable)
@@ -2,7 +2,7 @@
 
 import ast
 import element
-import flex_dfa
+import generate_flex
 import getopt
 import os
 import sys
@@ -39,4 +39,4 @@ with open(in_file) as fin:
 plex.post_process()
 element.serialize(plex, 'b.xml', 'utf-8')
 plex = element.deserialize('b.xml', ast.factory, 'utf-8')
-flex_dfa.generate(plex, skel_file, out_file)
+generate_flex.generate_flex(plex, skel_file, out_file)
diff --git a/dfa.py b/dfa.py
index 46f4cbe..2729bb8 100644 (file)
--- a/dfa.py
+++ b/dfa.py
@@ -1,5 +1,8 @@
 import bisect
 import element
+import flex_dfa
+import numpy
+import numpy_heap
 import work
 import sys
 
@@ -46,6 +49,272 @@ class DFA:
     self.actions = actions
     self.start_action = start_action
 
+  def to_flex_dfa(self):
+    # we use a modified version of the transition routine, we do not know
+    # how many threads are active, so we just create null threads as they
+    # are referred to (resulting threads have current marks but no history),
+    # each thread is a list in forward order, not a stack in reverse order
+    def transit(transition):
+      nonlocal threads0, threads1, prefix_slop # note: also uses i
+      j = prefix_slop
+      for trans in transition:
+        if len(threads0) < j + trans[1]:
+          threads0.extend([[] for k in range(j + trans[1] - len(threads0))])
+        if trans[0] == DFA.TRANSITION_POP:
+          j += trans[1]
+        elif trans[0] == DFA.TRANSITION_DUP:
+          while j < trans[1]:
+            threads0[:0] = [None] * prefix_slop
+            threads1[:0] = [None] * prefix_slop
+            j += prefix_slop
+            prefix_slop *= 2
+          threads0[j - trans[1]:j] = [
+            list(k)
+            for k in threads0[j:j + trans[1]]
+          ]
+          j -= trans[1]
+        elif trans[0] == DFA.TRANSITION_MARK:
+          for k in range(j, j + trans[1]):
+            threads0[j].append(trans[2])
+        elif trans[0] == DFA.TRANSITION_MOVE:
+          threads1.extend(threads0[j:j + trans[1]])
+          j += trans[1]
+        #elif trans[0] == DFA.TRANSITION_DEL:
+        #  del threads1[-trans[1]:]
+        else:
+          assert False
+      assert j == len(threads0)
+      threads0, threads1 = threads1, threads0
+      del threads1[prefix_slop:]
+
+    threads0 = [None]
+    threads1 = [None]
+    prefix_slop = 1
+
+    # action numbers in the DFA become state numbers in the FlexDFA,
+    # with the start-action for each start-condition being copied into
+    # the correctly numbered slot (may cause duplicates), and then all
+    # actions reachable from these being copied to the subsequent slots
+    # (if start-action is reached again, uses the lower numbered copy)
+    flex_state_to_action = [0] + self.start_action
+    action_to_flex_state = {-1: 0} # see comment about state -1 below
+    for i in range(len(flex_state_to_action)):
+      action = flex_state_to_action[i]
+      if action not in action_to_flex_state:
+        action_to_flex_state[action] = i
+
+    # last start-condition is really end-of-buffer (EOB), it has only
+    # a dummy rule that accepts the null string and executes EOB action
+    eob_state = len(self.start_action)
+
+    # state 0 is the jam state, make it exist but have empty acclist
+    acclist = numpy.zeros((0x100,), numpy.uint16)
+    n_acclist = 0
+    accept = numpy.zeros((0x100,), numpy.uint16)
+    n_states = 1
+
+    # transitions[i, j] is transition on character j in state i
+    # in our way of thinking, 0 is don't care and -1 is failure
+    # however, in the flex way these are both 0 (don't care),
+    # the distinction being that failure has no associated action
+    # thus all entries of states are filled, with 0 as a catch-all
+    transitions = numpy.zeros((0x100, 0x101), numpy.uint16)
+
+    # generate acclist, accept and transition tables
+    while n_states < len(flex_state_to_action):
+      action = flex_state_to_action[n_states]
+      state, transition = self.actions[action]
+      #print('state', n_states, 'transition', transition)
+
+      del threads0[prefix_slop:]
+      transit(transition)
+      #print(threads0[prefix_slop:])
+      if n_states >= accept.shape[0]:
+        # extend accept
+        new_accept = numpy.zeros(
+          (accept.shape[0] * 2,),
+          numpy.uint16
+        )
+        new_accept[:accept.shape[0]] = accept
+        accept = new_accept
+      accept[n_states] = n_acclist
+      accept_set = set()
+      for k in [j for i in threads0[prefix_slop:] for j in i]:
+        acc = k >> 1
+        if k & 1:
+          if (acc | flex_dfa.FlexDFA.YY_TRAILING_HEAD_MASK) not in accept_set:
+            # look back to start of trailing context, then accept
+            acc |= flex_dfa.FlexDFA.YY_TRAILING_MASK
+          # otherwise zero length trailing context, accept immediately
+        else:
+          # mark start of (hopefully safe) trailing context
+          acc |= flex_dfa.FlexDFA.YY_TRAILING_HEAD_MASK
+        if acc not in accept_set:
+          if n_acclist >= acclist.shape[0]:
+            # extend acclist
+            new_acclist = numpy.zeros(
+              (acclist.shape[0] * 2,),
+              numpy.uint16
+            )
+            new_acclist[:acclist.shape[0]] = acclist
+            acclist = new_acclist
+          acclist[n_acclist] = acc
+          n_acclist += 1
+          accept_set.add(acc)
+
+      # calculate transition row from self.state character-to-action table
+      if n_states >= transitions.shape[0]:
+        # extend transitions
+        new_transitions = numpy.zeros(
+          (transitions.shape[0] * 2, 0x101),
+          numpy.uint16
+        )
+        new_transitions[:transitions.shape[0], :] = transitions
+        transitions = new_transitions
+      breaks, actions, _ = self.states[state]
+      character0 = 0
+      for i in range(len(breaks)):
+        character1 = breaks[i]
+        next_action = actions[i]
+        if next_action in action_to_flex_state:
+          next_flex_state = action_to_flex_state[next_action]
+        else:
+          next_flex_state = len(flex_state_to_action)
+          action_to_flex_state[next_action] = next_flex_state
+          flex_state_to_action.append(next_action)
+        transitions[n_states, character0:character1] = next_flex_state
+        character0 = character1
+      assert character0 == 0x100
+
+      n_states += 1
+
+    # finalize the acclist and accept tables
+    acclist = acclist[:n_acclist]
+    if n_states >= accept.shape[0]:
+      new_accept = numpy.zeros(
+        (accept.shape[0] * 2,),
+        numpy.uint16
+      )
+      new_accept[:accept.shape[0]] = accept
+      accept = new_accept
+    accept[n_states] = n_acclist
+    accept = accept[:n_states + 1]
+
+    # finalize the transitions table
+    transitions = transitions[:n_states, :]
+    transitions[:, 0x100] = transitions[:, 0]
+    transitions[:, 0] = eob_state
+
+    # calculate default states by constructing minimum spanning tree
+    # heap contains n states todo followed by n_states - n states done
+    # each heap entry is [distance, hop count, state done, state todo]
+    heap = numpy.zeros((n_states, 4), numpy.uint16)
+    heap[:-1, 0] = numpy.sum(transitions[1:, :] != transitions[:1, :], 1)
+    heap[:-1, 3] = numpy.arange(1, n_states, dtype = numpy.uint16)
+    numpy_heap.heapify(heap, n_states - 1)
+    for n in range(n_states - 2, 0, -1):
+      if n % 100 == 0:
+        print('mst', n)
+
+      key = tuple(heap[n, :])
+      heap[n, :] = heap[0, :]
+      numpy_heap.bubble_down(heap, 0, key, n)
+      hop_count = heap[n, 1] + 1 # proposed hop_count is current hop_count + 1
+      state_done = heap[n, 3] # proposed state_done is current state_todo
+      dist = numpy.sum(
+        transitions[heap[:n, 3], :] !=
+        transitions[state_done:state_done + 1, :],
+        1
+      )
+      # although numpy cannot do lexicographic comparisons, check the
+      # first field via numpy to quickly generate a list of candidates
+      for i in numpy.nonzero(dist <= heap[:n, 0])[0]:
+        key = (dist[i], hop_count, state_done, heap[i, 3])
+        if key < tuple(heap[i, :]):
+          numpy_heap.bubble_up(heap, i, key)
+
+    # state 0 is the jam state, the EOB state will be added later on
+    states = numpy.zeros((n_states, 2), numpy.uint16) # base, def
+    entries = numpy.full((0x200, 2), -1, numpy.uint16) # nxt, chk
+    entries[:0x101, :] = 0 # jam state just returns to jam state
+    entries[0, 0] = eob_state # except for the EOB transition
+    entries_free = numpy.full(0x200, True, numpy.bool)
+    entries_free[:0x101] = False # account for the jam (don't care) state
+    n_entries = 0x101
+
+    # pack states in reverse order (larger distances first)
+    dupes = []
+    for i in range(n_states - 1):
+      if (n_states - i) % 100 == 0:
+        print('pack', n_states - i)
+      if heap[i, 0] == 0:
+        # when copying another state, need to have the same base, though
+        # the base will not matter since there will be no entries, it is
+        # is because of the awkward way the compressed lookup is written
+        dupes.append(i)
+      else:
+        state_done = heap[i, 2]
+        state_todo = heap[i, 3]
+        indices = numpy.nonzero(
+          transitions[state_todo, :] != transitions[state_done, :]
+        )[0]
+
+        # make sure entries array is at least large enough to find a spot
+        while entries.shape[0] < n_entries + 0x101:
+          # extend entries, copying only n_entries entries
+          new_entries = numpy.full(
+            (entries.shape[0] * 2, 2),
+            -1,
+            numpy.uint16
+          )
+          new_entries[:n_entries, :] = entries[:n_entries, :]
+          entries = new_entries
+
+          # extend entries_free, copying only n_entries entries
+          new_entries_free = numpy.full(
+            (entries_free.shape[0] * 2,),
+            True,
+            numpy.bool
+          )
+          new_entries_free[:n_entries] = entries_free[:n_entries]
+          entries_free = new_entries_free
+
+        # find a suitable spot and store differences from default state
+        # generate a list of candidates via numpy for the first slot only
+        for start_index in numpy.nonzero(
+          entries_free[indices[0]:n_entries]
+        )[0]:
+          indices_offset = indices + start_index
+          if numpy.all(entries_free[indices_offset]):
+            break
+        else:
+          start_index = n_entries
+          indices_offset = indices + start_index
+        entries[indices_offset, 0] = transitions[state_todo, indices]
+        entries[indices_offset, 1] = state_todo
+        entries_free[indices_offset] = False
+        if n_entries < start_index + 0x101:
+          n_entries = start_index + 0x101
+
+        states[state_todo, 0] = start_index
+        states[state_todo, 1] = state_done
+    while len(dupes):
+      if len(dupes) % 100 == 0:
+        print('dupe', len(dupes))
+
+      i = dupes.pop()
+      state_done = heap[i, 2]
+      state_todo = heap[i, 3]
+      states[state_todo, 0] = states[state_done, 0]
+      states[state_todo, 1] = state_done
+
+    # finalize entries table
+    entries = entries[:n_entries, :]
+    print('n_entries', n_entries)
+
+    return flex_dfa.FlexDFA(accept, acclist, states, entries)
+
   def match_text(self, text, i, start_index = 0):
     def transit(transition):
       nonlocal threads0, threads1, prefix_slop # note: also uses i
index 1bd97c8..f6edd3d 100644 (file)
-import dfa
-import element
-import numpy
-import numpy_heap
-import regex
-
 class FlexDFA:
   YY_TRAILING_MASK = 0x2000
   YY_TRAILING_HEAD_MASK = 0x4000
 
-  def __init__(self, _dfa):
-    # we use a modified version of the transition routine, we do not know
-    # how many threads are active, so we just create null threads as they
-    # are referred to (resulting threads have current marks but no history),
-    # each thread is a list in forward order, not a stack in reverse order
-    def transit(transition):
-      nonlocal threads0, threads1, prefix_slop # note: also uses i
-      j = prefix_slop
-      for trans in transition:
-        if len(threads0) < j + trans[1]:
-          threads0.extend([[] for k in range(j + trans[1] - len(threads0))])
-        if trans[0] == dfa.DFA.TRANSITION_POP:
-          j += trans[1]
-        elif trans[0] == dfa.DFA.TRANSITION_DUP:
-          while j < trans[1]:
-            threads0[:0] = [None] * prefix_slop
-            threads1[:0] = [None] * prefix_slop
-            j += prefix_slop
-            prefix_slop *= 2
-          threads0[j - trans[1]:j] = [
-            list(k)
-            for k in threads0[j:j + trans[1]]
-          ]
-          j -= trans[1]
-        elif trans[0] == dfa.DFA.TRANSITION_MARK:
-          for k in range(j, j + trans[1]):
-            threads0[j].append(trans[2])
-        elif trans[0] == dfa.DFA.TRANSITION_MOVE:
-          threads1.extend(threads0[j:j + trans[1]])
-          j += trans[1]
-        #elif trans[0] == dfa.DFA.TRANSITION_DEL:
-        #  del threads1[-trans[1]:]
-        else:
-          assert False
-      assert j == len(threads0)
-      threads0, threads1 = threads1, threads0
-      del threads1[prefix_slop:]
-
-    threads0 = [None]
-    threads1 = [None]
-    prefix_slop = 1
-
-    # action numbers in the DFA become state numbers in the FlexDFA,
-    # with the start-action for each start-condition being copied into
-    # the correctly numbered slot (may cause duplicates), and then all
-    # actions reachable from these being copied to the subsequent slots
-    # (if start-action is reached again, uses the lower numbered copy)
-    flex_state_to_action = [0] + _dfa.start_action
-    action_to_flex_state = {-1: 0} # see comment about state -1 below
-    for i in range(len(flex_state_to_action)):
-      action = flex_state_to_action[i]
-      if action not in action_to_flex_state:
-        action_to_flex_state[action] = i
-
-    # last start-condition is really end-of-buffer (EOB), it has only
-    # a dummy rule that accepts the null string and executes EOB action
-    eob_state = len(_dfa.start_action)
-
-    # state 0 is the jam state, make it exist but have empty acclist
-    self.acclist = numpy.zeros((0x100,), numpy.uint16)
-    n_acclist = 0
-    self.accept = numpy.zeros((0x100,), numpy.uint16)
-    n_states = 1
-
-    # transitions[i, j] is transition on character j in state i
-    # in our way of thinking, 0 is don't care and -1 is failure
-    # however, in the flex way these are both 0 (don't care),
-    # the distinction being that failure has no associated action
-    # thus all entries of states are filled, with 0 as a catch-all
-    transitions = numpy.zeros((0x100, 0x101), numpy.uint16)
-
-    # generate acclist, accept and transition tables
-    while n_states < len(flex_state_to_action):
-      action = flex_state_to_action[n_states]
-      state, transition = _dfa.actions[action]
-      #print('state', n_states, 'transition', transition)
-
-      del threads0[prefix_slop:]
-      transit(transition)
-      #print(threads0[prefix_slop:])
-      if n_states >= self.accept.shape[0]:
-        # extend accept
-        new_accept = numpy.zeros(
-          (self.accept.shape[0] * 2,),
-          numpy.uint16
-        )
-        new_accept[:self.accept.shape[0]] = self.accept
-        self.accept = new_accept
-      self.accept[n_states] = n_acclist
-      accept_set = set()
-      for k in [j for i in threads0[prefix_slop:] for j in i]:
-        acc = k >> 1
-        if k & 1:
-          if (acc | FlexDFA.YY_TRAILING_HEAD_MASK) not in accept_set:
-            # look back to start of trailing context, then accept
-            acc |= FlexDFA.YY_TRAILING_MASK
-          # otherwise zero length trailing context, accept immediately
-        else:
-          # mark start of (hopefully safe) trailing context
-          acc |= FlexDFA.YY_TRAILING_HEAD_MASK
-        if acc not in accept_set:
-          if n_acclist >= self.acclist.shape[0]:
-            # extend acclist
-            new_acclist = numpy.zeros(
-              (self.acclist.shape[0] * 2,),
-              numpy.uint16
-            )
-            new_acclist[:self.acclist.shape[0]] = self.acclist
-            self.acclist = new_acclist
-          self.acclist[n_acclist] = acc
-          n_acclist += 1
-          accept_set.add(acc)
-
-      # calculate transition row from _dfa.state character-to-action table
-      if n_states >= transitions.shape[0]:
-        # extend transitions
-        new_transitions = numpy.zeros(
-          (transitions.shape[0] * 2, 0x101),
-          numpy.uint16
-        )
-        new_transitions[:transitions.shape[0], :] = transitions
-        transitions = new_transitions
-      breaks, actions, _ = _dfa.states[state]
-      character0 = 0
-      for i in range(len(breaks)):
-        character1 = breaks[i]
-        next_action = actions[i]
-        if next_action in action_to_flex_state:
-          next_flex_state = action_to_flex_state[next_action]
-        else:
-          next_flex_state = len(flex_state_to_action)
-          action_to_flex_state[next_action] = next_flex_state
-          flex_state_to_action.append(next_action)
-        transitions[n_states, character0:character1] = next_flex_state
-        character0 = character1
-      assert character0 == 0x100
-
-      n_states += 1
-
-    # finalize the acclist and accept tables
-    self.acclist = self.acclist[:n_acclist]
-    if n_states >= self.accept.shape[0]:
-      new_accept = numpy.zeros(
-        (self.accept.shape[0] * 2,),
-        numpy.uint16
-      )
-      new_accept[:self.accept.shape[0]] = self.accept
-      self.accept = new_accept
-    self.accept[n_states] = n_acclist
-    self.accept = self.accept[:n_states + 1]
-
-    # finalize the transitions table
-    transitions = transitions[:n_states, :]
-    transitions[:, 0x100] = transitions[:, 0]
-    transitions[:, 0] = eob_state
-
-    # calculate default states by constructing minimum spanning tree
-    # heap contains n states todo followed by n_states - n states done
-    # each heap entry is [distance, hop count, state done, state todo]
-    heap = numpy.zeros((n_states, 4), numpy.uint16)
-    heap[:-1, 0] = numpy.sum(transitions[1:, :] != transitions[:1, :], 1)
-    heap[:-1, 3] = numpy.arange(1, n_states, dtype = numpy.uint16)
-    numpy_heap.heapify(heap, n_states - 1)
-    for n in range(n_states - 2, 0, -1):
-      if n % 100 == 0:
-        print('mst', n)
-
-      key = tuple(heap[n, :])
-      heap[n, :] = heap[0, :]
-      numpy_heap.bubble_down(heap, 0, key, n)
-      hop_count = heap[n, 1] + 1 # proposed hop_count is current hop_count + 1
-      state_done = heap[n, 3] # proposed state_done is current state_todo
-      dist = numpy.sum(
-        transitions[heap[:n, 3], :] !=
-        transitions[state_done:state_done + 1, :],
-        1
-      )
-      # although numpy cannot do lexicographic comparisons, check the
-      # first field via numpy to quickly generate a list of candidates
-      for i in numpy.nonzero(dist <= heap[:n, 0])[0]:
-        key = (dist[i], hop_count, state_done, heap[i, 3])
-        if key < tuple(heap[i, :]):
-          numpy_heap.bubble_up(heap, i, key)
-
-    # state 0 is the jam state, the EOB state will be added later on
-    self.states = numpy.zeros((n_states, 2), numpy.uint16) # base, def
-    self.entries = numpy.full((0x200, 2), -1, numpy.uint16) # nxt, chk
-    self.entries[:0x101, :] = 0 # jam state just returns to jam state
-    self.entries[0, 0] = eob_state # except for the EOB transition
-    entries_free = numpy.full(0x200, True, numpy.bool)
-    entries_free[:0x101] = False # account for the jam (don't care) state
-    n_entries = 0x101
-
-    # pack states in reverse order (larger distances first)
-    dupes = []
-    for i in range(n_states - 1):
-      if (n_states - i) % 100 == 0:
-        print('pack', n_states - i)
-      if heap[i, 0] == 0:
-        # when copying another state, need to have the same base, though
-        # the base will not matter since there will be no entries, it is
-        # is because of the awkward way the compressed lookup is written
-        dupes.append(i)
-      else:
-        state_done = heap[i, 2]
-        state_todo = heap[i, 3]
-        indices = numpy.nonzero(
-          transitions[state_todo, :] != transitions[state_done, :]
-        )[0]
-
-        # make sure entries array is at least large enough to find a spot
-        while self.entries.shape[0] < n_entries + 0x101:
-          # extend entries, copying only n_entries entries
-          new_entries = numpy.full(
-            (self.entries.shape[0] * 2, 2),
-            -1,
-            numpy.uint16
-          )
-          new_entries[:n_entries, :] = self.entries[:n_entries, :]
-          self.entries = new_entries
-
-          # extend entries_free, copying only n_entries entries
-          new_entries_free = numpy.full(
-            (entries_free.shape[0] * 2,),
-            True,
-            numpy.bool
-          )
-          new_entries_free[:n_entries] = entries_free[:n_entries]
-          entries_free = new_entries_free
-
-        # find a suitable spot and store differences from default state
-        # generate a list of candidates via numpy for the first slot only
-        for start_index in numpy.nonzero(
-          entries_free[indices[0]:n_entries]
-        )[0]:
-          indices_offset = indices + start_index
-          if numpy.all(entries_free[indices_offset]):
-            break
-        else:
-          start_index = n_entries
-          indices_offset = indices + start_index
-        self.entries[indices_offset, 0] = transitions[state_todo, indices]
-        self.entries[indices_offset, 1] = state_todo
-        entries_free[indices_offset] = False
-        if n_entries < start_index + 0x101:
-          n_entries = start_index + 0x101
-
-        self.states[state_todo, 0] = start_index
-        self.states[state_todo, 1] = state_done
-    while len(dupes):
-      if len(dupes) % 100 == 0:
-        print('dupe', len(dupes))
-
-      i = dupes.pop()
-      state_done = heap[i, 2]
-      state_todo = heap[i, 3]
-      self.states[state_todo, 0] = self.states[state_done, 0]
-      self.states[state_todo, 1] = state_done
-
-    # finalize entries table
-    self.entries = self.entries[:n_entries, :]
-    print('n_entries', n_entries)
-
-def generate(plex, skel_file, out_file):
-  _nfa = plex.to_nfa()
-  #print(len(_nfa.states), _nfa.start_state)
-  eob_expr = regex.RegexGroup(children = [regex.RegexEmpty()])
-  eob_expr.post_process(len(plex.actions_text))
-  eob_expr.add_to_nfa(_nfa)
-  _dfa = _nfa.to_dfa()
-  #print(len(_dfa.states), len(_dfa.actions), _dfa.start_action)
-  flex_dfa = FlexDFA(_dfa)
+  def __init__( self, accept, acclist, entries, states):
+    self.accept = accept
+    self.acclist = acclist
+    self.states = states
+    self.entries = entries
 
-  if out_file is None:
-    out_file = (
-      plex[0].out_file
-    if len(plex[0].outfile) else
-      'lex.{0:s}.c'.format(plex[0].prefix)
-    )
-  with open(skel_file, 'r') as fin:
-    with open(out_file, 'w+') as fout:
-      line = fin.readline()
-      while len(line):
-        if line == '/* GENERATE PREFIX */\n':
-          fout.write(
-            '''/* GENERATE PREFIX BEGIN */
-{0:s}/* GENERATE END */
-'''.format(
-              ''
-            if plex[0].prefix == 'yy' else
-              ''.join(
-                [
-                  '#define yy{0:s} {1:s}{2:s}\n'.format(
-                    i,
-                    plex[0].prefix,
-                    i
-                  )
-                  for i in [
-                    '_create_buffer',
-                    '_delete_buffer',
-                    '_scan_buffer',
-                    '_scan_string',
-                    '_scan_bytes',
-                    '_init_buffer',
-                    '_flush_buffer',
-                    '_load_buffer_state',
-                    '_switch_to_buffer',
-                    'push_buffer_state',
-                    'pop_buffer_state',
-                    'ensure_buffer_stack',
-                    '_flex_debug',
-                    'in',
-                    'leng',
-                    'lex',
-                    'lineno',
-                    'out',
-                    'restart',
-                    'text',
-                    'wrap',
-                    'alloc',
-                    'realloc',
-                    'free',
-                    '_create_buffer',
-                    '_delete_buffer',
-                    '_scan_buffer',
-                    '_scan_string',
-                    '_scan_bytes',
-                    '_init_buffer',
-                    '_flush_buffer',
-                    '_load_buffer_state',
-                    '_switch_to_buffer',
-                    'push_buffer_state',
-                    'pop_buffer_state',
-                    'ensure_buffer_stack',
-                    'lex',
-                    'restart',
-                    'lex_init',
-                    'lex_init_extra',
-                    'lex_destroy',
-                    'get_debug',
-                    'set_debug',
-                    'get_extra',
-                    'set_extra',
-                    'get_in',
-                    'set_in',
-                    'get_out',
-                    'set_out',
-                    'get_leng',
-                    'get_text',
-                    'get_lineno',
-                    'set_lineno',
-                    'wrap',
-                    'alloc',
-                    'realloc',
-                    'free',
-                    'text',
-                    'leng',
-                    'in',
-                    'out',
-                    '_flex_debug',
-                    'lineno'
-                  ]
-                ]
-              )
-            )
-          )
-        elif line == '/* GENERATE YYWRAP */\n':
-          fout.write(
-            '''/* GENERATE YYWRAP BEGIN */
-{0:s}/* GENERATE END */
-'''.format(
-              ''
-            if plex[0].yywrap else
-              '''#define {0:s}wrap() (/*CONSTCOND*/1)
-#define YY_SKIP_YYWRAP
-'''.format(
-                plex[0].prefix
-              )
-            )
-          )
-        elif line == '/* GENERATE TABLES */\n':
-          fout.write(
-            '''/* GENERATE TABLES BEGIN */
-#define YY_END_OF_BUFFER {0:d}
-static const flex_uint16_t yy_acclist[] = {{{1:s}
-}};
-static const flex_uint16_t yy_accept[] = {{{2:s}
-}};
-static const flex_uint16_t yy_base[] = {{{3:s}
-}};
-static const flex_uint16_t yy_def[] = {{{4:s}
-}};
-static const flex_uint16_t yy_nxt[] = {{{5:s}
-}};
-static const flex_uint16_t yy_chk[] = {{{6:s}
-}};
-/* GENERATE END */
-'''.format(
-              len(plex.actions_text),
-              ','.join(
-                [
-                  '\n\t{0:s}'.format(
-                    ', '.join(
-                      [
-                        '{0:5d}'.format(j)
-                        for j in flex_dfa.acclist[i:i + 10]
-                      ]
-                    )
-                  )
-                  for i in range(0, flex_dfa.acclist.shape[0], 10)
-                ]
-              ),
-              ','.join(
-                [
-                  '\n\t{0:s}'.format(
-                    ', '.join(
-                      [
-                        '{0:5d}'.format(j)
-                        for j in flex_dfa.accept[i:i + 10]
-                      ]
-                    )
-                  )
-                  for i in range(0, flex_dfa.accept.shape[0], 10)
-                ]
-              ),
-              ','.join(
-                [
-                  '\n\t{0:s}'.format(
-                    ', '.join(
-                      [
-                        '{0:5d}'.format(j)
-                        for j in flex_dfa.states[i:i + 10, 0]
-                      ]
-                    )
-                  )
-                  for i in range(0, flex_dfa.states.shape[0], 10)
-                ]
-              ),
-              ','.join(
-                [
-                  '\n\t{0:s}'.format(
-                    ', '.join(
-                      [
-                        '{0:5d}'.format(j)
-                        for j in flex_dfa.states[i:i + 10, 1]
-                      ]
-                    )
-                  )
-                  for i in range(0, flex_dfa.states.shape[0], 10)
-                ]
-              ),
-              ','.join(
-                [
-                  '\n\t{0:s}'.format(
-                    ', '.join(
-                      [
-                        '{0:5d}'.format(j)
-                        for j in flex_dfa.entries[i:i + 10, 0]
-                      ]
-                    )
-                  )
-                  for i in range(0, flex_dfa.entries.shape[0], 10)
-                ]
-              ),
-              ','.join(
-                [
-                  '\n\t{0:s}'.format(
-                    ', '.join(
-                      [
-                        '{0:5d}'.format(j)
-                        for j in flex_dfa.entries[i:i + 10, 1]
-                      ]
-                    )
-                  )
-                  for i in range(0, flex_dfa.entries.shape[0], 10)
-                ]
-              )
-            )
-          )
-        elif line == '/* GENERATE SECTION1 */\n':
-          fout.write(
-            '''/* GENERATE SECTION1 BEGIN */
-{0:s}/* GENERATE END */
-'''.format(
-              ''.join([i.get_text() for i in plex[0].code_blocks_text])
-            )
-          )
-        elif line == '/* GENERATE STARTCONDDECL */\n':
-          fout.write(
-            '''/* GENERATE STARTCONDDECL BEGIN */
-{0:s}/* GENERATE END*/
-'''.format(
-              ''.join(
-                [
-                  '#define {0:s} {1:d}\n'.format(
-                    plex.start_conditions[i].name,
-                    i
-                  )
-                  for i in range(len(plex.start_conditions))
-                ]
-              )
-            )
-          )
-        elif line == '/* GENERATE SECTION2INITIAL */\n':
-          fout.write(
-            '''/* GENERATE SECTION2INITIAL BEGIN */
-{0:s}/* GENERATE END */
-'''.format(
-              ''.join([i.get_text() for i in plex[1].code_blocks_text])
-            )
-          )
-        elif line == '/* GENERATE SECTION2 */\n':
-          eof_action_to_start_conditions = [
-            [
-              j
-              for j in range(len(plex.start_conditions))
-              if plex.start_conditions[j].eof_action == i
-            ]
-            for i in range(len(plex.eof_actions_text))
-          ]
-          #print('eof_action_to_start_conditions', eof_action_to_start_conditions)
-          fout.write(
-            '''/* GENERATE SECTION2 BEGIN */
-{0:s}{1:s}/* GENERATE END */
-'''.format(
-              ''.join(
-                [
-                  '''case {0:d}:
-YY_RULE_SETUP
-{1:s}  YY_BREAK
-'''.format(
-                    i,
-                    plex.actions_text[i].get_text()
-                  )
-                  for i in range(len(plex.actions_text))
-                ]
-              ),
-              ''.join(
-                [
-                  '{0:s}{1:s}'.format(
-                    ''.join(
-                      [
-                        '\t\t\tcase YY_STATE_EOF({0:s}):\n'.format(
-                          plex.start_conditions[j].name
-                        )
-                        for j in eof_action_to_start_conditions[i]
-                      ]
-                    ),
-                    plex.eof_actions_text[i].get_text()
-                  )
-                  for i in range(len(plex.eof_actions_text))
-                  if len(eof_action_to_start_conditions[i]) > 0
-                ]
-              )
-            )
-          )
-        elif line == '/* GENERATE SECTION3 */\n':
-          fout.write(
-            '''/* GENERATE SECTION3 BEGIN */
-{0:s}/* GENERATE END */
-'''.format(
-              '' if len(plex) < 3 else plex[2].get_text()
-            )
-          )
-        else:
-          if plex[0].prefix != 'yy':
-            line = line.replace('yywrap', '{0:s}wrap'.format(plex[0].prefix))
-          fout.write(line)
-        line = fin.readline()
+  # add some test routines here for scanning text, etc, similar to DFA()
+  # add repr() routine, to use with wrap_repr() for basic serialization
similarity index 100%
rename from generate.py
rename to generate_ast.py
diff --git a/generate_flex.py b/generate_flex.py
new file mode 100644 (file)
index 0000000..2ebbf46
--- /dev/null
@@ -0,0 +1,308 @@
+import regex
+
+def generate_flex(plex, skel_file, out_file):
+  _nfa = plex.to_nfa()
+
+  # end of buffer expression (do it here because only necessary for flex)
+  _regex = regex.RegexGroup(children = [regex.RegexEmpty()])
+  _regex.post_process(len(plex.actions_text))
+  _regex.add_to_nfa(_nfa)
+
+  _flex_dfa = _nfa.to_dfa().to_flex_dfa()
+
+  if out_file is None:
+    out_file = (
+      plex[0].out_file
+    if len(plex[0].outfile) else
+      'lex.{0:s}.c'.format(plex[0].prefix)
+    )
+  with open(skel_file, 'r') as fin:
+    with open(out_file, 'w+') as fout:
+      line = fin.readline()
+      while len(line):
+        if line == '/* GENERATE PREFIX */\n':
+          fout.write(
+            '''/* GENERATE PREFIX BEGIN */
+{0:s}/* GENERATE END */
+'''.format(
+              ''
+            if plex[0].prefix == 'yy' else
+              ''.join(
+                [
+                  '#define yy{0:s} {1:s}{2:s}\n'.format(
+                    i,
+                    plex[0].prefix,
+                    i
+                  )
+                  for i in [
+                    '_create_buffer',
+                    '_delete_buffer',
+                    '_scan_buffer',
+                    '_scan_string',
+                    '_scan_bytes',
+                    '_init_buffer',
+                    '_flush_buffer',
+                    '_load_buffer_state',
+                    '_switch_to_buffer',
+                    'push_buffer_state',
+                    'pop_buffer_state',
+                    'ensure_buffer_stack',
+                    '_flex_debug',
+                    'in',
+                    'leng',
+                    'lex',
+                    'lineno',
+                    'out',
+                    'restart',
+                    'text',
+                    'wrap',
+                    'alloc',
+                    'realloc',
+                    'free',
+                    '_create_buffer',
+                    '_delete_buffer',
+                    '_scan_buffer',
+                    '_scan_string',
+                    '_scan_bytes',
+                    '_init_buffer',
+                    '_flush_buffer',
+                    '_load_buffer_state',
+                    '_switch_to_buffer',
+                    'push_buffer_state',
+                    'pop_buffer_state',
+                    'ensure_buffer_stack',
+                    'lex',
+                    'restart',
+                    'lex_init',
+                    'lex_init_extra',
+                    'lex_destroy',
+                    'get_debug',
+                    'set_debug',
+                    'get_extra',
+                    'set_extra',
+                    'get_in',
+                    'set_in',
+                    'get_out',
+                    'set_out',
+                    'get_leng',
+                    'get_text',
+                    'get_lineno',
+                    'set_lineno',
+                    'wrap',
+                    'alloc',
+                    'realloc',
+                    'free',
+                    'text',
+                    'leng',
+                    'in',
+                    'out',
+                    '_flex_debug',
+                    'lineno'
+                  ]
+                ]
+              )
+            )
+          )
+        elif line == '/* GENERATE YYWRAP */\n':
+          fout.write(
+            '''/* GENERATE YYWRAP BEGIN */
+{0:s}/* GENERATE END */
+'''.format(
+              ''
+            if plex[0].yywrap else
+              '''#define {0:s}wrap() (/*CONSTCOND*/1)
+#define YY_SKIP_YYWRAP
+'''.format(
+                plex[0].prefix
+              )
+            )
+          )
+        elif line == '/* GENERATE TABLES */\n':
+          fout.write(
+            '''/* GENERATE TABLES BEGIN */
+#define YY_END_OF_BUFFER {0:d}
+static const flex_uint16_t yy_acclist[] = {{{1:s}
+}};
+static const flex_uint16_t yy_accept[] = {{{2:s}
+}};
+static const flex_uint16_t yy_base[] = {{{3:s}
+}};
+static const flex_uint16_t yy_def[] = {{{4:s}
+}};
+static const flex_uint16_t yy_nxt[] = {{{5:s}
+}};
+static const flex_uint16_t yy_chk[] = {{{6:s}
+}};
+/* GENERATE END */
+'''.format(
+              len(plex.actions_text),
+              ','.join(
+                [
+                  '\n\t{0:s}'.format(
+                    ', '.join(
+                      [
+                        '{0:5d}'.format(j)
+                        for j in _flex_dfa.acclist[i:i + 10]
+                      ]
+                    )
+                  )
+                  for i in range(0, _flex_dfa.acclist.shape[0], 10)
+                ]
+              ),
+              ','.join(
+                [
+                  '\n\t{0:s}'.format(
+                    ', '.join(
+                      [
+                        '{0:5d}'.format(j)
+                        for j in _flex_dfa.accept[i:i + 10]
+                      ]
+                    )
+                  )
+                  for i in range(0, _flex_dfa.accept.shape[0], 10)
+                ]
+              ),
+              ','.join(
+                [
+                  '\n\t{0:s}'.format(
+                    ', '.join(
+                      [
+                        '{0:5d}'.format(j)
+                        for j in _flex_dfa.states[i:i + 10, 0]
+                      ]
+                    )
+                  )
+                  for i in range(0, _flex_dfa.states.shape[0], 10)
+                ]
+              ),
+              ','.join(
+                [
+                  '\n\t{0:s}'.format(
+                    ', '.join(
+                      [
+                        '{0:5d}'.format(j)
+                        for j in _flex_dfa.states[i:i + 10, 1]
+                      ]
+                    )
+                  )
+                  for i in range(0, _flex_dfa.states.shape[0], 10)
+                ]
+              ),
+              ','.join(
+                [
+                  '\n\t{0:s}'.format(
+                    ', '.join(
+                      [
+                        '{0:5d}'.format(j)
+                        for j in _flex_dfa.entries[i:i + 10, 0]
+                      ]
+                    )
+                  )
+                  for i in range(0, _flex_dfa.entries.shape[0], 10)
+                ]
+              ),
+              ','.join(
+                [
+                  '\n\t{0:s}'.format(
+                    ', '.join(
+                      [
+                        '{0:5d}'.format(j)
+                        for j in _flex_dfa.entries[i:i + 10, 1]
+                      ]
+                    )
+                  )
+                  for i in range(0, _flex_dfa.entries.shape[0], 10)
+                ]
+              )
+            )
+          )
+        elif line == '/* GENERATE SECTION1 */\n':
+          fout.write(
+            '''/* GENERATE SECTION1 BEGIN */
+{0:s}/* GENERATE END */
+'''.format(
+              ''.join([i.get_text() for i in plex[0].code_blocks_text])
+            )
+          )
+        elif line == '/* GENERATE STARTCONDDECL */\n':
+          fout.write(
+            '''/* GENERATE STARTCONDDECL BEGIN */
+{0:s}/* GENERATE END*/
+'''.format(
+              ''.join(
+                [
+                  '#define {0:s} {1:d}\n'.format(
+                    plex.start_conditions[i].name,
+                    i
+                  )
+                  for i in range(len(plex.start_conditions))
+                ]
+              )
+            )
+          )
+        elif line == '/* GENERATE SECTION2INITIAL */\n':
+          fout.write(
+            '''/* GENERATE SECTION2INITIAL BEGIN */
+{0:s}/* GENERATE END */
+'''.format(
+              ''.join([i.get_text() for i in plex[1].code_blocks_text])
+            )
+          )
+        elif line == '/* GENERATE SECTION2 */\n':
+          eof_action_to_start_conditions = [
+            [
+              j
+              for j in range(len(plex.start_conditions))
+              if plex.start_conditions[j].eof_action == i
+            ]
+            for i in range(len(plex.eof_actions_text))
+          ]
+          #print('eof_action_to_start_conditions', eof_action_to_start_conditions)
+          fout.write(
+            '''/* GENERATE SECTION2 BEGIN */
+{0:s}{1:s}/* GENERATE END */
+'''.format(
+              ''.join(
+                [
+                  '''case {0:d}:
+YY_RULE_SETUP
+{1:s}  YY_BREAK
+'''.format(
+                    i,
+                    plex.actions_text[i].get_text()
+                  )
+                  for i in range(len(plex.actions_text))
+                ]
+              ),
+              ''.join(
+                [
+                  '{0:s}{1:s}'.format(
+                    ''.join(
+                      [
+                        '\t\t\tcase YY_STATE_EOF({0:s}):\n'.format(
+                          plex.start_conditions[j].name
+                        )
+                        for j in eof_action_to_start_conditions[i]
+                      ]
+                    ),
+                    plex.eof_actions_text[i].get_text()
+                  )
+                  for i in range(len(plex.eof_actions_text))
+                  if len(eof_action_to_start_conditions[i]) > 0
+                ]
+              )
+            )
+          )
+        elif line == '/* GENERATE SECTION3 */\n':
+          fout.write(
+            '''/* GENERATE SECTION3 BEGIN */
+{0:s}/* GENERATE END */
+'''.format(
+              '' if len(plex) < 3 else plex[2].get_text()
+            )
+          )
+        else:
+          if plex[0].prefix != 'yy':
+            line = line.replace('yywrap', '{0:s}wrap'.format(plex[0].prefix))
+          fout.write(line)
+        line = fin.readline()
index 8cc339a..8fd5b31 100755 (executable)
--- a/regex.sh
+++ b/regex.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-if ./generate.py regex <regex.py >regex.py.new && ! diff -q regex.py regex.py.new
+if ./generate_ast.py regex <regex.py >regex.py.new && ! diff -q regex.py regex.py.new
 then
   mv regex.py.new regex.py
 else