Add --python switch, skel/skel_py.py template, and tests/cal_py.l test scanner
authorNick Downing <downing.nick@gmail.com>
Sun, 9 Sep 2018 04:22:13 +0000 (14:22 +1000)
committerNick Downing <downing.nick@gmail.com>
Mon, 10 Sep 2018 04:32:40 +0000 (14:32 +1000)
.gitignore
bootstrap_plex.py
dfa.py
generate_flex.py
generate_py.py [new file with mode: 0644]
nfa.py
skel/skel_py.py [new file with mode: 0644]
tests/Makefile
tests/cal_py.l [new file with mode: 0644]

index 355d1e1..90be895 100644 (file)
@@ -8,6 +8,7 @@ lex-yacc-examples/example7
 skel/skel_flex.c.orig
 tests/*.c
 tests/*.o
+tests/*.py
 tests/*.xml
 tests/cal
 tests/flex0
index 38040e7..5163739 100755 (executable)
@@ -3,29 +3,37 @@
 import ast
 import element
 import generate_flex
+import generate_py
 import getopt
 import os
 import sys
 
 home_dir = os.path.dirname(sys.argv[0])
 try:
-  opts, args = getopt.getopt(sys.argv[1:], 'o:S:', ['outfile=', 'skel='])
+  opts, args = getopt.getopt(
+    sys.argv[1:],
+    'o:pS:',
+    ['outfile=', 'python', 'skel=']
+  )
 except getopt.GetoptError as err:
   sys.stderr.write('{0:s}\n'.format(str(err)))
   sys.exit(1)
 
 out_file = None
-skel_file = os.path.join(home_dir, 'skel/skel_flex.c')
+python = False
+skel_file = None
 for opt, arg in opts:
   if opt == '-o' or opt == '--outfile':
     out_file = arg
+  elif opt == '-p' or opt == '--python':
+    python = True
   elif opt == '-S' or opt == '--skel':
     skel_file = arg
   else:
     assert False
 if len(args) < 1:
   sys.stdout.write(
-    'usage: {0:s} rules.l\n'.format(
+    'usage: {0:s} [options] rules.l\n'.format(
       sys.argv[0]
     )
   )
@@ -37,6 +45,11 @@ with open(in_file) as fin:
 #element.serialize(plex, 'a.xml', 'utf-8')
 #plex = element.deserialize('a.xml', ast.factory, 'utf-8')
 plex.post_process()
-element.serialize(plex, 'b.xml', 'utf-8')
-plex = element.deserialize('b.xml', ast.factory, 'utf-8')
-generate_flex.generate_flex(plex, skel_file, out_file)
+#element.serialize(plex, 'b.xml', 'utf-8')
+#plex = element.deserialize('b.xml', ast.factory, 'utf-8')
+(generate_py.generate_py if python else generate_flex.generate_flex)(
+  plex,
+  home_dir,
+  skel_file,
+  out_file
+)
diff --git a/dfa.py b/dfa.py
index 2729bb8..4e28ce8 100644 (file)
--- a/dfa.py
+++ b/dfa.py
@@ -32,7 +32,7 @@ class DFA:
   def __init__(
     self,
     groups = [],
-    states = [([n_characters], [0], 0)],
+    states = [([n_characters], [0], [0])],
     actions = [(0, [])],
     start_action = [] # can have multiple DFAs in same container
   ):
@@ -40,10 +40,10 @@ class DFA:
     # group_desc: (tag, kwargs)
     #   tag, kwargs will be passed to apply_markup() hence factory()
     # states: list of state_desc
-    # state_desc: (list of breaks, list of action to do, accept_thread)
+    # state_desc: (list of breaks, list of action to do, accept_threads)
     # actions: list of action_desc
     # action_desc: (state to go to next, compiled transition to do first)
-    # accept_thread: which thread of thread list to use, -1 don't accept
+    # accept_threads: list of accepting thread numbers (in thread list)
     self.groups = groups
     self.states = states
     self.actions = actions
@@ -358,11 +358,12 @@ class DFA:
       if state == 0:
         # there is only one match, which is complete
         assert len(threads0) == prefix_slop + 1
+        assert self.states[state][2] == [0]
         return threads0[prefix_slop]
       if i >= len(text):
         # return best match we have, but not incomplete match
-        i = self.states[state][2]
-        return (None if i == -1 else threads0[prefix_slop + i])
+        accept = self.states[state][2]
+        return threads0[prefix_slop + accept[0]] if len(accept) else None
       action = self.states[state][1][
         bisect.bisect_right(self.states[state][0], ord(text[i]))
       ]
@@ -415,6 +416,7 @@ class DFA:
       if state == 0:
         # there is only one match, which is complete
         assert len(threads0) == prefix_slop + 1
+        assert self.states[state][2] == [0]
         return threads0[prefix_slop]
       while off >= len(text):
         if pos < len(root):
@@ -425,8 +427,8 @@ class DFA:
             next(yychunk_iter)
           except StopIteration:
             # return best match we have, but not incomplete match
-            i = self.states[state][2]
-            return (None if i == -1 else threads0[prefix_slop + i])
+            accept = self.states[state][2]
+            return threads0[prefix_slop + accept[0]] if len(accept) else None
           text = element.get_text(root, pos)
       #print(
       #  'state {0:d} pos {1:d} off {2:d} text "{3:s}"'.format(
index 932c42b..f2498e9 100644 (file)
@@ -1,6 +1,7 @@
+import os
 import regex
 
-def generate_flex(plex, skel_file, out_file):
+def generate_flex(plex, home_dir, skel_file, out_file):
   _nfa = plex.to_nfa()
 
   # end of buffer expression (do it here because only necessary for flex)
@@ -10,6 +11,8 @@ def generate_flex(plex, skel_file, out_file):
 
   _flex_dfa = _nfa.to_dfa().to_flex_dfa()
 
+  if skel_file is None:
+    skel_file = os.path.join(home_dir, 'skel/skel_flex.c')
   if out_file is None:
     out_file = (
       plex[0].outfile
diff --git a/generate_py.py b/generate_py.py
new file mode 100644 (file)
index 0000000..38eb5d7
--- /dev/null
@@ -0,0 +1,163 @@
+import os
+import wrap_repr
+
+def plex_text_to_python(plex_text, indent):
+  text = plex_text.get_text()
+  text_strip = text.strip()
+  if text_strip[:1] == '{' and text_strip[-1:] == '}':
+    text = text_strip[1:-1]
+  lines = text.rstrip().split('\n')
+  while len(lines) and len(lines[0].lstrip()) == 0:
+    lines = lines[1:]
+  while len(lines) and len(lines[-1].lstrip()) == 0:
+    lines = lines[:-1]
+  if len(lines) == 0:
+    return '' #{0:s}pass\n'.format(indent)
+  for j in range(len(lines[0])):
+    if lines[0][j] != '\t' and lines[0][j] != ' ':
+      break
+  else:
+    print(text)
+    assert False
+  #print('---')
+  #print(text)
+  prefix = lines[0][:j]
+  for j in range(len(lines)):
+    if len(lines[j]) == 0:
+      lines[j] = '\n'
+    else:
+      assert lines[j][:len(prefix)] == prefix
+      lines[j] = '{0:s}{1:s}\n'.format(indent, lines[j][len(prefix):])
+  return ''.join(lines)
+
+def generate_py(plex, home_dir, skel_file, out_file):
+  _dfa = plex.to_nfa().to_dfa()
+
+  if skel_file is None:
+    skel_file = os.path.join(home_dir, 'skel/skel_py.py')
+  if out_file is None:
+    out_file = (
+      plex[0].outfile
+    if len(plex[0].outfile) else
+      'lex_{0:s}.py'.format(plex[0].prefix)
+    )
+  with open(skel_file, 'r') as fin:
+    with open(out_file, 'w+') as fout:
+      line = fin.readline()
+      while len(line):
+        if line == '# GENERATE SECTION1\n':
+          fout.write(
+            '''# GENERATE SECTION1 BEGIN
+{0:s}# GENERATE END
+'''.format(
+              ''.join(
+                [
+                  plex_text_to_python(i, '')
+                  for i in plex[0].code_blocks_text
+                ]
+              )
+            )
+          )
+        elif line == '# GENERATE STARTCONDDECL\n':
+          fout.write(
+            '''# GENERATE STARTCONDDECL BEGIN
+{0:s}# GENERATE END
+'''.format(
+              ''.join(
+                [
+                  '{0:s} = {1:d}\n'.format(
+                    plex.start_conditions[i].name,
+                    i
+                  )
+                  for i in range(len(plex.start_conditions))
+                ]
+              )
+            )
+          )
+        elif line == '# GENERATE SECTION2\n':
+          fout.write(
+            '''# GENERATE SECTION2 BEGIN
+{0:s}{1:s}{2:s}{3:s}{4:s}yy_actions = [{5:s}
+]
+{6:s}yy_eof_actions = [{7:s}
+]
+# GENERATE END
+'''.format(
+              wrap_repr.wrap_repr(
+                'yy_dfa_groups = {0:s}'.format(repr(_dfa.groups)),
+                79
+              ),
+              wrap_repr.wrap_repr(
+                'yy_dfa_states = {0:s}'.format(repr(_dfa.states)),
+                79
+              ),
+              wrap_repr.wrap_repr(
+                'yy_dfa_actions = {0:s}'.format(repr(_dfa.actions)),
+                79
+              ),
+              wrap_repr.wrap_repr(
+                'yy_dfa_start_action = {0:s}'.format(repr(_dfa.start_action)),
+                79
+              ),
+              ''.join(
+                [
+                  '''def yy_action{0:d}():
+{1:s}  raise YYContinue()
+'''.format(
+                    i,
+                    plex_text_to_python(plex.actions_text[i], '  ')
+                  )
+                  for i in range(len(plex.actions_text))
+                ]
+              ),
+              ','.join(
+                [
+                  '\n  yy_action{0:d}'.format(i)
+                  for i in range(len(plex.actions_text))
+                ]
+              ),
+              ''.join(
+                [
+                  '''def yy_eof_action{0:d}():
+{1:s}  return 0
+'''.format(
+                    i,
+                    plex_text_to_python(plex.eof_actions_text[i], '  ')
+                  )
+                  for i in range(len(plex.eof_actions_text))
+                ]
+              ),
+              ','.join(
+                [
+                  '\n  yy_eof_action{0:d}'.format(i.eof_action)
+                  for i in plex.start_conditions
+                ]
+              )
+            )
+          )
+        elif line == '  # GENERATE SECTION2INITIAL\n':
+          fout.write(
+            '''  # GENERATE SECTION2INITIAL BEGIN
+{0:s}  # GENERATE END
+'''.format(
+              ''.join(
+                [
+                  plex_text_to_python(i, '  ')
+                  for i in plex[1].code_blocks_text
+                ]
+              )
+            )
+          )
+        elif line == '# GENERATE SECTION3\n':
+          fout.write(
+            '''# GENERATE SECTION3 BEGIN
+{0:s}# GENERATE END
+'''.format(
+              '' if len(plex) < 3 else plex_text_to_python(plex[2], '')
+            )
+          )
+        else:
+          #if plex[0].prefix != 'yy':
+          #  line = line.replace('yywrap', '{0:s}wrap'.format(plex[0].prefix))
+          fout.write(line)
+        line = fin.readline()
diff --git a/nfa.py b/nfa.py
index 0d108bd..d55ebae 100644 (file)
--- a/nfa.py
+++ b/nfa.py
@@ -202,19 +202,22 @@ class NFA:
 
   def multistate_accept(root_multistate):
     i = 0
+    result = []
     def accept(multistate):
-      nonlocal i
+      nonlocal i # also uses result
       if multistate[0] == NFA.MULTISTATE_ACCEPT:
-        return True
-      if multistate[0] == NFA.MULTISTATE_AND:
-        _, _, _, child = multistate
-        i += child[1]
-        return False
-      if multistate[0] == NFA.MULTISTATE_OR:
+        result.append(i)
+        i += 1
+      elif multistate[0] == NFA.MULTISTATE_AND:
+        i += multistate[1]
+      elif multistate[0] == NFA.MULTISTATE_OR:
         _, _, child0, child1 = multistate
-        return accept(child0) or accept(child1)
-      assert False
-    return i if accept(root_multistate) else -1
+        accept(child0)
+        accept(child1)
+      else:
+        assert False
+    accept(root_multistate)
+    return result
 
   def match_text(self, text, i, start_index = 0):
     def transit(transition):
@@ -267,8 +270,8 @@ class NFA:
         return threads0[prefix_slop]
       if i >= len(text):
         # return best match we have, but not incomplete match
-        i = NFA.multistate_accept(next_multistate)
-        return (None if i == -1 else threads0[prefix_slop + i])
+        accept = NFA.multistate_accept(next_multistate)
+        return threads0[prefix_slop + accept[0]] if len(accept) else None
       next_multistate, transition, _, _ = (
         self.multistate_next(next_multistate, ord(text[i]))
       )
@@ -337,8 +340,8 @@ class NFA:
             next(yychunk_iter)
           except StopIteration:
             # return best match we have, but not incomplete match
-            i = NFA.multistate_accept(next_multistate)
-            return (None if i == -1 else threads0[prefix_slop + i])
+            accept = NFA.multistate_accept(next_multistate)
+            return threads0[prefix_slop + accept[0]] if len(accept) else None
           text = element.get_text(root, pos)
       next_multistate, transition, _, _ = (
         self.multistate_next(next_multistate, ord(text[off]))
diff --git a/skel/skel_py.py b/skel/skel_py.py
new file mode 100644 (file)
index 0000000..3c19297
--- /dev/null
@@ -0,0 +1,176 @@
+import bisect
+import sys
+
+# GENERATE SECTION1
+
+# GENERATE STARTCONDDECL
+
+class YYContinue(Exception):
+  pass
+
+class YYTerminate(Exception):
+  pass
+
+class YYBufferList:
+  def __init__(self, next = None):
+    self.next = next
+
+class YYBufferBlock(YYBufferList):
+  def __init__(self, next = None, pos = 0, text = ''):
+    YYBufferList.__init__(self, next)
+    self.pos = pos
+    self.text = text
+
+class YYBufferState(YYBufferList):
+  def __init__(self, next = None, file_in = None):
+    YYBufferList.__init__(self, next)
+    self.file_in = file_in
+
+yyin = sys.stdin
+yyout = sys.stdout
+yy_buffer_stack = [YYBufferState(None, None)]
+
+yystart = INITIAL
+yy_threads0 = [None]
+yy_threads1 = [None]
+yy_prefix_slop = 1
+
+yytext = None
+
+def yyterminate():
+  raise YYTerminate()
+
+# GENERATE SECTION2
+
+def yylex():
+  global yyin, yy_threads0, yy_threads1, yy_prefix_slop, yytext
+
+  # GENERATE SECTION2INITIAL
+
+  while True:
+    assert len(yy_threads0) == yy_prefix_slop
+    assert len(yy_threads1) == yy_prefix_slop
+    yy_threads0.append(None)
+
+    i = 0
+    buffer_ptr = len(yy_buffer_stack) - 1
+    block_prev = yy_buffer_stack[buffer_ptr]
+    block = block_prev.next
+    if block is not None:
+      block_pos = block.pos
+    file_in = yyin
+
+    action = yy_dfa_start_action[yystart]
+    while action != -1:
+      state, transition = yy_dfa_actions[action]
+      #print('i', i, 'action', action, 'state', state, 'transition', transition)
+
+      j = yy_prefix_slop
+      for trans in transition:
+        if trans[0] == 0: #DFA.TRANSITION_POP:
+          j += trans[1]
+        elif trans[0] == 1: #DFA.TRANSITION_DUP:
+          while j < trans[1]:
+            yy_threads0[:0] = [None] * yy_prefix_slop
+            yy_threads1[:0] = [None] * yy_prefix_slop
+            j += yy_prefix_slop
+            yy_prefix_slop *= 2
+          yy_threads0[j - trans[1]:j] = yy_threads0[j:j + trans[1]]
+          j -= trans[1]
+        elif trans[0] == 2: #DFA.TRANSITION_MARK:
+          yy_threads0[j:j + trans[1]] = [
+            (i, trans[2], thread)
+            for thread in yy_threads0[j:j + trans[1]]
+          ]
+        elif trans[0] == 3: #DFA.TRANSITION_MOVE:
+          yy_threads1.extend(yy_threads0[j:j + trans[1]])
+          j += trans[1]
+        #elif trans[0] == DFA.TRANSITION_DEL:
+        #  del yy_threads1[-trans[1]:]
+        else:
+          assert False
+      assert j == len(yy_threads0)
+      yy_threads0, yy_threads1 = yy_threads1, yy_threads0
+      del yy_threads1[yy_prefix_slop:]
+
+      if state == 0:
+        # there is only one match, which is complete
+        assert len(yy_threads0) == yy_prefix_slop + 1
+        assert yy_dfa_states[state][2] == [0]
+        break
+
+      while block is None or block_pos >= len(block.text):
+        if block is None:
+          text = file_in.readline()
+          if len(text):
+            block = YYBufferBlock(None, 0, text)
+            block_pos = 0
+            block_prev.next = block
+          else:
+            buffer_ptr -= 1
+            if buffer_ptr < 0:
+              break # EOF
+            block_prev = yy_buffer_stack[buffer_ptr]
+            block = block_prev.next
+            if block is not None:
+              block_pos = block.pos
+            file_in = yy_buffer_stack[buffer_ptr].file_in
+        else:
+          block_prev = block
+          block = block_prev.next
+          if block is not None:
+            block_pos = block.pos
+      else: 
+        #print('block_pos', block_pos, 'block.text', block.text)
+        action = yy_dfa_states[state][1][
+          bisect.bisect_right(
+            yy_dfa_states[state][0],
+            ord(block.text[block_pos])
+          )
+        ]
+        i += 1
+        block_pos += 1
+        continue
+      # EOF
+      if i == 0:
+        del yy_threads0[yy_prefix_slop:]
+        try:
+          return yy_eof_actions[yystart]()
+        except YYTerminate:
+          return 0
+      break
+
+    accept = yy_dfa_states[state][2]
+    if len(accept) == 0:
+      del yy_threads0[yy_prefix_slop:]
+      raise Exception('scanner jammed')
+    _, _, thread = yy_threads0[yy_prefix_slop + accept[0]]
+    del yy_threads0[yy_prefix_slop:]
+    #print('thread', thread)
+    i, mark, thread = thread
+    assert thread is None
+
+    yytext = ''
+    while len(yytext) < i:
+      block = yy_buffer_stack[-1].next
+      while block is None or block.pos >= len(block.text):
+        if block is None:
+          yy_buffer_stack.pop()
+          block = yy_buffer_stack[-1].next
+          yyin = yy_buffer_stack[-1].file_in
+        else:
+          block = block.next
+          yy_buffer_stack[-1].next = block
+      j = min(i - len(yytext), len(block.text) - block.pos)
+      yytext += block.text[block.pos:block.pos + j]
+      block.pos += j
+    #print('yytext', yytext)
+
+    try:
+      return yy_actions[mark >> 1]()
+    except YYContinue:
+      pass
+    except YYTerminate:
+      return 0
+
+# GENERATE SECTION3
index 906dc82..601e8e0 100644 (file)
@@ -1,4 +1,9 @@
-all: cal flex0 flex1
+all: lex_yy.py cal flex0 flex1
+
+# Python scanner test
+lex_yy.py: cal_py.l
+       ../../bootstrap_flex.git/src/flex -o /dev/null $< 2>$<.xml
+       ../bootstrap_plex.py --python $<.xml
 
 # cal program
 cal: y.tab.o
@@ -38,4 +43,4 @@ flex1.c: flex1.l
 
 # other
 clean:
-       rm -f *.c *.o *.xml cal flex0 flex1
+       rm -f *.c *.o *.py *.xml cal flex0 flex1
diff --git a/tests/cal_py.l b/tests/cal_py.l
new file mode 100644 (file)
index 0000000..2922915
--- /dev/null
@@ -0,0 +1,35 @@
+%{
+# this is section 1
+NUM = 0x100 
+yylval = None
+%}
+
+DIGIT [0-9]+\.?|[0-9]*\.[0-9]+
+
+%option noecs nometa-ecs noyywrap reject yymore
+
+%%
+
+       # this is section 2 initial
+
+[ ]
+{DIGIT}        {
+  global yylval
+  yylval = float(yytext)
+  return NUM
+}
+\n|.   {
+  return ord(yytext[0])
+}
+
+%%
+
+# this is section 3
+if __name__ == '__main__':
+  token = yylex()
+  while token != 0:
+    if token == NUM:
+      print('NUM', yylval)
+    else:
+      print('{0:02x}'.format(token))
+    token = yylex()