Add start conditions (AT&T style only), improve EOF rules, add default action
authorNick Downing <downing.nick@gmail.com>
Sat, 30 Jun 2018 00:22:16 +0000 (10:22 +1000)
committerNick Downing <downing.nick@gmail.com>
Sat, 30 Jun 2018 00:22:16 +0000 (10:22 +1000)
.gitignore
ast.py
doc/Flex - a scanner generator - Start conditions.pdf [new file with mode: 0644]
plex.py
skel/lex.yy.c.patch
tests/Makefile
tests/cal.l.xml [deleted file]
tests/flex0.l [new file with mode: 0644]
tests/flex1.l [new file with mode: 0644]

index cc5970e..cf141e2 100644 (file)
@@ -1,6 +1,13 @@
 __pycache__
 lex.yy.c
 tests/cal
+tests/cal.l.xml
+tests/flex0
+tests/flex0.c
+tests/flex0.l.xml
+tests/flex1
+tests/flex1.c
+tests/flex1.l.xml
 tests/lex.yy.c.orig
 tests/lex.yy.c
 tests/y.tab.c
diff --git a/ast.py b/ast.py
index 3bebd67..67a2931 100644 (file)
--- a/ast.py
+++ b/ast.py
@@ -1,6 +1,34 @@
 import element
 import regex
 
+class Name(element.Element):
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'Name',
+    attrib = {},
+    text = '',
+    children = []
+  ):
+    element.Element.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+  def copy(self, factory = None):
+    result = element.Element.copy(
+      self,
+      Name if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'ast.Name({0:s})'.format(', '.join(params))
+  # GENERATE END
+
 class Section1(element.Element):
   # GENERATE ELEMENT() BEGIN
   def __init__(
@@ -325,6 +353,53 @@ class Options(Option):
     for i in self:
       i.process(options)
 
+class StartCondDecl(element.Element):
+  # GENERATE ELEMENT(bool exclusive) BEGIN
+  def __init__(
+    self,
+    tag = 'StartCondDecl',
+    attrib = {},
+    text = '',
+    children = [],
+    exclusive = False
+  ):
+    element.Element.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+    self.exclusive = (
+      element.deserialize_bool(exclusive)
+    if isinstance(exclusive, str) else
+      exclusive
+    )
+  def serialize(self, ref_list, indent = 0):
+    element.Element.serialize(self, ref_list, indent)
+    self.set('exclusive', element.serialize_bool(self.exclusive))
+  def deserialize(self, ref_list):
+    element.Element.deserialize(self, ref_list)
+    self.exclusive = element.deserialize_bool(self.get('exclusive', 'false'))
+  def copy(self, factory = None):
+    result = element.Element.copy(
+      self,
+      StartCondDecl if factory is None else factory
+    )
+    result.exclusive = self.exclusive
+    return result
+  def repr_serialize(self, params):
+    element.Element.repr_serialize(self, params)
+    if self.exclusive != False:
+      params.append(
+        'exclusive = {0:s}'.format(repr(self.exclusive))
+      )
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'ast.StartCondDecl({0:s})'.format(', '.join(params))
+  # GENERATE END
+
 class Section2(element.Element):
   # GENERATE ELEMENT() BEGIN
   def __init__(
@@ -381,6 +456,34 @@ class StartCondNone(element.Element):
     return 'ast.StartCondNone({0:s})'.format(', '.join(params))
   # GENERATE END
 
+class StartCond(element.Element):
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'StartCond',
+    attrib = {},
+    text = '',
+    children = []
+  ):
+    element.Element.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+  def copy(self, factory = None):
+    result = element.Element.copy(
+      self,
+      StartCond if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'ast.StartCond({0:s})'.format(', '.join(params))
+  # GENERATE END
+
 class BOLRule(element.Element):
   # GENERATE ELEMENT() BEGIN
   def __init__(
@@ -465,6 +568,34 @@ class Rule(element.Element):
     return 'ast.Rule({0:s})'.format(', '.join(params))
   # GENERATE END
 
+class Action(element.Element):
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'Action',
+    attrib = {},
+    text = '',
+    children = []
+  ):
+    element.Element.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+  def copy(self, factory = None):
+    result = element.Element.copy(
+      self,
+      Action if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'ast.Action({0:s})'.format(', '.join(params))
+  # GENERATE END
+
 class Section3(element.Element):
   # GENERATE ELEMENT() BEGIN
   def __init__(
@@ -495,6 +626,7 @@ class Section3(element.Element):
 
 # GENERATE FACTORY(regex.factory) BEGIN
 tag_to_class = {
+  'Name': Name,
   'Section1': Section1,
   'CodeBlock': CodeBlock,
   'Option': Option,
@@ -505,11 +637,14 @@ tag_to_class = {
   'Options_Reject': Options.Reject,
   'Options_YYMore': Options.YYMore,
   'Options_YYWrap': Options.YYWrap,
+  'StartCondDecl': StartCondDecl,
   'Section2': Section2,
   'StartCondNone': StartCondNone,
+  'StartCond': StartCond,
   'BOLRule': BOLRule,
   'EOFRule': EOFRule,
   'Rule': Rule,
+  'Action': Action,
   'Section3': Section3
 }
 def factory(tag, attrib = {}, *args, **kwargs):
diff --git a/doc/Flex - a scanner generator - Start conditions.pdf b/doc/Flex - a scanner generator - Start conditions.pdf
new file mode 100644 (file)
index 0000000..04baca6
Binary files /dev/null and b/doc/Flex - a scanner generator - Start conditions.pdf differ
diff --git a/plex.py b/plex.py
index a2712a2..516af75 100755 (executable)
--- a/plex.py
+++ b/plex.py
@@ -2,8 +2,10 @@
 
 import ast
 import element
+import getopt
 #import lex
 import numpy
+import os
 import re
 import regex
 import sys
@@ -210,19 +212,37 @@ class FlexDFA:
       self.states.append((flex_accept, flex_base, flex_def))
     #print(full_entries[:len(self.states), :])
     #print(flex_state_to_action)
-if len(sys.argv) < 2:
+
+home_dir = os.path.dirname(sys.argv[0])
+try:
+  opts, args = getopt.getopt(sys.argv[1:], 'o:S:', ['outfile=', 'skel='])
+except getopt.GetoptError as err:
+  sys.stderr.write(str(err))
+  sys.exit(1)
+
+out_file = 'lex.yy.c'
+skel_file = os.path.join(home_dir, 'skel/lex.yy.c')
+for opt, arg in opts:
+  if opt == '-o' or opt == '--outfile':
+    out_file = arg
+  elif opt == '-S' or opt == '--skel':
+    skel_file = arg
+  else:
+    assert False
+if len(args) < 1:
   sys.stdout.write(
     'usage: {0:s} rules.l\n'.format(
       sys.argv[0]
     )
   )
   sys.exit(1)
+in_file = args[0]
+print(in_file, out_file, skel_file)
 
 #root = element.Element('root')
 #mark = []
 #macro_dict = {}
-#with open(sys.argv[1]) as fin:
+#with open(in_file) as fin:
 #  assert not yacc.yyparse(
 #    root,
 #    mark,
@@ -235,65 +255,106 @@ if len(sys.argv) < 2:
 #    for i in node:
 #      post_process(i)
 #post_process(root)
-with open(sys.argv[1] + '.xml') as fin:
+with open(in_file + '.xml') as fin:
   root = element.deserialize(fin, ast.factory)
 #xml.etree.ElementTree.dump(root)
 
-options = Options()
-assert isinstance(root[0], ast.Section1)
-for i in root[0]:
-  if isinstance(i, ast.Options):
-    i.process(options)
-#print(options.yywrap)
-
 class StartCondition:
   def __init__(self, name, eof_action):
     self.name = name
     self.eof_action = eof_action
+name_to_start_condition = {'INITIAL': 0}
+inclusive_start_conditions = set([0])
 start_conditions = [StartCondition('INITIAL', 0)]
-start_condition_exprs = [regex.RegexNone(), regex.RegexNone()]
-actions = []
-eof_actions = ['\t\t\t\tyyterminate();\n']
 
-assert isinstance(root[1], ast.Section2)
-for i in root[1]:
+section1 = root[0]
+assert isinstance(section1, ast.Section1)
+section2 = root[1]
+assert isinstance(section2, ast.Section2)
+if len(root) < 3:
+  section3 = ast.Section3()
+else:
+  section3 = root[2]
+  assert isinstance(section3, ast.Section3)
+
+options = Options()
+for i in section1:
+  if isinstance(i, ast.Options):
+    i.process(options)
+  elif isinstance(i, ast.StartCondDecl):
+    for j in i:
+      assert isinstance(j, ast.Name)
+      name = element.get_text(j, 0)
+      assert name not in name_to_start_condition
+      name_to_start_condition[name] = len(start_conditions)
+      if not i.exclusive:
+        inclusive_start_conditions.add(len(start_conditions))
+      start_conditions.append(StartCondition(name, 0))
+
+actions = []
+eof_actions = [ast.Action(text = '\t\t\t\tyyterminate();\n')]
+start_condition_exprs = [
+  regex.RegexNone()
+  for i in range(len(start_conditions) * 2) # normal followed by BOL expr
+]
+for i in section2:
   if isinstance(i, ast.Rule):
-    assert isinstance(i[0], ast.StartCondNone)
+    if isinstance(i[0], ast.StartCondNone):
+      default = True
+      rule_start_conditions = inclusive_start_conditions
+    else:
+      default = False
+      rule_start_conditions = set()
+      for j in i[0]:
+        assert isinstance(j, ast.Name)
+        rule_start_conditions.add(
+          name_to_start_condition[element.get_text(j, 0)]
+        )
     rule_expr = i[1]
+    rule_trailing_context = i[2]
+    assert isinstance(rule_trailing_context, regex.Regex)
+    rule_action = i[3]
+    assert isinstance(rule_action, ast.Action)
     if isinstance(rule_expr, ast.EOFRule):
-      assert isinstance(i[2], regex.RegexEmpty)
-      assert start_conditions[0].eof_action is None
-      start_conditions[0].eof_action = len(eof_actions)
-      eof_actions.append(i[3])
+      assert isinstance(rule_trailing_context, regex.RegexNone)
+      for j in rule_start_conditions:
+        if default and start_conditions[j].eof_action != 0:
+          continue # rule applies to start conditions with no EOF rule yet
+        assert start_conditions[j].eof_action == 0
+        start_conditions[j].eof_action = len(eof_actions)
+      eof_actions.append(rule_action)
     else:
       if isinstance(rule_expr, ast.BOLRule):
         bol_rule = True
         rule_expr = rule_expr[0]
       else:
         bol_rule = False
+      assert isinstance(rule_expr, regex.Regex)
       rule_expr = regex.RegexSequence(
         children = [
           rule_expr,
           regex.RegexGroup(
             children = [
-              i[2] # trailing context
+              rule_trailing_context
             ]
           )
         ]
       )
       rule_expr.post_process(len(actions))
-      for j in range(int(bol_rule), 2):
-        start_condition_exprs[j] = regex.RegexOr(
-          children = [
-            start_condition_exprs[j],
-            rule_expr
-          ]
-        )
-      actions.append(i[3])
+      for j in rule_start_conditions:
+        for k in range(j * 2 + int(bol_rule), j * 2 + 2):
+          start_condition_exprs[k] = regex.RegexOr(
+            children = [
+              start_condition_exprs[k],
+              rule_expr
+            ]
+          )
+      actions.append(rule_action)
 
 nfa = regex.NFA()
 for i in range(len(start_condition_exprs)):
   # make expr match as much as possible
+  # add default rule to match one char
   start_condition_exprs[i] = regex.RegexAnd(
     children = [
       regex.RegexRepeat(
@@ -304,14 +365,32 @@ for i in range(len(start_condition_exprs)):
           )
         ]
       ),
-      start_condition_exprs[i]
+      regex.RegexOr(
+        children = [
+          start_condition_exprs[i],
+          regex.RegexSequence(
+            children = [
+              regex.RegexCharacter(
+                char_set = [0, 0x100]
+              ),
+              regex.RegexGroup(
+                group_index = len(actions),
+                children = [
+                  regex.RegexEmpty()
+                ]
+              )
+            ]
+          )
+        ]
+      )
     ]
   )
-  print('i', i, 'expr', repr(start_condition_exprs[i]))
+  #print('i', i, 'expr', repr(start_condition_exprs[i]))
   start_condition_exprs[i].add_to_nfa(nfa)
+actions.append(ast.Action(text = 'ECHO;\n'))
 eob_expr = regex.RegexGroup(children = [regex.RegexEmpty()])
 eob_expr.post_process(len(actions))
-print('eob expr', repr(eob_expr))
+#print('eob expr', repr(eob_expr))
 eob_expr.add_to_nfa(nfa)
 
 dfa = nfa.to_dfa()
@@ -320,24 +399,37 @@ dfa = nfa.to_dfa()
 #print(dfa.match_text('1.0 + 5', 0))
  
 flex_dfa = FlexDFA(dfa) #nfa.to_dfa())
-with open('skel/lex.yy.c', 'r') as fin:
-  with open('lex.yy.c', 'w+') as fout:
+with open(skel_file, 'r') as fin:
+  with open(out_file, 'w+') as fout:
     line = fin.readline()
     while len(line):
       if line == '/* GENERATE SECTION1 */\n':
         fout.write(
           '''/* GENERATE SECTION1 BEGIN */
-{0:s}/*GENERATE SECTION1 END*/
+{0:s}/* GENERATE SECTION1 END*/
 '''.format(
             ''.join(
               [
                 element.get_text(i, 0)
-                for i in root[0]
+                for i in section1
                 if isinstance(i, ast.CodeBlock)
               ]
             )
           )
         )
+      elif line == '/* GENERATE STARTCONDDECL */\n':
+        fout.write(
+          '''/* GENERATE STARTCONDDECL BEGIN */
+{0:s}/* GENERATE STARTCONDDECL END*/
+'''.format(
+            ''.join(
+              [
+                '#define {0:s} {1:d}\n'.format(start_conditions[i].name, i)
+                for i in range(len(start_conditions))
+              ]
+            )
+          )
+        )
       elif line == '/* GENERATE TABLES */\n':
         yy_acclist = []
         yy_accept = [0]
@@ -445,12 +537,12 @@ static const flex_int16_t yy_chk[] = {{{6:s}
       elif line == '/* GENERATE SECTION2INITIAL */\n':
         fout.write(
           '''/* GENERATE SECTION2INITIAL BEGIN */
-/* GENERATE SECTION2INITIAL END */
+{0:s}/* GENERATE SECTION2INITIAL END */
 '''.format(
             ''.join(
               [
                 element.get_text(i, 0)
-                for i in root[1]
+                for i in section2
                 if isinstance(i, ast.CodeBlock)
               ]
             )
@@ -461,10 +553,11 @@ static const flex_int16_t yy_chk[] = {{{6:s}
           [
             j
             for j in range(len(start_conditions))
-            if start_conditions[i].eof_action == i
+            if start_conditions[i].eof_action == j
           ]
           for i in range(len(eof_actions))
         ]
+        #print('eof_action_to_start_conditions', eof_action_to_start_conditions)
         fout.write(
           '''/* GENERATE SECTION2 BEGIN */
 {0:s}{1:s}/* GENERATE SECTION2 END */
@@ -492,7 +585,7 @@ YY_RULE_SETUP
                       for j in eof_action_to_start_conditions[i]
                     ]
                   ),
-                  eof_actions[i]
+                  element.get_text(eof_actions[i], 0)
                 )
                 for i in range(len(eof_actions))
                 if len(eof_action_to_start_conditions[i]) > 0
@@ -501,12 +594,11 @@ YY_RULE_SETUP
           )
         )
       elif line == '/* GENERATE SECTION3 */\n':
-        assert len(root) < 2 or isinstance(root[2], ast.Section3)
         fout.write(
           '''/* GENERATE SECTION3 BEGIN */
 {0:s}/*GENERATE SECTION3 END */
 '''.format(
-            element.get_text(root[2], 0) if len(root) >= 3 else ''
+            element.get_text(section3, 0)
           )
         )
       else:
index f2b6ae7..ee49c9b 100644 (file)
@@ -1,5 +1,5 @@
 --- lex.yy.c.orig      2018-06-29 12:12:25.644004319 +1000
-+++ lex.yy.c   2018-06-29 22:32:56.627837990 +1000
++++ lex.yy.c   2018-06-30 09:47:14.740034867 +1000
 @@ -1,6 +1,3 @@
 -
 -#line 2 "lex.yy.c"
  
  extern int yy_flex_debug;
  int yy_flex_debug = 0;
-@@ -553,8 +379,8 @@
+@@ -553,10 +379,10 @@
  #define YY_MORE_ADJ (yy_more_len)
  #define YY_RESTORE_YY_MORE_OFFSET
  char *yytext;
 -#line 1 "skel.l"
 -#line 557 "lex.yy.c"
-+
-+/* GENERATE SECTION1 */
  
- #define INITIAL 0
+-#define INITIAL 0
++/* GENERATE SECTION1 */
++
++/* GENERATE STARTCONDDECL */
  
+ #ifndef YY_NO_UNISTD_H
+ /* Special case for "unistd.h", since it is non-ANSI. We include it way
 @@ -780,9 +606,7 @@
                }
  
index 9fc8dd2..33c6d57 100644 (file)
@@ -1,10 +1,29 @@
+all: cal flex0 flex1
+
 cal: y.tab.c lex.yy.c
-       gcc -o $@ $<
+       # remove -ll when we've implemented noyywrap
+       gcc -o $@ $< -ll
 
 lex.yy.c: cal.l
-       ../../bootstrap_flex.git/src/flex $< 2>$<.xml
-       cp $@ $@.orig
-       patch $@ <$@.patch
+       ../../bootstrap_flex.git/src/flex -o /dev/null $< 2>$<.xml
+       ../plex.py $<
+       # add the patch for state machine diagnostic
+       #cp $@ $@.orig
+       #patch $@ <$@.patch
 
 y.tab.c: cal.y
        ../../bootstrap_bison.git/src/bison -y $< 2>$<.xml
+
+flex0: flex0.c
+       gcc -o $@ $< -ll
+
+flex0.c: flex0.l
+       ../../bootstrap_flex.git/src/flex -o /dev/null $< 2>$<.xml
+       ../plex.py -o $@ $<
+
+flex1: flex1.c
+       gcc -o $@ $< -ll
+
+flex1.c: flex1.l
+       ../../bootstrap_flex.git/src/flex -o /dev/null $< 2>$<.xml
+       ../plex.py -o $@ $<
diff --git a/tests/cal.l.xml b/tests/cal.l.xml
deleted file mode 100644 (file)
index 07144b8..0000000
+++ /dev/null
@@ -1,20 +0,0 @@
-<RefList><PLexSpecification ref="0"><Section1>%{
-<CodeBlock>/* this is section 1 */
-</CodeBlock>%}
-
-DIGIT [0-9]+\.?|[0-9]*\.[0-9]+
-
-<Options>%option <Options_ECS>noecs</Options_ECS> <Options_MetaECS>nometa-ecs</Options_MetaECS> <Options_YYWrap>noyywrap</Options_YYWrap> <Options_Reject value="true">reject</Options_Reject> <Options_YYMore value="true">yymore</Options_YYMore></Options>
-
-</Section1>%%<Section2>
-
-       <CodeBlock>/* this is section 2 initial */
-</CodeBlock>
-<Rule><StartCondNone />[<RegexCharacterOr><RegexCharacter char_set="" /><RegexCharacter char_set="32 33"> </RegexCharacter></RegexCharacterOr>]<RegexEmpty /><Action>
-</Action></Rule>{DIGIT}<Rule><StartCondNone />(<RegexOr><RegexSequence><RegexRepeat count0="1">[<RegexCharacterOr><RegexCharacter char_set="" /><RegexCharacter char_set="48 58">0-9</RegexCharacter></RegexCharacterOr>]+</RegexRepeat><RegexRepeat count0="0" count1="1"><RegexCharacter char_set="46 47">\.</RegexCharacter>?</RegexRepeat></RegexSequence>|<RegexSequence><RegexSequence><RegexRepeat count0="0">[<RegexCharacterOr><RegexCharacter char_set="" /><RegexCharacter char_set="48 58">0-9</RegexCharacter></RegexCharacterOr>]*</RegexRepeat><RegexCharacter char_set="46 47">\.</RegexCharacter></RegexSequence><RegexRepeat count0="1">[<RegexCharacterOr><RegexCharacter char_set="" /><RegexCharacter char_set="48 58">0-9</RegexCharacter></RegexCharacterOr>]+</RegexRepeat></RegexSequence></RegexOr>) <RegexEmpty /><Action>{ yylval = atof(yytext); return NUM; }
-</Action></Rule><Rule><StartCondNone /><RegexOr><RegexCharacter char_set="10 11">\n</RegexCharacter>|<RegexCharacter char_set="0 10 11 256">.</RegexCharacter></RegexOr>       <RegexEmpty /><Action>{ return yytext[0]; }
-</Action></Rule>
-</Section2>%%<Section3>
-
-/* this is section 3 */
-</Section3></PLexSpecification></RefList>
\ No newline at end of file
diff --git a/tests/flex0.l b/tests/flex0.l
new file mode 100644 (file)
index 0000000..f64842a
--- /dev/null
@@ -0,0 +1,27 @@
+%{
+#include <math.h>
+%}
+%s expect
+
+%%
+expect-floats        BEGIN(expect);
+
+<expect>[0-9]+"."[0-9]+      {
+            printf( "found a float, = %f\n",
+                    atof( yytext ) );
+            }
+<expect>\n           {
+            /* that's the end of the line, so
+             * we need another "expect-number"
+             * before we'll recognize any more
+             * numbers
+             */
+            BEGIN(INITIAL);
+            }
+
+[0-9]+      {
+            printf( "found an integer, = %d\n",
+                    atoi( yytext ) );
+            }
+
+"."         printf( "found a dot\n" );
diff --git a/tests/flex1.l b/tests/flex1.l
new file mode 100644 (file)
index 0000000..e7bc862
--- /dev/null
@@ -0,0 +1,11 @@
+%x comment
+%%
+        int line_num = 1;
+
+"/*"         BEGIN(comment);
+<<EOF>>      printf("line_num %d\n", line_num); yyterminate(); /* Nick added */
+
+<comment>[^*\n]*        /* eat anything that's not a '*' */
+<comment>"*"+[^*/\n]*   /* eat up '*'s not followed by '/'s */
+<comment>\n             ++line_num;
+<comment>"*"+"/"        BEGIN(INITIAL);