Add continued actions and %option nodefault, make acclist generation slightly more...
authorNick Downing <downing.nick@gmail.com>
Tue, 24 Jul 2018 01:20:42 +0000 (11:20 +1000)
committerNick Downing <downing.nick@gmail.com>
Tue, 24 Jul 2018 02:43:15 +0000 (12:43 +1000)
ast.py
bootstrap_plex.py
flex_dfa.py
regex.py

diff --git a/ast.py b/ast.py
index 9dfcea6..65b2849 100644 (file)
--- a/ast.py
+++ b/ast.py
@@ -36,7 +36,8 @@ class Item(element.Element):
     name_to_start_condition,
     all_start_conditions,
     inclusive_start_conditions,
-    parent_start_conditions
+    parent_start_conditions,
+    continued_action
   ):
     raise NotImplementedException
 
@@ -239,9 +240,11 @@ class PLex(element.Element):
         name_to_start_condition,
         all_start_conditions,
         inclusive_start_conditions,
-        parent_start_conditions
+        parent_start_conditions,
+        continued_action
       ):
         section.code_blocks_text.append(self[0])
+        return continued_action
 
     # GENERATE ELEMENT(list(ref) code_blocks_text) BEGIN
     def __init__(
@@ -297,18 +300,21 @@ class PLex(element.Element):
       plex,
       name_to_start_condition,
       all_start_conditions,
-      inclusive_start_conditions,
-      parent_start_conditions
+      inclusive_start_conditions
     ):
+      parent_start_conditions = set()
+      continued_action = False
       for i in self:
-        i.post_process(
+        continued_action = i.post_process(
           plex,
           self,
           name_to_start_condition,
           all_start_conditions,
           inclusive_start_conditions,
-          parent_start_conditions
+          parent_start_conditions,
+          continued_action
         )
+      assert not continued_action
 
   class Section1(Section1Or2):
     class Options(Item):
@@ -743,10 +749,12 @@ class PLex(element.Element):
         name_to_start_condition,
         all_start_conditions,
         inclusive_start_conditions,
-        parent_start_conditions
+        parent_start_conditions,
+        continued_action
       ):
         for i in self:
           i.post_process(section)
+        return continued_action
 
     class StartConditions(Item):
       # GENERATE ELEMENT(bool exclusive) BEGIN
@@ -801,7 +809,8 @@ class PLex(element.Element):
         name_to_start_condition,
         all_start_conditions,
         inclusive_start_conditions,
-        parent_start_conditions
+        parent_start_conditions,
+        continued_action
       ):
         for i in self:
           name = i.get_text()
@@ -819,6 +828,7 @@ class PLex(element.Element):
               eof_action = 0,
             )
           )
+        return continued_action
 
     # GENERATE ELEMENT(bool caseless, bool default, bool ecs, bool meta_ecs, bool reject, bool stack, bool std_init, bool yymore, bool yy_top_state, bool yywrap) BEGIN
     def __init__(
@@ -989,8 +999,7 @@ class PLex(element.Element):
       plex,
       name_to_start_condition,
       all_start_conditions,
-      inclusive_start_conditions,
-      parent_start_conditions
+      inclusive_start_conditions
     ):
       self.caseless = False
       self.default = True
@@ -1007,8 +1016,7 @@ class PLex(element.Element):
         plex,
         name_to_start_condition,
         all_start_conditions,
-        inclusive_start_conditions,
-        parent_start_conditions
+        inclusive_start_conditions
       )
 
   class Section2(Section1Or2):
@@ -1046,8 +1054,10 @@ class PLex(element.Element):
         name_to_start_condition,
         all_start_conditions,
         inclusive_start_conditions,
-        parent_start_conditions
+        parent_start_conditions,
+        continued_action
       ):
+        assert not continued_action
         if self[0].wildcard:
           start_conditions = all_start_conditions
         else:
@@ -1057,24 +1067,27 @@ class PLex(element.Element):
               name_to_start_condition[i.get_text()]
             )
         for i in self[1:]:
-          i.post_process(
+          continued_action = i.post_process(
             plex,
             section,
             name_to_start_condition,
             all_start_conditions,
             inclusive_start_conditions,
-            start_conditions # parent_start_conditions
+            start_conditions, # parent_start_conditions
+            continued_action
           )
-
+        assert not continued_action
+        return False
     class Rule(Item):
       class Action(element.Element):
-        # GENERATE ELEMENT() BEGIN
+        # GENERATE ELEMENT(bool continued) BEGIN
         def __init__(
           self,
           tag = 'PLex_Section2_Rule_Action',
           attrib = {},
           text = '',
-          children = []
+          children = [],
+          continued = False
         ):
           element.Element.__init__(
             self,
@@ -1083,12 +1096,30 @@ class PLex(element.Element):
             text,
             children
           )
+          self.continued = (
+            element.deserialize_bool(continued)
+          if isinstance(continued, str) else
+            continued
+          )
+        def serialize(self, ref_list):
+          element.Element.serialize(self, ref_list)
+          self.set('continued', element.serialize_bool(self.continued))
+        def deserialize(self, ref_list):
+          element.Element.deserialize(self, ref_list)
+          self.continued = element.deserialize_bool(self.get('continued', 'false'))
         def copy(self, factory = None):
           result = element.Element.copy(
             self,
             Action if factory is None else factory
           )
+          result.continued = self.continued
           return result
+        def repr_serialize(self, params):
+          element.Element.repr_serialize(self, params)
+          if self.continued != False:
+            params.append(
+              'continued = {0:s}'.format(repr(self.continued))
+            )
         def __repr__(self):
           params = []
           self.repr_serialize(params)
@@ -1216,7 +1247,8 @@ class PLex(element.Element):
         name_to_start_condition,
         all_start_conditions,
         inclusive_start_conditions,
-        parent_start_conditions
+        parent_start_conditions,
+        continued_action
       ):
         if self[0].wildcard:
           start_conditions = all_start_conditions
@@ -1227,6 +1259,7 @@ class PLex(element.Element):
               name_to_start_condition[i.get_text()]
             )
         if isinstance(self[1], PLex.Section2.Rule.EOFRule):
+          assert not continued_action
           if len(start_conditions) == 0:
             for i in all_start_conditions:
               if plex.start_conditions[i].eof_action == 0:
@@ -1239,7 +1272,8 @@ class PLex(element.Element):
               plex.start_conditions[i].eof_action = (
                 len(plex.eof_actions_text)
               )
-          plex.eof_actions_text.append(self[2][0] if len(self) > 2 else PLex.Text()) # fix this later
+          assert not self[2].continued
+          plex.eof_actions_text.append(self[2][0])
         elif isinstance(self[1], PLex.Section2.Rule.FLexRule):
           for i in (
             start_conditions
@@ -1256,9 +1290,32 @@ class PLex(element.Element):
             caseless = plex[0].caseless
           ) # trailing context regex
           self[1].action = len(plex.actions_text)
-          plex.actions_text.append(self[2][0] if len(self) > 2 else PLex.Text()) # fix this later
+          if self[2].continued:
+            continued_action = True
+          else:
+            plex.actions_text.append(self[2][0])
+            continued_action = False
+            #def to_text(node):
+            #  return ''.join(
+            #    [
+            #      j
+            #      for i in range(len(node))
+            #      for j in [element.get_text(node, i), to_text(node[i])]
+            #    ] +
+            #    [element.get_text(node, len(node))]
+            #  )
+            #text = '{0:s}/{1:s}'.format(to_text(self[1][0]), to_text(self[1][1]))
+            #element.set_text(
+            #  plex.actions_text[-1],
+            #  0,
+            #  'fprintf(stderr, "%d >>>%s<<< {0:s}\\n", yy_start, yytext);\n{1:s}'.format(
+            #    text.replace('\\', '\\\\').replace('"', '\\"').replace('%', '%%'),
+            #    element.get_text(plex.actions_text[-1], 0)
+            #  )
+            #)
         else:
           assert False
+        return continued_action
  
     class StartConditions(element.Element):
       # GENERATE ELEMENT(bool wildcard) BEGIN
@@ -1481,26 +1538,30 @@ class PLex(element.Element):
     name_to_start_condition = {'INITIAL': 0}
     all_start_conditions = set([0])
     inclusive_start_conditions = set([0])
-    start_conditions = set()
 
     # perform the semantic analysis pass
     self[0].post_process(
       self,
       name_to_start_condition,
       all_start_conditions,
-      inclusive_start_conditions,
-      start_conditions # parent_start_conditions
+      inclusive_start_conditions
     )
     self[1].post_process(
       self,
       name_to_start_condition,
       all_start_conditions,
-      inclusive_start_conditions,
-      start_conditions # parent_start_conditions
+      inclusive_start_conditions
     )
     self.default_action = len(self.actions_text)
-    self.actions_text.append(PLex.Text(text = 'ECHO;\n'))
-
+    self.actions_text.append(
+      PLex.Text(
+        text = (
+          'ECHO;\n'
+        if self[0].default else
+          'YY_FATAL_ERROR( "flex scanner jammed" );\n'
+        )
+      )
+    )
   def to_nfa(self):
     _nfa = nfa.NFA()
     for i in self.start_conditions:
index a871402..461d8e4 100755 (executable)
@@ -37,6 +37,6 @@ with open(in_file) as fin:
 #element.serialize(plex, 'a.xml', 'utf-8')
 #plex = element.deserialize('a.xml', ast.factory, 'utf-8')
 plex.post_process()
-#element.serialize(plex, 'b.xml', 'utf-8')
-#plex = element.deserialize('b.xml', ast.factory, 'utf-8')
+element.serialize(plex, 'b.xml', 'utf-8')
+plex = element.deserialize('b.xml', ast.factory, 'utf-8')
 flex_dfa.generate(plex, skel_file, out_file)
index 3af9641..ccdb73d 100644 (file)
@@ -97,29 +97,29 @@ class FlexDFA:
         new_accept[:self.accept.shape[0]] = self.accept
         self.accept = new_accept
       self.accept[n_states] = n_acclist
+      accept_set = set()
       for k in [j for i in threads0[prefix_slop:] for j in i]:
         acc = k >> 1
         if k & 1:
-          if (
-            n_acclist == self.accept[n_states] or
-            self.acclist[n_acclist - 1] != acc | FlexDFA.YY_TRAILING_HEAD_MASK
-          ):
+          if (acc | FlexDFA.YY_TRAILING_HEAD_MASK) not in accept_set:
             # look back to start of trailing context, then accept
             acc |= FlexDFA.YY_TRAILING_MASK
           # otherwise zero length trailing context, accept immediately
         else:
           # mark start of (hopefully safe) trailing context
           acc |= FlexDFA.YY_TRAILING_HEAD_MASK
-        if n_acclist >= self.acclist.shape[0]:
-          # extend acclist
-          new_acclist = numpy.zeros(
-            (self.acclist.shape[0] * 2,),
-            numpy.uint16
-          )
-          new_acclist[:self.acclist.shape[0]] = self.acclist
-          self.acclist = new_acclist
-        self.acclist[n_acclist] = acc
-        n_acclist += 1
+        if acc not in accept_set:
+          if n_acclist >= self.acclist.shape[0]:
+            # extend acclist
+            new_acclist = numpy.zeros(
+              (self.acclist.shape[0] * 2,),
+              numpy.uint16
+            )
+            new_acclist[:self.acclist.shape[0]] = self.acclist
+            self.acclist = new_acclist
+          self.acclist[n_acclist] = acc
+          n_acclist += 1
+          accept_set.add(acc)
 
       # calculate transition row from _dfa.state character-to-action table
       if n_states >= transitions.shape[0]:
index ac6b158..ca07332 100644 (file)
--- a/regex.py
+++ b/regex.py
@@ -768,100 +768,215 @@ def factory(tag, attrib = {}, *args, **kwargs):
   return tag_to_class.get(tag, element.Element)(tag, attrib, *args, **kwargs)
 # GENERATE END
 
-# some of this should be moved into grammar.py:
-#if __name__ == '__main__':
-#  import sys
-#  import xml.etree.ElementTree
-#
-#  regex = RegexAnd(children = [RegexRepeat(children = [RegexCharacterNot(
-#children = [RegexCharacter()], character_set = [0, 256])]), RegexGroup(children = [
-#RegexOr(children = [RegexOr(children = [RegexOr(children = [RegexGroup(children
-#= [RegexRepeat(children = [RegexCharacter(character_set = [9, 14, 32, 33])],
-#one_or_more = True)], group_index = 1, group_name = 'Whitespace'), RegexGroup(
-#children = [RegexRepeat(children = [RegexCharacter(character_set = [48, 58])],
-#one_or_more = True)], group_index = 2, group_name = 'Number')]), RegexGroup(
-#children = [RegexSequence(children = [RegexSequence(children = [RegexSequence(
-#children = [RegexEmpty(), RegexCharacter(character_set = [102, 103])]),
-#RegexCharacter(character_set = [111, 112])]), RegexCharacter(character_set = [114, 115])]
-#)], group_index = 3, group_name = 'For')]), RegexGroup(children = [
-#RegexSequence(children = [RegexCharacter(character_set = [65, 91, 95, 96, 97, 123]),
-#RegexRepeat(children = [RegexCharacter(character_set = [48, 58, 65, 91, 95, 96, 97,
-#123])])])], group_index = 4, group_name = 'Identifier')])], group_index = 0)])
-#  #sys.stdout.write(
-#  #  wrap_repr(
-#  #    '  regex = {0:s}'.format(repr(regex).replace('regex.', '')),
-#  #    79
-#  #  )
-#  #)
-#
-#  _nfa = regex.to_nfa()
-#  #sys.stdout.write(
-#  #  wrap_repr(
-#  #    '  _nfa = {0:s}'.format(repr(_nfa).replace('regex.', '')),
-#  #    79
-#  #  )
-#  #)
-#
-#  text = '    id   99id id99 for forex  '
-#  i = 0
-#  while i < len(text):
-#    print('text "{0:s}"'.format(text[i:i + 72].replace('\n', '$')))
-#    thread = _nfa.match_text(text, i)
-#    if thread is None:
-#      print('no match')
-#      break
-#    i = thread[0] # end position of overall match
-#    group_start = [-1 for j in range(len(_nfa.groups))]
-#    group_end = [-1 for j in range(len(_nfa.groups))]
-#    while thread is not None:
-#      pos, mark, thread = thread
-#      group = mark >> 1
-#      if (mark & 1) == 0:
-#        group_start[group] = pos
-#        print(
-#          'group {0:d} name "{1:s}" text "{2:s}"'.format(
-#             group,
-#             _nfa.groups[group][0],
-#             text[group_start[group]:group_end[group]].replace('\n', '$')
-#          )
-#        )
-#      else:
-#        group_end[group] = pos
-#
-#  dfa = _nfa.to_dfa()
-#  #sys.stdout.write(
-#  #  wrap_repr(
-#  #    '  dfa = {0:s}'.format(repr(dfa).replace('regex.', '')),
-#  #    79
-#  #  )
-#  #)
-#
-#  text = '    id   99id id99 for forex  '
-#  i = 0
-#  while i < len(text):
-#    print('text "{0:s}"'.format(text[i:i + 72].replace('\n', '$')))
-#    thread = dfa.match_text(text, i)
-#    if thread is None:
-#      print('no match')
-#      break
-#    i = thread[0] # end position of overall match
-#    group_start = [-1 for j in range(len(dfa.groups))]
-#    group_end = [-1 for j in range(len(dfa.groups))]
-#    while thread is not None:
-#      pos, mark, thread = thread
-#      group = mark >> 1
-#      if (mark & 1) == 0:
-#        group_start[group] = pos
-#        print(
-#          'group {0:d} name "{1:s}" text "{2:s}"'.format(
-#             group,
-#             dfa.groups[group][0],
-#             text[group_start[group]:group_end[group]].replace('\n', '$')
-#          )
-#        )
-#      else:
-#        group_end[group] = pos
-#
+if __name__ == '__main__':
+  import sys
+  import xml.etree.ElementTree
+  import wrap_repr
+
+  _regex = RegexAnd(
+    children = [
+      RegexRepeat(
+        children = [
+          RegexCharacterNot(
+           children = [
+             RegexCharacterLiteral()
+           ],
+           character_set = [0, 256]
+         )
+       ]
+      ),
+      RegexGroup(
+        children = [
+          RegexOr(
+            children = [
+              RegexOr(
+                children = [
+                  RegexOr(
+                    children = [
+                      RegexOr(
+                        children = [
+                          RegexNone(),
+                          RegexGroup(
+                            children = [
+                              RegexRepeat(
+                                children = [
+                                  RegexCharacterLiteral(
+                                    character_set = [9, 14, 32, 33]
+                                  )
+                                ],
+                                count0 = 1
+                              )
+                            ],
+                            index = 1,
+                            name = 'Whitespace'
+                          )
+                        ]
+                      ),
+                      RegexGroup(
+                        children = [
+                          RegexRepeat(
+                            children = [
+                              RegexCharacterLiteral(
+                                character_set = [48, 58]
+                              )
+                            ],
+                            count0 = 1
+                          )
+                        ],
+                        index = 2,
+                        name = 'Number'
+                      )
+                    ]
+                  ),
+                  RegexGroup(
+                    children = [
+                      RegexSequence(
+                        children = [
+                          RegexSequence(
+                            children = [
+                              RegexSequence(
+                                children = [
+                                  RegexSequence(
+                                    children = [
+                                      RegexEmpty(),
+                                      RegexCharacterLiteral(
+                                        character_set = [102, 103]
+                                      )
+                                    ]
+                                  ),
+                                  RegexCharacterLiteral(
+                                    character_set = [111, 112]
+                                  )
+                                ]
+                              ),
+                              RegexCharacterLiteral(
+                                character_set = [114, 115]
+                              )
+                            ]
+                          ),
+                          RegexRepeat(
+                            children = [
+                              RegexCharacterLiteral(
+                                character_set = [101, 102]
+                              )
+                            ],
+                            count0 = 0,
+                            count1 = 1
+                          )
+                        ]
+                      )
+                    ],
+                    index = 3,
+                    name = 'For'
+                  )
+                ]
+              ),
+              RegexGroup(
+                children = [
+                  RegexSequence(
+                    children = [
+                      RegexCharacterLiteral(
+                        character_set = [65, 91, 95, 96, 97, 123]
+                      ),
+                      RegexRepeat(
+                        children = [
+                          RegexCharacterLiteral(
+                            character_set = [48, 58, 65, 91, 95, 96, 97, 123]
+                          )
+                        ]
+                      )
+                    ]
+                  )
+                ],
+                index = 4,
+                name = 'Identifier'
+              )
+            ]
+          )
+        ],
+        index = 0
+      )
+    ]
+  )
+  sys.stdout.write(
+    wrap_repr.wrap_repr(
+      '  _regex = {0:s}'.format(repr(_regex).replace('regex.', '')),
+      79
+    )
+  )
+
+  groups = []
+  _regex.add_to_groups(groups)
+  _nfa = nfa.NFA(groups)
+  _regex.add_to_nfa(_nfa)
+  sys.stdout.write(
+    wrap_repr.wrap_repr(
+      '  _nfa = {0:s}'.format(repr(_nfa).replace('nfa.', '')),
+      79
+    )
+  )
+
+  text = '     id   99id id99 for fore foree forex  '
+  i = 0
+  while i < len(text):
+    print('text "{0:s}"'.format(text[i:i + 72].replace('\n', '$')))
+    thread = _nfa.match_text(text, i)
+    if thread is None:
+      print('no match')
+      break
+    i = thread[0] # end position of overall match
+    group_start = [-1 for j in range(len(_nfa.groups))]
+    group_end = [-1 for j in range(len(_nfa.groups))]
+    while thread is not None:
+      pos, mark, thread = thread
+      group = mark >> 1
+      if (mark & 1) == 0:
+        group_start[group] = pos
+        print(
+          'group {0:d} name "{1:s}" text "{2:s}"'.format(
+             group,
+             _nfa.groups[group][0],
+             text[group_start[group]:group_end[group]].replace('\n', '$')
+          )
+        )
+      else:
+        group_end[group] = pos
+
+  _dfa = _nfa.to_dfa()
+  sys.stdout.write(
+    wrap_repr.wrap_repr(
+      '  _dfa = {0:s}'.format(repr(_dfa).replace('dfa.', '')),
+      79
+    )
+  )
+
+  text = '     id   99id id99 for fore foree forex  '
+  i = 0
+  while i < len(text):
+    print('text "{0:s}"'.format(text[i:i + 72].replace('\n', '$')))
+    thread = _dfa.match_text(text, i)
+    if thread is None:
+      print('no match')
+      break
+    i = thread[0] # end position of overall match
+    group_start = [-1 for j in range(len(_dfa.groups))]
+    group_end = [-1 for j in range(len(_dfa.groups))]
+    while thread is not None:
+      pos, mark, thread = thread
+      group = mark >> 1
+      if (mark & 1) == 0:
+        group_start[group] = pos
+        print(
+          'group {0:d} name "{1:s}" text "{2:s}"'.format(
+             group,
+             _dfa.groups[group][0],
+             text[group_start[group]:group_end[group]].replace('\n', '$')
+          )
+        )
+      else:
+        group_end[group] = pos
+
+# move this into grammar.py:
 #  grammar = Grammar(children = [Grammar.Production(children = [RegexSequence(
 #children = [RegexSequence(children = [RegexEmpty(), RegexCharacterRule(character_set
 #= [288, 295], rule_name = 'whitespace_opt')]), RegexCharacterRule(character_set = [
@@ -982,7 +1097,7 @@ def factory(tag, attrib = {}, *args, **kwargs):
 #'whitespace_opt')]), RegexCharacter(character_set = [32, 33])])], nonterminal = 36)
 #], n_terminals = 258)
 #  #sys.stdout.write(
-#  #  wrap_repr(
+#  #  wrap_repr.wrap_repr(
 #  #    '  grammar = {0:s}'.format(repr(grammar).replace('regex.', '')),
 #  #    79
 #  #  )
@@ -990,7 +1105,7 @@ def factory(tag, attrib = {}, *args, **kwargs):
 #
 #  lr1 = grammar.to_lr1()
 #  #sys.stdout.write(
-#  #  wrap_repr(
+#  #  wrap_repr.wrap_repr(
 #  #    '  lr1 = {0:s}'.format(repr(lr1).replace('regex.', '')),
 #  #    79
 #  #  )
@@ -1003,7 +1118,7 @@ def factory(tag, attrib = {}, *args, **kwargs):
 #
 #  clr1 = lr1.to_clr1()
 #  #sys.stdout.write(
-#  #  wrap_repr(
+#  #  wrap_repr.wrap_repr(
 #  #    '  clr1 = {0:s}'.format(repr(clr1).replace('regex.', '')),
 #  #    79
 #  #  )
@@ -1016,7 +1131,7 @@ def factory(tag, attrib = {}, *args, **kwargs):
 #
 #  lalr1 = lr1.to_lalr1()
 #  #sys.stdout.write(
-#  #  wrap_repr(
+#  #  wrap_repr.wrap_repr(
 #  #    '  lalr1 = {0:s}'.format(repr(lalr1).replace('regex.', '')),
 #  #    79
 #  #  )