Add continued actions and %option nodefault, make acclist generation slightly more...

author Nick Downing <downing.nick@gmail.com>

Tue, 24 Jul 2018 01:20:42 +0000 (11:20 +1000)

committer Nick Downing <downing.nick@gmail.com>

Tue, 24 Jul 2018 02:43:15 +0000 (12:43 +1000)
author Nick Downing <downing.nick@gmail.com>
Tue, 24 Jul 2018 01:20:42 +0000 (11:20 +1000)
committer Nick Downing <downing.nick@gmail.com>
Tue, 24 Jul 2018 02:43:15 +0000 (12:43 +1000)
diff --git a/ast.py b/ast.py

index 9dfcea6..65b2849 100644 (file)
--- a/ast.py
+++ b/ast.py
@@ -36,7 +36,8 @@ class Item(element.Element):
      name_to_start_condition,
      all_start_conditions,
      inclusive_start_conditions,
-    parent_start_conditions
+    parent_start_conditions,
+    continued_action
    ):
      raise NotImplementedException
  
@@ -239,9 +240,11 @@ class PLex(element.Element):
          name_to_start_condition,
          all_start_conditions,
          inclusive_start_conditions,
-        parent_start_conditions
+        parent_start_conditions,
+        continued_action
        ):
          section.code_blocks_text.append(self[0])
+        return continued_action
  
      # GENERATE ELEMENT(list(ref) code_blocks_text) BEGIN
      def __init__(
@@ -297,18 +300,21 @@ class PLex(element.Element):
        plex,
        name_to_start_condition,
        all_start_conditions,
-      inclusive_start_conditions,
-      parent_start_conditions
+      inclusive_start_conditions
      ):
+      parent_start_conditions = set()
+      continued_action = False
        for i in self:
-        i.post_process(
+        continued_action = i.post_process(
            plex,
            self,
            name_to_start_condition,
            all_start_conditions,
            inclusive_start_conditions,
-          parent_start_conditions
+          parent_start_conditions,
+          continued_action
          )
+      assert not continued_action
  
    class Section1(Section1Or2):
      class Options(Item):
@@ -743,10 +749,12 @@ class PLex(element.Element):
          name_to_start_condition,
          all_start_conditions,
          inclusive_start_conditions,
-        parent_start_conditions
+        parent_start_conditions,
+        continued_action
        ):
          for i in self:
            i.post_process(section)
+        return continued_action
  
      class StartConditions(Item):
        # GENERATE ELEMENT(bool exclusive) BEGIN
@@ -801,7 +809,8 @@ class PLex(element.Element):
          name_to_start_condition,
          all_start_conditions,
          inclusive_start_conditions,
-        parent_start_conditions
+        parent_start_conditions,
+        continued_action
        ):
          for i in self:
            name = i.get_text()
@@ -819,6 +828,7 @@ class PLex(element.Element):
                eof_action = 0,
              )
            )
+        return continued_action
  
      # GENERATE ELEMENT(bool caseless, bool default, bool ecs, bool meta_ecs, bool reject, bool stack, bool std_init, bool yymore, bool yy_top_state, bool yywrap) BEGIN
      def __init__(
@@ -989,8 +999,7 @@ class PLex(element.Element):
        plex,
        name_to_start_condition,
        all_start_conditions,
-      inclusive_start_conditions,
-      parent_start_conditions
+      inclusive_start_conditions
      ):
        self.caseless = False
        self.default = True
@@ -1007,8 +1016,7 @@ class PLex(element.Element):
          plex,
          name_to_start_condition,
          all_start_conditions,
-        inclusive_start_conditions,
-        parent_start_conditions
+        inclusive_start_conditions
        )
  
    class Section2(Section1Or2):
@@ -1046,8 +1054,10 @@ class PLex(element.Element):
          name_to_start_condition,
          all_start_conditions,
          inclusive_start_conditions,
-        parent_start_conditions
+        parent_start_conditions,
+        continued_action
        ):
+        assert not continued_action
          if self[0].wildcard:
            start_conditions = all_start_conditions
          else:
@@ -1057,24 +1067,27 @@ class PLex(element.Element):
                name_to_start_condition[i.get_text()]
              )
          for i in self[1:]:
-          i.post_process(
+          continued_action = i.post_process(
              plex,
              section,
              name_to_start_condition,
              all_start_conditions,
              inclusive_start_conditions,
-            start_conditions # parent_start_conditions
+            start_conditions, # parent_start_conditions
+            continued_action
            )
-
+        assert not continued_action
+        return False
      class Rule(Item):
        class Action(element.Element):
-        # GENERATE ELEMENT() BEGIN
+        # GENERATE ELEMENT(bool continued) BEGIN
          def __init__(
            self,
            tag = 'PLex_Section2_Rule_Action',
            attrib = {},
            text = '',
-          children = []
+          children = [],
+          continued = False
          ):
            element.Element.__init__(
              self,
@@ -1083,12 +1096,30 @@ class PLex(element.Element):
              text,
              children
            )
+          self.continued = (
+            element.deserialize_bool(continued)
+          if isinstance(continued, str) else
+            continued
+          )
+        def serialize(self, ref_list):
+          element.Element.serialize(self, ref_list)
+          self.set('continued', element.serialize_bool(self.continued))
+        def deserialize(self, ref_list):
+          element.Element.deserialize(self, ref_list)
+          self.continued = element.deserialize_bool(self.get('continued', 'false'))
          def copy(self, factory = None):
            result = element.Element.copy(
              self,
              Action if factory is None else factory
            )
+          result.continued = self.continued
            return result
+        def repr_serialize(self, params):
+          element.Element.repr_serialize(self, params)
+          if self.continued != False:
+            params.append(
+              'continued = {0:s}'.format(repr(self.continued))
+            )
          def __repr__(self):
            params = []
            self.repr_serialize(params)
@@ -1216,7 +1247,8 @@ class PLex(element.Element):
          name_to_start_condition,
          all_start_conditions,
          inclusive_start_conditions,
-        parent_start_conditions
+        parent_start_conditions,
+        continued_action
        ):
          if self[0].wildcard:
            start_conditions = all_start_conditions
@@ -1227,6 +1259,7 @@ class PLex(element.Element):
                name_to_start_condition[i.get_text()]
              )
          if isinstance(self[1], PLex.Section2.Rule.EOFRule):
+          assert not continued_action
            if len(start_conditions) == 0:
              for i in all_start_conditions:
                if plex.start_conditions[i].eof_action == 0:
@@ -1239,7 +1272,8 @@ class PLex(element.Element):
                plex.start_conditions[i].eof_action = (
                  len(plex.eof_actions_text)
                )
-          plex.eof_actions_text.append(self[2][0] if len(self) > 2 else PLex.Text()) # fix this later
+          assert not self[2].continued
+          plex.eof_actions_text.append(self[2][0])
          elif isinstance(self[1], PLex.Section2.Rule.FLexRule):
            for i in (
              start_conditions
@@ -1256,9 +1290,32 @@ class PLex(element.Element):
              caseless = plex[0].caseless
            ) # trailing context regex
            self[1].action = len(plex.actions_text)
-          plex.actions_text.append(self[2][0] if len(self) > 2 else PLex.Text()) # fix this later
+          if self[2].continued:
+            continued_action = True
+          else:
+            plex.actions_text.append(self[2][0])
+            continued_action = False
+            #def to_text(node):
+            #  return ''.join(
+            #    [
+            #      j
+            #      for i in range(len(node))
+            #      for j in [element.get_text(node, i), to_text(node[i])]
+            #    ] +
+            #    [element.get_text(node, len(node))]
+            #  )
+            #text = '{0:s}/{1:s}'.format(to_text(self[1][0]), to_text(self[1][1]))
+            #element.set_text(
+            #  plex.actions_text[-1],
+            #  0,
+            #  'fprintf(stderr, "%d >>>%s<<< {0:s}\\n", yy_start, yytext);\n{1:s}'.format(
+            #    text.replace('\\', '\\\\').replace('"', '\\"').replace('%', '%%'),
+            #    element.get_text(plex.actions_text[-1], 0)
+            #  )
+            #)
          else:
            assert False
+        return continued_action
   
      class StartConditions(element.Element):
        # GENERATE ELEMENT(bool wildcard) BEGIN
@@ -1481,26 +1538,30 @@ class PLex(element.Element):
      name_to_start_condition = {'INITIAL': 0}
      all_start_conditions = set([0])
      inclusive_start_conditions = set([0])
-    start_conditions = set()
  
      # perform the semantic analysis pass
      self[0].post_process(
        self,
        name_to_start_condition,
        all_start_conditions,
-      inclusive_start_conditions,
-      start_conditions # parent_start_conditions
+      inclusive_start_conditions
      )
      self[1].post_process(
        self,
        name_to_start_condition,
        all_start_conditions,
-      inclusive_start_conditions,
-      start_conditions # parent_start_conditions
+      inclusive_start_conditions
      )
      self.default_action = len(self.actions_text)
-    self.actions_text.append(PLex.Text(text = 'ECHO;\n'))
-
+    self.actions_text.append(
+      PLex.Text(
+        text = (
+          'ECHO;\n'
+        if self[0].default else
+          'YY_FATAL_ERROR( "flex scanner jammed" );\n'
+        )
+      )
+    )
    def to_nfa(self):
      _nfa = nfa.NFA()
      for i in self.start_conditions:
diff --git a/bootstrap_plex.py b/bootstrap_plex.py

index a871402..461d8e4 100755 (executable)
--- a/bootstrap_plex.py
+++ b/bootstrap_plex.py
@@ -37,6 +37,6 @@ with open(in_file) as fin:
  #element.serialize(plex, 'a.xml', 'utf-8')
  #plex = element.deserialize('a.xml', ast.factory, 'utf-8')
  plex.post_process()
-#element.serialize(plex, 'b.xml', 'utf-8')
-#plex = element.deserialize('b.xml', ast.factory, 'utf-8')
+element.serialize(plex, 'b.xml', 'utf-8')
+plex = element.deserialize('b.xml', ast.factory, 'utf-8')
  flex_dfa.generate(plex, skel_file, out_file)
diff --git a/flex_dfa.py b/flex_dfa.py

index 3af9641..ccdb73d 100644 (file)
--- a/flex_dfa.py
+++ b/flex_dfa.py
@@ -97,29 +97,29 @@ class FlexDFA:
          new_accept[:self.accept.shape[0]] = self.accept
          self.accept = new_accept
        self.accept[n_states] = n_acclist
+      accept_set = set()
        for k in [j for i in threads0[prefix_slop:] for j in i]:
          acc = k >> 1
          if k & 1:
-          if (
-            n_acclist == self.accept[n_states] or
-            self.acclist[n_acclist - 1] != acc | FlexDFA.YY_TRAILING_HEAD_MASK
-          ):
+          if (acc | FlexDFA.YY_TRAILING_HEAD_MASK) not in accept_set:
              # look back to start of trailing context, then accept
              acc |= FlexDFA.YY_TRAILING_MASK
            # otherwise zero length trailing context, accept immediately
          else:
            # mark start of (hopefully safe) trailing context
            acc |= FlexDFA.YY_TRAILING_HEAD_MASK
-        if n_acclist >= self.acclist.shape[0]:
-          # extend acclist
-          new_acclist = numpy.zeros(
-            (self.acclist.shape[0] * 2,),
-            numpy.uint16
-          )
-          new_acclist[:self.acclist.shape[0]] = self.acclist
-          self.acclist = new_acclist
-        self.acclist[n_acclist] = acc
-        n_acclist += 1
+        if acc not in accept_set:
+          if n_acclist >= self.acclist.shape[0]:
+            # extend acclist
+            new_acclist = numpy.zeros(
+              (self.acclist.shape[0] * 2,),
+              numpy.uint16
+            )
+            new_acclist[:self.acclist.shape[0]] = self.acclist
+            self.acclist = new_acclist
+          self.acclist[n_acclist] = acc
+          n_acclist += 1
+          accept_set.add(acc)
  
        # calculate transition row from _dfa.state character-to-action table
        if n_states >= transitions.shape[0]:
diff --git a/regex.py b/regex.py

index ac6b158..ca07332 100644 (file)
--- a/regex.py
+++ b/regex.py
@@ -768,100 +768,215 @@ def factory(tag, attrib = {}, *args, **kwargs):
    return tag_to_class.get(tag, element.Element)(tag, attrib, *args, **kwargs)
  # GENERATE END
  
-# some of this should be moved into grammar.py:
-#if __name__ == '__main__':
-#  import sys
-#  import xml.etree.ElementTree
-#
-#  regex = RegexAnd(children = [RegexRepeat(children = [RegexCharacterNot(
-#children = [RegexCharacter()], character_set = [0, 256])]), RegexGroup(children = [
-#RegexOr(children = [RegexOr(children = [RegexOr(children = [RegexGroup(children
-#= [RegexRepeat(children = [RegexCharacter(character_set = [9, 14, 32, 33])],
-#one_or_more = True)], group_index = 1, group_name = 'Whitespace'), RegexGroup(
-#children = [RegexRepeat(children = [RegexCharacter(character_set = [48, 58])],
-#one_or_more = True)], group_index = 2, group_name = 'Number')]), RegexGroup(
-#children = [RegexSequence(children = [RegexSequence(children = [RegexSequence(
-#children = [RegexEmpty(), RegexCharacter(character_set = [102, 103])]),
-#RegexCharacter(character_set = [111, 112])]), RegexCharacter(character_set = [114, 115])]
-#)], group_index = 3, group_name = 'For')]), RegexGroup(children = [
-#RegexSequence(children = [RegexCharacter(character_set = [65, 91, 95, 96, 97, 123]),
-#RegexRepeat(children = [RegexCharacter(character_set = [48, 58, 65, 91, 95, 96, 97,
-#123])])])], group_index = 4, group_name = 'Identifier')])], group_index = 0)])
-#  #sys.stdout.write(
-#  #  wrap_repr(
-#  #    '  regex = {0:s}'.format(repr(regex).replace('regex.', '')),
-#  #    79
-#  #  )
-#  #)
-#
-#  _nfa = regex.to_nfa()
-#  #sys.stdout.write(
-#  #  wrap_repr(
-#  #    '  _nfa = {0:s}'.format(repr(_nfa).replace('regex.', '')),
-#  #    79
-#  #  )
-#  #)
-#
-#  text = '    id   99id id99 for forex  '
-#  i = 0
-#  while i < len(text):
-#    print('text "{0:s}"'.format(text[i:i + 72].replace('\n', '$')))
-#    thread = _nfa.match_text(text, i)
-#    if thread is None:
-#      print('no match')
-#      break
-#    i = thread[0] # end position of overall match
-#    group_start = [-1 for j in range(len(_nfa.groups))]
-#    group_end = [-1 for j in range(len(_nfa.groups))]
-#    while thread is not None:
-#      pos, mark, thread = thread
-#      group = mark >> 1
-#      if (mark & 1) == 0:
-#        group_start[group] = pos
-#        print(
-#          'group {0:d} name "{1:s}" text "{2:s}"'.format(
-#             group,
-#             _nfa.groups[group][0],
-#             text[group_start[group]:group_end[group]].replace('\n', '$')
-#          )
-#        )
-#      else:
-#        group_end[group] = pos
-#
-#  dfa = _nfa.to_dfa()
-#  #sys.stdout.write(
-#  #  wrap_repr(
-#  #    '  dfa = {0:s}'.format(repr(dfa).replace('regex.', '')),
-#  #    79
-#  #  )
-#  #)
-#
-#  text = '    id   99id id99 for forex  '
-#  i = 0
-#  while i < len(text):
-#    print('text "{0:s}"'.format(text[i:i + 72].replace('\n', '$')))
-#    thread = dfa.match_text(text, i)
-#    if thread is None:
-#      print('no match')
-#      break
-#    i = thread[0] # end position of overall match
-#    group_start = [-1 for j in range(len(dfa.groups))]
-#    group_end = [-1 for j in range(len(dfa.groups))]
-#    while thread is not None:
-#      pos, mark, thread = thread
-#      group = mark >> 1
-#      if (mark & 1) == 0:
-#        group_start[group] = pos
-#        print(
-#          'group {0:d} name "{1:s}" text "{2:s}"'.format(
-#             group,
-#             dfa.groups[group][0],
-#             text[group_start[group]:group_end[group]].replace('\n', '$')
-#          )
-#        )
-#      else:
-#        group_end[group] = pos
-#
+if __name__ == '__main__':
+  import sys
+  import xml.etree.ElementTree
+  import wrap_repr
+
+  _regex = RegexAnd(
+    children = [
+      RegexRepeat(
+        children = [
+          RegexCharacterNot(
+           children = [
+             RegexCharacterLiteral()
+           ],
+           character_set = [0, 256]
+         )
+       ]
+      ),
+      RegexGroup(
+        children = [
+          RegexOr(
+            children = [
+              RegexOr(
+                children = [
+                  RegexOr(
+                    children = [
+                      RegexOr(
+                        children = [
+                          RegexNone(),
+                          RegexGroup(
+                            children = [
+                              RegexRepeat(
+                                children = [
+                                  RegexCharacterLiteral(
+                                    character_set = [9, 14, 32, 33]
+                                  )
+                                ],
+                                count0 = 1
+                              )
+                            ],
+                            index = 1,
+                            name = 'Whitespace'
+                          )
+                        ]
+                      ),
+                      RegexGroup(
+                        children = [
+                          RegexRepeat(
+                            children = [
+                              RegexCharacterLiteral(
+                                character_set = [48, 58]
+                              )
+                            ],
+                            count0 = 1
+                          )
+                        ],
+                        index = 2,
+                        name = 'Number'
+                      )
+                    ]
+                  ),
+                  RegexGroup(
+                    children = [
+                      RegexSequence(
+                        children = [
+                          RegexSequence(
+                            children = [
+                              RegexSequence(
+                                children = [
+                                  RegexSequence(
+                                    children = [
+                                      RegexEmpty(),
+                                      RegexCharacterLiteral(
+                                        character_set = [102, 103]
+                                      )
+                                    ]
+                                  ),
+                                  RegexCharacterLiteral(
+                                    character_set = [111, 112]
+                                  )
+                                ]
+                              ),
+                              RegexCharacterLiteral(
+                                character_set = [114, 115]
+                              )
+                            ]
+                          ),
+                          RegexRepeat(
+                            children = [
+                              RegexCharacterLiteral(
+                                character_set = [101, 102]
+                              )
+                            ],
+                            count0 = 0,
+                            count1 = 1
+                          )
+                        ]
+                      )
+                    ],
+                    index = 3,
+                    name = 'For'
+                  )
+                ]
+              ),
+              RegexGroup(
+                children = [
+                  RegexSequence(
+                    children = [
+                      RegexCharacterLiteral(
+                        character_set = [65, 91, 95, 96, 97, 123]
+                      ),
+                      RegexRepeat(
+                        children = [
+                          RegexCharacterLiteral(
+                            character_set = [48, 58, 65, 91, 95, 96, 97, 123]
+                          )
+                        ]
+                      )
+                    ]
+                  )
+                ],
+                index = 4,
+                name = 'Identifier'
+              )
+            ]
+          )
+        ],
+        index = 0
+      )
+    ]
+  )
+  sys.stdout.write(
+    wrap_repr.wrap_repr(
+      '  _regex = {0:s}'.format(repr(_regex).replace('regex.', '')),
+      79
+    )
+  )
+
+  groups = []
+  _regex.add_to_groups(groups)
+  _nfa = nfa.NFA(groups)
+  _regex.add_to_nfa(_nfa)
+  sys.stdout.write(
+    wrap_repr.wrap_repr(
+      '  _nfa = {0:s}'.format(repr(_nfa).replace('nfa.', '')),
+      79
+    )
+  )
+
+  text = '     id   99id id99 for fore foree forex  '
+  i = 0
+  while i < len(text):
+    print('text "{0:s}"'.format(text[i:i + 72].replace('\n', '$')))
+    thread = _nfa.match_text(text, i)
+    if thread is None:
+      print('no match')
+      break
+    i = thread[0] # end position of overall match
+    group_start = [-1 for j in range(len(_nfa.groups))]
+    group_end = [-1 for j in range(len(_nfa.groups))]
+    while thread is not None:
+      pos, mark, thread = thread
+      group = mark >> 1
+      if (mark & 1) == 0:
+        group_start[group] = pos
+        print(
+          'group {0:d} name "{1:s}" text "{2:s}"'.format(
+             group,
+             _nfa.groups[group][0],
+             text[group_start[group]:group_end[group]].replace('\n', '$')
+          )
+        )
+      else:
+        group_end[group] = pos
+
+  _dfa = _nfa.to_dfa()
+  sys.stdout.write(
+    wrap_repr.wrap_repr(
+      '  _dfa = {0:s}'.format(repr(_dfa).replace('dfa.', '')),
+      79
+    )
+  )
+
+  text = '     id   99id id99 for fore foree forex  '
+  i = 0
+  while i < len(text):
+    print('text "{0:s}"'.format(text[i:i + 72].replace('\n', '$')))
+    thread = _dfa.match_text(text, i)
+    if thread is None:
+      print('no match')
+      break
+    i = thread[0] # end position of overall match
+    group_start = [-1 for j in range(len(_dfa.groups))]
+    group_end = [-1 for j in range(len(_dfa.groups))]
+    while thread is not None:
+      pos, mark, thread = thread
+      group = mark >> 1
+      if (mark & 1) == 0:
+        group_start[group] = pos
+        print(
+          'group {0:d} name "{1:s}" text "{2:s}"'.format(
+             group,
+             _dfa.groups[group][0],
+             text[group_start[group]:group_end[group]].replace('\n', '$')
+          )
+        )
+      else:
+        group_end[group] = pos
+
+# move this into grammar.py:
  #  grammar = Grammar(children = [Grammar.Production(children = [RegexSequence(
  #children = [RegexSequence(children = [RegexEmpty(), RegexCharacterRule(character_set
  #= [288, 295], rule_name = 'whitespace_opt')]), RegexCharacterRule(character_set = [
@@ -982,7 +1097,7 @@ def factory(tag, attrib = {}, *args, **kwargs):
  #'whitespace_opt')]), RegexCharacter(character_set = [32, 33])])], nonterminal = 36)
  #], n_terminals = 258)
  #  #sys.stdout.write(
-#  #  wrap_repr(
+#  #  wrap_repr.wrap_repr(
  #  #    '  grammar = {0:s}'.format(repr(grammar).replace('regex.', '')),
  #  #    79
  #  #  )
@@ -990,7 +1105,7 @@ def factory(tag, attrib = {}, *args, **kwargs):
  #
  #  lr1 = grammar.to_lr1()
  #  #sys.stdout.write(
-#  #  wrap_repr(
+#  #  wrap_repr.wrap_repr(
  #  #    '  lr1 = {0:s}'.format(repr(lr1).replace('regex.', '')),
  #  #    79
  #  #  )
@@ -1003,7 +1118,7 @@ def factory(tag, attrib = {}, *args, **kwargs):
  #
  #  clr1 = lr1.to_clr1()
  #  #sys.stdout.write(
-#  #  wrap_repr(
+#  #  wrap_repr.wrap_repr(
  #  #    '  clr1 = {0:s}'.format(repr(clr1).replace('regex.', '')),
  #  #    79
  #  #  )
@@ -1016,7 +1131,7 @@ def factory(tag, attrib = {}, *args, **kwargs):
  #
  #  lalr1 = lr1.to_lalr1()
  #  #sys.stdout.write(
-#  #  wrap_repr(
+#  #  wrap_repr.wrap_repr(
  #  #    '  lalr1 = {0:s}'.format(repr(lalr1).replace('regex.', '')),
  #  #    79
  #  #  )
author	Nick Downing <downing.nick@gmail.com>
	Tue, 24 Jul 2018 01:20:42 +0000 (11:20 +1000)
committer	Nick Downing <downing.nick@gmail.com>
	Tue, 24 Jul 2018 02:43:15 +0000 (12:43 +1000)
ast.py		patch \| blob \| history
bootstrap_plex.py		patch \| blob \| history
flex_dfa.py		patch \| blob \| history
regex.py		patch \| blob \| history