Change the semantic analysis pass to only index the rules applicable to each start...
authorNick Downing <downing.nick@gmail.com>
Sat, 21 Jul 2018 13:07:11 +0000 (23:07 +1000)
committerNick Downing <downing.nick@gmail.com>
Sat, 21 Jul 2018 13:07:26 +0000 (23:07 +1000)
ast.py

diff --git a/ast.py b/ast.py
index 110c9b5..9f7d918 100644 (file)
--- a/ast.py
+++ b/ast.py
@@ -42,7 +42,7 @@ class Item(element.Element):
 class PLex(element.Element):
   # internal classes
   class StartCondition(element.Element):
-    # GENERATE ELEMENT(str name, bool exclusive, int eof_action) BEGIN
+    # GENERATE ELEMENT(str name, bool exclusive, list(ref) rules, list(ref) bol_rules, int eof_action) BEGIN
     def __init__(
       self,
       tag = 'PLex_StartCondition',
@@ -51,6 +51,8 @@ class PLex(element.Element):
       children = [],
       name = '',
       exclusive = False,
+      rules = [],
+      bol_rules = [],
       eof_action = -1
     ):
       element.Element.__init__(
@@ -66,6 +68,8 @@ class PLex(element.Element):
       if isinstance(exclusive, str) else
         exclusive
       )
+      self.rules = rules
+      self.bol_rules = bol_rules
       self.eof_action = (
         element.deserialize_int(eof_action)
       if isinstance(eof_action, str) else
@@ -75,11 +79,27 @@ class PLex(element.Element):
       element.Element.serialize(self, ref_list)
       self.set('name', element.serialize_str(self.name))
       self.set('exclusive', element.serialize_bool(self.exclusive))
+      self.set(
+        'rules',
+        ' '.join([element.serialize_ref(i, ref_list) for i in self.rules])
+      )
+      self.set(
+        'bol_rules',
+        ' '.join([element.serialize_ref(i, ref_list) for i in self.bol_rules])
+      )
       self.set('eof_action', element.serialize_int(self.eof_action))
     def deserialize(self, ref_list):
       element.Element.deserialize(self, ref_list)
       self.name = element.deserialize_str(self.get('name', ''))
       self.exclusive = element.deserialize_bool(self.get('exclusive', 'false'))
+      self.rules = [
+        element.deserialize_ref(i, ref_list)
+        for i in self.get('rules', '').split()
+      ]
+      self.bol_rules = [
+        element.deserialize_ref(i, ref_list)
+        for i in self.get('bol_rules', '').split()
+      ]
       self.eof_action = element.deserialize_int(self.get('eof_action', '-1'))
     def copy(self, factory = None):
       result = element.Element.copy(
@@ -88,6 +108,8 @@ class PLex(element.Element):
       )
       result.name = self.name
       result.exclusive = self.exclusive
+      result.rules = self.rules
+      result.bol_rules = self.bol_rules
       result.eof_action = self.eof_action
       return result
     def repr_serialize(self, params):
@@ -100,6 +122,18 @@ class PLex(element.Element):
         params.append(
           'exclusive = {0:s}'.format(repr(self.exclusive))
         )
+      if len(self.rules):
+        params.append(
+          'rules = [{0:s}]'.format(
+            ', '.join([repr(i) for i in self.rules])
+          )
+        )
+      if len(self.bol_rules):
+        params.append(
+          'bol_rules = [{0:s}]'.format(
+            ', '.join([repr(i) for i in self.bol_rules])
+          )
+        )
       if self.eof_action != -1:
         params.append(
           'eof_action = {0:s}'.format(repr(self.eof_action))
@@ -611,10 +645,11 @@ class PLex(element.Element):
             inclusive_start_conditions.add(len(plex.start_conditions))
           plex.start_conditions.append(
             PLex.StartCondition(
-              children = [regex.RegexNone(), regex.RegexNone()], # [normal, BOL]
               name = name,
               exclusive = self.exclusive,
-              eof_action = 0
+              rules = [],
+              bol_rules = [],
+              eof_action = 0,
             )
           )
 
@@ -870,13 +905,14 @@ class PLex(element.Element):
           return 'ast.PLex.Section2.Rule.Action({0:s})'.format(', '.join(params))
         # GENERATE END
 
-      # GENERATE ELEMENT() BEGIN
+      # GENERATE ELEMENT(int action) BEGIN
       def __init__(
         self,
         tag = 'PLex_Section2_Rule',
         attrib = {},
         text = '',
-        children = []
+        children = [],
+        action = -1
       ):
         Item.__init__(
           self,
@@ -885,12 +921,30 @@ class PLex(element.Element):
           text,
           children
         )
+        self.action = (
+          element.deserialize_int(action)
+        if isinstance(action, str) else
+          action
+        )
+      def serialize(self, ref_list):
+        Item.serialize(self, ref_list)
+        self.set('action', element.serialize_int(self.action))
+      def deserialize(self, ref_list):
+        Item.deserialize(self, ref_list)
+        self.action = element.deserialize_int(self.get('action', '-1'))
       def copy(self, factory = None):
         result = Item.copy(
           self,
           Rule if factory is None else factory
         )
+        result.action = self.action
         return result
+      def repr_serialize(self, params):
+        Item.repr_serialize(self, params)
+        if self.action != -1:
+          params.append(
+            'action = {0:s}'.format(repr(self.action))
+          )
       def __repr__(self):
         params = []
         self.repr_serialize(params)
@@ -904,60 +958,39 @@ class PLex(element.Element):
         all_start_conditions,
         inclusive_start_conditions
       ):
-        start_conditions = self[0]
-        assert isinstance(start_conditions, PLex.Section2.StartConditions)
         default = False
-        if start_conditions.wildcard:
+        if self[0].wildcard:
           start_condition_set = all_start_conditions
-        elif len(start_conditions) == 0:
+        elif len(self[0]) == 0:
           default = True
           start_condition_set = inclusive_start_conditions
         else:
           start_condition_set = set()
-          for i in start_conditions:
+          for i in self[0]:
             start_condition_set.add(
               name_to_start_condition[i.get_text()]
             )
-        expr = self[1]
-        trailing_context = self[2]
-        assert isinstance(trailing_context, regex.Regex)
-        action = self[3]
-        assert isinstance(action, PLex.Section2.Rule.Action)
-        if isinstance(expr, PLex.Section2.Rule.EOFRule):
-          assert isinstance(trailing_context, regex.RegexNone)
+        if isinstance(self[1], PLex.Section2.Rule.EOFRule):
+          assert isinstance(self[2], regex.RegexNone)
           for i in start_condition_set:
             if default and plex.start_conditions[i].eof_action != 0:
               continue # rule applies to start conditions with no EOF rule yet
             assert plex.start_conditions[i].eof_action == 0
             plex.start_conditions[i].eof_action = len(plex.eof_actions_text)
-          plex.eof_actions_text.append(action[0])
+          plex.eof_actions_text.append(self[3][0])
         else:
-          if isinstance(expr, PLex.Section2.Rule.BOLRule):
-            bol_rule = True
-            expr = expr[0]
+          if isinstance(self[1], PLex.Section2.Rule.BOLRule):
+            for i in start_condition_set:
+              plex.start_conditions[i].bol_rules.append(self)
+            self[1][0].post_process()
           else:
-            bol_rule = False
-          assert isinstance(expr, regex.Regex)
-          expr = regex.RegexSequence(
-            children = [
-              expr,
-              regex.RegexGroup(
-                children = [
-                  trailing_context
-                ]
-              )
-            ]
-          )
-          expr.post_process(len(plex.actions_text))
-          for j in start_condition_set:
-            for k in range(int(bol_rule), 2):
-              plex.start_conditions[j][k] = regex.RegexOr(
-                children = [
-                  plex.start_conditions[j][k],
-                  expr
-                ]
-              )
-          plex.actions_text.append(action[0])
+            for i in start_condition_set:
+              plex.start_conditions[i].rules.append(self)
+              plex.start_conditions[i].bol_rules.append(self)
+            self[1].post_process()
+          self[2].post_process()
+          self.action = len(plex.actions_text)
+          plex.actions_text.append(self[3][0])
 
     # GENERATE ELEMENT() BEGIN
     def __init__(
@@ -1016,7 +1049,7 @@ class PLex(element.Element):
       return 'ast.PLex.Section3({0:s})'.format(', '.join(params))
     # GENERATE END
 
-  # GENERATE ELEMENT(list(ref) start_conditions, list(ref) actions_text, list(ref) eof_actions_text) BEGIN
+  # GENERATE ELEMENT(list(ref) start_conditions, list(ref) actions_text, list(ref) eof_actions_text, int default_action) BEGIN
   def __init__(
     self,
     tag = 'PLex',
@@ -1025,7 +1058,8 @@ class PLex(element.Element):
     children = [],
     start_conditions = [],
     actions_text = [],
-    eof_actions_text = []
+    eof_actions_text = [],
+    default_action = -1
   ):
     element.Element.__init__(
       self,
@@ -1037,6 +1071,11 @@ class PLex(element.Element):
     self.start_conditions = start_conditions
     self.actions_text = actions_text
     self.eof_actions_text = eof_actions_text
+    self.default_action = (
+      element.deserialize_int(default_action)
+    if isinstance(default_action, str) else
+      default_action
+    )
   def serialize(self, ref_list):
     element.Element.serialize(self, ref_list)
     self.set(
@@ -1051,6 +1090,7 @@ class PLex(element.Element):
       'eof_actions_text',
       ' '.join([element.serialize_ref(i, ref_list) for i in self.eof_actions_text])
     )
+    self.set('default_action', element.serialize_int(self.default_action))
   def deserialize(self, ref_list):
     element.Element.deserialize(self, ref_list)
     self.start_conditions = [
@@ -1065,6 +1105,7 @@ class PLex(element.Element):
       element.deserialize_ref(i, ref_list)
       for i in self.get('eof_actions_text', '').split()
     ]
+    self.default_action = element.deserialize_int(self.get('default_action', '-1'))
   def copy(self, factory = None):
     result = element.Element.copy(
       self,
@@ -1073,6 +1114,7 @@ class PLex(element.Element):
     result.start_conditions = self.start_conditions
     result.actions_text = self.actions_text
     result.eof_actions_text = self.eof_actions_text
+    result.default_action = self.default_action
     return result
   def repr_serialize(self, params):
     element.Element.repr_serialize(self, params)
@@ -1094,6 +1136,10 @@ class PLex(element.Element):
           ', '.join([repr(i) for i in self.eof_actions_text])
         )
       )
+    if self.default_action != -1:
+      params.append(
+        'default_action = {0:s}'.format(repr(self.default_action))
+      )
   def __repr__(self):
     params = []
     self.repr_serialize(params)
@@ -1104,10 +1150,11 @@ class PLex(element.Element):
     # variables that will be serialized
     self.start_conditions = [
       PLex.StartCondition(
-        children = [regex.RegexNone(), regex.RegexNone()], # [normal, BOL]
         name = 'INITIAL',
         exclusive = False,
-        eof_action = 0
+        rules = [],
+        bol_rules = [],
+        eof_action = 0,
       )
     ]
     self.actions_text = []
@@ -1133,11 +1180,36 @@ class PLex(element.Element):
       all_start_conditions,
       inclusive_start_conditions
     )
+    self.default_action = len(self.actions_text)
+    self.actions_text.append(PLex.Text(text = 'ECHO;\n'))
 
-    # add default rule to each expr, then make it greedy
-    for i in range(len(self.start_conditions)):
+  def to_nfa(self):
+    _nfa = nfa.NFA()
+    for i in self.start_conditions:
       for j in range(2):
-        self.start_conditions[i][j] = regex.RegexAnd(
+        expr = regex.RegexNone()
+        for k in [i.rules, i.bol_rules][j]:
+          expr = regex.RegexOr(
+            children = [
+              expr,
+              regex.RegexSequence(
+                children = [
+                  (
+                    k[1][0]
+                  if isinstance(k[1], PLex.Section2.Rule.BOLRule) else
+                    k[1]
+                  ),
+                  regex.RegexGroup(
+                    children = [
+                      k[2]
+                    ],
+                    group_index = k.action
+                  )
+                ]
+              )
+            ]
+          )
+        expr = regex.RegexAnd(
           children = [
             regex.RegexRepeat(
               count0 = 0,
@@ -1149,14 +1221,14 @@ class PLex(element.Element):
             ),
             regex.RegexOr(
               children = [
-                self.start_conditions[i][j],
+                expr,
                 regex.RegexSequence(
                   children = [
                     regex.RegexCharacter(
                       character_set = [0, 0x100]
                     ),
                     regex.RegexGroup(
-                      group_index = len(self.actions_text),
+                      group_index = self.default_action,
                       children = [
                         regex.RegexEmpty()
                       ]
@@ -1167,13 +1239,7 @@ class PLex(element.Element):
             )
           ]
         )
-    self.actions_text.append(PLex.Text(text = 'ECHO;\n'))
-
-  def to_nfa(self):
-    _nfa = nfa.NFA()
-    for i in self.start_conditions:
-      for j in i:
-        j.add_to_nfa(_nfa)
+        expr.add_to_nfa(_nfa)
     return _nfa
 
 # GENERATE FACTORY(regex.factory) BEGIN