Store nonterminal symbols in a separate array so that the position in this array...
authorNick Downing <downing.nick@gmail.com>
Wed, 4 Jul 2018 13:10:10 +0000 (23:10 +1000)
committerNick Downing <downing.nick@gmail.com>
Thu, 5 Jul 2018 01:28:09 +0000 (11:28 +1000)
ast.py

diff --git a/ast.py b/ast.py
index 8ca2710..804098d 100644 (file)
--- a/ast.py
+++ b/ast.py
@@ -102,70 +102,6 @@ class PYACC(element.Element):
       return 'ast.PYACC.Symbol({0:s})'.format(', '.join(params))
     # GENERATE END
 
-  class TerminalSymbol(Symbol):
-    # GENERATE ELEMENT() BEGIN
-    def __init__(
-      self,
-      tag = 'PYACC_TerminalSymbol',
-      attrib = {},
-      text = '',
-      children = [],
-      name = '',
-      character_set = []
-    ):
-      PYACC.Symbol.__init__(
-        self,
-        tag,
-        attrib,
-        text,
-        children,
-        name,
-        character_set
-      )
-    def copy(self, factory = None):
-      result = PYACC.Symbol.copy(
-        self,
-        TerminalSymbol if factory is None else factory
-      )
-      return result
-    def __repr__(self):
-      params = []
-      self.repr_serialize(params)
-      return 'ast.PYACC.TerminalSymbol({0:s})'.format(', '.join(params))
-    # GENERATE END
-
-  class NonterminalSymbol(Symbol):
-    # GENERATE ELEMENT() BEGIN
-    def __init__(
-      self,
-      tag = 'PYACC_NonterminalSymbol',
-      attrib = {},
-      text = '',
-      children = [],
-      name = '',
-      character_set = []
-    ):
-      PYACC.Symbol.__init__(
-        self,
-        tag,
-        attrib,
-        text,
-        children,
-        name,
-        character_set
-      )
-    def copy(self, factory = None):
-      result = PYACC.Symbol.copy(
-        self,
-        NonterminalSymbol if factory is None else factory
-      )
-      return result
-    def __repr__(self):
-      params = []
-      self.repr_serialize(params)
-      return 'ast.PYACC.NonterminalSymbol({0:s})'.format(', '.join(params))
-    # GENERATE END
-
   # syntax classes
   class BracedCode(element.Element):
     # GENERATE ELEMENT() BEGIN
@@ -917,17 +853,14 @@ class PYACC(element.Element):
           assert isinstance(i, PYACC.ID)
           token_name = element.get_text(i, 0)
           assert token_name not in name_to_symbol
-          name_to_symbol[token_name] = len(pyacc.symbols)
-          pyacc.symbols.append(
-            PYACC.TerminalSymbol(
+          name_to_symbol[token_name] = len(pyacc.terminal_symbols)
+          character = 0x100 + len(pyacc.terminal_symbols)
+          pyacc.terminal_symbols.append(
+            PYACC.Symbol(
               name = token_name,
-              character_set = [
-                pyacc.grammar.n_terminals,
-                pyacc.grammar.n_terminals + 1
-              ]
+              character_set = [character, character + 1]
             )
           )
-          pyacc.grammar.n_terminals += 1
  
     class Type(Item):
       class Symbols(element.Element):
@@ -1816,12 +1749,13 @@ class PYACC(element.Element):
           for i in range(len(self)):
             if isinstance(self[i], PYACC.Section2.Rules.RHS.Symbol):
               if isinstance(self[i][0], PYACC.Char):
-                text = self[i][0].get_text()
+                character = ord(self[i][0].get_text())
+                pyacc.characters_used.add(character)
                 expr = regex.RegexSequence(
                   children = [
                     expr,
                     regex.RegexCharacter(
-                      character_set = [ord(text), ord(text) + 1]
+                      character_set = [character, character + 1]
                     )
                   ]
                 )
@@ -1845,11 +1779,12 @@ class PYACC(element.Element):
           else:
             pyacc.actions_text.append('')
 
-          character_set = pyacc.symbols[lhs_symbol].character_set
-          if len(character_set) and character_set[-1] == len(pyacc.grammar):
-            character_set[-1] = len(pyacc.grammar) + 1
+          character_set = pyacc.nonterminal_symbols[lhs_symbol].character_set
+          character = len(pyacc.grammar)
+          if len(character_set) and character_set[-1] == character:
+            character_set[-1] = character + 1
           else:
-            character_set.extend([len(pyacc.grammar), len(pyacc.grammar) + 1])
+            character_set.extend([character, character + 1])
           pyacc.grammar.append(regex.Grammar.Production(children = [expr]))
 
       # GENERATE ELEMENT() BEGIN
@@ -1887,16 +1822,14 @@ class PYACC(element.Element):
         assert isinstance(self[0], PYACC.ID)
         lhs_name = element.get_text(self[0], 0)
         if lhs_name in name_to_symbol:
-          lhs_symbol = name_to_symbol[lhs_name]
-          assert isinstance(
-            pyacc.symbols[lhs.symbol],
-            PYACC.NonterminalSymbol
-          )
+          i = name_to_symbol[lhs_name]
+          assert i < 0
+          lhs_symbol = ~i
         else:
-          lhs_symbol = len(pyacc.symbols)
-          name_to_symbol[lhs_name] = lhs_symbol
-          pyacc.symbols.append(
-            PYACC.NonterminalSymbol(name = lhs_name, character_set = [])
+          lhs_symbol = len(pyacc.nonterminal_symbols)
+          name_to_symbol[lhs_name] = ~lhs_symbol
+          pyacc.nonterminal_symbols.append(
+            PYACC.Symbol(name = lhs_name, character_set = [])
           )
         for i in self[1:]:
           i.post_process(
@@ -1960,7 +1893,7 @@ class PYACC(element.Element):
       return 'ast.PYACC.Section3({0:s})'.format(', '.join(params))
     # GENERATE END
 
-  # GENERATE ELEMENT(list(str) prologue_text, list(ref) symbols, ref grammar, list(str) actions_text) BEGIN
+  # GENERATE ELEMENT(list(str) prologue_text, set(int) characters_used, list(ref) terminal_symbols, list(ref) nonterminal_symbols, ref grammar, list(str) actions_text) BEGIN
   def __init__(
     self,
     tag = 'PYACC',
@@ -1968,7 +1901,9 @@ class PYACC(element.Element):
     text = '',
     children = [],
     prologue_text = [],
-    symbols = [],
+    characters_used = set(),
+    terminal_symbols = [],
+    nonterminal_symbols = [],
     grammar = None,
     actions_text = []
   ):
@@ -1984,7 +1919,13 @@ class PYACC(element.Element):
     if isinstance(prologue_text, str) else
       prologue_text
     )
-    self.symbols = symbols
+    self.characters_used = (
+      set([element.deserialize_int(i) for i in characters_used.split()])
+    if isinstance(characters_used, str) else
+      characters_used
+    )
+    self.terminal_symbols = terminal_symbols
+    self.nonterminal_symbols = nonterminal_symbols
     self.grammar = grammar
     self.actions_text = (
       [element.deserialize_str(i) for i in actions_text.split()]
@@ -1998,8 +1939,16 @@ class PYACC(element.Element):
       ' '.join([element.serialize_str(i) for i in self.prologue_text])
     )
     self.set(
-      'symbols',
-      ' '.join([element.serialize_ref(i, ref_list) for i in self.symbols])
+      'characters_used',
+      ' '.join([element.serialize_int(i) for i in sorted(self.characters_used)])
+    )
+    self.set(
+      'terminal_symbols',
+      ' '.join([element.serialize_ref(i, ref_list) for i in self.terminal_symbols])
+    )
+    self.set(
+      'nonterminal_symbols',
+      ' '.join([element.serialize_ref(i, ref_list) for i in self.nonterminal_symbols])
     )
     self.set('grammar', element.serialize_ref(self.grammar, ref_list))
     self.set(
@@ -2012,9 +1961,19 @@ class PYACC(element.Element):
       element.deserialize_str(i)
       for i in self.get('prologue_text', '').split()
     ]
-    self.symbols = [
+    self.characters_used = set(
+      [
+        element.deserialize_int(i)
+        for i in self.get('characters_used', '').split()
+      ]
+    )
+    self.terminal_symbols = [
       element.deserialize_ref(i, ref_list)
-      for i in self.get('symbols', '').split()
+      for i in self.get('terminal_symbols', '').split()
+    ]
+    self.nonterminal_symbols = [
+      element.deserialize_ref(i, ref_list)
+      for i in self.get('nonterminal_symbols', '').split()
     ]
     self.grammar = element.deserialize_ref(self.get('grammar', '-1'), ref_list)
     self.actions_text = [
@@ -2027,7 +1986,9 @@ class PYACC(element.Element):
       PYACC if factory is None else factory
     )
     result.prologue_text = self.prologue_text
-    result.symbols = self.symbols
+    result.characters_used = self.characters_used
+    result.terminal_symbols = self.terminal_symbols
+    result.nonterminal_symbols = self.nonterminal_symbols
     result.grammar = self.grammar
     result.actions_text = self.actions_text
     return result
@@ -2039,10 +2000,22 @@ class PYACC(element.Element):
           ', '.join([repr(i) for i in self.prologue_text])
         )
       )
-    if len(self.symbols):
+    if len(self.characters_used):
+      params.append(
+        'characters_used = set([{0:s}])'.format(
+          ', '.join([repr(i) for i in sorted(self.characters_used)])
+        )
+      )
+    if len(self.terminal_symbols):
+      params.append(
+        'terminal_symbols = [{0:s}]'.format(
+          ', '.join([repr(i) for i in self.terminal_symbols])
+        )
+      )
+    if len(self.nonterminal_symbols):
       params.append(
-        'symbols = [{0:s}]'.format(
-          ', '.join([repr(i) for i in self.symbols])
+        'nonterminal_symbols = [{0:s}]'.format(
+          ', '.join([repr(i) for i in self.nonterminal_symbols])
         )
       )
     if self.grammar != None:
@@ -2063,23 +2036,33 @@ class PYACC(element.Element):
   def post_process(self):
     # variables that will be serialized
     self.prologue_text = []
-    self.symbols = [
-      PYACC.NonterminalSymbol(name = 'error', character_set = [])
+    self.characters_used = set()
+    self.terminal_symbols = [
+      PYACC.Symbol(name = '$end', character_set = [0x100, 0x101]),
+      PYACC.Symbol(name = 'error', character_set = [0x101, 0x102]),
+      PYACC.Symbol(name = '$undefined', character_set = [0x102, 0x103])
     ]
+    self.nonterminal_symbols = []
     self.grammar = regex.Grammar(
       children = [
         regex.Grammar.Production(
           children = [
-            regex.RegexCharacterRule()
+            regex.RegexSequence(
+              children = [
+                regex.RegexCharacterRule(),
+                regex.RegexCharacterRule(rule_name = '$end')
+              ]
+            )
           ]
         )
-      ],
-      n_terminals = 0x102
+      ]
     )
     self.actions_text = []
 
     # variables that won't be serialized
-    name_to_symbol = {'error': 0}
+    # note: in name_to_symbol, >= 0 is terminal, < 0 is ~nonterminal
+    # (we do not bother storing the '$end' and '$undefined' entries)
+    name_to_symbol = {'error': 1}
 
     # perform the semantic analysis pass
     for i in self:
@@ -2089,19 +2072,22 @@ class PYACC(element.Element):
       )
 
     # if start symbol not specified, use first nonterminal defined in file
-    if len(self.grammar[0][0].rule_name) == 0:
-      for i in self.symbols:
-        if isinstance(i, PYACC.NonterminalSymbol):
-          self.grammar[0][0].rule_name = i.name
-
-    # make nonterminal character_set offset by n_terminals which is now known
-    for i in self.symbols:
-      if isinstance(i, PYACC.NonterminalSymbol):
-        i.character_set = [j + self.grammar.n_terminals for j in i.character_set]
+    if len(self.grammar[0][0][0].rule_name) == 0:
+      self.grammar[0][0][0].rule_name = self.nonterminal_symbols[0].name
 
-    # look up all rule names and substitute appropriate character_set for each
+    # look up rule names and substitute appropriate character_set for each
+    self.grammar.n_terminals = 0x100 + len(self.terminal_symbols)
     self.grammar.post_process(
-      dict([(i.name, i.character_set) for i in self.symbols])
+      dict(
+        [
+          (i.name, i.character_set)
+          for i in self.terminal_symbols
+        ] +
+        [
+          (i.name, [self.grammar.n_terminals + j for j in i.character_set])
+          for i in self.nonterminal_symbols
+        ]
+      )
     )
 
 # GENERATE FACTORY(regex.factory) BEGIN
@@ -2109,8 +2095,6 @@ tag_to_class = {
   'Item': Item,
   'PYACC': PYACC,
   'PYACC_Symbol': PYACC.Symbol,
-  'PYACC_TerminalSymbol': PYACC.TerminalSymbol,
-  'PYACC_NonterminalSymbol': PYACC.NonterminalSymbol,
   'PYACC_BracedCode': PYACC.BracedCode,
   'PYACC_BracedPredicate': PYACC.BracedPredicate,
   'PYACC_Char': PYACC.Char,