Partially revert commit 8494a18, since the LR1DFA itself does not need the compressed...
authorNick Downing <downing.nick@gmail.com>
Wed, 18 Jul 2018 12:19:09 +0000 (22:19 +1000)
committerNick Downing <downing.nick@gmail.com>
Wed, 18 Jul 2018 12:19:09 +0000 (22:19 +1000)
ast.py
bison_lr1dfa.py

diff --git a/ast.py b/ast.py
index f404084..9902f23 100644 (file)
--- a/ast.py
+++ b/ast.py
@@ -41,7 +41,7 @@ class Item(element.Element):
 class PYACC(element.Element):
   # internal classes
   class Symbol(element.Element):
-    # GENERATE ELEMENT(str name, int token, list(int) character_set, int precedence, int associativity) BEGIN
+    # GENERATE ELEMENT(str name, list(int) character_set, int precedence, int associativity) BEGIN
     def __init__(
       self,
       tag = 'PYACC_Symbol',
@@ -49,7 +49,6 @@ class PYACC(element.Element):
       text = '',
       children = [],
       name = '',
-      token = -1,
       character_set = [],
       precedence = -1,
       associativity = -1
@@ -62,11 +61,6 @@ class PYACC(element.Element):
         children
       )
       self.name = name
-      self.token = (
-        element.deserialize_int(token)
-      if isinstance(token, str) else
-        token
-      )
       self.character_set = (
         [element.deserialize_int(i) for i in character_set.split()]
       if isinstance(character_set, str) else
@@ -85,7 +79,6 @@ class PYACC(element.Element):
     def serialize(self, ref_list, indent = 0):
       element.Element.serialize(self, ref_list, indent)
       self.set('name', element.serialize_str(self.name))
-      self.set('token', element.serialize_int(self.token))
       self.set(
         'character_set',
         ' '.join([element.serialize_int(i) for i in self.character_set])
@@ -95,7 +88,6 @@ class PYACC(element.Element):
     def deserialize(self, ref_list):
       element.Element.deserialize(self, ref_list)
       self.name = element.deserialize_str(self.get('name', ''))
-      self.token = element.deserialize_int(self.get('token', '-1'))
       self.character_set = [
         element.deserialize_int(i)
         for i in self.get('character_set', '').split()
@@ -108,7 +100,6 @@ class PYACC(element.Element):
         Symbol if factory is None else factory
       )
       result.name = self.name
-      result.token = self.token
       result.character_set = self.character_set
       result.precedence = self.precedence
       result.associativity = self.associativity
@@ -119,10 +110,6 @@ class PYACC(element.Element):
         params.append(
           'name = {0:s}'.format(repr(self.name))
         )
-      if self.token != -1:
-        params.append(
-          'token = {0:s}'.format(repr(self.token))
-        )
       if len(self.character_set):
         params.append(
           'character_set = [{0:s}]'.format(
@@ -586,10 +573,7 @@ class PYACC(element.Element):
             self.symbol = len(pyacc.terminal_symbols)
             character_to_symbol[character] = self.symbol
             pyacc.terminal_symbols.append(
-              PYACC.Symbol(
-                token = character,
-                character_set = [self.symbol, self.symbol + 1]
-              )
+              PYACC.Symbol(character_set = [character, character + 1])
             )
         elif isinstance(self[0], PYACC.ID):
           name = element.get_text(self[0], 0)
@@ -600,10 +584,7 @@ class PYACC(element.Element):
             self.symbol = len(pyacc.terminal_symbols)
             name_to_symbol[name] = self.symbol
             pyacc.terminal_symbols.append(
-              PYACC.Symbol(
-                name = name,
-                character_set = [self.symbol, self.symbol + 1]
-              )
+              PYACC.Symbol(name = name)
             )
         else:
           assert False
@@ -675,10 +656,7 @@ class PYACC(element.Element):
             self.symbol = len(pyacc.terminal_symbols)
             character_to_symbol[character] = self.symbol
             pyacc.terminal_symbols.append(
-              PYACC.Symbol(
-                token = character,
-                character_set = [self.symbol, self.symbol + 1]
-              )
+              PYACC.Symbol(character_set = [character, character + 1])
             )
         elif isinstance(self[0], PYACC.ID):
           name = element.get_text(self[0], 0)
@@ -688,10 +666,7 @@ class PYACC(element.Element):
             self.symbol = ~len(pyacc.nonterminal_symbols)
             name_to_symbol[name] = self.symbol
             pyacc.nonterminal_symbols.append(
-              PYACC.Symbol(
-                name = name,
-                character_set = []
-              )
+              PYACC.Symbol(name = name, character_set = [])
             )
         else:
           assert False
@@ -796,26 +771,28 @@ class PYACC(element.Element):
         character_set.extend([character, character + 1])
       pyacc.productions.append(self)
 
-    def to_lr1_production(self, pyacc):
+    def add_to_lr1(self, pyacc, _lr1):
       last_action = None
       symbols = []
       lookaheads = []
       for i in self:
         last_action = i.add_to_symbols(pyacc, last_action, symbols)
-      return (
-        # precedence
+      _lr1.productions.append(
         (
-          (pyacc.terminal_symbols[self.precedence_terminal].precedence << 2) |
-          (pyacc.terminal_symbols[self.precedence_terminal].associativity & 3)
-        if self.precedence_terminal != -1 else
-          -1
-        ),
-        # symbols (list of terminal_set, nonterminal_set)
-        symbols,
-        # lookaheads (list of initial_set, can_be_empty)
-        [([], False) for i in range(len(symbols))] + [([], True)],
-        # ref_data
-        last_action if last_action is not None else PYACC.BracedCode()
+          # precedence
+          (
+            (pyacc.terminal_symbols[self.precedence_terminal].precedence << 2) |
+            (pyacc.terminal_symbols[self.precedence_terminal].associativity & 3)
+          if self.precedence_terminal != -1 else
+            -1
+          ),
+          # symbols (list of terminal_set, nonterminal_set)
+          symbols,
+          # lookaheads (list of initial_set, can_be_empty)
+          [([], False) for i in range(len(symbols))] + [([], True)],
+          # ref_data
+          last_action if last_action is not None else PYACC.BracedCode()
+        )
       )
 
   class Section(element.Element):
@@ -1415,10 +1392,7 @@ class PYACC(element.Element):
               self.symbol = ~len(pyacc.nonterminal_symbols)
               name_to_symbol[name] = self.symbol
               pyacc.nonterminal_symbols.append(
-                PYACC.Symbol(
-                  name = name,
-                  character_set = []
-                )
+                PYACC.Symbol(name = name, character_set = [])
               )
           else:
             assert False
@@ -1434,10 +1408,7 @@ class PYACC(element.Element):
               self.symbol = len(pyacc.terminal_symbols)
               character_to_symbol[character] = self.symbol
               pyacc.terminal_symbols.append(
-                PYACC.Symbol(
-                  token = character,
-                  character_set = [self.symbol, self.symbol + 1]
-                )
+                PYACC.Symbol(character_set = [character, character + 1])
               )
           elif isinstance(self[0], PYACC.ID):
             name = element.get_text(self[0], 0)
@@ -1448,16 +1419,15 @@ class PYACC(element.Element):
               self.symbol = len(pyacc.terminal_symbols)
               name_to_symbol[name] = self.symbol
               pyacc.terminal_symbols.append(
-                PYACC.Symbol(
-                  name = name,
-                  character_set = [self.symbol, self.symbol + 1]
-                )
+                PYACC.Symbol(name = name)
               )
           else:
             assert False
           if self.token != -1:
-            assert pyacc.terminal_symbols[self.symbol].token == -1
-            pyacc.terminal_symbols[self.symbol].token = self.token
+            assert len(pyacc.terminal_symbols[self.symbol].character_set) == 0
+            pyacc.terminal_symbols[self.symbol].character_set = (
+              [self.token, self.token + 1]
+            )
           if precedence != -1:
             assert pyacc.terminal_symbols[self.symbol].precedence == -1
             pyacc.terminal_symbols[self.symbol].precedence = precedence
@@ -2556,9 +2526,9 @@ class PYACC(element.Element):
     self.prologue_text = []
     self.precedences = 0
     self.terminal_symbols = [
-      PYACC.Symbol(name = '$eof', token = 0, character_set = [0, 1]),
-      PYACC.Symbol(name = 'error', character_set = [1, 2]),
-      PYACC.Symbol(name = '$undefined', character_set = [2, 3])
+      PYACC.Symbol(name = '$eof', character_set = [0, 1]),
+      PYACC.Symbol(name = 'error'),
+      PYACC.Symbol(name = '$undefined')
     ]
     self.nonterminal_symbols = []
     self.start_nonterminal = -1
@@ -2581,8 +2551,8 @@ class PYACC(element.Element):
     # fill in token numbers that are not characters or overridden by user
     token = 0x100
     for i in self.terminal_symbols:
-      if i.token == -1:
-        i.token = token
+      if len(i.character_set) == 0:
+        i.character_set = [token, token + 1]
         token += 1
 
     # fill in start nonterminal if not overridden by user
@@ -2591,44 +2561,59 @@ class PYACC(element.Element):
       self.start_nonterminal = 0
 
   def to_lr1(self):
-    lr1_productions = [
-      (
-        # precedence
-        -1,
-        # symbols (list of terminal_set, nonterminal_set)
-        [([], self.nonterminal_symbols[self.start_nonterminal].character_set)],
-        # lookaheads (list of initial_set, can_be_empty)
-        [([], False), ([], True)],
-        # ref_data
-        PYACC.BracedCode()
-      )
-    ]
+    _lr1 = lr1.LR1(
+      [
+        (
+          # precedence
+          -1,
+          # symbols (list of terminal_set, nonterminal_set)
+          [
+            (
+              [],
+              self.nonterminal_symbols[self.start_nonterminal].character_set
+            )
+          ],
+          # lookaheads (list of initial_set, can_be_empty)
+          [([], False), ([], True)],
+          # ref_data
+          PYACC.BracedCode()
+        )
+      ],
+      max([0] + [i.character_set[1] for i in self.terminal_symbols]),
+      0
+    )
     for i in self.productions:
-      lr1_productions.append(i.to_lr1_production(self))
+      i.add_to_lr1(self, _lr1)
 
     # propagate lookaheads
     modified = True
     while modified:
       modified = False
-      for _, symbols, lookaheads, _ in lr1_productions:
+      for _, symbols, lookaheads, _ in _lr1.productions:
         for i in range(len(symbols) - 1, -1, -1):
           initial_set, nonterminal_set = symbols[i]
           can_be_empty = False
           for j in range(0, len(nonterminal_set), 2):
             for k in range(nonterminal_set[j], nonterminal_set[j + 1]):
-              child_initial_set, child_can_be_empty = lr1_productions[k][2][0]
-              initial_set = bisect_set.bisect_set_or(initial_set, child_initial_set)
+              child_initial_set, child_can_be_empty = _lr1.productions[k][2][0]
+              initial_set = bisect_set.bisect_set_or(
+                initial_set,
+                child_initial_set
+              )
               can_be_empty = can_be_empty or child_can_be_empty
           # at this point can_be_empty refers to current symbol only
           if can_be_empty:
             next_initial_set, can_be_empty = lookaheads[i + 1]
-            initial_set = bisect_set.bisect_set_or(initial_set, next_initial_set)
+            initial_set = bisect_set.bisect_set_or(
+              initial_set,
+              next_initial_set
+            )
           # at this point can_be_empty refers to all remaining symbols
           if (initial_set, can_be_empty) != lookaheads[i]:
             lookaheads[i] = (initial_set, can_be_empty)
             modified = True
 
-    return lr1.LR1(lr1_productions, len(self.terminal_symbols), 0)
+    return _lr1
 
 # GENERATE FACTORY(element.Element) BEGIN
 tag_to_class = {
index 2f537e1..253b81f 100644 (file)
@@ -60,7 +60,13 @@ class BisonLR1DFA:
         nonterminal0 = nonterminal1
       assert nonterminal0 == len(lr1dfa.productions)
 
-    # permute and combine rows based on the nonterminal translate vector
+    # permute and combine columns/rows on the basis of the translate vectors
+    new_action_table = numpy.zeros(
+      (len(lr1dfa.states), n_terminals),
+      numpy.int16
+    )
+    new_action_table[:, translate_terminals] = action_table
+    action_table = new_action_table 
     new_goto_table = numpy.zeros(
       (n_nonterminals, len(lr1dfa.states)),
       numpy.int16
@@ -257,36 +263,31 @@ def generate(pyacc, skel_file, out_file):
   # this undoes yacc/bison's rather wasteful mapping of 0x00..0xff to literal
   # characters, and also accommodates any token value overrides given by the
   # user, yielding a consecutive set of terminal numbers that are really used
-  # (matching pyacc.terminal_symbols[*].character_set, and hence the lr1dfa)
-  reverse_translate_terminals = numpy.array(
-    [i.token for i in pyacc.terminal_symbols],
-    numpy.int16
-  )
   translate_terminals = numpy.full(
-    (numpy.max(reverse_translate_terminals) + 1,),
+    (lr1dfa.n_terminals,),
     2, # '$undefined'
     numpy.int16
   )
-  translate_terminals[reverse_translate_terminals] = numpy.arange(
-    len(pyacc.terminal_symbols),
-    dtype = numpy.int16
-  )
+  for i in range(len(pyacc.terminal_symbols)):
+    for j in range(0, len(pyacc.terminal_symbols[i].character_set), 2):
+      translate_terminals[
+        pyacc.terminal_symbols[i].character_set[j]:
+        pyacc.terminal_symbols[i].character_set[j + 1]
+      ] = i
 
   # generate translate table for nonterminal symbols
   # this is effectively a map from productions back to nonterminal symbols
   # we do not generate an entry for the first production (start production)
-  nonterminal = 0
   translate_nonterminals = numpy.zeros(
     (len(lr1dfa.productions) - 1,),
     numpy.int16
   )
-  for i in pyacc.nonterminal_symbols:
-    for j in range(0, len(i.character_set), 2):
+  for i in range(len(pyacc.nonterminal_symbols)):
+    for j in range(0, len(pyacc.nonterminal_symbols[i].character_set), 2):
       translate_nonterminals[
-        i.character_set[j] - 1:
-        i.character_set[j + 1] - 1
-      ] = nonterminal
-    nonterminal += 1
+        pyacc.nonterminal_symbols[i].character_set[j] - 1:
+        pyacc.nonterminal_symbols[i].character_set[j + 1] - 1
+      ] = i
 
   # translate and compress the tables
   bison_lr1dfa = BisonLR1DFA(
@@ -316,7 +317,7 @@ def generate(pyacc, skel_file, out_file):
 '''.format(
               ','.join(
                 [
-                  '\n    {0:s} = {1:d}'.format(i.name, i.token)
+                  '\n    {0:s} = {1:d}'.format(i.name, i.character_set[0])
                   for i in pyacc.terminal_symbols[3:]
                   if len(i.name)
                 ]
@@ -330,7 +331,7 @@ def generate(pyacc, skel_file, out_file):
 '''.format(
               ''.join(
                 [
-                  '#define {0:s} {1:d}\n'.format(i.name, i.token)
+                  '#define {0:s} {1:d}\n'.format(i.name, i.character_set[0])
                   for i in pyacc.terminal_symbols[3:]
                   if len(i.name)
                 ]
@@ -361,9 +362,9 @@ def generate(pyacc, skel_file, out_file):
                 '"{0:s}"'.format(i.name)
               if len(i.name) else
                 '"\'{0:s}\'"'.format(
-                  chr(i.token)
-                if i.token >= 0x20 else
-                  '\\\\x{0:02x}'.format(i.token)
+                  chr(i.character_set[0])
+                if i.character_set[0] >= 0x20 else
+                  '\\\\x{0:02x}'.format(i.character_set[0])
                 )
               ) 
               for i in pyacc.terminal_symbols