Move BisonLR1DFA generation into LR1DFA.to_bison_lr1dfa() consistent with other gener...
authorNick Downing <downing.nick@gmail.com>
Wed, 8 Aug 2018 06:08:05 +0000 (16:08 +1000)
committerNick Downing <downing.nick@gmail.com>
Wed, 8 Aug 2018 06:08:19 +0000 (16:08 +1000)
ast.sh
bison_lr1dfa.py
bootstrap_pyacc.py
generate_ast.py [moved from generate.py with 100% similarity]
generate_bison.py [new file with mode: 0644]
lr1dfa.py

diff --git a/ast.sh b/ast.sh
index dd16e2d..2ba2388 100755 (executable)
--- a/ast.sh
+++ b/ast.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-if ./generate.py ast <ast.py >ast.py.new && ! diff -q ast.py ast.py.new
+if ./generate_ast.py ast <ast.py >ast.py.new && ! diff -q ast.py ast.py.new
 then
   mv ast.py.new ast.py
 else
index b655557..836c0b0 100644 (file)
-import ast
-import element
-import numpy
-import sys
-
 class BisonLR1DFA:
   def __init__(
     self,
-    lr1dfa,
-    n_terminals,
-    translate_terminals,
-    n_nonterminals,
-    translate_nonterminals
+    translate,
+    rules,
+    accessing_symbols,
+    default_action,
+    default_goto,
+    entry_base,
+    entries,
+    action_pointer,
+    goto_pointer,
+    n_terminals
   ):
-    #numpy.set_printoptions(threshold = numpy.nan)
-    #print(translate_terminals)
-    #print(translate_nonterminals)
-    # translate is yytranslate
-    self.translate = translate_terminals
-
-    # rules[:, 0] is yyr1
-    # rules[:, 1] is yyr2
-    # note: production 0 (start production) can't be reduced, null it out
-    self.rules = numpy.concatenate(
-      [
-        numpy.zeros((1, 2), numpy.int16),
-        numpy.stack(
-          [
-            translate_nonterminals + n_terminals,
-            numpy.array([i for i, _ in lr1dfa.productions[1:]], numpy.int16)
-          ],
-          1
-        )
-      ],
-      0
-    )
-
-    # unpack tables into numpy arrays so we can manipulate them efficiently
-    # note: the goto table is transposed with respect to the action table,
-    # so the row in the table corresponds to the yypact[]/yypgoto[] index,
-    # and the column in the table is what gets added to yypact[]/yypgoto[]
-    action_table = numpy.zeros(
-      (len(lr1dfa.states), lr1dfa.n_terminals),
-      numpy.int16
-    )
-    goto_table = numpy.zeros(
-      (len(lr1dfa.productions), len(lr1dfa.states)),
-      numpy.int16
-    )
-    for i in range(len(lr1dfa.states)):
-      terminal_breaks, actions, nonterminal_breaks, gotos = lr1dfa.states[i]
-
-      terminal0 = 0
-      for j in range(len(terminal_breaks)):
-        terminal1 = terminal_breaks[j]
-        action_table[i, terminal0:terminal1] = actions[j]
-        terminal0 = terminal1
-      assert terminal0 == lr1dfa.n_terminals
-
-      nonterminal0 = 0
-      for j in range(len(nonterminal_breaks)):
-        nonterminal1 = nonterminal_breaks[j]
-        goto_table[nonterminal0:nonterminal1, i] = gotos[j]
-        nonterminal0 = nonterminal1
-      assert nonterminal0 == len(lr1dfa.productions)
-    assert numpy.all(action_table != 0)
-    assert numpy.all(goto_table != 0)
-    #print(action_table)
-    #print(goto_table)
-    # permute and combine columns/rows on the basis of the translate vectors
-    new_action_table = numpy.zeros(
-      (len(lr1dfa.states), n_terminals),
-      numpy.int16
-    )
-    new_action_table[:, translate_terminals] = action_table
-    action_table = new_action_table
-    new_goto_table = numpy.zeros(
-      (n_nonterminals, len(lr1dfa.states)),
-      numpy.int16
-    )
-    new_goto_table[translate_nonterminals, :] = goto_table[1:] # ignore start
-    goto_table = new_goto_table
-
-    # manipulate the table entries as follows:
-    # - ensure there is no shift or goto 0 (cannot return to starting state)
-    # - replace reduce 0 (start production) with shift to a nonexistent state
-    # - replace action or goto -1 (unrecognized terminal/nonterminal) with 0
-    # - change the low-bit indication of shift/reduce to positive/negative
-    # we do it here after removing redundant columns, as it's more efficient
-    assert numpy.all(action_table != 0)
-    action_table[action_table == 1] = len(lr1dfa.states) << 1
-    action_table[action_table == -1] = 0
-    mask = (action_table & 1).astype(numpy.bool)
-    action_table >>= 1
-    action_table[mask] = -action_table[mask]
-    assert numpy.all(goto_table != 0)
-    goto_table[goto_table == -1] = 0
-
-    # find out which column the transition to each state occurs in, this
-    # must be unique and is called the "accessing symbol" for the state
-    accessing_table = numpy.any(
-      numpy.concatenate(
-        [action_table, goto_table.transpose()],
-        1
-      )[:, :, numpy.newaxis] ==
-      numpy.arange(
-        1,
-        len(lr1dfa.states),
-        dtype = numpy.int16
-      )[numpy.newaxis, numpy.newaxis, :],
-      0
-    )
-    self.accessing_symbols = numpy.full(
-      (len(lr1dfa.states),),
-      2, # '$undefined'
-      numpy.int16
-    )
-    for i in range(1, len(lr1dfa.states)):
-      accessing_symbol = numpy.nonzero(accessing_table[:, i - 1])[0]
-      if accessing_symbol.shape[0] == 0:
-        sys.stderr.write('warning: unreachable state {0:d}\n'.format(i))
-      elif accessing_symbol.shape[0] == 1:
-        self.accessing_symbols[i] = accessing_symbol[0]
-      else:
-        assert False
-
-    # default_action is yydefact, default_goto is yydefgoto
-    # find default reduce (most common negative value per action_table row)
-    # and find default goto (most common non-zero value per goto_table row)
-    # note: 0 is used as a last resort if there is no proper default value
-    self.default_action = numpy.argmax(
-      numpy.concatenate(
-        [
-          numpy.zeros((len(lr1dfa.states), 1), numpy.int16),
-          numpy.sum(
-            (
-              action_table[:, :, numpy.newaxis] ==
-              numpy.arange(
-                -1,
-                -len(lr1dfa.productions),
-                -1,
-                dtype = numpy.int16
-              )[numpy.newaxis, numpy.newaxis, :]
-            ).astype(numpy.int16),
-            1
-          )
-        ],
-        1
-      ),
-      1
-    )
-    #print(action_table)
-    #print(self.default_action)
-    self.default_goto = numpy.argmax(
-      numpy.concatenate(
-        [
-          numpy.zeros((n_nonterminals, 1), numpy.int16),
-          numpy.sum(
-            (
-              goto_table[:, :, numpy.newaxis] ==
-              numpy.arange(
-                1,
-                len(lr1dfa.states),
-                dtype = numpy.int16
-              )[numpy.newaxis, numpy.newaxis, :]
-            ).astype(numpy.int16),
-            1
-          )
-        ],
-        1
-      ),
-      1
-    )
-    #print(goto_table)
-    #print(self.default_goto)
-
-    # fill in the zero entries in the tables with default reduce or goto
-    for i in range(len(lr1dfa.states)):
-      action_table[i, action_table[i, :] == 0] = -self.default_action[i]
-    for i in range(n_nonterminals):
-      goto_table[i, goto_table[i, :] == 0] = self.default_goto[i]
-    #print(action_table)
-    #print(goto_table)
-
-    # entry_base indicates the most negative starting index we can expect
-    # we maintain n_entries such that entries_used[n_entries:, :] == False
-    # entries[i, 0] is yytable[i - entry_base]
-    # entries[i, 1] is yycheck[i - entry_base]
-    # entries_used[i, 0] == True if any vector has been stored starting at i
-    # entries_used[0, 0] does not go True as it is -infinity and can be reused
-    # entries_used[i, 1] == True if entry has been stored in entries[i, :]
-    self.entry_base = max(n_terminals, len(lr1dfa.states))
-    n_entries = self.entry_base
-    entries = numpy.full((self.entry_base, 2), 0x8000, numpy.int16)
-    entries_used = numpy.zeros((self.entry_base, 2), numpy.bool)
-    entries_used[:, 1] = True # padding so we don't use any negative entries
-
-    # action_pointer is yypact, goto_pointer is yypgoto
-    def pack_matrix(data, mask):
-      nonlocal n_entries, entries, entries_used # also uses self.entry_base
-
-      start_indices = numpy.zeros((data.shape[0],), numpy.int16)
-      for i in range(data.shape[0]):
-        indices = numpy.nonzero(mask[i, :])[0]
-        if indices.shape[0] == 0:
-          start_index = 0
-        else:
-          size = indices[-1] + 1
-
-          # ensure arrays are at least large enough to find a spot
-          while entries.shape[0] < n_entries + size:
-            # extend entries, copying only n_entries entries
-            new_entries = numpy.full(
-              (entries.shape[0] * 2, 2),
-              0x8000,
-              numpy.int16
-            )
-            new_entries[:n_entries, :] = entries[:n_entries, :]
-            entries = new_entries
-
-            # extend entries_used, copying only n_entries entries
-            new_entries_used = numpy.zeros(
-              (entries_used.shape[0] * 2, 2),
-              numpy.bool
-            )
-            new_entries_used[:n_entries, :] = entries_used[:n_entries, :]
-            entries_used = new_entries_used
-
-          # find a suitable spot and store differences from default
-          start_index = self.entry_base - indices[0]
-          while start_index < n_entries:
-            if (
-              not entries_used[start_index, 0] and
-              not numpy.any(entries_used[indices + start_index, 1])
-            ):
-              break
-            start_index += 1
-          entries[indices + start_index, 0] = data[i, indices]
-          entries[indices + start_index, 1] = indices
-          entries_used[start_index, 0] = True
-          entries_used[indices + start_index, 1] = True
-          if n_entries < start_index + size:
-            n_entries = start_index + size
-
-        start_indices[i] = start_index
-      return start_indices
-    self.action_pointer = pack_matrix(
-      action_table,
-      action_table != (-self.default_action)[:, numpy.newaxis]
-    ) - self.entry_base
-    self.goto_pointer = pack_matrix(
-      goto_table,
-      goto_table != self.default_goto[:, numpy.newaxis]
-    ) - self.entry_base
-    self.entries = numpy.zeros(
-      (n_entries - self.entry_base, 2),
-      numpy.int16
-    )
-    self.entries[:, :] = entries[self.entry_base:n_entries, :]
-
-    # n_states == self.action_pointer.shape[0]
-    # n_productions == self.rules.shape[0]
-    # n_nonterminals == self.goto_pointer.shape[0]
+    self.translate = translate
+    self.rules = rules
+    self.accessing_symbols = accessing_symbols
+    self.default_action = default_action
+    self.default_goto = default_goto
+    self.entry_base = entry_base
+    self.entries = entries
+    self.action_pointer = action_pointer
+    self.goto_pointer = goto_pointer
     self.n_terminals = n_terminals
 
-escapes = {
-  0x07: '\\a',
-  0x08: '\\b',
-  0x09: '\\t',
-  0x0a: '\\n',
-  0x0b: '\\v',
-  0x0c: '\\f',
-  0x0d: '\\r',
-  0x22: '\\"',
-  0x5c: '\\\\'
-}
-def generate(pyacc, skel_file, out_file, defines_file = None):
-  lr1dfa = pyacc.to_lr1().to_lalr1()
-
-  # generate translate table for terminal symbols
-  # this undoes yacc/bison's rather wasteful mapping of 0x00..0xff to literal
-  # characters, and also accommodates any token value overrides given by the
-  # user, yielding a consecutive set of terminal numbers that are really used
-  n_terminals = 0
-  translate_terminals = numpy.full(
-    (lr1dfa.n_terminals,),
-    2, # '$undefined'
-    numpy.int16
-  )
-  for i in pyacc.symbols:
-    if i._type == ast.PYACC.Symbol.TYPE_TERMINAL:
-      for j in range(0, len(i.character_set), 2):
-        translate_terminals[
-          i.character_set[j]:i.character_set[j + 1]
-        ] = n_terminals
-      n_terminals += 1
-
-  # generate translate table for nonterminal symbols
-  # this is effectively a map from productions back to nonterminal symbols
-  # we do not generate an entry for the first production (start production)
-  # we generate extra fake entries after end of the nonterminals for fake
-  # productions due to midrule actions (which leave gaps in the numbering)
-  n_nonterminals = 0
-  translate_nonterminals = numpy.full(
-    (len(lr1dfa.productions) - 1,),
-    -1,
-    numpy.int16
-  )
-  for i in pyacc.symbols:
-    if i._type == ast.PYACC.Symbol.TYPE_NONTERMINAL: 
-      for j in range(0, len(i.character_set), 2):
-        translate_nonterminals[
-          i.character_set[j] - 1:i.character_set[j + 1] - 1
-        ] = n_nonterminals
-      n_nonterminals += 1
-  midrule_actions = [translate_nonterminals == -1]
-  n_midrule_actions = numpy.sum(midrule_actions)
-  translate_nonterminals[midrule_actions] = numpy.arange(
-    n_nonterminals,
-    n_nonterminals + n_midrule_actions,
-    dtype = numpy.int16
-  )
-
-  # translate and compress the tables
-  bison_lr1dfa = BisonLR1DFA(
-    lr1dfa,
-    n_terminals,
-    translate_terminals,
-    n_nonterminals + n_midrule_actions,
-    translate_nonterminals
-  )
-
-  def generate(
-    skel_file,
-    out_file,
-    file_prefix,
-    type_prefix,
-    name_prefix,
-    is_header
-  ):
-    with open(skel_file, 'r') as fin:
-      with open(out_file, 'w+') as fout:
-        line = fin.readline()
-        while len(line):
-          if line == '/* GENERATE YYPURE */\n':
-            fout.write(
-              '''/* GENERATE YYPURE BEGIN */
-#define YYPURE {0:d}
-/* GENERATE YYPURE END */
-'''.format(
-                pyacc[0].api_pure
-              ).replace('YY', type_prefix if is_header else 'YY').replace('yy', name_prefix if is_header else 'yy') # hack
-            )
-          elif line == '/* GENERATE TYPEPREFIX */\n':
-            fout.write(
-              '''/* GENERATE TYPEPREFIX BEGIN */
-{0:s}/* GENERATE TYPEPREFIX END */
-'''.format(
-                  '''/* Substitute the type names.  */
-{0:s}'''.format(
-                    ''.join(
-                      [
-                        '#define YY{0:s} {1:s}{2:s}\n'.format(
-                          i,
-                          pyacc[0].type_prefix,
-                          i
-                        )
-                        for i in (
-                          ['STYPE'] +
-                          (['LTYPE'] if pyacc[0].locations else [])
-                        )
-                      ]
-                    )
-                  )
-                if pyacc[0].type_prefix != 'YY' else
-                  ''
-              )
-            )
-          elif line == '/* GENERATE NAMEPREFIX */\n':
-            fout.write(
-              '''/* GENERATE NAMEPREFIX BEGIN */
-{0:s}/* GENERATE NAMEPREFIX END */
-'''.format(
-                  '''/* Substitute the variable and function names.  */
-{0:s}'''.format(
-                    ''.join(
-                      [
-                        '#define yy{0:s} {1:s}{2:s}\n'.format(
-                          i,
-                          pyacc[0].name_prefix,
-                          i
-                        )
-                        for i in (
-                          # note: nerrs is actually pure but do what bison does
-                          ['parse', 'lex', 'error', 'debug', 'nerrs'] +
-                          (
-                            []
-                          if pyacc[0].api_pure else
-                            ['lval', 'char'] +
-                            (['lloc'] if pyacc[0].locations else [])
-                          )
-                        )
-                      ]
-                    )
-                  )
-                if pyacc[0].name_prefix != 'yy' else
-                  ''
-              )
-            )
-          elif line == '/* GENERATE SECTION1TOP */\n':
-            fout.write(
-              '''/* GENERATE SECTION1TOP BEGIN */
-{0:s}/* GENERATE SECTION1TOP END */
-'''.format(
-                ''.join(
-                  [
-                    '{0:s}\n'.format(i.get_text())
-                    for i in pyacc.top_text
-                  ]
-                )
-              )
-            )
-          elif line == '/* GENERATE SECTION1BEFOREUNION */\n':
-            fout.write(
-              '''/* GENERATE SECTION1BEFOREUNION BEGIN */
-{0:s}/* GENERATE SECTION1BEFOREUNION END */
-'''.format(
-                ''.join(
-                  [
-                    '{0:s}\n'.format(i.get_text())
-                    for i in pyacc.before_union_text
-                  ]
-                )
-              )
-            )
-          elif line == '/* GENERATE YYERROR_VERBOSE */\n':
-            fout.write(
-              '''/* GENERATE YYERROR_VERBOSE BEGIN */
-#ifdef YYERROR_VERBOSE
-# undef YYERROR_VERBOSE
-# define YYERROR_VERBOSE 1
-#else
-# define YYERROR_VERBOSE {0:d}
-#endif
-/* GENERATE YYERROR_VERBOSE END */
-'''.format(
-                int(pyacc[0].error_verbose)
-              )
-            )
-          elif line == '/* GENERATE SECTION1REQUIRES */\n':
-            fout.write(
-              '''/* GENERATE SECTION1REQUIRES BEGIN */
-{0:s}/* GENERATE SECTION1REQUIRES END */
-'''.format(
-                ''.join(
-                  [
-                    '{0:s}\n'.format(i.get_text())
-                    for i in pyacc.requires_text
-                  ]
-                )
-              )
-            )
-          elif line == '/* GENERATE YYDEBUG */\n':
-            fout.write(
-              '''/* GENERATE YYDEBUG BEGIN */
-#ifndef YYDEBUG
-# define YYDEBUG {0:d}
-#endif
-#if YYDEBUG
-extern int yydebug;
-#endif
-/* GENERATE YYDEBUG END */
-'''.format(
-                int(pyacc[0].debug)
-              ).replace('YY', type_prefix).replace('yy', name_prefix) # hack
-            )
-          elif line == '/* GENERATE TOKENSEQUAL */\n':
-            fout.write(
-              '''/* GENERATE TOKENSEQUAL BEGIN */{0:s}
-/* GENERATE TOKENSEQUAL END */
-'''.format(
-                ','.join(
-                  [
-                    '\n    {0:s} = {1:d}'.format(i.name, i.character_set[0])
-                    for i in pyacc.symbols[3:]
-                    if (
-                      i._type == ast.PYACC.Symbol.TYPE_TERMINAL and
-                      len(i.name)
-                    )
-                  ]
-                )
-              )
-            )
-          elif line == '/* GENERATE TOKENS */\n':
-            fout.write(
-              '''/* GENERATE TOKENS BEGIN */
-{0:s}/* GENERATE TOKENS END */
-'''.format(
-                ''.join(
-                  [
-                    '#define {0:s} {1:d}\n'.format(i.name, i.character_set[0])
-                    for i in pyacc.symbols[3:]
-                    if (
-                      i._type == ast.PYACC.Symbol.TYPE_TERMINAL and
-                      len(i.name)
-                    )
-                  ]
-                )
-              )
-            )
-          elif line == '/* GENERATE YYSTYPE */\n':
-            fout.write(
-              '''/* GENERATE YYSTYPE BEGIN */
-#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
-{0:s}# define YYSTYPE_IS_TRIVIAL 1
-# define YYSTYPE_IS_DECLARED 1
-#endif
-/* GENERATE YYSTYPE END */
-'''.format(
-                  '''union YYSTYPE
-{{
-{0:s}}};
-typedef union YYSTYPE YYSTYPE;
-'''.format(
-                    ''.join(
-                      [
-                        '{0:s}\n'.format(i.get_text())
-                        for i in pyacc.union_text
-                      ]
-                    )
-                  )
-                if len(pyacc.union_text) else
-                  '''typedef int YYSTYPE;
-'''
-              ).replace('YY', type_prefix).replace('yy', name_prefix) # hack
-            )
-          elif line == '/* GENERATE YYLTYPE */\n':
-            fout.write(
-              '''/* GENERATE YYLTYPE BEGIN */
-{0:s}/* GENERATE YYLTYPE END */
-'''.format(
-                  '''#if ! defined YYLTYPE && ! defined YYLTYPE_IS_DECLARED
-typedef struct YYLTYPE YYLTYPE;
-struct YYLTYPE
-{
-  int first_line;
-  int first_column;
-  int last_line;
-  int last_column;
-};
-# define YYLTYPE_IS_DECLARED 1
-# define YYLTYPE_IS_TRIVIAL 1
-#endif
-'''
-                if pyacc[0].locations else
-                  ''
-              ).replace('YY', type_prefix).replace('yy', name_prefix) # hack
-            )
-          elif line == '/* GENERATE SECTION1AFTERUNION */\n':
-            fout.write(
-              '''/* GENERATE SECTION1AFTERUNION BEGIN */
-{0:s}/* GENERATE SECTION1AFTERUNION END */
-'''.format(
-                ''.join(
-                  [
-                    '{0:s}\n'.format(i.get_text())
-                    for i in pyacc.after_union_text
-                  ]
-                )
-              )
-            )
-          elif line == '/* GENERATE TABLES */\n':
-            # yyrline (source line where rule is defined) not implemented yet
-            yyrline = numpy.zeros(
-              (bison_lr1dfa.rules.shape[0],),
-              numpy.int16
-            )
-
-            # yytname (textual terminal/nonterminal name) wraps 70 columns
-            x = 70
-            yytname_lines = []
-            for i in (
-              [
-                (
-                  '"{0:s}"'.format(i.name)
-                if len(i.name) else
-                  '"\'{0:s}\'"'.format(
-                    escapes[i.character_set[0]]
-                  if i.character_set[0] in escapes else
-                    chr(i.character_set[0])
-                  if i.character_set[0] >= 0x20 else
-                    '\\\\x{0:02x}'.format(i.character_set[0])
-                  )
-                )
-                for i in pyacc.symbols
-                if i._type == ast.PYACC.Symbol.TYPE_TERMINAL
-              ] +
-              [
-                '"{0:s}"'.format(i.name)
-                for i in pyacc.symbols
-                if i._type == ast.PYACC.Symbol.TYPE_NONTERMINAL
-              ] +
-              ['"$@{0:d}"'.format(i) for i in range(n_midrule_actions)] +
-              ['YY_NULLPTR']
-            ):
-              if x + len(i) >= 70:
-                yytname_lines.append([])
-                x = 0
-              yytname_lines[-1].append(i)
-              x += len(i) + 2
-
-            # yytoknum is the approximate reverse of bison_lr1dfa.translate,
-            # do in reverse order so later duplicate entries get overwritten
-            yytoknum = numpy.zeros((bison_lr1dfa.n_terminals,), numpy.int16)
-            yytoknum[bison_lr1dfa.translate[::-1]] = numpy.arange(
-              bison_lr1dfa.translate.shape[0] - 1,
-              -1,
-              -1,
-              numpy.int16
-            )
-
-            fout.write(
-              '''/* GENERATE TABLES BEGIN */
-/* YYFINAL -- State number of the termination state.  */
-#define YYFINAL {0:d}
-/* YYLAST -- Last index in YYTABLE.  */
-#define YYLAST {1:d}
-
-/* YYNTOKENS -- Number of terminals.  */
-#define YYNTOKENS {2:d}
-/* YYNNTS -- Number of nonterminals.  */
-#define YYNNTS {3:d}
-/* YYNRULES -- Number of rules.  */
-#define YYNRULES {4:d}
-/* YYNSTATES -- Number of states.  */
-#define YYNSTATES {5:d}
-
-/* YYTRANSLATE[YYX] -- Symbol number corresponding to YYX as returned
-   by yylex, with out-of-bounds checking.  */
-#define YYUNDEFTOK 2
-#define YYMAXUTOK {6:d}
-
-#define YYTRANSLATE(YYX)                                                \\
-  ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK)
-
-/* YYTRANSLATE[TOKEN-NUM] -- Symbol number corresponding to TOKEN-NUM
-   as returned by yylex, without out-of-bounds checking.  */
-static const yytype_int16 yytranslate[] =
-{{{7:s}
-}};
-
-#if YYDEBUG
-  /* YYRLINE[YYN] -- Source line where rule number YYN was defined.  */
-static const yytype_int16 yyrline[] =
-{{{8:s}
-}};
-#endif
-
-#if YYDEBUG || YYERROR_VERBOSE || {9:d}
-/* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM.
-   First, the terminals, then, starting at YYNTOKENS, nonterminals.  */
-static const char *const yytname[] =
-{{{10:s}
-}};
-#endif
-
-# ifdef YYPRINT
-/* YYTOKNUM[NUM] -- (External) token number corresponding to the
-   (internal) symbol number NUM (which must be that of a token).  */
-static const yytype_int16 yytoknum[] =
-{{{11:s}
-}};
-# endif
-
-#define YYPACT_NINF {12:d}
-
-#define yypact_value_is_default(Yystate) \\
-  (!!((Yystate) == ({13:d})))
-
-#define YYTABLE_NINF -1
-
-#define yytable_value_is_error(Yytable_value) \\
-  0
-
-  /* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing
-     STATE-NUM.  */
-static const yytype_int16 yypact[] =
-{{{14:s}
-}};
-
-  /* YYDEFACT[STATE-NUM] -- Default reduction number in state STATE-NUM.
-     Performed when YYTABLE does not specify something else to do.  Zero
-     means the default is an error.  */
-static const yytype_int16 yydefact[] =
-{{{15:s}
-}};
-
-  /* YYPGOTO[NTERM-NUM].  */
-static const yytype_int16 yypgoto[] =
-{{{16:s}
-}};
-
-  /* YYDEFGOTO[NTERM-NUM].  */
-static const yytype_int16 yydefgoto[] =
-{{{17:s}
-}};
-
-  /* YYTABLE[YYPACT[STATE-NUM]] -- What to do in state STATE-NUM.  If
-     positive, shift that token.  If negative, reduce the rule whose
-     number is the opposite.  If YYTABLE_NINF, syntax error.  */
-static const yytype_int16 yytable[] =
-{{{18:s}
-}};
-
-static const yytype_int16 yycheck[] =
-{{{19:s}
-}};
-
-  /* YYSTOS[STATE-NUM] -- The (internal number of the) accessing
-     symbol of state STATE-NUM.  */
-static const yytype_int16 yystos[] =
-{{{20:s}
-}};
-
-  /* YYR1[YYN] -- Symbol number of symbol that rule YYN derives.  */
-static const yytype_int16 yyr1[] =
-{{{21:s}
-}};
-
-  /* YYR2[YYN] -- Number of symbols on the right hand side of rule YYN.  */
-static const yytype_int16 yyr2[] =
-{{{22:s}
-}};
-/* GENERATE TABLES END */
-'''.format(
-                # YYFINAL
-                bison_lr1dfa.action_pointer.shape[0],
-                # YYLAST
-                bison_lr1dfa.entries.shape[0] - 1,
-                # YYNTOKENS
-                bison_lr1dfa.n_terminals,
-                # YYNNTS
-                bison_lr1dfa.goto_pointer.shape[0],
-                # YYNRULES
-                bison_lr1dfa.rules.shape[0],
-                # YYNSTATES
-                bison_lr1dfa.action_pointer.shape[0],
-                # YYMAXUTOK
-                bison_lr1dfa.translate.shape[0] - 1,
-                # yytranslate
-                ','.join(
-                  [
-                    '\n  {0:s}'.format(
-                      ','.join(
-                        [
-                          '{0:6d}'.format(j)
-                          for j in bison_lr1dfa.translate[i:i + 10]
-                        ]
-                      )
-                    )
-                    for i in range(0, bison_lr1dfa.translate.shape[0], 10)
-                  ]
-                ),
-                # yyrline
-                ','.join(
-                  [
-                    '\n{0:s}'.format(
-                      ','.join(
-                        [
-                          '{0:6d}'.format(j)
-                          for j in yyrline[i:i + 10]
-                        ]
-                      )
-                    )
-                    for i in range(0, yyrline.shape[0], 10)
-                  ]
-                ),
-                # YYERROR_VERBOSE (strangely the defined value is repeated)
-                int(pyacc[0].error_verbose),
-                # yytname
-                ','.join(
-                  ['\n  {0:s}'.format(', '.join(i)) for i in yytname_lines]
-                ),
-                # yytoknum
-                ','.join(
-                  [
-                    '\n  {0:s}'.format(
-                      ','.join(
-                        [
-                          '{0:6d}'.format(j)
-                          for j in yytoknum[i:i + 10]
-                        ]
-                      )
-                    )
-                    for i in range(0, yytoknum.shape[0], 10)
-                  ]
-                ),
-                # YYPACT_NINF
-                -bison_lr1dfa.entry_base,
-                # yypact_value_is_default
-                -bison_lr1dfa.entry_base,
-                # yypact
-                ','.join(
-                  [
-                    '\n  {0:s}'.format(
-                      ','.join(
-                        [
-                          '{0:6d}'.format(j)
-                          for j in bison_lr1dfa.action_pointer[i:i + 10]
-                        ]
-                      )
-                    )
-                    for i in range(0, bison_lr1dfa.action_pointer.shape[0], 10)
-                  ]
-                ),
-                # yydefact
-                ','.join(
-                  [
-                    '\n  {0:s}'.format(
-                      ','.join(
-                        [
-                          '{0:6d}'.format(j)
-                          for j in bison_lr1dfa.default_action[i:i + 10]
-                        ]
-                      )
-                    )
-                    for i in range(0, bison_lr1dfa.default_action.shape[0], 10)
-                  ]
-                ),
-                # yypgoto
-                ','.join(
-                  [
-                    '\n  {0:s}'.format(
-                      ','.join(
-                        [
-                          '{0:6d}'.format(j)
-                          for j in bison_lr1dfa.goto_pointer[i:i + 10]
-                        ]
-                      )
-                    )
-                    for i in range(0, bison_lr1dfa.goto_pointer.shape[0], 10)
-                  ]
-                ),
-                # yydefgoto
-                ','.join(
-                  [
-                    '\n  {0:s}'.format(
-                      ','.join(
-                        [
-                          '{0:6d}'.format(j)
-                          for j in bison_lr1dfa.default_goto[i:i + 10]
-                        ]
-                      )
-                    )
-                    for i in range(0, bison_lr1dfa.default_goto.shape[0], 10)
-                  ]
-                ),
-                # yytable
-                ','.join(
-                  [
-                    '\n  {0:s}'.format(
-                      ','.join(
-                        [
-                          '{0:6d}'.format(j)
-                          for j in bison_lr1dfa.entries[i:i + 10, 0]
-                        ]
-                      )
-                    )
-                    for i in range(0, bison_lr1dfa.entries.shape[0], 10)
-                  ]
-                ),
-                # yycheck
-                ','.join(
-                  [
-                    '\n  {0:s}'.format(
-                      ','.join(
-                        [
-                          '{0:6d}'.format(j)
-                          for j in bison_lr1dfa.entries[i:i + 10, 1]
-                        ]
-                      )
-                    )
-                    for i in range(0, bison_lr1dfa.entries.shape[0], 10)
-                  ]
-                ),
-                # yystos
-                ','.join(
-                  [
-                    '\n  {0:s}'.format(
-                      ','.join(
-                        [
-                          '{0:6d}'.format(j)
-                          for j in bison_lr1dfa.accessing_symbols[i:i + 10]
-                        ]
-                      )
-                    )
-                    for i in range(0, bison_lr1dfa.accessing_symbols.shape[0], 10)
-                  ]
-                ),
-                # yyr1
-                ','.join(
-                  [
-                    '\n  {0:s}'.format(
-                      ','.join(
-                        [
-                          '{0:6d}'.format(j)
-                          for j in bison_lr1dfa.rules[i:i + 10, 0]
-                        ]
-                      )
-                    )
-                    for i in range(0, bison_lr1dfa.rules.shape[0], 10)
-                  ]
-                ),
-                # yyr2
-                ','.join(
-                  [
-                    '\n  {0:s}'.format(
-                      ','.join(
-                        [
-                          '{0:6d}'.format(j)
-                          for j in bison_lr1dfa.rules[i:i + 10, 1]
-                        ]
-                      )
-                    )
-                    for i in range(0, bison_lr1dfa.rules.shape[0], 10)
-                  ]
-                )
-              ).replace('YYDEBUG', type_prefix + 'DEBUG') # hack
-            )
-          elif line == '/* GENERATE INITIALACTION */\n':
-            fout.write(
-              '''/* GENERATE INITIALACTION BEGIN */
-{0:s}/* GENERATE INITIALACTION END */
-'''.format(
-                ''.join(
-                  [
-                    '{0:s}\n'.format(i.get_text())
-                    for i in pyacc.initial_action_text
-                  ]
-                ).replace('(yyval)', '(yylval').replace('(yyloc)', '(yylloc)') # hack
-              )
-            )
-          elif line == '/* GENERATE SECTION2 */\n':
-            fout.write(
-              '''/* GENERATE SECTION2 BEGIN */
-{0:s}/* GENERATE SECTION2 END */
-'''.format(
-                '\n'.join(
-                  [
-                    '''  case {0:d}:
-    {1:s}
-    break;
-'''.format(
-                      i,
-                      lr1dfa.productions[i][1].get_text()
-                    )
-                    for i in range(len(lr1dfa.productions))
-                    if lr1dfa.productions[i][1] is not None
-                  ]
-                )
-              )
-            )
-          elif line == '/* GENERATE SECTION3 */\n':
-            fout.write(
-              '''/* GENERATE SECTION3 BEGIN */
-{0:s}/*GENERATE SECTION3 END */
-'''.format(
-                '' if len(pyacc) < 3 else pyacc[2].get_text()
-              )
-            )
-          else:
-            if file_prefix != 'YY_YY_Y_':
-              line = line.replace('YY_YY_Y_', file_prefix)
-            if is_header:
-              if type_prefix != 'YY':
-                line = line.replace('YY_YY_Y_', 'YY').replace('YY', type_prefix)
-              if name_prefix != 'yy':
-                line = line.replace('yy', name_prefix)
-            elif type_prefix != 'YY':
-              line = line.replace('YYDEBUG', type_prefix + 'DEBUG')
-              line = line.replace('YYSTYPE_IS_', type_prefix + 'STYPE_IS_')
-              line = line.replace('YYLTYPE_IS_', type_prefix + 'LTYPE_IS_')
-            fout.write(line)
-          line = fin.readline()
-  generate(
-    skel_file,
-    out_file,
-    pyacc[0].type_prefix,
-    pyacc[0].type_prefix,
-    pyacc[0].name_prefix,
-    False
-  )
-  if defines_file is not None:
-    generate(
-      '{0:s}.h'.format(
-        skel_file[:-2] if skel_file[-2:] == '.c' else skel_file
-      ),
-      defines_file,
-      pyacc[0].type_prefix,
-      pyacc[0].type_prefix,
-      pyacc[0].name_prefix,
-      True
-    )
+  # add some test routines here for parsing text, etc, similar to LR1DFA()
+  # add repr() routine, to use with wrap_repr() for basic serialization
index f0a7223..4cefee0 100755 (executable)
@@ -2,7 +2,7 @@
 
 import ast
 import element
-import bison_lr1dfa
+import generate_bison
 import getopt
 import os
 import sys
@@ -46,6 +46,6 @@ with open(in_file) as fin:
 #element.serialize(pyacc, 'a.xml', 'utf-8')
 #pyacc = element.deserialize('a.xml', ast.factory, 'utf-8')
 pyacc.post_process()
-element.serialize(pyacc, 'b.xml', 'utf-8')
-pyacc = element.deserialize('b.xml', ast.factory, 'utf-8')
-bison_lr1dfa.generate(pyacc, skel_file, out_file, defines_file)
+#element.serialize(pyacc, 'b.xml', 'utf-8')
+#pyacc = element.deserialize('b.xml', ast.factory, 'utf-8')
+generate_bison.generate_bison(pyacc, skel_file, out_file, defines_file)
similarity index 100%
rename from generate.py
rename to generate_ast.py
diff --git a/generate_bison.py b/generate_bison.py
new file mode 100644 (file)
index 0000000..f159c74
--- /dev/null
@@ -0,0 +1,741 @@
+import ast
+import numpy
+
+escapes = {
+  0x07: '\\a',
+  0x08: '\\b',
+  0x09: '\\t',
+  0x0a: '\\n',
+  0x0b: '\\v',
+  0x0c: '\\f',
+  0x0d: '\\r',
+  0x22: '\\"',
+  0x5c: '\\\\'
+}
+
+def generate_bison(pyacc, skel_file, out_file, defines_file = None):
+  _lr1dfa = pyacc.to_lr1().to_lalr1()
+
+  # generate translate table for terminal symbols
+  # this undoes yacc/bison's rather wasteful mapping of 0x00..0xff to literal
+  # characters, and also accommodates any token value overrides given by the
+  # user, yielding a consecutive set of terminal numbers that are really used
+  n_terminals = 0
+  translate_terminals = numpy.full(
+    (_lr1dfa.n_terminals,),
+    2, # '$undefined'
+    numpy.int16
+  )
+  for i in pyacc.symbols:
+    if i._type == ast.PYACC.Symbol.TYPE_TERMINAL:
+      for j in range(0, len(i.character_set), 2):
+        translate_terminals[
+          i.character_set[j]:i.character_set[j + 1]
+        ] = n_terminals
+      n_terminals += 1
+
+  # generate translate table for nonterminal symbols
+  # this is effectively a map from productions back to nonterminal symbols
+  # we do not generate an entry for the first production (start production)
+  # we generate extra fake entries after end of the nonterminals for fake
+  # productions due to midrule actions (which leave gaps in the numbering)
+  n_nonterminals = 0
+  translate_nonterminals = numpy.full(
+    (len(_lr1dfa.productions) - 1,),
+    -1,
+    numpy.int16
+  )
+  for i in pyacc.symbols:
+    if i._type == ast.PYACC.Symbol.TYPE_NONTERMINAL: 
+      for j in range(0, len(i.character_set), 2):
+        translate_nonterminals[
+          i.character_set[j] - 1:i.character_set[j + 1] - 1
+        ] = n_nonterminals
+      n_nonterminals += 1
+  midrule_actions = [translate_nonterminals == -1]
+  n_midrule_actions = numpy.sum(midrule_actions)
+  translate_nonterminals[midrule_actions] = numpy.arange(
+    n_nonterminals,
+    n_nonterminals + n_midrule_actions,
+    dtype = numpy.int16
+  )
+
+  # translate and compress the tables
+  _bison_lr1dfa = _lr1dfa.to_bison_lr1dfa(
+    n_terminals,
+    translate_terminals,
+    n_nonterminals + n_midrule_actions,
+    translate_nonterminals
+  )
+
+  def generate(
+    skel_file,
+    out_file,
+    file_prefix,
+    type_prefix,
+    name_prefix,
+    is_header
+  ):
+    with open(skel_file, 'r') as fin:
+      with open(out_file, 'w+') as fout:
+        line = fin.readline()
+        while len(line):
+          if line == '/* GENERATE YYPURE */\n':
+            fout.write(
+              '''/* GENERATE YYPURE BEGIN */
+#define YYPURE {0:d}
+/* GENERATE YYPURE END */
+'''.format(
+                pyacc[0].api_pure
+              ).replace('YY', type_prefix if is_header else 'YY').replace('yy', name_prefix if is_header else 'yy') # hack
+            )
+          elif line == '/* GENERATE TYPEPREFIX */\n':
+            fout.write(
+              '''/* GENERATE TYPEPREFIX BEGIN */
+{0:s}/* GENERATE TYPEPREFIX END */
+'''.format(
+                  '''/* Substitute the type names.  */
+{0:s}'''.format(
+                    ''.join(
+                      [
+                        '#define YY{0:s} {1:s}{2:s}\n'.format(
+                          i,
+                          pyacc[0].type_prefix,
+                          i
+                        )
+                        for i in (
+                          ['STYPE'] +
+                          (['LTYPE'] if pyacc[0].locations else [])
+                        )
+                      ]
+                    )
+                  )
+                if pyacc[0].type_prefix != 'YY' else
+                  ''
+              )
+            )
+          elif line == '/* GENERATE NAMEPREFIX */\n':
+            fout.write(
+              '''/* GENERATE NAMEPREFIX BEGIN */
+{0:s}/* GENERATE NAMEPREFIX END */
+'''.format(
+                  '''/* Substitute the variable and function names.  */
+{0:s}'''.format(
+                    ''.join(
+                      [
+                        '#define yy{0:s} {1:s}{2:s}\n'.format(
+                          i,
+                          pyacc[0].name_prefix,
+                          i
+                        )
+                        for i in (
+                          # note: nerrs is actually pure but do what bison does
+                          ['parse', 'lex', 'error', 'debug', 'nerrs'] +
+                          (
+                            []
+                          if pyacc[0].api_pure else
+                            ['lval', 'char'] +
+                            (['lloc'] if pyacc[0].locations else [])
+                          )
+                        )
+                      ]
+                    )
+                  )
+                if pyacc[0].name_prefix != 'yy' else
+                  ''
+              )
+            )
+          elif line == '/* GENERATE SECTION1TOP */\n':
+            fout.write(
+              '''/* GENERATE SECTION1TOP BEGIN */
+{0:s}/* GENERATE SECTION1TOP END */
+'''.format(
+                ''.join(
+                  [
+                    '{0:s}\n'.format(i.get_text())
+                    for i in pyacc.top_text
+                  ]
+                )
+              )
+            )
+          elif line == '/* GENERATE SECTION1BEFOREUNION */\n':
+            fout.write(
+              '''/* GENERATE SECTION1BEFOREUNION BEGIN */
+{0:s}/* GENERATE SECTION1BEFOREUNION END */
+'''.format(
+                ''.join(
+                  [
+                    '{0:s}\n'.format(i.get_text())
+                    for i in pyacc.before_union_text
+                  ]
+                )
+              )
+            )
+          elif line == '/* GENERATE YYERROR_VERBOSE */\n':
+            fout.write(
+              '''/* GENERATE YYERROR_VERBOSE BEGIN */
+#ifdef YYERROR_VERBOSE
+# undef YYERROR_VERBOSE
+# define YYERROR_VERBOSE 1
+#else
+# define YYERROR_VERBOSE {0:d}
+#endif
+/* GENERATE YYERROR_VERBOSE END */
+'''.format(
+                int(pyacc[0].error_verbose)
+              )
+            )
+          elif line == '/* GENERATE SECTION1REQUIRES */\n':
+            fout.write(
+              '''/* GENERATE SECTION1REQUIRES BEGIN */
+{0:s}/* GENERATE SECTION1REQUIRES END */
+'''.format(
+                ''.join(
+                  [
+                    '{0:s}\n'.format(i.get_text())
+                    for i in pyacc.requires_text
+                  ]
+                )
+              )
+            )
+          elif line == '/* GENERATE YYDEBUG */\n':
+            fout.write(
+              '''/* GENERATE YYDEBUG BEGIN */
+#ifndef YYDEBUG
+# define YYDEBUG {0:d}
+#endif
+#if YYDEBUG
+extern int yydebug;
+#endif
+/* GENERATE YYDEBUG END */
+'''.format(
+                int(pyacc[0].debug)
+              ).replace('YY', type_prefix).replace('yy', name_prefix) # hack
+            )
+          elif line == '/* GENERATE TOKENSEQUAL */\n':
+            fout.write(
+              '''/* GENERATE TOKENSEQUAL BEGIN */{0:s}
+/* GENERATE TOKENSEQUAL END */
+'''.format(
+                ','.join(
+                  [
+                    '\n    {0:s} = {1:d}'.format(i.name, i.character_set[0])
+                    for i in pyacc.symbols[3:]
+                    if (
+                      i._type == ast.PYACC.Symbol.TYPE_TERMINAL and
+                      len(i.name)
+                    )
+                  ]
+                )
+              )
+            )
+          elif line == '/* GENERATE TOKENS */\n':
+            fout.write(
+              '''/* GENERATE TOKENS BEGIN */
+{0:s}/* GENERATE TOKENS END */
+'''.format(
+                ''.join(
+                  [
+                    '#define {0:s} {1:d}\n'.format(i.name, i.character_set[0])
+                    for i in pyacc.symbols[3:]
+                    if (
+                      i._type == ast.PYACC.Symbol.TYPE_TERMINAL and
+                      len(i.name)
+                    )
+                  ]
+                )
+              )
+            )
+          elif line == '/* GENERATE YYSTYPE */\n':
+            fout.write(
+              '''/* GENERATE YYSTYPE BEGIN */
+#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
+{0:s}# define YYSTYPE_IS_TRIVIAL 1
+# define YYSTYPE_IS_DECLARED 1
+#endif
+/* GENERATE YYSTYPE END */
+'''.format(
+                  '''union YYSTYPE
+{{
+{0:s}}};
+typedef union YYSTYPE YYSTYPE;
+'''.format(
+                    ''.join(
+                      [
+                        '{0:s}\n'.format(i.get_text())
+                        for i in pyacc.union_text
+                      ]
+                    )
+                  )
+                if len(pyacc.union_text) else
+                  '''typedef int YYSTYPE;
+'''
+              ).replace('YY', type_prefix).replace('yy', name_prefix) # hack
+            )
+          elif line == '/* GENERATE YYLTYPE */\n':
+            fout.write(
+              '''/* GENERATE YYLTYPE BEGIN */
+{0:s}/* GENERATE YYLTYPE END */
+'''.format(
+                  '''#if ! defined YYLTYPE && ! defined YYLTYPE_IS_DECLARED
+typedef struct YYLTYPE YYLTYPE;
+struct YYLTYPE
+{
+  int first_line;
+  int first_column;
+  int last_line;
+  int last_column;
+};
+# define YYLTYPE_IS_DECLARED 1
+# define YYLTYPE_IS_TRIVIAL 1
+#endif
+'''
+                if pyacc[0].locations else
+                  ''
+              ).replace('YY', type_prefix).replace('yy', name_prefix) # hack
+            )
+          elif line == '/* GENERATE SECTION1AFTERUNION */\n':
+            fout.write(
+              '''/* GENERATE SECTION1AFTERUNION BEGIN */
+{0:s}/* GENERATE SECTION1AFTERUNION END */
+'''.format(
+                ''.join(
+                  [
+                    '{0:s}\n'.format(i.get_text())
+                    for i in pyacc.after_union_text
+                  ]
+                )
+              )
+            )
+          elif line == '/* GENERATE TABLES */\n':
+            # yyrline (source line where rule is defined) not implemented yet
+            yyrline = numpy.zeros(
+              (_bison_lr1dfa.rules.shape[0],),
+              numpy.int16
+            )
+
+            # yytname (textual terminal/nonterminal name) wraps 70 columns
+            x = 70
+            yytname_lines = []
+            for i in (
+              [
+                (
+                  '"{0:s}"'.format(i.name)
+                if len(i.name) else
+                  '"\'{0:s}\'"'.format(
+                    escapes[i.character_set[0]]
+                  if i.character_set[0] in escapes else
+                    chr(i.character_set[0])
+                  if i.character_set[0] >= 0x20 else
+                    '\\\\x{0:02x}'.format(i.character_set[0])
+                  )
+                )
+                for i in pyacc.symbols
+                if i._type == ast.PYACC.Symbol.TYPE_TERMINAL
+              ] +
+              [
+                '"{0:s}"'.format(i.name)
+                for i in pyacc.symbols
+                if i._type == ast.PYACC.Symbol.TYPE_NONTERMINAL
+              ] +
+              ['"$@{0:d}"'.format(i) for i in range(n_midrule_actions)] +
+              ['YY_NULLPTR']
+            ):
+              if x + len(i) >= 70:
+                yytname_lines.append([])
+                x = 0
+              yytname_lines[-1].append(i)
+              x += len(i) + 2
+
+            # yytoknum is the approximate reverse of _bison_lr1dfa.translate,
+            # do in reverse order so later duplicate entries get overwritten
+            yytoknum = numpy.zeros((_bison_lr1dfa.n_terminals,), numpy.int16)
+            yytoknum[_bison_lr1dfa.translate[::-1]] = numpy.arange(
+              _bison_lr1dfa.translate.shape[0] - 1,
+              -1,
+              -1,
+              numpy.int16
+            )
+
+            fout.write(
+              '''/* GENERATE TABLES BEGIN */
+/* YYFINAL -- State number of the termination state.  */
+#define YYFINAL {0:d}
+/* YYLAST -- Last index in YYTABLE.  */
+#define YYLAST {1:d}
+
+/* YYNTOKENS -- Number of terminals.  */
+#define YYNTOKENS {2:d}
+/* YYNNTS -- Number of nonterminals.  */
+#define YYNNTS {3:d}
+/* YYNRULES -- Number of rules.  */
+#define YYNRULES {4:d}
+/* YYNSTATES -- Number of states.  */
+#define YYNSTATES {5:d}
+
+/* YYTRANSLATE[YYX] -- Symbol number corresponding to YYX as returned
+   by yylex, with out-of-bounds checking.  */
+#define YYUNDEFTOK 2
+#define YYMAXUTOK {6:d}
+
+#define YYTRANSLATE(YYX)                                                \\
+  ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK)
+
+/* YYTRANSLATE[TOKEN-NUM] -- Symbol number corresponding to TOKEN-NUM
+   as returned by yylex, without out-of-bounds checking.  */
+static const yytype_int16 yytranslate[] =
+{{{7:s}
+}};
+
+#if YYDEBUG
+  /* YYRLINE[YYN] -- Source line where rule number YYN was defined.  */
+static const yytype_int16 yyrline[] =
+{{{8:s}
+}};
+#endif
+
+#if YYDEBUG || YYERROR_VERBOSE || {9:d}
+/* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM.
+   First, the terminals, then, starting at YYNTOKENS, nonterminals.  */
+static const char *const yytname[] =
+{{{10:s}
+}};
+#endif
+
+# ifdef YYPRINT
+/* YYTOKNUM[NUM] -- (External) token number corresponding to the
+   (internal) symbol number NUM (which must be that of a token).  */
+static const yytype_int16 yytoknum[] =
+{{{11:s}
+}};
+# endif
+
+#define YYPACT_NINF {12:d}
+
+#define yypact_value_is_default(Yystate) \\
+  (!!((Yystate) == ({13:d})))
+
+#define YYTABLE_NINF -1
+
+#define yytable_value_is_error(Yytable_value) \\
+  0
+
+  /* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing
+     STATE-NUM.  */
+static const yytype_int16 yypact[] =
+{{{14:s}
+}};
+
+  /* YYDEFACT[STATE-NUM] -- Default reduction number in state STATE-NUM.
+     Performed when YYTABLE does not specify something else to do.  Zero
+     means the default is an error.  */
+static const yytype_int16 yydefact[] =
+{{{15:s}
+}};
+
+  /* YYPGOTO[NTERM-NUM].  */
+static const yytype_int16 yypgoto[] =
+{{{16:s}
+}};
+
+  /* YYDEFGOTO[NTERM-NUM].  */
+static const yytype_int16 yydefgoto[] =
+{{{17:s}
+}};
+
+  /* YYTABLE[YYPACT[STATE-NUM]] -- What to do in state STATE-NUM.  If
+     positive, shift that token.  If negative, reduce the rule whose
+     number is the opposite.  If YYTABLE_NINF, syntax error.  */
+static const yytype_int16 yytable[] =
+{{{18:s}
+}};
+
+static const yytype_int16 yycheck[] =
+{{{19:s}
+}};
+
+  /* YYSTOS[STATE-NUM] -- The (internal number of the) accessing
+     symbol of state STATE-NUM.  */
+static const yytype_int16 yystos[] =
+{{{20:s}
+}};
+
+  /* YYR1[YYN] -- Symbol number of symbol that rule YYN derives.  */
+static const yytype_int16 yyr1[] =
+{{{21:s}
+}};
+
+  /* YYR2[YYN] -- Number of symbols on the right hand side of rule YYN.  */
+static const yytype_int16 yyr2[] =
+{{{22:s}
+}};
+/* GENERATE TABLES END */
+'''.format(
+                # YYFINAL
+                _bison_lr1dfa.action_pointer.shape[0],
+                # YYLAST
+                _bison_lr1dfa.entries.shape[0] - 1,
+                # YYNTOKENS
+                _bison_lr1dfa.n_terminals,
+                # YYNNTS
+                _bison_lr1dfa.goto_pointer.shape[0],
+                # YYNRULES
+                _bison_lr1dfa.rules.shape[0],
+                # YYNSTATES
+                _bison_lr1dfa.action_pointer.shape[0],
+                # YYMAXUTOK
+                _bison_lr1dfa.translate.shape[0] - 1,
+                # yytranslate
+                ','.join(
+                  [
+                    '\n  {0:s}'.format(
+                      ','.join(
+                        [
+                          '{0:6d}'.format(j)
+                          for j in _bison_lr1dfa.translate[i:i + 10]
+                        ]
+                      )
+                    )
+                    for i in range(0, _bison_lr1dfa.translate.shape[0], 10)
+                  ]
+                ),
+                # yyrline
+                ','.join(
+                  [
+                    '\n{0:s}'.format(
+                      ','.join(
+                        [
+                          '{0:6d}'.format(j)
+                          for j in yyrline[i:i + 10]
+                        ]
+                      )
+                    )
+                    for i in range(0, yyrline.shape[0], 10)
+                  ]
+                ),
+                # YYERROR_VERBOSE (strangely the defined value is repeated)
+                int(pyacc[0].error_verbose),
+                # yytname
+                ','.join(
+                  ['\n  {0:s}'.format(', '.join(i)) for i in yytname_lines]
+                ),
+                # yytoknum
+                ','.join(
+                  [
+                    '\n  {0:s}'.format(
+                      ','.join(
+                        [
+                          '{0:6d}'.format(j)
+                          for j in yytoknum[i:i + 10]
+                        ]
+                      )
+                    )
+                    for i in range(0, yytoknum.shape[0], 10)
+                  ]
+                ),
+                # YYPACT_NINF
+                -_bison_lr1dfa.entry_base,
+                # yypact_value_is_default
+                -_bison_lr1dfa.entry_base,
+                # yypact
+                ','.join(
+                  [
+                    '\n  {0:s}'.format(
+                      ','.join(
+                        [
+                          '{0:6d}'.format(j)
+                          for j in _bison_lr1dfa.action_pointer[i:i + 10]
+                        ]
+                      )
+                    )
+                    for i in range(0, _bison_lr1dfa.action_pointer.shape[0], 10)
+                  ]
+                ),
+                # yydefact
+                ','.join(
+                  [
+                    '\n  {0:s}'.format(
+                      ','.join(
+                        [
+                          '{0:6d}'.format(j)
+                          for j in _bison_lr1dfa.default_action[i:i + 10]
+                        ]
+                      )
+                    )
+                    for i in range(0, _bison_lr1dfa.default_action.shape[0], 10)
+                  ]
+                ),
+                # yypgoto
+                ','.join(
+                  [
+                    '\n  {0:s}'.format(
+                      ','.join(
+                        [
+                          '{0:6d}'.format(j)
+                          for j in _bison_lr1dfa.goto_pointer[i:i + 10]
+                        ]
+                      )
+                    )
+                    for i in range(0, _bison_lr1dfa.goto_pointer.shape[0], 10)
+                  ]
+                ),
+                # yydefgoto
+                ','.join(
+                  [
+                    '\n  {0:s}'.format(
+                      ','.join(
+                        [
+                          '{0:6d}'.format(j)
+                          for j in _bison_lr1dfa.default_goto[i:i + 10]
+                        ]
+                      )
+                    )
+                    for i in range(0, _bison_lr1dfa.default_goto.shape[0], 10)
+                  ]
+                ),
+                # yytable
+                ','.join(
+                  [
+                    '\n  {0:s}'.format(
+                      ','.join(
+                        [
+                          '{0:6d}'.format(j)
+                          for j in _bison_lr1dfa.entries[i:i + 10, 0]
+                        ]
+                      )
+                    )
+                    for i in range(0, _bison_lr1dfa.entries.shape[0], 10)
+                  ]
+                ),
+                # yycheck
+                ','.join(
+                  [
+                    '\n  {0:s}'.format(
+                      ','.join(
+                        [
+                          '{0:6d}'.format(j)
+                          for j in _bison_lr1dfa.entries[i:i + 10, 1]
+                        ]
+                      )
+                    )
+                    for i in range(0, _bison_lr1dfa.entries.shape[0], 10)
+                  ]
+                ),
+                # yystos
+                ','.join(
+                  [
+                    '\n  {0:s}'.format(
+                      ','.join(
+                        [
+                          '{0:6d}'.format(j)
+                          for j in _bison_lr1dfa.accessing_symbols[i:i + 10]
+                        ]
+                      )
+                    )
+                    for i in range(0, _bison_lr1dfa.accessing_symbols.shape[0], 10)
+                  ]
+                ),
+                # yyr1
+                ','.join(
+                  [
+                    '\n  {0:s}'.format(
+                      ','.join(
+                        [
+                          '{0:6d}'.format(j)
+                          for j in _bison_lr1dfa.rules[i:i + 10, 0]
+                        ]
+                      )
+                    )
+                    for i in range(0, _bison_lr1dfa.rules.shape[0], 10)
+                  ]
+                ),
+                # yyr2
+                ','.join(
+                  [
+                    '\n  {0:s}'.format(
+                      ','.join(
+                        [
+                          '{0:6d}'.format(j)
+                          for j in _bison_lr1dfa.rules[i:i + 10, 1]
+                        ]
+                      )
+                    )
+                    for i in range(0, _bison_lr1dfa.rules.shape[0], 10)
+                  ]
+                )
+              ).replace('YYDEBUG', type_prefix + 'DEBUG') # hack
+            )
+          elif line == '/* GENERATE INITIALACTION */\n':
+            fout.write(
+              '''/* GENERATE INITIALACTION BEGIN */
+{0:s}/* GENERATE INITIALACTION END */
+'''.format(
+                ''.join(
+                  [
+                    '{0:s}\n'.format(i.get_text())
+                    for i in pyacc.initial_action_text
+                  ]
+                ).replace('(yyval)', '(yylval').replace('(yyloc)', '(yylloc)') # hack
+              )
+            )
+          elif line == '/* GENERATE SECTION2 */\n':
+            fout.write(
+              '''/* GENERATE SECTION2 BEGIN */
+{0:s}/* GENERATE SECTION2 END */
+'''.format(
+                '\n'.join(
+                  [
+                    '''  case {0:d}:
+    {1:s}
+    break;
+'''.format(
+                      i,
+                      _lr1dfa.productions[i][1].get_text()
+                    )
+                    for i in range(len(_lr1dfa.productions))
+                    if _lr1dfa.productions[i][1] is not None
+                  ]
+                )
+              )
+            )
+          elif line == '/* GENERATE SECTION3 */\n':
+            fout.write(
+              '''/* GENERATE SECTION3 BEGIN */
+{0:s}/*GENERATE SECTION3 END */
+'''.format(
+                '' if len(pyacc) < 3 else pyacc[2].get_text()
+              )
+            )
+          else:
+            if file_prefix != 'YY_YY_Y_':
+              line = line.replace('YY_YY_Y_', file_prefix)
+            if is_header:
+              if type_prefix != 'YY':
+                line = line.replace('YY_YY_Y_', 'YY').replace('YY', type_prefix)
+              if name_prefix != 'yy':
+                line = line.replace('yy', name_prefix)
+            elif type_prefix != 'YY':
+              line = line.replace('YYDEBUG', type_prefix + 'DEBUG')
+              line = line.replace('YYSTYPE_IS_', type_prefix + 'STYPE_IS_')
+              line = line.replace('YYLTYPE_IS_', type_prefix + 'LTYPE_IS_')
+            fout.write(line)
+          line = fin.readline()
+  generate(
+    skel_file,
+    out_file,
+    pyacc[0].type_prefix,
+    pyacc[0].type_prefix,
+    pyacc[0].name_prefix,
+    False
+  )
+  if defines_file is not None:
+    generate(
+      '{0:s}.h'.format(
+        skel_file[:-2] if skel_file[-2:] == '.c' else skel_file
+      ),
+      defines_file,
+      pyacc[0].type_prefix,
+      pyacc[0].type_prefix,
+      pyacc[0].name_prefix,
+      True
+    )
index c0d49d6..c33301f 100644 (file)
--- a/lr1dfa.py
+++ b/lr1dfa.py
@@ -1,5 +1,7 @@
 import bisect
+import bison_lr1dfa
 import element
+import numpy
 import work
 import sys
 
@@ -34,6 +36,278 @@ class LR1DFA:
     self.n_terminals = n_terminals
     self.eof_terminal = eof_terminal
 
+  def to_bison_lr1dfa(
+    self,
+    n_terminals,
+    translate_terminals,
+    n_nonterminals,
+    translate_nonterminals
+  ):
+    #numpy.set_printoptions(threshold = numpy.nan)
+    #print(translate_terminals)
+    #print(translate_nonterminals)
+    # translate is yytranslate
+    translate = translate_terminals
+
+    # rules[:, 0] is yyr1
+    # rules[:, 1] is yyr2
+    # note: production 0 (start production) can't be reduced, null it out
+    rules = numpy.concatenate(
+      [
+        numpy.zeros((1, 2), numpy.int16),
+        numpy.stack(
+          [
+            translate_nonterminals + n_terminals,
+            numpy.array([i for i, _ in self.productions[1:]], numpy.int16)
+          ],
+          1
+        )
+      ],
+      0
+    )
+
+    # unpack tables into numpy arrays so we can manipulate them efficiently
+    # note: the goto table is transposed with respect to the action table,
+    # so the row in the table corresponds to the yypact[]/yypgoto[] index,
+    # and the column in the table is what gets added to yypact[]/yypgoto[]
+    action_table = numpy.zeros(
+      (len(self.states), self.n_terminals),
+      numpy.int16
+    )
+    goto_table = numpy.zeros(
+      (len(self.productions), len(self.states)),
+      numpy.int16
+    )
+    for i in range(len(self.states)):
+      terminal_breaks, actions, nonterminal_breaks, gotos = self.states[i]
+
+      terminal0 = 0
+      for j in range(len(terminal_breaks)):
+        terminal1 = terminal_breaks[j]
+        action_table[i, terminal0:terminal1] = actions[j]
+        terminal0 = terminal1
+      assert terminal0 == self.n_terminals
+
+      nonterminal0 = 0
+      for j in range(len(nonterminal_breaks)):
+        nonterminal1 = nonterminal_breaks[j]
+        goto_table[nonterminal0:nonterminal1, i] = gotos[j]
+        nonterminal0 = nonterminal1
+      assert nonterminal0 == len(self.productions)
+    assert numpy.all(action_table != 0)
+    assert numpy.all(goto_table != 0)
+    #print(action_table)
+    #print(goto_table)
+    # permute and combine columns/rows on the basis of the translate vectors
+    new_action_table = numpy.zeros(
+      (len(self.states), n_terminals),
+      numpy.int16
+    )
+    new_action_table[:, translate_terminals] = action_table
+    action_table = new_action_table
+    new_goto_table = numpy.zeros(
+      (n_nonterminals, len(self.states)),
+      numpy.int16
+    )
+    new_goto_table[translate_nonterminals, :] = goto_table[1:] # ignore start
+    goto_table = new_goto_table
+
+    # manipulate the table entries as follows:
+    # - ensure there is no shift or goto 0 (cannot return to starting state)
+    # - replace reduce 0 (start production) with shift to a nonexistent state
+    # - replace action or goto -1 (unrecognized terminal/nonterminal) with 0
+    # - change the low-bit indication of shift/reduce to positive/negative
+    # we do it here after removing redundant columns, as it's more efficient
+    assert numpy.all(action_table != 0)
+    action_table[action_table == 1] = len(self.states) << 1
+    action_table[action_table == -1] = 0
+    mask = (action_table & 1).astype(numpy.bool)
+    action_table >>= 1
+    action_table[mask] = -action_table[mask]
+    assert numpy.all(goto_table != 0)
+    goto_table[goto_table == -1] = 0
+
+    # find out which column the transition to each state occurs in, this
+    # must be unique and is called the "accessing symbol" for the state
+    accessing_table = numpy.any(
+      numpy.concatenate(
+        [action_table, goto_table.transpose()],
+        1
+      )[:, :, numpy.newaxis] ==
+      numpy.arange(
+        1,
+        len(self.states),
+        dtype = numpy.int16
+      )[numpy.newaxis, numpy.newaxis, :],
+      0
+    )
+    accessing_symbols = numpy.full(
+      (len(self.states),),
+      2, # '$undefined'
+      numpy.int16
+    )
+    for i in range(1, len(self.states)):
+      accessing_symbol = numpy.nonzero(accessing_table[:, i - 1])[0]
+      if accessing_symbol.shape[0] == 0:
+        sys.stderr.write('warning: unreachable state {0:d}\n'.format(i))
+      elif accessing_symbol.shape[0] == 1:
+        accessing_symbols[i] = accessing_symbol[0]
+      else:
+        assert False
+
+    # default_action is yydefact, default_goto is yydefgoto
+    # find default reduce (most common negative value per action_table row)
+    # and find default goto (most common non-zero value per goto_table row)
+    # note: 0 is used as a last resort if there is no proper default value
+    default_action = numpy.argmax(
+      numpy.concatenate(
+        [
+          numpy.zeros((len(self.states), 1), numpy.int16),
+          numpy.sum(
+            (
+              action_table[:, :, numpy.newaxis] ==
+              numpy.arange(
+                -1,
+                -len(self.productions),
+                -1,
+                dtype = numpy.int16
+              )[numpy.newaxis, numpy.newaxis, :]
+            ).astype(numpy.int16),
+            1
+          )
+        ],
+        1
+      ),
+      1
+    )
+    #print(action_table)
+    #print(default_action)
+    default_goto = numpy.argmax(
+      numpy.concatenate(
+        [
+          numpy.zeros((n_nonterminals, 1), numpy.int16),
+          numpy.sum(
+            (
+              goto_table[:, :, numpy.newaxis] ==
+              numpy.arange(
+                1,
+                len(self.states),
+                dtype = numpy.int16
+              )[numpy.newaxis, numpy.newaxis, :]
+            ).astype(numpy.int16),
+            1
+          )
+        ],
+        1
+      ),
+      1
+    )
+    #print(goto_table)
+    #print(default_goto)
+
+    # fill in the zero entries in the tables with default reduce or goto
+    for i in range(len(self.states)):
+      action_table[i, action_table[i, :] == 0] = -default_action[i]
+    for i in range(n_nonterminals):
+      goto_table[i, goto_table[i, :] == 0] = default_goto[i]
+    #print(action_table)
+    #print(goto_table)
+
+    # entry_base indicates the most negative starting index we can expect
+    # we maintain n_entries such that entries_used[n_entries:, :] == False
+    # entries[i, 0] is yytable[i - entry_base]
+    # entries[i, 1] is yycheck[i - entry_base]
+    # entries_used[i, 0] == True if any vector has been stored starting at i
+    # entries_used[0, 0] does not go True as it is -infinity and can be reused
+    # entries_used[i, 1] == True if entry has been stored in entries[i, :]
+    entry_base = max(n_terminals, len(self.states))
+    n_entries = entry_base
+    entries = numpy.full((entry_base, 2), 0x8000, numpy.int16)
+    entries_used = numpy.zeros((entry_base, 2), numpy.bool)
+    entries_used[:, 1] = True # padding so we don't use any negative entries
+
+    # action_pointer is yypact, goto_pointer is yypgoto
+    def pack_matrix(data, mask):
+      nonlocal n_entries, entries, entries_used # also uses entry_base
+
+      start_indices = numpy.zeros((data.shape[0],), numpy.int16)
+      for i in range(data.shape[0]):
+        indices = numpy.nonzero(mask[i, :])[0]
+        if indices.shape[0] == 0:
+          start_index = 0
+        else:
+          size = indices[-1] + 1
+
+          # ensure arrays are at least large enough to find a spot
+          while entries.shape[0] < n_entries + size:
+            # extend entries, copying only n_entries entries
+            new_entries = numpy.full(
+              (entries.shape[0] * 2, 2),
+              0x8000,
+              numpy.int16
+            )
+            new_entries[:n_entries, :] = entries[:n_entries, :]
+            entries = new_entries
+
+            # extend entries_used, copying only n_entries entries
+            new_entries_used = numpy.zeros(
+              (entries_used.shape[0] * 2, 2),
+              numpy.bool
+            )
+            new_entries_used[:n_entries, :] = entries_used[:n_entries, :]
+            entries_used = new_entries_used
+
+          # find a suitable spot and store differences from default
+          start_index = entry_base - indices[0]
+          while start_index < n_entries:
+            if (
+              not entries_used[start_index, 0] and
+              not numpy.any(entries_used[indices + start_index, 1])
+            ):
+              break
+            start_index += 1
+          entries[indices + start_index, 0] = data[i, indices]
+          entries[indices + start_index, 1] = indices
+          entries_used[start_index, 0] = True
+          entries_used[indices + start_index, 1] = True
+          if n_entries < start_index + size:
+            n_entries = start_index + size
+
+        start_indices[i] = start_index
+      return start_indices
+    action_pointer = pack_matrix(
+      action_table,
+      action_table != (-default_action)[:, numpy.newaxis]
+    ) - entry_base
+    goto_pointer = pack_matrix(
+      goto_table,
+      goto_table != default_goto[:, numpy.newaxis]
+    ) - entry_base
+    new_entries = numpy.zeros(
+      (n_entries - entry_base, 2),
+      numpy.int16
+    )
+    new_entries[:, :] = entries[entry_base:n_entries, :]
+    entries = new_entries
+
+    # n_states == action_pointer.shape[0]
+    # n_productions == rules.shape[0]
+    # n_nonterminals == goto_pointer.shape[0]
+    return bison_lr1dfa.BisonLR1DFA(
+      translate,
+      rules,
+      accessing_symbols,
+      default_action,
+      default_goto,
+      entry_base,
+      entries,
+      action_pointer,
+      goto_pointer,
+      n_terminals
+    )
+
   #def parse_text(self, text, i):
   #  state = 0
   #  value_stack = []