Add element-based Python scanner/parser in root of repository which builds AST direct...

author Nick Downing <nick@ndcode.org>

Mon, 21 Jan 2019 01:24:45 +0000 (12:24 +1100)

committer Nick Downing <nick@ndcode.org>

Mon, 21 Jan 2019 01:24:45 +0000 (12:24 +1100)
author Nick Downing <nick@ndcode.org>
Mon, 21 Jan 2019 01:24:45 +0000 (12:24 +1100)
committer Nick Downing <nick@ndcode.org>
Mon, 21 Jan 2019 01:24:45 +0000 (12:24 +1100)
diff --git a/.gitignore b/.gitignore

index 7e7d720..3b98e57 100644 (file)
--- a/.gitignore
+++ b/.gitignore
@@ -1,24 +1,29 @@
  __pycache__
-bootstrap/*.xml
-bootstrap/lex_yy.py
-bootstrap/lex_yy_code.py
-bootstrap/out
-bootstrap/y_tab.py
-lex-yacc-examples/*.c
-lex-yacc-examples/*.h
-lex-yacc-examples/*.o
-lex-yacc-examples/*.xml
-lex-yacc-examples/example4
-lex-yacc-examples/example7
-skel/skel_bison.c.orig
-skel/skel_bison.h.orig
-tests/*.c
-tests/*.o
-tests/*.xml
-tests/lex_yy.py
-tests/y_tab.py
-tests/cal
-tests/cal2
-tests_ast/*.xml
-tests_ast/lex_yy.py
-tests_ast/y_tab.py
+/*.xml
+/bootstrap/*.xml
+/bootstrap/lex_yy.py
+/bootstrap/lex_yy_code.py
+/bootstrap/out
+/bootstrap/y_tab.py
+/lex-yacc-examples/*.c
+/lex-yacc-examples/*.h
+/lex-yacc-examples/*.o
+/lex-yacc-examples/*.xml
+/lex-yacc-examples/example4
+/lex-yacc-examples/example7
+/lex_yy.py
+/lex_yy_code.py
+/out
+/skel/skel_bison.c.orig
+/skel/skel_bison.h.orig
+/tests/*.c
+/tests/*.o
+/tests/*.xml
+/tests/lex_yy.py
+/tests/y_tab.py
+/tests/cal
+/tests/cal2
+/tests_ast/*.xml
+/tests_ast/lex_yy.py
+/tests_ast/y_tab.py
+/y_tab.py
diff --git a/Makefile b/Makefile

new file mode 100644 (file)

index 0000000..52a6513
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,16 @@
+all: lex_yy.py lex_yy_code.py y_tab.py
+
+lex_yy.py: scan-gram.l bootstrap/skel_lex_yy.py
+       ../bootstrap_flex.git/src/flex -o /dev/null $< 2>$<.xml
+       ../pilex.git/pilex.py --python --skel bootstrap/skel_lex_yy.py $<.xml
+
+lex_yy_code.py: scan-code.l
+       ../bootstrap_flex.git/src/flex -o /dev/null $< 2>$<.xml
+       ../pilex.git/pilex.py --element --python -o $@ $<.xml
+
+y_tab.py: parse-gram.y bootstrap/skel_y_tab.py
+       ../bootstrap_bison.git/src/bison -o /dev/null $< 2>$<.xml
+       ./piyacc.py --python --skel bootstrap/skel_y_tab.py $<.xml
+
+clean:
+       rm -f lex_yy.py lex_yy_code.py y_tab.py *.xml
diff --git a/bootstrap/parse-gram.y b/bootstrap/parse-gram.y

index 514eb4f..8d3cbc8 100644 (file)
--- a/bootstrap/parse-gram.y
+++ b/bootstrap/parse-gram.y
@@ -1089,12 +1089,6 @@ epilogue.opt
    | "%%" EPILOGUE
      {
        #muscle_code_grow('epilogue', translate_code($2, @2, True), @2)
-      lex_yy_code.yyin = None
-      lex_yy_code.yy_buffer_stack = [lex_yy_code.YYBufferState()]
-      lex_yy_code.yytext = '' # fool unput()
-      lex_yy_code.unput($2)
-      lex_yy_code.sc_context = lex_yy_code.INITIAL # CODE_PROPS_PLAIN
-      lex_yy_code.yylex()
        #code_scanner_last_string_free()
        insert_after(1, '</AST_Section3>')
        insert_after(0, '<AST_Section3>')
diff --git a/markup.py b/markup.py

new file mode 100755 (executable)

index 0000000..1817bee
--- /dev/null
+++ b/markup.py
@@ -0,0 +1,10 @@
+#!/usr/bin/env python3
+
+import lex_yy
+import y_tab
+import sys
+
+lex_yy.gram_piece_append('<root>\n  <AST ref=\"0\">')
+y_tab.yyparse()
+lex_yy.gram_piece_append('</AST>\n</root>\n')
+sys.stdout.write(''.join(lex_yy.gram_piece))
diff --git a/n.sh b/n.sh

new file mode 100755 (executable)

index 0000000..61baf6b
--- /dev/null
+++ b/n.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+if ! test -d out
+then
+  mkdir out
+  bootstrap/markup.py <tests/cal.y |./reserialize.py >out/cal.y.xml.ok
+  bootstrap/markup.py <tests_ast/cal_py.y |./reserialize.py >out/cal_py.y.xml.ok
+  bootstrap/markup.py <../bootstrap_flex.git/src/parse.y |./reserialize.py >out/parse.y.xml.ok
+  bootstrap/markup.py <../bootstrap_bison.git/src/parse-gram.y |./reserialize.py >out/parse-gram.y.xml.ok
+fi
+./markup.py <tests/cal.y |./reserialize.py >out/cal.y.xml
+diff -q out/cal.y.xml.ok out/cal.y.xml
+./markup.py <tests_ast/cal_py.y |./reserialize.py >out/cal_py.y.xml
+diff -q out/cal_py.y.xml.ok out/cal_py.y.xml
+./markup.py <../bootstrap_flex.git/src/parse.y |./reserialize.py >out/parse.y.xml
+diff -q out/parse.y.xml.ok out/parse.y.xml
+./markup.py <../bootstrap_bison.git/src/parse-gram.y |./reserialize.py >out/parse-gram.y.xml
+diff -q out/parse-gram.y.xml.ok out/parse-gram.y.xml
diff --git a/parse-gram.y b/parse-gram.y

new file mode 100644 (file)

index 0000000..7796dae
--- /dev/null
+++ b/parse-gram.y
@@ -0,0 +1,1232 @@
+/* Bison Grammar Parser                             -*- C -*-
+
+   Copyright (C) 2002-2015, 2018 Free Software Foundation, Inc.
+
+   This file is part of Bison, the GNU Compiler Compiler.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+%code requires
+{
+  import ast
+  import element
+  import lex_yy_code
+  import state
+  import xml.etree.ElementTree
+
+  YYLTYPE = state.location
+}
+
+%code top
+{
+}
+
+%code
+{
+  #current_prec = 0
+  #current_lhs_location = 0
+  #current_lhs_named_ref = []
+  #current_lhs_symbol = []
+  #current_class = unknown_sym
+  #current_type = None
+  gram_piece2 = 0
+  gram_piece3 = 0
+  #nested_rhs = 0
+}
+
+/* Nick %define api.prefix {gram_} */
+%define api.pure full
+%define locations
+%define parse.error verbose
+%define parse.lac full
+%define parse.trace
+/* Nick %defines */
+%expect 0
+/* Nick %verbose */
+
+%initial-action
+{
+  #boundary_set(&@$.start, current_file, 1, 1)
+  #boundary_set(&@$.end, current_file, 1, 1)
+}
+
+/* Define the tokens together with their human representation.  */
+%token GRAM_EOF 0 "end of file"
+%token STRING     "string"
+
+%token PERCENT_TOKEN       "%token"
+%token PERCENT_NTERM       "%nterm"
+
+%token PERCENT_TYPE        "%type"
+%token PERCENT_DESTRUCTOR  "%destructor"
+%token PERCENT_PRINTER     "%printer"
+
+%token PERCENT_LEFT        "%left"
+%token PERCENT_RIGHT       "%right"
+%token PERCENT_NONASSOC    "%nonassoc"
+%token PERCENT_PRECEDENCE  "%precedence"
+
+%token PERCENT_PREC          "%prec"
+%token PERCENT_DPREC         "%dprec"
+%token PERCENT_MERGE         "%merge"
+
+/*----------------------.
+| Global Declarations.  |
+`----------------------*/
+
+%token
+  PERCENT_CODE            "%code"
+  PERCENT_DEFAULT_PREC    "%default-prec"
+  PERCENT_DEFINE          "%define"
+  PERCENT_DEFINES         "%defines"
+  PERCENT_ERROR_VERBOSE   "%error-verbose"
+  PERCENT_EXPECT          "%expect"
+  PERCENT_EXPECT_RR       "%expect-rr"
+  PERCENT_FLAG            "%<flag>"
+  PERCENT_FILE_PREFIX     "%file-prefix"
+  PERCENT_GLR_PARSER      "%glr-parser"
+  PERCENT_INITIAL_ACTION  "%initial-action"
+  PERCENT_LANGUAGE        "%language"
+  PERCENT_NAME_PREFIX     "%name-prefix"
+  PERCENT_NO_DEFAULT_PREC "%no-default-prec"
+  PERCENT_NO_LINES        "%no-lines"
+  PERCENT_NONDETERMINISTIC_PARSER
+                          "%nondeterministic-parser"
+  PERCENT_OUTPUT          "%output"
+  PERCENT_REQUIRE         "%require"
+  PERCENT_SKELETON        "%skeleton"
+  PERCENT_START           "%start"
+  PERCENT_TOKEN_TABLE     "%token-table"
+  PERCENT_VERBOSE         "%verbose"
+  PERCENT_YACC            "%yacc"
+;
+
+%token BRACED_CODE     "{...}"
+%token BRACED_PREDICATE "%?{...}"
+%token BRACKETED_ID    "[identifier]"
+%token CHAR            "char"
+%token EPILOGUE        "epilogue"
+%token EQUAL           "="
+%token ID              "identifier"
+%token ID_COLON        "identifier:"
+%token PERCENT_PERCENT "%%"
+%token PIPE            "|"
+%token PROLOGUE        "%{...%}"
+%token SEMICOLON       ";"
+%token TAG             "<tag>"
+%token TAG_ANY         "<*>"
+%token TAG_NONE        "<>"
+
+/*%union {
+  character = 0
+}*/
+/*%type <character> CHAR*/
+%printer {
+  fputs_unlocked(char_name($$), yyo)
+} CHAR
+
+/*%union {
+  code = ''
+};*/
+/*%type <code> "{...}" "%?{...}" "%{...%}" EPILOGUE STRING*/
+%printer {
+  fputs_unlocked(quotearg_style(c_quoting_style, $$), yyo)
+} STRING
+/*%printer {
+  rpl_fprintf(yyo, '{\n%s\n}', $$)
+} <code>*/
+
+/*%union {
+  uniqstr = ''
+}*/
+/*%type <uniqstr> BRACKETED_ID ID ID_COLON PERCENT_FLAG TAG tag variable*/
+/*%printer {
+  fputs_unlocked($$, yyo)
+} <uniqstr>*/
+%printer {
+  rpl_fprintf(yyo, '[%s]', $$)
+} BRACKETED_ID
+%printer {
+  rpl_fprintf(yyo, '%s:', $$)
+} ID_COLON
+%printer {
+  rpl_fprintf(yyo, '%%%s', $$)
+} PERCENT_FLAG
+%printer {
+  rpl_fprintf(yyo, '<%s>', $$)
+} TAG tag
+
+/*%union {
+  integer = 0
+};*/
+/*%token <integer> INT "integer"*/
+%token INT "integer"
+/*%printer {
+  rpl_fprintf(yyo, '%d', $$)
+} <integer>*/
+
+/*%union {
+  symbol = []
+}*/
+/*%type <symbol> id id_colon string_as_id symbol symbol.prec*/
+/*%printer {
+  rpl_fprintf(yyo, '%s', $$->tag)
+} <symbol>*/
+%printer {
+  rpl_fprintf(yyo, '%s:', $$->tag)
+} id_colon
+
+/*%union {
+  assoc = 0
+};*/
+/*%type <assoc> precedence_declarator*/
+
+/*%union {
+  list = []
+}*/
+/*%type <list>  symbols.1 symbols.prec generic_symlist generic_symlist_item*/
+
+/*%union {
+  named_ref = []
+}*/
+/*%type <named_ref> named_ref.opt*/
+
+/*---------.
+| %param.  |
+`---------*/
+%code requires
+{
+  param_none = 0
+  param_lex = 1 << 0
+  param_parse = 1 << 1
+  param_both = param_lex | param_parse
+};
+%code
+{
+  current_param = param_none
+};
+/*%union {
+  param = 0
+}*/
+/*%token <param> PERCENT_PARAM "%param";*/
+%token PERCENT_PARAM "%param";
+/*%printer
+{
+  if $$ == param_lex:
+    fputs_unlocked('%' 'lex-param', yyo)
+    break
+  elif $$ == param_parse:
+    fputs_unlocked('%' 'parse-param', yyo)
+    break
+  elif $$ == param_both:
+    fputs_unlocked('%' 'param', yyo)
+    break
+  elif $$ == param_none:
+    assert(False)
+    break
+} <param>;*/
+
+                     /*==========\
+                     | Grammar.  |
+                     \==========*/
+%%
+
+input
+  : prologue_declarations "%%" grammar epilogue.opt
+    {
+      insert_after(2, '</AST_Section2>')
+      insert_before(2, '<AST_Section2>')
+      insert_after(0, '</AST_Section1>')
+      insert_before(0, '<AST_Section1>')
+    }
+  ;
+
+        /*------------------------------------.
+        | Declarations: before the first %%.  |
+        `------------------------------------*/
+
+prologue_declarations
+  : %empty
+    {
+      global yychar
+      if yychar == YYEMPTY:
+        yychar = lex_yy.gram_lex()
+      temp = lex_yy.gram_piece[gram_piece2 + 1]
+      lex_yy.gram_piece[gram_piece2 + 1] = lex_yy.gram_piece[gram_piece2]
+      lex_yy.gram_piece[gram_piece2] = lex_yy.gram_piece[gram_piece2 - 1]
+      lex_yy.gram_piece[gram_piece2 - 1] = temp
+    }
+  | prologue_declarations prologue_declaration
+  ;
+
+prologue_declaration
+  : grammar_declaration
+  | "%{...%}"
+    {
+      #muscle_code_grow('post_prologue' if union_seen else 'pre_prologue', translate_code($1, @1, True), @1)
+      lex_yy_code.yyin = None
+      lex_yy_code.yy_buffer_stack = [lex_yy_code.YYBufferState()]
+      lex_yy_code.yytext = '' # fool unput()
+      lex_yy_code.unput($1)
+      lex_yy_code.sc_context = lex_yy_code.INITIAL # CODE_PROPS_PLAIN
+      lex_yy_code.yylex(ast.AST.Text)
+      ref_list = []
+      element.serialize_ref(lex_yy_code.yy_element_space, ref_list)
+      del lex_yy_code.yy_element_space.attrib['ref']
+      lex_yy.gram_piece[gram_piece2] = \
+        '<AST_Section1_Prologue>%{{{0:s}%}}</AST_Section1_Prologue>'.format(
+          xml.etree.ElementTree.tostring(
+            lex_yy_code.yy_element_space,
+            encoding = 'unicode'
+          )
+        )
+    }
+  | "%<flag>"
+    {
+      #muscle_percent_define_ensure($1, @1, True)
+      if $1 == 'api.pure':
+        insert_after(0, '</AST_Section1_PureParser>')
+        insert_before(0, '<AST_Section1_PureParser>')
+      elif $1 == 'locations':
+        insert_after(0, '</AST_Section1_Locations>')
+        insert_before(0, '<AST_Section1_Locations>')
+      elif $1 == 'parse.trace':
+        insert_after(0, '</AST_Section1_Debug>')
+        insert_before(0, '<AST_Section1_Debug>')
+      else:
+        assert False
+    }
+  | "%define" variable value
+    {
+      #muscle_percent_define_insert($2, @2, $3.kind, $3.chars, MUSCLE_PERCENT_DEFINE_GRAMMAR_FILE)
+      insert_after(2, '</AST_Section1_Define>')
+      insert_before(0, '<AST_Section1_Define>')
+    }
+  | "%defines"
+    {
+      #defines_flag = True
+      insert_after(0, '</AST_Section1_Defines>')
+      insert_before(0, '<AST_Section1_Defines>')
+    }
+  | "%defines" STRING
+    {
+      #defines_flag = True
+      #spec_defines_file = xstrdup($2)
+      insert_after(1, '</AST_Section1_Defines>')
+      insert_before(0, '<AST_Section1_Defines>')
+    }
+  | "%error-verbose"
+    {
+      #muscle_percent_define_insert('parse.error', @1, muscle_keyword, 'verbose', MUSCLE_PERCENT_DEFINE_GRAMMAR_FILE)
+      insert_after(0, '</AST_Section1_ErrorVerbose>')
+      insert_before(0, '<AST_Section1_ErrorVerbose>')
+    }
+  | "%expect" INT
+    {
+      #expected_sr_conflicts = $2
+      insert_after(1, '</AST_Section1_Expect>')
+      insert_before(0, '<AST_Section1_Expect value="{0:d}">'.format($2))
+    }
+  | "%expect-rr" INT
+    {
+      #expected_rr_conflicts = $2
+      insert_after(1, '</AST_Section1_ExpectRR>')
+      insert_before(0, '<AST_Section1_ExpectRR value="{0:d}">'.format($2))
+    }
+  | "%file-prefix" STRING
+    {
+      #spec_file_prefix = $2
+      insert_after(1, '</AST_Section1_FilePrefix>')
+      insert_before(0, '<AST_Section1_FilePrefix>')
+    }
+  | "%glr-parser"
+    {
+      #nondeterministic_parser = True
+      #glr_parser = True
+      insert_after(0, '</AST_Section1_GLRParser>')
+      insert_before(0, '<AST_Section1_GLRParser>')
+    }
+  | "%initial-action" "{...}"
+    {
+      #muscle_code_grow('initial_action', translate_code($2, @2, False), @2)
+      lex_yy_code.yyin = None
+      lex_yy_code.yy_buffer_stack = [lex_yy_code.YYBufferState()]
+      lex_yy_code.yytext = '' # fool unput()
+      lex_yy_code.unput($2)
+      lex_yy_code.sc_context = lex_yy_code.SC_SYMBOL_ACTION # CODE_PROPS_SYMBOL_ACTION
+      lex_yy_code.yylex(ast.AST.Text)
+      ref_list = []
+      element.serialize_ref(lex_yy_code.yy_element_space, ref_list)
+      del lex_yy_code.yy_element_space.attrib['ref']
+      lex_yy.gram_piece[gram_piece2 + 2] = xml.etree.ElementTree.tostring(
+        lex_yy_code.yy_element_space,
+        encoding = 'unicode'
+      )
+      insert_after(1, '</AST_Section1_InitialAction>')
+      insert_before(0, '<AST_Section1_InitialAction>')
+    }
+  | "%language" STRING
+    {
+      #language_argmatch($2, grammar_prio, @1)
+      insert_after(1, '</AST_Section1_Language>')
+      insert_before(0, '<AST_Section1_Language>')
+    }
+  | "%name-prefix" STRING
+    {
+      #spec_name_prefix = $2
+      insert_after(1, '</AST_Section1_NamePrefix>')
+      insert_before(0, '<AST_Section1_NamePrefix>')
+    }
+  | "%no-lines"
+    {
+      #no_lines_flag = True
+      insert_after(0, '</AST_Section1_Lines>')
+      insert_before(0, '<AST_Section1_Lines value="false">')
+    }
+  | "%nondeterministic-parser"
+    {
+      #nondeterministic_parser = True
+      insert_after(0, '</AST_Section1_NonDeterministicParser>')
+      insert_before(0, '<AST_Section1_NonDeterministicParser>')
+    }
+  | "%output" STRING
+    {
+      #spec_outfile = $2
+      insert_after(1, '</AST_Section1_Output>')
+      insert_before(0, '<AST_Section1_Output>')
+    }
+  | "%param"
+    {
+      #current_param = $1
+    }
+    params
+    {
+      #current_param = param_none
+      insert_after(2, '</AST_Section1_Param>')
+      insert_before(0, '<AST_Section1_Param>')
+    }
+  | "%require" STRING
+    {
+      #version_check(&@2, $2)
+      insert_after(1, '</AST_Section1_Require>')
+      insert_before(0, '<AST_Section1_Require>')
+    }
+  | "%skeleton" STRING
+    {
+      #skeleton_user = $2
+      #if strchr(skeleton_user, ord('/')):
+      #  dir_length = len(current_file)
+      #  skeleton_build = None
+      #  while dir_length and current_file[dir_length - 1] != ord('/'):
+      #    dir_length -= 1
+      #  while dir_length and current_file[dir_length - 1] == ord('/'):
+      #    dir_length -= 1
+      #  skeleton_build = xmalloc(dir_length + 1 + len(skeleton_user) + 1)
+      #  if dir_length > 0:
+      #    memcpy(skeleton_build, current_file, dir_length)
+      #    skeleton_build[dir_length++] = ord('/')
+      #  strcpy(skeleton_build + dir_length, skeleton_user)
+      #  skeleton_user = uniqstr_new(skeleton_build)
+      #  free(skeleton_build)
+      #skeleton_arg(skeleton_user, grammar_prio, @1)
+      insert_after(1, '</AST_Section1_Skeleton>')
+      insert_before(0, '<AST_Section1_Skeleton>')
+    }
+  | "%token-table"
+    {
+      #token_table_flag = True
+      insert_after(0, '</AST_Section1_TokenTable>')
+      insert_before(0, '<AST_Section1_TokenTable>')
+    }
+  | "%verbose"
+    {
+      #report_flag |= report_states
+      insert_after(0, '</AST_Section1_Verbose>')
+      insert_before(0, '<AST_Section1_Verbose>')
+    }
+  | "%yacc"
+    {
+      #yacc_flag = True
+      insert_after(0, '</AST_Section1_YACC>')
+      insert_before(0, '<AST_Section1_YACC>')
+    }
+  | /*FIXME: Err?  What is this horror doing here? */ ";"
+  ;
+
+params
+  : params "{...}"
+    {
+      #add_param(current_param, $2, @2)
+    }
+  | "{...}"
+    {
+      #add_param(current_param, $1, @1)
+    }
+  ;
+
+/*----------------------.
+| grammar_declaration.  |
+`----------------------*/
+
+grammar_declaration
+  : precedence_declaration
+  | symbol_declaration
+  | "%start" symbol
+    {
+      #grammar_start_symbol_set($2, @2)
+      insert_after(1, '</AST_Section1Or2_Start>')
+      insert_after(1, '</AST_SymbolRef>')
+      insert_before(1, '<AST_SymbolRef>')
+      insert_before(0, '<AST_Section1Or2_Start>')
+    }
+  | code_props_type "{...}" generic_symlist
+    {
+      #code = None
+      #code_props_symbol_action_init(&code, $2, @2)
+      #code_props_translate_code(&code)
+      lex_yy_code.yyin = None
+      lex_yy_code.yy_buffer_stack = [lex_yy_code.YYBufferState()]
+      lex_yy_code.yytext = '' # fool unput()
+      lex_yy_code.unput($2)
+      lex_yy_code.sc_context = lex_yy_code.SC_SYMBOL_ACTION # CODE_PROPS_SYMBOL_ACTION
+      lex_yy_code.yylex(ast.AST.Text)
+      ref_list = []
+      element.serialize_ref(lex_yy_code.yy_element_space, ref_list)
+      del lex_yy_code.yy_element_space.attrib['ref']
+      lex_yy.gram_piece[gram_piece2 + 2] = xml.etree.ElementTree.tostring(
+        lex_yy_code.yy_element_space,
+        encoding = 'unicode'
+      )
+      #list = None
+      #list = $3
+      #while list:
+      #  symbol_list_code_props_set(list, $1, &code)
+      #  list = list->next
+      #symbol_list_free($3)
+      insert_after(2, '</AST_Section1Or2_CodeProps>')
+      insert_before(0, '<AST_Section1Or2_CodeProps _type="{0:d}">'.format($1))
+    }
+  | "%default-prec"
+    {
+      #default_prec = True
+      insert_after(0, '</AST_Section1Or2_DefaultPrec>')
+      insert_before(0, '<AST_Section1Or2_DefaultPrec value="true">')
+    }
+  | "%no-default-prec"
+    {
+      #default_prec = False
+      insert_after(0, '</AST_Section1Or2_DefaultPrec>')
+      insert_before(0, '<AST_Section1Or2_DefaultPrec>')
+    }
+  | "%code" "{...}"
+    {
+      #muscle_code_grow('percent_code()', translate_code_braceless($2, @2), @2)
+      lex_yy_code.yyin = None
+      lex_yy_code.yy_buffer_stack = [lex_yy_code.YYBufferState()]
+      lex_yy_code.yytext = '' # fool unput()
+      lex_yy_code.unput($2[1:-1])
+      lex_yy_code.sc_context = lex_yy_code.INITIAL # CODE_PROPS_PLAIN
+      lex_yy_code.yylex(ast.AST.Text)
+      ref_list = []
+      element.serialize_ref(lex_yy_code.yy_element_space, ref_list)
+      del lex_yy_code.yy_element_space.attrib['ref']
+      lex_yy.gram_piece[gram_piece2 + 2] = \
+        '<AST_BracedCode>{{{0:s}}}</AST_BracedCode>'.format(
+          xml.etree.ElementTree.tostring(
+            lex_yy_code.yy_element_space,
+            encoding = 'unicode'
+          )
+        )
+      insert_after(1, '</AST_Section1Or2_Code>')
+      insert_before(0, '<AST_Section1Or2_Code><AST_ID />')
+    }
+  | "%code" ID "{...}"
+    {
+      #muscle_percent_code_grow($2, @2, translate_code_braceless($3, @3), @3)
+      lex_yy_code.yyin = None
+      lex_yy_code.yy_buffer_stack = [lex_yy_code.YYBufferState()]
+      lex_yy_code.yytext = '' # fool unput()
+      lex_yy_code.unput($3[1:-1])
+      lex_yy_code.sc_context = lex_yy_code.INITIAL # CODE_PROPS_PLAIN
+      lex_yy_code.yylex(ast.AST.Text)
+      ref_list = []
+      element.serialize_ref(lex_yy_code.yy_element_space, ref_list)
+      del lex_yy_code.yy_element_space.attrib['ref']
+      lex_yy.gram_piece[gram_piece2 + 4] = \
+        '<AST_BracedCode>{{{0:s}}}</AST_BracedCode>'.format(
+          xml.etree.ElementTree.tostring(
+            lex_yy_code.yy_element_space,
+            encoding = 'unicode'
+          )
+        )
+      insert_after(2, '</AST_Section1Or2_Code>')
+      insert_before(0, '<AST_Section1Or2_Code>')
+    }
+  ;
+
+/*%type <code_type> code_props_type;*/
+/*%union {
+  code_type = 0
+};*/
+/*%printer {
+  rpl_fprintf(yyo, '%s', code_props_type_string($$))
+} <code_type>;*/
+
+code_props_type
+  : "%destructor"
+    {
+      $$ = state.destructor
+    }
+  | "%printer"
+    {
+      $$ = state.printer
+    }
+  ;
+
+/*---------.
+| %union.  |
+`---------*/
+
+%token PERCENT_UNION "%union";
+
+union_name
+  : %empty
+    {
+      insert_before(0, '<AST_ID />')
+    }
+  | ID
+    {
+      #muscle_percent_define_insert('api.value.union.name', @1, muscle_keyword, $1, MUSCLE_PERCENT_DEFINE_GRAMMAR_FILE)
+    }
+  ;
+
+grammar_declaration
+  : "%union" union_name "{...}"
+    {
+      #union_seen = True
+      #muscle_code_grow('union_members', translate_code_braceless($3, @3), @3)
+      lex_yy_code.yyin = None
+      lex_yy_code.yy_buffer_stack = [lex_yy_code.YYBufferState()]
+      lex_yy_code.yytext = '' # fool unput()
+      lex_yy_code.unput($3[1:-1])
+      lex_yy_code.sc_context = lex_yy_code.INITIAL # CODE_PROPS_PLAIN
+      lex_yy_code.yylex(ast.AST.Text)
+      ref_list = []
+      element.serialize_ref(lex_yy_code.yy_element_space, ref_list)
+      del lex_yy_code.yy_element_space.attrib['ref']
+      lex_yy.gram_piece[gram_piece2 + 4] = \
+        '<AST_BracedCode>{{{0:s}}}</AST_BracedCode>'.format(
+          xml.etree.ElementTree.tostring(
+            lex_yy_code.yy_element_space,
+            encoding = 'unicode'
+          )
+        )
+      insert_after(2, '</AST_Section1Or2_Union>')
+      insert_before(0, '<AST_Section1Or2_Union>')
+    }
+  ;
+
+symbol_declaration
+  : "%nterm"
+    {
+      #current_class = nterm_sym
+    }
+    symbol_defs.1
+    {
+      #current_class = unknown_sym
+      #current_type = None
+      insert_after(2, '</AST_Section1Or2_NTerm>')
+      insert_before(0, '<AST_Section1Or2_NTerm>')
+    }
+  | "%token"
+    {
+      #current_class = token_sym
+    }
+    symbol_defs.1
+    {
+      #current_class = unknown_sym
+      #current_type = None
+      insert_after(2, '</AST_Section1Or2_Token>')
+      insert_before(0, '<AST_Section1Or2_Token>')
+    }
+  | "%type" TAG symbols.1
+    {
+      #list = None
+      #tag_seen = True
+      #list = $3
+      #while list:
+      #  symbol_type_set(list->content.sym, $2, @2)
+      #  list = list->next
+      #symbol_list_free($3)
+      insert_after(2, '</AST_Section1Or2_Type>')
+      insert_before(0, '<AST_Section1Or2_Type>')
+    }
+  ;
+
+precedence_declaration
+  : precedence_declarator tag.opt symbols.prec
+    {
+      #list = None
+      #current_prec += 1
+      #list = $3
+      #while list:
+      #  symbol_type_set(list->content.sym, current_type, @2)
+      #  symbol_precedence_set(list->content.sym, current_prec, $1, @1)
+      #  list = list->next
+      #symbol_list_free($3)
+      #current_type = None
+      insert_after(2, '</AST_Section1Or2_Precedence>')
+      insert_before(0, '<AST_Section1Or2_Precedence _type="{0:d}">'.format(($1 & 3) - 1))
+    }
+  ;
+
+precedence_declarator
+  : "%left"
+    {
+      $$ = state.left_assoc
+    }
+  | "%right"
+    {
+      $$ = state.right_assoc
+    }
+  | "%nonassoc"
+    {
+      $$ = state.non_assoc
+    }
+  | "%precedence"
+    {
+      $$ = state.precedence_assoc
+    }
+  ;
+
+tag.opt
+  : %empty
+    {
+      #current_type = None
+    }
+  | TAG
+    {
+      #current_type = $1
+      #tag_seen = True
+    }
+  ;
+
+/* Just like symbols.1 but accept INT for the sake of POSIX.  */
+symbols.prec
+  : symbol.prec
+    {
+      #$$ = symbol_list_sym_new($1, @1)
+    }
+  | symbols.prec symbol.prec
+    {
+      #$$ = symbol_list_append($1, symbol_list_sym_new($2, @2))
+    }
+  ;
+
+symbol.prec
+  : symbol
+    {
+      #$$ = $1
+      #symbol_class_set($1, token_sym, @1, False)
+      insert_after(0, '</AST_SymbolRef>')
+      insert_before(0, '<AST_SymbolRef>')
+    }
+  | symbol INT
+    {
+      #$$ = $1
+      #symbol_user_token_number_set($1, $2, @2)
+      #symbol_class_set($1, token_sym, @1, False)
+      insert_after(1, '</AST_SymbolRef>')
+      insert_before(0, '<AST_SymbolRef user_token="{0:d}">'.format($2))
+    }
+  ;
+
+/* One or more symbols to be %typed. */
+symbols.1
+  : symbol
+    {
+      #$$ = symbol_list_sym_new($1, @1)
+      insert_after(0, '</AST_SymbolRef>')
+      insert_before(0, '<AST_SymbolRef>')
+    }
+  | symbols.1 symbol
+    {
+      #$$ = symbol_list_append($1, symbol_list_sym_new($2, @2))
+      insert_after(1, '</AST_SymbolRef>')
+      insert_before(1, '<AST_SymbolRef>')
+    }
+  ;
+
+generic_symlist
+  : generic_symlist_item
+    {
+      #$$ = $1
+    }
+  | generic_symlist generic_symlist_item
+    {
+      #$$ = symbol_list_append($1, $2)
+    }
+  ;
+
+generic_symlist_item
+  : symbol
+    {
+      #$$ = symbol_list_sym_new($1, @1)
+      insert_after(0, '</AST_SymbolRef>')
+      insert_before(0, '<AST_SymbolRef>')
+    }
+  | tag
+    {
+      #$$ = symbol_list_type_new($1, @1)
+    }
+  ;
+
+tag
+  : TAG
+  | "<*>"
+    {
+      #$$ = uniqstr_new('*')
+    }
+  | "<>"
+    {
+      #$$ = uniqstr_new('')
+    }
+  ;
+
+/* One token definition.  */
+symbol_def
+  : TAG
+    {
+      #current_type = $1
+      #tag_seen = True
+    }
+  | id
+    {
+      #symbol_class_set($1, current_class, @1, True)
+      #symbol_type_set($1, current_type, @1)
+      insert_after(0, '</AST_SymbolRef>')
+      insert_before(0, '<AST_SymbolRef>')
+    }
+  | id INT
+    {
+      #symbol_class_set($1, current_class, @1, True)
+      #symbol_type_set($1, current_type, @1)
+      #symbol_user_token_number_set($1, $2, @2)
+      insert_after(1, '</AST_SymbolRef>')
+      insert_before(0, '<AST_SymbolRef user_token="{0:d}">'.format($2))
+    }
+  | id string_as_id
+    {
+      #symbol_class_set($1, current_class, @1, True)
+      #symbol_type_set($1, current_type, @1)
+      #symbol_make_alias($1, $2, @$)
+      insert_after(1, '</AST_SymbolRef>')
+      insert_before(0, '<AST_SymbolRef>')
+    }
+  | id INT string_as_id
+    {
+      #symbol_class_set($1, current_class, @1, True)
+      #symbol_type_set($1, current_type, @1)
+      #symbol_user_token_number_set($1, $2, @2)
+      #symbol_make_alias($1, $3, @$)
+      insert_after(2, '</AST_SymbolRef>')
+      insert_before(0, '<AST_SymbolRef user_token="{0:d}">'.format($2))
+    }
+  ;
+
+/* One or more symbol definitions. */
+symbol_defs.1
+  : symbol_def
+  | symbol_defs.1 symbol_def
+  ;
+
+        /*------------------------------------------.
+        | The grammar section: between the two %%.  |
+        `------------------------------------------*/
+
+grammar
+  : rules_or_grammar_declaration
+  | grammar rules_or_grammar_declaration
+  ;
+
+/* As a Bison extension, one can use the grammar declarations in the
+   body of the grammar.  */
+rules_or_grammar_declaration
+  : rules
+  | grammar_declaration ";"
+  | error ";"
+    {
+      #yyerrok
+    }
+  ;
+
+rules
+  : id_colon named_ref.opt
+    {
+      #current_lhs($1, @1, $2)
+    }
+    rhses.1
+    {
+      #current_lhs(0, @1, 0)
+      insert_after(3, '</AST_Section2_Rules>')
+      insert_after(0, '</AST_SymbolRef>')
+      insert_before(0, '<AST_SymbolRef>')
+      insert_before(0, '<AST_Section2_Rules>')
+    }
+  ;
+
+rhses.1
+  : rhs
+    {
+      #grammar_current_rule_end(@1)
+      insert_after(0, '</AST_Production>')
+      insert_before(0, '<AST_Production>')
+    }
+  | rhses.1 "|" rhs
+    {
+      #grammar_current_rule_end(@3)
+      insert_after(2, '</AST_Production>')
+      insert_before(2, '<AST_Production>')
+    }
+  | rhses.1 ";"
+  ;
+
+%token PERCENT_EMPTY "%empty";
+/* Nick added %space */
+%token PERCENT_SPACE "%space";
+
+rhs
+  : %empty
+    {
+      global yychar
+      #if nested_rhs:
+      #  nested_rhs -= 1
+      #else:
+      #  grammar_current_rule_begin(current_lhs_symbol, current_lhs_location, current_lhs_named_ref)
+      if yychar == YYEMPTY:
+        yychar = lex_yy.gram_lex()
+      temp = lex_yy.gram_piece[gram_piece2 + 1]
+      lex_yy.gram_piece[gram_piece2 + 1] = lex_yy.gram_piece[gram_piece2]
+      lex_yy.gram_piece[gram_piece2] = lex_yy.gram_piece[gram_piece2 - 1]
+      lex_yy.gram_piece[gram_piece2 - 1] = temp
+    }
+  | rhs symbol named_ref.opt
+    {
+      #grammar_current_rule_symbol_append($2, @2, $3)
+      insert_after(2, '</AST_Production_SymbolRef>')
+      insert_after(1, '</AST_SymbolRef>')
+      insert_before(1, '<AST_Production_SymbolRef><AST_SymbolRef>')
+    }
+  | rhs "{...}" named_ref.opt
+    {
+      #grammar_current_rule_action_append($2, @2, $3, False)
+      lex_yy_code.yyin = None
+      lex_yy_code.yy_buffer_stack = [lex_yy_code.YYBufferState()]
+      lex_yy_code.yytext = '' # fool unput()
+      lex_yy_code.unput($2)
+      lex_yy_code.sc_context = lex_yy_code.SC_RULE_ACTION # CODE_PROPS_RULE_ACTION
+      lex_yy_code.yylex(ast.AST.Text)
+      ref_list = []
+      element.serialize_ref(lex_yy_code.yy_element_space, ref_list)
+      del lex_yy_code.yy_element_space.attrib['ref']
+      lex_yy.gram_piece[gram_piece2 + 2] = xml.etree.ElementTree.tostring(
+        lex_yy_code.yy_element_space,
+        encoding = 'unicode'
+      )
+      insert_after(2, '</AST_Production_Action>')
+      insert_before(1, '<AST_Production_Action>')
+    }
+  | rhs "%?{...}"
+    {
+      #grammar_current_rule_action_append($2, @2, None, True)
+    }
+  | rhs "%empty"
+    {
+      #grammar_current_rule_empty_set(@2)
+      insert_after(1, '</AST_Production_Empty>')
+      insert_before(1, '<AST_Production_Empty>')
+    }
+  | rhs "%prec" symbol
+    {
+      #grammar_current_rule_prec_set($3, @3)
+      insert_after(2, '</AST_Production_Prec>')
+      insert_after(2, '</AST_SymbolRef>')
+      insert_before(2, '<AST_SymbolRef>')
+      insert_before(1, '<AST_Production_Prec>')
+    }
+  | rhs "%dprec" INT
+    {
+      #grammar_current_rule_dprec_set($3, @3)
+      insert_after(2, '</AST_Production_DPrec>')
+      insert_before(1, '<AST_Production_DPrec value="{0:d}">'.format($3))
+    }
+  | rhs "%merge" TAG
+    {
+      #grammar_current_rule_merge_set($3, @3)
+      insert_after(2, '</AST_Production_Merge>')
+      insert_before(1, '<AST_Production_Merge>')
+    }
+  /* Nick extra rules for element groups */
+  | rhs '('
+    /*{
+      #nested_rhs += 1
+    }*/
+    rhs ')'
+    {
+      lex_yy_code.yyin = None
+      lex_yy_code.yy_buffer_stack = [lex_yy_code.YYBufferState()]
+      lex_yy_code.yytext = '' # fool unput()
+      lex_yy_code.unput($2[4:-1])
+      lex_yy_code.sc_context = lex_yy_code.SC_RULE_ACTION # CODE_PROPS_RULE_ACTION
+      lex_yy_code.yylex(ast.AST.Text)
+      ref_list = []
+      element.serialize_ref(lex_yy_code.yy_element_space, ref_list)
+      del lex_yy_code.yy_element_space.attrib['ref']
+      lex_yy.gram_piece[gram_piece2 + 2] = \
+        '(?E{{{0:s}}}'.format(
+          xml.etree.ElementTree.tostring(
+            lex_yy_code.yy_element_space,
+            encoding = 'unicode'
+          )
+        )
+      insert_after(3, '</AST_Production_GroupElement>')
+      insert_before(1, '<AST_Production_GroupElement>')
+    }
+  /* Nick added %space */
+  | rhs "%space"
+    {
+      insert_after(1, '</AST_Production_Space>')
+      insert_before(1, '<AST_Production_Space>')
+    }
+  ;
+
+named_ref.opt
+  : %empty
+    {
+      #$$ = 0
+    }
+  | BRACKETED_ID
+    {
+      #$$ = named_ref_new($1, @1)
+    }
+  ;
+
+/*---------------------.
+| variable and value.  |
+`---------------------*/
+
+/* The STRING form of variable is deprecated and is not M4-friendly.
+   For example, M4 fails for '%define "[" "value"'.  */
+variable
+  : ID
+  | STRING
+    {
+      #$$ = uniqstr_new($1)
+    }
+  ;
+
+/* Some content or empty by default. */
+%code requires {
+};
+/*%union
+{
+  value = 0
+};*/
+/*%type <value> value;*/
+/*%printer
+{
+  if $$.kind == muscle_code:
+    rpl_fprintf(yyo, '{%s}', $$.chars)
+    break
+  elif $$.kind == muscle_keyword:
+    rpl_fprintf(yyo, '%s', $$.chars)
+    break
+  elif $$.kind == muscle_string:
+    rpl_fprintf(yyo, '"%s"', $$.chars)
+    break
+} <value>;*/
+
+value
+  : %empty
+    {
+      #$$.kind = muscle_keyword
+      #$$.chars = ''
+    }
+  | ID
+    {
+      #$$.kind = muscle_keyword
+      #$$.chars = $1
+    }
+  | STRING
+    {
+      #$$.kind = muscle_string
+      #$$.chars = $1
+    }
+  | "{...}"
+    {
+      #$$.kind = muscle_code
+      #$$.chars = strip_braces($1)
+    }
+  ;
+
+/*--------------.
+| Identifiers.  |
+`--------------*/
+
+/* Identifiers are returned as uniqstr values by the scanner.
+   Depending on their use, we may need to make them genuine symbols.  */
+
+id
+  : ID
+    {
+      #$$ = symbol_from_uniqstr($1, @1)
+    }
+  | CHAR
+    {
+      #$$ = symbol_get(char_name($1), @1)
+      #symbol_class_set($$, token_sym, @1, False)
+      #symbol_user_token_number_set($$, $1, @1)
+    }
+  ;
+
+id_colon
+  : ID_COLON
+    {
+      #$$ = symbol_from_uniqstr($1, @1)
+    }
+  ;
+
+symbol
+  : id
+  | string_as_id
+  ;
+
+/* A string used as an ID: quote it.  */
+string_as_id
+  : STRING
+    {
+      #$$ = symbol_get(quotearg_style(c_quoting_style, $1), @1)
+      #symbol_class_set($$, token_sym, @1, False)
+    }
+  ;
+
+epilogue.opt
+  : %empty
+  | "%%" EPILOGUE
+    {
+      #muscle_code_grow('epilogue', translate_code($2, @2, True), @2)
+      #code_scanner_last_string_free()
+      insert_after(1, '</AST_Section3>')
+      insert_after(0, '<AST_Section3>')
+    }
+  ;
+
+%%
+
+#def lloc_default(rhs, n):
+#  i = None
+#  loc = None
+#  loc.start = rhs[n].end
+#  loc.end = rhs[n].end
+#  i = 1
+#  while i <= n:
+#    if not equal_boundaries(rhs[i].start, rhs[i].end):
+#      loc.start = rhs[i].start
+#      break
+#    i += 1
+#  return loc
+#
+#def strip_braces(code):
+#  code[len(code) - 1] = 0
+#  return code + 1
+#
+#def translate_code(code, loc, plain):
+#  plain_code = None
+#  if plain:
+#    code_props_plain_init(&plain_code, code, loc)
+#  else:
+#    code_props_symbol_action_init(&plain_code, code, loc)
+#  code_props_translate_code(&plain_code)
+#  lex_yy.gram_scanner_last_string_free()
+#  return plain_code.code
+#
+#def translate_code_braceless(code, loc):
+#  return translate_code(strip_braces(code), loc, True)
+#
+#def add_param(type, decl, loc):
+#  alphanum = 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' '_' '0123456789'
+#  name_start = None
+#  p = None
+#  p = decl
+#  while p[1]:
+#    if (p == decl or not memchr(alphanum, p[-1], sizeof alphanum - 1)) and memchr(alphanum, p[0], sizeof alphanum - 10 - 1):
+#      name_start = p
+#    p += 1
+#  p -= 1
+#  while c_isspace(int(*p)):
+#    p -= 1
+#  p[1] = ord('\0')
+#  decl += 1
+#  while c_isspace(int(*decl)):
+#    decl += 1
+#  if not name_start:
+#    complain(&loc, complaint, gettext('missing identifier in parameter declaration'))
+#  else:
+#    name = xmemdup0(name_start, strspn(name_start, alphanum))
+#    if type & param_lex:
+#      muscle_pair_list_grow('lex_param', decl, name)
+#    if type & param_parse:
+#      muscle_pair_list_grow('parse_param', decl, name)
+#    free(name)
+#  lex_yy.gram_scanner_last_string_free()
+#
+#def version_check(loc, version):
+#  if strverscmp(version, '3.0.5') > 0:
+#    complain(loc, complaint, 'require bison %s, but have %s', version, '3.0.5')
+#    exit(63)
+#
+#def gram_error(loc, msg):
+#  complain(loc, complaint, '%s', msg)
+#
+#def token_name(type):
+#  return yytname[YYTRANSLATE(type)]
+#
+#def char_name(c):
+#  if c == ord('\''):
+#    return '\'\\\'\''
+#  else:
+#    buf = [None, None, None, None]
+#    buf[0] = ord('\'')
+#    buf[1] = c
+#    buf[2] = ord('\'')
+#    buf[3] = ord('\0')
+#    return quotearg_style(escape_quoting_style, buf)
+#
+#def current_lhs(sym, loc, ref):
+#  current_lhs_symbol = sym
+#  current_lhs_location = loc
+#  free(current_lhs_named_ref)
+#  current_lhs_named_ref = ref
+
+def insert_before(n, str):
+  global gram_piece3
+  lex_yy.gram_piece_insert(gram_piece2 + n * 2, str)
+  lex_yy.gram_piece0 += 1
+  gram_piece3 += 1
+
+def insert_after(n, str):
+  global gram_piece3
+  lex_yy.gram_piece_insert(gram_piece2 + n * 2 + 1, str)
+  lex_yy.gram_piece0 += 1
+  gram_piece3 += 1
diff --git a/reserialize.py b/reserialize.py

new file mode 100755 (executable)

index 0000000..b8346d3
--- /dev/null
+++ b/reserialize.py
@@ -0,0 +1,7 @@
+#!/usr/bin/env python3
+
+import ast
+import element
+import sys
+
+element.serialize(element.deserialize(sys.stdin, ast.factory), sys.stdout)
diff --git a/scan-code.l b/scan-code.l

new file mode 100644 (file)

index 0000000..244c649
--- /dev/null
+++ b/scan-code.l
@@ -0,0 +1,128 @@
+/* Bison Action Scanner                             -*- C -*-
+
+   Copyright (C) 2006-2015, 2018 Free Software Foundation, Inc.
+
+   This file is part of Bison, the GNU Compiler Compiler.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+%option debug nodefault noinput nounput noyywrap never-interactive
+
+%{
+  import ast
+
+  sc_context = -1
+%}
+
+%x SC_COMMENT SC_LINE_COMMENT
+%x SC_STRING SC_CHARACTER
+%x SC_RULE_ACTION SC_SYMBOL_ACTION
+
+/* POSIX says that a tag must be both an id and a C union member, but
+   historically almost any character is allowed in a tag.  We disallow
+   NUL and newline, as this simplifies our implementation.  We allow
+   "->" as a means to dereference a pointer.  */
+tag      (?:[^\0\n>]|->)+
+
+/* Zero or more instances of backslash-newline.  Following GCC, allow
+   white space between the backslash and the newline.  */
+splice   (?:\\[ \f\t\v]*\n)*
+
+/* C style identifier. Must start with letter. Will be used for
+   named symbol references. Shall be kept synchronized with
+   scan-gram.l "letter" and "id". */
+letter    [.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_]
+id        {letter}(?:{letter}|[-0-9])*
+
+%%
+
+%{
+  assert sc_context == SC_SYMBOL_ACTION or sc_context == SC_RULE_ACTION or sc_context == INITIAL
+  BEGIN(sc_context)
+%}
+
+<SC_COMMENT>
+{
+  "*"{splice}"/"               BEGIN(sc_context)
+}
+
+  /*--------------------------------------------------------------.
+  | Scanning a line comment.  The initial '//' is already eaten.  |
+  `--------------------------------------------------------------*/
+
+<SC_LINE_COMMENT>
+{
+  "\n"                         BEGIN(sc_context)
+  {splice}
+}
+
+  /*--------------------------------------------.
+  | Scanning user-code characters and strings.  |
+  `--------------------------------------------*/
+
+<SC_CHARACTER,SC_STRING>
+{
+  {splice}|\\{splice}.
+}
+
+<SC_CHARACTER>
+{
+  "'"                          BEGIN(sc_context)
+}
+
+<SC_STRING>
+{
+  "\""                         BEGIN(sc_context)
+}
+
+<SC_RULE_ACTION,SC_SYMBOL_ACTION>
+{
+  "'"                          BEGIN(SC_CHARACTER)
+  "\""                         BEGIN(SC_STRING)
+  "/"{splice}"*"               BEGIN(SC_COMMENT)
+  "/"{splice}"/"               BEGIN(SC_LINE_COMMENT)
+
+  [$@] {
+    state.complain(yylloc, state.Wother, 'stray \'{0:s}\''.format(yytext))
+  }
+}
+
+<SC_RULE_ACTION>
+{
+  (?E{
+    ast.AST.Text.StackReference,
+    tag_name = '' if yy_groups[2] is None else yy_groups[2][1:-1],
+    index = int(yy_groups[3])
+  }"$"("<"{tag}">")?(-?[0-9]+|{id}|"["{id}"]"))
+  (?E{
+    ast.AST.Text.StackLocation,
+    index = int(yy_groups[2])
+  }"@"(-?[0-9]+|{id}|"["{id}"]"))
+}
+
+<SC_RULE_ACTION,SC_SYMBOL_ACTION>
+{
+  (?E{
+    ast.AST.Text.ValueReference,
+    tag_name = '' if yy_groups[2] is None else yy_groups[2][1:-1]
+  }"$"("<"{tag}">")?"$")
+  (?E{
+    ast.AST.Text.ValueLocation
+  }"@$")
+}
+
+<*>
+{
+  .|\n
+}
diff --git a/scan-gram.l b/scan-gram.l

new file mode 100644 (file)

index 0000000..6b2b1b8
--- /dev/null
+++ b/scan-gram.l
@@ -0,0 +1,1060 @@
+/* Bison Grammar Scanner                             -*- C -*-
+
+   Copyright (C) 2002-2015, 2018 Free Software Foundation, Inc.
+
+   This file is part of Bison, the GNU Compiler Compiler.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+%option debug nodefault noinput noyywrap never-interactive
+
+%{
+  import state
+  import y_tab
+
+  scanner_cursor = state.boundary()
+  gram_last_string = ''
+  bracketed_id_str = None
+  bracketed_id_loc = 0
+  bracketed_id_start = 0
+  bracketed_id_context_state = -1
+
+  obstack_for_string = []
+  #def gram_scanner_last_string_free():
+  #  del obstack_for_string[:]
+
+  gram_piece = []
+  gram_piece0 = 0
+  gram_piece1 = 0
+
+  percent_percent_count = 0;
+
+  # these should be yylex()-local, but moved to here, see further down:
+  nesting = 0
+  context_state = -1
+  id_loc = state.location()
+  code_start = scanner_cursor.copy()
+  token_start = scanner_cursor.copy()
+  #first = True
+  if True: #first:
+    scanner_cursor = y_tab.yylloc.start.copy()
+    #first = False
+%}
+
+%x SC_YACC_COMMENT
+%x SC_ESCAPED_STRING SC_ESCAPED_CHARACTER
+%x SC_AFTER_IDENTIFIER
+
+%x SC_TAG
+
+%x SC_PROLOGUE SC_BRACED_CODE SC_EPILOGUE SC_PREDICATE
+%x SC_COMMENT SC_LINE_COMMENT
+%x SC_STRING SC_CHARACTER
+%x SC_BRACKETED_ID SC_RETURN_BRACKETED_ID
+%x SC_ELEMENT_GROUP
+
+letter    [.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_]
+notletter [^.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_]{-}[%\{]
+id        {letter}({letter}|[-0-9])*
+int       [0-9]+
+
+/* Zero or more instances of backslash-newline.  Following GCC, allow
+   white space between the backslash and the newline.  */
+splice   (\\[ \f\t\v]*\n)*
+
+/* An equal sign, with optional leading whitespaces. This is used in some
+   deprecated constructs. */
+eqopt    ([[:space:]]*=)?
+
+%%
+
+%{
+  # these should be here, but we can't access yylex()-local variables
+  # from an action since the action functions are not nested to yylex():
+  #nesting = 0
+  #context_state = 0
+  #id_loc = state.location()
+  #code_start = scanner_cursor.copy()
+  #token_start = scanner_cursor.copy()
+  #first = True
+  #if first:
+  #  scanner_cursor = y_tab.yylloc.start.copy()
+  #  first = False
+%}
+
+<INITIAL,SC_AFTER_IDENTIFIER,SC_BRACKETED_ID,SC_RETURN_BRACKETED_ID>
+{
+  /* Comments and white space.  */
+  ","                          state.complain(state.loc, state.Wother, 'stray \',\' treated as white space')
+  [ \f\n\t\v]                  |
+  "//".*                       #continue
+  "/*" {
+    global token_start, context_state
+    token_start = y_tab.yylloc.start
+    context_state = YY_START()
+    BEGIN(SC_YACC_COMMENT)
+  }
+
+  /* #line directives are not documented, and may be withdrawn or
+     modified in future versions of Bison.  */
+  ^"#line "{int}(" \"".*"\"")?"\n" #handle_syncline(yytext + sizeof '#line ' - 1, y_tab.yylloc)
+}
+
+
+  /*----------------------------.
+  | Scanning Bison directives.  |
+  `----------------------------*/
+
+  /* For directives that are also command line options, the regex must be
+        "%..."
+     after "[-_]"s are removed, and the directive must match the --long
+     option name, with a single string argument.  Otherwise, add exceptions
+     to ../build-aux/cross-options.pl.  */
+
+<INITIAL>
+{
+  "%binary"                    return y_tab.PERCENT_NONASSOC
+  "%code"                      return y_tab.PERCENT_CODE
+  "%debug" {
+    y_tab.yylval = 'parse.trace'
+    return y_tab.PERCENT_FLAG
+  }
+  "%default-prec"              return y_tab.PERCENT_DEFAULT_PREC
+  "%define"                    return y_tab.PERCENT_DEFINE
+  "%defines"                   return y_tab.PERCENT_DEFINES
+  "%destructor"                        return y_tab.PERCENT_DESTRUCTOR
+  "%dprec"                     return y_tab.PERCENT_DPREC
+  "%empty"                     return y_tab.PERCENT_EMPTY
+  "%error-verbose"             return y_tab.PERCENT_ERROR_VERBOSE
+  "%expect"                    return y_tab.PERCENT_EXPECT
+  "%expect-rr"                 return y_tab.PERCENT_EXPECT_RR
+  "%file-prefix"               return y_tab.PERCENT_FILE_PREFIX
+  "%fixed-output-files"                return y_tab.PERCENT_YACC
+  "%initial-action"            return y_tab.PERCENT_INITIAL_ACTION
+  "%glr-parser"                        return y_tab.PERCENT_GLR_PARSER
+  "%language"                  return y_tab.PERCENT_LANGUAGE
+  "%left"                      return y_tab.PERCENT_LEFT
+  "%lex-param" {
+    y_tab.yylval = y_tab.param_lex
+    return y_tab.PERCENT_PARAM
+  }
+  "%locations" {
+    y_tab.yylval = 'locations'
+    return y_tab.PERCENT_FLAG
+  }
+  "%merge"                     return y_tab.PERCENT_MERGE
+  "%name-prefix"               return y_tab.PERCENT_NAME_PREFIX
+  "%no-default-prec"           return y_tab.PERCENT_NO_DEFAULT_PREC
+  "%no-lines"                  return y_tab.PERCENT_NO_LINES
+  "%nonassoc"                  return y_tab.PERCENT_NONASSOC
+  "%nondeterministic-parser"   return y_tab.PERCENT_NONDETERMINISTIC_PARSER
+  "%nterm"                     return y_tab.PERCENT_NTERM
+  "%output"                    return y_tab.PERCENT_OUTPUT
+  "%param" {
+    y_tab.yylval = y_tab.param_both
+    return y_tab.PERCENT_PARAM
+  }
+  "%parse-param" {
+    y_tab.yylval = y_tab.param_parse
+    return y_tab.PERCENT_PARAM
+  }
+  "%prec"                      return y_tab.PERCENT_PREC
+  "%precedence"                        return y_tab.PERCENT_PRECEDENCE
+  "%printer"                   return y_tab.PERCENT_PRINTER
+  "%pure-parser" {
+    y_tab.yylval = 'api.pure'
+    return y_tab.PERCENT_FLAG
+  }
+  "%require"                   return y_tab.PERCENT_REQUIRE
+  "%right"                     return y_tab.PERCENT_RIGHT
+  "%skeleton"                  return y_tab.PERCENT_SKELETON
+  /* Nick added %space */
+  "%space"                     return y_tab.PERCENT_SPACE
+  "%start"                     return y_tab.PERCENT_START
+  "%term"                      return y_tab.PERCENT_TOKEN
+  "%token"                     return y_tab.PERCENT_TOKEN
+  "%token-table"               return y_tab.PERCENT_TOKEN_TABLE
+  "%type"                      return y_tab.PERCENT_TYPE
+  "%union"                     return y_tab.PERCENT_UNION
+  "%verbose"                   return y_tab.PERCENT_VERBOSE
+  "%yacc"                      return y_tab.PERCENT_YACC
+
+  /* deprecated */
+  "%default"[-_]"prec" {
+    #deprecated_directive(loc, yytext, '%default-prec')
+    scanner_cursor.column -= len('%default-prec')
+    unput('%default-prec')
+  }
+  "%error"[-_]"verbose" {
+    #deprecated_directive(loc, yytext, '%define parse.error verbose')
+    scanner_cursor.column -= len('%define parse.error verbose')
+    unput('%define parse.error verbose')
+  }
+  "%expect"[-_]"rr" {
+    #deprecated_directive(loc, yytext, '%expect-rr')
+    scanner_cursor.column -= len('%expect-rr')
+    unput('%expect-rr')
+  }
+  "%file-prefix"{eqopt} {
+    #deprecated_directive(loc, yytext, '%file-prefix')
+    scanner_cursor.column -= len('%file-prefix')
+    unput('%file-prefix')
+  }
+  "%fixed"[-_]"output"[-_]"files" {
+    #deprecated_directive(loc, yytext, '%fixed-output-files')
+    scanner_cursor.column -= len('%fixed-output-files')
+    unput('%fixed-output-files')
+  }
+  "%name"[-_]"prefix"{eqopt} {
+    #deprecated_directive(loc, yytext, '%name-prefix')
+    scanner_cursor.column -= len('%name-prefix')
+    unput('%name-prefix')
+  }
+  "%no"[-_]"default"[-_]"prec" {
+    #deprecated_directive(loc, yytext, '%no-default-prec')
+    scanner_cursor.column -= len('%no-default-prec')
+    unput('%no-default-prec')
+  }
+  "%no"[-_]"lines" {
+    #deprecated_directive(loc, yytext, '%no-lines')
+    scanner_cursor.column -= len('%no-lines')
+    unput('%no-lines')
+  }
+  "%output"{eqopt} {
+    #deprecated_directive(loc, yytext, '%output')
+    scanner_cursor.column -= len('%output')
+    unput('%output')
+  }
+  "%pure"[-_]"parser" {
+    #deprecated_directive(loc, yytext, '%pure-parser')
+    scanner_cursor.column -= len('%pure-parser')
+    unput('%pure-parser')
+  }
+  "%token"[-_]"table" {
+    #deprecated_directive(loc, yytext, '%token-table')
+    scanner_cursor.column -= len('%token-table')
+    unput('%token-table')
+  }
+
+  "%"{id}                      state.complain(y_tab.yylloc, state.complaint, 'invalid directive: {0:s}'.format(state.quote(yytext)))
+
+  "="                          return y_tab.EQUAL
+  "|"                          return y_tab.PIPE
+  ";"                          return y_tab.SEMICOLON
+
+  {id} {
+    global id_loc, bracketed_id_str
+    y_tab.yylval = yytext
+    id_loc = y_tab.yylloc
+    bracketed_id_str = None
+    BEGIN(SC_AFTER_IDENTIFIER)
+    gram_piece_pack()
+    gram_piece_append('<AST_ID>')
+    gram_piece_flush(len(yytext))
+    gram_piece_append('</AST_ID>')
+    gram_piece_pack()
+  }
+
+  {int} {
+    y_tab.yylval = scan_integer(yytext, 10, y_tab.yylloc)
+    return y_tab.INT
+  }
+  0[xX][0-9abcdefABCDEF]+ {
+    y_tab.yylval = scan_integer(yytext, 16, y_tab.yylloc)
+    return y_tab.INT
+  }
+
+  /* Identifiers may not start with a digit.  Yet, don't silently
+     accept "1FOO" as "1 FOO".  */
+  {int}{id}                    state.complain(y_tab.yylloc, state.complaint, 'invalid identifier: {0:s}'.format(state.quote(yytext)))
+
+  /* Characters.  */
+  "'" {
+    global token_start
+    token_start = y_tab.yylloc.start
+    BEGIN(SC_ESCAPED_CHARACTER)
+    gram_piece_pack()
+    gram_piece_append('<AST_Char>')
+    gram_piece_flush(len(yytext))
+    gram_piece_append('<AST_Text>')
+  }
+
+  /* Strings. */
+  "\"" {
+    global token_start
+    token_start = y_tab.yylloc.start
+    BEGIN(SC_ESCAPED_STRING)
+    gram_piece_pack()
+    gram_piece_append('<AST_String>')
+    gram_piece_flush(len(yytext))
+    gram_piece_append('<AST_Text>')
+  }
+
+  /* Prologue. */
+  "%{" {
+    global code_start
+    code_start = y_tab.yylloc.start
+    BEGIN(SC_PROLOGUE)
+    gram_piece_pack()
+    gram_piece_append('<AST_Section1_Prologue>')
+    gram_piece_flush(len(yytext))
+    gram_piece_append('<AST_Text>')
+  }
+
+  /* Code in between braces.  */
+  "{" {
+    global nesting, code_start
+    obstack_for_string.append(yytext)
+    nesting = 0
+    code_start = y_tab.yylloc.start
+    BEGIN(SC_BRACED_CODE)
+    gram_piece_pack()
+    gram_piece_append('<AST_BracedCode>')
+    gram_piece_flush(len(yytext))
+    gram_piece_append('<AST_Text>')
+  }
+
+  /* Semantic predicate. */
+  "%?"[ \f\n\t\v]*"{" {
+    global nesting, code_start
+    nesting = 0
+    code_start = y_tab.yylloc.start
+    gram_piece_pack()
+    gram_piece_append('<AST_BracedPredicate>')
+    gram_piece_flush(len(yytext))
+    gram_piece_append('<AST_Text>')
+    BEGIN(SC_PREDICATE)
+  }
+
+  /* Nick extra rules for element groups */
+  "(?E{" {
+    global nesting, code_start
+    obstack_for_string.append(yytext)
+    nesting = 0
+    code_start = y_tab.yylloc.start
+    BEGIN(SC_ELEMENT_GROUP)
+    gram_piece_pack()
+    gram_piece_flush(len(yytext))
+    gram_piece_append('<AST_Text>')
+  }
+  ")"                          return ord(')')
+
+  /* A type. */
+  "<*>" {
+    gram_piece_pack()
+    gram_piece_append('<AST_TagRef>&lt;<AST_Text>*</AST_Text>&gt;</AST_TagRef>')
+    return ~y_tab.TAG_ANY
+  }
+  "<>" {
+    gram_piece_pack()
+    gram_piece_append('<AST_TagRef>&lt;<AST_Text />&gt;</AST_TagRef>')
+    return ~y_tab.TAG_NONE
+  }
+  "<" {
+    global nesting, token_start
+    nesting = 0
+    token_start = y_tab.yylloc.start
+    BEGIN(SC_TAG)
+    gram_piece_pack()
+    gram_piece_append('<AST_TagRef>')
+    gram_piece_flush(len(yytext))
+    gram_piece_append('<AST_Text>')
+  }
+
+  "%%" {
+    global percent_percent_count
+    percent_percent_count += 1
+    if percent_percent_count == 2:
+      BEGIN(SC_EPILOGUE)
+      gram_piece_pack()
+      gram_piece_escape(yytext)
+      gram_piece_pack()
+      gram_piece_pack()
+      return ~y_tab.PERCENT_PERCENT
+    return y_tab.PERCENT_PERCENT
+  }
+
+  "[" {
+    global bracketed_id_str, bracketed_id_start, bracketed_id_context_state
+    bracketed_id_str = None
+    bracketed_id_start = y_tab.yylloc.start
+    bracketed_id_context_state = YY_START()
+    BEGIN(SC_BRACKETED_ID)
+  }
+
+  [^\[%A-Za-z0-9_<>{}\"\'*;|=/, \f\n\t\v]+|. state.complain(y_tab.yylloc, state.complaint, '{0:s}: {1:s}'.format('invalid character' if len(yytext) == 1 else 'invalid characters', state.quote(yytext)))
+
+  <<EOF>> {
+    y_tab.yylloc.start = scanner_cursor.copy()
+    y_tab.yylloc.end = scanner_cursor.copy()
+    yyterminate()
+  }
+}
+
+
+  /*--------------------------------------------------------------.
+  | Supporting \0 complexifies our implementation for no expected |
+  | added value.                                                  |
+  `--------------------------------------------------------------*/
+
+<SC_ESCAPED_CHARACTER,SC_ESCAPED_STRING,SC_TAG>
+{
+  \0                           state.complain(y_tab.yylloc, state.complaint, 'invalid null character')
+}
+
+
+  /*-----------------------------------------------------------------.
+  | Scanning after an identifier, checking whether a colon is next.  |
+  `-----------------------------------------------------------------*/
+
+<SC_AFTER_IDENTIFIER>
+{
+  "[" {
+    global bracketed_id_start, bracketed_id_context_state
+    if bracketed_id_str is not None:
+      scanner_cursor.column -= len(yytext)
+      yyless(0)
+      BEGIN(SC_RETURN_BRACKETED_ID)
+      y_tab.yylloc = id_loc
+      return y_tab.ID
+    else:
+      bracketed_id_start = y_tab.yylloc.start
+      bracketed_id_context_state = YY_START()
+      BEGIN(SC_BRACKETED_ID)
+  }
+  ":" {
+    BEGIN(SC_RETURN_BRACKETED_ID if bracketed_id_str else INITIAL)
+    y_tab.yylloc = id_loc
+    gram_piece_escape(yytext)
+    return ~y_tab.ID_COLON
+  }
+  . {
+    scanner_cursor.column -= len(yytext)
+    yyless(0)
+    BEGIN(SC_RETURN_BRACKETED_ID if bracketed_id_str else INITIAL)
+    y_tab.yylloc = id_loc
+    return ~y_tab.ID
+  }
+  <<EOF>> {
+    BEGIN(SC_RETURN_BRACKETED_ID if bracketed_id_str else INITIAL)
+    y_tab.yylloc = id_loc
+    return ~y_tab.ID
+  }
+}
+
+  /*--------------------------------.
+  | Scanning bracketed identifiers. |
+  `--------------------------------*/
+
+<SC_BRACKETED_ID>
+{
+  {id} {
+    global bracketed_id_str, bracketed_id_loc
+    if bracketed_id_str is not None:
+      state.complain(y_tab.yylloc, state.complaint, 'unexpected identifier in bracketed name: {0:s}'.format(state.quote(yytext)))
+    else:
+      bracketed_id_str = yytext
+      bracketed_id_loc = y_tab.yylloc
+  }
+  "]" {
+    global bracketed_id_str
+    BEGIN(bracketed_id_context_state)
+    if bracketed_id_str is not None:
+      if INITIAL == bracketed_id_context_state:
+        y_tab.yylval = bracketed_id_str
+        bracketed_id_str = None
+        y_tab.yylloc = bracketed_id_loc
+        return y_tab.BRACKETED_ID
+    else:
+      state.complain(y_tab.yylloc, state.complaint, 'an identifier expected')
+  }
+
+  [^\].A-Za-z0-9_/ \f\n\t\v]+|.        state.complain(y_tab.yylloc, state.complaint, '{0:s}: {1:s}'.format('invalid character in bracketed name' if len(yytext) == 1 else 'invalid characters in bracketed name', state.quote(yytext)))
+
+  <<EOF>> {
+    BEGIN(bracketed_id_context_state)
+    unexpected_eof(bracketed_id_start, ']')
+  }
+}
+
+<SC_RETURN_BRACKETED_ID>
+{
+  . {
+    global bracketed_id_str
+    scanner_cursor.column -= len(yytext)
+    yyless(0)
+    y_tab.yylval = bracketed_id_str
+    bracketed_id_str = None
+    y_tab.yylloc = bracketed_id_loc
+    BEGIN(INITIAL)
+    return y_tab.BRACKETED_ID
+  }
+}
+
+
+  /*---------------------------------------------------------------.
+  | Scanning a Yacc comment.  The initial '/ *' is already eaten.  |
+  `---------------------------------------------------------------*/
+
+<SC_YACC_COMMENT>
+{
+  "*/"                         BEGIN(context_state)
+  .|\n                         #continue
+  <<EOF>> {
+    unexpected_eof(token_start, '*/')
+    BEGIN(context_state)
+  }
+}
+
+
+  /*------------------------------------------------------------.
+  | Scanning a C comment.  The initial '/ *' is already eaten.  |
+  `------------------------------------------------------------*/
+
+<SC_COMMENT>
+{
+  "*"{splice}"/" {
+    obstack_for_string.append(yytext)
+    BEGIN(context_state)
+  }
+  <<EOF>> {
+    unexpected_eof(token_start, '*/')
+    BEGIN(context_state)
+  }
+}
+
+
+  /*--------------------------------------------------------------.
+  | Scanning a line comment.  The initial '//' is already eaten.  |
+  `--------------------------------------------------------------*/
+
+<SC_LINE_COMMENT>
+{
+  "\n" {
+    obstack_for_string.append(yytext)
+    BEGIN(context_state)
+  }
+  {splice}                     obstack_for_string.append(yytext)
+  <<EOF>>                      BEGIN(context_state)
+}
+
+
+  /*------------------------------------------------.
+  | Scanning a Bison string, including its escapes. |
+  | The initial quote is already eaten.             |
+  `------------------------------------------------*/
+
+<SC_ESCAPED_STRING>
+{
+  "\"" {
+    global gram_last_string
+    gram_last_string = ''.join(obstack_for_string)
+    del obstack_for_string[:] # not strictly correct
+    y_tab.yylloc.start = token_start
+    y_tab.yylval = gram_last_string
+    BEGIN(INITIAL)
+    gram_piece_append('</AST_Text>')
+    gram_piece_escape(yytext)
+    gram_piece_append('</AST_String>')
+    gram_piece_pack()
+    return ~y_tab.STRING
+  }
+  <<EOF>>                      unexpected_eof(token_start, '"')
+  "\n"                         unexpected_newline(token_start, '"')
+}
+
+  /*----------------------------------------------------------.
+  | Scanning a Bison character literal, decoding its escapes. |
+  | The initial quote is already eaten.                       |
+  `----------------------------------------------------------*/
+
+<SC_ESCAPED_CHARACTER>
+{
+  "'" {
+    global gram_last_string
+    gram_last_string = ''.join(obstack_for_string)
+    del obstack_for_string[:] # not strictly correct
+    y_tab.yylloc.start = token_start
+    if len(gram_last_string) == 0:
+      state.complain(y_tab.yylloc, state.Wother, 'empty character literal')
+      y_tab.yylval = ord('\'')
+    else:
+      if len(gram_last_string) > 1:
+        state.complain(y_tab.yylloc, state.Wother, 'extra characters in character literal')
+      y_tab.yylval = ord(gram_last_string[0])
+    #del obstack_for_string[:]
+    BEGIN(INITIAL)
+    gram_piece_append('</AST_Text>')
+    gram_piece_escape(yytext)
+    gram_piece_append('</AST_Char>')
+    gram_piece_pack()
+    return ~y_tab.CHAR
+  }
+  "\n"                         unexpected_newline(token_start, '\'')
+  <<EOF>>                      unexpected_eof(token_start, '\'')
+}
+
+
+
+  /*--------------------------------------------------------------.
+  | Scanning a tag.  The initial angle bracket is already eaten.  |
+  `--------------------------------------------------------------*/
+
+<SC_TAG>
+{
+  ">" {
+    global nesting, gram_last_string
+    nesting -= 1
+    if nesting < 0:
+      gram_last_string = ''.join(obstack_for_string)
+      del obstack_for_string[:] # not strictly correct
+      y_tab.yylloc.start = token_start
+      y_tab.yylval = gram_last_string
+      #del obstack_for_string[:]
+      BEGIN(INITIAL)
+      gram_piece_append('</AST_Text>')
+      gram_piece_escape(yytext)
+      gram_piece_append('</AST_TagRef>')
+      gram_piece_pack()
+      return ~y_tab.TAG
+    obstack_for_string.append(yytext)
+  }
+
+  ([^<>]|->)+                  obstack_for_string.append(yytext)
+  "<"+ {
+    global nesting
+    obstack_for_string.append(yytext)
+    nesting += len(yytext)
+  }
+
+  <<EOF>>                      unexpected_eof(token_start, '>')
+}
+
+  /*----------------------------.
+  | Decode escaped characters.  |
+  `----------------------------*/
+
+<SC_ESCAPED_STRING,SC_ESCAPED_CHARACTER>
+{
+  \\[0-7]{1,3} {
+    c = strtoul(yytext + 1, None, 8)
+    if not c or 0x7f * 2 + 1 < c:
+      state.complain(y_tab.yylloc, state.complaint, 'invalid number after \\-escape: {0:s}'.format(yytext[1:]))
+    else:
+      obstack_for_string.append(chr(c))
+      rpl_sprintf(gram_piece_temp, '<AST_Text_Escape character="%d">', int(c))
+      gram_piece_append(gram_piece_temp)
+      gram_piece_flush(len(yytext))
+      gram_piece_append('</AST_Text_Escape>')
+  }
+
+  \\x[0-9abcdefABCDEF]+ {
+    c = strtoul(yytext + 2, None, 16)
+    if not c or 0x7f * 2 + 1 < c:
+      state.complain(y_tab.yylloc, state.complaint, 'invalid number after \\-escape: {0:s}'.format(yytext[1:]))
+    else:
+      obstack_for_string.append(chr(c))
+      rpl_sprintf(gram_piece_temp, '<AST_Text_Escape character="%d">', int(c))
+      gram_piece_append(gram_piece_temp)
+      gram_piece_flush(len(yytext))
+      gram_piece_append('</AST_Text_Escape>')
+  }
+
+  \\a {
+    obstack_for_string.append('\a')
+    gram_piece_append('<AST_Text_Escape character="7">')
+    gram_piece_flush(len(yytext))
+    gram_piece_append('</AST_Text_Escape>')
+  }
+  \\b {
+    obstack_for_string.append('\b')
+    gram_piece_append('<AST_Text_Escape character="8">')
+    gram_piece_flush(len(yytext))
+    gram_piece_append('</AST_Text_Escape>')
+  }
+  \\f {
+    obstack_for_string.append('\f')
+    gram_piece_append('<AST_Text_Escape character="12">')
+    gram_piece_flush(len(yytext))
+    gram_piece_append('</AST_Text_Escape>')
+  }
+  \\n {
+    obstack_for_string.append('\n')
+    gram_piece_append('<AST_Text_Escape character="10">')
+    gram_piece_flush(len(yytext))
+    gram_piece_append('</AST_Text_Escape>')
+  }
+  \\r {
+    obstack_for_string.append('\r')
+    gram_piece_append('<AST_Text_Escape character="13">')
+    gram_piece_flush(len(yytext))
+    gram_piece_append('</AST_Text_Escape>')
+  }
+  \\t {
+    obstack_for_string.append('\t')
+    gram_piece_append('<AST_Text_Escape character="9">')
+    gram_piece_flush(len(yytext))
+    gram_piece_append('</AST_Text_Escape>')
+  }
+  \\v {
+    obstack_for_string.append('\v')
+    gram_piece_append('<AST_Text_Escape character="11">')
+    gram_piece_flush(len(yytext))
+    gram_piece_append('</AST_Text_Escape>')
+  }
+
+  /* \\[\"\'?\\] would be shorter, but it confuses xgettext.  */
+  \\("\""|"'"|"?"|"\\") {
+    obstack_for_string.append(yytext[1])
+    rpl_sprintf(gram_piece_temp, '<AST_Text_Escape character="%d">', yytext[1])
+    gram_piece_append(gram_piece_temp)
+    gram_piece_flush(len(yytext))
+    gram_piece_append('</AST_Text_Escape>')
+  }
+ 
+  \\(u|U[0-9abcdefABCDEF]{4})[0-9abcdefABCDEF]{4} {
+    c = convert_ucn_to_byte(yytext)
+    if c <= 0:
+      state.complain(y_tab.yylloc, state.complaint, 'invalid number after \\-escape: {0:s}'.format(yytext[1:]))
+    else:
+      obstack_for_string.append(chr(c))
+      rpl_sprintf(gram_piece_temp, '<AST_Text_Escape character="%d">', c)
+      gram_piece_append(gram_piece_temp)
+      gram_piece_flush(len(yytext))
+      gram_piece_append('</AST_Text_Escape>')
+  }
+  \\(.|\n) {
+    p = yytext[1:]
+    if True: #c_isspace(int(*p)) and c_isprint(int(*p)):
+      p = state.quote(p)
+    #else:
+    #  p = quotearg_style_mem(escape_quoting_style, p, 1)
+    state.complain(y_tab.yylloc, state.complaint, 'invalid character after \\-escape: {0:s}'.format(p))
+  }
+}
+
+  /*--------------------------------------------.
+  | Scanning user-code characters and strings.  |
+  `--------------------------------------------*/
+
+<SC_CHARACTER,SC_STRING>
+{
+  {splice}|\\{splice}[^\n\[\]] obstack_for_string.append(yytext)
+}
+
+<SC_CHARACTER>
+{
+  "'" {
+    obstack_for_string.append(yytext)
+    BEGIN(context_state)
+  }
+  \n                           unexpected_newline(token_start, '\'')
+  <<EOF>>                      unexpected_eof(token_start, '\'')
+}
+
+<SC_STRING>
+{
+  "\"" {
+    obstack_for_string.append(yytext)
+    BEGIN(context_state)
+  }
+  \n                           unexpected_newline(token_start, '"')
+  <<EOF>>                      unexpected_eof(token_start, '"')
+}
+
+
+  /*---------------------------------------------------.
+  | Strings, comments etc. can be found in user code.  |
+  `---------------------------------------------------*/
+
+ /* Nick added: SC_ELEMENT_GROUP */
+<SC_BRACED_CODE,SC_PROLOGUE,SC_EPILOGUE,SC_PREDICATE,SC_ELEMENT_GROUP>
+{
+  "'" {
+    global context_state, token_start
+    obstack_for_string.append(yytext)
+    context_state = YY_START()
+    token_start = y_tab.yylloc.start
+    BEGIN(SC_CHARACTER)
+  }
+  "\"" {
+    global context_state, token_start
+    obstack_for_string.append(yytext)
+    context_state = YY_START()
+    token_start = y_tab.yylloc.start
+    BEGIN(SC_STRING)
+  }
+  "/"{splice}"*" {
+    global context_state, token_start
+    obstack_for_string.append(yytext)
+    context_state = YY_START()
+    token_start = y_tab.yylloc.start
+    BEGIN(SC_COMMENT)
+  }
+  "/"{splice}"/" {
+    global context_state, token_start
+    obstack_for_string.append(yytext)
+    context_state = YY_START()
+    BEGIN(SC_LINE_COMMENT)
+  }
+}
+
+
+
+  /*-----------------------------------------------------------.
+  | Scanning some code in braces (actions, predicates). The    |
+  | initial "{" is already eaten.                              |
+  `-----------------------------------------------------------*/
+
+ /* Nick added: SC_ELEMENT_GROUP */
+<SC_BRACED_CODE,SC_PREDICATE,SC_ELEMENT_GROUP>
+{
+  "{"|"<"{splice}"%" {
+    global nesting
+    obstack_for_string.append(yytext)
+    nesting += 1
+  }
+  "%"{splice}">" {
+    global nesting
+    obstack_for_string.append(yytext)
+    nesting -= 1
+  }
+
+  /* Tokenize '<<%' correctly (as '<<' '%') rather than incorrrectly
+     (as '<' '<%').  */
+  "<"{splice}"<"               obstack_for_string.append(yytext)
+
+  <<EOF>>                      unexpected_eof(code_start, '}')
+}
+
+<SC_BRACED_CODE>
+{
+  "}" {
+    global nesting, gram_last_string
+    obstack_for_string.append('}')
+    nesting -= 1
+    if nesting < 0:
+      gram_last_string = ''.join(obstack_for_string)
+      del obstack_for_string[:] # not strictly correct
+      y_tab.yylloc.start = code_start
+      y_tab.yylval = gram_last_string
+      BEGIN(INITIAL)
+      gram_piece_append('</AST_Text>')
+      gram_piece_escape(yytext)
+      gram_piece_append('</AST_BracedCode>')
+      gram_piece_pack()
+      return ~y_tab.BRACED_CODE
+  }
+}
+
+<SC_PREDICATE>
+{
+  "}" {
+    global nesting, gram_last_string
+    nesting -= 1
+    if nesting < 0:
+      gram_last_string = ''.join(obstack_for_string)
+      del obstack_for_string[:] # not strictly correct
+      y_tab.yylloc.start = code_start
+      y_tab.yylval = gram_last_string
+      BEGIN(INITIAL)
+      gram_piece_append('</AST_Text>')
+      gram_piece_escape(yytext)
+      gram_piece_append('</AST_BracedPredicate>')
+      gram_piece_pack()
+      return ~y_tab.BRACED_PREDICATE
+    else:
+      obstack_for_string.append('}')
+  }
+}
+
+ /* Nick extra rules for element groups */
+<SC_ELEMENT_GROUP>
+{
+  "}" {
+    global nesting, gram_last_string
+    obstack_for_string.append('}')
+    nesting -= 1
+    if nesting < 0:
+      gram_last_string = ''.join(obstack_for_string)
+      del obstack_for_string[:] # not strictly correct
+      y_tab.yylloc.start = code_start
+      y_tab.yylval = gram_last_string
+      #del obstack_for_string[:]
+      BEGIN(INITIAL)
+      gram_piece_append('</AST_Text>')
+      gram_piece_escape(yytext)
+      gram_piece_pack()
+      return ~ord('(')
+  }
+}
+
+  /*--------------------------------------------------------------.
+  | Scanning some prologue: from "%{" (already scanned) to "%}".  |
+  `--------------------------------------------------------------*/
+
+<SC_PROLOGUE>
+{
+  "%}" {
+    global gram_last_string
+    gram_last_string = ''.join(obstack_for_string)
+    del obstack_for_string[:] # not strictly correct
+    y_tab.yylloc.start = code_start
+    y_tab.yylval = gram_last_string
+    BEGIN(INITIAL)
+    gram_piece_append('</AST_Text>')
+    gram_piece_escape(yytext)
+    gram_piece_append('</AST_Section1_Prologue>')
+    gram_piece_pack()
+    return ~y_tab.PROLOGUE
+  }
+
+  <<EOF>>                      unexpected_eof(code_start, '%}')
+}
+
+
+  /*---------------------------------------------------------------.
+  | Scanning the epilogue (everything after the second "%%", which |
+  | has already been eaten).                                       |
+  `---------------------------------------------------------------*/
+
+<SC_EPILOGUE>
+{
+  <<EOF>> {
+    global gram_last_string
+    gram_last_string = ''.join(obstack_for_string)
+    del obstack_for_string[:] # not strictly correct
+    y_tab.yylloc.start = code_start
+    y_tab.yylval = gram_last_string
+    BEGIN(INITIAL)
+    gram_piece_pack()
+    return ~y_tab.EPILOGUE
+  }
+}
+
+
+  /*-----------------------------------------------------.
+  | By default, grow the string obstack with the input.  |
+  `-----------------------------------------------------*/
+
+ /* Nick added: SC_ELEMENT_GROUP */
+<SC_COMMENT,SC_LINE_COMMENT,SC_BRACED_CODE,SC_PREDICATE,SC_PROLOGUE,SC_EPILOGUE,SC_STRING,SC_CHARACTER,SC_ESCAPED_STRING,SC_ESCAPED_CHARACTER,SC_ELEMENT_GROUP>. |
+ /* Nick added: SC_ELEMENT_GROUP */
+<SC_COMMENT,SC_LINE_COMMENT,SC_BRACED_CODE,SC_PREDICATE,SC_PROLOGUE,SC_EPILOGUE,SC_ELEMENT_GROUP>\n obstack_for_string.append(yytext)
+
+
+%%
+
+#def no_cr_read(fp, buf, size):
+#  bytes_read = fread_unlocked(buf, 1, size, fp)
+#  if bytes_read:
+#    w = memchr(buf, ord('\r'), bytes_read)
+#    if w:
+#      r = ++w
+#      lim = buf + bytes_read
+#      pass
+#      while True:
+#        w[-1] = ord('\n')
+#        if r == lim:
+#          ch = getc_unlocked(fp)
+#          if ch != ord('\n') and ungetc(ch, fp) != ch:
+#            break
+#        else:
+#          if *r == ord('\n'):
+#            r += 1
+#        while True:
+#          if r == lim:
+#            return w - buf
+#          if not ((*w++ = *r++) != ord('\r')):
+#            break
+#        pass
+#      return w - buf
+#  return bytes_read
+
+def scan_integer(number, base, loc):
+  num = int(number, base)
+  if 0x7fffffff < num:
+    state.complain(y_tab.yylloc, state.complaint, 'integer out of range: {0:s}'.format(state.quote(number)))
+    num = 0x7fffffff
+  return num
+
+#def convert_ucn_to_byte(ucn):
+#  code = strtoul(ucn + 2, None, 16)
+#  if 0x7f * 2 + 1 < code:
+#    return -1
+#  return code
+#
+#def handle_syncline(args, loc):
+#  file = None
+#  lineno = strtoul(args, &file, 10)
+#  if 0x7fffffff <= lineno:
+#    state.complain(y_tab.yylloc, state.Wother, 'line number overflow')
+#    lineno = 0x7fffffff
+#  file = strchr(file, ord('"'))
+#  if file:
+#    *strchr(file + 1, ord('"')) = ord('\0')
+#    current_file = uniqstr_new(file + 1)
+#  boundary_set(&scanner_cursor, current_file, lineno, 1)
+
+def unexpected_end(start, msg, token_end):
+  loc = state.location(start.copy(), scanner_cursor.copy())
+  scanner_cursor.column -= len(token_end)
+  unput(token_end)
+  token_end = state.quote(token_end)
+  if token_end == '\'\\\'\'':
+    token_end = '"\'"'
+  state.complain(y_tab.yylloc, state.complaint, msg.format(token_end))
+
+def unexpected_eof(start, token_end):
+  unexpected_end(start, 'missing {0:s} at end of file', token_end)
+
+def unexpected_newline(start, token_end):
+  unexpected_end(start, 'missing {0:s} at end of line', token_end)
+
+#def gram_scanner_initialize():
+#  global obstack_for_string
+#  obstack_for_string = []
+#
+#def gram_scanner_free():
+#  del obstack_for_string[:]
+#  yy_delete_buffer(YY_CURRENT_BUFFER)
+
+def gram_piece_append(str):
+  gram_piece.append(str)
+
+def gram_piece_insert(n, str):
+  gram_piece[n:n] = [str]
+
+xml_escape = {'<': '&lt;', '>': '&gt;', '&': '&amp;'}
+def gram_piece_escape(str):
+  gram_piece.append(''.join([xml_escape.get(i, i) for i in str]))
+
+def gram_piece_flush(n):
+  global yytext
+  gram_piece_escape(yytext[:n])
+  yytext = yytext[n:]
+
+def gram_piece_pack():
+  global gram_piece0
+  gram_piece[gram_piece0:] = [''.join(gram_piece[gram_piece0:])]
+  gram_piece0 += 1
+
+def gram_lex():
+  result = yylex()
+  if result < 0:
+    return ~result
+  gram_piece_pack()
+  gram_piece_escape(yytext)
+  gram_piece_pack()
+  return result
diff --git a/state.py b/state.py

new file mode 100644 (file)

index 0000000..a87299a
--- /dev/null
+++ b/state.py
@@ -0,0 +1,77 @@
+import sys
+
+# miscellaneous state accessed by scan-gram.l and parse-gram.y
+class boundary:
+  def __init__(self, file = '<stdin>', line = 0, column = 0):
+    self.file = file
+    self.line = line
+    self.column = column
+  def copy(self):
+    return boundary(self.file, self.line, self.column)
+
+class location:
+  def __init__(self, start = None, end = None):
+    self.start = boundary() if start is None else start
+    self.end = boundary() if end is None else end
+  def copy(self):
+    return location(self.start.copy(), self.end.copy())
+
+warning_midrule_values = 0
+warning_yacc = 1
+warning_conflicts_sr = 2
+warning_conflicts_rr = 3
+warning_empty_rule = 3
+warning_deprecated = 4
+warning_precedence = 5
+warning_other = 6
+warnings_size = 7
+ 
+Wnone = 0
+Wmidrule_values = 1 << warning_midrule_values
+Wyacc = 1 << warning_yacc
+Wconflicts_sr = 1 << warning_conflicts_sr
+Wconflicts_rr = 1 << warning_conflicts_rr
+Wdeprecated = 1 << warning_deprecated
+Wempty_rule = 1 << warning_empty_rule
+Wprecedence = 1 << warning_precedence
+Wother = 1 << warning_other
+Werror = 1 << 10
+complaint = 1 << 11
+fatal = 1 << 12
+silent = 1 << 13
+no_caret = 1 << 14
+Weverything = ~complaint & ~fatal & ~silent
+Wall = Weverything & ~Wyacc
+
+def complain(loc, flags, message):
+  #severity s = warning_severity (flags);
+  #if ((flags & complaint) && complaint_status < status_complaint)
+  #  complaint_status = status_complaint;
+  #
+  #if (severity_warning <= s)
+  #  {
+  #    const char* prefix =
+  #      s == severity_fatal ? _("fatal error")
+  #      : s == severity_error ? _("error")
+  #      : _("warning");
+  #    if (severity_error <= s && ! complaint_status)
+  #      complaint_status = status_warning_as_error;
+  #    error_message (loc, flags, prefix, message, args);
+  #  }
+  #
+  #if (flags & fatal)
+  #  exit (EXIT_FAILURE);
+  print(message)
+  sys.exit(1)
+
+undef_assoc = 0
+right_assoc = 1
+left_assoc = 2
+non_assoc = 3
+precedence_assoc = 4
+
+destructor = 0
+printer = 1
+
+def quote(str):
+  return '"{0:s}"'.format(str.replace('\\', '\\\\').replace('"', '\\"'))
author	Nick Downing <nick@ndcode.org>
	Mon, 21 Jan 2019 01:24:45 +0000 (12:24 +1100)
committer	Nick Downing <nick@ndcode.org>
	Mon, 21 Jan 2019 01:24:45 +0000 (12:24 +1100)
.gitignore		patch \| blob \| history
Makefile	[new file with mode: 0644]	patch \| blob
bootstrap/parse-gram.y		patch \| blob \| history
markup.py	[new file with mode: 0755]	patch \| blob
n.sh	[new file with mode: 0755]	patch \| blob
parse-gram.y	[new file with mode: 0644]	patch \| blob
reserialize.py	[new file with mode: 0755]	patch \| blob
scan-code.l	[new file with mode: 0644]	patch \| blob
scan-gram.l	[new file with mode: 0644]	patch \| blob
state.py	[new file with mode: 0644]	patch \| blob