From f9fdd965ce9ef49c28b50ca6dcd0d3d8d72faa66 Mon Sep 17 00:00:00 2001
From: Nick Downing <nick@ndcode.org>
Date: Mon, 14 Jan 2019 14:54:58 +1100
Subject: [PATCH] First cut at y_to_python.py, need to normalize whitespace and
 indent in *.y.new

---
 ansi_c.l       |   4 +-
 ast.py         |  53 ++++++++++++----
 l_to_python.py |   7 +--
 o.sh           |   4 ++
 tests/parse.y  |  87 +++++++++++++++----------
 xml_to_y.py    |   9 +++
 y_to_python.py | 168 +++++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 282 insertions(+), 50 deletions(-)
 create mode 100755 o.sh
 create mode 100755 xml_to_y.py
 create mode 100755 y_to_python.py

diff --git a/ansi_c.l b/ansi_c.l
index 49900bf..01e37e2 100644
--- a/ansi_c.l
+++ b/ansi_c.l
@@ -85,7 +85,9 @@ import y_tab
   return y_tab.TYPEDEF_NAME
 }
 
-(?E{ast.AST.Identifier}{L}{A}*) {
+ /* hack for yacc/bison specification */
+(?E{ast.AST.Identifier}{L}{A}*) |
+(?E{ast.AST.Identifier}"$$"|"$"{D}+) {
   return y_tab.IDENTIFIER
 }
 
diff --git a/ast.py b/ast.py
index bcd537e..22bb2e1 100644
--- a/ast.py
+++ b/ast.py
@@ -403,6 +403,12 @@ class AST(element.Element):
     # GENERATE END
     def __str__(self):
       return 'void'
+    def translate_size(self, context):
+      return 0
+    def translate_type(self, context):
+      return 'NoneType'
+    def translate_zero(self, context):
+      return 'None'
 
   class TypeInt(Type):
     # GENERATE ELEMENT(bool signed, int bits) BEGIN
@@ -626,17 +632,27 @@ class AST(element.Element):
     def translate_size(self, context):
       return 4
     def translate_type(self, context):
-      assert (
+      return (
+        'str'
+      if (
         isinstance(self.target_type, AST.TypeInt) and
         self.target_type.bits == 8
+      ) else
+        'list'
       )
-      return 'str'
     def translate_zero(self, context):
-      assert (
-        isinstance(self.target_type, AST.TypeInt) and
-        self.target_type.bits == 8
+      return (
+        (
+          '\'\''
+        if (
+          isinstance(self.target_type, AST.TypeInt) and
+          self.target_type.bits == 8
+        ) else
+          '[]'
+        )
+      if context.top_level else
+        'None'
       )
-      return '\'\'' if context.top_level else 'None'
 
   class TypeArray(Type):
     # GENERATE ELEMENT(ref element_type, int element_count) BEGIN
@@ -1120,9 +1136,14 @@ class AST(element.Element):
       return self[0].get_type_and_name(
         AST.TypeArray(
           element_type = base_type,
-          element_count = int(
-            element.get_text(self[2], 0),
-            8 if element.get_text(self[2], 0)[:2] in octal_prefix else 0
+          element_count = (
+            -1
+          if isinstance(self[2], AST.ExpressionEmpty) else
+            # kludgey way, assuming not calculated size
+            int(
+              element.get_text(self[2], 0),
+              8 if element.get_text(self[2], 0)[:2] in octal_prefix else 0
+            )
           )
         )
       )
@@ -4961,6 +4982,7 @@ class AST(element.Element):
     return 'ast.AST({0:s})'.format(', '.join(params))
   # GENERATE END
 
+# void char short int long float double signed unsigned bool complex imaginary
 type_specifiers_to_type = {
   (1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): AST.TypeVoid(),
   (0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): AST.TypeInt(signed = True, bits = 8),
@@ -4969,18 +4991,27 @@ type_specifiers_to_type = {
   (0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0): AST.TypeInt(signed = True, bits = 16),
   (0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0): AST.TypeInt(signed = True, bits = 16),
   (0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0): AST.TypeInt(signed = False, bits = 16),
+  (0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0): AST.TypeInt(signed = True, bits = 16),
+  (0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0): AST.TypeInt(signed = True, bits = 16),
+  (0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0): AST.TypeInt(signed = False, bits = 16),
   (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): AST.TypeInt(signed = True, bits = 32),
+  (0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0): AST.TypeInt(signed = True, bits = 32),
+  (0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0): AST.TypeInt(signed = False, bits = 32),
   (0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0): AST.TypeInt(signed = True, bits = 32),
   (0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0): AST.TypeInt(signed = True, bits = 32),
   (0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0): AST.TypeInt(signed = False, bits = 32),
   (0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0): AST.TypeInt(signed = True, bits = 32),
   (0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0): AST.TypeInt(signed = True, bits = 32),
-  (0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0): AST.TypeInt(signed = True, bits = 32),
   (0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0): AST.TypeInt(signed = False, bits = 32),
-  (0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0): AST.TypeInt(signed = False, bits = 32),
+  (0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0): AST.TypeInt(signed = True, bits = 32),
+  (0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0): AST.TypeInt(signed = True, bits = 32),
+  (0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0): AST.TypeInt(signed = False, bits = 32),
   (0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0): AST.TypeInt(signed = True, bits = 64),
   (0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0): AST.TypeInt(signed = True, bits = 64),
   (0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0): AST.TypeInt(signed = False, bits = 64),
+  (0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0): AST.TypeInt(signed = True, bits = 64),
+  (0, 0, 0, 1, 2, 0, 0, 1, 0, 0, 0, 0): AST.TypeInt(signed = True, bits = 64),
+  (0, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0): AST.TypeInt(signed = False, bits = 64),
   (0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0): AST.TypeFloat(complex = 0, bits = 32),
   (0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0): AST.TypeFloat(complex = 2, bits = 32),
   (0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1): AST.TypeFloat(complex = 1, bits = 32),
diff --git a/l_to_python.py b/l_to_python.py
index 1399a68..6c9473c 100755
--- a/l_to_python.py
+++ b/l_to_python.py
@@ -21,7 +21,7 @@ def my_rstrip(text, indent):
 
 def c_to_python(context, text):
   lex_yy.yyin = None
-  lex_yy.yy_buffer_stack = [lex_yy.YYBufferState(None, None)]
+  lex_yy.yy_buffer_stack = [lex_yy.YYBufferState()]
   lex_yy.yytext_len = 0
   lex_yy.unput(text)
   root = y_tab.yyparse(ast.AST.TranslationUnit)
@@ -184,10 +184,7 @@ with open('a.i') as fin:
         text = text.lstrip('\t ')
       else:
         prefix += ' '
-        text = '{{\n{0:s}{1:s}}}\n'.format(
-          text,
-          indent
-        )
+        text = '{{\n{0:s}{1:s}}}\n'.format(text, indent)
       element.set_text(parent, 2, prefix)
     element.set_text(node, 0, text)
 
diff --git a/o.sh b/o.sh
new file mode 100755
index 0000000..5a85fe7
--- /dev/null
+++ b/o.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+../bootstrap_bison.git/src/bison -o /dev/null tests/parse.y 2>tests/parse.y.xml
+./y_to_python.py <tests/parse.y.xml >tests/parse.y.new.xml
+./xml_to_y.py <tests/parse.y.new.xml >tests/parse.y.new
diff --git a/tests/parse.y b/tests/parse.y
index 814842a..8811acc 100644
--- a/tests/parse.y
+++ b/tests/parse.y
@@ -12,6 +12,9 @@
 
 %left CCL_OP_DIFF CCL_OP_UNION
 
+/* Nick extra rules for action groups */
+%token TOK_ACTION_GROUP TOK_ELEMENT_GROUP
+
 /*
  *POSIX and AT&T lex place the
  * precedence of the repeat operator, {}, below that of concatenation.
@@ -173,14 +176,14 @@ initlex		:
 
 sect1		:  sect1 startconddecl namelist1
  {
- insert_after(2, "</PLex_Section1_StartConditions>");
- sprintf(piece_temp, "<PLex_Section1_StartConditions exclusive=\"%s\">", xcluflg ? "true" : "false");
+ insert_after(2, "</AST_Section1_StartConditions>");
+ sprintf(piece_temp, "<AST_Section1_StartConditions exclusive=\"%s\">", xcluflg ? "true" : "false");
  insert_before(1, piece_temp);
  }
 		|  sect1 options
  {
- insert_after(1, "</PLex_Section1_Options>");
- insert_before(1, "<PLex_Section1_Options>");
+ insert_after(1, "</AST_Section1_Options>");
+ insert_before(1, "<AST_Section1_Options>");
  }
 		|
 		|  error
@@ -223,47 +226,47 @@ option		:  TOK_OUTFILE '=' NAME
 			{
 			outfilename = xstrdup(nmstr);
 			did_outfilename = 1;
- insert_after(2, "</PLex_Section1_Options_OutFile>");
- insert_before(0,"<PLex_Section1_Options_OutFile>");
+ insert_after(2, "</AST_Section1_Options_OutFile>");
+ insert_before(0,"<AST_Section1_Options_OutFile>");
 			}
 		|  TOK_EXTRA_TYPE '=' NAME
 			{ extra_type = xstrdup(nmstr); /*}*/
- insert_after(2, "</PLex_Section1_Options_ExtraType>");
- insert_before(0, "<PLex_Section1_Options_ExtraType>");
+ insert_after(2, "</AST_Section1_Options_ExtraType>");
+ insert_before(0, "<AST_Section1_Options_ExtraType>");
  }
 		|  TOK_PREFIX '=' NAME
 			{ prefix = xstrdup(nmstr);
                           if (strchr(prefix, '[') || strchr(prefix, ']'))
                               flexerror(_("Prefix must not contain [ or ]")); /*}*/
- insert_after(2, "</PLex_Section1_Options_Prefix>");
- insert_before(0, "<PLex_Section1_Options_Prefix>");
+ insert_after(2, "</AST_Section1_Options_Prefix>");
+ insert_before(0, "<AST_Section1_Options_Prefix>");
  }
 		|  TOK_YYCLASS '=' NAME
 			{ yyclass = xstrdup(nmstr); /*}*/
- insert_after(2, "</PLex_Section1_Options_YYClass>");
- insert_before(0, "<PLex_Section1_Options_YYClass>");
+ insert_after(2, "</AST_Section1_Options_YYClass>");
+ insert_before(0, "<AST_Section1_Options_YYClass>");
  }
 		|  TOK_HEADER_FILE '=' NAME
 			{ headerfilename = xstrdup(nmstr); /*}*/
- insert_after(2, "</PLex_Section1_Options_HeaderFile>");
- insert_before(0, "<PLex_Section1_Options_HeaderFile>");
+ insert_after(2, "</AST_Section1_Options_HeaderFile>");
+ insert_before(0, "<AST_Section1_Options_HeaderFile>");
  }
 	    |  TOK_TABLES_FILE '=' NAME
             { tablesext = true; tablesfilename = xstrdup(nmstr); /*}*/
- insert_after(2, "</PLex_Section1_Options_TablesFile>");
- insert_before(0, "<PLex_Section1_Options_TablesFile>");
+ insert_after(2, "</AST_Section1_Options_TablesFile>");
+ insert_before(0, "<AST_Section1_Options_TablesFile>");
  }
 		;
 
 sect2		:  sect2 scon initforrule flexrule '\n'
 			{ scon_stk_ptr = $2; /*}*/
- insert_after(4, "</PLex_Section2_Rule>");
- insert_before(1, "<PLex_Section2_Rule>");
+ insert_after(4, "</AST_Section2_Rule>");
+ insert_before(1, "<AST_Section2_Rule>");
  }
 		|  sect2 scon '{' sect2 '}'
 			{ scon_stk_ptr = $2; /*}*/
- insert_after(4, "</PLex_Section2_CompoundRule>");
- insert_before(1, "<PLex_Section2_CompoundRule>");
+ insert_after(4, "</AST_Section2_CompoundRule>");
+ insert_before(1, "<AST_Section2_CompoundRule>");
  }
 		|
 		;
@@ -315,8 +318,8 @@ flexrule	:  '^' rule
 					pinpoint_message(
 			"'^' operator results in sub-optimal performance" );
 				}
- insert_after(1, "</PLex_Section2_Rule_FLexRule>");
- insert_before(0, "<PLex_Section2_Rule_FLexRule bol=\"true\">");
+ insert_after(1, "</AST_Section2_Rule_FLexRule>");
+ insert_before(0, "<AST_Section2_Rule_FLexRule bol=\"true\">");
 			}
 
 		|  rule
@@ -341,8 +344,8 @@ flexrule	:  '^' rule
 							mkbranch( scset[i],
 								pat );
 				}
- insert_after(0, "</PLex_Section2_Rule_FLexRule>");
- insert_before(0, "<PLex_Section2_Rule_FLexRule bol=\"false\">");
+ insert_after(0, "</AST_Section2_Rule_FLexRule>");
+ insert_before(0, "<AST_Section2_Rule_FLexRule bol=\"false\">");
 			}
 
 		|  EOF_OP
@@ -366,8 +369,8 @@ flexrule	:  '^' rule
 				else
 					build_eof_action();
 				}
- insert_after(0, "</PLex_Section2_Rule_EOFRule>");
- insert_before(0, "<PLex_Section2_Rule_EOFRule>");
+ insert_after(0, "</AST_Section2_Rule_EOFRule>");
+ insert_before(0, "<AST_Section2_Rule_EOFRule>");
 			}
 
 		|  error
@@ -380,8 +383,8 @@ scon_stk_ptr	:
 
 scon		:  '<' scon_stk_ptr namelist2 '>'
 			{ $$ = $2; /*}*/
- insert_after(3, "</PLex_Section2_StartConditions>");
- insert_before(0, "<PLex_Section2_StartConditions>");
+ insert_after(3, "</AST_Section2_StartConditions>");
+ insert_before(0, "<AST_Section2_StartConditions>");
  }
 
 		|  '<' '*' '>'
@@ -399,8 +402,8 @@ scon		:  '<' scon_stk_ptr namelist2 '>'
 				if ( j > scon_stk_ptr )
 					scon_stk[++scon_stk_ptr] = i;
 				}
- insert_after(2, "</PLex_Section2_StartConditions>");
- insert_before(0, "<PLex_Section2_StartConditions wildcard=\"true\">");
+ insert_after(2, "</AST_Section2_StartConditions>");
+ insert_before(0, "<AST_Section2_StartConditions wildcard=\"true\">");
 			}
 
 		|
@@ -413,7 +416,7 @@ scon		:  '<' scon_stk_ptr namelist2 '>'
  piece[piece2 + 1] = piece[piece2]; /* empty */
  piece[piece2] = piece[piece2 - 1]; /* empty */
  piece[piece2 - 1] = temp;
- insert_before(0, "<PLex_Section2_StartConditions />");
+ insert_before(0, "<AST_Section2_StartConditions />");
  }
 		;
 
@@ -871,11 +874,29 @@ singleton	:  singleton '*'
 
 		|  '(' re ')'
 			{ $$ = $2; /*}*/
-#if 0 /* for now do things in the traditional lex way without subexpressions */
  insert_after(2, "</RegexGroup>");
  insert_before(0, "<RegexGroup>");
-#endif
  }
+		/* Nick extra rules for unnumbered groups */
+		| '(' ':' re ')'
+			{ $$ = $3; }
+		/* Nick extra rules for named groups */
+		| '(' NAME re ')'
+			{ $$ = $3; /*}*/
+ insert_after(3, "</RegexGroupName>");
+ insert_before(0, "<RegexGroupName>");
+}
+		/* Nick extra rules for action groups */
+		| '(' TOK_ACTION_GROUP re ')'
+			{ $$ = $3; /*}*/
+ insert_after(3, "</RegexGroupAction>");
+ insert_before(0, "<RegexGroupAction>");
+}
+		| '(' TOK_ELEMENT_GROUP re ')'
+			{ $$ = $3; /*}*/
+ insert_after(3, "</RegexGroupElement>");
+ insert_before(0, "<RegexGroupElement>");
+}
 
 		|  CHAR
 			{
diff --git a/xml_to_y.py b/xml_to_y.py
new file mode 100755
index 0000000..6cb6974
--- /dev/null
+++ b/xml_to_y.py
@@ -0,0 +1,9 @@
+#!/usr/bin/env python3
+
+import element
+import sys
+import xml.etree.ElementTree
+
+sys.stdout.write(
+  element.to_text(xml.etree.ElementTree.parse(sys.stdin).getroot())
+)
diff --git a/y_to_python.py b/y_to_python.py
new file mode 100755
index 0000000..6333847
--- /dev/null
+++ b/y_to_python.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+
+import ast
+import element
+import lex_yy
+import os
+import sys
+import xml.etree.ElementTree
+import y_tab
+
+def my_rstrip(text, indent):
+  i = len(text)
+  while i > 0 and text[i - 1] == '}':
+    i -= 1
+    assert i > 0
+    while text[i - 1] != '{':
+      i -= 1
+      assert i > 0
+    i -= 1
+  return text[:i].rstrip('\t ') + indent + text[i:]
+
+def c_to_python(context, text):
+  lex_yy.yyin = None
+  lex_yy.yy_buffer_stack = [lex_yy.YYBufferState()]
+  lex_yy.yytext_len = 0
+  lex_yy.unput(text)
+  root = y_tab.yyparse(ast.AST.TranslationUnit)
+  context.lines = []
+  root.translate_translation_unit(context)
+  return ''.join(context.lines)
+
+root = xml.etree.ElementTree.parse(
+  sys.stdin,
+  xml.etree.ElementTree.XMLParser(
+    target = xml.etree.ElementTree.TreeBuilder(element.Element),
+    encoding = 'unicode'
+  )
+).getroot()
+
+context = ast.Context()
+#context.translate_identifier['BEGIN'] = 'self.BEGIN'
+#context.translate_identifier['yylval'] = 'ref_data.yylval'
+#context.translate_identifier['yytext'] = 'self.yytext'
+#context.translate_identifier['yy_pop_state'] = 'self.yy_pop_state'
+#context.translate_identifier['yy_push_state'] = 'self.yy_push_state'
+
+actions = []
+with open('a.c', 'w') as fout:
+  def extract(i, indent):
+    if i.tag == 'AST_Section1_Prologue':
+      node = i[0]
+      assert node.tag == 'AST_Text'
+      indent += '  '
+      initial = True
+    elif i.tag == 'AST_Production_Action':
+      node = i[0]
+      assert node.tag == 'AST_Text'
+      initial = False
+    elif i.tag == 'AST_Section3':
+      node = i
+      initial = True
+    else:
+      child_indent = indent
+      if i.tag == 'AST':
+        for j in range(1, len(i) + 1):
+          element.set_text(i, j, element.get_text(i, j).rstrip() + '\n')
+      #elif (
+      #  i.tag == 'AST_Section2_Rule' or
+      #  i.tag == 'AST_Section2_Rule_FLexRule'
+      #):
+      #  element.set_text(i, 0, element.get_text(i, 0).lstrip('\t '))
+      #elif i.tag == 'AST_Section2_CompoundRule':
+      #  child_indent += '  '
+      #  element.set_text(
+      #    i,
+      #    0,
+      #    indent + element.get_text(i, 0).lstrip('\t ')
+      #  )
+      #  for j in range(1, len(i)):
+      #    element.set_text(
+      #      i,
+      #      j,
+      #      #element.get_text(i, j).rstrip('\t ') + child_indent
+      #      my_rstrip(element.get_text(i, j), child_indent)
+      #    )
+      #  element.set_text(
+      #    i,
+      #    len(i),
+      #    indent + element.get_text(i, len(i)).lstrip('\t ')
+      #  )
+      for j in i:
+        extract(j, child_indent)
+      return
+    #assert len(node) == 0
+    #text = element.get_text(node, 0)
+    text = element.to_text(node)
+
+    lines = [i.rstrip() for i in text.split('\n')]
+    while len(lines) and len(lines[-1]) == 0:
+      del lines[-1]
+    while len(lines) and len(lines[0]) == 0:
+      del lines[0]
+    for line in lines:
+      if (
+        (line[:10] == '#include <' and line[-3:] == '.h>') or
+        (line[:10] == '#include "' and line[-3:] == '.h"')
+      ):
+        fout.write(
+          '''@@@ IMPORT({0:s})
+{1:s}
+#undef NULL
+#undef bool
+#undef false
+#undef true
+@@@ IMPORT END\n'''.format(
+            line[10:-3].replace('/', '.'),
+            line
+          )
+        )
+      else:
+        fout.write(line + '\n')
+    fout.write('@@@\n')
+
+    actions.append((node, indent, initial))
+  extract(root, '')
+
+os.system('gcc -I tests/flex_h -E a.c >a.i')
+with open('a.i') as fin:
+  for node, indent, initial in actions:
+    lines = []
+    line = fin.readline()
+    while line != '@@@\n':
+      assert len(line)
+      if (
+        line[:1] == '#' or
+        (line == '\n' and len(lines) and lines[-1] == '\n')
+      ):
+        pass
+      elif line[:11] == '@@@ IMPORT(' and line[-2:] == ')\n':
+        # make the importing look like a function call in the C code:
+        #lines.append('import("{0:s}");\n'.format(line[11:-2]))
+        line = fin.readline()
+        while line != '@@@ IMPORT END\n':
+          assert len(line)
+          line = fin.readline()
+      else:
+        lines.append(line)
+      line = fin.readline()
+    text = ''.join(lines)
+
+    if initial:
+      context.indent = indent
+      text = c_to_python(context, text)
+    else:
+      context.indent = indent
+      text = c_to_python(
+        context,
+        'void a(void) {0:s}'.format(text) # already has braces and \n
+      )
+      assert text[:len(indent) + 10] == '\n{0:s}def a():\n'.format(indent)
+      text = text[len(indent) + 10:]
+      text = '{{\n{0:s}{1:s}}}\n'.format(text, indent)
+    element.set_text(node, 0, text)
+
+xml.etree.ElementTree.ElementTree(root).write(
+  sys.stdout,
+  encoding = 'unicode' # strangely does not seem to default to this
+)
-- 
2.34.1