Implement action groups (extension syntax), use this to make a private implementation...
authorNick Downing <downing.nick@gmail.com>
Tue, 25 Sep 2018 23:20:48 +0000 (09:20 +1000)
committerNick Downing <downing.nick@gmail.com>
Tue, 25 Sep 2018 23:20:48 +0000 (09:20 +1000)
ast.py
dfa.py
generate_py.py
nfa.py
regex.py
skel/skel_py.py
tests/Makefile
tests/cal_py.l

diff --git a/ast.py b/ast.py
index 5e02610..6043e17 100644 (file)
--- a/ast.py
+++ b/ast.py
@@ -3991,7 +3991,7 @@ class AST(element.Element):
           return False # continued_action
  
       class FLexRule(element.Element):
-        # GENERATE ELEMENT(bool bol, int action) BEGIN
+        # GENERATE ELEMENT(bool bol, int group_index, list(ref) groups0, list(ref) groups1, int action) BEGIN
         def __init__(
           self,
           tag = 'AST_Section2_Rule_FLexRule',
@@ -3999,6 +3999,9 @@ class AST(element.Element):
           text = '',
           children = [],
           bol = False,
+          group_index = -1,
+          groups0 = [],
+          groups1 = [],
           action = -1
         ):
           element.Element.__init__(
@@ -4013,6 +4016,13 @@ class AST(element.Element):
           if isinstance(bol, str) else
             bol
           )
+          self.group_index = (
+            element.deserialize_int(group_index)
+          if isinstance(group_index, str) else
+            group_index
+          )
+          self.groups0 = groups0
+          self.groups1 = groups1
           self.action = (
             element.deserialize_int(action)
           if isinstance(action, str) else
@@ -4021,10 +4031,28 @@ class AST(element.Element):
         def serialize(self, ref_list):
           element.Element.serialize(self, ref_list)
           self.set('bol', element.serialize_bool(self.bol))
+          self.set('group_index', element.serialize_int(self.group_index))
+          self.set(
+            'groups0',
+            ' '.join([element.serialize_ref(i, ref_list) for i in self.groups0])
+          )
+          self.set(
+            'groups1',
+            ' '.join([element.serialize_ref(i, ref_list) for i in self.groups1])
+          )
           self.set('action', element.serialize_int(self.action))
         def deserialize(self, ref_list):
           element.Element.deserialize(self, ref_list)
           self.bol = element.deserialize_bool(self.get('bol', 'false'))
+          self.group_index = element.deserialize_int(self.get('group_index', '-1'))
+          self.groups0 = [
+            element.deserialize_ref(i, ref_list)
+            for i in self.get('groups0', '').split()
+          ]
+          self.groups1 = [
+            element.deserialize_ref(i, ref_list)
+            for i in self.get('groups1', '').split()
+          ]
           self.action = element.deserialize_int(self.get('action', '-1'))
         def copy(self, factory = None):
           result = element.Element.copy(
@@ -4032,6 +4060,9 @@ class AST(element.Element):
             FLexRule if factory is None else factory
           )
           result.bol = self.bol
+          result.group_index = self.group_index
+          result.groups0 = self.groups0
+          result.groups1 = self.groups1
           result.action = self.action
           return result
         def repr_serialize(self, params):
@@ -4040,6 +4071,22 @@ class AST(element.Element):
             params.append(
               'bol = {0:s}'.format(repr(self.bol))
             )
+          if self.group_index != -1:
+            params.append(
+              'group_index = {0:s}'.format(repr(self.group_index))
+            )
+          if len(self.groups0):
+            params.append(
+              'groups0 = [{0:s}]'.format(
+                ', '.join([repr(i) for i in self.groups0])
+              )
+            )
+          if len(self.groups1):
+            params.append(
+              'groups1 = [{0:s}]'.format(
+                ', '.join([repr(i) for i in self.groups1])
+              )
+            )
           if self.action != -1:
             params.append(
               'action = {0:s}'.format(repr(self.action))
@@ -4069,10 +4116,14 @@ class AST(element.Element):
             if not self.bol:
               _ast.start_conditions[i].rules.append(self)
             _ast.start_conditions[i].bol_rules.append(self)
+          self.groups0 = []
           self[0].post_process(
+            self.groups0,
             caseless = _ast[0].caseless
           ) # regex
+          self.groups1 = []
           self[1].post_process(
+            self.groups1,
             caseless = _ast[0].caseless
           ) # trailing context regex
           self.action = len(_ast.actions_text)
@@ -4099,6 +4150,9 @@ class AST(element.Element):
             #    element.get_text(_ast.actions_text[-1], 0)
             #  )
             #)
+          _ast.flex_rules.append(self)
+          self.group_index = _ast.n_groups
+          _ast.n_groups += len(self.groups0) + 1 + len(self.groups1)
           return continued_action
  
       # GENERATE ELEMENT() BEGIN
@@ -4261,7 +4315,7 @@ class AST(element.Element):
       return 'ast.AST.Section3({0:s})'.format(', '.join(params))
     # GENERATE END
 
-  # GENERATE ELEMENT(list(ref) start_conditions, list(ref) actions_text, list(ref) eof_actions_text, int default_action) BEGIN
+  # GENERATE ELEMENT(list(ref) start_conditions, list(ref) actions_text, list(ref) eof_actions_text, int default_action, list(ref) flex_rules, int n_groups) BEGIN
   def __init__(
     self,
     tag = 'AST',
@@ -4271,7 +4325,9 @@ class AST(element.Element):
     start_conditions = [],
     actions_text = [],
     eof_actions_text = [],
-    default_action = -1
+    default_action = -1,
+    flex_rules = [],
+    n_groups = -1
   ):
     element.Element.__init__(
       self,
@@ -4288,6 +4344,12 @@ class AST(element.Element):
     if isinstance(default_action, str) else
       default_action
     )
+    self.flex_rules = flex_rules
+    self.n_groups = (
+      element.deserialize_int(n_groups)
+    if isinstance(n_groups, str) else
+      n_groups
+    )
   def serialize(self, ref_list):
     element.Element.serialize(self, ref_list)
     self.set(
@@ -4303,6 +4365,11 @@ class AST(element.Element):
       ' '.join([element.serialize_ref(i, ref_list) for i in self.eof_actions_text])
     )
     self.set('default_action', element.serialize_int(self.default_action))
+    self.set(
+      'flex_rules',
+      ' '.join([element.serialize_ref(i, ref_list) for i in self.flex_rules])
+    )
+    self.set('n_groups', element.serialize_int(self.n_groups))
   def deserialize(self, ref_list):
     element.Element.deserialize(self, ref_list)
     self.start_conditions = [
@@ -4318,6 +4385,11 @@ class AST(element.Element):
       for i in self.get('eof_actions_text', '').split()
     ]
     self.default_action = element.deserialize_int(self.get('default_action', '-1'))
+    self.flex_rules = [
+      element.deserialize_ref(i, ref_list)
+      for i in self.get('flex_rules', '').split()
+    ]
+    self.n_groups = element.deserialize_int(self.get('n_groups', '-1'))
   def copy(self, factory = None):
     result = element.Element.copy(
       self,
@@ -4327,6 +4399,8 @@ class AST(element.Element):
     result.actions_text = self.actions_text
     result.eof_actions_text = self.eof_actions_text
     result.default_action = self.default_action
+    result.flex_rules = self.flex_rules
+    result.n_groups = self.n_groups
     return result
   def repr_serialize(self, params):
     element.Element.repr_serialize(self, params)
@@ -4352,6 +4426,16 @@ class AST(element.Element):
       params.append(
         'default_action = {0:s}'.format(repr(self.default_action))
       )
+    if len(self.flex_rules):
+      params.append(
+        'flex_rules = [{0:s}]'.format(
+          ', '.join([repr(i) for i in self.flex_rules])
+        )
+      )
+    if self.n_groups != -1:
+      params.append(
+        'n_groups = {0:s}'.format(repr(self.n_groups))
+      )
   def __repr__(self):
     params = []
     self.repr_serialize(params)
@@ -4373,6 +4457,8 @@ class AST(element.Element):
     self.eof_actions_text = [
       AST.Text(text = '\t\t\t\tyyterminate();\n')
     ]
+    self.flex_rules = []
+    self.n_groups = 0
 
     # variables that won't be serialized
     name_to_start_condition = {'INITIAL': 0}
@@ -4402,12 +4488,18 @@ class AST(element.Element):
         )
       )
     )
-  def to_nfa(self):
+  def to_nfa(self, group_ref_data):
     _nfa = nfa.NFA()
     for i in self.start_conditions:
-      for j in range(2):
-        _regex = regex.RegexNone()
-        for k in [i.rules, i.bol_rules][j]:
+      for j in [i.rules, i.bol_rules]:
+        _regex = regex.RegexNone(
+          n_groups = 0
+        )
+        regex_group_ref_data = []
+        for k in j:
+          ng = _regex.n_groups
+          ng0 = k[0].n_groups
+          ng1 = k[1].n_groups
           _regex = regex.RegexOr(
             children = [
               _regex,
@@ -4418,21 +4510,32 @@ class AST(element.Element):
                     children = [
                       k[1]
                     ],
-                    index = k.action
+                    n_groups = 1 + ng1
                   )
-                ]
+                ],
+                n_groups = ng0 + 1 + ng1
               )
+            ],
+            n_groups = ng + ng0 + 1 + ng1
+          )
+          regex_group_ref_data.extend(
+            group_ref_data[
+              k.group_index:
+              k.group_index + len(k.groups0) + 1 + len(k.groups1)
             ]
           )
+        ng = _regex.n_groups
         _regex = regex.RegexAnd(
           children = [
             regex.RegexRepeat(
-              count0 = 0,
               children = [
                 regex.RegexCharacter(
-                  character_set = [0, 0x100]
+                  character_set = [0, 0x100],
+                  n_groups = 0
                 )
-              ]
+              ],
+              n_groups = 0,
+              count0 = 0
             ),
             regex.RegexOr(
               children = [
@@ -4440,21 +4543,28 @@ class AST(element.Element):
                 regex.RegexSequence(
                   children = [
                     regex.RegexCharacter(
-                      character_set = [0, 0x100]
+                      character_set = [0, 0x100],
+                      n_groups = 0
                     ),
                     regex.RegexGroup(
                       children = [
-                        regex.RegexEmpty()
+                        regex.RegexEmpty(
+                          n_groups = 0
+                        )
                       ],
-                      index = self.default_action
+                      n_groups = 1
                     )
-                  ]
+                  ],
+                  n_groups = 1
                 )
-              ]
+              ],
+              n_groups = ng + 1
             )
-          ]
+          ],
+          n_groups = ng + 1
         )
-        _regex.add_to_nfa(_nfa)
+        regex_group_ref_data.append(group_ref_data[-1])
+        _regex.add_to_nfa(_nfa, regex_group_ref_data)
     return _nfa
 
 # GENERATE FACTORY(regex.factory) BEGIN
diff --git a/dfa.py b/dfa.py
index 4e28ce8..2ba88f8 100644 (file)
--- a/dfa.py
+++ b/dfa.py
@@ -31,20 +31,15 @@ class DFA:
  
   def __init__(
     self,
-    groups = [],
     states = [([n_characters], [0], [0])],
     actions = [(0, [])],
     start_action = [] # can have multiple DFAs in same container
   ):
-    # groups: list of group_desc
-    # group_desc: (tag, kwargs)
-    #   tag, kwargs will be passed to apply_markup() hence factory()
     # states: list of state_desc
     # state_desc: (list of breaks, list of action to do, accept_threads)
     # actions: list of action_desc
     # action_desc: (state to go to next, compiled transition to do first)
     # accept_threads: list of accepting thread numbers (in thread list)
-    self.groups = groups
     self.states = states
     self.actions = actions
     self.start_action = start_action
@@ -444,48 +439,47 @@ class DFA:
       off += 1
     return None
 
-  def yylex(self, root, pos, off, factory, yychunk_iter):
-    if pos < 0:
-      pos, off = element.to_start_relative(root, pos, off)
-
-    while True:
-      # note: pointers must be kept start relative during the below call,
-      # because it extends the following text by calling the yychunk_iter
-      thread = self.match_yychunk(root, pos, off, yychunk_iter)
-      if thread is None:
-        break
-      stack = []
-      while True:
-        pos, off, mark_value, thread = thread
-        group_index = mark_value >> 1
-        if (mark_value & 1) != 0:
-          end_pos, end_off = element.to_end_relative(root, pos, off)
-          stack.append((end_pos, end_off, group_index))
-        else:
-          end_pos, end_off, temp = stack.pop()
-          assert temp == group_index
-          if len(stack) == 0:
-            break
-          tag, kwargs = self.groups[group_index]
-          if tag != '':
-            work.apply_markup(
-              root,
-              pos,
-              off,
-              end_pos,
-              end_off,
-              factory,
-              tag,
-              **kwargs
-            )
-      # note: pointers must be kept end relative during the below call,
-      # because it modifies the preceding text by calling apply_markup()
-      yield end_pos, end_off, group_index
-      pos, off = element.to_start_relative(root, end_pos, end_off)
+  #def yylex(self, root, pos, off, factory, yychunk_iter):
+  #  if pos < 0:
+  #    pos, off = element.to_start_relative(root, pos, off)
+
+  #  while True:
+  #    # note: pointers must be kept start relative during the below call,
+  #    # because it extends the following text by calling the yychunk_iter
+  #    thread = self.match_yychunk(root, pos, off, yychunk_iter)
+  #    if thread is None:
+  #      break
+  #    stack = []
+  #    while True:
+  #      pos, off, mark_value, thread = thread
+  #      group_index = mark_value >> 1
+  #      if (mark_value & 1) != 0:
+  #        end_pos, end_off = element.to_end_relative(root, pos, off)
+  #        stack.append((end_pos, end_off, group_index))
+  #      else:
+  #        end_pos, end_off, temp = stack.pop()
+  #        assert temp == group_index
+  #        if len(stack) == 0:
+  #          break
+  #        tag, kwargs = self.groups[group_index]
+  #        if tag != '':
+  #          work.apply_markup(
+  #            root,
+  #            pos,
+  #            off,
+  #            end_pos,
+  #            end_off,
+  #            factory,
+  #            tag,
+  #            **kwargs
+  #          )
+  #    # note: pointers must be kept end relative during the below call,
+  #    # because it modifies the preceding text by calling apply_markup()
+  #    yield end_pos, end_off, group_index
+  #    pos, off = element.to_start_relative(root, end_pos, end_off)
 
   def __repr__(self):
-    return 'dfa.DFA({0:s}, {1:s}, {2:s}, {3:s})'.format(
-      repr(self.groups),
+    return 'dfa.DFA({0:s}, {1:s}, {2:s})'.format(
       repr(self.states),
       repr(self.actions),
       repr(self.start_action)
index 21c9c5c..bc36709 100644 (file)
@@ -1,8 +1,8 @@
 import os
+import regex
 import wrap_repr
 
-def ast_text_to_python(ast_text, indent):
-  text = ast_text.get_text()
+def text_to_python(text, indent):
   text_strip = text.strip()
   if text_strip[:1] == '{' and text_strip[-1:] == '}':
     text = text_strip[1:-1]
@@ -30,8 +30,74 @@ def ast_text_to_python(ast_text, indent):
       lines[j] = '{0:s}{1:s}\n'.format(indent, lines[j][len(prefix):])
   return ''.join(lines)
 
+# note: these routines are literally the same, but conceptually different,
+# because ast.Text and regex.Text are different and unrelated base classes
+def ast_text_to_python(ast_text, indent):
+  return text_to_python(ast_text.get_text(), indent)
+def regex_text_to_python(regex_text, indent):
+  return text_to_python(regex_text.get_text(), indent)
+
 def generate_py(_ast, _element, home_dir, skel_file, out_file):
-  _dfa = _ast.to_nfa().to_dfa()
+  # generate group action function names (ref_data) and body text
+  group_ref_data = []
+  group_actions_text = []
+  group_rules_text = []
+  for i in _ast.flex_rules:
+    # add actions for capturing groups in ordinary regex
+    group_ref_data.extend(
+      [
+        (
+          'yy_group{0:d}'.format(len(group_actions_text) + j),
+          'yy_group_end'
+        )
+        for j in range(len(i.groups0))
+      ]
+    )
+    group_actions_text.extend([j[0] for j in i.groups0])
+
+    # add group for the rule, recognizing this matches the rule
+    group_ref_data.append(
+      (
+        'yy_rule_start',
+        'yy_rule{0:d}'.format(len(group_rules_text))
+      )
+    )
+    group_rules_text.append(
+      regex.Text(
+        text = '''global yy_action
+yy_action = yy_action{0:d}
+'''.format(i.action)
+      )
+    )
+
+    # add actions for capturing groups in trailing context regex
+    group_ref_data.extend(
+      [
+        (
+          'yy_group{0:d}'.format(len(group_actions_text) + j),
+          'yy_group_end'
+        )
+        for j in range(len(i.groups1))
+      ]
+    )
+    group_actions_text.extend([j[0] for j in i.groups1])
+
+  # add group for default rule, recognizing this matches the rule
+  group_ref_data.append(
+    (
+      'yy_rule_start',
+      'yy_rule{0:d}'.format(len(group_rules_text))
+    )
+  )
+  group_rules_text.append(
+    regex.Text(
+      text = '''global yy_action
+yy_action = yy_action{0:d}
+'''.format(_ast.default_action)
+    )
+  )
+
+  _dfa = _ast.to_nfa(group_ref_data).to_dfa()
 
   if skel_file is None:
     skel_file = os.path.join(
@@ -80,28 +146,10 @@ def generate_py(_ast, _element, home_dir, skel_file, out_file):
         elif line == '# GENERATE SECTION2\n':
           fout.write(
             '''# GENERATE SECTION2 BEGIN
-{0:s}{1:s}{2:s}{3:s}{4:s}yy_actions = [{5:s}
-]
-{6:s}yy_eof_actions = [{7:s}
+{0:s}{1:s}{2:s}{3:s}{4:s}{5:s}{6:s}yy_eof_actions = [{7:s}
 ]
 # GENERATE END
 '''.format(
-              wrap_repr.wrap_repr(
-                'yy_dfa_groups = {0:s}'.format(repr(_dfa.groups)),
-                79
-              ),
-              wrap_repr.wrap_repr(
-                'yy_dfa_states = {0:s}'.format(repr(_dfa.states)),
-                79
-              ),
-              wrap_repr.wrap_repr(
-                'yy_dfa_actions = {0:s}'.format(repr(_dfa.actions)),
-                79
-              ),
-              wrap_repr.wrap_repr(
-                'yy_dfa_start_action = {0:s}'.format(repr(_dfa.start_action)),
-                79
-              ),
               ''.join(
                 [
                   '''def yy_action{0:d}():
@@ -113,12 +161,40 @@ def generate_py(_ast, _element, home_dir, skel_file, out_file):
                   for i in range(len(_ast.actions_text))
                 ]
               ),
-              ','.join(
+              ''.join(
                 [
-                  '\n  yy_action{0:d}'.format(i)
-                  for i in range(len(_ast.actions_text))
+                  '''def yy_rule{0:d}(match, pos):
+{1:s}'''.format(
+                    i,
+                    regex_text_to_python(group_rules_text[i], '  ')
+                  )
+                  for i in range(len(group_rules_text))
                 ]
               ),
+              ''.join(
+                [
+                  '''def yy_group{0:d}(match, pos):
+{1:s}'''.format(
+                    i,
+                    regex_text_to_python(group_actions_text[i], '  ')
+                  )
+                  for i in range(len(group_actions_text))
+                ]
+              ),
+              wrap_repr.wrap_repr(
+                'yy_dfa_states = {0:s}'.format(repr(_dfa.states)),
+                79
+              ),
+              wrap_repr.wrap_repr(
+                'yy_dfa_actions = {0:s}'.format(
+                  repr(_dfa.actions).replace('\'', '')
+                ),
+                79
+              ),
+              wrap_repr.wrap_repr(
+                'yy_dfa_start_action = {0:s}'.format(repr(_dfa.start_action)),
+                79
+              ),
               ''.join(
                 [
                   '''def yy_eof_action{0:d}():
diff --git a/nfa.py b/nfa.py
index d55ebae..b101906 100644 (file)
--- a/nfa.py
+++ b/nfa.py
@@ -36,14 +36,9 @@ class NFA:
 
   def __init__(
     self,
-    groups = [],
     states = [(STATE_CHARACTER, [0, n_characters], 0)],
     start_state = [] # can have multiple NFAs in same container
   ):
-    # groups: list of group_desc
-    # group_desc: (tag, kwargs)
-    #   tag, kwargs will be passed to apply_markup() hence factory()
-    self.groups = groups
     self.states = states
     self.start_state = start_state
 
@@ -350,7 +345,7 @@ class NFA:
     return None
 
   def to_dfa(self):
-    _dfa = dfa.DFA(list(self.groups))
+    _dfa = dfa.DFA()
 
     accept_key = (NFA.accept_multistate, ())
     action_to_meaning = [accept_key]
@@ -442,8 +437,7 @@ class NFA:
     return _dfa
 
   def __repr__(self):
-    return 'nfa.NFA({0:s}, {1:s}, {2:s})'.format(
-      repr(self.groups),
+    return 'nfa.NFA({0:s}, {1:s})'.format(
       repr(self.states),
       repr(self.start_state)
     )
index 0fc24a5..1989a7b 100644 (file)
--- a/regex.py
+++ b/regex.py
@@ -6,13 +6,14 @@ import nfa
 n_characters = 0x100
 
 class Regex(element.Element):
-  # GENERATE ELEMENT() BEGIN
+  # GENERATE ELEMENT(int n_groups) BEGIN
   def __init__(
     self,
     tag = 'Regex',
     attrib = {},
     text = '',
-    children = []
+    children = [],
+    n_groups = -1
   ):
     element.Element.__init__(
       self,
@@ -21,28 +22,44 @@ class Regex(element.Element):
       text,
       children
     )
+    self.n_groups = (
+      element.deserialize_int(n_groups)
+    if isinstance(n_groups, str) else
+      n_groups
+    )
+  def serialize(self, ref_list):
+    element.Element.serialize(self, ref_list)
+    self.set('n_groups', element.serialize_int(self.n_groups))
+  def deserialize(self, ref_list):
+    element.Element.deserialize(self, ref_list)
+    self.n_groups = element.deserialize_int(self.get('n_groups', '-1'))
   def copy(self, factory = None):
     result = element.Element.copy(
       self,
       Regex if factory is None else factory
     )
+    result.n_groups = self.n_groups
     return result
+  def repr_serialize(self, params):
+    element.Element.repr_serialize(self, params)
+    if self.n_groups != -1:
+      params.append(
+        'n_groups = {0:s}'.format(repr(self.n_groups))
+      )
   def __repr__(self):
     params = []
     self.repr_serialize(params)
     return 'regex.Regex({0:s})'.format(', '.join(params))
   # GENERATE END
-  def post_process(self, group_index = 0, caseless = False):
+  def post_process(self, groups, caseless = False):
+    self.n_groups = 0
     for i in self:
-      group_index = i.post_process(group_index, caseless)
-    return group_index
-  def add_to_groups(self, groups):
-    for i in self:
-      i.add_to_groups(groups)
-  def to_nfa_state(self, _nfa, next_state):
+      i.post_process(groups, caseless)
+      self.n_groups += i.n_groups
+  def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
     raise NotImplementedError
-  def add_to_nfa(self, _nfa):
-    _nfa.start_state.append(self.to_nfa_state(_nfa, 0))
+  def add_to_nfa(self, _nfa, group_ref_data):
+    _nfa.start_state.append(self.to_nfa_state(_nfa, group_ref_data, 0, 0))
 
 class RegexNone(Regex):
   # GENERATE ELEMENT() BEGIN
@@ -51,14 +68,16 @@ class RegexNone(Regex):
     tag = 'RegexNone',
     attrib = {},
     text = '',
-    children = []
+    children = [],
+    n_groups = -1
   ):
     Regex.__init__(
       self,
       tag,
       attrib,
       text,
-      children
+      children,
+      n_groups
     )
   def copy(self, factory = None):
     result = Regex.copy(
@@ -71,7 +90,7 @@ class RegexNone(Regex):
     self.repr_serialize(params)
     return 'regex.RegexNone({0:s})'.format(', '.join(params))
   # GENERATE END
-  def to_nfa_state(self, _nfa, next_state):
+  def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
     return -1
 
 class RegexEmpty(Regex):
@@ -81,14 +100,16 @@ class RegexEmpty(Regex):
     tag = 'RegexEmpty',
     attrib = {},
     text = '',
-    children = []
+    children = [],
+    n_groups = -1
   ):
     Regex.__init__(
       self,
       tag,
       attrib,
       text,
-      children
+      children,
+      n_groups
     )
   def copy(self, factory = None):
     result = Regex.copy(
@@ -101,7 +122,7 @@ class RegexEmpty(Regex):
     self.repr_serialize(params)
     return 'regex.RegexEmpty({0:s})'.format(', '.join(params))
   # GENERATE END
-  def to_nfa_state(self, _nfa, next_state):
+  def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
     return next_state
 
 class RegexCharacter(Regex):
@@ -112,6 +133,7 @@ class RegexCharacter(Regex):
     attrib = {},
     text = '',
     children = [],
+    n_groups = -1,
     character_set = []
   ):
     Regex.__init__(
@@ -119,7 +141,8 @@ class RegexCharacter(Regex):
       tag,
       attrib,
       text,
-      children
+      children,
+      n_groups
     )
     self.character_set = (
       [element.deserialize_int(i) for i in character_set.split()]
@@ -158,8 +181,8 @@ class RegexCharacter(Regex):
     self.repr_serialize(params)
     return 'regex.RegexCharacter({0:s})'.format(', '.join(params))
   # GENERATE END
-  def post_process(self, group_index = 0, caseless = False):
-    group_index = Regex.post_process(self, group_index, caseless)
+  def post_process(self, groups, caseless = False):
+    Regex.post_process(self, groups, caseless)
     if caseless:
       temp = bisect_set.bisect_set_and(
         self.character_set,
@@ -170,10 +193,11 @@ class RegexCharacter(Regex):
         [i ^ 0x20 for i in temp if i >= 0x60] +
         [i ^ 0x20 for i in temp if i < 0x60]
       )
-    return group_index
-  def to_nfa_state(self, _nfa, next_state):
+  def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
     new_state = len(_nfa.states)
-    _nfa.states.append((nfa.NFA.STATE_CHARACTER, self.character_set, next_state))
+    _nfa.states.append(
+      (nfa.NFA.STATE_CHARACTER, self.character_set, next_state)
+    )
     return new_state
 
 class RegexCharacterLiteral(RegexCharacter):
@@ -184,6 +208,7 @@ class RegexCharacterLiteral(RegexCharacter):
     attrib = {},
     text = '',
     children = [],
+    n_groups = -1,
     character_set = []
   ):
     RegexCharacter.__init__(
@@ -192,6 +217,7 @@ class RegexCharacterLiteral(RegexCharacter):
       attrib,
       text,
       children,
+      n_groups,
       character_set
     )
   def copy(self, factory = None):
@@ -205,8 +231,8 @@ class RegexCharacterLiteral(RegexCharacter):
     self.repr_serialize(params)
     return 'regex.RegexCharacterLiteral({0:s})'.format(', '.join(params))
   # GENERATE END
-  def post_process(self, group_index = 1, caseless = False):
-    group_index = RegexCharacter.post_process(self, group_index, False)
+  def post_process(self, groups, caseless = False):
+    RegexCharacter.post_process(self, groups, False)
     if caseless:
       temp = bisect_set.bisect_set_and(
         self.character_set,
@@ -217,7 +243,6 @@ class RegexCharacterLiteral(RegexCharacter):
         [i ^ 0x20 for i in temp if i >= 0x60] +
         [i ^ 0x20 for i in temp if i < 0x60]
       )
-    return group_index
 
 class RegexCharacterRange(RegexCharacter):
   # GENERATE ELEMENT() BEGIN
@@ -227,6 +252,7 @@ class RegexCharacterRange(RegexCharacter):
     attrib = {},
     text = '',
     children = [],
+    n_groups = -1,
     character_set = []
   ):
     RegexCharacter.__init__(
@@ -235,6 +261,7 @@ class RegexCharacterRange(RegexCharacter):
       attrib,
       text,
       children,
+      n_groups,
       character_set
     )
   def copy(self, factory = None):
@@ -248,8 +275,8 @@ class RegexCharacterRange(RegexCharacter):
     self.repr_serialize(params)
     return 'regex.RegexCharacterRange({0:s})'.format(', '.join(params))
   # GENERATE END
-  def post_process(self, group_index = 0, caseless = False):
-    group_index = RegexCharacter.post_process(self, group_index, False)
+  def post_process(self, groups, caseless = False):
+    RegexCharacter.post_process(self, groups, False)
     self.character_set = [self[0].character_set[0], self[1].character_set[-1]]
     if caseless:
       temp = bisect_set.bisect_set_and(
@@ -261,7 +288,6 @@ class RegexCharacterRange(RegexCharacter):
         [i ^ 0x20 for i in temp if i >= 0x60] +
         [i ^ 0x20 for i in temp if i < 0x60]
       )
-    return group_index
 
 class RegexCharacterOr(RegexCharacter):
   # GENERATE ELEMENT() BEGIN
@@ -271,6 +297,7 @@ class RegexCharacterOr(RegexCharacter):
     attrib = {},
     text = '',
     children = [],
+    n_groups = -1,
     character_set = []
   ):
     RegexCharacter.__init__(
@@ -279,6 +306,7 @@ class RegexCharacterOr(RegexCharacter):
       attrib,
       text,
       children,
+      n_groups,
       character_set
     )
   def copy(self, factory = None):
@@ -292,10 +320,12 @@ class RegexCharacterOr(RegexCharacter):
     self.repr_serialize(params)
     return 'regex.RegexCharacterOr({0:s})'.format(', '.join(params))
   # GENERATE END
-  def post_process(self, group_index = 0, caseless = False):
-    group_index = RegexCharacter.post_process(self, group_index, caseless)
-    self.character_set = bisect_set.bisect_set_or(self[0].character_set, self[1].character_set)
-    return group_index
+  def post_process(self, groups, caseless = False):
+    RegexCharacter.post_process(self, groups, caseless)
+    self.character_set = bisect_set.bisect_set_or(
+      self[0].character_set,
+      self[1].character_set
+    )
 
 class RegexCharacterAnd(RegexCharacter):
   # GENERATE ELEMENT() BEGIN
@@ -305,6 +335,7 @@ class RegexCharacterAnd(RegexCharacter):
     attrib = {},
     text = '',
     children = [],
+    n_groups = -1,
     character_set = []
   ):
     RegexCharacter.__init__(
@@ -313,6 +344,7 @@ class RegexCharacterAnd(RegexCharacter):
       attrib,
       text,
       children,
+      n_groups,
       character_set
     )
   def copy(self, factory = None):
@@ -326,10 +358,12 @@ class RegexCharacterAnd(RegexCharacter):
     self.repr_serialize(params)
     return 'regex.RegexCharacterAnd({0:s})'.format(', '.join(params))
   # GENERATE END
-  def post_process(self, group_index = 0, caseless = False):
-    group_index = RegexCharacter.post_process(self, group_index, caseless)
-    self.character_set = bisect_set.bisect_set_and(self[0].character_set, self[1].character_set)
-    return group_index
+  def post_process(self, groups, caseless = False):
+    RegexCharacter.post_process(self, groups, caseless)
+    self.character_set = bisect_set.bisect_set_and(
+      self[0].character_set,
+      self[1].character_set
+    )
 
 class RegexCharacterNot(RegexCharacter):
   # GENERATE ELEMENT() BEGIN
@@ -339,6 +373,7 @@ class RegexCharacterNot(RegexCharacter):
     attrib = {},
     text = '',
     children = [],
+    n_groups = -1,
     character_set = []
   ):
     RegexCharacter.__init__(
@@ -347,6 +382,7 @@ class RegexCharacterNot(RegexCharacter):
       attrib,
       text,
       children,
+      n_groups,
       character_set
     )
   def copy(self, factory = None):
@@ -360,10 +396,9 @@ class RegexCharacterNot(RegexCharacter):
     self.repr_serialize(params)
     return 'regex.RegexCharacterNot({0:s})'.format(', '.join(params))
   # GENERATE END
-  def post_process(self, group_index = 0, caseless = False):
-    group_index = RegexCharacter.post_process(self, group_index, caseless)
+  def post_process(self, groups, caseless = False):
+    RegexCharacter.post_process(self, groups, caseless)
     self.character_set = bisect_set.bisect_set_not(self[0].character_set)
-    return group_index
 
 class RegexOr(Regex):
   # GENERATE ELEMENT() BEGIN
@@ -372,14 +407,16 @@ class RegexOr(Regex):
     tag = 'RegexOr',
     attrib = {},
     text = '',
-    children = []
+    children = [],
+    n_groups = -1
   ):
     Regex.__init__(
       self,
       tag,
       attrib,
       text,
-      children
+      children,
+      n_groups
     )
   def copy(self, factory = None):
     result = Regex.copy(
@@ -392,9 +429,19 @@ class RegexOr(Regex):
     self.repr_serialize(params)
     return 'regex.RegexOr({0:s})'.format(', '.join(params))
   # GENERATE END
-  def to_nfa_state(self, _nfa, next_state):
-    child0_state = self[0].to_nfa_state(_nfa, next_state)
-    child1_state = self[1].to_nfa_state(_nfa, next_state)
+  def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
+    child0_state = self[0].to_nfa_state(
+      _nfa,
+      group_ref_data,
+      group_index,
+      next_state
+    )
+    child1_state = self[1].to_nfa_state(
+      _nfa,
+      group_ref_data,
+      group_index + self[0].n_groups,
+      next_state
+    )
     if child0_state == -1:
       return child1_state
     if child1_state == -1:
@@ -410,14 +457,16 @@ class RegexAnd(Regex):
     tag = 'RegexAnd',
     attrib = {},
     text = '',
-    children = []
+    children = [],
+    n_groups = -1
   ):
     Regex.__init__(
       self,
       tag,
       attrib,
       text,
-      children
+      children,
+      n_groups
     )
   def copy(self, factory = None):
     result = Regex.copy(
@@ -430,15 +479,25 @@ class RegexAnd(Regex):
     self.repr_serialize(params)
     return 'regex.RegexAnd({0:s})'.format(', '.join(params))
   # GENERATE END
-  def to_nfa_state(self, _nfa, next_state):
+  def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
     join0_state = len(_nfa.states)
-    _nfa.states.append(nfa.NFA.join0_state) # takes no arguments so use static one
+    _nfa.states.append(nfa.NFA.join0_state) # no arguments so use static one
     join1_state = len(_nfa.states)
     _nfa.states.append((nfa.NFA.STATE_JOIN1, next_state))
-    child0_state = self[0].to_nfa_state(_nfa, join0_state)
+    child0_state = self[0].to_nfa_state(
+      _nfa,
+      group_ref_data,
+      group_index,
+      join0_state
+    )
     if child0_state == -1:
       return -1
-    child1_state = self[1].to_nfa_state(_nfa, join1_state)
+    child1_state = self[1].to_nfa_state(
+      _nfa,
+      group_ref_data,
+      group_index + self[0].n_groups,
+      join1_state
+    )
     if child1_state == -1:
       return -1
     new_state = len(_nfa.states)
@@ -452,14 +511,16 @@ class RegexSequence(Regex):
     tag = 'RegexSequence',
     attrib = {},
     text = '',
-    children = []
+    children = [],
+    n_groups = -1
   ):
     Regex.__init__(
       self,
       tag,
       attrib,
       text,
-      children
+      children,
+      n_groups
     )
   def copy(self, factory = None):
     result = Regex.copy(
@@ -472,11 +533,21 @@ class RegexSequence(Regex):
     self.repr_serialize(params)
     return 'regex.RegexSequence({0:s})'.format(', '.join(params))
   # GENERATE END
-  def to_nfa_state(self, _nfa, next_state):
-    child1_state = self[1].to_nfa_state(_nfa, next_state)
-    if child1_state == -1:
+  def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
+    next_state = self[1].to_nfa_state(
+      _nfa,
+      group_ref_data,
+      group_index + self[0].n_groups,
+      next_state
+    )
+    if next_state == -1:
       return -1
-    return self[0].to_nfa_state(_nfa, child1_state)
+    return self[0].to_nfa_state(
+      _nfa,
+      group_ref_data,
+      group_index,
+      next_state
+    )
 
 class RegexRepeat(Regex):
   # GENERATE ELEMENT(int count0, int count1, bool non_greedy) BEGIN
@@ -486,6 +557,7 @@ class RegexRepeat(Regex):
     attrib = {},
     text = '',
     children = [],
+    n_groups = -1,
     count0 = -1,
     count1 = -1,
     non_greedy = False
@@ -495,7 +567,8 @@ class RegexRepeat(Regex):
       tag,
       attrib,
       text,
-      children
+      children,
+      n_groups
     )
     self.count0 = (
       element.deserialize_int(count0)
@@ -550,7 +623,7 @@ class RegexRepeat(Regex):
     self.repr_serialize(params)
     return 'regex.RegexRepeat({0:s})'.format(', '.join(params))
   # GENERATE END
-  def post_process(self, group_index = 0, caseless = False):
+  def post_process(self, groups, caseless = False):
     # total hack which will be done in a Python action in future
     if len(self) >= 2:
       assert self[1].tag == 'Number'
@@ -562,190 +635,256 @@ class RegexRepeat(Regex):
         self.count1 = self.count0
       del self[1:]
     # end total hack
-    return Regex.post_process(self, group_index, caseless)
-  def to_nfa_state(self, _nfa, next_state):
+    Regex.post_process(self, groups, caseless)
+  def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
     count0 = self.count0
     count1 = self.count1
     if count1 == -1:
       new_state = len(_nfa.states)
       _nfa.states.append(None)
-      child_state = self[0].to_nfa_state(_nfa, new_state)
-      if child_state == -1:
-        new_state = next_state # note: unreachable state remains invalid (None)
-      else:
+      child_state = self[0].to_nfa_state(
+        _nfa,
+        group_ref_data,
+        group_index,
+        new_state
+      )
+      if child_state != -1:
         _nfa.states[new_state] = (
           (nfa.NFA.STATE_OR, next_state, child_state)
         if self.non_greedy else
           (nfa.NFA.STATE_OR, child_state, next_state)
         )
+        next_state = new_state
     else:
-      new_state = next_state
+      done_state = next_state
       for i in range(count1 - count0):
-        child_state = self[0].to_nfa_state(_nfa, new_state)
+        child_state = self[0].to_nfa_state(
+          _nfa,
+          group_ref_data,
+          group_index,
+          next_state
+        )
         if child_state == -1:
           break
         new_state = len(_nfa.states)
         _nfa.states.append(
-          (nfa.NFA.STATE_OR, next_state, child_state)
+          (nfa.NFA.STATE_OR, done_state, child_state)
         if self.non_greedy else
-          (nfa.NFA.STATE_OR, child_state, next_state)
+          (nfa.NFA.STATE_OR, child_state, done_state)
         )
+        next_state = new_state
     for i in range(count0):
-      new_state = self[0].to_nfa_state(_nfa, new_state)
-      if new_state == -1:
+      next_state = self[0].to_nfa_state(
+        _nfa,
+        group_ref_data,
+        group_index,
+        next_state
+      )
+      if next_state == -1:
         break
-    return new_state
+    return next_state
 
 class RegexGroup(Regex):
-  class Attribute(element.Element):
-    # GENERATE ELEMENT(str name, str value) BEGIN
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'RegexGroup',
+    attrib = {},
+    text = '',
+    children = [],
+    n_groups = -1
+  ):
+    Regex.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children,
+      n_groups
+    )
+  def copy(self, factory = None):
+    result = Regex.copy(
+      self,
+      RegexGroup if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'regex.RegexGroup({0:s})'.format(', '.join(params))
+  # GENERATE END
+  def post_process(self, groups, caseless = False):
+    # we use -1 here because named or action groups use self[0] for text
+    groups.append(self)
+    self[-1].post_process(groups, caseless)
+    self.n_groups = self[-1].n_groups + 1
+  def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
+    new_state = len(_nfa.states)
+    _nfa.states.append(
+      (nfa.NFA.STATE_MARK, group_ref_data[group_index][1], next_state)
+    )
+    next_state = new_state
+    next_state = self[-1].to_nfa_state(
+      _nfa,
+      group_ref_data,
+      group_index + 1,
+      next_state
+    )
+    if next_state == -1:
+      return -1
+    new_state = len(_nfa.states)
+    _nfa.states.append(
+      (nfa.NFA.STATE_MARK, group_ref_data[group_index][0], next_state)
+    )
+    return new_state
+
+# internal base class
+class Text(element.Element):
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'Text',
+    attrib = {},
+    text = '',
+    children = []
+  ):
+    element.Element.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+  def copy(self, factory = None):
+    result = element.Element.copy(
+      self,
+      Text if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'regex.Text({0:s})'.format(', '.join(params))
+  # GENERATE END
+  def get_text(self):
+    return element.get_text(self, 0)
+
+class RegexGroupName(RegexGroup):
+  class Text(Text):
+    # GENERATE ELEMENT() BEGIN
     def __init__(
       self,
-      tag = 'RegexGroup_Attribute',
+      tag = 'RegexGroupName_Text',
       attrib = {},
       text = '',
-      children = [],
-      name = '',
-      value = ''
+      children = []
     ):
-      element.Element.__init__(
+      Text.__init__(
         self,
         tag,
         attrib,
         text,
         children
       )
-      self.name = name
-      self.value = value
-    def serialize(self, ref_list):
-      element.Element.serialize(self, ref_list)
-      self.set('name', element.serialize_str(self.name))
-      self.set('value', element.serialize_str(self.value))
-    def deserialize(self, ref_list):
-      element.Element.deserialize(self, ref_list)
-      self.name = element.deserialize_str(self.get('name', ''))
-      self.value = element.deserialize_str(self.get('value', ''))
     def copy(self, factory = None):
-      result = element.Element.copy(
+      result = Text.copy(
         self,
-        Attribute if factory is None else factory
+        Text if factory is None else factory
       )
-      result.name = self.name
-      result.value = self.value
       return result
-    def repr_serialize(self, params):
-      element.Element.repr_serialize(self, params)
-      if self.name != '':
-        params.append(
-          'name = {0:s}'.format(repr(self.name))
-        )
-      if self.value != '':
-        params.append(
-          'value = {0:s}'.format(repr(self.value))
-        )
     def __repr__(self):
       params = []
       self.repr_serialize(params)
-      return 'regex.RegexGroup.Attribute({0:s})'.format(', '.join(params))
+      return 'regex.RegexGroupName.Text({0:s})'.format(', '.join(params))
     # GENERATE END
 
-  # GENERATE ELEMENT(int index, str name, list(ref) attributes) BEGIN
+  # GENERATE ELEMENT() BEGIN
   def __init__(
     self,
-    tag = 'RegexGroup',
+    tag = 'RegexGroupName',
     attrib = {},
     text = '',
     children = [],
-    index = -1,
-    name = '',
-    attributes = []
+    n_groups = -1
   ):
-    Regex.__init__(
+    RegexGroup.__init__(
       self,
       tag,
       attrib,
       text,
-      children
-    )
-    self.index = (
-      element.deserialize_int(index)
-    if isinstance(index, str) else
-      index
-    )
-    self.name = name
-    self.attributes = attributes
-  def serialize(self, ref_list):
-    Regex.serialize(self, ref_list)
-    self.set('index', element.serialize_int(self.index))
-    self.set('name', element.serialize_str(self.name))
-    self.set(
-      'attributes',
-      ' '.join([element.serialize_ref(i, ref_list) for i in self.attributes])
+      children,
+      n_groups
     )
-  def deserialize(self, ref_list):
-    Regex.deserialize(self, ref_list)
-    self.index = element.deserialize_int(self.get('index', '-1'))
-    self.name = element.deserialize_str(self.get('name', ''))
-    self.attributes = [
-      element.deserialize_ref(i, ref_list)
-      for i in self.get('attributes', '').split()
-    ]
   def copy(self, factory = None):
-    result = Regex.copy(
+    result = RegexGroup.copy(
       self,
-      RegexGroup if factory is None else factory
+      RegexGroupName if factory is None else factory
     )
-    result.index = self.index
-    result.name = self.name
-    result.attributes = self.attributes
     return result
-  def repr_serialize(self, params):
-    Regex.repr_serialize(self, params)
-    if self.index != -1:
-      params.append(
-        'index = {0:s}'.format(repr(self.index))
-      )
-    if self.name != '':
-      params.append(
-        'name = {0:s}'.format(repr(self.name))
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'regex.RegexGroupName({0:s})'.format(', '.join(params))
+  # GENERATE END
+
+class RegexGroupAction(RegexGroup):
+  class Text(Text):
+    # GENERATE ELEMENT() BEGIN
+    def __init__(
+      self,
+      tag = 'RegexGroupAction_Text',
+      attrib = {},
+      text = '',
+      children = []
+    ):
+      Text.__init__(
+        self,
+        tag,
+        attrib,
+        text,
+        children
       )
-    if len(self.attributes):
-      params.append(
-        'attributes = [{0:s}]'.format(
-          ', '.join([repr(i) for i in self.attributes])
-        )
+    def copy(self, factory = None):
+      result = Text.copy(
+        self,
+        Text if factory is None else factory
       )
+      return result
+    def __repr__(self):
+      params = []
+      self.repr_serialize(params)
+      return 'regex.RegexGroupAction.Text({0:s})'.format(', '.join(params))
+    # GENERATE END
+
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'RegexGroupAction',
+    attrib = {},
+    text = '',
+    children = [],
+    n_groups = -1
+  ):
+    RegexGroup.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children,
+      n_groups
+    )
+  def copy(self, factory = None):
+    result = RegexGroup.copy(
+      self,
+      RegexGroupAction if factory is None else factory
+    )
+    return result
   def __repr__(self):
     params = []
     self.repr_serialize(params)
-    return 'regex.RegexGroup({0:s})'.format(', '.join(params))
+    return 'regex.RegexGroupAction({0:s})'.format(', '.join(params))
   # GENERATE END
-  def post_process(self, group_index = 0, caseless = False):
-    # total hack which will be done in a Python action in future
-    if len(self) >= 2:
-      assert self[0].tag == 'GroupName'
-      self.name = self[0].text[1:-1]
-      del self[:1]
-    # end total hack
-    self.index = group_index
-    group_index += 1
-    return Regex.post_process(self, group_index, caseless)
-  def add_to_groups(self, groups):
-    assert len(groups) == self.index
-    groups.append(
-      (self.name, {i.name: i.value for i in self.attributes})
-    )
-    return Regex.add_to_groups(self, groups)
-  def to_nfa_state(self, _nfa, next_state):
-    mark_state = len(_nfa.states)
-    _nfa.states.append((nfa.NFA.STATE_MARK, self.index * 2 + 1, next_state))
-    child_state = self[0].to_nfa_state(_nfa, mark_state)
-    if child_state == -1:
-      return -1
-    new_state = len(_nfa.states)
-    _nfa.states.append((nfa.NFA.STATE_MARK, self.index * 2, child_state))
-    return new_state
-
 # GENERATE FACTORY(element.Element) BEGIN
 tag_to_class = {
   'Regex': Regex,
@@ -762,7 +901,11 @@ tag_to_class = {
   'RegexSequence': RegexSequence,
   'RegexRepeat': RegexRepeat,
   'RegexGroup': RegexGroup,
-  'RegexGroup_Attribute': RegexGroup.Attribute
+  'Text': Text,
+  'RegexGroupName': RegexGroupName,
+  'RegexGroupName_Text': RegexGroupName.Text,
+  'RegexGroupAction': RegexGroupAction,
+  'RegexGroupAction_Text': RegexGroupAction.Text
 }
 def factory(tag, attrib = {}, *args, **kwargs):
   return tag_to_class.get(tag, element.Element)(tag, attrib, *args, **kwargs)
@@ -905,9 +1048,7 @@ if __name__ == '__main__':
     )
   )
 
-  groups = []
-  _regex.add_to_groups(groups)
-  _nfa = nfa.NFA(groups)
+  _nfa = nfa.NFA()
   _regex.add_to_nfa(_nfa)
   sys.stdout.write(
     wrap_repr.wrap_repr(
index 9430b22..c1cfd46 100644 (file)
@@ -39,7 +39,10 @@ yy_threads1 = [None]
 yy_prefix_slop = 1
 
 yytext = ''
-len_yytext = 0
+yytext_len = 0
+
+yy_action = None
+yy_group_stack = None
 
 def REJECT():
   raise YYReject()
@@ -48,12 +51,14 @@ def yyterminate():
   raise YYTerminate()
 
 def yyless(i):
-  assert len_yytext >= i
+  global yytext, yytext_len
+  assert yytext_len >= i
   yytext = yytext[:i]
-  len_yytext = i
+  yytext_len = i
 
 def unput(text):
-  while len_yytext:
+  global yyin, yytext_len
+  while yytext_len:
     block = yy_buffer_stack[-1].next
     while block is None or block.pos >= len(block.text):
       if block is None:
@@ -63,20 +68,36 @@ def unput(text):
       else:
         block = block.next
         yy_buffer_stack[-1].next = block
-    i = min(len_yytext, len(block.text) - block.pos)
+    i = min(yytext_len, len(block.text) - block.pos)
     block.pos += i
-    len_yytext -= i
+    yytext_len -= i
   yy_buffer_stack[-1].next = YYBufferBlock(yy_buffer_stack[-1].next, 0, text)
 
+def yy_rule_start(match, pos):
+  global yytext, yytext_len
+  yytext = match[:pos]
+  yytext_len = pos
+
+def yy_group_end(match, pos):
+  yy_group_stack.append(pos)
+
 # GENERATE SECTION2
 
 def yylex():
-  global yyin, yy_threads0, yy_threads1, yy_prefix_slop, yytext, len_yytext
+  global \
+    yyin, \
+    yy_threads0, \
+    yy_threads1, \
+    yy_prefix_slop, \
+    yytext, \
+    yytext_len, \
+    yy_action, \
+    yy_group_stack
 
   # GENERATE SECTION2INITIAL
 
   while True:
-    while len_yytext:
+    while yytext_len:
       block = yy_buffer_stack[-1].next
       while block is None or block.pos >= len(block.text):
         if block is None:
@@ -86,10 +107,12 @@ def yylex():
         else:
           block = block.next
           yy_buffer_stack[-1].next = block
-      i = min(len_yytext, len(block.text) - block.pos)
+      i = min(yytext_len, len(block.text) - block.pos)
       block.pos += i
-      len_yytext -= i
-    yytext = ''
+      yytext_len -= i
+
+    match = ''
+    match_len = 0
 
     del yy_threads0[yy_prefix_slop:]
     yy_threads0.append(None)
@@ -121,7 +144,7 @@ def yylex():
           i -= trans[1]
         elif trans[0] == 2: #DFA.TRANSITION_MARK:
           yy_threads0[i:i + trans[1]] = [
-            (len_yytext, trans[2], thread)
+            (match_len, trans[2], thread)
             for thread in yy_threads0[i:i + trans[1]]
           ]
         elif trans[0] == 3: #DFA.TRANSITION_MOVE:
@@ -158,9 +181,9 @@ def yylex():
               block_pos = block.pos
             file_in = yy_buffer_stack[buffer_ptr].file_in
         else:
-          i = len_yytext - len(yytext)
+          i = match_len - len(match)
           if i:
-            yytext += block.text[block_pos - i:]
+            match += block.text[block_pos - i:]
           block_prev = block
           block = block_prev.next
           if block is not None:
@@ -174,7 +197,7 @@ def yylex():
           )
         ]
         block_pos += 1
-        len_yytext += 1
+        match_len += 1
         continue
       # EOF
       if i == 0:
@@ -184,23 +207,30 @@ def yylex():
           return 0
       break
 
-    i = len_yytext - len(yytext)
+    i = match_len - len(match)
     if i:
       assert block is not None
-      yytext += block.text[block_pos - i:]
+      match += block.text[block_pos - i:]
 
     for i in yy_dfa_states[state][2]:
-      _, _, thread = yy_threads0[yy_prefix_slop + i]
+      yytext = None
+      yytext_len = None
+      yy_action = None
+      yy_group_stack = []
+
+      thread = yy_threads0[yy_prefix_slop + i]
       #print('thread', thread)
-      len_yytext, mark, thread = thread
-      assert thread is None
+      while thread is not None:
+        pos, ref_data, thread = thread
+        ref_data(match, pos)
 
-      assert len(yytext) >= len_yytext
-      yytext = yytext[:len_yytext]
       #print('yytext', yytext)
+      #print('yytext_len', yytext_len)
+      #print('yy_action', yy_action)
+      #print('yy_group_stack', yy_group_stack)
 
       try:
-        return yy_actions[mark >> 1]()
+        return yy_action()
       except YYReject:
         pass
       except YYContinue:
index 968fd23..38fd6d0 100644 (file)
@@ -1,9 +1,14 @@
-all: lex_yy.py cal flex0 flex1
+all: cal_py.py cal_py_pi.py cal flex0 flex1
 
 # Python scanner test
-lex_yy.py: cal_py.l
+cal_py.py: cal_py.l
        ../../bootstrap_flex.git/src/flex -o /dev/null $< 2>$<.xml
-       ../pilex.py --python $<.xml
+       ../pilex.py --python -o $@ $<.xml
+
+# Python scanner test
+cal_py_pi.py: cal_py_pi.l
+       ../../bootstrap_flex.git/src/flex -o /dev/null $< 2>$<.xml
+       ../pilex.py --python -o $@ $<.xml
 
 # cal program
 cal: y.tab.o
index 0479367..624b71a 100644 (file)
@@ -1,18 +1,33 @@
 %{
 NUM = 0x100 
 yylval = None
+groups = None
+def gc(n):
+  global groups
+  groups = [None for i in range(n)]
+  yy_group_stack.pop()
+def g(n, match, pos):
+  groups[n] = match[pos:yy_group_stack.pop()]
 %}
 
-DIGIT [0-9]+\.?|[0-9]*\.[0-9]+
+DIGIT (?{g(1, match, pos)}[0-9]+)\.?|(?{g(2, match, pos)}[0-9]*)\.(?{g(3, match, pos)}[0-9]+)
 
 %option noecs nometa-ecs noyywrap reject yymore
 
 %%
 
 [ ]
-{DIGIT}        {
+(?{g(0, match, pos)}{DIGIT})(?{gc(4)}"")       {
   global yylval
-  yylval = float(yytext)
+  #print('groups', groups)
+  #yylval = float(yytext)
+  if groups[1] is not None:
+    mantissa = groups[1]
+    exponent = 0
+  else:
+    mantissa = groups[2] + groups[3]
+    exponent = -len(groups[3])
+  yylval = int(mantissa) * 10 ** exponent
   return NUM
 }
 \n|.   {