Implement action groups (extension syntax), use this to make a private implementation...

author Nick Downing <downing.nick@gmail.com>

Tue, 25 Sep 2018 23:20:48 +0000 (09:20 +1000)

committer Nick Downing <downing.nick@gmail.com>

Tue, 25 Sep 2018 23:20:48 +0000 (09:20 +1000)
author Nick Downing <downing.nick@gmail.com>
Tue, 25 Sep 2018 23:20:48 +0000 (09:20 +1000)
committer Nick Downing <downing.nick@gmail.com>
Tue, 25 Sep 2018 23:20:48 +0000 (09:20 +1000)
diff --git a/ast.py b/ast.py

index 5e02610..6043e17 100644 (file)
--- a/ast.py
+++ b/ast.py
@@ -3991,7 +3991,7 @@ class AST(element.Element):
            return False # continued_action
   
        class FLexRule(element.Element):
-        # GENERATE ELEMENT(bool bol, int action) BEGIN
+        # GENERATE ELEMENT(bool bol, int group_index, list(ref) groups0, list(ref) groups1, int action) BEGIN
          def __init__(
            self,
            tag = 'AST_Section2_Rule_FLexRule',
@@ -3999,6 +3999,9 @@ class AST(element.Element):
            text = '',
            children = [],
            bol = False,
+          group_index = -1,
+          groups0 = [],
+          groups1 = [],
            action = -1
          ):
            element.Element.__init__(
@@ -4013,6 +4016,13 @@ class AST(element.Element):
            if isinstance(bol, str) else
              bol
            )
+          self.group_index = (
+            element.deserialize_int(group_index)
+          if isinstance(group_index, str) else
+            group_index
+          )
+          self.groups0 = groups0
+          self.groups1 = groups1
            self.action = (
              element.deserialize_int(action)
            if isinstance(action, str) else
@@ -4021,10 +4031,28 @@ class AST(element.Element):
          def serialize(self, ref_list):
            element.Element.serialize(self, ref_list)
            self.set('bol', element.serialize_bool(self.bol))
+          self.set('group_index', element.serialize_int(self.group_index))
+          self.set(
+            'groups0',
+            ' '.join([element.serialize_ref(i, ref_list) for i in self.groups0])
+          )
+          self.set(
+            'groups1',
+            ' '.join([element.serialize_ref(i, ref_list) for i in self.groups1])
+          )
            self.set('action', element.serialize_int(self.action))
          def deserialize(self, ref_list):
            element.Element.deserialize(self, ref_list)
            self.bol = element.deserialize_bool(self.get('bol', 'false'))
+          self.group_index = element.deserialize_int(self.get('group_index', '-1'))
+          self.groups0 = [
+            element.deserialize_ref(i, ref_list)
+            for i in self.get('groups0', '').split()
+          ]
+          self.groups1 = [
+            element.deserialize_ref(i, ref_list)
+            for i in self.get('groups1', '').split()
+          ]
            self.action = element.deserialize_int(self.get('action', '-1'))
          def copy(self, factory = None):
            result = element.Element.copy(
@@ -4032,6 +4060,9 @@ class AST(element.Element):
              FLexRule if factory is None else factory
            )
            result.bol = self.bol
+          result.group_index = self.group_index
+          result.groups0 = self.groups0
+          result.groups1 = self.groups1
            result.action = self.action
            return result
          def repr_serialize(self, params):
@@ -4040,6 +4071,22 @@ class AST(element.Element):
              params.append(
                'bol = {0:s}'.format(repr(self.bol))
              )
+          if self.group_index != -1:
+            params.append(
+              'group_index = {0:s}'.format(repr(self.group_index))
+            )
+          if len(self.groups0):
+            params.append(
+              'groups0 = [{0:s}]'.format(
+                ', '.join([repr(i) for i in self.groups0])
+              )
+            )
+          if len(self.groups1):
+            params.append(
+              'groups1 = [{0:s}]'.format(
+                ', '.join([repr(i) for i in self.groups1])
+              )
+            )
            if self.action != -1:
              params.append(
                'action = {0:s}'.format(repr(self.action))
@@ -4069,10 +4116,14 @@ class AST(element.Element):
              if not self.bol:
                _ast.start_conditions[i].rules.append(self)
              _ast.start_conditions[i].bol_rules.append(self)
+          self.groups0 = []
            self[0].post_process(
+            self.groups0,
              caseless = _ast[0].caseless
            ) # regex
+          self.groups1 = []
            self[1].post_process(
+            self.groups1,
              caseless = _ast[0].caseless
            ) # trailing context regex
            self.action = len(_ast.actions_text)
@@ -4099,6 +4150,9 @@ class AST(element.Element):
              #    element.get_text(_ast.actions_text[-1], 0)
              #  )
              #)
+          _ast.flex_rules.append(self)
+          self.group_index = _ast.n_groups
+          _ast.n_groups += len(self.groups0) + 1 + len(self.groups1)
            return continued_action
   
        # GENERATE ELEMENT() BEGIN
@@ -4261,7 +4315,7 @@ class AST(element.Element):
        return 'ast.AST.Section3({0:s})'.format(', '.join(params))
      # GENERATE END
  
-  # GENERATE ELEMENT(list(ref) start_conditions, list(ref) actions_text, list(ref) eof_actions_text, int default_action) BEGIN
+  # GENERATE ELEMENT(list(ref) start_conditions, list(ref) actions_text, list(ref) eof_actions_text, int default_action, list(ref) flex_rules, int n_groups) BEGIN
    def __init__(
      self,
      tag = 'AST',
@@ -4271,7 +4325,9 @@ class AST(element.Element):
      start_conditions = [],
      actions_text = [],
      eof_actions_text = [],
-    default_action = -1
+    default_action = -1,
+    flex_rules = [],
+    n_groups = -1
    ):
      element.Element.__init__(
        self,
@@ -4288,6 +4344,12 @@ class AST(element.Element):
      if isinstance(default_action, str) else
        default_action
      )
+    self.flex_rules = flex_rules
+    self.n_groups = (
+      element.deserialize_int(n_groups)
+    if isinstance(n_groups, str) else
+      n_groups
+    )
    def serialize(self, ref_list):
      element.Element.serialize(self, ref_list)
      self.set(
@@ -4303,6 +4365,11 @@ class AST(element.Element):
        ' '.join([element.serialize_ref(i, ref_list) for i in self.eof_actions_text])
      )
      self.set('default_action', element.serialize_int(self.default_action))
+    self.set(
+      'flex_rules',
+      ' '.join([element.serialize_ref(i, ref_list) for i in self.flex_rules])
+    )
+    self.set('n_groups', element.serialize_int(self.n_groups))
    def deserialize(self, ref_list):
      element.Element.deserialize(self, ref_list)
      self.start_conditions = [
@@ -4318,6 +4385,11 @@ class AST(element.Element):
        for i in self.get('eof_actions_text', '').split()
      ]
      self.default_action = element.deserialize_int(self.get('default_action', '-1'))
+    self.flex_rules = [
+      element.deserialize_ref(i, ref_list)
+      for i in self.get('flex_rules', '').split()
+    ]
+    self.n_groups = element.deserialize_int(self.get('n_groups', '-1'))
    def copy(self, factory = None):
      result = element.Element.copy(
        self,
@@ -4327,6 +4399,8 @@ class AST(element.Element):
      result.actions_text = self.actions_text
      result.eof_actions_text = self.eof_actions_text
      result.default_action = self.default_action
+    result.flex_rules = self.flex_rules
+    result.n_groups = self.n_groups
      return result
    def repr_serialize(self, params):
      element.Element.repr_serialize(self, params)
@@ -4352,6 +4426,16 @@ class AST(element.Element):
        params.append(
          'default_action = {0:s}'.format(repr(self.default_action))
        )
+    if len(self.flex_rules):
+      params.append(
+        'flex_rules = [{0:s}]'.format(
+          ', '.join([repr(i) for i in self.flex_rules])
+        )
+      )
+    if self.n_groups != -1:
+      params.append(
+        'n_groups = {0:s}'.format(repr(self.n_groups))
+      )
    def __repr__(self):
      params = []
      self.repr_serialize(params)
@@ -4373,6 +4457,8 @@ class AST(element.Element):
      self.eof_actions_text = [
        AST.Text(text = '\t\t\t\tyyterminate();\n')
      ]
+    self.flex_rules = []
+    self.n_groups = 0
  
      # variables that won't be serialized
      name_to_start_condition = {'INITIAL': 0}
@@ -4402,12 +4488,18 @@ class AST(element.Element):
          )
        )
      )
-  def to_nfa(self):
+  def to_nfa(self, group_ref_data):
      _nfa = nfa.NFA()
      for i in self.start_conditions:
-      for j in range(2):
-        _regex = regex.RegexNone()
-        for k in [i.rules, i.bol_rules][j]:
+      for j in [i.rules, i.bol_rules]:
+        _regex = regex.RegexNone(
+          n_groups = 0
+        )
+        regex_group_ref_data = []
+        for k in j:
+          ng = _regex.n_groups
+          ng0 = k[0].n_groups
+          ng1 = k[1].n_groups
            _regex = regex.RegexOr(
              children = [
                _regex,
@@ -4418,21 +4510,32 @@ class AST(element.Element):
                      children = [
                        k[1]
                      ],
-                    index = k.action
+                    n_groups = 1 + ng1
                    )
-                ]
+                ],
+                n_groups = ng0 + 1 + ng1
                )
+            ],
+            n_groups = ng + ng0 + 1 + ng1
+          )
+          regex_group_ref_data.extend(
+            group_ref_data[
+              k.group_index:
+              k.group_index + len(k.groups0) + 1 + len(k.groups1)
              ]
            )
+        ng = _regex.n_groups
          _regex = regex.RegexAnd(
            children = [
              regex.RegexRepeat(
-              count0 = 0,
                children = [
                  regex.RegexCharacter(
-                  character_set = [0, 0x100]
+                  character_set = [0, 0x100],
+                  n_groups = 0
                  )
-              ]
+              ],
+              n_groups = 0,
+              count0 = 0
              ),
              regex.RegexOr(
                children = [
@@ -4440,21 +4543,28 @@ class AST(element.Element):
                  regex.RegexSequence(
                    children = [
                      regex.RegexCharacter(
-                      character_set = [0, 0x100]
+                      character_set = [0, 0x100],
+                      n_groups = 0
                      ),
                      regex.RegexGroup(
                        children = [
-                        regex.RegexEmpty()
+                        regex.RegexEmpty(
+                          n_groups = 0
+                        )
                        ],
-                      index = self.default_action
+                      n_groups = 1
                      )
-                  ]
+                  ],
+                  n_groups = 1
                  )
-              ]
+              ],
+              n_groups = ng + 1
              )
-          ]
+          ],
+          n_groups = ng + 1
          )
-        _regex.add_to_nfa(_nfa)
+        regex_group_ref_data.append(group_ref_data[-1])
+        _regex.add_to_nfa(_nfa, regex_group_ref_data)
      return _nfa
  
  # GENERATE FACTORY(regex.factory) BEGIN
diff --git a/dfa.py b/dfa.py

index 4e28ce8..2ba88f8 100644 (file)
--- a/dfa.py
+++ b/dfa.py
@@ -31,20 +31,15 @@ class DFA:
   
    def __init__(
      self,
-    groups = [],
      states = [([n_characters], [0], [0])],
      actions = [(0, [])],
      start_action = [] # can have multiple DFAs in same container
    ):
-    # groups: list of group_desc
-    # group_desc: (tag, kwargs)
-    #   tag, kwargs will be passed to apply_markup() hence factory()
      # states: list of state_desc
      # state_desc: (list of breaks, list of action to do, accept_threads)
      # actions: list of action_desc
      # action_desc: (state to go to next, compiled transition to do first)
      # accept_threads: list of accepting thread numbers (in thread list)
-    self.groups = groups
      self.states = states
      self.actions = actions
      self.start_action = start_action
@@ -444,48 +439,47 @@ class DFA:
        off += 1
      return None
  
-  def yylex(self, root, pos, off, factory, yychunk_iter):
-    if pos < 0:
-      pos, off = element.to_start_relative(root, pos, off)
-
-    while True:
-      # note: pointers must be kept start relative during the below call,
-      # because it extends the following text by calling the yychunk_iter
-      thread = self.match_yychunk(root, pos, off, yychunk_iter)
-      if thread is None:
-        break
-      stack = []
-      while True:
-        pos, off, mark_value, thread = thread
-        group_index = mark_value >> 1
-        if (mark_value & 1) != 0:
-          end_pos, end_off = element.to_end_relative(root, pos, off)
-          stack.append((end_pos, end_off, group_index))
-        else:
-          end_pos, end_off, temp = stack.pop()
-          assert temp == group_index
-          if len(stack) == 0:
-            break
-          tag, kwargs = self.groups[group_index]
-          if tag != '':
-            work.apply_markup(
-              root,
-              pos,
-              off,
-              end_pos,
-              end_off,
-              factory,
-              tag,
-              **kwargs
-            )
-      # note: pointers must be kept end relative during the below call,
-      # because it modifies the preceding text by calling apply_markup()
-      yield end_pos, end_off, group_index
-      pos, off = element.to_start_relative(root, end_pos, end_off)
+  #def yylex(self, root, pos, off, factory, yychunk_iter):
+  #  if pos < 0:
+  #    pos, off = element.to_start_relative(root, pos, off)
+
+  #  while True:
+  #    # note: pointers must be kept start relative during the below call,
+  #    # because it extends the following text by calling the yychunk_iter
+  #    thread = self.match_yychunk(root, pos, off, yychunk_iter)
+  #    if thread is None:
+  #      break
+  #    stack = []
+  #    while True:
+  #      pos, off, mark_value, thread = thread
+  #      group_index = mark_value >> 1
+  #      if (mark_value & 1) != 0:
+  #        end_pos, end_off = element.to_end_relative(root, pos, off)
+  #        stack.append((end_pos, end_off, group_index))
+  #      else:
+  #        end_pos, end_off, temp = stack.pop()
+  #        assert temp == group_index
+  #        if len(stack) == 0:
+  #          break
+  #        tag, kwargs = self.groups[group_index]
+  #        if tag != '':
+  #          work.apply_markup(
+  #            root,
+  #            pos,
+  #            off,
+  #            end_pos,
+  #            end_off,
+  #            factory,
+  #            tag,
+  #            **kwargs
+  #          )
+  #    # note: pointers must be kept end relative during the below call,
+  #    # because it modifies the preceding text by calling apply_markup()
+  #    yield end_pos, end_off, group_index
+  #    pos, off = element.to_start_relative(root, end_pos, end_off)
  
    def __repr__(self):
-    return 'dfa.DFA({0:s}, {1:s}, {2:s}, {3:s})'.format(
-      repr(self.groups),
+    return 'dfa.DFA({0:s}, {1:s}, {2:s})'.format(
        repr(self.states),
        repr(self.actions),
        repr(self.start_action)
diff --git a/generate_py.py b/generate_py.py

index 21c9c5c..bc36709 100644 (file)
--- a/generate_py.py
+++ b/generate_py.py
@@ -1,8 +1,8 @@
  import os
+import regex
  import wrap_repr
  
-def ast_text_to_python(ast_text, indent):
-  text = ast_text.get_text()
+def text_to_python(text, indent):
    text_strip = text.strip()
    if text_strip[:1] == '{' and text_strip[-1:] == '}':
      text = text_strip[1:-1]
@@ -30,8 +30,74 @@ def ast_text_to_python(ast_text, indent):
        lines[j] = '{0:s}{1:s}\n'.format(indent, lines[j][len(prefix):])
    return ''.join(lines)
  
+# note: these routines are literally the same, but conceptually different,
+# because ast.Text and regex.Text are different and unrelated base classes
+def ast_text_to_python(ast_text, indent):
+  return text_to_python(ast_text.get_text(), indent)
+def regex_text_to_python(regex_text, indent):
+  return text_to_python(regex_text.get_text(), indent)
+
  def generate_py(_ast, _element, home_dir, skel_file, out_file):
-  _dfa = _ast.to_nfa().to_dfa()
+  # generate group action function names (ref_data) and body text
+  group_ref_data = []
+  group_actions_text = []
+  group_rules_text = []
+  for i in _ast.flex_rules:
+    # add actions for capturing groups in ordinary regex
+    group_ref_data.extend(
+      [
+        (
+          'yy_group{0:d}'.format(len(group_actions_text) + j),
+          'yy_group_end'
+        )
+        for j in range(len(i.groups0))
+      ]
+    )
+    group_actions_text.extend([j[0] for j in i.groups0])
+
+    # add group for the rule, recognizing this matches the rule
+    group_ref_data.append(
+      (
+        'yy_rule_start',
+        'yy_rule{0:d}'.format(len(group_rules_text))
+      )
+    )
+    group_rules_text.append(
+      regex.Text(
+        text = '''global yy_action
+yy_action = yy_action{0:d}
+'''.format(i.action)
+      )
+    )
+
+    # add actions for capturing groups in trailing context regex
+    group_ref_data.extend(
+      [
+        (
+          'yy_group{0:d}'.format(len(group_actions_text) + j),
+          'yy_group_end'
+        )
+        for j in range(len(i.groups1))
+      ]
+    )
+    group_actions_text.extend([j[0] for j in i.groups1])
+
+  # add group for default rule, recognizing this matches the rule
+  group_ref_data.append(
+    (
+      'yy_rule_start',
+      'yy_rule{0:d}'.format(len(group_rules_text))
+    )
+  )
+  group_rules_text.append(
+    regex.Text(
+      text = '''global yy_action
+yy_action = yy_action{0:d}
+'''.format(_ast.default_action)
+    )
+  )
+
+  _dfa = _ast.to_nfa(group_ref_data).to_dfa()
  
    if skel_file is None:
      skel_file = os.path.join(
@@ -80,28 +146,10 @@ def generate_py(_ast, _element, home_dir, skel_file, out_file):
          elif line == '# GENERATE SECTION2\n':
            fout.write(
              '''# GENERATE SECTION2 BEGIN
-{0:s}{1:s}{2:s}{3:s}{4:s}yy_actions = [{5:s}
-]
-{6:s}yy_eof_actions = [{7:s}
+{0:s}{1:s}{2:s}{3:s}{4:s}{5:s}{6:s}yy_eof_actions = [{7:s}
  ]
  # GENERATE END
  '''.format(
-              wrap_repr.wrap_repr(
-                'yy_dfa_groups = {0:s}'.format(repr(_dfa.groups)),
-                79
-              ),
-              wrap_repr.wrap_repr(
-                'yy_dfa_states = {0:s}'.format(repr(_dfa.states)),
-                79
-              ),
-              wrap_repr.wrap_repr(
-                'yy_dfa_actions = {0:s}'.format(repr(_dfa.actions)),
-                79
-              ),
-              wrap_repr.wrap_repr(
-                'yy_dfa_start_action = {0:s}'.format(repr(_dfa.start_action)),
-                79
-              ),
                ''.join(
                  [
                    '''def yy_action{0:d}():
@@ -113,12 +161,40 @@ def generate_py(_ast, _element, home_dir, skel_file, out_file):
                    for i in range(len(_ast.actions_text))
                  ]
                ),
-              ','.join(
+              ''.join(
                  [
-                  '\n  yy_action{0:d}'.format(i)
-                  for i in range(len(_ast.actions_text))
+                  '''def yy_rule{0:d}(match, pos):
+{1:s}'''.format(
+                    i,
+                    regex_text_to_python(group_rules_text[i], '  ')
+                  )
+                  for i in range(len(group_rules_text))
                  ]
                ),
+              ''.join(
+                [
+                  '''def yy_group{0:d}(match, pos):
+{1:s}'''.format(
+                    i,
+                    regex_text_to_python(group_actions_text[i], '  ')
+                  )
+                  for i in range(len(group_actions_text))
+                ]
+              ),
+              wrap_repr.wrap_repr(
+                'yy_dfa_states = {0:s}'.format(repr(_dfa.states)),
+                79
+              ),
+              wrap_repr.wrap_repr(
+                'yy_dfa_actions = {0:s}'.format(
+                  repr(_dfa.actions).replace('\'', '')
+                ),
+                79
+              ),
+              wrap_repr.wrap_repr(
+                'yy_dfa_start_action = {0:s}'.format(repr(_dfa.start_action)),
+                79
+              ),
                ''.join(
                  [
                    '''def yy_eof_action{0:d}():
diff --git a/nfa.py b/nfa.py

index d55ebae..b101906 100644 (file)
--- a/nfa.py
+++ b/nfa.py
@@ -36,14 +36,9 @@ class NFA:
  
    def __init__(
      self,
-    groups = [],
      states = [(STATE_CHARACTER, [0, n_characters], 0)],
      start_state = [] # can have multiple NFAs in same container
    ):
-    # groups: list of group_desc
-    # group_desc: (tag, kwargs)
-    #   tag, kwargs will be passed to apply_markup() hence factory()
-    self.groups = groups
      self.states = states
      self.start_state = start_state
  
@@ -350,7 +345,7 @@ class NFA:
      return None
  
    def to_dfa(self):
-    _dfa = dfa.DFA(list(self.groups))
+    _dfa = dfa.DFA()
  
      accept_key = (NFA.accept_multistate, ())
      action_to_meaning = [accept_key]
@@ -442,8 +437,7 @@ class NFA:
      return _dfa
  
    def __repr__(self):
-    return 'nfa.NFA({0:s}, {1:s}, {2:s})'.format(
-      repr(self.groups),
+    return 'nfa.NFA({0:s}, {1:s})'.format(
        repr(self.states),
        repr(self.start_state)
      )
diff --git a/regex.py b/regex.py

index 0fc24a5..1989a7b 100644 (file)
--- a/regex.py
+++ b/regex.py
@@ -6,13 +6,14 @@ import nfa
  n_characters = 0x100
  
  class Regex(element.Element):
-  # GENERATE ELEMENT() BEGIN
+  # GENERATE ELEMENT(int n_groups) BEGIN
    def __init__(
      self,
      tag = 'Regex',
      attrib = {},
      text = '',
-    children = []
+    children = [],
+    n_groups = -1
    ):
      element.Element.__init__(
        self,
@@ -21,28 +22,44 @@ class Regex(element.Element):
        text,
        children
      )
+    self.n_groups = (
+      element.deserialize_int(n_groups)
+    if isinstance(n_groups, str) else
+      n_groups
+    )
+  def serialize(self, ref_list):
+    element.Element.serialize(self, ref_list)
+    self.set('n_groups', element.serialize_int(self.n_groups))
+  def deserialize(self, ref_list):
+    element.Element.deserialize(self, ref_list)
+    self.n_groups = element.deserialize_int(self.get('n_groups', '-1'))
    def copy(self, factory = None):
      result = element.Element.copy(
        self,
        Regex if factory is None else factory
      )
+    result.n_groups = self.n_groups
      return result
+  def repr_serialize(self, params):
+    element.Element.repr_serialize(self, params)
+    if self.n_groups != -1:
+      params.append(
+        'n_groups = {0:s}'.format(repr(self.n_groups))
+      )
    def __repr__(self):
      params = []
      self.repr_serialize(params)
      return 'regex.Regex({0:s})'.format(', '.join(params))
    # GENERATE END
-  def post_process(self, group_index = 0, caseless = False):
+  def post_process(self, groups, caseless = False):
+    self.n_groups = 0
      for i in self:
-      group_index = i.post_process(group_index, caseless)
-    return group_index
-  def add_to_groups(self, groups):
-    for i in self:
-      i.add_to_groups(groups)
-  def to_nfa_state(self, _nfa, next_state):
+      i.post_process(groups, caseless)
+      self.n_groups += i.n_groups
+  def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
      raise NotImplementedError
-  def add_to_nfa(self, _nfa):
-    _nfa.start_state.append(self.to_nfa_state(_nfa, 0))
+  def add_to_nfa(self, _nfa, group_ref_data):
+    _nfa.start_state.append(self.to_nfa_state(_nfa, group_ref_data, 0, 0))
  
  class RegexNone(Regex):
    # GENERATE ELEMENT() BEGIN
@@ -51,14 +68,16 @@ class RegexNone(Regex):
      tag = 'RegexNone',
      attrib = {},
      text = '',
-    children = []
+    children = [],
+    n_groups = -1
    ):
      Regex.__init__(
        self,
        tag,
        attrib,
        text,
-      children
+      children,
+      n_groups
      )
    def copy(self, factory = None):
      result = Regex.copy(
@@ -71,7 +90,7 @@ class RegexNone(Regex):
      self.repr_serialize(params)
      return 'regex.RegexNone({0:s})'.format(', '.join(params))
    # GENERATE END
-  def to_nfa_state(self, _nfa, next_state):
+  def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
      return -1
  
  class RegexEmpty(Regex):
@@ -81,14 +100,16 @@ class RegexEmpty(Regex):
      tag = 'RegexEmpty',
      attrib = {},
      text = '',
-    children = []
+    children = [],
+    n_groups = -1
    ):
      Regex.__init__(
        self,
        tag,
        attrib,
        text,
-      children
+      children,
+      n_groups
      )
    def copy(self, factory = None):
      result = Regex.copy(
@@ -101,7 +122,7 @@ class RegexEmpty(Regex):
      self.repr_serialize(params)
      return 'regex.RegexEmpty({0:s})'.format(', '.join(params))
    # GENERATE END
-  def to_nfa_state(self, _nfa, next_state):
+  def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
      return next_state
  
  class RegexCharacter(Regex):
@@ -112,6 +133,7 @@ class RegexCharacter(Regex):
      attrib = {},
      text = '',
      children = [],
+    n_groups = -1,
      character_set = []
    ):
      Regex.__init__(
@@ -119,7 +141,8 @@ class RegexCharacter(Regex):
        tag,
        attrib,
        text,
-      children
+      children,
+      n_groups
      )
      self.character_set = (
        [element.deserialize_int(i) for i in character_set.split()]
@@ -158,8 +181,8 @@ class RegexCharacter(Regex):
      self.repr_serialize(params)
      return 'regex.RegexCharacter({0:s})'.format(', '.join(params))
    # GENERATE END
-  def post_process(self, group_index = 0, caseless = False):
-    group_index = Regex.post_process(self, group_index, caseless)
+  def post_process(self, groups, caseless = False):
+    Regex.post_process(self, groups, caseless)
      if caseless:
        temp = bisect_set.bisect_set_and(
          self.character_set,
@@ -170,10 +193,11 @@ class RegexCharacter(Regex):
          [i ^ 0x20 for i in temp if i >= 0x60] +
          [i ^ 0x20 for i in temp if i < 0x60]
        )
-    return group_index
-  def to_nfa_state(self, _nfa, next_state):
+  def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
      new_state = len(_nfa.states)
-    _nfa.states.append((nfa.NFA.STATE_CHARACTER, self.character_set, next_state))
+    _nfa.states.append(
+      (nfa.NFA.STATE_CHARACTER, self.character_set, next_state)
+    )
      return new_state
  
  class RegexCharacterLiteral(RegexCharacter):
@@ -184,6 +208,7 @@ class RegexCharacterLiteral(RegexCharacter):
      attrib = {},
      text = '',
      children = [],
+    n_groups = -1,
      character_set = []
    ):
      RegexCharacter.__init__(
@@ -192,6 +217,7 @@ class RegexCharacterLiteral(RegexCharacter):
        attrib,
        text,
        children,
+      n_groups,
        character_set
      )
    def copy(self, factory = None):
@@ -205,8 +231,8 @@ class RegexCharacterLiteral(RegexCharacter):
      self.repr_serialize(params)
      return 'regex.RegexCharacterLiteral({0:s})'.format(', '.join(params))
    # GENERATE END
-  def post_process(self, group_index = 1, caseless = False):
-    group_index = RegexCharacter.post_process(self, group_index, False)
+  def post_process(self, groups, caseless = False):
+    RegexCharacter.post_process(self, groups, False)
      if caseless:
        temp = bisect_set.bisect_set_and(
          self.character_set,
@@ -217,7 +243,6 @@ class RegexCharacterLiteral(RegexCharacter):
          [i ^ 0x20 for i in temp if i >= 0x60] +
          [i ^ 0x20 for i in temp if i < 0x60]
        )
-    return group_index
  
  class RegexCharacterRange(RegexCharacter):
    # GENERATE ELEMENT() BEGIN
@@ -227,6 +252,7 @@ class RegexCharacterRange(RegexCharacter):
      attrib = {},
      text = '',
      children = [],
+    n_groups = -1,
      character_set = []
    ):
      RegexCharacter.__init__(
@@ -235,6 +261,7 @@ class RegexCharacterRange(RegexCharacter):
        attrib,
        text,
        children,
+      n_groups,
        character_set
      )
    def copy(self, factory = None):
@@ -248,8 +275,8 @@ class RegexCharacterRange(RegexCharacter):
      self.repr_serialize(params)
      return 'regex.RegexCharacterRange({0:s})'.format(', '.join(params))
    # GENERATE END
-  def post_process(self, group_index = 0, caseless = False):
-    group_index = RegexCharacter.post_process(self, group_index, False)
+  def post_process(self, groups, caseless = False):
+    RegexCharacter.post_process(self, groups, False)
      self.character_set = [self[0].character_set[0], self[1].character_set[-1]]
      if caseless:
        temp = bisect_set.bisect_set_and(
@@ -261,7 +288,6 @@ class RegexCharacterRange(RegexCharacter):
          [i ^ 0x20 for i in temp if i >= 0x60] +
          [i ^ 0x20 for i in temp if i < 0x60]
        )
-    return group_index
  
  class RegexCharacterOr(RegexCharacter):
    # GENERATE ELEMENT() BEGIN
@@ -271,6 +297,7 @@ class RegexCharacterOr(RegexCharacter):
      attrib = {},
      text = '',
      children = [],
+    n_groups = -1,
      character_set = []
    ):
      RegexCharacter.__init__(
@@ -279,6 +306,7 @@ class RegexCharacterOr(RegexCharacter):
        attrib,
        text,
        children,
+      n_groups,
        character_set
      )
    def copy(self, factory = None):
@@ -292,10 +320,12 @@ class RegexCharacterOr(RegexCharacter):
      self.repr_serialize(params)
      return 'regex.RegexCharacterOr({0:s})'.format(', '.join(params))
    # GENERATE END
-  def post_process(self, group_index = 0, caseless = False):
-    group_index = RegexCharacter.post_process(self, group_index, caseless)
-    self.character_set = bisect_set.bisect_set_or(self[0].character_set, self[1].character_set)
-    return group_index
+  def post_process(self, groups, caseless = False):
+    RegexCharacter.post_process(self, groups, caseless)
+    self.character_set = bisect_set.bisect_set_or(
+      self[0].character_set,
+      self[1].character_set
+    )
  
  class RegexCharacterAnd(RegexCharacter):
    # GENERATE ELEMENT() BEGIN
@@ -305,6 +335,7 @@ class RegexCharacterAnd(RegexCharacter):
      attrib = {},
      text = '',
      children = [],
+    n_groups = -1,
      character_set = []
    ):
      RegexCharacter.__init__(
@@ -313,6 +344,7 @@ class RegexCharacterAnd(RegexCharacter):
        attrib,
        text,
        children,
+      n_groups,
        character_set
      )
    def copy(self, factory = None):
@@ -326,10 +358,12 @@ class RegexCharacterAnd(RegexCharacter):
      self.repr_serialize(params)
      return 'regex.RegexCharacterAnd({0:s})'.format(', '.join(params))
    # GENERATE END
-  def post_process(self, group_index = 0, caseless = False):
-    group_index = RegexCharacter.post_process(self, group_index, caseless)
-    self.character_set = bisect_set.bisect_set_and(self[0].character_set, self[1].character_set)
-    return group_index
+  def post_process(self, groups, caseless = False):
+    RegexCharacter.post_process(self, groups, caseless)
+    self.character_set = bisect_set.bisect_set_and(
+      self[0].character_set,
+      self[1].character_set
+    )
  
  class RegexCharacterNot(RegexCharacter):
    # GENERATE ELEMENT() BEGIN
@@ -339,6 +373,7 @@ class RegexCharacterNot(RegexCharacter):
      attrib = {},
      text = '',
      children = [],
+    n_groups = -1,
      character_set = []
    ):
      RegexCharacter.__init__(
@@ -347,6 +382,7 @@ class RegexCharacterNot(RegexCharacter):
        attrib,
        text,
        children,
+      n_groups,
        character_set
      )
    def copy(self, factory = None):
@@ -360,10 +396,9 @@ class RegexCharacterNot(RegexCharacter):
      self.repr_serialize(params)
      return 'regex.RegexCharacterNot({0:s})'.format(', '.join(params))
    # GENERATE END
-  def post_process(self, group_index = 0, caseless = False):
-    group_index = RegexCharacter.post_process(self, group_index, caseless)
+  def post_process(self, groups, caseless = False):
+    RegexCharacter.post_process(self, groups, caseless)
      self.character_set = bisect_set.bisect_set_not(self[0].character_set)
-    return group_index
  
  class RegexOr(Regex):
    # GENERATE ELEMENT() BEGIN
@@ -372,14 +407,16 @@ class RegexOr(Regex):
      tag = 'RegexOr',
      attrib = {},
      text = '',
-    children = []
+    children = [],
+    n_groups = -1
    ):
      Regex.__init__(
        self,
        tag,
        attrib,
        text,
-      children
+      children,
+      n_groups
      )
    def copy(self, factory = None):
      result = Regex.copy(
@@ -392,9 +429,19 @@ class RegexOr(Regex):
      self.repr_serialize(params)
      return 'regex.RegexOr({0:s})'.format(', '.join(params))
    # GENERATE END
-  def to_nfa_state(self, _nfa, next_state):
-    child0_state = self[0].to_nfa_state(_nfa, next_state)
-    child1_state = self[1].to_nfa_state(_nfa, next_state)
+  def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
+    child0_state = self[0].to_nfa_state(
+      _nfa,
+      group_ref_data,
+      group_index,
+      next_state
+    )
+    child1_state = self[1].to_nfa_state(
+      _nfa,
+      group_ref_data,
+      group_index + self[0].n_groups,
+      next_state
+    )
      if child0_state == -1:
        return child1_state
      if child1_state == -1:
@@ -410,14 +457,16 @@ class RegexAnd(Regex):
      tag = 'RegexAnd',
      attrib = {},
      text = '',
-    children = []
+    children = [],
+    n_groups = -1
    ):
      Regex.__init__(
        self,
        tag,
        attrib,
        text,
-      children
+      children,
+      n_groups
      )
    def copy(self, factory = None):
      result = Regex.copy(
@@ -430,15 +479,25 @@ class RegexAnd(Regex):
      self.repr_serialize(params)
      return 'regex.RegexAnd({0:s})'.format(', '.join(params))
    # GENERATE END
-  def to_nfa_state(self, _nfa, next_state):
+  def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
      join0_state = len(_nfa.states)
-    _nfa.states.append(nfa.NFA.join0_state) # takes no arguments so use static one
+    _nfa.states.append(nfa.NFA.join0_state) # no arguments so use static one
      join1_state = len(_nfa.states)
      _nfa.states.append((nfa.NFA.STATE_JOIN1, next_state))
-    child0_state = self[0].to_nfa_state(_nfa, join0_state)
+    child0_state = self[0].to_nfa_state(
+      _nfa,
+      group_ref_data,
+      group_index,
+      join0_state
+    )
      if child0_state == -1:
        return -1
-    child1_state = self[1].to_nfa_state(_nfa, join1_state)
+    child1_state = self[1].to_nfa_state(
+      _nfa,
+      group_ref_data,
+      group_index + self[0].n_groups,
+      join1_state
+    )
      if child1_state == -1:
        return -1
      new_state = len(_nfa.states)
@@ -452,14 +511,16 @@ class RegexSequence(Regex):
      tag = 'RegexSequence',
      attrib = {},
      text = '',
-    children = []
+    children = [],
+    n_groups = -1
    ):
      Regex.__init__(
        self,
        tag,
        attrib,
        text,
-      children
+      children,
+      n_groups
      )
    def copy(self, factory = None):
      result = Regex.copy(
@@ -472,11 +533,21 @@ class RegexSequence(Regex):
      self.repr_serialize(params)
      return 'regex.RegexSequence({0:s})'.format(', '.join(params))
    # GENERATE END
-  def to_nfa_state(self, _nfa, next_state):
-    child1_state = self[1].to_nfa_state(_nfa, next_state)
-    if child1_state == -1:
+  def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
+    next_state = self[1].to_nfa_state(
+      _nfa,
+      group_ref_data,
+      group_index + self[0].n_groups,
+      next_state
+    )
+    if next_state == -1:
        return -1
-    return self[0].to_nfa_state(_nfa, child1_state)
+    return self[0].to_nfa_state(
+      _nfa,
+      group_ref_data,
+      group_index,
+      next_state
+    )
  
  class RegexRepeat(Regex):
    # GENERATE ELEMENT(int count0, int count1, bool non_greedy) BEGIN
@@ -486,6 +557,7 @@ class RegexRepeat(Regex):
      attrib = {},
      text = '',
      children = [],
+    n_groups = -1,
      count0 = -1,
      count1 = -1,
      non_greedy = False
@@ -495,7 +567,8 @@ class RegexRepeat(Regex):
        tag,
        attrib,
        text,
-      children
+      children,
+      n_groups
      )
      self.count0 = (
        element.deserialize_int(count0)
@@ -550,7 +623,7 @@ class RegexRepeat(Regex):
      self.repr_serialize(params)
      return 'regex.RegexRepeat({0:s})'.format(', '.join(params))
    # GENERATE END
-  def post_process(self, group_index = 0, caseless = False):
+  def post_process(self, groups, caseless = False):
      # total hack which will be done in a Python action in future
      if len(self) >= 2:
        assert self[1].tag == 'Number'
@@ -562,190 +635,256 @@ class RegexRepeat(Regex):
          self.count1 = self.count0
        del self[1:]
      # end total hack
-    return Regex.post_process(self, group_index, caseless)
-  def to_nfa_state(self, _nfa, next_state):
+    Regex.post_process(self, groups, caseless)
+  def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
      count0 = self.count0
      count1 = self.count1
      if count1 == -1:
        new_state = len(_nfa.states)
        _nfa.states.append(None)
-      child_state = self[0].to_nfa_state(_nfa, new_state)
-      if child_state == -1:
-        new_state = next_state # note: unreachable state remains invalid (None)
-      else:
+      child_state = self[0].to_nfa_state(
+        _nfa,
+        group_ref_data,
+        group_index,
+        new_state
+      )
+      if child_state != -1:
          _nfa.states[new_state] = (
            (nfa.NFA.STATE_OR, next_state, child_state)
          if self.non_greedy else
            (nfa.NFA.STATE_OR, child_state, next_state)
          )
+        next_state = new_state
      else:
-      new_state = next_state
+      done_state = next_state
        for i in range(count1 - count0):
-        child_state = self[0].to_nfa_state(_nfa, new_state)
+        child_state = self[0].to_nfa_state(
+          _nfa,
+          group_ref_data,
+          group_index,
+          next_state
+        )
          if child_state == -1:
            break
          new_state = len(_nfa.states)
          _nfa.states.append(
-          (nfa.NFA.STATE_OR, next_state, child_state)
+          (nfa.NFA.STATE_OR, done_state, child_state)
          if self.non_greedy else
-          (nfa.NFA.STATE_OR, child_state, next_state)
+          (nfa.NFA.STATE_OR, child_state, done_state)
          )
+        next_state = new_state
      for i in range(count0):
-      new_state = self[0].to_nfa_state(_nfa, new_state)
-      if new_state == -1:
+      next_state = self[0].to_nfa_state(
+        _nfa,
+        group_ref_data,
+        group_index,
+        next_state
+      )
+      if next_state == -1:
          break
-    return new_state
+    return next_state
  
  class RegexGroup(Regex):
-  class Attribute(element.Element):
-    # GENERATE ELEMENT(str name, str value) BEGIN
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'RegexGroup',
+    attrib = {},
+    text = '',
+    children = [],
+    n_groups = -1
+  ):
+    Regex.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children,
+      n_groups
+    )
+  def copy(self, factory = None):
+    result = Regex.copy(
+      self,
+      RegexGroup if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'regex.RegexGroup({0:s})'.format(', '.join(params))
+  # GENERATE END
+  def post_process(self, groups, caseless = False):
+    # we use -1 here because named or action groups use self[0] for text
+    groups.append(self)
+    self[-1].post_process(groups, caseless)
+    self.n_groups = self[-1].n_groups + 1
+  def to_nfa_state(self, _nfa, group_ref_data, group_index, next_state):
+    new_state = len(_nfa.states)
+    _nfa.states.append(
+      (nfa.NFA.STATE_MARK, group_ref_data[group_index][1], next_state)
+    )
+    next_state = new_state
+    next_state = self[-1].to_nfa_state(
+      _nfa,
+      group_ref_data,
+      group_index + 1,
+      next_state
+    )
+    if next_state == -1:
+      return -1
+    new_state = len(_nfa.states)
+    _nfa.states.append(
+      (nfa.NFA.STATE_MARK, group_ref_data[group_index][0], next_state)
+    )
+    return new_state
+
+# internal base class
+class Text(element.Element):
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'Text',
+    attrib = {},
+    text = '',
+    children = []
+  ):
+    element.Element.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+  def copy(self, factory = None):
+    result = element.Element.copy(
+      self,
+      Text if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'regex.Text({0:s})'.format(', '.join(params))
+  # GENERATE END
+  def get_text(self):
+    return element.get_text(self, 0)
+
+class RegexGroupName(RegexGroup):
+  class Text(Text):
+    # GENERATE ELEMENT() BEGIN
      def __init__(
        self,
-      tag = 'RegexGroup_Attribute',
+      tag = 'RegexGroupName_Text',
        attrib = {},
        text = '',
-      children = [],
-      name = '',
-      value = ''
+      children = []
      ):
-      element.Element.__init__(
+      Text.__init__(
          self,
          tag,
          attrib,
          text,
          children
        )
-      self.name = name
-      self.value = value
-    def serialize(self, ref_list):
-      element.Element.serialize(self, ref_list)
-      self.set('name', element.serialize_str(self.name))
-      self.set('value', element.serialize_str(self.value))
-    def deserialize(self, ref_list):
-      element.Element.deserialize(self, ref_list)
-      self.name = element.deserialize_str(self.get('name', ''))
-      self.value = element.deserialize_str(self.get('value', ''))
      def copy(self, factory = None):
-      result = element.Element.copy(
+      result = Text.copy(
          self,
-        Attribute if factory is None else factory
+        Text if factory is None else factory
        )
-      result.name = self.name
-      result.value = self.value
        return result
-    def repr_serialize(self, params):
-      element.Element.repr_serialize(self, params)
-      if self.name != '':
-        params.append(
-          'name = {0:s}'.format(repr(self.name))
-        )
-      if self.value != '':
-        params.append(
-          'value = {0:s}'.format(repr(self.value))
-        )
      def __repr__(self):
        params = []
        self.repr_serialize(params)
-      return 'regex.RegexGroup.Attribute({0:s})'.format(', '.join(params))
+      return 'regex.RegexGroupName.Text({0:s})'.format(', '.join(params))
      # GENERATE END
  
-  # GENERATE ELEMENT(int index, str name, list(ref) attributes) BEGIN
+  # GENERATE ELEMENT() BEGIN
    def __init__(
      self,
-    tag = 'RegexGroup',
+    tag = 'RegexGroupName',
      attrib = {},
      text = '',
      children = [],
-    index = -1,
-    name = '',
-    attributes = []
+    n_groups = -1
    ):
-    Regex.__init__(
+    RegexGroup.__init__(
        self,
        tag,
        attrib,
        text,
-      children
-    )
-    self.index = (
-      element.deserialize_int(index)
-    if isinstance(index, str) else
-      index
-    )
-    self.name = name
-    self.attributes = attributes
-  def serialize(self, ref_list):
-    Regex.serialize(self, ref_list)
-    self.set('index', element.serialize_int(self.index))
-    self.set('name', element.serialize_str(self.name))
-    self.set(
-      'attributes',
-      ' '.join([element.serialize_ref(i, ref_list) for i in self.attributes])
+      children,
+      n_groups
      )
-  def deserialize(self, ref_list):
-    Regex.deserialize(self, ref_list)
-    self.index = element.deserialize_int(self.get('index', '-1'))
-    self.name = element.deserialize_str(self.get('name', ''))
-    self.attributes = [
-      element.deserialize_ref(i, ref_list)
-      for i in self.get('attributes', '').split()
-    ]
    def copy(self, factory = None):
-    result = Regex.copy(
+    result = RegexGroup.copy(
        self,
-      RegexGroup if factory is None else factory
+      RegexGroupName if factory is None else factory
      )
-    result.index = self.index
-    result.name = self.name
-    result.attributes = self.attributes
      return result
-  def repr_serialize(self, params):
-    Regex.repr_serialize(self, params)
-    if self.index != -1:
-      params.append(
-        'index = {0:s}'.format(repr(self.index))
-      )
-    if self.name != '':
-      params.append(
-        'name = {0:s}'.format(repr(self.name))
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'regex.RegexGroupName({0:s})'.format(', '.join(params))
+  # GENERATE END
+
+class RegexGroupAction(RegexGroup):
+  class Text(Text):
+    # GENERATE ELEMENT() BEGIN
+    def __init__(
+      self,
+      tag = 'RegexGroupAction_Text',
+      attrib = {},
+      text = '',
+      children = []
+    ):
+      Text.__init__(
+        self,
+        tag,
+        attrib,
+        text,
+        children
        )
-    if len(self.attributes):
-      params.append(
-        'attributes = [{0:s}]'.format(
-          ', '.join([repr(i) for i in self.attributes])
-        )
+    def copy(self, factory = None):
+      result = Text.copy(
+        self,
+        Text if factory is None else factory
        )
+      return result
+    def __repr__(self):
+      params = []
+      self.repr_serialize(params)
+      return 'regex.RegexGroupAction.Text({0:s})'.format(', '.join(params))
+    # GENERATE END
+
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'RegexGroupAction',
+    attrib = {},
+    text = '',
+    children = [],
+    n_groups = -1
+  ):
+    RegexGroup.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children,
+      n_groups
+    )
+  def copy(self, factory = None):
+    result = RegexGroup.copy(
+      self,
+      RegexGroupAction if factory is None else factory
+    )
+    return result
    def __repr__(self):
      params = []
      self.repr_serialize(params)
-    return 'regex.RegexGroup({0:s})'.format(', '.join(params))
+    return 'regex.RegexGroupAction({0:s})'.format(', '.join(params))
    # GENERATE END
-  def post_process(self, group_index = 0, caseless = False):
-    # total hack which will be done in a Python action in future
-    if len(self) >= 2:
-      assert self[0].tag == 'GroupName'
-      self.name = self[0].text[1:-1]
-      del self[:1]
-    # end total hack
-    self.index = group_index
-    group_index += 1
-    return Regex.post_process(self, group_index, caseless)
-  def add_to_groups(self, groups):
-    assert len(groups) == self.index
-    groups.append(
-      (self.name, {i.name: i.value for i in self.attributes})
-    )
-    return Regex.add_to_groups(self, groups)
-  def to_nfa_state(self, _nfa, next_state):
-    mark_state = len(_nfa.states)
-    _nfa.states.append((nfa.NFA.STATE_MARK, self.index * 2 + 1, next_state))
-    child_state = self[0].to_nfa_state(_nfa, mark_state)
-    if child_state == -1:
-      return -1
-    new_state = len(_nfa.states)
-    _nfa.states.append((nfa.NFA.STATE_MARK, self.index * 2, child_state))
-    return new_state
-
+ 
  # GENERATE FACTORY(element.Element) BEGIN
  tag_to_class = {
    'Regex': Regex,
@@ -762,7 +901,11 @@ tag_to_class = {
    'RegexSequence': RegexSequence,
    'RegexRepeat': RegexRepeat,
    'RegexGroup': RegexGroup,
-  'RegexGroup_Attribute': RegexGroup.Attribute
+  'Text': Text,
+  'RegexGroupName': RegexGroupName,
+  'RegexGroupName_Text': RegexGroupName.Text,
+  'RegexGroupAction': RegexGroupAction,
+  'RegexGroupAction_Text': RegexGroupAction.Text
  }
  def factory(tag, attrib = {}, *args, **kwargs):
    return tag_to_class.get(tag, element.Element)(tag, attrib, *args, **kwargs)
@@ -905,9 +1048,7 @@ if __name__ == '__main__':
      )
    )
  
-  groups = []
-  _regex.add_to_groups(groups)
-  _nfa = nfa.NFA(groups)
+  _nfa = nfa.NFA()
    _regex.add_to_nfa(_nfa)
    sys.stdout.write(
      wrap_repr.wrap_repr(
diff --git a/skel/skel_py.py b/skel/skel_py.py

index 9430b22..c1cfd46 100644 (file)
--- a/skel/skel_py.py
+++ b/skel/skel_py.py
@@ -39,7 +39,10 @@ yy_threads1 = [None]
  yy_prefix_slop = 1
  
  yytext = ''
-len_yytext = 0
+yytext_len = 0
+
+yy_action = None
+yy_group_stack = None
  
  def REJECT():
    raise YYReject()
@@ -48,12 +51,14 @@ def yyterminate():
    raise YYTerminate()
  
  def yyless(i):
-  assert len_yytext >= i
+  global yytext, yytext_len
+  assert yytext_len >= i
    yytext = yytext[:i]
-  len_yytext = i
+  yytext_len = i
  
  def unput(text):
-  while len_yytext:
+  global yyin, yytext_len
+  while yytext_len:
      block = yy_buffer_stack[-1].next
      while block is None or block.pos >= len(block.text):
        if block is None:
@@ -63,20 +68,36 @@ def unput(text):
        else:
          block = block.next
          yy_buffer_stack[-1].next = block
-    i = min(len_yytext, len(block.text) - block.pos)
+    i = min(yytext_len, len(block.text) - block.pos)
      block.pos += i
-    len_yytext -= i
+    yytext_len -= i
    yy_buffer_stack[-1].next = YYBufferBlock(yy_buffer_stack[-1].next, 0, text)
  
+def yy_rule_start(match, pos):
+  global yytext, yytext_len
+  yytext = match[:pos]
+  yytext_len = pos
+
+def yy_group_end(match, pos):
+  yy_group_stack.append(pos)
+
  # GENERATE SECTION2
  
  def yylex():
-  global yyin, yy_threads0, yy_threads1, yy_prefix_slop, yytext, len_yytext
+  global \
+    yyin, \
+    yy_threads0, \
+    yy_threads1, \
+    yy_prefix_slop, \
+    yytext, \
+    yytext_len, \
+    yy_action, \
+    yy_group_stack
  
    # GENERATE SECTION2INITIAL
  
    while True:
-    while len_yytext:
+    while yytext_len:
        block = yy_buffer_stack[-1].next
        while block is None or block.pos >= len(block.text):
          if block is None:
@@ -86,10 +107,12 @@ def yylex():
          else:
            block = block.next
            yy_buffer_stack[-1].next = block
-      i = min(len_yytext, len(block.text) - block.pos)
+      i = min(yytext_len, len(block.text) - block.pos)
        block.pos += i
-      len_yytext -= i
-    yytext = ''
+      yytext_len -= i
+
+    match = ''
+    match_len = 0
  
      del yy_threads0[yy_prefix_slop:]
      yy_threads0.append(None)
@@ -121,7 +144,7 @@ def yylex():
            i -= trans[1]
          elif trans[0] == 2: #DFA.TRANSITION_MARK:
            yy_threads0[i:i + trans[1]] = [
-            (len_yytext, trans[2], thread)
+            (match_len, trans[2], thread)
              for thread in yy_threads0[i:i + trans[1]]
            ]
          elif trans[0] == 3: #DFA.TRANSITION_MOVE:
@@ -158,9 +181,9 @@ def yylex():
                block_pos = block.pos
              file_in = yy_buffer_stack[buffer_ptr].file_in
          else:
-          i = len_yytext - len(yytext)
+          i = match_len - len(match)
            if i:
-            yytext += block.text[block_pos - i:]
+            match += block.text[block_pos - i:]
            block_prev = block
            block = block_prev.next
            if block is not None:
@@ -174,7 +197,7 @@ def yylex():
            )
          ]
          block_pos += 1
-        len_yytext += 1
+        match_len += 1
          continue
        # EOF
        if i == 0:
@@ -184,23 +207,30 @@ def yylex():
            return 0
        break
  
-    i = len_yytext - len(yytext)
+    i = match_len - len(match)
      if i:
        assert block is not None
-      yytext += block.text[block_pos - i:]
+      match += block.text[block_pos - i:]
  
      for i in yy_dfa_states[state][2]:
-      _, _, thread = yy_threads0[yy_prefix_slop + i]
+      yytext = None
+      yytext_len = None
+      yy_action = None
+      yy_group_stack = []
+
+      thread = yy_threads0[yy_prefix_slop + i]
        #print('thread', thread)
-      len_yytext, mark, thread = thread
-      assert thread is None
+      while thread is not None:
+        pos, ref_data, thread = thread
+        ref_data(match, pos)
  
-      assert len(yytext) >= len_yytext
-      yytext = yytext[:len_yytext]
        #print('yytext', yytext)
+      #print('yytext_len', yytext_len)
+      #print('yy_action', yy_action)
+      #print('yy_group_stack', yy_group_stack)
  
        try:
-        return yy_actions[mark >> 1]()
+        return yy_action()
        except YYReject:
          pass
        except YYContinue:
diff --git a/tests/Makefile b/tests/Makefile

index 968fd23..38fd6d0 100644 (file)
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -1,9 +1,14 @@
-all: lex_yy.py cal flex0 flex1
+all: cal_py.py cal_py_pi.py cal flex0 flex1
  
  # Python scanner test
-lex_yy.py: cal_py.l
+cal_py.py: cal_py.l
         ../../bootstrap_flex.git/src/flex -o /dev/null $< 2>$<.xml
-       ../pilex.py --python $<.xml
+       ../pilex.py --python -o $@ $<.xml
+
+# Python scanner test
+cal_py_pi.py: cal_py_pi.l
+       ../../bootstrap_flex.git/src/flex -o /dev/null $< 2>$<.xml
+       ../pilex.py --python -o $@ $<.xml
  
  # cal program
  cal: y.tab.o
diff --git a/tests/cal_py.l b/tests/cal_py.l

index 0479367..624b71a 100644 (file)
--- a/tests/cal_py.l
+++ b/tests/cal_py.l
@@ -1,18 +1,33 @@
  %{
  NUM = 0x100 
  yylval = None
+groups = None
+def gc(n):
+  global groups
+  groups = [None for i in range(n)]
+  yy_group_stack.pop()
+def g(n, match, pos):
+  groups[n] = match[pos:yy_group_stack.pop()]
  %}
  
-DIGIT [0-9]+\.?|[0-9]*\.[0-9]+
+DIGIT (?{g(1, match, pos)}[0-9]+)\.?|(?{g(2, match, pos)}[0-9]*)\.(?{g(3, match, pos)}[0-9]+)
  
  %option noecs nometa-ecs noyywrap reject yymore
  
  %%
  
  [ ]
-{DIGIT}        {
+(?{g(0, match, pos)}{DIGIT})(?{gc(4)}"")       {
    global yylval
-  yylval = float(yytext)
+  #print('groups', groups)
+  #yylval = float(yytext)
+  if groups[1] is not None:
+    mantissa = groups[1]
+    exponent = 0
+  else:
+    mantissa = groups[2] + groups[3]
+    exponent = -len(groups[3])
+  yylval = int(mantissa) * 10 ** exponent
    return NUM
  }
  \n|.   {
author	Nick Downing <downing.nick@gmail.com>
	Tue, 25 Sep 2018 23:20:48 +0000 (09:20 +1000)
committer	Nick Downing <downing.nick@gmail.com>
	Tue, 25 Sep 2018 23:20:48 +0000 (09:20 +1000)
ast.py		patch \| blob \| history
dfa.py		patch \| blob \| history
generate_py.py		patch \| blob \| history
nfa.py		patch \| blob \| history
regex.py		patch \| blob \| history
skel/skel_py.py		patch \| blob \| history
tests/Makefile		patch \| blob \| history
tests/cal_py.l		patch \| blob \| history