Get latest regex.py and other modules (since it has been split up) from pyacc2.git...
authorNick Downing <downing.nick@gmail.com>
Sat, 21 Jul 2018 12:04:59 +0000 (22:04 +1000)
committerNick Downing <downing.nick@gmail.com>
Sat, 21 Jul 2018 12:19:40 +0000 (22:19 +1000)
ast.py
bisect_set.py [new file with mode: 0644]
bootstrap_plex.py
dfa.py [new file with mode: 0644]
element.py
flex_dfa.py
generate.py
nfa.py [new file with mode: 0644]
regex.py

diff --git a/ast.py b/ast.py
index 265c713..b8a166f 100644 (file)
--- a/ast.py
+++ b/ast.py
@@ -1,4 +1,5 @@
 import element
+import nfa
 import regex
 
 class Item(element.Element):
@@ -70,8 +71,8 @@ class PLex(element.Element):
       if isinstance(eof_action, str) else
         eof_action
       )
-    def serialize(self, ref_list, indent = 0):
-      element.Element.serialize(self, ref_list, indent)
+    def serialize(self, ref_list):
+      element.Element.serialize(self, ref_list)
       self.set('name', element.serialize_str(self.name))
       self.set('exclusive', element.serialize_bool(self.exclusive))
       self.set('eof_action', element.serialize_int(self.eof_action))
@@ -264,8 +265,8 @@ class PLex(element.Element):
         children
       )
       self.code_blocks_text = code_blocks_text
-    def serialize(self, ref_list, indent = 0):
-      PLex.Section.serialize(self, ref_list, indent)
+    def serialize(self, ref_list):
+      PLex.Section.serialize(self, ref_list)
       self.set(
         'code_blocks_text',
         ' '.join([element.serialize_ref(i, ref_list) for i in self.code_blocks_text])
@@ -351,8 +352,8 @@ class PLex(element.Element):
           if isinstance(value, str) else
             value
           )
-        def serialize(self, ref_list, indent = 0):
-          PLex.Section1.Options.Option.serialize(self, ref_list, indent)
+        def serialize(self, ref_list):
+          PLex.Section1.Options.Option.serialize(self, ref_list)
           self.set('value', element.serialize_bool(self.value))
         def deserialize(self, ref_list):
           PLex.Section1.Options.Option.deserialize(self, ref_list)
@@ -595,8 +596,8 @@ class PLex(element.Element):
         if isinstance(exclusive, str) else
           exclusive
         )
-      def serialize(self, ref_list, indent = 0):
-        Item.serialize(self, ref_list, indent)
+      def serialize(self, ref_list):
+        Item.serialize(self, ref_list)
         self.set('exclusive', element.serialize_bool(self.exclusive))
       def deserialize(self, ref_list):
         Item.deserialize(self, ref_list)
@@ -691,8 +692,8 @@ class PLex(element.Element):
       if isinstance(yywrap, str) else
         yywrap
       )
-    def serialize(self, ref_list, indent = 0):
-      PLex.Section1Or2.serialize(self, ref_list, indent)
+    def serialize(self, ref_list):
+      PLex.Section1Or2.serialize(self, ref_list)
       self.set('ecs', element.serialize_bool(self.ecs))
       self.set('meta_ecs', element.serialize_bool(self.meta_ecs))
       self.set('reject', element.serialize_bool(self.reject))
@@ -786,8 +787,8 @@ class PLex(element.Element):
         if isinstance(wildcard, str) else
           wildcard
         )
-      def serialize(self, ref_list, indent = 0):
-        element.Element.serialize(self, ref_list, indent)
+      def serialize(self, ref_list):
+        element.Element.serialize(self, ref_list)
         self.set('wildcard', element.serialize_bool(self.wildcard))
       def deserialize(self, ref_list):
         element.Element.deserialize(self, ref_list)
@@ -1064,8 +1065,8 @@ class PLex(element.Element):
     self.start_conditions = start_conditions
     self.actions_text = actions_text
     self.eof_actions_text = eof_actions_text
-  def serialize(self, ref_list, indent = 0):
-    element.Element.serialize(self, ref_list, indent)
+  def serialize(self, ref_list):
+    element.Element.serialize(self, ref_list)
     self.set(
       'start_conditions',
       ' '.join([element.serialize_ref(i, ref_list) for i in self.start_conditions])
@@ -1163,7 +1164,7 @@ class PLex(element.Element):
               count0 = 0,
               children = [
                 regex.RegexCharacter(
-                  char_set = [0, 0x100]
+                  character_set = [0, 0x100]
                 )
               ]
             ),
@@ -1173,7 +1174,7 @@ class PLex(element.Element):
                 regex.RegexSequence(
                   children = [
                     regex.RegexCharacter(
-                      char_set = [0, 0x100]
+                      character_set = [0, 0x100]
                     ),
                     regex.RegexGroup(
                       group_index = len(self.actions_text),
@@ -1189,11 +1190,11 @@ class PLex(element.Element):
         )
     self.actions_text.append(PLex.Text(text = 'ECHO;\n'))
   def to_nfa(self):
-    nfa = regex.NFA()
+    _nfa = nfa.NFA()
     for i in self.start_conditions:
       for j in i:
-        j.add_to_nfa(nfa)
-    return nfa
+        j.add_to_nfa(_nfa)
+    return _nfa
 
 # GENERATE FACTORY(regex.factory) BEGIN
 tag_to_class = {
diff --git a/bisect_set.py b/bisect_set.py
new file mode 100644 (file)
index 0000000..6ce1be9
--- /dev/null
@@ -0,0 +1,71 @@
+# defines the alphabet size, set this to 0x11000 for unicode
+n_characters = 0x100
+
+def bisect_set_or(character_set0, character_set1):
+  # calculate union of the child sets
+  # we do this by calculating a series of breakpoints, at each breakpoint
+  # evaluating the "or" (max) of the even/odd truth values of each child,
+  # then making the output truth value even/odd by outputting if necessary
+  result = []
+  i = 0
+  j = 0
+  while True:
+    if i < len(character_set0):
+      k = character_set0[i]
+      if j < len(character_set1):
+        k = min(k, character_set1[j])
+    elif j < len(character_set1):
+      k = character_set1[j]
+    else:
+      break
+    if i < len(character_set0) and character_set0[i] == k:
+      i += 1
+    if j < len(character_set1) and character_set1[j] == k:
+      j += 1
+    if (len(result) & 1) != max(i & 1, j & 1):
+      result.append(k)
+  assert (i & 1) == 0 and (j & 1) == 0
+  return result
+
+def bisect_set_and(character_set0, character_set1):
+  # calculate intersection of the child sets
+  # we do this by calculating a series of breakpoints, at each breakpoint
+  # evaluating the "and" (min) of the even/odd truth values of each child,
+  # then making the output truth value even/odd by outputting if necessary
+  result = []
+  i = 0
+  j = 0
+  while True:
+    if i < len(character_set0):
+      k = character_set0[i]
+      if j < len(character_set1):
+        k = min(k, character_set1[j])
+    elif j < len(character_set1):
+      k = character_set1[j]
+    else:
+      break
+    if i < len(character_set0) and character_set0[i] == k:
+      i += 1
+    if j < len(character_set1) and character_set1[j] == k:
+      j += 1
+    if (len(result) & 1) != min(i & 1, j & 1):
+      result.append(k)
+  assert (i & 1) == 0 and (j & 1) == 0
+  return result
+
+def bisect_set_not(character_set):
+  # calculate complement of the child set
+  # if child set begins with [0], remove it, otherwise add [0] prefix
+  # if child set ends with [n_characters], remove it, otherwise add [n_characters] suffix
+  # the suffix part is not totally necessary, but makes sure length is even
+  # (the evenness is so that single character sets can always be [c, c + 1])
+  result = list(character_set)
+  if result[:1] == [0]:
+    del result[:1]
+  else:
+    result[:0] = [0]
+  if result[-1:] == [n_characters]:
+    del result[-1:]
+  else:
+    result.append(n_characters)
+  return result
index 87c8fe6..a871402 100755 (executable)
@@ -6,13 +6,12 @@ import flex_dfa
 import getopt
 import os
 import sys
-import xml.etree.ElementTree
 
 home_dir = os.path.dirname(sys.argv[0])
 try:
   opts, args = getopt.getopt(sys.argv[1:], 'o:S:', ['outfile=', 'skel='])
 except getopt.GetoptError as err:
-  sys.stderr.write(str(err))
+  sys.stderr.write('{0:s}\n'.format(str(err)))
   sys.exit(1)
 
 out_file = 'lex.yy.c'
@@ -35,5 +34,9 @@ in_file = args[0]
 
 with open(in_file) as fin:
   plex = element.deserialize(fin, ast.factory)
+#element.serialize(plex, 'a.xml', 'utf-8')
+#plex = element.deserialize('a.xml', ast.factory, 'utf-8')
 plex.post_process()
+#element.serialize(plex, 'b.xml', 'utf-8')
+#plex = element.deserialize('b.xml', ast.factory, 'utf-8')
 flex_dfa.generate(plex, skel_file, out_file)
diff --git a/dfa.py b/dfa.py
new file mode 100644 (file)
index 0000000..46f4cbe
--- /dev/null
+++ b/dfa.py
@@ -0,0 +1,221 @@
+import bisect
+import element
+import work
+import sys
+
+# defines the alphabet size, set this to 0x11000 for unicode
+n_characters = 0x100
+
+class DFA:
+  # transition classes:
+  # instructions to transform thread list from a multistate to next multistate
+  # (TRANSITION_POP, n)                        j += n
+  # (TRANSITION_DUP, n)                        threads0[j - n:j] = threads0[j:j + n]
+  #                                    j -= n
+  # (TRANSITION_MARK, n, mark_value)   threads0[j:j + n] = [
+  #                                      (i, mark, thread)
+  #                                      for thread in threads0[j:j + n]
+  #                                    ]
+  # (TRANSITION_MOVE, n)               threads1.extend(threads0[j:j + n])
+  #                                    j += n
+  # (TRANSITION_DEL, n)                        del threads1[-n:]
+
+  TRANSITION_POP = 0
+  TRANSITION_DUP = 1
+  TRANSITION_MARK = 2
+  TRANSITION_MOVE = 3
+  #TRANSITION_DEL = 4
+  def __init__(
+    self,
+    groups = [],
+    states = [([n_characters], [0], 0)],
+    actions = [(0, [])],
+    start_action = [] # can have multiple DFAs in same container
+  ):
+    # groups: list of group_desc
+    # group_desc: (tag, kwargs)
+    #   tag, kwargs will be passed to apply_markup() hence factory()
+    # states: list of state_desc
+    # state_desc: (list of breaks, list of action to do, accept_thread)
+    # actions: list of action_desc
+    # action_desc: (state to go to next, compiled transition to do first)
+    # accept_thread: which thread of thread list to use, -1 don't accept
+    self.groups = groups
+    self.states = states
+    self.actions = actions
+    self.start_action = start_action
+
+  def match_text(self, text, i, start_index = 0):
+    def transit(transition):
+      nonlocal threads0, threads1, prefix_slop # note: also uses i
+      j = prefix_slop
+      for trans in transition:
+        if trans[0] == DFA.TRANSITION_POP:
+          j += trans[1]
+        elif trans[0] == DFA.TRANSITION_DUP:
+          while j < trans[1]:
+            threads0[:0] = [None] * prefix_slop
+            threads1[:0] = [None] * prefix_slop
+            j += prefix_slop
+            prefix_slop *= 2
+          threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
+          j -= trans[1]
+        elif trans[0] == DFA.TRANSITION_MARK:
+          threads0[j:j + trans[1]] = [
+            (i, trans[2], thread)
+            for thread in threads0[j:j + trans[1]]
+          ]
+        elif trans[0] == DFA.TRANSITION_MOVE:
+          threads1.extend(threads0[j:j + trans[1]])
+          j += trans[1]
+        #elif trans[0] == DFA.TRANSITION_DEL:
+        #  del threads1[-trans[1]:]
+        else:
+          assert False
+      assert j == len(threads0)
+      threads0, threads1 = threads1, threads0
+      del threads1[prefix_slop:]
+
+    threads0 = [None, None]
+    threads1 = [None]
+    prefix_slop = 1
+
+    action = self.start_action[start_index]
+    while action != -1:
+      state, transition = self.actions[action]
+      #print('i', i, 'action', action, 'state', state, 'transition', transition)
+      transit(transition)
+      if state == 0:
+        # there is only one match, which is complete
+        assert len(threads0) == prefix_slop + 1
+        return threads0[prefix_slop]
+      if i >= len(text):
+        # return best match we have, but not incomplete match
+        i = self.states[state][2]
+        return (None if i == -1 else threads0[prefix_slop + i])
+      action = self.states[state][1][
+        bisect.bisect_right(self.states[state][0], ord(text[i]))
+      ]
+      i += 1
+    return None
+
+  def match_yychunk(self, root, pos, off, yychunk_iter, start_index = 0):
+    if pos < 0:
+      pos, off = element.to_start_relative(root, pos, off)
+
+    def transit(transition):
+      nonlocal threads0, threads1, prefix_slop # note: also uses pos, off
+      j = prefix_slop
+      for trans in transition:
+        if trans[0] == DFA.TRANSITION_POP:
+          j += trans[1]
+        elif trans[0] == DFA.TRANSITION_DUP:
+          while j < trans[1]:
+            threads0[:0] = [None] * prefix_slop
+            threads1[:0] = [None] * prefix_slop
+            j += prefix_slop
+            prefix_slop *= 2
+          threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
+          j -= trans[1]
+        elif trans[0] == DFA.TRANSITION_MARK:
+          threads0[j:j + trans[1]] = [
+            (pos, off, trans[2], thread)
+            for thread in threads0[j:j + trans[1]]
+          ]
+        elif trans[0] == DFA.TRANSITION_MOVE:
+          threads1.extend(threads0[j:j + trans[1]])
+          j += trans[1]
+        #elif trans[0] == DFA.TRANSITION_DEL:
+        #  del threads1[-trans[1]:]
+        else:
+          assert False
+      assert j == len(threads0)
+      threads0, threads1 = threads1, threads0
+      del threads1[prefix_slop:]
+
+    threads0 = [None, None]
+    threads1 = [None]
+    prefix_slop = 1
+
+    action = self.start_action[start_index]
+    text = element.get_text(root, pos)
+    while action != -1:
+      state, transition = self.actions[action]
+      transit(transition)
+      if state == 0:
+        # there is only one match, which is complete
+        assert len(threads0) == prefix_slop + 1
+        return threads0[prefix_slop]
+      while off >= len(text):
+        if pos < len(root):
+          pos += 1
+          off = 0
+        else:
+          try:
+            next(yychunk_iter)
+          except StopIteration:
+            # return best match we have, but not incomplete match
+            i = self.states[state][2]
+            return (None if i == -1 else threads0[prefix_slop + i])
+          text = element.get_text(root, pos)
+      #print(
+      #  'state {0:d} pos {1:d} off {2:d} text "{3:s}"'.format(
+      #    state,
+      #    pos,
+      #    off,
+      #    text.replace('\n', '$')
+      #  )
+      #)
+      action = self.states[state][1][
+        bisect.bisect_right(self.states[state][0], ord(text[off]))
+      ]
+      off += 1
+    return None
+
+  def yylex(self, root, pos, off, factory, yychunk_iter):
+    if pos < 0:
+      pos, off = element.to_start_relative(root, pos, off)
+
+    while True:
+      # note: pointers must be kept start relative during the below call,
+      # because it extends the following text by calling the yychunk_iter
+      thread = self.match_yychunk(root, pos, off, yychunk_iter)
+      if thread is None:
+        break
+      stack = []
+      while True:
+        pos, off, mark_value, thread = thread
+        group_index = mark_value >> 1
+        if (mark_value & 1) != 0:
+          end_pos, end_off = element.to_end_relative(root, pos, off)
+          stack.append((end_pos, end_off, group_index))
+        else:
+          end_pos, end_off, temp = stack.pop()
+          assert temp == group_index
+          if len(stack) == 0:
+            break
+          tag, kwargs = self.groups[group_index]
+          if tag != '':
+            work.apply_markup(
+              root,
+              pos,
+              off,
+              end_pos,
+              end_off,
+              factory,
+              tag,
+              **kwargs
+            )
+      # note: pointers must be kept end relative during the below call,
+      # because it modifies the preceding text by calling apply_markup()
+      yield end_pos, end_off, group_index
+      pos, off = element.to_start_relative(root, end_pos, end_off)
+
+  def __repr__(self):
+    return 'dfa.DFA({0:s}, {1:s}, {2:s}, {3:s})'.format(
+      repr(self.groups),
+      repr(self.states),
+      repr(self.actions),
+      repr(self.start_action)
+    )
index e9bda98..1f8e84f 100644 (file)
@@ -1,18 +1,20 @@
-import sys
 import xml.etree.ElementTree
 
 class Element(xml.etree.ElementTree._Element_Py):
   def __init__(self, tag = 'Element', attrib = {}, text = '', children = []):
     xml.etree.ElementTree._Element_Py.__init__(self, tag, attrib)
     self.ref = -1
+    self.seen = False
     set_text(self, 0, text)
     self[:] = children
-  def serialize(self, ref_list, indent = 0):
-    if len(self):
-      for i in range(len(self)):
-        set_text(self, i, '\n' + ' ' * (indent + 2))
-        self[i].serialize(ref_list, indent + 2)
-      set_text(self, len(self), '\n' + ' ' * indent)
+  def serialize(self, ref_list):
+    for i in self:
+      # parented, enforce that child can only be parented at most once
+      # (although there can be unlimited numbers of numeric refs to it)
+      assert not i.seen
+      i.seen = True
+      if i.ref == -1:
+        i.serialize(ref_list)
   def deserialize(self, ref_list):
     for i in self:
       i.deserialize(ref_list)
@@ -53,13 +55,15 @@ def serialize_ref(value, ref_list):
     ref = -1
   else:
     ref = value.ref
-    if ref < 0:
-      value.serialize(ref_list, 2)
+    if ref == -1:
       ref = len(ref_list)
-      set_text(ref_list, ref, '\n  ')
       ref_list.append(value)
       value.ref = ref
-      value.set('ref', serialize_int(ref))
+      value.set('ref', str(ref))
+      # this doesn't set the seen flag, so it will be parented by the
+      # root, unless it is already parented or gets parented later on
+      if not value.seen:
+        value.serialize(ref_list)
   return str(ref)
 
 def deserialize_ref(text, ref_list):
@@ -72,34 +76,55 @@ def serialize_str(value):
 def deserialize_str(text):
   return text
 
-def serialize(root, fout, encoding = 'utf-8'):
-  ref_list = Element('RefList')
-  root.serialize(ref_list, 2)
-  ref = len(ref_list)
-  set_text(ref_list, ref, '\n  ')
-  ref_list.append(root)
-  root.ref = ref
-  root.set('ref', serialize_int(ref))
-  set_text(ref_list, ref + 1, '\n')
-  ref_list.tail = '\n'
-  xml.etree.ElementTree.ElementTree(ref_list).write(fout, encoding)
+def serialize(value, fout, encoding = 'unicode'):
+  ref_list = []
+  serialize_ref(value, ref_list)
+  parents = [i for i in ref_list if not i.seen]
+  root = Element('root', children = parents)
+  for i in range(len(root)):
+    set_text(root, i, '\n  ')
+  set_text(root, len(root), '\n')
+  root.tail = '\n'
+  xml.etree.ElementTree.ElementTree(root).write(fout, encoding)
+  for i in root:
+    i.tail = None
   for i in ref_list:
     i.ref = -1
     del i.attrib['ref']
+  i = 0
+  while i < len(parents):
+    for j in parents[i]:
+      j.seen = False
+      parents.append(j)
+    i += 1
 
-def deserialize(fin, factory = Element, encoding = 'utf-8'):
-  ref_list = xml.etree.ElementTree.parse(
+def deserialize(fin, factory = Element, encoding = 'unicode'):
+  root = xml.etree.ElementTree.parse(
     fin,
     xml.etree.ElementTree.XMLParser(
       target = xml.etree.ElementTree.TreeBuilder(factory),
       encoding = encoding
     )
   ).getroot()
-  for i in ref_list:
+  assert root.tag == 'root'
+  for i in root:
+    i.tail = None
+  i = 0
+  parents = root[:]
+  ref_list = []
+  while i < len(parents):
+    j = parents[i]
+    if 'ref' in j.attrib:
+      ref = int(j.attrib['ref'])
+      del j.attrib['ref']
+      if len(ref_list) < ref + 1:
+        ref_list.extend([None] * (ref + 1 - len(ref_list)))
+      ref_list[ref] = j
+    parents.extend(j[:])
+    i += 1
+  for i in root:
     i.deserialize(ref_list)
-    i.ref = -1
-    del i.attrib['ref']
-  return ref_list[-1]
+  return ref_list[0]
 
 # compatibility scheme to access arbitrary xml.etree.ElementTree.Element-like
 # objects (not just Element defined above) using a more consistent interface:
index 51ae564..4e62b7a 100644 (file)
@@ -1,3 +1,4 @@
+import dfa
 import element
 import numpy
 import regex
@@ -6,7 +7,7 @@ class FlexDFA:
   YY_TRAILING_MASK = 0x2000
   YY_TRAILING_HEAD_MASK = 0x4000
 
-  def __init__(self, dfa):
+  def __init__(self, _dfa):
     # we use a modified version of the transition routine, we do not know
     # how many threads are active, so we just create null threads as they
     # are referred to (resulting threads have current marks but no history),
@@ -17,9 +18,9 @@ class FlexDFA:
       for trans in transition:
         if len(threads0) < j + trans[1]:
           threads0.extend([[] for k in range(j + trans[1] - len(threads0))])
-        if trans[0] == regex.DFA.TRANSITION_POP:
+        if trans[0] == dfa.DFA.TRANSITION_POP:
           j += trans[1]
-        elif trans[0] == regex.DFA.TRANSITION_DUP:
+        elif trans[0] == dfa.DFA.TRANSITION_DUP:
           while j < trans[1]:
             threads0[:0] = [None] * prefix_slop
             threads1[:0] = [None] * prefix_slop
@@ -30,13 +31,13 @@ class FlexDFA:
             for k in threads0[j:j + trans[1]]
           ]
           j -= trans[1]
-        elif trans[0] == regex.DFA.TRANSITION_MARK:
+        elif trans[0] == dfa.DFA.TRANSITION_MARK:
           for k in range(j, j + trans[1]):
             threads0[j].append(trans[2])
-        elif trans[0] == regex.DFA.TRANSITION_MOVE:
+        elif trans[0] == dfa.DFA.TRANSITION_MOVE:
           threads1.extend(threads0[j:j + trans[1]])
           j += trans[1]
-        #elif trans[0] == regex.DFA.TRANSITION_DEL:
+        #elif trans[0] == dfa.DFA.TRANSITION_DEL:
         #  del threads1[-trans[1]:]
         else:
           assert False
@@ -59,7 +60,7 @@ class FlexDFA:
     # the correctly numbered slot (may cause duplicates), and then all
     # actions reachable from these being copied to the subsequent slots
     # (if start-action is reached again, uses the lower numbered copy)
-    flex_state_to_action = [0] + dfa.start_action
+    flex_state_to_action = [0] + _dfa.start_action
     action_to_flex_state = {-1: 0} # see comment about state -1 below
     for i in range(len(flex_state_to_action)):
       action = flex_state_to_action[i]
@@ -68,7 +69,7 @@ class FlexDFA:
 
     # last start-condition is really end-of-buffer (EOB), it has only
     # a dummy rule that accepts the null string and executes EOB action
-    eob_state = len(dfa.start_action)
+    eob_state = len(_dfa.start_action)
 
     # state 0 is the jam state, the EOB state will be added later on
     self.states = [([], 0, 0)] # accept, base, def
@@ -86,7 +87,7 @@ class FlexDFA:
 
     while len(self.states) < len(flex_state_to_action):
       action = flex_state_to_action[len(self.states)]
-      state, transition = dfa.actions[action]
+      state, transition = _dfa.actions[action]
       #print('state', len(self.states), 'transition', transition)
 
       del threads0[prefix_slop:]
@@ -121,11 +122,11 @@ class FlexDFA:
           new_full_entries[:full_entries.shape[0], :] = full_entries
           full_entries = new_full_entries
 
-        # calculate full entry from dfa.state char-to-action table
-        breaks, actions, _ = dfa.states[state]
-        char0 = 0
+        # calculate full entry from _dfa.state character-to-action table
+        breaks, actions, _ = _dfa.states[state]
+        character0 = 0
         for i in range(len(breaks)):
-          char1 = breaks[i]
+          character1 = breaks[i]
           next_action = actions[i]
           if next_action in action_to_flex_state:
             next_flex_state = action_to_flex_state[next_action]
@@ -133,9 +134,9 @@ class FlexDFA:
             next_flex_state = len(flex_state_to_action)
             action_to_flex_state[next_action] = next_flex_state
             flex_state_to_action.append(next_action)
-          full_entries[len(self.states), char0:char1] = next_flex_state
-          char0 = char1
-        assert char0 == 0x100
+          full_entries[len(self.states), character0:character1] = next_flex_state
+          character0 = character1
+        assert character0 == 0x100
 
         # remap NUL transition to 0x100 and replace with EOB transition
         full_entries[len(self.states), 0x100] = \
index 169c348..f9a633b 100755 (executable)
@@ -219,8 +219,8 @@ while len(line):
           )
       if len(fields):
         sys.stdout.write(
-          '''{0:s}def serialize(self, ref_list, indent = 0):
-{1:s}  {2:s}.serialize(self, ref_list, indent)
+          '''{0:s}def serialize(self, ref_list):
+{1:s}  {2:s}.serialize(self, ref_list)
 '''.format(indent, indent, full_base_class)
         )
         for type, name in fields:
diff --git a/nfa.py b/nfa.py
new file mode 100644 (file)
index 0000000..0d108bd
--- /dev/null
+++ b/nfa.py
@@ -0,0 +1,446 @@
+import bisect
+import dfa
+import element
+
+# defines the alphabet size, set this to 0x11000 for unicode
+n_characters = 0x100
+
+class NFA:
+  # state_desc classes:
+  # (STATE_CHARACTER, character_set, next_state)
+  # (STATE_OR, next_state0, next_state1)
+  # (STATE_AND, next_state0, next_state1)
+  # (STATE_JOIN0,)
+  # (STATE_JOIN1, next_state)
+  # (STATE_MARK, mark_value, next_state)
+
+  STATE_CHARACTER = 0
+  STATE_OR = 1
+  STATE_AND = 2
+  STATE_JOIN0 = 3
+  STATE_JOIN1 = 4
+  STATE_MARK = 5
+  join0_state = (STATE_JOIN0,)
+
+  # multistate classes:
+  # (MULTISTATE_ACCEPT, 1)             accept, occupies one thread in list
+  # (MULTISTATE_AND, n, state, child)  and, occupies n threads in list
+  # (MULTISTATE_OR, n, child0, child1)
+  # n = sum of n from child states, that is, multistate[1] is the number of
+  # (MULTISTATE_ACCEPT, 1) leaf states ultimately reachable from this subtree
+
+  MULTISTATE_ACCEPT = 0
+  MULTISTATE_AND = 1
+  MULTISTATE_OR = 2
+  accept_multistate = (MULTISTATE_ACCEPT, 1)
+
+  def __init__(
+    self,
+    groups = [],
+    states = [(STATE_CHARACTER, [0, n_characters], 0)],
+    start_state = [] # can have multiple NFAs in same container
+  ):
+    # groups: list of group_desc
+    # group_desc: (tag, kwargs)
+    #   tag, kwargs will be passed to apply_markup() hence factory()
+    self.groups = groups
+    self.states = states
+    self.start_state = start_state
+
+  def multistate_next(self, root_multistate, character):
+    # the deduplication works as effectively a second pass which goes
+    # over the multistate tree in pre-order, looking for OR-disjunctions
+    # of any depth and configuration, e.g. (a OR b) or (c OR d), and
+    # collecting the subexpressions e.g. a, b, c OR b, c, d, c OR d, and
+    # failing any subexpression that's previously occurred in pre-order
+    # (children of AND-conjunctions get a fresh empty subexpression set)
+
+    # unfortunately it can't be done as a second pass literally, because
+    # we have to generate the transition list, thus we need to know if a
+    # subexpression is going to be failed, so we can delete it's threads
+
+    # therefore the deduplication is tacked onto the end of the building
+    # process, examining each built node just before the builder returns,
+    # if already in the subexpression set we will rewrite the transition
+    # list produced by the recursive calls, otherwise we will add it in
+
+    # the problem is determining the subexpression set to pass into any
+    # recursive calls, the caller's set will be sent unchanged by cases
+    # that return the result unchanged or add an OR-disjuction, an empty
+    # set will be sent by cases that add an AND-conjuction to the result 
+
+    # if we have added something to the node built by a recursive call,
+    # we'll fall into the deduplication logic, otherwise just return it
+
+    def advance1(multistate, join_count, done_multistates):
+      # modifies nonlocal: transition
+      assert multistate[0] == NFA.MULTISTATE_AND
+      _, _, state, child = multistate
+      if state == 0:
+        # note that if we reach the accepting state, we must be a top-level
+        # expression, and could not be part of an AND-conjunction (because
+        # AND-conjunction terms always go to a join0 or join1 state first),
+        # we remove ourselves to indicate to scanner that match is complete
+        assert join_count == 0
+        assert child == NFA.accept_multistate
+        return advance(child, 0, done_multistates)
+      state_desc = self.states[state]
+      if state_desc[0] == NFA.STATE_CHARACTER:
+        if join_count != 0:
+          transition.append((dfa.DFA.TRANSITION_POP, child[1]))
+          return None
+        len_transition = len(transition)
+        child = advance(child, 0, set())
+        if child is None:
+          return None
+        result = (NFA.MULTISTATE_AND, child[1], state, child)
+      elif state_desc[0] == NFA.STATE_OR:
+        _, next_state0, next_state1 = state_desc
+        len_transition = len(transition)
+        transition.append((dfa.DFA.TRANSITION_DUP, child[1]))
+        child0 = advance1(
+          (NFA.MULTISTATE_AND, child[1], next_state0, child),
+          join_count,
+          done_multistates
+        )
+        child1 = advance1(
+          (NFA.MULTISTATE_AND, child[1], next_state1, child),
+          join_count,
+          done_multistates
+        )
+        if child0 is None:
+          return child1
+        if child1 is None:
+          return child0
+        result = (NFA.MULTISTATE_OR, child0[1] + child1[1], child0, child1)
+      elif state_desc[0] == NFA.STATE_AND:
+        _, next_state0, next_state1 = state_desc
+        return advance1(
+          (
+            NFA.MULTISTATE_AND,
+            child[1],
+            next_state0,
+            (NFA.MULTISTATE_AND, child[1], next_state1, child)
+          ),
+          join_count,
+          done_multistates
+        )
+      elif state_desc[0] == NFA.STATE_JOIN0:
+        return advance(child, join_count + 1, done_multistates)
+      elif state_desc[0] == NFA.STATE_JOIN1:
+        _, next_state = state_desc
+        if join_count == 0:
+          transition.append((dfa.DFA.TRANSITION_POP, child[1]))
+          return None
+        return advance1(
+          (NFA.MULTISTATE_AND, child[1], next_state, child),
+          join_count - 1,
+          done_multistates
+        )
+      elif state_desc[0] == NFA.STATE_MARK:
+        _, mark_value, next_state = state_desc
+        transition.append((dfa.DFA.TRANSITION_MARK, child[1], mark_value))
+        return advance1(
+          (NFA.MULTISTATE_AND, child[1], next_state, child),
+          join_count,
+          done_multistates
+        )
+      else:
+        assert False
+      if result in done_multistates:
+        transition[len_transition:] = [(dfa.DFA.TRANSITION_POP, multistate[1])]
+        return None
+      done_multistates.add(result)
+      return result
+
+    def advance(multistate, join_count, done_multistates):
+      nonlocal character0, character1 # modifies nonlocal: transition
+      if multistate[0] == NFA.MULTISTATE_ACCEPT:
+        assert join_count == 0
+        len_transition = len(transition)
+        transition.append((dfa.DFA.TRANSITION_MOVE, 1))
+        result = NFA.accept_multistate # takes no arguments so use static one
+      elif multistate[0] == NFA.MULTISTATE_AND:
+        if character >= 0:
+          _, _, state, child = multistate
+          state_desc = self.states[state]
+          assert state_desc[0] == NFA.STATE_CHARACTER
+          _, character_set, next_state = state_desc
+          k = bisect.bisect_right(character_set, character)
+          if k > 0 and character0 < character_set[k - 1]:
+            character0 = character_set[k - 1]
+          if k < len(character_set) and character1 > character_set[k]:
+            character1 = character_set[k]
+          if (k & 1) == 0:
+            transition.append((dfa.DFA.TRANSITION_POP, child[1]))
+            return None
+          multistate = (NFA.MULTISTATE_AND, child[1], next_state, child)
+        return advance1(multistate, join_count, done_multistates)
+      elif multistate[0] == NFA.MULTISTATE_OR:
+        _, _, child0, child1 = multistate
+        len_transition = len(transition)
+        child0 = advance(child0, join_count, done_multistates)
+        child1 = advance(child1, join_count, done_multistates)
+        if child0 is None:
+          return child1
+        if child1 is None:
+          return child0
+        result = (NFA.MULTISTATE_OR, child0[1] + child1[1], child0, child1)
+      else:
+        assert False
+      if result in done_multistates:
+        transition[len_transition:] = [(dfa.DFA.TRANSITION_POP, multistate[1])]
+        return None
+      done_multistates.add(result)
+      return result
+
+    transition = []
+    character0 = 0
+    character1 = n_characters
+    root_multistate = advance(root_multistate, 0, set())
+    return root_multistate, transition, character0, character1
+
+  def multistate_accept(root_multistate):
+    i = 0
+    def accept(multistate):
+      nonlocal i
+      if multistate[0] == NFA.MULTISTATE_ACCEPT:
+        return True
+      if multistate[0] == NFA.MULTISTATE_AND:
+        _, _, _, child = multistate
+        i += child[1]
+        return False
+      if multistate[0] == NFA.MULTISTATE_OR:
+        _, _, child0, child1 = multistate
+        return accept(child0) or accept(child1)
+      assert False
+    return i if accept(root_multistate) else -1
+
+  def match_text(self, text, i, start_index = 0):
+    def transit(transition):
+      nonlocal threads0, threads1, prefix_slop # note: also uses i
+      j = prefix_slop
+      for trans in transition:
+        if trans[0] == dfa.DFA.TRANSITION_POP:
+          j += trans[1]
+        elif trans[0] == dfa.DFA.TRANSITION_DUP:
+          while j < trans[1]:
+            threads0[:0] = [None] * prefix_slop
+            threads1[:0] = [None] * prefix_slop
+            j += prefix_slop
+            prefix_slop *= 2
+          threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
+          j -= trans[1]
+        elif trans[0] == dfa.DFA.TRANSITION_MARK:
+          threads0[j:j + trans[1]] = [
+            (i, trans[2], thread)
+            for thread in threads0[j:j + trans[1]]
+          ]
+        elif trans[0] == dfa.DFA.TRANSITION_MOVE:
+          threads1.extend(threads0[j:j + trans[1]])
+          j += trans[1]
+        #elif trans[0] == dfa.DFA.TRANSITION_DEL:
+        #  del threads1[-trans[1]:]
+        else:
+          assert False
+      assert j == len(threads0)
+      threads0, threads1 = threads1, threads0
+      del threads1[prefix_slop:]
+
+    threads0 = [None, None]
+    threads1 = [None]
+    prefix_slop = 1
+
+    start_state = self.start_state[start_index]
+    if start_state == -1:
+      return None
+    next_multistate, transition, _, _ = self.multistate_next(
+      (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
+      -1
+    )
+    while next_multistate is not None:
+      transit(transition)
+      assert len(threads0) == prefix_slop + next_multistate[1]
+      if next_multistate == NFA.accept_multistate:
+        # there is only one match, which is complete
+        assert len(threads0) == prefix_slop + 1
+        return threads0[prefix_slop]
+      if i >= len(text):
+        # return best match we have, but not incomplete match
+        i = NFA.multistate_accept(next_multistate)
+        return (None if i == -1 else threads0[prefix_slop + i])
+      next_multistate, transition, _, _ = (
+        self.multistate_next(next_multistate, ord(text[i]))
+      )
+      i += 1
+    return None
+
+  def match_yychunk(self, root, pos, off, yychunk_iter, start_index = 0):
+    if pos < 0:
+      pos, off = element.to_start_relative(root, pos, off)
+
+    def transit(transition):
+      nonlocal threads0, threads1, prefix_slop # note: also uses pos, off
+      j = prefix_slop
+      for trans in transition:
+        if trans[0] == dfa.DFA.TRANSITION_POP:
+          j += trans[1]
+        elif trans[0] == dfa.DFA.TRANSITION_DUP:
+          while j < trans[1]:
+            threads0[:0] = [None] * prefix_slop
+            threads1[:0] = [None] * prefix_slop
+            j += prefix_slop
+            prefix_slop *= 2
+          threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
+          j -= trans[1]
+        elif trans[0] == dfa.DFA.TRANSITION_MARK:
+          threads0[j:j + trans[1]] = [
+            (pos, off, trans[2], thread)
+            for thread in threads0[j:j + trans[1]]
+          ]
+        elif trans[0] == dfa.DFA.TRANSITION_MOVE:
+          threads1.extend(threads0[j:j + trans[1]])
+          j += trans[1]
+        #elif trans[0] == dfa.DFA.TRANSITION_DEL:
+        #  del threads1[-trans[1]:]
+        else:
+          assert False
+      assert j == len(threads0)
+      threads0, threads1 = threads1, threads0
+      del threads1[prefix_slop:]
+
+    threads0 = [None, None]
+    threads1 = [None]
+    prefix_slop = 1
+
+    start_state = self.start_state[start_index]
+    if start_state == -1:
+      return None
+    next_multistate, transition, _, _ = self.multistate_next(
+      (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
+      -1
+    )
+    text = element.get_text(root, pos)
+    while next_multistate is not None:
+      transit(transition)
+      assert len(threads0) == prefix_slop + next_multistate[1]
+      if next_multistate == NFA.accept_multistate:
+        # there is only one match, which is complete
+        assert len(threads0) == prefix_slop + 1
+        return threads0[prefix_slop]
+      while off >= len(text):
+        if pos < len(root):
+          pos += 1
+          off = 0
+        else:
+          try:
+            next(yychunk_iter)
+          except StopIteration:
+            # return best match we have, but not incomplete match
+            i = NFA.multistate_accept(next_multistate)
+            return (None if i == -1 else threads0[prefix_slop + i])
+          text = element.get_text(root, pos)
+      next_multistate, transition, _, _ = (
+        self.multistate_next(next_multistate, ord(text[off]))
+      )
+      off += 1
+    return None
+
+  def to_dfa(self):
+    _dfa = dfa.DFA(list(self.groups))
+
+    accept_key = (NFA.accept_multistate, ())
+    action_to_meaning = [accept_key]
+    meaning_to_action = {accept_key: 0}
+    state_to_meaning = [NFA.accept_multistate]
+    meaning_to_state = {NFA.accept_multistate: 0}
+
+    for start_state in self.start_state:
+      if start_state == -1:
+        start_action = -1
+      else:
+        next_multistate, transition, _, _ = self.multistate_next(
+          (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
+          -1
+        )
+        if next_multistate is None:
+          start_action = -1
+        else:
+          start_key = (next_multistate, tuple(transition))
+          start_action = len(action_to_meaning)
+          meaning_to_action[start_key] = start_action
+          action_to_meaning.append(start_key)
+      _dfa.start_action.append(start_action)
+
+    while len(_dfa.actions) < len(action_to_meaning):
+      next_multistate, transition = action_to_meaning[len(_dfa.actions)]
+      if next_multistate in meaning_to_state:
+        next_state = meaning_to_state[next_multistate]
+      else:
+        next_state = len(state_to_meaning)
+        state_to_meaning.append(next_multistate)
+        meaning_to_state[next_multistate] = next_state
+      _dfa.actions.append((next_state, list(transition)))
+
+      while len(_dfa.states) < len(state_to_meaning):
+        character = 0
+        multistate = state_to_meaning[len(_dfa.states)]
+        state_desc = ([], [], NFA.multistate_accept(multistate))
+        while character < n_characters:
+          next_multistate, transition, character0, character1 = self.multistate_next(
+            multistate,
+            character
+          )
+          assert character0 == character and character1 > character
+          if next_multistate is None:
+            action = -1
+          else:
+            # optimize transition (optional)
+            i = 0
+            j = 0
+            while i < len(transition):
+              if transition[i][0] == dfa.DFA.TRANSITION_POP:
+                n = transition[i][1]
+                i += 1
+                while (
+                  i < len(transition) and
+                  transition[i][0] == dfa.DFA.TRANSITION_POP
+                ):
+                  n += transition[i][1]
+                  i += 1
+                transition[j] = (dfa.DFA.TRANSITION_POP, n)
+              elif transition[i][0] == dfa.DFA.TRANSITION_MOVE:
+                n = transition[i][1]
+                i += 1
+                while (
+                  i < len(transition) and
+                  transition[i][0] == dfa.DFA.TRANSITION_MOVE
+                ):
+                  n += transition[i][1]
+                  i += 1
+                transition[j] = (dfa.DFA.TRANSITION_MOVE, n)
+              else:
+                transition[j] = transition[i]
+                i += 1
+              j += 1
+            del transition[j:]
+            # end optimize transition
+            key = (next_multistate, tuple(transition))
+            if key in meaning_to_action:
+              action = meaning_to_action[key]
+            else:
+              action = len(action_to_meaning)
+              action_to_meaning.append(key)
+              meaning_to_action[key] = action
+          state_desc[0].append(character1)
+          state_desc[1].append(action)
+          character = character1
+        _dfa.states.append(state_desc)
+    return _dfa
+
+  def __repr__(self):
+    return 'nfa.NFA({0:s}, {1:s}, {2:s})'.format(
+      repr(self.groups),
+      repr(self.states),
+      repr(self.start_state)
+    )
index 080b4c3..0193035 100644 (file)
--- a/regex.py
+++ b/regex.py
@@ -1,79 +1,9 @@
-import bisect
+import bisect_set
 import element
-import work
-import sys
+import nfa
 
 # defines the alphabet size, set this to 0x11000 for unicode
-chars = 0x100
-
-def char_set_or(char_set0, char_set1):
-  # calculate union of the child sets
-  # we do this by calculating a series of breakpoints, at each breakpoint
-  # evaluating the "or" (max) of the even/odd truth values of each child,
-  # then making the output truth value even/odd by outputting if necessary
-  result = []
-  i = 0
-  j = 0
-  while True:
-    if i < len(char_set0):
-      k = char_set0[i]
-      if j < len(char_set1):
-        k = min(k, char_set1[j])
-    elif j < len(char_set1):
-      k = char_set1[j]
-    else:
-      break
-    if i < len(char_set0) and char_set0[i] == k:
-      i += 1
-    if j < len(char_set1) and char_set1[j] == k:
-      j += 1
-    if (len(result) & 1) != max(i & 1, j & 1):
-      result.append(k)
-  assert (i & 1) == 0 and (j & 1) == 0
-  return result
-
-def char_set_and(char_set0, char_set1):
-  # calculate intersection of the child sets
-  # we do this by calculating a series of breakpoints, at each breakpoint
-  # evaluating the "and" (min) of the even/odd truth values of each child,
-  # then making the output truth value even/odd by outputting if necessary
-  result = []
-  i = 0
-  j = 0
-  while True:
-    if i < len(char_set0):
-      k = char_set0[i]
-      if j < len(char_set1):
-        k = min(k, char_set1[j])
-    elif j < len(char_set1):
-      k = char_set1[j]
-    else:
-      break
-    if i < len(char_set0) and char_set0[i] == k:
-      i += 1
-    if j < len(char_set1) and char_set1[j] == k:
-      j += 1
-    if (len(result) & 1) != min(i & 1, j & 1):
-      result.append(k)
-  assert (i & 1) == 0 and (j & 1) == 0
-  return result
-
-def char_set_not(char_set):
-  # calculate complement of the child set
-  # if child set begins with [0], remove it, otherwise add [0] prefix
-  # if child set ends with [chars], remove it, otherwise add [chars] suffix
-  # the suffix part is not totally necessary, but makes sure length is even
-  # (the evenness is so that single character sets can always be [c, c + 1])
-  result = list(char_set)
-  if result[:1] == [0]:
-    del result[:1]
-  else:
-    result[:0] = [0]
-  if result[-1:] == [chars]:
-    del result[-1:]
-  else:
-    result.append(chars)
-  return result
+n_characters = 0x100
 
 class Regex(element.Element):
   # GENERATE ELEMENT() BEGIN
@@ -102,24 +32,24 @@ class Regex(element.Element):
     self.repr_serialize(params)
     return 'regex.Regex({0:s})'.format(', '.join(params))
   # GENERATE END
-  def post_process(self, group_index = 0, rule_char_sets = None):
+  def post_process(self, group_index = 0): #, rule_name_to_character_set = None):
     for i in self:
-      group_index = i.post_process(group_index, rule_char_sets)
+      group_index = i.post_process(group_index) #, rule_name_to_character_set)
     return group_index
   def to_groups(self, groups):
     for i in self:
       i.to_groups(groups)
-  def to_nfa_state(self, nfa, next_state):
+  def to_nfa_state(self, _nfa, next_state):
     raise NotImplementedException
-  def add_to_nfa(self, nfa):
-    nfa.start_state.append(self.to_nfa_state(nfa, 0))
-  def to_lr1_symbols(self, terminal_thres, symbols, lookaheads, group_bounds):
-    group_count = 0
-    for i in self:
-      group_count += (
-        i.to_lr1_symbols(terminal_thres, symbols, lookaheads, group_bounds)
-      )
-    return group_count # count of groups or ungrouped characters
+  def add_to_nfa(self, _nfa):
+    _nfa.start_state.append(self.to_nfa_state(_nfa, 0))
+  #def to_lr1_symbols(self, n_terminals, symbols, lookaheads, group_bounds):
+  #  group_count = 0
+  #  for i in self:
+  #    group_count += (
+  #      i.to_lr1_symbols(n_terminals, symbols, lookaheads, group_bounds)
+  #    )
+  #  return group_count # count of groups or ungrouped characters
 
 class RegexNone(Regex):
   # GENERATE ELEMENT() BEGIN
@@ -148,7 +78,7 @@ class RegexNone(Regex):
     self.repr_serialize(params)
     return 'regex.RegexNone({0:s})'.format(', '.join(params))
   # GENERATE END
-  def to_nfa_state(self, nfa, next_state):
+  def to_nfa_state(self, _nfa, next_state):
     return -1
 
 class RegexEmpty(Regex):
@@ -178,18 +108,18 @@ class RegexEmpty(Regex):
     self.repr_serialize(params)
     return 'regex.RegexEmpty({0:s})'.format(', '.join(params))
   # GENERATE END
-  def to_nfa_state(self, nfa, next_state):
+  def to_nfa_state(self, _nfa, next_state):
     return next_state
 
 class RegexCharacter(Regex):
-  # GENERATE ELEMENT(list(int) char_set) BEGIN
+  # GENERATE ELEMENT(list(int) character_set) BEGIN
   def __init__(
     self,
     tag = 'RegexCharacter',
     attrib = {},
     text = '',
     children = [],
-    char_set = []
+    character_set = []
   ):
     Regex.__init__(
       self,
@@ -198,36 +128,36 @@ class RegexCharacter(Regex):
       text,
       children
     )
-    self.char_set = (
-      [element.deserialize_int(i) for i in char_set.split()]
-    if isinstance(char_set, str) else
-      char_set
+    self.character_set = (
+      [element.deserialize_int(i) for i in character_set.split()]
+    if isinstance(character_set, str) else
+      character_set
     )
-  def serialize(self, ref_list, indent = 0):
-    Regex.serialize(self, ref_list, indent)
+  def serialize(self, ref_list):
+    Regex.serialize(self, ref_list)
     self.set(
-      'char_set',
-      ' '.join([element.serialize_int(i) for i in self.char_set])
+      'character_set',
+      ' '.join([element.serialize_int(i) for i in self.character_set])
     )
   def deserialize(self, ref_list):
     Regex.deserialize(self, ref_list)
-    self.char_set = [
+    self.character_set = [
       element.deserialize_int(i)
-      for i in self.get('char_set', '').split()
+      for i in self.get('character_set', '').split()
     ]
   def copy(self, factory = None):
     result = Regex.copy(
       self,
       RegexCharacter if factory is None else factory
     )
-    result.char_set = self.char_set
+    result.character_set = self.character_set
     return result
   def repr_serialize(self, params):
     Regex.repr_serialize(self, params)
-    if len(self.char_set):
+    if len(self.character_set):
       params.append(
-        'char_set = [{0:s}]'.format(
-          ', '.join([repr(i) for i in self.char_set])
+        'character_set = [{0:s}]'.format(
+          ', '.join([repr(i) for i in self.character_set])
         )
       )
   def __repr__(self):
@@ -235,31 +165,31 @@ class RegexCharacter(Regex):
     self.repr_serialize(params)
     return 'regex.RegexCharacter({0:s})'.format(', '.join(params))
   # GENERATE END
-  def to_nfa_state(self, nfa, next_state):
-    new_state = len(nfa.states)
-    nfa.states.append((NFA.STATE_CHARACTER, self.char_set, next_state))
+  def to_nfa_state(self, _nfa, next_state):
+    new_state = len(_nfa.states)
+    _nfa.states.append((nfa.NFA.STATE_CHARACTER, self.character_set, next_state))
     return new_state
-  def to_lr1_symbols(self, terminal_thres, symbols, lookaheads, group_bounds):
-    terminal_set = []
-    nonterminal_set = []
-    i = 0
-    while i < len(self.char_set):
-      [j, k] = self.char_set[i:i + 2]
-      if k > terminal_thres:
-        if j < terminal_thres:
-          terminal_set.extend([j, terminal_thres])
-          nonterminal_set.extend([0, k - terminal_thres])
-          i += 2
-        while i < len(self.char_set):
-          [j, k] = self.char_set[i:i + 2]
-          nonterminal_set.extend([j - terminal_thres, k - terminal_thres])
-          i += 2
-        break
-      terminal_set.extend([j, k])
-      i += 2
-    symbols.append((terminal_set, nonterminal_set))
-    lookaheads.append(([], False)) # initial_set, can_be_empty
-    return 1 # count of groups or ungrouped characters
+  #def to_lr1_symbols(self, n_terminals, symbols, lookaheads, group_bounds):
+  #  terminal_set = []
+  #  nonterminal_set = []
+  #  i = 0
+  #  while i < len(self.character_set):
+  #    [j, k] = self.character_set[i:i + 2]
+  #    if k > n_terminals:
+  #      if j < n_terminals:
+  #        terminal_set.extend([j, n_terminals])
+  #        nonterminal_set.extend([0, k - n_terminals])
+  #        i += 2
+  #      while i < len(self.character_set):
+  #        [j, k] = self.character_set[i:i + 2]
+  #        nonterminal_set.extend([j - n_terminals, k - n_terminals])
+  #        i += 2
+  #      break
+  #    terminal_set.extend([j, k])
+  #    i += 2
+  #  symbols.append((terminal_set, nonterminal_set))
+  #  lookaheads.append(([], False)) # initial_set, can_be_empty
+  #  return 1 # count of groups or ungrouped characters
 
 class RegexCharacterRange(RegexCharacter):
   # GENERATE ELEMENT() BEGIN
@@ -269,7 +199,7 @@ class RegexCharacterRange(RegexCharacter):
     attrib = {},
     text = '',
     children = [],
-    char_set = []
+    character_set = []
   ):
     RegexCharacter.__init__(
       self,
@@ -277,7 +207,7 @@ class RegexCharacterRange(RegexCharacter):
       attrib,
       text,
       children,
-      char_set
+      character_set
     )
   def copy(self, factory = None):
     result = RegexCharacter.copy(
@@ -290,9 +220,9 @@ class RegexCharacterRange(RegexCharacter):
     self.repr_serialize(params)
     return 'regex.RegexCharacterRange({0:s})'.format(', '.join(params))
   # GENERATE END
-  def post_process(self, group_index = 0, rule_char_sets = None):
-    group_index = RegexCharacter.post_process(self, group_index, rule_char_sets)
-    self.char_set = [self[0].char_set[0], self[1].char_set[-1]]
+  def post_process(self, group_index = 0): #, rule_name_to_character_set = None):
+    group_index = RegexCharacter.post_process(self, group_index) #, rule_name_to_character_set)
+    self.character_set = [self[0].character_set[0], self[1].character_set[-1]]
     return group_index
 
 class RegexCharacterOr(RegexCharacter):
@@ -303,7 +233,7 @@ class RegexCharacterOr(RegexCharacter):
     attrib = {},
     text = '',
     children = [],
-    char_set = []
+    character_set = []
   ):
     RegexCharacter.__init__(
       self,
@@ -311,7 +241,7 @@ class RegexCharacterOr(RegexCharacter):
       attrib,
       text,
       children,
-      char_set
+      character_set
     )
   def copy(self, factory = None):
     result = RegexCharacter.copy(
@@ -324,9 +254,9 @@ class RegexCharacterOr(RegexCharacter):
     self.repr_serialize(params)
     return 'regex.RegexCharacterOr({0:s})'.format(', '.join(params))
   # GENERATE END
-  def post_process(self, group_index = 0, rule_char_sets = None):
-    group_index = RegexCharacter.post_process(self, group_index, rule_char_sets)
-    self.char_set = char_set_or(self[0].char_set, self[1].char_set)
+  def post_process(self, group_index = 0): #, rule_name_to_character_set = None):
+    group_index = RegexCharacter.post_process(self, group_index) #, rule_name_to_character_set)
+    self.character_set = bisect_set.bisect_set_or(self[0].character_set, self[1].character_set)
     return group_index
 
 class RegexCharacterAnd(RegexCharacter):
@@ -337,7 +267,7 @@ class RegexCharacterAnd(RegexCharacter):
     attrib = {},
     text = '',
     children = [],
-    char_set = []
+    character_set = []
   ):
     RegexCharacter.__init__(
       self,
@@ -345,7 +275,7 @@ class RegexCharacterAnd(RegexCharacter):
       attrib,
       text,
       children,
-      char_set
+      character_set
     )
   def copy(self, factory = None):
     result = RegexCharacter.copy(
@@ -358,9 +288,9 @@ class RegexCharacterAnd(RegexCharacter):
     self.repr_serialize(params)
     return 'regex.RegexCharacterAnd({0:s})'.format(', '.join(params))
   # GENERATE END
-  def post_process(self, group_index = 0, rule_char_sets = None):
-    group_index = RegexCharacter.post_process(self, group_index, rule_char_sets)
-    self.char_set = char_set_and(self[0].char_set, self[1].char_set)
+  def post_process(self, group_index = 0): #, rule_name_to_character_set = None):
+    group_index = RegexCharacter.post_process(self, group_index) #, rule_name_to_character_set)
+    self.character_set = bisect_set.bisect_set_and(self[0].character_set, self[1].character_set)
     return group_index
 
 class RegexCharacterNot(RegexCharacter):
@@ -371,7 +301,7 @@ class RegexCharacterNot(RegexCharacter):
     attrib = {},
     text = '',
     children = [],
-    char_set = []
+    character_set = []
   ):
     RegexCharacter.__init__(
       self,
@@ -379,7 +309,7 @@ class RegexCharacterNot(RegexCharacter):
       attrib,
       text,
       children,
-      char_set
+      character_set
     )
   def copy(self, factory = None):
     result = RegexCharacter.copy(
@@ -392,59 +322,59 @@ class RegexCharacterNot(RegexCharacter):
     self.repr_serialize(params)
     return 'regex.RegexCharacterNot({0:s})'.format(', '.join(params))
   # GENERATE END
-  def post_process(self, group_index = 0, rule_char_sets = None):
-    group_index = RegexCharacter.post_process(self, group_index, rule_char_sets)
-    self.char_set = char_set_not(self[0].char_set)
+  def post_process(self, group_index = 0): #, rule_name_to_character_set = None):
+    group_index = RegexCharacter.post_process(self, group_index) #, rule_name_to_character_set)
+    self.character_set = bisect_set.bisect_set_not(self[0].character_set)
     return group_index
 
-class RegexCharacterRule(RegexCharacter):
-  # GENERATE ELEMENT(str rule_name) BEGIN
-  def __init__(
-    self,
-    tag = 'RegexCharacterRule',
-    attrib = {},
-    text = '',
-    children = [],
-    char_set = [],
-    rule_name = ''
-  ):
-    RegexCharacter.__init__(
-      self,
-      tag,
-      attrib,
-      text,
-      children,
-      char_set
-    )
-    self.rule_name = rule_name
-  def serialize(self, ref_list, indent = 0):
-    RegexCharacter.serialize(self, ref_list, indent)
-    self.set('rule_name', element.serialize_str(self.rule_name))
-  def deserialize(self, ref_list):
-    RegexCharacter.deserialize(self, ref_list)
-    self.rule_name = element.deserialize_str(self.get('rule_name', ''))
-  def copy(self, factory = None):
-    result = RegexCharacter.copy(
-      self,
-      RegexCharacterRule if factory is None else factory
-    )
-    result.rule_name = self.rule_name
-    return result
-  def repr_serialize(self, params):
-    RegexCharacter.repr_serialize(self, params)
-    if self.rule_name != '':
-      params.append(
-        'rule_name = {0:s}'.format(repr(self.rule_name))
-      )
-  def __repr__(self):
-    params = []
-    self.repr_serialize(params)
-    return 'regex.RegexCharacterRule({0:s})'.format(', '.join(params))
-  # GENERATE END
-  def post_process(self, group_index = 0, rule_char_sets = None):
-    if rule_char_sets is not None:
-      self.char_set = rule_char_sets[self.rule_name]
-    return RegexCharacter.post_process(self, group_index, rule_char_sets)
+#class RegexCharacterRule(RegexCharacter):
+#  # GENERATE ELEMENT(str rule_name) BEGIN
+#  def __init__(
+#    self,
+#    tag = 'RegexCharacterRule',
+#    attrib = {},
+#    text = '',
+#    children = [],
+#    character_set = [],
+#    rule_name = ''
+#  ):
+#    RegexCharacter.__init__(
+#      self,
+#      tag,
+#      attrib,
+#      text,
+#      children,
+#      character_set
+#    )
+#    self.rule_name = rule_name
+#  def serialize(self, ref_list, indent = 0):
+#    RegexCharacter.serialize(self, ref_list, indent)
+#    self.set('rule_name', element.serialize_str(self.rule_name))
+#  def deserialize(self, ref_list):
+#    RegexCharacter.deserialize(self, ref_list)
+#    self.rule_name = element.deserialize_str(self.get('rule_name', ''))
+#  def copy(self, factory = None):
+#    result = RegexCharacter.copy(
+#      self,
+#      RegexCharacterRule if factory is None else factory
+#    )
+#    result.rule_name = self.rule_name
+#    return result
+#  def repr_serialize(self, params):
+#    RegexCharacter.repr_serialize(self, params)
+#    if self.rule_name != '':
+#      params.append(
+#        'rule_name = {0:s}'.format(repr(self.rule_name))
+#      )
+#  def __repr__(self):
+#    params = []
+#    self.repr_serialize(params)
+#    return 'regex.RegexCharacterRule({0:s})'.format(', '.join(params))
+#  # GENERATE END
+#  def post_process(self, group_index = 0, rule_name_to_character_set = None):
+#    if rule_name_to_character_set is not None:
+#      self.character_set = rule_name_to_character_set[self.rule_name]
+#    return RegexCharacter.post_process(self, group_index, rule_name_to_character_set)
 
 class RegexOr(Regex):
   # GENERATE ELEMENT() BEGIN
@@ -473,15 +403,15 @@ class RegexOr(Regex):
     self.repr_serialize(params)
     return 'regex.RegexOr({0:s})'.format(', '.join(params))
   # GENERATE END
-  def to_nfa_state(self, nfa, next_state):
-    child0_state = self[0].to_nfa_state(nfa, next_state)
-    child1_state = self[1].to_nfa_state(nfa, next_state)
+  def to_nfa_state(self, _nfa, next_state):
+    child0_state = self[0].to_nfa_state(_nfa, next_state)
+    child1_state = self[1].to_nfa_state(_nfa, next_state)
     if child0_state == -1:
       return child1_state
     if child1_state == -1:
       return child0_state
-    new_state = len(nfa.states)
-    nfa.states.append((NFA.STATE_OR, child0_state, child1_state))
+    new_state = len(_nfa.states)
+    _nfa.states.append((nfa.NFA.STATE_OR, child0_state, child1_state))
     return new_state
 
 class RegexAnd(Regex):
@@ -511,19 +441,19 @@ class RegexAnd(Regex):
     self.repr_serialize(params)
     return 'regex.RegexAnd({0:s})'.format(', '.join(params))
   # GENERATE END
-  def to_nfa_state(self, nfa, next_state):
-    join0_state = len(nfa.states)
-    nfa.states.append(NFA.join0_state) # takes no arguments so use static one
-    join1_state = len(nfa.states)
-    nfa.states.append((NFA.STATE_JOIN1, next_state))
-    child0_state = self[0].to_nfa_state(nfa, join0_state)
+  def to_nfa_state(self, _nfa, next_state):
+    join0_state = len(_nfa.states)
+    _nfa.states.append(nfa.NFA.join0_state) # takes no arguments so use static one
+    join1_state = len(_nfa.states)
+    _nfa.states.append((nfa.NFA.STATE_JOIN1, next_state))
+    child0_state = self[0].to_nfa_state(_nfa, join0_state)
     if child0_state == -1:
       return -1
-    child1_state = self[1].to_nfa_state(nfa, join1_state)
+    child1_state = self[1].to_nfa_state(_nfa, join1_state)
     if child1_state == -1:
       return -1
-    new_state = len(nfa.states)
-    nfa.states.append((NFA.STATE_AND, child0_state, child1_state))
+    new_state = len(_nfa.states)
+    _nfa.states.append((nfa.NFA.STATE_AND, child0_state, child1_state))
     return new_state
 
 class RegexSequence(Regex):
@@ -553,11 +483,11 @@ class RegexSequence(Regex):
     self.repr_serialize(params)
     return 'regex.RegexSequence({0:s})'.format(', '.join(params))
   # GENERATE END
-  def to_nfa_state(self, nfa, next_state):
-    child1_state = self[1].to_nfa_state(nfa, next_state)
+  def to_nfa_state(self, _nfa, next_state):
+    child1_state = self[1].to_nfa_state(_nfa, next_state)
     if child1_state == -1:
       return -1
-    return self[0].to_nfa_state(nfa, child1_state)
+    return self[0].to_nfa_state(_nfa, child1_state)
 
 class RegexRepeat(Regex):
   # GENERATE ELEMENT(int count0, int count1, bool non_greedy) BEGIN
@@ -593,8 +523,8 @@ class RegexRepeat(Regex):
     if isinstance(non_greedy, str) else
       non_greedy
     )
-  def serialize(self, ref_list, indent = 0):
-    Regex.serialize(self, ref_list, indent)
+  def serialize(self, ref_list):
+    Regex.serialize(self, ref_list)
     self.set('count0', element.serialize_int(self.count0))
     self.set('count1', element.serialize_int(self.count1))
     self.set('non_greedy', element.serialize_bool(self.non_greedy))
@@ -631,7 +561,7 @@ class RegexRepeat(Regex):
     self.repr_serialize(params)
     return 'regex.RegexRepeat({0:s})'.format(', '.join(params))
   # GENERATE END
-  def post_process(self, group_index = 0, rule_char_sets = None):
+  def post_process(self, group_index = 0): #, rule_name_to_character_set = None):
     # total hack which will be done in a Python action in future
     if len(self) >= 2:
       assert self[1].tag == 'Number'
@@ -643,36 +573,36 @@ class RegexRepeat(Regex):
         self.count1 = self.count0
       del self[1:]
     # end total hack
-    return Regex.post_process(self, group_index, rule_char_sets)
-  def to_nfa_state(self, nfa, next_state):
+    return Regex.post_process(self, group_index) #, rule_name_to_character_set)
+  def to_nfa_state(self, _nfa, next_state):
     count0 = self.count0
     count1 = self.count1
     if count1 == -1:
-      new_state = len(nfa.states)
-      nfa.states.append(None)
-      child_state = self[0].to_nfa_state(nfa, new_state)
+      new_state = len(_nfa.states)
+      _nfa.states.append(None)
+      child_state = self[0].to_nfa_state(_nfa, new_state)
       if child_state == -1:
         new_state = next_state # note: unreachable state remains invalid (None)
       else:
-        nfa.states[new_state] = (
-          (NFA.STATE_OR, next_state, child_state)
+        _nfa.states[new_state] = (
+          (nfa.NFA.STATE_OR, next_state, child_state)
         if self.non_greedy else
-          (NFA.STATE_OR, child_state, next_state)
+          (nfa.NFA.STATE_OR, child_state, next_state)
         )
     else:
       new_state = next_state
       for i in range(count1 - count0):
-        child_state = self[0].to_nfa_state(nfa, new_state)
+        child_state = self[0].to_nfa_state(_nfa, new_state)
         if child_state == -1:
           break
-        new_state = len(nfa.states)
-        nfa.states.append(
-          (NFA.STATE_OR, next_state, child_state)
+        new_state = len(_nfa.states)
+        _nfa.states.append(
+          (nfa.NFA.STATE_OR, next_state, child_state)
         if self.non_greedy else
-          (NFA.STATE_OR, child_state, next_state)
+          (nfa.NFA.STATE_OR, child_state, next_state)
         )
     for i in range(count0):
-      new_state = self[0].to_nfa_state(nfa, new_state)
+      new_state = self[0].to_nfa_state(_nfa, new_state)
       if new_state == -1:
         break
     return new_state
@@ -698,8 +628,8 @@ class RegexGroup(Regex):
       )
       self.name = name
       self.value = value
-    def serialize(self, ref_list, indent = 0):
-      element.Element.serialize(self, ref_list, indent)
+    def serialize(self, ref_list):
+      element.Element.serialize(self, ref_list)
       self.set('name', element.serialize_str(self.name))
       self.set('value', element.serialize_str(self.value))
     def deserialize(self, ref_list):
@@ -755,8 +685,8 @@ class RegexGroup(Regex):
     )
     self.group_name = group_name
     self.group_attributes = group_attributes
-  def serialize(self, ref_list, indent = 0):
-    Regex.serialize(self, ref_list, indent)
+  def serialize(self, ref_list):
+    Regex.serialize(self, ref_list)
     self.set('group_index', element.serialize_int(self.group_index))
     self.set('group_name', element.serialize_str(self.group_name))
     self.set(
@@ -801,7 +731,7 @@ class RegexGroup(Regex):
     self.repr_serialize(params)
     return 'regex.RegexGroup({0:s})'.format(', '.join(params))
   # GENERATE END
-  def post_process(self, group_index = 0, rule_char_sets = None):
+  def post_process(self, group_index = 0): #, rule_name_to_character_set = None):
     # total hack which will be done in a Python action in future
     if len(self) >= 2:
       assert self[0].tag == 'GroupName'
@@ -810,207 +740,40 @@ class RegexGroup(Regex):
     # end total hack
     self.group_index = group_index
     group_index += 1
-    return Regex.post_process(self, group_index, rule_char_sets)
+    return Regex.post_process(self, group_index) #, rule_name_to_character_set)
   def to_groups(self, groups):
     assert len(groups) == self.group_index
     groups.append(
       (self.group_name, {i.name: i.value for i in self.group_attributes})
     )
     return Regex.to_groups(self, groups)
-  def to_nfa_state(self, nfa, next_state):
-    mark_state = len(nfa.states)
-    nfa.states.append((NFA.STATE_MARK, self.group_index * 2 + 1, next_state))
-    child_state = self[0].to_nfa_state(nfa, mark_state)
+  def to_nfa_state(self, _nfa, next_state):
+    mark_state = len(_nfa.states)
+    _nfa.states.append((nfa.NFA.STATE_MARK, self.group_index * 2 + 1, next_state))
+    child_state = self[0].to_nfa_state(_nfa, mark_state)
     if child_state == -1:
       return -1
-    new_state = len(nfa.states)
-    nfa.states.append((NFA.STATE_MARK, self.group_index * 2, child_state))
+    new_state = len(_nfa.states)
+    _nfa.states.append((nfa.NFA.STATE_MARK, self.group_index * 2, child_state))
     return new_state
-  def to_lr1_symbols(self, terminal_thres, symbols, lookaheads, group_bounds):
-    group_start = len(symbols)
-    assert self.group_index == len(group_bounds)
-    group_bounds.append(None)
-    group_count = Regex.to_lr1_symbols(
-      self,
-      terminal_thres,
-      symbols,
-      lookaheads,
-      group_bounds
-    )
-    group_bounds[self.group_index] = (
-      group_start,
-      group_count,
-      self.group_name,
-      {i.name: i.value for i in self.group_attributes}
-    )
-    return 1 # count of groups or ungrouped characters
-
-class Grammar(element.Element):
-  class Production(element.Element):
-    # GENERATE ELEMENT(int nonterminal, int priority, bool right_to_left) BEGIN
-    def __init__(
-      self,
-      tag = 'Grammar_Production',
-      attrib = {},
-      text = '',
-      children = [],
-      nonterminal = -1,
-      priority = -1,
-      right_to_left = False
-    ):
-      element.Element.__init__(
-        self,
-        tag,
-        attrib,
-        text,
-        children
-      )
-      self.nonterminal = (
-        element.deserialize_int(nonterminal)
-      if isinstance(nonterminal, str) else
-        nonterminal
-      )
-      self.priority = (
-        element.deserialize_int(priority)
-      if isinstance(priority, str) else
-        priority
-      )
-      self.right_to_left = (
-        element.deserialize_bool(right_to_left)
-      if isinstance(right_to_left, str) else
-        right_to_left
-      )
-    def serialize(self, ref_list, indent = 0):
-      element.Element.serialize(self, ref_list, indent)
-      self.set('nonterminal', element.serialize_int(self.nonterminal))
-      self.set('priority', element.serialize_int(self.priority))
-      self.set('right_to_left', element.serialize_bool(self.right_to_left))
-    def deserialize(self, ref_list):
-      element.Element.deserialize(self, ref_list)
-      self.nonterminal = element.deserialize_int(self.get('nonterminal', '-1'))
-      self.priority = element.deserialize_int(self.get('priority', '-1'))
-      self.right_to_left = element.deserialize_bool(self.get('right_to_left', 'false'))
-    def copy(self, factory = None):
-      result = element.Element.copy(
-        self,
-        Production if factory is None else factory
-      )
-      result.nonterminal = self.nonterminal
-      result.priority = self.priority
-      result.right_to_left = self.right_to_left
-      return result
-    def repr_serialize(self, params):
-      element.Element.repr_serialize(self, params)
-      if self.nonterminal != -1:
-        params.append(
-          'nonterminal = {0:s}'.format(repr(self.nonterminal))
-        )
-      if self.priority != -1:
-        params.append(
-          'priority = {0:s}'.format(repr(self.priority))
-        )
-      if self.right_to_left != False:
-        params.append(
-          'right_to_left = {0:s}'.format(repr(self.right_to_left))
-        )
-    def __repr__(self):
-      params = []
-      self.repr_serialize(params)
-      return 'regex.Grammar.Production({0:s})'.format(', '.join(params))
-    # GENERATE END
-
-  # GENERATE ELEMENT(int terminal_thres) BEGIN
-  def __init__(
-    self,
-    tag = 'Grammar',
-    attrib = {},
-    text = '',
-    children = [],
-    terminal_thres = -1
-  ):
-    element.Element.__init__(
-      self,
-      tag,
-      attrib,
-      text,
-      children
-    )
-    self.terminal_thres = (
-      element.deserialize_int(terminal_thres)
-    if isinstance(terminal_thres, str) else
-      terminal_thres
-    )
-  def serialize(self, ref_list, indent = 0):
-    element.Element.serialize(self, ref_list, indent)
-    self.set('terminal_thres', element.serialize_int(self.terminal_thres))
-  def deserialize(self, ref_list):
-    element.Element.deserialize(self, ref_list)
-    self.terminal_thres = element.deserialize_int(self.get('terminal_thres', '-1'))
-  def copy(self, factory = None):
-    result = element.Element.copy(
-      self,
-      Grammar if factory is None else factory
-    )
-    result.terminal_thres = self.terminal_thres
-    return result
-  def repr_serialize(self, params):
-    element.Element.repr_serialize(self, params)
-    if self.terminal_thres != -1:
-      params.append(
-        'terminal_thres = {0:s}'.format(repr(self.terminal_thres))
-      )
-  def __repr__(self):
-    params = []
-    self.repr_serialize(params)
-    return 'regex.Grammar({0:s})'.format(', '.join(params))
-  # GENERATE END
-  def post_process(self, rule_char_sets):
-    for i in range(len(self)):
-      self[i].nonterminal = i
-      self[i][0].post_process(0, rule_char_sets)
-  def to_lr1(self):
-    lr1 = LR1([], self.terminal_thres)
-    for i in self:
-      symbols = []
-      lookaheads = []
-      group_bounds = []
-      i[0].to_lr1_symbols(
-        self.terminal_thres,
-        symbols,
-        lookaheads,
-        group_bounds
-      )
-      lookaheads.append(([], True)) # initial_set, can_be_empty (sentinel)
-      lr1.productions.append(
-        (
-          i.priority * 2 + int(i.right_to_left),
-          symbols,
-          lookaheads,
-          group_bounds
-        )
-      )
-    # propagate lookaheads
-    modified = True
-    while modified:
-      modified = False
-      for _, symbols, lookaheads, _ in lr1.productions:
-        for i in range(len(symbols) - 1, -1, -1):
-          initial_set, nonterminal_set = symbols[i]
-          can_be_empty = False
-          for j in range(0, len(nonterminal_set), 2):
-            for k in range(nonterminal_set[j], nonterminal_set[j + 1]):
-              child_initial_set, child_can_be_empty = lr1.productions[k][2][0]
-              initial_set = char_set_or(initial_set, child_initial_set)
-              can_be_empty = can_be_empty or child_can_be_empty
-          # at this point can_be_empty refers to current symbol only
-          if can_be_empty:
-            next_initial_set, can_be_empty = lookaheads[i + 1]
-            initial_set = char_set_or(initial_set, next_initial_set)
-          # at this point can_be_empty refers to all remaining symbols
-          if (initial_set, can_be_empty) != lookaheads[i]:
-            lookaheads[i] = (initial_set, can_be_empty)
-            modified = True
-    return lr1
+  #def to_lr1_symbols(self, n_terminals, symbols, lookaheads, group_bounds):
+  #  group_start = len(symbols)
+  #  assert self.group_index == len(group_bounds)
+  #  group_bounds.append(None)
+  #  group_count = Regex.to_lr1_symbols(
+  #    self,
+  #    n_terminals,
+  #    symbols,
+  #    lookaheads,
+  #    group_bounds
+  #  )
+  #  group_bounds[self.group_index] = (
+  #    group_start,
+  #    group_count,
+  #    self.group_name,
+  #    {i.name: i.value for i in self.group_attributes}
+  #  )
+  #  return 1 # count of groups or ungrouped characters
 
 # GENERATE FACTORY(element.Element) BEGIN
 tag_to_class = {
@@ -1022,1435 +785,34 @@ tag_to_class = {
   'RegexCharacterOr': RegexCharacterOr,
   'RegexCharacterAnd': RegexCharacterAnd,
   'RegexCharacterNot': RegexCharacterNot,
-  'RegexCharacterRule': RegexCharacterRule,
   'RegexOr': RegexOr,
   'RegexAnd': RegexAnd,
   'RegexSequence': RegexSequence,
   'RegexRepeat': RegexRepeat,
   'RegexGroup': RegexGroup,
-  'RegexGroup_Attribute': RegexGroup.Attribute,
-  'Grammar': Grammar,
-  'Grammar_Production': Grammar.Production
+  'RegexGroup_Attribute': RegexGroup.Attribute
 }
 def factory(tag, attrib = {}, *args, **kwargs):
   return tag_to_class.get(tag, element.Element)(tag, attrib, *args, **kwargs)
 # GENERATE END
 
-class NFA:
-  # state_desc classes:
-  # (STATE_CHARACTER, char_set, next_state)
-  # (STATE_OR, next_state0, next_state1)
-  # (STATE_AND, next_state0, next_state1)
-  # (STATE_JOIN0,)
-  # (STATE_JOIN1, next_state)
-  # (STATE_MARK, mark_value, next_state)
-
-  STATE_CHARACTER = 0
-  STATE_OR = 1
-  STATE_AND = 2
-  STATE_JOIN0 = 3
-  STATE_JOIN1 = 4
-  STATE_MARK = 5
-  join0_state = (STATE_JOIN0,)
-
-  # multistate classes:
-  # (MULTISTATE_ACCEPT, 1)             accept, occupies one thread in list
-  # (MULTISTATE_AND, n, state, child)  and, occupies n threads in list
-  # (MULTISTATE_OR, n, child0, child1)
-  # n = sum of n from child states, that is, multistate[1] is the number of
-  # (MULTISTATE_ACCEPT, 1) leaf states ultimately reachable from this subtree
-
-  MULTISTATE_ACCEPT = 0
-  MULTISTATE_AND = 1
-  MULTISTATE_OR = 2
-  accept_multistate = (MULTISTATE_ACCEPT, 1)
-
-  def __init__(
-    self,
-    groups = [],
-    states = [(STATE_CHARACTER, [0, chars], 0)],
-    start_state = [] # can have multiple NFAs in same container
-  ):
-    # groups: list of group_desc
-    # group_desc: (tag, kwargs)
-    #   tag, kwargs will be passed to apply_markup() hence factory()
-    self.groups = groups
-    self.states = states
-    self.start_state = start_state
-
-  def multistate_next(self, root_multistate, char):
-    # the deduplication works as effectively a second pass which goes
-    # over the multistate tree in pre-order, looking for OR-disjunctions
-    # of any depth and configuration, e.g. (a OR b) or (c OR d), and
-    # collecting the subexpressions e.g. a, b, c OR b, c, d, c OR d, and
-    # failing any subexpression that's previously occurred in pre-order
-    # (children of AND-conjunctions get a fresh empty subexpression set)
-
-    # unfortunately it can't be done as a second pass literally, because
-    # we have to generate the transition list, thus we need to know if a
-    # subexpression is going to be failed, so we can delete it's threads
-
-    # therefore the deduplication is tacked onto the end of the building
-    # process, examining each built node just before the builder returns,
-    # if already in the subexpression set we will rewrite the transition
-    # list produced by the recursive calls, otherwise we will add it in
-
-    # the problem is determining the subexpression set to pass into any
-    # recursive calls, the caller's set will be sent unchanged by cases
-    # that return the result unchanged or add an OR-disjuction, an empty
-    # set will be sent by cases that add an AND-conjuction to the result 
-
-    # if we have added something to the node built by a recursive call,
-    # we'll fall into the deduplication logic, otherwise just return it
-
-    def advance1(multistate, join_count, done_multistates):
-      # modifies nonlocal: transition
-      assert multistate[0] == NFA.MULTISTATE_AND
-      _, _, state, child = multistate
-      if state == 0:
-        # note that if we reach the accepting state, we must be a top-level
-        # expression, and could not be part of an AND-conjunction (because
-        # AND-conjunction terms always go to a join0 or join1 state first),
-        # we remove ourselves to indicate to scanner that match is complete
-        assert join_count == 0
-        assert child == NFA.accept_multistate
-        return advance(child, 0, done_multistates)
-      state_desc = self.states[state]
-      if state_desc[0] == NFA.STATE_CHARACTER:
-        if join_count != 0:
-          transition.append((DFA.TRANSITION_POP, child[1]))
-          return None
-        len_transition = len(transition)
-        child = advance(child, 0, set())
-        if child is None:
-          return None
-        result = (NFA.MULTISTATE_AND, child[1], state, child)
-      elif state_desc[0] == NFA.STATE_OR:
-        _, next_state0, next_state1 = state_desc
-        len_transition = len(transition)
-        transition.append((DFA.TRANSITION_DUP, child[1]))
-        child0 = advance1(
-          (NFA.MULTISTATE_AND, child[1], next_state0, child),
-          join_count,
-          done_multistates
-        )
-        child1 = advance1(
-          (NFA.MULTISTATE_AND, child[1], next_state1, child),
-          join_count,
-          done_multistates
-        )
-        if child0 is None:
-          return child1
-        if child1 is None:
-          return child0
-        result = (NFA.MULTISTATE_OR, child0[1] + child1[1], child0, child1)
-      elif state_desc[0] == NFA.STATE_AND:
-        _, next_state0, next_state1 = state_desc
-        return advance1(
-          (
-            NFA.MULTISTATE_AND,
-            child[1],
-            next_state0,
-            (NFA.MULTISTATE_AND, child[1], next_state1, child)
-          ),
-          join_count,
-          done_multistates
-        )
-      elif state_desc[0] == NFA.STATE_JOIN0:
-        return advance(child, join_count + 1, done_multistates)
-      elif state_desc[0] == NFA.STATE_JOIN1:
-        _, next_state = state_desc
-        if join_count == 0:
-          transition.append((DFA.TRANSITION_POP, child[1]))
-          return None
-        return advance1(
-          (NFA.MULTISTATE_AND, child[1], next_state, child),
-          join_count - 1,
-          done_multistates
-        )
-      elif state_desc[0] == NFA.STATE_MARK:
-        _, mark_value, next_state = state_desc
-        transition.append((DFA.TRANSITION_MARK, child[1], mark_value))
-        return advance1(
-          (NFA.MULTISTATE_AND, child[1], next_state, child),
-          join_count,
-          done_multistates
-        )
-      else:
-        assert False
-      if result in done_multistates:
-        transition[len_transition:] = [(DFA.TRANSITION_POP, multistate[1])]
-        return None
-      done_multistates.add(result)
-      return result
-
-    def advance(multistate, join_count, done_multistates):
-      nonlocal char0, char1 # modifies nonlocal: transition
-      if multistate[0] == NFA.MULTISTATE_ACCEPT:
-        assert join_count == 0
-        len_transition = len(transition)
-        transition.append((DFA.TRANSITION_MOVE, 1))
-        result = NFA.accept_multistate # takes no arguments so use static one
-      elif multistate[0] == NFA.MULTISTATE_AND:
-        if char >= 0:
-          _, _, state, child = multistate
-          state_desc = self.states[state]
-          assert state_desc[0] == NFA.STATE_CHARACTER
-          _, char_set, next_state = state_desc
-          k = bisect.bisect_right(char_set, char)
-          if k > 0 and char0 < char_set[k - 1]:
-            char0 = char_set[k - 1]
-          if k < len(char_set) and char1 > char_set[k]:
-            char1 = char_set[k]
-          if (k & 1) == 0:
-            transition.append((DFA.TRANSITION_POP, child[1]))
-            return None
-          multistate = (NFA.MULTISTATE_AND, child[1], next_state, child)
-        return advance1(multistate, join_count, done_multistates)
-      elif multistate[0] == NFA.MULTISTATE_OR:
-        _, _, child0, child1 = multistate
-        len_transition = len(transition)
-        child0 = advance(child0, join_count, done_multistates)
-        child1 = advance(child1, join_count, done_multistates)
-        if child0 is None:
-          return child1
-        if child1 is None:
-          return child0
-        result = (NFA.MULTISTATE_OR, child0[1] + child1[1], child0, child1)
-      else:
-        assert False
-      if result in done_multistates:
-        transition[len_transition:] = [(DFA.TRANSITION_POP, multistate[1])]
-        return None
-      done_multistates.add(result)
-      return result
-
-    transition = []
-    char0 = 0
-    char1 = chars
-    root_multistate = advance(root_multistate, 0, set())
-    return root_multistate, transition, char0, char1
-
-  def multistate_accept(root_multistate):
-    i = 0
-    def accept(multistate):
-      nonlocal i
-      if multistate[0] == NFA.MULTISTATE_ACCEPT:
-        return True
-      if multistate[0] == NFA.MULTISTATE_AND:
-        _, _, _, child = multistate
-        i += child[1]
-        return False
-      if multistate[0] == NFA.MULTISTATE_OR:
-        _, _, child0, child1 = multistate
-        return accept(child0) or accept(child1)
-      assert False
-    return i if accept(root_multistate) else -1
-
-  def match_text(self, text, i, start_index = 0):
-    def transit(transition):
-      nonlocal threads0, threads1, prefix_slop # note: also uses i
-      j = prefix_slop
-      for trans in transition:
-        if trans[0] == DFA.TRANSITION_POP:
-          j += trans[1]
-        elif trans[0] == DFA.TRANSITION_DUP:
-          while j < trans[1]:
-            threads0[:0] = [None] * prefix_slop
-            threads1[:0] = [None] * prefix_slop
-            j += prefix_slop
-            prefix_slop *= 2
-          threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
-          j -= trans[1]
-        elif trans[0] == DFA.TRANSITION_MARK:
-          threads0[j:j + trans[1]] = [
-            (i, trans[2], thread)
-            for thread in threads0[j:j + trans[1]]
-          ]
-        elif trans[0] == DFA.TRANSITION_MOVE:
-          threads1.extend(threads0[j:j + trans[1]])
-          j += trans[1]
-        #elif trans[0] == DFA.TRANSITION_DEL:
-        #  del threads1[-trans[1]:]
-        else:
-          assert False
-      assert j == len(threads0)
-      threads0, threads1 = threads1, threads0
-      del threads1[prefix_slop:]
-
-    threads0 = [None, None]
-    threads1 = [None]
-    prefix_slop = 1
-
-    start_state = self.start_state[start_index]
-    if start_state == -1:
-      return None
-    next_multistate, transition, _, _ = self.multistate_next(
-      (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
-      -1
-    )
-    while next_multistate is not None:
-      transit(transition)
-      assert len(threads0) == prefix_slop + next_multistate[1]
-      if next_multistate == NFA.accept_multistate:
-        # there is only one match, which is complete
-        assert len(threads0) == prefix_slop + 1
-        return threads0[prefix_slop]
-      if i >= len(text):
-        # return best match we have, but not incomplete match
-        i = NFA.multistate_accept(next_multistate)
-        return (None if i == -1 else threads0[prefix_slop + i])
-      next_multistate, transition, _, _ = (
-        self.multistate_next(next_multistate, ord(text[i]))
-      )
-      i += 1
-    return None
-
-  def match_yychunk(self, root, pos, off, yychunk_iter, start_index = 0):
-    if pos < 0:
-      pos, off = element.to_start_relative(root, pos, off)
-
-    def transit(transition):
-      nonlocal threads0, threads1, prefix_slop # note: also uses pos, off
-      j = prefix_slop
-      for trans in transition:
-        if trans[0] == DFA.TRANSITION_POP:
-          j += trans[1]
-        elif trans[0] == DFA.TRANSITION_DUP:
-          while j < trans[1]:
-            threads0[:0] = [None] * prefix_slop
-            threads1[:0] = [None] * prefix_slop
-            j += prefix_slop
-            prefix_slop *= 2
-          threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
-          j -= trans[1]
-        elif trans[0] == DFA.TRANSITION_MARK:
-          threads0[j:j + trans[1]] = [
-            (pos, off, trans[2], thread)
-            for thread in threads0[j:j + trans[1]]
-          ]
-        elif trans[0] == DFA.TRANSITION_MOVE:
-          threads1.extend(threads0[j:j + trans[1]])
-          j += trans[1]
-        #elif trans[0] == DFA.TRANSITION_DEL:
-        #  del threads1[-trans[1]:]
-        else:
-          assert False
-      assert j == len(threads0)
-      threads0, threads1 = threads1, threads0
-      del threads1[prefix_slop:]
-
-    threads0 = [None, None]
-    threads1 = [None]
-    prefix_slop = 1
-
-    start_state = self.start_state[start_index]
-    if start_state == -1:
-      return None
-    next_multistate, transition, _, _ = self.multistate_next(
-      (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
-      -1
-    )
-    text = element.get_text(root, pos)
-    while next_multistate is not None:
-      transit(transition)
-      assert len(threads0) == prefix_slop + next_multistate[1]
-      if next_multistate == NFA.accept_multistate:
-        # there is only one match, which is complete
-        assert len(threads0) == prefix_slop + 1
-        return threads0[prefix_slop]
-      while off >= len(text):
-        if pos < len(root):
-          pos += 1
-          off = 0
-        else:
-          try:
-            next(yychunk_iter)
-          except StopIteration:
-            # return best match we have, but not incomplete match
-            i = NFA.multistate_accept(next_multistate)
-            return (None if i == -1 else threads0[prefix_slop + i])
-          text = element.get_text(root, pos)
-      next_multistate, transition, _, _ = (
-        self.multistate_next(next_multistate, ord(text[off]))
-      )
-      off += 1
-    return None
-
-  def to_dfa(self):
-    dfa = DFA(list(self.groups))
-
-    accept_key = (NFA.accept_multistate, ())
-    action_to_meaning = [accept_key]
-    meaning_to_action = {accept_key: 0}
-    state_to_meaning = [NFA.accept_multistate]
-    meaning_to_state = {NFA.accept_multistate: 0}
-
-    for start_state in self.start_state:
-      if start_state == -1:
-        start_action = -1
-      else:
-        next_multistate, transition, _, _ = self.multistate_next(
-          (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
-          -1
-        )
-        if next_multistate is None:
-          start_action = -1
-        else:
-          start_key = (next_multistate, tuple(transition))
-          start_action = len(action_to_meaning)
-          meaning_to_action[start_key] = start_action
-          action_to_meaning.append(start_key)
-      dfa.start_action.append(start_action)
-
-    while len(dfa.actions) < len(action_to_meaning):
-      next_multistate, transition = action_to_meaning[len(dfa.actions)]
-      if next_multistate in meaning_to_state:
-        next_state = meaning_to_state[next_multistate]
-      else:
-        next_state = len(state_to_meaning)
-        state_to_meaning.append(next_multistate)
-        meaning_to_state[next_multistate] = next_state
-      dfa.actions.append((next_state, list(transition)))
-
-      while len(dfa.states) < len(state_to_meaning):
-        char = 0
-        multistate = state_to_meaning[len(dfa.states)]
-        state_desc = ([], [], NFA.multistate_accept(multistate))
-        while char < chars:
-          next_multistate, transition, char0, char1 = self.multistate_next(
-            multistate,
-            char
-          )
-          assert char0 == char and char1 > char
-          if next_multistate is None:
-            action = -1
-          else:
-            # optimize transition (optional)
-            i = 0
-            j = 0
-            while i < len(transition):
-              if transition[i][0] == DFA.TRANSITION_POP:
-                n = transition[i][1]
-                i += 1
-                while (
-                  i < len(transition) and
-                  transition[i][0] == DFA.TRANSITION_POP
-                ):
-                  n += transition[i][1]
-                  i += 1
-                transition[j] = (DFA.TRANSITION_POP, n)
-              elif transition[i][0] == DFA.TRANSITION_MOVE:
-                n = transition[i][1]
-                i += 1
-                while (
-                  i < len(transition) and
-                  transition[i][0] == DFA.TRANSITION_MOVE
-                ):
-                  n += transition[i][1]
-                  i += 1
-                transition[j] = (DFA.TRANSITION_MOVE, n)
-              else:
-                transition[j] = transition[i]
-                i += 1
-              j += 1
-            del transition[j:]
-            # end optimize transition
-            key = (next_multistate, tuple(transition))
-            if key in meaning_to_action:
-              action = meaning_to_action[key]
-            else:
-              action = len(action_to_meaning)
-              action_to_meaning.append(key)
-              meaning_to_action[key] = action
-          state_desc[0].append(char1)
-          state_desc[1].append(action)
-          char = char1
-        dfa.states.append(state_desc)
-    return dfa
-
-  def __repr__(self):
-    return 'regex.NFA({0:s}, {1:s}, {2:s})'.format(
-      repr(self.groups),
-      repr(self.states),
-      repr(self.start_state)
-    )
-
-class DFA:
-  # transition classes:
-  # instructions to transform thread list from a multistate to next multistate
-  # (TRANSITION_POP, n)                        j += n
-  # (TRANSITION_DUP, n)                        threads0[j - n:j] = threads0[j:j + n]
-  #                                    j -= n
-  # (TRANSITION_MARK, n, mark_value)   threads0[j:j + n] = [
-  #                                      (i, mark, thread)
-  #                                      for thread in threads0[j:j + n]
-  #                                    ]
-  # (TRANSITION_MOVE, n)               threads1.extend(threads0[j:j + n])
-  #                                    j += n
-  # (TRANSITION_DEL, n)                        del threads1[-n:]
-
-  TRANSITION_POP = 0
-  TRANSITION_DUP = 1
-  TRANSITION_MARK = 2
-  TRANSITION_MOVE = 3
-  #TRANSITION_DEL = 4
-  def __init__(
-    self,
-    groups = [],
-    states = [([chars], [0], 0)],
-    actions = [(0, [])],
-    start_action = [] # can have multiple DFAs in same container
-  ):
-    # groups: list of group_desc
-    # group_desc: (tag, kwargs)
-    #   tag, kwargs will be passed to apply_markup() hence factory()
-    # states: list of state_desc
-    # state_desc: (list of breaks, list of action to do, accept_thread)
-    # actions: list of action_desc
-    # action_desc: (state to go to next, compiled transition to do first)
-    # accept_thread: which thread of thread list to use, -1 don't accept
-    self.groups = groups
-    self.states = states
-    self.actions = actions
-    self.start_action = start_action
-
-  def match_text(self, text, i, start_index = 0):
-    def transit(transition):
-      nonlocal threads0, threads1, prefix_slop # note: also uses i
-      j = prefix_slop
-      for trans in transition:
-        if trans[0] == DFA.TRANSITION_POP:
-          j += trans[1]
-        elif trans[0] == DFA.TRANSITION_DUP:
-          while j < trans[1]:
-            threads0[:0] = [None] * prefix_slop
-            threads1[:0] = [None] * prefix_slop
-            j += prefix_slop
-            prefix_slop *= 2
-          threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
-          j -= trans[1]
-        elif trans[0] == DFA.TRANSITION_MARK:
-          threads0[j:j + trans[1]] = [
-            (i, trans[2], thread)
-            for thread in threads0[j:j + trans[1]]
-          ]
-        elif trans[0] == DFA.TRANSITION_MOVE:
-          threads1.extend(threads0[j:j + trans[1]])
-          j += trans[1]
-        #elif trans[0] == DFA.TRANSITION_DEL:
-        #  del threads1[-trans[1]:]
-        else:
-          assert False
-      assert j == len(threads0)
-      threads0, threads1 = threads1, threads0
-      del threads1[prefix_slop:]
-
-    threads0 = [None, None]
-    threads1 = [None]
-    prefix_slop = 1
-
-    action = self.start_action[start_index]
-    while action != -1:
-      state, transition = self.actions[action]
-      #print('i', i, 'action', action, 'state', state, 'transition', transition)
-      transit(transition)
-      if state == 0:
-        # there is only one match, which is complete
-        assert len(threads0) == prefix_slop + 1
-        return threads0[prefix_slop]
-      if i >= len(text):
-        # return best match we have, but not incomplete match
-        i = self.states[state][2]
-        return (None if i == -1 else threads0[prefix_slop + i])
-      action = self.states[state][1][
-        bisect.bisect_right(self.states[state][0], ord(text[i]))
-      ]
-      i += 1
-    return None
-
-  def match_yychunk(self, root, pos, off, yychunk_iter, start_index = 0):
-    if pos < 0:
-      pos, off = element.to_start_relative(root, pos, off)
-
-    def transit(transition):
-      nonlocal threads0, threads1, prefix_slop # note: also uses pos, off
-      j = prefix_slop
-      for trans in transition:
-        if trans[0] == DFA.TRANSITION_POP:
-          j += trans[1]
-        elif trans[0] == DFA.TRANSITION_DUP:
-          while j < trans[1]:
-            threads0[:0] = [None] * prefix_slop
-            threads1[:0] = [None] * prefix_slop
-            j += prefix_slop
-            prefix_slop *= 2
-          threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
-          j -= trans[1]
-        elif trans[0] == DFA.TRANSITION_MARK:
-          threads0[j:j + trans[1]] = [
-            (pos, off, trans[2], thread)
-            for thread in threads0[j:j + trans[1]]
-          ]
-        elif trans[0] == DFA.TRANSITION_MOVE:
-          threads1.extend(threads0[j:j + trans[1]])
-          j += trans[1]
-        #elif trans[0] == DFA.TRANSITION_DEL:
-        #  del threads1[-trans[1]:]
-        else:
-          assert False
-      assert j == len(threads0)
-      threads0, threads1 = threads1, threads0
-      del threads1[prefix_slop:]
-
-    threads0 = [None, None]
-    threads1 = [None]
-    prefix_slop = 1
-
-    action = self.start_action[start_index]
-    text = element.get_text(root, pos)
-    while action != -1:
-      state, transition = self.actions[action]
-      transit(transition)
-      if state == 0:
-        # there is only one match, which is complete
-        assert len(threads0) == prefix_slop + 1
-        return threads0[prefix_slop]
-      while off >= len(text):
-        if pos < len(root):
-          pos += 1
-          off = 0
-        else:
-          try:
-            next(yychunk_iter)
-          except StopIteration:
-            # return best match we have, but not incomplete match
-            i = self.states[state][2]
-            return (None if i == -1 else threads0[prefix_slop + i])
-          text = element.get_text(root, pos)
-      #print(
-      #  'state {0:d} pos {1:d} off {2:d} text "{3:s}"'.format(
-      #    state,
-      #    pos,
-      #    off,
-      #    text.replace('\n', '$')
-      #  )
-      #)
-      action = self.states[state][1][
-        bisect.bisect_right(self.states[state][0], ord(text[off]))
-      ]
-      off += 1
-    return None
-
-  def yylex(self, root, pos, off, factory, yychunk_iter):
-    if pos < 0:
-      pos, off = element.to_start_relative(root, pos, off)
-
-    while True:
-      # note: pointers must be kept start relative during the below call,
-      # because it extends the following text by calling the yychunk_iter
-      thread = self.match_yychunk(root, pos, off, yychunk_iter)
-      if thread is None:
-        break
-      stack = []
-      while True:
-        pos, off, mark_value, thread = thread
-        group_index = mark_value >> 1
-        if (mark_value & 1) != 0:
-          end_pos, end_off = element.to_end_relative(root, pos, off)
-          stack.append((end_pos, end_off, group_index))
-        else:
-          end_pos, end_off, temp = stack.pop()
-          assert temp == group_index
-          if len(stack) == 0:
-            break
-          tag, kwargs = self.groups[group_index]
-          if tag != '':
-            work.apply_markup(
-              root,
-              pos,
-              off,
-              end_pos,
-              end_off,
-              factory,
-              tag,
-              **kwargs
-            )
-      # note: pointers must be kept end relative during the below call,
-      # because it modifies the preceding text by calling apply_markup()
-      yield end_pos, end_off, group_index
-      pos, off = element.to_start_relative(root, end_pos, end_off)
-
-  def __repr__(self):
-    return 'regex.DFA({0:s}, {1:s}, {2:s}, {3:s})'.format(
-      repr(self.groups),
-      repr(self.states),
-      repr(self.actions),
-      repr(self.start_action)
-    )
-
-class LR1:
-  def __init__(self, productions = [], terminal_thres = chars):
-    # productions: list of production
-    # production: (
-    #   priority,
-    #   symbols,
-    #   lookaheads,
-    #   group_bounds
-    # )
-    # priority: bit 0 = right to left, bits 1: = numeric priority
-    # symbols: list of symbol_desc
-    # symbol_desc: (terminal_set, nonterminal_set)
-    # terminal_set: similar to char_set, even length list of pairs of breaks
-    # nonterminal_set: as above but has terminal_thres subtracted from breaks
-    # lookaheads: list of lookahead_desc, len(lookaheads) = len(symbols) + 1
-    # lookahead_desc: (initial_set, can_be_empty)
-    # initial_set: what terminals can occur at this position in symbols array,
-    # computed such that lookaheads[i][0] is symbols[i][0], plus initial set of
-    # each nonterminal from symbols[i][1], plus lookaheads[i + 1][0] if any
-    # nonterminal of symbols[i][1] can be empty (may pull in i + 2, i + 3 etc)
-    # can_be_empty: whether all symbols from this position to end can be empty
-    # note: last lookahead_desc is a sentinel consisting of ([], True), so that
-    # initial_set is empty (will be augmented by context) and can_be_empty is
-    # True (because all symbols from the end to the end can obviously be empty)
-    # group_bounds: list of group_bound
-    # group_bound: (start_index, end_index, tag, kwargs)
-    #   where start_index, end_index are indices into list of char_set,
-    #   and tag, kwargs will be passed to apply_markup() hence factory(),
-    #   noting that markup has to be applied in reverse order of the list
-    # terminal_thres: offset to apply to productions[] index to get symbol
-    #   (char set code), also symbol for productions[0] = start production
-    self.productions = productions
-    self.terminal_thres = terminal_thres
-
-  def lookahead_item_set_closure(self, items, item_to_index):
-    in_queue = [True for i in range(len(items))]
-    queue = list(range(len(items)))
-
-    qhead = 0
-    while qhead < len(queue):
-      i = queue[qhead]
-      in_queue[i] = False
-      j, k, lookahead_set = items[i]
-      _, symbols, lookaheads, _ = self.productions[j]
-      if k < len(symbols):
-        _, nonterminal_set = symbols[k]
-        if len(nonterminal_set):
-          next_lookahead_set, next_can_be_empty = lookaheads[k + 1]
-          if next_can_be_empty:
-            next_lookahead_set = char_set_or(
-              next_lookahead_set,
-              lookahead_set
-            )
-          for l in range(0, len(nonterminal_set), 2):
-            for m in range(nonterminal_set[l], nonterminal_set[l + 1]):
-              key = (m, 0)
-              if key in item_to_index:
-                n = item_to_index[key]
-                child_lookahead_set = char_set_or(
-                  items[n][2],
-                  next_lookahead_set
-                )
-                if child_lookahead_set != items[n][2]:
-                  items[n] = (m, 0, child_lookahead_set)
-                  if not in_queue[n]:
-                    # optimization: do not re-queue unless it is transparent
-                    # to changes in the lookahead set wrt. further propagation
-                    _, child_symbols, child_lookaheads, _ = self.productions[m]
-                    if len(child_symbols) and child_lookaheads[1][1]:
-                      in_queue[n] = True
-                      queue.append(n)
-              else:
-                n = len(items)
-                items.append((m, 0, next_lookahead_set))
-                item_to_index[key] = n
-                in_queue.append(True)
-                queue.append(n)
-      qhead += 1 
-
-  def lookahead_item_set_shift(self, items, terminal):
-    next_items = []
-    next_item_to_index = {}
-    reductions = set()
-    terminal0 = 0
-    terminal1 = self.terminal_thres
-    for i, j, lookahead_set in items:
-      _, symbols, _, _ = self.productions[i]
-      if j < len(symbols):
-        terminal_set, _ = symbols[j]
-        k = bisect.bisect_right(terminal_set, terminal)
-        if k > 0 and terminal0 < terminal_set[k - 1]:
-          terminal0 = terminal_set[k - 1]
-        if k < len(terminal_set) and terminal1 > terminal_set[k]:
-          terminal1 = terminal_set[k]
-        if (k & 1) == 1:
-          next_item_to_index[(i, j + 1)] = len(next_items)
-          next_items.append((i, j + 1, lookahead_set))
-      else:
-        k = bisect.bisect_right(lookahead_set, terminal)
-        if k > 0 and terminal0 < lookahead_set[k - 1]:
-          terminal0 = lookahead_set[k - 1]
-        if k < len(lookahead_set) and terminal1 > lookahead_set[k]:
-          terminal1 = lookahead_set[k]
-        if (k & 1) == 1:
-          reductions.add(i)
-    return next_items, next_item_to_index, reductions, terminal0, terminal1
-
-  def lookahead_item_set_goto(self, items, nonterminal):
-    next_items = []
-    next_item_to_index = {}
-    reductions = set()
-    nonterminal0 = 0
-    nonterminal1 = len(self.productions)
-    for i, j, lookahead_set in items:
-      _, symbols, _, _ = self.productions[i]
-      if j < len(symbols):
-        _, nonterminal_set = symbols[j]
-        k = bisect.bisect_right(nonterminal_set, nonterminal)
-        if k > 0 and nonterminal0 < nonterminal_set[k - 1]:
-          nonterminal0 = nonterminal_set[k - 1]
-        if k < len(nonterminal_set) and nonterminal1 > nonterminal_set[k]:
-          nonterminal1 = nonterminal_set[k]
-        if (k & 1) == 1:
-          next_item_to_index[(i, j + 1)] = len(next_items)
-          next_items.append((i, j + 1, lookahead_set))
-    return next_items, next_item_to_index, nonterminal0, nonterminal1
-
-  def parse_text(self, text, i):
-    items = [(0, 0, [chars, chars + 1])] # EOF
-    item_to_index = {(0, 0): 0}
-    value_stack = []
-    state_stack = []
-    lookahead_char = ord(text[i]) if i < len(text) else chars # EOF
-    while True:
-      self.lookahead_item_set_closure(items, item_to_index)
-      value_stack.append(i)
-      state_stack.append(items)
-      items, item_to_index, reductions, _, _ = (
-        self.lookahead_item_set_shift(items, lookahead_char)
-      )
-      if len(items) != 0:
-        if len(reductions) != 0:
-          sys.stderr.write(
-            'shift/reduce conflict: {0:s} vs {1:s}\n'.format(
-              ','.join([str(i) for i, _, _ in next_items]),
-              ','.join([str(i) for i in reductions])
-            )
-          )
-        i += 1
-        lookahead_char = ord(text[i]) if i < len(text) else chars # EOF
-      elif len(reductions) != 0:
-        if len(reductions) != 1:
-          sys.stderr.write(
-            'reduce/reduce conflict: {0:s}\n'.format(
-              ','.join([str(i) for i in reductions])
-            )
-          )
-        reduce = min(reductions)
-        _, symbols, _, group_bounds = self.productions[reduce]
-        base = len(value_stack) - len(symbols) - 1
-        for j in range(len(group_bounds) - 1, -1, -1):
-          k, l, tag, _ = group_bounds[j]
-          k += base
-          if l != 1:
-            value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
-          sys.stderr.write(
-            'text \'{0:s}\' tag \'{1:s}\'\n'.format(
-              text[value_stack[k]:value_stack[k + 1]],
-              tag
-            )
-          )
-        del value_stack[base + 1:]
-        del state_stack[base + 1:]
-        if reduce == 0:
-          assert base == 0
-          return
-        items, item_to_index, _, _ = (
-          self.lookahead_item_set_goto(state_stack[-1], reduce)
-        )
-        assert len(items) != 0
-      else:
-        raise Exception(
-          'syntax error at {0:d}: {1:s}'.format(i, text[i:])
-        )
-
-  def parse_yychunk(self, root, pos, off, factory, yychunk_iter):
-    if pos < 0:
-      pos, off = element.to_start_relative(root, pos, off)
-
-    items = [(0, 0, [chars, chars + 1])] # EOF
-    item_to_index = {(0, 0): 0}
-    value_stack = []
-    state_stack = []
-    text = element.get_text(root, pos)
-    while off >= len(text):
-      if pos < len(root):
-        pos += 1
-        off = 0
-      else:
-        try:
-          next(yychunk_iter)
-        except StopIteration:
-          lookahead_char = chars # EOF
-          break
-        text = element.get_text(root, pos)
-    else: 
-      lookahead_char = ord(text[off])
-    while True:
-      self.lookahead_item_set_closure(items, item_to_index)
-      value_stack.append((pos, off))
-      state_stack.append(items)
-      items, item_to_index, reductions, _, _ = (
-        self.lookahead_item_set_shift(items, lookahead_char)
-      )
-      if len(items) != 0:
-        if len(reductions) != 0:
-          sys.stderr.write(
-            'shift/reduce conflict: {0:s} vs {1:s}\n'.format(
-              ','.join([str(i) for i, _ in next_lookahead_item_set.keys()]),
-              ','.join([str(i) for i in reductions])
-            )
-          )
-        off += 1
-        while off >= len(text):
-          if pos < len(root):
-            pos += 1
-            off = 0
-          else:
-            try:
-              next(yychunk_iter)
-            except StopIteration:
-              lookahead_char = chars # EOF
-              break
-            text = element.get_text(root, pos)
-        else: 
-          lookahead_char = ord(text[off])
-      elif len(reductions) != 0:
-        if len(reductions) != 1:
-          sys.stderr.write(
-            'reduce/reduce conflict: {0:s}\n'.format(
-              ','.join([str(i) for i in reductions])
-            )
-          )
-        reduce = min(reductions)
-        _, symbols, _, group_bounds = self.productions[reduce]
-        base = len(value_stack) - len(symbols) - 1
-        end_relative = len(value_stack)
-        for j in range(len(group_bounds) - 1, -1, -1):
-          k, l, tag, kwargs = group_bounds[j]
-          k += base
-          assert k < end_relative
-          if l != 1:
-            value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
-            end_relative = max(k + 1, end_relative + 1 - l)
-          while end_relative > k + 1:
-            end_relative -= 1
-            pos1, off1 = value_stack[end_relative]
-            value_stack[end_relative] = (
-              element.to_end_relative(root, pos1, off1)
-            )
-          pos0, off0 = value_stack[k]
-          pos1, off1 = value_stack[k + 1]
-          work.apply_markup(
-            root,
-            pos0,
-            off0,
-            pos1,
-            off1,
-            factory,
-            tag,
-            **kwargs
-          )
-        if end_relative < len(value_stack):
-          pos, off = value_stack[-1]
-          pos, off = element.to_start_relative(root, pos, off)
-          text = element.get_text(root, pos)
-        del value_stack[base + 1:]
-        del state_stack[base + 1:]
-        if reduce == 0:
-          assert base == 0
-          return
-        items, item_to_index, _, _ = (
-          self.lookahead_item_set_goto(state_stack[-1], reduce)
-        )
-        assert len(items) != 0
-      else:
-        raise Exception(
-          'syntax error at {0:d},{1:d}: {2:s}'.format(pos, off, text[off:])
-        )
-
-  def to_clr1(self):
-    lr1dfa = LR1DFA(
-      [],
-      [
-        (len(symbols), group_bounds)
-        for _, symbols, _, group_bounds in self.productions
-      ],
-      self.terminal_thres
-    )
-
-    items = [(0, 0, [chars, chars + 1])] # EOF
-    item_to_index = {(0, 0): 0}
-    self.lookahead_item_set_closure(items, item_to_index)
-
-    items = sorted(items)
-    key = tuple((i, j, tuple(k)) for i, j, k in items)
-    state_to_items = [items]
-    items_to_state = {key: 0}
-
-    while len(lr1dfa.states) < len(state_to_items):
-      items = state_to_items[len(lr1dfa.states)]
-      state_desc = ([], [], [], [])
-
-      def add_state(next_items, next_item_to_index):
-        self.lookahead_item_set_closure(next_items, next_item_to_index)
-        new_items = sorted(next_items)
-        key = tuple((i, j, tuple(k)) for i, j, k in new_items)
-        if key in items_to_state:
-          state = items_to_state[key]
-        else:
-          state = len(state_to_items)
-          state_to_items.append(new_items)
-          items_to_state[key] = state
-        return state
-
-      terminal = 0
-      while terminal < self.terminal_thres:
-        next_items, next_item_to_index, reductions, terminal0, terminal1 = (
-          self.lookahead_item_set_shift(items, terminal)
-        )
-        assert terminal0 == terminal and terminal1 > terminal
-        if len(next_items) != 0:
-          if len(reductions) != 0:
-            sys.stderr.write(
-              'state {0:d} shift/reduce conflict: {1:s} vs {2:s}\n'.format(
-                len(lr1dfa.states),
-                ','.join([str(i) for i, _, _ in next_items]),
-                ','.join([str(i) for i in reductions])
-              )
-            )
-          action = add_state(next_items, next_item_to_index) * 2
-        elif len(reductions) != 0:
-          if len(reductions) != 1:
-            sys.stderr.write(
-              'state {0:d} reduce/reduce conflict: {1:s}\n'.format(
-                len(lr1dfa.states),
-                ','.join([str(i) for i in reductions])
-              )
-            )
-          action = min(reductions) * 2 + 1 # odd means reduce
-        else:
-          action = -1
-        state_desc[0].append(terminal1)
-        state_desc[1].append(action)
-        terminal = terminal1
-
-      nonterminal = 0
-      while nonterminal < len(self.productions):
-        next_items, next_item_to_index, nonterminal0, nonterminal1 = (
-          self.lookahead_item_set_goto(items, nonterminal)
-        )
-        assert nonterminal0 == nonterminal and nonterminal1 > nonterminal
-        if len(next_items) != 0:
-          goto = add_state(next_items, next_item_to_index)
-        else:
-          goto = -1
-        state_desc[2].append(nonterminal1)
-        state_desc[3].append(goto)
-        nonterminal = nonterminal1
-
-      lr1dfa.states.append(state_desc)
-    return lr1dfa
-
-  def to_lalr1(self):
-    lr1dfa = LR1DFA(
-      [],
-      [
-        (len(symbols), group_bounds)
-        for _, symbols, _, group_bounds in self.productions
-      ],
-      self.terminal_thres
-    )
-
-    items = [(0, 0, [chars, chars + 1])] # EOF
-    item_to_index = {(0, 0): 0}
-    self.lookahead_item_set_closure(items, item_to_index)
-
-    items = sorted(items)
-    key = tuple((i, j) for i, j, _ in items) # ignore lookahead
-    state_to_items = [items]
-    items_to_state = {key: 0}
-
-    in_queue = [True]
-    queue = [0]
-
-    qhead = 0
-    while qhead < len(queue):
-      i = queue[qhead]
-      in_queue[i] = False
-      items = state_to_items[i]
-      state_desc = ([], [], [], [])
-
-      def add_state(next_items, next_item_to_index):
-        self.lookahead_item_set_closure(next_items, next_item_to_index)
-        new_items = sorted(next_items)
-        key = tuple((i, j) for i, j, _ in new_items) # ignore lookahead
-        if key in items_to_state:
-          state = items_to_state[key]
-          state_items = state_to_items[state]
-          for i in range(len(new_items)):
-            j, k, lookahead_set = new_items[i]
-            lookahead_set = char_set_or(lookahead_set, state_items[i][2])
-            if lookahead_set != state_items[i][2]:
-              state_items[i] = (j, k, lookahead_set)
-              if not in_queue[state]:
-                in_queue[state] = True
-                queue.append(state)
-        else:
-          state = len(state_to_items)
-          state_to_items.append(new_items)
-          items_to_state[key] = state
-          in_queue.append(True)
-          queue.append(state)
-        return state
-
-      terminal = 0
-      while terminal < self.terminal_thres:
-        next_items, next_item_to_index, reductions, terminal0, terminal1 = (
-          self.lookahead_item_set_shift(items, terminal)
-        )
-        assert terminal0 == terminal and terminal1 > terminal
-        if len(next_items) != 0:
-          if len(reductions) != 0:
-            sys.stderr.write(
-              'state {0:d} shift/reduce conflict: {1:s} vs {2:s}\n'.format(
-                i,
-                ','.join([str(j) for j, _, _ in next_items]),
-                ','.join([str(j) for j in reductions])
-              )
-            )
-          action = add_state(next_items, next_item_to_index) * 2
-        elif len(reductions) != 0:
-          if len(reductions) != 1:
-            sys.stderr.write(
-              'state {0:d} reduce/reduce conflict: {1:s}\n'.format(
-                i,
-                ','.join([str(j) for j in reductions])
-              )
-            )
-          action = min(reductions) * 2 + 1 # odd means reduce
-        else:
-          action = -1
-        state_desc[0].append(terminal1)
-        state_desc[1].append(action)
-        terminal = terminal1
-
-      nonterminal = 0
-      while nonterminal < len(self.productions):
-        next_items, next_item_to_index, nonterminal0, nonterminal1 = (
-          self.lookahead_item_set_goto(items, nonterminal)
-        )
-        assert nonterminal0 == nonterminal and nonterminal1 > nonterminal
-        if len(next_items) != 0:
-          goto = add_state(next_items, next_item_to_index)
-        else:
-          goto = -1
-        state_desc[2].append(nonterminal1)
-        state_desc[3].append(goto)
-        nonterminal = nonterminal1
-
-      if i < len(lr1dfa.states):
-        lr1dfa.states[i] = state_desc
-      else:
-        lr1dfa.states.append(state_desc)
-      qhead += 1
-    return lr1dfa
-
-  def __repr__(self):
-    return 'regex.LR1({0:s}, {1:d})'.format(
-      repr(self.productions),
-      self.terminal_thres
-    )
-
-class LR1DFA:
-  def __init__(self, states = [], productions = [], terminal_thres = chars):
-    # states: list of state_desc
-    # state_desc: (terminal breaks, actions, nonterminal breaks, gotos)
-    # action: shift = new state * 2, reduce = production * 2 + 1, error = -1
-    # goto: reduce = production, error = -1 (but error can't really happen)
-    # productions: list of production
-    # production: (len(symbols), group_bounds)
-    # len(symbols): how many states to pop stack to reduce this production
-    # group_bounds: list of group_bound
-    # group_bound: (start_index, end_index, tag, kwargs)
-    #   where start_index, end_index are indices into list of char_set,
-    #   and tag, kwargs will be passed to apply_markup() hence factory(),
-    #   noting that markup has to be applied in reverse order of the list
-    # terminal_thres: offset to apply to productions[] index to get symbol
-    #   (char set code), also symbol for productions[0] = start production
-    self.states = states
-    self.productions = productions
-    self.terminal_thres = terminal_thres
-
-  def parse_text(self, text, i):
-    state = 0
-    value_stack = []
-    state_stack = []
-    lookahead_char = ord(text[i]) if i < len(text) else chars # EOF
-    while True:
-      value_stack.append(i)
-      state_stack.append(state)
-      action = self.states[state][1][
-        bisect.bisect_right(self.states[state][0], lookahead_char)
-      ]
-      if action == -1:
-        raise Exception(
-          'syntax error at {0:d}: {1:s}'.format(i, text[i:])
-        )
-      if (action & 1) == 0:
-        state = action >> 1
-        i += 1
-        lookahead_char = ord(text[i]) if i < len(text) else chars # EOF
-      else:
-        reduce = action >> 1
-        len_symbols, group_bounds = self.productions[reduce]
-        base = len(value_stack) - len_symbols - 1
-        for j in range(len(group_bounds) - 1, -1, -1):
-          k, l, tag, _ = group_bounds[j]
-          k += base
-          if l != 1:
-            value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
-          sys.stdout.write(
-            'text \'{0:s}\' tag \'{1:s}\'\n'.format(
-              text[value_stack[k]:value_stack[k + 1]],
-              tag
-            )
-          )
-        del value_stack[base + 1:]
-        del state_stack[base + 1:]
-        if reduce == 0:
-          assert base == 0
-          return
-        state = self.states[state_stack[-1]][3][
-          bisect.bisect_right(self.states[state_stack[-1]][2], reduce)
-        ]
-        assert state != -1
-
-  def parse_yychunk(self, root, pos, off, factory, yychunk_iter):
-    if pos < 0:
-      pos, off = element.to_start_relative(root, pos, off)
-
-    state = 0
-    value_stack = []
-    state_stack = []
-    text = element.get_text(root, pos)
-    while off >= len(text):
-      if pos < len(root):
-        pos += 1
-        off = 0
-      else:
-        try:
-          next(yychunk_iter)
-        except StopIteration:
-          lookahead_char = chars # EOF
-          break
-        text = element.get_text(root, pos)
-    else: 
-      lookahead_char = ord(text[off])
-    while True:
-      value_stack.append((pos, off))
-      state_stack.append(state)
-      action = self.states[state][1][
-        bisect.bisect_right(self.states[state][0], lookahead_char)
-      ]
-      #print('lookahead_char', lookahead_char, 'action', action)
-      if action == -1:
-        raise Exception(
-          'syntax error at {0:d},{1:d}: {2:s}'.format(pos, off, text[off:])
-        )
-      if (action & 1) == 0:
-        state = action >> 1
-        off += 1
-        while off >= len(text):
-          if pos < len(root):
-            pos += 1
-            off = 0
-          else:
-            try:
-              next(yychunk_iter)
-            except StopIteration:
-              lookahead_char = chars # EOF
-              break
-            text = element.get_text(root, pos)
-        else: 
-          lookahead_char = ord(text[off])
-      else:
-        reduce = action >> 1
-        len_symbols, group_bounds = self.productions[reduce]
-        base = len(value_stack) - len_symbols - 1
-        end_relative = len(value_stack)
-        for j in range(len(group_bounds) - 1, -1, -1):
-          k, l, tag, kwargs = group_bounds[j]
-          k += base
-          assert k < end_relative
-          if l != 1:
-            value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
-            end_relative = max(k + 1, end_relative + 1 - l)
-          while end_relative > k + 1:
-            end_relative -= 1
-            pos1, off1 = value_stack[end_relative]
-            value_stack[end_relative] = (
-              element.to_end_relative(root, pos1, off1)
-            )
-          pos0, off0 = value_stack[k]
-          pos1, off1 = value_stack[k + 1]
-          work.apply_markup(
-            root,
-            pos0,
-            off0,
-            pos1,
-            off1,
-            factory,
-            tag,
-            **kwargs
-          )
-        if end_relative < len(value_stack):
-          pos, off = value_stack[-1]
-          pos, off = element.to_start_relative(root, pos, off)
-          text = element.get_text(root, pos)
-        del value_stack[base + 1:]
-        del state_stack[base + 1:]
-        if reduce == 0:
-          assert base == 0
-          return
-        state = self.states[state_stack[-1]][3][
-          bisect.bisect_right(self.states[state_stack[-1]][2], reduce)
-        ]
-        assert state != -1
-
-  def yyparse(self, root, pos, off, factory, yylex_iter):
-    if pos < 0:
-      pos, off = element.to_start_relative(root, pos, off)
-
-    state = 0
-    value_stack = []
-    state_stack = []
-    try:
-      end_pos, end_off, lookahead_char = next(yylex_iter)
-    except StopIteration:
-      lookahead_char = chars # EOF
-      end_pos, end_off = element.to_end_relative(root, pos, off)
-    while True:
-      value_stack.append((pos, off))
-      state_stack.append(state)
-      action = self.states[state][1][
-        bisect.bisect_right(self.states[state][0], lookahead_char)
-      ]
-      #print('lookahead_char', lookahead_char, 'action', action)
-      if action == -1:
-        raise Exception(
-          'syntax error at {0:d},{1:d}: {2:d}'.format(pos, off, lookahead_char)
-        )
-      if (action & 1) == 0:
-        state = action >> 1
-        pos, off = element.to_start_relative(root, end_pos, end_off)
-        try:
-          end_pos, end_off, lookahead_char = next(yylex_iter)
-        except StopIteration:
-          lookahead_char = chars # EOF
-          #end_pos, end_off = element.to_end_relative(root, pos, off)
-      else:
-        reduce = action >> 1
-        len_symbols, group_bounds = self.productions[reduce]
-        base = len(value_stack) - len_symbols - 1
-        end_relative = len(value_stack)
-        for j in range(len(group_bounds) - 1, -1, -1):
-          k, l, tag, kwargs = group_bounds[j]
-          k += base
-          assert k < end_relative
-          if l != 1:
-            value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
-            end_relative = max(k + 1, end_relative + 1 - l)
-          while end_relative > k + 1:
-            end_relative -= 1
-            pos1, off1 = value_stack[end_relative]
-            value_stack[end_relative] = (
-              element.to_end_relative(root, pos1, off1)
-            )
-          pos0, off0 = value_stack[k]
-          pos1, off1 = value_stack[k + 1]
-          work.apply_markup(
-            root,
-            pos0,
-            off0,
-            pos1,
-            off1,
-            factory,
-            tag,
-            **kwargs
-          )
-        if end_relative < len(value_stack):
-          pos, off = value_stack[-1]
-          pos, off = element.to_start_relative(root, pos, off)
-        del value_stack[base + 1:]
-        del state_stack[base + 1:]
-        if reduce == 0:
-          assert base == 0
-          return
-        state = self.states[state_stack[-1]][3][
-          bisect.bisect_right(self.states[state_stack[-1]][2], reduce)
-        ]
-        assert state != -1
-
-  def __repr__(self):
-    return 'regex.LR1DFA({0:s}, {1:s}, {2:d})'.format(
-      repr(self.states),
-      repr(self.productions),
-      self.terminal_thres
-    )
-
-def wrap_repr(text, width):
-  lines = []
-  i = 0
-  while i < len(text):
-    j = i + width
-    if j < len(text):
-      j = max(
-        [
-          text.rfind('(', i, j) + 1,
-          text.rfind('[', i, j) + 1,
-          text.rfind('{', i, j) + 1,
-          text.rfind('.', i, j) + 1,
-          text.rfind(')', i, j + 1),
-          text.rfind(']', i, j + 1),
-          text.rfind('}', i, j + 1),
-          text.rfind(' ', i, j + 1),
-        ]
-      )
-      assert j > 0
-    lines.append(text[i:j] + '\n')
-    i = j
-    while text[i:i + 1] == ' ':
-      i += 1
-  return ''.join(lines) 
-
 if __name__ == '__main__':
+  import sys
   import xml.etree.ElementTree
 
   regex = RegexAnd(children = [RegexRepeat(children = [RegexCharacterNot(
-children = [RegexCharacter()], char_set = [0, 256])]), RegexGroup(children = [
+children = [RegexCharacter()], character_set = [0, 256])]), RegexGroup(children = [
 RegexOr(children = [RegexOr(children = [RegexOr(children = [RegexGroup(children
-= [RegexRepeat(children = [RegexCharacter(char_set = [9, 14, 32, 33])],
+= [RegexRepeat(children = [RegexCharacter(character_set = [9, 14, 32, 33])],
 one_or_more = True)], group_index = 1, group_name = 'Whitespace'), RegexGroup(
-children = [RegexRepeat(children = [RegexCharacter(char_set = [48, 58])],
+children = [RegexRepeat(children = [RegexCharacter(character_set = [48, 58])],
 one_or_more = True)], group_index = 2, group_name = 'Number')]), RegexGroup(
 children = [RegexSequence(children = [RegexSequence(children = [RegexSequence(
-children = [RegexEmpty(), RegexCharacter(char_set = [102, 103])]),
-RegexCharacter(char_set = [111, 112])]), RegexCharacter(char_set = [114, 115])]
+children = [RegexEmpty(), RegexCharacter(character_set = [102, 103])]),
+RegexCharacter(character_set = [111, 112])]), RegexCharacter(character_set = [114, 115])]
 )], group_index = 3, group_name = 'For')]), RegexGroup(children = [
-RegexSequence(children = [RegexCharacter(char_set = [65, 91, 95, 96, 97, 123]),
-RegexRepeat(children = [RegexCharacter(char_set = [48, 58, 65, 91, 95, 96, 97,
+RegexSequence(children = [RegexCharacter(character_set = [65, 91, 95, 96, 97, 123]),
+RegexRepeat(children = [RegexCharacter(character_set = [48, 58, 65, 91, 95, 96, 97,
 123])])])], group_index = 4, group_name = 'Identifier')])], group_index = 0)])
   #sys.stdout.write(
   #  wrap_repr(
@@ -2459,10 +821,10 @@ RegexRepeat(children = [RegexCharacter(char_set = [48, 58, 65, 91, 95, 96, 97,
   #  )
   #)
 
-  nfa = regex.to_nfa()
+  _nfa = regex.to_nfa()
   #sys.stdout.write(
   #  wrap_repr(
-  #    '  nfa = {0:s}'.format(repr(nfa).replace('regex.', '')),
+  #    '  _nfa = {0:s}'.format(repr(_nfa).replace('regex.', '')),
   #    79
   #  )
   #)
@@ -2471,13 +833,13 @@ RegexRepeat(children = [RegexCharacter(char_set = [48, 58, 65, 91, 95, 96, 97,
   i = 0
   while i < len(text):
     print('text "{0:s}"'.format(text[i:i + 72].replace('\n', '$')))
-    thread = nfa.match_text(text, i)
+    thread = _nfa.match_text(text, i)
     if thread is None:
       print('no match')
       break
     i = thread[0] # end position of overall match
-    group_start = [-1 for j in range(len(nfa.groups))]
-    group_end = [-1 for j in range(len(nfa.groups))]
+    group_start = [-1 for j in range(len(_nfa.groups))]
+    group_end = [-1 for j in range(len(_nfa.groups))]
     while thread is not None:
       pos, mark, thread = thread
       group = mark >> 1
@@ -2486,14 +848,14 @@ RegexRepeat(children = [RegexCharacter(char_set = [48, 58, 65, 91, 95, 96, 97,
         print(
           'group {0:d} name "{1:s}" text "{2:s}"'.format(
              group,
-             nfa.groups[group][0],
+             _nfa.groups[group][0],
              text[group_start[group]:group_end[group]].replace('\n', '$')
           )
         )
       else:
         group_end[group] = pos
 
-  dfa = nfa.to_dfa()
+  dfa = _nfa.to_dfa()
   #sys.stdout.write(
   #  wrap_repr(
   #    '  dfa = {0:s}'.format(repr(dfa).replace('regex.', '')),
@@ -2528,124 +890,124 @@ RegexRepeat(children = [RegexCharacter(char_set = [48, 58, 65, 91, 95, 96, 97,
         group_end[group] = pos
 
   grammar = Grammar(children = [Grammar.Production(children = [RegexSequence(
-children = [RegexSequence(children = [RegexEmpty(), RegexCharacterRule(char_set
-= [288, 295], rule_name = 'whitespace_opt')]), RegexCharacterRule(char_set = [
+children = [RegexSequence(children = [RegexEmpty(), RegexCharacterRule(character_set
+= [288, 295], rule_name = 'whitespace_opt')]), RegexCharacterRule(character_set = [
 259, 262], rule_name = 'expr0')])], nonterminal = 0), Grammar.Production(
-children = [RegexSequence(children = [RegexEmpty(), RegexCharacterRule(char_set
+children = [RegexSequence(children = [RegexEmpty(), RegexCharacterRule(character_set
 = [262, 265], rule_name = 'expr1')])], nonterminal = 1), Grammar.Production(
 children = [RegexSequence(children = [RegexEmpty(), RegexGroup(children = [
 RegexSequence(children = [RegexSequence(children = [RegexSequence(children = [
-RegexSequence(children = [RegexEmpty(), RegexCharacterRule(char_set = [259, 262
-], rule_name = 'expr0')]), RegexCharacter(char_set = [43, 44])]),
-RegexCharacterRule(char_set = [288, 295], rule_name = 'whitespace_opt')]),
-RegexCharacterRule(char_set = [262, 265], rule_name = 'expr1')])], group_index
+RegexSequence(children = [RegexEmpty(), RegexCharacterRule(character_set = [259, 262
+], rule_name = 'expr0')]), RegexCharacter(character_set = [43, 44])]),
+RegexCharacterRule(character_set = [288, 295], rule_name = 'whitespace_opt')]),
+RegexCharacterRule(character_set = [262, 265], rule_name = 'expr1')])], group_index
 = 0, group_name = 'Add')])], nonterminal = 2), Grammar.Production(children = [
 RegexSequence(children = [RegexEmpty(), RegexGroup(children = [RegexSequence(
 children = [RegexSequence(children = [RegexSequence(children = [RegexSequence(
-children = [RegexEmpty(), RegexCharacterRule(char_set = [259, 262], rule_name =
-'expr0')]), RegexCharacter(char_set = [45, 46])]), RegexCharacterRule(char_set
-= [288, 295], rule_name = 'whitespace_opt')]), RegexCharacterRule(char_set = [
+children = [RegexEmpty(), RegexCharacterRule(character_set = [259, 262], rule_name =
+'expr0')]), RegexCharacter(character_set = [45, 46])]), RegexCharacterRule(character_set
+= [288, 295], rule_name = 'whitespace_opt')]), RegexCharacterRule(character_set = [
 262, 265], rule_name = 'expr1')])], group_index = 0, group_name = 'Subtract')])
 ], nonterminal = 3), Grammar.Production(children = [RegexSequence(children = [
-RegexEmpty(), RegexCharacterRule(char_set = [265, 268], rule_name = 'expr2')])
+RegexEmpty(), RegexCharacterRule(character_set = [265, 268], rule_name = 'expr2')])
 ], nonterminal = 4), Grammar.Production(children = [RegexSequence(children = [
 RegexEmpty(), RegexGroup(children = [RegexSequence(children = [RegexSequence(
 children = [RegexSequence(children = [RegexSequence(children = [RegexEmpty(),
-RegexCharacterRule(char_set = [262, 265], rule_name = 'expr1')]),
-RegexCharacter(char_set = [42, 43])]), RegexCharacterRule(char_set = [288, 295
-], rule_name = 'whitespace_opt')]), RegexCharacterRule(char_set = [265, 268],
+RegexCharacterRule(character_set = [262, 265], rule_name = 'expr1')]),
+RegexCharacter(character_set = [42, 43])]), RegexCharacterRule(character_set = [288, 295
+], rule_name = 'whitespace_opt')]), RegexCharacterRule(character_set = [265, 268],
 rule_name = 'expr2')])], group_index = 0, group_name = 'Multiply')])],
 nonterminal = 5), Grammar.Production(children = [RegexSequence(children = [
 RegexEmpty(), RegexGroup(children = [RegexSequence(children = [RegexSequence(
 children = [RegexSequence(children = [RegexSequence(children = [RegexEmpty(),
-RegexCharacterRule(char_set = [262, 265], rule_name = 'expr1')]),
-RegexCharacter(char_set = [47, 48])]), RegexCharacterRule(char_set = [288, 295
-], rule_name = 'whitespace_opt')]), RegexCharacterRule(char_set = [265, 268],
+RegexCharacterRule(character_set = [262, 265], rule_name = 'expr1')]),
+RegexCharacter(character_set = [47, 48])]), RegexCharacterRule(character_set = [288, 295
+], rule_name = 'whitespace_opt')]), RegexCharacterRule(character_set = [265, 268],
 rule_name = 'expr2')])], group_index = 0, group_name = 'Divide')])],
 nonterminal = 6), Grammar.Production(children = [RegexSequence(children = [
 RegexSequence(children = [RegexEmpty(), RegexGroup(children = [RegexSequence(
-children = [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name =
+children = [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name =
 'number')])], group_index = 0, group_name = 'Number')]), RegexCharacterRule(
-char_set = [288, 295], rule_name = 'whitespace_opt')])], nonterminal = 7),
+character_set = [288, 295], rule_name = 'whitespace_opt')])], nonterminal = 7),
 Grammar.Production(children = [RegexSequence(children = [RegexEmpty(),
 RegexGroup(children = [RegexSequence(children = [RegexSequence(children = [
-RegexSequence(children = [RegexEmpty(), RegexCharacter(char_set = [45, 46])]),
-RegexCharacterRule(char_set = [288, 295], rule_name = 'whitespace_opt')]),
-RegexCharacterRule(char_set = [265, 268], rule_name = 'expr2')])], group_index
+RegexSequence(children = [RegexEmpty(), RegexCharacter(character_set = [45, 46])]),
+RegexCharacterRule(character_set = [288, 295], rule_name = 'whitespace_opt')]),
+RegexCharacterRule(character_set = [265, 268], rule_name = 'expr2')])], group_index
 = 0, group_name = 'Negate')])], nonterminal = 8), Grammar.Production(children =
 [RegexSequence(children = [RegexSequence(children = [RegexSequence(children = [
 RegexSequence(children = [RegexSequence(children = [RegexEmpty(),
-RegexCharacter(char_set = [40, 41])]), RegexCharacterRule(char_set = [288, 295
-], rule_name = 'whitespace_opt')]), RegexCharacterRule(char_set = [259, 262],
-rule_name = 'expr0')]), RegexCharacter(char_set = [41, 42])]),
-RegexCharacterRule(char_set = [288, 295], rule_name = 'whitespace_opt')])],
+RegexCharacter(character_set = [40, 41])]), RegexCharacterRule(character_set = [288, 295
+], rule_name = 'whitespace_opt')]), RegexCharacterRule(character_set = [259, 262],
+rule_name = 'expr0')]), RegexCharacter(character_set = [41, 42])]),
+RegexCharacterRule(character_set = [288, 295], rule_name = 'whitespace_opt')])],
 nonterminal = 9), Grammar.Production(children = [RegexSequence(children = [
-RegexEmpty(), RegexCharacter(char_set = [48, 49])])], nonterminal = 10),
+RegexEmpty(), RegexCharacter(character_set = [48, 49])])], nonterminal = 10),
 Grammar.Production(children = [RegexSequence(children = [RegexEmpty(),
-RegexCharacter(char_set = [49, 50])])], nonterminal = 11), Grammar.Production(
-children = [RegexSequence(children = [RegexEmpty(), RegexCharacter(char_set = [
+RegexCharacter(character_set = [49, 50])])], nonterminal = 11), Grammar.Production(
+children = [RegexSequence(children = [RegexEmpty(), RegexCharacter(character_set = [
 50, 51])])], nonterminal = 12), Grammar.Production(children = [RegexSequence(
-children = [RegexEmpty(), RegexCharacter(char_set = [51, 52])])], nonterminal =
+children = [RegexEmpty(), RegexCharacter(character_set = [51, 52])])], nonterminal =
 13), Grammar.Production(children = [RegexSequence(children = [RegexEmpty(),
-RegexCharacter(char_set = [52, 53])])], nonterminal = 14), Grammar.Production(
-children = [RegexSequence(children = [RegexEmpty(), RegexCharacter(char_set = [
+RegexCharacter(character_set = [52, 53])])], nonterminal = 14), Grammar.Production(
+children = [RegexSequence(children = [RegexEmpty(), RegexCharacter(character_set = [
 53, 54])])], nonterminal = 15), Grammar.Production(children = [RegexSequence(
-children = [RegexEmpty(), RegexCharacter(char_set = [54, 55])])], nonterminal =
+children = [RegexEmpty(), RegexCharacter(character_set = [54, 55])])], nonterminal =
 16), Grammar.Production(children = [RegexSequence(children = [RegexEmpty(),
-RegexCharacter(char_set = [55, 56])])], nonterminal = 17), Grammar.Production(
-children = [RegexSequence(children = [RegexEmpty(), RegexCharacter(char_set = [
+RegexCharacter(character_set = [55, 56])])], nonterminal = 17), Grammar.Production(
+children = [RegexSequence(children = [RegexEmpty(), RegexCharacter(character_set = [
 56, 57])])], nonterminal = 18), Grammar.Production(children = [RegexSequence(
-children = [RegexEmpty(), RegexCharacter(char_set = [57, 58])])], nonterminal =
+children = [RegexEmpty(), RegexCharacter(character_set = [57, 58])])], nonterminal =
 19), Grammar.Production(children = [RegexSequence(children = [RegexSequence(
-children = [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name =
-'number')]), RegexCharacter(char_set = [48, 49])])], nonterminal = 20),
+children = [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name =
+'number')]), RegexCharacter(character_set = [48, 49])])], nonterminal = 20),
 Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
-)]), RegexCharacter(char_set = [49, 50])])], nonterminal = 21),
+= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(character_set = [49, 50])])], nonterminal = 21),
 Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
-)]), RegexCharacter(char_set = [50, 51])])], nonterminal = 22),
+= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(character_set = [50, 51])])], nonterminal = 22),
 Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
-)]), RegexCharacter(char_set = [51, 52])])], nonterminal = 23),
+= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(character_set = [51, 52])])], nonterminal = 23),
 Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
-)]), RegexCharacter(char_set = [52, 53])])], nonterminal = 24),
+= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(character_set = [52, 53])])], nonterminal = 24),
 Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
-)]), RegexCharacter(char_set = [53, 54])])], nonterminal = 25),
+= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(character_set = [53, 54])])], nonterminal = 25),
 Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
-)]), RegexCharacter(char_set = [54, 55])])], nonterminal = 26),
+= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(character_set = [54, 55])])], nonterminal = 26),
 Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
-)]), RegexCharacter(char_set = [55, 56])])], nonterminal = 27),
+= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(character_set = [55, 56])])], nonterminal = 27),
 Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
-)]), RegexCharacter(char_set = [56, 57])])], nonterminal = 28),
+= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(character_set = [56, 57])])], nonterminal = 28),
 Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
-)]), RegexCharacter(char_set = [57, 58])])], nonterminal = 29),
+= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(character_set = [57, 58])])], nonterminal = 29),
 Grammar.Production(children = [RegexEmpty()], nonterminal = 30),
 Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [288, 295], rule_name =
-'whitespace_opt')]), RegexCharacter(char_set = [9, 10])])], nonterminal = 31),
+= [RegexEmpty(), RegexCharacterRule(character_set = [288, 295], rule_name =
+'whitespace_opt')]), RegexCharacter(character_set = [9, 10])])], nonterminal = 31),
 Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [288, 295], rule_name =
-'whitespace_opt')]), RegexCharacter(char_set = [10, 11])])], nonterminal = 32),
+= [RegexEmpty(), RegexCharacterRule(character_set = [288, 295], rule_name =
+'whitespace_opt')]), RegexCharacter(character_set = [10, 11])])], nonterminal = 32),
 Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [288, 295], rule_name =
-'whitespace_opt')]), RegexCharacter(char_set = [11, 12])])], nonterminal = 33),
+= [RegexEmpty(), RegexCharacterRule(character_set = [288, 295], rule_name =
+'whitespace_opt')]), RegexCharacter(character_set = [11, 12])])], nonterminal = 33),
 Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [288, 295], rule_name =
-'whitespace_opt')]), RegexCharacter(char_set = [12, 13])])], nonterminal = 34),
+= [RegexEmpty(), RegexCharacterRule(character_set = [288, 295], rule_name =
+'whitespace_opt')]), RegexCharacter(character_set = [12, 13])])], nonterminal = 34),
 Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [288, 295], rule_name =
-'whitespace_opt')]), RegexCharacter(char_set = [13, 14])])], nonterminal = 35),
+= [RegexEmpty(), RegexCharacterRule(character_set = [288, 295], rule_name =
+'whitespace_opt')]), RegexCharacter(character_set = [13, 14])])], nonterminal = 35),
 Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [288, 295], rule_name =
-'whitespace_opt')]), RegexCharacter(char_set = [32, 33])])], nonterminal = 36)
-], terminal_thres = 258)
+= [RegexEmpty(), RegexCharacterRule(character_set = [288, 295], rule_name =
+'whitespace_opt')]), RegexCharacter(character_set = [32, 33])])], nonterminal = 36)
+], n_terminals = 258)
   #sys.stdout.write(
   #  wrap_repr(
   #    '  grammar = {0:s}'.format(repr(grammar).replace('regex.', '')),