Split out regex.py module into regexes, NFAs, DFAs, grammars, LR1s and LR1DFAs

author Nick Downing <downing.nick@gmail.com>

Sun, 8 Jul 2018 05:39:58 +0000 (15:39 +1000)

committer Nick Downing <downing.nick@gmail.com>

Sun, 8 Jul 2018 05:39:58 +0000 (15:39 +1000)
author Nick Downing <downing.nick@gmail.com>
Sun, 8 Jul 2018 05:39:58 +0000 (15:39 +1000)
committer Nick Downing <downing.nick@gmail.com>
Sun, 8 Jul 2018 05:39:58 +0000 (15:39 +1000)
diff --git a/ast.py b/ast.py

index bed3dd1..26e3a1f 100644 (file)
--- a/ast.py
+++ b/ast.py
@@ -1,5 +1,5 @@
  import element
-import regex
+import grammar
  
  class Item(element.Element):
    # GENERATE ELEMENT() BEGIN
@@ -1871,7 +1871,7 @@ class PYACC(element.Element):
            lhs_symbol,
            name_to_symbol
          ):
-          production = regex.Grammar.Production(
+          production = grammar.Grammar.Production(
              nonterminal = len(pyacc.grammar)
            )
            for i in range(len(self)):
@@ -1881,13 +1881,13 @@ class PYACC(element.Element):
                  assert character != 0 # would conflict with YYEOF
                  pyacc.characters_used.add(character)
                  production.append(
-                  regex.Grammar.Production.Symbol(
+                  grammar.Grammar.Production.Symbol(
                      terminal_set = [character, character + 1]
                    )
                  )
                elif isinstance(self[i][0], PYACC.ID):
                  production.append(
-                  regex.Grammar.Production.NamedSymbol(
+                  grammar.Grammar.Production.NamedSymbol(
                      # (non)terminal_set will be filled in later once assigned
                      name = element.get_text(self[i][0], 0)
                    )
@@ -2220,11 +2220,11 @@ class PYACC(element.Element):
        PYACC.Symbol(name = '$undefined', character_set = [0x101, 0x102])
      ]
      self.nonterminal_symbols = []
-    self.grammar = regex.Grammar(
+    self.grammar = grammar.Grammar(
        children = [
-        regex.Grammar.Production(
+        grammar.Grammar.Production(
            children = [
-            regex.Grammar.Production.NamedSymbol()
+            grammar.Grammar.Production.NamedSymbol()
            ],
            nonterminal = 0
          )
@@ -2264,7 +2264,7 @@ class PYACC(element.Element):
        )
      )
  
-# GENERATE FACTORY(regex.factory) BEGIN
+# GENERATE FACTORY(grammar.factory) BEGIN
  tag_to_class = {
    'Item': Item,
    'PYACC': PYACC,
@@ -2334,5 +2334,5 @@ tag_to_class = {
    'PYACC_ValueReference': PYACC.ValueReference
  }
  def factory(tag, attrib = {}, *args, **kwargs):
-  return tag_to_class.get(tag, regex.factory)(tag, attrib, *args, **kwargs)
+  return tag_to_class.get(tag, grammar.factory)(tag, attrib, *args, **kwargs)
  # GENERATE END
diff --git a/bisect_set.py b/bisect_set.py

new file mode 100644 (file)

index 0000000..6ce1be9
--- /dev/null
+++ b/bisect_set.py
@@ -0,0 +1,71 @@
+# defines the alphabet size, set this to 0x11000 for unicode
+n_characters = 0x100
+
+def bisect_set_or(character_set0, character_set1):
+  # calculate union of the child sets
+  # we do this by calculating a series of breakpoints, at each breakpoint
+  # evaluating the "or" (max) of the even/odd truth values of each child,
+  # then making the output truth value even/odd by outputting if necessary
+  result = []
+  i = 0
+  j = 0
+  while True:
+    if i < len(character_set0):
+      k = character_set0[i]
+      if j < len(character_set1):
+        k = min(k, character_set1[j])
+    elif j < len(character_set1):
+      k = character_set1[j]
+    else:
+      break
+    if i < len(character_set0) and character_set0[i] == k:
+      i += 1
+    if j < len(character_set1) and character_set1[j] == k:
+      j += 1
+    if (len(result) & 1) != max(i & 1, j & 1):
+      result.append(k)
+  assert (i & 1) == 0 and (j & 1) == 0
+  return result
+
+def bisect_set_and(character_set0, character_set1):
+  # calculate intersection of the child sets
+  # we do this by calculating a series of breakpoints, at each breakpoint
+  # evaluating the "and" (min) of the even/odd truth values of each child,
+  # then making the output truth value even/odd by outputting if necessary
+  result = []
+  i = 0
+  j = 0
+  while True:
+    if i < len(character_set0):
+      k = character_set0[i]
+      if j < len(character_set1):
+        k = min(k, character_set1[j])
+    elif j < len(character_set1):
+      k = character_set1[j]
+    else:
+      break
+    if i < len(character_set0) and character_set0[i] == k:
+      i += 1
+    if j < len(character_set1) and character_set1[j] == k:
+      j += 1
+    if (len(result) & 1) != min(i & 1, j & 1):
+      result.append(k)
+  assert (i & 1) == 0 and (j & 1) == 0
+  return result
+
+def bisect_set_not(character_set):
+  # calculate complement of the child set
+  # if child set begins with [0], remove it, otherwise add [0] prefix
+  # if child set ends with [n_characters], remove it, otherwise add [n_characters] suffix
+  # the suffix part is not totally necessary, but makes sure length is even
+  # (the evenness is so that single character sets can always be [c, c + 1])
+  result = list(character_set)
+  if result[:1] == [0]:
+    del result[:1]
+  else:
+    result[:0] = [0]
+  if result[-1:] == [n_characters]:
+    del result[-1:]
+  else:
+    result.append(n_characters)
+  return result
diff --git a/bison_lr1dfa.py b/bison_lr1dfa.py

index 43ecf10..c8c93d4 100644 (file)
--- a/bison_lr1dfa.py
+++ b/bison_lr1dfa.py
@@ -1,6 +1,5 @@
  import element
  import numpy
-import regex
  import sys
  
  class BisonLR1DFA:
diff --git a/dfa.py b/dfa.py

new file mode 100644 (file)

index 0000000..46f4cbe
--- /dev/null
+++ b/dfa.py
@@ -0,0 +1,221 @@
+import bisect
+import element
+import work
+import sys
+
+# defines the alphabet size, set this to 0x11000 for unicode
+n_characters = 0x100
+
+class DFA:
+  # transition classes:
+  # instructions to transform thread list from a multistate to next multistate
+  # (TRANSITION_POP, n)                        j += n
+  # (TRANSITION_DUP, n)                        threads0[j - n:j] = threads0[j:j + n]
+  #                                    j -= n
+  # (TRANSITION_MARK, n, mark_value)   threads0[j:j + n] = [
+  #                                      (i, mark, thread)
+  #                                      for thread in threads0[j:j + n]
+  #                                    ]
+  # (TRANSITION_MOVE, n)               threads1.extend(threads0[j:j + n])
+  #                                    j += n
+  # (TRANSITION_DEL, n)                        del threads1[-n:]
+
+  TRANSITION_POP = 0
+  TRANSITION_DUP = 1
+  TRANSITION_MARK = 2
+  TRANSITION_MOVE = 3
+  #TRANSITION_DEL = 4
+ 
+  def __init__(
+    self,
+    groups = [],
+    states = [([n_characters], [0], 0)],
+    actions = [(0, [])],
+    start_action = [] # can have multiple DFAs in same container
+  ):
+    # groups: list of group_desc
+    # group_desc: (tag, kwargs)
+    #   tag, kwargs will be passed to apply_markup() hence factory()
+    # states: list of state_desc
+    # state_desc: (list of breaks, list of action to do, accept_thread)
+    # actions: list of action_desc
+    # action_desc: (state to go to next, compiled transition to do first)
+    # accept_thread: which thread of thread list to use, -1 don't accept
+    self.groups = groups
+    self.states = states
+    self.actions = actions
+    self.start_action = start_action
+
+  def match_text(self, text, i, start_index = 0):
+    def transit(transition):
+      nonlocal threads0, threads1, prefix_slop # note: also uses i
+      j = prefix_slop
+      for trans in transition:
+        if trans[0] == DFA.TRANSITION_POP:
+          j += trans[1]
+        elif trans[0] == DFA.TRANSITION_DUP:
+          while j < trans[1]:
+            threads0[:0] = [None] * prefix_slop
+            threads1[:0] = [None] * prefix_slop
+            j += prefix_slop
+            prefix_slop *= 2
+          threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
+          j -= trans[1]
+        elif trans[0] == DFA.TRANSITION_MARK:
+          threads0[j:j + trans[1]] = [
+            (i, trans[2], thread)
+            for thread in threads0[j:j + trans[1]]
+          ]
+        elif trans[0] == DFA.TRANSITION_MOVE:
+          threads1.extend(threads0[j:j + trans[1]])
+          j += trans[1]
+        #elif trans[0] == DFA.TRANSITION_DEL:
+        #  del threads1[-trans[1]:]
+        else:
+          assert False
+      assert j == len(threads0)
+      threads0, threads1 = threads1, threads0
+      del threads1[prefix_slop:]
+
+    threads0 = [None, None]
+    threads1 = [None]
+    prefix_slop = 1
+
+    action = self.start_action[start_index]
+    while action != -1:
+      state, transition = self.actions[action]
+      #print('i', i, 'action', action, 'state', state, 'transition', transition)
+      transit(transition)
+      if state == 0:
+        # there is only one match, which is complete
+        assert len(threads0) == prefix_slop + 1
+        return threads0[prefix_slop]
+      if i >= len(text):
+        # return best match we have, but not incomplete match
+        i = self.states[state][2]
+        return (None if i == -1 else threads0[prefix_slop + i])
+      action = self.states[state][1][
+        bisect.bisect_right(self.states[state][0], ord(text[i]))
+      ]
+      i += 1
+    return None
+
+  def match_yychunk(self, root, pos, off, yychunk_iter, start_index = 0):
+    if pos < 0:
+      pos, off = element.to_start_relative(root, pos, off)
+
+    def transit(transition):
+      nonlocal threads0, threads1, prefix_slop # note: also uses pos, off
+      j = prefix_slop
+      for trans in transition:
+        if trans[0] == DFA.TRANSITION_POP:
+          j += trans[1]
+        elif trans[0] == DFA.TRANSITION_DUP:
+          while j < trans[1]:
+            threads0[:0] = [None] * prefix_slop
+            threads1[:0] = [None] * prefix_slop
+            j += prefix_slop
+            prefix_slop *= 2
+          threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
+          j -= trans[1]
+        elif trans[0] == DFA.TRANSITION_MARK:
+          threads0[j:j + trans[1]] = [
+            (pos, off, trans[2], thread)
+            for thread in threads0[j:j + trans[1]]
+          ]
+        elif trans[0] == DFA.TRANSITION_MOVE:
+          threads1.extend(threads0[j:j + trans[1]])
+          j += trans[1]
+        #elif trans[0] == DFA.TRANSITION_DEL:
+        #  del threads1[-trans[1]:]
+        else:
+          assert False
+      assert j == len(threads0)
+      threads0, threads1 = threads1, threads0
+      del threads1[prefix_slop:]
+
+    threads0 = [None, None]
+    threads1 = [None]
+    prefix_slop = 1
+
+    action = self.start_action[start_index]
+    text = element.get_text(root, pos)
+    while action != -1:
+      state, transition = self.actions[action]
+      transit(transition)
+      if state == 0:
+        # there is only one match, which is complete
+        assert len(threads0) == prefix_slop + 1
+        return threads0[prefix_slop]
+      while off >= len(text):
+        if pos < len(root):
+          pos += 1
+          off = 0
+        else:
+          try:
+            next(yychunk_iter)
+          except StopIteration:
+            # return best match we have, but not incomplete match
+            i = self.states[state][2]
+            return (None if i == -1 else threads0[prefix_slop + i])
+          text = element.get_text(root, pos)
+      #print(
+      #  'state {0:d} pos {1:d} off {2:d} text "{3:s}"'.format(
+      #    state,
+      #    pos,
+      #    off,
+      #    text.replace('\n', '$')
+      #  )
+      #)
+      action = self.states[state][1][
+        bisect.bisect_right(self.states[state][0], ord(text[off]))
+      ]
+      off += 1
+    return None
+
+  def yylex(self, root, pos, off, factory, yychunk_iter):
+    if pos < 0:
+      pos, off = element.to_start_relative(root, pos, off)
+
+    while True:
+      # note: pointers must be kept start relative during the below call,
+      # because it extends the following text by calling the yychunk_iter
+      thread = self.match_yychunk(root, pos, off, yychunk_iter)
+      if thread is None:
+        break
+      stack = []
+      while True:
+        pos, off, mark_value, thread = thread
+        group_index = mark_value >> 1
+        if (mark_value & 1) != 0:
+          end_pos, end_off = element.to_end_relative(root, pos, off)
+          stack.append((end_pos, end_off, group_index))
+        else:
+          end_pos, end_off, temp = stack.pop()
+          assert temp == group_index
+          if len(stack) == 0:
+            break
+          tag, kwargs = self.groups[group_index]
+          if tag != '':
+            work.apply_markup(
+              root,
+              pos,
+              off,
+              end_pos,
+              end_off,
+              factory,
+              tag,
+              **kwargs
+            )
+      # note: pointers must be kept end relative during the below call,
+      # because it modifies the preceding text by calling apply_markup()
+      yield end_pos, end_off, group_index
+      pos, off = element.to_start_relative(root, end_pos, end_off)
+
+  def __repr__(self):
+    return 'dfa.DFA({0:s}, {1:s}, {2:s}, {3:s})'.format(
+      repr(self.groups),
+      repr(self.states),
+      repr(self.actions),
+      repr(self.start_action)
+    )
diff --git a/grammar.py b/grammar.py

new file mode 100644 (file)

index 0000000..4054805
--- /dev/null
+++ b/grammar.py
@@ -0,0 +1,328 @@
+import bisect
+import bisect_set
+import element
+import lr1
+import work
+import sys
+
+# defines the alphabet size, set this to 0x11000 for unicode
+n_characters = 0x100
+
+class Grammar(element.Element):
+  class Production(element.Element):
+    class Symbol(element.Element):
+      # GENERATE ELEMENT(list(int) terminal_set, list(int) nonterminal_set) BEGIN
+      def __init__(
+        self,
+        tag = 'Grammar_Production_Symbol',
+        attrib = {},
+        text = '',
+        children = [],
+        terminal_set = [],
+        nonterminal_set = []
+      ):
+        element.Element.__init__(
+          self,
+          tag,
+          attrib,
+          text,
+          children
+        )
+        self.terminal_set = (
+          [element.deserialize_int(i) for i in terminal_set.split()]
+        if isinstance(terminal_set, str) else
+          terminal_set
+        )
+        self.nonterminal_set = (
+          [element.deserialize_int(i) for i in nonterminal_set.split()]
+        if isinstance(nonterminal_set, str) else
+          nonterminal_set
+        )
+      def serialize(self, ref_list, indent = 0):
+        element.Element.serialize(self, ref_list, indent)
+        self.set(
+          'terminal_set',
+          ' '.join([element.serialize_int(i) for i in self.terminal_set])
+        )
+        self.set(
+          'nonterminal_set',
+          ' '.join([element.serialize_int(i) for i in self.nonterminal_set])
+        )
+      def deserialize(self, ref_list):
+        element.Element.deserialize(self, ref_list)
+        self.terminal_set = [
+          element.deserialize_int(i)
+          for i in self.get('terminal_set', '').split()
+        ]
+        self.nonterminal_set = [
+          element.deserialize_int(i)
+          for i in self.get('nonterminal_set', '').split()
+        ]
+      def copy(self, factory = None):
+        result = element.Element.copy(
+          self,
+          Symbol if factory is None else factory
+        )
+        result.terminal_set = self.terminal_set
+        result.nonterminal_set = self.nonterminal_set
+        return result
+      def repr_serialize(self, params):
+        element.Element.repr_serialize(self, params)
+        if len(self.terminal_set):
+          params.append(
+            'terminal_set = [{0:s}]'.format(
+              ', '.join([repr(i) for i in self.terminal_set])
+            )
+          )
+        if len(self.nonterminal_set):
+          params.append(
+            'nonterminal_set = [{0:s}]'.format(
+              ', '.join([repr(i) for i in self.nonterminal_set])
+            )
+          )
+      def __repr__(self):
+        params = []
+        self.repr_serialize(params)
+        return 'grammar.Grammar.Production.Symbol({0:s})'.format(', '.join(params))
+      # GENERATE END
+      def post_process(self, name_to_character_sets):
+        pass
+
+    class NamedSymbol(Symbol):
+      # GENERATE ELEMENT(str name) BEGIN
+      def __init__(
+        self,
+        tag = 'Grammar_Production_NamedSymbol',
+        attrib = {},
+        text = '',
+        children = [],
+        terminal_set = [],
+        nonterminal_set = [],
+        name = ''
+      ):
+        Grammar.Production.Symbol.__init__(
+          self,
+          tag,
+          attrib,
+          text,
+          children,
+          terminal_set,
+          nonterminal_set
+        )
+        self.name = name
+      def serialize(self, ref_list, indent = 0):
+        Grammar.Production.Symbol.serialize(self, ref_list, indent)
+        self.set('name', element.serialize_str(self.name))
+      def deserialize(self, ref_list):
+        Grammar.Production.Symbol.deserialize(self, ref_list)
+        self.name = element.deserialize_str(self.get('name', ''))
+      def copy(self, factory = None):
+        result = Grammar.Production.Symbol.copy(
+          self,
+          NamedSymbol if factory is None else factory
+        )
+        result.name = self.name
+        return result
+      def repr_serialize(self, params):
+        Grammar.Production.Symbol.repr_serialize(self, params)
+        if self.name != '':
+          params.append(
+            'name = {0:s}'.format(repr(self.name))
+          )
+      def __repr__(self):
+        params = []
+        self.repr_serialize(params)
+        return 'grammar.Grammar.Production.NamedSymbol({0:s})'.format(', '.join(params))
+      # GENERATE END
+      def post_process(self, name_to_character_sets):
+        self.terminal_set, self.nonterminal_set = (
+          name_to_character_sets[self.name]
+        )
+ 
+    # GENERATE ELEMENT(int nonterminal, int precedence, int associativity) BEGIN
+    def __init__(
+      self,
+      tag = 'Grammar_Production',
+      attrib = {},
+      text = '',
+      children = [],
+      nonterminal = -1,
+      precedence = -1,
+      associativity = -1
+    ):
+      element.Element.__init__(
+        self,
+        tag,
+        attrib,
+        text,
+        children
+      )
+      self.nonterminal = (
+        element.deserialize_int(nonterminal)
+      if isinstance(nonterminal, str) else
+        nonterminal
+      )
+      self.precedence = (
+        element.deserialize_int(precedence)
+      if isinstance(precedence, str) else
+        precedence
+      )
+      self.associativity = (
+        element.deserialize_int(associativity)
+      if isinstance(associativity, str) else
+        associativity
+      )
+    def serialize(self, ref_list, indent = 0):
+      element.Element.serialize(self, ref_list, indent)
+      self.set('nonterminal', element.serialize_int(self.nonterminal))
+      self.set('precedence', element.serialize_int(self.precedence))
+      self.set('associativity', element.serialize_int(self.associativity))
+    def deserialize(self, ref_list):
+      element.Element.deserialize(self, ref_list)
+      self.nonterminal = element.deserialize_int(self.get('nonterminal', '-1'))
+      self.precedence = element.deserialize_int(self.get('precedence', '-1'))
+      self.associativity = element.deserialize_int(self.get('associativity', '-1'))
+    def copy(self, factory = None):
+      result = element.Element.copy(
+        self,
+        Production if factory is None else factory
+      )
+      result.nonterminal = self.nonterminal
+      result.precedence = self.precedence
+      result.associativity = self.associativity
+      return result
+    def repr_serialize(self, params):
+      element.Element.repr_serialize(self, params)
+      if self.nonterminal != -1:
+        params.append(
+          'nonterminal = {0:s}'.format(repr(self.nonterminal))
+        )
+      if self.precedence != -1:
+        params.append(
+          'precedence = {0:s}'.format(repr(self.precedence))
+        )
+      if self.associativity != -1:
+        params.append(
+          'associativity = {0:s}'.format(repr(self.associativity))
+        )
+    def __repr__(self):
+      params = []
+      self.repr_serialize(params)
+      return 'grammar.Grammar.Production({0:s})'.format(', '.join(params))
+    # GENERATE END
+    def post_process(self, nonterminal, name_to_character_sets):
+      self.nonterminal = nonterminal
+      for i in self:
+        i.post_process(name_to_character_sets)
+    def add_to_lr1(self, lr1):
+      lr1.productions.append(
+        (
+          # precedence
+          self.precedence * 2 + self.associativity,
+          # symbols
+          [(i.terminal_set, i.nonterminal_set) for i in self],
+          # lookaheads (list of initial_set, can_be_empty)
+          [([], False) for i in range(len(self))] + [([], True)],
+          # group_bounds
+          []
+        )
+      )
+
+  # GENERATE ELEMENT(int n_terminals, int eof_terminal) BEGIN
+  def __init__(
+    self,
+    tag = 'Grammar',
+    attrib = {},
+    text = '',
+    children = [],
+    n_terminals = -1,
+    eof_terminal = -1
+  ):
+    element.Element.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+    self.n_terminals = (
+      element.deserialize_int(n_terminals)
+    if isinstance(n_terminals, str) else
+      n_terminals
+    )
+    self.eof_terminal = (
+      element.deserialize_int(eof_terminal)
+    if isinstance(eof_terminal, str) else
+      eof_terminal
+    )
+  def serialize(self, ref_list, indent = 0):
+    element.Element.serialize(self, ref_list, indent)
+    self.set('n_terminals', element.serialize_int(self.n_terminals))
+    self.set('eof_terminal', element.serialize_int(self.eof_terminal))
+  def deserialize(self, ref_list):
+    element.Element.deserialize(self, ref_list)
+    self.n_terminals = element.deserialize_int(self.get('n_terminals', '-1'))
+    self.eof_terminal = element.deserialize_int(self.get('eof_terminal', '-1'))
+  def copy(self, factory = None):
+    result = element.Element.copy(
+      self,
+      Grammar if factory is None else factory
+    )
+    result.n_terminals = self.n_terminals
+    result.eof_terminal = self.eof_terminal
+    return result
+  def repr_serialize(self, params):
+    element.Element.repr_serialize(self, params)
+    if self.n_terminals != -1:
+      params.append(
+        'n_terminals = {0:s}'.format(repr(self.n_terminals))
+      )
+    if self.eof_terminal != -1:
+      params.append(
+        'eof_terminal = {0:s}'.format(repr(self.eof_terminal))
+      )
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'grammar.Grammar({0:s})'.format(', '.join(params))
+  # GENERATE END
+  def post_process(self, name_to_character_sets):
+    for i in range(len(self)):
+      self[i].post_process(i, name_to_character_sets)
+  def to_lr1(self):
+    _lr1 = lr1.LR1([], self.n_terminals, self.eof_terminal)
+    for i in self:
+      i.add_to_lr1(_lr1)
+    # propagate lookaheads
+    modified = True
+    while modified:
+      modified = False
+      for _, symbols, lookaheads, _ in _lr1.productions:
+        for i in range(len(symbols) - 1, -1, -1):
+          initial_set, nonterminal_set = symbols[i]
+          can_be_empty = False
+          for j in range(0, len(nonterminal_set), 2):
+            for k in range(nonterminal_set[j], nonterminal_set[j + 1]):
+              child_initial_set, child_can_be_empty = _lr1.productions[k][2][0]
+              initial_set = bisect_set.bisect_set_or(initial_set, child_initial_set)
+              can_be_empty = can_be_empty or child_can_be_empty
+          # at this point can_be_empty refers to current symbol only
+          if can_be_empty:
+            next_initial_set, can_be_empty = lookaheads[i + 1]
+            initial_set = bisect_set.bisect_set_or(initial_set, next_initial_set)
+          # at this point can_be_empty refers to all remaining symbols
+          if (initial_set, can_be_empty) != lookaheads[i]:
+            lookaheads[i] = (initial_set, can_be_empty)
+            modified = True
+    return _lr1
+
+# GENERATE FACTORY(element.Element) BEGIN
+tag_to_class = {
+  'Grammar': Grammar,
+  'Grammar_Production': Grammar.Production,
+  'Grammar_Production_Symbol': Grammar.Production.Symbol,
+  'Grammar_Production_NamedSymbol': Grammar.Production.NamedSymbol
+}
+def factory(tag, attrib = {}, *args, **kwargs):
+  return tag_to_class.get(tag, element.Element)(tag, attrib, *args, **kwargs)
+# GENERATE END
diff --git a/grammar.sh b/grammar.sh

new file mode 100755 (executable)

index 0000000..67d4650
--- /dev/null
+++ b/grammar.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+if ./generate.py grammar <grammar.py >grammar.py.new && ! diff -q grammar.py grammar.py.new
+then
+  mv grammar.py.new grammar.py
+else
+  rm -f grammar.py.new
+fi
diff --git a/lr1.py b/lr1.py

new file mode 100644 (file)

index 0000000..4d4b410
--- /dev/null
+++ b/lr1.py
@@ -0,0 +1,504 @@
+import bisect
+import bisect_set
+import element
+import lr1dfa
+import work
+import sys
+
+# defines the alphabet size, set this to 0x11000 for unicode
+n_characters = 0x100
+
+class LR1:
+  def __init__(
+    self,
+    productions = [],
+    n_terminals = n_characters + 1,
+    eof_terminal = n_characters
+  ):
+    # productions: list of production
+    # production: (
+    #   priority,
+    #   symbols,
+    #   lookaheads,
+    #   group_bounds
+    # )
+    # priority: bit 0 = right to left, bits 1: = numeric priority
+    # symbols: list of symbol_desc
+    # symbol_desc: (terminal_set, nonterminal_set)
+    # terminal_set: similar to character_set, even length list of pairs of breaks
+    # nonterminal_set: as above but has n_terminals subtracted from breaks
+    # lookaheads: list of lookahead_desc, len(lookaheads) = len(symbols) + 1
+    # lookahead_desc: (initial_set, can_be_empty)
+    # initial_set: what terminals can occur at this position in symbols array,
+    # computed such that lookaheads[i][0] is symbols[i][0], plus initial set of
+    # each nonterminal from symbols[i][1], plus lookaheads[i + 1][0] if any
+    # nonterminal of symbols[i][1] can be empty (may pull in i + 2, i + 3 etc)
+    # can_be_empty: whether all symbols from this position to end can be empty
+    # note: last lookahead_desc is a sentinel consisting of ([], True), so that
+    # initial_set is empty (will be augmented by context) and can_be_empty is
+    # True (because all symbols from the end to the end can obviously be empty)
+    # group_bounds: list of group_bound
+    # group_bound: (start_index, end_index, tag, kwargs)
+    #   where start_index, end_index are indices into list of character_set,
+    #   and tag, kwargs will be passed to apply_markup() hence factory(),
+    #   noting that markup has to be applied in reverse order of the list
+    # n_terminals: offset to apply to productions[] index to get symbol
+    #   (character set code), also symbol for productions[0] = start production
+    # eof_terminal: usually == n_terminals - 1 (must be valid terminal value)
+    self.productions = productions
+    self.n_terminals = n_terminals
+    self.eof_terminal = eof_terminal
+
+  def lookahead_item_set_closure(self, items, item_to_index):
+    in_queue = [True for i in range(len(items))]
+    queue = list(range(len(items)))
+
+    qhead = 0
+    while qhead < len(queue):
+      i = queue[qhead]
+      in_queue[i] = False
+      j, k, lookahead_set = items[i]
+      _, symbols, lookaheads, _ = self.productions[j]
+      if k < len(symbols):
+        _, nonterminal_set = symbols[k]
+        if len(nonterminal_set):
+          next_lookahead_set, next_can_be_empty = lookaheads[k + 1]
+          if next_can_be_empty:
+            next_lookahead_set = bisect_set.bisect_set_or(
+              next_lookahead_set,
+              lookahead_set
+            )
+          for l in range(0, len(nonterminal_set), 2):
+            for m in range(nonterminal_set[l], nonterminal_set[l + 1]):
+              key = (m, 0)
+              if key in item_to_index:
+                n = item_to_index[key]
+                child_lookahead_set = bisect_set.bisect_set_or(
+                  items[n][2],
+                  next_lookahead_set
+                )
+                if child_lookahead_set != items[n][2]:
+                  items[n] = (m, 0, child_lookahead_set)
+                  if not in_queue[n]:
+                    # optimization: do not re-queue unless it is transparent
+                    # to changes in the lookahead set wrt. further propagation
+                    _, child_symbols, child_lookaheads, _ = self.productions[m]
+                    if len(child_symbols) and child_lookaheads[1][1]:
+                      in_queue[n] = True
+                      queue.append(n)
+              else:
+                n = len(items)
+                items.append((m, 0, next_lookahead_set))
+                item_to_index[key] = n
+                in_queue.append(True)
+                queue.append(n)
+      qhead += 1 
+
+  def lookahead_item_set_shift(self, items, terminal):
+    next_items = []
+    next_item_to_index = {}
+    reductions = set()
+    terminal0 = 0
+    terminal1 = self.n_terminals
+    for i, j, lookahead_set in items:
+      _, symbols, _, _ = self.productions[i]
+      if j < len(symbols):
+        terminal_set, _ = symbols[j]
+        k = bisect.bisect_right(terminal_set, terminal)
+        if k > 0 and terminal0 < terminal_set[k - 1]:
+          terminal0 = terminal_set[k - 1]
+        if k < len(terminal_set) and terminal1 > terminal_set[k]:
+          terminal1 = terminal_set[k]
+        if (k & 1) == 1:
+          next_item_to_index[(i, j + 1)] = len(next_items)
+          next_items.append((i, j + 1, lookahead_set))
+      else:
+        k = bisect.bisect_right(lookahead_set, terminal)
+        if k > 0 and terminal0 < lookahead_set[k - 1]:
+          terminal0 = lookahead_set[k - 1]
+        if k < len(lookahead_set) and terminal1 > lookahead_set[k]:
+          terminal1 = lookahead_set[k]
+        if (k & 1) == 1:
+          reductions.add(i)
+    return next_items, next_item_to_index, reductions, terminal0, terminal1
+
+  def lookahead_item_set_goto(self, items, nonterminal):
+    next_items = []
+    next_item_to_index = {}
+    reductions = set()
+    nonterminal0 = 0
+    nonterminal1 = len(self.productions)
+    for i, j, lookahead_set in items:
+      _, symbols, _, _ = self.productions[i]
+      if j < len(symbols):
+        _, nonterminal_set = symbols[j]
+        k = bisect.bisect_right(nonterminal_set, nonterminal)
+        if k > 0 and nonterminal0 < nonterminal_set[k - 1]:
+          nonterminal0 = nonterminal_set[k - 1]
+        if k < len(nonterminal_set) and nonterminal1 > nonterminal_set[k]:
+          nonterminal1 = nonterminal_set[k]
+        if (k & 1) == 1:
+          next_item_to_index[(i, j + 1)] = len(next_items)
+          next_items.append((i, j + 1, lookahead_set))
+    return next_items, next_item_to_index, nonterminal0, nonterminal1
+
+  def parse_text(self, text, i):
+    items = [(0, 0, [self.eof_terminal, self.eof_terminal + 1])]
+    item_to_index = {(0, 0): 0}
+    value_stack = []
+    state_stack = []
+    lookahead_character = ord(text[i]) if i < len(text) else self.eof_terminal
+    while True:
+      self.lookahead_item_set_closure(items, item_to_index)
+      value_stack.append(i)
+      state_stack.append(items)
+      items, item_to_index, reductions, _, _ = (
+        self.lookahead_item_set_shift(items, lookahead_character)
+      )
+      if len(items) != 0:
+        if len(reductions) != 0:
+          sys.stderr.write(
+            'shift/reduce conflict: {0:s} vs {1:s}\n'.format(
+              ','.join([str(i) for i, _, _ in next_items]),
+              ','.join([str(i) for i in reductions])
+            )
+          )
+        i += 1
+        lookahead_character = ord(text[i]) if i < len(text) else self.eof_terminal
+      elif len(reductions) != 0:
+        if len(reductions) != 1:
+          sys.stderr.write(
+            'reduce/reduce conflict: {0:s}\n'.format(
+              ','.join([str(i) for i in reductions])
+            )
+          )
+        reduce = min(reductions)
+        _, symbols, _, group_bounds = self.productions[reduce]
+        base = len(value_stack) - len(symbols) - 1
+        for j in range(len(group_bounds) - 1, -1, -1):
+          k, l, tag, _ = group_bounds[j]
+          k += base
+          if l != 1:
+            value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
+          sys.stderr.write(
+            'text \'{0:s}\' tag \'{1:s}\'\n'.format(
+              text[value_stack[k]:value_stack[k + 1]],
+              tag
+            )
+          )
+        del value_stack[base + 1:]
+        del state_stack[base + 1:]
+        if reduce == 0:
+          assert base == 0
+          return
+        items, item_to_index, _, _ = (
+          self.lookahead_item_set_goto(state_stack[-1], reduce)
+        )
+        assert len(items) != 0
+      else:
+        raise Exception(
+          'syntax error at {0:d}: {1:s}'.format(i, text[i:])
+        )
+
+  def parse_yychunk(self, root, pos, off, factory, yychunk_iter):
+    if pos < 0:
+      pos, off = element.to_start_relative(root, pos, off)
+
+    items = [(0, 0, [self.eof_terminal, self.eof_terminal + 1])]
+    item_to_index = {(0, 0): 0}
+    value_stack = []
+    state_stack = []
+    text = element.get_text(root, pos)
+    while off >= len(text):
+      if pos < len(root):
+        pos += 1
+        off = 0
+      else:
+        try:
+          next(yychunk_iter)
+        except StopIteration:
+          lookahead_character = self.eof_terminal
+          break
+        text = element.get_text(root, pos)
+    else: 
+      lookahead_character = ord(text[off])
+    while True:
+      self.lookahead_item_set_closure(items, item_to_index)
+      value_stack.append((pos, off))
+      state_stack.append(items)
+      items, item_to_index, reductions, _, _ = (
+        self.lookahead_item_set_shift(items, lookahead_character)
+      )
+      if len(items) != 0:
+        if len(reductions) != 0:
+          sys.stderr.write(
+            'shift/reduce conflict: {0:s} vs {1:s}\n'.format(
+              ','.join([str(i) for i, _ in next_lookahead_item_set.keys()]),
+              ','.join([str(i) for i in reductions])
+            )
+          )
+        off += 1
+        while off >= len(text):
+          if pos < len(root):
+            pos += 1
+            off = 0
+          else:
+            try:
+              next(yychunk_iter)
+            except StopIteration:
+              lookahead_character = self.eof_terminal
+              break
+            text = element.get_text(root, pos)
+        else: 
+          lookahead_character = ord(text[off])
+      elif len(reductions) != 0:
+        if len(reductions) != 1:
+          sys.stderr.write(
+            'reduce/reduce conflict: {0:s}\n'.format(
+              ','.join([str(i) for i in reductions])
+            )
+          )
+        reduce = min(reductions)
+        _, symbols, _, group_bounds = self.productions[reduce]
+        base = len(value_stack) - len(symbols) - 1
+        end_relative = len(value_stack)
+        for j in range(len(group_bounds) - 1, -1, -1):
+          k, l, tag, kwargs = group_bounds[j]
+          k += base
+          assert k < end_relative
+          if l != 1:
+            value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
+            end_relative = max(k + 1, end_relative + 1 - l)
+          while end_relative > k + 1:
+            end_relative -= 1
+            pos1, off1 = value_stack[end_relative]
+            value_stack[end_relative] = (
+              element.to_end_relative(root, pos1, off1)
+            )
+          pos0, off0 = value_stack[k]
+          pos1, off1 = value_stack[k + 1]
+          work.apply_markup(
+            root,
+            pos0,
+            off0,
+            pos1,
+            off1,
+            factory,
+            tag,
+            **kwargs
+          )
+        if end_relative < len(value_stack):
+          pos, off = value_stack[-1]
+          pos, off = element.to_start_relative(root, pos, off)
+          text = element.get_text(root, pos)
+        del value_stack[base + 1:]
+        del state_stack[base + 1:]
+        if reduce == 0:
+          assert base == 0
+          return
+        items, item_to_index, _, _ = (
+          self.lookahead_item_set_goto(state_stack[-1], reduce)
+        )
+        assert len(items) != 0
+      else:
+        raise Exception(
+          'syntax error at {0:d},{1:d}: {2:s}'.format(pos, off, text[off:])
+        )
+
+  def to_clr1(self):
+    _lr1dfa = lr1dfa.LR1DFA(
+      [],
+      [
+        (len(symbols), group_bounds)
+        for _, symbols, _, group_bounds in self.productions
+      ],
+      self.n_terminals,
+      self.eof_terminal
+    )
+
+    items = [(0, 0, [self.eof_terminal, self.eof_terminal + 1])]
+    item_to_index = {(0, 0): 0}
+    self.lookahead_item_set_closure(items, item_to_index)
+
+    items = sorted(items)
+    key = tuple((i, j, tuple(k)) for i, j, k in items)
+    state_to_items = [items]
+    items_to_state = {key: 0}
+
+    while len(_lr1dfa.states) < len(state_to_items):
+      items = state_to_items[len(_lr1dfa.states)]
+      state_desc = ([], [], [], [])
+
+      def add_state(next_items, next_item_to_index):
+        self.lookahead_item_set_closure(next_items, next_item_to_index)
+        new_items = sorted(next_items)
+        key = tuple((i, j, tuple(k)) for i, j, k in new_items)
+        if key in items_to_state:
+          state = items_to_state[key]
+        else:
+          state = len(state_to_items)
+          state_to_items.append(new_items)
+          items_to_state[key] = state
+        return state
+
+      terminal = 0
+      while terminal < self.n_terminals:
+        next_items, next_item_to_index, reductions, terminal0, terminal1 = (
+          self.lookahead_item_set_shift(items, terminal)
+        )
+        assert terminal0 == terminal and terminal1 > terminal
+        if len(next_items) != 0:
+          if len(reductions) != 0:
+            sys.stderr.write(
+              'state {0:d} shift/reduce conflict: {1:s} vs {2:s}\n'.format(
+                len(_lr1dfa.states),
+                ','.join([str(i) for i, _, _ in next_items]),
+                ','.join([str(i) for i in reductions])
+              )
+            )
+          action = add_state(next_items, next_item_to_index) * 2
+        elif len(reductions) != 0:
+          if len(reductions) != 1:
+            sys.stderr.write(
+              'state {0:d} reduce/reduce conflict: {1:s}\n'.format(
+                len(_lr1dfa.states),
+                ','.join([str(i) for i in reductions])
+              )
+            )
+          action = min(reductions) * 2 + 1 # odd means reduce
+        else:
+          action = -1
+        state_desc[0].append(terminal1)
+        state_desc[1].append(action)
+        terminal = terminal1
+
+      nonterminal = 0
+      while nonterminal < len(self.productions):
+        next_items, next_item_to_index, nonterminal0, nonterminal1 = (
+          self.lookahead_item_set_goto(items, nonterminal)
+        )
+        assert nonterminal0 == nonterminal and nonterminal1 > nonterminal
+        if len(next_items) != 0:
+          goto = add_state(next_items, next_item_to_index)
+        else:
+          goto = -1
+        state_desc[2].append(nonterminal1)
+        state_desc[3].append(goto)
+        nonterminal = nonterminal1
+
+      _lr1dfa.states.append(state_desc)
+    return _lr1dfa
+
+  def to_lalr1(self):
+    _lr1dfa = lr1dfa.LR1DFA(
+      [],
+      [
+        (len(symbols), group_bounds)
+        for _, symbols, _, group_bounds in self.productions
+      ],
+      self.n_terminals,
+      self.eof_terminal
+    )
+
+    items = [(0, 0, [self.eof_terminal, self.eof_terminal + 1])]
+    item_to_index = {(0, 0): 0}
+    self.lookahead_item_set_closure(items, item_to_index)
+
+    items = sorted(items)
+    key = tuple((i, j) for i, j, _ in items) # ignore lookahead
+    state_to_items = [items]
+    items_to_state = {key: 0}
+
+    in_queue = [True]
+    queue = [0]
+
+    qhead = 0
+    while qhead < len(queue):
+      i = queue[qhead]
+      in_queue[i] = False
+      items = state_to_items[i]
+      state_desc = ([], [], [], [])
+
+      def add_state(next_items, next_item_to_index):
+        self.lookahead_item_set_closure(next_items, next_item_to_index)
+        new_items = sorted(next_items)
+        key = tuple((i, j) for i, j, _ in new_items) # ignore lookahead
+        if key in items_to_state:
+          state = items_to_state[key]
+          state_items = state_to_items[state]
+          for i in range(len(new_items)):
+            j, k, lookahead_set = new_items[i]
+            lookahead_set = bisect_set.bisect_set_or(lookahead_set, state_items[i][2])
+            if lookahead_set != state_items[i][2]:
+              state_items[i] = (j, k, lookahead_set)
+              if not in_queue[state]:
+                in_queue[state] = True
+                queue.append(state)
+        else:
+          state = len(state_to_items)
+          state_to_items.append(new_items)
+          items_to_state[key] = state
+          in_queue.append(True)
+          queue.append(state)
+        return state
+
+      terminal = 0
+      while terminal < self.n_terminals:
+        next_items, next_item_to_index, reductions, terminal0, terminal1 = (
+          self.lookahead_item_set_shift(items, terminal)
+        )
+        assert terminal0 == terminal and terminal1 > terminal
+        if len(next_items) != 0:
+          if len(reductions) != 0:
+            sys.stderr.write(
+              'state {0:d} shift/reduce conflict: {1:s} vs {2:s}\n'.format(
+                i,
+                ','.join([str(j) for j, _, _ in next_items]),
+                ','.join([str(j) for j in reductions])
+              )
+            )
+          action = add_state(next_items, next_item_to_index) * 2
+        elif len(reductions) != 0:
+          if len(reductions) != 1:
+            sys.stderr.write(
+              'state {0:d} reduce/reduce conflict: {1:s}\n'.format(
+                i,
+                ','.join([str(j) for j in reductions])
+              )
+            )
+          action = min(reductions) * 2 + 1 # odd means reduce
+        else:
+          action = -1
+        state_desc[0].append(terminal1)
+        state_desc[1].append(action)
+        terminal = terminal1
+
+      nonterminal = 0
+      while nonterminal < len(self.productions):
+        next_items, next_item_to_index, nonterminal0, nonterminal1 = (
+          self.lookahead_item_set_goto(items, nonterminal)
+        )
+        assert nonterminal0 == nonterminal and nonterminal1 > nonterminal
+        if len(next_items) != 0:
+          goto = add_state(next_items, next_item_to_index)
+        else:
+          goto = -1
+        state_desc[2].append(nonterminal1)
+        state_desc[3].append(goto)
+        nonterminal = nonterminal1
+
+      if i < len(_lr1dfa.states):
+        _lr1dfa.states[i] = state_desc
+      else:
+        _lr1dfa.states.append(state_desc)
+      qhead += 1
+    return _lr1dfa
+
+  def __repr__(self):
+    return 'lr1.LR1({0:s}, {1:d}, {2:d})'.format(
+      repr(self.productions),
+      self.n_terminals,
+      self.eof_terminal
+    )
+
+
diff --git a/lr1dfa.py b/lr1dfa.py

new file mode 100644 (file)

index 0000000..71866be
--- /dev/null
+++ b/lr1dfa.py
@@ -0,0 +1,278 @@
+import bisect
+import element
+import work
+import sys
+
+# defines the alphabet size, set this to 0x11000 for unicode
+n_characters = 0x100
+
+class LR1DFA:
+  def __init__(
+    self,
+    states = [],
+    productions = [],
+    n_terminals = n_characters + 1,
+    eof_terminal = n_characters
+  ):
+    # states: list of state_desc
+    # state_desc: (terminal breaks, actions, nonterminal breaks, gotos)
+    # action: shift = new state * 2, reduce = production * 2 + 1, error = -1
+    # goto: reduce = production, error = -1 (but error can't really happen)
+    # productions: list of production
+    # production: (len(symbols), group_bounds)
+    # len(symbols): how many states to pop stack to reduce this production
+    # group_bounds: list of group_bound
+    # group_bound: (start_index, end_index, tag, kwargs)
+    #   where start_index, end_index are indices into list of character_set,
+    #   and tag, kwargs will be passed to apply_markup() hence factory(),
+    #   noting that markup has to be applied in reverse order of the list
+    # n_terminals: offset to apply to productions[] index to get symbol
+    #   (character set code), also symbol for productions[0] = start production
+    # eof_terminal: usually == n_terminals - 1 (must be valid terminal value)
+    self.states = states
+    self.productions = productions
+    self.n_terminals = n_terminals
+    self.eof_terminal = eof_terminal
+
+  def parse_text(self, text, i):
+    state = 0
+    value_stack = []
+    state_stack = []
+    lookahead_character = ord(text[i]) if i < len(text) else self.eof_terminal
+    while True:
+      value_stack.append(i)
+      state_stack.append(state)
+      action = self.states[state][1][
+        bisect.bisect_right(self.states[state][0], lookahead_character)
+      ]
+      if action == -1:
+        raise Exception(
+          'syntax error at {0:d}: {1:s}'.format(i, text[i:])
+        )
+      if (action & 1) == 0:
+        state = action >> 1
+        i += 1
+        lookahead_character = ord(text[i]) if i < len(text) else self.eof_terminal
+      else:
+        reduce = action >> 1
+        len_symbols, group_bounds = self.productions[reduce]
+        base = len(value_stack) - len_symbols - 1
+        for j in range(len(group_bounds) - 1, -1, -1):
+          k, l, tag, _ = group_bounds[j]
+          k += base
+          if l != 1:
+            value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
+          sys.stdout.write(
+            'text \'{0:s}\' tag \'{1:s}\'\n'.format(
+              text[value_stack[k]:value_stack[k + 1]],
+              tag
+            )
+          )
+        del value_stack[base + 1:]
+        del state_stack[base + 1:]
+        if reduce == 0:
+          assert base == 0
+          return
+        state = self.states[state_stack[-1]][3][
+          bisect.bisect_right(self.states[state_stack[-1]][2], reduce)
+        ]
+        assert state != -1
+
+  def parse_yychunk(self, root, pos, off, factory, yychunk_iter):
+    if pos < 0:
+      pos, off = element.to_start_relative(root, pos, off)
+
+    state = 0
+    value_stack = []
+    state_stack = []
+    text = element.get_text(root, pos)
+    while off >= len(text):
+      if pos < len(root):
+        pos += 1
+        off = 0
+      else:
+        try:
+          next(yychunk_iter)
+        except StopIteration:
+          lookahead_character = self.eof_terminal
+          break
+        text = element.get_text(root, pos)
+    else: 
+      lookahead_character = ord(text[off])
+    while True:
+      value_stack.append((pos, off))
+      state_stack.append(state)
+      action = self.states[state][1][
+        bisect.bisect_right(self.states[state][0], lookahead_character)
+      ]
+      #print('lookahead_character', lookahead_character, 'action', action)
+      if action == -1:
+        raise Exception(
+          'syntax error at {0:d},{1:d}: {2:s}'.format(pos, off, text[off:])
+        )
+      if (action & 1) == 0:
+        state = action >> 1
+        off += 1
+        while off >= len(text):
+          if pos < len(root):
+            pos += 1
+            off = 0
+          else:
+            try:
+              next(yychunk_iter)
+            except StopIteration:
+              lookahead_character = self.eof_terminal
+              break
+            text = element.get_text(root, pos)
+        else: 
+          lookahead_character = ord(text[off])
+      else:
+        reduce = action >> 1
+        len_symbols, group_bounds = self.productions[reduce]
+        base = len(value_stack) - len_symbols - 1
+        end_relative = len(value_stack)
+        for j in range(len(group_bounds) - 1, -1, -1):
+          k, l, tag, kwargs = group_bounds[j]
+          k += base
+          assert k < end_relative
+          if l != 1:
+            value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
+            end_relative = max(k + 1, end_relative + 1 - l)
+          while end_relative > k + 1:
+            end_relative -= 1
+            pos1, off1 = value_stack[end_relative]
+            value_stack[end_relative] = (
+              element.to_end_relative(root, pos1, off1)
+            )
+          pos0, off0 = value_stack[k]
+          pos1, off1 = value_stack[k + 1]
+          work.apply_markup(
+            root,
+            pos0,
+            off0,
+            pos1,
+            off1,
+            factory,
+            tag,
+            **kwargs
+          )
+        if end_relative < len(value_stack):
+          pos, off = value_stack[-1]
+          pos, off = element.to_start_relative(root, pos, off)
+          text = element.get_text(root, pos)
+        del value_stack[base + 1:]
+        del state_stack[base + 1:]
+        if reduce == 0:
+          assert base == 0
+          return
+        state = self.states[state_stack[-1]][3][
+          bisect.bisect_right(self.states[state_stack[-1]][2], reduce)
+        ]
+        assert state != -1
+
+  def yyparse(self, root, pos, off, factory, yylex_iter):
+    if pos < 0:
+      pos, off = element.to_start_relative(root, pos, off)
+
+    state = 0
+    value_stack = []
+    state_stack = []
+    try:
+      end_pos, end_off, lookahead_character = next(yylex_iter)
+    except StopIteration:
+      lookahead_character = self.eof_terminal
+      end_pos, end_off = element.to_end_relative(root, pos, off)
+    while True:
+      value_stack.append((pos, off))
+      state_stack.append(state)
+      action = self.states[state][1][
+        bisect.bisect_right(self.states[state][0], lookahead_character)
+      ]
+      #print('lookahead_character', lookahead_character, 'action', action)
+      if action == -1:
+        raise Exception(
+          'syntax error at {0:d},{1:d}: {2:d}'.format(pos, off, lookahead_character)
+        )
+      if (action & 1) == 0:
+        state = action >> 1
+        pos, off = element.to_start_relative(root, end_pos, end_off)
+        try:
+          end_pos, end_off, lookahead_character = next(yylex_iter)
+        except StopIteration:
+          lookahead_character = self.eof_terminal
+          #end_pos, end_off = element.to_end_relative(root, pos, off)
+      else:
+        reduce = action >> 1
+        len_symbols, group_bounds = self.productions[reduce]
+        base = len(value_stack) - len_symbols - 1
+        end_relative = len(value_stack)
+        for j in range(len(group_bounds) - 1, -1, -1):
+          k, l, tag, kwargs = group_bounds[j]
+          k += base
+          assert k < end_relative
+          if l != 1:
+            value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
+            end_relative = max(k + 1, end_relative + 1 - l)
+          while end_relative > k + 1:
+            end_relative -= 1
+            pos1, off1 = value_stack[end_relative]
+            value_stack[end_relative] = (
+              element.to_end_relative(root, pos1, off1)
+            )
+          pos0, off0 = value_stack[k]
+          pos1, off1 = value_stack[k + 1]
+          work.apply_markup(
+            root,
+            pos0,
+            off0,
+            pos1,
+            off1,
+            factory,
+            tag,
+            **kwargs
+          )
+        if end_relative < len(value_stack):
+          pos, off = value_stack[-1]
+          pos, off = element.to_start_relative(root, pos, off)
+        del value_stack[base + 1:]
+        del state_stack[base + 1:]
+        if reduce == 0:
+          assert base == 0
+          return
+        state = self.states[state_stack[-1]][3][
+          bisect.bisect_right(self.states[state_stack[-1]][2], reduce)
+        ]
+        assert state != -1
+
+  def __repr__(self):
+    return 'lr1dfa.LR1DFA({0:s}, {1:s}, {2:d}, {3:d})'.format(
+      repr(self.states),
+      repr(self.productions),
+      self.n_terminals,
+      self.eof_terminal
+    )
+
+def wrap_repr(text, width):
+  lines = []
+  i = 0
+  while i < len(text):
+    j = i + width
+    if j < len(text):
+      j = max(
+        [
+          text.rfind('(', i, j) + 1,
+          text.rfind('[', i, j) + 1,
+          text.rfind('{', i, j) + 1,
+          text.rfind('.', i, j) + 1,
+          text.rfind(')', i, j + 1),
+          text.rfind(']', i, j + 1),
+          text.rfind('}', i, j + 1),
+          text.rfind(' ', i, j + 1),
+        ]
+      )
+      assert j > 0
+    lines.append(text[i:j] + '\n')
+    i = j
+    while text[i:i + 1] == ' ':
+      i += 1
+  return ''.join(lines) 
diff --git a/nfa.py b/nfa.py

new file mode 100644 (file)

index 0000000..e6076b2
--- /dev/null
+++ b/nfa.py
@@ -0,0 +1,446 @@
+import bisect
+import dfa
+import element
+
+# defines the alphabet size, set this to 0x11000 for unicode
+n_characters = 0x100
+
+class NFA:
+  # state_desc classes:
+  # (STATE_CHARACTER, character_set, next_state)
+  # (STATE_OR, next_state0, next_state1)
+  # (STATE_AND, next_state0, next_state1)
+  # (STATE_JOIN0,)
+  # (STATE_JOIN1, next_state)
+  # (STATE_MARK, mark_value, next_state)
+
+  STATE_CHARACTER = 0
+  STATE_OR = 1
+  STATE_AND = 2
+  STATE_JOIN0 = 3
+  STATE_JOIN1 = 4
+  STATE_MARK = 5
+  join0_state = (STATE_JOIN0,)
+
+  # multistate classes:
+  # (MULTISTATE_ACCEPT, 1)             accept, occupies one thread in list
+  # (MULTISTATE_AND, n, state, child)  and, occupies n threads in list
+  # (MULTISTATE_OR, n, child0, child1)
+  # n = sum of n from child states, that is, multistate[1] is the number of
+  # (MULTISTATE_ACCEPT, 1) leaf states ultimately reachable from this subtree
+
+  MULTISTATE_ACCEPT = 0
+  MULTISTATE_AND = 1
+  MULTISTATE_OR = 2
+  accept_multistate = (MULTISTATE_ACCEPT, 1)
+
+  def __init__(
+    self,
+    groups = [],
+    states = [(STATE_CHARACTER, [0, n_characters], 0)],
+    start_state = [] # can have multiple NFAs in same container
+  ):
+    # groups: list of group_desc
+    # group_desc: (tag, kwargs)
+    #   tag, kwargs will be passed to apply_markup() hence factory()
+    self.groups = groups
+    self.states = states
+    self.start_state = start_state
+
+  def multistate_next(self, root_multistate, character):
+    # the deduplication works as effectively a second pass which goes
+    # over the multistate tree in pre-order, looking for OR-disjunctions
+    # of any depth and configuration, e.g. (a OR b) or (c OR d), and
+    # collecting the subexpressions e.g. a, b, c OR b, c, d, c OR d, and
+    # failing any subexpression that's previously occurred in pre-order
+    # (children of AND-conjunctions get a fresh empty subexpression set)
+
+    # unfortunately it can't be done as a second pass literally, because
+    # we have to generate the transition list, thus we need to know if a
+    # subexpression is going to be failed, so we can delete it's threads
+
+    # therefore the deduplication is tacked onto the end of the building
+    # process, examining each built node just before the builder returns,
+    # if already in the subexpression set we will rewrite the transition
+    # list produced by the recursive calls, otherwise we will add it in
+
+    # the problem is determining the subexpression set to pass into any
+    # recursive calls, the caller's set will be sent unchanged by cases
+    # that return the result unchanged or add an OR-disjuction, an empty
+    # set will be sent by cases that add an AND-conjuction to the result 
+
+    # if we have added something to the node built by a recursive call,
+    # we'll fall into the deduplication logic, otherwise just return it
+
+    def advance1(multistate, join_count, done_multistates):
+      # modifies nonlocal: transition
+      assert multistate[0] == NFA.MULTISTATE_AND
+      _, _, state, child = multistate
+      if state == 0:
+        # note that if we reach the accepting state, we must be a top-level
+        # expression, and could not be part of an AND-conjunction (because
+        # AND-conjunction terms always go to a join0 or join1 state first),
+        # we remove ourselves to indicate to scanner that match is complete
+        assert join_count == 0
+        assert child == NFA.accept_multistate
+        return advance(child, 0, done_multistates)
+      state_desc = self.states[state]
+      if state_desc[0] == NFA.STATE_CHARACTER:
+        if join_count != 0:
+          transition.append((dfa.DFA.TRANSITION_POP, child[1]))
+          return None
+        len_transition = len(transition)
+        child = advance(child, 0, set())
+        if child is None:
+          return None
+        result = (NFA.MULTISTATE_AND, child[1], state, child)
+      elif state_desc[0] == NFA.STATE_OR:
+        _, next_state0, next_state1 = state_desc
+        len_transition = len(transition)
+        transition.append((dfa.DFA.TRANSITION_DUP, child[1]))
+        child0 = advance1(
+          (NFA.MULTISTATE_AND, child[1], next_state0, child),
+          join_count,
+          done_multistates
+        )
+        child1 = advance1(
+          (NFA.MULTISTATE_AND, child[1], next_state1, child),
+          join_count,
+          done_multistates
+        )
+        if child0 is None:
+          return child1
+        if child1 is None:
+          return child0
+        result = (NFA.MULTISTATE_OR, child0[1] + child1[1], child0, child1)
+      elif state_desc[0] == NFA.STATE_AND:
+        _, next_state0, next_state1 = state_desc
+        return advance1(
+          (
+            NFA.MULTISTATE_AND,
+            child[1],
+            next_state0,
+            (NFA.MULTISTATE_AND, child[1], next_state1, child)
+          ),
+          join_count,
+          done_multistates
+        )
+      elif state_desc[0] == NFA.STATE_JOIN0:
+        return advance(child, join_count + 1, done_multistates)
+      elif state_desc[0] == NFA.STATE_JOIN1:
+        _, next_state = state_desc
+        if join_count == 0:
+          transition.append((dfa.DFA.TRANSITION_POP, child[1]))
+          return None
+        return advance1(
+          (NFA.MULTISTATE_AND, child[1], next_state, child),
+          join_count - 1,
+          done_multistates
+        )
+      elif state_desc[0] == NFA.STATE_MARK:
+        _, mark_value, next_state = state_desc
+        transition.append((dfa.DFA.TRANSITION_MARK, child[1], mark_value))
+        return advance1(
+          (NFA.MULTISTATE_AND, child[1], next_state, child),
+          join_count,
+          done_multistates
+        )
+      else:
+        assert False
+      if result in done_multistates:
+        transition[len_transition:] = [(dfa.DFA.TRANSITION_POP, multistate[1])]
+        return None
+      done_multistates.add(result)
+      return result
+
+    def advance(multistate, join_count, done_multistates):
+      nonlocal character0, character1 # modifies nonlocal: transition
+      if multistate[0] == NFA.MULTISTATE_ACCEPT:
+        assert join_count == 0
+        len_transition = len(transition)
+        transition.append((dfa.DFA.TRANSITION_MOVE, 1))
+        result = NFA.accept_multistate # takes no arguments so use static one
+      elif multistate[0] == NFA.MULTISTATE_AND:
+        if character >= 0:
+          _, _, state, child = multistate
+          state_desc = self.states[state]
+          assert state_desc[0] == NFA.STATE_CHARACTER
+          _, character_set, next_state = state_desc
+          k = bisect.bisect_right(character_set, character)
+          if k > 0 and character0 < character_set[k - 1]:
+            character0 = character_set[k - 1]
+          if k < len(character_set) and character1 > character_set[k]:
+            character1 = character_set[k]
+          if (k & 1) == 0:
+            transition.append((dfa.DFA.TRANSITION_POP, child[1]))
+            return None
+          multistate = (NFA.MULTISTATE_AND, child[1], next_state, child)
+        return advance1(multistate, join_count, done_multistates)
+      elif multistate[0] == NFA.MULTISTATE_OR:
+        _, _, child0, child1 = multistate
+        len_transition = len(transition)
+        child0 = advance(child0, join_count, done_multistates)
+        child1 = advance(child1, join_count, done_multistates)
+        if child0 is None:
+          return child1
+        if child1 is None:
+          return child0
+        result = (NFA.MULTISTATE_OR, child0[1] + child1[1], child0, child1)
+      else:
+        assert False
+      if result in done_multistates:
+        transition[len_transition:] = [(dfa.DFA.TRANSITION_POP, multistate[1])]
+        return None
+      done_multistates.add(result)
+      return result
+
+    transition = []
+    character0 = 0
+    character1 = n_characters
+    root_multistate = advance(root_multistate, 0, set())
+    return root_multistate, transition, character0, character1
+
+  def multistate_accept(root_multistate):
+    i = 0
+    def accept(multistate):
+      nonlocal i
+      if multistate[0] == NFA.MULTISTATE_ACCEPT:
+        return True
+      if multistate[0] == NFA.MULTISTATE_AND:
+        _, _, _, child = multistate
+        i += child[1]
+        return False
+      if multistate[0] == NFA.MULTISTATE_OR:
+        _, _, child0, child1 = multistate
+        return accept(child0) or accept(child1)
+      assert False
+    return i if accept(root_multistate) else -1
+
+  def match_text(self, text, i, start_index = 0):
+    def transit(transition):
+      nonlocal threads0, threads1, prefix_slop # note: also uses i
+      j = prefix_slop
+      for trans in transition:
+        if trans[0] == dfa.DFA.TRANSITION_POP:
+          j += trans[1]
+        elif trans[0] == dfa.DFA.TRANSITION_DUP:
+          while j < trans[1]:
+            threads0[:0] = [None] * prefix_slop
+            threads1[:0] = [None] * prefix_slop
+            j += prefix_slop
+            prefix_slop *= 2
+          threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
+          j -= trans[1]
+        elif trans[0] == dfa.DFA.TRANSITION_MARK:
+          threads0[j:j + trans[1]] = [
+            (i, trans[2], thread)
+            for thread in threads0[j:j + trans[1]]
+          ]
+        elif trans[0] == dfa.DFA.TRANSITION_MOVE:
+          threads1.extend(threads0[j:j + trans[1]])
+          j += trans[1]
+        #elif trans[0] == dfa.DFA.TRANSITION_DEL:
+        #  del threads1[-trans[1]:]
+        else:
+          assert False
+      assert j == len(threads0)
+      threads0, threads1 = threads1, threads0
+      del threads1[prefix_slop:]
+
+    threads0 = [None, None]
+    threads1 = [None]
+    prefix_slop = 1
+
+    start_state = self.start_state[start_index]
+    if start_state == -1:
+      return None
+    next_multistate, transition, _, _ = self.multistate_next(
+      (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
+      -1
+    )
+    while next_multistate is not None:
+      transit(transition)
+      assert len(threads0) == prefix_slop + next_multistate[1]
+      if next_multistate == NFA.accept_multistate:
+        # there is only one match, which is complete
+        assert len(threads0) == prefix_slop + 1
+        return threads0[prefix_slop]
+      if i >= len(text):
+        # return best match we have, but not incomplete match
+        i = NFA.multistate_accept(next_multistate)
+        return (None if i == -1 else threads0[prefix_slop + i])
+      next_multistate, transition, _, _ = (
+        self.multistate_next(next_multistate, ord(text[i]))
+      )
+      i += 1
+    return None
+
+  def match_yychunk(self, root, pos, off, yychunk_iter, start_index = 0):
+    if pos < 0:
+      pos, off = element.to_start_relative(root, pos, off)
+
+    def transit(transition):
+      nonlocal threads0, threads1, prefix_slop # note: also uses pos, off
+      j = prefix_slop
+      for trans in transition:
+        if trans[0] == dfa.DFA.TRANSITION_POP:
+          j += trans[1]
+        elif trans[0] == dfa.DFA.TRANSITION_DUP:
+          while j < trans[1]:
+            threads0[:0] = [None] * prefix_slop
+            threads1[:0] = [None] * prefix_slop
+            j += prefix_slop
+            prefix_slop *= 2
+          threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
+          j -= trans[1]
+        elif trans[0] == dfa.DFA.TRANSITION_MARK:
+          threads0[j:j + trans[1]] = [
+            (pos, off, trans[2], thread)
+            for thread in threads0[j:j + trans[1]]
+          ]
+        elif trans[0] == dfa.DFA.TRANSITION_MOVE:
+          threads1.extend(threads0[j:j + trans[1]])
+          j += trans[1]
+        #elif trans[0] == dfa.DFA.TRANSITION_DEL:
+        #  del threads1[-trans[1]:]
+        else:
+          assert False
+      assert j == len(threads0)
+      threads0, threads1 = threads1, threads0
+      del threads1[prefix_slop:]
+
+    threads0 = [None, None]
+    threads1 = [None]
+    prefix_slop = 1
+
+    start_state = self.start_state[start_index]
+    if start_state == -1:
+      return None
+    next_multistate, transition, _, _ = self.multistate_next(
+      (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
+      -1
+    )
+    text = element.get_text(root, pos)
+    while next_multistate is not None:
+      transit(transition)
+      assert len(threads0) == prefix_slop + next_multistate[1]
+      if next_multistate == NFA.accept_multistate:
+        # there is only one match, which is complete
+        assert len(threads0) == prefix_slop + 1
+        return threads0[prefix_slop]
+      while off >= len(text):
+        if pos < len(root):
+          pos += 1
+          off = 0
+        else:
+          try:
+            next(yychunk_iter)
+          except StopIteration:
+            # return best match we have, but not incomplete match
+            i = NFA.multistate_accept(next_multistate)
+            return (None if i == -1 else threads0[prefix_slop + i])
+          text = element.get_text(root, pos)
+      next_multistate, transition, _, _ = (
+        self.multistate_next(next_multistate, ord(text[off]))
+      )
+      off += 1
+    return None
+
+  def to_dfa(self):
+    dfa = dfa.DFA(list(self.groups))
+
+    accept_key = (NFA.accept_multistate, ())
+    action_to_meaning = [accept_key]
+    meaning_to_action = {accept_key: 0}
+    state_to_meaning = [NFA.accept_multistate]
+    meaning_to_state = {NFA.accept_multistate: 0}
+
+    for start_state in self.start_state:
+      if start_state == -1:
+        start_action = -1
+      else:
+        next_multistate, transition, _, _ = self.multistate_next(
+          (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
+          -1
+        )
+        if next_multistate is None:
+          start_action = -1
+        else:
+          start_key = (next_multistate, tuple(transition))
+          start_action = len(action_to_meaning)
+          meaning_to_action[start_key] = start_action
+          action_to_meaning.append(start_key)
+      dfa.start_action.append(start_action)
+
+    while len(dfa.actions) < len(action_to_meaning):
+      next_multistate, transition = action_to_meaning[len(dfa.actions)]
+      if next_multistate in meaning_to_state:
+        next_state = meaning_to_state[next_multistate]
+      else:
+        next_state = len(state_to_meaning)
+        state_to_meaning.append(next_multistate)
+        meaning_to_state[next_multistate] = next_state
+      dfa.actions.append((next_state, list(transition)))
+
+      while len(dfa.states) < len(state_to_meaning):
+        character = 0
+        multistate = state_to_meaning[len(dfa.states)]
+        state_desc = ([], [], NFA.multistate_accept(multistate))
+        while character < n_characters:
+          next_multistate, transition, character0, character1 = self.multistate_next(
+            multistate,
+            character
+          )
+          assert character0 == character and character1 > character
+          if next_multistate is None:
+            action = -1
+          else:
+            # optimize transition (optional)
+            i = 0
+            j = 0
+            while i < len(transition):
+              if transition[i][0] == dfa.DFA.TRANSITION_POP:
+                n = transition[i][1]
+                i += 1
+                while (
+                  i < len(transition) and
+                  transition[i][0] == dfa.DFA.TRANSITION_POP
+                ):
+                  n += transition[i][1]
+                  i += 1
+                transition[j] = (dfa.DFA.TRANSITION_POP, n)
+              elif transition[i][0] == dfa.DFA.TRANSITION_MOVE:
+                n = transition[i][1]
+                i += 1
+                while (
+                  i < len(transition) and
+                  transition[i][0] == dfa.DFA.TRANSITION_MOVE
+                ):
+                  n += transition[i][1]
+                  i += 1
+                transition[j] = (dfa.DFA.TRANSITION_MOVE, n)
+              else:
+                transition[j] = transition[i]
+                i += 1
+              j += 1
+            del transition[j:]
+            # end optimize transition
+            key = (next_multistate, tuple(transition))
+            if key in meaning_to_action:
+              action = meaning_to_action[key]
+            else:
+              action = len(action_to_meaning)
+              action_to_meaning.append(key)
+              meaning_to_action[key] = action
+          state_desc[0].append(character1)
+          state_desc[1].append(action)
+          character = character1
+        dfa.states.append(state_desc)
+    return dfa
+
+  def __repr__(self):
+    return 'nfa.NFA({0:s}, {1:s}, {2:s})'.format(
+      repr(self.groups),
+      repr(self.states),
+      repr(self.start_state)
+    )
diff --git a/regex.py b/regex.py

index 96c65d5..44d573c 100644 (file)
--- a/regex.py
+++ b/regex.py
@@ -1,80 +1,10 @@
-import bisect
+import bisect_set
  import element
-import work
-import sys
+import nfa
  
  # defines the alphabet size, set this to 0x11000 for unicode
  n_characters = 0x100
  
-def character_set_or(character_set0, character_set1):
-  # calculate union of the child sets
-  # we do this by calculating a series of breakpoints, at each breakpoint
-  # evaluating the "or" (max) of the even/odd truth values of each child,
-  # then making the output truth value even/odd by outputting if necessary
-  result = []
-  i = 0
-  j = 0
-  while True:
-    if i < len(character_set0):
-      k = character_set0[i]
-      if j < len(character_set1):
-        k = min(k, character_set1[j])
-    elif j < len(character_set1):
-      k = character_set1[j]
-    else:
-      break
-    if i < len(character_set0) and character_set0[i] == k:
-      i += 1
-    if j < len(character_set1) and character_set1[j] == k:
-      j += 1
-    if (len(result) & 1) != max(i & 1, j & 1):
-      result.append(k)
-  assert (i & 1) == 0 and (j & 1) == 0
-  return result
-
-def character_set_and(character_set0, character_set1):
-  # calculate intersection of the child sets
-  # we do this by calculating a series of breakpoints, at each breakpoint
-  # evaluating the "and" (min) of the even/odd truth values of each child,
-  # then making the output truth value even/odd by outputting if necessary
-  result = []
-  i = 0
-  j = 0
-  while True:
-    if i < len(character_set0):
-      k = character_set0[i]
-      if j < len(character_set1):
-        k = min(k, character_set1[j])
-    elif j < len(character_set1):
-      k = character_set1[j]
-    else:
-      break
-    if i < len(character_set0) and character_set0[i] == k:
-      i += 1
-    if j < len(character_set1) and character_set1[j] == k:
-      j += 1
-    if (len(result) & 1) != min(i & 1, j & 1):
-      result.append(k)
-  assert (i & 1) == 0 and (j & 1) == 0
-  return result
-
-def character_set_not(character_set):
-  # calculate complement of the child set
-  # if child set begins with [0], remove it, otherwise add [0] prefix
-  # if child set ends with [n_characters], remove it, otherwise add [n_characters] suffix
-  # the suffix part is not totally necessary, but makes sure length is even
-  # (the evenness is so that single character sets can always be [c, c + 1])
-  result = list(character_set)
-  if result[:1] == [0]:
-    del result[:1]
-  else:
-    result[:0] = [0]
-  if result[-1:] == [n_characters]:
-    del result[-1:]
-  else:
-    result.append(n_characters)
-  return result
-
  class Regex(element.Element):
    # GENERATE ELEMENT() BEGIN
    def __init__(
@@ -237,7 +167,7 @@ class RegexCharacter(Regex):
    # GENERATE END
    def to_nfa_state(self, nfa, next_state):
      new_state = len(nfa.states)
-    nfa.states.append((NFA.STATE_CHARACTER, self.character_set, next_state))
+    nfa.states.append((nfa.NFA.STATE_CHARACTER, self.character_set, next_state))
      return new_state
    #def to_lr1_symbols(self, n_terminals, symbols, lookaheads, group_bounds):
    #  terminal_set = []
@@ -326,7 +256,7 @@ class RegexCharacterOr(RegexCharacter):
    # GENERATE END
    def post_process(self, group_index = 0): #, rule_name_to_character_set = None):
      group_index = RegexCharacter.post_process(self, group_index) #, rule_name_to_character_set)
-    self.character_set = character_set_or(self[0].character_set, self[1].character_set)
+    self.character_set = bisect_set.bisect_set_or(self[0].character_set, self[1].character_set)
      return group_index
  
  class RegexCharacterAnd(RegexCharacter):
@@ -360,7 +290,7 @@ class RegexCharacterAnd(RegexCharacter):
    # GENERATE END
    def post_process(self, group_index = 0): #, rule_name_to_character_set = None):
      group_index = RegexCharacter.post_process(self, group_index) #, rule_name_to_character_set)
-    self.character_set = character_set_and(self[0].character_set, self[1].character_set)
+    self.character_set = bisect_set.bisect_set_and(self[0].character_set, self[1].character_set)
      return group_index
  
  class RegexCharacterNot(RegexCharacter):
@@ -394,7 +324,7 @@ class RegexCharacterNot(RegexCharacter):
    # GENERATE END
    def post_process(self, group_index = 0): #, rule_name_to_character_set = None):
      group_index = RegexCharacter.post_process(self, group_index) #, rule_name_to_character_set)
-    self.character_set = character_set_not(self[0].character_set)
+    self.character_set = bisect_set.bisect_set_not(self[0].character_set)
      return group_index
  
  #class RegexCharacterRule(RegexCharacter):
@@ -481,7 +411,7 @@ class RegexOr(Regex):
      if child1_state == -1:
        return child0_state
      new_state = len(nfa.states)
-    nfa.states.append((NFA.STATE_OR, child0_state, child1_state))
+    nfa.states.append((nfa.NFA.STATE_OR, child0_state, child1_state))
      return new_state
  
  class RegexAnd(Regex):
@@ -513,9 +443,9 @@ class RegexAnd(Regex):
    # GENERATE END
    def to_nfa_state(self, nfa, next_state):
      join0_state = len(nfa.states)
-    nfa.states.append(NFA.join0_state) # takes no arguments so use static one
+    nfa.states.append(nfa.NFA.join0_state) # takes no arguments so use static one
      join1_state = len(nfa.states)
-    nfa.states.append((NFA.STATE_JOIN1, next_state))
+    nfa.states.append((nfa.NFA.STATE_JOIN1, next_state))
      child0_state = self[0].to_nfa_state(nfa, join0_state)
      if child0_state == -1:
        return -1
@@ -523,7 +453,7 @@ class RegexAnd(Regex):
      if child1_state == -1:
        return -1
      new_state = len(nfa.states)
-    nfa.states.append((NFA.STATE_AND, child0_state, child1_state))
+    nfa.states.append((nfa.NFA.STATE_AND, child0_state, child1_state))
      return new_state
  
  class RegexSequence(Regex):
@@ -655,9 +585,9 @@ class RegexRepeat(Regex):
          new_state = next_state # note: unreachable state remains invalid (None)
        else:
          nfa.states[new_state] = (
-          (NFA.STATE_OR, next_state, child_state)
+          (nfa.NFA.STATE_OR, next_state, child_state)
          if self.non_greedy else
-          (NFA.STATE_OR, child_state, next_state)
+          (nfa.NFA.STATE_OR, child_state, next_state)
          )
      else:
        new_state = next_state
@@ -667,9 +597,9 @@ class RegexRepeat(Regex):
            break
          new_state = len(nfa.states)
          nfa.states.append(
-          (NFA.STATE_OR, next_state, child_state)
+          (nfa.NFA.STATE_OR, next_state, child_state)
          if self.non_greedy else
-          (NFA.STATE_OR, child_state, next_state)
+          (nfa.NFA.STATE_OR, child_state, next_state)
          )
      for i in range(count0):
        new_state = self[0].to_nfa_state(nfa, new_state)
@@ -819,12 +749,12 @@ class RegexGroup(Regex):
      return Regex.to_groups(self, groups)
    def to_nfa_state(self, nfa, next_state):
      mark_state = len(nfa.states)
-    nfa.states.append((NFA.STATE_MARK, self.group_index * 2 + 1, next_state))
+    nfa.states.append((nfa.NFA.STATE_MARK, self.group_index * 2 + 1, next_state))
      child_state = self[0].to_nfa_state(nfa, mark_state)
      if child_state == -1:
        return -1
      new_state = len(nfa.states)
-    nfa.states.append((NFA.STATE_MARK, self.group_index * 2, child_state))
+    nfa.states.append((nfa.NFA.STATE_MARK, self.group_index * 2, child_state))
      return new_state
    #def to_lr1_symbols(self, n_terminals, symbols, lookaheads, group_bounds):
    #  group_start = len(symbols)
@@ -845,314 +775,6 @@ class RegexGroup(Regex):
    #  )
    #  return 1 # count of groups or ungrouped characters
  
-class Grammar(element.Element):
-  class Production(element.Element):
-    class Symbol(element.Element):
-      # GENERATE ELEMENT(list(int) terminal_set, list(int) nonterminal_set) BEGIN
-      def __init__(
-        self,
-        tag = 'Grammar_Production_Symbol',
-        attrib = {},
-        text = '',
-        children = [],
-        terminal_set = [],
-        nonterminal_set = []
-      ):
-        element.Element.__init__(
-          self,
-          tag,
-          attrib,
-          text,
-          children
-        )
-        self.terminal_set = (
-          [element.deserialize_int(i) for i in terminal_set.split()]
-        if isinstance(terminal_set, str) else
-          terminal_set
-        )
-        self.nonterminal_set = (
-          [element.deserialize_int(i) for i in nonterminal_set.split()]
-        if isinstance(nonterminal_set, str) else
-          nonterminal_set
-        )
-      def serialize(self, ref_list, indent = 0):
-        element.Element.serialize(self, ref_list, indent)
-        self.set(
-          'terminal_set',
-          ' '.join([element.serialize_int(i) for i in self.terminal_set])
-        )
-        self.set(
-          'nonterminal_set',
-          ' '.join([element.serialize_int(i) for i in self.nonterminal_set])
-        )
-      def deserialize(self, ref_list):
-        element.Element.deserialize(self, ref_list)
-        self.terminal_set = [
-          element.deserialize_int(i)
-          for i in self.get('terminal_set', '').split()
-        ]
-        self.nonterminal_set = [
-          element.deserialize_int(i)
-          for i in self.get('nonterminal_set', '').split()
-        ]
-      def copy(self, factory = None):
-        result = element.Element.copy(
-          self,
-          Symbol if factory is None else factory
-        )
-        result.terminal_set = self.terminal_set
-        result.nonterminal_set = self.nonterminal_set
-        return result
-      def repr_serialize(self, params):
-        element.Element.repr_serialize(self, params)
-        if len(self.terminal_set):
-          params.append(
-            'terminal_set = [{0:s}]'.format(
-              ', '.join([repr(i) for i in self.terminal_set])
-            )
-          )
-        if len(self.nonterminal_set):
-          params.append(
-            'nonterminal_set = [{0:s}]'.format(
-              ', '.join([repr(i) for i in self.nonterminal_set])
-            )
-          )
-      def __repr__(self):
-        params = []
-        self.repr_serialize(params)
-        return 'regex.Grammar.Production.Symbol({0:s})'.format(', '.join(params))
-      # GENERATE END
-      def post_process(self, name_to_character_sets):
-        pass
-
-    class NamedSymbol(Symbol):
-      # GENERATE ELEMENT(str name) BEGIN
-      def __init__(
-        self,
-        tag = 'Grammar_Production_NamedSymbol',
-        attrib = {},
-        text = '',
-        children = [],
-        terminal_set = [],
-        nonterminal_set = [],
-        name = ''
-      ):
-        Grammar.Production.Symbol.__init__(
-          self,
-          tag,
-          attrib,
-          text,
-          children,
-          terminal_set,
-          nonterminal_set
-        )
-        self.name = name
-      def serialize(self, ref_list, indent = 0):
-        Grammar.Production.Symbol.serialize(self, ref_list, indent)
-        self.set('name', element.serialize_str(self.name))
-      def deserialize(self, ref_list):
-        Grammar.Production.Symbol.deserialize(self, ref_list)
-        self.name = element.deserialize_str(self.get('name', ''))
-      def copy(self, factory = None):
-        result = Grammar.Production.Symbol.copy(
-          self,
-          NamedSymbol if factory is None else factory
-        )
-        result.name = self.name
-        return result
-      def repr_serialize(self, params):
-        Grammar.Production.Symbol.repr_serialize(self, params)
-        if self.name != '':
-          params.append(
-            'name = {0:s}'.format(repr(self.name))
-          )
-      def __repr__(self):
-        params = []
-        self.repr_serialize(params)
-        return 'regex.Grammar.Production.NamedSymbol({0:s})'.format(', '.join(params))
-      # GENERATE END
-      def post_process(self, name_to_character_sets):
-        self.terminal_set, self.nonterminal_set = (
-          name_to_character_sets[self.name]
-        )
- 
-    # GENERATE ELEMENT(int nonterminal, int precedence, int associativity) BEGIN
-    def __init__(
-      self,
-      tag = 'Grammar_Production',
-      attrib = {},
-      text = '',
-      children = [],
-      nonterminal = -1,
-      precedence = -1,
-      associativity = -1
-    ):
-      element.Element.__init__(
-        self,
-        tag,
-        attrib,
-        text,
-        children
-      )
-      self.nonterminal = (
-        element.deserialize_int(nonterminal)
-      if isinstance(nonterminal, str) else
-        nonterminal
-      )
-      self.precedence = (
-        element.deserialize_int(precedence)
-      if isinstance(precedence, str) else
-        precedence
-      )
-      self.associativity = (
-        element.deserialize_int(associativity)
-      if isinstance(associativity, str) else
-        associativity
-      )
-    def serialize(self, ref_list, indent = 0):
-      element.Element.serialize(self, ref_list, indent)
-      self.set('nonterminal', element.serialize_int(self.nonterminal))
-      self.set('precedence', element.serialize_int(self.precedence))
-      self.set('associativity', element.serialize_int(self.associativity))
-    def deserialize(self, ref_list):
-      element.Element.deserialize(self, ref_list)
-      self.nonterminal = element.deserialize_int(self.get('nonterminal', '-1'))
-      self.precedence = element.deserialize_int(self.get('precedence', '-1'))
-      self.associativity = element.deserialize_int(self.get('associativity', '-1'))
-    def copy(self, factory = None):
-      result = element.Element.copy(
-        self,
-        Production if factory is None else factory
-      )
-      result.nonterminal = self.nonterminal
-      result.precedence = self.precedence
-      result.associativity = self.associativity
-      return result
-    def repr_serialize(self, params):
-      element.Element.repr_serialize(self, params)
-      if self.nonterminal != -1:
-        params.append(
-          'nonterminal = {0:s}'.format(repr(self.nonterminal))
-        )
-      if self.precedence != -1:
-        params.append(
-          'precedence = {0:s}'.format(repr(self.precedence))
-        )
-      if self.associativity != -1:
-        params.append(
-          'associativity = {0:s}'.format(repr(self.associativity))
-        )
-    def __repr__(self):
-      params = []
-      self.repr_serialize(params)
-      return 'regex.Grammar.Production({0:s})'.format(', '.join(params))
-    # GENERATE END
-    def post_process(self, nonterminal, name_to_character_sets):
-      self.nonterminal = nonterminal
-      for i in self:
-        i.post_process(name_to_character_sets)
-    def add_to_lr1(self, lr1):
-      lr1.productions.append(
-        (
-          # precedence
-          self.precedence * 2 + self.associativity,
-          # symbols
-          [(i.terminal_set, i.nonterminal_set) for i in self],
-          # lookaheads (list of initial_set, can_be_empty)
-          [([], False) for i in range(len(self))] + [([], True)],
-          # group_bounds
-          []
-        )
-      )
-
-  # GENERATE ELEMENT(int n_terminals, int eof_terminal) BEGIN
-  def __init__(
-    self,
-    tag = 'Grammar',
-    attrib = {},
-    text = '',
-    children = [],
-    n_terminals = -1,
-    eof_terminal = -1
-  ):
-    element.Element.__init__(
-      self,
-      tag,
-      attrib,
-      text,
-      children
-    )
-    self.n_terminals = (
-      element.deserialize_int(n_terminals)
-    if isinstance(n_terminals, str) else
-      n_terminals
-    )
-    self.eof_terminal = (
-      element.deserialize_int(eof_terminal)
-    if isinstance(eof_terminal, str) else
-      eof_terminal
-    )
-  def serialize(self, ref_list, indent = 0):
-    element.Element.serialize(self, ref_list, indent)
-    self.set('n_terminals', element.serialize_int(self.n_terminals))
-    self.set('eof_terminal', element.serialize_int(self.eof_terminal))
-  def deserialize(self, ref_list):
-    element.Element.deserialize(self, ref_list)
-    self.n_terminals = element.deserialize_int(self.get('n_terminals', '-1'))
-    self.eof_terminal = element.deserialize_int(self.get('eof_terminal', '-1'))
-  def copy(self, factory = None):
-    result = element.Element.copy(
-      self,
-      Grammar if factory is None else factory
-    )
-    result.n_terminals = self.n_terminals
-    result.eof_terminal = self.eof_terminal
-    return result
-  def repr_serialize(self, params):
-    element.Element.repr_serialize(self, params)
-    if self.n_terminals != -1:
-      params.append(
-        'n_terminals = {0:s}'.format(repr(self.n_terminals))
-      )
-    if self.eof_terminal != -1:
-      params.append(
-        'eof_terminal = {0:s}'.format(repr(self.eof_terminal))
-      )
-  def __repr__(self):
-    params = []
-    self.repr_serialize(params)
-    return 'regex.Grammar({0:s})'.format(', '.join(params))
-  # GENERATE END
-  def post_process(self, name_to_character_sets):
-    for i in range(len(self)):
-      self[i].post_process(i, name_to_character_sets)
-  def to_lr1(self):
-    lr1 = LR1([], self.n_terminals, self.eof_terminal)
-    for i in self:
-      i.add_to_lr1(lr1)
-    # propagate lookaheads
-    modified = True
-    while modified:
-      modified = False
-      for _, symbols, lookaheads, _ in lr1.productions:
-        for i in range(len(symbols) - 1, -1, -1):
-          initial_set, nonterminal_set = symbols[i]
-          can_be_empty = False
-          for j in range(0, len(nonterminal_set), 2):
-            for k in range(nonterminal_set[j], nonterminal_set[j + 1]):
-              child_initial_set, child_can_be_empty = lr1.productions[k][2][0]
-              initial_set = character_set_or(initial_set, child_initial_set)
-              can_be_empty = can_be_empty or child_can_be_empty
-          # at this point can_be_empty refers to current symbol only
-          if can_be_empty:
-            next_initial_set, can_be_empty = lookaheads[i + 1]
-            initial_set = character_set_or(initial_set, next_initial_set)
-          # at this point can_be_empty refers to all remaining symbols
-          if (initial_set, can_be_empty) != lookaheads[i]:
-            lookaheads[i] = (initial_set, can_be_empty)
-            modified = True
-    return lr1
-
  # GENERATE FACTORY(element.Element) BEGIN
  tag_to_class = {
    'Regex': Regex,
@@ -1168,1435 +790,14 @@ tag_to_class = {
    'RegexSequence': RegexSequence,
    'RegexRepeat': RegexRepeat,
    'RegexGroup': RegexGroup,
-  'RegexGroup_Attribute': RegexGroup.Attribute,
-  'Grammar': Grammar,
-  'Grammar_Production': Grammar.Production,
-  'Grammar_Production_Symbol': Grammar.Production.Symbol,
-  'Grammar_Production_NamedSymbol': Grammar.Production.NamedSymbol
+  'RegexGroup_Attribute': RegexGroup.Attribute
  }
  def factory(tag, attrib = {}, *args, **kwargs):
    return tag_to_class.get(tag, element.Element)(tag, attrib, *args, **kwargs)
  # GENERATE END
  
-class NFA:
-  # state_desc classes:
-  # (STATE_CHARACTER, character_set, next_state)
-  # (STATE_OR, next_state0, next_state1)
-  # (STATE_AND, next_state0, next_state1)
-  # (STATE_JOIN0,)
-  # (STATE_JOIN1, next_state)
-  # (STATE_MARK, mark_value, next_state)
-
-  STATE_CHARACTER = 0
-  STATE_OR = 1
-  STATE_AND = 2
-  STATE_JOIN0 = 3
-  STATE_JOIN1 = 4
-  STATE_MARK = 5
-  join0_state = (STATE_JOIN0,)
-
-  # multistate classes:
-  # (MULTISTATE_ACCEPT, 1)             accept, occupies one thread in list
-  # (MULTISTATE_AND, n, state, child)  and, occupies n threads in list
-  # (MULTISTATE_OR, n, child0, child1)
-  # n = sum of n from child states, that is, multistate[1] is the number of
-  # (MULTISTATE_ACCEPT, 1) leaf states ultimately reachable from this subtree
-
-  MULTISTATE_ACCEPT = 0
-  MULTISTATE_AND = 1
-  MULTISTATE_OR = 2
-  accept_multistate = (MULTISTATE_ACCEPT, 1)
-
-  def __init__(
-    self,
-    groups = [],
-    states = [(STATE_CHARACTER, [0, n_characters], 0)],
-    start_state = [] # can have multiple NFAs in same container
-  ):
-    # groups: list of group_desc
-    # group_desc: (tag, kwargs)
-    #   tag, kwargs will be passed to apply_markup() hence factory()
-    self.groups = groups
-    self.states = states
-    self.start_state = start_state
-
-  def multistate_next(self, root_multistate, character):
-    # the deduplication works as effectively a second pass which goes
-    # over the multistate tree in pre-order, looking for OR-disjunctions
-    # of any depth and configuration, e.g. (a OR b) or (c OR d), and
-    # collecting the subexpressions e.g. a, b, c OR b, c, d, c OR d, and
-    # failing any subexpression that's previously occurred in pre-order
-    # (children of AND-conjunctions get a fresh empty subexpression set)
-
-    # unfortunately it can't be done as a second pass literally, because
-    # we have to generate the transition list, thus we need to know if a
-    # subexpression is going to be failed, so we can delete it's threads
-
-    # therefore the deduplication is tacked onto the end of the building
-    # process, examining each built node just before the builder returns,
-    # if already in the subexpression set we will rewrite the transition
-    # list produced by the recursive calls, otherwise we will add it in
-
-    # the problem is determining the subexpression set to pass into any
-    # recursive calls, the caller's set will be sent unchanged by cases
-    # that return the result unchanged or add an OR-disjuction, an empty
-    # set will be sent by cases that add an AND-conjuction to the result 
-
-    # if we have added something to the node built by a recursive call,
-    # we'll fall into the deduplication logic, otherwise just return it
-
-    def advance1(multistate, join_count, done_multistates):
-      # modifies nonlocal: transition
-      assert multistate[0] == NFA.MULTISTATE_AND
-      _, _, state, child = multistate
-      if state == 0:
-        # note that if we reach the accepting state, we must be a top-level
-        # expression, and could not be part of an AND-conjunction (because
-        # AND-conjunction terms always go to a join0 or join1 state first),
-        # we remove ourselves to indicate to scanner that match is complete
-        assert join_count == 0
-        assert child == NFA.accept_multistate
-        return advance(child, 0, done_multistates)
-      state_desc = self.states[state]
-      if state_desc[0] == NFA.STATE_CHARACTER:
-        if join_count != 0:
-          transition.append((DFA.TRANSITION_POP, child[1]))
-          return None
-        len_transition = len(transition)
-        child = advance(child, 0, set())
-        if child is None:
-          return None
-        result = (NFA.MULTISTATE_AND, child[1], state, child)
-      elif state_desc[0] == NFA.STATE_OR:
-        _, next_state0, next_state1 = state_desc
-        len_transition = len(transition)
-        transition.append((DFA.TRANSITION_DUP, child[1]))
-        child0 = advance1(
-          (NFA.MULTISTATE_AND, child[1], next_state0, child),
-          join_count,
-          done_multistates
-        )
-        child1 = advance1(
-          (NFA.MULTISTATE_AND, child[1], next_state1, child),
-          join_count,
-          done_multistates
-        )
-        if child0 is None:
-          return child1
-        if child1 is None:
-          return child0
-        result = (NFA.MULTISTATE_OR, child0[1] + child1[1], child0, child1)
-      elif state_desc[0] == NFA.STATE_AND:
-        _, next_state0, next_state1 = state_desc
-        return advance1(
-          (
-            NFA.MULTISTATE_AND,
-            child[1],
-            next_state0,
-            (NFA.MULTISTATE_AND, child[1], next_state1, child)
-          ),
-          join_count,
-          done_multistates
-        )
-      elif state_desc[0] == NFA.STATE_JOIN0:
-        return advance(child, join_count + 1, done_multistates)
-      elif state_desc[0] == NFA.STATE_JOIN1:
-        _, next_state = state_desc
-        if join_count == 0:
-          transition.append((DFA.TRANSITION_POP, child[1]))
-          return None
-        return advance1(
-          (NFA.MULTISTATE_AND, child[1], next_state, child),
-          join_count - 1,
-          done_multistates
-        )
-      elif state_desc[0] == NFA.STATE_MARK:
-        _, mark_value, next_state = state_desc
-        transition.append((DFA.TRANSITION_MARK, child[1], mark_value))
-        return advance1(
-          (NFA.MULTISTATE_AND, child[1], next_state, child),
-          join_count,
-          done_multistates
-        )
-      else:
-        assert False
-      if result in done_multistates:
-        transition[len_transition:] = [(DFA.TRANSITION_POP, multistate[1])]
-        return None
-      done_multistates.add(result)
-      return result
-
-    def advance(multistate, join_count, done_multistates):
-      nonlocal character0, character1 # modifies nonlocal: transition
-      if multistate[0] == NFA.MULTISTATE_ACCEPT:
-        assert join_count == 0
-        len_transition = len(transition)
-        transition.append((DFA.TRANSITION_MOVE, 1))
-        result = NFA.accept_multistate # takes no arguments so use static one
-      elif multistate[0] == NFA.MULTISTATE_AND:
-        if character >= 0:
-          _, _, state, child = multistate
-          state_desc = self.states[state]
-          assert state_desc[0] == NFA.STATE_CHARACTER
-          _, character_set, next_state = state_desc
-          k = bisect.bisect_right(character_set, character)
-          if k > 0 and character0 < character_set[k - 1]:
-            character0 = character_set[k - 1]
-          if k < len(character_set) and character1 > character_set[k]:
-            character1 = character_set[k]
-          if (k & 1) == 0:
-            transition.append((DFA.TRANSITION_POP, child[1]))
-            return None
-          multistate = (NFA.MULTISTATE_AND, child[1], next_state, child)
-        return advance1(multistate, join_count, done_multistates)
-      elif multistate[0] == NFA.MULTISTATE_OR:
-        _, _, child0, child1 = multistate
-        len_transition = len(transition)
-        child0 = advance(child0, join_count, done_multistates)
-        child1 = advance(child1, join_count, done_multistates)
-        if child0 is None:
-          return child1
-        if child1 is None:
-          return child0
-        result = (NFA.MULTISTATE_OR, child0[1] + child1[1], child0, child1)
-      else:
-        assert False
-      if result in done_multistates:
-        transition[len_transition:] = [(DFA.TRANSITION_POP, multistate[1])]
-        return None
-      done_multistates.add(result)
-      return result
-
-    transition = []
-    character0 = 0
-    character1 = n_characters
-    root_multistate = advance(root_multistate, 0, set())
-    return root_multistate, transition, character0, character1
-
-  def multistate_accept(root_multistate):
-    i = 0
-    def accept(multistate):
-      nonlocal i
-      if multistate[0] == NFA.MULTISTATE_ACCEPT:
-        return True
-      if multistate[0] == NFA.MULTISTATE_AND:
-        _, _, _, child = multistate
-        i += child[1]
-        return False
-      if multistate[0] == NFA.MULTISTATE_OR:
-        _, _, child0, child1 = multistate
-        return accept(child0) or accept(child1)
-      assert False
-    return i if accept(root_multistate) else -1
-
-  def match_text(self, text, i, start_index = 0):
-    def transit(transition):
-      nonlocal threads0, threads1, prefix_slop # note: also uses i
-      j = prefix_slop
-      for trans in transition:
-        if trans[0] == DFA.TRANSITION_POP:
-          j += trans[1]
-        elif trans[0] == DFA.TRANSITION_DUP:
-          while j < trans[1]:
-            threads0[:0] = [None] * prefix_slop
-            threads1[:0] = [None] * prefix_slop
-            j += prefix_slop
-            prefix_slop *= 2
-          threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
-          j -= trans[1]
-        elif trans[0] == DFA.TRANSITION_MARK:
-          threads0[j:j + trans[1]] = [
-            (i, trans[2], thread)
-            for thread in threads0[j:j + trans[1]]
-          ]
-        elif trans[0] == DFA.TRANSITION_MOVE:
-          threads1.extend(threads0[j:j + trans[1]])
-          j += trans[1]
-        #elif trans[0] == DFA.TRANSITION_DEL:
-        #  del threads1[-trans[1]:]
-        else:
-          assert False
-      assert j == len(threads0)
-      threads0, threads1 = threads1, threads0
-      del threads1[prefix_slop:]
-
-    threads0 = [None, None]
-    threads1 = [None]
-    prefix_slop = 1
-
-    start_state = self.start_state[start_index]
-    if start_state == -1:
-      return None
-    next_multistate, transition, _, _ = self.multistate_next(
-      (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
-      -1
-    )
-    while next_multistate is not None:
-      transit(transition)
-      assert len(threads0) == prefix_slop + next_multistate[1]
-      if next_multistate == NFA.accept_multistate:
-        # there is only one match, which is complete
-        assert len(threads0) == prefix_slop + 1
-        return threads0[prefix_slop]
-      if i >= len(text):
-        # return best match we have, but not incomplete match
-        i = NFA.multistate_accept(next_multistate)
-        return (None if i == -1 else threads0[prefix_slop + i])
-      next_multistate, transition, _, _ = (
-        self.multistate_next(next_multistate, ord(text[i]))
-      )
-      i += 1
-    return None
-
-  def match_yychunk(self, root, pos, off, yychunk_iter, start_index = 0):
-    if pos < 0:
-      pos, off = element.to_start_relative(root, pos, off)
-
-    def transit(transition):
-      nonlocal threads0, threads1, prefix_slop # note: also uses pos, off
-      j = prefix_slop
-      for trans in transition:
-        if trans[0] == DFA.TRANSITION_POP:
-          j += trans[1]
-        elif trans[0] == DFA.TRANSITION_DUP:
-          while j < trans[1]:
-            threads0[:0] = [None] * prefix_slop
-            threads1[:0] = [None] * prefix_slop
-            j += prefix_slop
-            prefix_slop *= 2
-          threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
-          j -= trans[1]
-        elif trans[0] == DFA.TRANSITION_MARK:
-          threads0[j:j + trans[1]] = [
-            (pos, off, trans[2], thread)
-            for thread in threads0[j:j + trans[1]]
-          ]
-        elif trans[0] == DFA.TRANSITION_MOVE:
-          threads1.extend(threads0[j:j + trans[1]])
-          j += trans[1]
-        #elif trans[0] == DFA.TRANSITION_DEL:
-        #  del threads1[-trans[1]:]
-        else:
-          assert False
-      assert j == len(threads0)
-      threads0, threads1 = threads1, threads0
-      del threads1[prefix_slop:]
-
-    threads0 = [None, None]
-    threads1 = [None]
-    prefix_slop = 1
-
-    start_state = self.start_state[start_index]
-    if start_state == -1:
-      return None
-    next_multistate, transition, _, _ = self.multistate_next(
-      (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
-      -1
-    )
-    text = element.get_text(root, pos)
-    while next_multistate is not None:
-      transit(transition)
-      assert len(threads0) == prefix_slop + next_multistate[1]
-      if next_multistate == NFA.accept_multistate:
-        # there is only one match, which is complete
-        assert len(threads0) == prefix_slop + 1
-        return threads0[prefix_slop]
-      while off >= len(text):
-        if pos < len(root):
-          pos += 1
-          off = 0
-        else:
-          try:
-            next(yychunk_iter)
-          except StopIteration:
-            # return best match we have, but not incomplete match
-            i = NFA.multistate_accept(next_multistate)
-            return (None if i == -1 else threads0[prefix_slop + i])
-          text = element.get_text(root, pos)
-      next_multistate, transition, _, _ = (
-        self.multistate_next(next_multistate, ord(text[off]))
-      )
-      off += 1
-    return None
-
-  def to_dfa(self):
-    dfa = DFA(list(self.groups))
-
-    accept_key = (NFA.accept_multistate, ())
-    action_to_meaning = [accept_key]
-    meaning_to_action = {accept_key: 0}
-    state_to_meaning = [NFA.accept_multistate]
-    meaning_to_state = {NFA.accept_multistate: 0}
-
-    for start_state in self.start_state:
-      if start_state == -1:
-        start_action = -1
-      else:
-        next_multistate, transition, _, _ = self.multistate_next(
-          (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
-          -1
-        )
-        if next_multistate is None:
-          start_action = -1
-        else:
-          start_key = (next_multistate, tuple(transition))
-          start_action = len(action_to_meaning)
-          meaning_to_action[start_key] = start_action
-          action_to_meaning.append(start_key)
-      dfa.start_action.append(start_action)
-
-    while len(dfa.actions) < len(action_to_meaning):
-      next_multistate, transition = action_to_meaning[len(dfa.actions)]
-      if next_multistate in meaning_to_state:
-        next_state = meaning_to_state[next_multistate]
-      else:
-        next_state = len(state_to_meaning)
-        state_to_meaning.append(next_multistate)
-        meaning_to_state[next_multistate] = next_state
-      dfa.actions.append((next_state, list(transition)))
-
-      while len(dfa.states) < len(state_to_meaning):
-        character = 0
-        multistate = state_to_meaning[len(dfa.states)]
-        state_desc = ([], [], NFA.multistate_accept(multistate))
-        while character < n_characters:
-          next_multistate, transition, character0, character1 = self.multistate_next(
-            multistate,
-            character
-          )
-          assert character0 == character and character1 > character
-          if next_multistate is None:
-            action = -1
-          else:
-            # optimize transition (optional)
-            i = 0
-            j = 0
-            while i < len(transition):
-              if transition[i][0] == DFA.TRANSITION_POP:
-                n = transition[i][1]
-                i += 1
-                while (
-                  i < len(transition) and
-                  transition[i][0] == DFA.TRANSITION_POP
-                ):
-                  n += transition[i][1]
-                  i += 1
-                transition[j] = (DFA.TRANSITION_POP, n)
-              elif transition[i][0] == DFA.TRANSITION_MOVE:
-                n = transition[i][1]
-                i += 1
-                while (
-                  i < len(transition) and
-                  transition[i][0] == DFA.TRANSITION_MOVE
-                ):
-                  n += transition[i][1]
-                  i += 1
-                transition[j] = (DFA.TRANSITION_MOVE, n)
-              else:
-                transition[j] = transition[i]
-                i += 1
-              j += 1
-            del transition[j:]
-            # end optimize transition
-            key = (next_multistate, tuple(transition))
-            if key in meaning_to_action:
-              action = meaning_to_action[key]
-            else:
-              action = len(action_to_meaning)
-              action_to_meaning.append(key)
-              meaning_to_action[key] = action
-          state_desc[0].append(character1)
-          state_desc[1].append(action)
-          character = character1
-        dfa.states.append(state_desc)
-    return dfa
-
-  def __repr__(self):
-    return 'regex.NFA({0:s}, {1:s}, {2:s})'.format(
-      repr(self.groups),
-      repr(self.states),
-      repr(self.start_state)
-    )
-
-class DFA:
-  # transition classes:
-  # instructions to transform thread list from a multistate to next multistate
-  # (TRANSITION_POP, n)                        j += n
-  # (TRANSITION_DUP, n)                        threads0[j - n:j] = threads0[j:j + n]
-  #                                    j -= n
-  # (TRANSITION_MARK, n, mark_value)   threads0[j:j + n] = [
-  #                                      (i, mark, thread)
-  #                                      for thread in threads0[j:j + n]
-  #                                    ]
-  # (TRANSITION_MOVE, n)               threads1.extend(threads0[j:j + n])
-  #                                    j += n
-  # (TRANSITION_DEL, n)                        del threads1[-n:]
-
-  TRANSITION_POP = 0
-  TRANSITION_DUP = 1
-  TRANSITION_MARK = 2
-  TRANSITION_MOVE = 3
-  #TRANSITION_DEL = 4
- 
-  def __init__(
-    self,
-    groups = [],
-    states = [([n_characters], [0], 0)],
-    actions = [(0, [])],
-    start_action = [] # can have multiple DFAs in same container
-  ):
-    # groups: list of group_desc
-    # group_desc: (tag, kwargs)
-    #   tag, kwargs will be passed to apply_markup() hence factory()
-    # states: list of state_desc
-    # state_desc: (list of breaks, list of action to do, accept_thread)
-    # actions: list of action_desc
-    # action_desc: (state to go to next, compiled transition to do first)
-    # accept_thread: which thread of thread list to use, -1 don't accept
-    self.groups = groups
-    self.states = states
-    self.actions = actions
-    self.start_action = start_action
-
-  def match_text(self, text, i, start_index = 0):
-    def transit(transition):
-      nonlocal threads0, threads1, prefix_slop # note: also uses i
-      j = prefix_slop
-      for trans in transition:
-        if trans[0] == DFA.TRANSITION_POP:
-          j += trans[1]
-        elif trans[0] == DFA.TRANSITION_DUP:
-          while j < trans[1]:
-            threads0[:0] = [None] * prefix_slop
-            threads1[:0] = [None] * prefix_slop
-            j += prefix_slop
-            prefix_slop *= 2
-          threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
-          j -= trans[1]
-        elif trans[0] == DFA.TRANSITION_MARK:
-          threads0[j:j + trans[1]] = [
-            (i, trans[2], thread)
-            for thread in threads0[j:j + trans[1]]
-          ]
-        elif trans[0] == DFA.TRANSITION_MOVE:
-          threads1.extend(threads0[j:j + trans[1]])
-          j += trans[1]
-        #elif trans[0] == DFA.TRANSITION_DEL:
-        #  del threads1[-trans[1]:]
-        else:
-          assert False
-      assert j == len(threads0)
-      threads0, threads1 = threads1, threads0
-      del threads1[prefix_slop:]
-
-    threads0 = [None, None]
-    threads1 = [None]
-    prefix_slop = 1
-
-    action = self.start_action[start_index]
-    while action != -1:
-      state, transition = self.actions[action]
-      #print('i', i, 'action', action, 'state', state, 'transition', transition)
-      transit(transition)
-      if state == 0:
-        # there is only one match, which is complete
-        assert len(threads0) == prefix_slop + 1
-        return threads0[prefix_slop]
-      if i >= len(text):
-        # return best match we have, but not incomplete match
-        i = self.states[state][2]
-        return (None if i == -1 else threads0[prefix_slop + i])
-      action = self.states[state][1][
-        bisect.bisect_right(self.states[state][0], ord(text[i]))
-      ]
-      i += 1
-    return None
-
-  def match_yychunk(self, root, pos, off, yychunk_iter, start_index = 0):
-    if pos < 0:
-      pos, off = element.to_start_relative(root, pos, off)
-
-    def transit(transition):
-      nonlocal threads0, threads1, prefix_slop # note: also uses pos, off
-      j = prefix_slop
-      for trans in transition:
-        if trans[0] == DFA.TRANSITION_POP:
-          j += trans[1]
-        elif trans[0] == DFA.TRANSITION_DUP:
-          while j < trans[1]:
-            threads0[:0] = [None] * prefix_slop
-            threads1[:0] = [None] * prefix_slop
-            j += prefix_slop
-            prefix_slop *= 2
-          threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
-          j -= trans[1]
-        elif trans[0] == DFA.TRANSITION_MARK:
-          threads0[j:j + trans[1]] = [
-            (pos, off, trans[2], thread)
-            for thread in threads0[j:j + trans[1]]
-          ]
-        elif trans[0] == DFA.TRANSITION_MOVE:
-          threads1.extend(threads0[j:j + trans[1]])
-          j += trans[1]
-        #elif trans[0] == DFA.TRANSITION_DEL:
-        #  del threads1[-trans[1]:]
-        else:
-          assert False
-      assert j == len(threads0)
-      threads0, threads1 = threads1, threads0
-      del threads1[prefix_slop:]
-
-    threads0 = [None, None]
-    threads1 = [None]
-    prefix_slop = 1
-
-    action = self.start_action[start_index]
-    text = element.get_text(root, pos)
-    while action != -1:
-      state, transition = self.actions[action]
-      transit(transition)
-      if state == 0:
-        # there is only one match, which is complete
-        assert len(threads0) == prefix_slop + 1
-        return threads0[prefix_slop]
-      while off >= len(text):
-        if pos < len(root):
-          pos += 1
-          off = 0
-        else:
-          try:
-            next(yychunk_iter)
-          except StopIteration:
-            # return best match we have, but not incomplete match
-            i = self.states[state][2]
-            return (None if i == -1 else threads0[prefix_slop + i])
-          text = element.get_text(root, pos)
-      #print(
-      #  'state {0:d} pos {1:d} off {2:d} text "{3:s}"'.format(
-      #    state,
-      #    pos,
-      #    off,
-      #    text.replace('\n', '$')
-      #  )
-      #)
-      action = self.states[state][1][
-        bisect.bisect_right(self.states[state][0], ord(text[off]))
-      ]
-      off += 1
-    return None
-
-  def yylex(self, root, pos, off, factory, yychunk_iter):
-    if pos < 0:
-      pos, off = element.to_start_relative(root, pos, off)
-
-    while True:
-      # note: pointers must be kept start relative during the below call,
-      # because it extends the following text by calling the yychunk_iter
-      thread = self.match_yychunk(root, pos, off, yychunk_iter)
-      if thread is None:
-        break
-      stack = []
-      while True:
-        pos, off, mark_value, thread = thread
-        group_index = mark_value >> 1
-        if (mark_value & 1) != 0:
-          end_pos, end_off = element.to_end_relative(root, pos, off)
-          stack.append((end_pos, end_off, group_index))
-        else:
-          end_pos, end_off, temp = stack.pop()
-          assert temp == group_index
-          if len(stack) == 0:
-            break
-          tag, kwargs = self.groups[group_index]
-          if tag != '':
-            work.apply_markup(
-              root,
-              pos,
-              off,
-              end_pos,
-              end_off,
-              factory,
-              tag,
-              **kwargs
-            )
-      # note: pointers must be kept end relative during the below call,
-      # because it modifies the preceding text by calling apply_markup()
-      yield end_pos, end_off, group_index
-      pos, off = element.to_start_relative(root, end_pos, end_off)
-
-  def __repr__(self):
-    return 'regex.DFA({0:s}, {1:s}, {2:s}, {3:s})'.format(
-      repr(self.groups),
-      repr(self.states),
-      repr(self.actions),
-      repr(self.start_action)
-    )
-
-class LR1:
-  def __init__(
-    self,
-    productions = [],
-    n_terminals = n_characters + 1,
-    eof_terminal = n_characters
-  ):
-    # productions: list of production
-    # production: (
-    #   priority,
-    #   symbols,
-    #   lookaheads,
-    #   group_bounds
-    # )
-    # priority: bit 0 = right to left, bits 1: = numeric priority
-    # symbols: list of symbol_desc
-    # symbol_desc: (terminal_set, nonterminal_set)
-    # terminal_set: similar to character_set, even length list of pairs of breaks
-    # nonterminal_set: as above but has n_terminals subtracted from breaks
-    # lookaheads: list of lookahead_desc, len(lookaheads) = len(symbols) + 1
-    # lookahead_desc: (initial_set, can_be_empty)
-    # initial_set: what terminals can occur at this position in symbols array,
-    # computed such that lookaheads[i][0] is symbols[i][0], plus initial set of
-    # each nonterminal from symbols[i][1], plus lookaheads[i + 1][0] if any
-    # nonterminal of symbols[i][1] can be empty (may pull in i + 2, i + 3 etc)
-    # can_be_empty: whether all symbols from this position to end can be empty
-    # note: last lookahead_desc is a sentinel consisting of ([], True), so that
-    # initial_set is empty (will be augmented by context) and can_be_empty is
-    # True (because all symbols from the end to the end can obviously be empty)
-    # group_bounds: list of group_bound
-    # group_bound: (start_index, end_index, tag, kwargs)
-    #   where start_index, end_index are indices into list of character_set,
-    #   and tag, kwargs will be passed to apply_markup() hence factory(),
-    #   noting that markup has to be applied in reverse order of the list
-    # n_terminals: offset to apply to productions[] index to get symbol
-    #   (character set code), also symbol for productions[0] = start production
-    # eof_terminal: usually == n_terminals - 1 (must be valid terminal value)
-    self.productions = productions
-    self.n_terminals = n_terminals
-    self.eof_terminal = eof_terminal
-
-  def lookahead_item_set_closure(self, items, item_to_index):
-    in_queue = [True for i in range(len(items))]
-    queue = list(range(len(items)))
-
-    qhead = 0
-    while qhead < len(queue):
-      i = queue[qhead]
-      in_queue[i] = False
-      j, k, lookahead_set = items[i]
-      _, symbols, lookaheads, _ = self.productions[j]
-      if k < len(symbols):
-        _, nonterminal_set = symbols[k]
-        if len(nonterminal_set):
-          next_lookahead_set, next_can_be_empty = lookaheads[k + 1]
-          if next_can_be_empty:
-            next_lookahead_set = character_set_or(
-              next_lookahead_set,
-              lookahead_set
-            )
-          for l in range(0, len(nonterminal_set), 2):
-            for m in range(nonterminal_set[l], nonterminal_set[l + 1]):
-              key = (m, 0)
-              if key in item_to_index:
-                n = item_to_index[key]
-                child_lookahead_set = character_set_or(
-                  items[n][2],
-                  next_lookahead_set
-                )
-                if child_lookahead_set != items[n][2]:
-                  items[n] = (m, 0, child_lookahead_set)
-                  if not in_queue[n]:
-                    # optimization: do not re-queue unless it is transparent
-                    # to changes in the lookahead set wrt. further propagation
-                    _, child_symbols, child_lookaheads, _ = self.productions[m]
-                    if len(child_symbols) and child_lookaheads[1][1]:
-                      in_queue[n] = True
-                      queue.append(n)
-              else:
-                n = len(items)
-                items.append((m, 0, next_lookahead_set))
-                item_to_index[key] = n
-                in_queue.append(True)
-                queue.append(n)
-      qhead += 1 
-
-  def lookahead_item_set_shift(self, items, terminal):
-    next_items = []
-    next_item_to_index = {}
-    reductions = set()
-    terminal0 = 0
-    terminal1 = self.n_terminals
-    for i, j, lookahead_set in items:
-      _, symbols, _, _ = self.productions[i]
-      if j < len(symbols):
-        terminal_set, _ = symbols[j]
-        k = bisect.bisect_right(terminal_set, terminal)
-        if k > 0 and terminal0 < terminal_set[k - 1]:
-          terminal0 = terminal_set[k - 1]
-        if k < len(terminal_set) and terminal1 > terminal_set[k]:
-          terminal1 = terminal_set[k]
-        if (k & 1) == 1:
-          next_item_to_index[(i, j + 1)] = len(next_items)
-          next_items.append((i, j + 1, lookahead_set))
-      else:
-        k = bisect.bisect_right(lookahead_set, terminal)
-        if k > 0 and terminal0 < lookahead_set[k - 1]:
-          terminal0 = lookahead_set[k - 1]
-        if k < len(lookahead_set) and terminal1 > lookahead_set[k]:
-          terminal1 = lookahead_set[k]
-        if (k & 1) == 1:
-          reductions.add(i)
-    return next_items, next_item_to_index, reductions, terminal0, terminal1
-
-  def lookahead_item_set_goto(self, items, nonterminal):
-    next_items = []
-    next_item_to_index = {}
-    reductions = set()
-    nonterminal0 = 0
-    nonterminal1 = len(self.productions)
-    for i, j, lookahead_set in items:
-      _, symbols, _, _ = self.productions[i]
-      if j < len(symbols):
-        _, nonterminal_set = symbols[j]
-        k = bisect.bisect_right(nonterminal_set, nonterminal)
-        if k > 0 and nonterminal0 < nonterminal_set[k - 1]:
-          nonterminal0 = nonterminal_set[k - 1]
-        if k < len(nonterminal_set) and nonterminal1 > nonterminal_set[k]:
-          nonterminal1 = nonterminal_set[k]
-        if (k & 1) == 1:
-          next_item_to_index[(i, j + 1)] = len(next_items)
-          next_items.append((i, j + 1, lookahead_set))
-    return next_items, next_item_to_index, nonterminal0, nonterminal1
-
-  def parse_text(self, text, i):
-    items = [(0, 0, [self.eof_terminal, self.eof_terminal + 1])]
-    item_to_index = {(0, 0): 0}
-    value_stack = []
-    state_stack = []
-    lookahead_character = ord(text[i]) if i < len(text) else self.eof_terminal
-    while True:
-      self.lookahead_item_set_closure(items, item_to_index)
-      value_stack.append(i)
-      state_stack.append(items)
-      items, item_to_index, reductions, _, _ = (
-        self.lookahead_item_set_shift(items, lookahead_character)
-      )
-      if len(items) != 0:
-        if len(reductions) != 0:
-          sys.stderr.write(
-            'shift/reduce conflict: {0:s} vs {1:s}\n'.format(
-              ','.join([str(i) for i, _, _ in next_items]),
-              ','.join([str(i) for i in reductions])
-            )
-          )
-        i += 1
-        lookahead_character = ord(text[i]) if i < len(text) else self.eof_terminal
-      elif len(reductions) != 0:
-        if len(reductions) != 1:
-          sys.stderr.write(
-            'reduce/reduce conflict: {0:s}\n'.format(
-              ','.join([str(i) for i in reductions])
-            )
-          )
-        reduce = min(reductions)
-        _, symbols, _, group_bounds = self.productions[reduce]
-        base = len(value_stack) - len(symbols) - 1
-        for j in range(len(group_bounds) - 1, -1, -1):
-          k, l, tag, _ = group_bounds[j]
-          k += base
-          if l != 1:
-            value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
-          sys.stderr.write(
-            'text \'{0:s}\' tag \'{1:s}\'\n'.format(
-              text[value_stack[k]:value_stack[k + 1]],
-              tag
-            )
-          )
-        del value_stack[base + 1:]
-        del state_stack[base + 1:]
-        if reduce == 0:
-          assert base == 0
-          return
-        items, item_to_index, _, _ = (
-          self.lookahead_item_set_goto(state_stack[-1], reduce)
-        )
-        assert len(items) != 0
-      else:
-        raise Exception(
-          'syntax error at {0:d}: {1:s}'.format(i, text[i:])
-        )
-
-  def parse_yychunk(self, root, pos, off, factory, yychunk_iter):
-    if pos < 0:
-      pos, off = element.to_start_relative(root, pos, off)
-
-    items = [(0, 0, [self.eof_terminal, self.eof_terminal + 1])]
-    item_to_index = {(0, 0): 0}
-    value_stack = []
-    state_stack = []
-    text = element.get_text(root, pos)
-    while off >= len(text):
-      if pos < len(root):
-        pos += 1
-        off = 0
-      else:
-        try:
-          next(yychunk_iter)
-        except StopIteration:
-          lookahead_character = self.eof_terminal
-          break
-        text = element.get_text(root, pos)
-    else: 
-      lookahead_character = ord(text[off])
-    while True:
-      self.lookahead_item_set_closure(items, item_to_index)
-      value_stack.append((pos, off))
-      state_stack.append(items)
-      items, item_to_index, reductions, _, _ = (
-        self.lookahead_item_set_shift(items, lookahead_character)
-      )
-      if len(items) != 0:
-        if len(reductions) != 0:
-          sys.stderr.write(
-            'shift/reduce conflict: {0:s} vs {1:s}\n'.format(
-              ','.join([str(i) for i, _ in next_lookahead_item_set.keys()]),
-              ','.join([str(i) for i in reductions])
-            )
-          )
-        off += 1
-        while off >= len(text):
-          if pos < len(root):
-            pos += 1
-            off = 0
-          else:
-            try:
-              next(yychunk_iter)
-            except StopIteration:
-              lookahead_character = self.eof_terminal
-              break
-            text = element.get_text(root, pos)
-        else: 
-          lookahead_character = ord(text[off])
-      elif len(reductions) != 0:
-        if len(reductions) != 1:
-          sys.stderr.write(
-            'reduce/reduce conflict: {0:s}\n'.format(
-              ','.join([str(i) for i in reductions])
-            )
-          )
-        reduce = min(reductions)
-        _, symbols, _, group_bounds = self.productions[reduce]
-        base = len(value_stack) - len(symbols) - 1
-        end_relative = len(value_stack)
-        for j in range(len(group_bounds) - 1, -1, -1):
-          k, l, tag, kwargs = group_bounds[j]
-          k += base
-          assert k < end_relative
-          if l != 1:
-            value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
-            end_relative = max(k + 1, end_relative + 1 - l)
-          while end_relative > k + 1:
-            end_relative -= 1
-            pos1, off1 = value_stack[end_relative]
-            value_stack[end_relative] = (
-              element.to_end_relative(root, pos1, off1)
-            )
-          pos0, off0 = value_stack[k]
-          pos1, off1 = value_stack[k + 1]
-          work.apply_markup(
-            root,
-            pos0,
-            off0,
-            pos1,
-            off1,
-            factory,
-            tag,
-            **kwargs
-          )
-        if end_relative < len(value_stack):
-          pos, off = value_stack[-1]
-          pos, off = element.to_start_relative(root, pos, off)
-          text = element.get_text(root, pos)
-        del value_stack[base + 1:]
-        del state_stack[base + 1:]
-        if reduce == 0:
-          assert base == 0
-          return
-        items, item_to_index, _, _ = (
-          self.lookahead_item_set_goto(state_stack[-1], reduce)
-        )
-        assert len(items) != 0
-      else:
-        raise Exception(
-          'syntax error at {0:d},{1:d}: {2:s}'.format(pos, off, text[off:])
-        )
-
-  def to_clr1(self):
-    lr1dfa = LR1DFA(
-      [],
-      [
-        (len(symbols), group_bounds)
-        for _, symbols, _, group_bounds in self.productions
-      ],
-      self.n_terminals,
-      self.eof_terminal
-    )
-
-    items = [(0, 0, [self.eof_terminal, self.eof_terminal + 1])]
-    item_to_index = {(0, 0): 0}
-    self.lookahead_item_set_closure(items, item_to_index)
-
-    items = sorted(items)
-    key = tuple((i, j, tuple(k)) for i, j, k in items)
-    state_to_items = [items]
-    items_to_state = {key: 0}
-
-    while len(lr1dfa.states) < len(state_to_items):
-      items = state_to_items[len(lr1dfa.states)]
-      state_desc = ([], [], [], [])
-
-      def add_state(next_items, next_item_to_index):
-        self.lookahead_item_set_closure(next_items, next_item_to_index)
-        new_items = sorted(next_items)
-        key = tuple((i, j, tuple(k)) for i, j, k in new_items)
-        if key in items_to_state:
-          state = items_to_state[key]
-        else:
-          state = len(state_to_items)
-          state_to_items.append(new_items)
-          items_to_state[key] = state
-        return state
-
-      terminal = 0
-      while terminal < self.n_terminals:
-        next_items, next_item_to_index, reductions, terminal0, terminal1 = (
-          self.lookahead_item_set_shift(items, terminal)
-        )
-        assert terminal0 == terminal and terminal1 > terminal
-        if len(next_items) != 0:
-          if len(reductions) != 0:
-            sys.stderr.write(
-              'state {0:d} shift/reduce conflict: {1:s} vs {2:s}\n'.format(
-                len(lr1dfa.states),
-                ','.join([str(i) for i, _, _ in next_items]),
-                ','.join([str(i) for i in reductions])
-              )
-            )
-          action = add_state(next_items, next_item_to_index) * 2
-        elif len(reductions) != 0:
-          if len(reductions) != 1:
-            sys.stderr.write(
-              'state {0:d} reduce/reduce conflict: {1:s}\n'.format(
-                len(lr1dfa.states),
-                ','.join([str(i) for i in reductions])
-              )
-            )
-          action = min(reductions) * 2 + 1 # odd means reduce
-        else:
-          action = -1
-        state_desc[0].append(terminal1)
-        state_desc[1].append(action)
-        terminal = terminal1
-
-      nonterminal = 0
-      while nonterminal < len(self.productions):
-        next_items, next_item_to_index, nonterminal0, nonterminal1 = (
-          self.lookahead_item_set_goto(items, nonterminal)
-        )
-        assert nonterminal0 == nonterminal and nonterminal1 > nonterminal
-        if len(next_items) != 0:
-          goto = add_state(next_items, next_item_to_index)
-        else:
-          goto = -1
-        state_desc[2].append(nonterminal1)
-        state_desc[3].append(goto)
-        nonterminal = nonterminal1
-
-      lr1dfa.states.append(state_desc)
-    return lr1dfa
-
-  def to_lalr1(self):
-    lr1dfa = LR1DFA(
-      [],
-      [
-        (len(symbols), group_bounds)
-        for _, symbols, _, group_bounds in self.productions
-      ],
-      self.n_terminals,
-      self.eof_terminal
-    )
-
-    items = [(0, 0, [self.eof_terminal, self.eof_terminal + 1])]
-    item_to_index = {(0, 0): 0}
-    self.lookahead_item_set_closure(items, item_to_index)
-
-    items = sorted(items)
-    key = tuple((i, j) for i, j, _ in items) # ignore lookahead
-    state_to_items = [items]
-    items_to_state = {key: 0}
-
-    in_queue = [True]
-    queue = [0]
-
-    qhead = 0
-    while qhead < len(queue):
-      i = queue[qhead]
-      in_queue[i] = False
-      items = state_to_items[i]
-      state_desc = ([], [], [], [])
-
-      def add_state(next_items, next_item_to_index):
-        self.lookahead_item_set_closure(next_items, next_item_to_index)
-        new_items = sorted(next_items)
-        key = tuple((i, j) for i, j, _ in new_items) # ignore lookahead
-        if key in items_to_state:
-          state = items_to_state[key]
-          state_items = state_to_items[state]
-          for i in range(len(new_items)):
-            j, k, lookahead_set = new_items[i]
-            lookahead_set = character_set_or(lookahead_set, state_items[i][2])
-            if lookahead_set != state_items[i][2]:
-              state_items[i] = (j, k, lookahead_set)
-              if not in_queue[state]:
-                in_queue[state] = True
-                queue.append(state)
-        else:
-          state = len(state_to_items)
-          state_to_items.append(new_items)
-          items_to_state[key] = state
-          in_queue.append(True)
-          queue.append(state)
-        return state
-
-      terminal = 0
-      while terminal < self.n_terminals:
-        next_items, next_item_to_index, reductions, terminal0, terminal1 = (
-          self.lookahead_item_set_shift(items, terminal)
-        )
-        assert terminal0 == terminal and terminal1 > terminal
-        if len(next_items) != 0:
-          if len(reductions) != 0:
-            sys.stderr.write(
-              'state {0:d} shift/reduce conflict: {1:s} vs {2:s}\n'.format(
-                i,
-                ','.join([str(j) for j, _, _ in next_items]),
-                ','.join([str(j) for j in reductions])
-              )
-            )
-          action = add_state(next_items, next_item_to_index) * 2
-        elif len(reductions) != 0:
-          if len(reductions) != 1:
-            sys.stderr.write(
-              'state {0:d} reduce/reduce conflict: {1:s}\n'.format(
-                i,
-                ','.join([str(j) for j in reductions])
-              )
-            )
-          action = min(reductions) * 2 + 1 # odd means reduce
-        else:
-          action = -1
-        state_desc[0].append(terminal1)
-        state_desc[1].append(action)
-        terminal = terminal1
-
-      nonterminal = 0
-      while nonterminal < len(self.productions):
-        next_items, next_item_to_index, nonterminal0, nonterminal1 = (
-          self.lookahead_item_set_goto(items, nonterminal)
-        )
-        assert nonterminal0 == nonterminal and nonterminal1 > nonterminal
-        if len(next_items) != 0:
-          goto = add_state(next_items, next_item_to_index)
-        else:
-          goto = -1
-        state_desc[2].append(nonterminal1)
-        state_desc[3].append(goto)
-        nonterminal = nonterminal1
-
-      if i < len(lr1dfa.states):
-        lr1dfa.states[i] = state_desc
-      else:
-        lr1dfa.states.append(state_desc)
-      qhead += 1
-    return lr1dfa
-
-  def __repr__(self):
-    return 'regex.LR1({0:s}, {1:d}, {2:d})'.format(
-      repr(self.productions),
-      self.n_terminals,
-      self.eof_terminal
-    )
-
-class LR1DFA:
-  def __init__(
-    self,
-    states = [],
-    productions = [],
-    n_terminals = n_characters + 1,
-    eof_terminal = n_characters
-  ):
-    # states: list of state_desc
-    # state_desc: (terminal breaks, actions, nonterminal breaks, gotos)
-    # action: shift = new state * 2, reduce = production * 2 + 1, error = -1
-    # goto: reduce = production, error = -1 (but error can't really happen)
-    # productions: list of production
-    # production: (len(symbols), group_bounds)
-    # len(symbols): how many states to pop stack to reduce this production
-    # group_bounds: list of group_bound
-    # group_bound: (start_index, end_index, tag, kwargs)
-    #   where start_index, end_index are indices into list of character_set,
-    #   and tag, kwargs will be passed to apply_markup() hence factory(),
-    #   noting that markup has to be applied in reverse order of the list
-    # n_terminals: offset to apply to productions[] index to get symbol
-    #   (character set code), also symbol for productions[0] = start production
-    # eof_terminal: usually == n_terminals - 1 (must be valid terminal value)
-    self.states = states
-    self.productions = productions
-    self.n_terminals = n_terminals
-    self.eof_terminal = eof_terminal
-
-  def parse_text(self, text, i):
-    state = 0
-    value_stack = []
-    state_stack = []
-    lookahead_character = ord(text[i]) if i < len(text) else self.eof_terminal
-    while True:
-      value_stack.append(i)
-      state_stack.append(state)
-      action = self.states[state][1][
-        bisect.bisect_right(self.states[state][0], lookahead_character)
-      ]
-      if action == -1:
-        raise Exception(
-          'syntax error at {0:d}: {1:s}'.format(i, text[i:])
-        )
-      if (action & 1) == 0:
-        state = action >> 1
-        i += 1
-        lookahead_character = ord(text[i]) if i < len(text) else self.eof_terminal
-      else:
-        reduce = action >> 1
-        len_symbols, group_bounds = self.productions[reduce]
-        base = len(value_stack) - len_symbols - 1
-        for j in range(len(group_bounds) - 1, -1, -1):
-          k, l, tag, _ = group_bounds[j]
-          k += base
-          if l != 1:
-            value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
-          sys.stdout.write(
-            'text \'{0:s}\' tag \'{1:s}\'\n'.format(
-              text[value_stack[k]:value_stack[k + 1]],
-              tag
-            )
-          )
-        del value_stack[base + 1:]
-        del state_stack[base + 1:]
-        if reduce == 0:
-          assert base == 0
-          return
-        state = self.states[state_stack[-1]][3][
-          bisect.bisect_right(self.states[state_stack[-1]][2], reduce)
-        ]
-        assert state != -1
-
-  def parse_yychunk(self, root, pos, off, factory, yychunk_iter):
-    if pos < 0:
-      pos, off = element.to_start_relative(root, pos, off)
-
-    state = 0
-    value_stack = []
-    state_stack = []
-    text = element.get_text(root, pos)
-    while off >= len(text):
-      if pos < len(root):
-        pos += 1
-        off = 0
-      else:
-        try:
-          next(yychunk_iter)
-        except StopIteration:
-          lookahead_character = self.eof_terminal
-          break
-        text = element.get_text(root, pos)
-    else: 
-      lookahead_character = ord(text[off])
-    while True:
-      value_stack.append((pos, off))
-      state_stack.append(state)
-      action = self.states[state][1][
-        bisect.bisect_right(self.states[state][0], lookahead_character)
-      ]
-      #print('lookahead_character', lookahead_character, 'action', action)
-      if action == -1:
-        raise Exception(
-          'syntax error at {0:d},{1:d}: {2:s}'.format(pos, off, text[off:])
-        )
-      if (action & 1) == 0:
-        state = action >> 1
-        off += 1
-        while off >= len(text):
-          if pos < len(root):
-            pos += 1
-            off = 0
-          else:
-            try:
-              next(yychunk_iter)
-            except StopIteration:
-              lookahead_character = self.eof_terminal
-              break
-            text = element.get_text(root, pos)
-        else: 
-          lookahead_character = ord(text[off])
-      else:
-        reduce = action >> 1
-        len_symbols, group_bounds = self.productions[reduce]
-        base = len(value_stack) - len_symbols - 1
-        end_relative = len(value_stack)
-        for j in range(len(group_bounds) - 1, -1, -1):
-          k, l, tag, kwargs = group_bounds[j]
-          k += base
-          assert k < end_relative
-          if l != 1:
-            value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
-            end_relative = max(k + 1, end_relative + 1 - l)
-          while end_relative > k + 1:
-            end_relative -= 1
-            pos1, off1 = value_stack[end_relative]
-            value_stack[end_relative] = (
-              element.to_end_relative(root, pos1, off1)
-            )
-          pos0, off0 = value_stack[k]
-          pos1, off1 = value_stack[k + 1]
-          work.apply_markup(
-            root,
-            pos0,
-            off0,
-            pos1,
-            off1,
-            factory,
-            tag,
-            **kwargs
-          )
-        if end_relative < len(value_stack):
-          pos, off = value_stack[-1]
-          pos, off = element.to_start_relative(root, pos, off)
-          text = element.get_text(root, pos)
-        del value_stack[base + 1:]
-        del state_stack[base + 1:]
-        if reduce == 0:
-          assert base == 0
-          return
-        state = self.states[state_stack[-1]][3][
-          bisect.bisect_right(self.states[state_stack[-1]][2], reduce)
-        ]
-        assert state != -1
-
-  def yyparse(self, root, pos, off, factory, yylex_iter):
-    if pos < 0:
-      pos, off = element.to_start_relative(root, pos, off)
-
-    state = 0
-    value_stack = []
-    state_stack = []
-    try:
-      end_pos, end_off, lookahead_character = next(yylex_iter)
-    except StopIteration:
-      lookahead_character = self.eof_terminal
-      end_pos, end_off = element.to_end_relative(root, pos, off)
-    while True:
-      value_stack.append((pos, off))
-      state_stack.append(state)
-      action = self.states[state][1][
-        bisect.bisect_right(self.states[state][0], lookahead_character)
-      ]
-      #print('lookahead_character', lookahead_character, 'action', action)
-      if action == -1:
-        raise Exception(
-          'syntax error at {0:d},{1:d}: {2:d}'.format(pos, off, lookahead_character)
-        )
-      if (action & 1) == 0:
-        state = action >> 1
-        pos, off = element.to_start_relative(root, end_pos, end_off)
-        try:
-          end_pos, end_off, lookahead_character = next(yylex_iter)
-        except StopIteration:
-          lookahead_character = self.eof_terminal
-          #end_pos, end_off = element.to_end_relative(root, pos, off)
-      else:
-        reduce = action >> 1
-        len_symbols, group_bounds = self.productions[reduce]
-        base = len(value_stack) - len_symbols - 1
-        end_relative = len(value_stack)
-        for j in range(len(group_bounds) - 1, -1, -1):
-          k, l, tag, kwargs = group_bounds[j]
-          k += base
-          assert k < end_relative
-          if l != 1:
-            value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
-            end_relative = max(k + 1, end_relative + 1 - l)
-          while end_relative > k + 1:
-            end_relative -= 1
-            pos1, off1 = value_stack[end_relative]
-            value_stack[end_relative] = (
-              element.to_end_relative(root, pos1, off1)
-            )
-          pos0, off0 = value_stack[k]
-          pos1, off1 = value_stack[k + 1]
-          work.apply_markup(
-            root,
-            pos0,
-            off0,
-            pos1,
-            off1,
-            factory,
-            tag,
-            **kwargs
-          )
-        if end_relative < len(value_stack):
-          pos, off = value_stack[-1]
-          pos, off = element.to_start_relative(root, pos, off)
-        del value_stack[base + 1:]
-        del state_stack[base + 1:]
-        if reduce == 0:
-          assert base == 0
-          return
-        state = self.states[state_stack[-1]][3][
-          bisect.bisect_right(self.states[state_stack[-1]][2], reduce)
-        ]
-        assert state != -1
-
-  def __repr__(self):
-    return 'regex.LR1DFA({0:s}, {1:s}, {2:d}, {3:d})'.format(
-      repr(self.states),
-      repr(self.productions),
-      self.n_terminals,
-      self.eof_terminal
-    )
-
-def wrap_repr(text, width):
-  lines = []
-  i = 0
-  while i < len(text):
-    j = i + width
-    if j < len(text):
-      j = max(
-        [
-          text.rfind('(', i, j) + 1,
-          text.rfind('[', i, j) + 1,
-          text.rfind('{', i, j) + 1,
-          text.rfind('.', i, j) + 1,
-          text.rfind(')', i, j + 1),
-          text.rfind(']', i, j + 1),
-          text.rfind('}', i, j + 1),
-          text.rfind(' ', i, j + 1),
-        ]
-      )
-      assert j > 0
-    lines.append(text[i:j] + '\n')
-    i = j
-    while text[i:i + 1] == ' ':
-      i += 1
-  return ''.join(lines) 
-
  if __name__ == '__main__':
+  import sys
    import xml.etree.ElementTree
  
    regex = RegexAnd(children = [RegexRepeat(children = [RegexCharacterNot(
author	Nick Downing <downing.nick@gmail.com>
	Sun, 8 Jul 2018 05:39:58 +0000 (15:39 +1000)
committer	Nick Downing <downing.nick@gmail.com>
	Sun, 8 Jul 2018 05:39:58 +0000 (15:39 +1000)
ast.py		patch \| blob \| history
bisect_set.py	[new file with mode: 0644]	patch \| blob
bison_lr1dfa.py		patch \| blob \| history
dfa.py	[new file with mode: 0644]	patch \| blob
grammar.py	[new file with mode: 0644]	patch \| blob
grammar.sh	[new file with mode: 0755]	patch \| blob
lr1.py	[new file with mode: 0644]	patch \| blob
lr1dfa.py	[new file with mode: 0644]	patch \| blob
nfa.py	[new file with mode: 0644]	patch \| blob
regex.py		patch \| blob \| history