First cut at generating state machines compatible with a generic flex skeleton
authorNick Downing <downing.nick@gmail.com>
Wed, 27 Jun 2018 14:12:22 +0000 (00:12 +1000)
committerNick Downing <downing.nick@gmail.com>
Wed, 27 Jun 2018 14:12:22 +0000 (00:12 +1000)
16 files changed:
.gitignore [new file with mode: 0644]
ast.py [new file with mode: 0644]
ast.sh [new file with mode: 0755]
doc/Lex - A Lexical Analyzer Generator.pdf [new file with mode: 0644]
element.py [new file with mode: 0644]
generate.py [new file with mode: 0755]
plex.py [new file with mode: 0755]
regex.py [new file with mode: 0644]
regex.sh [new file with mode: 0755]
skel/Makefile [new file with mode: 0644]
skel/lex.yy.c.patch [new file with mode: 0644]
skel/skel.l [new file with mode: 0644]
tests/Makefile [new file with mode: 0644]
tests/cal.l [new file with mode: 0644]
tests/cal.l.xml [new file with mode: 0644]
work.py [new file with mode: 0644]

diff --git a/.gitignore b/.gitignore
new file mode 100644 (file)
index 0000000..9066073
--- /dev/null
@@ -0,0 +1,3 @@
+__pycache__
+lex.yy.c
+skel/lex.yy.c.orig
diff --git a/ast.py b/ast.py
new file mode 100644 (file)
index 0000000..40aba57
--- /dev/null
+++ b/ast.py
@@ -0,0 +1,451 @@
+import element
+import regex
+
+class Section1(element.Element):
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'Section1',
+    attrib = {},
+    text = '',
+    children = []
+  ):
+    element.Element.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+  def copy(self, factory = None):
+    result = element.Element.copy(
+      self,
+      Section1 if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'ast.Section1({0:s})'.format(', '.join(params))
+  # GENERATE END
+
+class CodeBlock(element.Element):
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'CodeBlock',
+    attrib = {},
+    text = '',
+    children = []
+  ):
+    element.Element.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+  def copy(self, factory = None):
+    result = element.Element.copy(
+      self,
+      CodeBlock if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'ast.CodeBlock({0:s})'.format(', '.join(params))
+  # GENERATE END
+
+class Option(element.Element):
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'Option',
+    attrib = {},
+    text = '',
+    children = []
+  ):
+    element.Element.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+  def copy(self, factory = None):
+    result = element.Element.copy(
+      self,
+      Option if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'ast.Option({0:s})'.format(', '.join(params))
+  # GENERATE END
+  def process(self, options):
+    raise NotImplementedException
+
+class BoolOption(element.Element):
+  # GENERATE ELEMENT(bool value) BEGIN
+  def __init__(
+    self,
+    tag = 'BoolOption',
+    attrib = {},
+    text = '',
+    children = [],
+    value = False
+  ):
+    element.Element.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+    self.value = (
+      element.deserialize_bool(value)
+    if isinstance(value, str) else
+      value
+    )
+  def serialize(self, ref_list, indent = 0):
+    element.Element.serialize(self, ref_list, indent)
+    self.set('value', element.serialize_bool(self.value))
+  def deserialize(self, ref_list):
+    element.Element.deserialize(self, ref_list)
+    self.value = element.deserialize_bool(self.get('value', 'false'))
+  def copy(self, factory = None):
+    result = element.Element.copy(
+      self,
+      BoolOption if factory is None else factory
+    )
+    result.value = self.value
+    return result
+  def repr_serialize(self, params):
+    element.Element.repr_serialize(self, params)
+    if self.value != False:
+      params.append(
+        'value = {0:s}'.format(repr(self.value))
+      )
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'ast.BoolOption({0:s})'.format(', '.join(params))
+  # GENERATE END
+
+class Options(Option):
+  class ECS(BoolOption):
+    # GENERATE ELEMENT() BEGIN
+    def __init__(
+      self,
+      tag = 'Options_ECS',
+      attrib = {},
+      text = '',
+      children = [],
+      value = False
+    ):
+      BoolOption.__init__(
+        self,
+        tag,
+        attrib,
+        text,
+        children,
+        value
+      )
+    def copy(self, factory = None):
+      result = BoolOption.copy(
+        self,
+        ECS if factory is None else factory
+      )
+      return result
+    def __repr__(self):
+      params = []
+      self.repr_serialize(params)
+      return 'ast.Options.ECS({0:s})'.format(', '.join(params))
+    # GENERATE END
+    def process(self, options):
+      options.ecs = self.value
+
+  class MetaECS(BoolOption):
+    # GENERATE ELEMENT() BEGIN
+    def __init__(
+      self,
+      tag = 'Options_MetaECS',
+      attrib = {},
+      text = '',
+      children = [],
+      value = False
+    ):
+      BoolOption.__init__(
+        self,
+        tag,
+        attrib,
+        text,
+        children,
+        value
+      )
+    def copy(self, factory = None):
+      result = BoolOption.copy(
+        self,
+        MetaECS if factory is None else factory
+      )
+      return result
+    def __repr__(self):
+      params = []
+      self.repr_serialize(params)
+      return 'ast.Options.MetaECS({0:s})'.format(', '.join(params))
+    # GENERATE END
+    def process(self, options):
+      options.meta_ecs = self.value
+
+  class YYWrap(BoolOption):
+    # GENERATE ELEMENT() BEGIN
+    def __init__(
+      self,
+      tag = 'Options_YYWrap',
+      attrib = {},
+      text = '',
+      children = [],
+      value = False
+    ):
+      BoolOption.__init__(
+        self,
+        tag,
+        attrib,
+        text,
+        children,
+        value
+      )
+    def copy(self, factory = None):
+      result = BoolOption.copy(
+        self,
+        YYWrap if factory is None else factory
+      )
+      return result
+    def __repr__(self):
+      params = []
+      self.repr_serialize(params)
+      return 'ast.Options.YYWrap({0:s})'.format(', '.join(params))
+    # GENERATE END
+    def process(self, options):
+      options.yywrap = self.value
+
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'Options',
+    attrib = {},
+    text = '',
+    children = []
+  ):
+    Option.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+  def copy(self, factory = None):
+    result = Option.copy(
+      self,
+      Options if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'ast.Options({0:s})'.format(', '.join(params))
+  # GENERATE END
+  def process(self, options):
+    for i in self:
+      i.process(options)
+
+class Section2(element.Element):
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'Section2',
+    attrib = {},
+    text = '',
+    children = []
+  ):
+    element.Element.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+  def copy(self, factory = None):
+    result = element.Element.copy(
+      self,
+      Section2 if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'ast.Section2({0:s})'.format(', '.join(params))
+  # GENERATE END
+
+class StartCondNone(element.Element):
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'StartCondNone',
+    attrib = {},
+    text = '',
+    children = []
+  ):
+    element.Element.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+  def copy(self, factory = None):
+    result = element.Element.copy(
+      self,
+      StartCondNone if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'ast.StartCondNone({0:s})'.format(', '.join(params))
+  # GENERATE END
+
+class BOLRule(element.Element):
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'BOLRule',
+    attrib = {},
+    text = '',
+    children = []
+  ):
+    element.Element.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+  def copy(self, factory = None):
+    result = element.Element.copy(
+      self,
+      BOLRule if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'ast.BOLRule({0:s})'.format(', '.join(params))
+  # GENERATE END
+
+class EOFRule(element.Element):
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'EOFRule',
+    attrib = {},
+    text = '',
+    children = []
+  ):
+    element.Element.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+  def copy(self, factory = None):
+    result = element.Element.copy(
+      self,
+      EOFRule if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'ast.EOFRule({0:s})'.format(', '.join(params))
+  # GENERATE END
+
+class Rule(element.Element):
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'Rule',
+    attrib = {},
+    text = '',
+    children = []
+  ):
+    element.Element.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+  def copy(self, factory = None):
+    result = element.Element.copy(
+      self,
+      Rule if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'ast.Rule({0:s})'.format(', '.join(params))
+  # GENERATE END
+
+class Section3(element.Element):
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'Section3',
+    attrib = {},
+    text = '',
+    children = []
+  ):
+    element.Element.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+  def copy(self, factory = None):
+    result = element.Element.copy(
+      self,
+      Section3 if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'ast.Section3({0:s})'.format(', '.join(params))
+  # GENERATE END
+
+# GENERATE FACTORY(regex.factory) BEGIN
+tag_to_class = {
+  'Section1': Section1,
+  'CodeBlock': CodeBlock,
+  'Option': Option,
+  'BoolOption': BoolOption,
+  'Options': Options,
+  'Options_ECS': Options.ECS,
+  'Options_MetaECS': Options.MetaECS,
+  'Options_YYWrap': Options.YYWrap,
+  'Section2': Section2,
+  'StartCondNone': StartCondNone,
+  'BOLRule': BOLRule,
+  'EOFRule': EOFRule,
+  'Rule': Rule,
+  'Section3': Section3
+}
+def factory(tag, attrib = {}, *args, **kwargs):
+  return tag_to_class.get(tag, regex.factory)(tag, attrib, *args, **kwargs)
+# GENERATE END
diff --git a/ast.sh b/ast.sh
new file mode 100755 (executable)
index 0000000..dd16e2d
--- /dev/null
+++ b/ast.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+if ./generate.py ast <ast.py >ast.py.new && ! diff -q ast.py ast.py.new
+then
+  mv ast.py.new ast.py
+else
+  rm -f ast.py.new
+fi
diff --git a/doc/Lex - A Lexical Analyzer Generator.pdf b/doc/Lex - A Lexical Analyzer Generator.pdf
new file mode 100644 (file)
index 0000000..7b78929
Binary files /dev/null and b/doc/Lex - A Lexical Analyzer Generator.pdf differ
diff --git a/element.py b/element.py
new file mode 100644 (file)
index 0000000..e9bda98
--- /dev/null
@@ -0,0 +1,132 @@
+import sys
+import xml.etree.ElementTree
+
+class Element(xml.etree.ElementTree._Element_Py):
+  def __init__(self, tag = 'Element', attrib = {}, text = '', children = []):
+    xml.etree.ElementTree._Element_Py.__init__(self, tag, attrib)
+    self.ref = -1
+    set_text(self, 0, text)
+    self[:] = children
+  def serialize(self, ref_list, indent = 0):
+    if len(self):
+      for i in range(len(self)):
+        set_text(self, i, '\n' + ' ' * (indent + 2))
+        self[i].serialize(ref_list, indent + 2)
+      set_text(self, len(self), '\n' + ' ' * indent)
+  def deserialize(self, ref_list):
+    for i in self:
+      i.deserialize(ref_list)
+  def copy(self, factory = None):
+    result = (Element if factory is None else factory)(self.tag, self.attrib)
+    result.text = self.text
+    result.tail = self.tail
+    result[:] = [i.copy() for i in self]
+    return result
+  def repr_serialize(self, params):
+    if len(self):
+      params.append(
+        'children = [{0:s}]'.format(
+          ', '.join([repr(i) for i in self])
+        )
+      )
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'element.Element({0:s})'.format(', '.join(params))
+
+bool_to_str = ['false', 'true']
+def serialize_bool(value):
+  return bool_to_str[int(value)]
+
+str_to_bool = {'false': False, 'true': True}
+def deserialize_bool(text):
+  return str_to_bool[text]
+
+def serialize_int(value):
+  return str(value)
+
+def deserialize_int(text):
+  return int(text)
+
+def serialize_ref(value, ref_list):
+  if value is None:
+    ref = -1
+  else:
+    ref = value.ref
+    if ref < 0:
+      value.serialize(ref_list, 2)
+      ref = len(ref_list)
+      set_text(ref_list, ref, '\n  ')
+      ref_list.append(value)
+      value.ref = ref
+      value.set('ref', serialize_int(ref))
+  return str(ref)
+
+def deserialize_ref(text, ref_list):
+  ref = int(text)
+  return None if ref < 0 else ref_list[ref]
+
+def serialize_str(value):
+  return value
+
+def deserialize_str(text):
+  return text
+
+def serialize(root, fout, encoding = 'utf-8'):
+  ref_list = Element('RefList')
+  root.serialize(ref_list, 2)
+  ref = len(ref_list)
+  set_text(ref_list, ref, '\n  ')
+  ref_list.append(root)
+  root.ref = ref
+  root.set('ref', serialize_int(ref))
+  set_text(ref_list, ref + 1, '\n')
+  ref_list.tail = '\n'
+  xml.etree.ElementTree.ElementTree(ref_list).write(fout, encoding)
+  for i in ref_list:
+    i.ref = -1
+    del i.attrib['ref']
+
+def deserialize(fin, factory = Element, encoding = 'utf-8'):
+  ref_list = xml.etree.ElementTree.parse(
+    fin,
+    xml.etree.ElementTree.XMLParser(
+      target = xml.etree.ElementTree.TreeBuilder(factory),
+      encoding = encoding
+    )
+  ).getroot()
+  for i in ref_list:
+    i.deserialize(ref_list)
+    i.ref = -1
+    del i.attrib['ref']
+  return ref_list[-1]
+
+# compatibility scheme to access arbitrary xml.etree.ElementTree.Element-like
+# objects (not just Element defined above) using a more consistent interface:
+def get_text(root, i):
+  if i < 0:
+    i += len(root) + 1
+  text = root.text if i == 0 else root[i - 1].tail
+  return '' if text is None else text
+
+def set_text(root, i, text):
+  if i < 0:
+    i += len(root) + 1
+  if len(text) == 0:
+    text = None
+  if i == 0:
+    root.text = text
+  else:
+    root[i - 1].tail = text
+
+def to_end_relative(root, pos, off):
+  assert pos >= 0 and off >= 0
+  off -= len(get_text(root, pos))
+  pos -= len(root) + 1
+  return pos, off
+
+def to_start_relative(root, pos, off):
+  assert pos < 0 and off <= 0
+  pos += len(root) + 1
+  off += len(get_text(root, pos))
+  return pos, off
diff --git a/generate.py b/generate.py
new file mode 100755 (executable)
index 0000000..ae8aae5
--- /dev/null
@@ -0,0 +1,404 @@
+#!/usr/bin/env python3
+
+import re
+import sys
+
+if len(sys.argv) >= 2:
+  package_name = '{0:s}.'.format(sys.argv[1])
+else:
+  package_name = ''
+
+default_value = {
+  'bool': 'False',
+  'int': '-1',
+  'ref': 'None',
+  'str': '\'\'',
+  'list(bool)': '[]',
+  'list(int)': '[]',
+  'list(ref)': '[]',
+  'list(str)': '[]'
+}
+default_value_str = {
+  'bool': 'false',
+  'int': '-1',
+  'ref': '-1',
+  'str': ''
+}
+
+re_class = re.compile(
+  '([\t ]*)class ([A-Za-z_][A-Za-z0-9_]*)\(([A-Za-z_][A-Za-z0-9_.]*)'
+)
+re_element = re.compile(
+  '([\t ]*)# GENERATE ELEMENT\((([^()]|\([^()]*\))*)\)( BEGIN)?'
+)
+re_factory = re.compile(
+  '([\t ]*)# GENERATE FACTORY\(([^()]*)\)( BEGIN)?'
+)
+stack = []
+classes = []
+base_classes = {'element.Element': []} # params
+
+line = sys.stdin.readline()
+while len(line):
+  match = re_class.match(line)
+  if match is not None:
+    sys.stdout.write(line)
+    indent = match.group(1)
+    class_name = match.group(2)
+    base_class = match.group(3)
+    for i in range(len(stack)):
+      if stack[i][0][:len(indent)] == indent:
+        del stack[i:]
+        break
+    full_name = (
+      '{0:s}.{1:s}'.format(stack[-1][1], class_name)
+    if len(stack) else
+      class_name
+    )
+    stack.append((indent, class_name, full_name, base_class))
+    if base_class in base_classes:
+      classes.append(full_name)
+      base_classes[full_name] = list(base_classes[base_class])
+  else:
+    match = re_element.match(line)
+    if match is not None:
+      indent = match.group(1)
+      params = match.group(2)
+      begin = match.group(4)
+
+      for i in range(len(stack)):
+        if stack[i][0][:len(indent)] == indent:
+          del stack[i:]
+          break
+      _, class_name, full_name, base_class = stack[-1]
+      fields = params.split(',')
+      if fields[-1] == '':
+        del fields[-1:]
+      fields = [i.split() for i in fields]
+      fields = [(type, name) for [type, name] in fields]
+      base_classes[full_name].extend(fields)
+
+      sys.stdout.write(
+        '''{0:s}# GENERATE ELEMENT({1:s}) BEGIN
+{2:s}def __init__(
+{3:s}  self,
+{4:s}  tag = '{5:s}',
+{6:s}  attrib = {{}},
+{7:s}  text = '',
+{8:s}  children = []{9:s}
+{10:s}):
+{11:s}  {12:s}.__init__(
+{13:s}    self,
+{14:s}    tag,
+{15:s}    attrib,
+{16:s}    text,
+{17:s}    children{18:s}
+{19:s}  )
+'''.format(
+          indent,
+          params,
+          indent,
+          indent,
+          indent,
+          full_name.replace('.', '_'),
+          indent,
+          indent,
+          indent,
+          ''.join(
+            [
+              ',\n{0:s}  {1:s} = {2:s}'.format(
+                indent,
+                name,
+                default_value[type]
+              )
+              for type, name in base_classes[full_name]
+            ]
+          ),
+          indent,
+          indent,
+          base_class,  
+          indent,
+          indent,
+          indent,
+          indent,
+          indent,
+          ''.join(
+            [
+              ',\n{0:s}    {1:s}'.format(
+                indent,
+                name
+              )
+              for type, name in base_classes[base_class]
+            ]
+          ),
+          indent
+        )
+      )
+      for type, name in fields:
+        if type == 'ref' or type == 'list(ref)' or type == 'str':
+          sys.stdout.write(
+            '''{0:s}  self.{1:s} = {2:s}
+'''.format(indent, name, name)
+          )
+        elif type[:5] == 'list(' and type[-1:] == ')':
+          subtype = type[5:-1]
+          sys.stdout.write(
+            '''{0:s}  self.{1:s} = (
+{2:s}    [element.deserialize_{3:s}(i) for i in {4:s}.split()]
+{5:s}  if isinstance({6:s}, str) else
+{7:s}    {8:s}
+{9:s}  )
+'''.format(
+              indent,
+              name,
+              indent,
+              subtype,
+              name,
+              indent,
+              name,
+              indent,
+              name,
+              indent
+            )
+          )
+        else:
+          sys.stdout.write(
+            '''{0:s}  self.{1:s} = (
+{2:s}    element.deserialize_{3:s}({4:s})
+{5:s}  if isinstance({6:s}, str) else
+{7:s}    {8:s}
+{9:s}  )
+'''.format(
+              indent,
+              name,
+              indent,
+              type,
+              name,
+              indent,
+              name,
+              indent,
+              name,
+              indent
+            )
+          )
+      if len(fields):
+        sys.stdout.write(
+          '''{0:s}def serialize(self, ref_list, indent = 0):
+{1:s}  {2:s}.serialize(self, ref_list, indent)
+'''.format(indent, indent, base_class)
+        )
+        for type, name in fields:
+          if type[:5] == 'list(' and type[-1:] == ')':
+            subtype = type[5:-1]
+            sys.stdout.write(
+              '''{0:s}  self.set(
+{1:s}    '{2:s}',
+{3:s}    ' '.join([element.serialize_{4:s}(i{5:s}) for i in self.{6:s}])
+{7:s}  )
+'''.format(
+                indent,
+                indent,
+                name,
+                indent,
+                subtype,
+                ', ref_list' if subtype == 'ref' else '',
+                name,
+                indent
+              )
+            )
+          else:
+            sys.stdout.write(
+              '''{0:s}  self.set('{1:s}', element.serialize_{2:s}(self.{3:s}{4:s}))
+'''.format(
+                indent,
+                name,
+                type,
+                name,
+                ', ref_list' if type == 'ref' else ''
+              )
+            )
+        sys.stdout.write(
+          '''{0:s}def deserialize(self, ref_list):
+{1:s}  {2:s}.deserialize(self, ref_list)
+'''.format(indent, indent, base_class)
+        )
+        for type, name in fields:
+          if type[:5] == 'list(' and type[-1:] == ')':
+            subtype = type[5:-1]
+            sys.stdout.write(
+              '''{0:s}  self.{1:s} = [
+{2:s}    element.deserialize_{3:s}(i{4:s})
+{5:s}    for i in self.get('{6:s}', '').split()
+{7:s}  ]
+'''.format(
+                indent,
+                name,
+                indent,
+                subtype,
+                ', ref_list' if subtype == 'ref' else '',
+                indent,
+                name,
+                indent
+              )
+            )
+          else: 
+            sys.stdout.write(
+              '''{0:s}  self.{1:s} = element.deserialize_{2:s}(self.get('{3:s}', '{4:s}'){5:s})
+'''.format(
+                indent,
+                name,
+                type,
+                name,
+                default_value_str[type],
+                ', ref_list' if type == 'ref' else ''
+              )
+            )
+      sys.stdout.write(
+        '''{0:s}def copy(self, factory = None):
+{1:s}  result = {2:s}.copy(
+{3:s}    self,
+{4:s}    {5:s} if factory is None else factory
+{6:s}  ){7:s}
+{8:s}  return result
+'''.format(
+          indent,
+          indent,
+          base_class,
+          indent,
+          indent,
+          class_name,
+          indent,
+          ''.join(
+            [
+              '\n{0:s}  result.{1:s} = self.{2:s}'.format(
+                indent,
+                name,
+                name
+              )
+              for _, name in fields
+            ]
+          ),
+          indent
+        )
+      )
+      if len(fields):
+        sys.stdout.write(
+          '''{0:s}def repr_serialize(self, params):
+{1:s}  {2:s}.repr_serialize(self, params)
+'''.format(
+            indent,
+            indent,
+            base_class
+          )
+        )
+        for type, name in fields:
+          if type[:5] == 'list(' and type[-1:] == ')':
+            subtype = type[5:-1]
+            sys.stdout.write(
+              '''{0:s}  if len(self.{1:s}):
+{2:s}    params.append(
+{3:s}      '{4:s} = [{{0:s}}]'.format(
+{5:s}        ', '.join([repr(i) for i in self.{6:s}])
+{7:s}      )
+{8:s}    )
+'''.format(
+                indent,
+                name,
+                indent,
+                indent,
+                name,
+                indent,
+                name,
+                indent,
+                indent
+              )
+            )
+          else:
+            sys.stdout.write(
+              '''{0:s}  if self.{1:s} != {2:s}:
+{3:s}    params.append(
+{4:s}      '{5:s} = {{0:s}}'.format(repr(self.{6:s}))
+{7:s}    )
+'''.format(
+                indent,
+                name,
+                default_value[type],
+                indent,
+                indent,
+                name,
+                name,
+                indent
+              )
+            )
+      sys.stdout.write(
+        '''{0:s}def __repr__(self):
+{1:s}  params = []
+{2:s}  self.repr_serialize(params)
+{3:s}  return '{4:s}{5:s}({{0:s}})'.format(', '.join(params))
+{6:s}# GENERATE END
+'''.format(
+          indent,
+          indent,
+          indent,
+          indent,
+          package_name,
+          full_name,
+          indent
+        )
+      )
+      if begin is not None:
+        line = sys.stdin.readline()
+        while len(line):
+          if line.strip() == '# GENERATE END':
+            break
+          line = sys.stdin.readline()
+        else:
+          assert False
+    else:
+      match = re_factory.match(line)
+      if match is not None:
+        indent = match.group(1)
+        param = match.group(2)
+        begin = match.group(3)
+
+        sys.stdout.write(
+          '''{0:s}# GENERATE FACTORY({1:s}) BEGIN
+{2:s}tag_to_class = {{{3:s}
+{4:s}}}
+{5:s}def factory(tag, attrib = {{}}, *args, **kwargs):
+{6:s}  return tag_to_class.get(tag, {7:s})(tag, attrib, *args, **kwargs)
+{8:s}# GENERATE END
+'''.format(
+            indent,
+            param,
+            indent,
+            ','.join(
+              [
+                '\n{0:s}  \'{1:s}\': {2:s}'.format(
+                  indent,
+                  i.replace('.', '_'),
+                  i
+                )
+                for i in classes
+              ]
+            ),
+            indent,
+            indent,
+            indent,
+            param,
+            indent
+          )
+        )
+
+        if begin is not None:
+          line = sys.stdin.readline()
+          while len(line):
+            if line.strip() == '# GENERATE END':
+              break
+            line = sys.stdin.readline()
+          else:
+            assert False
+      else:
+        sys.stdout.write(line)
+  line = sys.stdin.readline()
diff --git a/plex.py b/plex.py
new file mode 100755 (executable)
index 0000000..e4bbf12
--- /dev/null
+++ b/plex.py
@@ -0,0 +1,416 @@
+#!/usr/bin/env python3
+
+import ast
+import element
+#import lex
+import numpy
+import re
+import regex
+import sys
+import work
+import xml.etree.ElementTree
+#import yacc
+
+class Options:
+  def __init__(self):
+    self.ecs = False
+    self.meta_ecs = False
+    self.yywrap = True
+
+class FlexDFA:
+  YY_TRAILING_MASK = 0x2000
+  YY_TRAILING_HEAD_MASK = 0x4000
+
+  def __init__(self, dfa):
+    # state 0 is the jam state, the EOB state will be added at the end
+    self.states = [([], 0, 0)] # accept, base, def
+    self.entries = [(0, 0)] * 0x101 # nxt, chk
+
+    # this is basically just a renumbering
+
+    # state numbers in the DFA become base/def numbers in the FlexDFA,
+    # obviously with gaps in the numbering depending on how things fit
+    state_to_flex_base_def = {}
+
+    # action numbers in the DFA become state numbers in the FlexDFA,
+    # with the start-action for each start-condition being copied into
+    # the correctly numbered slot (may cause duplicates), and then all
+    # actions reachable from these being copied to the subsequent slots
+    # (if start-action is reached again, uses the lower numbered copy)
+    flex_state_to_action = [0] + dfa.start_action
+    action_to_flex_state = {-1: 0} # see comment about state -1 below
+    for i in range(len(flex_state_to_action)):
+      action = flex_state_to_action[i]
+      if action not in action_to_flex_state:
+        action_to_flex_state[action] = i
+
+    # last start-condition is really end-of-buffer (EOB), it has only
+    # a dummy rule that accepts the null string and executes EOB action
+    eob_state = len(dfa.start_action)
+
+    # full_entries[i, j] is transition on character j in state i
+    # in our way of thinking, 0 is don't care and -1 is failure
+    # however, in the flex way these are both 0 (don't care),
+    # the distinction being that failure has no associated action
+    # thus all entries of states are filled, with 0 as a catch-all
+    full_entries = numpy.zeros((0x100, 0x101), numpy.int16)
+    full_entries[0, 0] = eob_state # all states go to EOB on NUL
+    used = numpy.zeros(0x200, numpy.bool)
+    used[:0x101] = True # account for the jam (don't care) state
+
+    while len(self.states) < len(flex_state_to_action):
+      action = flex_state_to_action[len(self.states)]
+      state, transition = dfa.actions[action]
+
+      # we collect marks without regard to which thread they refer to,
+      # they should already be in priority order without any duplicates
+      # (because the deduplication removes subsequent identical threads)
+      flex_accept = []
+      for j in [i[2] for i in transition if i[0] == regex.DFA.TRANSITION_MARK]:
+        if j & 1:
+          if (
+            len(flex_accept) > 0 and
+            flex_accept[-1] == (j >> 1) | FlexDFA.YY_TRAILING_HEAD_MASK
+          ):
+            # zero length trailing context, accept immediately
+            flex_accept.append(j >> 1)
+          else:
+            # look back to start of trailing context, then accept
+            flex_accept.append((j >> 1) | FlexDFA.YY_HEAD_MASK)
+        else:
+          # mark start of (hopefully safe) trailing context
+          flex_accept.append((j >> 1) | FlexDFA.YY_TRAILING_HEAD_MASK)
+
+      if state in state_to_flex_base_def:
+        flex_base, flex_def = state_to_flex_base[state]
+      else:
+        # extend full_entries array if required
+        if len(self.states) >= full_entries.shape[0]:
+          new_full_entries = numpy.zeros(
+            (full_entries.shape[0] * 2, 0x101),
+            numpy.int16
+          )
+          new_full_entries[:full_entries.shape[0], :] = full_entries
+          full_entries = new_full_entries
+
+        # calculate full entry from dfa.state char-to-action table
+        breaks, actions, _ = dfa.states[state]
+        char0 = 0
+        for i in range(len(breaks)):
+          char1 = breaks[i]
+          next_action = actions[i]
+          if next_action in action_to_flex_state:
+            next_flex_state = action_to_flex_state[next_action]
+          else:
+            next_flex_state = len(flex_state_to_action)
+            action_to_flex_state[next_action] = next_flex_state
+            flex_state_to_action.append(next_action)
+          full_entries[len(self.states), char0:char1] = next_flex_state
+          char0 = char1
+        assert char0 == 0x100
+
+        # remap NUL transition to 0x100 and replace with EOB transition
+        full_entries[len(self.states), 0x100] = \
+          full_entries[len(self.states), 0]
+        full_entries[len(self.states), 0] = eob_state
+        #print(len(self.states), full_entries[len(self.states), :])
+
+        # find most similar/earliest existing state to use as default
+        mask = (
+          full_entries[len(self.states):len(self.states) + 1, :] !=
+          full_entries[:len(self.states), :]
+        )
+        diff = numpy.sum(mask, 1)
+        flex_def = numpy.argmin(diff, 0)
+        if diff[flex_def] == 0: # exactly matching state
+          # this is supposed to have been compressed by DFA already,
+          # but can happen since we ignore the accept field of the DFA,
+          # just copy the base and def fields from the matching state
+          _, flex_base, flex_def = self.states[flex_def]
+        else:
+          mask = mask[flex_def, :]
+
+          # make sure used array is at least large enough to find a spot
+          while used.shape[0] < len(self.entries) + 0x101:
+            new_used = numpy.zeros((used.shape[0] * 2,), numpy.bool)
+            new_used[:used.shape[0]] = used
+            used = new_used
+
+          # find a suitable spot and store differences from default state
+          flex_base = 0
+          while flex_base < len(self.entries):
+            if not numpy.any(used[flex_base:flex_base + 0x101] & mask):
+              break
+            flex_base += 1
+          used[flex_base:flex_base + 0x101] |= mask
+          if len(self.entries) < flex_base + 0x101:
+            self.entries.extend(
+              [(0xffff, 0xffff)] * (flex_base + 0x101 - len(self.entries))
+            )
+          for i in numpy.nonzero(mask)[0]:
+            assert self.entries[flex_base + i] == (0xffff, 0xffff)
+            self.entries[flex_base + i] = (
+              full_entries[len(self.states), i],
+              len(self.states)
+            )
+
+      self.states.append((flex_accept, flex_base, flex_def))
+
+if len(sys.argv) < 2:
+  sys.stdout.write(
+    'usage: {0:s} rules.l\n'.format(
+      sys.argv[0]
+    )
+  )
+  sys.exit(1)
+
+#root = element.Element('root')
+#mark = []
+#macro_dict = {}
+#with open(sys.argv[1]) as fin:
+#  assert not yacc.yyparse(
+#    root,
+#    mark,
+#    lex.yylex(root, mark, macro_dict, work.yychunk_line(root, fin))
+#  )
+#def post_process(node):
+#  if isinstance(node, regex.Regex):
+#    node.post_process()
+#  else:
+#    for i in node:
+#      post_process(i)
+#post_process(root)
+with open(sys.argv[1] + '.xml') as fin:
+  root = element.deserialize(fin, ast.factory)
+#xml.etree.ElementTree.dump(root)
+
+options = Options()
+assert isinstance(root[0], ast.Section1)
+for i in root[0]:
+  if isinstance(i, ast.Options):
+    i.process(options)
+#print(options.yywrap)
+
+nfa = regex.NFA()
+
+expr = regex.RegexNone()
+actions = []
+assert isinstance(root[1], ast.Section2)
+for i in root[1]:
+  if isinstance(i, ast.Rule):
+    assert isinstance(i[0], ast.StartCondNone)
+    if isinstance(i[1], ast.BOLRule):
+      assert False
+    elif isinstance(i[1], ast.EOFRule):
+      assert False
+    else:
+      expr = regex.RegexOr(
+        children = [
+          expr,
+          regex.RegexSequence(
+            children = [
+              i[1],
+              regex.RegexGroup(
+                children = [
+                  i[2]
+                ]
+              )
+            ]
+          )
+        ]
+      )
+      actions.append(i[3])
+expr = regex.RegexAnd(
+  children = [
+    regex.RegexRepeat(
+      count0 = 0,
+      children = [
+        regex.RegexCharacterNot(
+          children = [
+            regex.RegexCharacter()
+          ]
+        )
+      ]
+    ),
+    expr
+  ]
+)
+expr.post_process()
+expr.add_to_nfa(nfa)
+
+# add EOB rule
+expr = regex.RegexGroup(children = [regex.RegexEmpty()])
+expr.post_process(group_index = len(actions))
+expr.add_to_nfa(nfa)
+
+flex_dfa = FlexDFA(nfa.to_dfa())
+with open('skel/lex.yy.c', 'r') as fin:
+  with open('lex.yy.c', 'w+') as fout:
+    line = fin.readline()
+    while len(line):
+      if line == '/* GENERATE SECTION1 */\n':
+        fout.write(
+          '''/* GENERATE SECTION1 BEGIN */
+{0:s}/*GENERATE SECTION1 END*/
+'''.format(
+            ''.join(
+              [
+                element.get_text(i, 0)
+                for i in root[0]
+                if isinstance(i, ast.CodeBlock)
+              ]
+            )
+          )
+        )
+      elif line == '/* GENERATE TABLES */\n':
+        yy_acclist = []
+        yy_accept = [0]
+        for flex_accept, _, _ in flex_dfa.states:
+          yy_acclist.extend(flex_accept)
+          yy_accept.append(len(yy_acclist))
+        fout.write(
+          '''/* GENERATE TABLES BEGIN */
+#define YY_END_OF_BUFFER {0:d}
+static const flex_int16_t yy_acclist[] = {{{1:s}
+}};
+static const flex_int16_t yy_accept[] = {{{2:s}
+}};
+static const flex_int16_t yy_base[] = {{{3:s}
+}};
+static const flex_int16_t yy_def[] = {{{4:s}
+}};
+static const flex_int16_t yy_nxt[] = {{{5:s}
+}};
+static const flex_int16_t yy_chk[] = {{{6:s}
+}};
+/* GENERATE TABLES END */
+'''.format(
+            len(actions),
+            ','.join(
+              [
+                '\n\t{0:s}'.format(
+                  ', '.join(
+                    [
+                      '{0:5d}'.format(j)
+                      for j in yy_acclist[i:i + 10]
+                    ]
+                  )
+                )
+                for i in range(0, len(yy_acclist), 10)
+              ]
+            ),
+            ','.join(
+              [
+                '\n\t{0:s}'.format(
+                  ', '.join(
+                    [
+                      '{0:5d}'.format(j)
+                      for j in yy_accept[i:i + 10]
+                    ]
+                  )
+                )
+                for i in range(0, len(yy_accept), 10)
+              ]
+            ),
+            ','.join(
+              [
+                '\n\t{0:s}'.format(
+                  ', '.join(
+                    [
+                      '{0:5d}'.format(j)
+                      for _, j, _ in flex_dfa.states[i:i + 10]
+                    ]
+                  )
+                )
+                for i in range(0, len(flex_dfa.states), 10)
+              ]
+            ),
+            ','.join(
+              [
+                '\n\t{0:s}'.format(
+                  ', '.join(
+                    [
+                      '{0:5d}'.format(j)
+                      for _, _, j in flex_dfa.states[i:i + 10]
+                    ]
+                  )
+                )
+                for i in range(0, len(flex_dfa.states), 10)
+              ]
+            ),
+            ','.join(
+              [
+                '\n\t{0:s}'.format(
+                  ', '.join(
+                    [
+                      '{0:5d}'.format(j)
+                      for j, _ in flex_dfa.entries[i:i + 10]
+                    ]
+                  )
+                )
+                for i in range(0, len(flex_dfa.entries), 10)
+              ]
+            ),
+            ','.join(
+              [
+                '\n\t{0:s}'.format(
+                  ', '.join(
+                    [
+                      '{0:5d}'.format(j)
+                      for _, j in flex_dfa.entries[i:i + 10]
+                    ]
+                  )
+                )
+                for i in range(0, len(flex_dfa.entries), 10)
+              ]
+            )
+          )
+        )
+      elif line == '/* GENERATE SECTION2INITIAL */\n':
+        fout.write(
+          '''/* GENERATE SECTION2INITIAL BEGIN */
+/* GENERATE SECTION2INITIAL END */
+'''.format(
+            ''.join(
+              [
+                element.get_text(i, 0)
+                for i in root[1]
+                if isinstance(i, ast.CodeBlock)
+              ]
+            )
+          )
+        )
+      elif line == '/* GENERATE SECTION2 */\n':
+        fout.write(
+          '''/* GENERATE SECTION2 BEGIN */
+{0:s}  case YY_STATE_EOF(INITIAL):
+               yyterminate();
+/* GENERATE SECTION2 END */
+'''.format(
+            ''.join(
+              [
+                '''    case {0:d}:
+               YY_RULE_SETUP
+               {1:s}           YY_BREAK
+
+'''.format(
+                  i,
+                  element.get_text(actions[i], 0)
+                )
+                for i in range(len(actions))
+              ]
+            )
+          )
+        )
+      elif line == '/* GENERATE SECTION3 */\n':
+        assert len(root) < 2 or isinstance(root[2], ast.Section3)
+        fout.write(
+          '''/* GENERATE SECTION3 BEGIN */
+{0:s}/*GENERATE SECTION3 END */
+'''.format(
+            element.get_text(root[2], 0) if len(root) >= 3 else ''
+          )
+        )
+      else:
+        fout.write(line)
+      line = fin.readline()
diff --git a/regex.py b/regex.py
new file mode 100644 (file)
index 0000000..d9da32a
--- /dev/null
+++ b/regex.py
@@ -0,0 +1,2692 @@
+import bisect
+import element
+import work
+import sys
+
+# defines the alphabet size, set this to 0x11000 for unicode
+chars = 0x100
+
+def char_set_or(char_set0, char_set1):
+  # calculate union of the child sets
+  # we do this by calculating a series of breakpoints, at each breakpoint
+  # evaluating the "or" (max) of the even/odd truth values of each child,
+  # then making the output truth value even/odd by outputting if necessary
+  result = []
+  i = 0
+  j = 0
+  while True:
+    if i < len(char_set0):
+      k = char_set0[i]
+      if j < len(char_set1):
+        k = min(k, char_set1[j])
+    elif j < len(char_set1):
+      k = char_set1[j]
+    else:
+      break
+    if i < len(char_set0) and char_set0[i] == k:
+      i += 1
+    if j < len(char_set1) and char_set1[j] == k:
+      j += 1
+    if (len(result) & 1) != max(i & 1, j & 1):
+      result.append(k)
+  assert (i & 1) == 0 and (j & 1) == 0
+  return result
+
+def char_set_and(char_set0, char_set1):
+  # calculate intersection of the child sets
+  # we do this by calculating a series of breakpoints, at each breakpoint
+  # evaluating the "and" (min) of the even/odd truth values of each child,
+  # then making the output truth value even/odd by outputting if necessary
+  result = []
+  i = 0
+  j = 0
+  while True:
+    if i < len(char_set0):
+      k = char_set0[i]
+      if j < len(char_set1):
+        k = min(k, char_set1[j])
+    elif j < len(char_set1):
+      k = char_set1[j]
+    else:
+      break
+    if i < len(char_set0) and char_set0[i] == k:
+      i += 1
+    if j < len(char_set1) and char_set1[j] == k:
+      j += 1
+    if (len(result) & 1) != min(i & 1, j & 1):
+      result.append(k)
+  assert (i & 1) == 0 and (j & 1) == 0
+  return result
+
+def char_set_not(char_set):
+  # calculate complement of the child set
+  # if child set begins with [0], remove it, otherwise add [0] prefix
+  # if child set ends with [chars], remove it, otherwise add [chars] suffix
+  # the suffix part is not totally necessary, but makes sure length is even
+  # (the evenness is so that single character sets can always be [c, c + 1])
+  result = list(char_set)
+  if result[:1] == [0]:
+    del result[:1]
+  else:
+    result[:0] = [0]
+  if result[-1:] == [chars]:
+    del result[-1:]
+  else:
+    result.append(chars)
+  return result
+
+class Regex(element.Element):
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'Regex',
+    attrib = {},
+    text = '',
+    children = []
+  ):
+    element.Element.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+  def copy(self, factory = None):
+    result = element.Element.copy(
+      self,
+      Regex if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'regex.Regex({0:s})'.format(', '.join(params))
+  # GENERATE END
+  def post_process(self, group_index = 0, rule_char_sets = None):
+    for i in self:
+      group_index = i.post_process(group_index, rule_char_sets)
+    return group_index
+  def to_groups(self, groups):
+    for i in self:
+      i.to_groups(groups)
+  def to_nfa_state(self, nfa, next_state):
+    raise NotImplementedException
+  def add_to_nfa(self, nfa):
+    nfa.start_state.append(self.to_nfa_state(nfa, 0))
+  def to_lr1_symbols(self, terminal_thres, symbols, lookaheads, group_bounds):
+    group_count = 0
+    for i in self:
+      group_count += (
+        i.to_lr1_symbols(terminal_thres, symbols, lookaheads, group_bounds)
+      )
+    return group_count # count of groups or ungrouped characters
+
+class RegexNone(Regex):
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'RegexNone',
+    attrib = {},
+    text = '',
+    children = []
+  ):
+    Regex.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+  def copy(self, factory = None):
+    result = Regex.copy(
+      self,
+      RegexNone if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'regex.RegexNone({0:s})'.format(', '.join(params))
+  # GENERATE END
+  def to_nfa_state(self, nfa, next_state):
+    return -1
+
+class RegexEmpty(Regex):
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'RegexEmpty',
+    attrib = {},
+    text = '',
+    children = []
+  ):
+    Regex.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+  def copy(self, factory = None):
+    result = Regex.copy(
+      self,
+      RegexEmpty if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'regex.RegexEmpty({0:s})'.format(', '.join(params))
+  # GENERATE END
+  def to_nfa_state(self, nfa, next_state):
+    return next_state
+
+class RegexCharacter(Regex):
+  # GENERATE ELEMENT(list(int) char_set) BEGIN
+  def __init__(
+    self,
+    tag = 'RegexCharacter',
+    attrib = {},
+    text = '',
+    children = [],
+    char_set = []
+  ):
+    Regex.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+    self.char_set = (
+      [element.deserialize_int(i) for i in char_set.split()]
+    if isinstance(char_set, str) else
+      char_set
+    )
+  def serialize(self, ref_list, indent = 0):
+    Regex.serialize(self, ref_list, indent)
+    self.set(
+      'char_set',
+      ' '.join([element.serialize_int(i) for i in self.char_set])
+    )
+  def deserialize(self, ref_list):
+    Regex.deserialize(self, ref_list)
+    self.char_set = [
+      element.deserialize_int(i)
+      for i in self.get('char_set', '').split()
+    ]
+  def copy(self, factory = None):
+    result = Regex.copy(
+      self,
+      RegexCharacter if factory is None else factory
+    )
+    result.char_set = self.char_set
+    return result
+  def repr_serialize(self, params):
+    Regex.repr_serialize(self, params)
+    if len(self.char_set):
+      params.append(
+        'char_set = [{0:s}]'.format(
+          ', '.join([repr(i) for i in self.char_set])
+        )
+      )
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'regex.RegexCharacter({0:s})'.format(', '.join(params))
+  # GENERATE END
+  def to_nfa_state(self, nfa, next_state):
+    new_state = len(nfa.states)
+    nfa.states.append((NFA.STATE_CHARACTER, self.char_set, next_state))
+    return new_state
+  def to_lr1_symbols(self, terminal_thres, symbols, lookaheads, group_bounds):
+    terminal_set = []
+    nonterminal_set = []
+    i = 0
+    while i < len(self.char_set):
+      [j, k] = self.char_set[i:i + 2]
+      if k > terminal_thres:
+        if j < terminal_thres:
+          terminal_set.extend([j, terminal_thres])
+          nonterminal_set.extend([0, k - terminal_thres])
+          i += 2
+        while i < len(self.char_set):
+          [j, k] = self.char_set[i:i + 2]
+          nonterminal_set.extend([j - terminal_thres, k - terminal_thres])
+          i += 2
+        break
+      terminal_set.extend([j, k])
+      i += 2
+    symbols.append((terminal_set, nonterminal_set))
+    lookaheads.append(([], False)) # initial_set, can_be_empty
+    return 1 # count of groups or ungrouped characters
+
+class RegexCharacterRange(RegexCharacter):
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'RegexCharacterRange',
+    attrib = {},
+    text = '',
+    children = [],
+    char_set = []
+  ):
+    RegexCharacter.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children,
+      char_set
+    )
+  def copy(self, factory = None):
+    result = RegexCharacter.copy(
+      self,
+      RegexCharacterRange if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'regex.RegexCharacterRange({0:s})'.format(', '.join(params))
+  # GENERATE END
+  def post_process(self, group_index = 0, rule_char_sets = None):
+    group_index = RegexCharacter.post_process(self, group_index, rule_char_sets)
+    self.char_set = [self[0].char_set[0], self[1].char_set[-1]]
+    return group_index
+
+class RegexCharacterOr(RegexCharacter):
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'RegexCharacterOr',
+    attrib = {},
+    text = '',
+    children = [],
+    char_set = []
+  ):
+    RegexCharacter.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children,
+      char_set
+    )
+  def copy(self, factory = None):
+    result = RegexCharacter.copy(
+      self,
+      RegexCharacterOr if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'regex.RegexCharacterOr({0:s})'.format(', '.join(params))
+  # GENERATE END
+  def post_process(self, group_index = 0, rule_char_sets = None):
+    group_index = RegexCharacter.post_process(self, group_index, rule_char_sets)
+    self.char_set = char_set_or(self[0].char_set, self[1].char_set)
+    return group_index
+
+class RegexCharacterAnd(RegexCharacter):
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'RegexCharacterAnd',
+    attrib = {},
+    text = '',
+    children = [],
+    char_set = []
+  ):
+    RegexCharacter.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children,
+      char_set
+    )
+  def copy(self, factory = None):
+    result = RegexCharacter.copy(
+      self,
+      RegexCharacterAnd if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'regex.RegexCharacterAnd({0:s})'.format(', '.join(params))
+  # GENERATE END
+  def post_process(self, group_index = 0, rule_char_sets = None):
+    group_index = RegexCharacter.post_process(self, group_index, rule_char_sets)
+    self.char_set = char_set_and(self[0].char_set, self[1].char_set)
+    return group_index
+
+class RegexCharacterNot(RegexCharacter):
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'RegexCharacterNot',
+    attrib = {},
+    text = '',
+    children = [],
+    char_set = []
+  ):
+    RegexCharacter.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children,
+      char_set
+    )
+  def copy(self, factory = None):
+    result = RegexCharacter.copy(
+      self,
+      RegexCharacterNot if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'regex.RegexCharacterNot({0:s})'.format(', '.join(params))
+  # GENERATE END
+  def post_process(self, group_index = 0, rule_char_sets = None):
+    group_index = RegexCharacter.post_process(self, group_index, rule_char_sets)
+    self.char_set = char_set_not(self[0].char_set)
+    return group_index
+
+class RegexCharacterRule(RegexCharacter):
+  # GENERATE ELEMENT(str rule_name) BEGIN
+  def __init__(
+    self,
+    tag = 'RegexCharacterRule',
+    attrib = {},
+    text = '',
+    children = [],
+    char_set = [],
+    rule_name = ''
+  ):
+    RegexCharacter.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children,
+      char_set
+    )
+    self.rule_name = rule_name
+  def serialize(self, ref_list, indent = 0):
+    RegexCharacter.serialize(self, ref_list, indent)
+    self.set('rule_name', element.serialize_str(self.rule_name))
+  def deserialize(self, ref_list):
+    RegexCharacter.deserialize(self, ref_list)
+    self.rule_name = element.deserialize_str(self.get('rule_name', ''))
+  def copy(self, factory = None):
+    result = RegexCharacter.copy(
+      self,
+      RegexCharacterRule if factory is None else factory
+    )
+    result.rule_name = self.rule_name
+    return result
+  def repr_serialize(self, params):
+    RegexCharacter.repr_serialize(self, params)
+    if self.rule_name != '':
+      params.append(
+        'rule_name = {0:s}'.format(repr(self.rule_name))
+      )
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'regex.RegexCharacterRule({0:s})'.format(', '.join(params))
+  # GENERATE END
+  def post_process(self, group_index = 0, rule_char_sets = None):
+    if rule_char_sets is not None:
+      self.char_set = rule_char_sets[self.rule_name]
+    return RegexCharacter.post_process(self, group_index, rule_char_sets)
+
+class RegexOr(Regex):
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'RegexOr',
+    attrib = {},
+    text = '',
+    children = []
+  ):
+    Regex.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+  def copy(self, factory = None):
+    result = Regex.copy(
+      self,
+      RegexOr if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'regex.RegexOr({0:s})'.format(', '.join(params))
+  # GENERATE END
+  def to_nfa_state(self, nfa, next_state):
+    child0_state = self[0].to_nfa_state(nfa, next_state)
+    child1_state = self[1].to_nfa_state(nfa, next_state)
+    if child0_state == -1:
+      return child1_state
+    if child1_state == -1:
+      return child0_state
+    new_state = len(nfa.states)
+    nfa.states.append((NFA.STATE_OR, child0_state, child1_state))
+    return new_state
+
+class RegexAnd(Regex):
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'RegexAnd',
+    attrib = {},
+    text = '',
+    children = []
+  ):
+    Regex.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+  def copy(self, factory = None):
+    result = Regex.copy(
+      self,
+      RegexAnd if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'regex.RegexAnd({0:s})'.format(', '.join(params))
+  # GENERATE END
+  def to_nfa_state(self, nfa, next_state):
+    join0_state = len(nfa.states)
+    nfa.states.append(NFA.join0_state) # takes no arguments so use static one
+    join1_state = len(nfa.states)
+    nfa.states.append((NFA.STATE_JOIN1, next_state))
+    child0_state = self[0].to_nfa_state(nfa, join0_state)
+    if child0_state == -1:
+      return -1
+    child1_state = self[1].to_nfa_state(nfa, join1_state)
+    if child1_state == -1:
+      return -1
+    new_state = len(nfa.states)
+    nfa.states.append((NFA.STATE_AND, child0_state, child1_state))
+    return new_state
+
+class RegexSequence(Regex):
+  # GENERATE ELEMENT() BEGIN
+  def __init__(
+    self,
+    tag = 'RegexSequence',
+    attrib = {},
+    text = '',
+    children = []
+  ):
+    Regex.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+  def copy(self, factory = None):
+    result = Regex.copy(
+      self,
+      RegexSequence if factory is None else factory
+    )
+    return result
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'regex.RegexSequence({0:s})'.format(', '.join(params))
+  # GENERATE END
+  def to_nfa_state(self, nfa, next_state):
+    child1_state = self[1].to_nfa_state(nfa, next_state)
+    if child1_state == -1:
+      return -1
+    return self[0].to_nfa_state(nfa, child1_state)
+
+class RegexRepeat(Regex):
+  # GENERATE ELEMENT(int count0, int count1, bool non_greedy) BEGIN
+  def __init__(
+    self,
+    tag = 'RegexRepeat',
+    attrib = {},
+    text = '',
+    children = [],
+    count0 = -1,
+    count1 = -1,
+    non_greedy = False
+  ):
+    Regex.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+    self.count0 = (
+      element.deserialize_int(count0)
+    if isinstance(count0, str) else
+      count0
+    )
+    self.count1 = (
+      element.deserialize_int(count1)
+    if isinstance(count1, str) else
+      count1
+    )
+    self.non_greedy = (
+      element.deserialize_bool(non_greedy)
+    if isinstance(non_greedy, str) else
+      non_greedy
+    )
+  def serialize(self, ref_list, indent = 0):
+    Regex.serialize(self, ref_list, indent)
+    self.set('count0', element.serialize_int(self.count0))
+    self.set('count1', element.serialize_int(self.count1))
+    self.set('non_greedy', element.serialize_bool(self.non_greedy))
+  def deserialize(self, ref_list):
+    Regex.deserialize(self, ref_list)
+    self.count0 = element.deserialize_int(self.get('count0', '-1'))
+    self.count1 = element.deserialize_int(self.get('count1', '-1'))
+    self.non_greedy = element.deserialize_bool(self.get('non_greedy', 'false'))
+  def copy(self, factory = None):
+    result = Regex.copy(
+      self,
+      RegexRepeat if factory is None else factory
+    )
+    result.count0 = self.count0
+    result.count1 = self.count1
+    result.non_greedy = self.non_greedy
+    return result
+  def repr_serialize(self, params):
+    Regex.repr_serialize(self, params)
+    if self.count0 != -1:
+      params.append(
+        'count0 = {0:s}'.format(repr(self.count0))
+      )
+    if self.count1 != -1:
+      params.append(
+        'count1 = {0:s}'.format(repr(self.count1))
+      )
+    if self.non_greedy != False:
+      params.append(
+        'non_greedy = {0:s}'.format(repr(self.non_greedy))
+      )
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'regex.RegexRepeat({0:s})'.format(', '.join(params))
+  # GENERATE END
+  def post_process(self, group_index = 0, rule_char_sets = None):
+    # total hack which will be done in a Python action in future
+    if len(self) >= 2:
+      assert self[1].tag == 'Number'
+      self.count0 = int(self[1].text)
+      if len(self) >= 3:
+        assert self[2].tag == 'Number'
+        self.count1 = int(self[2].text)
+      else:
+        self.count1 = self.count0
+      del self[1:]
+    # end total hack
+    return Regex.post_process(self, group_index, rule_char_sets)
+  def to_nfa_state(self, nfa, next_state):
+    count0 = self.count0
+    count1 = self.count1
+    if count1 == -1:
+      new_state = len(nfa.states)
+      nfa.states.append(None)
+      child_state = self[0].to_nfa_state(nfa, new_state)
+      if child_state == -1:
+        new_state = next_state # note: unreachable state remains invalid (None)
+      else:
+        nfa.states[new_state] = (
+          (NFA.STATE_OR, next_state, child_state)
+        if self.non_greedy else
+          (NFA.STATE_OR, child_state, next_state)
+        )
+    else:
+      new_state = next_state
+      for i in range(count1 - count0):
+        child_state = self[0].to_nfa_state(nfa, new_state)
+        if child_state == -1:
+          break
+        new_state = len(nfa.states)
+        nfa.states.append(
+          (NFA.STATE_OR, next_state, child_state)
+        if self.non_greedy else
+          (NFA.STATE_OR, child_state, next_state)
+        )
+    for i in range(count0):
+      new_state = self[0].to_nfa_state(nfa, new_state)
+      if new_state == -1:
+        break
+    return new_state
+
+class RegexGroup(Regex):
+  class Attribute(element.Element):
+    # GENERATE ELEMENT(str name, str value) BEGIN
+    def __init__(
+      self,
+      tag = 'RegexGroup_Attribute',
+      attrib = {},
+      text = '',
+      children = [],
+      name = '',
+      value = ''
+    ):
+      element.Element.__init__(
+        self,
+        tag,
+        attrib,
+        text,
+        children
+      )
+      self.name = name
+      self.value = value
+    def serialize(self, ref_list, indent = 0):
+      element.Element.serialize(self, ref_list, indent)
+      self.set('name', element.serialize_str(self.name))
+      self.set('value', element.serialize_str(self.value))
+    def deserialize(self, ref_list):
+      element.Element.deserialize(self, ref_list)
+      self.name = element.deserialize_str(self.get('name', ''))
+      self.value = element.deserialize_str(self.get('value', ''))
+    def copy(self, factory = None):
+      result = element.Element.copy(
+        self,
+        Attribute if factory is None else factory
+      )
+      result.name = self.name
+      result.value = self.value
+      return result
+    def repr_serialize(self, params):
+      element.Element.repr_serialize(self, params)
+      if self.name != '':
+        params.append(
+          'name = {0:s}'.format(repr(self.name))
+        )
+      if self.value != '':
+        params.append(
+          'value = {0:s}'.format(repr(self.value))
+        )
+    def __repr__(self):
+      params = []
+      self.repr_serialize(params)
+      return 'regex.RegexGroup.Attribute({0:s})'.format(', '.join(params))
+    # GENERATE END
+
+  # GENERATE ELEMENT(int group_index, str group_name, list(ref) group_attributes) BEGIN
+  def __init__(
+    self,
+    tag = 'RegexGroup',
+    attrib = {},
+    text = '',
+    children = [],
+    group_index = -1,
+    group_name = '',
+    group_attributes = []
+  ):
+    Regex.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+    self.group_index = (
+      element.deserialize_int(group_index)
+    if isinstance(group_index, str) else
+      group_index
+    )
+    self.group_name = group_name
+    self.group_attributes = group_attributes
+  def serialize(self, ref_list, indent = 0):
+    Regex.serialize(self, ref_list, indent)
+    self.set('group_index', element.serialize_int(self.group_index))
+    self.set('group_name', element.serialize_str(self.group_name))
+    self.set(
+      'group_attributes',
+      ' '.join([element.serialize_ref(i, ref_list) for i in self.group_attributes])
+    )
+  def deserialize(self, ref_list):
+    Regex.deserialize(self, ref_list)
+    self.group_index = element.deserialize_int(self.get('group_index', '-1'))
+    self.group_name = element.deserialize_str(self.get('group_name', ''))
+    self.group_attributes = [
+      element.deserialize_ref(i, ref_list)
+      for i in self.get('group_attributes', '').split()
+    ]
+  def copy(self, factory = None):
+    result = Regex.copy(
+      self,
+      RegexGroup if factory is None else factory
+    )
+    result.group_index = self.group_index
+    result.group_name = self.group_name
+    result.group_attributes = self.group_attributes
+    return result
+  def repr_serialize(self, params):
+    Regex.repr_serialize(self, params)
+    if self.group_index != -1:
+      params.append(
+        'group_index = {0:s}'.format(repr(self.group_index))
+      )
+    if self.group_name != '':
+      params.append(
+        'group_name = {0:s}'.format(repr(self.group_name))
+      )
+    if len(self.group_attributes):
+      params.append(
+        'group_attributes = [{0:s}]'.format(
+          ', '.join([repr(i) for i in self.group_attributes])
+        )
+      )
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'regex.RegexGroup({0:s})'.format(', '.join(params))
+  # GENERATE END
+  def post_process(self, group_index = 0, rule_char_sets = None):
+    # total hack which will be done in a Python action in future
+    if len(self) >= 2:
+      assert self[0].tag == 'GroupName'
+      self.group_name = self[0].text[1:-1]
+      del self[:1]
+    # end total hack
+    self.group_index = group_index
+    group_index += 1
+    return Regex.post_process(self, group_index, rule_char_sets)
+  def to_groups(self, groups):
+    assert len(groups) == self.group_index
+    groups.append(
+      (self.group_name, {i.name: i.value for i in self.group_attributes})
+    )
+    return Regex.to_groups(self, groups)
+  def to_nfa_state(self, nfa, next_state):
+    mark_state = len(nfa.states)
+    nfa.states.append((NFA.STATE_MARK, self.group_index * 2 + 1, next_state))
+    child_state = self[0].to_nfa_state(nfa, mark_state)
+    if child_state == -1:
+      return -1
+    new_state = len(nfa.states)
+    nfa.states.append((NFA.STATE_MARK, self.group_index * 2, child_state))
+    return new_state
+  def to_lr1_symbols(self, terminal_thres, symbols, lookaheads, group_bounds):
+    group_start = len(symbols)
+    assert self.group_index == len(group_bounds)
+    group_bounds.append(None)
+    group_count = Regex.to_lr1_symbols(
+      self,
+      terminal_thres,
+      symbols,
+      lookaheads,
+      group_bounds
+    )
+    group_bounds[self.group_index] = (
+      group_start,
+      group_count,
+      self.group_name,
+      {i.name: i.value for i in self.group_attributes}
+    )
+    return 1 # count of groups or ungrouped characters
+
+class Grammar(element.Element):
+  class Production(element.Element):
+    # GENERATE ELEMENT(int nonterminal, int priority, bool right_to_left) BEGIN
+    def __init__(
+      self,
+      tag = 'Grammar_Production',
+      attrib = {},
+      text = '',
+      children = [],
+      nonterminal = -1,
+      priority = -1,
+      right_to_left = False
+    ):
+      element.Element.__init__(
+        self,
+        tag,
+        attrib,
+        text,
+        children
+      )
+      self.nonterminal = (
+        element.deserialize_int(nonterminal)
+      if isinstance(nonterminal, str) else
+        nonterminal
+      )
+      self.priority = (
+        element.deserialize_int(priority)
+      if isinstance(priority, str) else
+        priority
+      )
+      self.right_to_left = (
+        element.deserialize_bool(right_to_left)
+      if isinstance(right_to_left, str) else
+        right_to_left
+      )
+    def serialize(self, ref_list, indent = 0):
+      element.Element.serialize(self, ref_list, indent)
+      self.set('nonterminal', element.serialize_int(self.nonterminal))
+      self.set('priority', element.serialize_int(self.priority))
+      self.set('right_to_left', element.serialize_bool(self.right_to_left))
+    def deserialize(self, ref_list):
+      element.Element.deserialize(self, ref_list)
+      self.nonterminal = element.deserialize_int(self.get('nonterminal', '-1'))
+      self.priority = element.deserialize_int(self.get('priority', '-1'))
+      self.right_to_left = element.deserialize_bool(self.get('right_to_left', 'false'))
+    def copy(self, factory = None):
+      result = element.Element.copy(
+        self,
+        Production if factory is None else factory
+      )
+      result.nonterminal = self.nonterminal
+      result.priority = self.priority
+      result.right_to_left = self.right_to_left
+      return result
+    def repr_serialize(self, params):
+      element.Element.repr_serialize(self, params)
+      if self.nonterminal != -1:
+        params.append(
+          'nonterminal = {0:s}'.format(repr(self.nonterminal))
+        )
+      if self.priority != -1:
+        params.append(
+          'priority = {0:s}'.format(repr(self.priority))
+        )
+      if self.right_to_left != False:
+        params.append(
+          'right_to_left = {0:s}'.format(repr(self.right_to_left))
+        )
+    def __repr__(self):
+      params = []
+      self.repr_serialize(params)
+      return 'regex.Grammar.Production({0:s})'.format(', '.join(params))
+    # GENERATE END
+
+  # GENERATE ELEMENT(int terminal_thres) BEGIN
+  def __init__(
+    self,
+    tag = 'Grammar',
+    attrib = {},
+    text = '',
+    children = [],
+    terminal_thres = -1
+  ):
+    element.Element.__init__(
+      self,
+      tag,
+      attrib,
+      text,
+      children
+    )
+    self.terminal_thres = (
+      element.deserialize_int(terminal_thres)
+    if isinstance(terminal_thres, str) else
+      terminal_thres
+    )
+  def serialize(self, ref_list, indent = 0):
+    element.Element.serialize(self, ref_list, indent)
+    self.set('terminal_thres', element.serialize_int(self.terminal_thres))
+  def deserialize(self, ref_list):
+    element.Element.deserialize(self, ref_list)
+    self.terminal_thres = element.deserialize_int(self.get('terminal_thres', '-1'))
+  def copy(self, factory = None):
+    result = element.Element.copy(
+      self,
+      Grammar if factory is None else factory
+    )
+    result.terminal_thres = self.terminal_thres
+    return result
+  def repr_serialize(self, params):
+    element.Element.repr_serialize(self, params)
+    if self.terminal_thres != -1:
+      params.append(
+        'terminal_thres = {0:s}'.format(repr(self.terminal_thres))
+      )
+  def __repr__(self):
+    params = []
+    self.repr_serialize(params)
+    return 'regex.Grammar({0:s})'.format(', '.join(params))
+  # GENERATE END
+  def post_process(self, rule_char_sets):
+    for i in range(len(self)):
+      self[i].nonterminal = i
+      self[i][0].post_process(0, rule_char_sets)
+  def to_lr1(self):
+    lr1 = LR1([], self.terminal_thres)
+    for i in self:
+      symbols = []
+      lookaheads = []
+      group_bounds = []
+      i[0].to_lr1_symbols(
+        self.terminal_thres,
+        symbols,
+        lookaheads,
+        group_bounds
+      )
+      lookaheads.append(([], True)) # initial_set, can_be_empty (sentinel)
+      lr1.productions.append(
+        (
+          i.priority * 2 + int(i.right_to_left),
+          symbols,
+          lookaheads,
+          group_bounds
+        )
+      )
+    # propagate lookaheads
+    modified = True
+    while modified:
+      modified = False
+      for _, symbols, lookaheads, _ in lr1.productions:
+        for i in range(len(symbols) - 1, -1, -1):
+          initial_set, nonterminal_set = symbols[i]
+          can_be_empty = False
+          for j in range(0, len(nonterminal_set), 2):
+            for k in range(nonterminal_set[j], nonterminal_set[j + 1]):
+              child_initial_set, child_can_be_empty = lr1.productions[k][2][0]
+              initial_set = char_set_or(initial_set, child_initial_set)
+              can_be_empty = can_be_empty or child_can_be_empty
+          # at this point can_be_empty refers to current symbol only
+          if can_be_empty:
+            next_initial_set, can_be_empty = lookaheads[i + 1]
+            initial_set = char_set_or(initial_set, next_initial_set)
+          # at this point can_be_empty refers to all remaining symbols
+          if (initial_set, can_be_empty) != lookaheads[i]:
+            lookaheads[i] = (initial_set, can_be_empty)
+            modified = True
+    return lr1
+
+# GENERATE FACTORY(element.Element) BEGIN
+tag_to_class = {
+  'Regex': Regex,
+  'RegexNone': RegexNone,
+  'RegexEmpty': RegexEmpty,
+  'RegexCharacter': RegexCharacter,
+  'RegexCharacterRange': RegexCharacterRange,
+  'RegexCharacterOr': RegexCharacterOr,
+  'RegexCharacterAnd': RegexCharacterAnd,
+  'RegexCharacterNot': RegexCharacterNot,
+  'RegexCharacterRule': RegexCharacterRule,
+  'RegexOr': RegexOr,
+  'RegexAnd': RegexAnd,
+  'RegexSequence': RegexSequence,
+  'RegexRepeat': RegexRepeat,
+  'RegexGroup': RegexGroup,
+  'RegexGroup_Attribute': RegexGroup.Attribute,
+  'Grammar': Grammar,
+  'Grammar_Production': Grammar.Production
+}
+def factory(tag, attrib = {}, *args, **kwargs):
+  return tag_to_class.get(tag, element.Element)(tag, attrib, *args, **kwargs)
+# GENERATE END
+
+class NFA:
+  # state_desc classes:
+  # (STATE_CHARACTER, char_set, next_state)
+  # (STATE_OR, next_state0, next_state1)
+  # (STATE_AND, next_state0, next_state1)
+  # (STATE_JOIN0,)
+  # (STATE_JOIN1, next_state)
+  # (STATE_MARK, mark_value, next_state)
+
+  STATE_CHARACTER = 0
+  STATE_OR = 1
+  STATE_AND = 2
+  STATE_JOIN0 = 3
+  STATE_JOIN1 = 4
+  STATE_MARK = 5
+  join0_state = (STATE_JOIN0,)
+
+  # multistate classes:
+  # (MULTISTATE_ACCEPT, 1)             accept, occupies one thread in list
+  # (MULTISTATE_AND, n, state, child)  and, occupies n threads in list
+  # (MULTISTATE_OR, n, child0, child1)
+  # n = sum of n from child states, that is, multistate[1] is the number of
+  # (MULTISTATE_ACCEPT, 1) leaf states ultimately reachable from this subtree
+
+  MULTISTATE_ACCEPT = 0
+  MULTISTATE_AND = 1
+  MULTISTATE_OR = 2
+  accept_multistate = (MULTISTATE_ACCEPT, 1)
+
+  def __init__(
+    self,
+    groups = [],
+    states = [(STATE_CHARACTER, [0, chars], 0)],
+    start_state = [] # can have multiple NFAs in same container
+  ):
+    # groups: list of group_desc
+    # group_desc: (tag, kwargs)
+    #   tag, kwargs will be passed to apply_markup() hence factory()
+    self.groups = groups
+    self.states = states
+    self.start_state = start_state
+
+  def multistate_next(self, root_multistate, char):
+    # the deduplication works as effectively a second pass which goes
+    # over the multistate tree in pre-order, looking for OR-disjunctions
+    # of any depth and configuration, e.g. (a OR b) or (c OR d), and
+    # collecting the subexpressions e.g. a, b, c OR b, c, d, c OR d, and
+    # failing any subexpression that's previously occurred in pre-order
+    # (children of AND-conjunctions get a fresh empty subexpression set)
+
+    # unfortunately it can't be done as a second pass literally, because
+    # we have to generate the transition list, thus we need to know if a
+    # subexpression is going to be failed, so we can delete it's threads
+
+    # therefore the deduplication is tacked onto the end of the building
+    # process, examining each built node just before the builder returns,
+    # if already in the subexpression set we will rewrite the transition
+    # list produced by the recursive calls, otherwise we will add it in
+
+    # the problem is determining the subexpression set to pass into any
+    # recursive calls, the caller's set will be sent unchanged by cases
+    # that return the result unchanged or add an OR-disjuction, an empty
+    # set will be sent by cases that add an AND-conjuction to the result 
+
+    # if we have added something to the node built by a recursive call,
+    # we'll fall into the deduplication logic, otherwise just return it
+
+    def advance1(multistate, join_count, done_multistates):
+      # modifies nonlocal: transition
+      assert multistate[0] == NFA.MULTISTATE_AND
+      _, _, state, child = multistate
+      if state == 0:
+        # note that if we reach the accepting state, we must be a top-level
+        # expression, and could not be part of an AND-conjunction (because
+        # AND-conjunction terms always go to a join0 or join1 state first),
+        # we remove ourselves to indicate to scanner that match is complete
+        assert join_count == 0
+        assert child == NFA.accept_multistate
+        return advance(child, 0, done_multistates)
+      state_desc = self.states[state]
+      if state_desc[0] == NFA.STATE_CHARACTER:
+        if join_count != 0:
+          transition.append((DFA.TRANSITION_POP, child[1]))
+          return None
+        len_transition = len(transition)
+        child = advance(child, 0, set())
+        if child is None:
+          return None
+        result = (NFA.MULTISTATE_AND, child[1], state, child)
+      elif state_desc[0] == NFA.STATE_OR:
+        _, next_state0, next_state1 = state_desc
+        len_transition = len(transition)
+        transition.append((DFA.TRANSITION_DUP, child[1]))
+        child0 = advance1(
+          (NFA.MULTISTATE_AND, child[1], next_state0, child),
+          join_count,
+          done_multistates
+        )
+        child1 = advance1(
+          (NFA.MULTISTATE_AND, child[1], next_state1, child),
+          join_count,
+          done_multistates
+        )
+        if child0 is None:
+          return child1
+        if child1 is None:
+          return child0
+        result = (NFA.MULTISTATE_OR, child0[1] + child1[1], child0, child1)
+      elif state_desc[0] == NFA.STATE_AND:
+        _, next_state0, next_state1 = state_desc
+        return advance1(
+          (
+            NFA.MULTISTATE_AND,
+            child[1],
+            next_state0,
+            (NFA.MULTISTATE_AND, child[1], next_state1, child)
+          ),
+          join_count,
+          done_multistates
+        )
+      elif state_desc[0] == NFA.STATE_JOIN0:
+        return advance(child, join_count + 1, done_multistates)
+      elif state_desc[0] == NFA.STATE_JOIN1:
+        _, next_state = state_desc
+        if join_count == 0:
+          transition.append((DFA.TRANSITION_POP, child[1]))
+          return None
+        return advance1(
+          (NFA.MULTISTATE_AND, child[1], next_state, child),
+          join_count - 1,
+          done_multistates
+        )
+      elif state_desc[0] == NFA.STATE_MARK:
+        _, mark_value, next_state = state_desc
+        transition.append((DFA.TRANSITION_MARK, child[1], mark_value))
+        return advance1(
+          (NFA.MULTISTATE_AND, child[1], next_state, child),
+          join_count,
+          done_multistates
+        )
+      else:
+        assert False
+      if result in done_multistates:
+        transition[len_transition:] = [(DFA.TRANSITION_POP, multistate[1])]
+        return None
+      done_multistates.add(result)
+      return result
+
+    def advance(multistate, join_count, done_multistates):
+      nonlocal char0, char1 # modifies nonlocal: transition
+      if multistate[0] == NFA.MULTISTATE_ACCEPT:
+        assert join_count == 0
+        len_transition = len(transition)
+        transition.append((DFA.TRANSITION_MOVE, 1))
+        result = NFA.accept_multistate # takes no arguments so use static one
+      elif multistate[0] == NFA.MULTISTATE_AND:
+        if char >= 0:
+          _, _, state, child = multistate
+          state_desc = self.states[state]
+          assert state_desc[0] == NFA.STATE_CHARACTER
+          _, char_set, next_state = state_desc
+          k = bisect.bisect_right(char_set, char)
+          if k > 0 and char0 < char_set[k - 1]:
+            char0 = char_set[k - 1]
+          if k < len(char_set) and char1 > char_set[k]:
+            char1 = char_set[k]
+          if (k & 1) == 0:
+            transition.append((DFA.TRANSITION_POP, child[1]))
+            return None
+          multistate = (NFA.MULTISTATE_AND, child[1], next_state, child)
+        return advance1(multistate, join_count, done_multistates)
+      elif multistate[0] == NFA.MULTISTATE_OR:
+        _, _, child0, child1 = multistate
+        len_transition = len(transition)
+        child0 = advance(child0, join_count, done_multistates)
+        child1 = advance(child1, join_count, done_multistates)
+        if child0 is None:
+          return child1
+        if child1 is None:
+          return child0
+        result = (NFA.MULTISTATE_OR, child0[1] + child1[1], child0, child1)
+      else:
+        assert False
+      if result in done_multistates:
+        transition[len_transition:] = [(DFA.TRANSITION_POP, multistate[1])]
+        return None
+      done_multistates.add(result)
+      return result
+
+    transition = []
+    char0 = 0
+    char1 = chars
+    root_multistate = advance(root_multistate, 0, set())
+    return root_multistate, transition, char0, char1
+
+  def multistate_accept(root_multistate):
+    i = 0
+    def accept(multistate):
+      nonlocal i
+      if multistate[0] == NFA.MULTISTATE_ACCEPT:
+        return True
+      if multistate[0] == NFA.MULTISTATE_AND:
+        _, _, _, child = multistate
+        i += child[1]
+        return False
+      if multistate[0] == NFA.MULTISTATE_OR:
+        _, _, child0, child1 = multistate
+        return accept(child0) or accept(child1)
+      assert False
+    return i if accept(root_multistate) else -1
+
+  def match_text(self, text, i, start_index = 0):
+    def transit(transition):
+      nonlocal threads0, threads1, prefix_slop # note: also uses i
+      j = prefix_slop
+      for trans in transition:
+        if trans[0] == DFA.TRANSITION_POP:
+          j += trans[1]
+        elif trans[0] == DFA.TRANSITION_DUP:
+          while j < trans[1]:
+            threads0[:0] = [None] * prefix_slop
+            threads1[:0] = [None] * prefix_slop
+            j += prefix_slop
+            prefix_slop *= 2
+          threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
+          j -= trans[1]
+        elif trans[0] == DFA.TRANSITION_MARK:
+          threads0[j:j + trans[1]] = [
+            (i, trans[2], thread)
+            for thread in threads0[j:j + trans[1]]
+          ]
+        elif trans[0] == DFA.TRANSITION_MOVE:
+          threads1.extend(threads0[j:j + trans[1]])
+          j += trans[1]
+        #elif trans[0] == DFA.TRANSITION_DEL:
+        #  del threads1[-trans[1]:]
+        else:
+          assert False
+      assert j == len(threads0)
+      threads0, threads1 = threads1, threads0
+      del threads1[prefix_slop:]
+
+    threads0 = [None, None]
+    threads1 = [None]
+    prefix_slop = 1
+
+    start_state = self.start_state[start_index]
+    if start_state == -1:
+      return None
+    next_multistate, transition, _, _ = self.multistate_next(
+      (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
+      -1
+    )
+    while next_multistate is not None:
+      transit(transition)
+      assert len(threads0) == prefix_slop + next_multistate[1]
+      if next_multistate == NFA.accept_multistate:
+        # there is only one match, which is complete
+        assert len(threads0) == prefix_slop + 1
+        return threads0[prefix_slop]
+      if i >= len(text):
+        # return best match we have, but not incomplete match
+        i = NFA.multistate_accept(next_multistate)
+        return (None if i == -1 else threads0[prefix_slop + i])
+      next_multistate, transition, _, _ = (
+        self.multistate_next(next_multistate, ord(text[i]))
+      )
+      i += 1
+    return None
+
+  def match_yychunk(self, root, pos, off, yychunk_iter, start_index = 0):
+    if pos < 0:
+      pos, off = element.to_start_relative(root, pos, off)
+
+    def transit(transition):
+      nonlocal threads0, threads1, prefix_slop # note: also uses pos, off
+      j = prefix_slop
+      for trans in transition:
+        if trans[0] == DFA.TRANSITION_POP:
+          j += trans[1]
+        elif trans[0] == DFA.TRANSITION_DUP:
+          while j < trans[1]:
+            threads0[:0] = [None] * prefix_slop
+            threads1[:0] = [None] * prefix_slop
+            j += prefix_slop
+            prefix_slop *= 2
+          threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
+          j -= trans[1]
+        elif trans[0] == DFA.TRANSITION_MARK:
+          threads0[j:j + trans[1]] = [
+            (pos, off, trans[2], thread)
+            for thread in threads0[j:j + trans[1]]
+          ]
+        elif trans[0] == DFA.TRANSITION_MOVE:
+          threads1.extend(threads0[j:j + trans[1]])
+          j += trans[1]
+        #elif trans[0] == DFA.TRANSITION_DEL:
+        #  del threads1[-trans[1]:]
+        else:
+          assert False
+      assert j == len(threads0)
+      threads0, threads1 = threads1, threads0
+      del threads1[prefix_slop:]
+
+    threads0 = [None, None]
+    threads1 = [None]
+    prefix_slop = 1
+
+    start_state = self.start_state[start_index]
+    if start_state == -1:
+      return None
+    next_multistate, transition, _, _ = self.multistate_next(
+      (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
+      -1
+    )
+    text = element.get_text(root, pos)
+    while next_multistate is not None:
+      transit(transition)
+      assert len(threads0) == prefix_slop + next_multistate[1]
+      if next_multistate == NFA.accept_multistate:
+        # there is only one match, which is complete
+        assert len(threads0) == prefix_slop + 1
+        return threads0[prefix_slop]
+      while off >= len(text):
+        if pos < len(root):
+          pos += 1
+          off = 0
+        else:
+          try:
+            next(yychunk_iter)
+          except StopIteration:
+            # return best match we have, but not incomplete match
+            i = NFA.multistate_accept(next_multistate)
+            return (None if i == -1 else threads0[prefix_slop + i])
+          text = element.get_text(root, pos)
+      next_multistate, transition, _, _ = (
+        self.multistate_next(next_multistate, ord(text[off]))
+      )
+      off += 1
+    return None
+
+  def to_dfa(self):
+    dfa = DFA(list(self.groups))
+
+    accept_key = (NFA.accept_multistate, ())
+    action_to_meaning = [accept_key]
+    meaning_to_action = {accept_key: 0}
+    state_to_meaning = [NFA.accept_multistate]
+    meaning_to_state = {NFA.accept_multistate: 0}
+
+    for start_state in self.start_state:
+      if start_state == -1:
+        start_action = -1
+      else:
+        next_multistate, transition, _, _ = self.multistate_next(
+          (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
+          -1
+        )
+        if next_multistate is None:
+          start_action = -1
+        else:
+          start_key = (next_multistate, tuple(transition))
+          start_action = len(action_to_meaning)
+          meaning_to_action[start_key] = start_action
+          action_to_meaning.append(start_key)
+      dfa.start_action.append(start_action)
+
+    while len(dfa.actions) < len(action_to_meaning):
+      next_multistate, transition = action_to_meaning[len(dfa.actions)]
+      if next_multistate in meaning_to_state:
+        next_state = meaning_to_state[next_multistate]
+      else:
+        next_state = len(state_to_meaning)
+        state_to_meaning.append(next_multistate)
+        meaning_to_state[next_multistate] = next_state
+      dfa.actions.append((next_state, list(transition)))
+
+      while len(dfa.states) < len(state_to_meaning):
+        char = 0
+        multistate = state_to_meaning[len(dfa.states)]
+        state_desc = ([], [], NFA.multistate_accept(multistate))
+        while char < chars:
+          next_multistate, transition, char0, char1 = self.multistate_next(
+            multistate,
+            char
+          )
+          assert char0 == char and char1 > char
+          if next_multistate is None:
+            action = -1
+          else:
+            # optimize transition (optional)
+            i = 0
+            j = 0
+            while i < len(transition):
+              if transition[i][0] == DFA.TRANSITION_POP:
+                n = transition[i][1]
+                i += 1
+                while (
+                  i < len(transition) and
+                  transition[i][0] == DFA.TRANSITION_POP
+                ):
+                  n += transition[i][1]
+                  i += 1
+                transition[j] = (DFA.TRANSITION_POP, n)
+              elif transition[i][0] == DFA.TRANSITION_MOVE:
+                n = transition[i][1]
+                i += 1
+                while (
+                  i < len(transition) and
+                  transition[i][0] == DFA.TRANSITION_MOVE
+                ):
+                  n += transition[i][1]
+                  i += 1
+                transition[j] = (DFA.TRANSITION_MOVE, n)
+              else:
+                transition[j] = transition[i]
+                i += 1
+              j += 1
+            del transition[j:]
+            # end optimize transition
+            key = (next_multistate, tuple(transition))
+            if key in meaning_to_action:
+              action = meaning_to_action[key]
+            else:
+              action = len(action_to_meaning)
+              action_to_meaning.append(key)
+              meaning_to_action[key] = action
+          state_desc[0].append(char1)
+          state_desc[1].append(action)
+          char = char1
+        dfa.states.append(state_desc)
+    return dfa
+
+  def __repr__(self):
+    return 'regex.NFA({0:s}, {1:s}, {2:s})'.format(
+      repr(self.groups),
+      repr(self.states),
+      repr(self.start_state)
+    )
+
+class DFA:
+  # transition classes:
+  # instructions to transform thread list from a multistate to next multistate
+  # (TRANSITION_POP, n)                        j += n
+  # (TRANSITION_DUP, n)                        threads0[j - n:j] = threads0[j:j + n]
+  #                                    j -= n
+  # (TRANSITION_MARK, n, mark_value)   threads0[j:j + n] = [
+  #                                      (i, mark, thread)
+  #                                      for thread in threads0[j:j + n]
+  #                                    ]
+  # (TRANSITION_MOVE, n)               threads1.extend(threads0[j:j + n])
+  #                                    j += n
+  # (TRANSITION_DEL, n)                        del threads1[-n:]
+
+  TRANSITION_POP = 0
+  TRANSITION_DUP = 1
+  TRANSITION_MARK = 2
+  TRANSITION_MOVE = 3
+  #TRANSITION_DEL = 4
+  def __init__(
+    self,
+    groups = [],
+    states = [([chars], [0], 0)],
+    actions = [(0, [])],
+    start_action = [] # can have multiple DFAs in same container
+  ):
+    # groups: list of group_desc
+    # group_desc: (tag, kwargs)
+    #   tag, kwargs will be passed to apply_markup() hence factory()
+    # states: list of state_desc
+    # state_desc: (list of breaks, list of action to do, accept_thread)
+    # actions: list of action_desc
+    # action_desc: (state to go to next, compiled transition to do first)
+    # accept_thread: which thread of thread list to use, -1 don't accept
+    self.groups = groups
+    self.states = states
+    self.actions = actions
+    self.start_action = start_action
+
+  def match_text(self, text, i, start_index = 0):
+    def transit(transition):
+      nonlocal threads0, threads1, prefix_slop # note: also uses i
+      j = prefix_slop
+      for trans in transition:
+        if trans[0] == DFA.TRANSITION_POP:
+          j += trans[1]
+        elif trans[0] == DFA.TRANSITION_DUP:
+          while j < trans[1]:
+            threads0[:0] = [None] * prefix_slop
+            threads1[:0] = [None] * prefix_slop
+            j += prefix_slop
+            prefix_slop *= 2
+          threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
+          j -= trans[1]
+        elif trans[0] == DFA.TRANSITION_MARK:
+          threads0[j:j + trans[1]] = [
+            (i, trans[2], thread)
+            for thread in threads0[j:j + trans[1]]
+          ]
+        elif trans[0] == DFA.TRANSITION_MOVE:
+          threads1.extend(threads0[j:j + trans[1]])
+          j += trans[1]
+        #elif trans[0] == DFA.TRANSITION_DEL:
+        #  del threads1[-trans[1]:]
+        else:
+          assert False
+      assert j == len(threads0)
+      threads0, threads1 = threads1, threads0
+      del threads1[prefix_slop:]
+
+    threads0 = [None, None]
+    threads1 = [None]
+    prefix_slop = 1
+
+    action = self.start_action[start_index]
+    while action != -1:
+      state, transition = self.actions[action]
+      transit(transition)
+      if state == 0:
+        # there is only one match, which is complete
+        assert len(threads0) == prefix_slop + 1
+        return threads0[prefix_slop]
+      if i >= len(text):
+        # return best match we have, but not incomplete match
+        i = self.states[state][2]
+        return (None if i == -1 else threads0[prefix_slop + i])
+      action = self.states[state][1][
+        bisect.bisect_right(self.states[state][0], ord(text[i]))
+      ]
+      i += 1
+    return None
+
+  def match_yychunk(self, root, pos, off, yychunk_iter, start_index = 0):
+    if pos < 0:
+      pos, off = element.to_start_relative(root, pos, off)
+
+    def transit(transition):
+      nonlocal threads0, threads1, prefix_slop # note: also uses pos, off
+      j = prefix_slop
+      for trans in transition:
+        if trans[0] == DFA.TRANSITION_POP:
+          j += trans[1]
+        elif trans[0] == DFA.TRANSITION_DUP:
+          while j < trans[1]:
+            threads0[:0] = [None] * prefix_slop
+            threads1[:0] = [None] * prefix_slop
+            j += prefix_slop
+            prefix_slop *= 2
+          threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
+          j -= trans[1]
+        elif trans[0] == DFA.TRANSITION_MARK:
+          threads0[j:j + trans[1]] = [
+            (pos, off, trans[2], thread)
+            for thread in threads0[j:j + trans[1]]
+          ]
+        elif trans[0] == DFA.TRANSITION_MOVE:
+          threads1.extend(threads0[j:j + trans[1]])
+          j += trans[1]
+        #elif trans[0] == DFA.TRANSITION_DEL:
+        #  del threads1[-trans[1]:]
+        else:
+          assert False
+      assert j == len(threads0)
+      threads0, threads1 = threads1, threads0
+      del threads1[prefix_slop:]
+
+    threads0 = [None, None]
+    threads1 = [None]
+    prefix_slop = 1
+
+    action = self.start_action[start_index]
+    text = element.get_text(root, pos)
+    while action != -1:
+      state, transition = self.actions[action]
+      transit(transition)
+      if state == 0:
+        # there is only one match, which is complete
+        assert len(threads0) == prefix_slop + 1
+        return threads0[prefix_slop]
+      while off >= len(text):
+        if pos < len(root):
+          pos += 1
+          off = 0
+        else:
+          try:
+            next(yychunk_iter)
+          except StopIteration:
+            # return best match we have, but not incomplete match
+            i = self.states[state][2]
+            return (None if i == -1 else threads0[prefix_slop + i])
+          text = element.get_text(root, pos)
+      #print(
+      #  'state {0:d} pos {1:d} off {2:d} text "{3:s}"'.format(
+      #    state,
+      #    pos,
+      #    off,
+      #    text.replace('\n', '$')
+      #  )
+      #)
+      action = self.states[state][1][
+        bisect.bisect_right(self.states[state][0], ord(text[off]))
+      ]
+      off += 1
+    return None
+
+  def yylex(self, root, pos, off, factory, yychunk_iter):
+    if pos < 0:
+      pos, off = element.to_start_relative(root, pos, off)
+
+    while True:
+      # note: pointers must be kept start relative during the below call,
+      # because it extends the following text by calling the yychunk_iter
+      thread = self.match_yychunk(root, pos, off, yychunk_iter)
+      if thread is None:
+        break
+      stack = []
+      while True:
+        pos, off, mark_value, thread = thread
+        group_index = mark_value >> 1
+        if (mark_value & 1) != 0:
+          end_pos, end_off = element.to_end_relative(root, pos, off)
+          stack.append((end_pos, end_off, group_index))
+        else:
+          end_pos, end_off, temp = stack.pop()
+          assert temp == group_index
+          if len(stack) == 0:
+            break
+          tag, kwargs = self.groups[group_index]
+          if tag != '':
+            work.apply_markup(
+              root,
+              pos,
+              off,
+              end_pos,
+              end_off,
+              factory,
+              tag,
+              **kwargs
+            )
+      # note: pointers must be kept end relative during the below call,
+      # because it modifies the preceding text by calling apply_markup()
+      yield end_pos, end_off, group_index
+      pos, off = element.to_start_relative(root, end_pos, end_off)
+
+  def __repr__(self):
+    return 'regex.DFA({0:s}, {1:s}, {2:s}, {3:s})'.format(
+      repr(self.groups),
+      repr(self.states),
+      repr(self.actions),
+      repr(self.start_action)
+    )
+
+class LR1:
+  def __init__(self, productions = [], terminal_thres = chars):
+    # productions: list of production
+    # production: (
+    #   priority,
+    #   symbols,
+    #   lookaheads,
+    #   group_bounds
+    # )
+    # priority: bit 0 = right to left, bits 1: = numeric priority
+    # symbols: list of symbol_desc
+    # symbol_desc: (terminal_set, nonterminal_set)
+    # terminal_set: similar to char_set, even length list of pairs of breaks
+    # nonterminal_set: as above but has terminal_thres subtracted from breaks
+    # lookaheads: list of lookahead_desc, len(lookaheads) = len(symbols) + 1
+    # lookahead_desc: (initial_set, can_be_empty)
+    # initial_set: what terminals can occur at this position in symbols array,
+    # computed such that lookaheads[i][0] is symbols[i][0], plus initial set of
+    # each nonterminal from symbols[i][1], plus lookaheads[i + 1][0] if any
+    # nonterminal of symbols[i][1] can be empty (may pull in i + 2, i + 3 etc)
+    # can_be_empty: whether all symbols from this position to end can be empty
+    # note: last lookahead_desc is a sentinel consisting of ([], True), so that
+    # initial_set is empty (will be augmented by context) and can_be_empty is
+    # True (because all symbols from the end to the end can obviously be empty)
+    # group_bounds: list of group_bound
+    # group_bound: (start_index, end_index, tag, kwargs)
+    #   where start_index, end_index are indices into list of char_set,
+    #   and tag, kwargs will be passed to apply_markup() hence factory(),
+    #   noting that markup has to be applied in reverse order of the list
+    # terminal_thres: offset to apply to productions[] index to get symbol
+    #   (char set code), also symbol for productions[0] = start production
+    self.productions = productions
+    self.terminal_thres = terminal_thres
+
+  def lookahead_item_set_closure(self, items, item_to_index):
+    in_queue = [True for i in range(len(items))]
+    queue = list(range(len(items)))
+
+    qhead = 0
+    while qhead < len(queue):
+      i = queue[qhead]
+      in_queue[i] = False
+      j, k, lookahead_set = items[i]
+      _, symbols, lookaheads, _ = self.productions[j]
+      if k < len(symbols):
+        _, nonterminal_set = symbols[k]
+        if len(nonterminal_set):
+          next_lookahead_set, next_can_be_empty = lookaheads[k + 1]
+          if next_can_be_empty:
+            next_lookahead_set = char_set_or(
+              next_lookahead_set,
+              lookahead_set
+            )
+          for l in range(0, len(nonterminal_set), 2):
+            for m in range(nonterminal_set[l], nonterminal_set[l + 1]):
+              key = (m, 0)
+              if key in item_to_index:
+                n = item_to_index[key]
+                child_lookahead_set = char_set_or(
+                  items[n][2],
+                  next_lookahead_set
+                )
+                if child_lookahead_set != items[n][2]:
+                  items[n] = (m, 0, child_lookahead_set)
+                  if not in_queue[n]:
+                    # optimization: do not re-queue unless it is transparent
+                    # to changes in the lookahead set wrt. further propagation
+                    _, child_symbols, child_lookaheads, _ = self.productions[m]
+                    if len(child_symbols) and child_lookaheads[1][1]:
+                      in_queue[n] = True
+                      queue.append(n)
+              else:
+                n = len(items)
+                items.append((m, 0, next_lookahead_set))
+                item_to_index[key] = n
+                in_queue.append(True)
+                queue.append(n)
+      qhead += 1 
+
+  def lookahead_item_set_shift(self, items, terminal):
+    next_items = []
+    next_item_to_index = {}
+    reductions = set()
+    terminal0 = 0
+    terminal1 = self.terminal_thres
+    for i, j, lookahead_set in items:
+      _, symbols, _, _ = self.productions[i]
+      if j < len(symbols):
+        terminal_set, _ = symbols[j]
+        k = bisect.bisect_right(terminal_set, terminal)
+        if k > 0 and terminal0 < terminal_set[k - 1]:
+          terminal0 = terminal_set[k - 1]
+        if k < len(terminal_set) and terminal1 > terminal_set[k]:
+          terminal1 = terminal_set[k]
+        if (k & 1) == 1:
+          next_item_to_index[(i, j + 1)] = len(next_items)
+          next_items.append((i, j + 1, lookahead_set))
+      else:
+        k = bisect.bisect_right(lookahead_set, terminal)
+        if k > 0 and terminal0 < lookahead_set[k - 1]:
+          terminal0 = lookahead_set[k - 1]
+        if k < len(lookahead_set) and terminal1 > lookahead_set[k]:
+          terminal1 = lookahead_set[k]
+        if (k & 1) == 1:
+          reductions.add(i)
+    return next_items, next_item_to_index, reductions, terminal0, terminal1
+
+  def lookahead_item_set_goto(self, items, nonterminal):
+    next_items = []
+    next_item_to_index = {}
+    reductions = set()
+    nonterminal0 = 0
+    nonterminal1 = len(self.productions)
+    for i, j, lookahead_set in items:
+      _, symbols, _, _ = self.productions[i]
+      if j < len(symbols):
+        _, nonterminal_set = symbols[j]
+        k = bisect.bisect_right(nonterminal_set, nonterminal)
+        if k > 0 and nonterminal0 < nonterminal_set[k - 1]:
+          nonterminal0 = nonterminal_set[k - 1]
+        if k < len(nonterminal_set) and nonterminal1 > nonterminal_set[k]:
+          nonterminal1 = nonterminal_set[k]
+        if (k & 1) == 1:
+          next_item_to_index[(i, j + 1)] = len(next_items)
+          next_items.append((i, j + 1, lookahead_set))
+    return next_items, next_item_to_index, nonterminal0, nonterminal1
+
+  def parse_text(self, text, i):
+    items = [(0, 0, [chars, chars + 1])] # EOF
+    item_to_index = {(0, 0): 0}
+    value_stack = []
+    state_stack = []
+    lookahead_char = ord(text[i]) if i < len(text) else chars # EOF
+    while True:
+      self.lookahead_item_set_closure(items, item_to_index)
+      value_stack.append(i)
+      state_stack.append(items)
+      items, item_to_index, reductions, _, _ = (
+        self.lookahead_item_set_shift(items, lookahead_char)
+      )
+      if len(items) != 0:
+        if len(reductions) != 0:
+          sys.stderr.write(
+            'shift/reduce conflict: {0:s} vs {1:s}\n'.format(
+              ','.join([str(i) for i, _, _ in next_items]),
+              ','.join([str(i) for i in reductions])
+            )
+          )
+        i += 1
+        lookahead_char = ord(text[i]) if i < len(text) else chars # EOF
+      elif len(reductions) != 0:
+        if len(reductions) != 1:
+          sys.stderr.write(
+            'reduce/reduce conflict: {0:s}\n'.format(
+              ','.join([str(i) for i in reductions])
+            )
+          )
+        reduce = min(reductions)
+        _, symbols, _, group_bounds = self.productions[reduce]
+        base = len(value_stack) - len(symbols) - 1
+        for j in range(len(group_bounds) - 1, -1, -1):
+          k, l, tag, _ = group_bounds[j]
+          k += base
+          if l != 1:
+            value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
+          sys.stderr.write(
+            'text \'{0:s}\' tag \'{1:s}\'\n'.format(
+              text[value_stack[k]:value_stack[k + 1]],
+              tag
+            )
+          )
+        del value_stack[base + 1:]
+        del state_stack[base + 1:]
+        if reduce == 0:
+          assert base == 0
+          return
+        items, item_to_index, _, _ = (
+          self.lookahead_item_set_goto(state_stack[-1], reduce)
+        )
+        assert len(items) != 0
+      else:
+        raise Exception(
+          'syntax error at {0:d}: {1:s}'.format(i, text[i:])
+        )
+
+  def parse_yychunk(self, root, pos, off, factory, yychunk_iter):
+    if pos < 0:
+      pos, off = element.to_start_relative(root, pos, off)
+
+    items = [(0, 0, [chars, chars + 1])] # EOF
+    item_to_index = {(0, 0): 0}
+    value_stack = []
+    state_stack = []
+    text = element.get_text(root, pos)
+    while off >= len(text):
+      if pos < len(root):
+        pos += 1
+        off = 0
+      else:
+        try:
+          next(yychunk_iter)
+        except StopIteration:
+          lookahead_char = chars # EOF
+          break
+        text = element.get_text(root, pos)
+    else: 
+      lookahead_char = ord(text[off])
+    while True:
+      self.lookahead_item_set_closure(items, item_to_index)
+      value_stack.append((pos, off))
+      state_stack.append(items)
+      items, item_to_index, reductions, _, _ = (
+        self.lookahead_item_set_shift(items, lookahead_char)
+      )
+      if len(items) != 0:
+        if len(reductions) != 0:
+          sys.stderr.write(
+            'shift/reduce conflict: {0:s} vs {1:s}\n'.format(
+              ','.join([str(i) for i, _ in next_lookahead_item_set.keys()]),
+              ','.join([str(i) for i in reductions])
+            )
+          )
+        off += 1
+        while off >= len(text):
+          if pos < len(root):
+            pos += 1
+            off = 0
+          else:
+            try:
+              next(yychunk_iter)
+            except StopIteration:
+              lookahead_char = chars # EOF
+              break
+            text = element.get_text(root, pos)
+        else: 
+          lookahead_char = ord(text[off])
+      elif len(reductions) != 0:
+        if len(reductions) != 1:
+          sys.stderr.write(
+            'reduce/reduce conflict: {0:s}\n'.format(
+              ','.join([str(i) for i in reductions])
+            )
+          )
+        reduce = min(reductions)
+        _, symbols, _, group_bounds = self.productions[reduce]
+        base = len(value_stack) - len(symbols) - 1
+        end_relative = len(value_stack)
+        for j in range(len(group_bounds) - 1, -1, -1):
+          k, l, tag, kwargs = group_bounds[j]
+          k += base
+          assert k < end_relative
+          if l != 1:
+            value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
+            end_relative = max(k + 1, end_relative + 1 - l)
+          while end_relative > k + 1:
+            end_relative -= 1
+            pos1, off1 = value_stack[end_relative]
+            value_stack[end_relative] = (
+              element.to_end_relative(root, pos1, off1)
+            )
+          pos0, off0 = value_stack[k]
+          pos1, off1 = value_stack[k + 1]
+          work.apply_markup(
+            root,
+            pos0,
+            off0,
+            pos1,
+            off1,
+            factory,
+            tag,
+            **kwargs
+          )
+        if end_relative < len(value_stack):
+          pos, off = value_stack[-1]
+          pos, off = element.to_start_relative(root, pos, off)
+          text = element.get_text(root, pos)
+        del value_stack[base + 1:]
+        del state_stack[base + 1:]
+        if reduce == 0:
+          assert base == 0
+          return
+        items, item_to_index, _, _ = (
+          self.lookahead_item_set_goto(state_stack[-1], reduce)
+        )
+        assert len(items) != 0
+      else:
+        raise Exception(
+          'syntax error at {0:d},{1:d}: {2:s}'.format(pos, off, text[off:])
+        )
+
+  def to_clr1(self):
+    lr1dfa = LR1DFA(
+      [],
+      [
+        (len(symbols), group_bounds)
+        for _, symbols, _, group_bounds in self.productions
+      ],
+      self.terminal_thres
+    )
+
+    items = [(0, 0, [chars, chars + 1])] # EOF
+    item_to_index = {(0, 0): 0}
+    self.lookahead_item_set_closure(items, item_to_index)
+
+    items = sorted(items)
+    key = tuple((i, j, tuple(k)) for i, j, k in items)
+    state_to_items = [items]
+    items_to_state = {key: 0}
+
+    while len(lr1dfa.states) < len(state_to_items):
+      items = state_to_items[len(lr1dfa.states)]
+      state_desc = ([], [], [], [])
+
+      def add_state(next_items, next_item_to_index):
+        self.lookahead_item_set_closure(next_items, next_item_to_index)
+        new_items = sorted(next_items)
+        key = tuple((i, j, tuple(k)) for i, j, k in new_items)
+        if key in items_to_state:
+          state = items_to_state[key]
+        else:
+          state = len(state_to_items)
+          state_to_items.append(new_items)
+          items_to_state[key] = state
+        return state
+
+      terminal = 0
+      while terminal < self.terminal_thres:
+        next_items, next_item_to_index, reductions, terminal0, terminal1 = (
+          self.lookahead_item_set_shift(items, terminal)
+        )
+        assert terminal0 == terminal and terminal1 > terminal
+        if len(next_items) != 0:
+          if len(reductions) != 0:
+            sys.stderr.write(
+              'state {0:d} shift/reduce conflict: {1:s} vs {2:s}\n'.format(
+                len(lr1dfa.states),
+                ','.join([str(i) for i, _, _ in next_items]),
+                ','.join([str(i) for i in reductions])
+              )
+            )
+          action = add_state(next_items, next_item_to_index) * 2
+        elif len(reductions) != 0:
+          if len(reductions) != 1:
+            sys.stderr.write(
+              'state {0:d} reduce/reduce conflict: {1:s}\n'.format(
+                len(lr1dfa.states),
+                ','.join([str(i) for i in reductions])
+              )
+            )
+          action = min(reductions) * 2 + 1 # odd means reduce
+        else:
+          action = -1
+        state_desc[0].append(terminal1)
+        state_desc[1].append(action)
+        terminal = terminal1
+
+      nonterminal = 0
+      while nonterminal < len(self.productions):
+        next_items, next_item_to_index, nonterminal0, nonterminal1 = (
+          self.lookahead_item_set_goto(items, nonterminal)
+        )
+        assert nonterminal0 == nonterminal and nonterminal1 > nonterminal
+        if len(next_items) != 0:
+          goto = add_state(next_items, next_item_to_index)
+        else:
+          goto = -1
+        state_desc[2].append(nonterminal1)
+        state_desc[3].append(goto)
+        nonterminal = nonterminal1
+
+      lr1dfa.states.append(state_desc)
+    return lr1dfa
+
+  def to_lalr1(self):
+    lr1dfa = LR1DFA(
+      [],
+      [
+        (len(symbols), group_bounds)
+        for _, symbols, _, group_bounds in self.productions
+      ],
+      self.terminal_thres
+    )
+
+    items = [(0, 0, [chars, chars + 1])] # EOF
+    item_to_index = {(0, 0): 0}
+    self.lookahead_item_set_closure(items, item_to_index)
+
+    items = sorted(items)
+    key = tuple((i, j) for i, j, _ in items) # ignore lookahead
+    state_to_items = [items]
+    items_to_state = {key: 0}
+
+    in_queue = [True]
+    queue = [0]
+
+    qhead = 0
+    while qhead < len(queue):
+      i = queue[qhead]
+      in_queue[i] = False
+      items = state_to_items[i]
+      state_desc = ([], [], [], [])
+
+      def add_state(next_items, next_item_to_index):
+        self.lookahead_item_set_closure(next_items, next_item_to_index)
+        new_items = sorted(next_items)
+        key = tuple((i, j) for i, j, _ in new_items) # ignore lookahead
+        if key in items_to_state:
+          state = items_to_state[key]
+          state_items = state_to_items[state]
+          for i in range(len(new_items)):
+            j, k, lookahead_set = new_items[i]
+            lookahead_set = char_set_or(lookahead_set, state_items[i][2])
+            if lookahead_set != state_items[i][2]:
+              state_items[i] = (j, k, lookahead_set)
+              if not in_queue[state]:
+                in_queue[state] = True
+                queue.append(state)
+        else:
+          state = len(state_to_items)
+          state_to_items.append(new_items)
+          items_to_state[key] = state
+          in_queue.append(True)
+          queue.append(state)
+        return state
+
+      terminal = 0
+      while terminal < self.terminal_thres:
+        next_items, next_item_to_index, reductions, terminal0, terminal1 = (
+          self.lookahead_item_set_shift(items, terminal)
+        )
+        assert terminal0 == terminal and terminal1 > terminal
+        if len(next_items) != 0:
+          if len(reductions) != 0:
+            sys.stderr.write(
+              'state {0:d} shift/reduce conflict: {1:s} vs {2:s}\n'.format(
+                i,
+                ','.join([str(j) for j, _, _ in next_items]),
+                ','.join([str(j) for j in reductions])
+              )
+            )
+          action = add_state(next_items, next_item_to_index) * 2
+        elif len(reductions) != 0:
+          if len(reductions) != 1:
+            sys.stderr.write(
+              'state {0:d} reduce/reduce conflict: {1:s}\n'.format(
+                i,
+                ','.join([str(j) for j in reductions])
+              )
+            )
+          action = min(reductions) * 2 + 1 # odd means reduce
+        else:
+          action = -1
+        state_desc[0].append(terminal1)
+        state_desc[1].append(action)
+        terminal = terminal1
+
+      nonterminal = 0
+      while nonterminal < len(self.productions):
+        next_items, next_item_to_index, nonterminal0, nonterminal1 = (
+          self.lookahead_item_set_goto(items, nonterminal)
+        )
+        assert nonterminal0 == nonterminal and nonterminal1 > nonterminal
+        if len(next_items) != 0:
+          goto = add_state(next_items, next_item_to_index)
+        else:
+          goto = -1
+        state_desc[2].append(nonterminal1)
+        state_desc[3].append(goto)
+        nonterminal = nonterminal1
+
+      if i < len(lr1dfa.states):
+        lr1dfa.states[i] = state_desc
+      else:
+        lr1dfa.states.append(state_desc)
+      qhead += 1
+    return lr1dfa
+
+  def __repr__(self):
+    return 'regex.LR1({0:s}, {1:d})'.format(
+      repr(self.productions),
+      self.terminal_thres
+    )
+
+class LR1DFA:
+  def __init__(self, states = [], productions = [], terminal_thres = chars):
+    # states: list of state_desc
+    # state_desc: (terminal breaks, actions, nonterminal breaks, gotos)
+    # action: shift = new state * 2, reduce = production * 2 + 1, error = -1
+    # goto: reduce = production, error = -1 (but error can't really happen)
+    # productions: list of production
+    # production: (len(symbols), group_bounds)
+    # len(symbols): how many states to pop stack to reduce this production
+    # group_bounds: list of group_bound
+    # group_bound: (start_index, end_index, tag, kwargs)
+    #   where start_index, end_index are indices into list of char_set,
+    #   and tag, kwargs will be passed to apply_markup() hence factory(),
+    #   noting that markup has to be applied in reverse order of the list
+    # terminal_thres: offset to apply to productions[] index to get symbol
+    #   (char set code), also symbol for productions[0] = start production
+    self.states = states
+    self.productions = productions
+    self.terminal_thres = terminal_thres
+
+  def parse_text(self, text, i):
+    state = 0
+    value_stack = []
+    state_stack = []
+    lookahead_char = ord(text[i]) if i < len(text) else chars # EOF
+    while True:
+      value_stack.append(i)
+      state_stack.append(state)
+      action = self.states[state][1][
+        bisect.bisect_right(self.states[state][0], lookahead_char)
+      ]
+      if action == -1:
+        raise Exception(
+          'syntax error at {0:d}: {1:s}'.format(i, text[i:])
+        )
+      if (action & 1) == 0:
+        state = action >> 1
+        i += 1
+        lookahead_char = ord(text[i]) if i < len(text) else chars # EOF
+      else:
+        reduce = action >> 1
+        len_symbols, group_bounds = self.productions[reduce]
+        base = len(value_stack) - len_symbols - 1
+        for j in range(len(group_bounds) - 1, -1, -1):
+          k, l, tag, _ = group_bounds[j]
+          k += base
+          if l != 1:
+            value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
+          sys.stdout.write(
+            'text \'{0:s}\' tag \'{1:s}\'\n'.format(
+              text[value_stack[k]:value_stack[k + 1]],
+              tag
+            )
+          )
+        del value_stack[base + 1:]
+        del state_stack[base + 1:]
+        if reduce == 0:
+          assert base == 0
+          return
+        state = self.states[state_stack[-1]][3][
+          bisect.bisect_right(self.states[state_stack[-1]][2], reduce)
+        ]
+        assert state != -1
+
+  def parse_yychunk(self, root, pos, off, factory, yychunk_iter):
+    if pos < 0:
+      pos, off = element.to_start_relative(root, pos, off)
+
+    state = 0
+    value_stack = []
+    state_stack = []
+    text = element.get_text(root, pos)
+    while off >= len(text):
+      if pos < len(root):
+        pos += 1
+        off = 0
+      else:
+        try:
+          next(yychunk_iter)
+        except StopIteration:
+          lookahead_char = chars # EOF
+          break
+        text = element.get_text(root, pos)
+    else: 
+      lookahead_char = ord(text[off])
+    while True:
+      value_stack.append((pos, off))
+      state_stack.append(state)
+      action = self.states[state][1][
+        bisect.bisect_right(self.states[state][0], lookahead_char)
+      ]
+      #print('lookahead_char', lookahead_char, 'action', action)
+      if action == -1:
+        raise Exception(
+          'syntax error at {0:d},{1:d}: {2:s}'.format(pos, off, text[off:])
+        )
+      if (action & 1) == 0:
+        state = action >> 1
+        off += 1
+        while off >= len(text):
+          if pos < len(root):
+            pos += 1
+            off = 0
+          else:
+            try:
+              next(yychunk_iter)
+            except StopIteration:
+              lookahead_char = chars # EOF
+              break
+            text = element.get_text(root, pos)
+        else: 
+          lookahead_char = ord(text[off])
+      else:
+        reduce = action >> 1
+        len_symbols, group_bounds = self.productions[reduce]
+        base = len(value_stack) - len_symbols - 1
+        end_relative = len(value_stack)
+        for j in range(len(group_bounds) - 1, -1, -1):
+          k, l, tag, kwargs = group_bounds[j]
+          k += base
+          assert k < end_relative
+          if l != 1:
+            value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
+            end_relative = max(k + 1, end_relative + 1 - l)
+          while end_relative > k + 1:
+            end_relative -= 1
+            pos1, off1 = value_stack[end_relative]
+            value_stack[end_relative] = (
+              element.to_end_relative(root, pos1, off1)
+            )
+          pos0, off0 = value_stack[k]
+          pos1, off1 = value_stack[k + 1]
+          work.apply_markup(
+            root,
+            pos0,
+            off0,
+            pos1,
+            off1,
+            factory,
+            tag,
+            **kwargs
+          )
+        if end_relative < len(value_stack):
+          pos, off = value_stack[-1]
+          pos, off = element.to_start_relative(root, pos, off)
+          text = element.get_text(root, pos)
+        del value_stack[base + 1:]
+        del state_stack[base + 1:]
+        if reduce == 0:
+          assert base == 0
+          return
+        state = self.states[state_stack[-1]][3][
+          bisect.bisect_right(self.states[state_stack[-1]][2], reduce)
+        ]
+        assert state != -1
+
+  def yyparse(self, root, pos, off, factory, yylex_iter):
+    if pos < 0:
+      pos, off = element.to_start_relative(root, pos, off)
+
+    state = 0
+    value_stack = []
+    state_stack = []
+    try:
+      end_pos, end_off, lookahead_char = next(yylex_iter)
+    except StopIteration:
+      lookahead_char = chars # EOF
+      end_pos, end_off = element.to_end_relative(root, pos, off)
+    while True:
+      value_stack.append((pos, off))
+      state_stack.append(state)
+      action = self.states[state][1][
+        bisect.bisect_right(self.states[state][0], lookahead_char)
+      ]
+      #print('lookahead_char', lookahead_char, 'action', action)
+      if action == -1:
+        raise Exception(
+          'syntax error at {0:d},{1:d}: {2:d}'.format(pos, off, lookahead_char)
+        )
+      if (action & 1) == 0:
+        state = action >> 1
+        pos, off = element.to_start_relative(root, end_pos, end_off)
+        try:
+          end_pos, end_off, lookahead_char = next(yylex_iter)
+        except StopIteration:
+          lookahead_char = chars # EOF
+          #end_pos, end_off = element.to_end_relative(root, pos, off)
+      else:
+        reduce = action >> 1
+        len_symbols, group_bounds = self.productions[reduce]
+        base = len(value_stack) - len_symbols - 1
+        end_relative = len(value_stack)
+        for j in range(len(group_bounds) - 1, -1, -1):
+          k, l, tag, kwargs = group_bounds[j]
+          k += base
+          assert k < end_relative
+          if l != 1:
+            value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
+            end_relative = max(k + 1, end_relative + 1 - l)
+          while end_relative > k + 1:
+            end_relative -= 1
+            pos1, off1 = value_stack[end_relative]
+            value_stack[end_relative] = (
+              element.to_end_relative(root, pos1, off1)
+            )
+          pos0, off0 = value_stack[k]
+          pos1, off1 = value_stack[k + 1]
+          work.apply_markup(
+            root,
+            pos0,
+            off0,
+            pos1,
+            off1,
+            factory,
+            tag,
+            **kwargs
+          )
+        if end_relative < len(value_stack):
+          pos, off = value_stack[-1]
+          pos, off = element.to_start_relative(root, pos, off)
+        del value_stack[base + 1:]
+        del state_stack[base + 1:]
+        if reduce == 0:
+          assert base == 0
+          return
+        state = self.states[state_stack[-1]][3][
+          bisect.bisect_right(self.states[state_stack[-1]][2], reduce)
+        ]
+        assert state != -1
+
+  def __repr__(self):
+    return 'regex.LR1DFA({0:s}, {1:s}, {2:d})'.format(
+      repr(self.states),
+      repr(self.productions),
+      self.terminal_thres
+    )
+
+def wrap_repr(text, width):
+  lines = []
+  i = 0
+  while i < len(text):
+    j = i + width
+    if j < len(text):
+      j = max(
+        [
+          text.rfind('(', i, j) + 1,
+          text.rfind('[', i, j) + 1,
+          text.rfind('{', i, j) + 1,
+          text.rfind('.', i, j) + 1,
+          text.rfind(')', i, j + 1),
+          text.rfind(']', i, j + 1),
+          text.rfind('}', i, j + 1),
+          text.rfind(' ', i, j + 1),
+        ]
+      )
+      assert j > 0
+    lines.append(text[i:j] + '\n')
+    i = j
+    while text[i:i + 1] == ' ':
+      i += 1
+  return ''.join(lines) 
+
+if __name__ == '__main__':
+  import xml.etree.ElementTree
+
+  regex = RegexAnd(children = [RegexRepeat(children = [RegexCharacterNot(
+children = [RegexCharacter()], char_set = [0, 256])]), RegexGroup(children = [
+RegexOr(children = [RegexOr(children = [RegexOr(children = [RegexGroup(children
+= [RegexRepeat(children = [RegexCharacter(char_set = [9, 14, 32, 33])],
+one_or_more = True)], group_index = 1, group_name = 'Whitespace'), RegexGroup(
+children = [RegexRepeat(children = [RegexCharacter(char_set = [48, 58])],
+one_or_more = True)], group_index = 2, group_name = 'Number')]), RegexGroup(
+children = [RegexSequence(children = [RegexSequence(children = [RegexSequence(
+children = [RegexEmpty(), RegexCharacter(char_set = [102, 103])]),
+RegexCharacter(char_set = [111, 112])]), RegexCharacter(char_set = [114, 115])]
+)], group_index = 3, group_name = 'For')]), RegexGroup(children = [
+RegexSequence(children = [RegexCharacter(char_set = [65, 91, 95, 96, 97, 123]),
+RegexRepeat(children = [RegexCharacter(char_set = [48, 58, 65, 91, 95, 96, 97,
+123])])])], group_index = 4, group_name = 'Identifier')])], group_index = 0)])
+  #sys.stdout.write(
+  #  wrap_repr(
+  #    '  regex = {0:s}'.format(repr(regex).replace('regex.', '')),
+  #    79
+  #  )
+  #)
+
+  nfa = regex.to_nfa()
+  #sys.stdout.write(
+  #  wrap_repr(
+  #    '  nfa = {0:s}'.format(repr(nfa).replace('regex.', '')),
+  #    79
+  #  )
+  #)
+
+  text = '     id   99id id99 for forex  '
+  i = 0
+  while i < len(text):
+    print('text "{0:s}"'.format(text[i:i + 72].replace('\n', '$')))
+    thread = nfa.match_text(text, i)
+    if thread is None:
+      print('no match')
+      break
+    i = thread[0] # end position of overall match
+    group_start = [-1 for j in range(len(nfa.groups))]
+    group_end = [-1 for j in range(len(nfa.groups))]
+    while thread is not None:
+      pos, mark, thread = thread
+      group = mark >> 1
+      if (mark & 1) == 0:
+        group_start[group] = pos
+        print(
+          'group {0:d} name "{1:s}" text "{2:s}"'.format(
+             group,
+             nfa.groups[group][0],
+             text[group_start[group]:group_end[group]].replace('\n', '$')
+          )
+        )
+      else:
+        group_end[group] = pos
+
+  dfa = nfa.to_dfa()
+  #sys.stdout.write(
+  #  wrap_repr(
+  #    '  dfa = {0:s}'.format(repr(dfa).replace('regex.', '')),
+  #    79
+  #  )
+  #)
+
+  text = '     id   99id id99 for forex  '
+  i = 0
+  while i < len(text):
+    print('text "{0:s}"'.format(text[i:i + 72].replace('\n', '$')))
+    thread = dfa.match_text(text, i)
+    if thread is None:
+      print('no match')
+      break
+    i = thread[0] # end position of overall match
+    group_start = [-1 for j in range(len(dfa.groups))]
+    group_end = [-1 for j in range(len(dfa.groups))]
+    while thread is not None:
+      pos, mark, thread = thread
+      group = mark >> 1
+      if (mark & 1) == 0:
+        group_start[group] = pos
+        print(
+          'group {0:d} name "{1:s}" text "{2:s}"'.format(
+             group,
+             dfa.groups[group][0],
+             text[group_start[group]:group_end[group]].replace('\n', '$')
+          )
+        )
+      else:
+        group_end[group] = pos
+
+  grammar = Grammar(children = [Grammar.Production(children = [RegexSequence(
+children = [RegexSequence(children = [RegexEmpty(), RegexCharacterRule(char_set
+= [288, 295], rule_name = 'whitespace_opt')]), RegexCharacterRule(char_set = [
+259, 262], rule_name = 'expr0')])], nonterminal = 0), Grammar.Production(
+children = [RegexSequence(children = [RegexEmpty(), RegexCharacterRule(char_set
+= [262, 265], rule_name = 'expr1')])], nonterminal = 1), Grammar.Production(
+children = [RegexSequence(children = [RegexEmpty(), RegexGroup(children = [
+RegexSequence(children = [RegexSequence(children = [RegexSequence(children = [
+RegexSequence(children = [RegexEmpty(), RegexCharacterRule(char_set = [259, 262
+], rule_name = 'expr0')]), RegexCharacter(char_set = [43, 44])]),
+RegexCharacterRule(char_set = [288, 295], rule_name = 'whitespace_opt')]),
+RegexCharacterRule(char_set = [262, 265], rule_name = 'expr1')])], group_index
+= 0, group_name = 'Add')])], nonterminal = 2), Grammar.Production(children = [
+RegexSequence(children = [RegexEmpty(), RegexGroup(children = [RegexSequence(
+children = [RegexSequence(children = [RegexSequence(children = [RegexSequence(
+children = [RegexEmpty(), RegexCharacterRule(char_set = [259, 262], rule_name =
+'expr0')]), RegexCharacter(char_set = [45, 46])]), RegexCharacterRule(char_set
+= [288, 295], rule_name = 'whitespace_opt')]), RegexCharacterRule(char_set = [
+262, 265], rule_name = 'expr1')])], group_index = 0, group_name = 'Subtract')])
+], nonterminal = 3), Grammar.Production(children = [RegexSequence(children = [
+RegexEmpty(), RegexCharacterRule(char_set = [265, 268], rule_name = 'expr2')])
+], nonterminal = 4), Grammar.Production(children = [RegexSequence(children = [
+RegexEmpty(), RegexGroup(children = [RegexSequence(children = [RegexSequence(
+children = [RegexSequence(children = [RegexSequence(children = [RegexEmpty(),
+RegexCharacterRule(char_set = [262, 265], rule_name = 'expr1')]),
+RegexCharacter(char_set = [42, 43])]), RegexCharacterRule(char_set = [288, 295
+], rule_name = 'whitespace_opt')]), RegexCharacterRule(char_set = [265, 268],
+rule_name = 'expr2')])], group_index = 0, group_name = 'Multiply')])],
+nonterminal = 5), Grammar.Production(children = [RegexSequence(children = [
+RegexEmpty(), RegexGroup(children = [RegexSequence(children = [RegexSequence(
+children = [RegexSequence(children = [RegexSequence(children = [RegexEmpty(),
+RegexCharacterRule(char_set = [262, 265], rule_name = 'expr1')]),
+RegexCharacter(char_set = [47, 48])]), RegexCharacterRule(char_set = [288, 295
+], rule_name = 'whitespace_opt')]), RegexCharacterRule(char_set = [265, 268],
+rule_name = 'expr2')])], group_index = 0, group_name = 'Divide')])],
+nonterminal = 6), Grammar.Production(children = [RegexSequence(children = [
+RegexSequence(children = [RegexEmpty(), RegexGroup(children = [RegexSequence(
+children = [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name =
+'number')])], group_index = 0, group_name = 'Number')]), RegexCharacterRule(
+char_set = [288, 295], rule_name = 'whitespace_opt')])], nonterminal = 7),
+Grammar.Production(children = [RegexSequence(children = [RegexEmpty(),
+RegexGroup(children = [RegexSequence(children = [RegexSequence(children = [
+RegexSequence(children = [RegexEmpty(), RegexCharacter(char_set = [45, 46])]),
+RegexCharacterRule(char_set = [288, 295], rule_name = 'whitespace_opt')]),
+RegexCharacterRule(char_set = [265, 268], rule_name = 'expr2')])], group_index
+= 0, group_name = 'Negate')])], nonterminal = 8), Grammar.Production(children =
+[RegexSequence(children = [RegexSequence(children = [RegexSequence(children = [
+RegexSequence(children = [RegexSequence(children = [RegexEmpty(),
+RegexCharacter(char_set = [40, 41])]), RegexCharacterRule(char_set = [288, 295
+], rule_name = 'whitespace_opt')]), RegexCharacterRule(char_set = [259, 262],
+rule_name = 'expr0')]), RegexCharacter(char_set = [41, 42])]),
+RegexCharacterRule(char_set = [288, 295], rule_name = 'whitespace_opt')])],
+nonterminal = 9), Grammar.Production(children = [RegexSequence(children = [
+RegexEmpty(), RegexCharacter(char_set = [48, 49])])], nonterminal = 10),
+Grammar.Production(children = [RegexSequence(children = [RegexEmpty(),
+RegexCharacter(char_set = [49, 50])])], nonterminal = 11), Grammar.Production(
+children = [RegexSequence(children = [RegexEmpty(), RegexCharacter(char_set = [
+50, 51])])], nonterminal = 12), Grammar.Production(children = [RegexSequence(
+children = [RegexEmpty(), RegexCharacter(char_set = [51, 52])])], nonterminal =
+13), Grammar.Production(children = [RegexSequence(children = [RegexEmpty(),
+RegexCharacter(char_set = [52, 53])])], nonterminal = 14), Grammar.Production(
+children = [RegexSequence(children = [RegexEmpty(), RegexCharacter(char_set = [
+53, 54])])], nonterminal = 15), Grammar.Production(children = [RegexSequence(
+children = [RegexEmpty(), RegexCharacter(char_set = [54, 55])])], nonterminal =
+16), Grammar.Production(children = [RegexSequence(children = [RegexEmpty(),
+RegexCharacter(char_set = [55, 56])])], nonterminal = 17), Grammar.Production(
+children = [RegexSequence(children = [RegexEmpty(), RegexCharacter(char_set = [
+56, 57])])], nonterminal = 18), Grammar.Production(children = [RegexSequence(
+children = [RegexEmpty(), RegexCharacter(char_set = [57, 58])])], nonterminal =
+19), Grammar.Production(children = [RegexSequence(children = [RegexSequence(
+children = [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name =
+'number')]), RegexCharacter(char_set = [48, 49])])], nonterminal = 20),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(char_set = [49, 50])])], nonterminal = 21),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(char_set = [50, 51])])], nonterminal = 22),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(char_set = [51, 52])])], nonterminal = 23),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(char_set = [52, 53])])], nonterminal = 24),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(char_set = [53, 54])])], nonterminal = 25),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(char_set = [54, 55])])], nonterminal = 26),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(char_set = [55, 56])])], nonterminal = 27),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(char_set = [56, 57])])], nonterminal = 28),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(char_set = [57, 58])])], nonterminal = 29),
+Grammar.Production(children = [RegexEmpty()], nonterminal = 30),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [288, 295], rule_name =
+'whitespace_opt')]), RegexCharacter(char_set = [9, 10])])], nonterminal = 31),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [288, 295], rule_name =
+'whitespace_opt')]), RegexCharacter(char_set = [10, 11])])], nonterminal = 32),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [288, 295], rule_name =
+'whitespace_opt')]), RegexCharacter(char_set = [11, 12])])], nonterminal = 33),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [288, 295], rule_name =
+'whitespace_opt')]), RegexCharacter(char_set = [12, 13])])], nonterminal = 34),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [288, 295], rule_name =
+'whitespace_opt')]), RegexCharacter(char_set = [13, 14])])], nonterminal = 35),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [288, 295], rule_name =
+'whitespace_opt')]), RegexCharacter(char_set = [32, 33])])], nonterminal = 36)
+], terminal_thres = 258)
+  #sys.stdout.write(
+  #  wrap_repr(
+  #    '  grammar = {0:s}'.format(repr(grammar).replace('regex.', '')),
+  #    79
+  #  )
+  #)
+
+  lr1 = grammar.to_lr1()
+  #sys.stdout.write(
+  #  wrap_repr(
+  #    '  lr1 = {0:s}'.format(repr(lr1).replace('regex.', '')),
+  #    79
+  #  )
+  #)
+
+  lr1.parse_text('(13 + 5 * 6) * 2', 0)
+  root = element.Element('root', text = '(13 + 5 * 6) * 2')
+  lr1.parse_yychunk(root, 0, 0, element.Element, iter([]))
+  xml.etree.ElementTree.dump(root)
+
+  clr1 = lr1.to_clr1()
+  #sys.stdout.write(
+  #  wrap_repr(
+  #    '  clr1 = {0:s}'.format(repr(clr1).replace('regex.', '')),
+  #    79
+  #  )
+  #)
+
+  clr1.parse_text('(13 + 5 * 6) * 2', 0)
+  root = element.Element('root', text = '(13 + 5 * 6) * 2')
+  clr1.parse_yychunk(root, 0, 0, element.Element, iter([]))
+  xml.etree.ElementTree.dump(root)
+
+  lalr1 = lr1.to_lalr1()
+  #sys.stdout.write(
+  #  wrap_repr(
+  #    '  lalr1 = {0:s}'.format(repr(lalr1).replace('regex.', '')),
+  #    79
+  #  )
+  #)
+
+  lalr1.parse_text('(13 + 5 * 6) * 2', 0)
+  root = element.Element('root', text = '(13 + 5 * 6) * 2')
+  lalr1.parse_yychunk(root, 0, 0, element.Element, iter([]))
+  xml.etree.ElementTree.dump(root)
diff --git a/regex.sh b/regex.sh
new file mode 100755 (executable)
index 0000000..8cc339a
--- /dev/null
+++ b/regex.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+if ./generate.py regex <regex.py >regex.py.new && ! diff -q regex.py regex.py.new
+then
+  mv regex.py.new regex.py
+else
+  rm -f regex.py.new
+fi
diff --git a/skel/Makefile b/skel/Makefile
new file mode 100644 (file)
index 0000000..8103b05
--- /dev/null
@@ -0,0 +1,4 @@
+lex.yy.c: skel.l
+       ../../bootstrap_flex.git/src/flex $<
+       cp $@ $@.orig
+       patch $@ <$@.patch
diff --git a/skel/lex.yy.c.patch b/skel/lex.yy.c.patch
new file mode 100644 (file)
index 0000000..bcee795
--- /dev/null
@@ -0,0 +1,249 @@
+--- lex.yy.c.orig      2018-06-25 10:36:41.898822220 +1000
++++ lex.yy.c   2018-06-28 00:00:54.431048812 +1000
+@@ -1,6 +1,3 @@
+-
+-#line 2 "lex.yy.c"
+-
+ #define  YY_INT_ALIGNED short int
+ /* A lexical scanner generated by flex */
+@@ -351,179 +348,8 @@
+       (yy_hold_char) = *yy_cp; \
+       *yy_cp = '\0'; \
+       (yy_c_buf_p) = yy_cp;
+-#define YY_NUM_RULES 2
+-#define YY_END_OF_BUFFER 3
+-/* This struct is not used in this scanner,
+-   but its presence is necessary. */
+-struct yy_trans_info
+-      {
+-      flex_int32_t yy_verify;
+-      flex_int32_t yy_nxt;
+-      };
+-static const flex_int16_t yy_acclist[15] =
+-    {   0,
+-     8193,16385, 8193,16385,    3,    2, 8193,    2,16385, 8193,
+-        2, 8193,16385, 8193
+-    } ;
+-
+-static const flex_int16_t yy_accept[11] =
+-    {   0,
+-        1,    3,    5,    6,    7,   10,   12,   14,   15,   15
+-    } ;
+-
+-static const flex_int16_t yy_base[11] =
+-    {   0,
+-        0,    2,  363,  364,    4,  264,    6,  263,  364,  104
+-    } ;
+-
+-static const flex_int16_t yy_def[11] =
+-    {   0,
+-       10,   10,    9,    9,    9,    9,    9,    9,    0,    9
+-    } ;
+-
+-static const flex_int16_t yy_nxt[621] =
+-    {   0,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    5,    6,    5,    6,
+-
+-        7,    8,    7,    8,    4,    4,    4,    4,    4,    4,
+-        4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
+-        4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
+-        4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
+-        4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
+-        4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
+-        4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
+-        4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
+-        4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
+-        4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
+-
+-        4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
+-        4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
+-        4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
+-        4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
+-        4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
+-        4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
+-        4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
+-        4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
+-        4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
+-        4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
+-
+-        4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
+-        4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
+-        4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
+-        4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
+-        4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
+-        4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
+-        8,    8,    9,    3,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9
+-    } ;
+-
+-static const flex_int16_t yy_chk[621] =
+-    {   0,
+-        0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+-        0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+-        0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+-        0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+-        0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+-        0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+-        0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+-        0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+-        0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+-        0,    0,    0,    0,    0,    0,    1,    1,    2,    2,
+-
+-        5,    5,    7,    7,   10,   10,   10,   10,   10,   10,
+-       10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
+-       10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
+-       10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
+-       10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
+-       10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
+-       10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
+-       10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
+-       10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
+-       10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
+-
+-       10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
+-       10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
+-       10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
+-       10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
+-       10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
+-       10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
+-       10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
+-       10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
+-       10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
+-       10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
+-
+-       10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
+-       10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
+-       10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
+-       10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
+-       10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
+-       10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
+-        8,    6,    3,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+-        9,    9,    9,    9,    9,    9,    9,    9,    9,    9
+-    } ;
++
++/* GENERATE TABLES */
+ extern int yy_flex_debug;
+ int yy_flex_debug = 0;
+@@ -553,8 +379,8 @@
+ #define YY_MORE_ADJ (yy_more_len)
+ #define YY_RESTORE_YY_MORE_OFFSET
+ char *yytext;
+-#line 1 "skel.l"
+-#line 557 "lex.yy.c"
++
++/* GENERATE SECTION1 */
+ #define INITIAL 0
+@@ -777,9 +603,7 @@
+               }
+       {
+-#line 2 "skel.l"
+-
+-#line 782 "lex.yy.c"
++/* GENERATE SECTION2INITIAL */
+       while ( /*CONSTCOND*/1 )                /* loops until end-of-file is reached */
+               {
+@@ -816,7 +640,7 @@
+                       *(yy_state_ptr)++ = yy_current_state;
+                       ++yy_cp;
+                       }
+-              while ( yy_base[yy_current_state] != 364 );
++              while ( yy_base[yy_current_state] != 0 );
+ yy_find_action:
+               yy_current_state = *--(yy_state_ptr);
+@@ -866,19 +690,7 @@
+               switch ( yy_act )
+       { /* beginning of action switch */
+-case 1:
+-YY_RULE_SETUP
+-#line 3 "skel.l"
+-
+-      YY_BREAK
+-case 2:
+-YY_RULE_SETUP
+-#line 4 "skel.l"
+-ECHO;
+-      YY_BREAK
+-#line 879 "lex.yy.c"
+-                      case YY_STATE_EOF(INITIAL):
+-                              yyterminate();
++/* GENERATE SECTION2 */
+       case YY_END_OF_BUFFER:
+               {
+@@ -1850,4 +1662,4 @@
+ #define YYTABLES_NAME "yytables"
+-#line 4 "skel.l"
++/* GENERATE SECTION3 */
diff --git a/skel/skel.l b/skel/skel.l
new file mode 100644 (file)
index 0000000..8f6b20b
--- /dev/null
@@ -0,0 +1,3 @@
+%option noecs nometa-ecs reject yymore
+%%
+a*/b*
diff --git a/tests/Makefile b/tests/Makefile
new file mode 100644 (file)
index 0000000..86f4d56
--- /dev/null
@@ -0,0 +1,3 @@
+cal.l.xml: cal.l
+       ../../bootstrap_flex.git/src/flex $< 2>$@
+       rm -f lex.yy.c
diff --git a/tests/cal.l b/tests/cal.l
new file mode 100644 (file)
index 0000000..86e30e3
--- /dev/null
@@ -0,0 +1,19 @@
+%{
+/* this is section 1 */
+%}
+
+DIGIT [0-9]+\.?|[0-9]*\.[0-9]+
+
+%option noecs nometa-ecs noyywrap
+
+%%
+
+       /* this is section 2 initial */
+
+[ ]
+{DIGIT}        { yylval = atof(yytext); return NUM; }
+\n|.   { return yytext[0]; }
+
+%%
+
+/* this is section 3 */
diff --git a/tests/cal.l.xml b/tests/cal.l.xml
new file mode 100644 (file)
index 0000000..2440a6b
--- /dev/null
@@ -0,0 +1,20 @@
+<RefList><PLexSpecification ref="0"><Section1>%{
+<CodeBlock>/* this is section 1 */
+</CodeBlock>%}
+
+DIGIT [0-9]+\.?|[0-9]*\.[0-9]+
+
+<Options>%option <Options_ECS>noecs</Options_ECS> <Options_MetaECS>nometa-ecs</Options_MetaECS> <Options_YYWrap>noyywrap</Options_YYWrap></Options>
+
+</Section1>%%<Section2>
+
+       <CodeBlock>/* this is section 2 initial */
+</CodeBlock>
+<Rule><StartCondNone />[<RegexCharacterOr><RegexCharacter char_set="" /><RegexCharacter char_set="32 33"> </RegexCharacter></RegexCharacterOr>]<RegexEmpty /><Action>
+</Action></Rule>{DIGIT}<Rule><StartCondNone />(<RegexOr><RegexSequence><RegexRepeat count0="1">[<RegexCharacterOr><RegexCharacter char_set="" /><RegexCharacter char_set="48 58">0-9</RegexCharacter></RegexCharacterOr>]+</RegexRepeat><RegexRepeat count0="0" count1="1"><RegexCharacter char_set="46 47">\.</RegexCharacter>?</RegexRepeat></RegexSequence>|<RegexSequence><RegexSequence><RegexRepeat count0="0">[<RegexCharacterOr><RegexCharacter char_set="" /><RegexCharacter char_set="48 58">0-9</RegexCharacter></RegexCharacterOr>]*</RegexRepeat><RegexCharacter char_set="46 47">\.</RegexCharacter></RegexSequence><RegexRepeat count0="1">[<RegexCharacterOr><RegexCharacter char_set="" /><RegexCharacter char_set="48 58">0-9</RegexCharacter></RegexCharacterOr>]+</RegexRepeat></RegexSequence></RegexOr>) <RegexEmpty /><Action>{ yylval = atof(yytext); return NUM; }
+</Action></Rule><Rule><StartCondNone /><RegexOr><RegexCharacter char_set="10 11">\n</RegexCharacter>|<RegexCharacter char_set="0 10 11 256">.</RegexCharacter></RegexOr>       <RegexEmpty /><Action>{ return yytext[0]; }
+</Action></Rule>
+</Section2>%%<Section3>
+
+/* this is section 3 */
+</Section3></PLexSpecification></RefList>
\ No newline at end of file
diff --git a/work.py b/work.py
new file mode 100644 (file)
index 0000000..cf832c4
--- /dev/null
+++ b/work.py
@@ -0,0 +1,173 @@
+import sys
+import element
+
+def yychunk_line(root, fin):
+  line = fin.readline()
+  while len(line):
+    element.set_text(root, -1, element.get_text(root, -1) + line)
+    yield
+    line = fin.readline()
+
+def yychunk_block(root, fin, count):
+  block = fin.read(count)
+  while len(block):
+    element.set_text(root, -1, element.get_text(root, -1) + line)
+    yield
+    block = fin.read(count)
+
+def replace_with_element(root, pos0, off0, pos1, off1, child):
+  if pos0 < 0:
+    pos0, off0 = element.to_start_relative(root, pos0, off0)
+  if pos1 < 0:
+    pos1, off1 = element.to_start_relative(root, pos1, off1)
+  count = pos1 - pos0
+  assert count >= 0
+
+  temp = element.get_text(root, pos1)
+  root[pos0:pos1] = [child]
+  element.set_text(root, pos0 + 1, temp[off1:])
+  if count != 0:
+    temp = element.get_text(root, pos0)
+  element.set_text(root, pos0, temp[:off0])
+
+def replace_with_text(root, mark, i, j, text):
+  (pos0, off0) = mark[i]
+  (pos1, off1) = mark[j]
+  count = pos1 - pos0
+  assert count >= 0
+
+  temp = element.get_tail(root, pos1)
+  del root[pos0 + 1:pos1 + 1]
+  element.set_tail(
+    root,
+    pos0,
+    element.get_tail(root, pos0)[:off0] + text + temp[off1:]
+  )
+
+  if j != i + 1:
+    mark[i + 1:j + 1] = [mark[j]]
+  k = i + 1
+  delta_pos = -count
+  delta_off = off0 + len(text) - off1
+  while k < len(mark):
+    (pos2, off2) = mark[k]
+    if pos2 > pos1:
+      break
+    mark[k] = (pos2 + delta_pos, off2 + delta_off)
+    k += 1
+  if delta_pos != 0:
+    while k < len(mark):
+      (pos2, off2) = mark[k]
+      mark[k] = (pos2 + delta_pos, off2)
+      k += 1
+
+def replace_with_content(root, mark, i, j, child):
+  text = element.get_tail(child, -1)
+  if len(child) == 0:
+    replace_with_text(root, mark, i, j, text)
+  else:
+    (pos0, off0) = mark[i]
+    (pos1, off1) = mark[j]
+    count = pos1 - pos0
+    assert count >= 0
+  
+    temp = element.get_tail(root, pos1)
+    root[pos0 + 1:pos1 + 1] = child[:]
+    tail = element.get_tail(root, pos0 + len(child))
+    element.set_tail(root, pos0 + len(child), tail + temp[off1:])
+    if count != 0:
+      temp = element.get_tail(root, pos0)
+    element.set_tail(root, pos0, temp[:off0] + text)
+  
+    if j != i + 1:
+      mark[i + 1:j + 1] = [mark[j]]
+    k = i + 1
+    delta_pos = len(child) - count
+    delta_off = len(tail) - off1
+    while k < len(mark):
+      (pos2, off2) = mark[k]
+      if pos2 > pos1:
+        break
+      mark[k] = (pos2 + delta_pos, off2 + delta_off)
+      k += 1
+    if delta_pos != 0:
+      while k < len(mark):
+        (pos2, off2) = mark[k]
+        mark[k] = (pos2 + delta_pos, off2)
+        k += 1
+
+def extract_element(root, mark, i, j, factory, *args, **kwargs):
+  (pos0, off0) = mark[i]
+  (pos1, off1) = mark[j]
+  count = pos1 - pos0
+  assert count >= 0
+
+  result = factory(*args, **kwargs)
+  result[:] = [i.copy() for i in root[pos0 + 1:pos1 + 1]]
+  tail = element.get_tail(root, pos1)
+  if count == 0:
+    element.set_tail(result, -1, tail[off0:off1])
+  else:
+    element.set_tail(result, count - 1, tail[:off1])
+    tail = element.get_tail(root, pos0)
+    element.set_tail(result, -1, tail[off0:])
+  return result
+
+def extract_text(root, mark, i, j):
+  (pos0, off0) = mark[i]
+  (pos1, off1) = mark[j]
+  #result = [element.get_tail(root, i) for i in range(pos0, pos1 + 1)]
+  #result[-1] = result[-1][:off1]
+  #result[0] = result[0][off0:]
+  #return ''.join(result)
+  assert pos1 == pos0
+  return element.get_tail(root, pos0)[off0:off1]
+
+def apply_markup(root, pos0, off0, pos1, off1, factory, *args, **kwargs):
+  if pos0 < 0:
+    pos0, off0 = element.to_start_relative(root, pos0, off0)
+  if pos1 < 0:
+    pos1, off1 = element.to_start_relative(root, pos1, off1)
+  count = pos1 - pos0
+  assert count >= 0
+
+  child = factory(*args, **kwargs)
+  child[:] = root[pos0:pos1]
+
+  tail = element.get_text(root, pos1)
+  # at present, if count > 0, child[-1] is shared with root[pos1 - 1],
+  # so we cannot change child[-1].tail until after the replacement
+  if count == 0:
+    replace_with_element(root, pos0, off0, pos1, off1, child)
+    element.set_text(child, 0, tail[off0:off1])
+  else:
+    temp = element.get_text(root, pos0)
+    replace_with_element(root, pos0, off0, pos1, off1, child)
+    element.set_text(child, count, tail[:off1])
+    element.set_text(child, 0, temp[off0:])
+  return child
+
+def dump(root, mark):
+  i = 0
+  pos = -1
+  result = ''
+  while True:
+    assert i >= len(mark) or mark[i][0] >= pos
+    tail = element.get_tail(root, pos) 
+    off = 0
+    while off < len(tail) or (i < len(mark) and mark[i][0] == pos):
+      end = len(tail) if i >= len(mark) or mark[i][0] != pos else mark[i][1]
+      if end > off:
+        result += tail[off:end].replace('\n', '(lf)').replace('\t', '(tab)')
+        off = end
+      if i < len(mark) and mark[i][0] == pos:
+        result += '({0:d})'.format(i)
+        i += 1
+    pos += 1
+    if pos >= len(root):
+      break
+    result += '({0:s}{1:d})'.format(root[pos].tag, pos)
+  assert i >= len(mark)
+  result += '(eof)'
+  sys.stdout.write('{0:s}\n'.format(result[-159:])) #79:]))
+  sys.stdout.flush()