Fix EOF action and other things to get parser working, still lookahead problem
authorNick Downing <downing.nick@gmail.com>
Fri, 6 Jul 2018 06:38:59 +0000 (16:38 +1000)
committerNick Downing <downing.nick@gmail.com>
Fri, 6 Jul 2018 06:38:59 +0000 (16:38 +1000)
ast.py
bison_lr1dfa.py
regex.py

diff --git a/ast.py b/ast.py
index 1b9a054..d8931bc 100644 (file)
--- a/ast.py
+++ b/ast.py
@@ -130,6 +130,21 @@ class PYACC(element.Element):
       self.repr_serialize(params)
       return 'ast.PYACC.BracedCode({0:s})'.format(', '.join(params))
     # GENERATE END
+    def get_text(self, len_production = 0):
+      return ''.join(
+        [
+          (
+            '(yyvsp[{0:d}])'.format(i.index - len_production)
+          if isinstance(i, PYACC.StackReference) else
+            '(yyval)'
+          if isinstance(i, PYACC.ValueReference) else
+            chr(i.character)
+          if isinstance(i, PYACC.Escape) else
+            element.get_text(i, 0)
+          )
+          for i in self
+        ]
+      )
 
   class BracedPredicate(element.Element):
     # GENERATE ELEMENT() BEGIN
@@ -391,6 +406,100 @@ class PYACC(element.Element):
     ):
       raise NotImplementedException
 
+  class StackLocation(element.Element):
+    # GENERATE ELEMENT(int index) BEGIN
+    def __init__(
+      self,
+      tag = 'PYACC_StackLocation',
+      attrib = {},
+      text = '',
+      children = [],
+      index = -1
+    ):
+      element.Element.__init__(
+        self,
+        tag,
+        attrib,
+        text,
+        children
+      )
+      self.index = (
+        element.deserialize_int(index)
+      if isinstance(index, str) else
+        index
+      )
+    def serialize(self, ref_list, indent = 0):
+      element.Element.serialize(self, ref_list, indent)
+      self.set('index', element.serialize_int(self.index))
+    def deserialize(self, ref_list):
+      element.Element.deserialize(self, ref_list)
+      self.index = element.deserialize_int(self.get('index', '-1'))
+    def copy(self, factory = None):
+      result = element.Element.copy(
+        self,
+        StackLocation if factory is None else factory
+      )
+      result.index = self.index
+      return result
+    def repr_serialize(self, params):
+      element.Element.repr_serialize(self, params)
+      if self.index != -1:
+        params.append(
+          'index = {0:s}'.format(repr(self.index))
+        )
+    def __repr__(self):
+      params = []
+      self.repr_serialize(params)
+      return 'ast.PYACC.StackLocation({0:s})'.format(', '.join(params))
+    # GENERATE END
+
+  class StackReference(element.Element):
+    # GENERATE ELEMENT(int index) BEGIN
+    def __init__(
+      self,
+      tag = 'PYACC_StackReference',
+      attrib = {},
+      text = '',
+      children = [],
+      index = -1
+    ):
+      element.Element.__init__(
+        self,
+        tag,
+        attrib,
+        text,
+        children
+      )
+      self.index = (
+        element.deserialize_int(index)
+      if isinstance(index, str) else
+        index
+      )
+    def serialize(self, ref_list, indent = 0):
+      element.Element.serialize(self, ref_list, indent)
+      self.set('index', element.serialize_int(self.index))
+    def deserialize(self, ref_list):
+      element.Element.deserialize(self, ref_list)
+      self.index = element.deserialize_int(self.get('index', '-1'))
+    def copy(self, factory = None):
+      result = element.Element.copy(
+        self,
+        StackReference if factory is None else factory
+      )
+      result.index = self.index
+      return result
+    def repr_serialize(self, params):
+      element.Element.repr_serialize(self, params)
+      if self.index != -1:
+        params.append(
+          'index = {0:s}'.format(repr(self.index))
+        )
+    def __repr__(self):
+      params = []
+      self.repr_serialize(params)
+      return 'ast.PYACC.StackReference({0:s})'.format(', '.join(params))
+    # GENERATE END
+
   class String(element.Element):
     # GENERATE ELEMENT() BEGIN
     def __init__(
@@ -418,6 +527,17 @@ class PYACC(element.Element):
       self.repr_serialize(params)
       return 'ast.PYACC.String({0:s})'.format(', '.join(params))
     # GENERATE END
+    def get_text(self):
+      return ''.join(
+        [
+          (
+            chr(i.character)
+          if isinstance(i, PYACC.Escape) else
+            element.get_text(i, 0)
+          )
+          for i in self
+        ]
+      )
 
   class Tag(element.Element):
     # GENERATE ELEMENT() BEGIN
@@ -1756,6 +1876,7 @@ class PYACC(element.Element):
             if isinstance(self[i], PYACC.Section2.Rules.RHS.Symbol):
               if isinstance(self[i][0], PYACC.Char):
                 character = ord(self[i][0].get_text())
+                assert character != 0 # would conflict with YYEOF
                 pyacc.characters_used.add(character)
                 expr = regex.RegexSequence(
                   children = [
@@ -1905,6 +2026,63 @@ class PYACC(element.Element):
     ):
       pass
 
+  class ValueLocation(element.Element):
+    # GENERATE ELEMENT() BEGIN
+    def __init__(
+      self,
+      tag = 'PYACC_ValueLocation',
+      attrib = {},
+      text = '',
+      children = []
+    ):
+      element.Element.__init__(
+        self,
+        tag,
+        attrib,
+        text,
+        children
+      )
+    def copy(self, factory = None):
+      result = element.Element.copy(
+        self,
+        ValueLocation if factory is None else factory
+      )
+      return result
+    def __repr__(self):
+      params = []
+      self.repr_serialize(params)
+      return 'ast.PYACC.ValueLocation({0:s})'.format(', '.join(params))
+    # GENERATE END
+
+  class ValueReference(element.Element):
+    # GENERATE ELEMENT() BEGIN
+    def __init__(
+      self,
+      tag = 'PYACC_ValueReference',
+      attrib = {},
+      text = '',
+      children = []
+    ):
+      element.Element.__init__(
+        self,
+        tag,
+        attrib,
+        text,
+        children
+      )
+    def copy(self, factory = None):
+      result = element.Element.copy(
+        self,
+        ValueReference if factory is None else factory
+      )
+      return result
+    def __repr__(self):
+      params = []
+      self.repr_serialize(params)
+      return 'ast.PYACC.ValueReference({0:s})'.format(', '.join(params))
+    # GENERATE END
+
+
   # GENERATE ELEMENT(list(ref) prologue_text, set(int) characters_used, list(ref) terminal_symbols, list(ref) nonterminal_symbols, ref grammar, list(ref) actions_braced_code) BEGIN
   def __init__(
     self,
@@ -1926,11 +2104,7 @@ class PYACC(element.Element):
       text,
       children
     )
-    self.prologue_text = (
-      [element.deserialize_str(i) for i in prologue_text.split()]
-    if isinstance(prologue_text, str) else
-      prologue_text
-    )
+    self.prologue_text = prologue_text
     self.characters_used = (
       set([element.deserialize_int(i) for i in characters_used.split()])
     if isinstance(characters_used, str) else
@@ -1939,16 +2113,12 @@ class PYACC(element.Element):
     self.terminal_symbols = terminal_symbols
     self.nonterminal_symbols = nonterminal_symbols
     self.grammar = grammar
-    self.actions_braced_code = (
-      [element.deserialize_str(i) for i in actions_braced_code.split()]
-    if isinstance(actions_braced_code, str) else
-      actions_braced_code
-    )
+    self.actions_braced_code = actions_braced_code
   def serialize(self, ref_list, indent = 0):
     element.Element.serialize(self, ref_list, indent)
     self.set(
       'prologue_text',
-      ' '.join([element.serialize_str(i) for i in self.prologue_text])
+      ' '.join([element.serialize_ref(i, ref_list) for i in self.prologue_text])
     )
     self.set(
       'characters_used',
@@ -1965,12 +2135,12 @@ class PYACC(element.Element):
     self.set('grammar', element.serialize_ref(self.grammar, ref_list))
     self.set(
       'actions_braced_code',
-      ' '.join([element.serialize_str(i) for i in self.actions_braced_code])
+      ' '.join([element.serialize_ref(i, ref_list) for i in self.actions_braced_code])
     )
   def deserialize(self, ref_list):
     element.Element.deserialize(self, ref_list)
     self.prologue_text = [
-      element.deserialize_str(i)
+      element.deserialize_ref(i, ref_list)
       for i in self.get('prologue_text', '').split()
     ]
     self.characters_used = set(
@@ -1989,7 +2159,7 @@ class PYACC(element.Element):
     ]
     self.grammar = element.deserialize_ref(self.get('grammar', '-1'), ref_list)
     self.actions_braced_code = [
-      element.deserialize_str(i)
+      element.deserialize_ref(i, ref_list)
       for i in self.get('actions_braced_code', '').split()
     ]
   def copy(self, factory = None):
@@ -2050,8 +2220,8 @@ class PYACC(element.Element):
     self.prologue_text = []
     self.characters_used = set()
     self.terminal_symbols = [
-      PYACC.Symbol(name = '$end', character_set = [0x100, 0x101]),
-      PYACC.Symbol(name = 'error', character_set = [0x101, 0x102])
+      PYACC.Symbol(name = 'error', character_set = [0x100, 0x101]),
+      PYACC.Symbol(name = '$undefined', character_set = [0x101, 0x102])
     ]
     self.nonterminal_symbols = []
     self.grammar = regex.Grammar(
@@ -2061,14 +2231,16 @@ class PYACC(element.Element):
             regex.RegexCharacterRule()
           ]
         )
-      ]
+      ],
+      n_terminals = 0,
+      eof_character = 0
     )
     self.actions_braced_code = []
 
     # variables that won't be serialized
     # note: in name_to_symbol, >= 0 is terminal, < 0 is ~nonterminal
-    # (we do not bother storing the '$end' entry, it can't be looked up)
-    name_to_symbol = {'error': 1}
+    # (don't bother storing the '$undefined', it can't be looked up)
+    name_to_symbol = {'error': 0}
 
     # perform the semantic analysis pass
     for i in self:
@@ -2110,6 +2282,8 @@ tag_to_class = {
   'PYACC_Int': PYACC.Int,
   'PYACC_IntNone': PYACC.IntNone,
   'PYACC_Section': PYACC.Section,
+  'PYACC_StackLocation': PYACC.StackLocation,
+  'PYACC_StackReference': PYACC.StackReference,
   'PYACC_String': PYACC.String,
   'PYACC_Tag': PYACC.Tag,
   'PYACC_TagNone': PYACC.TagNone,
@@ -2159,7 +2333,9 @@ tag_to_class = {
   'PYACC_Section2_Rules_RHS_Merge': PYACC.Section2.Rules.RHS.Merge,
   'PYACC_Section2_Rules_RHS_Prec': PYACC.Section2.Rules.RHS.Prec,
   'PYACC_Section2_Rules_RHS_Symbol': PYACC.Section2.Rules.RHS.Symbol,
-  'PYACC_Section3': PYACC.Section3
+  'PYACC_Section3': PYACC.Section3,
+  'PYACC_ValueLocation': PYACC.ValueLocation,
+  'PYACC_ValueReference': PYACC.ValueReference
 }
 def factory(tag, attrib = {}, *args, **kwargs):
   return tag_to_class.get(tag, regex.factory)(tag, attrib, *args, **kwargs)
index 58b0b78..e1f3fe2 100644 (file)
@@ -80,10 +80,10 @@ class BisonLR1DFA:
     # - change the low-bit indication of shift/reduce to positive/negative
     # we do it here after removing redundant columns, as it's more efficient
     assert numpy.all(action_table != 0)
+    action_table[action_table == 1] = len(lr1dfa.states) << 1
+    action_table[action_table == -1] = 0
     mask = (action_table & 1).astype(numpy.bool)
     action_table >>= 1
-    action_table[action_table == 0] = len(lr1dfa.states)
-    action_table[action_table == -1] = 0
     action_table[mask] = -action_table[mask]
     assert numpy.all(goto_table != 0)
     goto_table[goto_table == -1] = 0
@@ -255,20 +255,18 @@ def generate(pyacc, skel_file, out_file):
   # the nonterminals (for each nonterminal, one character per production)
   lr1dfa = pyacc.grammar.to_lr1().to_lalr1()
 
-  # squash this down to the set of character literals that are referenced,
-  # the set of terminals, then only one character per nonterminal (hence
-  # nonterminals referenced by pyacc.nonterminal_symbols[] index, rather
+  # squash this down to the set of terminals, then the set of character
+  # literals that are referenced, then only one character per nonterminal
+  # (nonterminals referenced by pyacc.nonterminal_symbols[] index, rather
   # than the internal way as only the set of lr1dfa.productions[] indices)
 
   # generate translate table for character literals and terminal symbols
-  n_terminals = 1 # room for "$undefined"
+  n_terminals = 1 # room for '$eof'
   translate_terminals = numpy.zeros(
     (lr1dfa.n_terminals,),
     numpy.int16
   )
-  for i in sorted(pyacc.characters_used):
-    translate_terminals[i] = n_terminals
-    n_terminals += 1
+  translate_terminals[1:0x100] = 2 # '$undefined'
   for i in pyacc.terminal_symbols:
     for j in range(0, len(i.character_set), 2):
       translate_terminals[
@@ -276,6 +274,9 @@ def generate(pyacc, skel_file, out_file):
         i.character_set[j + 1]
       ] = n_terminals
     n_terminals += 1
+  for i in sorted(pyacc.characters_used):
+    translate_terminals[i] = n_terminals
+    n_terminals += 1
 
   # generate translate table for nonterminal symbols
   # this is effectively a map from productions back to nonterminal symbols
@@ -362,23 +363,23 @@ def generate(pyacc, skel_file, out_file):
           )
 
           # yytname (textual terminal/nonterminal name) wraps 70 columns
-          x = 72
+          x = 70
           yytname_lines = []
           for i in (
-            ['"$undefined"'] +
+            ['"$end"'] +
+            ['"{0:s}"'.format(i.name) for i in pyacc.terminal_symbols] +
             [
               '"\'{0:s}\'"'.format(
                 chr(i)
               if i >= 0x20 else
-                '\\x{0:02x}'.format(i)
+                '\\\\x{0:02x}'.format(i)
               )
               for i in sorted(pyacc.characters_used)
             ] +
-            ['"{0:s}"'.format(i.name) for i in pyacc.terminal_symbols] +
             ['"{0:s}"'.format(i.name) for i in pyacc.nonterminal_symbols] +
             ['YY_NULLPTR']
           ):
-            if x >= 72:
+            if x + len(i) >= 70:
               yytname_lines.append([])
               x = 0
             yytname_lines[-1].append(i)
@@ -412,7 +413,7 @@ def generate(pyacc, skel_file, out_file):
 
 /* YYTRANSLATE[YYX] -- Symbol number corresponding to YYX as returned
    by yylex, with out-of-bounds checking.  */
-#define YYUNDEFTOK 0
+#define YYUNDEFTOK 2
 #define YYMAXUTOK {6:d}
 
 #define YYTRANSLATE(YYX)                                                \\
@@ -420,13 +421,13 @@ def generate(pyacc, skel_file, out_file):
 
 /* YYTRANSLATE[TOKEN-NUM] -- Symbol number corresponding to TOKEN-NUM
    as returned by yylex, without out-of-bounds checking.  */
-static const yytype_uint16 yytranslate[] =
+static const yytype_int16 yytranslate[] =
 {{{7:s}
 }};
 
 #if YYDEBUG
   /* YYRLINE[YYN] -- Source line where rule number YYN was defined.  */
-static const yytype_uint16 yyrline[] =
+static const yytype_int16 yyrline[] =
 {{{8:s}
 }};
 #endif
@@ -442,7 +443,7 @@ static const char *const yytname[] =
 # ifdef YYPRINT
 /* YYTOKNUM[NUM] -- (External) token number corresponding to the
    (internal) symbol number NUM (which must be that of a token).  */
-static const yytype_uint16 yytoknum[] =
+static const yytype_int16 yytoknum[] =
 {{{10:s}
 }};
 # endif
@@ -466,7 +467,7 @@ static const yytype_int16 yypact[] =
   /* YYDEFACT[STATE-NUM] -- Default reduction number in state STATE-NUM.
      Performed when YYTABLE does not specify something else to do.  Zero
      means the default is an error.  */
-static const yytype_uint16 yydefact[] =
+static const yytype_int16 yydefact[] =
 {{{14:s}
 }};
 
@@ -483,27 +484,27 @@ static const yytype_int16 yydefgoto[] =
   /* YYTABLE[YYPACT[STATE-NUM]] -- What to do in state STATE-NUM.  If
      positive, shift that token.  If negative, reduce the rule whose
      number is the opposite.  If YYTABLE_NINF, syntax error.  */
-static const yytype_uint16 yytable[] =
+static const yytype_int16 yytable[] =
 {{{17:s}
 }};
 
-static const yytype_uint16 yycheck[] =
+static const yytype_int16 yycheck[] =
 {{{18:s}
 }};
 
   /* YYSTOS[STATE-NUM] -- The (internal number of the) accessing
      symbol of state STATE-NUM.  */
-static const yytype_uint16 yystos[] =
+static const yytype_int16 yystos[] =
 {{{19:s}
 }};
 
   /* YYR1[YYN] -- Symbol number of symbol that rule YYN derives.  */
-static const yytype_uint16 yyr1[] =
+static const yytype_int16 yyr1[] =
 {{{20:s}
 }};
 
   /* YYR2[YYN] -- Number of symbols on the right hand side of rule YYN.  */
-static const yytype_uint16 yyr2[] =
+static const yytype_int16 yyr2[] =
 {{{21:s}
 }};
 /* GENERATE TABLES END */
@@ -719,15 +720,12 @@ static const yytype_uint16 yyr2[] =
     {1:s}
     break;
 '''.format(
-                    i,
-                    ''.join(
-                      [
-                        element.get_text(j, 0)
-                        for j in pyacc.actions_braced_code[i]
-                      ]
+                    i + 1,
+                    pyacc.actions_braced_code[i].get_text(
+                      bison_lr1dfa.rule_data[i + 1, 1] # length of production
                     )
                   )
-                  for i in range(1, len(pyacc.actions_braced_code))
+                  for i in range(len(pyacc.actions_braced_code))
                   if len(pyacc.actions_braced_code[i])
                 ]
               )
index ef89305..7e970fe 100644 (file)
--- a/regex.py
+++ b/regex.py
@@ -941,14 +941,15 @@ class Grammar(element.Element):
         )
       )
 
-  # GENERATE ELEMENT(int n_terminals) BEGIN
+  # GENERATE ELEMENT(int n_terminals, int eof_character) BEGIN
   def __init__(
     self,
     tag = 'Grammar',
     attrib = {},
     text = '',
     children = [],
-    n_terminals = -1
+    n_terminals = -1,
+    eof_character = -1
   ):
     element.Element.__init__(
       self,
@@ -962,18 +963,26 @@ class Grammar(element.Element):
     if isinstance(n_terminals, str) else
       n_terminals
     )
+    self.eof_character = (
+      element.deserialize_int(eof_character)
+    if isinstance(eof_character, str) else
+      eof_character
+    )
   def serialize(self, ref_list, indent = 0):
     element.Element.serialize(self, ref_list, indent)
     self.set('n_terminals', element.serialize_int(self.n_terminals))
+    self.set('eof_character', element.serialize_int(self.eof_character))
   def deserialize(self, ref_list):
     element.Element.deserialize(self, ref_list)
     self.n_terminals = element.deserialize_int(self.get('n_terminals', '-1'))
+    self.eof_character = element.deserialize_int(self.get('eof_character', '-1'))
   def copy(self, factory = None):
     result = element.Element.copy(
       self,
       Grammar if factory is None else factory
     )
     result.n_terminals = self.n_terminals
+    result.eof_character = self.eof_character
     return result
   def repr_serialize(self, params):
     element.Element.repr_serialize(self, params)
@@ -981,6 +990,10 @@ class Grammar(element.Element):
       params.append(
         'n_terminals = {0:s}'.format(repr(self.n_terminals))
       )
+    if self.eof_character != -1:
+      params.append(
+        'eof_character = {0:s}'.format(repr(self.eof_character))
+      )
   def __repr__(self):
     params = []
     self.repr_serialize(params)
@@ -990,7 +1003,7 @@ class Grammar(element.Element):
     for i in range(len(self)):
       self[i].post_process(i, rule_name_to_character_set)
   def to_lr1(self):
-    lr1 = LR1([], self.n_terminals)
+    lr1 = LR1([], self.n_terminals, self.eof_character)
     for i in self:
       i.add_to_lr1(lr1)
     # propagate lookaheads
@@ -1695,7 +1708,12 @@ class DFA:
     )
 
 class LR1:
-  def __init__(self, productions = [], n_terminals = n_characters):
+  def __init__(
+    self,
+    productions = [],
+    n_terminals = n_characters,
+    eof_character = n_characters
+  ):
     # productions: list of production
     # production: (
     #   priority,
@@ -1725,8 +1743,10 @@ class LR1:
     #   noting that markup has to be applied in reverse order of the list
     # n_terminals: offset to apply to productions[] index to get symbol
     #   (character set code), also symbol for productions[0] = start production
+    # eof_character: usually == n_terminals (need not be valid terminal value)
     self.productions = productions
     self.n_terminals = n_terminals
+    self.eof_character = eof_character
 
   def lookahead_item_set_closure(self, items, item_to_index):
     in_queue = [True for i in range(len(items))]
@@ -1822,11 +1842,11 @@ class LR1:
     return next_items, next_item_to_index, nonterminal0, nonterminal1
 
   def parse_text(self, text, i):
-    items = [(0, 0, [n_characters, n_characters + 1])] # EOF
+    items = [(0, 0, [self.eof_character, self.eof_character + 1])]
     item_to_index = {(0, 0): 0}
     value_stack = []
     state_stack = []
-    lookahead_character = ord(text[i]) if i < len(text) else n_characters # EOF
+    lookahead_character = ord(text[i]) if i < len(text) else self.eof_character
     while True:
       self.lookahead_item_set_closure(items, item_to_index)
       value_stack.append(i)
@@ -1843,7 +1863,7 @@ class LR1:
             )
           )
         i += 1
-        lookahead_character = ord(text[i]) if i < len(text) else n_characters # EOF
+        lookahead_character = ord(text[i]) if i < len(text) else self.eof_character
       elif len(reductions) != 0:
         if len(reductions) != 1:
           sys.stderr.write(
@@ -1883,7 +1903,7 @@ class LR1:
     if pos < 0:
       pos, off = element.to_start_relative(root, pos, off)
 
-    items = [(0, 0, [n_characters, n_characters + 1])] # EOF
+    items = [(0, 0, [self.eof_character, self.eof_character + 1])]
     item_to_index = {(0, 0): 0}
     value_stack = []
     state_stack = []
@@ -1896,7 +1916,7 @@ class LR1:
         try:
           next(yychunk_iter)
         except StopIteration:
-          lookahead_character = n_characters # EOF
+          lookahead_character = self.eof_character
           break
         text = element.get_text(root, pos)
     else: 
@@ -1925,7 +1945,7 @@ class LR1:
             try:
               next(yychunk_iter)
             except StopIteration:
-              lookahead_character = n_characters # EOF
+              lookahead_character = self.eof_character
               break
             text = element.get_text(root, pos)
         else: 
@@ -1991,10 +2011,11 @@ class LR1:
         (len(symbols), group_bounds)
         for _, symbols, _, group_bounds in self.productions
       ],
-      self.n_terminals
+      self.n_terminals,
+      self.eof_character
     )
 
-    items = [(0, 0, [n_characters, n_characters + 1])] # EOF
+    items = [(0, 0, [self.eof_character, self.eof_character + 1])]
     item_to_index = {(0, 0): 0}
     self.lookahead_item_set_closure(items, item_to_index)
 
@@ -2074,10 +2095,11 @@ class LR1:
         (len(symbols), group_bounds)
         for _, symbols, _, group_bounds in self.productions
       ],
-      self.n_terminals
+      self.n_terminals,
+      self.eof_character
     )
 
-    items = [(0, 0, [n_characters, n_characters + 1])] # EOF
+    items = [(0, 0, [self.eof_character, self.eof_character + 1])]
     item_to_index = {(0, 0): 0}
     self.lookahead_item_set_closure(items, item_to_index)
 
@@ -2172,13 +2194,20 @@ class LR1:
     return lr1dfa
 
   def __repr__(self):
-    return 'regex.LR1({0:s}, {1:d})'.format(
+    return 'regex.LR1({0:s}, {1:d}, {2:d})'.format(
       repr(self.productions),
-      self.n_terminals
+      self.n_terminals,
+      self.eof_character
     )
 
 class LR1DFA:
-  def __init__(self, states = [], productions = [], n_terminals = n_characters):
+  def __init__(
+    self,
+    states = [],
+    productions = [],
+    n_terminals = n_characters,
+    eof_character = n_characters
+  ):
     # states: list of state_desc
     # state_desc: (terminal breaks, actions, nonterminal breaks, gotos)
     # action: shift = new state * 2, reduce = production * 2 + 1, error = -1
@@ -2193,15 +2222,17 @@ class LR1DFA:
     #   noting that markup has to be applied in reverse order of the list
     # n_terminals: offset to apply to productions[] index to get symbol
     #   (character set code), also symbol for productions[0] = start production
+    # eof_character: usually == n_terminals (need not be valid terminal value)
     self.states = states
     self.productions = productions
     self.n_terminals = n_terminals
+    self.eof_character = eof_character
 
   def parse_text(self, text, i):
     state = 0
     value_stack = []
     state_stack = []
-    lookahead_character = ord(text[i]) if i < len(text) else n_characters # EOF
+    lookahead_character = ord(text[i]) if i < len(text) else self.eof_character
     while True:
       value_stack.append(i)
       state_stack.append(state)
@@ -2215,7 +2246,7 @@ class LR1DFA:
       if (action & 1) == 0:
         state = action >> 1
         i += 1
-        lookahead_character = ord(text[i]) if i < len(text) else n_characters # EOF
+        lookahead_character = ord(text[i]) if i < len(text) else self.eof_character
       else:
         reduce = action >> 1
         len_symbols, group_bounds = self.productions[reduce]
@@ -2257,7 +2288,7 @@ class LR1DFA:
         try:
           next(yychunk_iter)
         except StopIteration:
-          lookahead_character = n_characters # EOF
+          lookahead_character = self.eof_character
           break
         text = element.get_text(root, pos)
     else: 
@@ -2284,7 +2315,7 @@ class LR1DFA:
             try:
               next(yychunk_iter)
             except StopIteration:
-              lookahead_character = n_characters # EOF
+              lookahead_character = self.eof_character
               break
             text = element.get_text(root, pos)
         else: 
@@ -2343,7 +2374,7 @@ class LR1DFA:
     try:
       end_pos, end_off, lookahead_character = next(yylex_iter)
     except StopIteration:
-      lookahead_character = n_characters # EOF
+      lookahead_character = self.eof_character
       end_pos, end_off = element.to_end_relative(root, pos, off)
     while True:
       value_stack.append((pos, off))
@@ -2362,7 +2393,7 @@ class LR1DFA:
         try:
           end_pos, end_off, lookahead_character = next(yylex_iter)
         except StopIteration:
-          lookahead_character = n_characters # EOF
+          lookahead_character = self.eof_character
           #end_pos, end_off = element.to_end_relative(root, pos, off)
       else:
         reduce = action >> 1
@@ -2408,10 +2439,11 @@ class LR1DFA:
         assert state != -1
 
   def __repr__(self):
-    return 'regex.LR1DFA({0:s}, {1:s}, {2:d})'.format(
+    return 'regex.LR1DFA({0:s}, {1:s}, {2:d}, {3:d})'.format(
       repr(self.states),
       repr(self.productions),
-      self.n_terminals
+      self.n_terminals,
+      self.eof_character
     )
 
 def wrap_repr(text, width):