From: Nick Downing Date: Sun, 8 Jul 2018 05:19:46 +0000 (+1000) Subject: Decouple Regex and LR1 by defining a new Grammar.Production.Symbol class, simplifies... X-Git-Url: https://git.ndcode.org/public/gitweb.cgi?p=piyacc.git;a=commitdiff_plain;h=e5cc042e7867df43c51d5b5bfcab88a311e1dddd Decouple Regex and LR1 by defining a new Grammar.Production.Symbol class, simplifies the handling of separate terminal and nonterminals sets, but means that symbols in a production can no longer be grouped (Nick's advanced feature) for now --- diff --git a/ast.py b/ast.py index d8931bc..bed3dd1 100644 --- a/ast.py +++ b/ast.py @@ -1871,30 +1871,26 @@ class PYACC(element.Element): lhs_symbol, name_to_symbol ): - expr = regex.RegexEmpty() + production = regex.Grammar.Production( + nonterminal = len(pyacc.grammar) + ) for i in range(len(self)): if isinstance(self[i], PYACC.Section2.Rules.RHS.Symbol): if isinstance(self[i][0], PYACC.Char): character = ord(self[i][0].get_text()) assert character != 0 # would conflict with YYEOF pyacc.characters_used.add(character) - expr = regex.RegexSequence( - children = [ - expr, - regex.RegexCharacter( - character_set = [character, character + 1] - ) - ] + production.append( + regex.Grammar.Production.Symbol( + terminal_set = [character, character + 1] + ) ) elif isinstance(self[i][0], PYACC.ID): - expr = regex.RegexSequence( - children = [ - expr, - regex.RegexCharacterRule( - rule_name = element.get_text(self[i][0], 0) - # character_set will be filled in later once assigned - ) - ] + production.append( + regex.Grammar.Production.NamedSymbol( + # (non)terminal_set will be filled in later once assigned + name = element.get_text(self[i][0], 0) + ) ) else: assert False @@ -1912,7 +1908,7 @@ class PYACC(element.Element): character_set[-1] = character + 1 else: character_set.extend([character, character + 1]) - pyacc.grammar.append(regex.Grammar.Production(children = [expr])) + pyacc.grammar.append(production) # GENERATE ELEMENT() BEGIN def __init__( @@ -2228,12 +2224,12 @@ class PYACC(element.Element): children = [ regex.Grammar.Production( children = [ - regex.RegexCharacterRule() - ] + regex.Grammar.Production.NamedSymbol() + ], + nonterminal = 0 ) ], - n_terminals = 0, - eof_character = 0 + eof_terminal = 0 ) self.actions_braced_code = [] @@ -2250,19 +2246,19 @@ class PYACC(element.Element): ) # if start symbol not specified, use first nonterminal defined in file - if len(self.grammar[0][0].rule_name) == 0: - self.grammar[0][0].rule_name = self.nonterminal_symbols[0].name + if len(self.grammar[0][0].name) == 0: + self.grammar[0][0].name = self.nonterminal_symbols[0].name # look up rule names and substitute appropriate character_set for each self.grammar.n_terminals = 0x100 + len(self.terminal_symbols) self.grammar.post_process( dict( [ - (i.name, i.character_set) + (i.name, (i.character_set, [])) for i in self.terminal_symbols ] + [ - (i.name, [self.grammar.n_terminals + j for j in i.character_set]) + (i.name, ([], i.character_set)) for i in self.nonterminal_symbols ] ) diff --git a/regex.py b/regex.py index 7e970fe..96c65d5 100644 --- a/regex.py +++ b/regex.py @@ -102,9 +102,9 @@ class Regex(element.Element): self.repr_serialize(params) return 'regex.Regex({0:s})'.format(', '.join(params)) # GENERATE END - def post_process(self, group_index = 0, rule_name_to_character_set = None): + def post_process(self, group_index = 0): #, rule_name_to_character_set = None): for i in self: - group_index = i.post_process(group_index, rule_name_to_character_set) + group_index = i.post_process(group_index) #, rule_name_to_character_set) return group_index def to_groups(self, groups): for i in self: @@ -113,13 +113,13 @@ class Regex(element.Element): raise NotImplementedException def add_to_nfa(self, nfa): nfa.start_state.append(self.to_nfa_state(nfa, 0)) - def to_lr1_symbols(self, n_terminals, symbols, lookaheads, group_bounds): - group_count = 0 - for i in self: - group_count += ( - i.to_lr1_symbols(n_terminals, symbols, lookaheads, group_bounds) - ) - return group_count # count of groups or ungrouped characters + #def to_lr1_symbols(self, n_terminals, symbols, lookaheads, group_bounds): + # group_count = 0 + # for i in self: + # group_count += ( + # i.to_lr1_symbols(n_terminals, symbols, lookaheads, group_bounds) + # ) + # return group_count # count of groups or ungrouped characters class RegexNone(Regex): # GENERATE ELEMENT() BEGIN @@ -239,27 +239,27 @@ class RegexCharacter(Regex): new_state = len(nfa.states) nfa.states.append((NFA.STATE_CHARACTER, self.character_set, next_state)) return new_state - def to_lr1_symbols(self, n_terminals, symbols, lookaheads, group_bounds): - terminal_set = [] - nonterminal_set = [] - i = 0 - while i < len(self.character_set): - [j, k] = self.character_set[i:i + 2] - if k > n_terminals: - if j < n_terminals: - terminal_set.extend([j, n_terminals]) - nonterminal_set.extend([0, k - n_terminals]) - i += 2 - while i < len(self.character_set): - [j, k] = self.character_set[i:i + 2] - nonterminal_set.extend([j - n_terminals, k - n_terminals]) - i += 2 - break - terminal_set.extend([j, k]) - i += 2 - symbols.append((terminal_set, nonterminal_set)) - lookaheads.append(([], False)) # initial_set, can_be_empty - return 1 # count of groups or ungrouped characters + #def to_lr1_symbols(self, n_terminals, symbols, lookaheads, group_bounds): + # terminal_set = [] + # nonterminal_set = [] + # i = 0 + # while i < len(self.character_set): + # [j, k] = self.character_set[i:i + 2] + # if k > n_terminals: + # if j < n_terminals: + # terminal_set.extend([j, n_terminals]) + # nonterminal_set.extend([0, k - n_terminals]) + # i += 2 + # while i < len(self.character_set): + # [j, k] = self.character_set[i:i + 2] + # nonterminal_set.extend([j - n_terminals, k - n_terminals]) + # i += 2 + # break + # terminal_set.extend([j, k]) + # i += 2 + # symbols.append((terminal_set, nonterminal_set)) + # lookaheads.append(([], False)) # initial_set, can_be_empty + # return 1 # count of groups or ungrouped characters class RegexCharacterRange(RegexCharacter): # GENERATE ELEMENT() BEGIN @@ -290,8 +290,8 @@ class RegexCharacterRange(RegexCharacter): self.repr_serialize(params) return 'regex.RegexCharacterRange({0:s})'.format(', '.join(params)) # GENERATE END - def post_process(self, group_index = 0, rule_name_to_character_set = None): - group_index = RegexCharacter.post_process(self, group_index, rule_name_to_character_set) + def post_process(self, group_index = 0): #, rule_name_to_character_set = None): + group_index = RegexCharacter.post_process(self, group_index) #, rule_name_to_character_set) self.character_set = [self[0].character_set[0], self[1].character_set[-1]] return group_index @@ -324,8 +324,8 @@ class RegexCharacterOr(RegexCharacter): self.repr_serialize(params) return 'regex.RegexCharacterOr({0:s})'.format(', '.join(params)) # GENERATE END - def post_process(self, group_index = 0, rule_name_to_character_set = None): - group_index = RegexCharacter.post_process(self, group_index, rule_name_to_character_set) + def post_process(self, group_index = 0): #, rule_name_to_character_set = None): + group_index = RegexCharacter.post_process(self, group_index) #, rule_name_to_character_set) self.character_set = character_set_or(self[0].character_set, self[1].character_set) return group_index @@ -358,8 +358,8 @@ class RegexCharacterAnd(RegexCharacter): self.repr_serialize(params) return 'regex.RegexCharacterAnd({0:s})'.format(', '.join(params)) # GENERATE END - def post_process(self, group_index = 0, rule_name_to_character_set = None): - group_index = RegexCharacter.post_process(self, group_index, rule_name_to_character_set) + def post_process(self, group_index = 0): #, rule_name_to_character_set = None): + group_index = RegexCharacter.post_process(self, group_index) #, rule_name_to_character_set) self.character_set = character_set_and(self[0].character_set, self[1].character_set) return group_index @@ -392,59 +392,59 @@ class RegexCharacterNot(RegexCharacter): self.repr_serialize(params) return 'regex.RegexCharacterNot({0:s})'.format(', '.join(params)) # GENERATE END - def post_process(self, group_index = 0, rule_name_to_character_set = None): - group_index = RegexCharacter.post_process(self, group_index, rule_name_to_character_set) + def post_process(self, group_index = 0): #, rule_name_to_character_set = None): + group_index = RegexCharacter.post_process(self, group_index) #, rule_name_to_character_set) self.character_set = character_set_not(self[0].character_set) return group_index -class RegexCharacterRule(RegexCharacter): - # GENERATE ELEMENT(str rule_name) BEGIN - def __init__( - self, - tag = 'RegexCharacterRule', - attrib = {}, - text = '', - children = [], - character_set = [], - rule_name = '' - ): - RegexCharacter.__init__( - self, - tag, - attrib, - text, - children, - character_set - ) - self.rule_name = rule_name - def serialize(self, ref_list, indent = 0): - RegexCharacter.serialize(self, ref_list, indent) - self.set('rule_name', element.serialize_str(self.rule_name)) - def deserialize(self, ref_list): - RegexCharacter.deserialize(self, ref_list) - self.rule_name = element.deserialize_str(self.get('rule_name', '')) - def copy(self, factory = None): - result = RegexCharacter.copy( - self, - RegexCharacterRule if factory is None else factory - ) - result.rule_name = self.rule_name - return result - def repr_serialize(self, params): - RegexCharacter.repr_serialize(self, params) - if self.rule_name != '': - params.append( - 'rule_name = {0:s}'.format(repr(self.rule_name)) - ) - def __repr__(self): - params = [] - self.repr_serialize(params) - return 'regex.RegexCharacterRule({0:s})'.format(', '.join(params)) - # GENERATE END - def post_process(self, group_index = 0, rule_name_to_character_set = None): - if rule_name_to_character_set is not None: - self.character_set = rule_name_to_character_set[self.rule_name] - return RegexCharacter.post_process(self, group_index, rule_name_to_character_set) +#class RegexCharacterRule(RegexCharacter): +# # GENERATE ELEMENT(str rule_name) BEGIN +# def __init__( +# self, +# tag = 'RegexCharacterRule', +# attrib = {}, +# text = '', +# children = [], +# character_set = [], +# rule_name = '' +# ): +# RegexCharacter.__init__( +# self, +# tag, +# attrib, +# text, +# children, +# character_set +# ) +# self.rule_name = rule_name +# def serialize(self, ref_list, indent = 0): +# RegexCharacter.serialize(self, ref_list, indent) +# self.set('rule_name', element.serialize_str(self.rule_name)) +# def deserialize(self, ref_list): +# RegexCharacter.deserialize(self, ref_list) +# self.rule_name = element.deserialize_str(self.get('rule_name', '')) +# def copy(self, factory = None): +# result = RegexCharacter.copy( +# self, +# RegexCharacterRule if factory is None else factory +# ) +# result.rule_name = self.rule_name +# return result +# def repr_serialize(self, params): +# RegexCharacter.repr_serialize(self, params) +# if self.rule_name != '': +# params.append( +# 'rule_name = {0:s}'.format(repr(self.rule_name)) +# ) +# def __repr__(self): +# params = [] +# self.repr_serialize(params) +# return 'regex.RegexCharacterRule({0:s})'.format(', '.join(params)) +# # GENERATE END +# def post_process(self, group_index = 0, rule_name_to_character_set = None): +# if rule_name_to_character_set is not None: +# self.character_set = rule_name_to_character_set[self.rule_name] +# return RegexCharacter.post_process(self, group_index, rule_name_to_character_set) class RegexOr(Regex): # GENERATE ELEMENT() BEGIN @@ -631,7 +631,7 @@ class RegexRepeat(Regex): self.repr_serialize(params) return 'regex.RegexRepeat({0:s})'.format(', '.join(params)) # GENERATE END - def post_process(self, group_index = 0, rule_name_to_character_set = None): + def post_process(self, group_index = 0): #, rule_name_to_character_set = None): # total hack which will be done in a Python action in future if len(self) >= 2: assert self[1].tag == 'Number' @@ -643,7 +643,7 @@ class RegexRepeat(Regex): self.count1 = self.count0 del self[1:] # end total hack - return Regex.post_process(self, group_index, rule_name_to_character_set) + return Regex.post_process(self, group_index) #, rule_name_to_character_set) def to_nfa_state(self, nfa, next_state): count0 = self.count0 count1 = self.count1 @@ -801,7 +801,7 @@ class RegexGroup(Regex): self.repr_serialize(params) return 'regex.RegexGroup({0:s})'.format(', '.join(params)) # GENERATE END - def post_process(self, group_index = 0, rule_name_to_character_set = None): + def post_process(self, group_index = 0): #, rule_name_to_character_set = None): # total hack which will be done in a Python action in future if len(self) >= 2: assert self[0].tag == 'GroupName' @@ -810,7 +810,7 @@ class RegexGroup(Regex): # end total hack self.group_index = group_index group_index += 1 - return Regex.post_process(self, group_index, rule_name_to_character_set) + return Regex.post_process(self, group_index) #, rule_name_to_character_set) def to_groups(self, groups): assert len(groups) == self.group_index groups.append( @@ -826,28 +826,157 @@ class RegexGroup(Regex): new_state = len(nfa.states) nfa.states.append((NFA.STATE_MARK, self.group_index * 2, child_state)) return new_state - def to_lr1_symbols(self, n_terminals, symbols, lookaheads, group_bounds): - group_start = len(symbols) - assert self.group_index == len(group_bounds) - group_bounds.append(None) - group_count = Regex.to_lr1_symbols( - self, - n_terminals, - symbols, - lookaheads, - group_bounds - ) - group_bounds[self.group_index] = ( - group_start, - group_count, - self.group_name, - {i.name: i.value for i in self.group_attributes} - ) - return 1 # count of groups or ungrouped characters + #def to_lr1_symbols(self, n_terminals, symbols, lookaheads, group_bounds): + # group_start = len(symbols) + # assert self.group_index == len(group_bounds) + # group_bounds.append(None) + # group_count = Regex.to_lr1_symbols( + # self, + # n_terminals, + # symbols, + # lookaheads, + # group_bounds + # ) + # group_bounds[self.group_index] = ( + # group_start, + # group_count, + # self.group_name, + # {i.name: i.value for i in self.group_attributes} + # ) + # return 1 # count of groups or ungrouped characters class Grammar(element.Element): class Production(element.Element): - # GENERATE ELEMENT(int nonterminal, int priority, bool right_to_left) BEGIN + class Symbol(element.Element): + # GENERATE ELEMENT(list(int) terminal_set, list(int) nonterminal_set) BEGIN + def __init__( + self, + tag = 'Grammar_Production_Symbol', + attrib = {}, + text = '', + children = [], + terminal_set = [], + nonterminal_set = [] + ): + element.Element.__init__( + self, + tag, + attrib, + text, + children + ) + self.terminal_set = ( + [element.deserialize_int(i) for i in terminal_set.split()] + if isinstance(terminal_set, str) else + terminal_set + ) + self.nonterminal_set = ( + [element.deserialize_int(i) for i in nonterminal_set.split()] + if isinstance(nonterminal_set, str) else + nonterminal_set + ) + def serialize(self, ref_list, indent = 0): + element.Element.serialize(self, ref_list, indent) + self.set( + 'terminal_set', + ' '.join([element.serialize_int(i) for i in self.terminal_set]) + ) + self.set( + 'nonterminal_set', + ' '.join([element.serialize_int(i) for i in self.nonterminal_set]) + ) + def deserialize(self, ref_list): + element.Element.deserialize(self, ref_list) + self.terminal_set = [ + element.deserialize_int(i) + for i in self.get('terminal_set', '').split() + ] + self.nonterminal_set = [ + element.deserialize_int(i) + for i in self.get('nonterminal_set', '').split() + ] + def copy(self, factory = None): + result = element.Element.copy( + self, + Symbol if factory is None else factory + ) + result.terminal_set = self.terminal_set + result.nonterminal_set = self.nonterminal_set + return result + def repr_serialize(self, params): + element.Element.repr_serialize(self, params) + if len(self.terminal_set): + params.append( + 'terminal_set = [{0:s}]'.format( + ', '.join([repr(i) for i in self.terminal_set]) + ) + ) + if len(self.nonterminal_set): + params.append( + 'nonterminal_set = [{0:s}]'.format( + ', '.join([repr(i) for i in self.nonterminal_set]) + ) + ) + def __repr__(self): + params = [] + self.repr_serialize(params) + return 'regex.Grammar.Production.Symbol({0:s})'.format(', '.join(params)) + # GENERATE END + def post_process(self, name_to_character_sets): + pass + + class NamedSymbol(Symbol): + # GENERATE ELEMENT(str name) BEGIN + def __init__( + self, + tag = 'Grammar_Production_NamedSymbol', + attrib = {}, + text = '', + children = [], + terminal_set = [], + nonterminal_set = [], + name = '' + ): + Grammar.Production.Symbol.__init__( + self, + tag, + attrib, + text, + children, + terminal_set, + nonterminal_set + ) + self.name = name + def serialize(self, ref_list, indent = 0): + Grammar.Production.Symbol.serialize(self, ref_list, indent) + self.set('name', element.serialize_str(self.name)) + def deserialize(self, ref_list): + Grammar.Production.Symbol.deserialize(self, ref_list) + self.name = element.deserialize_str(self.get('name', '')) + def copy(self, factory = None): + result = Grammar.Production.Symbol.copy( + self, + NamedSymbol if factory is None else factory + ) + result.name = self.name + return result + def repr_serialize(self, params): + Grammar.Production.Symbol.repr_serialize(self, params) + if self.name != '': + params.append( + 'name = {0:s}'.format(repr(self.name)) + ) + def __repr__(self): + params = [] + self.repr_serialize(params) + return 'regex.Grammar.Production.NamedSymbol({0:s})'.format(', '.join(params)) + # GENERATE END + def post_process(self, name_to_character_sets): + self.terminal_set, self.nonterminal_set = ( + name_to_character_sets[self.name] + ) + + # GENERATE ELEMENT(int nonterminal, int precedence, int associativity) BEGIN def __init__( self, tag = 'Grammar_Production', @@ -855,8 +984,8 @@ class Grammar(element.Element): text = '', children = [], nonterminal = -1, - priority = -1, - right_to_left = False + precedence = -1, + associativity = -1 ): element.Element.__init__( self, @@ -870,34 +999,34 @@ class Grammar(element.Element): if isinstance(nonterminal, str) else nonterminal ) - self.priority = ( - element.deserialize_int(priority) - if isinstance(priority, str) else - priority + self.precedence = ( + element.deserialize_int(precedence) + if isinstance(precedence, str) else + precedence ) - self.right_to_left = ( - element.deserialize_bool(right_to_left) - if isinstance(right_to_left, str) else - right_to_left + self.associativity = ( + element.deserialize_int(associativity) + if isinstance(associativity, str) else + associativity ) def serialize(self, ref_list, indent = 0): element.Element.serialize(self, ref_list, indent) self.set('nonterminal', element.serialize_int(self.nonterminal)) - self.set('priority', element.serialize_int(self.priority)) - self.set('right_to_left', element.serialize_bool(self.right_to_left)) + self.set('precedence', element.serialize_int(self.precedence)) + self.set('associativity', element.serialize_int(self.associativity)) def deserialize(self, ref_list): element.Element.deserialize(self, ref_list) self.nonterminal = element.deserialize_int(self.get('nonterminal', '-1')) - self.priority = element.deserialize_int(self.get('priority', '-1')) - self.right_to_left = element.deserialize_bool(self.get('right_to_left', 'false')) + self.precedence = element.deserialize_int(self.get('precedence', '-1')) + self.associativity = element.deserialize_int(self.get('associativity', '-1')) def copy(self, factory = None): result = element.Element.copy( self, Production if factory is None else factory ) result.nonterminal = self.nonterminal - result.priority = self.priority - result.right_to_left = self.right_to_left + result.precedence = self.precedence + result.associativity = self.associativity return result def repr_serialize(self, params): element.Element.repr_serialize(self, params) @@ -905,43 +1034,38 @@ class Grammar(element.Element): params.append( 'nonterminal = {0:s}'.format(repr(self.nonterminal)) ) - if self.priority != -1: + if self.precedence != -1: params.append( - 'priority = {0:s}'.format(repr(self.priority)) + 'precedence = {0:s}'.format(repr(self.precedence)) ) - if self.right_to_left != False: + if self.associativity != -1: params.append( - 'right_to_left = {0:s}'.format(repr(self.right_to_left)) + 'associativity = {0:s}'.format(repr(self.associativity)) ) def __repr__(self): params = [] self.repr_serialize(params) return 'regex.Grammar.Production({0:s})'.format(', '.join(params)) # GENERATE END - def post_process(self, nonterminal, rule_name_to_character_set): + def post_process(self, nonterminal, name_to_character_sets): self.nonterminal = nonterminal - self[0].post_process(0, rule_name_to_character_set) + for i in self: + i.post_process(name_to_character_sets) def add_to_lr1(self, lr1): - symbols = [] - lookaheads = [] - group_bounds = [] - self[0].to_lr1_symbols( - lr1.n_terminals, - symbols, - lookaheads, - group_bounds - ) - lookaheads.append(([], True)) # initial_set, can_be_empty (sentinel) lr1.productions.append( ( - self.priority * 2 + int(self.right_to_left), - symbols, - lookaheads, - group_bounds + # precedence + self.precedence * 2 + self.associativity, + # symbols + [(i.terminal_set, i.nonterminal_set) for i in self], + # lookaheads (list of initial_set, can_be_empty) + [([], False) for i in range(len(self))] + [([], True)], + # group_bounds + [] ) ) - # GENERATE ELEMENT(int n_terminals, int eof_character) BEGIN + # GENERATE ELEMENT(int n_terminals, int eof_terminal) BEGIN def __init__( self, tag = 'Grammar', @@ -949,7 +1073,7 @@ class Grammar(element.Element): text = '', children = [], n_terminals = -1, - eof_character = -1 + eof_terminal = -1 ): element.Element.__init__( self, @@ -963,26 +1087,26 @@ class Grammar(element.Element): if isinstance(n_terminals, str) else n_terminals ) - self.eof_character = ( - element.deserialize_int(eof_character) - if isinstance(eof_character, str) else - eof_character + self.eof_terminal = ( + element.deserialize_int(eof_terminal) + if isinstance(eof_terminal, str) else + eof_terminal ) def serialize(self, ref_list, indent = 0): element.Element.serialize(self, ref_list, indent) self.set('n_terminals', element.serialize_int(self.n_terminals)) - self.set('eof_character', element.serialize_int(self.eof_character)) + self.set('eof_terminal', element.serialize_int(self.eof_terminal)) def deserialize(self, ref_list): element.Element.deserialize(self, ref_list) self.n_terminals = element.deserialize_int(self.get('n_terminals', '-1')) - self.eof_character = element.deserialize_int(self.get('eof_character', '-1')) + self.eof_terminal = element.deserialize_int(self.get('eof_terminal', '-1')) def copy(self, factory = None): result = element.Element.copy( self, Grammar if factory is None else factory ) result.n_terminals = self.n_terminals - result.eof_character = self.eof_character + result.eof_terminal = self.eof_terminal return result def repr_serialize(self, params): element.Element.repr_serialize(self, params) @@ -990,20 +1114,20 @@ class Grammar(element.Element): params.append( 'n_terminals = {0:s}'.format(repr(self.n_terminals)) ) - if self.eof_character != -1: + if self.eof_terminal != -1: params.append( - 'eof_character = {0:s}'.format(repr(self.eof_character)) + 'eof_terminal = {0:s}'.format(repr(self.eof_terminal)) ) def __repr__(self): params = [] self.repr_serialize(params) return 'regex.Grammar({0:s})'.format(', '.join(params)) # GENERATE END - def post_process(self, rule_name_to_character_set): + def post_process(self, name_to_character_sets): for i in range(len(self)): - self[i].post_process(i, rule_name_to_character_set) + self[i].post_process(i, name_to_character_sets) def to_lr1(self): - lr1 = LR1([], self.n_terminals, self.eof_character) + lr1 = LR1([], self.n_terminals, self.eof_terminal) for i in self: i.add_to_lr1(lr1) # propagate lookaheads @@ -1039,7 +1163,6 @@ tag_to_class = { 'RegexCharacterOr': RegexCharacterOr, 'RegexCharacterAnd': RegexCharacterAnd, 'RegexCharacterNot': RegexCharacterNot, - 'RegexCharacterRule': RegexCharacterRule, 'RegexOr': RegexOr, 'RegexAnd': RegexAnd, 'RegexSequence': RegexSequence, @@ -1047,7 +1170,9 @@ tag_to_class = { 'RegexGroup': RegexGroup, 'RegexGroup_Attribute': RegexGroup.Attribute, 'Grammar': Grammar, - 'Grammar_Production': Grammar.Production + 'Grammar_Production': Grammar.Production, + 'Grammar_Production_Symbol': Grammar.Production.Symbol, + 'Grammar_Production_NamedSymbol': Grammar.Production.NamedSymbol } def factory(tag, attrib = {}, *args, **kwargs): return tag_to_class.get(tag, element.Element)(tag, attrib, *args, **kwargs) @@ -1711,8 +1836,8 @@ class LR1: def __init__( self, productions = [], - n_terminals = n_characters, - eof_character = n_characters + n_terminals = n_characters + 1, + eof_terminal = n_characters ): # productions: list of production # production: ( @@ -1743,10 +1868,10 @@ class LR1: # noting that markup has to be applied in reverse order of the list # n_terminals: offset to apply to productions[] index to get symbol # (character set code), also symbol for productions[0] = start production - # eof_character: usually == n_terminals (need not be valid terminal value) + # eof_terminal: usually == n_terminals - 1 (must be valid terminal value) self.productions = productions self.n_terminals = n_terminals - self.eof_character = eof_character + self.eof_terminal = eof_terminal def lookahead_item_set_closure(self, items, item_to_index): in_queue = [True for i in range(len(items))] @@ -1842,11 +1967,11 @@ class LR1: return next_items, next_item_to_index, nonterminal0, nonterminal1 def parse_text(self, text, i): - items = [(0, 0, [self.eof_character, self.eof_character + 1])] + items = [(0, 0, [self.eof_terminal, self.eof_terminal + 1])] item_to_index = {(0, 0): 0} value_stack = [] state_stack = [] - lookahead_character = ord(text[i]) if i < len(text) else self.eof_character + lookahead_character = ord(text[i]) if i < len(text) else self.eof_terminal while True: self.lookahead_item_set_closure(items, item_to_index) value_stack.append(i) @@ -1863,7 +1988,7 @@ class LR1: ) ) i += 1 - lookahead_character = ord(text[i]) if i < len(text) else self.eof_character + lookahead_character = ord(text[i]) if i < len(text) else self.eof_terminal elif len(reductions) != 0: if len(reductions) != 1: sys.stderr.write( @@ -1903,7 +2028,7 @@ class LR1: if pos < 0: pos, off = element.to_start_relative(root, pos, off) - items = [(0, 0, [self.eof_character, self.eof_character + 1])] + items = [(0, 0, [self.eof_terminal, self.eof_terminal + 1])] item_to_index = {(0, 0): 0} value_stack = [] state_stack = [] @@ -1916,7 +2041,7 @@ class LR1: try: next(yychunk_iter) except StopIteration: - lookahead_character = self.eof_character + lookahead_character = self.eof_terminal break text = element.get_text(root, pos) else: @@ -1945,7 +2070,7 @@ class LR1: try: next(yychunk_iter) except StopIteration: - lookahead_character = self.eof_character + lookahead_character = self.eof_terminal break text = element.get_text(root, pos) else: @@ -2012,10 +2137,10 @@ class LR1: for _, symbols, _, group_bounds in self.productions ], self.n_terminals, - self.eof_character + self.eof_terminal ) - items = [(0, 0, [self.eof_character, self.eof_character + 1])] + items = [(0, 0, [self.eof_terminal, self.eof_terminal + 1])] item_to_index = {(0, 0): 0} self.lookahead_item_set_closure(items, item_to_index) @@ -2096,10 +2221,10 @@ class LR1: for _, symbols, _, group_bounds in self.productions ], self.n_terminals, - self.eof_character + self.eof_terminal ) - items = [(0, 0, [self.eof_character, self.eof_character + 1])] + items = [(0, 0, [self.eof_terminal, self.eof_terminal + 1])] item_to_index = {(0, 0): 0} self.lookahead_item_set_closure(items, item_to_index) @@ -2197,7 +2322,7 @@ class LR1: return 'regex.LR1({0:s}, {1:d}, {2:d})'.format( repr(self.productions), self.n_terminals, - self.eof_character + self.eof_terminal ) class LR1DFA: @@ -2205,8 +2330,8 @@ class LR1DFA: self, states = [], productions = [], - n_terminals = n_characters, - eof_character = n_characters + n_terminals = n_characters + 1, + eof_terminal = n_characters ): # states: list of state_desc # state_desc: (terminal breaks, actions, nonterminal breaks, gotos) @@ -2222,17 +2347,17 @@ class LR1DFA: # noting that markup has to be applied in reverse order of the list # n_terminals: offset to apply to productions[] index to get symbol # (character set code), also symbol for productions[0] = start production - # eof_character: usually == n_terminals (need not be valid terminal value) + # eof_terminal: usually == n_terminals - 1 (must be valid terminal value) self.states = states self.productions = productions self.n_terminals = n_terminals - self.eof_character = eof_character + self.eof_terminal = eof_terminal def parse_text(self, text, i): state = 0 value_stack = [] state_stack = [] - lookahead_character = ord(text[i]) if i < len(text) else self.eof_character + lookahead_character = ord(text[i]) if i < len(text) else self.eof_terminal while True: value_stack.append(i) state_stack.append(state) @@ -2246,7 +2371,7 @@ class LR1DFA: if (action & 1) == 0: state = action >> 1 i += 1 - lookahead_character = ord(text[i]) if i < len(text) else self.eof_character + lookahead_character = ord(text[i]) if i < len(text) else self.eof_terminal else: reduce = action >> 1 len_symbols, group_bounds = self.productions[reduce] @@ -2288,7 +2413,7 @@ class LR1DFA: try: next(yychunk_iter) except StopIteration: - lookahead_character = self.eof_character + lookahead_character = self.eof_terminal break text = element.get_text(root, pos) else: @@ -2315,7 +2440,7 @@ class LR1DFA: try: next(yychunk_iter) except StopIteration: - lookahead_character = self.eof_character + lookahead_character = self.eof_terminal break text = element.get_text(root, pos) else: @@ -2374,7 +2499,7 @@ class LR1DFA: try: end_pos, end_off, lookahead_character = next(yylex_iter) except StopIteration: - lookahead_character = self.eof_character + lookahead_character = self.eof_terminal end_pos, end_off = element.to_end_relative(root, pos, off) while True: value_stack.append((pos, off)) @@ -2393,7 +2518,7 @@ class LR1DFA: try: end_pos, end_off, lookahead_character = next(yylex_iter) except StopIteration: - lookahead_character = self.eof_character + lookahead_character = self.eof_terminal #end_pos, end_off = element.to_end_relative(root, pos, off) else: reduce = action >> 1 @@ -2443,7 +2568,7 @@ class LR1DFA: repr(self.states), repr(self.productions), self.n_terminals, - self.eof_character + self.eof_terminal ) def wrap_repr(text, width):