-import bisect
+import bisect_set
import element
-import work
-import sys
+import nfa
# defines the alphabet size, set this to 0x11000 for unicode
-chars = 0x100
-
-def char_set_or(char_set0, char_set1):
- # calculate union of the child sets
- # we do this by calculating a series of breakpoints, at each breakpoint
- # evaluating the "or" (max) of the even/odd truth values of each child,
- # then making the output truth value even/odd by outputting if necessary
- result = []
- i = 0
- j = 0
- while True:
- if i < len(char_set0):
- k = char_set0[i]
- if j < len(char_set1):
- k = min(k, char_set1[j])
- elif j < len(char_set1):
- k = char_set1[j]
- else:
- break
- if i < len(char_set0) and char_set0[i] == k:
- i += 1
- if j < len(char_set1) and char_set1[j] == k:
- j += 1
- if (len(result) & 1) != max(i & 1, j & 1):
- result.append(k)
- assert (i & 1) == 0 and (j & 1) == 0
- return result
-
-def char_set_and(char_set0, char_set1):
- # calculate intersection of the child sets
- # we do this by calculating a series of breakpoints, at each breakpoint
- # evaluating the "and" (min) of the even/odd truth values of each child,
- # then making the output truth value even/odd by outputting if necessary
- result = []
- i = 0
- j = 0
- while True:
- if i < len(char_set0):
- k = char_set0[i]
- if j < len(char_set1):
- k = min(k, char_set1[j])
- elif j < len(char_set1):
- k = char_set1[j]
- else:
- break
- if i < len(char_set0) and char_set0[i] == k:
- i += 1
- if j < len(char_set1) and char_set1[j] == k:
- j += 1
- if (len(result) & 1) != min(i & 1, j & 1):
- result.append(k)
- assert (i & 1) == 0 and (j & 1) == 0
- return result
-
-def char_set_not(char_set):
- # calculate complement of the child set
- # if child set begins with [0], remove it, otherwise add [0] prefix
- # if child set ends with [chars], remove it, otherwise add [chars] suffix
- # the suffix part is not totally necessary, but makes sure length is even
- # (the evenness is so that single character sets can always be [c, c + 1])
- result = list(char_set)
- if result[:1] == [0]:
- del result[:1]
- else:
- result[:0] = [0]
- if result[-1:] == [chars]:
- del result[-1:]
- else:
- result.append(chars)
- return result
+n_characters = 0x100
class Regex(element.Element):
# GENERATE ELEMENT() BEGIN
self.repr_serialize(params)
return 'regex.Regex({0:s})'.format(', '.join(params))
# GENERATE END
- def post_process(self, group_index = 0, rule_char_sets = None):
+ def post_process(self, group_index = 0): #, rule_name_to_character_set = None):
for i in self:
- group_index = i.post_process(group_index, rule_char_sets)
+ group_index = i.post_process(group_index) #, rule_name_to_character_set)
return group_index
def to_groups(self, groups):
for i in self:
i.to_groups(groups)
- def to_nfa_state(self, nfa, next_state):
+ def to_nfa_state(self, _nfa, next_state):
raise NotImplementedException
- def add_to_nfa(self, nfa):
- nfa.start_state.append(self.to_nfa_state(nfa, 0))
- def to_lr1_symbols(self, terminal_thres, symbols, lookaheads, group_bounds):
- group_count = 0
- for i in self:
- group_count += (
- i.to_lr1_symbols(terminal_thres, symbols, lookaheads, group_bounds)
- )
- return group_count # count of groups or ungrouped characters
+ def add_to_nfa(self, _nfa):
+ _nfa.start_state.append(self.to_nfa_state(_nfa, 0))
+ #def to_lr1_symbols(self, n_terminals, symbols, lookaheads, group_bounds):
+ # group_count = 0
+ # for i in self:
+ # group_count += (
+ # i.to_lr1_symbols(n_terminals, symbols, lookaheads, group_bounds)
+ # )
+ # return group_count # count of groups or ungrouped characters
class RegexNone(Regex):
# GENERATE ELEMENT() BEGIN
self.repr_serialize(params)
return 'regex.RegexNone({0:s})'.format(', '.join(params))
# GENERATE END
- def to_nfa_state(self, nfa, next_state):
+ def to_nfa_state(self, _nfa, next_state):
return -1
class RegexEmpty(Regex):
self.repr_serialize(params)
return 'regex.RegexEmpty({0:s})'.format(', '.join(params))
# GENERATE END
- def to_nfa_state(self, nfa, next_state):
+ def to_nfa_state(self, _nfa, next_state):
return next_state
class RegexCharacter(Regex):
- # GENERATE ELEMENT(list(int) char_set) BEGIN
+ # GENERATE ELEMENT(list(int) character_set) BEGIN
def __init__(
self,
tag = 'RegexCharacter',
attrib = {},
text = '',
children = [],
- char_set = []
+ character_set = []
):
Regex.__init__(
self,
text,
children
)
- self.char_set = (
- [element.deserialize_int(i) for i in char_set.split()]
- if isinstance(char_set, str) else
- char_set
+ self.character_set = (
+ [element.deserialize_int(i) for i in character_set.split()]
+ if isinstance(character_set, str) else
+ character_set
)
- def serialize(self, ref_list, indent = 0):
- Regex.serialize(self, ref_list, indent)
+ def serialize(self, ref_list):
+ Regex.serialize(self, ref_list)
self.set(
- 'char_set',
- ' '.join([element.serialize_int(i) for i in self.char_set])
+ 'character_set',
+ ' '.join([element.serialize_int(i) for i in self.character_set])
)
def deserialize(self, ref_list):
Regex.deserialize(self, ref_list)
- self.char_set = [
+ self.character_set = [
element.deserialize_int(i)
- for i in self.get('char_set', '').split()
+ for i in self.get('character_set', '').split()
]
def copy(self, factory = None):
result = Regex.copy(
self,
RegexCharacter if factory is None else factory
)
- result.char_set = self.char_set
+ result.character_set = self.character_set
return result
def repr_serialize(self, params):
Regex.repr_serialize(self, params)
- if len(self.char_set):
+ if len(self.character_set):
params.append(
- 'char_set = [{0:s}]'.format(
- ', '.join([repr(i) for i in self.char_set])
+ 'character_set = [{0:s}]'.format(
+ ', '.join([repr(i) for i in self.character_set])
)
)
def __repr__(self):
self.repr_serialize(params)
return 'regex.RegexCharacter({0:s})'.format(', '.join(params))
# GENERATE END
- def to_nfa_state(self, nfa, next_state):
- new_state = len(nfa.states)
- nfa.states.append((NFA.STATE_CHARACTER, self.char_set, next_state))
+ def to_nfa_state(self, _nfa, next_state):
+ new_state = len(_nfa.states)
+ _nfa.states.append((nfa.NFA.STATE_CHARACTER, self.character_set, next_state))
return new_state
- def to_lr1_symbols(self, terminal_thres, symbols, lookaheads, group_bounds):
- terminal_set = []
- nonterminal_set = []
- i = 0
- while i < len(self.char_set):
- [j, k] = self.char_set[i:i + 2]
- if k > terminal_thres:
- if j < terminal_thres:
- terminal_set.extend([j, terminal_thres])
- nonterminal_set.extend([0, k - terminal_thres])
- i += 2
- while i < len(self.char_set):
- [j, k] = self.char_set[i:i + 2]
- nonterminal_set.extend([j - terminal_thres, k - terminal_thres])
- i += 2
- break
- terminal_set.extend([j, k])
- i += 2
- symbols.append((terminal_set, nonterminal_set))
- lookaheads.append(([], False)) # initial_set, can_be_empty
- return 1 # count of groups or ungrouped characters
+ #def to_lr1_symbols(self, n_terminals, symbols, lookaheads, group_bounds):
+ # terminal_set = []
+ # nonterminal_set = []
+ # i = 0
+ # while i < len(self.character_set):
+ # [j, k] = self.character_set[i:i + 2]
+ # if k > n_terminals:
+ # if j < n_terminals:
+ # terminal_set.extend([j, n_terminals])
+ # nonterminal_set.extend([0, k - n_terminals])
+ # i += 2
+ # while i < len(self.character_set):
+ # [j, k] = self.character_set[i:i + 2]
+ # nonterminal_set.extend([j - n_terminals, k - n_terminals])
+ # i += 2
+ # break
+ # terminal_set.extend([j, k])
+ # i += 2
+ # symbols.append((terminal_set, nonterminal_set))
+ # lookaheads.append(([], False)) # initial_set, can_be_empty
+ # return 1 # count of groups or ungrouped characters
class RegexCharacterRange(RegexCharacter):
# GENERATE ELEMENT() BEGIN
attrib = {},
text = '',
children = [],
- char_set = []
+ character_set = []
):
RegexCharacter.__init__(
self,
attrib,
text,
children,
- char_set
+ character_set
)
def copy(self, factory = None):
result = RegexCharacter.copy(
self.repr_serialize(params)
return 'regex.RegexCharacterRange({0:s})'.format(', '.join(params))
# GENERATE END
- def post_process(self, group_index = 0, rule_char_sets = None):
- group_index = RegexCharacter.post_process(self, group_index, rule_char_sets)
- self.char_set = [self[0].char_set[0], self[1].char_set[-1]]
+ def post_process(self, group_index = 0): #, rule_name_to_character_set = None):
+ group_index = RegexCharacter.post_process(self, group_index) #, rule_name_to_character_set)
+ self.character_set = [self[0].character_set[0], self[1].character_set[-1]]
return group_index
class RegexCharacterOr(RegexCharacter):
attrib = {},
text = '',
children = [],
- char_set = []
+ character_set = []
):
RegexCharacter.__init__(
self,
attrib,
text,
children,
- char_set
+ character_set
)
def copy(self, factory = None):
result = RegexCharacter.copy(
self.repr_serialize(params)
return 'regex.RegexCharacterOr({0:s})'.format(', '.join(params))
# GENERATE END
- def post_process(self, group_index = 0, rule_char_sets = None):
- group_index = RegexCharacter.post_process(self, group_index, rule_char_sets)
- self.char_set = char_set_or(self[0].char_set, self[1].char_set)
+ def post_process(self, group_index = 0): #, rule_name_to_character_set = None):
+ group_index = RegexCharacter.post_process(self, group_index) #, rule_name_to_character_set)
+ self.character_set = bisect_set.bisect_set_or(self[0].character_set, self[1].character_set)
return group_index
class RegexCharacterAnd(RegexCharacter):
attrib = {},
text = '',
children = [],
- char_set = []
+ character_set = []
):
RegexCharacter.__init__(
self,
attrib,
text,
children,
- char_set
+ character_set
)
def copy(self, factory = None):
result = RegexCharacter.copy(
self.repr_serialize(params)
return 'regex.RegexCharacterAnd({0:s})'.format(', '.join(params))
# GENERATE END
- def post_process(self, group_index = 0, rule_char_sets = None):
- group_index = RegexCharacter.post_process(self, group_index, rule_char_sets)
- self.char_set = char_set_and(self[0].char_set, self[1].char_set)
+ def post_process(self, group_index = 0): #, rule_name_to_character_set = None):
+ group_index = RegexCharacter.post_process(self, group_index) #, rule_name_to_character_set)
+ self.character_set = bisect_set.bisect_set_and(self[0].character_set, self[1].character_set)
return group_index
class RegexCharacterNot(RegexCharacter):
attrib = {},
text = '',
children = [],
- char_set = []
+ character_set = []
):
RegexCharacter.__init__(
self,
attrib,
text,
children,
- char_set
+ character_set
)
def copy(self, factory = None):
result = RegexCharacter.copy(
self.repr_serialize(params)
return 'regex.RegexCharacterNot({0:s})'.format(', '.join(params))
# GENERATE END
- def post_process(self, group_index = 0, rule_char_sets = None):
- group_index = RegexCharacter.post_process(self, group_index, rule_char_sets)
- self.char_set = char_set_not(self[0].char_set)
+ def post_process(self, group_index = 0): #, rule_name_to_character_set = None):
+ group_index = RegexCharacter.post_process(self, group_index) #, rule_name_to_character_set)
+ self.character_set = bisect_set.bisect_set_not(self[0].character_set)
return group_index
-class RegexCharacterRule(RegexCharacter):
- # GENERATE ELEMENT(str rule_name) BEGIN
- def __init__(
- self,
- tag = 'RegexCharacterRule',
- attrib = {},
- text = '',
- children = [],
- char_set = [],
- rule_name = ''
- ):
- RegexCharacter.__init__(
- self,
- tag,
- attrib,
- text,
- children,
- char_set
- )
- self.rule_name = rule_name
- def serialize(self, ref_list, indent = 0):
- RegexCharacter.serialize(self, ref_list, indent)
- self.set('rule_name', element.serialize_str(self.rule_name))
- def deserialize(self, ref_list):
- RegexCharacter.deserialize(self, ref_list)
- self.rule_name = element.deserialize_str(self.get('rule_name', ''))
- def copy(self, factory = None):
- result = RegexCharacter.copy(
- self,
- RegexCharacterRule if factory is None else factory
- )
- result.rule_name = self.rule_name
- return result
- def repr_serialize(self, params):
- RegexCharacter.repr_serialize(self, params)
- if self.rule_name != '':
- params.append(
- 'rule_name = {0:s}'.format(repr(self.rule_name))
- )
- def __repr__(self):
- params = []
- self.repr_serialize(params)
- return 'regex.RegexCharacterRule({0:s})'.format(', '.join(params))
- # GENERATE END
- def post_process(self, group_index = 0, rule_char_sets = None):
- if rule_char_sets is not None:
- self.char_set = rule_char_sets[self.rule_name]
- return RegexCharacter.post_process(self, group_index, rule_char_sets)
+#class RegexCharacterRule(RegexCharacter):
+# # GENERATE ELEMENT(str rule_name) BEGIN
+# def __init__(
+# self,
+# tag = 'RegexCharacterRule',
+# attrib = {},
+# text = '',
+# children = [],
+# character_set = [],
+# rule_name = ''
+# ):
+# RegexCharacter.__init__(
+# self,
+# tag,
+# attrib,
+# text,
+# children,
+# character_set
+# )
+# self.rule_name = rule_name
+# def serialize(self, ref_list, indent = 0):
+# RegexCharacter.serialize(self, ref_list, indent)
+# self.set('rule_name', element.serialize_str(self.rule_name))
+# def deserialize(self, ref_list):
+# RegexCharacter.deserialize(self, ref_list)
+# self.rule_name = element.deserialize_str(self.get('rule_name', ''))
+# def copy(self, factory = None):
+# result = RegexCharacter.copy(
+# self,
+# RegexCharacterRule if factory is None else factory
+# )
+# result.rule_name = self.rule_name
+# return result
+# def repr_serialize(self, params):
+# RegexCharacter.repr_serialize(self, params)
+# if self.rule_name != '':
+# params.append(
+# 'rule_name = {0:s}'.format(repr(self.rule_name))
+# )
+# def __repr__(self):
+# params = []
+# self.repr_serialize(params)
+# return 'regex.RegexCharacterRule({0:s})'.format(', '.join(params))
+# # GENERATE END
+# def post_process(self, group_index = 0, rule_name_to_character_set = None):
+# if rule_name_to_character_set is not None:
+# self.character_set = rule_name_to_character_set[self.rule_name]
+# return RegexCharacter.post_process(self, group_index, rule_name_to_character_set)
class RegexOr(Regex):
# GENERATE ELEMENT() BEGIN
self.repr_serialize(params)
return 'regex.RegexOr({0:s})'.format(', '.join(params))
# GENERATE END
- def to_nfa_state(self, nfa, next_state):
- child0_state = self[0].to_nfa_state(nfa, next_state)
- child1_state = self[1].to_nfa_state(nfa, next_state)
+ def to_nfa_state(self, _nfa, next_state):
+ child0_state = self[0].to_nfa_state(_nfa, next_state)
+ child1_state = self[1].to_nfa_state(_nfa, next_state)
if child0_state == -1:
return child1_state
if child1_state == -1:
return child0_state
- new_state = len(nfa.states)
- nfa.states.append((NFA.STATE_OR, child0_state, child1_state))
+ new_state = len(_nfa.states)
+ _nfa.states.append((nfa.NFA.STATE_OR, child0_state, child1_state))
return new_state
class RegexAnd(Regex):
self.repr_serialize(params)
return 'regex.RegexAnd({0:s})'.format(', '.join(params))
# GENERATE END
- def to_nfa_state(self, nfa, next_state):
- join0_state = len(nfa.states)
- nfa.states.append(NFA.join0_state) # takes no arguments so use static one
- join1_state = len(nfa.states)
- nfa.states.append((NFA.STATE_JOIN1, next_state))
- child0_state = self[0].to_nfa_state(nfa, join0_state)
+ def to_nfa_state(self, _nfa, next_state):
+ join0_state = len(_nfa.states)
+ _nfa.states.append(nfa.NFA.join0_state) # takes no arguments so use static one
+ join1_state = len(_nfa.states)
+ _nfa.states.append((nfa.NFA.STATE_JOIN1, next_state))
+ child0_state = self[0].to_nfa_state(_nfa, join0_state)
if child0_state == -1:
return -1
- child1_state = self[1].to_nfa_state(nfa, join1_state)
+ child1_state = self[1].to_nfa_state(_nfa, join1_state)
if child1_state == -1:
return -1
- new_state = len(nfa.states)
- nfa.states.append((NFA.STATE_AND, child0_state, child1_state))
+ new_state = len(_nfa.states)
+ _nfa.states.append((nfa.NFA.STATE_AND, child0_state, child1_state))
return new_state
class RegexSequence(Regex):
self.repr_serialize(params)
return 'regex.RegexSequence({0:s})'.format(', '.join(params))
# GENERATE END
- def to_nfa_state(self, nfa, next_state):
- child1_state = self[1].to_nfa_state(nfa, next_state)
+ def to_nfa_state(self, _nfa, next_state):
+ child1_state = self[1].to_nfa_state(_nfa, next_state)
if child1_state == -1:
return -1
- return self[0].to_nfa_state(nfa, child1_state)
+ return self[0].to_nfa_state(_nfa, child1_state)
class RegexRepeat(Regex):
# GENERATE ELEMENT(int count0, int count1, bool non_greedy) BEGIN
if isinstance(non_greedy, str) else
non_greedy
)
- def serialize(self, ref_list, indent = 0):
- Regex.serialize(self, ref_list, indent)
+ def serialize(self, ref_list):
+ Regex.serialize(self, ref_list)
self.set('count0', element.serialize_int(self.count0))
self.set('count1', element.serialize_int(self.count1))
self.set('non_greedy', element.serialize_bool(self.non_greedy))
self.repr_serialize(params)
return 'regex.RegexRepeat({0:s})'.format(', '.join(params))
# GENERATE END
- def post_process(self, group_index = 0, rule_char_sets = None):
+ def post_process(self, group_index = 0): #, rule_name_to_character_set = None):
# total hack which will be done in a Python action in future
if len(self) >= 2:
assert self[1].tag == 'Number'
self.count1 = self.count0
del self[1:]
# end total hack
- return Regex.post_process(self, group_index, rule_char_sets)
- def to_nfa_state(self, nfa, next_state):
+ return Regex.post_process(self, group_index) #, rule_name_to_character_set)
+ def to_nfa_state(self, _nfa, next_state):
count0 = self.count0
count1 = self.count1
if count1 == -1:
- new_state = len(nfa.states)
- nfa.states.append(None)
- child_state = self[0].to_nfa_state(nfa, new_state)
+ new_state = len(_nfa.states)
+ _nfa.states.append(None)
+ child_state = self[0].to_nfa_state(_nfa, new_state)
if child_state == -1:
new_state = next_state # note: unreachable state remains invalid (None)
else:
- nfa.states[new_state] = (
- (NFA.STATE_OR, next_state, child_state)
+ _nfa.states[new_state] = (
+ (nfa.NFA.STATE_OR, next_state, child_state)
if self.non_greedy else
- (NFA.STATE_OR, child_state, next_state)
+ (nfa.NFA.STATE_OR, child_state, next_state)
)
else:
new_state = next_state
for i in range(count1 - count0):
- child_state = self[0].to_nfa_state(nfa, new_state)
+ child_state = self[0].to_nfa_state(_nfa, new_state)
if child_state == -1:
break
- new_state = len(nfa.states)
- nfa.states.append(
- (NFA.STATE_OR, next_state, child_state)
+ new_state = len(_nfa.states)
+ _nfa.states.append(
+ (nfa.NFA.STATE_OR, next_state, child_state)
if self.non_greedy else
- (NFA.STATE_OR, child_state, next_state)
+ (nfa.NFA.STATE_OR, child_state, next_state)
)
for i in range(count0):
- new_state = self[0].to_nfa_state(nfa, new_state)
+ new_state = self[0].to_nfa_state(_nfa, new_state)
if new_state == -1:
break
return new_state
)
self.name = name
self.value = value
- def serialize(self, ref_list, indent = 0):
- element.Element.serialize(self, ref_list, indent)
+ def serialize(self, ref_list):
+ element.Element.serialize(self, ref_list)
self.set('name', element.serialize_str(self.name))
self.set('value', element.serialize_str(self.value))
def deserialize(self, ref_list):
)
self.group_name = group_name
self.group_attributes = group_attributes
- def serialize(self, ref_list, indent = 0):
- Regex.serialize(self, ref_list, indent)
+ def serialize(self, ref_list):
+ Regex.serialize(self, ref_list)
self.set('group_index', element.serialize_int(self.group_index))
self.set('group_name', element.serialize_str(self.group_name))
self.set(
self.repr_serialize(params)
return 'regex.RegexGroup({0:s})'.format(', '.join(params))
# GENERATE END
- def post_process(self, group_index = 0, rule_char_sets = None):
+ def post_process(self, group_index = 0): #, rule_name_to_character_set = None):
# total hack which will be done in a Python action in future
if len(self) >= 2:
assert self[0].tag == 'GroupName'
# end total hack
self.group_index = group_index
group_index += 1
- return Regex.post_process(self, group_index, rule_char_sets)
+ return Regex.post_process(self, group_index) #, rule_name_to_character_set)
def to_groups(self, groups):
assert len(groups) == self.group_index
groups.append(
(self.group_name, {i.name: i.value for i in self.group_attributes})
)
return Regex.to_groups(self, groups)
- def to_nfa_state(self, nfa, next_state):
- mark_state = len(nfa.states)
- nfa.states.append((NFA.STATE_MARK, self.group_index * 2 + 1, next_state))
- child_state = self[0].to_nfa_state(nfa, mark_state)
+ def to_nfa_state(self, _nfa, next_state):
+ mark_state = len(_nfa.states)
+ _nfa.states.append((nfa.NFA.STATE_MARK, self.group_index * 2 + 1, next_state))
+ child_state = self[0].to_nfa_state(_nfa, mark_state)
if child_state == -1:
return -1
- new_state = len(nfa.states)
- nfa.states.append((NFA.STATE_MARK, self.group_index * 2, child_state))
+ new_state = len(_nfa.states)
+ _nfa.states.append((nfa.NFA.STATE_MARK, self.group_index * 2, child_state))
return new_state
- def to_lr1_symbols(self, terminal_thres, symbols, lookaheads, group_bounds):
- group_start = len(symbols)
- assert self.group_index == len(group_bounds)
- group_bounds.append(None)
- group_count = Regex.to_lr1_symbols(
- self,
- terminal_thres,
- symbols,
- lookaheads,
- group_bounds
- )
- group_bounds[self.group_index] = (
- group_start,
- group_count,
- self.group_name,
- {i.name: i.value for i in self.group_attributes}
- )
- return 1 # count of groups or ungrouped characters
-
-class Grammar(element.Element):
- class Production(element.Element):
- # GENERATE ELEMENT(int nonterminal, int priority, bool right_to_left) BEGIN
- def __init__(
- self,
- tag = 'Grammar_Production',
- attrib = {},
- text = '',
- children = [],
- nonterminal = -1,
- priority = -1,
- right_to_left = False
- ):
- element.Element.__init__(
- self,
- tag,
- attrib,
- text,
- children
- )
- self.nonterminal = (
- element.deserialize_int(nonterminal)
- if isinstance(nonterminal, str) else
- nonterminal
- )
- self.priority = (
- element.deserialize_int(priority)
- if isinstance(priority, str) else
- priority
- )
- self.right_to_left = (
- element.deserialize_bool(right_to_left)
- if isinstance(right_to_left, str) else
- right_to_left
- )
- def serialize(self, ref_list, indent = 0):
- element.Element.serialize(self, ref_list, indent)
- self.set('nonterminal', element.serialize_int(self.nonterminal))
- self.set('priority', element.serialize_int(self.priority))
- self.set('right_to_left', element.serialize_bool(self.right_to_left))
- def deserialize(self, ref_list):
- element.Element.deserialize(self, ref_list)
- self.nonterminal = element.deserialize_int(self.get('nonterminal', '-1'))
- self.priority = element.deserialize_int(self.get('priority', '-1'))
- self.right_to_left = element.deserialize_bool(self.get('right_to_left', 'false'))
- def copy(self, factory = None):
- result = element.Element.copy(
- self,
- Production if factory is None else factory
- )
- result.nonterminal = self.nonterminal
- result.priority = self.priority
- result.right_to_left = self.right_to_left
- return result
- def repr_serialize(self, params):
- element.Element.repr_serialize(self, params)
- if self.nonterminal != -1:
- params.append(
- 'nonterminal = {0:s}'.format(repr(self.nonterminal))
- )
- if self.priority != -1:
- params.append(
- 'priority = {0:s}'.format(repr(self.priority))
- )
- if self.right_to_left != False:
- params.append(
- 'right_to_left = {0:s}'.format(repr(self.right_to_left))
- )
- def __repr__(self):
- params = []
- self.repr_serialize(params)
- return 'regex.Grammar.Production({0:s})'.format(', '.join(params))
- # GENERATE END
-
- # GENERATE ELEMENT(int terminal_thres) BEGIN
- def __init__(
- self,
- tag = 'Grammar',
- attrib = {},
- text = '',
- children = [],
- terminal_thres = -1
- ):
- element.Element.__init__(
- self,
- tag,
- attrib,
- text,
- children
- )
- self.terminal_thres = (
- element.deserialize_int(terminal_thres)
- if isinstance(terminal_thres, str) else
- terminal_thres
- )
- def serialize(self, ref_list, indent = 0):
- element.Element.serialize(self, ref_list, indent)
- self.set('terminal_thres', element.serialize_int(self.terminal_thres))
- def deserialize(self, ref_list):
- element.Element.deserialize(self, ref_list)
- self.terminal_thres = element.deserialize_int(self.get('terminal_thres', '-1'))
- def copy(self, factory = None):
- result = element.Element.copy(
- self,
- Grammar if factory is None else factory
- )
- result.terminal_thres = self.terminal_thres
- return result
- def repr_serialize(self, params):
- element.Element.repr_serialize(self, params)
- if self.terminal_thres != -1:
- params.append(
- 'terminal_thres = {0:s}'.format(repr(self.terminal_thres))
- )
- def __repr__(self):
- params = []
- self.repr_serialize(params)
- return 'regex.Grammar({0:s})'.format(', '.join(params))
- # GENERATE END
- def post_process(self, rule_char_sets):
- for i in range(len(self)):
- self[i].nonterminal = i
- self[i][0].post_process(0, rule_char_sets)
- def to_lr1(self):
- lr1 = LR1([], self.terminal_thres)
- for i in self:
- symbols = []
- lookaheads = []
- group_bounds = []
- i[0].to_lr1_symbols(
- self.terminal_thres,
- symbols,
- lookaheads,
- group_bounds
- )
- lookaheads.append(([], True)) # initial_set, can_be_empty (sentinel)
- lr1.productions.append(
- (
- i.priority * 2 + int(i.right_to_left),
- symbols,
- lookaheads,
- group_bounds
- )
- )
- # propagate lookaheads
- modified = True
- while modified:
- modified = False
- for _, symbols, lookaheads, _ in lr1.productions:
- for i in range(len(symbols) - 1, -1, -1):
- initial_set, nonterminal_set = symbols[i]
- can_be_empty = False
- for j in range(0, len(nonterminal_set), 2):
- for k in range(nonterminal_set[j], nonterminal_set[j + 1]):
- child_initial_set, child_can_be_empty = lr1.productions[k][2][0]
- initial_set = char_set_or(initial_set, child_initial_set)
- can_be_empty = can_be_empty or child_can_be_empty
- # at this point can_be_empty refers to current symbol only
- if can_be_empty:
- next_initial_set, can_be_empty = lookaheads[i + 1]
- initial_set = char_set_or(initial_set, next_initial_set)
- # at this point can_be_empty refers to all remaining symbols
- if (initial_set, can_be_empty) != lookaheads[i]:
- lookaheads[i] = (initial_set, can_be_empty)
- modified = True
- return lr1
+ #def to_lr1_symbols(self, n_terminals, symbols, lookaheads, group_bounds):
+ # group_start = len(symbols)
+ # assert self.group_index == len(group_bounds)
+ # group_bounds.append(None)
+ # group_count = Regex.to_lr1_symbols(
+ # self,
+ # n_terminals,
+ # symbols,
+ # lookaheads,
+ # group_bounds
+ # )
+ # group_bounds[self.group_index] = (
+ # group_start,
+ # group_count,
+ # self.group_name,
+ # {i.name: i.value for i in self.group_attributes}
+ # )
+ # return 1 # count of groups or ungrouped characters
# GENERATE FACTORY(element.Element) BEGIN
tag_to_class = {
'RegexCharacterOr': RegexCharacterOr,
'RegexCharacterAnd': RegexCharacterAnd,
'RegexCharacterNot': RegexCharacterNot,
- 'RegexCharacterRule': RegexCharacterRule,
'RegexOr': RegexOr,
'RegexAnd': RegexAnd,
'RegexSequence': RegexSequence,
'RegexRepeat': RegexRepeat,
'RegexGroup': RegexGroup,
- 'RegexGroup_Attribute': RegexGroup.Attribute,
- 'Grammar': Grammar,
- 'Grammar_Production': Grammar.Production
+ 'RegexGroup_Attribute': RegexGroup.Attribute
}
def factory(tag, attrib = {}, *args, **kwargs):
return tag_to_class.get(tag, element.Element)(tag, attrib, *args, **kwargs)
# GENERATE END
-class NFA:
- # state_desc classes:
- # (STATE_CHARACTER, char_set, next_state)
- # (STATE_OR, next_state0, next_state1)
- # (STATE_AND, next_state0, next_state1)
- # (STATE_JOIN0,)
- # (STATE_JOIN1, next_state)
- # (STATE_MARK, mark_value, next_state)
-
- STATE_CHARACTER = 0
- STATE_OR = 1
- STATE_AND = 2
- STATE_JOIN0 = 3
- STATE_JOIN1 = 4
- STATE_MARK = 5
- join0_state = (STATE_JOIN0,)
-
- # multistate classes:
- # (MULTISTATE_ACCEPT, 1) accept, occupies one thread in list
- # (MULTISTATE_AND, n, state, child) and, occupies n threads in list
- # (MULTISTATE_OR, n, child0, child1)
- # n = sum of n from child states, that is, multistate[1] is the number of
- # (MULTISTATE_ACCEPT, 1) leaf states ultimately reachable from this subtree
-
- MULTISTATE_ACCEPT = 0
- MULTISTATE_AND = 1
- MULTISTATE_OR = 2
- accept_multistate = (MULTISTATE_ACCEPT, 1)
-
- def __init__(
- self,
- groups = [],
- states = [(STATE_CHARACTER, [0, chars], 0)],
- start_state = [] # can have multiple NFAs in same container
- ):
- # groups: list of group_desc
- # group_desc: (tag, kwargs)
- # tag, kwargs will be passed to apply_markup() hence factory()
- self.groups = groups
- self.states = states
- self.start_state = start_state
-
- def multistate_next(self, root_multistate, char):
- # the deduplication works as effectively a second pass which goes
- # over the multistate tree in pre-order, looking for OR-disjunctions
- # of any depth and configuration, e.g. (a OR b) or (c OR d), and
- # collecting the subexpressions e.g. a, b, c OR b, c, d, c OR d, and
- # failing any subexpression that's previously occurred in pre-order
- # (children of AND-conjunctions get a fresh empty subexpression set)
-
- # unfortunately it can't be done as a second pass literally, because
- # we have to generate the transition list, thus we need to know if a
- # subexpression is going to be failed, so we can delete it's threads
-
- # therefore the deduplication is tacked onto the end of the building
- # process, examining each built node just before the builder returns,
- # if already in the subexpression set we will rewrite the transition
- # list produced by the recursive calls, otherwise we will add it in
-
- # the problem is determining the subexpression set to pass into any
- # recursive calls, the caller's set will be sent unchanged by cases
- # that return the result unchanged or add an OR-disjuction, an empty
- # set will be sent by cases that add an AND-conjuction to the result
-
- # if we have added something to the node built by a recursive call,
- # we'll fall into the deduplication logic, otherwise just return it
-
- def advance1(multistate, join_count, done_multistates):
- # modifies nonlocal: transition
- assert multistate[0] == NFA.MULTISTATE_AND
- _, _, state, child = multistate
- if state == 0:
- # note that if we reach the accepting state, we must be a top-level
- # expression, and could not be part of an AND-conjunction (because
- # AND-conjunction terms always go to a join0 or join1 state first),
- # we remove ourselves to indicate to scanner that match is complete
- assert join_count == 0
- assert child == NFA.accept_multistate
- return advance(child, 0, done_multistates)
- state_desc = self.states[state]
- if state_desc[0] == NFA.STATE_CHARACTER:
- if join_count != 0:
- transition.append((DFA.TRANSITION_POP, child[1]))
- return None
- len_transition = len(transition)
- child = advance(child, 0, set())
- if child is None:
- return None
- result = (NFA.MULTISTATE_AND, child[1], state, child)
- elif state_desc[0] == NFA.STATE_OR:
- _, next_state0, next_state1 = state_desc
- len_transition = len(transition)
- transition.append((DFA.TRANSITION_DUP, child[1]))
- child0 = advance1(
- (NFA.MULTISTATE_AND, child[1], next_state0, child),
- join_count,
- done_multistates
- )
- child1 = advance1(
- (NFA.MULTISTATE_AND, child[1], next_state1, child),
- join_count,
- done_multistates
- )
- if child0 is None:
- return child1
- if child1 is None:
- return child0
- result = (NFA.MULTISTATE_OR, child0[1] + child1[1], child0, child1)
- elif state_desc[0] == NFA.STATE_AND:
- _, next_state0, next_state1 = state_desc
- return advance1(
- (
- NFA.MULTISTATE_AND,
- child[1],
- next_state0,
- (NFA.MULTISTATE_AND, child[1], next_state1, child)
- ),
- join_count,
- done_multistates
- )
- elif state_desc[0] == NFA.STATE_JOIN0:
- return advance(child, join_count + 1, done_multistates)
- elif state_desc[0] == NFA.STATE_JOIN1:
- _, next_state = state_desc
- if join_count == 0:
- transition.append((DFA.TRANSITION_POP, child[1]))
- return None
- return advance1(
- (NFA.MULTISTATE_AND, child[1], next_state, child),
- join_count - 1,
- done_multistates
- )
- elif state_desc[0] == NFA.STATE_MARK:
- _, mark_value, next_state = state_desc
- transition.append((DFA.TRANSITION_MARK, child[1], mark_value))
- return advance1(
- (NFA.MULTISTATE_AND, child[1], next_state, child),
- join_count,
- done_multistates
- )
- else:
- assert False
- if result in done_multistates:
- transition[len_transition:] = [(DFA.TRANSITION_POP, multistate[1])]
- return None
- done_multistates.add(result)
- return result
-
- def advance(multistate, join_count, done_multistates):
- nonlocal char0, char1 # modifies nonlocal: transition
- if multistate[0] == NFA.MULTISTATE_ACCEPT:
- assert join_count == 0
- len_transition = len(transition)
- transition.append((DFA.TRANSITION_MOVE, 1))
- result = NFA.accept_multistate # takes no arguments so use static one
- elif multistate[0] == NFA.MULTISTATE_AND:
- if char >= 0:
- _, _, state, child = multistate
- state_desc = self.states[state]
- assert state_desc[0] == NFA.STATE_CHARACTER
- _, char_set, next_state = state_desc
- k = bisect.bisect_right(char_set, char)
- if k > 0 and char0 < char_set[k - 1]:
- char0 = char_set[k - 1]
- if k < len(char_set) and char1 > char_set[k]:
- char1 = char_set[k]
- if (k & 1) == 0:
- transition.append((DFA.TRANSITION_POP, child[1]))
- return None
- multistate = (NFA.MULTISTATE_AND, child[1], next_state, child)
- return advance1(multistate, join_count, done_multistates)
- elif multistate[0] == NFA.MULTISTATE_OR:
- _, _, child0, child1 = multistate
- len_transition = len(transition)
- child0 = advance(child0, join_count, done_multistates)
- child1 = advance(child1, join_count, done_multistates)
- if child0 is None:
- return child1
- if child1 is None:
- return child0
- result = (NFA.MULTISTATE_OR, child0[1] + child1[1], child0, child1)
- else:
- assert False
- if result in done_multistates:
- transition[len_transition:] = [(DFA.TRANSITION_POP, multistate[1])]
- return None
- done_multistates.add(result)
- return result
-
- transition = []
- char0 = 0
- char1 = chars
- root_multistate = advance(root_multistate, 0, set())
- return root_multistate, transition, char0, char1
-
- def multistate_accept(root_multistate):
- i = 0
- def accept(multistate):
- nonlocal i
- if multistate[0] == NFA.MULTISTATE_ACCEPT:
- return True
- if multistate[0] == NFA.MULTISTATE_AND:
- _, _, _, child = multistate
- i += child[1]
- return False
- if multistate[0] == NFA.MULTISTATE_OR:
- _, _, child0, child1 = multistate
- return accept(child0) or accept(child1)
- assert False
- return i if accept(root_multistate) else -1
-
- def match_text(self, text, i, start_index = 0):
- def transit(transition):
- nonlocal threads0, threads1, prefix_slop # note: also uses i
- j = prefix_slop
- for trans in transition:
- if trans[0] == DFA.TRANSITION_POP:
- j += trans[1]
- elif trans[0] == DFA.TRANSITION_DUP:
- while j < trans[1]:
- threads0[:0] = [None] * prefix_slop
- threads1[:0] = [None] * prefix_slop
- j += prefix_slop
- prefix_slop *= 2
- threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
- j -= trans[1]
- elif trans[0] == DFA.TRANSITION_MARK:
- threads0[j:j + trans[1]] = [
- (i, trans[2], thread)
- for thread in threads0[j:j + trans[1]]
- ]
- elif trans[0] == DFA.TRANSITION_MOVE:
- threads1.extend(threads0[j:j + trans[1]])
- j += trans[1]
- #elif trans[0] == DFA.TRANSITION_DEL:
- # del threads1[-trans[1]:]
- else:
- assert False
- assert j == len(threads0)
- threads0, threads1 = threads1, threads0
- del threads1[prefix_slop:]
-
- threads0 = [None, None]
- threads1 = [None]
- prefix_slop = 1
-
- start_state = self.start_state[start_index]
- if start_state == -1:
- return None
- next_multistate, transition, _, _ = self.multistate_next(
- (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
- -1
- )
- while next_multistate is not None:
- transit(transition)
- assert len(threads0) == prefix_slop + next_multistate[1]
- if next_multistate == NFA.accept_multistate:
- # there is only one match, which is complete
- assert len(threads0) == prefix_slop + 1
- return threads0[prefix_slop]
- if i >= len(text):
- # return best match we have, but not incomplete match
- i = NFA.multistate_accept(next_multistate)
- return (None if i == -1 else threads0[prefix_slop + i])
- next_multistate, transition, _, _ = (
- self.multistate_next(next_multistate, ord(text[i]))
- )
- i += 1
- return None
-
- def match_yychunk(self, root, pos, off, yychunk_iter, start_index = 0):
- if pos < 0:
- pos, off = element.to_start_relative(root, pos, off)
-
- def transit(transition):
- nonlocal threads0, threads1, prefix_slop # note: also uses pos, off
- j = prefix_slop
- for trans in transition:
- if trans[0] == DFA.TRANSITION_POP:
- j += trans[1]
- elif trans[0] == DFA.TRANSITION_DUP:
- while j < trans[1]:
- threads0[:0] = [None] * prefix_slop
- threads1[:0] = [None] * prefix_slop
- j += prefix_slop
- prefix_slop *= 2
- threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
- j -= trans[1]
- elif trans[0] == DFA.TRANSITION_MARK:
- threads0[j:j + trans[1]] = [
- (pos, off, trans[2], thread)
- for thread in threads0[j:j + trans[1]]
- ]
- elif trans[0] == DFA.TRANSITION_MOVE:
- threads1.extend(threads0[j:j + trans[1]])
- j += trans[1]
- #elif trans[0] == DFA.TRANSITION_DEL:
- # del threads1[-trans[1]:]
- else:
- assert False
- assert j == len(threads0)
- threads0, threads1 = threads1, threads0
- del threads1[prefix_slop:]
-
- threads0 = [None, None]
- threads1 = [None]
- prefix_slop = 1
-
- start_state = self.start_state[start_index]
- if start_state == -1:
- return None
- next_multistate, transition, _, _ = self.multistate_next(
- (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
- -1
- )
- text = element.get_text(root, pos)
- while next_multistate is not None:
- transit(transition)
- assert len(threads0) == prefix_slop + next_multistate[1]
- if next_multistate == NFA.accept_multistate:
- # there is only one match, which is complete
- assert len(threads0) == prefix_slop + 1
- return threads0[prefix_slop]
- while off >= len(text):
- if pos < len(root):
- pos += 1
- off = 0
- else:
- try:
- next(yychunk_iter)
- except StopIteration:
- # return best match we have, but not incomplete match
- i = NFA.multistate_accept(next_multistate)
- return (None if i == -1 else threads0[prefix_slop + i])
- text = element.get_text(root, pos)
- next_multistate, transition, _, _ = (
- self.multistate_next(next_multistate, ord(text[off]))
- )
- off += 1
- return None
-
- def to_dfa(self):
- dfa = DFA(list(self.groups))
-
- accept_key = (NFA.accept_multistate, ())
- action_to_meaning = [accept_key]
- meaning_to_action = {accept_key: 0}
- state_to_meaning = [NFA.accept_multistate]
- meaning_to_state = {NFA.accept_multistate: 0}
-
- for start_state in self.start_state:
- if start_state == -1:
- start_action = -1
- else:
- next_multistate, transition, _, _ = self.multistate_next(
- (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
- -1
- )
- if next_multistate is None:
- start_action = -1
- else:
- start_key = (next_multistate, tuple(transition))
- start_action = len(action_to_meaning)
- meaning_to_action[start_key] = start_action
- action_to_meaning.append(start_key)
- dfa.start_action.append(start_action)
-
- while len(dfa.actions) < len(action_to_meaning):
- next_multistate, transition = action_to_meaning[len(dfa.actions)]
- if next_multistate in meaning_to_state:
- next_state = meaning_to_state[next_multistate]
- else:
- next_state = len(state_to_meaning)
- state_to_meaning.append(next_multistate)
- meaning_to_state[next_multistate] = next_state
- dfa.actions.append((next_state, list(transition)))
-
- while len(dfa.states) < len(state_to_meaning):
- char = 0
- multistate = state_to_meaning[len(dfa.states)]
- state_desc = ([], [], NFA.multistate_accept(multistate))
- while char < chars:
- next_multistate, transition, char0, char1 = self.multistate_next(
- multistate,
- char
- )
- assert char0 == char and char1 > char
- if next_multistate is None:
- action = -1
- else:
- # optimize transition (optional)
- i = 0
- j = 0
- while i < len(transition):
- if transition[i][0] == DFA.TRANSITION_POP:
- n = transition[i][1]
- i += 1
- while (
- i < len(transition) and
- transition[i][0] == DFA.TRANSITION_POP
- ):
- n += transition[i][1]
- i += 1
- transition[j] = (DFA.TRANSITION_POP, n)
- elif transition[i][0] == DFA.TRANSITION_MOVE:
- n = transition[i][1]
- i += 1
- while (
- i < len(transition) and
- transition[i][0] == DFA.TRANSITION_MOVE
- ):
- n += transition[i][1]
- i += 1
- transition[j] = (DFA.TRANSITION_MOVE, n)
- else:
- transition[j] = transition[i]
- i += 1
- j += 1
- del transition[j:]
- # end optimize transition
- key = (next_multistate, tuple(transition))
- if key in meaning_to_action:
- action = meaning_to_action[key]
- else:
- action = len(action_to_meaning)
- action_to_meaning.append(key)
- meaning_to_action[key] = action
- state_desc[0].append(char1)
- state_desc[1].append(action)
- char = char1
- dfa.states.append(state_desc)
- return dfa
-
- def __repr__(self):
- return 'regex.NFA({0:s}, {1:s}, {2:s})'.format(
- repr(self.groups),
- repr(self.states),
- repr(self.start_state)
- )
-
-class DFA:
- # transition classes:
- # instructions to transform thread list from a multistate to next multistate
- # (TRANSITION_POP, n) j += n
- # (TRANSITION_DUP, n) threads0[j - n:j] = threads0[j:j + n]
- # j -= n
- # (TRANSITION_MARK, n, mark_value) threads0[j:j + n] = [
- # (i, mark, thread)
- # for thread in threads0[j:j + n]
- # ]
- # (TRANSITION_MOVE, n) threads1.extend(threads0[j:j + n])
- # j += n
- # (TRANSITION_DEL, n) del threads1[-n:]
-
- TRANSITION_POP = 0
- TRANSITION_DUP = 1
- TRANSITION_MARK = 2
- TRANSITION_MOVE = 3
- #TRANSITION_DEL = 4
-
- def __init__(
- self,
- groups = [],
- states = [([chars], [0], 0)],
- actions = [(0, [])],
- start_action = [] # can have multiple DFAs in same container
- ):
- # groups: list of group_desc
- # group_desc: (tag, kwargs)
- # tag, kwargs will be passed to apply_markup() hence factory()
- # states: list of state_desc
- # state_desc: (list of breaks, list of action to do, accept_thread)
- # actions: list of action_desc
- # action_desc: (state to go to next, compiled transition to do first)
- # accept_thread: which thread of thread list to use, -1 don't accept
- self.groups = groups
- self.states = states
- self.actions = actions
- self.start_action = start_action
-
- def match_text(self, text, i, start_index = 0):
- def transit(transition):
- nonlocal threads0, threads1, prefix_slop # note: also uses i
- j = prefix_slop
- for trans in transition:
- if trans[0] == DFA.TRANSITION_POP:
- j += trans[1]
- elif trans[0] == DFA.TRANSITION_DUP:
- while j < trans[1]:
- threads0[:0] = [None] * prefix_slop
- threads1[:0] = [None] * prefix_slop
- j += prefix_slop
- prefix_slop *= 2
- threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
- j -= trans[1]
- elif trans[0] == DFA.TRANSITION_MARK:
- threads0[j:j + trans[1]] = [
- (i, trans[2], thread)
- for thread in threads0[j:j + trans[1]]
- ]
- elif trans[0] == DFA.TRANSITION_MOVE:
- threads1.extend(threads0[j:j + trans[1]])
- j += trans[1]
- #elif trans[0] == DFA.TRANSITION_DEL:
- # del threads1[-trans[1]:]
- else:
- assert False
- assert j == len(threads0)
- threads0, threads1 = threads1, threads0
- del threads1[prefix_slop:]
-
- threads0 = [None, None]
- threads1 = [None]
- prefix_slop = 1
-
- action = self.start_action[start_index]
- while action != -1:
- state, transition = self.actions[action]
- #print('i', i, 'action', action, 'state', state, 'transition', transition)
- transit(transition)
- if state == 0:
- # there is only one match, which is complete
- assert len(threads0) == prefix_slop + 1
- return threads0[prefix_slop]
- if i >= len(text):
- # return best match we have, but not incomplete match
- i = self.states[state][2]
- return (None if i == -1 else threads0[prefix_slop + i])
- action = self.states[state][1][
- bisect.bisect_right(self.states[state][0], ord(text[i]))
- ]
- i += 1
- return None
-
- def match_yychunk(self, root, pos, off, yychunk_iter, start_index = 0):
- if pos < 0:
- pos, off = element.to_start_relative(root, pos, off)
-
- def transit(transition):
- nonlocal threads0, threads1, prefix_slop # note: also uses pos, off
- j = prefix_slop
- for trans in transition:
- if trans[0] == DFA.TRANSITION_POP:
- j += trans[1]
- elif trans[0] == DFA.TRANSITION_DUP:
- while j < trans[1]:
- threads0[:0] = [None] * prefix_slop
- threads1[:0] = [None] * prefix_slop
- j += prefix_slop
- prefix_slop *= 2
- threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
- j -= trans[1]
- elif trans[0] == DFA.TRANSITION_MARK:
- threads0[j:j + trans[1]] = [
- (pos, off, trans[2], thread)
- for thread in threads0[j:j + trans[1]]
- ]
- elif trans[0] == DFA.TRANSITION_MOVE:
- threads1.extend(threads0[j:j + trans[1]])
- j += trans[1]
- #elif trans[0] == DFA.TRANSITION_DEL:
- # del threads1[-trans[1]:]
- else:
- assert False
- assert j == len(threads0)
- threads0, threads1 = threads1, threads0
- del threads1[prefix_slop:]
-
- threads0 = [None, None]
- threads1 = [None]
- prefix_slop = 1
-
- action = self.start_action[start_index]
- text = element.get_text(root, pos)
- while action != -1:
- state, transition = self.actions[action]
- transit(transition)
- if state == 0:
- # there is only one match, which is complete
- assert len(threads0) == prefix_slop + 1
- return threads0[prefix_slop]
- while off >= len(text):
- if pos < len(root):
- pos += 1
- off = 0
- else:
- try:
- next(yychunk_iter)
- except StopIteration:
- # return best match we have, but not incomplete match
- i = self.states[state][2]
- return (None if i == -1 else threads0[prefix_slop + i])
- text = element.get_text(root, pos)
- #print(
- # 'state {0:d} pos {1:d} off {2:d} text "{3:s}"'.format(
- # state,
- # pos,
- # off,
- # text.replace('\n', '$')
- # )
- #)
- action = self.states[state][1][
- bisect.bisect_right(self.states[state][0], ord(text[off]))
- ]
- off += 1
- return None
-
- def yylex(self, root, pos, off, factory, yychunk_iter):
- if pos < 0:
- pos, off = element.to_start_relative(root, pos, off)
-
- while True:
- # note: pointers must be kept start relative during the below call,
- # because it extends the following text by calling the yychunk_iter
- thread = self.match_yychunk(root, pos, off, yychunk_iter)
- if thread is None:
- break
- stack = []
- while True:
- pos, off, mark_value, thread = thread
- group_index = mark_value >> 1
- if (mark_value & 1) != 0:
- end_pos, end_off = element.to_end_relative(root, pos, off)
- stack.append((end_pos, end_off, group_index))
- else:
- end_pos, end_off, temp = stack.pop()
- assert temp == group_index
- if len(stack) == 0:
- break
- tag, kwargs = self.groups[group_index]
- if tag != '':
- work.apply_markup(
- root,
- pos,
- off,
- end_pos,
- end_off,
- factory,
- tag,
- **kwargs
- )
- # note: pointers must be kept end relative during the below call,
- # because it modifies the preceding text by calling apply_markup()
- yield end_pos, end_off, group_index
- pos, off = element.to_start_relative(root, end_pos, end_off)
-
- def __repr__(self):
- return 'regex.DFA({0:s}, {1:s}, {2:s}, {3:s})'.format(
- repr(self.groups),
- repr(self.states),
- repr(self.actions),
- repr(self.start_action)
- )
-
-class LR1:
- def __init__(self, productions = [], terminal_thres = chars):
- # productions: list of production
- # production: (
- # priority,
- # symbols,
- # lookaheads,
- # group_bounds
- # )
- # priority: bit 0 = right to left, bits 1: = numeric priority
- # symbols: list of symbol_desc
- # symbol_desc: (terminal_set, nonterminal_set)
- # terminal_set: similar to char_set, even length list of pairs of breaks
- # nonterminal_set: as above but has terminal_thres subtracted from breaks
- # lookaheads: list of lookahead_desc, len(lookaheads) = len(symbols) + 1
- # lookahead_desc: (initial_set, can_be_empty)
- # initial_set: what terminals can occur at this position in symbols array,
- # computed such that lookaheads[i][0] is symbols[i][0], plus initial set of
- # each nonterminal from symbols[i][1], plus lookaheads[i + 1][0] if any
- # nonterminal of symbols[i][1] can be empty (may pull in i + 2, i + 3 etc)
- # can_be_empty: whether all symbols from this position to end can be empty
- # note: last lookahead_desc is a sentinel consisting of ([], True), so that
- # initial_set is empty (will be augmented by context) and can_be_empty is
- # True (because all symbols from the end to the end can obviously be empty)
- # group_bounds: list of group_bound
- # group_bound: (start_index, end_index, tag, kwargs)
- # where start_index, end_index are indices into list of char_set,
- # and tag, kwargs will be passed to apply_markup() hence factory(),
- # noting that markup has to be applied in reverse order of the list
- # terminal_thres: offset to apply to productions[] index to get symbol
- # (char set code), also symbol for productions[0] = start production
- self.productions = productions
- self.terminal_thres = terminal_thres
-
- def lookahead_item_set_closure(self, items, item_to_index):
- in_queue = [True for i in range(len(items))]
- queue = list(range(len(items)))
-
- qhead = 0
- while qhead < len(queue):
- i = queue[qhead]
- in_queue[i] = False
- j, k, lookahead_set = items[i]
- _, symbols, lookaheads, _ = self.productions[j]
- if k < len(symbols):
- _, nonterminal_set = symbols[k]
- if len(nonterminal_set):
- next_lookahead_set, next_can_be_empty = lookaheads[k + 1]
- if next_can_be_empty:
- next_lookahead_set = char_set_or(
- next_lookahead_set,
- lookahead_set
- )
- for l in range(0, len(nonterminal_set), 2):
- for m in range(nonterminal_set[l], nonterminal_set[l + 1]):
- key = (m, 0)
- if key in item_to_index:
- n = item_to_index[key]
- child_lookahead_set = char_set_or(
- items[n][2],
- next_lookahead_set
- )
- if child_lookahead_set != items[n][2]:
- items[n] = (m, 0, child_lookahead_set)
- if not in_queue[n]:
- # optimization: do not re-queue unless it is transparent
- # to changes in the lookahead set wrt. further propagation
- _, child_symbols, child_lookaheads, _ = self.productions[m]
- if len(child_symbols) and child_lookaheads[1][1]:
- in_queue[n] = True
- queue.append(n)
- else:
- n = len(items)
- items.append((m, 0, next_lookahead_set))
- item_to_index[key] = n
- in_queue.append(True)
- queue.append(n)
- qhead += 1
-
- def lookahead_item_set_shift(self, items, terminal):
- next_items = []
- next_item_to_index = {}
- reductions = set()
- terminal0 = 0
- terminal1 = self.terminal_thres
- for i, j, lookahead_set in items:
- _, symbols, _, _ = self.productions[i]
- if j < len(symbols):
- terminal_set, _ = symbols[j]
- k = bisect.bisect_right(terminal_set, terminal)
- if k > 0 and terminal0 < terminal_set[k - 1]:
- terminal0 = terminal_set[k - 1]
- if k < len(terminal_set) and terminal1 > terminal_set[k]:
- terminal1 = terminal_set[k]
- if (k & 1) == 1:
- next_item_to_index[(i, j + 1)] = len(next_items)
- next_items.append((i, j + 1, lookahead_set))
- else:
- k = bisect.bisect_right(lookahead_set, terminal)
- if k > 0 and terminal0 < lookahead_set[k - 1]:
- terminal0 = lookahead_set[k - 1]
- if k < len(lookahead_set) and terminal1 > lookahead_set[k]:
- terminal1 = lookahead_set[k]
- if (k & 1) == 1:
- reductions.add(i)
- return next_items, next_item_to_index, reductions, terminal0, terminal1
-
- def lookahead_item_set_goto(self, items, nonterminal):
- next_items = []
- next_item_to_index = {}
- reductions = set()
- nonterminal0 = 0
- nonterminal1 = len(self.productions)
- for i, j, lookahead_set in items:
- _, symbols, _, _ = self.productions[i]
- if j < len(symbols):
- _, nonterminal_set = symbols[j]
- k = bisect.bisect_right(nonterminal_set, nonterminal)
- if k > 0 and nonterminal0 < nonterminal_set[k - 1]:
- nonterminal0 = nonterminal_set[k - 1]
- if k < len(nonterminal_set) and nonterminal1 > nonterminal_set[k]:
- nonterminal1 = nonterminal_set[k]
- if (k & 1) == 1:
- next_item_to_index[(i, j + 1)] = len(next_items)
- next_items.append((i, j + 1, lookahead_set))
- return next_items, next_item_to_index, nonterminal0, nonterminal1
-
- def parse_text(self, text, i):
- items = [(0, 0, [chars, chars + 1])] # EOF
- item_to_index = {(0, 0): 0}
- value_stack = []
- state_stack = []
- lookahead_char = ord(text[i]) if i < len(text) else chars # EOF
- while True:
- self.lookahead_item_set_closure(items, item_to_index)
- value_stack.append(i)
- state_stack.append(items)
- items, item_to_index, reductions, _, _ = (
- self.lookahead_item_set_shift(items, lookahead_char)
- )
- if len(items) != 0:
- if len(reductions) != 0:
- sys.stderr.write(
- 'shift/reduce conflict: {0:s} vs {1:s}\n'.format(
- ','.join([str(i) for i, _, _ in next_items]),
- ','.join([str(i) for i in reductions])
- )
- )
- i += 1
- lookahead_char = ord(text[i]) if i < len(text) else chars # EOF
- elif len(reductions) != 0:
- if len(reductions) != 1:
- sys.stderr.write(
- 'reduce/reduce conflict: {0:s}\n'.format(
- ','.join([str(i) for i in reductions])
- )
- )
- reduce = min(reductions)
- _, symbols, _, group_bounds = self.productions[reduce]
- base = len(value_stack) - len(symbols) - 1
- for j in range(len(group_bounds) - 1, -1, -1):
- k, l, tag, _ = group_bounds[j]
- k += base
- if l != 1:
- value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
- sys.stderr.write(
- 'text \'{0:s}\' tag \'{1:s}\'\n'.format(
- text[value_stack[k]:value_stack[k + 1]],
- tag
- )
- )
- del value_stack[base + 1:]
- del state_stack[base + 1:]
- if reduce == 0:
- assert base == 0
- return
- items, item_to_index, _, _ = (
- self.lookahead_item_set_goto(state_stack[-1], reduce)
- )
- assert len(items) != 0
- else:
- raise Exception(
- 'syntax error at {0:d}: {1:s}'.format(i, text[i:])
- )
-
- def parse_yychunk(self, root, pos, off, factory, yychunk_iter):
- if pos < 0:
- pos, off = element.to_start_relative(root, pos, off)
-
- items = [(0, 0, [chars, chars + 1])] # EOF
- item_to_index = {(0, 0): 0}
- value_stack = []
- state_stack = []
- text = element.get_text(root, pos)
- while off >= len(text):
- if pos < len(root):
- pos += 1
- off = 0
- else:
- try:
- next(yychunk_iter)
- except StopIteration:
- lookahead_char = chars # EOF
- break
- text = element.get_text(root, pos)
- else:
- lookahead_char = ord(text[off])
- while True:
- self.lookahead_item_set_closure(items, item_to_index)
- value_stack.append((pos, off))
- state_stack.append(items)
- items, item_to_index, reductions, _, _ = (
- self.lookahead_item_set_shift(items, lookahead_char)
- )
- if len(items) != 0:
- if len(reductions) != 0:
- sys.stderr.write(
- 'shift/reduce conflict: {0:s} vs {1:s}\n'.format(
- ','.join([str(i) for i, _ in next_lookahead_item_set.keys()]),
- ','.join([str(i) for i in reductions])
- )
- )
- off += 1
- while off >= len(text):
- if pos < len(root):
- pos += 1
- off = 0
- else:
- try:
- next(yychunk_iter)
- except StopIteration:
- lookahead_char = chars # EOF
- break
- text = element.get_text(root, pos)
- else:
- lookahead_char = ord(text[off])
- elif len(reductions) != 0:
- if len(reductions) != 1:
- sys.stderr.write(
- 'reduce/reduce conflict: {0:s}\n'.format(
- ','.join([str(i) for i in reductions])
- )
- )
- reduce = min(reductions)
- _, symbols, _, group_bounds = self.productions[reduce]
- base = len(value_stack) - len(symbols) - 1
- end_relative = len(value_stack)
- for j in range(len(group_bounds) - 1, -1, -1):
- k, l, tag, kwargs = group_bounds[j]
- k += base
- assert k < end_relative
- if l != 1:
- value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
- end_relative = max(k + 1, end_relative + 1 - l)
- while end_relative > k + 1:
- end_relative -= 1
- pos1, off1 = value_stack[end_relative]
- value_stack[end_relative] = (
- element.to_end_relative(root, pos1, off1)
- )
- pos0, off0 = value_stack[k]
- pos1, off1 = value_stack[k + 1]
- work.apply_markup(
- root,
- pos0,
- off0,
- pos1,
- off1,
- factory,
- tag,
- **kwargs
- )
- if end_relative < len(value_stack):
- pos, off = value_stack[-1]
- pos, off = element.to_start_relative(root, pos, off)
- text = element.get_text(root, pos)
- del value_stack[base + 1:]
- del state_stack[base + 1:]
- if reduce == 0:
- assert base == 0
- return
- items, item_to_index, _, _ = (
- self.lookahead_item_set_goto(state_stack[-1], reduce)
- )
- assert len(items) != 0
- else:
- raise Exception(
- 'syntax error at {0:d},{1:d}: {2:s}'.format(pos, off, text[off:])
- )
-
- def to_clr1(self):
- lr1dfa = LR1DFA(
- [],
- [
- (len(symbols), group_bounds)
- for _, symbols, _, group_bounds in self.productions
- ],
- self.terminal_thres
- )
-
- items = [(0, 0, [chars, chars + 1])] # EOF
- item_to_index = {(0, 0): 0}
- self.lookahead_item_set_closure(items, item_to_index)
-
- items = sorted(items)
- key = tuple((i, j, tuple(k)) for i, j, k in items)
- state_to_items = [items]
- items_to_state = {key: 0}
-
- while len(lr1dfa.states) < len(state_to_items):
- items = state_to_items[len(lr1dfa.states)]
- state_desc = ([], [], [], [])
-
- def add_state(next_items, next_item_to_index):
- self.lookahead_item_set_closure(next_items, next_item_to_index)
- new_items = sorted(next_items)
- key = tuple((i, j, tuple(k)) for i, j, k in new_items)
- if key in items_to_state:
- state = items_to_state[key]
- else:
- state = len(state_to_items)
- state_to_items.append(new_items)
- items_to_state[key] = state
- return state
-
- terminal = 0
- while terminal < self.terminal_thres:
- next_items, next_item_to_index, reductions, terminal0, terminal1 = (
- self.lookahead_item_set_shift(items, terminal)
- )
- assert terminal0 == terminal and terminal1 > terminal
- if len(next_items) != 0:
- if len(reductions) != 0:
- sys.stderr.write(
- 'state {0:d} shift/reduce conflict: {1:s} vs {2:s}\n'.format(
- len(lr1dfa.states),
- ','.join([str(i) for i, _, _ in next_items]),
- ','.join([str(i) for i in reductions])
- )
- )
- action = add_state(next_items, next_item_to_index) * 2
- elif len(reductions) != 0:
- if len(reductions) != 1:
- sys.stderr.write(
- 'state {0:d} reduce/reduce conflict: {1:s}\n'.format(
- len(lr1dfa.states),
- ','.join([str(i) for i in reductions])
- )
- )
- action = min(reductions) * 2 + 1 # odd means reduce
- else:
- action = -1
- state_desc[0].append(terminal1)
- state_desc[1].append(action)
- terminal = terminal1
-
- nonterminal = 0
- while nonterminal < len(self.productions):
- next_items, next_item_to_index, nonterminal0, nonterminal1 = (
- self.lookahead_item_set_goto(items, nonterminal)
- )
- assert nonterminal0 == nonterminal and nonterminal1 > nonterminal
- if len(next_items) != 0:
- goto = add_state(next_items, next_item_to_index)
- else:
- goto = -1
- state_desc[2].append(nonterminal1)
- state_desc[3].append(goto)
- nonterminal = nonterminal1
-
- lr1dfa.states.append(state_desc)
- return lr1dfa
-
- def to_lalr1(self):
- lr1dfa = LR1DFA(
- [],
- [
- (len(symbols), group_bounds)
- for _, symbols, _, group_bounds in self.productions
- ],
- self.terminal_thres
- )
-
- items = [(0, 0, [chars, chars + 1])] # EOF
- item_to_index = {(0, 0): 0}
- self.lookahead_item_set_closure(items, item_to_index)
-
- items = sorted(items)
- key = tuple((i, j) for i, j, _ in items) # ignore lookahead
- state_to_items = [items]
- items_to_state = {key: 0}
-
- in_queue = [True]
- queue = [0]
-
- qhead = 0
- while qhead < len(queue):
- i = queue[qhead]
- in_queue[i] = False
- items = state_to_items[i]
- state_desc = ([], [], [], [])
-
- def add_state(next_items, next_item_to_index):
- self.lookahead_item_set_closure(next_items, next_item_to_index)
- new_items = sorted(next_items)
- key = tuple((i, j) for i, j, _ in new_items) # ignore lookahead
- if key in items_to_state:
- state = items_to_state[key]
- state_items = state_to_items[state]
- for i in range(len(new_items)):
- j, k, lookahead_set = new_items[i]
- lookahead_set = char_set_or(lookahead_set, state_items[i][2])
- if lookahead_set != state_items[i][2]:
- state_items[i] = (j, k, lookahead_set)
- if not in_queue[state]:
- in_queue[state] = True
- queue.append(state)
- else:
- state = len(state_to_items)
- state_to_items.append(new_items)
- items_to_state[key] = state
- in_queue.append(True)
- queue.append(state)
- return state
-
- terminal = 0
- while terminal < self.terminal_thres:
- next_items, next_item_to_index, reductions, terminal0, terminal1 = (
- self.lookahead_item_set_shift(items, terminal)
- )
- assert terminal0 == terminal and terminal1 > terminal
- if len(next_items) != 0:
- if len(reductions) != 0:
- sys.stderr.write(
- 'state {0:d} shift/reduce conflict: {1:s} vs {2:s}\n'.format(
- i,
- ','.join([str(j) for j, _, _ in next_items]),
- ','.join([str(j) for j in reductions])
- )
- )
- action = add_state(next_items, next_item_to_index) * 2
- elif len(reductions) != 0:
- if len(reductions) != 1:
- sys.stderr.write(
- 'state {0:d} reduce/reduce conflict: {1:s}\n'.format(
- i,
- ','.join([str(j) for j in reductions])
- )
- )
- action = min(reductions) * 2 + 1 # odd means reduce
- else:
- action = -1
- state_desc[0].append(terminal1)
- state_desc[1].append(action)
- terminal = terminal1
-
- nonterminal = 0
- while nonterminal < len(self.productions):
- next_items, next_item_to_index, nonterminal0, nonterminal1 = (
- self.lookahead_item_set_goto(items, nonterminal)
- )
- assert nonterminal0 == nonterminal and nonterminal1 > nonterminal
- if len(next_items) != 0:
- goto = add_state(next_items, next_item_to_index)
- else:
- goto = -1
- state_desc[2].append(nonterminal1)
- state_desc[3].append(goto)
- nonterminal = nonterminal1
-
- if i < len(lr1dfa.states):
- lr1dfa.states[i] = state_desc
- else:
- lr1dfa.states.append(state_desc)
- qhead += 1
- return lr1dfa
-
- def __repr__(self):
- return 'regex.LR1({0:s}, {1:d})'.format(
- repr(self.productions),
- self.terminal_thres
- )
-
-class LR1DFA:
- def __init__(self, states = [], productions = [], terminal_thres = chars):
- # states: list of state_desc
- # state_desc: (terminal breaks, actions, nonterminal breaks, gotos)
- # action: shift = new state * 2, reduce = production * 2 + 1, error = -1
- # goto: reduce = production, error = -1 (but error can't really happen)
- # productions: list of production
- # production: (len(symbols), group_bounds)
- # len(symbols): how many states to pop stack to reduce this production
- # group_bounds: list of group_bound
- # group_bound: (start_index, end_index, tag, kwargs)
- # where start_index, end_index are indices into list of char_set,
- # and tag, kwargs will be passed to apply_markup() hence factory(),
- # noting that markup has to be applied in reverse order of the list
- # terminal_thres: offset to apply to productions[] index to get symbol
- # (char set code), also symbol for productions[0] = start production
- self.states = states
- self.productions = productions
- self.terminal_thres = terminal_thres
-
- def parse_text(self, text, i):
- state = 0
- value_stack = []
- state_stack = []
- lookahead_char = ord(text[i]) if i < len(text) else chars # EOF
- while True:
- value_stack.append(i)
- state_stack.append(state)
- action = self.states[state][1][
- bisect.bisect_right(self.states[state][0], lookahead_char)
- ]
- if action == -1:
- raise Exception(
- 'syntax error at {0:d}: {1:s}'.format(i, text[i:])
- )
- if (action & 1) == 0:
- state = action >> 1
- i += 1
- lookahead_char = ord(text[i]) if i < len(text) else chars # EOF
- else:
- reduce = action >> 1
- len_symbols, group_bounds = self.productions[reduce]
- base = len(value_stack) - len_symbols - 1
- for j in range(len(group_bounds) - 1, -1, -1):
- k, l, tag, _ = group_bounds[j]
- k += base
- if l != 1:
- value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
- sys.stdout.write(
- 'text \'{0:s}\' tag \'{1:s}\'\n'.format(
- text[value_stack[k]:value_stack[k + 1]],
- tag
- )
- )
- del value_stack[base + 1:]
- del state_stack[base + 1:]
- if reduce == 0:
- assert base == 0
- return
- state = self.states[state_stack[-1]][3][
- bisect.bisect_right(self.states[state_stack[-1]][2], reduce)
- ]
- assert state != -1
-
- def parse_yychunk(self, root, pos, off, factory, yychunk_iter):
- if pos < 0:
- pos, off = element.to_start_relative(root, pos, off)
-
- state = 0
- value_stack = []
- state_stack = []
- text = element.get_text(root, pos)
- while off >= len(text):
- if pos < len(root):
- pos += 1
- off = 0
- else:
- try:
- next(yychunk_iter)
- except StopIteration:
- lookahead_char = chars # EOF
- break
- text = element.get_text(root, pos)
- else:
- lookahead_char = ord(text[off])
- while True:
- value_stack.append((pos, off))
- state_stack.append(state)
- action = self.states[state][1][
- bisect.bisect_right(self.states[state][0], lookahead_char)
- ]
- #print('lookahead_char', lookahead_char, 'action', action)
- if action == -1:
- raise Exception(
- 'syntax error at {0:d},{1:d}: {2:s}'.format(pos, off, text[off:])
- )
- if (action & 1) == 0:
- state = action >> 1
- off += 1
- while off >= len(text):
- if pos < len(root):
- pos += 1
- off = 0
- else:
- try:
- next(yychunk_iter)
- except StopIteration:
- lookahead_char = chars # EOF
- break
- text = element.get_text(root, pos)
- else:
- lookahead_char = ord(text[off])
- else:
- reduce = action >> 1
- len_symbols, group_bounds = self.productions[reduce]
- base = len(value_stack) - len_symbols - 1
- end_relative = len(value_stack)
- for j in range(len(group_bounds) - 1, -1, -1):
- k, l, tag, kwargs = group_bounds[j]
- k += base
- assert k < end_relative
- if l != 1:
- value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
- end_relative = max(k + 1, end_relative + 1 - l)
- while end_relative > k + 1:
- end_relative -= 1
- pos1, off1 = value_stack[end_relative]
- value_stack[end_relative] = (
- element.to_end_relative(root, pos1, off1)
- )
- pos0, off0 = value_stack[k]
- pos1, off1 = value_stack[k + 1]
- work.apply_markup(
- root,
- pos0,
- off0,
- pos1,
- off1,
- factory,
- tag,
- **kwargs
- )
- if end_relative < len(value_stack):
- pos, off = value_stack[-1]
- pos, off = element.to_start_relative(root, pos, off)
- text = element.get_text(root, pos)
- del value_stack[base + 1:]
- del state_stack[base + 1:]
- if reduce == 0:
- assert base == 0
- return
- state = self.states[state_stack[-1]][3][
- bisect.bisect_right(self.states[state_stack[-1]][2], reduce)
- ]
- assert state != -1
-
- def yyparse(self, root, pos, off, factory, yylex_iter):
- if pos < 0:
- pos, off = element.to_start_relative(root, pos, off)
-
- state = 0
- value_stack = []
- state_stack = []
- try:
- end_pos, end_off, lookahead_char = next(yylex_iter)
- except StopIteration:
- lookahead_char = chars # EOF
- end_pos, end_off = element.to_end_relative(root, pos, off)
- while True:
- value_stack.append((pos, off))
- state_stack.append(state)
- action = self.states[state][1][
- bisect.bisect_right(self.states[state][0], lookahead_char)
- ]
- #print('lookahead_char', lookahead_char, 'action', action)
- if action == -1:
- raise Exception(
- 'syntax error at {0:d},{1:d}: {2:d}'.format(pos, off, lookahead_char)
- )
- if (action & 1) == 0:
- state = action >> 1
- pos, off = element.to_start_relative(root, end_pos, end_off)
- try:
- end_pos, end_off, lookahead_char = next(yylex_iter)
- except StopIteration:
- lookahead_char = chars # EOF
- #end_pos, end_off = element.to_end_relative(root, pos, off)
- else:
- reduce = action >> 1
- len_symbols, group_bounds = self.productions[reduce]
- base = len(value_stack) - len_symbols - 1
- end_relative = len(value_stack)
- for j in range(len(group_bounds) - 1, -1, -1):
- k, l, tag, kwargs = group_bounds[j]
- k += base
- assert k < end_relative
- if l != 1:
- value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
- end_relative = max(k + 1, end_relative + 1 - l)
- while end_relative > k + 1:
- end_relative -= 1
- pos1, off1 = value_stack[end_relative]
- value_stack[end_relative] = (
- element.to_end_relative(root, pos1, off1)
- )
- pos0, off0 = value_stack[k]
- pos1, off1 = value_stack[k + 1]
- work.apply_markup(
- root,
- pos0,
- off0,
- pos1,
- off1,
- factory,
- tag,
- **kwargs
- )
- if end_relative < len(value_stack):
- pos, off = value_stack[-1]
- pos, off = element.to_start_relative(root, pos, off)
- del value_stack[base + 1:]
- del state_stack[base + 1:]
- if reduce == 0:
- assert base == 0
- return
- state = self.states[state_stack[-1]][3][
- bisect.bisect_right(self.states[state_stack[-1]][2], reduce)
- ]
- assert state != -1
-
- def __repr__(self):
- return 'regex.LR1DFA({0:s}, {1:s}, {2:d})'.format(
- repr(self.states),
- repr(self.productions),
- self.terminal_thres
- )
-
-def wrap_repr(text, width):
- lines = []
- i = 0
- while i < len(text):
- j = i + width
- if j < len(text):
- j = max(
- [
- text.rfind('(', i, j) + 1,
- text.rfind('[', i, j) + 1,
- text.rfind('{', i, j) + 1,
- text.rfind('.', i, j) + 1,
- text.rfind(')', i, j + 1),
- text.rfind(']', i, j + 1),
- text.rfind('}', i, j + 1),
- text.rfind(' ', i, j + 1),
- ]
- )
- assert j > 0
- lines.append(text[i:j] + '\n')
- i = j
- while text[i:i + 1] == ' ':
- i += 1
- return ''.join(lines)
-
if __name__ == '__main__':
+ import sys
import xml.etree.ElementTree
regex = RegexAnd(children = [RegexRepeat(children = [RegexCharacterNot(
-children = [RegexCharacter()], char_set = [0, 256])]), RegexGroup(children = [
+children = [RegexCharacter()], character_set = [0, 256])]), RegexGroup(children = [
RegexOr(children = [RegexOr(children = [RegexOr(children = [RegexGroup(children
-= [RegexRepeat(children = [RegexCharacter(char_set = [9, 14, 32, 33])],
+= [RegexRepeat(children = [RegexCharacter(character_set = [9, 14, 32, 33])],
one_or_more = True)], group_index = 1, group_name = 'Whitespace'), RegexGroup(
-children = [RegexRepeat(children = [RegexCharacter(char_set = [48, 58])],
+children = [RegexRepeat(children = [RegexCharacter(character_set = [48, 58])],
one_or_more = True)], group_index = 2, group_name = 'Number')]), RegexGroup(
children = [RegexSequence(children = [RegexSequence(children = [RegexSequence(
-children = [RegexEmpty(), RegexCharacter(char_set = [102, 103])]),
-RegexCharacter(char_set = [111, 112])]), RegexCharacter(char_set = [114, 115])]
+children = [RegexEmpty(), RegexCharacter(character_set = [102, 103])]),
+RegexCharacter(character_set = [111, 112])]), RegexCharacter(character_set = [114, 115])]
)], group_index = 3, group_name = 'For')]), RegexGroup(children = [
-RegexSequence(children = [RegexCharacter(char_set = [65, 91, 95, 96, 97, 123]),
-RegexRepeat(children = [RegexCharacter(char_set = [48, 58, 65, 91, 95, 96, 97,
+RegexSequence(children = [RegexCharacter(character_set = [65, 91, 95, 96, 97, 123]),
+RegexRepeat(children = [RegexCharacter(character_set = [48, 58, 65, 91, 95, 96, 97,
123])])])], group_index = 4, group_name = 'Identifier')])], group_index = 0)])
#sys.stdout.write(
# wrap_repr(
# )
#)
- nfa = regex.to_nfa()
+ _nfa = regex.to_nfa()
#sys.stdout.write(
# wrap_repr(
- # ' nfa = {0:s}'.format(repr(nfa).replace('regex.', '')),
+ # ' _nfa = {0:s}'.format(repr(_nfa).replace('regex.', '')),
# 79
# )
#)
i = 0
while i < len(text):
print('text "{0:s}"'.format(text[i:i + 72].replace('\n', '$')))
- thread = nfa.match_text(text, i)
+ thread = _nfa.match_text(text, i)
if thread is None:
print('no match')
break
i = thread[0] # end position of overall match
- group_start = [-1 for j in range(len(nfa.groups))]
- group_end = [-1 for j in range(len(nfa.groups))]
+ group_start = [-1 for j in range(len(_nfa.groups))]
+ group_end = [-1 for j in range(len(_nfa.groups))]
while thread is not None:
pos, mark, thread = thread
group = mark >> 1
print(
'group {0:d} name "{1:s}" text "{2:s}"'.format(
group,
- nfa.groups[group][0],
+ _nfa.groups[group][0],
text[group_start[group]:group_end[group]].replace('\n', '$')
)
)
else:
group_end[group] = pos
- dfa = nfa.to_dfa()
+ dfa = _nfa.to_dfa()
#sys.stdout.write(
# wrap_repr(
# ' dfa = {0:s}'.format(repr(dfa).replace('regex.', '')),
group_end[group] = pos
grammar = Grammar(children = [Grammar.Production(children = [RegexSequence(
-children = [RegexSequence(children = [RegexEmpty(), RegexCharacterRule(char_set
-= [288, 295], rule_name = 'whitespace_opt')]), RegexCharacterRule(char_set = [
+children = [RegexSequence(children = [RegexEmpty(), RegexCharacterRule(character_set
+= [288, 295], rule_name = 'whitespace_opt')]), RegexCharacterRule(character_set = [
259, 262], rule_name = 'expr0')])], nonterminal = 0), Grammar.Production(
-children = [RegexSequence(children = [RegexEmpty(), RegexCharacterRule(char_set
+children = [RegexSequence(children = [RegexEmpty(), RegexCharacterRule(character_set
= [262, 265], rule_name = 'expr1')])], nonterminal = 1), Grammar.Production(
children = [RegexSequence(children = [RegexEmpty(), RegexGroup(children = [
RegexSequence(children = [RegexSequence(children = [RegexSequence(children = [
-RegexSequence(children = [RegexEmpty(), RegexCharacterRule(char_set = [259, 262
-], rule_name = 'expr0')]), RegexCharacter(char_set = [43, 44])]),
-RegexCharacterRule(char_set = [288, 295], rule_name = 'whitespace_opt')]),
-RegexCharacterRule(char_set = [262, 265], rule_name = 'expr1')])], group_index
+RegexSequence(children = [RegexEmpty(), RegexCharacterRule(character_set = [259, 262
+], rule_name = 'expr0')]), RegexCharacter(character_set = [43, 44])]),
+RegexCharacterRule(character_set = [288, 295], rule_name = 'whitespace_opt')]),
+RegexCharacterRule(character_set = [262, 265], rule_name = 'expr1')])], group_index
= 0, group_name = 'Add')])], nonterminal = 2), Grammar.Production(children = [
RegexSequence(children = [RegexEmpty(), RegexGroup(children = [RegexSequence(
children = [RegexSequence(children = [RegexSequence(children = [RegexSequence(
-children = [RegexEmpty(), RegexCharacterRule(char_set = [259, 262], rule_name =
-'expr0')]), RegexCharacter(char_set = [45, 46])]), RegexCharacterRule(char_set
-= [288, 295], rule_name = 'whitespace_opt')]), RegexCharacterRule(char_set = [
+children = [RegexEmpty(), RegexCharacterRule(character_set = [259, 262], rule_name =
+'expr0')]), RegexCharacter(character_set = [45, 46])]), RegexCharacterRule(character_set
+= [288, 295], rule_name = 'whitespace_opt')]), RegexCharacterRule(character_set = [
262, 265], rule_name = 'expr1')])], group_index = 0, group_name = 'Subtract')])
], nonterminal = 3), Grammar.Production(children = [RegexSequence(children = [
-RegexEmpty(), RegexCharacterRule(char_set = [265, 268], rule_name = 'expr2')])
+RegexEmpty(), RegexCharacterRule(character_set = [265, 268], rule_name = 'expr2')])
], nonterminal = 4), Grammar.Production(children = [RegexSequence(children = [
RegexEmpty(), RegexGroup(children = [RegexSequence(children = [RegexSequence(
children = [RegexSequence(children = [RegexSequence(children = [RegexEmpty(),
-RegexCharacterRule(char_set = [262, 265], rule_name = 'expr1')]),
-RegexCharacter(char_set = [42, 43])]), RegexCharacterRule(char_set = [288, 295
-], rule_name = 'whitespace_opt')]), RegexCharacterRule(char_set = [265, 268],
+RegexCharacterRule(character_set = [262, 265], rule_name = 'expr1')]),
+RegexCharacter(character_set = [42, 43])]), RegexCharacterRule(character_set = [288, 295
+], rule_name = 'whitespace_opt')]), RegexCharacterRule(character_set = [265, 268],
rule_name = 'expr2')])], group_index = 0, group_name = 'Multiply')])],
nonterminal = 5), Grammar.Production(children = [RegexSequence(children = [
RegexEmpty(), RegexGroup(children = [RegexSequence(children = [RegexSequence(
children = [RegexSequence(children = [RegexSequence(children = [RegexEmpty(),
-RegexCharacterRule(char_set = [262, 265], rule_name = 'expr1')]),
-RegexCharacter(char_set = [47, 48])]), RegexCharacterRule(char_set = [288, 295
-], rule_name = 'whitespace_opt')]), RegexCharacterRule(char_set = [265, 268],
+RegexCharacterRule(character_set = [262, 265], rule_name = 'expr1')]),
+RegexCharacter(character_set = [47, 48])]), RegexCharacterRule(character_set = [288, 295
+], rule_name = 'whitespace_opt')]), RegexCharacterRule(character_set = [265, 268],
rule_name = 'expr2')])], group_index = 0, group_name = 'Divide')])],
nonterminal = 6), Grammar.Production(children = [RegexSequence(children = [
RegexSequence(children = [RegexEmpty(), RegexGroup(children = [RegexSequence(
-children = [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name =
+children = [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name =
'number')])], group_index = 0, group_name = 'Number')]), RegexCharacterRule(
-char_set = [288, 295], rule_name = 'whitespace_opt')])], nonterminal = 7),
+character_set = [288, 295], rule_name = 'whitespace_opt')])], nonterminal = 7),
Grammar.Production(children = [RegexSequence(children = [RegexEmpty(),
RegexGroup(children = [RegexSequence(children = [RegexSequence(children = [
-RegexSequence(children = [RegexEmpty(), RegexCharacter(char_set = [45, 46])]),
-RegexCharacterRule(char_set = [288, 295], rule_name = 'whitespace_opt')]),
-RegexCharacterRule(char_set = [265, 268], rule_name = 'expr2')])], group_index
+RegexSequence(children = [RegexEmpty(), RegexCharacter(character_set = [45, 46])]),
+RegexCharacterRule(character_set = [288, 295], rule_name = 'whitespace_opt')]),
+RegexCharacterRule(character_set = [265, 268], rule_name = 'expr2')])], group_index
= 0, group_name = 'Negate')])], nonterminal = 8), Grammar.Production(children =
[RegexSequence(children = [RegexSequence(children = [RegexSequence(children = [
RegexSequence(children = [RegexSequence(children = [RegexEmpty(),
-RegexCharacter(char_set = [40, 41])]), RegexCharacterRule(char_set = [288, 295
-], rule_name = 'whitespace_opt')]), RegexCharacterRule(char_set = [259, 262],
-rule_name = 'expr0')]), RegexCharacter(char_set = [41, 42])]),
-RegexCharacterRule(char_set = [288, 295], rule_name = 'whitespace_opt')])],
+RegexCharacter(character_set = [40, 41])]), RegexCharacterRule(character_set = [288, 295
+], rule_name = 'whitespace_opt')]), RegexCharacterRule(character_set = [259, 262],
+rule_name = 'expr0')]), RegexCharacter(character_set = [41, 42])]),
+RegexCharacterRule(character_set = [288, 295], rule_name = 'whitespace_opt')])],
nonterminal = 9), Grammar.Production(children = [RegexSequence(children = [
-RegexEmpty(), RegexCharacter(char_set = [48, 49])])], nonterminal = 10),
+RegexEmpty(), RegexCharacter(character_set = [48, 49])])], nonterminal = 10),
Grammar.Production(children = [RegexSequence(children = [RegexEmpty(),
-RegexCharacter(char_set = [49, 50])])], nonterminal = 11), Grammar.Production(
-children = [RegexSequence(children = [RegexEmpty(), RegexCharacter(char_set = [
+RegexCharacter(character_set = [49, 50])])], nonterminal = 11), Grammar.Production(
+children = [RegexSequence(children = [RegexEmpty(), RegexCharacter(character_set = [
50, 51])])], nonterminal = 12), Grammar.Production(children = [RegexSequence(
-children = [RegexEmpty(), RegexCharacter(char_set = [51, 52])])], nonterminal =
+children = [RegexEmpty(), RegexCharacter(character_set = [51, 52])])], nonterminal =
13), Grammar.Production(children = [RegexSequence(children = [RegexEmpty(),
-RegexCharacter(char_set = [52, 53])])], nonterminal = 14), Grammar.Production(
-children = [RegexSequence(children = [RegexEmpty(), RegexCharacter(char_set = [
+RegexCharacter(character_set = [52, 53])])], nonterminal = 14), Grammar.Production(
+children = [RegexSequence(children = [RegexEmpty(), RegexCharacter(character_set = [
53, 54])])], nonterminal = 15), Grammar.Production(children = [RegexSequence(
-children = [RegexEmpty(), RegexCharacter(char_set = [54, 55])])], nonterminal =
+children = [RegexEmpty(), RegexCharacter(character_set = [54, 55])])], nonterminal =
16), Grammar.Production(children = [RegexSequence(children = [RegexEmpty(),
-RegexCharacter(char_set = [55, 56])])], nonterminal = 17), Grammar.Production(
-children = [RegexSequence(children = [RegexEmpty(), RegexCharacter(char_set = [
+RegexCharacter(character_set = [55, 56])])], nonterminal = 17), Grammar.Production(
+children = [RegexSequence(children = [RegexEmpty(), RegexCharacter(character_set = [
56, 57])])], nonterminal = 18), Grammar.Production(children = [RegexSequence(
-children = [RegexEmpty(), RegexCharacter(char_set = [57, 58])])], nonterminal =
+children = [RegexEmpty(), RegexCharacter(character_set = [57, 58])])], nonterminal =
19), Grammar.Production(children = [RegexSequence(children = [RegexSequence(
-children = [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name =
-'number')]), RegexCharacter(char_set = [48, 49])])], nonterminal = 20),
+children = [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name =
+'number')]), RegexCharacter(character_set = [48, 49])])], nonterminal = 20),
Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
-)]), RegexCharacter(char_set = [49, 50])])], nonterminal = 21),
+= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(character_set = [49, 50])])], nonterminal = 21),
Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
-)]), RegexCharacter(char_set = [50, 51])])], nonterminal = 22),
+= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(character_set = [50, 51])])], nonterminal = 22),
Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
-)]), RegexCharacter(char_set = [51, 52])])], nonterminal = 23),
+= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(character_set = [51, 52])])], nonterminal = 23),
Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
-)]), RegexCharacter(char_set = [52, 53])])], nonterminal = 24),
+= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(character_set = [52, 53])])], nonterminal = 24),
Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
-)]), RegexCharacter(char_set = [53, 54])])], nonterminal = 25),
+= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(character_set = [53, 54])])], nonterminal = 25),
Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
-)]), RegexCharacter(char_set = [54, 55])])], nonterminal = 26),
+= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(character_set = [54, 55])])], nonterminal = 26),
Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
-)]), RegexCharacter(char_set = [55, 56])])], nonterminal = 27),
+= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(character_set = [55, 56])])], nonterminal = 27),
Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
-)]), RegexCharacter(char_set = [56, 57])])], nonterminal = 28),
+= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(character_set = [56, 57])])], nonterminal = 28),
Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
-)]), RegexCharacter(char_set = [57, 58])])], nonterminal = 29),
+= [RegexEmpty(), RegexCharacterRule(character_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(character_set = [57, 58])])], nonterminal = 29),
Grammar.Production(children = [RegexEmpty()], nonterminal = 30),
Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [288, 295], rule_name =
-'whitespace_opt')]), RegexCharacter(char_set = [9, 10])])], nonterminal = 31),
+= [RegexEmpty(), RegexCharacterRule(character_set = [288, 295], rule_name =
+'whitespace_opt')]), RegexCharacter(character_set = [9, 10])])], nonterminal = 31),
Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [288, 295], rule_name =
-'whitespace_opt')]), RegexCharacter(char_set = [10, 11])])], nonterminal = 32),
+= [RegexEmpty(), RegexCharacterRule(character_set = [288, 295], rule_name =
+'whitespace_opt')]), RegexCharacter(character_set = [10, 11])])], nonterminal = 32),
Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [288, 295], rule_name =
-'whitespace_opt')]), RegexCharacter(char_set = [11, 12])])], nonterminal = 33),
+= [RegexEmpty(), RegexCharacterRule(character_set = [288, 295], rule_name =
+'whitespace_opt')]), RegexCharacter(character_set = [11, 12])])], nonterminal = 33),
Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [288, 295], rule_name =
-'whitespace_opt')]), RegexCharacter(char_set = [12, 13])])], nonterminal = 34),
+= [RegexEmpty(), RegexCharacterRule(character_set = [288, 295], rule_name =
+'whitespace_opt')]), RegexCharacter(character_set = [12, 13])])], nonterminal = 34),
Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [288, 295], rule_name =
-'whitespace_opt')]), RegexCharacter(char_set = [13, 14])])], nonterminal = 35),
+= [RegexEmpty(), RegexCharacterRule(character_set = [288, 295], rule_name =
+'whitespace_opt')]), RegexCharacter(character_set = [13, 14])])], nonterminal = 35),
Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
-= [RegexEmpty(), RegexCharacterRule(char_set = [288, 295], rule_name =
-'whitespace_opt')]), RegexCharacter(char_set = [32, 33])])], nonterminal = 36)
-], terminal_thres = 258)
+= [RegexEmpty(), RegexCharacterRule(character_set = [288, 295], rule_name =
+'whitespace_opt')]), RegexCharacter(character_set = [32, 33])])], nonterminal = 36)
+], n_terminals = 258)
#sys.stdout.write(
# wrap_repr(
# ' grammar = {0:s}'.format(repr(grammar).replace('regex.', '')),