--- /dev/null
+import bisect
+import element
+import work
+import sys
+
+# defines the alphabet size, set this to 0x11000 for unicode
+chars = 0x100
+
+def char_set_or(char_set0, char_set1):
+ # calculate union of the child sets
+ # we do this by calculating a series of breakpoints, at each breakpoint
+ # evaluating the "or" (max) of the even/odd truth values of each child,
+ # then making the output truth value even/odd by outputting if necessary
+ result = []
+ i = 0
+ j = 0
+ while True:
+ if i < len(char_set0):
+ k = char_set0[i]
+ if j < len(char_set1):
+ k = min(k, char_set1[j])
+ elif j < len(char_set1):
+ k = char_set1[j]
+ else:
+ break
+ if i < len(char_set0) and char_set0[i] == k:
+ i += 1
+ if j < len(char_set1) and char_set1[j] == k:
+ j += 1
+ if (len(result) & 1) != max(i & 1, j & 1):
+ result.append(k)
+ assert (i & 1) == 0 and (j & 1) == 0
+ return result
+
+def char_set_and(char_set0, char_set1):
+ # calculate intersection of the child sets
+ # we do this by calculating a series of breakpoints, at each breakpoint
+ # evaluating the "and" (min) of the even/odd truth values of each child,
+ # then making the output truth value even/odd by outputting if necessary
+ result = []
+ i = 0
+ j = 0
+ while True:
+ if i < len(char_set0):
+ k = char_set0[i]
+ if j < len(char_set1):
+ k = min(k, char_set1[j])
+ elif j < len(char_set1):
+ k = char_set1[j]
+ else:
+ break
+ if i < len(char_set0) and char_set0[i] == k:
+ i += 1
+ if j < len(char_set1) and char_set1[j] == k:
+ j += 1
+ if (len(result) & 1) != min(i & 1, j & 1):
+ result.append(k)
+ assert (i & 1) == 0 and (j & 1) == 0
+ return result
+
+def char_set_not(char_set):
+ # calculate complement of the child set
+ # if child set begins with [0], remove it, otherwise add [0] prefix
+ # if child set ends with [chars], remove it, otherwise add [chars] suffix
+ # the suffix part is not totally necessary, but makes sure length is even
+ # (the evenness is so that single character sets can always be [c, c + 1])
+ result = list(char_set)
+ if result[:1] == [0]:
+ del result[:1]
+ else:
+ result[:0] = [0]
+ if result[-1:] == [chars]:
+ del result[-1:]
+ else:
+ result.append(chars)
+ return result
+
+class Regex(element.Element):
+ # GENERATE ELEMENT() BEGIN
+ def __init__(
+ self,
+ tag = 'Regex',
+ attrib = {},
+ text = '',
+ children = []
+ ):
+ element.Element.__init__(
+ self,
+ tag,
+ attrib,
+ text,
+ children
+ )
+ def copy(self, factory = None):
+ result = element.Element.copy(
+ self,
+ Regex if factory is None else factory
+ )
+ return result
+ def __repr__(self):
+ params = []
+ self.repr_serialize(params)
+ return 'regex.Regex({0:s})'.format(', '.join(params))
+ # GENERATE END
+ def post_process(self, group_index = 0, rule_char_sets = None):
+ for i in self:
+ group_index = i.post_process(group_index, rule_char_sets)
+ return group_index
+ def to_groups(self, groups):
+ for i in self:
+ i.to_groups(groups)
+ def to_nfa_state(self, nfa, next_state):
+ raise NotImplementedException
+ def add_to_nfa(self, nfa):
+ nfa.start_state.append(self.to_nfa_state(nfa, 0))
+ def to_lr1_symbols(self, terminal_thres, symbols, lookaheads, group_bounds):
+ group_count = 0
+ for i in self:
+ group_count += (
+ i.to_lr1_symbols(terminal_thres, symbols, lookaheads, group_bounds)
+ )
+ return group_count # count of groups or ungrouped characters
+
+class RegexNone(Regex):
+ # GENERATE ELEMENT() BEGIN
+ def __init__(
+ self,
+ tag = 'RegexNone',
+ attrib = {},
+ text = '',
+ children = []
+ ):
+ Regex.__init__(
+ self,
+ tag,
+ attrib,
+ text,
+ children
+ )
+ def copy(self, factory = None):
+ result = Regex.copy(
+ self,
+ RegexNone if factory is None else factory
+ )
+ return result
+ def __repr__(self):
+ params = []
+ self.repr_serialize(params)
+ return 'regex.RegexNone({0:s})'.format(', '.join(params))
+ # GENERATE END
+ def to_nfa_state(self, nfa, next_state):
+ return -1
+
+class RegexEmpty(Regex):
+ # GENERATE ELEMENT() BEGIN
+ def __init__(
+ self,
+ tag = 'RegexEmpty',
+ attrib = {},
+ text = '',
+ children = []
+ ):
+ Regex.__init__(
+ self,
+ tag,
+ attrib,
+ text,
+ children
+ )
+ def copy(self, factory = None):
+ result = Regex.copy(
+ self,
+ RegexEmpty if factory is None else factory
+ )
+ return result
+ def __repr__(self):
+ params = []
+ self.repr_serialize(params)
+ return 'regex.RegexEmpty({0:s})'.format(', '.join(params))
+ # GENERATE END
+ def to_nfa_state(self, nfa, next_state):
+ return next_state
+
+class RegexCharacter(Regex):
+ # GENERATE ELEMENT(list(int) char_set) BEGIN
+ def __init__(
+ self,
+ tag = 'RegexCharacter',
+ attrib = {},
+ text = '',
+ children = [],
+ char_set = []
+ ):
+ Regex.__init__(
+ self,
+ tag,
+ attrib,
+ text,
+ children
+ )
+ self.char_set = (
+ [element.deserialize_int(i) for i in char_set.split()]
+ if isinstance(char_set, str) else
+ char_set
+ )
+ def serialize(self, ref_list, indent = 0):
+ Regex.serialize(self, ref_list, indent)
+ self.set(
+ 'char_set',
+ ' '.join([element.serialize_int(i) for i in self.char_set])
+ )
+ def deserialize(self, ref_list):
+ Regex.deserialize(self, ref_list)
+ self.char_set = [
+ element.deserialize_int(i)
+ for i in self.get('char_set', '').split()
+ ]
+ def copy(self, factory = None):
+ result = Regex.copy(
+ self,
+ RegexCharacter if factory is None else factory
+ )
+ result.char_set = self.char_set
+ return result
+ def repr_serialize(self, params):
+ Regex.repr_serialize(self, params)
+ if len(self.char_set):
+ params.append(
+ 'char_set = [{0:s}]'.format(
+ ', '.join([repr(i) for i in self.char_set])
+ )
+ )
+ def __repr__(self):
+ params = []
+ self.repr_serialize(params)
+ return 'regex.RegexCharacter({0:s})'.format(', '.join(params))
+ # GENERATE END
+ def to_nfa_state(self, nfa, next_state):
+ new_state = len(nfa.states)
+ nfa.states.append((NFA.STATE_CHARACTER, self.char_set, next_state))
+ return new_state
+ def to_lr1_symbols(self, terminal_thres, symbols, lookaheads, group_bounds):
+ terminal_set = []
+ nonterminal_set = []
+ i = 0
+ while i < len(self.char_set):
+ [j, k] = self.char_set[i:i + 2]
+ if k > terminal_thres:
+ if j < terminal_thres:
+ terminal_set.extend([j, terminal_thres])
+ nonterminal_set.extend([0, k - terminal_thres])
+ i += 2
+ while i < len(self.char_set):
+ [j, k] = self.char_set[i:i + 2]
+ nonterminal_set.extend([j - terminal_thres, k - terminal_thres])
+ i += 2
+ break
+ terminal_set.extend([j, k])
+ i += 2
+ symbols.append((terminal_set, nonterminal_set))
+ lookaheads.append(([], False)) # initial_set, can_be_empty
+ return 1 # count of groups or ungrouped characters
+
+class RegexCharacterRange(RegexCharacter):
+ # GENERATE ELEMENT() BEGIN
+ def __init__(
+ self,
+ tag = 'RegexCharacterRange',
+ attrib = {},
+ text = '',
+ children = [],
+ char_set = []
+ ):
+ RegexCharacter.__init__(
+ self,
+ tag,
+ attrib,
+ text,
+ children,
+ char_set
+ )
+ def copy(self, factory = None):
+ result = RegexCharacter.copy(
+ self,
+ RegexCharacterRange if factory is None else factory
+ )
+ return result
+ def __repr__(self):
+ params = []
+ self.repr_serialize(params)
+ return 'regex.RegexCharacterRange({0:s})'.format(', '.join(params))
+ # GENERATE END
+ def post_process(self, group_index = 0, rule_char_sets = None):
+ group_index = RegexCharacter.post_process(self, group_index, rule_char_sets)
+ self.char_set = [self[0].char_set[0], self[1].char_set[-1]]
+ return group_index
+
+class RegexCharacterOr(RegexCharacter):
+ # GENERATE ELEMENT() BEGIN
+ def __init__(
+ self,
+ tag = 'RegexCharacterOr',
+ attrib = {},
+ text = '',
+ children = [],
+ char_set = []
+ ):
+ RegexCharacter.__init__(
+ self,
+ tag,
+ attrib,
+ text,
+ children,
+ char_set
+ )
+ def copy(self, factory = None):
+ result = RegexCharacter.copy(
+ self,
+ RegexCharacterOr if factory is None else factory
+ )
+ return result
+ def __repr__(self):
+ params = []
+ self.repr_serialize(params)
+ return 'regex.RegexCharacterOr({0:s})'.format(', '.join(params))
+ # GENERATE END
+ def post_process(self, group_index = 0, rule_char_sets = None):
+ group_index = RegexCharacter.post_process(self, group_index, rule_char_sets)
+ self.char_set = char_set_or(self[0].char_set, self[1].char_set)
+ return group_index
+
+class RegexCharacterAnd(RegexCharacter):
+ # GENERATE ELEMENT() BEGIN
+ def __init__(
+ self,
+ tag = 'RegexCharacterAnd',
+ attrib = {},
+ text = '',
+ children = [],
+ char_set = []
+ ):
+ RegexCharacter.__init__(
+ self,
+ tag,
+ attrib,
+ text,
+ children,
+ char_set
+ )
+ def copy(self, factory = None):
+ result = RegexCharacter.copy(
+ self,
+ RegexCharacterAnd if factory is None else factory
+ )
+ return result
+ def __repr__(self):
+ params = []
+ self.repr_serialize(params)
+ return 'regex.RegexCharacterAnd({0:s})'.format(', '.join(params))
+ # GENERATE END
+ def post_process(self, group_index = 0, rule_char_sets = None):
+ group_index = RegexCharacter.post_process(self, group_index, rule_char_sets)
+ self.char_set = char_set_and(self[0].char_set, self[1].char_set)
+ return group_index
+
+class RegexCharacterNot(RegexCharacter):
+ # GENERATE ELEMENT() BEGIN
+ def __init__(
+ self,
+ tag = 'RegexCharacterNot',
+ attrib = {},
+ text = '',
+ children = [],
+ char_set = []
+ ):
+ RegexCharacter.__init__(
+ self,
+ tag,
+ attrib,
+ text,
+ children,
+ char_set
+ )
+ def copy(self, factory = None):
+ result = RegexCharacter.copy(
+ self,
+ RegexCharacterNot if factory is None else factory
+ )
+ return result
+ def __repr__(self):
+ params = []
+ self.repr_serialize(params)
+ return 'regex.RegexCharacterNot({0:s})'.format(', '.join(params))
+ # GENERATE END
+ def post_process(self, group_index = 0, rule_char_sets = None):
+ group_index = RegexCharacter.post_process(self, group_index, rule_char_sets)
+ self.char_set = char_set_not(self[0].char_set)
+ return group_index
+
+class RegexCharacterRule(RegexCharacter):
+ # GENERATE ELEMENT(str rule_name) BEGIN
+ def __init__(
+ self,
+ tag = 'RegexCharacterRule',
+ attrib = {},
+ text = '',
+ children = [],
+ char_set = [],
+ rule_name = ''
+ ):
+ RegexCharacter.__init__(
+ self,
+ tag,
+ attrib,
+ text,
+ children,
+ char_set
+ )
+ self.rule_name = rule_name
+ def serialize(self, ref_list, indent = 0):
+ RegexCharacter.serialize(self, ref_list, indent)
+ self.set('rule_name', element.serialize_str(self.rule_name))
+ def deserialize(self, ref_list):
+ RegexCharacter.deserialize(self, ref_list)
+ self.rule_name = element.deserialize_str(self.get('rule_name', ''))
+ def copy(self, factory = None):
+ result = RegexCharacter.copy(
+ self,
+ RegexCharacterRule if factory is None else factory
+ )
+ result.rule_name = self.rule_name
+ return result
+ def repr_serialize(self, params):
+ RegexCharacter.repr_serialize(self, params)
+ if self.rule_name != '':
+ params.append(
+ 'rule_name = {0:s}'.format(repr(self.rule_name))
+ )
+ def __repr__(self):
+ params = []
+ self.repr_serialize(params)
+ return 'regex.RegexCharacterRule({0:s})'.format(', '.join(params))
+ # GENERATE END
+ def post_process(self, group_index = 0, rule_char_sets = None):
+ if rule_char_sets is not None:
+ self.char_set = rule_char_sets[self.rule_name]
+ return RegexCharacter.post_process(self, group_index, rule_char_sets)
+
+class RegexOr(Regex):
+ # GENERATE ELEMENT() BEGIN
+ def __init__(
+ self,
+ tag = 'RegexOr',
+ attrib = {},
+ text = '',
+ children = []
+ ):
+ Regex.__init__(
+ self,
+ tag,
+ attrib,
+ text,
+ children
+ )
+ def copy(self, factory = None):
+ result = Regex.copy(
+ self,
+ RegexOr if factory is None else factory
+ )
+ return result
+ def __repr__(self):
+ params = []
+ self.repr_serialize(params)
+ return 'regex.RegexOr({0:s})'.format(', '.join(params))
+ # GENERATE END
+ def to_nfa_state(self, nfa, next_state):
+ child0_state = self[0].to_nfa_state(nfa, next_state)
+ child1_state = self[1].to_nfa_state(nfa, next_state)
+ if child0_state == -1:
+ return child1_state
+ if child1_state == -1:
+ return child0_state
+ new_state = len(nfa.states)
+ nfa.states.append((NFA.STATE_OR, child0_state, child1_state))
+ return new_state
+
+class RegexAnd(Regex):
+ # GENERATE ELEMENT() BEGIN
+ def __init__(
+ self,
+ tag = 'RegexAnd',
+ attrib = {},
+ text = '',
+ children = []
+ ):
+ Regex.__init__(
+ self,
+ tag,
+ attrib,
+ text,
+ children
+ )
+ def copy(self, factory = None):
+ result = Regex.copy(
+ self,
+ RegexAnd if factory is None else factory
+ )
+ return result
+ def __repr__(self):
+ params = []
+ self.repr_serialize(params)
+ return 'regex.RegexAnd({0:s})'.format(', '.join(params))
+ # GENERATE END
+ def to_nfa_state(self, nfa, next_state):
+ join0_state = len(nfa.states)
+ nfa.states.append(NFA.join0_state) # takes no arguments so use static one
+ join1_state = len(nfa.states)
+ nfa.states.append((NFA.STATE_JOIN1, next_state))
+ child0_state = self[0].to_nfa_state(nfa, join0_state)
+ if child0_state == -1:
+ return -1
+ child1_state = self[1].to_nfa_state(nfa, join1_state)
+ if child1_state == -1:
+ return -1
+ new_state = len(nfa.states)
+ nfa.states.append((NFA.STATE_AND, child0_state, child1_state))
+ return new_state
+
+class RegexSequence(Regex):
+ # GENERATE ELEMENT() BEGIN
+ def __init__(
+ self,
+ tag = 'RegexSequence',
+ attrib = {},
+ text = '',
+ children = []
+ ):
+ Regex.__init__(
+ self,
+ tag,
+ attrib,
+ text,
+ children
+ )
+ def copy(self, factory = None):
+ result = Regex.copy(
+ self,
+ RegexSequence if factory is None else factory
+ )
+ return result
+ def __repr__(self):
+ params = []
+ self.repr_serialize(params)
+ return 'regex.RegexSequence({0:s})'.format(', '.join(params))
+ # GENERATE END
+ def to_nfa_state(self, nfa, next_state):
+ child1_state = self[1].to_nfa_state(nfa, next_state)
+ if child1_state == -1:
+ return -1
+ return self[0].to_nfa_state(nfa, child1_state)
+
+class RegexRepeat(Regex):
+ # GENERATE ELEMENT(int count0, int count1, bool non_greedy) BEGIN
+ def __init__(
+ self,
+ tag = 'RegexRepeat',
+ attrib = {},
+ text = '',
+ children = [],
+ count0 = -1,
+ count1 = -1,
+ non_greedy = False
+ ):
+ Regex.__init__(
+ self,
+ tag,
+ attrib,
+ text,
+ children
+ )
+ self.count0 = (
+ element.deserialize_int(count0)
+ if isinstance(count0, str) else
+ count0
+ )
+ self.count1 = (
+ element.deserialize_int(count1)
+ if isinstance(count1, str) else
+ count1
+ )
+ self.non_greedy = (
+ element.deserialize_bool(non_greedy)
+ if isinstance(non_greedy, str) else
+ non_greedy
+ )
+ def serialize(self, ref_list, indent = 0):
+ Regex.serialize(self, ref_list, indent)
+ self.set('count0', element.serialize_int(self.count0))
+ self.set('count1', element.serialize_int(self.count1))
+ self.set('non_greedy', element.serialize_bool(self.non_greedy))
+ def deserialize(self, ref_list):
+ Regex.deserialize(self, ref_list)
+ self.count0 = element.deserialize_int(self.get('count0', '-1'))
+ self.count1 = element.deserialize_int(self.get('count1', '-1'))
+ self.non_greedy = element.deserialize_bool(self.get('non_greedy', 'false'))
+ def copy(self, factory = None):
+ result = Regex.copy(
+ self,
+ RegexRepeat if factory is None else factory
+ )
+ result.count0 = self.count0
+ result.count1 = self.count1
+ result.non_greedy = self.non_greedy
+ return result
+ def repr_serialize(self, params):
+ Regex.repr_serialize(self, params)
+ if self.count0 != -1:
+ params.append(
+ 'count0 = {0:s}'.format(repr(self.count0))
+ )
+ if self.count1 != -1:
+ params.append(
+ 'count1 = {0:s}'.format(repr(self.count1))
+ )
+ if self.non_greedy != False:
+ params.append(
+ 'non_greedy = {0:s}'.format(repr(self.non_greedy))
+ )
+ def __repr__(self):
+ params = []
+ self.repr_serialize(params)
+ return 'regex.RegexRepeat({0:s})'.format(', '.join(params))
+ # GENERATE END
+ def post_process(self, group_index = 0, rule_char_sets = None):
+ # total hack which will be done in a Python action in future
+ if len(self) >= 2:
+ assert self[1].tag == 'Number'
+ self.count0 = int(self[1].text)
+ if len(self) >= 3:
+ assert self[2].tag == 'Number'
+ self.count1 = int(self[2].text)
+ else:
+ self.count1 = self.count0
+ del self[1:]
+ # end total hack
+ return Regex.post_process(self, group_index, rule_char_sets)
+ def to_nfa_state(self, nfa, next_state):
+ count0 = self.count0
+ count1 = self.count1
+ if count1 == -1:
+ new_state = len(nfa.states)
+ nfa.states.append(None)
+ child_state = self[0].to_nfa_state(nfa, new_state)
+ if child_state == -1:
+ new_state = next_state # note: unreachable state remains invalid (None)
+ else:
+ nfa.states[new_state] = (
+ (NFA.STATE_OR, next_state, child_state)
+ if self.non_greedy else
+ (NFA.STATE_OR, child_state, next_state)
+ )
+ else:
+ new_state = next_state
+ for i in range(count1 - count0):
+ child_state = self[0].to_nfa_state(nfa, new_state)
+ if child_state == -1:
+ break
+ new_state = len(nfa.states)
+ nfa.states.append(
+ (NFA.STATE_OR, next_state, child_state)
+ if self.non_greedy else
+ (NFA.STATE_OR, child_state, next_state)
+ )
+ for i in range(count0):
+ new_state = self[0].to_nfa_state(nfa, new_state)
+ if new_state == -1:
+ break
+ return new_state
+
+class RegexGroup(Regex):
+ class Attribute(element.Element):
+ # GENERATE ELEMENT(str name, str value) BEGIN
+ def __init__(
+ self,
+ tag = 'RegexGroup_Attribute',
+ attrib = {},
+ text = '',
+ children = [],
+ name = '',
+ value = ''
+ ):
+ element.Element.__init__(
+ self,
+ tag,
+ attrib,
+ text,
+ children
+ )
+ self.name = name
+ self.value = value
+ def serialize(self, ref_list, indent = 0):
+ element.Element.serialize(self, ref_list, indent)
+ self.set('name', element.serialize_str(self.name))
+ self.set('value', element.serialize_str(self.value))
+ def deserialize(self, ref_list):
+ element.Element.deserialize(self, ref_list)
+ self.name = element.deserialize_str(self.get('name', ''))
+ self.value = element.deserialize_str(self.get('value', ''))
+ def copy(self, factory = None):
+ result = element.Element.copy(
+ self,
+ Attribute if factory is None else factory
+ )
+ result.name = self.name
+ result.value = self.value
+ return result
+ def repr_serialize(self, params):
+ element.Element.repr_serialize(self, params)
+ if self.name != '':
+ params.append(
+ 'name = {0:s}'.format(repr(self.name))
+ )
+ if self.value != '':
+ params.append(
+ 'value = {0:s}'.format(repr(self.value))
+ )
+ def __repr__(self):
+ params = []
+ self.repr_serialize(params)
+ return 'regex.RegexGroup.Attribute({0:s})'.format(', '.join(params))
+ # GENERATE END
+
+ # GENERATE ELEMENT(int group_index, str group_name, list(ref) group_attributes) BEGIN
+ def __init__(
+ self,
+ tag = 'RegexGroup',
+ attrib = {},
+ text = '',
+ children = [],
+ group_index = -1,
+ group_name = '',
+ group_attributes = []
+ ):
+ Regex.__init__(
+ self,
+ tag,
+ attrib,
+ text,
+ children
+ )
+ self.group_index = (
+ element.deserialize_int(group_index)
+ if isinstance(group_index, str) else
+ group_index
+ )
+ self.group_name = group_name
+ self.group_attributes = group_attributes
+ def serialize(self, ref_list, indent = 0):
+ Regex.serialize(self, ref_list, indent)
+ self.set('group_index', element.serialize_int(self.group_index))
+ self.set('group_name', element.serialize_str(self.group_name))
+ self.set(
+ 'group_attributes',
+ ' '.join([element.serialize_ref(i, ref_list) for i in self.group_attributes])
+ )
+ def deserialize(self, ref_list):
+ Regex.deserialize(self, ref_list)
+ self.group_index = element.deserialize_int(self.get('group_index', '-1'))
+ self.group_name = element.deserialize_str(self.get('group_name', ''))
+ self.group_attributes = [
+ element.deserialize_ref(i, ref_list)
+ for i in self.get('group_attributes', '').split()
+ ]
+ def copy(self, factory = None):
+ result = Regex.copy(
+ self,
+ RegexGroup if factory is None else factory
+ )
+ result.group_index = self.group_index
+ result.group_name = self.group_name
+ result.group_attributes = self.group_attributes
+ return result
+ def repr_serialize(self, params):
+ Regex.repr_serialize(self, params)
+ if self.group_index != -1:
+ params.append(
+ 'group_index = {0:s}'.format(repr(self.group_index))
+ )
+ if self.group_name != '':
+ params.append(
+ 'group_name = {0:s}'.format(repr(self.group_name))
+ )
+ if len(self.group_attributes):
+ params.append(
+ 'group_attributes = [{0:s}]'.format(
+ ', '.join([repr(i) for i in self.group_attributes])
+ )
+ )
+ def __repr__(self):
+ params = []
+ self.repr_serialize(params)
+ return 'regex.RegexGroup({0:s})'.format(', '.join(params))
+ # GENERATE END
+ def post_process(self, group_index = 0, rule_char_sets = None):
+ # total hack which will be done in a Python action in future
+ if len(self) >= 2:
+ assert self[0].tag == 'GroupName'
+ self.group_name = self[0].text[1:-1]
+ del self[:1]
+ # end total hack
+ self.group_index = group_index
+ group_index += 1
+ return Regex.post_process(self, group_index, rule_char_sets)
+ def to_groups(self, groups):
+ assert len(groups) == self.group_index
+ groups.append(
+ (self.group_name, {i.name: i.value for i in self.group_attributes})
+ )
+ return Regex.to_groups(self, groups)
+ def to_nfa_state(self, nfa, next_state):
+ mark_state = len(nfa.states)
+ nfa.states.append((NFA.STATE_MARK, self.group_index * 2 + 1, next_state))
+ child_state = self[0].to_nfa_state(nfa, mark_state)
+ if child_state == -1:
+ return -1
+ new_state = len(nfa.states)
+ nfa.states.append((NFA.STATE_MARK, self.group_index * 2, child_state))
+ return new_state
+ def to_lr1_symbols(self, terminal_thres, symbols, lookaheads, group_bounds):
+ group_start = len(symbols)
+ assert self.group_index == len(group_bounds)
+ group_bounds.append(None)
+ group_count = Regex.to_lr1_symbols(
+ self,
+ terminal_thres,
+ symbols,
+ lookaheads,
+ group_bounds
+ )
+ group_bounds[self.group_index] = (
+ group_start,
+ group_count,
+ self.group_name,
+ {i.name: i.value for i in self.group_attributes}
+ )
+ return 1 # count of groups or ungrouped characters
+
+class Grammar(element.Element):
+ class Production(element.Element):
+ # GENERATE ELEMENT(int nonterminal, int priority, bool right_to_left) BEGIN
+ def __init__(
+ self,
+ tag = 'Grammar_Production',
+ attrib = {},
+ text = '',
+ children = [],
+ nonterminal = -1,
+ priority = -1,
+ right_to_left = False
+ ):
+ element.Element.__init__(
+ self,
+ tag,
+ attrib,
+ text,
+ children
+ )
+ self.nonterminal = (
+ element.deserialize_int(nonterminal)
+ if isinstance(nonterminal, str) else
+ nonterminal
+ )
+ self.priority = (
+ element.deserialize_int(priority)
+ if isinstance(priority, str) else
+ priority
+ )
+ self.right_to_left = (
+ element.deserialize_bool(right_to_left)
+ if isinstance(right_to_left, str) else
+ right_to_left
+ )
+ def serialize(self, ref_list, indent = 0):
+ element.Element.serialize(self, ref_list, indent)
+ self.set('nonterminal', element.serialize_int(self.nonterminal))
+ self.set('priority', element.serialize_int(self.priority))
+ self.set('right_to_left', element.serialize_bool(self.right_to_left))
+ def deserialize(self, ref_list):
+ element.Element.deserialize(self, ref_list)
+ self.nonterminal = element.deserialize_int(self.get('nonterminal', '-1'))
+ self.priority = element.deserialize_int(self.get('priority', '-1'))
+ self.right_to_left = element.deserialize_bool(self.get('right_to_left', 'false'))
+ def copy(self, factory = None):
+ result = element.Element.copy(
+ self,
+ Production if factory is None else factory
+ )
+ result.nonterminal = self.nonterminal
+ result.priority = self.priority
+ result.right_to_left = self.right_to_left
+ return result
+ def repr_serialize(self, params):
+ element.Element.repr_serialize(self, params)
+ if self.nonterminal != -1:
+ params.append(
+ 'nonterminal = {0:s}'.format(repr(self.nonterminal))
+ )
+ if self.priority != -1:
+ params.append(
+ 'priority = {0:s}'.format(repr(self.priority))
+ )
+ if self.right_to_left != False:
+ params.append(
+ 'right_to_left = {0:s}'.format(repr(self.right_to_left))
+ )
+ def __repr__(self):
+ params = []
+ self.repr_serialize(params)
+ return 'regex.Grammar.Production({0:s})'.format(', '.join(params))
+ # GENERATE END
+
+ # GENERATE ELEMENT(int terminal_thres) BEGIN
+ def __init__(
+ self,
+ tag = 'Grammar',
+ attrib = {},
+ text = '',
+ children = [],
+ terminal_thres = -1
+ ):
+ element.Element.__init__(
+ self,
+ tag,
+ attrib,
+ text,
+ children
+ )
+ self.terminal_thres = (
+ element.deserialize_int(terminal_thres)
+ if isinstance(terminal_thres, str) else
+ terminal_thres
+ )
+ def serialize(self, ref_list, indent = 0):
+ element.Element.serialize(self, ref_list, indent)
+ self.set('terminal_thres', element.serialize_int(self.terminal_thres))
+ def deserialize(self, ref_list):
+ element.Element.deserialize(self, ref_list)
+ self.terminal_thres = element.deserialize_int(self.get('terminal_thres', '-1'))
+ def copy(self, factory = None):
+ result = element.Element.copy(
+ self,
+ Grammar if factory is None else factory
+ )
+ result.terminal_thres = self.terminal_thres
+ return result
+ def repr_serialize(self, params):
+ element.Element.repr_serialize(self, params)
+ if self.terminal_thres != -1:
+ params.append(
+ 'terminal_thres = {0:s}'.format(repr(self.terminal_thres))
+ )
+ def __repr__(self):
+ params = []
+ self.repr_serialize(params)
+ return 'regex.Grammar({0:s})'.format(', '.join(params))
+ # GENERATE END
+ def post_process(self, rule_char_sets):
+ for i in range(len(self)):
+ self[i].nonterminal = i
+ self[i][0].post_process(0, rule_char_sets)
+ def to_lr1(self):
+ lr1 = LR1([], self.terminal_thres)
+ for i in self:
+ symbols = []
+ lookaheads = []
+ group_bounds = []
+ i[0].to_lr1_symbols(
+ self.terminal_thres,
+ symbols,
+ lookaheads,
+ group_bounds
+ )
+ lookaheads.append(([], True)) # initial_set, can_be_empty (sentinel)
+ lr1.productions.append(
+ (
+ i.priority * 2 + int(i.right_to_left),
+ symbols,
+ lookaheads,
+ group_bounds
+ )
+ )
+ # propagate lookaheads
+ modified = True
+ while modified:
+ modified = False
+ for _, symbols, lookaheads, _ in lr1.productions:
+ for i in range(len(symbols) - 1, -1, -1):
+ initial_set, nonterminal_set = symbols[i]
+ can_be_empty = False
+ for j in range(0, len(nonterminal_set), 2):
+ for k in range(nonterminal_set[j], nonterminal_set[j + 1]):
+ child_initial_set, child_can_be_empty = lr1.productions[k][2][0]
+ initial_set = char_set_or(initial_set, child_initial_set)
+ can_be_empty = can_be_empty or child_can_be_empty
+ # at this point can_be_empty refers to current symbol only
+ if can_be_empty:
+ next_initial_set, can_be_empty = lookaheads[i + 1]
+ initial_set = char_set_or(initial_set, next_initial_set)
+ # at this point can_be_empty refers to all remaining symbols
+ if (initial_set, can_be_empty) != lookaheads[i]:
+ lookaheads[i] = (initial_set, can_be_empty)
+ modified = True
+ return lr1
+
+# GENERATE FACTORY(element.Element) BEGIN
+tag_to_class = {
+ 'Regex': Regex,
+ 'RegexNone': RegexNone,
+ 'RegexEmpty': RegexEmpty,
+ 'RegexCharacter': RegexCharacter,
+ 'RegexCharacterRange': RegexCharacterRange,
+ 'RegexCharacterOr': RegexCharacterOr,
+ 'RegexCharacterAnd': RegexCharacterAnd,
+ 'RegexCharacterNot': RegexCharacterNot,
+ 'RegexCharacterRule': RegexCharacterRule,
+ 'RegexOr': RegexOr,
+ 'RegexAnd': RegexAnd,
+ 'RegexSequence': RegexSequence,
+ 'RegexRepeat': RegexRepeat,
+ 'RegexGroup': RegexGroup,
+ 'RegexGroup_Attribute': RegexGroup.Attribute,
+ 'Grammar': Grammar,
+ 'Grammar_Production': Grammar.Production
+}
+def factory(tag, attrib = {}, *args, **kwargs):
+ return tag_to_class.get(tag, element.Element)(tag, attrib, *args, **kwargs)
+# GENERATE END
+
+class NFA:
+ # state_desc classes:
+ # (STATE_CHARACTER, char_set, next_state)
+ # (STATE_OR, next_state0, next_state1)
+ # (STATE_AND, next_state0, next_state1)
+ # (STATE_JOIN0,)
+ # (STATE_JOIN1, next_state)
+ # (STATE_MARK, mark_value, next_state)
+
+ STATE_CHARACTER = 0
+ STATE_OR = 1
+ STATE_AND = 2
+ STATE_JOIN0 = 3
+ STATE_JOIN1 = 4
+ STATE_MARK = 5
+ join0_state = (STATE_JOIN0,)
+
+ # multistate classes:
+ # (MULTISTATE_ACCEPT, 1) accept, occupies one thread in list
+ # (MULTISTATE_AND, n, state, child) and, occupies n threads in list
+ # (MULTISTATE_OR, n, child0, child1)
+ # n = sum of n from child states, that is, multistate[1] is the number of
+ # (MULTISTATE_ACCEPT, 1) leaf states ultimately reachable from this subtree
+
+ MULTISTATE_ACCEPT = 0
+ MULTISTATE_AND = 1
+ MULTISTATE_OR = 2
+ accept_multistate = (MULTISTATE_ACCEPT, 1)
+
+ def __init__(
+ self,
+ groups = [],
+ states = [(STATE_CHARACTER, [0, chars], 0)],
+ start_state = [] # can have multiple NFAs in same container
+ ):
+ # groups: list of group_desc
+ # group_desc: (tag, kwargs)
+ # tag, kwargs will be passed to apply_markup() hence factory()
+ self.groups = groups
+ self.states = states
+ self.start_state = start_state
+
+ def multistate_next(self, root_multistate, char):
+ # the deduplication works as effectively a second pass which goes
+ # over the multistate tree in pre-order, looking for OR-disjunctions
+ # of any depth and configuration, e.g. (a OR b) or (c OR d), and
+ # collecting the subexpressions e.g. a, b, c OR b, c, d, c OR d, and
+ # failing any subexpression that's previously occurred in pre-order
+ # (children of AND-conjunctions get a fresh empty subexpression set)
+
+ # unfortunately it can't be done as a second pass literally, because
+ # we have to generate the transition list, thus we need to know if a
+ # subexpression is going to be failed, so we can delete it's threads
+
+ # therefore the deduplication is tacked onto the end of the building
+ # process, examining each built node just before the builder returns,
+ # if already in the subexpression set we will rewrite the transition
+ # list produced by the recursive calls, otherwise we will add it in
+
+ # the problem is determining the subexpression set to pass into any
+ # recursive calls, the caller's set will be sent unchanged by cases
+ # that return the result unchanged or add an OR-disjuction, an empty
+ # set will be sent by cases that add an AND-conjuction to the result
+
+ # if we have added something to the node built by a recursive call,
+ # we'll fall into the deduplication logic, otherwise just return it
+
+ def advance1(multistate, join_count, done_multistates):
+ # modifies nonlocal: transition
+ assert multistate[0] == NFA.MULTISTATE_AND
+ _, _, state, child = multistate
+ if state == 0:
+ # note that if we reach the accepting state, we must be a top-level
+ # expression, and could not be part of an AND-conjunction (because
+ # AND-conjunction terms always go to a join0 or join1 state first),
+ # we remove ourselves to indicate to scanner that match is complete
+ assert join_count == 0
+ assert child == NFA.accept_multistate
+ return advance(child, 0, done_multistates)
+ state_desc = self.states[state]
+ if state_desc[0] == NFA.STATE_CHARACTER:
+ if join_count != 0:
+ transition.append((DFA.TRANSITION_POP, child[1]))
+ return None
+ len_transition = len(transition)
+ child = advance(child, 0, set())
+ if child is None:
+ return None
+ result = (NFA.MULTISTATE_AND, child[1], state, child)
+ elif state_desc[0] == NFA.STATE_OR:
+ _, next_state0, next_state1 = state_desc
+ len_transition = len(transition)
+ transition.append((DFA.TRANSITION_DUP, child[1]))
+ child0 = advance1(
+ (NFA.MULTISTATE_AND, child[1], next_state0, child),
+ join_count,
+ done_multistates
+ )
+ child1 = advance1(
+ (NFA.MULTISTATE_AND, child[1], next_state1, child),
+ join_count,
+ done_multistates
+ )
+ if child0 is None:
+ return child1
+ if child1 is None:
+ return child0
+ result = (NFA.MULTISTATE_OR, child0[1] + child1[1], child0, child1)
+ elif state_desc[0] == NFA.STATE_AND:
+ _, next_state0, next_state1 = state_desc
+ return advance1(
+ (
+ NFA.MULTISTATE_AND,
+ child[1],
+ next_state0,
+ (NFA.MULTISTATE_AND, child[1], next_state1, child)
+ ),
+ join_count,
+ done_multistates
+ )
+ elif state_desc[0] == NFA.STATE_JOIN0:
+ return advance(child, join_count + 1, done_multistates)
+ elif state_desc[0] == NFA.STATE_JOIN1:
+ _, next_state = state_desc
+ if join_count == 0:
+ transition.append((DFA.TRANSITION_POP, child[1]))
+ return None
+ return advance1(
+ (NFA.MULTISTATE_AND, child[1], next_state, child),
+ join_count - 1,
+ done_multistates
+ )
+ elif state_desc[0] == NFA.STATE_MARK:
+ _, mark_value, next_state = state_desc
+ transition.append((DFA.TRANSITION_MARK, child[1], mark_value))
+ return advance1(
+ (NFA.MULTISTATE_AND, child[1], next_state, child),
+ join_count,
+ done_multistates
+ )
+ else:
+ assert False
+ if result in done_multistates:
+ transition[len_transition:] = [(DFA.TRANSITION_POP, multistate[1])]
+ return None
+ done_multistates.add(result)
+ return result
+
+ def advance(multistate, join_count, done_multistates):
+ nonlocal char0, char1 # modifies nonlocal: transition
+ if multistate[0] == NFA.MULTISTATE_ACCEPT:
+ assert join_count == 0
+ len_transition = len(transition)
+ transition.append((DFA.TRANSITION_MOVE, 1))
+ result = NFA.accept_multistate # takes no arguments so use static one
+ elif multistate[0] == NFA.MULTISTATE_AND:
+ if char >= 0:
+ _, _, state, child = multistate
+ state_desc = self.states[state]
+ assert state_desc[0] == NFA.STATE_CHARACTER
+ _, char_set, next_state = state_desc
+ k = bisect.bisect_right(char_set, char)
+ if k > 0 and char0 < char_set[k - 1]:
+ char0 = char_set[k - 1]
+ if k < len(char_set) and char1 > char_set[k]:
+ char1 = char_set[k]
+ if (k & 1) == 0:
+ transition.append((DFA.TRANSITION_POP, child[1]))
+ return None
+ multistate = (NFA.MULTISTATE_AND, child[1], next_state, child)
+ return advance1(multistate, join_count, done_multistates)
+ elif multistate[0] == NFA.MULTISTATE_OR:
+ _, _, child0, child1 = multistate
+ len_transition = len(transition)
+ child0 = advance(child0, join_count, done_multistates)
+ child1 = advance(child1, join_count, done_multistates)
+ if child0 is None:
+ return child1
+ if child1 is None:
+ return child0
+ result = (NFA.MULTISTATE_OR, child0[1] + child1[1], child0, child1)
+ else:
+ assert False
+ if result in done_multistates:
+ transition[len_transition:] = [(DFA.TRANSITION_POP, multistate[1])]
+ return None
+ done_multistates.add(result)
+ return result
+
+ transition = []
+ char0 = 0
+ char1 = chars
+ root_multistate = advance(root_multistate, 0, set())
+ return root_multistate, transition, char0, char1
+
+ def multistate_accept(root_multistate):
+ i = 0
+ def accept(multistate):
+ nonlocal i
+ if multistate[0] == NFA.MULTISTATE_ACCEPT:
+ return True
+ if multistate[0] == NFA.MULTISTATE_AND:
+ _, _, _, child = multistate
+ i += child[1]
+ return False
+ if multistate[0] == NFA.MULTISTATE_OR:
+ _, _, child0, child1 = multistate
+ return accept(child0) or accept(child1)
+ assert False
+ return i if accept(root_multistate) else -1
+
+ def match_text(self, text, i, start_index = 0):
+ def transit(transition):
+ nonlocal threads0, threads1, prefix_slop # note: also uses i
+ j = prefix_slop
+ for trans in transition:
+ if trans[0] == DFA.TRANSITION_POP:
+ j += trans[1]
+ elif trans[0] == DFA.TRANSITION_DUP:
+ while j < trans[1]:
+ threads0[:0] = [None] * prefix_slop
+ threads1[:0] = [None] * prefix_slop
+ j += prefix_slop
+ prefix_slop *= 2
+ threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
+ j -= trans[1]
+ elif trans[0] == DFA.TRANSITION_MARK:
+ threads0[j:j + trans[1]] = [
+ (i, trans[2], thread)
+ for thread in threads0[j:j + trans[1]]
+ ]
+ elif trans[0] == DFA.TRANSITION_MOVE:
+ threads1.extend(threads0[j:j + trans[1]])
+ j += trans[1]
+ #elif trans[0] == DFA.TRANSITION_DEL:
+ # del threads1[-trans[1]:]
+ else:
+ assert False
+ assert j == len(threads0)
+ threads0, threads1 = threads1, threads0
+ del threads1[prefix_slop:]
+
+ threads0 = [None, None]
+ threads1 = [None]
+ prefix_slop = 1
+
+ start_state = self.start_state[start_index]
+ if start_state == -1:
+ return None
+ next_multistate, transition, _, _ = self.multistate_next(
+ (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
+ -1
+ )
+ while next_multistate is not None:
+ transit(transition)
+ assert len(threads0) == prefix_slop + next_multistate[1]
+ if next_multistate == NFA.accept_multistate:
+ # there is only one match, which is complete
+ assert len(threads0) == prefix_slop + 1
+ return threads0[prefix_slop]
+ if i >= len(text):
+ # return best match we have, but not incomplete match
+ i = NFA.multistate_accept(next_multistate)
+ return (None if i == -1 else threads0[prefix_slop + i])
+ next_multistate, transition, _, _ = (
+ self.multistate_next(next_multistate, ord(text[i]))
+ )
+ i += 1
+ return None
+
+ def match_yychunk(self, root, pos, off, yychunk_iter, start_index = 0):
+ if pos < 0:
+ pos, off = element.to_start_relative(root, pos, off)
+
+ def transit(transition):
+ nonlocal threads0, threads1, prefix_slop # note: also uses pos, off
+ j = prefix_slop
+ for trans in transition:
+ if trans[0] == DFA.TRANSITION_POP:
+ j += trans[1]
+ elif trans[0] == DFA.TRANSITION_DUP:
+ while j < trans[1]:
+ threads0[:0] = [None] * prefix_slop
+ threads1[:0] = [None] * prefix_slop
+ j += prefix_slop
+ prefix_slop *= 2
+ threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
+ j -= trans[1]
+ elif trans[0] == DFA.TRANSITION_MARK:
+ threads0[j:j + trans[1]] = [
+ (pos, off, trans[2], thread)
+ for thread in threads0[j:j + trans[1]]
+ ]
+ elif trans[0] == DFA.TRANSITION_MOVE:
+ threads1.extend(threads0[j:j + trans[1]])
+ j += trans[1]
+ #elif trans[0] == DFA.TRANSITION_DEL:
+ # del threads1[-trans[1]:]
+ else:
+ assert False
+ assert j == len(threads0)
+ threads0, threads1 = threads1, threads0
+ del threads1[prefix_slop:]
+
+ threads0 = [None, None]
+ threads1 = [None]
+ prefix_slop = 1
+
+ start_state = self.start_state[start_index]
+ if start_state == -1:
+ return None
+ next_multistate, transition, _, _ = self.multistate_next(
+ (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
+ -1
+ )
+ text = element.get_text(root, pos)
+ while next_multistate is not None:
+ transit(transition)
+ assert len(threads0) == prefix_slop + next_multistate[1]
+ if next_multistate == NFA.accept_multistate:
+ # there is only one match, which is complete
+ assert len(threads0) == prefix_slop + 1
+ return threads0[prefix_slop]
+ while off >= len(text):
+ if pos < len(root):
+ pos += 1
+ off = 0
+ else:
+ try:
+ next(yychunk_iter)
+ except StopIteration:
+ # return best match we have, but not incomplete match
+ i = NFA.multistate_accept(next_multistate)
+ return (None if i == -1 else threads0[prefix_slop + i])
+ text = element.get_text(root, pos)
+ next_multistate, transition, _, _ = (
+ self.multistate_next(next_multistate, ord(text[off]))
+ )
+ off += 1
+ return None
+
+ def to_dfa(self):
+ dfa = DFA(list(self.groups))
+
+ accept_key = (NFA.accept_multistate, ())
+ action_to_meaning = [accept_key]
+ meaning_to_action = {accept_key: 0}
+ state_to_meaning = [NFA.accept_multistate]
+ meaning_to_state = {NFA.accept_multistate: 0}
+
+ for start_state in self.start_state:
+ if start_state == -1:
+ start_action = -1
+ else:
+ next_multistate, transition, _, _ = self.multistate_next(
+ (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
+ -1
+ )
+ if next_multistate is None:
+ start_action = -1
+ else:
+ start_key = (next_multistate, tuple(transition))
+ start_action = len(action_to_meaning)
+ meaning_to_action[start_key] = start_action
+ action_to_meaning.append(start_key)
+ dfa.start_action.append(start_action)
+
+ while len(dfa.actions) < len(action_to_meaning):
+ next_multistate, transition = action_to_meaning[len(dfa.actions)]
+ if next_multistate in meaning_to_state:
+ next_state = meaning_to_state[next_multistate]
+ else:
+ next_state = len(state_to_meaning)
+ state_to_meaning.append(next_multistate)
+ meaning_to_state[next_multistate] = next_state
+ dfa.actions.append((next_state, list(transition)))
+
+ while len(dfa.states) < len(state_to_meaning):
+ char = 0
+ multistate = state_to_meaning[len(dfa.states)]
+ state_desc = ([], [], NFA.multistate_accept(multistate))
+ while char < chars:
+ next_multistate, transition, char0, char1 = self.multistate_next(
+ multistate,
+ char
+ )
+ assert char0 == char and char1 > char
+ if next_multistate is None:
+ action = -1
+ else:
+ # optimize transition (optional)
+ i = 0
+ j = 0
+ while i < len(transition):
+ if transition[i][0] == DFA.TRANSITION_POP:
+ n = transition[i][1]
+ i += 1
+ while (
+ i < len(transition) and
+ transition[i][0] == DFA.TRANSITION_POP
+ ):
+ n += transition[i][1]
+ i += 1
+ transition[j] = (DFA.TRANSITION_POP, n)
+ elif transition[i][0] == DFA.TRANSITION_MOVE:
+ n = transition[i][1]
+ i += 1
+ while (
+ i < len(transition) and
+ transition[i][0] == DFA.TRANSITION_MOVE
+ ):
+ n += transition[i][1]
+ i += 1
+ transition[j] = (DFA.TRANSITION_MOVE, n)
+ else:
+ transition[j] = transition[i]
+ i += 1
+ j += 1
+ del transition[j:]
+ # end optimize transition
+ key = (next_multistate, tuple(transition))
+ if key in meaning_to_action:
+ action = meaning_to_action[key]
+ else:
+ action = len(action_to_meaning)
+ action_to_meaning.append(key)
+ meaning_to_action[key] = action
+ state_desc[0].append(char1)
+ state_desc[1].append(action)
+ char = char1
+ dfa.states.append(state_desc)
+ return dfa
+
+ def __repr__(self):
+ return 'regex.NFA({0:s}, {1:s}, {2:s})'.format(
+ repr(self.groups),
+ repr(self.states),
+ repr(self.start_state)
+ )
+
+class DFA:
+ # transition classes:
+ # instructions to transform thread list from a multistate to next multistate
+ # (TRANSITION_POP, n) j += n
+ # (TRANSITION_DUP, n) threads0[j - n:j] = threads0[j:j + n]
+ # j -= n
+ # (TRANSITION_MARK, n, mark_value) threads0[j:j + n] = [
+ # (i, mark, thread)
+ # for thread in threads0[j:j + n]
+ # ]
+ # (TRANSITION_MOVE, n) threads1.extend(threads0[j:j + n])
+ # j += n
+ # (TRANSITION_DEL, n) del threads1[-n:]
+
+ TRANSITION_POP = 0
+ TRANSITION_DUP = 1
+ TRANSITION_MARK = 2
+ TRANSITION_MOVE = 3
+ #TRANSITION_DEL = 4
+
+ def __init__(
+ self,
+ groups = [],
+ states = [([chars], [0], 0)],
+ actions = [(0, [])],
+ start_action = [] # can have multiple DFAs in same container
+ ):
+ # groups: list of group_desc
+ # group_desc: (tag, kwargs)
+ # tag, kwargs will be passed to apply_markup() hence factory()
+ # states: list of state_desc
+ # state_desc: (list of breaks, list of action to do, accept_thread)
+ # actions: list of action_desc
+ # action_desc: (state to go to next, compiled transition to do first)
+ # accept_thread: which thread of thread list to use, -1 don't accept
+ self.groups = groups
+ self.states = states
+ self.actions = actions
+ self.start_action = start_action
+
+ def match_text(self, text, i, start_index = 0):
+ def transit(transition):
+ nonlocal threads0, threads1, prefix_slop # note: also uses i
+ j = prefix_slop
+ for trans in transition:
+ if trans[0] == DFA.TRANSITION_POP:
+ j += trans[1]
+ elif trans[0] == DFA.TRANSITION_DUP:
+ while j < trans[1]:
+ threads0[:0] = [None] * prefix_slop
+ threads1[:0] = [None] * prefix_slop
+ j += prefix_slop
+ prefix_slop *= 2
+ threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
+ j -= trans[1]
+ elif trans[0] == DFA.TRANSITION_MARK:
+ threads0[j:j + trans[1]] = [
+ (i, trans[2], thread)
+ for thread in threads0[j:j + trans[1]]
+ ]
+ elif trans[0] == DFA.TRANSITION_MOVE:
+ threads1.extend(threads0[j:j + trans[1]])
+ j += trans[1]
+ #elif trans[0] == DFA.TRANSITION_DEL:
+ # del threads1[-trans[1]:]
+ else:
+ assert False
+ assert j == len(threads0)
+ threads0, threads1 = threads1, threads0
+ del threads1[prefix_slop:]
+
+ threads0 = [None, None]
+ threads1 = [None]
+ prefix_slop = 1
+
+ action = self.start_action[start_index]
+ while action != -1:
+ state, transition = self.actions[action]
+ transit(transition)
+ if state == 0:
+ # there is only one match, which is complete
+ assert len(threads0) == prefix_slop + 1
+ return threads0[prefix_slop]
+ if i >= len(text):
+ # return best match we have, but not incomplete match
+ i = self.states[state][2]
+ return (None if i == -1 else threads0[prefix_slop + i])
+ action = self.states[state][1][
+ bisect.bisect_right(self.states[state][0], ord(text[i]))
+ ]
+ i += 1
+ return None
+
+ def match_yychunk(self, root, pos, off, yychunk_iter, start_index = 0):
+ if pos < 0:
+ pos, off = element.to_start_relative(root, pos, off)
+
+ def transit(transition):
+ nonlocal threads0, threads1, prefix_slop # note: also uses pos, off
+ j = prefix_slop
+ for trans in transition:
+ if trans[0] == DFA.TRANSITION_POP:
+ j += trans[1]
+ elif trans[0] == DFA.TRANSITION_DUP:
+ while j < trans[1]:
+ threads0[:0] = [None] * prefix_slop
+ threads1[:0] = [None] * prefix_slop
+ j += prefix_slop
+ prefix_slop *= 2
+ threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
+ j -= trans[1]
+ elif trans[0] == DFA.TRANSITION_MARK:
+ threads0[j:j + trans[1]] = [
+ (pos, off, trans[2], thread)
+ for thread in threads0[j:j + trans[1]]
+ ]
+ elif trans[0] == DFA.TRANSITION_MOVE:
+ threads1.extend(threads0[j:j + trans[1]])
+ j += trans[1]
+ #elif trans[0] == DFA.TRANSITION_DEL:
+ # del threads1[-trans[1]:]
+ else:
+ assert False
+ assert j == len(threads0)
+ threads0, threads1 = threads1, threads0
+ del threads1[prefix_slop:]
+
+ threads0 = [None, None]
+ threads1 = [None]
+ prefix_slop = 1
+
+ action = self.start_action[start_index]
+ text = element.get_text(root, pos)
+ while action != -1:
+ state, transition = self.actions[action]
+ transit(transition)
+ if state == 0:
+ # there is only one match, which is complete
+ assert len(threads0) == prefix_slop + 1
+ return threads0[prefix_slop]
+ while off >= len(text):
+ if pos < len(root):
+ pos += 1
+ off = 0
+ else:
+ try:
+ next(yychunk_iter)
+ except StopIteration:
+ # return best match we have, but not incomplete match
+ i = self.states[state][2]
+ return (None if i == -1 else threads0[prefix_slop + i])
+ text = element.get_text(root, pos)
+ #print(
+ # 'state {0:d} pos {1:d} off {2:d} text "{3:s}"'.format(
+ # state,
+ # pos,
+ # off,
+ # text.replace('\n', '$')
+ # )
+ #)
+ action = self.states[state][1][
+ bisect.bisect_right(self.states[state][0], ord(text[off]))
+ ]
+ off += 1
+ return None
+
+ def yylex(self, root, pos, off, factory, yychunk_iter):
+ if pos < 0:
+ pos, off = element.to_start_relative(root, pos, off)
+
+ while True:
+ # note: pointers must be kept start relative during the below call,
+ # because it extends the following text by calling the yychunk_iter
+ thread = self.match_yychunk(root, pos, off, yychunk_iter)
+ if thread is None:
+ break
+ stack = []
+ while True:
+ pos, off, mark_value, thread = thread
+ group_index = mark_value >> 1
+ if (mark_value & 1) != 0:
+ end_pos, end_off = element.to_end_relative(root, pos, off)
+ stack.append((end_pos, end_off, group_index))
+ else:
+ end_pos, end_off, temp = stack.pop()
+ assert temp == group_index
+ if len(stack) == 0:
+ break
+ tag, kwargs = self.groups[group_index]
+ if tag != '':
+ work.apply_markup(
+ root,
+ pos,
+ off,
+ end_pos,
+ end_off,
+ factory,
+ tag,
+ **kwargs
+ )
+ # note: pointers must be kept end relative during the below call,
+ # because it modifies the preceding text by calling apply_markup()
+ yield end_pos, end_off, group_index
+ pos, off = element.to_start_relative(root, end_pos, end_off)
+
+ def __repr__(self):
+ return 'regex.DFA({0:s}, {1:s}, {2:s}, {3:s})'.format(
+ repr(self.groups),
+ repr(self.states),
+ repr(self.actions),
+ repr(self.start_action)
+ )
+
+class LR1:
+ def __init__(self, productions = [], terminal_thres = chars):
+ # productions: list of production
+ # production: (
+ # priority,
+ # symbols,
+ # lookaheads,
+ # group_bounds
+ # )
+ # priority: bit 0 = right to left, bits 1: = numeric priority
+ # symbols: list of symbol_desc
+ # symbol_desc: (terminal_set, nonterminal_set)
+ # terminal_set: similar to char_set, even length list of pairs of breaks
+ # nonterminal_set: as above but has terminal_thres subtracted from breaks
+ # lookaheads: list of lookahead_desc, len(lookaheads) = len(symbols) + 1
+ # lookahead_desc: (initial_set, can_be_empty)
+ # initial_set: what terminals can occur at this position in symbols array,
+ # computed such that lookaheads[i][0] is symbols[i][0], plus initial set of
+ # each nonterminal from symbols[i][1], plus lookaheads[i + 1][0] if any
+ # nonterminal of symbols[i][1] can be empty (may pull in i + 2, i + 3 etc)
+ # can_be_empty: whether all symbols from this position to end can be empty
+ # note: last lookahead_desc is a sentinel consisting of ([], True), so that
+ # initial_set is empty (will be augmented by context) and can_be_empty is
+ # True (because all symbols from the end to the end can obviously be empty)
+ # group_bounds: list of group_bound
+ # group_bound: (start_index, end_index, tag, kwargs)
+ # where start_index, end_index are indices into list of char_set,
+ # and tag, kwargs will be passed to apply_markup() hence factory(),
+ # noting that markup has to be applied in reverse order of the list
+ # terminal_thres: offset to apply to productions[] index to get symbol
+ # (char set code), also symbol for productions[0] = start production
+ self.productions = productions
+ self.terminal_thres = terminal_thres
+
+ def lookahead_item_set_closure(self, items, item_to_index):
+ in_queue = [True for i in range(len(items))]
+ queue = list(range(len(items)))
+
+ qhead = 0
+ while qhead < len(queue):
+ i = queue[qhead]
+ in_queue[i] = False
+ j, k, lookahead_set = items[i]
+ _, symbols, lookaheads, _ = self.productions[j]
+ if k < len(symbols):
+ _, nonterminal_set = symbols[k]
+ if len(nonterminal_set):
+ next_lookahead_set, next_can_be_empty = lookaheads[k + 1]
+ if next_can_be_empty:
+ next_lookahead_set = char_set_or(
+ next_lookahead_set,
+ lookahead_set
+ )
+ for l in range(0, len(nonterminal_set), 2):
+ for m in range(nonterminal_set[l], nonterminal_set[l + 1]):
+ key = (m, 0)
+ if key in item_to_index:
+ n = item_to_index[key]
+ child_lookahead_set = char_set_or(
+ items[n][2],
+ next_lookahead_set
+ )
+ if child_lookahead_set != items[n][2]:
+ items[n] = (m, 0, child_lookahead_set)
+ if not in_queue[n]:
+ # optimization: do not re-queue unless it is transparent
+ # to changes in the lookahead set wrt. further propagation
+ _, child_symbols, child_lookaheads, _ = self.productions[m]
+ if len(child_symbols) and child_lookaheads[1][1]:
+ in_queue[n] = True
+ queue.append(n)
+ else:
+ n = len(items)
+ items.append((m, 0, next_lookahead_set))
+ item_to_index[key] = n
+ in_queue.append(True)
+ queue.append(n)
+ qhead += 1
+
+ def lookahead_item_set_shift(self, items, terminal):
+ next_items = []
+ next_item_to_index = {}
+ reductions = set()
+ terminal0 = 0
+ terminal1 = self.terminal_thres
+ for i, j, lookahead_set in items:
+ _, symbols, _, _ = self.productions[i]
+ if j < len(symbols):
+ terminal_set, _ = symbols[j]
+ k = bisect.bisect_right(terminal_set, terminal)
+ if k > 0 and terminal0 < terminal_set[k - 1]:
+ terminal0 = terminal_set[k - 1]
+ if k < len(terminal_set) and terminal1 > terminal_set[k]:
+ terminal1 = terminal_set[k]
+ if (k & 1) == 1:
+ next_item_to_index[(i, j + 1)] = len(next_items)
+ next_items.append((i, j + 1, lookahead_set))
+ else:
+ k = bisect.bisect_right(lookahead_set, terminal)
+ if k > 0 and terminal0 < lookahead_set[k - 1]:
+ terminal0 = lookahead_set[k - 1]
+ if k < len(lookahead_set) and terminal1 > lookahead_set[k]:
+ terminal1 = lookahead_set[k]
+ if (k & 1) == 1:
+ reductions.add(i)
+ return next_items, next_item_to_index, reductions, terminal0, terminal1
+
+ def lookahead_item_set_goto(self, items, nonterminal):
+ next_items = []
+ next_item_to_index = {}
+ reductions = set()
+ nonterminal0 = 0
+ nonterminal1 = len(self.productions)
+ for i, j, lookahead_set in items:
+ _, symbols, _, _ = self.productions[i]
+ if j < len(symbols):
+ _, nonterminal_set = symbols[j]
+ k = bisect.bisect_right(nonterminal_set, nonterminal)
+ if k > 0 and nonterminal0 < nonterminal_set[k - 1]:
+ nonterminal0 = nonterminal_set[k - 1]
+ if k < len(nonterminal_set) and nonterminal1 > nonterminal_set[k]:
+ nonterminal1 = nonterminal_set[k]
+ if (k & 1) == 1:
+ next_item_to_index[(i, j + 1)] = len(next_items)
+ next_items.append((i, j + 1, lookahead_set))
+ return next_items, next_item_to_index, nonterminal0, nonterminal1
+
+ def parse_text(self, text, i):
+ items = [(0, 0, [chars, chars + 1])] # EOF
+ item_to_index = {(0, 0): 0}
+ value_stack = []
+ state_stack = []
+ lookahead_char = ord(text[i]) if i < len(text) else chars # EOF
+ while True:
+ self.lookahead_item_set_closure(items, item_to_index)
+ value_stack.append(i)
+ state_stack.append(items)
+ items, item_to_index, reductions, _, _ = (
+ self.lookahead_item_set_shift(items, lookahead_char)
+ )
+ if len(items) != 0:
+ if len(reductions) != 0:
+ sys.stderr.write(
+ 'shift/reduce conflict: {0:s} vs {1:s}\n'.format(
+ ','.join([str(i) for i, _, _ in next_items]),
+ ','.join([str(i) for i in reductions])
+ )
+ )
+ i += 1
+ lookahead_char = ord(text[i]) if i < len(text) else chars # EOF
+ elif len(reductions) != 0:
+ if len(reductions) != 1:
+ sys.stderr.write(
+ 'reduce/reduce conflict: {0:s}\n'.format(
+ ','.join([str(i) for i in reductions])
+ )
+ )
+ reduce = min(reductions)
+ _, symbols, _, group_bounds = self.productions[reduce]
+ base = len(value_stack) - len(symbols) - 1
+ for j in range(len(group_bounds) - 1, -1, -1):
+ k, l, tag, _ = group_bounds[j]
+ k += base
+ if l != 1:
+ value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
+ sys.stderr.write(
+ 'text \'{0:s}\' tag \'{1:s}\'\n'.format(
+ text[value_stack[k]:value_stack[k + 1]],
+ tag
+ )
+ )
+ del value_stack[base + 1:]
+ del state_stack[base + 1:]
+ if reduce == 0:
+ assert base == 0
+ return
+ items, item_to_index, _, _ = (
+ self.lookahead_item_set_goto(state_stack[-1], reduce)
+ )
+ assert len(items) != 0
+ else:
+ raise Exception(
+ 'syntax error at {0:d}: {1:s}'.format(i, text[i:])
+ )
+
+ def parse_yychunk(self, root, pos, off, factory, yychunk_iter):
+ if pos < 0:
+ pos, off = element.to_start_relative(root, pos, off)
+
+ items = [(0, 0, [chars, chars + 1])] # EOF
+ item_to_index = {(0, 0): 0}
+ value_stack = []
+ state_stack = []
+ text = element.get_text(root, pos)
+ while off >= len(text):
+ if pos < len(root):
+ pos += 1
+ off = 0
+ else:
+ try:
+ next(yychunk_iter)
+ except StopIteration:
+ lookahead_char = chars # EOF
+ break
+ text = element.get_text(root, pos)
+ else:
+ lookahead_char = ord(text[off])
+ while True:
+ self.lookahead_item_set_closure(items, item_to_index)
+ value_stack.append((pos, off))
+ state_stack.append(items)
+ items, item_to_index, reductions, _, _ = (
+ self.lookahead_item_set_shift(items, lookahead_char)
+ )
+ if len(items) != 0:
+ if len(reductions) != 0:
+ sys.stderr.write(
+ 'shift/reduce conflict: {0:s} vs {1:s}\n'.format(
+ ','.join([str(i) for i, _ in next_lookahead_item_set.keys()]),
+ ','.join([str(i) for i in reductions])
+ )
+ )
+ off += 1
+ while off >= len(text):
+ if pos < len(root):
+ pos += 1
+ off = 0
+ else:
+ try:
+ next(yychunk_iter)
+ except StopIteration:
+ lookahead_char = chars # EOF
+ break
+ text = element.get_text(root, pos)
+ else:
+ lookahead_char = ord(text[off])
+ elif len(reductions) != 0:
+ if len(reductions) != 1:
+ sys.stderr.write(
+ 'reduce/reduce conflict: {0:s}\n'.format(
+ ','.join([str(i) for i in reductions])
+ )
+ )
+ reduce = min(reductions)
+ _, symbols, _, group_bounds = self.productions[reduce]
+ base = len(value_stack) - len(symbols) - 1
+ end_relative = len(value_stack)
+ for j in range(len(group_bounds) - 1, -1, -1):
+ k, l, tag, kwargs = group_bounds[j]
+ k += base
+ assert k < end_relative
+ if l != 1:
+ value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
+ end_relative = max(k + 1, end_relative + 1 - l)
+ while end_relative > k + 1:
+ end_relative -= 1
+ pos1, off1 = value_stack[end_relative]
+ value_stack[end_relative] = (
+ element.to_end_relative(root, pos1, off1)
+ )
+ pos0, off0 = value_stack[k]
+ pos1, off1 = value_stack[k + 1]
+ work.apply_markup(
+ root,
+ pos0,
+ off0,
+ pos1,
+ off1,
+ factory,
+ tag,
+ **kwargs
+ )
+ if end_relative < len(value_stack):
+ pos, off = value_stack[-1]
+ pos, off = element.to_start_relative(root, pos, off)
+ text = element.get_text(root, pos)
+ del value_stack[base + 1:]
+ del state_stack[base + 1:]
+ if reduce == 0:
+ assert base == 0
+ return
+ items, item_to_index, _, _ = (
+ self.lookahead_item_set_goto(state_stack[-1], reduce)
+ )
+ assert len(items) != 0
+ else:
+ raise Exception(
+ 'syntax error at {0:d},{1:d}: {2:s}'.format(pos, off, text[off:])
+ )
+
+ def to_clr1(self):
+ lr1dfa = LR1DFA(
+ [],
+ [
+ (len(symbols), group_bounds)
+ for _, symbols, _, group_bounds in self.productions
+ ],
+ self.terminal_thres
+ )
+
+ items = [(0, 0, [chars, chars + 1])] # EOF
+ item_to_index = {(0, 0): 0}
+ self.lookahead_item_set_closure(items, item_to_index)
+
+ items = sorted(items)
+ key = tuple((i, j, tuple(k)) for i, j, k in items)
+ state_to_items = [items]
+ items_to_state = {key: 0}
+
+ while len(lr1dfa.states) < len(state_to_items):
+ items = state_to_items[len(lr1dfa.states)]
+ state_desc = ([], [], [], [])
+
+ def add_state(next_items, next_item_to_index):
+ self.lookahead_item_set_closure(next_items, next_item_to_index)
+ new_items = sorted(next_items)
+ key = tuple((i, j, tuple(k)) for i, j, k in new_items)
+ if key in items_to_state:
+ state = items_to_state[key]
+ else:
+ state = len(state_to_items)
+ state_to_items.append(new_items)
+ items_to_state[key] = state
+ return state
+
+ terminal = 0
+ while terminal < self.terminal_thres:
+ next_items, next_item_to_index, reductions, terminal0, terminal1 = (
+ self.lookahead_item_set_shift(items, terminal)
+ )
+ assert terminal0 == terminal and terminal1 > terminal
+ if len(next_items) != 0:
+ if len(reductions) != 0:
+ sys.stderr.write(
+ 'state {0:d} shift/reduce conflict: {1:s} vs {2:s}\n'.format(
+ len(lr1dfa.states),
+ ','.join([str(i) for i, _, _ in next_items]),
+ ','.join([str(i) for i in reductions])
+ )
+ )
+ action = add_state(next_items, next_item_to_index) * 2
+ elif len(reductions) != 0:
+ if len(reductions) != 1:
+ sys.stderr.write(
+ 'state {0:d} reduce/reduce conflict: {1:s}\n'.format(
+ len(lr1dfa.states),
+ ','.join([str(i) for i in reductions])
+ )
+ )
+ action = min(reductions) * 2 + 1 # odd means reduce
+ else:
+ action = -1
+ state_desc[0].append(terminal1)
+ state_desc[1].append(action)
+ terminal = terminal1
+
+ nonterminal = 0
+ while nonterminal < len(self.productions):
+ next_items, next_item_to_index, nonterminal0, nonterminal1 = (
+ self.lookahead_item_set_goto(items, nonterminal)
+ )
+ assert nonterminal0 == nonterminal and nonterminal1 > nonterminal
+ if len(next_items) != 0:
+ goto = add_state(next_items, next_item_to_index)
+ else:
+ goto = -1
+ state_desc[2].append(nonterminal1)
+ state_desc[3].append(goto)
+ nonterminal = nonterminal1
+
+ lr1dfa.states.append(state_desc)
+ return lr1dfa
+
+ def to_lalr1(self):
+ lr1dfa = LR1DFA(
+ [],
+ [
+ (len(symbols), group_bounds)
+ for _, symbols, _, group_bounds in self.productions
+ ],
+ self.terminal_thres
+ )
+
+ items = [(0, 0, [chars, chars + 1])] # EOF
+ item_to_index = {(0, 0): 0}
+ self.lookahead_item_set_closure(items, item_to_index)
+
+ items = sorted(items)
+ key = tuple((i, j) for i, j, _ in items) # ignore lookahead
+ state_to_items = [items]
+ items_to_state = {key: 0}
+
+ in_queue = [True]
+ queue = [0]
+
+ qhead = 0
+ while qhead < len(queue):
+ i = queue[qhead]
+ in_queue[i] = False
+ items = state_to_items[i]
+ state_desc = ([], [], [], [])
+
+ def add_state(next_items, next_item_to_index):
+ self.lookahead_item_set_closure(next_items, next_item_to_index)
+ new_items = sorted(next_items)
+ key = tuple((i, j) for i, j, _ in new_items) # ignore lookahead
+ if key in items_to_state:
+ state = items_to_state[key]
+ state_items = state_to_items[state]
+ for i in range(len(new_items)):
+ j, k, lookahead_set = new_items[i]
+ lookahead_set = char_set_or(lookahead_set, state_items[i][2])
+ if lookahead_set != state_items[i][2]:
+ state_items[i] = (j, k, lookahead_set)
+ if not in_queue[state]:
+ in_queue[state] = True
+ queue.append(state)
+ else:
+ state = len(state_to_items)
+ state_to_items.append(new_items)
+ items_to_state[key] = state
+ in_queue.append(True)
+ queue.append(state)
+ return state
+
+ terminal = 0
+ while terminal < self.terminal_thres:
+ next_items, next_item_to_index, reductions, terminal0, terminal1 = (
+ self.lookahead_item_set_shift(items, terminal)
+ )
+ assert terminal0 == terminal and terminal1 > terminal
+ if len(next_items) != 0:
+ if len(reductions) != 0:
+ sys.stderr.write(
+ 'state {0:d} shift/reduce conflict: {1:s} vs {2:s}\n'.format(
+ i,
+ ','.join([str(j) for j, _, _ in next_items]),
+ ','.join([str(j) for j in reductions])
+ )
+ )
+ action = add_state(next_items, next_item_to_index) * 2
+ elif len(reductions) != 0:
+ if len(reductions) != 1:
+ sys.stderr.write(
+ 'state {0:d} reduce/reduce conflict: {1:s}\n'.format(
+ i,
+ ','.join([str(j) for j in reductions])
+ )
+ )
+ action = min(reductions) * 2 + 1 # odd means reduce
+ else:
+ action = -1
+ state_desc[0].append(terminal1)
+ state_desc[1].append(action)
+ terminal = terminal1
+
+ nonterminal = 0
+ while nonterminal < len(self.productions):
+ next_items, next_item_to_index, nonterminal0, nonterminal1 = (
+ self.lookahead_item_set_goto(items, nonterminal)
+ )
+ assert nonterminal0 == nonterminal and nonterminal1 > nonterminal
+ if len(next_items) != 0:
+ goto = add_state(next_items, next_item_to_index)
+ else:
+ goto = -1
+ state_desc[2].append(nonterminal1)
+ state_desc[3].append(goto)
+ nonterminal = nonterminal1
+
+ if i < len(lr1dfa.states):
+ lr1dfa.states[i] = state_desc
+ else:
+ lr1dfa.states.append(state_desc)
+ qhead += 1
+ return lr1dfa
+
+ def __repr__(self):
+ return 'regex.LR1({0:s}, {1:d})'.format(
+ repr(self.productions),
+ self.terminal_thres
+ )
+
+class LR1DFA:
+ def __init__(self, states = [], productions = [], terminal_thres = chars):
+ # states: list of state_desc
+ # state_desc: (terminal breaks, actions, nonterminal breaks, gotos)
+ # action: shift = new state * 2, reduce = production * 2 + 1, error = -1
+ # goto: reduce = production, error = -1 (but error can't really happen)
+ # productions: list of production
+ # production: (len(symbols), group_bounds)
+ # len(symbols): how many states to pop stack to reduce this production
+ # group_bounds: list of group_bound
+ # group_bound: (start_index, end_index, tag, kwargs)
+ # where start_index, end_index are indices into list of char_set,
+ # and tag, kwargs will be passed to apply_markup() hence factory(),
+ # noting that markup has to be applied in reverse order of the list
+ # terminal_thres: offset to apply to productions[] index to get symbol
+ # (char set code), also symbol for productions[0] = start production
+ self.states = states
+ self.productions = productions
+ self.terminal_thres = terminal_thres
+
+ def parse_text(self, text, i):
+ state = 0
+ value_stack = []
+ state_stack = []
+ lookahead_char = ord(text[i]) if i < len(text) else chars # EOF
+ while True:
+ value_stack.append(i)
+ state_stack.append(state)
+ action = self.states[state][1][
+ bisect.bisect_right(self.states[state][0], lookahead_char)
+ ]
+ if action == -1:
+ raise Exception(
+ 'syntax error at {0:d}: {1:s}'.format(i, text[i:])
+ )
+ if (action & 1) == 0:
+ state = action >> 1
+ i += 1
+ lookahead_char = ord(text[i]) if i < len(text) else chars # EOF
+ else:
+ reduce = action >> 1
+ len_symbols, group_bounds = self.productions[reduce]
+ base = len(value_stack) - len_symbols - 1
+ for j in range(len(group_bounds) - 1, -1, -1):
+ k, l, tag, _ = group_bounds[j]
+ k += base
+ if l != 1:
+ value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
+ sys.stdout.write(
+ 'text \'{0:s}\' tag \'{1:s}\'\n'.format(
+ text[value_stack[k]:value_stack[k + 1]],
+ tag
+ )
+ )
+ del value_stack[base + 1:]
+ del state_stack[base + 1:]
+ if reduce == 0:
+ assert base == 0
+ return
+ state = self.states[state_stack[-1]][3][
+ bisect.bisect_right(self.states[state_stack[-1]][2], reduce)
+ ]
+ assert state != -1
+
+ def parse_yychunk(self, root, pos, off, factory, yychunk_iter):
+ if pos < 0:
+ pos, off = element.to_start_relative(root, pos, off)
+
+ state = 0
+ value_stack = []
+ state_stack = []
+ text = element.get_text(root, pos)
+ while off >= len(text):
+ if pos < len(root):
+ pos += 1
+ off = 0
+ else:
+ try:
+ next(yychunk_iter)
+ except StopIteration:
+ lookahead_char = chars # EOF
+ break
+ text = element.get_text(root, pos)
+ else:
+ lookahead_char = ord(text[off])
+ while True:
+ value_stack.append((pos, off))
+ state_stack.append(state)
+ action = self.states[state][1][
+ bisect.bisect_right(self.states[state][0], lookahead_char)
+ ]
+ #print('lookahead_char', lookahead_char, 'action', action)
+ if action == -1:
+ raise Exception(
+ 'syntax error at {0:d},{1:d}: {2:s}'.format(pos, off, text[off:])
+ )
+ if (action & 1) == 0:
+ state = action >> 1
+ off += 1
+ while off >= len(text):
+ if pos < len(root):
+ pos += 1
+ off = 0
+ else:
+ try:
+ next(yychunk_iter)
+ except StopIteration:
+ lookahead_char = chars # EOF
+ break
+ text = element.get_text(root, pos)
+ else:
+ lookahead_char = ord(text[off])
+ else:
+ reduce = action >> 1
+ len_symbols, group_bounds = self.productions[reduce]
+ base = len(value_stack) - len_symbols - 1
+ end_relative = len(value_stack)
+ for j in range(len(group_bounds) - 1, -1, -1):
+ k, l, tag, kwargs = group_bounds[j]
+ k += base
+ assert k < end_relative
+ if l != 1:
+ value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
+ end_relative = max(k + 1, end_relative + 1 - l)
+ while end_relative > k + 1:
+ end_relative -= 1
+ pos1, off1 = value_stack[end_relative]
+ value_stack[end_relative] = (
+ element.to_end_relative(root, pos1, off1)
+ )
+ pos0, off0 = value_stack[k]
+ pos1, off1 = value_stack[k + 1]
+ work.apply_markup(
+ root,
+ pos0,
+ off0,
+ pos1,
+ off1,
+ factory,
+ tag,
+ **kwargs
+ )
+ if end_relative < len(value_stack):
+ pos, off = value_stack[-1]
+ pos, off = element.to_start_relative(root, pos, off)
+ text = element.get_text(root, pos)
+ del value_stack[base + 1:]
+ del state_stack[base + 1:]
+ if reduce == 0:
+ assert base == 0
+ return
+ state = self.states[state_stack[-1]][3][
+ bisect.bisect_right(self.states[state_stack[-1]][2], reduce)
+ ]
+ assert state != -1
+
+ def yyparse(self, root, pos, off, factory, yylex_iter):
+ if pos < 0:
+ pos, off = element.to_start_relative(root, pos, off)
+
+ state = 0
+ value_stack = []
+ state_stack = []
+ try:
+ end_pos, end_off, lookahead_char = next(yylex_iter)
+ except StopIteration:
+ lookahead_char = chars # EOF
+ end_pos, end_off = element.to_end_relative(root, pos, off)
+ while True:
+ value_stack.append((pos, off))
+ state_stack.append(state)
+ action = self.states[state][1][
+ bisect.bisect_right(self.states[state][0], lookahead_char)
+ ]
+ #print('lookahead_char', lookahead_char, 'action', action)
+ if action == -1:
+ raise Exception(
+ 'syntax error at {0:d},{1:d}: {2:d}'.format(pos, off, lookahead_char)
+ )
+ if (action & 1) == 0:
+ state = action >> 1
+ pos, off = element.to_start_relative(root, end_pos, end_off)
+ try:
+ end_pos, end_off, lookahead_char = next(yylex_iter)
+ except StopIteration:
+ lookahead_char = chars # EOF
+ #end_pos, end_off = element.to_end_relative(root, pos, off)
+ else:
+ reduce = action >> 1
+ len_symbols, group_bounds = self.productions[reduce]
+ base = len(value_stack) - len_symbols - 1
+ end_relative = len(value_stack)
+ for j in range(len(group_bounds) - 1, -1, -1):
+ k, l, tag, kwargs = group_bounds[j]
+ k += base
+ assert k < end_relative
+ if l != 1:
+ value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
+ end_relative = max(k + 1, end_relative + 1 - l)
+ while end_relative > k + 1:
+ end_relative -= 1
+ pos1, off1 = value_stack[end_relative]
+ value_stack[end_relative] = (
+ element.to_end_relative(root, pos1, off1)
+ )
+ pos0, off0 = value_stack[k]
+ pos1, off1 = value_stack[k + 1]
+ work.apply_markup(
+ root,
+ pos0,
+ off0,
+ pos1,
+ off1,
+ factory,
+ tag,
+ **kwargs
+ )
+ if end_relative < len(value_stack):
+ pos, off = value_stack[-1]
+ pos, off = element.to_start_relative(root, pos, off)
+ del value_stack[base + 1:]
+ del state_stack[base + 1:]
+ if reduce == 0:
+ assert base == 0
+ return
+ state = self.states[state_stack[-1]][3][
+ bisect.bisect_right(self.states[state_stack[-1]][2], reduce)
+ ]
+ assert state != -1
+
+ def __repr__(self):
+ return 'regex.LR1DFA({0:s}, {1:s}, {2:d})'.format(
+ repr(self.states),
+ repr(self.productions),
+ self.terminal_thres
+ )
+
+def wrap_repr(text, width):
+ lines = []
+ i = 0
+ while i < len(text):
+ j = i + width
+ if j < len(text):
+ j = max(
+ [
+ text.rfind('(', i, j) + 1,
+ text.rfind('[', i, j) + 1,
+ text.rfind('{', i, j) + 1,
+ text.rfind('.', i, j) + 1,
+ text.rfind(')', i, j + 1),
+ text.rfind(']', i, j + 1),
+ text.rfind('}', i, j + 1),
+ text.rfind(' ', i, j + 1),
+ ]
+ )
+ assert j > 0
+ lines.append(text[i:j] + '\n')
+ i = j
+ while text[i:i + 1] == ' ':
+ i += 1
+ return ''.join(lines)
+
+if __name__ == '__main__':
+ import xml.etree.ElementTree
+
+ regex = RegexAnd(children = [RegexRepeat(children = [RegexCharacterNot(
+children = [RegexCharacter()], char_set = [0, 256])]), RegexGroup(children = [
+RegexOr(children = [RegexOr(children = [RegexOr(children = [RegexGroup(children
+= [RegexRepeat(children = [RegexCharacter(char_set = [9, 14, 32, 33])],
+one_or_more = True)], group_index = 1, group_name = 'Whitespace'), RegexGroup(
+children = [RegexRepeat(children = [RegexCharacter(char_set = [48, 58])],
+one_or_more = True)], group_index = 2, group_name = 'Number')]), RegexGroup(
+children = [RegexSequence(children = [RegexSequence(children = [RegexSequence(
+children = [RegexEmpty(), RegexCharacter(char_set = [102, 103])]),
+RegexCharacter(char_set = [111, 112])]), RegexCharacter(char_set = [114, 115])]
+)], group_index = 3, group_name = 'For')]), RegexGroup(children = [
+RegexSequence(children = [RegexCharacter(char_set = [65, 91, 95, 96, 97, 123]),
+RegexRepeat(children = [RegexCharacter(char_set = [48, 58, 65, 91, 95, 96, 97,
+123])])])], group_index = 4, group_name = 'Identifier')])], group_index = 0)])
+ #sys.stdout.write(
+ # wrap_repr(
+ # ' regex = {0:s}'.format(repr(regex).replace('regex.', '')),
+ # 79
+ # )
+ #)
+
+ nfa = regex.to_nfa()
+ #sys.stdout.write(
+ # wrap_repr(
+ # ' nfa = {0:s}'.format(repr(nfa).replace('regex.', '')),
+ # 79
+ # )
+ #)
+
+ text = ' id 99id id99 for forex '
+ i = 0
+ while i < len(text):
+ print('text "{0:s}"'.format(text[i:i + 72].replace('\n', '$')))
+ thread = nfa.match_text(text, i)
+ if thread is None:
+ print('no match')
+ break
+ i = thread[0] # end position of overall match
+ group_start = [-1 for j in range(len(nfa.groups))]
+ group_end = [-1 for j in range(len(nfa.groups))]
+ while thread is not None:
+ pos, mark, thread = thread
+ group = mark >> 1
+ if (mark & 1) == 0:
+ group_start[group] = pos
+ print(
+ 'group {0:d} name "{1:s}" text "{2:s}"'.format(
+ group,
+ nfa.groups[group][0],
+ text[group_start[group]:group_end[group]].replace('\n', '$')
+ )
+ )
+ else:
+ group_end[group] = pos
+
+ dfa = nfa.to_dfa()
+ #sys.stdout.write(
+ # wrap_repr(
+ # ' dfa = {0:s}'.format(repr(dfa).replace('regex.', '')),
+ # 79
+ # )
+ #)
+
+ text = ' id 99id id99 for forex '
+ i = 0
+ while i < len(text):
+ print('text "{0:s}"'.format(text[i:i + 72].replace('\n', '$')))
+ thread = dfa.match_text(text, i)
+ if thread is None:
+ print('no match')
+ break
+ i = thread[0] # end position of overall match
+ group_start = [-1 for j in range(len(dfa.groups))]
+ group_end = [-1 for j in range(len(dfa.groups))]
+ while thread is not None:
+ pos, mark, thread = thread
+ group = mark >> 1
+ if (mark & 1) == 0:
+ group_start[group] = pos
+ print(
+ 'group {0:d} name "{1:s}" text "{2:s}"'.format(
+ group,
+ dfa.groups[group][0],
+ text[group_start[group]:group_end[group]].replace('\n', '$')
+ )
+ )
+ else:
+ group_end[group] = pos
+
+ grammar = Grammar(children = [Grammar.Production(children = [RegexSequence(
+children = [RegexSequence(children = [RegexEmpty(), RegexCharacterRule(char_set
+= [288, 295], rule_name = 'whitespace_opt')]), RegexCharacterRule(char_set = [
+259, 262], rule_name = 'expr0')])], nonterminal = 0), Grammar.Production(
+children = [RegexSequence(children = [RegexEmpty(), RegexCharacterRule(char_set
+= [262, 265], rule_name = 'expr1')])], nonterminal = 1), Grammar.Production(
+children = [RegexSequence(children = [RegexEmpty(), RegexGroup(children = [
+RegexSequence(children = [RegexSequence(children = [RegexSequence(children = [
+RegexSequence(children = [RegexEmpty(), RegexCharacterRule(char_set = [259, 262
+], rule_name = 'expr0')]), RegexCharacter(char_set = [43, 44])]),
+RegexCharacterRule(char_set = [288, 295], rule_name = 'whitespace_opt')]),
+RegexCharacterRule(char_set = [262, 265], rule_name = 'expr1')])], group_index
+= 0, group_name = 'Add')])], nonterminal = 2), Grammar.Production(children = [
+RegexSequence(children = [RegexEmpty(), RegexGroup(children = [RegexSequence(
+children = [RegexSequence(children = [RegexSequence(children = [RegexSequence(
+children = [RegexEmpty(), RegexCharacterRule(char_set = [259, 262], rule_name =
+'expr0')]), RegexCharacter(char_set = [45, 46])]), RegexCharacterRule(char_set
+= [288, 295], rule_name = 'whitespace_opt')]), RegexCharacterRule(char_set = [
+262, 265], rule_name = 'expr1')])], group_index = 0, group_name = 'Subtract')])
+], nonterminal = 3), Grammar.Production(children = [RegexSequence(children = [
+RegexEmpty(), RegexCharacterRule(char_set = [265, 268], rule_name = 'expr2')])
+], nonterminal = 4), Grammar.Production(children = [RegexSequence(children = [
+RegexEmpty(), RegexGroup(children = [RegexSequence(children = [RegexSequence(
+children = [RegexSequence(children = [RegexSequence(children = [RegexEmpty(),
+RegexCharacterRule(char_set = [262, 265], rule_name = 'expr1')]),
+RegexCharacter(char_set = [42, 43])]), RegexCharacterRule(char_set = [288, 295
+], rule_name = 'whitespace_opt')]), RegexCharacterRule(char_set = [265, 268],
+rule_name = 'expr2')])], group_index = 0, group_name = 'Multiply')])],
+nonterminal = 5), Grammar.Production(children = [RegexSequence(children = [
+RegexEmpty(), RegexGroup(children = [RegexSequence(children = [RegexSequence(
+children = [RegexSequence(children = [RegexSequence(children = [RegexEmpty(),
+RegexCharacterRule(char_set = [262, 265], rule_name = 'expr1')]),
+RegexCharacter(char_set = [47, 48])]), RegexCharacterRule(char_set = [288, 295
+], rule_name = 'whitespace_opt')]), RegexCharacterRule(char_set = [265, 268],
+rule_name = 'expr2')])], group_index = 0, group_name = 'Divide')])],
+nonterminal = 6), Grammar.Production(children = [RegexSequence(children = [
+RegexSequence(children = [RegexEmpty(), RegexGroup(children = [RegexSequence(
+children = [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name =
+'number')])], group_index = 0, group_name = 'Number')]), RegexCharacterRule(
+char_set = [288, 295], rule_name = 'whitespace_opt')])], nonterminal = 7),
+Grammar.Production(children = [RegexSequence(children = [RegexEmpty(),
+RegexGroup(children = [RegexSequence(children = [RegexSequence(children = [
+RegexSequence(children = [RegexEmpty(), RegexCharacter(char_set = [45, 46])]),
+RegexCharacterRule(char_set = [288, 295], rule_name = 'whitespace_opt')]),
+RegexCharacterRule(char_set = [265, 268], rule_name = 'expr2')])], group_index
+= 0, group_name = 'Negate')])], nonterminal = 8), Grammar.Production(children =
+[RegexSequence(children = [RegexSequence(children = [RegexSequence(children = [
+RegexSequence(children = [RegexSequence(children = [RegexEmpty(),
+RegexCharacter(char_set = [40, 41])]), RegexCharacterRule(char_set = [288, 295
+], rule_name = 'whitespace_opt')]), RegexCharacterRule(char_set = [259, 262],
+rule_name = 'expr0')]), RegexCharacter(char_set = [41, 42])]),
+RegexCharacterRule(char_set = [288, 295], rule_name = 'whitespace_opt')])],
+nonterminal = 9), Grammar.Production(children = [RegexSequence(children = [
+RegexEmpty(), RegexCharacter(char_set = [48, 49])])], nonterminal = 10),
+Grammar.Production(children = [RegexSequence(children = [RegexEmpty(),
+RegexCharacter(char_set = [49, 50])])], nonterminal = 11), Grammar.Production(
+children = [RegexSequence(children = [RegexEmpty(), RegexCharacter(char_set = [
+50, 51])])], nonterminal = 12), Grammar.Production(children = [RegexSequence(
+children = [RegexEmpty(), RegexCharacter(char_set = [51, 52])])], nonterminal =
+13), Grammar.Production(children = [RegexSequence(children = [RegexEmpty(),
+RegexCharacter(char_set = [52, 53])])], nonterminal = 14), Grammar.Production(
+children = [RegexSequence(children = [RegexEmpty(), RegexCharacter(char_set = [
+53, 54])])], nonterminal = 15), Grammar.Production(children = [RegexSequence(
+children = [RegexEmpty(), RegexCharacter(char_set = [54, 55])])], nonterminal =
+16), Grammar.Production(children = [RegexSequence(children = [RegexEmpty(),
+RegexCharacter(char_set = [55, 56])])], nonterminal = 17), Grammar.Production(
+children = [RegexSequence(children = [RegexEmpty(), RegexCharacter(char_set = [
+56, 57])])], nonterminal = 18), Grammar.Production(children = [RegexSequence(
+children = [RegexEmpty(), RegexCharacter(char_set = [57, 58])])], nonterminal =
+19), Grammar.Production(children = [RegexSequence(children = [RegexSequence(
+children = [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name =
+'number')]), RegexCharacter(char_set = [48, 49])])], nonterminal = 20),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(char_set = [49, 50])])], nonterminal = 21),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(char_set = [50, 51])])], nonterminal = 22),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(char_set = [51, 52])])], nonterminal = 23),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(char_set = [52, 53])])], nonterminal = 24),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(char_set = [53, 54])])], nonterminal = 25),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(char_set = [54, 55])])], nonterminal = 26),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(char_set = [55, 56])])], nonterminal = 27),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(char_set = [56, 57])])], nonterminal = 28),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [268, 288], rule_name = 'number'
+)]), RegexCharacter(char_set = [57, 58])])], nonterminal = 29),
+Grammar.Production(children = [RegexEmpty()], nonterminal = 30),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [288, 295], rule_name =
+'whitespace_opt')]), RegexCharacter(char_set = [9, 10])])], nonterminal = 31),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [288, 295], rule_name =
+'whitespace_opt')]), RegexCharacter(char_set = [10, 11])])], nonterminal = 32),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [288, 295], rule_name =
+'whitespace_opt')]), RegexCharacter(char_set = [11, 12])])], nonterminal = 33),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [288, 295], rule_name =
+'whitespace_opt')]), RegexCharacter(char_set = [12, 13])])], nonterminal = 34),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [288, 295], rule_name =
+'whitespace_opt')]), RegexCharacter(char_set = [13, 14])])], nonterminal = 35),
+Grammar.Production(children = [RegexSequence(children = [RegexSequence(children
+= [RegexEmpty(), RegexCharacterRule(char_set = [288, 295], rule_name =
+'whitespace_opt')]), RegexCharacter(char_set = [32, 33])])], nonterminal = 36)
+], terminal_thres = 258)
+ #sys.stdout.write(
+ # wrap_repr(
+ # ' grammar = {0:s}'.format(repr(grammar).replace('regex.', '')),
+ # 79
+ # )
+ #)
+
+ lr1 = grammar.to_lr1()
+ #sys.stdout.write(
+ # wrap_repr(
+ # ' lr1 = {0:s}'.format(repr(lr1).replace('regex.', '')),
+ # 79
+ # )
+ #)
+
+ lr1.parse_text('(13 + 5 * 6) * 2', 0)
+ root = element.Element('root', text = '(13 + 5 * 6) * 2')
+ lr1.parse_yychunk(root, 0, 0, element.Element, iter([]))
+ xml.etree.ElementTree.dump(root)
+
+ clr1 = lr1.to_clr1()
+ #sys.stdout.write(
+ # wrap_repr(
+ # ' clr1 = {0:s}'.format(repr(clr1).replace('regex.', '')),
+ # 79
+ # )
+ #)
+
+ clr1.parse_text('(13 + 5 * 6) * 2', 0)
+ root = element.Element('root', text = '(13 + 5 * 6) * 2')
+ clr1.parse_yychunk(root, 0, 0, element.Element, iter([]))
+ xml.etree.ElementTree.dump(root)
+
+ lalr1 = lr1.to_lalr1()
+ #sys.stdout.write(
+ # wrap_repr(
+ # ' lalr1 = {0:s}'.format(repr(lalr1).replace('regex.', '')),
+ # 79
+ # )
+ #)
+
+ lalr1.parse_text('(13 + 5 * 6) * 2', 0)
+ root = element.Element('root', text = '(13 + 5 * 6) * 2')
+ lalr1.parse_yychunk(root, 0, 0, element.Element, iter([]))
+ xml.etree.ElementTree.dump(root)
--- /dev/null
+--- lex.yy.c.orig 2018-06-25 10:36:41.898822220 +1000
++++ lex.yy.c 2018-06-28 00:00:54.431048812 +1000
+@@ -1,6 +1,3 @@
+-
+-#line 2 "lex.yy.c"
+-
+ #define YY_INT_ALIGNED short int
+
+ /* A lexical scanner generated by flex */
+@@ -351,179 +348,8 @@
+ (yy_hold_char) = *yy_cp; \
+ *yy_cp = '\0'; \
+ (yy_c_buf_p) = yy_cp;
+-#define YY_NUM_RULES 2
+-#define YY_END_OF_BUFFER 3
+-/* This struct is not used in this scanner,
+- but its presence is necessary. */
+-struct yy_trans_info
+- {
+- flex_int32_t yy_verify;
+- flex_int32_t yy_nxt;
+- };
+-static const flex_int16_t yy_acclist[15] =
+- { 0,
+- 8193,16385, 8193,16385, 3, 2, 8193, 2,16385, 8193,
+- 2, 8193,16385, 8193
+- } ;
+-
+-static const flex_int16_t yy_accept[11] =
+- { 0,
+- 1, 3, 5, 6, 7, 10, 12, 14, 15, 15
+- } ;
+-
+-static const flex_int16_t yy_base[11] =
+- { 0,
+- 0, 2, 363, 364, 4, 264, 6, 263, 364, 104
+- } ;
+-
+-static const flex_int16_t yy_def[11] =
+- { 0,
+- 10, 10, 9, 9, 9, 9, 9, 9, 0, 9
+- } ;
+-
+-static const flex_int16_t yy_nxt[621] =
+- { 0,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 5, 6, 5, 6,
+-
+- 7, 8, 7, 8, 4, 4, 4, 4, 4, 4,
+- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+-
+- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+-
+- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+- 8, 8, 9, 3, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+-
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+-
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+-
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9
+- } ;
+-
+-static const flex_int16_t yy_chk[621] =
+- { 0,
+- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+- 0, 0, 0, 0, 0, 0, 1, 1, 2, 2,
+-
+- 5, 5, 7, 7, 10, 10, 10, 10, 10, 10,
+- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+-
+- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+-
+- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+- 8, 6, 3, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+-
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+-
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+-
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9
+- } ;
++
++/* GENERATE TABLES */
+
+ extern int yy_flex_debug;
+ int yy_flex_debug = 0;
+@@ -553,8 +379,8 @@
+ #define YY_MORE_ADJ (yy_more_len)
+ #define YY_RESTORE_YY_MORE_OFFSET
+ char *yytext;
+-#line 1 "skel.l"
+-#line 557 "lex.yy.c"
++
++/* GENERATE SECTION1 */
+
+ #define INITIAL 0
+
+@@ -777,9 +603,7 @@
+ }
+
+ {
+-#line 2 "skel.l"
+-
+-#line 782 "lex.yy.c"
++/* GENERATE SECTION2INITIAL */
+
+ while ( /*CONSTCOND*/1 ) /* loops until end-of-file is reached */
+ {
+@@ -816,7 +640,7 @@
+ *(yy_state_ptr)++ = yy_current_state;
+ ++yy_cp;
+ }
+- while ( yy_base[yy_current_state] != 364 );
++ while ( yy_base[yy_current_state] != 0 );
+
+ yy_find_action:
+ yy_current_state = *--(yy_state_ptr);
+@@ -866,19 +690,7 @@
+
+ switch ( yy_act )
+ { /* beginning of action switch */
+-case 1:
+-YY_RULE_SETUP
+-#line 3 "skel.l"
+-
+- YY_BREAK
+-case 2:
+-YY_RULE_SETUP
+-#line 4 "skel.l"
+-ECHO;
+- YY_BREAK
+-#line 879 "lex.yy.c"
+- case YY_STATE_EOF(INITIAL):
+- yyterminate();
++/* GENERATE SECTION2 */
+
+ case YY_END_OF_BUFFER:
+ {
+@@ -1850,4 +1662,4 @@
+
+ #define YYTABLES_NAME "yytables"
+
+-#line 4 "skel.l"
++/* GENERATE SECTION3 */