--- /dev/null
+import bisect
+import bisect_set
+import element
+import lr1dfa
+import work
+import sys
+
+# defines the alphabet size, set this to 0x11000 for unicode
+n_characters = 0x100
+
+class LR1:
+ def __init__(
+ self,
+ productions = [],
+ n_terminals = n_characters + 1,
+ eof_terminal = n_characters
+ ):
+ # productions: list of production
+ # production: (
+ # priority,
+ # symbols,
+ # lookaheads,
+ # group_bounds
+ # )
+ # priority: bit 0 = right to left, bits 1: = numeric priority
+ # symbols: list of symbol_desc
+ # symbol_desc: (terminal_set, nonterminal_set)
+ # terminal_set: similar to character_set, even length list of pairs of breaks
+ # nonterminal_set: as above but has n_terminals subtracted from breaks
+ # lookaheads: list of lookahead_desc, len(lookaheads) = len(symbols) + 1
+ # lookahead_desc: (initial_set, can_be_empty)
+ # initial_set: what terminals can occur at this position in symbols array,
+ # computed such that lookaheads[i][0] is symbols[i][0], plus initial set of
+ # each nonterminal from symbols[i][1], plus lookaheads[i + 1][0] if any
+ # nonterminal of symbols[i][1] can be empty (may pull in i + 2, i + 3 etc)
+ # can_be_empty: whether all symbols from this position to end can be empty
+ # note: last lookahead_desc is a sentinel consisting of ([], True), so that
+ # initial_set is empty (will be augmented by context) and can_be_empty is
+ # True (because all symbols from the end to the end can obviously be empty)
+ # group_bounds: list of group_bound
+ # group_bound: (start_index, end_index, tag, kwargs)
+ # where start_index, end_index are indices into list of character_set,
+ # and tag, kwargs will be passed to apply_markup() hence factory(),
+ # noting that markup has to be applied in reverse order of the list
+ # n_terminals: offset to apply to productions[] index to get symbol
+ # (character set code), also symbol for productions[0] = start production
+ # eof_terminal: usually == n_terminals - 1 (must be valid terminal value)
+ self.productions = productions
+ self.n_terminals = n_terminals
+ self.eof_terminal = eof_terminal
+
+ def lookahead_item_set_closure(self, items, item_to_index):
+ in_queue = [True for i in range(len(items))]
+ queue = list(range(len(items)))
+
+ qhead = 0
+ while qhead < len(queue):
+ i = queue[qhead]
+ in_queue[i] = False
+ j, k, lookahead_set = items[i]
+ _, symbols, lookaheads, _ = self.productions[j]
+ if k < len(symbols):
+ _, nonterminal_set = symbols[k]
+ if len(nonterminal_set):
+ next_lookahead_set, next_can_be_empty = lookaheads[k + 1]
+ if next_can_be_empty:
+ next_lookahead_set = bisect_set.bisect_set_or(
+ next_lookahead_set,
+ lookahead_set
+ )
+ for l in range(0, len(nonterminal_set), 2):
+ for m in range(nonterminal_set[l], nonterminal_set[l + 1]):
+ key = (m, 0)
+ if key in item_to_index:
+ n = item_to_index[key]
+ child_lookahead_set = bisect_set.bisect_set_or(
+ items[n][2],
+ next_lookahead_set
+ )
+ if child_lookahead_set != items[n][2]:
+ items[n] = (m, 0, child_lookahead_set)
+ if not in_queue[n]:
+ # optimization: do not re-queue unless it is transparent
+ # to changes in the lookahead set wrt. further propagation
+ _, child_symbols, child_lookaheads, _ = self.productions[m]
+ if len(child_symbols) and child_lookaheads[1][1]:
+ in_queue[n] = True
+ queue.append(n)
+ else:
+ n = len(items)
+ items.append((m, 0, next_lookahead_set))
+ item_to_index[key] = n
+ in_queue.append(True)
+ queue.append(n)
+ qhead += 1
+
+ def lookahead_item_set_shift(self, items, terminal):
+ next_items = []
+ next_item_to_index = {}
+ reductions = set()
+ terminal0 = 0
+ terminal1 = self.n_terminals
+ for i, j, lookahead_set in items:
+ _, symbols, _, _ = self.productions[i]
+ if j < len(symbols):
+ terminal_set, _ = symbols[j]
+ k = bisect.bisect_right(terminal_set, terminal)
+ if k > 0 and terminal0 < terminal_set[k - 1]:
+ terminal0 = terminal_set[k - 1]
+ if k < len(terminal_set) and terminal1 > terminal_set[k]:
+ terminal1 = terminal_set[k]
+ if (k & 1) == 1:
+ next_item_to_index[(i, j + 1)] = len(next_items)
+ next_items.append((i, j + 1, lookahead_set))
+ else:
+ k = bisect.bisect_right(lookahead_set, terminal)
+ if k > 0 and terminal0 < lookahead_set[k - 1]:
+ terminal0 = lookahead_set[k - 1]
+ if k < len(lookahead_set) and terminal1 > lookahead_set[k]:
+ terminal1 = lookahead_set[k]
+ if (k & 1) == 1:
+ reductions.add(i)
+ return next_items, next_item_to_index, reductions, terminal0, terminal1
+
+ def lookahead_item_set_goto(self, items, nonterminal):
+ next_items = []
+ next_item_to_index = {}
+ reductions = set()
+ nonterminal0 = 0
+ nonterminal1 = len(self.productions)
+ for i, j, lookahead_set in items:
+ _, symbols, _, _ = self.productions[i]
+ if j < len(symbols):
+ _, nonterminal_set = symbols[j]
+ k = bisect.bisect_right(nonterminal_set, nonterminal)
+ if k > 0 and nonterminal0 < nonterminal_set[k - 1]:
+ nonterminal0 = nonterminal_set[k - 1]
+ if k < len(nonterminal_set) and nonterminal1 > nonterminal_set[k]:
+ nonterminal1 = nonterminal_set[k]
+ if (k & 1) == 1:
+ next_item_to_index[(i, j + 1)] = len(next_items)
+ next_items.append((i, j + 1, lookahead_set))
+ return next_items, next_item_to_index, nonterminal0, nonterminal1
+
+ def parse_text(self, text, i):
+ items = [(0, 0, [self.eof_terminal, self.eof_terminal + 1])]
+ item_to_index = {(0, 0): 0}
+ value_stack = []
+ state_stack = []
+ lookahead_character = ord(text[i]) if i < len(text) else self.eof_terminal
+ while True:
+ self.lookahead_item_set_closure(items, item_to_index)
+ value_stack.append(i)
+ state_stack.append(items)
+ items, item_to_index, reductions, _, _ = (
+ self.lookahead_item_set_shift(items, lookahead_character)
+ )
+ if len(items) != 0:
+ if len(reductions) != 0:
+ sys.stderr.write(
+ 'shift/reduce conflict: {0:s} vs {1:s}\n'.format(
+ ','.join([str(i) for i, _, _ in next_items]),
+ ','.join([str(i) for i in reductions])
+ )
+ )
+ i += 1
+ lookahead_character = ord(text[i]) if i < len(text) else self.eof_terminal
+ elif len(reductions) != 0:
+ if len(reductions) != 1:
+ sys.stderr.write(
+ 'reduce/reduce conflict: {0:s}\n'.format(
+ ','.join([str(i) for i in reductions])
+ )
+ )
+ reduce = min(reductions)
+ _, symbols, _, group_bounds = self.productions[reduce]
+ base = len(value_stack) - len(symbols) - 1
+ for j in range(len(group_bounds) - 1, -1, -1):
+ k, l, tag, _ = group_bounds[j]
+ k += base
+ if l != 1:
+ value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
+ sys.stderr.write(
+ 'text \'{0:s}\' tag \'{1:s}\'\n'.format(
+ text[value_stack[k]:value_stack[k + 1]],
+ tag
+ )
+ )
+ del value_stack[base + 1:]
+ del state_stack[base + 1:]
+ if reduce == 0:
+ assert base == 0
+ return
+ items, item_to_index, _, _ = (
+ self.lookahead_item_set_goto(state_stack[-1], reduce)
+ )
+ assert len(items) != 0
+ else:
+ raise Exception(
+ 'syntax error at {0:d}: {1:s}'.format(i, text[i:])
+ )
+
+ def parse_yychunk(self, root, pos, off, factory, yychunk_iter):
+ if pos < 0:
+ pos, off = element.to_start_relative(root, pos, off)
+
+ items = [(0, 0, [self.eof_terminal, self.eof_terminal + 1])]
+ item_to_index = {(0, 0): 0}
+ value_stack = []
+ state_stack = []
+ text = element.get_text(root, pos)
+ while off >= len(text):
+ if pos < len(root):
+ pos += 1
+ off = 0
+ else:
+ try:
+ next(yychunk_iter)
+ except StopIteration:
+ lookahead_character = self.eof_terminal
+ break
+ text = element.get_text(root, pos)
+ else:
+ lookahead_character = ord(text[off])
+ while True:
+ self.lookahead_item_set_closure(items, item_to_index)
+ value_stack.append((pos, off))
+ state_stack.append(items)
+ items, item_to_index, reductions, _, _ = (
+ self.lookahead_item_set_shift(items, lookahead_character)
+ )
+ if len(items) != 0:
+ if len(reductions) != 0:
+ sys.stderr.write(
+ 'shift/reduce conflict: {0:s} vs {1:s}\n'.format(
+ ','.join([str(i) for i, _ in next_lookahead_item_set.keys()]),
+ ','.join([str(i) for i in reductions])
+ )
+ )
+ off += 1
+ while off >= len(text):
+ if pos < len(root):
+ pos += 1
+ off = 0
+ else:
+ try:
+ next(yychunk_iter)
+ except StopIteration:
+ lookahead_character = self.eof_terminal
+ break
+ text = element.get_text(root, pos)
+ else:
+ lookahead_character = ord(text[off])
+ elif len(reductions) != 0:
+ if len(reductions) != 1:
+ sys.stderr.write(
+ 'reduce/reduce conflict: {0:s}\n'.format(
+ ','.join([str(i) for i in reductions])
+ )
+ )
+ reduce = min(reductions)
+ _, symbols, _, group_bounds = self.productions[reduce]
+ base = len(value_stack) - len(symbols) - 1
+ end_relative = len(value_stack)
+ for j in range(len(group_bounds) - 1, -1, -1):
+ k, l, tag, kwargs = group_bounds[j]
+ k += base
+ assert k < end_relative
+ if l != 1:
+ value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
+ end_relative = max(k + 1, end_relative + 1 - l)
+ while end_relative > k + 1:
+ end_relative -= 1
+ pos1, off1 = value_stack[end_relative]
+ value_stack[end_relative] = (
+ element.to_end_relative(root, pos1, off1)
+ )
+ pos0, off0 = value_stack[k]
+ pos1, off1 = value_stack[k + 1]
+ work.apply_markup(
+ root,
+ pos0,
+ off0,
+ pos1,
+ off1,
+ factory,
+ tag,
+ **kwargs
+ )
+ if end_relative < len(value_stack):
+ pos, off = value_stack[-1]
+ pos, off = element.to_start_relative(root, pos, off)
+ text = element.get_text(root, pos)
+ del value_stack[base + 1:]
+ del state_stack[base + 1:]
+ if reduce == 0:
+ assert base == 0
+ return
+ items, item_to_index, _, _ = (
+ self.lookahead_item_set_goto(state_stack[-1], reduce)
+ )
+ assert len(items) != 0
+ else:
+ raise Exception(
+ 'syntax error at {0:d},{1:d}: {2:s}'.format(pos, off, text[off:])
+ )
+
+ def to_clr1(self):
+ _lr1dfa = lr1dfa.LR1DFA(
+ [],
+ [
+ (len(symbols), group_bounds)
+ for _, symbols, _, group_bounds in self.productions
+ ],
+ self.n_terminals,
+ self.eof_terminal
+ )
+
+ items = [(0, 0, [self.eof_terminal, self.eof_terminal + 1])]
+ item_to_index = {(0, 0): 0}
+ self.lookahead_item_set_closure(items, item_to_index)
+
+ items = sorted(items)
+ key = tuple((i, j, tuple(k)) for i, j, k in items)
+ state_to_items = [items]
+ items_to_state = {key: 0}
+
+ while len(_lr1dfa.states) < len(state_to_items):
+ items = state_to_items[len(_lr1dfa.states)]
+ state_desc = ([], [], [], [])
+
+ def add_state(next_items, next_item_to_index):
+ self.lookahead_item_set_closure(next_items, next_item_to_index)
+ new_items = sorted(next_items)
+ key = tuple((i, j, tuple(k)) for i, j, k in new_items)
+ if key in items_to_state:
+ state = items_to_state[key]
+ else:
+ state = len(state_to_items)
+ state_to_items.append(new_items)
+ items_to_state[key] = state
+ return state
+
+ terminal = 0
+ while terminal < self.n_terminals:
+ next_items, next_item_to_index, reductions, terminal0, terminal1 = (
+ self.lookahead_item_set_shift(items, terminal)
+ )
+ assert terminal0 == terminal and terminal1 > terminal
+ if len(next_items) != 0:
+ if len(reductions) != 0:
+ sys.stderr.write(
+ 'state {0:d} shift/reduce conflict: {1:s} vs {2:s}\n'.format(
+ len(_lr1dfa.states),
+ ','.join([str(i) for i, _, _ in next_items]),
+ ','.join([str(i) for i in reductions])
+ )
+ )
+ action = add_state(next_items, next_item_to_index) * 2
+ elif len(reductions) != 0:
+ if len(reductions) != 1:
+ sys.stderr.write(
+ 'state {0:d} reduce/reduce conflict: {1:s}\n'.format(
+ len(_lr1dfa.states),
+ ','.join([str(i) for i in reductions])
+ )
+ )
+ action = min(reductions) * 2 + 1 # odd means reduce
+ else:
+ action = -1
+ state_desc[0].append(terminal1)
+ state_desc[1].append(action)
+ terminal = terminal1
+
+ nonterminal = 0
+ while nonterminal < len(self.productions):
+ next_items, next_item_to_index, nonterminal0, nonterminal1 = (
+ self.lookahead_item_set_goto(items, nonterminal)
+ )
+ assert nonterminal0 == nonterminal and nonterminal1 > nonterminal
+ if len(next_items) != 0:
+ goto = add_state(next_items, next_item_to_index)
+ else:
+ goto = -1
+ state_desc[2].append(nonterminal1)
+ state_desc[3].append(goto)
+ nonterminal = nonterminal1
+
+ _lr1dfa.states.append(state_desc)
+ return _lr1dfa
+
+ def to_lalr1(self):
+ _lr1dfa = lr1dfa.LR1DFA(
+ [],
+ [
+ (len(symbols), group_bounds)
+ for _, symbols, _, group_bounds in self.productions
+ ],
+ self.n_terminals,
+ self.eof_terminal
+ )
+
+ items = [(0, 0, [self.eof_terminal, self.eof_terminal + 1])]
+ item_to_index = {(0, 0): 0}
+ self.lookahead_item_set_closure(items, item_to_index)
+
+ items = sorted(items)
+ key = tuple((i, j) for i, j, _ in items) # ignore lookahead
+ state_to_items = [items]
+ items_to_state = {key: 0}
+
+ in_queue = [True]
+ queue = [0]
+
+ qhead = 0
+ while qhead < len(queue):
+ i = queue[qhead]
+ in_queue[i] = False
+ items = state_to_items[i]
+ state_desc = ([], [], [], [])
+
+ def add_state(next_items, next_item_to_index):
+ self.lookahead_item_set_closure(next_items, next_item_to_index)
+ new_items = sorted(next_items)
+ key = tuple((i, j) for i, j, _ in new_items) # ignore lookahead
+ if key in items_to_state:
+ state = items_to_state[key]
+ state_items = state_to_items[state]
+ for i in range(len(new_items)):
+ j, k, lookahead_set = new_items[i]
+ lookahead_set = bisect_set.bisect_set_or(lookahead_set, state_items[i][2])
+ if lookahead_set != state_items[i][2]:
+ state_items[i] = (j, k, lookahead_set)
+ if not in_queue[state]:
+ in_queue[state] = True
+ queue.append(state)
+ else:
+ state = len(state_to_items)
+ state_to_items.append(new_items)
+ items_to_state[key] = state
+ in_queue.append(True)
+ queue.append(state)
+ return state
+
+ terminal = 0
+ while terminal < self.n_terminals:
+ next_items, next_item_to_index, reductions, terminal0, terminal1 = (
+ self.lookahead_item_set_shift(items, terminal)
+ )
+ assert terminal0 == terminal and terminal1 > terminal
+ if len(next_items) != 0:
+ if len(reductions) != 0:
+ sys.stderr.write(
+ 'state {0:d} shift/reduce conflict: {1:s} vs {2:s}\n'.format(
+ i,
+ ','.join([str(j) for j, _, _ in next_items]),
+ ','.join([str(j) for j in reductions])
+ )
+ )
+ action = add_state(next_items, next_item_to_index) * 2
+ elif len(reductions) != 0:
+ if len(reductions) != 1:
+ sys.stderr.write(
+ 'state {0:d} reduce/reduce conflict: {1:s}\n'.format(
+ i,
+ ','.join([str(j) for j in reductions])
+ )
+ )
+ action = min(reductions) * 2 + 1 # odd means reduce
+ else:
+ action = -1
+ state_desc[0].append(terminal1)
+ state_desc[1].append(action)
+ terminal = terminal1
+
+ nonterminal = 0
+ while nonterminal < len(self.productions):
+ next_items, next_item_to_index, nonterminal0, nonterminal1 = (
+ self.lookahead_item_set_goto(items, nonterminal)
+ )
+ assert nonterminal0 == nonterminal and nonterminal1 > nonterminal
+ if len(next_items) != 0:
+ goto = add_state(next_items, next_item_to_index)
+ else:
+ goto = -1
+ state_desc[2].append(nonterminal1)
+ state_desc[3].append(goto)
+ nonterminal = nonterminal1
+
+ if i < len(_lr1dfa.states):
+ _lr1dfa.states[i] = state_desc
+ else:
+ _lr1dfa.states.append(state_desc)
+ qhead += 1
+ return _lr1dfa
+
+ def __repr__(self):
+ return 'lr1.LR1({0:s}, {1:d}, {2:d})'.format(
+ repr(self.productions),
+ self.n_terminals,
+ self.eof_terminal
+ )
+
+
--- /dev/null
+import bisect
+import dfa
+import element
+
+# defines the alphabet size, set this to 0x11000 for unicode
+n_characters = 0x100
+
+class NFA:
+ # state_desc classes:
+ # (STATE_CHARACTER, character_set, next_state)
+ # (STATE_OR, next_state0, next_state1)
+ # (STATE_AND, next_state0, next_state1)
+ # (STATE_JOIN0,)
+ # (STATE_JOIN1, next_state)
+ # (STATE_MARK, mark_value, next_state)
+
+ STATE_CHARACTER = 0
+ STATE_OR = 1
+ STATE_AND = 2
+ STATE_JOIN0 = 3
+ STATE_JOIN1 = 4
+ STATE_MARK = 5
+ join0_state = (STATE_JOIN0,)
+
+ # multistate classes:
+ # (MULTISTATE_ACCEPT, 1) accept, occupies one thread in list
+ # (MULTISTATE_AND, n, state, child) and, occupies n threads in list
+ # (MULTISTATE_OR, n, child0, child1)
+ # n = sum of n from child states, that is, multistate[1] is the number of
+ # (MULTISTATE_ACCEPT, 1) leaf states ultimately reachable from this subtree
+
+ MULTISTATE_ACCEPT = 0
+ MULTISTATE_AND = 1
+ MULTISTATE_OR = 2
+ accept_multistate = (MULTISTATE_ACCEPT, 1)
+
+ def __init__(
+ self,
+ groups = [],
+ states = [(STATE_CHARACTER, [0, n_characters], 0)],
+ start_state = [] # can have multiple NFAs in same container
+ ):
+ # groups: list of group_desc
+ # group_desc: (tag, kwargs)
+ # tag, kwargs will be passed to apply_markup() hence factory()
+ self.groups = groups
+ self.states = states
+ self.start_state = start_state
+
+ def multistate_next(self, root_multistate, character):
+ # the deduplication works as effectively a second pass which goes
+ # over the multistate tree in pre-order, looking for OR-disjunctions
+ # of any depth and configuration, e.g. (a OR b) or (c OR d), and
+ # collecting the subexpressions e.g. a, b, c OR b, c, d, c OR d, and
+ # failing any subexpression that's previously occurred in pre-order
+ # (children of AND-conjunctions get a fresh empty subexpression set)
+
+ # unfortunately it can't be done as a second pass literally, because
+ # we have to generate the transition list, thus we need to know if a
+ # subexpression is going to be failed, so we can delete it's threads
+
+ # therefore the deduplication is tacked onto the end of the building
+ # process, examining each built node just before the builder returns,
+ # if already in the subexpression set we will rewrite the transition
+ # list produced by the recursive calls, otherwise we will add it in
+
+ # the problem is determining the subexpression set to pass into any
+ # recursive calls, the caller's set will be sent unchanged by cases
+ # that return the result unchanged or add an OR-disjuction, an empty
+ # set will be sent by cases that add an AND-conjuction to the result
+
+ # if we have added something to the node built by a recursive call,
+ # we'll fall into the deduplication logic, otherwise just return it
+
+ def advance1(multistate, join_count, done_multistates):
+ # modifies nonlocal: transition
+ assert multistate[0] == NFA.MULTISTATE_AND
+ _, _, state, child = multistate
+ if state == 0:
+ # note that if we reach the accepting state, we must be a top-level
+ # expression, and could not be part of an AND-conjunction (because
+ # AND-conjunction terms always go to a join0 or join1 state first),
+ # we remove ourselves to indicate to scanner that match is complete
+ assert join_count == 0
+ assert child == NFA.accept_multistate
+ return advance(child, 0, done_multistates)
+ state_desc = self.states[state]
+ if state_desc[0] == NFA.STATE_CHARACTER:
+ if join_count != 0:
+ transition.append((dfa.DFA.TRANSITION_POP, child[1]))
+ return None
+ len_transition = len(transition)
+ child = advance(child, 0, set())
+ if child is None:
+ return None
+ result = (NFA.MULTISTATE_AND, child[1], state, child)
+ elif state_desc[0] == NFA.STATE_OR:
+ _, next_state0, next_state1 = state_desc
+ len_transition = len(transition)
+ transition.append((dfa.DFA.TRANSITION_DUP, child[1]))
+ child0 = advance1(
+ (NFA.MULTISTATE_AND, child[1], next_state0, child),
+ join_count,
+ done_multistates
+ )
+ child1 = advance1(
+ (NFA.MULTISTATE_AND, child[1], next_state1, child),
+ join_count,
+ done_multistates
+ )
+ if child0 is None:
+ return child1
+ if child1 is None:
+ return child0
+ result = (NFA.MULTISTATE_OR, child0[1] + child1[1], child0, child1)
+ elif state_desc[0] == NFA.STATE_AND:
+ _, next_state0, next_state1 = state_desc
+ return advance1(
+ (
+ NFA.MULTISTATE_AND,
+ child[1],
+ next_state0,
+ (NFA.MULTISTATE_AND, child[1], next_state1, child)
+ ),
+ join_count,
+ done_multistates
+ )
+ elif state_desc[0] == NFA.STATE_JOIN0:
+ return advance(child, join_count + 1, done_multistates)
+ elif state_desc[0] == NFA.STATE_JOIN1:
+ _, next_state = state_desc
+ if join_count == 0:
+ transition.append((dfa.DFA.TRANSITION_POP, child[1]))
+ return None
+ return advance1(
+ (NFA.MULTISTATE_AND, child[1], next_state, child),
+ join_count - 1,
+ done_multistates
+ )
+ elif state_desc[0] == NFA.STATE_MARK:
+ _, mark_value, next_state = state_desc
+ transition.append((dfa.DFA.TRANSITION_MARK, child[1], mark_value))
+ return advance1(
+ (NFA.MULTISTATE_AND, child[1], next_state, child),
+ join_count,
+ done_multistates
+ )
+ else:
+ assert False
+ if result in done_multistates:
+ transition[len_transition:] = [(dfa.DFA.TRANSITION_POP, multistate[1])]
+ return None
+ done_multistates.add(result)
+ return result
+
+ def advance(multistate, join_count, done_multistates):
+ nonlocal character0, character1 # modifies nonlocal: transition
+ if multistate[0] == NFA.MULTISTATE_ACCEPT:
+ assert join_count == 0
+ len_transition = len(transition)
+ transition.append((dfa.DFA.TRANSITION_MOVE, 1))
+ result = NFA.accept_multistate # takes no arguments so use static one
+ elif multistate[0] == NFA.MULTISTATE_AND:
+ if character >= 0:
+ _, _, state, child = multistate
+ state_desc = self.states[state]
+ assert state_desc[0] == NFA.STATE_CHARACTER
+ _, character_set, next_state = state_desc
+ k = bisect.bisect_right(character_set, character)
+ if k > 0 and character0 < character_set[k - 1]:
+ character0 = character_set[k - 1]
+ if k < len(character_set) and character1 > character_set[k]:
+ character1 = character_set[k]
+ if (k & 1) == 0:
+ transition.append((dfa.DFA.TRANSITION_POP, child[1]))
+ return None
+ multistate = (NFA.MULTISTATE_AND, child[1], next_state, child)
+ return advance1(multistate, join_count, done_multistates)
+ elif multistate[0] == NFA.MULTISTATE_OR:
+ _, _, child0, child1 = multistate
+ len_transition = len(transition)
+ child0 = advance(child0, join_count, done_multistates)
+ child1 = advance(child1, join_count, done_multistates)
+ if child0 is None:
+ return child1
+ if child1 is None:
+ return child0
+ result = (NFA.MULTISTATE_OR, child0[1] + child1[1], child0, child1)
+ else:
+ assert False
+ if result in done_multistates:
+ transition[len_transition:] = [(dfa.DFA.TRANSITION_POP, multistate[1])]
+ return None
+ done_multistates.add(result)
+ return result
+
+ transition = []
+ character0 = 0
+ character1 = n_characters
+ root_multistate = advance(root_multistate, 0, set())
+ return root_multistate, transition, character0, character1
+
+ def multistate_accept(root_multistate):
+ i = 0
+ def accept(multistate):
+ nonlocal i
+ if multistate[0] == NFA.MULTISTATE_ACCEPT:
+ return True
+ if multistate[0] == NFA.MULTISTATE_AND:
+ _, _, _, child = multistate
+ i += child[1]
+ return False
+ if multistate[0] == NFA.MULTISTATE_OR:
+ _, _, child0, child1 = multistate
+ return accept(child0) or accept(child1)
+ assert False
+ return i if accept(root_multistate) else -1
+
+ def match_text(self, text, i, start_index = 0):
+ def transit(transition):
+ nonlocal threads0, threads1, prefix_slop # note: also uses i
+ j = prefix_slop
+ for trans in transition:
+ if trans[0] == dfa.DFA.TRANSITION_POP:
+ j += trans[1]
+ elif trans[0] == dfa.DFA.TRANSITION_DUP:
+ while j < trans[1]:
+ threads0[:0] = [None] * prefix_slop
+ threads1[:0] = [None] * prefix_slop
+ j += prefix_slop
+ prefix_slop *= 2
+ threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
+ j -= trans[1]
+ elif trans[0] == dfa.DFA.TRANSITION_MARK:
+ threads0[j:j + trans[1]] = [
+ (i, trans[2], thread)
+ for thread in threads0[j:j + trans[1]]
+ ]
+ elif trans[0] == dfa.DFA.TRANSITION_MOVE:
+ threads1.extend(threads0[j:j + trans[1]])
+ j += trans[1]
+ #elif trans[0] == dfa.DFA.TRANSITION_DEL:
+ # del threads1[-trans[1]:]
+ else:
+ assert False
+ assert j == len(threads0)
+ threads0, threads1 = threads1, threads0
+ del threads1[prefix_slop:]
+
+ threads0 = [None, None]
+ threads1 = [None]
+ prefix_slop = 1
+
+ start_state = self.start_state[start_index]
+ if start_state == -1:
+ return None
+ next_multistate, transition, _, _ = self.multistate_next(
+ (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
+ -1
+ )
+ while next_multistate is not None:
+ transit(transition)
+ assert len(threads0) == prefix_slop + next_multistate[1]
+ if next_multistate == NFA.accept_multistate:
+ # there is only one match, which is complete
+ assert len(threads0) == prefix_slop + 1
+ return threads0[prefix_slop]
+ if i >= len(text):
+ # return best match we have, but not incomplete match
+ i = NFA.multistate_accept(next_multistate)
+ return (None if i == -1 else threads0[prefix_slop + i])
+ next_multistate, transition, _, _ = (
+ self.multistate_next(next_multistate, ord(text[i]))
+ )
+ i += 1
+ return None
+
+ def match_yychunk(self, root, pos, off, yychunk_iter, start_index = 0):
+ if pos < 0:
+ pos, off = element.to_start_relative(root, pos, off)
+
+ def transit(transition):
+ nonlocal threads0, threads1, prefix_slop # note: also uses pos, off
+ j = prefix_slop
+ for trans in transition:
+ if trans[0] == dfa.DFA.TRANSITION_POP:
+ j += trans[1]
+ elif trans[0] == dfa.DFA.TRANSITION_DUP:
+ while j < trans[1]:
+ threads0[:0] = [None] * prefix_slop
+ threads1[:0] = [None] * prefix_slop
+ j += prefix_slop
+ prefix_slop *= 2
+ threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
+ j -= trans[1]
+ elif trans[0] == dfa.DFA.TRANSITION_MARK:
+ threads0[j:j + trans[1]] = [
+ (pos, off, trans[2], thread)
+ for thread in threads0[j:j + trans[1]]
+ ]
+ elif trans[0] == dfa.DFA.TRANSITION_MOVE:
+ threads1.extend(threads0[j:j + trans[1]])
+ j += trans[1]
+ #elif trans[0] == dfa.DFA.TRANSITION_DEL:
+ # del threads1[-trans[1]:]
+ else:
+ assert False
+ assert j == len(threads0)
+ threads0, threads1 = threads1, threads0
+ del threads1[prefix_slop:]
+
+ threads0 = [None, None]
+ threads1 = [None]
+ prefix_slop = 1
+
+ start_state = self.start_state[start_index]
+ if start_state == -1:
+ return None
+ next_multistate, transition, _, _ = self.multistate_next(
+ (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
+ -1
+ )
+ text = element.get_text(root, pos)
+ while next_multistate is not None:
+ transit(transition)
+ assert len(threads0) == prefix_slop + next_multistate[1]
+ if next_multistate == NFA.accept_multistate:
+ # there is only one match, which is complete
+ assert len(threads0) == prefix_slop + 1
+ return threads0[prefix_slop]
+ while off >= len(text):
+ if pos < len(root):
+ pos += 1
+ off = 0
+ else:
+ try:
+ next(yychunk_iter)
+ except StopIteration:
+ # return best match we have, but not incomplete match
+ i = NFA.multistate_accept(next_multistate)
+ return (None if i == -1 else threads0[prefix_slop + i])
+ text = element.get_text(root, pos)
+ next_multistate, transition, _, _ = (
+ self.multistate_next(next_multistate, ord(text[off]))
+ )
+ off += 1
+ return None
+
+ def to_dfa(self):
+ dfa = dfa.DFA(list(self.groups))
+
+ accept_key = (NFA.accept_multistate, ())
+ action_to_meaning = [accept_key]
+ meaning_to_action = {accept_key: 0}
+ state_to_meaning = [NFA.accept_multistate]
+ meaning_to_state = {NFA.accept_multistate: 0}
+
+ for start_state in self.start_state:
+ if start_state == -1:
+ start_action = -1
+ else:
+ next_multistate, transition, _, _ = self.multistate_next(
+ (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
+ -1
+ )
+ if next_multistate is None:
+ start_action = -1
+ else:
+ start_key = (next_multistate, tuple(transition))
+ start_action = len(action_to_meaning)
+ meaning_to_action[start_key] = start_action
+ action_to_meaning.append(start_key)
+ dfa.start_action.append(start_action)
+
+ while len(dfa.actions) < len(action_to_meaning):
+ next_multistate, transition = action_to_meaning[len(dfa.actions)]
+ if next_multistate in meaning_to_state:
+ next_state = meaning_to_state[next_multistate]
+ else:
+ next_state = len(state_to_meaning)
+ state_to_meaning.append(next_multistate)
+ meaning_to_state[next_multistate] = next_state
+ dfa.actions.append((next_state, list(transition)))
+
+ while len(dfa.states) < len(state_to_meaning):
+ character = 0
+ multistate = state_to_meaning[len(dfa.states)]
+ state_desc = ([], [], NFA.multistate_accept(multistate))
+ while character < n_characters:
+ next_multistate, transition, character0, character1 = self.multistate_next(
+ multistate,
+ character
+ )
+ assert character0 == character and character1 > character
+ if next_multistate is None:
+ action = -1
+ else:
+ # optimize transition (optional)
+ i = 0
+ j = 0
+ while i < len(transition):
+ if transition[i][0] == dfa.DFA.TRANSITION_POP:
+ n = transition[i][1]
+ i += 1
+ while (
+ i < len(transition) and
+ transition[i][0] == dfa.DFA.TRANSITION_POP
+ ):
+ n += transition[i][1]
+ i += 1
+ transition[j] = (dfa.DFA.TRANSITION_POP, n)
+ elif transition[i][0] == dfa.DFA.TRANSITION_MOVE:
+ n = transition[i][1]
+ i += 1
+ while (
+ i < len(transition) and
+ transition[i][0] == dfa.DFA.TRANSITION_MOVE
+ ):
+ n += transition[i][1]
+ i += 1
+ transition[j] = (dfa.DFA.TRANSITION_MOVE, n)
+ else:
+ transition[j] = transition[i]
+ i += 1
+ j += 1
+ del transition[j:]
+ # end optimize transition
+ key = (next_multistate, tuple(transition))
+ if key in meaning_to_action:
+ action = meaning_to_action[key]
+ else:
+ action = len(action_to_meaning)
+ action_to_meaning.append(key)
+ meaning_to_action[key] = action
+ state_desc[0].append(character1)
+ state_desc[1].append(action)
+ character = character1
+ dfa.states.append(state_desc)
+ return dfa
+
+ def __repr__(self):
+ return 'nfa.NFA({0:s}, {1:s}, {2:s})'.format(
+ repr(self.groups),
+ repr(self.states),
+ repr(self.start_state)
+ )
-import bisect
+import bisect_set
import element
-import work
-import sys
+import nfa
# defines the alphabet size, set this to 0x11000 for unicode
n_characters = 0x100
-def character_set_or(character_set0, character_set1):
- # calculate union of the child sets
- # we do this by calculating a series of breakpoints, at each breakpoint
- # evaluating the "or" (max) of the even/odd truth values of each child,
- # then making the output truth value even/odd by outputting if necessary
- result = []
- i = 0
- j = 0
- while True:
- if i < len(character_set0):
- k = character_set0[i]
- if j < len(character_set1):
- k = min(k, character_set1[j])
- elif j < len(character_set1):
- k = character_set1[j]
- else:
- break
- if i < len(character_set0) and character_set0[i] == k:
- i += 1
- if j < len(character_set1) and character_set1[j] == k:
- j += 1
- if (len(result) & 1) != max(i & 1, j & 1):
- result.append(k)
- assert (i & 1) == 0 and (j & 1) == 0
- return result
-
-def character_set_and(character_set0, character_set1):
- # calculate intersection of the child sets
- # we do this by calculating a series of breakpoints, at each breakpoint
- # evaluating the "and" (min) of the even/odd truth values of each child,
- # then making the output truth value even/odd by outputting if necessary
- result = []
- i = 0
- j = 0
- while True:
- if i < len(character_set0):
- k = character_set0[i]
- if j < len(character_set1):
- k = min(k, character_set1[j])
- elif j < len(character_set1):
- k = character_set1[j]
- else:
- break
- if i < len(character_set0) and character_set0[i] == k:
- i += 1
- if j < len(character_set1) and character_set1[j] == k:
- j += 1
- if (len(result) & 1) != min(i & 1, j & 1):
- result.append(k)
- assert (i & 1) == 0 and (j & 1) == 0
- return result
-
-def character_set_not(character_set):
- # calculate complement of the child set
- # if child set begins with [0], remove it, otherwise add [0] prefix
- # if child set ends with [n_characters], remove it, otherwise add [n_characters] suffix
- # the suffix part is not totally necessary, but makes sure length is even
- # (the evenness is so that single character sets can always be [c, c + 1])
- result = list(character_set)
- if result[:1] == [0]:
- del result[:1]
- else:
- result[:0] = [0]
- if result[-1:] == [n_characters]:
- del result[-1:]
- else:
- result.append(n_characters)
- return result
-
class Regex(element.Element):
# GENERATE ELEMENT() BEGIN
def __init__(
# GENERATE END
def to_nfa_state(self, nfa, next_state):
new_state = len(nfa.states)
- nfa.states.append((NFA.STATE_CHARACTER, self.character_set, next_state))
+ nfa.states.append((nfa.NFA.STATE_CHARACTER, self.character_set, next_state))
return new_state
#def to_lr1_symbols(self, n_terminals, symbols, lookaheads, group_bounds):
# terminal_set = []
# GENERATE END
def post_process(self, group_index = 0): #, rule_name_to_character_set = None):
group_index = RegexCharacter.post_process(self, group_index) #, rule_name_to_character_set)
- self.character_set = character_set_or(self[0].character_set, self[1].character_set)
+ self.character_set = bisect_set.bisect_set_or(self[0].character_set, self[1].character_set)
return group_index
class RegexCharacterAnd(RegexCharacter):
# GENERATE END
def post_process(self, group_index = 0): #, rule_name_to_character_set = None):
group_index = RegexCharacter.post_process(self, group_index) #, rule_name_to_character_set)
- self.character_set = character_set_and(self[0].character_set, self[1].character_set)
+ self.character_set = bisect_set.bisect_set_and(self[0].character_set, self[1].character_set)
return group_index
class RegexCharacterNot(RegexCharacter):
# GENERATE END
def post_process(self, group_index = 0): #, rule_name_to_character_set = None):
group_index = RegexCharacter.post_process(self, group_index) #, rule_name_to_character_set)
- self.character_set = character_set_not(self[0].character_set)
+ self.character_set = bisect_set.bisect_set_not(self[0].character_set)
return group_index
#class RegexCharacterRule(RegexCharacter):
if child1_state == -1:
return child0_state
new_state = len(nfa.states)
- nfa.states.append((NFA.STATE_OR, child0_state, child1_state))
+ nfa.states.append((nfa.NFA.STATE_OR, child0_state, child1_state))
return new_state
class RegexAnd(Regex):
# GENERATE END
def to_nfa_state(self, nfa, next_state):
join0_state = len(nfa.states)
- nfa.states.append(NFA.join0_state) # takes no arguments so use static one
+ nfa.states.append(nfa.NFA.join0_state) # takes no arguments so use static one
join1_state = len(nfa.states)
- nfa.states.append((NFA.STATE_JOIN1, next_state))
+ nfa.states.append((nfa.NFA.STATE_JOIN1, next_state))
child0_state = self[0].to_nfa_state(nfa, join0_state)
if child0_state == -1:
return -1
if child1_state == -1:
return -1
new_state = len(nfa.states)
- nfa.states.append((NFA.STATE_AND, child0_state, child1_state))
+ nfa.states.append((nfa.NFA.STATE_AND, child0_state, child1_state))
return new_state
class RegexSequence(Regex):
new_state = next_state # note: unreachable state remains invalid (None)
else:
nfa.states[new_state] = (
- (NFA.STATE_OR, next_state, child_state)
+ (nfa.NFA.STATE_OR, next_state, child_state)
if self.non_greedy else
- (NFA.STATE_OR, child_state, next_state)
+ (nfa.NFA.STATE_OR, child_state, next_state)
)
else:
new_state = next_state
break
new_state = len(nfa.states)
nfa.states.append(
- (NFA.STATE_OR, next_state, child_state)
+ (nfa.NFA.STATE_OR, next_state, child_state)
if self.non_greedy else
- (NFA.STATE_OR, child_state, next_state)
+ (nfa.NFA.STATE_OR, child_state, next_state)
)
for i in range(count0):
new_state = self[0].to_nfa_state(nfa, new_state)
return Regex.to_groups(self, groups)
def to_nfa_state(self, nfa, next_state):
mark_state = len(nfa.states)
- nfa.states.append((NFA.STATE_MARK, self.group_index * 2 + 1, next_state))
+ nfa.states.append((nfa.NFA.STATE_MARK, self.group_index * 2 + 1, next_state))
child_state = self[0].to_nfa_state(nfa, mark_state)
if child_state == -1:
return -1
new_state = len(nfa.states)
- nfa.states.append((NFA.STATE_MARK, self.group_index * 2, child_state))
+ nfa.states.append((nfa.NFA.STATE_MARK, self.group_index * 2, child_state))
return new_state
#def to_lr1_symbols(self, n_terminals, symbols, lookaheads, group_bounds):
# group_start = len(symbols)
# )
# return 1 # count of groups or ungrouped characters
-class Grammar(element.Element):
- class Production(element.Element):
- class Symbol(element.Element):
- # GENERATE ELEMENT(list(int) terminal_set, list(int) nonterminal_set) BEGIN
- def __init__(
- self,
- tag = 'Grammar_Production_Symbol',
- attrib = {},
- text = '',
- children = [],
- terminal_set = [],
- nonterminal_set = []
- ):
- element.Element.__init__(
- self,
- tag,
- attrib,
- text,
- children
- )
- self.terminal_set = (
- [element.deserialize_int(i) for i in terminal_set.split()]
- if isinstance(terminal_set, str) else
- terminal_set
- )
- self.nonterminal_set = (
- [element.deserialize_int(i) for i in nonterminal_set.split()]
- if isinstance(nonterminal_set, str) else
- nonterminal_set
- )
- def serialize(self, ref_list, indent = 0):
- element.Element.serialize(self, ref_list, indent)
- self.set(
- 'terminal_set',
- ' '.join([element.serialize_int(i) for i in self.terminal_set])
- )
- self.set(
- 'nonterminal_set',
- ' '.join([element.serialize_int(i) for i in self.nonterminal_set])
- )
- def deserialize(self, ref_list):
- element.Element.deserialize(self, ref_list)
- self.terminal_set = [
- element.deserialize_int(i)
- for i in self.get('terminal_set', '').split()
- ]
- self.nonterminal_set = [
- element.deserialize_int(i)
- for i in self.get('nonterminal_set', '').split()
- ]
- def copy(self, factory = None):
- result = element.Element.copy(
- self,
- Symbol if factory is None else factory
- )
- result.terminal_set = self.terminal_set
- result.nonterminal_set = self.nonterminal_set
- return result
- def repr_serialize(self, params):
- element.Element.repr_serialize(self, params)
- if len(self.terminal_set):
- params.append(
- 'terminal_set = [{0:s}]'.format(
- ', '.join([repr(i) for i in self.terminal_set])
- )
- )
- if len(self.nonterminal_set):
- params.append(
- 'nonterminal_set = [{0:s}]'.format(
- ', '.join([repr(i) for i in self.nonterminal_set])
- )
- )
- def __repr__(self):
- params = []
- self.repr_serialize(params)
- return 'regex.Grammar.Production.Symbol({0:s})'.format(', '.join(params))
- # GENERATE END
- def post_process(self, name_to_character_sets):
- pass
-
- class NamedSymbol(Symbol):
- # GENERATE ELEMENT(str name) BEGIN
- def __init__(
- self,
- tag = 'Grammar_Production_NamedSymbol',
- attrib = {},
- text = '',
- children = [],
- terminal_set = [],
- nonterminal_set = [],
- name = ''
- ):
- Grammar.Production.Symbol.__init__(
- self,
- tag,
- attrib,
- text,
- children,
- terminal_set,
- nonterminal_set
- )
- self.name = name
- def serialize(self, ref_list, indent = 0):
- Grammar.Production.Symbol.serialize(self, ref_list, indent)
- self.set('name', element.serialize_str(self.name))
- def deserialize(self, ref_list):
- Grammar.Production.Symbol.deserialize(self, ref_list)
- self.name = element.deserialize_str(self.get('name', ''))
- def copy(self, factory = None):
- result = Grammar.Production.Symbol.copy(
- self,
- NamedSymbol if factory is None else factory
- )
- result.name = self.name
- return result
- def repr_serialize(self, params):
- Grammar.Production.Symbol.repr_serialize(self, params)
- if self.name != '':
- params.append(
- 'name = {0:s}'.format(repr(self.name))
- )
- def __repr__(self):
- params = []
- self.repr_serialize(params)
- return 'regex.Grammar.Production.NamedSymbol({0:s})'.format(', '.join(params))
- # GENERATE END
- def post_process(self, name_to_character_sets):
- self.terminal_set, self.nonterminal_set = (
- name_to_character_sets[self.name]
- )
-
- # GENERATE ELEMENT(int nonterminal, int precedence, int associativity) BEGIN
- def __init__(
- self,
- tag = 'Grammar_Production',
- attrib = {},
- text = '',
- children = [],
- nonterminal = -1,
- precedence = -1,
- associativity = -1
- ):
- element.Element.__init__(
- self,
- tag,
- attrib,
- text,
- children
- )
- self.nonterminal = (
- element.deserialize_int(nonterminal)
- if isinstance(nonterminal, str) else
- nonterminal
- )
- self.precedence = (
- element.deserialize_int(precedence)
- if isinstance(precedence, str) else
- precedence
- )
- self.associativity = (
- element.deserialize_int(associativity)
- if isinstance(associativity, str) else
- associativity
- )
- def serialize(self, ref_list, indent = 0):
- element.Element.serialize(self, ref_list, indent)
- self.set('nonterminal', element.serialize_int(self.nonterminal))
- self.set('precedence', element.serialize_int(self.precedence))
- self.set('associativity', element.serialize_int(self.associativity))
- def deserialize(self, ref_list):
- element.Element.deserialize(self, ref_list)
- self.nonterminal = element.deserialize_int(self.get('nonterminal', '-1'))
- self.precedence = element.deserialize_int(self.get('precedence', '-1'))
- self.associativity = element.deserialize_int(self.get('associativity', '-1'))
- def copy(self, factory = None):
- result = element.Element.copy(
- self,
- Production if factory is None else factory
- )
- result.nonterminal = self.nonterminal
- result.precedence = self.precedence
- result.associativity = self.associativity
- return result
- def repr_serialize(self, params):
- element.Element.repr_serialize(self, params)
- if self.nonterminal != -1:
- params.append(
- 'nonterminal = {0:s}'.format(repr(self.nonterminal))
- )
- if self.precedence != -1:
- params.append(
- 'precedence = {0:s}'.format(repr(self.precedence))
- )
- if self.associativity != -1:
- params.append(
- 'associativity = {0:s}'.format(repr(self.associativity))
- )
- def __repr__(self):
- params = []
- self.repr_serialize(params)
- return 'regex.Grammar.Production({0:s})'.format(', '.join(params))
- # GENERATE END
- def post_process(self, nonterminal, name_to_character_sets):
- self.nonterminal = nonterminal
- for i in self:
- i.post_process(name_to_character_sets)
- def add_to_lr1(self, lr1):
- lr1.productions.append(
- (
- # precedence
- self.precedence * 2 + self.associativity,
- # symbols
- [(i.terminal_set, i.nonterminal_set) for i in self],
- # lookaheads (list of initial_set, can_be_empty)
- [([], False) for i in range(len(self))] + [([], True)],
- # group_bounds
- []
- )
- )
-
- # GENERATE ELEMENT(int n_terminals, int eof_terminal) BEGIN
- def __init__(
- self,
- tag = 'Grammar',
- attrib = {},
- text = '',
- children = [],
- n_terminals = -1,
- eof_terminal = -1
- ):
- element.Element.__init__(
- self,
- tag,
- attrib,
- text,
- children
- )
- self.n_terminals = (
- element.deserialize_int(n_terminals)
- if isinstance(n_terminals, str) else
- n_terminals
- )
- self.eof_terminal = (
- element.deserialize_int(eof_terminal)
- if isinstance(eof_terminal, str) else
- eof_terminal
- )
- def serialize(self, ref_list, indent = 0):
- element.Element.serialize(self, ref_list, indent)
- self.set('n_terminals', element.serialize_int(self.n_terminals))
- self.set('eof_terminal', element.serialize_int(self.eof_terminal))
- def deserialize(self, ref_list):
- element.Element.deserialize(self, ref_list)
- self.n_terminals = element.deserialize_int(self.get('n_terminals', '-1'))
- self.eof_terminal = element.deserialize_int(self.get('eof_terminal', '-1'))
- def copy(self, factory = None):
- result = element.Element.copy(
- self,
- Grammar if factory is None else factory
- )
- result.n_terminals = self.n_terminals
- result.eof_terminal = self.eof_terminal
- return result
- def repr_serialize(self, params):
- element.Element.repr_serialize(self, params)
- if self.n_terminals != -1:
- params.append(
- 'n_terminals = {0:s}'.format(repr(self.n_terminals))
- )
- if self.eof_terminal != -1:
- params.append(
- 'eof_terminal = {0:s}'.format(repr(self.eof_terminal))
- )
- def __repr__(self):
- params = []
- self.repr_serialize(params)
- return 'regex.Grammar({0:s})'.format(', '.join(params))
- # GENERATE END
- def post_process(self, name_to_character_sets):
- for i in range(len(self)):
- self[i].post_process(i, name_to_character_sets)
- def to_lr1(self):
- lr1 = LR1([], self.n_terminals, self.eof_terminal)
- for i in self:
- i.add_to_lr1(lr1)
- # propagate lookaheads
- modified = True
- while modified:
- modified = False
- for _, symbols, lookaheads, _ in lr1.productions:
- for i in range(len(symbols) - 1, -1, -1):
- initial_set, nonterminal_set = symbols[i]
- can_be_empty = False
- for j in range(0, len(nonterminal_set), 2):
- for k in range(nonterminal_set[j], nonterminal_set[j + 1]):
- child_initial_set, child_can_be_empty = lr1.productions[k][2][0]
- initial_set = character_set_or(initial_set, child_initial_set)
- can_be_empty = can_be_empty or child_can_be_empty
- # at this point can_be_empty refers to current symbol only
- if can_be_empty:
- next_initial_set, can_be_empty = lookaheads[i + 1]
- initial_set = character_set_or(initial_set, next_initial_set)
- # at this point can_be_empty refers to all remaining symbols
- if (initial_set, can_be_empty) != lookaheads[i]:
- lookaheads[i] = (initial_set, can_be_empty)
- modified = True
- return lr1
-
# GENERATE FACTORY(element.Element) BEGIN
tag_to_class = {
'Regex': Regex,
'RegexSequence': RegexSequence,
'RegexRepeat': RegexRepeat,
'RegexGroup': RegexGroup,
- 'RegexGroup_Attribute': RegexGroup.Attribute,
- 'Grammar': Grammar,
- 'Grammar_Production': Grammar.Production,
- 'Grammar_Production_Symbol': Grammar.Production.Symbol,
- 'Grammar_Production_NamedSymbol': Grammar.Production.NamedSymbol
+ 'RegexGroup_Attribute': RegexGroup.Attribute
}
def factory(tag, attrib = {}, *args, **kwargs):
return tag_to_class.get(tag, element.Element)(tag, attrib, *args, **kwargs)
# GENERATE END
-class NFA:
- # state_desc classes:
- # (STATE_CHARACTER, character_set, next_state)
- # (STATE_OR, next_state0, next_state1)
- # (STATE_AND, next_state0, next_state1)
- # (STATE_JOIN0,)
- # (STATE_JOIN1, next_state)
- # (STATE_MARK, mark_value, next_state)
-
- STATE_CHARACTER = 0
- STATE_OR = 1
- STATE_AND = 2
- STATE_JOIN0 = 3
- STATE_JOIN1 = 4
- STATE_MARK = 5
- join0_state = (STATE_JOIN0,)
-
- # multistate classes:
- # (MULTISTATE_ACCEPT, 1) accept, occupies one thread in list
- # (MULTISTATE_AND, n, state, child) and, occupies n threads in list
- # (MULTISTATE_OR, n, child0, child1)
- # n = sum of n from child states, that is, multistate[1] is the number of
- # (MULTISTATE_ACCEPT, 1) leaf states ultimately reachable from this subtree
-
- MULTISTATE_ACCEPT = 0
- MULTISTATE_AND = 1
- MULTISTATE_OR = 2
- accept_multistate = (MULTISTATE_ACCEPT, 1)
-
- def __init__(
- self,
- groups = [],
- states = [(STATE_CHARACTER, [0, n_characters], 0)],
- start_state = [] # can have multiple NFAs in same container
- ):
- # groups: list of group_desc
- # group_desc: (tag, kwargs)
- # tag, kwargs will be passed to apply_markup() hence factory()
- self.groups = groups
- self.states = states
- self.start_state = start_state
-
- def multistate_next(self, root_multistate, character):
- # the deduplication works as effectively a second pass which goes
- # over the multistate tree in pre-order, looking for OR-disjunctions
- # of any depth and configuration, e.g. (a OR b) or (c OR d), and
- # collecting the subexpressions e.g. a, b, c OR b, c, d, c OR d, and
- # failing any subexpression that's previously occurred in pre-order
- # (children of AND-conjunctions get a fresh empty subexpression set)
-
- # unfortunately it can't be done as a second pass literally, because
- # we have to generate the transition list, thus we need to know if a
- # subexpression is going to be failed, so we can delete it's threads
-
- # therefore the deduplication is tacked onto the end of the building
- # process, examining each built node just before the builder returns,
- # if already in the subexpression set we will rewrite the transition
- # list produced by the recursive calls, otherwise we will add it in
-
- # the problem is determining the subexpression set to pass into any
- # recursive calls, the caller's set will be sent unchanged by cases
- # that return the result unchanged or add an OR-disjuction, an empty
- # set will be sent by cases that add an AND-conjuction to the result
-
- # if we have added something to the node built by a recursive call,
- # we'll fall into the deduplication logic, otherwise just return it
-
- def advance1(multistate, join_count, done_multistates):
- # modifies nonlocal: transition
- assert multistate[0] == NFA.MULTISTATE_AND
- _, _, state, child = multistate
- if state == 0:
- # note that if we reach the accepting state, we must be a top-level
- # expression, and could not be part of an AND-conjunction (because
- # AND-conjunction terms always go to a join0 or join1 state first),
- # we remove ourselves to indicate to scanner that match is complete
- assert join_count == 0
- assert child == NFA.accept_multistate
- return advance(child, 0, done_multistates)
- state_desc = self.states[state]
- if state_desc[0] == NFA.STATE_CHARACTER:
- if join_count != 0:
- transition.append((DFA.TRANSITION_POP, child[1]))
- return None
- len_transition = len(transition)
- child = advance(child, 0, set())
- if child is None:
- return None
- result = (NFA.MULTISTATE_AND, child[1], state, child)
- elif state_desc[0] == NFA.STATE_OR:
- _, next_state0, next_state1 = state_desc
- len_transition = len(transition)
- transition.append((DFA.TRANSITION_DUP, child[1]))
- child0 = advance1(
- (NFA.MULTISTATE_AND, child[1], next_state0, child),
- join_count,
- done_multistates
- )
- child1 = advance1(
- (NFA.MULTISTATE_AND, child[1], next_state1, child),
- join_count,
- done_multistates
- )
- if child0 is None:
- return child1
- if child1 is None:
- return child0
- result = (NFA.MULTISTATE_OR, child0[1] + child1[1], child0, child1)
- elif state_desc[0] == NFA.STATE_AND:
- _, next_state0, next_state1 = state_desc
- return advance1(
- (
- NFA.MULTISTATE_AND,
- child[1],
- next_state0,
- (NFA.MULTISTATE_AND, child[1], next_state1, child)
- ),
- join_count,
- done_multistates
- )
- elif state_desc[0] == NFA.STATE_JOIN0:
- return advance(child, join_count + 1, done_multistates)
- elif state_desc[0] == NFA.STATE_JOIN1:
- _, next_state = state_desc
- if join_count == 0:
- transition.append((DFA.TRANSITION_POP, child[1]))
- return None
- return advance1(
- (NFA.MULTISTATE_AND, child[1], next_state, child),
- join_count - 1,
- done_multistates
- )
- elif state_desc[0] == NFA.STATE_MARK:
- _, mark_value, next_state = state_desc
- transition.append((DFA.TRANSITION_MARK, child[1], mark_value))
- return advance1(
- (NFA.MULTISTATE_AND, child[1], next_state, child),
- join_count,
- done_multistates
- )
- else:
- assert False
- if result in done_multistates:
- transition[len_transition:] = [(DFA.TRANSITION_POP, multistate[1])]
- return None
- done_multistates.add(result)
- return result
-
- def advance(multistate, join_count, done_multistates):
- nonlocal character0, character1 # modifies nonlocal: transition
- if multistate[0] == NFA.MULTISTATE_ACCEPT:
- assert join_count == 0
- len_transition = len(transition)
- transition.append((DFA.TRANSITION_MOVE, 1))
- result = NFA.accept_multistate # takes no arguments so use static one
- elif multistate[0] == NFA.MULTISTATE_AND:
- if character >= 0:
- _, _, state, child = multistate
- state_desc = self.states[state]
- assert state_desc[0] == NFA.STATE_CHARACTER
- _, character_set, next_state = state_desc
- k = bisect.bisect_right(character_set, character)
- if k > 0 and character0 < character_set[k - 1]:
- character0 = character_set[k - 1]
- if k < len(character_set) and character1 > character_set[k]:
- character1 = character_set[k]
- if (k & 1) == 0:
- transition.append((DFA.TRANSITION_POP, child[1]))
- return None
- multistate = (NFA.MULTISTATE_AND, child[1], next_state, child)
- return advance1(multistate, join_count, done_multistates)
- elif multistate[0] == NFA.MULTISTATE_OR:
- _, _, child0, child1 = multistate
- len_transition = len(transition)
- child0 = advance(child0, join_count, done_multistates)
- child1 = advance(child1, join_count, done_multistates)
- if child0 is None:
- return child1
- if child1 is None:
- return child0
- result = (NFA.MULTISTATE_OR, child0[1] + child1[1], child0, child1)
- else:
- assert False
- if result in done_multistates:
- transition[len_transition:] = [(DFA.TRANSITION_POP, multistate[1])]
- return None
- done_multistates.add(result)
- return result
-
- transition = []
- character0 = 0
- character1 = n_characters
- root_multistate = advance(root_multistate, 0, set())
- return root_multistate, transition, character0, character1
-
- def multistate_accept(root_multistate):
- i = 0
- def accept(multistate):
- nonlocal i
- if multistate[0] == NFA.MULTISTATE_ACCEPT:
- return True
- if multistate[0] == NFA.MULTISTATE_AND:
- _, _, _, child = multistate
- i += child[1]
- return False
- if multistate[0] == NFA.MULTISTATE_OR:
- _, _, child0, child1 = multistate
- return accept(child0) or accept(child1)
- assert False
- return i if accept(root_multistate) else -1
-
- def match_text(self, text, i, start_index = 0):
- def transit(transition):
- nonlocal threads0, threads1, prefix_slop # note: also uses i
- j = prefix_slop
- for trans in transition:
- if trans[0] == DFA.TRANSITION_POP:
- j += trans[1]
- elif trans[0] == DFA.TRANSITION_DUP:
- while j < trans[1]:
- threads0[:0] = [None] * prefix_slop
- threads1[:0] = [None] * prefix_slop
- j += prefix_slop
- prefix_slop *= 2
- threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
- j -= trans[1]
- elif trans[0] == DFA.TRANSITION_MARK:
- threads0[j:j + trans[1]] = [
- (i, trans[2], thread)
- for thread in threads0[j:j + trans[1]]
- ]
- elif trans[0] == DFA.TRANSITION_MOVE:
- threads1.extend(threads0[j:j + trans[1]])
- j += trans[1]
- #elif trans[0] == DFA.TRANSITION_DEL:
- # del threads1[-trans[1]:]
- else:
- assert False
- assert j == len(threads0)
- threads0, threads1 = threads1, threads0
- del threads1[prefix_slop:]
-
- threads0 = [None, None]
- threads1 = [None]
- prefix_slop = 1
-
- start_state = self.start_state[start_index]
- if start_state == -1:
- return None
- next_multistate, transition, _, _ = self.multistate_next(
- (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
- -1
- )
- while next_multistate is not None:
- transit(transition)
- assert len(threads0) == prefix_slop + next_multistate[1]
- if next_multistate == NFA.accept_multistate:
- # there is only one match, which is complete
- assert len(threads0) == prefix_slop + 1
- return threads0[prefix_slop]
- if i >= len(text):
- # return best match we have, but not incomplete match
- i = NFA.multistate_accept(next_multistate)
- return (None if i == -1 else threads0[prefix_slop + i])
- next_multistate, transition, _, _ = (
- self.multistate_next(next_multistate, ord(text[i]))
- )
- i += 1
- return None
-
- def match_yychunk(self, root, pos, off, yychunk_iter, start_index = 0):
- if pos < 0:
- pos, off = element.to_start_relative(root, pos, off)
-
- def transit(transition):
- nonlocal threads0, threads1, prefix_slop # note: also uses pos, off
- j = prefix_slop
- for trans in transition:
- if trans[0] == DFA.TRANSITION_POP:
- j += trans[1]
- elif trans[0] == DFA.TRANSITION_DUP:
- while j < trans[1]:
- threads0[:0] = [None] * prefix_slop
- threads1[:0] = [None] * prefix_slop
- j += prefix_slop
- prefix_slop *= 2
- threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
- j -= trans[1]
- elif trans[0] == DFA.TRANSITION_MARK:
- threads0[j:j + trans[1]] = [
- (pos, off, trans[2], thread)
- for thread in threads0[j:j + trans[1]]
- ]
- elif trans[0] == DFA.TRANSITION_MOVE:
- threads1.extend(threads0[j:j + trans[1]])
- j += trans[1]
- #elif trans[0] == DFA.TRANSITION_DEL:
- # del threads1[-trans[1]:]
- else:
- assert False
- assert j == len(threads0)
- threads0, threads1 = threads1, threads0
- del threads1[prefix_slop:]
-
- threads0 = [None, None]
- threads1 = [None]
- prefix_slop = 1
-
- start_state = self.start_state[start_index]
- if start_state == -1:
- return None
- next_multistate, transition, _, _ = self.multistate_next(
- (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
- -1
- )
- text = element.get_text(root, pos)
- while next_multistate is not None:
- transit(transition)
- assert len(threads0) == prefix_slop + next_multistate[1]
- if next_multistate == NFA.accept_multistate:
- # there is only one match, which is complete
- assert len(threads0) == prefix_slop + 1
- return threads0[prefix_slop]
- while off >= len(text):
- if pos < len(root):
- pos += 1
- off = 0
- else:
- try:
- next(yychunk_iter)
- except StopIteration:
- # return best match we have, but not incomplete match
- i = NFA.multistate_accept(next_multistate)
- return (None if i == -1 else threads0[prefix_slop + i])
- text = element.get_text(root, pos)
- next_multistate, transition, _, _ = (
- self.multistate_next(next_multistate, ord(text[off]))
- )
- off += 1
- return None
-
- def to_dfa(self):
- dfa = DFA(list(self.groups))
-
- accept_key = (NFA.accept_multistate, ())
- action_to_meaning = [accept_key]
- meaning_to_action = {accept_key: 0}
- state_to_meaning = [NFA.accept_multistate]
- meaning_to_state = {NFA.accept_multistate: 0}
-
- for start_state in self.start_state:
- if start_state == -1:
- start_action = -1
- else:
- next_multistate, transition, _, _ = self.multistate_next(
- (NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
- -1
- )
- if next_multistate is None:
- start_action = -1
- else:
- start_key = (next_multistate, tuple(transition))
- start_action = len(action_to_meaning)
- meaning_to_action[start_key] = start_action
- action_to_meaning.append(start_key)
- dfa.start_action.append(start_action)
-
- while len(dfa.actions) < len(action_to_meaning):
- next_multistate, transition = action_to_meaning[len(dfa.actions)]
- if next_multistate in meaning_to_state:
- next_state = meaning_to_state[next_multistate]
- else:
- next_state = len(state_to_meaning)
- state_to_meaning.append(next_multistate)
- meaning_to_state[next_multistate] = next_state
- dfa.actions.append((next_state, list(transition)))
-
- while len(dfa.states) < len(state_to_meaning):
- character = 0
- multistate = state_to_meaning[len(dfa.states)]
- state_desc = ([], [], NFA.multistate_accept(multistate))
- while character < n_characters:
- next_multistate, transition, character0, character1 = self.multistate_next(
- multistate,
- character
- )
- assert character0 == character and character1 > character
- if next_multistate is None:
- action = -1
- else:
- # optimize transition (optional)
- i = 0
- j = 0
- while i < len(transition):
- if transition[i][0] == DFA.TRANSITION_POP:
- n = transition[i][1]
- i += 1
- while (
- i < len(transition) and
- transition[i][0] == DFA.TRANSITION_POP
- ):
- n += transition[i][1]
- i += 1
- transition[j] = (DFA.TRANSITION_POP, n)
- elif transition[i][0] == DFA.TRANSITION_MOVE:
- n = transition[i][1]
- i += 1
- while (
- i < len(transition) and
- transition[i][0] == DFA.TRANSITION_MOVE
- ):
- n += transition[i][1]
- i += 1
- transition[j] = (DFA.TRANSITION_MOVE, n)
- else:
- transition[j] = transition[i]
- i += 1
- j += 1
- del transition[j:]
- # end optimize transition
- key = (next_multistate, tuple(transition))
- if key in meaning_to_action:
- action = meaning_to_action[key]
- else:
- action = len(action_to_meaning)
- action_to_meaning.append(key)
- meaning_to_action[key] = action
- state_desc[0].append(character1)
- state_desc[1].append(action)
- character = character1
- dfa.states.append(state_desc)
- return dfa
-
- def __repr__(self):
- return 'regex.NFA({0:s}, {1:s}, {2:s})'.format(
- repr(self.groups),
- repr(self.states),
- repr(self.start_state)
- )
-
-class DFA:
- # transition classes:
- # instructions to transform thread list from a multistate to next multistate
- # (TRANSITION_POP, n) j += n
- # (TRANSITION_DUP, n) threads0[j - n:j] = threads0[j:j + n]
- # j -= n
- # (TRANSITION_MARK, n, mark_value) threads0[j:j + n] = [
- # (i, mark, thread)
- # for thread in threads0[j:j + n]
- # ]
- # (TRANSITION_MOVE, n) threads1.extend(threads0[j:j + n])
- # j += n
- # (TRANSITION_DEL, n) del threads1[-n:]
-
- TRANSITION_POP = 0
- TRANSITION_DUP = 1
- TRANSITION_MARK = 2
- TRANSITION_MOVE = 3
- #TRANSITION_DEL = 4
-
- def __init__(
- self,
- groups = [],
- states = [([n_characters], [0], 0)],
- actions = [(0, [])],
- start_action = [] # can have multiple DFAs in same container
- ):
- # groups: list of group_desc
- # group_desc: (tag, kwargs)
- # tag, kwargs will be passed to apply_markup() hence factory()
- # states: list of state_desc
- # state_desc: (list of breaks, list of action to do, accept_thread)
- # actions: list of action_desc
- # action_desc: (state to go to next, compiled transition to do first)
- # accept_thread: which thread of thread list to use, -1 don't accept
- self.groups = groups
- self.states = states
- self.actions = actions
- self.start_action = start_action
-
- def match_text(self, text, i, start_index = 0):
- def transit(transition):
- nonlocal threads0, threads1, prefix_slop # note: also uses i
- j = prefix_slop
- for trans in transition:
- if trans[0] == DFA.TRANSITION_POP:
- j += trans[1]
- elif trans[0] == DFA.TRANSITION_DUP:
- while j < trans[1]:
- threads0[:0] = [None] * prefix_slop
- threads1[:0] = [None] * prefix_slop
- j += prefix_slop
- prefix_slop *= 2
- threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
- j -= trans[1]
- elif trans[0] == DFA.TRANSITION_MARK:
- threads0[j:j + trans[1]] = [
- (i, trans[2], thread)
- for thread in threads0[j:j + trans[1]]
- ]
- elif trans[0] == DFA.TRANSITION_MOVE:
- threads1.extend(threads0[j:j + trans[1]])
- j += trans[1]
- #elif trans[0] == DFA.TRANSITION_DEL:
- # del threads1[-trans[1]:]
- else:
- assert False
- assert j == len(threads0)
- threads0, threads1 = threads1, threads0
- del threads1[prefix_slop:]
-
- threads0 = [None, None]
- threads1 = [None]
- prefix_slop = 1
-
- action = self.start_action[start_index]
- while action != -1:
- state, transition = self.actions[action]
- #print('i', i, 'action', action, 'state', state, 'transition', transition)
- transit(transition)
- if state == 0:
- # there is only one match, which is complete
- assert len(threads0) == prefix_slop + 1
- return threads0[prefix_slop]
- if i >= len(text):
- # return best match we have, but not incomplete match
- i = self.states[state][2]
- return (None if i == -1 else threads0[prefix_slop + i])
- action = self.states[state][1][
- bisect.bisect_right(self.states[state][0], ord(text[i]))
- ]
- i += 1
- return None
-
- def match_yychunk(self, root, pos, off, yychunk_iter, start_index = 0):
- if pos < 0:
- pos, off = element.to_start_relative(root, pos, off)
-
- def transit(transition):
- nonlocal threads0, threads1, prefix_slop # note: also uses pos, off
- j = prefix_slop
- for trans in transition:
- if trans[0] == DFA.TRANSITION_POP:
- j += trans[1]
- elif trans[0] == DFA.TRANSITION_DUP:
- while j < trans[1]:
- threads0[:0] = [None] * prefix_slop
- threads1[:0] = [None] * prefix_slop
- j += prefix_slop
- prefix_slop *= 2
- threads0[j - trans[1]:j] = threads0[j:j + trans[1]]
- j -= trans[1]
- elif trans[0] == DFA.TRANSITION_MARK:
- threads0[j:j + trans[1]] = [
- (pos, off, trans[2], thread)
- for thread in threads0[j:j + trans[1]]
- ]
- elif trans[0] == DFA.TRANSITION_MOVE:
- threads1.extend(threads0[j:j + trans[1]])
- j += trans[1]
- #elif trans[0] == DFA.TRANSITION_DEL:
- # del threads1[-trans[1]:]
- else:
- assert False
- assert j == len(threads0)
- threads0, threads1 = threads1, threads0
- del threads1[prefix_slop:]
-
- threads0 = [None, None]
- threads1 = [None]
- prefix_slop = 1
-
- action = self.start_action[start_index]
- text = element.get_text(root, pos)
- while action != -1:
- state, transition = self.actions[action]
- transit(transition)
- if state == 0:
- # there is only one match, which is complete
- assert len(threads0) == prefix_slop + 1
- return threads0[prefix_slop]
- while off >= len(text):
- if pos < len(root):
- pos += 1
- off = 0
- else:
- try:
- next(yychunk_iter)
- except StopIteration:
- # return best match we have, but not incomplete match
- i = self.states[state][2]
- return (None if i == -1 else threads0[prefix_slop + i])
- text = element.get_text(root, pos)
- #print(
- # 'state {0:d} pos {1:d} off {2:d} text "{3:s}"'.format(
- # state,
- # pos,
- # off,
- # text.replace('\n', '$')
- # )
- #)
- action = self.states[state][1][
- bisect.bisect_right(self.states[state][0], ord(text[off]))
- ]
- off += 1
- return None
-
- def yylex(self, root, pos, off, factory, yychunk_iter):
- if pos < 0:
- pos, off = element.to_start_relative(root, pos, off)
-
- while True:
- # note: pointers must be kept start relative during the below call,
- # because it extends the following text by calling the yychunk_iter
- thread = self.match_yychunk(root, pos, off, yychunk_iter)
- if thread is None:
- break
- stack = []
- while True:
- pos, off, mark_value, thread = thread
- group_index = mark_value >> 1
- if (mark_value & 1) != 0:
- end_pos, end_off = element.to_end_relative(root, pos, off)
- stack.append((end_pos, end_off, group_index))
- else:
- end_pos, end_off, temp = stack.pop()
- assert temp == group_index
- if len(stack) == 0:
- break
- tag, kwargs = self.groups[group_index]
- if tag != '':
- work.apply_markup(
- root,
- pos,
- off,
- end_pos,
- end_off,
- factory,
- tag,
- **kwargs
- )
- # note: pointers must be kept end relative during the below call,
- # because it modifies the preceding text by calling apply_markup()
- yield end_pos, end_off, group_index
- pos, off = element.to_start_relative(root, end_pos, end_off)
-
- def __repr__(self):
- return 'regex.DFA({0:s}, {1:s}, {2:s}, {3:s})'.format(
- repr(self.groups),
- repr(self.states),
- repr(self.actions),
- repr(self.start_action)
- )
-
-class LR1:
- def __init__(
- self,
- productions = [],
- n_terminals = n_characters + 1,
- eof_terminal = n_characters
- ):
- # productions: list of production
- # production: (
- # priority,
- # symbols,
- # lookaheads,
- # group_bounds
- # )
- # priority: bit 0 = right to left, bits 1: = numeric priority
- # symbols: list of symbol_desc
- # symbol_desc: (terminal_set, nonterminal_set)
- # terminal_set: similar to character_set, even length list of pairs of breaks
- # nonterminal_set: as above but has n_terminals subtracted from breaks
- # lookaheads: list of lookahead_desc, len(lookaheads) = len(symbols) + 1
- # lookahead_desc: (initial_set, can_be_empty)
- # initial_set: what terminals can occur at this position in symbols array,
- # computed such that lookaheads[i][0] is symbols[i][0], plus initial set of
- # each nonterminal from symbols[i][1], plus lookaheads[i + 1][0] if any
- # nonterminal of symbols[i][1] can be empty (may pull in i + 2, i + 3 etc)
- # can_be_empty: whether all symbols from this position to end can be empty
- # note: last lookahead_desc is a sentinel consisting of ([], True), so that
- # initial_set is empty (will be augmented by context) and can_be_empty is
- # True (because all symbols from the end to the end can obviously be empty)
- # group_bounds: list of group_bound
- # group_bound: (start_index, end_index, tag, kwargs)
- # where start_index, end_index are indices into list of character_set,
- # and tag, kwargs will be passed to apply_markup() hence factory(),
- # noting that markup has to be applied in reverse order of the list
- # n_terminals: offset to apply to productions[] index to get symbol
- # (character set code), also symbol for productions[0] = start production
- # eof_terminal: usually == n_terminals - 1 (must be valid terminal value)
- self.productions = productions
- self.n_terminals = n_terminals
- self.eof_terminal = eof_terminal
-
- def lookahead_item_set_closure(self, items, item_to_index):
- in_queue = [True for i in range(len(items))]
- queue = list(range(len(items)))
-
- qhead = 0
- while qhead < len(queue):
- i = queue[qhead]
- in_queue[i] = False
- j, k, lookahead_set = items[i]
- _, symbols, lookaheads, _ = self.productions[j]
- if k < len(symbols):
- _, nonterminal_set = symbols[k]
- if len(nonterminal_set):
- next_lookahead_set, next_can_be_empty = lookaheads[k + 1]
- if next_can_be_empty:
- next_lookahead_set = character_set_or(
- next_lookahead_set,
- lookahead_set
- )
- for l in range(0, len(nonterminal_set), 2):
- for m in range(nonterminal_set[l], nonterminal_set[l + 1]):
- key = (m, 0)
- if key in item_to_index:
- n = item_to_index[key]
- child_lookahead_set = character_set_or(
- items[n][2],
- next_lookahead_set
- )
- if child_lookahead_set != items[n][2]:
- items[n] = (m, 0, child_lookahead_set)
- if not in_queue[n]:
- # optimization: do not re-queue unless it is transparent
- # to changes in the lookahead set wrt. further propagation
- _, child_symbols, child_lookaheads, _ = self.productions[m]
- if len(child_symbols) and child_lookaheads[1][1]:
- in_queue[n] = True
- queue.append(n)
- else:
- n = len(items)
- items.append((m, 0, next_lookahead_set))
- item_to_index[key] = n
- in_queue.append(True)
- queue.append(n)
- qhead += 1
-
- def lookahead_item_set_shift(self, items, terminal):
- next_items = []
- next_item_to_index = {}
- reductions = set()
- terminal0 = 0
- terminal1 = self.n_terminals
- for i, j, lookahead_set in items:
- _, symbols, _, _ = self.productions[i]
- if j < len(symbols):
- terminal_set, _ = symbols[j]
- k = bisect.bisect_right(terminal_set, terminal)
- if k > 0 and terminal0 < terminal_set[k - 1]:
- terminal0 = terminal_set[k - 1]
- if k < len(terminal_set) and terminal1 > terminal_set[k]:
- terminal1 = terminal_set[k]
- if (k & 1) == 1:
- next_item_to_index[(i, j + 1)] = len(next_items)
- next_items.append((i, j + 1, lookahead_set))
- else:
- k = bisect.bisect_right(lookahead_set, terminal)
- if k > 0 and terminal0 < lookahead_set[k - 1]:
- terminal0 = lookahead_set[k - 1]
- if k < len(lookahead_set) and terminal1 > lookahead_set[k]:
- terminal1 = lookahead_set[k]
- if (k & 1) == 1:
- reductions.add(i)
- return next_items, next_item_to_index, reductions, terminal0, terminal1
-
- def lookahead_item_set_goto(self, items, nonterminal):
- next_items = []
- next_item_to_index = {}
- reductions = set()
- nonterminal0 = 0
- nonterminal1 = len(self.productions)
- for i, j, lookahead_set in items:
- _, symbols, _, _ = self.productions[i]
- if j < len(symbols):
- _, nonterminal_set = symbols[j]
- k = bisect.bisect_right(nonterminal_set, nonterminal)
- if k > 0 and nonterminal0 < nonterminal_set[k - 1]:
- nonterminal0 = nonterminal_set[k - 1]
- if k < len(nonterminal_set) and nonterminal1 > nonterminal_set[k]:
- nonterminal1 = nonterminal_set[k]
- if (k & 1) == 1:
- next_item_to_index[(i, j + 1)] = len(next_items)
- next_items.append((i, j + 1, lookahead_set))
- return next_items, next_item_to_index, nonterminal0, nonterminal1
-
- def parse_text(self, text, i):
- items = [(0, 0, [self.eof_terminal, self.eof_terminal + 1])]
- item_to_index = {(0, 0): 0}
- value_stack = []
- state_stack = []
- lookahead_character = ord(text[i]) if i < len(text) else self.eof_terminal
- while True:
- self.lookahead_item_set_closure(items, item_to_index)
- value_stack.append(i)
- state_stack.append(items)
- items, item_to_index, reductions, _, _ = (
- self.lookahead_item_set_shift(items, lookahead_character)
- )
- if len(items) != 0:
- if len(reductions) != 0:
- sys.stderr.write(
- 'shift/reduce conflict: {0:s} vs {1:s}\n'.format(
- ','.join([str(i) for i, _, _ in next_items]),
- ','.join([str(i) for i in reductions])
- )
- )
- i += 1
- lookahead_character = ord(text[i]) if i < len(text) else self.eof_terminal
- elif len(reductions) != 0:
- if len(reductions) != 1:
- sys.stderr.write(
- 'reduce/reduce conflict: {0:s}\n'.format(
- ','.join([str(i) for i in reductions])
- )
- )
- reduce = min(reductions)
- _, symbols, _, group_bounds = self.productions[reduce]
- base = len(value_stack) - len(symbols) - 1
- for j in range(len(group_bounds) - 1, -1, -1):
- k, l, tag, _ = group_bounds[j]
- k += base
- if l != 1:
- value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
- sys.stderr.write(
- 'text \'{0:s}\' tag \'{1:s}\'\n'.format(
- text[value_stack[k]:value_stack[k + 1]],
- tag
- )
- )
- del value_stack[base + 1:]
- del state_stack[base + 1:]
- if reduce == 0:
- assert base == 0
- return
- items, item_to_index, _, _ = (
- self.lookahead_item_set_goto(state_stack[-1], reduce)
- )
- assert len(items) != 0
- else:
- raise Exception(
- 'syntax error at {0:d}: {1:s}'.format(i, text[i:])
- )
-
- def parse_yychunk(self, root, pos, off, factory, yychunk_iter):
- if pos < 0:
- pos, off = element.to_start_relative(root, pos, off)
-
- items = [(0, 0, [self.eof_terminal, self.eof_terminal + 1])]
- item_to_index = {(0, 0): 0}
- value_stack = []
- state_stack = []
- text = element.get_text(root, pos)
- while off >= len(text):
- if pos < len(root):
- pos += 1
- off = 0
- else:
- try:
- next(yychunk_iter)
- except StopIteration:
- lookahead_character = self.eof_terminal
- break
- text = element.get_text(root, pos)
- else:
- lookahead_character = ord(text[off])
- while True:
- self.lookahead_item_set_closure(items, item_to_index)
- value_stack.append((pos, off))
- state_stack.append(items)
- items, item_to_index, reductions, _, _ = (
- self.lookahead_item_set_shift(items, lookahead_character)
- )
- if len(items) != 0:
- if len(reductions) != 0:
- sys.stderr.write(
- 'shift/reduce conflict: {0:s} vs {1:s}\n'.format(
- ','.join([str(i) for i, _ in next_lookahead_item_set.keys()]),
- ','.join([str(i) for i in reductions])
- )
- )
- off += 1
- while off >= len(text):
- if pos < len(root):
- pos += 1
- off = 0
- else:
- try:
- next(yychunk_iter)
- except StopIteration:
- lookahead_character = self.eof_terminal
- break
- text = element.get_text(root, pos)
- else:
- lookahead_character = ord(text[off])
- elif len(reductions) != 0:
- if len(reductions) != 1:
- sys.stderr.write(
- 'reduce/reduce conflict: {0:s}\n'.format(
- ','.join([str(i) for i in reductions])
- )
- )
- reduce = min(reductions)
- _, symbols, _, group_bounds = self.productions[reduce]
- base = len(value_stack) - len(symbols) - 1
- end_relative = len(value_stack)
- for j in range(len(group_bounds) - 1, -1, -1):
- k, l, tag, kwargs = group_bounds[j]
- k += base
- assert k < end_relative
- if l != 1:
- value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
- end_relative = max(k + 1, end_relative + 1 - l)
- while end_relative > k + 1:
- end_relative -= 1
- pos1, off1 = value_stack[end_relative]
- value_stack[end_relative] = (
- element.to_end_relative(root, pos1, off1)
- )
- pos0, off0 = value_stack[k]
- pos1, off1 = value_stack[k + 1]
- work.apply_markup(
- root,
- pos0,
- off0,
- pos1,
- off1,
- factory,
- tag,
- **kwargs
- )
- if end_relative < len(value_stack):
- pos, off = value_stack[-1]
- pos, off = element.to_start_relative(root, pos, off)
- text = element.get_text(root, pos)
- del value_stack[base + 1:]
- del state_stack[base + 1:]
- if reduce == 0:
- assert base == 0
- return
- items, item_to_index, _, _ = (
- self.lookahead_item_set_goto(state_stack[-1], reduce)
- )
- assert len(items) != 0
- else:
- raise Exception(
- 'syntax error at {0:d},{1:d}: {2:s}'.format(pos, off, text[off:])
- )
-
- def to_clr1(self):
- lr1dfa = LR1DFA(
- [],
- [
- (len(symbols), group_bounds)
- for _, symbols, _, group_bounds in self.productions
- ],
- self.n_terminals,
- self.eof_terminal
- )
-
- items = [(0, 0, [self.eof_terminal, self.eof_terminal + 1])]
- item_to_index = {(0, 0): 0}
- self.lookahead_item_set_closure(items, item_to_index)
-
- items = sorted(items)
- key = tuple((i, j, tuple(k)) for i, j, k in items)
- state_to_items = [items]
- items_to_state = {key: 0}
-
- while len(lr1dfa.states) < len(state_to_items):
- items = state_to_items[len(lr1dfa.states)]
- state_desc = ([], [], [], [])
-
- def add_state(next_items, next_item_to_index):
- self.lookahead_item_set_closure(next_items, next_item_to_index)
- new_items = sorted(next_items)
- key = tuple((i, j, tuple(k)) for i, j, k in new_items)
- if key in items_to_state:
- state = items_to_state[key]
- else:
- state = len(state_to_items)
- state_to_items.append(new_items)
- items_to_state[key] = state
- return state
-
- terminal = 0
- while terminal < self.n_terminals:
- next_items, next_item_to_index, reductions, terminal0, terminal1 = (
- self.lookahead_item_set_shift(items, terminal)
- )
- assert terminal0 == terminal and terminal1 > terminal
- if len(next_items) != 0:
- if len(reductions) != 0:
- sys.stderr.write(
- 'state {0:d} shift/reduce conflict: {1:s} vs {2:s}\n'.format(
- len(lr1dfa.states),
- ','.join([str(i) for i, _, _ in next_items]),
- ','.join([str(i) for i in reductions])
- )
- )
- action = add_state(next_items, next_item_to_index) * 2
- elif len(reductions) != 0:
- if len(reductions) != 1:
- sys.stderr.write(
- 'state {0:d} reduce/reduce conflict: {1:s}\n'.format(
- len(lr1dfa.states),
- ','.join([str(i) for i in reductions])
- )
- )
- action = min(reductions) * 2 + 1 # odd means reduce
- else:
- action = -1
- state_desc[0].append(terminal1)
- state_desc[1].append(action)
- terminal = terminal1
-
- nonterminal = 0
- while nonterminal < len(self.productions):
- next_items, next_item_to_index, nonterminal0, nonterminal1 = (
- self.lookahead_item_set_goto(items, nonterminal)
- )
- assert nonterminal0 == nonterminal and nonterminal1 > nonterminal
- if len(next_items) != 0:
- goto = add_state(next_items, next_item_to_index)
- else:
- goto = -1
- state_desc[2].append(nonterminal1)
- state_desc[3].append(goto)
- nonterminal = nonterminal1
-
- lr1dfa.states.append(state_desc)
- return lr1dfa
-
- def to_lalr1(self):
- lr1dfa = LR1DFA(
- [],
- [
- (len(symbols), group_bounds)
- for _, symbols, _, group_bounds in self.productions
- ],
- self.n_terminals,
- self.eof_terminal
- )
-
- items = [(0, 0, [self.eof_terminal, self.eof_terminal + 1])]
- item_to_index = {(0, 0): 0}
- self.lookahead_item_set_closure(items, item_to_index)
-
- items = sorted(items)
- key = tuple((i, j) for i, j, _ in items) # ignore lookahead
- state_to_items = [items]
- items_to_state = {key: 0}
-
- in_queue = [True]
- queue = [0]
-
- qhead = 0
- while qhead < len(queue):
- i = queue[qhead]
- in_queue[i] = False
- items = state_to_items[i]
- state_desc = ([], [], [], [])
-
- def add_state(next_items, next_item_to_index):
- self.lookahead_item_set_closure(next_items, next_item_to_index)
- new_items = sorted(next_items)
- key = tuple((i, j) for i, j, _ in new_items) # ignore lookahead
- if key in items_to_state:
- state = items_to_state[key]
- state_items = state_to_items[state]
- for i in range(len(new_items)):
- j, k, lookahead_set = new_items[i]
- lookahead_set = character_set_or(lookahead_set, state_items[i][2])
- if lookahead_set != state_items[i][2]:
- state_items[i] = (j, k, lookahead_set)
- if not in_queue[state]:
- in_queue[state] = True
- queue.append(state)
- else:
- state = len(state_to_items)
- state_to_items.append(new_items)
- items_to_state[key] = state
- in_queue.append(True)
- queue.append(state)
- return state
-
- terminal = 0
- while terminal < self.n_terminals:
- next_items, next_item_to_index, reductions, terminal0, terminal1 = (
- self.lookahead_item_set_shift(items, terminal)
- )
- assert terminal0 == terminal and terminal1 > terminal
- if len(next_items) != 0:
- if len(reductions) != 0:
- sys.stderr.write(
- 'state {0:d} shift/reduce conflict: {1:s} vs {2:s}\n'.format(
- i,
- ','.join([str(j) for j, _, _ in next_items]),
- ','.join([str(j) for j in reductions])
- )
- )
- action = add_state(next_items, next_item_to_index) * 2
- elif len(reductions) != 0:
- if len(reductions) != 1:
- sys.stderr.write(
- 'state {0:d} reduce/reduce conflict: {1:s}\n'.format(
- i,
- ','.join([str(j) for j in reductions])
- )
- )
- action = min(reductions) * 2 + 1 # odd means reduce
- else:
- action = -1
- state_desc[0].append(terminal1)
- state_desc[1].append(action)
- terminal = terminal1
-
- nonterminal = 0
- while nonterminal < len(self.productions):
- next_items, next_item_to_index, nonterminal0, nonterminal1 = (
- self.lookahead_item_set_goto(items, nonterminal)
- )
- assert nonterminal0 == nonterminal and nonterminal1 > nonterminal
- if len(next_items) != 0:
- goto = add_state(next_items, next_item_to_index)
- else:
- goto = -1
- state_desc[2].append(nonterminal1)
- state_desc[3].append(goto)
- nonterminal = nonterminal1
-
- if i < len(lr1dfa.states):
- lr1dfa.states[i] = state_desc
- else:
- lr1dfa.states.append(state_desc)
- qhead += 1
- return lr1dfa
-
- def __repr__(self):
- return 'regex.LR1({0:s}, {1:d}, {2:d})'.format(
- repr(self.productions),
- self.n_terminals,
- self.eof_terminal
- )
-
-class LR1DFA:
- def __init__(
- self,
- states = [],
- productions = [],
- n_terminals = n_characters + 1,
- eof_terminal = n_characters
- ):
- # states: list of state_desc
- # state_desc: (terminal breaks, actions, nonterminal breaks, gotos)
- # action: shift = new state * 2, reduce = production * 2 + 1, error = -1
- # goto: reduce = production, error = -1 (but error can't really happen)
- # productions: list of production
- # production: (len(symbols), group_bounds)
- # len(symbols): how many states to pop stack to reduce this production
- # group_bounds: list of group_bound
- # group_bound: (start_index, end_index, tag, kwargs)
- # where start_index, end_index are indices into list of character_set,
- # and tag, kwargs will be passed to apply_markup() hence factory(),
- # noting that markup has to be applied in reverse order of the list
- # n_terminals: offset to apply to productions[] index to get symbol
- # (character set code), also symbol for productions[0] = start production
- # eof_terminal: usually == n_terminals - 1 (must be valid terminal value)
- self.states = states
- self.productions = productions
- self.n_terminals = n_terminals
- self.eof_terminal = eof_terminal
-
- def parse_text(self, text, i):
- state = 0
- value_stack = []
- state_stack = []
- lookahead_character = ord(text[i]) if i < len(text) else self.eof_terminal
- while True:
- value_stack.append(i)
- state_stack.append(state)
- action = self.states[state][1][
- bisect.bisect_right(self.states[state][0], lookahead_character)
- ]
- if action == -1:
- raise Exception(
- 'syntax error at {0:d}: {1:s}'.format(i, text[i:])
- )
- if (action & 1) == 0:
- state = action >> 1
- i += 1
- lookahead_character = ord(text[i]) if i < len(text) else self.eof_terminal
- else:
- reduce = action >> 1
- len_symbols, group_bounds = self.productions[reduce]
- base = len(value_stack) - len_symbols - 1
- for j in range(len(group_bounds) - 1, -1, -1):
- k, l, tag, _ = group_bounds[j]
- k += base
- if l != 1:
- value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
- sys.stdout.write(
- 'text \'{0:s}\' tag \'{1:s}\'\n'.format(
- text[value_stack[k]:value_stack[k + 1]],
- tag
- )
- )
- del value_stack[base + 1:]
- del state_stack[base + 1:]
- if reduce == 0:
- assert base == 0
- return
- state = self.states[state_stack[-1]][3][
- bisect.bisect_right(self.states[state_stack[-1]][2], reduce)
- ]
- assert state != -1
-
- def parse_yychunk(self, root, pos, off, factory, yychunk_iter):
- if pos < 0:
- pos, off = element.to_start_relative(root, pos, off)
-
- state = 0
- value_stack = []
- state_stack = []
- text = element.get_text(root, pos)
- while off >= len(text):
- if pos < len(root):
- pos += 1
- off = 0
- else:
- try:
- next(yychunk_iter)
- except StopIteration:
- lookahead_character = self.eof_terminal
- break
- text = element.get_text(root, pos)
- else:
- lookahead_character = ord(text[off])
- while True:
- value_stack.append((pos, off))
- state_stack.append(state)
- action = self.states[state][1][
- bisect.bisect_right(self.states[state][0], lookahead_character)
- ]
- #print('lookahead_character', lookahead_character, 'action', action)
- if action == -1:
- raise Exception(
- 'syntax error at {0:d},{1:d}: {2:s}'.format(pos, off, text[off:])
- )
- if (action & 1) == 0:
- state = action >> 1
- off += 1
- while off >= len(text):
- if pos < len(root):
- pos += 1
- off = 0
- else:
- try:
- next(yychunk_iter)
- except StopIteration:
- lookahead_character = self.eof_terminal
- break
- text = element.get_text(root, pos)
- else:
- lookahead_character = ord(text[off])
- else:
- reduce = action >> 1
- len_symbols, group_bounds = self.productions[reduce]
- base = len(value_stack) - len_symbols - 1
- end_relative = len(value_stack)
- for j in range(len(group_bounds) - 1, -1, -1):
- k, l, tag, kwargs = group_bounds[j]
- k += base
- assert k < end_relative
- if l != 1:
- value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
- end_relative = max(k + 1, end_relative + 1 - l)
- while end_relative > k + 1:
- end_relative -= 1
- pos1, off1 = value_stack[end_relative]
- value_stack[end_relative] = (
- element.to_end_relative(root, pos1, off1)
- )
- pos0, off0 = value_stack[k]
- pos1, off1 = value_stack[k + 1]
- work.apply_markup(
- root,
- pos0,
- off0,
- pos1,
- off1,
- factory,
- tag,
- **kwargs
- )
- if end_relative < len(value_stack):
- pos, off = value_stack[-1]
- pos, off = element.to_start_relative(root, pos, off)
- text = element.get_text(root, pos)
- del value_stack[base + 1:]
- del state_stack[base + 1:]
- if reduce == 0:
- assert base == 0
- return
- state = self.states[state_stack[-1]][3][
- bisect.bisect_right(self.states[state_stack[-1]][2], reduce)
- ]
- assert state != -1
-
- def yyparse(self, root, pos, off, factory, yylex_iter):
- if pos < 0:
- pos, off = element.to_start_relative(root, pos, off)
-
- state = 0
- value_stack = []
- state_stack = []
- try:
- end_pos, end_off, lookahead_character = next(yylex_iter)
- except StopIteration:
- lookahead_character = self.eof_terminal
- end_pos, end_off = element.to_end_relative(root, pos, off)
- while True:
- value_stack.append((pos, off))
- state_stack.append(state)
- action = self.states[state][1][
- bisect.bisect_right(self.states[state][0], lookahead_character)
- ]
- #print('lookahead_character', lookahead_character, 'action', action)
- if action == -1:
- raise Exception(
- 'syntax error at {0:d},{1:d}: {2:d}'.format(pos, off, lookahead_character)
- )
- if (action & 1) == 0:
- state = action >> 1
- pos, off = element.to_start_relative(root, end_pos, end_off)
- try:
- end_pos, end_off, lookahead_character = next(yylex_iter)
- except StopIteration:
- lookahead_character = self.eof_terminal
- #end_pos, end_off = element.to_end_relative(root, pos, off)
- else:
- reduce = action >> 1
- len_symbols, group_bounds = self.productions[reduce]
- base = len(value_stack) - len_symbols - 1
- end_relative = len(value_stack)
- for j in range(len(group_bounds) - 1, -1, -1):
- k, l, tag, kwargs = group_bounds[j]
- k += base
- assert k < end_relative
- if l != 1:
- value_stack[k + 1:k + l + 1] = [value_stack[k + l]]
- end_relative = max(k + 1, end_relative + 1 - l)
- while end_relative > k + 1:
- end_relative -= 1
- pos1, off1 = value_stack[end_relative]
- value_stack[end_relative] = (
- element.to_end_relative(root, pos1, off1)
- )
- pos0, off0 = value_stack[k]
- pos1, off1 = value_stack[k + 1]
- work.apply_markup(
- root,
- pos0,
- off0,
- pos1,
- off1,
- factory,
- tag,
- **kwargs
- )
- if end_relative < len(value_stack):
- pos, off = value_stack[-1]
- pos, off = element.to_start_relative(root, pos, off)
- del value_stack[base + 1:]
- del state_stack[base + 1:]
- if reduce == 0:
- assert base == 0
- return
- state = self.states[state_stack[-1]][3][
- bisect.bisect_right(self.states[state_stack[-1]][2], reduce)
- ]
- assert state != -1
-
- def __repr__(self):
- return 'regex.LR1DFA({0:s}, {1:s}, {2:d}, {3:d})'.format(
- repr(self.states),
- repr(self.productions),
- self.n_terminals,
- self.eof_terminal
- )
-
-def wrap_repr(text, width):
- lines = []
- i = 0
- while i < len(text):
- j = i + width
- if j < len(text):
- j = max(
- [
- text.rfind('(', i, j) + 1,
- text.rfind('[', i, j) + 1,
- text.rfind('{', i, j) + 1,
- text.rfind('.', i, j) + 1,
- text.rfind(')', i, j + 1),
- text.rfind(']', i, j + 1),
- text.rfind('}', i, j + 1),
- text.rfind(' ', i, j + 1),
- ]
- )
- assert j > 0
- lines.append(text[i:j] + '\n')
- i = j
- while text[i:i + 1] == ' ':
- i += 1
- return ''.join(lines)
-
if __name__ == '__main__':
+ import sys
import xml.etree.ElementTree
regex = RegexAnd(children = [RegexRepeat(children = [RegexCharacterNot(