n_characters = 0x100
class NFA:
- # state_desc classes:
+ # NFA state_desc classes:
# (STATE_CHARACTER, character_set, next_state)
# (STATE_OR, next_state0, next_state1)
# (STATE_AND, next_state0, next_state1)
# (STATE_JOIN1, next_state)
# (STATE_MARK, mark_value, next_state)
+ # the STATE_CHARACTER and STATE_OR are self-explanatory (standard for NFA)
+
+ # the STATE_AND is an innovation where the two branches lead to STATE_JOIN0
+ # and STATE_JOIN1 respectively and require the text to match both branches,
+ # but the groups recorded will be from the *right* branch only -- the left
+ # branch can be used to control the length of the match (minimum or maximum)
+ # -- after the first branch reaches STATE_JOIN0 and the second branch reaches
+ # STATE_JOIN1 on the *same* character input, we know both branches matched
+ # the same length of text, so the first branch can be deleted and the second
+ # branch continues to match text starting with the next_state recorded in it
+
+ # the STATE_MARK is used for recording groups, so basically when the opening
+ # or closing parenthesis of a group is encountered, that position in the text
+ # is marked with the given mark_value, and the marks are recorded in a linked
+ # list that works its way backwards through the text -- marks are recorded
+ # in "threads", where each thread is basically the head of a linked list of
+ # marks, and when STATE_OR is encountered, the thread is duplicated so that
+ # each thread collects its own mark list independently from that character on
+ # -- the collection of marks needs to be very lightweight (just creating an
+ # entry in a linked list, which is cheap) because threads can easily die if
+ # they reach a dead end in the NFA, causing their marks to be discarded -- we
+ # don't do the heavyweight group processing until we have reached the end of
+ # the text, and then we choose the first surviving thread and use its marks
+
+ # NFA state 0 links to itself and accepts any character repeatedly forever
+
STATE_CHARACTER = 0
STATE_OR = 1
STATE_AND = 2
join0_state = (STATE_JOIN0,)
# multistate classes:
- # (MULTISTATE_ACCEPT, 1) accept, occupies one thread in list
- # (MULTISTATE_AND, n, state, child) and, occupies n threads in list
+ # (MULTISTATE_ACCEPT, 1)
+ # (MULTISTATE_AND, n, state, child)
# (MULTISTATE_OR, n, child0, child1)
- # n = sum of n from child states, that is, multistate[1] is the number of
+
+ # a multistate is basically a set of state numbers we could be in, based on
+ # the character stream received so far, and the principle is that given a new
+ # character we can generate the next multistate by replacing each of the
+ # state numbers in the multistate with the (possibly empty) set of state
+ # numbers reachable in NFA-fashion from that original state by that character
+
+ # however, the storage format is as a tree rather than a set, which helps to
+ # implement our special hierarchical features such as STATE_AND matching --
+ # in the common case a multistate is a tree of MULTISTATE_OR nodes where each
+ # leaf corresponds to an entry in the multistate set (in the common case, a
+ # state number) -- but such a leaf is implemented as basically a linked list
+ # of MULTISTATE_AND nodes giving a number of states that we must be in at the
+ # same time (for STATE_AND matching), this linked list being terminated by a
+ # MULTISTATE_ACCEPT which matches anything -- so, suppose we only want to be
+ # in state k, the leaf is then (MULTISTATE_AND, 1, k, (MULTISTATE_ACCEPT, 1))
+
+ # to make the multistate work like a set, so that if the same state number is
+ # reachable in multiple ways then it is only recorded once in the multistate,
+ # we have a deduplication algorithm that works in post-order and makes sure
+ # that if there are duplicate subtrees in the multistate then the later one
+ # (in post-order) will be failed leaving only the original one which has the
+ # higher priority -- and the groups recorded are from the original one only
+
+ # processing a MULTISTATE_AND means generating a new (possibly empty) set of
+ # candidate states from a starting candidate state, but this has to be done
+ # separately for each entry in the MULTISTATE_AND linked list and the result
+ # is only returned if all entries in the list have at least one candidate --
+ # in this process we also handle the STATE_JOIN0 and STATE_JOIN1 nodes, as we
+ # know that MULTISTATE_AND to be joined will always be adjacent in the list,
+ # so if we see a STATE_JOIN0 we generate no candidate for that list-entry and
+ # instead pass a flag into the processing of the next entry to say that the
+ # previous MULTISTATE_AND wants to join with it -- when the flag is true it
+ # means that a candidate is only generated if a MULTISTATE_JOIN1 is seen --
+ # but it has to be a *count* not a flag because when the previous entry is
+ # requesting to join, the following entry might actually be several entries
+ # that need to join first before becoming able to join onto that requester
+ # (and note that when a MULTISTATE_AND node wants to join onto the next node
+ # in the list, that node might not be another MULTISTATE_AND node, e.g. if it
+ # is a MULTISTATE_OR, then the join count is passed into both branches, and
+ # each of them can either join the requester, or fail to join, independently)
+
+ # n = number of threads in thread list covered by a node, calculated as the
+ # sum of the n from child states, that is, multistate[1] is the number of
# (MULTISTATE_ACCEPT, 1) leaf states ultimately reachable from this subtree
+ # -- the n is needed so that if the entire subtree fails, we can pop all its
+ # threads from the thread list immediately without having to recurse into it
+
+ # MULTISTATE_ACCEPT means to accept any character forever (like NFA state 0)
MULTISTATE_ACCEPT = 0
MULTISTATE_AND = 1
MULTISTATE_OR = 2
accept_multistate = (MULTISTATE_ACCEPT, 1)
+ # note that NFA state 0 and MULTISTATE_ACCEPT are counterparts, they mean to
+ # keep accepting any character forever -- it works because the actual wanted
+ # text is protected by grouping so isn't affected by scanning more characters
+ # -- considering only one thread is at MULTISTATE_ACCEPT at any time because
+ # of deduplication, it means when the only thead is here then we know we have
+ # finished, or at least any further scanning will not record anything useful
+ # (the matching has failed or succeeded depending on if groups were recorded)
+
def __init__(
self,
states = [(STATE_CHARACTER, [0, n_characters], 0)],
self.start_state = start_state
def multistate_next(self, root_multistate, character):
+ # deduplication notes:
+
# the deduplication works as effectively a second pass which goes
# over the multistate tree in pre-order, looking for OR-disjunctions
# of any depth and configuration, e.g. (a OR b) or (c OR d), and
# if we have added something to the node built by a recursive call,
# we'll fall into the deduplication logic, otherwise just return it
+ # end of deduplication notes
+
+ # when we get to a MULTISTATE_AND node it is time to refer to the NFA, so
+ # this routine recurses through the NFA to find possibly multiple candidate
+ # states -- the recursion is terminated by finding a STATE_CHARACTER, as we
+ # do not know what character is coming next, so the STATE_CHARACTER is a
+ # candidate state to record in the set of states that we could be in now
+ # (when this routine returns a MULTISTATE_AND node, the state referred to
+ # is always a STATE_CHARACTER, but this invariant does not necessarily hold
+ # for the first multistate before we have passed the character -1 into it)
def advance1(multistate, join_count, done_multistates):
# modifies nonlocal: transition
assert multistate[0] == NFA.MULTISTATE_AND
transition.append((dfa.DFA.TRANSITION_POP, child[1]))
return None
len_transition = len(transition)
+ # to be a candidate, the rest of the MULTISTATE_AND chain must be too
child = advance(child, 0, set())
if child is None:
return None
_, next_state0, next_state1 = state_desc
len_transition = len(transition)
transition.append((dfa.DFA.TRANSITION_DUP, child[1]))
+ # to be a candidate, the rest of the MULTISTATE_AND chain must be too,
+ # but it can be a candidate using either or both MULTISTATE_OR children
child0 = advance1(
(NFA.MULTISTATE_AND, child[1], next_state0, child),
join_count,
done_multistates.add(result)
return result
+ # this routine recurses through the multistate tree
+ # process "multistate" to a new multistate based on character "character"
+ # being received, tighten the bounds character0, character1 as appropriate
+ # note: character = -1 is a dummy character used before passing in any real
+ # characters, to generate the starting actions for initial parentheses etc
def advance(multistate, join_count, done_multistates):
nonlocal character0, character1 # modifies nonlocal: transition
if multistate[0] == NFA.MULTISTATE_ACCEPT:
root_multistate = advance(root_multistate, 0, set())
return root_multistate, transition, character0, character1
+ # return a list of the thread numbers which are accepting
def multistate_accept(root_multistate):
i = 0
result = []
result.append(i)
i += 1
elif multistate[0] == NFA.MULTISTATE_AND:
+ # a MULTISTATE_AND can never be accepting because the state it points
+ # to is always a STATE_CHARACTER and so it is waiting for a character
+ # (cannot call this routine without having passed -1 character first)
i += multistate[1]
elif multistate[0] == NFA.MULTISTATE_OR:
_, _, child0, child1 = multistate
start_state = self.start_state[start_index]
if start_state == -1:
return None
+ # make an initial linked list containing only the start state
+ # (the NFA.accept_multistate is the terminator for the list)
next_multistate, transition, _, _ = self.multistate_next(
(NFA.MULTISTATE_AND, 1, start_state, NFA.accept_multistate),
-1