Make better use of numpy.nonzero in compression step, borrowing from pyacc2.git

author Nick Downing <downing.nick@gmail.com>

Sun, 22 Jul 2018 06:13:44 +0000 (16:13 +1000)

committer Nick Downing <downing.nick@gmail.com>

Sun, 22 Jul 2018 06:13:44 +0000 (16:13 +1000)
author Nick Downing <downing.nick@gmail.com>
Sun, 22 Jul 2018 06:13:44 +0000 (16:13 +1000)
committer Nick Downing <downing.nick@gmail.com>
Sun, 22 Jul 2018 06:13:44 +0000 (16:13 +1000)
diff --git a/flex_dfa.py b/flex_dfa.py

index ea85fa1..015f0b3 100644 (file)
--- a/flex_dfa.py
+++ b/flex_dfa.py
@@ -71,12 +71,12 @@ class FlexDFA:
      self.accept = numpy.zeros((0x100,), numpy.int16)
      n_accept = 1
  
-    # transition_table[i, j] is transition on character j in state i
+    # transitions[i, j] is transition on character j in state i
      # in our way of thinking, 0 is don't care and -1 is failure
      # however, in the flex way these are both 0 (don't care),
      # the distinction being that failure has no associated action
      # thus all entries of states are filled, with 0 as a catch-all
-    transition_table = numpy.zeros((0x100, 0x101), numpy.int16)
+    transitions = numpy.zeros((0x100, 0x101), numpy.int16)
  
      # generate acclist, accept and transition tables
      while n_accept < len(flex_state_to_action):
@@ -88,6 +88,7 @@ class FlexDFA:
        transit(transition)
        #print(threads0[prefix_slop:])
        if n_accept >= self.accept.shape[0]:
+        # extend accept
          new_accept = numpy.zeros(
            (self.accept.shape[0] * 2,),
            numpy.int16
@@ -95,7 +96,6 @@ class FlexDFA:
          new_accept[:self.accept.shape[0]] = self.accept
          self.accept = new_accept
        self.accept[n_accept] = n_acclist
-      flex_accept = []
        for k in [j for i in threads0[prefix_slop:] for j in i]:
          acc = k >> 1
          if k & 1:
@@ -110,6 +110,7 @@ class FlexDFA:
            # mark start of (hopefully safe) trailing context
            acc |= FlexDFA.YY_TRAILING_HEAD_MASK
          if n_acclist >= self.acclist.shape[0]:
+          # extend acclist
            new_acclist = numpy.zeros(
              (self.acclist.shape[0] * 2,),
              numpy.int16
@@ -120,13 +121,14 @@ class FlexDFA:
          n_acclist += 1
  
        # calculate transition row from _dfa.state character-to-action table
-      if n_accept >= transition_table.shape[0]:
-        new_transition_table = numpy.zeros(
-          (transition_table.shape[0] * 2, 0x101),
+      if n_accept >= transitions.shape[0]:
+        # extend transitions
+        new_transitions = numpy.zeros(
+          (transitions.shape[0] * 2, 0x101),
            numpy.int16
          )
-        new_transition_table[:transition_table.shape[0], :] = transition_table
-        transition_table = new_transition_table
+        new_transitions[:transitions.shape[0], :] = transitions
+        transitions = new_transitions
        breaks, actions, _ = _dfa.states[state]
        character0 = 0
        for i in range(len(breaks)):
@@ -138,15 +140,15 @@ class FlexDFA:
            next_flex_state = len(flex_state_to_action)
            action_to_flex_state[next_action] = next_flex_state
            flex_state_to_action.append(next_action)
-        transition_table[n_accept, character0:character1] = next_flex_state
+        transitions[n_accept, character0:character1] = next_flex_state
          character0 = character1
        assert character0 == 0x100
  
        n_accept += 1
  
      # remap NUL transitions to 0x100 and replace with EOB transitions
-    transition_table[:, 0x100] = transition_table[:, 0]
-    transition_table[:, 0] = eob_state
+    transitions[:, 0x100] = transitions[:, 0]
+    transitions[:, 0] = eob_state
  
      # finalize the acclist and accept tables
      self.acclist = self.acclist[:n_acclist]
@@ -174,46 +176,47 @@ class FlexDFA:
      while n_states < n_accept:
        # find most similar/earliest existing state to use as default
        mask = (
-        transition_table[n_states:n_states + 1, :] !=
-        transition_table[:n_states, :]
+        transitions[n_states:n_states + 1, :] !=
+        transitions[:n_states, :]
        )
-      diff = numpy.sum(mask, 1)
-      flex_def = numpy.argmin(diff, 0)
-      if diff[flex_def] == 0:
+      default_state = numpy.argmin(numpy.sum(mask, 1), 0)
+      indices = numpy.nonzero(mask[default_state, :])[0]
+      if indices.shape[0] == 0:
          # when copying another state, need to have the same base, though
          # the base will not matter since there will be no entries, it is
          # is because of the awkward way the compressed lookup is written
-        flex_base = self.states[flex_def, 0]
+        start_index = self.states[default_state, 0]
        else:
-        mask = mask[flex_def, :]
-
-        # make sure entries_used array is at least large enough to find a spot
+        # make sure entries array is at least large enough to find a spot
          while self.entries.shape[0] < n_entries + 0x101:
+          # extend entries, copying only n_entries entries
            new_entries = numpy.full(
              (self.entries.shape[0] * 2, 2),
              -1,
              numpy.int16
            )
-          new_entries[:self.entries.shape[0], :] = self.entries
+          new_entries[:n_entries, :] = self.entries[:n_entries, :]
            self.entries = new_entries
+
+          # extend entries_used, copying only n_entries entries
            new_entries_used = numpy.zeros(
              (entries_used.shape[0] * 2,),
              numpy.bool
            )
-          new_entries_used[:entries_used.shape[0]] = entries_used
+          new_entries_used[:n_entries] = entries_used[:n_entries]
            entries_used = new_entries_used
  
          # find a suitable spot and store differences from default state
-        flex_base = 0
-        while flex_base < n_entries:
-          if not numpy.any(entries_used[flex_base:flex_base + 0x101] & mask):
+        start_index = 0
+        while start_index < n_entries:
+          if not numpy.any(entries_used[indices + start_index]):
              break
-          flex_base += 1
-        entries_used[flex_base:flex_base + 0x101] |= mask
-        for i in numpy.nonzero(mask)[0]:
-          self.entries[flex_base + i, 0] = transition_table[n_states, i]
-          self.entries[flex_base + i, 1] = n_states
-        n_entries = max(n_entries, flex_base + 0x101)
+          start_index += 1
+        self.entries[indices + start_index, 0] = transitions[n_states, indices]
+        self.entries[indices + start_index, 1] = n_states
+        entries_used[indices + start_index] = True
+        if n_entries < start_index + 0x101:
+          n_entries = start_index + 0x101
  
        if n_states >= self.states.shape[0]:
          new_states = numpy.zeros(
@@ -222,8 +225,8 @@ class FlexDFA:
          )
          new_states[:self.states.shape[0], :] = self.states
          self.states = new_states
-      self.states[n_states, 0] = flex_base
-      self.states[n_states, 1] = flex_def
+      self.states[n_states, 0] = start_index
+      self.states[n_states, 1] = default_state
        n_states += 1
  
      # finalize states and entries tables
author	Nick Downing <downing.nick@gmail.com>
	Sun, 22 Jul 2018 06:13:44 +0000 (16:13 +1000)
committer	Nick Downing <downing.nick@gmail.com>
	Sun, 22 Jul 2018 06:13:44 +0000 (16:13 +1000)