assert hdr[2] == len(tree) & 0xff
assert hdr[3] == len(tree) >> 8
-assert tree[0x1a] == 0xa9 # lda #NN
-assert tree[0x1e] == 0xa9 # lda #NN
-src = tree[0x1b] | (tree[0x1f] << 8)
+assert tree[0x15] == 0xa9 # lda #NN
+assert tree[0x19] == 0xa9 # lda #NN
+src = tree[0x16] | (tree[0x1a] << 8)
-assert tree[0x22] == 0xa9 # lda #NN
-assert tree[0x26] == 0xa9 # lda #NN
-dest = tree[0x23] + (tree[0x27] << 8)
+assert tree[0x1d] == 0xa9 # lda #NN
+assert tree[0x21] == 0xa9 # lda #NN
+dest = tree[0x1e] + (tree[0x22] << 8)
-assert tree[0x2a] == 0xa9 # lda #NN
-assert tree[0x2e] == 0xa9 # lda #NN
-count = tree[0x2b] + (tree[0x2f] << 8)
+assert tree[0x25] == 0xa9 # lda #NN
+assert tree[0x29] == 0xa9 # lda #NN
+count = tree[0x26] + (tree[0x2a] << 8)
-assert tree[0x32] == 0xa9 # lda #NN
-assert tree[0x34] & ~0x20 == 0x18 # clc
-bits = tree[0x33] | ((tree[0x34] & 0x20) << 3)
+assert tree[0x2d] == 0xa9 # lda #NN
+assert tree[0x2f] & ~0x20 == 0x18 # clc
+bits = tree[0x2e] | ((tree[0x2f] & 0x20) << 3)
-assert tree[0x65] == 0x4c # jmp NNNN
-start = tree[0x66] | (tree[0x67] << 8)
+assert tree[0x60] == 0x4c # jmp NNNN
+start = tree[0x61] | (tree[0x62] << 8)
-assert src == load_addr + len(tree) - 0x300
+assert tree[0x65] == 0xe9 # sbc #NN
+break3 = tree[0x66]
+assert tree[0x67] == 0xc9 # cmp #NN
+break1 = (tree[0x68] + break3) & 0xff
+
+assert tree[0x7a] == 0xc9 # cmp #NN
+break2 = tree[0x7b]
+
+assert src == load_addr + len(tree) - 0x200
dest += 1
bin = [0] * (dest - start)
print(f'tree {len(tree):04x} bin {len(bin):04x}')
byte = _bytes.pop()
if bits & 0x100:
j = decode(
- [tree[-0x300 + byte], tree[-0x200 + byte]],
- 0x40 |
- ((tree[-0x100 + byte] & 1) << 7) |
- ((tree[-0x100 + byte] & 0x80) << 1),
+ [tree[-0x200 + byte], tree[-0x100 + byte]],
+ (
+ 0x40
+ if byte < break1 else
+ 0x140
+ if byte < break2 else
+ 0x1c0
+ if byte < break3 else
+ 0xc0
+ ),
j
)
else:
n = 8 - padding
count |= (-1 << 16)
-i = len(tree) - 0x300
+i = len(tree) - 0x200
j = len(bin)
while count < 0:
i -= n
- assert i >= 0x98
+ assert i >= 0x91
j = decode(tree[i:i + n], bits, j)
i -= 1
- assert i >= 0x98 - 1 # last one is garbage
+ assert i >= 0x91 - 1 # last one is garbage
bits = (tree[i] << 1) | 1
n = 8
count += 1
-assert i == 0x98 - 1 # last one is garbage
+assert i == 0x91 - 1 # last one is garbage
assert j == 0
load_addr = start
#!/usr/bin/env python3
+import bisect
import numpy
import sys
from heapdict import heapdict
with open(tree_loader_bin, 'rb') as fin:
tree_loader = list(fin.read())
-assert len(tree_loader) == 0x98
+assert len(tree_loader) == 0x91
with open(in_bin, 'rb') as fin:
data = list(fin.read())
freq[pair] = 0
freq[pair] -= 1
-pairs = []
-for i in range(0x100, 0x200):
+# define a gray code to circularly partition the token space 0x100..0x1ff
+# this makes tokens that have a left child occupy a contiguous region, and
+# tokens that have a right child a contiguous region, and they may overlap
+# we initially define the token space as 0x100..0x4ff and collapse it later
+gray_code = {
+ (False, False): 0x100,
+ (False, True ): 0x200,
+ (True , True ): 0x300,
+ (True , False): 0x400,
+}
+
+tokens = []
+for i in range(0x100):
best, _ = freq.peekitem()
#_, best = min([(v, k) for k, v in freq.items()])
- pairs.append(best)
+
+ token = i | gray_code[(best[0] >= 0x100, best[1] >= 0x100)]
+ tokens.append((token, best))
+
j = 0
while True:
try:
if j >= 1:
pair = (text[j - 1], best[0])
freq[pair] += 1
- pair = (pair[0], i)
+ pair = (pair[0], token)
if pair not in freq:
freq[pair] = 0
freq[pair] -= 1
if j + 2 < len(text):
pair = (best[1], text[j + 2])
freq[pair] += 1
- pair = (i, pair[1])
+ pair = (token, pair[1])
if pair not in freq:
freq[pair] = 0
freq[pair] -= 1
- text[j:j + 2] = [i]
+ text[j:j + 2] = [token]
j += 1
assert freq[best] == 0
del freq[best]
+# collapse the token space 0x100..0x4ff to 0x100..0x1ff
+tokens = sorted(tokens)
+xlate = list(range(0x100)) + [0x200] * 0x400
+for i in range(len(tokens)):
+ xlate[tokens[i][0]] = 0x100 + i
+pairs = [(xlate[left], xlate[right]) for _, (left, right) in tokens]
+tokens = [token for token, _ in tokens]
+text = [xlate[i] for i in text]
+
+# find the gray code transitions in the collapsed token space
+break1 = bisect.bisect_left(tokens, 0x200)
+break2 = bisect.bisect_left(tokens, 0x300)
+break3 = bisect.bisect_left(tokens, 0x400)
+
pairs = numpy.array(pairs, numpy.uint16)
text = numpy.array(text, numpy.uint16)
left = (pairs[:, 0] & 0xff).astype(numpy.uint8)
+assert not numpy.any(pairs[:break2, 0] >> 8)
+assert numpy.all(pairs[break2:, 0] >> 8)
+
right = (pairs[:, 1] & 0xff).astype(numpy.uint8)
-high = ((pairs[:, 0] >> 8) | ((pairs[:, 1] >> 1) & 0x80)).astype(numpy.uint8)
+assert not numpy.any(pairs[:break1, 1] >> 8)
+assert numpy.all(pairs[break1:break3, 1] >> 8)
+assert not numpy.any(pairs[break3:, 1] >> 8)
n = text.shape[0]
m = (n + 7) >> 3
text = list(text)
left = list(left)
right = list(right)
-high = list(high)
-tree = tree_loader + text + left + right + high
+tree = tree_loader + text + left + right
print('bin', f'{len(bin):04x}', 'tree', f'{len(tree):04x}')
src = load_addr + len(tree_loader) + len(text)
high1 = dest + 0x200
dest -= 1
-assert tree[5] == 0xb9 # lda NNNN,y
-tree[6] = left0 & 0xff
-tree[7] = left0 >> 8
-assert tree[8] == 0x99 # sta NNNN,y
-tree[9] = left1 & 0xff
-tree[0xa] = left1 >> 8
-
-assert tree[0xb] == 0xb9 # lda NNNN,y
-tree[0xc] = right0 & 0xff
-tree[0xd] = right0 >> 8
-assert tree[0xe] == 0x99 # sta NNNN,y
-tree[0xf] = right1 & 0xff
-tree[0x10] = right1 >> 8
-
-assert tree[0x11] == 0xb9 # lda NNNN,y
-tree[0x12] = high0 & 0xff
-tree[0x13] = high0 >> 8
-assert tree[0x14] == 0x99 # sta NNNN,y
-tree[0x15] = high1 & 0xff
-tree[0x16] = high1 >> 8
-
-assert tree[0x1a] == 0xa9 # lda #NN
-tree[0x1b] = src & 0xff
-assert tree[0x1e] == 0xa9 # lda #NN
-tree[0x1f] = src >> 8
-
-assert tree[0x22] == 0xa9 # lda #NN
-tree[0x23] = dest & 0xff
-assert tree[0x26] == 0xa9 # lda #NN
-tree[0x27] = dest >> 8
-
-assert tree[0x2a] == 0xa9 # lda #NN
-tree[0x2b] = count & 0xff
-assert tree[0x2e] == 0xa9 # lda #NN
-tree[0x2f] = count >> 8
-
-assert tree[0x32] == 0xa9 # lda #NN
-tree[0x33] = bits & 0xff
-assert tree[0x34] == 0x18 # clc
-tree[0x34] |= (bits >> 3) & 0x20 # clc or sec
-
-assert tree[0x65] == 0x4c # jmp NNNN
-tree[0x66] = start & 0xff
-tree[0x67] = start >> 8
-
-assert tree[0x6a] == 0xbd # lda NNNN,x
-tree[0x6b] = high1 & 0xff
-tree[0x6c] = high1 >> 8
-
-assert tree[0x6e] == 0xbd # lda NNNN,x
-tree[0x6f] = right1 & 0xff
-tree[0x70] = right1 >> 8
-
-assert tree[0x7f] == 0xbd # lda NNNN,x
-tree[0x80] = high1 & 0xff
-tree[0x81] = high1 >> 8
-
-assert tree[0x83] == 0xbd # lda NNNN,x
-tree[0x84] = left1 & 0xff
-tree[0x85] = left1 >> 8
+assert tree[6] == 0xb9 # lda NNNN,y
+tree[7] = left0 & 0xff
+tree[8] = left0 >> 8
+assert tree[9] == 0x99 # sta NNNN,y
+tree[0xa] = left1 & 0xff
+tree[0xb] = left1 >> 8
+
+assert tree[0xc] == 0xb9 # lda NNNN,y
+tree[0xd] = right0 & 0xff
+tree[0xe] = right0 >> 8
+assert tree[0xf] == 0x99 # sta NNNN,y
+tree[0x10] = right1 & 0xff
+tree[0x11] = right1 >> 8
+
+assert tree[0x15] == 0xa9 # lda #NN
+tree[0x16] = src & 0xff
+assert tree[0x19] == 0xa9 # lda #NN
+tree[0x1a] = src >> 8
+
+assert tree[0x1d] == 0xa9 # lda #NN
+tree[0x1e] = dest & 0xff
+assert tree[0x21] == 0xa9 # lda #NN
+tree[0x22] = dest >> 8
+
+assert tree[0x25] == 0xa9 # lda #NN
+tree[0x26] = count & 0xff
+assert tree[0x29] == 0xa9 # lda #NN
+tree[0x2a] = count >> 8
+
+assert tree[0x2d] == 0xa9 # lda #NN
+tree[0x2e] = bits & 0xff
+assert tree[0x2f] == 0x18 # clc
+tree[0x2f] |= (bits >> 3) & 0x20 # clc or sec
+
+assert tree[0x60] == 0x4c # jmp NNNN
+tree[0x61] = start & 0xff
+tree[0x62] = start >> 8
+
+assert tree[0x65] == 0xe9 # sbc #NN
+tree[0x66] = break3
+assert tree[0x67] == 0xc9 # cmp #NN
+tree[0x68] = (break1 - break3) & 0xff
+
+assert tree[0x69] == 0xbd # lda NNNN,x
+tree[0x6a] = right1 & 0xff
+tree[0x6b] = right1 >> 8
+
+assert tree[0x7a] == 0xc9 # cmp #NN
+tree[0x7b] = break2
+
+assert tree[0x7c] == 0xbd # lda NNNN,x
+tree[0x7d] = left1 & 0xff
+tree[0x7e] = left1 >> 8
load_size = len(tree)
hdr = [load_addr & 0xff, load_addr >> 8, load_size & 0xff, load_size >> 8]