--- /dev/null
+page0 = 1
+page1 = 2
+page2 = 4
+
+ .area SM (abs,ovr)
+
+; page 0 interpreter
+; stack and control transfer
+
+ .org page0 * 0x100
+
+; administrative
+
+page0_trace:
+ jp start ; will be overwritten with print_trace
+
+page0_esc:
+ ld l,c
+ ld h,b
+ jp (hl)
+
+page0_page2:
+ exx
+ pop hl
+ exx
+ inc h ; page 1
+page0_page1:
+ pop de
+ ld a,(bc)
+ inc bc
+ ld l,a
+ inc h ; page 1 (or 2 if came from page0_page2)
+ jp (hl)
+
+page0_jeq:
+ jr nz,not_taken
+page0_imm_jmp:
+ rst 8
+ ld c,l
+ ld b,h
+ jr page0_dispatch0
+
+page0_jne:
+ jr nz,page0_imm_jmp
+not_taken:
+ inc bc
+ inc bc
+ jr page0_dispatch0
+
+page0_jlt:
+ jr c,page0_imm_jmp
+ jr not_taken
+
+page0_jge:
+ jr nc,page0_imm_jmp
+ jr not_taken
+
+page0_peq:
+ call test_eq
+ jr page0_dispatch1
+
+page0_pne:
+ call test_ne
+ jr page0_dispatch1
+
+page0_plt:
+ call test_lt
+ jr page0_dispatch1
+
+page0_pge:
+ call test_ge
+ jr page0_dispatch1
+
+page0_imm_call:
+ rst 8
+ push bc
+ ld c,l
+ ld b,h
+ jr page0_dispatch0
+
+page0_ret:
+ pop bc
+page0_stkadj:
+ rst 8
+ add hl,sp
+ ld sp,hl
+page0_dispatch0:
+ ld a,(bc)
+ inc bc
+ ld l,a
+ ld h,page0
+ jp (hl)
+
+page0_stkptr:
+ rst 8
+ add hl,sp
+ ex de,hl
+page0_dispatch1:
+ ld a,(bc)
+ inc bc
+ ld l,a
+ ld h,page1
+ jp (hl)
+
+page0_stkld_w:
+ call math_stkld_w
+ jr page0_dispatch1
+
+page0_stkld_l:
+ call math_stkld_l
+page0_dispatch2:
+ ld a,(bc)
+ inc bc
+ ld l,a
+ ld h,page2
+ jp (hl)
+
+page0_imm_w:
+ call math_imm_w
+ jr page0_dispatch1
+
+page0_imm_l:
+ call math_imm_l
+ jr page0_dispatch2
+
+; page 1 interpreter
+; word arithmetic operations
+; top stack word cached in de
+
+ .org page1 * 0x100
+
+page1_trace:
+ jp print_trace
+
+page1_page0:
+ push de
+ ld a,(bc)
+ inc bc
+ ld l,a
+ dec h ; page 0
+ jp (hl)
+
+page1_call:
+ push bc
+page1_jmp:
+ ld c,e
+ ld b,d
+ jr page1_dispatch0
+
+page1_stkst_w:
+ call math_stkst_w
+page1_dispatch0:
+ ld a,(bc)
+ inc bc
+ ld l,a
+ ld h,page0
+ jp (hl)
+
+page1_stkst_l:
+ call math_stkst_l
+ jr page1_dispatch0
+
+page1_ld_w:
+ ex de,hl
+ ld e,(hl)
+ inc hl
+ ld d,(hl)
+ jr page1_dispatch1
+
+page1_ld_l:
+ ex de,hl
+ call math_ld_l
+page1_dispatch2:
+ ld a,(bc)
+ inc bc
+ ld l,a
+ ld h,page2
+ jp (hl)
+
+page1_st_w:
+ pop hl
+ ld (hl),e
+ inc hl
+ ld (hl),d
+ jr page1_dispatch0
+
+page1_imm_and_w:
+ call math_and_imm_w
+ jr page1_dispatch1
+
+page1_and_w:
+ call math_and_w
+ jr page1_dispatch1
+
+page1_imm_or_w:
+ call math_or_imm_w
+ jr page1_dispatch1
+
+page1_or_w:
+ call math_or_w
+ jr page1_dispatch1
+
+page1_imm_xor_w:
+ call math_xor_imm_w
+ jr page1_dispatch1
+
+page1_xor_w:
+ call math_xor_w
+ jr page1_dispatch1
+
+page1_imm_add_w:
+ rst 8
+ .db 0x3e ; ld a,
+page1_add_w:
+ pop hl
+ add hl,de
+ ex de,hl
+ jr page1_dispatch1
+
+page1_imm_subrev_w:
+ rst 8
+ .db 0x3e ; ld a,
+page1_sub_w:
+ pop hl
+ or a
+ sbc hl,de
+ ex de,hl
+ jr page1_dispatch1
+
+; use addition for page1_imm_sub_w
+page1_subrev_w:
+ pop hl
+ ex de,hl
+ or a
+ sbc hl,de
+ ex de,hl
+ jr page1_dispatch1
+
+page1_imm_cmprev_sw:
+ rst 8
+ .db 0x3e ; ld a,
+page1_cmp_sw:
+ pop hl
+ call math_cmprev_sw
+ jr page1_dispatch0
+
+page1_imm_cmp_sw:
+ rst 8
+ .db 0x3e ; ld a,
+page1_cmprev_sw:
+ pop hl
+ call math_cmp_sw
+ jr page1_dispatch0
+
+page1_imm_cmprev_uw:
+ rst 8
+ .db 0x3e ; ld a,
+page1_cmp_uw:
+ pop hl
+ or a
+ sbc hl,de
+ jr page1_dispatch0
+
+page1_imm_cmp_uw:
+ rst 8
+ .db 0x3e ; ld a,
+page1_cmprev_uw:
+ pop hl
+ ex de,hl
+ or a
+ sbc hl,de
+ jr page1_dispatch0
+
+page1_imm_slrev_w:
+ rst 8
+ .db 0x3e ; ld a,
+page1_sl_w:
+ pop hl
+ call math_slrev_w
+page1_dispatch1:
+ ld a,(bc)
+ inc bc
+ ld l,a
+ ld h,page1
+ jp (hl)
+
+page1_imm_sl_w:
+ call math_sl_imm_w
+ jr page1_dispatch1
+
+page1_slrev_w:
+ pop hl
+ call math_sl_w
+ jr page1_dispatch1
+
+page1_imm_srrev_sw:
+ rst 8
+ .db 0x3e ; ld a,
+page1_sr_sw:
+ pop hl
+ call math_srrev_sw
+ jr page1_dispatch1
+
+page1_imm_sr_sw:
+ ld a,(bc)
+ inc bc
+ ex de,hl
+ call math_srrev_sw0
+ jr page1_dispatch1
+
+page1_srrev_sw:
+ pop hl
+ call math_sr_sw
+ jr page1_dispatch1
+
+page1_imm_srrev_uw:
+ rst 8
+ .db 0x3e ; ld a,
+page1_sr_uw:
+ pop hl
+ call math_srrev_uw
+ jr page1_dispatch1
+
+page1_imm_sr_uw:
+ ld a,(bc)
+ inc bc
+ ex de,hl
+ call math_srrev_uw0
+ jr page1_dispatch1
+
+page1_srrev_uw:
+ pop hl
+ call math_sr_uw
+ jr page1_dispatch1
+
+page1_imm_mul_uw:
+ call math_mul_imm_w
+ jr page1_dispatch1
+
+page1_mul_w:
+ pop hl
+ call math_mul_w
+ jr page1_dispatch1
+
+page1_imm_divrev_sw:
+ rst 8
+ .db 0x3e ; ld a,
+page1_div_sw:
+ pop hl
+ call math_divrev_sw
+ ex de,hl
+ push hl
+ jr page1_dispatch1
+
+page1_imm_div_sw:
+ rst 8
+ .db 0x3e ; ld a,
+page1_divrev_sw:
+ pop hl
+ call math_div_sw
+ ex de,hl
+ push hl
+ jr page1_dispatch1
+
+page1_imm_divrev_uw:
+ rst 8
+ .db 0x3e ; ld a,
+page1_div_uw:
+ pop hl
+ call math_divrev_uw
+ ex de,hl
+ push hl
+ jr page1_dispatch1
+
+page1_imm_div_uw:
+ rst 8
+ .db 0x3e ; ld a,
+page1_divrev_uw:
+ pop hl
+ call math_div_uw
+ ex de,hl
+ push hl
+ jr page1_dispatch1
+
+; page 2 interpreter
+; long arithmetic operations
+; top stack long cached in de:hl'
+
+ .org page2 * 0x100
+
+; conditionals
+
+test_eq:
+ ld de,0
+ ret nz
+ inc de
+ ret
+
+test_ne:
+ ld de,0
+ ret z
+ inc de
+ ret
+
+test_ge:
+ ccf
+test_lt:
+ ld hl,0
+ adc hl,hl
+ ex de,hl
+ ret
+
+; math package
+
+math_imm_l: ; immediate to de:hl'
+ ld a,(bc)
+ inc bc
+ exx
+ ld l,a
+ exx
+ ld a,(bc)
+ inc bc
+ exx
+ ld h,a
+ exx
+math_imm_w: ; immediate to de
+ ld a,(bc)
+ inc bc
+ ld e,a
+ ld a,(bc)
+ inc bc
+ ld d,a
+ ret
+
+math_imm_l0: ; immediate to hl:de'
+ ld a,(bc)
+ inc bc
+ exx
+ ld e,a
+ exx
+ ld a,(bc)
+ inc bc
+ exx
+ ld d,a
+ exx
+; use rst 8 for math_imm_w0
+ ld a,(bc)
+ inc bc
+ ld l,a
+ ld a,(bc)
+ inc bc
+ ld h,a
+ ret
+
+math_stkld_w: ; sp(imm_w) to de
+ ld a,(bc)
+ inc bc
+ ld l,a
+ ld a,(bc)
+ inc bc
+ ld h,a
+ add hl,sp
+; use inline code for math_ld_w
+ ld e,(hl)
+ inc hl
+ ld d,(hl)
+ ret
+
+math_stkld_l: ; sp(imm_w) to de:hl'
+ ld a,(bc)
+ inc bc
+ ld l,a
+ ld a,(bc)
+ inc bc
+ ld h,a
+ add hl,sp
+math_ld_l: ; (hl) to de:hl'
+ ld a,(hl)
+ inc hl
+ exx
+ ld l,a
+ exx
+ ld a,(hl)
+ inc hl
+ exx
+ ld h,a
+ exx
+ ld e,(hl)
+ inc hl
+ ld d,(hl)
+ ret
+
+math_stkst_w: ; de to sp(imm_w)
+ ld a,(bc)
+ inc bc
+ ld l,a
+ ld a,(bc)
+ inc bc
+ ld h,a
+ add hl,sp
+; use inline code for math_st_w
+ ld (hl),e
+ inc hl
+ ld (hl),d
+ ret
+
+math_stkst_l: ; de:hl' to sp(imm_w)
+ ld a,(bc)
+ inc bc
+ ld l,a
+ ld a,(bc)
+ inc bc
+ ld h,a
+ add hl,sp
+math_st_l: ; de:hl' to (hl)
+ exx
+ ld a,l
+ exx
+ ld (hl),a
+ inc hl
+ exx
+ ld a,h
+ exx
+ ld (hl),a
+ inc hl
+ ld (hl),e
+ inc hl
+ ld (hl),d
+ ret
+
+math_and_imm_l: ; de:hl' &= imm_l
+ ld a,(bc)
+ inc bc
+ exx
+ and l
+ ld l,a
+ exx
+ ld a,(bc)
+ inc bc
+ exx
+ and h
+ ld h,a
+ exx
+math_and_imm_w: ; de &= imm_w
+ ld a,(bc)
+ inc bc
+ and e
+ ld e,a
+ ld a,(bc)
+ inc bc
+ and d
+ ld d,a
+ ret
+
+math_and_l: ; de:hl' &= hl:de'
+ exx
+ ld a,l
+ and e
+ ld l,a
+ ld a,h
+ and d
+ ld h,a
+ exx
+math_and_w: ; de &= hl
+ ld a,e
+ and l
+ ld e,a
+ ld a,d
+ and h
+ ld d,a
+ ret
+
+math_or_imm_l: ; de:hl' |= imm_l
+ ld a,(bc)
+ inc bc
+ exx
+ or l
+ ld l,a
+ exx
+ ld a,(bc)
+ inc bc
+ exx
+ or h
+ ld h,a
+ exx
+math_or_imm_w: ; de |= imm_w
+ ld a,(bc)
+ inc bc
+ or e
+ ld e,a
+ ld a,(bc)
+ inc bc
+ or d
+ ld d,a
+ ret
+
+math_or_l: ; de:hl' |= hl:de'
+ exx
+ ld a,l
+ or e
+ ld l,a
+ ld a,h
+ or d
+ ld h,a
+ exx
+math_or_w: ; de |= hl
+ ld a,e
+ or l
+ ld e,a
+ ld a,d
+ or h
+ ld d,a
+ ret
+
+math_xor_imm_l: ; de:hl' ^= imm_l
+ ld a,(bc)
+ inc bc
+ exx
+ xor l
+ ld l,a
+ exx
+ ld a,(bc)
+ inc bc
+ exx
+ xor h
+ ld h,a
+ exx
+math_xor_imm_w: ; de ^= imm_w
+ ld a,(bc)
+ inc bc
+ xor e
+ ld e,a
+ ld a,(bc)
+ inc bc
+ xor d
+ ld d,a
+ ret
+
+math_xor_l: ; de:hl' ^= hl:de'
+ exx
+ ld a,l
+ xor e
+ ld l,a
+ ld a,h
+ xor d
+ ld h,a
+ exx
+math_xor_w: ; de ^= hl
+ ld a,e
+ xor l
+ ld e,a
+ ld a,d
+ xor h
+ ld d,a
+ ret
+
+; use inline code for math_add_imm_w, math_add_w
+
+math_add_imm_l: ; de:hl' += imm_l
+ call math_imm_l0
+math_add_l: ; de:hl' += hl:de'
+ exx
+ add hl,de
+ exx
+ adc hl,de
+ ex de,hl
+ ret
+
+; use addition for math_sub_imm_w, math_sub_imm_l
+; use inline code for math_sub_w
+
+math_sub_l: ; de:hl' -= hl:de'
+ exx
+ or a
+ sbc hl,de
+ exx
+ ex de,hl
+ sbc hl,de
+ ex de,hl
+ ret
+
+; use inline code for math_subrev_imm_w, math_subrev_w
+
+math_subrev_imm_l: ; de:hl' = imm_l - de:hl'
+ call math_imm_l0
+math_subrev_l: ; de:hl' = hl:de' - de:hl'
+ exx
+ ex de,hl
+ or a
+ sbc hl,de
+ exx
+ sbc hl,de
+ ex de,hl
+ ret
+
+; use rst 8 then math_cmp_sw for math_cmp_imm_sw
+math_cmp_sw: ; cf=1 de < hl, zf=1 de == hl, signed
+ ex de,hl
+; use rst 8 then math_cmprev_sw for math_cmprev_imm_sw
+math_cmprev_sw: ; cf=1 hl < de, zf=1 hl == de, signed
+ or a
+ sbc hl,de
+ ld a,h
+ rla
+ ret po
+ ccf
+ ret
+
+; use rst 8 then inline code for math_cmp_imm_uw, math_cmprev_imm_uw
+; use inline code for math_cmp_uw, math_cmprev_uw
+
+math_cmp_imm_sl: ; cf=1 de:hl' < imm_l, zf=1 de:hl' == imm_l, signed
+ call math_imm_l0
+math_cmp_sl: ; cf=1 de:hl' < hl:de', zf=1 de:hl' == hl:de', signed
+ ex de,hl
+ or a
+ sbc hl,de
+ jr z,cmp_l_entry
+ ld a,h
+ rla
+ ret po
+ ccf
+ ret
+
+math_cmp_imm_ul: ; cf=1 de:hl' < imm_l, zf=1 de:hl' == imm_l, unsigned
+ call math_imm_l0
+math_cmp_ul: ; cf=1 de:hl' < hl:de', zf=1 de:hl' == hl:de', unsigned
+ ex de,hl
+ or a
+ sbc hl,de
+ ret nz
+cmp_l_entry:
+ exx
+ sbc hl,de
+ exx
+ ret
+
+math_cmprev_imm_sl: ; cf=1 hl:de' < imm_l, zf=1 hl:de' == imm_l, signed
+ call math_imm_l0
+math_cmprev_sl: ; cf=1 hl:de' < de:hl', zf=1 hl:de' == de:hl', signed
+ or a
+ sbc hl,de
+ jr z,cmprev_l_entry
+ ld a,h
+ rla
+ ret po
+ ccf
+ ret
+
+math_cmprev_imm_ul: ; cf=1 de:hl' < imm_l, zf=1 de:hl' == imm_l, unsigned
+ call math_imm_l0
+math_cmprev_ul: ; cf=1 de:hl' < de:de', zf=1 de:hl' == hl:de', unsigned
+ or a
+ sbc hl,de
+ ret nz
+cmprev_l_entry:
+ exx
+ ex de,hl
+ sbc hl,de
+ exx
+ ret
+
+math_sl_imm_w: ; de <<= imm_b, imm_b in [0, 0x10)
+ ld a,(bc)
+ inc bc
+ ld l,a
+math_sl_w: ; de <<= l & 0xf
+ ex de,hl
+math_slrev_w: ; de = hl << (e & 0xf)
+ ; by 1
+ bit 0,e
+ jr z,1$
+ add hl,hl
+1$: ; by 2
+ bit 1,e
+ jr z,2$
+ add hl,hl
+ add hl,hl
+2$: ; by 4
+ bit 2,e
+ jr z,3$
+ add hl,hl
+ add hl,hl
+ add hl,hl
+ add hl,hl
+3$: ; by 8
+ bit 3,e
+ ex de,hl
+ ret z
+ ld d,e
+ ld e,0
+ ret
+
+math_sl_imm_l: ; de:hl' << imm_b, imm_b in [0, 0x20)
+ ld a,(bc)
+ inc bc
+ ld l,a
+math_sl_l: ; de:hl' <<= l & 0x1f
+ ex de,hl
+ ; by 1
+ bit 0,e
+ jr z,1$
+ exx
+ add hl,hl
+ exx
+ adc hl,hl
+1$: ; by 2
+ bit 1,e
+ jr z,2$
+ exx
+ add hl,hl
+ exx
+ adc hl,hl
+ exx
+ add hl,hl
+ exx
+ adc hl,hl
+2$: ; by 4
+ bit 2,e
+ jr z,3$
+ exx
+ add hl,hl
+ exx
+ adc hl,hl
+ exx
+ add hl,hl
+ exx
+ adc hl,hl
+ exx
+ add hl,hl
+ exx
+ adc hl,hl
+ exx
+ add hl,hl
+ exx
+ adc hl,hl
+3$: ; by 8
+ bit 3,e
+ jr z,4$
+ exx
+ ld a,h
+ ld h,l
+ ld l,0
+ exx
+ ld h,l
+ ld l,a
+4$: ; by 16
+ bit 4,e
+ ex de,hl
+ ret z
+ exx
+ push hl
+ ld hl,0
+ exx
+ pop de
+ ret
+
+math_sr_uw: ; de >>= l & 0xf, logical
+ ex de,hl
+math_srrev_uw: ; de = hl >> (e & 0xf), logical
+ ld a,e
+ and 0x1f
+ add 7
+math_srrev_uw0: ; de = hl >> (a - 7), a in [7, 0x17), logical (a immediate)
+ ld e,a
+ sub a
+ jr sr_w_entry
+
+math_sr_sw: ; de >>= l & 0xf, arithmetic
+ ex de,hl
+math_srrev_sw: ; de = hl >> (e & 0xf), arithmetic
+ ld a,e
+ and 0xf
+ add 7
+math_srrev_sw0: ; de = hl >> (a - 7), a in [7, 0x17), arithmetic (a immediate)
+ ld e,a
+ ld a,h
+ rla
+ sbc a,a
+sr_w_entry:
+ ; by -1
+ bit 0,e
+ jr nz,1$
+ add hl,hl
+ rla
+1$: ; by -2
+ bit 1,e
+ jr nz,2$
+ add hl,hl
+ rla
+ add hl,hl
+ rla
+2$: ; by -4
+ bit 2,e
+ jr nz,3$
+ add hl,hl
+ rla
+ add hl,hl
+ rla
+ add hl,hl
+ rla
+ add hl,hl
+ rla
+3$: ; by 8
+ bit 3,e
+ jr z,4$
+ ld e,h
+ ld d,a
+ ret
+4$: ; by 16 (can't occur simultaneously with by 8)
+ bit 4,e
+ ex de,hl
+ ret z
+ ld e,a
+ rla
+ sbc a,a
+ ld d,a
+ ret
+
+math_mul_imm_l: ; de:hl' *= imm_l, big-endian imm_l
+ exx
+ ex de,hl
+ sub a
+ ld l,a
+ ld h,a
+ exx
+ ld l,a
+ ld h,a
+ ld a,(bc)
+ inc bc
+ call mul_l0
+ ld a,(bc)
+ inc bc
+ call mul_l
+ ld a,(bc)
+ inc bc
+ call mul_l
+ ld a,(bc)
+ inc bc
+ call mul_l
+ ex de,hl
+ ret
+
+math_sr_ul: ; de:hl' >>= l & 0x1f, logical
+ ld a,l
+ and 0x1f
+ add 7
+math_sr_ul0: ; de:hl' >>= a - 7, a in [7, 0x27), logical (for immediates)
+ ld l,a
+ sub a
+ jr sr_l_entry
+
+math_sr_sl: ; de:hl' >>= l & 0x1f, arithmetic
+ ld a,l
+ and 0x1f
+ add 7
+math_sr_sl0: ; de:hl' >>= a - 7, a in [7, 0x27), arithmetic (for immediates)
+ ld l,a
+ ld a,d
+ rla
+ sbc a,a
+sr_l_entry:
+ ex de,hl
+ ; by -1
+ bit 0,e
+ jr nz,1$
+ exx
+ add hl,hl
+ exx
+ adc hl,hl
+ rla
+1$: ; by -2
+ bit 1,e
+ jr nz,2$
+ exx
+ add hl,hl
+ exx
+ adc hl,hl
+ rla
+ exx
+ add hl,hl
+ exx
+ adc hl,hl
+ rla
+2$: ; by -4
+ bit 2,e
+ jr nz,3$
+ exx
+ add hl,hl
+ exx
+ adc hl,hl
+ rla
+ exx
+ add hl,hl
+ exx
+ adc hl,hl
+ rla
+ exx
+ add hl,hl
+ exx
+ adc hl,hl
+ rla
+ exx
+ add hl,hl
+ exx
+ adc hl,hl
+ rla
+3$: ; by 8
+ bit 3,e
+ jr z,4$
+ ld d,l
+ ld l,h
+ ld h,a
+ rla
+ ld a,d
+ exx
+ ld l,h
+ ld h,a
+ exx
+ sbc a,a
+4$: ; by 16
+ bit 4,e
+ jr z,5$
+ push hl
+ ld e,a
+ rla
+ sbc a,a
+ ld d,a
+ exx
+ pop hl
+ exx
+ ret
+5$: ; by 32 (can't occur simultaneously with by 16)
+ bit 5,e
+ ex de,hl
+ ret z
+ exx
+ ld l,a
+ rla
+ sbc a,a
+ ld h,a
+ exx
+ ld e,a
+ ld d,a
+ ret
+
+math_mul_imm_w: ; de *= imm_w, big-endian imm_w
+ ld hl,0
+ ld a,(bc)
+ inc bc
+ call mul_w0
+ ld a,(bc)
+ inc bc
+ call mul_w
+ ex de,hl
+ ret
+
+math_mul_w: ; de *= hl
+ ld a,l
+ push af
+ ld a,h
+ ld hl,0
+ call mul_w0
+ pop af
+ call mul_w
+ ex de,hl
+ ret
+
+mul_w: ; bit 0
+ add hl,hl
+mul_w0: rla
+ jr nc,1$
+ add hl,de
+1$: ; bit 1
+ add hl,hl
+ rla
+ jr nc,2$
+ add hl,de
+2$: ; bit 2
+ add hl,hl
+ rla
+ jr nc,3$
+ add hl,de
+3$: ; bit 3
+ add hl,hl
+ rla
+ jr nc,4$
+ add hl,de
+4$: ; bit 4
+ add hl,hl
+ rla
+ jr nc,5$
+ add hl,de
+5$: ; bit 5
+ add hl,hl
+ rla
+ jr nc,6$
+ add hl,de
+6$: ; bit 6
+ add hl,hl
+ rla
+ jr nc,7$
+ add hl,de
+7$: ; bit 7
+ add hl,hl
+ rla
+ ret nc
+ add hl,de
+ ret
+
+math_mul_l: ; de:hl' *= hl:de'
+ ex de,hl
+ exx
+ ld a,l
+ push af
+ push hl
+ ld hl,0
+ exx
+ ld a,l
+ push af
+ ld a,h
+ ld hl,0
+ call mul_l0
+ pop af
+ call mul_l
+ pop af
+ call mul_l
+ pop af
+ call mul_l
+ ex de,hl
+ ret
+
+mul_l: ; bit 0
+ exx
+ add hl,hl
+ exx
+ adc hl,hl
+mul_l0: rla
+ jr nc,1$
+ exx
+ add hl,de
+ exx
+ adc hl,de
+1$: ; bit 1
+ exx
+ add hl,hl
+ exx
+ adc hl,hl
+ rla
+ jr nc,2$
+ exx
+ add hl,de
+ exx
+ adc hl,de
+2$: ; bit 2
+ exx
+ add hl,hl
+ exx
+ adc hl,hl
+ rla
+ jr nc,3$
+ exx
+ add hl,de
+ exx
+ adc hl,de
+3$: ; bit 3
+ exx
+ add hl,hl
+ exx
+ adc hl,hl
+ rla
+ jr nc,4$
+ exx
+ add hl,de
+ exx
+ adc hl,de
+4$: ; bit 4
+ exx
+ add hl,hl
+ exx
+ adc hl,hl
+ rla
+ jr nc,5$
+ exx
+ add hl,de
+ exx
+ adc hl,de
+5$: ; bit 5
+ exx
+ add hl,hl
+ exx
+ adc hl,hl
+ rla
+ jr nc,6$
+ exx
+ add hl,de
+ exx
+ adc hl,de
+6$: ; bit 6
+ exx
+ add hl,hl
+ exx
+ adc hl,hl
+ rla
+ jr nc,7$
+ exx
+ add hl,de
+ exx
+ adc hl,de
+7$: ; bit 7
+ exx
+ add hl,hl
+ exx
+ adc hl,hl
+ rla
+ ret nc
+ exx
+ add hl,de
+ exx
+ adc hl,de
+ ret
+
+; use rst 8 then math_div_sw for math_div_imm_sw
+math_div_sw: ; de, hl = de / hl, de % hl, signed
+ ex de,hl
+; use rst 8 then math_divrev_sw for math_divrev_imm_sw
+math_divrev_sw: ; de, hl = hl / de, hl % de, signed
+ push bc
+ ld a,h
+ or a
+ ld a,d
+ rla
+ jp m,div_w_n ; positive dividend
+
+ ; positive dividend
+ ld a,h
+ ld c,l
+ ld hl,0
+ jr nc,div_w_pp ; positive dividend, positive divisor
+
+ ; positive dividend, negative divisor
+ call div_w_n1
+ ld b,a
+ ld a,c
+ call div_w_ncf
+ inc a
+ jr c,1$
+ sbc hl,de
+1$: ld d,b
+ ld e,a
+ pop bc
+ ret
+
+div_w_n:
+ ; negative dividend
+ dec hl ; reduces remainder by 1 (we inc later)
+ ld a,h
+ ld c,l
+ ld hl,-1
+ jr c,div_w_nn ; negative dividend, negative divisor
+
+ ; negative dividend, positive divisor
+ call div_w1
+ ld b,a
+ ld a,c
+ call div_wcf
+ inc a
+ jr c,1$
+ sbc hl,de
+1$: inc hl ; get into range -divisor+1..0
+ ld d,b
+ ld e,a
+ pop bc
+ ret
+
+div_w_nn: ; negative dividend, negative divisor
+ call div_w_n0
+ ld b,a
+ ld a,c
+ call div_w_ncf
+ jr nc,1$
+ add hl,de
+1$: inc hl ; get into range divisor+1..0
+ ld d,b
+ ld e,a
+ pop bc
+ ret
+
+math_div_imm_uw:
+ rst 8
+math_div_uw: ; de, hl = de / hl, de % hl, unsigned
+ ex de,hl
+; use rst 8 then math_divrev_uw for math_divrev_imm_uw
+math_divrev_uw: ; de, hl = hl / de, hl % de, unsigned
+ push bc
+ ld a,h
+ ld c,l
+ ld hl,0
+div_w_pp: ; positive dividend, positive divisor
+ call div_w0
+ ld b,a
+ ld a,c
+ call div_wcf
+ jr nc,1$
+ add hl,de
+1$: ld d,b
+ ld e,a
+ pop bc
+ ret
+
+; non-restoring division routine
+
+; de = divisor, hl:a = dividend with hl = previous remainder, a = next byte
+; enter at div0 with positive remainder in hl, such that hl < de
+; enter at div1 with negative remainder in hl, such that hl >= -de
+
+; div0/1 return a = 8-bit quotient as an odd number interpreted as -ff..ff,
+; by summing positive/negative place values, e.g. -80 +40 +20 -10 +8 -4 -2 +1
+
+; if entered at div0, there is a -80 and so quotient is in range -ff..-1
+; if entered at div1, there is a +80 and so quotient is in range 1..ff
+; falls out of loop after div01 with positive remainder, div11 with negative,
+; depending on this we should re-enter at div0 or div1, signalled by cf return
+
+; the successive quotient bytes can be concatenated into a full quotient,
+; but negative bytes require the next higher quotient byte to be decremented,
+; we know in advance if this will happen because the implied sign of the
+; quotient byte depends only on whether we entered at div0 or div1, hence,
+; before the div11 return we'll decrement to compensate for next negative byte
+
+; the decrement can also be seen as compensating for the extra add hl,de that
+; may be needed to make negative remainder positive before return to caller,
+; thus leaving quotient in a consistent state regardless of which exit taken,
+; remainder needs the add hl,de if cf=1 returned (equiv. return byte is even)
+
+; in the following code each sbc hl,de gets an inc a and each add hl,de gets
+; a dec a, guaranteeing the integrity of the division, the initial scf/rla is
+; needed to make the result 100 + -ff..ff or 1..1ff, so that the decrements
+; cannot borrow into the upcoming dividend bits also held in a, and there must
+; be another shift between the scf/rla and increment/decrement so that the scf
+; is implicitly in the 100s place, making the code awkward though it's correct
+
+; now optimized to only inc/dec a when doing zero-crossing, fix above analysis
+
+div_wcf:
+ jr c,div_w1
+div_w0: ; bit 0, above
+ scf
+ rla
+ adc hl,hl
+ sbc hl,de
+ jr nc,div_w01
+ dec a
+div_w11: ; bit 1, below
+ add a,a
+ adc hl,hl
+ add hl,de
+ jr nc,div_w12
+ inc a
+div_w02: ; bit 2, above
+ add a,a
+ adc hl,hl
+ sbc hl,de
+ jr nc,div_w03
+ dec a
+div_w13: ; bit 3, below
+ add a,a
+ adc hl,hl
+ add hl,de
+ jr nc,div_w14
+ inc a
+div_w04: ; bit 4, above
+ add a,a
+ adc hl,hl
+ sbc hl,de
+ jr nc,div_w05
+ dec a
+div_w15: ; bit 5, below
+ add a,a
+ adc hl,hl
+ add hl,de
+ jr nc,div_w16
+ inc a
+div_w06: ; bit 6, above
+ add a,a
+ adc hl,hl
+ sbc hl,de
+ jr nc,div_w07
+ dec a
+div_w17: ; bit 7, below
+ add a,a
+ adc hl,hl
+ add hl,de
+ jr nc,div_w18
+ inc a
+div_w08: ; done, above
+ add a,a
+ dec a
+ or a
+ ret
+
+div_w1: ; bit 0, below
+ add a,a
+ adc hl,hl
+ add hl,de
+ jr nc,div_w11
+ inc a
+div_w01: ; bit 1, above
+ add a,a
+ adc hl,hl
+ sbc hl,de
+ jr nc,div_w02
+ dec a
+div_w12: ; bit 2, below
+ add a,a
+ adc hl,hl
+ add hl,de
+ jr nc,div_w13
+ inc a
+div_w03: ; bit 3, above
+ add a,a
+ adc hl,hl
+ sbc hl,de
+ jr nc,div_w04
+ dec a
+div_w14: ; bit 4, below
+ add a,a
+ adc hl,hl
+ add hl,de
+ jr nc,div_w15
+ inc a
+div_w05: ; bit 5, above
+ add a,a
+ adc hl,hl
+ sbc hl,de
+ jr nc,div_w06
+ dec a
+div_w16: ; bit 6, below
+ add a,a
+ adc hl,hl
+ add hl,de
+ jr nc,div_w17
+ inc a
+div_w07: ; bit 7, above
+ add a,a
+ adc hl,hl
+ sbc hl,de
+ jr nc,div_w08
+ dec a
+div_w18: ; done, below
+ add a,a
+ ;inc a
+ ;dec a ; compensation
+ scf
+ ret
+
+; divn0/1 are the same as div0/1 but carry reversed after add/subtract divisor
+; this is for negative divisors where we expect carry (means no zero crossing)
+
+; when divisor negated, remainder also negated, so we expect to do subtraction
+; when remainder negative and vice versa, need to clear carry after add hl,hl
+
+div_w_ncf:
+ jr c,div_w_n1
+div_w_n0: ; bit 0, above
+ scf
+ rla
+ adc hl,hl
+ or a
+ sbc hl,de
+ jr c,div_w_n01
+ dec a
+div_w_n11: ; bit 1, below
+ add a,a
+ adc hl,hl
+ add hl,de
+ jr c,div_w_n12
+ inc a
+div_w_n02: ; bit 2, above
+ add a,a
+ adc hl,hl
+ or a
+ sbc hl,de
+ jr c,div_w_n03
+ dec a
+div_w_n13: ; bit 3, below
+ add a,a
+ adc hl,hl
+ add hl,de
+ jr c,div_w_n14
+ inc a
+div_w_n04: ; bit 4, above
+ add a,a
+ adc hl,hl
+ or a
+ sbc hl,de
+ jr c,div_w_n05
+ dec a
+div_w_n15: ; bit 5, below
+ add a,a
+ adc hl,hl
+ add hl,de
+ jr c,div_w_n16
+ inc a
+div_w_n06: ; bit 6, above
+ add a,a
+ adc hl,hl
+ or a
+ sbc hl,de
+ jr c,div_w_n07
+ dec a
+div_w_n17: ; bit 7, below
+ add a,a
+ adc hl,hl
+ add hl,de
+ jr c,div_w_n18
+ inc a
+div_w_n08: ; done, above
+ add a,a
+ dec a
+ or a
+ ret
+
+div_w_n1: ; bit 0, below
+ add a,a
+ adc hl,hl
+ add hl,de
+ jr c,div_w_n11
+ inc a
+div_w_n01: ; bit 1, above
+ add a,a
+ adc hl,hl
+ or a
+ sbc hl,de
+ jr c,div_w_n02
+ dec a
+div_w_n12: ; bit 2, below
+ add a,a
+ adc hl,hl
+ add hl,de
+ jr c,div_w_n13
+ inc a
+div_w_n03: ; bit 3, above
+ add a,a
+ adc hl,hl
+ or a
+ sbc hl,de
+ jr c,div_w_n04
+ dec a
+div_w_n14: ; bit 4, below
+ add a,a
+ adc hl,hl
+ add hl,de
+ jr c,div_w_n15
+ inc a
+div_w_n05: ; bit 5, above
+ add a,a
+ adc hl,hl
+ or a
+ sbc hl,de
+ jr c,div_w_n06
+ dec a
+div_w_n16: ; bit 6, below
+ add a,a
+ adc hl,hl
+ add hl,de
+ jr c,div_w_n17
+ inc a
+div_w_n07: ; bit 7, above
+ add a,a
+ adc hl,hl
+ or a
+ sbc hl,de
+ jr c,div_w_n08
+ dec a
+div_w_n18: ; done, below
+ add a,a
+ ;inc a
+ ;dec a ; compensation
+ scf
+ ret
+
+; debugging
+
+print_trace: ; print af, bc, de, hl, sp, (sp)
+ push hl
+ push af
+ pop hl
+ push hl
+ call print_word
+ ld a,'
+ call print_char
+ ld l,c
+ ld h,b
+ call print_word
+ ld a,'
+ call print_char
+ ld l,e
+ ld h,d
+ call print_word
+ ld a,'
+ call print_char
+ pop hl
+ push hl
+ call print_word
+ ld a,'
+ call print_char
+ ld hl,4
+ add hl,sp
+ call print_word
+ ld a,'
+ call print_char
+ ld a,(hl)
+ inc hl
+ ld h,(hl)
+ ld l,a
+ call print_word
+ ld a,0xd
+ call print_char
+ ld a,0xa
+ call print_char
+ pop af
+ pop hl
+ ld a,(bc)
+ inc bc
+ ld l,a
+ jp (hl)
+
+print_word:
+ push af
+ ld a,h
+ call print_byte
+ ld a,l
+ call print_byte
+ pop af
+ ret
+
+print_byte:
+ push af
+ push af
+ rrca
+ rrca
+ rrca
+ rrca
+ call print_digit
+ pop af
+ call print_digit
+ pop af
+ ret
+
+print_digit:
+ push de
+ push hl
+ and 0xf
+ ld e,a
+ ld d,0
+ ld hl,digits
+ add hl,de
+ ld a,(hl)
+ pop hl
+ pop de
+print_char:
+ push bc
+ push de
+ push hl
+ ld e,a
+ ld c,2
+ call 5
+ pop hl
+ pop de
+ pop bc
+ ret
+
+digits:
+ .ascii '0123456789abcdef'
+
+; sm code
+
+start:
+ ld hl,print_trace
+ ld (page0_trace + 1),hl
+
+ ld hl,rst8
+ ld de,8
+ ld bc,7
+ ldir
+
+ ld bc,sm_start
+ jp page0_dispatch0
+sm_start:
+ .db <page0_imm_call
+ .dw sm_main
+ .dw 0
+ .db <page0_esc
+ jp 0
+
+rst8: ; immediate to hl
+ ld a,(bc)
+ inc bc
+ ld l,a
+ ld a,(bc)
+ inc bc
+ ld h,a
+ ret
+
+sm_main:
+ ; create stack frame
+ .db <page0_stkadj
+ .dw -2
+
+ ; push argument
+ .db <page0_imm_w
+ .dw 5
+
+ ; push result pointer
+ .db <page1_page0
+ .db <page0_stkptr
+ .dw 2
+
+ ; call sm_factorial(argument)
+ .db <page1_page0
+ .db <page0_imm_call
+ .dw sm_factorial
+ .dw 4
+
+ ; print 10000s
+ .db <page0_stkld_w
+ .dw 0+2
+ .db <page1_imm_div_sw
+ .dw 10000
+ .db <page1_stkst_w
+ .dw 2+2
+ .db <page0_page1
+ .db <page1_imm_add_w
+ .dw '0
+ .db <page1_page0
+ .db <page0_imm_call
+ .dw sm_print_char
+ .dw 2
+
+ ; print 1000s
+ .db <page0_stkld_w
+ .dw 0+2
+ .db <page1_imm_div_sw
+ .dw 1000
+ .db <page1_stkst_w
+ .dw 2+2
+ .db <page0_page1
+ .db <page1_imm_add_w
+ .dw '0
+ .db <page1_page0
+ .db <page0_imm_call
+ .dw sm_print_char
+ .dw 2
+
+ ; print 100s
+ .db <page0_stkld_w
+ .dw 0+2
+ .db <page1_imm_div_sw
+ .dw 100
+ .db <page1_stkst_w
+ .dw 2+2
+ .db <page0_page1
+ .db <page1_imm_add_w
+ .dw '0
+ .db <page1_page0
+ .db <page0_imm_call
+ .dw sm_print_char
+ .dw 2
+
+ ; print 10s
+ .db <page0_stkld_w
+ .dw 0+2
+ .db <page1_imm_div_sw
+ .dw 10
+ .db <page1_stkst_w
+ .dw 2+2
+ .db <page0_page1
+ .db <page1_imm_add_w
+ .dw '0
+ .db <page1_page0
+ .db <page0_imm_call
+ .dw sm_print_char
+ .dw 2
+
+ ; print 1s
+ .db <page0_stkld_w
+ .dw 0+2
+ .db <page1_imm_add_w
+ .dw '0
+ .db <page1_page0
+ .db <page0_imm_call
+ .dw sm_print_char
+ .dw 2
+
+ ; print cr
+ .db <page0_imm_w
+ .dw 0xd
+ .db <page1_page0
+ .db <page0_imm_call
+ .dw sm_print_char
+ .dw 2
+
+ ; print lf
+ .db <page0_imm_w
+ .dw 0xa
+ .db <page1_page0
+ .db <page0_imm_call
+ .dw sm_print_char
+ .dw 2
+
+ ; destroy stack frame
+ .db <page0_stkadj
+ .dw 2
+
+ ; return
+ .db <page0_ret
+
+sm_factorial:
+ .db <page0_trace
+ ; get argument
+ .db <page0_stkld_w
+ .dw 4+2
+
+ ; is argument < 2?
+ .db <page1_imm_cmp_sw
+ .dw 2
+ .db <page0_jlt
+ .dw 1$
+
+ ; no, set up for *result =
+ .db <page0_stkld_w
+ .dw 2+2
+
+ ; get argument
+ .db <page1_page0
+ .db <page0_stkld_w
+ .dw 6+2
+
+ ; subtract 1
+ .db <page1_imm_add_w
+ .dw -1
+
+ ; push result pointer
+ .db <page1_page0
+ .db <page0_stkptr
+ .dw 0
+
+ ; call sm_factorial(argument - 1)
+ .db <page1_page0
+ .db <page0_imm_call
+ .dw sm_factorial
+ .dw 2
+
+ ; get argument
+ .db <page0_stkld_w
+ .dw 8+2
+
+ ; multiply
+ .db <page1_mul_w
+
+ ; set *result = sm_factorial(argument - 1) * argument
+ .db <page1_st_w
+
+ ; return
+ .db <page0_trace
+ .db <page0_ret
+
+1$:
+ ; yes, set up for *result =
+ .db <page0_stkld_w
+ .dw 2+2
+
+ ; set *result = 1
+ .db <page1_page0
+ .db <page0_imm_w
+ .dw 1
+ .db <page1_st_w
+
+ ; return
+ .db <page0_trace
+ .db <page0_ret
+
+sm_print_char:
+ .db <page0_esc
+ ld hl,2
+ add hl,sp
+ ld a,(hl)
+ call print_char
+ jp page0_ret