Revert sm2.asm to commit 557827e, reimplement in sm3.asm with a less optimal approach...
authorNick Downing <nick@ndcode.org>
Tue, 18 Jun 2019 08:22:37 +0000 (18:22 +1000)
committerNick Downing <nick@ndcode.org>
Tue, 18 Jun 2019 08:22:37 +0000 (18:22 +1000)
p.sh [new file with mode: 0755]
sm2.asm
sm3.asm [new file with mode: 0644]

diff --git a/p.sh b/p.sh
new file mode 100755 (executable)
index 0000000..f87882a
--- /dev/null
+++ b/p.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+#pip3 install --user intelhex
+bin/asz80 -l -o sm3.asm
+bin/aslink -i sm3.ihx sm3.rel
+python3 ~/.local/bin/hex2bin.py sm3.ihx sm3.com
+../z80pack/cpmsim/srctools/cpmw.sh a sm3.com
diff --git a/sm2.asm b/sm2.asm
index 674c811..049c2e3 100644 (file)
--- a/sm2.asm
+++ b/sm2.asm
@@ -1,6 +1,5 @@
 page0  =       1
 page1  =       2
-page2  =       4
 
        .area   SM (abs,ovr)
 
@@ -9,100 +8,24 @@ page2        =       4
 
        .org    page0 * 0x100
 
-; administrative
+       jp      start
 
 page0_trace:
-       jp      start ; will be overwritten with print_trace
+       jp      print_trace
 
 page0_esc:
        ld      l,c
        ld      h,b
        jp      (hl)
 
-page0_page2:
-       exx
-       pop     hl
-       exx
-       inc     h ; page 1
 page0_page1:
        pop     de
        ld      a,(bc)
        inc     bc
        ld      l,a
-       inc     h ; page 1 (or 2 if came from page0_page2)
-       jp      (hl)
-
-; long ops
-
-page0_stkld_l:
-stkld_l:
-       rst     8 ; ld hl,(bc)+
-       add     hl,sp
-       call    ld_l0
-       jr      dispatch2
-
-page0_stkst_l:
-stkst_l:
-       rst     8 ; ld hl,(bc)+
-       add     hl,sp
-       call    st_l0
-       jr      dispatch0
-
-page0_imm_l:
-imm_l:
-;      ld      a,(bc)
-;      inc     bc
-;      exx
-;      ld      l,a
-;      exx
-;      ld      a,(bc)
-;      inc     bc
-;      exx
-;      ld      h,a
-;      exx
-;      inc     h ; page 1
-;      jr      page0_imm_w
-       call    imm_l0
-dispatch2:
-       ld      a,(bc)
-       inc     bc
-       ld      l,a
-       ld      h,page2
+       inc     h ; page 1
        jp      (hl)
 
-page0_ld_l:
-       pop     hl
-ld_l:
-       call    ld_l0
-       jr      dispatch2
-
-page0_st_l:
-       exx
-       pop     hl
-       exx
-       pop     de
-st_l:
-       pop     hl
-       call    st_l0
-       jr      dispatch0
-
-page0_imm_st_l:
-       pop     hl
-imm_st_l:
-       call    imm_l0
-       call    st_l0
-       jr      dispatch0
-
-; less used long ops
-
-page0_imm_xchg_l:
-       call    imm_l0
-       jp      xchg_l0
-page0_xchg_l:
-       jp      xchg_l
-
-; less used word ops
-
 page0_imm_xchg_w:
        ld      a,(bc)
        inc     bc
@@ -115,9 +38,11 @@ page0_xchg_w:
        pop     hl
        ex      (sp),hl
        push    hl
-       jr      dispatch0
-
-; control transfer ops
+       ld      a,(bc)
+       inc     bc
+       ld      l,a
+       ld      h,page0
+       jp      (hl)
 
 page0_call:
        pop     de
@@ -172,9 +97,7 @@ page0_jmp:
        ld      l,a
        jp      (hl)
 
-       .org    page0 * 0x100 + 0x86
-
-; stack ops (reachable from start of page 1)
+       .org    page0 * 0x100 + 0x84
 
 imm_call:
        push    de
@@ -193,10 +116,9 @@ page0_imm_call:
        ld      l,a
        jp      (hl)
 
-page0_ret:
-       pop     bc
-page0_stkadj:
-stkadj:
+stkptr:
+       push    de
+page0_stkptr:
        ld      a,(bc)
        inc     bc
        ld      l,a
@@ -204,16 +126,17 @@ stkadj:
        inc     bc
        ld      h,a
        add     hl,sp
-       ld      sp,hl
-dispatch0:
+       ex      de,hl
        ld      a,(bc)
        inc     bc
        ld      l,a
-       ld      h,page0
+       ld      h,page1
        jp      (hl)
 
-page0_stkptr:
-stkptr:
+page0_ret:
+       pop     bc
+page0_stkadj:
+stkadj:
        ld      a,(bc)
        inc     bc
        ld      l,a
@@ -221,17 +144,16 @@ stkptr:
        inc     bc
        ld      h,a
        add     hl,sp
-       ex      de,hl
+       ld      sp,hl
        ld      a,(bc)
        inc     bc
        ld      l,a
-       ld      h,page1
+       ld      h,page0
        jp      (hl)
 
-; word ops
-
-page0_stkld_w:
 stkld_w:
+       push    de
+page0_stkld_w:
        ld      a,(bc)
        inc     bc
        ld      l,a
@@ -268,7 +190,7 @@ stkst_w:
        jp      (hl)
 
 page0_imm_w:
-       inc     h ; page1 (or page2 if came from page0_imm_l)
+       inc     h ; page1
 imm_w:
        ld      a,(bc)
        inc     bc
@@ -332,14 +254,12 @@ st_w:
 
 page1_imm_call:
        jr      imm_call
+page1_stkptr:
+       jr      stkptr
 page1_stkadj:
        push    de
        jr      stkadj
-page1_stkptr:
-       push    de
-       jr      stkptr
 page1_stkld_w:
-       push    de
        jr      stkld_w
 page1_stkst_w:
        jr      stkst_w
@@ -366,13 +286,6 @@ page1_call:
        dec     h ; page 0
        jp      _call
 
-page1_ld_l:
-       ex      de,hl
-       jp      ld_l
-page1_imm_st_l:
-       ex      de,hl
-       jp      st_l
-
 page1_imm_and_w:
        ld      a,(bc)
        inc     bc
@@ -632,7 +545,7 @@ page1_lt_sw: ; put this at the end because it's the longest one
        ld      h,page0
        jp      (hl)
 
-; word math package
+; math package
 
 imm_sl_w:
        ex      de,hl
@@ -833,133 +746,6 @@ divnn:    ; negative dividend, negative divisor
        pop     bc
        ret
 
-; long math package (reachable from page 2)
-
-xchg_l:
-       exx
-       pop     hl
-       exx
-       pop     de
-xchg_l0:
-       exx
-       pop     de
-       exx
-       pop     hl
-       push    de
-       exx
-       push    hl
-       exx
-       push    hl
-       exx
-       push    de
-       exx
-       ld      a,(bc)
-       inc     bc
-       ld      l,a
-       ld      h,page2
-       jp      (hl)
-
-; page 2 interpreter
-; long arithmetic operations
-; top stack long cached in de:hl'
-
-       .org    page2 * 0x100
-
-page2_imm_call:
-       jp      imm_call
-page2_stkadj:
-       exx
-       push    hl
-       exx
-       push    de
-       jp      stkadj
-page2_stkptr:
-       exx
-       push    hl
-       exx
-       push    de
-       jp      stkptr
-page2_stkld_l:
-       exx
-       push    hl
-       exx
-       push    de
-       jp      stkld_l
-page2_stkst_l:
-       jp      stkst_l
-page2_imm_l:
-       exx
-       push    hl
-       exx
-       push    de
-       jp      imm_l
-page2_st_l:
-       jp      st_l
-
-page2_page0:
-       exx
-       push    hl
-       exx
-       push    de
-       ld      a,(bc)
-       inc     bc
-       ld      l,a
-       ld      h,page0
-       jp      (hl)
-
-; long math package
-
-imm_l0:
-       ld      a,(bc)
-       inc     bc
-       exx
-       ld      l,a
-       exx
-       ld      a,(bc)
-       inc     bc
-       exx
-       ld      h,a
-       exx
-       ld      a,(bc)
-       inc     bc
-       ld      e,a
-       ld      a,(bc)
-       inc     bc
-       ld      d,a
-       ret
-
-ld_l0:
-       ld      a,(hl)
-       inc     hl
-       exx
-       ld      l,a
-       exx
-       ld      a,(hl)
-       inc     hl
-       exx
-       ld      h,a
-       exx
-       ld      e,(hl)
-       inc     hl
-       ld      d,(hl)
-       ret
-
-st_l0:
-       exx
-       ld      a,l
-       exx
-       ld      (hl),a
-       inc     hl
-       exx
-       ld      a,h
-       exx
-       ld      (hl),a
-       inc     hl
-       ld      (hl),e
-       inc     hl
-       ld      (hl),d
-       ret
-
 ; non-restoring division routine
 
 ; de = divisor, hl:a = dividend with hl = previous remainder, a = next byte
@@ -1309,14 +1095,6 @@ digits:
 ; sm code
 
 start:
-       ld      hl,print_trace
-       ld      (page0_trace + 1),hl
-
-       ld      hl,rst8
-       ld      de,8
-       ld      bc,7
-       ldir
-
        ld      h,page0
        call    page0_jmp
        .db     <page0_imm_call
@@ -1325,15 +1103,6 @@ start:
        .db     <page0_esc
        jp      0
 
-rst8:  ; ld hl,(bc)+
-       ld      a,(bc)
-       inc     bc
-       ld      l,a
-       ld      a,(bc)
-       inc     bc
-       ld      h,a
-       ret
-
 sm_main:
        ; create stack frame
        .db     <page0_stkadj
diff --git a/sm3.asm b/sm3.asm
new file mode 100644 (file)
index 0000000..94ef9e1
--- /dev/null
+++ b/sm3.asm
@@ -0,0 +1,1890 @@
+page0  =       1
+page1  =       2
+page2  =       4
+
+       .area   SM (abs,ovr)
+
+; page 0 interpreter
+; stack and control transfer
+
+       .org    page0 * 0x100
+
+; administrative
+
+page0_trace:
+       jp      start ; will be overwritten with print_trace
+
+page0_esc:
+       ld      l,c
+       ld      h,b
+       jp      (hl)
+
+page0_page2:
+       exx
+       pop     hl
+       exx
+       inc     h ; page 1
+page0_page1:
+       pop     de
+       ld      a,(bc)
+       inc     bc
+       ld      l,a
+       inc     h ; page 1 (or 2 if came from page0_page2)
+       jp      (hl)
+
+page0_jeq:
+       jr      nz,not_taken
+page0_imm_jmp:
+       rst     8
+       ld      c,l
+       ld      b,h
+       jr      page0_dispatch0
+
+page0_jne:
+       jr      nz,page0_imm_jmp
+not_taken:
+       inc     bc
+       inc     bc
+       jr      page0_dispatch0
+
+page0_jlt:
+       jr      c,page0_imm_jmp
+       jr      not_taken
+
+page0_jge:
+       jr      nc,page0_imm_jmp
+       jr      not_taken
+
+page0_peq:
+       call    test_eq
+       jr      page0_dispatch1
+
+page0_pne:
+       call    test_ne
+       jr      page0_dispatch1
+
+page0_plt:
+       call    test_lt
+       jr      page0_dispatch1
+
+page0_pge:
+       call    test_ge
+       jr      page0_dispatch1
+
+page0_imm_call:
+       rst     8
+       push    bc
+       ld      c,l
+       ld      b,h
+       jr      page0_dispatch0
+
+page0_ret:
+       pop     bc
+page0_stkadj:
+       rst     8
+       add     hl,sp
+       ld      sp,hl
+page0_dispatch0:
+       ld      a,(bc)
+       inc     bc
+       ld      l,a
+       ld      h,page0
+       jp      (hl)
+
+page0_stkptr:
+       rst     8
+       add     hl,sp
+       ex      de,hl
+page0_dispatch1:
+       ld      a,(bc)
+       inc     bc
+       ld      l,a
+       ld      h,page1
+       jp      (hl)
+
+page0_stkld_w:
+       call    math_stkld_w
+       jr      page0_dispatch1
+
+page0_stkld_l:
+       call    math_stkld_l
+page0_dispatch2:
+       ld      a,(bc)
+       inc     bc
+       ld      l,a
+       ld      h,page2
+       jp      (hl)
+
+page0_imm_w:
+       call    math_imm_w
+       jr      page0_dispatch1
+
+page0_imm_l:
+       call    math_imm_l
+       jr      page0_dispatch2
+
+; page 1 interpreter
+; word arithmetic operations
+; top stack word cached in de
+
+       .org    page1 * 0x100
+
+page1_trace:
+       jp      print_trace
+
+page1_page0:
+       push    de
+       ld      a,(bc)
+       inc     bc
+       ld      l,a
+       dec     h ; page 0
+       jp      (hl)
+
+page1_call:
+       push    bc
+page1_jmp:
+       ld      c,e
+       ld      b,d
+       jr      page1_dispatch0
+
+page1_stkst_w:
+       call    math_stkst_w
+page1_dispatch0:
+       ld      a,(bc)
+       inc     bc
+       ld      l,a
+       ld      h,page0
+       jp      (hl)
+
+page1_stkst_l:
+       call    math_stkst_l
+       jr      page1_dispatch0
+
+page1_ld_w:
+       ex      de,hl
+       ld      e,(hl)
+       inc     hl
+       ld      d,(hl)
+       jr      page1_dispatch1
+
+page1_ld_l:
+       ex      de,hl
+       call    math_ld_l
+page1_dispatch2:
+       ld      a,(bc)
+       inc     bc
+       ld      l,a
+       ld      h,page2
+       jp      (hl)
+
+page1_st_w:
+       pop     hl
+       ld      (hl),e
+       inc     hl
+       ld      (hl),d
+       jr      page1_dispatch0
+
+page1_imm_and_w:
+       call    math_and_imm_w
+       jr      page1_dispatch1
+
+page1_and_w:
+       call    math_and_w
+       jr      page1_dispatch1
+
+page1_imm_or_w:
+       call    math_or_imm_w
+       jr      page1_dispatch1
+
+page1_or_w:
+       call    math_or_w
+       jr      page1_dispatch1
+
+page1_imm_xor_w:
+       call    math_xor_imm_w
+       jr      page1_dispatch1
+
+page1_xor_w:
+       call    math_xor_w
+       jr      page1_dispatch1
+
+page1_imm_add_w:
+       rst     8
+       .db     0x3e ; ld a,
+page1_add_w:
+       pop     hl
+       add     hl,de
+       ex      de,hl
+       jr      page1_dispatch1
+
+page1_imm_subrev_w:
+       rst     8
+       .db     0x3e ; ld a,
+page1_sub_w:
+       pop     hl
+       or      a
+       sbc     hl,de
+       ex      de,hl
+       jr      page1_dispatch1
+
+; use addition for page1_imm_sub_w
+page1_subrev_w:
+       pop     hl
+       ex      de,hl
+       or      a
+       sbc     hl,de
+       ex      de,hl
+       jr      page1_dispatch1
+
+page1_imm_cmprev_sw:
+       rst     8
+       .db     0x3e ; ld a,
+page1_cmp_sw:
+       pop     hl
+       call    math_cmprev_sw
+       jr      page1_dispatch0
+
+page1_imm_cmp_sw:
+       rst     8
+       .db     0x3e ; ld a,
+page1_cmprev_sw:
+       pop     hl
+       call    math_cmp_sw
+       jr      page1_dispatch0
+
+page1_imm_cmprev_uw:
+       rst     8
+       .db     0x3e ; ld a,
+page1_cmp_uw:
+       pop     hl
+       or      a
+       sbc     hl,de
+       jr      page1_dispatch0
+
+page1_imm_cmp_uw:
+       rst     8
+       .db     0x3e ; ld a,
+page1_cmprev_uw:
+       pop     hl
+       ex      de,hl
+       or      a
+       sbc     hl,de
+       jr      page1_dispatch0
+
+page1_imm_slrev_w:
+       rst     8
+       .db     0x3e ; ld a,
+page1_sl_w:
+       pop     hl
+       call    math_slrev_w
+page1_dispatch1:
+       ld      a,(bc)
+       inc     bc
+       ld      l,a
+       ld      h,page1
+       jp      (hl)
+
+page1_imm_sl_w:
+       call    math_sl_imm_w
+       jr      page1_dispatch1
+
+page1_slrev_w:
+       pop     hl
+       call    math_sl_w
+       jr      page1_dispatch1
+
+page1_imm_srrev_sw:
+       rst     8
+       .db     0x3e ; ld a,
+page1_sr_sw:
+       pop     hl
+       call    math_srrev_sw
+       jr      page1_dispatch1
+
+page1_imm_sr_sw:
+       ld      a,(bc)
+       inc     bc
+       ex      de,hl
+       call    math_srrev_sw0
+       jr      page1_dispatch1
+
+page1_srrev_sw:
+       pop     hl
+       call    math_sr_sw
+       jr      page1_dispatch1
+
+page1_imm_srrev_uw:
+       rst     8
+       .db     0x3e ; ld a,
+page1_sr_uw:
+       pop     hl
+       call    math_srrev_uw
+       jr      page1_dispatch1
+
+page1_imm_sr_uw:
+       ld      a,(bc)
+       inc     bc
+       ex      de,hl
+       call    math_srrev_uw0
+       jr      page1_dispatch1
+
+page1_srrev_uw:
+       pop     hl
+       call    math_sr_uw
+       jr      page1_dispatch1
+
+page1_imm_mul_uw:
+       call    math_mul_imm_w
+       jr      page1_dispatch1
+
+page1_mul_w:
+       pop     hl
+       call    math_mul_w
+       jr      page1_dispatch1
+
+page1_imm_divrev_sw:
+       rst     8
+       .db     0x3e ; ld a,
+page1_div_sw:
+       pop     hl
+       call    math_divrev_sw
+       ex      de,hl
+       push    hl
+       jr      page1_dispatch1
+
+page1_imm_div_sw:
+       rst     8
+       .db     0x3e ; ld a,
+page1_divrev_sw:
+       pop     hl
+       call    math_div_sw
+       ex      de,hl
+       push    hl
+       jr      page1_dispatch1
+
+page1_imm_divrev_uw:
+       rst     8
+       .db     0x3e ; ld a,
+page1_div_uw:
+       pop     hl
+       call    math_divrev_uw
+       ex      de,hl
+       push    hl
+       jr      page1_dispatch1
+
+page1_imm_div_uw:
+       rst     8
+       .db     0x3e ; ld a,
+page1_divrev_uw:
+       pop     hl
+       call    math_div_uw
+       ex      de,hl
+       push    hl
+       jr      page1_dispatch1
+
+; page 2 interpreter
+; long arithmetic operations
+; top stack long cached in de:hl'
+
+       .org    page2 * 0x100
+
+; conditionals
+
+test_eq:
+       ld      de,0
+       ret     nz
+       inc     de
+       ret
+
+test_ne:
+       ld      de,0
+       ret     z
+       inc     de
+       ret
+
+test_ge:
+       ccf
+test_lt:
+       ld      hl,0
+       adc     hl,hl
+       ex      de,hl
+       ret
+
+; math package
+
+math_imm_l: ; immediate to de:hl'
+       ld      a,(bc)
+       inc     bc
+       exx
+       ld      l,a
+       exx
+       ld      a,(bc)
+       inc     bc
+       exx
+       ld      h,a
+       exx
+math_imm_w: ; immediate to de
+       ld      a,(bc)
+       inc     bc
+       ld      e,a
+       ld      a,(bc)
+       inc     bc
+       ld      d,a
+       ret
+
+math_imm_l0: ; immediate to hl:de'
+       ld      a,(bc)
+       inc     bc
+       exx
+       ld      e,a
+       exx
+       ld      a,(bc)
+       inc     bc
+       exx
+       ld      d,a
+       exx
+; use rst 8 for math_imm_w0
+       ld      a,(bc)
+       inc     bc
+       ld      l,a
+       ld      a,(bc)
+       inc     bc
+       ld      h,a
+       ret
+
+math_stkld_w: ; sp(imm_w) to de
+       ld      a,(bc)
+       inc     bc
+       ld      l,a
+       ld      a,(bc)
+       inc     bc
+       ld      h,a
+       add     hl,sp
+; use inline code for math_ld_w
+       ld      e,(hl)
+       inc     hl
+       ld      d,(hl)
+       ret
+
+math_stkld_l: ; sp(imm_w) to de:hl'
+       ld      a,(bc)
+       inc     bc
+       ld      l,a
+       ld      a,(bc)
+       inc     bc
+       ld      h,a
+       add     hl,sp
+math_ld_l: ; (hl) to de:hl'
+       ld      a,(hl)
+       inc     hl
+       exx
+       ld      l,a
+       exx
+       ld      a,(hl)
+       inc     hl
+       exx
+       ld      h,a
+       exx
+       ld      e,(hl)
+       inc     hl
+       ld      d,(hl)
+       ret
+
+math_stkst_w: ; de to sp(imm_w)
+       ld      a,(bc)
+       inc     bc
+       ld      l,a
+       ld      a,(bc)
+       inc     bc
+       ld      h,a
+       add     hl,sp
+; use inline code for math_st_w
+       ld      (hl),e
+       inc     hl
+       ld      (hl),d
+       ret
+
+math_stkst_l: ; de:hl' to sp(imm_w)
+       ld      a,(bc)
+       inc     bc
+       ld      l,a
+       ld      a,(bc)
+       inc     bc
+       ld      h,a
+       add     hl,sp
+math_st_l: ; de:hl' to (hl)
+       exx
+       ld      a,l
+       exx
+       ld      (hl),a
+       inc     hl
+       exx
+       ld      a,h
+       exx
+       ld      (hl),a
+       inc     hl
+       ld      (hl),e
+       inc     hl
+       ld      (hl),d
+       ret
+
+math_and_imm_l: ; de:hl' &= imm_l
+       ld      a,(bc)
+       inc     bc
+       exx
+       and     l
+       ld      l,a
+       exx
+       ld      a,(bc)
+       inc     bc
+       exx
+       and     h
+       ld      h,a
+       exx
+math_and_imm_w: ; de &= imm_w
+       ld      a,(bc)
+       inc     bc
+       and     e
+       ld      e,a
+       ld      a,(bc)
+       inc     bc
+       and     d
+       ld      d,a
+       ret
+
+math_and_l: ; de:hl' &= hl:de'
+       exx
+       ld      a,l
+       and     e
+       ld      l,a
+       ld      a,h
+       and     d
+       ld      h,a
+       exx
+math_and_w: ; de &= hl
+       ld      a,e
+       and     l
+       ld      e,a
+       ld      a,d
+       and     h
+       ld      d,a
+       ret
+
+math_or_imm_l: ; de:hl' |= imm_l
+       ld      a,(bc)
+       inc     bc
+       exx
+       or      l
+       ld      l,a
+       exx
+       ld      a,(bc)
+       inc     bc
+       exx
+       or      h
+       ld      h,a
+       exx
+math_or_imm_w: ; de |= imm_w
+       ld      a,(bc)
+       inc     bc
+       or      e
+       ld      e,a
+       ld      a,(bc)
+       inc     bc
+       or      d
+       ld      d,a
+       ret
+
+math_or_l: ; de:hl' |= hl:de'
+       exx
+       ld      a,l
+       or      e
+       ld      l,a
+       ld      a,h
+       or      d
+       ld      h,a
+       exx
+math_or_w: ; de |= hl
+       ld      a,e
+       or      l
+       ld      e,a
+       ld      a,d
+       or      h
+       ld      d,a
+       ret
+
+math_xor_imm_l: ; de:hl' ^= imm_l
+       ld      a,(bc)
+       inc     bc
+       exx
+       xor     l
+       ld      l,a
+       exx
+       ld      a,(bc)
+       inc     bc
+       exx
+       xor     h
+       ld      h,a
+       exx
+math_xor_imm_w: ; de ^= imm_w
+       ld      a,(bc)
+       inc     bc
+       xor     e
+       ld      e,a
+       ld      a,(bc)
+       inc     bc
+       xor     d
+       ld      d,a
+       ret
+
+math_xor_l: ; de:hl' ^= hl:de'
+       exx
+       ld      a,l
+       xor     e
+       ld      l,a
+       ld      a,h
+       xor     d
+       ld      h,a
+       exx
+math_xor_w: ; de ^= hl
+       ld      a,e
+       xor     l
+       ld      e,a
+       ld      a,d
+       xor     h
+       ld      d,a
+       ret
+
+; use inline code for math_add_imm_w, math_add_w
+
+math_add_imm_l: ; de:hl' += imm_l
+       call    math_imm_l0
+math_add_l: ; de:hl' += hl:de'
+       exx
+       add     hl,de
+       exx
+       adc     hl,de
+       ex      de,hl
+       ret
+
+; use addition for math_sub_imm_w, math_sub_imm_l
+; use inline code for math_sub_w
+
+math_sub_l: ; de:hl' -= hl:de'
+       exx
+       or      a
+       sbc     hl,de
+       exx
+       ex      de,hl
+       sbc     hl,de
+       ex      de,hl
+       ret
+
+; use inline code for math_subrev_imm_w, math_subrev_w
+
+math_subrev_imm_l: ; de:hl' = imm_l - de:hl'
+       call    math_imm_l0
+math_subrev_l: ; de:hl' = hl:de' - de:hl'
+       exx
+       ex      de,hl
+       or      a
+       sbc     hl,de
+       exx
+       sbc     hl,de
+       ex      de,hl
+       ret
+
+; use rst 8 then math_cmp_sw for math_cmp_imm_sw
+math_cmp_sw: ; cf=1 de < hl, zf=1 de == hl, signed
+       ex      de,hl
+; use rst 8 then math_cmprev_sw for math_cmprev_imm_sw
+math_cmprev_sw: ; cf=1 hl < de, zf=1 hl == de, signed
+       or      a
+       sbc     hl,de
+       ld      a,h
+       rla
+       ret     po
+       ccf
+       ret
+
+; use rst 8 then inline code for math_cmp_imm_uw, math_cmprev_imm_uw
+; use inline code for math_cmp_uw, math_cmprev_uw
+
+math_cmp_imm_sl: ; cf=1 de:hl' < imm_l, zf=1 de:hl' == imm_l, signed
+       call    math_imm_l0
+math_cmp_sl: ; cf=1 de:hl' < hl:de', zf=1 de:hl' == hl:de', signed
+       ex      de,hl
+       or      a
+       sbc     hl,de
+       jr      z,cmp_l_entry
+       ld      a,h
+       rla
+       ret     po
+       ccf
+       ret
+
+math_cmp_imm_ul: ; cf=1 de:hl' < imm_l, zf=1 de:hl' == imm_l, unsigned
+       call    math_imm_l0
+math_cmp_ul: ; cf=1 de:hl' < hl:de', zf=1 de:hl' == hl:de', unsigned
+       ex      de,hl
+       or      a
+       sbc     hl,de
+       ret     nz
+cmp_l_entry:
+       exx
+       sbc     hl,de
+       exx
+       ret
+
+math_cmprev_imm_sl: ; cf=1 hl:de' < imm_l, zf=1 hl:de' == imm_l, signed
+       call    math_imm_l0
+math_cmprev_sl: ; cf=1 hl:de' < de:hl', zf=1 hl:de' == de:hl', signed
+       or      a
+       sbc     hl,de
+       jr      z,cmprev_l_entry
+       ld      a,h
+       rla
+       ret     po
+       ccf
+       ret
+
+math_cmprev_imm_ul: ; cf=1 de:hl' < imm_l, zf=1 de:hl' == imm_l, unsigned
+       call    math_imm_l0
+math_cmprev_ul: ; cf=1 de:hl' < de:de', zf=1 de:hl' == hl:de', unsigned
+       or      a
+       sbc     hl,de
+       ret     nz
+cmprev_l_entry:
+       exx
+       ex      de,hl
+       sbc     hl,de
+       exx
+       ret
+
+math_sl_imm_w: ; de <<= imm_b, imm_b in [0, 0x10)
+       ld      a,(bc)
+       inc     bc
+       ld      l,a
+math_sl_w: ; de <<= l & 0xf
+       ex      de,hl
+math_slrev_w: ; de = hl << (e & 0xf)
+       ; by 1
+       bit     0,e
+       jr      z,1$
+       add     hl,hl
+1$:    ; by 2
+       bit     1,e
+       jr      z,2$
+       add     hl,hl
+       add     hl,hl
+2$:    ; by 4
+       bit     2,e
+       jr      z,3$
+       add     hl,hl
+       add     hl,hl
+       add     hl,hl
+       add     hl,hl
+3$:    ; by 8
+       bit     3,e
+       ex      de,hl
+       ret     z
+       ld      d,e
+       ld      e,0
+       ret
+
+math_sl_imm_l: ; de:hl' << imm_b, imm_b in [0, 0x20)
+       ld      a,(bc)
+       inc     bc
+       ld      l,a
+math_sl_l: ; de:hl' <<= l & 0x1f
+       ex      de,hl
+       ; by 1
+       bit     0,e
+       jr      z,1$
+       exx
+       add     hl,hl
+       exx
+       adc     hl,hl
+1$:    ; by 2
+       bit     1,e
+       jr      z,2$
+       exx
+       add     hl,hl
+       exx
+       adc     hl,hl
+       exx
+       add     hl,hl
+       exx
+       adc     hl,hl
+2$:    ; by 4
+       bit     2,e
+       jr      z,3$
+       exx
+       add     hl,hl
+       exx
+       adc     hl,hl
+       exx
+       add     hl,hl
+       exx
+       adc     hl,hl
+       exx
+       add     hl,hl
+       exx
+       adc     hl,hl
+       exx
+       add     hl,hl
+       exx
+       adc     hl,hl
+3$:    ; by 8
+       bit     3,e
+       jr      z,4$
+       exx
+       ld      a,h
+       ld      h,l
+       ld      l,0
+       exx
+       ld      h,l
+       ld      l,a
+4$:    ; by 16
+       bit     4,e
+       ex      de,hl
+       ret     z
+       exx
+       push    hl
+       ld      hl,0
+       exx
+       pop     de
+       ret
+
+math_sr_uw: ; de >>= l & 0xf, logical
+       ex      de,hl
+math_srrev_uw: ; de = hl >> (e & 0xf), logical
+       ld      a,e
+       and     0x1f
+       add     7
+math_srrev_uw0: ; de = hl >> (a - 7), a in [7, 0x17), logical (a immediate)
+       ld      e,a
+       sub     a
+       jr      sr_w_entry
+
+math_sr_sw: ; de >>= l & 0xf, arithmetic
+       ex      de,hl
+math_srrev_sw: ; de = hl >> (e & 0xf), arithmetic
+       ld      a,e
+       and     0xf
+       add     7
+math_srrev_sw0: ; de = hl >> (a - 7), a in [7, 0x17), arithmetic (a immediate)
+       ld      e,a
+       ld      a,h
+       rla
+       sbc     a,a
+sr_w_entry:
+       ; by -1
+       bit     0,e
+       jr      nz,1$
+       add     hl,hl
+       rla
+1$:    ; by -2
+       bit     1,e
+       jr      nz,2$
+       add     hl,hl
+       rla
+       add     hl,hl
+       rla
+2$:    ; by -4
+       bit     2,e
+       jr      nz,3$
+       add     hl,hl
+       rla
+       add     hl,hl
+       rla
+       add     hl,hl
+       rla
+       add     hl,hl
+       rla
+3$:    ; by 8
+       bit     3,e
+       jr      z,4$
+       ld      e,h
+       ld      d,a
+       ret
+4$:    ; by 16 (can't occur simultaneously with by 8)
+       bit     4,e
+       ex      de,hl
+       ret     z
+       ld      e,a
+       rla
+       sbc     a,a
+       ld      d,a
+       ret
+
+math_mul_imm_l: ; de:hl' *= imm_l, big-endian imm_l
+       exx     
+       ex      de,hl
+       sub     a
+       ld      l,a
+       ld      h,a
+       exx
+       ld      l,a
+       ld      h,a
+       ld      a,(bc)
+       inc     bc
+       call    mul_l0
+       ld      a,(bc)
+       inc     bc
+       call    mul_l
+       ld      a,(bc)
+       inc     bc
+       call    mul_l
+       ld      a,(bc)
+       inc     bc
+       call    mul_l
+       ex      de,hl
+       ret
+
+math_sr_ul: ; de:hl' >>= l & 0x1f, logical
+       ld      a,l
+       and     0x1f
+       add     7
+math_sr_ul0: ; de:hl' >>= a - 7, a in [7, 0x27), logical (for immediates)
+       ld      l,a
+       sub     a
+       jr      sr_l_entry
+
+math_sr_sl: ; de:hl' >>= l & 0x1f, arithmetic
+       ld      a,l
+       and     0x1f
+       add     7
+math_sr_sl0: ; de:hl' >>= a - 7, a in [7, 0x27), arithmetic (for immediates)
+       ld      l,a
+       ld      a,d
+       rla
+       sbc     a,a
+sr_l_entry:
+       ex      de,hl
+       ; by -1
+       bit     0,e
+       jr      nz,1$
+       exx
+       add     hl,hl
+       exx
+       adc     hl,hl
+       rla
+1$:    ; by -2
+       bit     1,e
+       jr      nz,2$
+       exx
+       add     hl,hl
+       exx
+       adc     hl,hl
+       rla
+       exx
+       add     hl,hl
+       exx
+       adc     hl,hl
+       rla
+2$:    ; by -4
+       bit     2,e
+       jr      nz,3$
+       exx
+       add     hl,hl
+       exx
+       adc     hl,hl
+       rla
+       exx
+       add     hl,hl
+       exx
+       adc     hl,hl
+       rla
+       exx
+       add     hl,hl
+       exx
+       adc     hl,hl
+       rla
+       exx
+       add     hl,hl
+       exx
+       adc     hl,hl
+       rla
+3$:    ; by 8
+       bit     3,e
+       jr      z,4$
+       ld      d,l
+       ld      l,h
+       ld      h,a
+       rla
+       ld      a,d
+       exx
+       ld      l,h
+       ld      h,a
+       exx
+       sbc     a,a
+4$:    ; by 16
+       bit     4,e
+       jr      z,5$
+       push    hl
+       ld      e,a
+       rla
+       sbc     a,a
+       ld      d,a
+       exx
+       pop     hl
+       exx
+       ret
+5$:    ; by 32 (can't occur simultaneously with by 16)
+       bit     5,e
+       ex      de,hl
+       ret     z
+       exx
+       ld      l,a
+       rla
+       sbc     a,a
+       ld      h,a
+       exx
+       ld      e,a
+       ld      d,a
+       ret
+
+math_mul_imm_w: ; de *= imm_w, big-endian imm_w
+       ld      hl,0
+       ld      a,(bc)
+       inc     bc
+       call    mul_w0
+       ld      a,(bc)
+       inc     bc
+       call    mul_w
+       ex      de,hl
+       ret
+
+math_mul_w: ; de *= hl
+       ld      a,l
+       push    af
+       ld      a,h
+       ld      hl,0
+       call    mul_w0
+       pop     af
+       call    mul_w
+       ex      de,hl
+       ret
+
+mul_w: ; bit 0
+       add     hl,hl
+mul_w0:        rla
+       jr      nc,1$
+       add     hl,de
+1$:    ; bit 1
+       add     hl,hl
+       rla
+       jr      nc,2$
+       add     hl,de
+2$:    ; bit 2
+       add     hl,hl
+       rla
+       jr      nc,3$
+       add     hl,de
+3$:    ; bit 3
+       add     hl,hl
+       rla
+       jr      nc,4$
+       add     hl,de
+4$:    ; bit 4
+       add     hl,hl
+       rla
+       jr      nc,5$
+       add     hl,de
+5$:    ; bit 5
+       add     hl,hl
+       rla
+       jr      nc,6$
+       add     hl,de
+6$:    ; bit 6
+       add     hl,hl
+       rla
+       jr      nc,7$
+       add     hl,de
+7$:    ; bit 7
+       add     hl,hl
+       rla
+       ret     nc
+       add     hl,de
+       ret
+
+math_mul_l: ; de:hl' *= hl:de'
+       ex      de,hl
+       exx
+       ld      a,l
+       push    af
+       push    hl
+       ld      hl,0
+       exx
+       ld      a,l
+       push    af
+       ld      a,h
+       ld      hl,0
+       call    mul_l0
+       pop     af
+       call    mul_l
+       pop     af
+       call    mul_l
+       pop     af
+       call    mul_l
+       ex      de,hl
+       ret
+
+mul_l: ; bit 0
+       exx
+       add     hl,hl
+       exx
+       adc     hl,hl
+mul_l0:        rla
+       jr      nc,1$
+       exx
+       add     hl,de
+       exx
+       adc     hl,de
+1$:    ; bit 1
+       exx
+       add     hl,hl
+       exx
+       adc     hl,hl
+       rla
+       jr      nc,2$
+       exx
+       add     hl,de
+       exx
+       adc     hl,de
+2$:    ; bit 2
+       exx
+       add     hl,hl
+       exx
+       adc     hl,hl
+       rla
+       jr      nc,3$
+       exx
+       add     hl,de
+       exx
+       adc     hl,de
+3$:    ; bit 3
+       exx
+       add     hl,hl
+       exx
+       adc     hl,hl
+       rla
+       jr      nc,4$
+       exx
+       add     hl,de
+       exx
+       adc     hl,de
+4$:    ; bit 4
+       exx
+       add     hl,hl
+       exx
+       adc     hl,hl
+       rla
+       jr      nc,5$
+       exx
+       add     hl,de
+       exx
+       adc     hl,de
+5$:    ; bit 5
+       exx
+       add     hl,hl
+       exx
+       adc     hl,hl
+       rla
+       jr      nc,6$
+       exx
+       add     hl,de
+       exx
+       adc     hl,de
+6$:    ; bit 6
+       exx
+       add     hl,hl
+       exx
+       adc     hl,hl
+       rla
+       jr      nc,7$
+       exx
+       add     hl,de
+       exx
+       adc     hl,de
+7$:    ; bit 7
+       exx
+       add     hl,hl
+       exx
+       adc     hl,hl
+       rla
+       ret     nc
+       exx
+       add     hl,de
+       exx
+       adc     hl,de
+       ret
+
+; use rst 8 then math_div_sw for math_div_imm_sw
+math_div_sw: ; de, hl = de / hl, de % hl, signed
+       ex      de,hl
+; use rst 8 then math_divrev_sw for math_divrev_imm_sw
+math_divrev_sw: ; de, hl = hl / de, hl % de, signed
+       push    bc
+       ld      a,h
+       or      a
+       ld      a,d
+       rla
+       jp      m,div_w_n               ; positive dividend
+
+       ; positive dividend
+       ld      a,h
+       ld      c,l
+       ld      hl,0
+       jr      nc,div_w_pp             ; positive dividend, positive divisor
+
+       ; positive dividend, negative divisor
+       call    div_w_n1
+       ld      b,a
+       ld      a,c
+       call    div_w_ncf
+       inc     a
+       jr      c,1$
+       sbc     hl,de
+1$:    ld      d,b
+       ld      e,a
+       pop     bc
+       ret
+
+div_w_n:
+       ; negative dividend
+       dec     hl                      ; reduces remainder by 1 (we inc later)
+       ld      a,h
+       ld      c,l
+       ld      hl,-1
+       jr      c,div_w_nn              ; negative dividend, negative divisor 
+
+       ; negative dividend, positive divisor
+       call    div_w1
+       ld      b,a
+       ld      a,c
+       call    div_wcf
+       inc     a
+       jr      c,1$
+       sbc     hl,de
+1$:    inc     hl                      ; get into range -divisor+1..0
+       ld      d,b
+       ld      e,a
+       pop     bc
+       ret
+
+div_w_nn: ; negative dividend, negative divisor
+       call    div_w_n0
+       ld      b,a
+       ld      a,c
+       call    div_w_ncf
+       jr      nc,1$
+       add     hl,de
+1$:    inc     hl                      ; get into range divisor+1..0
+       ld      d,b
+       ld      e,a
+       pop     bc
+       ret
+
+math_div_imm_uw:
+       rst     8
+math_div_uw: ; de, hl = de / hl, de % hl, unsigned
+       ex      de,hl
+; use rst 8 then math_divrev_uw for math_divrev_imm_uw
+math_divrev_uw: ; de, hl = hl / de, hl % de, unsigned
+       push    bc
+       ld      a,h
+       ld      c,l
+       ld      hl,0
+div_w_pp: ; positive dividend, positive divisor
+       call    div_w0
+       ld      b,a
+       ld      a,c
+       call    div_wcf
+       jr      nc,1$
+       add     hl,de
+1$:    ld      d,b
+       ld      e,a
+       pop     bc
+       ret
+
+; non-restoring division routine
+
+; de = divisor, hl:a = dividend with hl = previous remainder, a = next byte
+; enter at div0 with positive remainder in hl, such that hl < de
+; enter at div1 with negative remainder in hl, such that hl >= -de
+
+; div0/1 return a = 8-bit quotient as an odd number interpreted as -ff..ff,
+; by summing positive/negative place values, e.g. -80 +40 +20 -10 +8 -4 -2 +1
+
+; if entered at div0, there is a -80 and so quotient is in range -ff..-1
+; if entered at div1, there is a +80 and so quotient is in range 1..ff
+; falls out of loop after div01 with positive remainder, div11 with negative,
+; depending on this we should re-enter at div0 or div1, signalled by cf return
+
+; the successive quotient bytes can be concatenated into a full quotient,
+; but negative bytes require the next higher quotient byte to be decremented,
+; we know in advance if this will happen because the implied sign of the
+; quotient byte depends only on whether we entered at div0 or div1, hence,
+; before the div11 return we'll decrement to compensate for next negative byte
+
+; the decrement can also be seen as compensating for the extra add hl,de that
+; may be needed to make negative remainder positive before return to caller,
+; thus leaving quotient in a consistent state regardless of which exit taken,
+; remainder needs the add hl,de if cf=1 returned (equiv. return byte is even)
+
+; in the following code each sbc hl,de gets an inc a and each add hl,de gets
+; a dec a, guaranteeing the integrity of the division, the initial scf/rla is
+; needed to make the result 100 + -ff..ff or 1..1ff, so that the decrements
+; cannot borrow into the upcoming dividend bits also held in a, and there must
+; be another shift between the scf/rla and increment/decrement so that the scf
+; is implicitly in the 100s place, making the code awkward though it's correct
+
+; now optimized to only inc/dec a when doing zero-crossing, fix above analysis
+
+div_wcf:
+       jr      c,div_w1
+div_w0: ; bit 0, above
+       scf
+       rla
+       adc     hl,hl
+       sbc     hl,de
+       jr      nc,div_w01
+       dec     a
+div_w11: ; bit 1, below
+       add     a,a
+       adc     hl,hl
+       add     hl,de
+       jr      nc,div_w12
+       inc     a
+div_w02: ; bit 2, above
+       add     a,a
+       adc     hl,hl
+       sbc     hl,de
+       jr      nc,div_w03
+       dec     a
+div_w13: ; bit 3, below
+       add     a,a
+       adc     hl,hl
+       add     hl,de
+       jr      nc,div_w14
+       inc     a
+div_w04: ; bit 4, above
+       add     a,a
+       adc     hl,hl
+       sbc     hl,de
+       jr      nc,div_w05
+       dec     a
+div_w15: ; bit 5, below
+       add     a,a
+       adc     hl,hl
+       add     hl,de
+       jr      nc,div_w16
+       inc     a
+div_w06: ; bit 6, above
+       add     a,a
+       adc     hl,hl
+       sbc     hl,de
+       jr      nc,div_w07
+       dec     a
+div_w17: ; bit 7, below
+       add     a,a
+       adc     hl,hl
+       add     hl,de
+       jr      nc,div_w18
+       inc     a
+div_w08: ; done, above
+       add     a,a
+       dec     a
+       or      a
+       ret
+
+div_w1: ; bit 0, below
+       add     a,a
+       adc     hl,hl
+       add     hl,de
+       jr      nc,div_w11
+       inc     a
+div_w01: ; bit 1, above
+       add     a,a
+       adc     hl,hl
+       sbc     hl,de
+       jr      nc,div_w02
+       dec     a
+div_w12: ; bit 2, below
+       add     a,a
+       adc     hl,hl
+       add     hl,de
+       jr      nc,div_w13
+       inc     a
+div_w03: ; bit 3, above
+       add     a,a
+       adc     hl,hl
+       sbc     hl,de
+       jr      nc,div_w04
+       dec     a
+div_w14: ; bit 4, below
+       add     a,a
+       adc     hl,hl
+       add     hl,de
+       jr      nc,div_w15
+       inc     a
+div_w05: ; bit 5, above
+       add     a,a
+       adc     hl,hl
+       sbc     hl,de
+       jr      nc,div_w06
+       dec     a
+div_w16: ; bit 6, below
+       add     a,a
+       adc     hl,hl
+       add     hl,de
+       jr      nc,div_w17
+       inc     a
+div_w07: ; bit 7, above
+       add     a,a
+       adc     hl,hl
+       sbc     hl,de
+       jr      nc,div_w08
+       dec     a
+div_w18: ; done, below
+       add     a,a
+       ;inc    a
+       ;dec    a                       ; compensation
+       scf
+       ret
+
+; divn0/1 are the same as div0/1 but carry reversed after add/subtract divisor
+; this is for negative divisors where we expect carry (means no zero crossing)
+
+; when divisor negated, remainder also negated, so we expect to do subtraction
+; when remainder negative and vice versa, need to clear carry after add hl,hl
+
+div_w_ncf:
+       jr      c,div_w_n1
+div_w_n0: ; bit 0, above
+       scf
+       rla
+       adc     hl,hl
+       or      a
+       sbc     hl,de
+       jr      c,div_w_n01
+       dec     a
+div_w_n11: ; bit 1, below
+       add     a,a
+       adc     hl,hl
+       add     hl,de
+       jr      c,div_w_n12
+       inc     a
+div_w_n02: ; bit 2, above
+       add     a,a
+       adc     hl,hl
+       or      a
+       sbc     hl,de
+       jr      c,div_w_n03
+       dec     a
+div_w_n13: ; bit 3, below
+       add     a,a
+       adc     hl,hl
+       add     hl,de
+       jr      c,div_w_n14
+       inc     a
+div_w_n04: ; bit 4, above
+       add     a,a
+       adc     hl,hl
+       or      a
+       sbc     hl,de
+       jr      c,div_w_n05
+       dec     a
+div_w_n15: ; bit 5, below
+       add     a,a
+       adc     hl,hl
+       add     hl,de
+       jr      c,div_w_n16
+       inc     a
+div_w_n06: ; bit 6, above
+       add     a,a
+       adc     hl,hl
+       or      a
+       sbc     hl,de
+       jr      c,div_w_n07
+       dec     a
+div_w_n17: ; bit 7, below
+       add     a,a
+       adc     hl,hl
+       add     hl,de
+       jr      c,div_w_n18
+       inc     a
+div_w_n08: ; done, above
+       add     a,a
+       dec     a
+       or      a
+       ret
+
+div_w_n1: ; bit 0, below
+       add     a,a
+       adc     hl,hl
+       add     hl,de
+       jr      c,div_w_n11
+       inc     a
+div_w_n01: ; bit 1, above
+       add     a,a
+       adc     hl,hl
+       or      a
+       sbc     hl,de
+       jr      c,div_w_n02
+       dec     a
+div_w_n12: ; bit 2, below
+       add     a,a
+       adc     hl,hl
+       add     hl,de
+       jr      c,div_w_n13
+       inc     a
+div_w_n03: ; bit 3, above
+       add     a,a
+       adc     hl,hl
+       or      a
+       sbc     hl,de
+       jr      c,div_w_n04
+       dec     a
+div_w_n14: ; bit 4, below
+       add     a,a
+       adc     hl,hl
+       add     hl,de
+       jr      c,div_w_n15
+       inc     a
+div_w_n05: ; bit 5, above
+       add     a,a
+       adc     hl,hl
+       or      a
+       sbc     hl,de
+       jr      c,div_w_n06
+       dec     a
+div_w_n16: ; bit 6, below
+       add     a,a
+       adc     hl,hl
+       add     hl,de
+       jr      c,div_w_n17
+       inc     a
+div_w_n07: ; bit 7, above
+       add     a,a
+       adc     hl,hl
+       or      a
+       sbc     hl,de
+       jr      c,div_w_n08
+       dec     a
+div_w_n18: ; done, below
+       add     a,a
+       ;inc    a
+       ;dec    a                       ; compensation
+       scf
+       ret
+
+; debugging
+
+print_trace: ; print af, bc, de, hl, sp, (sp)
+       push    hl
+       push    af
+       pop     hl
+       push    hl
+       call    print_word
+       ld      a,' 
+       call    print_char
+       ld      l,c
+       ld      h,b
+       call    print_word
+       ld      a,' 
+       call    print_char
+       ld      l,e
+       ld      h,d
+       call    print_word
+       ld      a,' 
+       call    print_char
+       pop     hl
+       push    hl
+       call    print_word
+       ld      a,' 
+       call    print_char
+       ld      hl,4
+       add     hl,sp
+       call    print_word
+       ld      a,' 
+       call    print_char
+       ld      a,(hl)
+       inc     hl
+       ld      h,(hl)
+       ld      l,a
+       call    print_word
+       ld      a,0xd
+       call    print_char
+       ld      a,0xa
+       call    print_char
+       pop     af
+       pop     hl
+       ld      a,(bc)
+       inc     bc
+       ld      l,a
+       jp      (hl)
+
+print_word:
+       push    af
+       ld      a,h
+       call    print_byte
+       ld      a,l
+       call    print_byte
+       pop     af
+       ret
+
+print_byte:
+       push    af
+       push    af
+       rrca
+       rrca
+       rrca
+       rrca
+       call    print_digit
+       pop     af
+       call    print_digit
+       pop     af
+       ret
+
+print_digit:
+       push    de
+       push    hl
+       and     0xf
+       ld      e,a
+       ld      d,0
+       ld      hl,digits
+       add     hl,de
+       ld      a,(hl)
+       pop     hl
+       pop     de
+print_char:
+       push    bc
+       push    de
+       push    hl
+       ld      e,a
+       ld      c,2
+       call    5
+       pop     hl
+       pop     de
+       pop     bc
+       ret
+
+digits:
+       .ascii  '0123456789abcdef'
+
+; sm code
+
+start:
+       ld      hl,print_trace
+       ld      (page0_trace + 1),hl
+
+       ld      hl,rst8
+       ld      de,8
+       ld      bc,7
+       ldir
+
+       ld      bc,sm_start
+       jp      page0_dispatch0
+sm_start:
+       .db     <page0_imm_call
+       .dw     sm_main
+       .dw     0
+       .db     <page0_esc
+       jp      0
+
+rst8:  ; immediate to hl
+       ld      a,(bc)
+       inc     bc
+       ld      l,a
+       ld      a,(bc)
+       inc     bc
+       ld      h,a
+       ret
+
+sm_main:
+       ; create stack frame
+       .db     <page0_stkadj
+       .dw     -2
+
+       ; push argument
+       .db     <page0_imm_w
+       .dw     5
+
+       ; push result pointer
+       .db     <page1_page0
+       .db     <page0_stkptr
+       .dw     2
+
+       ; call sm_factorial(argument)
+       .db     <page1_page0
+       .db     <page0_imm_call
+       .dw     sm_factorial
+       .dw     4
+
+       ; print 10000s
+       .db     <page0_stkld_w
+       .dw     0+2
+       .db     <page1_imm_div_sw
+       .dw     10000
+       .db     <page1_stkst_w
+       .dw     2+2
+       .db     <page0_page1
+       .db     <page1_imm_add_w
+       .dw     '0
+       .db     <page1_page0
+       .db     <page0_imm_call
+       .dw     sm_print_char
+       .dw     2
+
+       ; print 1000s
+       .db     <page0_stkld_w
+       .dw     0+2
+       .db     <page1_imm_div_sw
+       .dw     1000
+       .db     <page1_stkst_w
+       .dw     2+2
+       .db     <page0_page1
+       .db     <page1_imm_add_w
+       .dw     '0
+       .db     <page1_page0
+       .db     <page0_imm_call
+       .dw     sm_print_char
+       .dw     2
+
+       ; print 100s
+       .db     <page0_stkld_w
+       .dw     0+2
+       .db     <page1_imm_div_sw
+       .dw     100
+       .db     <page1_stkst_w
+       .dw     2+2
+       .db     <page0_page1
+       .db     <page1_imm_add_w
+       .dw     '0
+       .db     <page1_page0
+       .db     <page0_imm_call
+       .dw     sm_print_char
+       .dw     2
+
+       ; print 10s
+       .db     <page0_stkld_w
+       .dw     0+2
+       .db     <page1_imm_div_sw
+       .dw     10
+       .db     <page1_stkst_w
+       .dw     2+2
+       .db     <page0_page1
+       .db     <page1_imm_add_w
+       .dw     '0
+       .db     <page1_page0
+       .db     <page0_imm_call
+       .dw     sm_print_char
+       .dw     2
+
+       ; print 1s
+       .db     <page0_stkld_w
+       .dw     0+2
+       .db     <page1_imm_add_w
+       .dw     '0
+       .db     <page1_page0
+       .db     <page0_imm_call
+       .dw     sm_print_char
+       .dw     2
+
+       ; print cr
+       .db     <page0_imm_w
+       .dw     0xd
+       .db     <page1_page0
+       .db     <page0_imm_call
+       .dw     sm_print_char
+       .dw     2
+
+       ; print lf
+       .db     <page0_imm_w
+       .dw     0xa
+       .db     <page1_page0
+       .db     <page0_imm_call
+       .dw     sm_print_char
+       .dw     2
+
+       ; destroy stack frame
+       .db     <page0_stkadj
+       .dw     2
+
+       ; return
+       .db     <page0_ret
+
+sm_factorial:
+ .db <page0_trace
+       ; get argument
+       .db     <page0_stkld_w
+       .dw     4+2
+
+       ; is argument < 2?
+       .db     <page1_imm_cmp_sw
+       .dw     2
+       .db     <page0_jlt
+       .dw     1$
+
+       ; no, set up for *result =
+       .db     <page0_stkld_w
+       .dw     2+2
+
+       ; get argument
+       .db     <page1_page0
+       .db     <page0_stkld_w
+       .dw     6+2
+
+       ; subtract 1
+       .db     <page1_imm_add_w
+       .dw     -1
+
+       ; push result pointer
+       .db     <page1_page0
+       .db     <page0_stkptr
+       .dw     0
+
+       ; call sm_factorial(argument - 1)
+       .db     <page1_page0
+       .db     <page0_imm_call
+       .dw     sm_factorial
+       .dw     2
+
+       ; get argument
+       .db     <page0_stkld_w
+       .dw     8+2
+
+       ; multiply
+       .db     <page1_mul_w
+
+       ; set *result = sm_factorial(argument - 1) * argument
+       .db     <page1_st_w
+
+       ; return
+ .db <page0_trace
+       .db     <page0_ret
+
+1$:
+       ; yes, set up for *result =
+       .db     <page0_stkld_w
+       .dw     2+2
+
+       ; set *result = 1
+       .db     <page1_page0
+       .db     <page0_imm_w
+       .dw     1
+       .db     <page1_st_w
+
+       ; return
+ .db <page0_trace
+       .db     <page0_ret
+
+sm_print_char:
+       .db     <page0_esc
+       ld      hl,2
+       add     hl,sp
+       ld      a,(hl)
+       call    print_char
+       jp      page0_ret