Revamp register usage in math package for shifts, multiplies and divides to remove...

author Nick Downing <nick@ndcode.org>

Tue, 18 Jun 2019 14:23:09 +0000 (00:23 +1000)

committer Nick Downing <nick@ndcode.org>

Tue, 18 Jun 2019 14:24:13 +0000 (00:24 +1000)
author Nick Downing <nick@ndcode.org>
Tue, 18 Jun 2019 14:23:09 +0000 (00:23 +1000)
committer Nick Downing <nick@ndcode.org>
Tue, 18 Jun 2019 14:24:13 +0000 (00:24 +1000)
diff --git a/sm3.asm b/sm3.asm

index 56a26e5..06d1abe 100644 (file)
--- a/sm3.asm
+++ b/sm3.asm
@@ -35,7 +35,7 @@ page0_page1:
  page0_jeq:
         jr      nz,not_taken
  page0_imm_jmp:
-       rst     8
+       rst     0x28
         ld      c,l
         ld      b,h
         jr      page0_dispatch0
@@ -72,7 +72,7 @@ page0_pge:
         jr      page0_dispatch1
  
  page0_imm_call:
-       rst     8
+       rst     0x28
         push    bc
         ld      c,l
         ld      b,h
@@ -81,7 +81,7 @@ page0_imm_call:
  page0_ret:
         pop     bc
  page0_stkadj:
-       rst     8
+       rst     0x28
         add     hl,sp
         ld      sp,hl
  page0_dispatch0:
@@ -92,7 +92,7 @@ page0_dispatch0:
         jp      (hl)
  
  page0_stkptr:
-       rst     8
+       rst     0x28
         add     hl,sp
         ex      de,hl
  page0_dispatch1:
@@ -204,7 +204,7 @@ page1_xor_w:
         jr      page1_dispatch1
  
  page1_imm_add_w:
-       rst     8
+       rst     0x28
         .db     0x3e ; ld a,
  page1_add_w:
         pop     hl
@@ -213,7 +213,7 @@ page1_add_w:
         jr      page1_dispatch1
  
  page1_imm_subrev_w:
-       rst     8
+       rst     0x28
         .db     0x3e ; ld a,
  page1_sub_w:
         pop     hl
@@ -232,7 +232,7 @@ page1_subrev_w:
         jr      page1_dispatch1
  
  page1_imm_cmprev_sw:
-       rst     8
+       rst     0x28
         .db     0x3e ; ld a,
  page1_cmp_sw:
         pop     hl
@@ -240,7 +240,7 @@ page1_cmp_sw:
         jr      page1_dispatch0
  
  page1_imm_cmp_sw:
-       rst     8
+       rst     0x28
         .db     0x3e ; ld a,
  page1_cmprev_sw:
         pop     hl
@@ -248,7 +248,7 @@ page1_cmprev_sw:
         jr      page1_dispatch0
  
  page1_imm_cmprev_uw:
-       rst     8
+       rst     0x28
         .db     0x3e ; ld a,
  page1_cmp_uw:
         pop     hl
@@ -257,7 +257,7 @@ page1_cmp_uw:
         jr      page1_dispatch0
  
  page1_imm_cmp_uw:
-       rst     8
+       rst     0x28
         .db     0x3e ; ld a,
  page1_cmprev_uw:
         pop     hl
@@ -267,7 +267,7 @@ page1_cmprev_uw:
         jr      page1_dispatch0
  
  page1_imm_slrev_w:
-       rst     8
+       rst     0x28
         .db     0x3e ; ld a,
  page1_sl_w:
         pop     hl
@@ -280,9 +280,10 @@ page1_dispatch1:
         jp      (hl)
  
  page1_imm_sl_w:
-       call    math_sl_imm_w
-       jr      page1_dispatch1
-
+       ld      a,(bc)
+       inc     bc
+       ld      l,a
+       .db     0x3e ; ld a,
  page1_slrev_w:
         pop     hl
         call    math_sl_w
@@ -303,7 +304,7 @@ page1_sl_l:
         jr      page1_dispatch2
  
  page1_imm_srrev_sw:
-       rst     8
+       rst     0x28
         .db     0x3e ; ld a,
  page1_sr_sw:
         pop     hl
@@ -337,7 +338,7 @@ page1_sr_sl:
         jr      page1_dispatch2
  
  page1_imm_srrev_uw:
-       rst     8
+       rst     0x28
         .db     0x3e ; ld a,
  page1_sr_uw:
         pop     hl
@@ -380,7 +381,7 @@ page1_mul_w:
         jr      page1_dispatch1
  
  page1_imm_divrev_sw:
-       rst     8
+       rst     0x28
         .db     0x3e ; ld a,
  page1_div_sw:
         pop     hl
@@ -389,7 +390,7 @@ page1_div_sw:
         jr      page1_dispatch1
  
  page1_imm_div_sw:
-       rst     8
+       rst     0x28
         .db     0x3e ; ld a,
  page1_divrev_sw:
         pop     hl
@@ -398,7 +399,7 @@ page1_divrev_sw:
         jr      page1_dispatch1
  
  page1_imm_divrev_uw:
-       rst     8
+       rst     0x28
         .db     0x3e ; ld a,
  page1_div_uw:
         pop     hl
@@ -407,7 +408,7 @@ page1_div_uw:
         jr      page1_dispatch1
  
  page1_imm_div_uw:
-       rst     8
+       rst     0x28
         .db     0x3e ; ld a,
  page1_divrev_uw:
         pop     hl
@@ -482,100 +483,73 @@ page2_xor_l:
         jr      page2_dispatch2
  
  page2_imm_add_l:
-       call    math_add_imm_l
-       jr      page2_dispatch2
-
+       rst     0x38
+       .db     0x3e ; ld a,
  page2_add_l:
-       exx
-       pop     de
-       exx
-       pop     hl
+       rst     0x30
         call    math_add_l
         jr      page2_dispatch2
  
  page2_imm_subrev_l:
-       call    math_subrev_imm_l
-       jr      page2_dispatch2
-
+       rst     0x38
+       .db     0x3e ; ld a,
  page2_sub_l:
-       exx
-       pop     de
-       exx
-       pop     hl
+       rst     0x30
         call    math_subrev_l
         jr      page2_dispatch2
  
  ; use addition for page2_imm_sub_l
-
  page2_subrev_l:
-       exx
-       pop     de
-       exx
-       pop     hl
+       rst     0x30
         call    math_sub_l
         jr      page2_dispatch2
  
  page2_imm_cmprev_sl:
-       call    math_cmprev_imm_sl
-       jr      page2_dispatch2
-
+       rst     0x38
+       .db     0x3e ; ld a,
  page2_cmp_sl:
-       exx
-       pop     de
-       exx
-       pop     hl
+       rst     0x30
         call    math_cmprev_sl
         jr      page2_dispatch2
  
  page2_imm_cmprev_ul:
-       call    math_cmprev_imm_ul
-       jr      page2_dispatch2
-
+       rst     0x38
+       .db     0x3e ; ld a,
  page2_cmp_ul:
-       exx
-       pop     de
-       exx
-       pop     hl
+       rst     0x30
         call    math_cmprev_ul
         jr      page2_dispatch2
  
  page2_imm_cmp_sl:
-       call    math_cmp_imm_sl
-       jr      page2_dispatch2
-
+       rst     0x38
+       .db     0x3e ; ld a,
  page2_cmprev_sl:
-       exx
-       pop     de
-       exx
-       pop     hl
+       rst     0x30
         call    math_cmp_sl
         jr      page2_dispatch2
  
  page2_imm_cmp_ul:
-       call    math_cmp_imm_ul
-       jr      page2_dispatch2
-
+       rst     0x38
+       .db     0x3e ; ld a,
  page2_cmprev_ul:
-       exx
-       pop     de
-       exx
-       pop     hl
+       rst     0x30
         call    math_cmp_ul
         jr      page2_dispatch2
  
-page2_imm_sl_l:
-       call    math_sl_imm_l
-page2_dispatch2:
+page1_imm_sl_l:
         ld      a,(bc)
         inc     bc
         ld      l,a
-       ld      h,page2
-       jp      (hl)
-
+       .db     0x3e ; ld a,
  page2_slrev_l:
         pop     hl
         call    math_sl_l
-       jr      page2_dispatch2
+page2_dispatch2:
+       ld      a,(bc)
+       inc     bc
+       ld      l,a
+       ld      h,page2
+       jp      (hl)
  
  page2_imm_sr_sl:
         ld      a,(bc)
@@ -705,6 +679,11 @@ test_lt:
  
  ; math package
  
+; lightweight routines
+
+; these can be duplicated without much cost, and will be called often, so the
+; calling convention is geared to what the interpreter needs (de:hl' and so on)
+
  math_imm_l: ; immediate to de:hl'
         ld      a,(bc)
         inc     bc
@@ -725,26 +704,6 @@ math_imm_w: ; immediate to de
         ld      d,a
         ret
  
-math_imm_l0: ; immediate to hl:de'
-       ld      a,(bc)
-       inc     bc
-       exx
-       ld      e,a
-       exx
-       ld      a,(bc)
-       inc     bc
-       exx
-       ld      d,a
-       exx
-; use rst 8 for math_imm_w0
-       ld      a,(bc)
-       inc     bc
-       ld      l,a
-       ld      a,(bc)
-       inc     bc
-       ld      h,a
-       ret
-
  math_stkld_w: ; sp(imm_w) to de
         ld      a,(bc)
         inc     bc
@@ -1053,13 +1012,35 @@ cmprev_l_entry:
         exx
         ret
  
-math_sl_imm_w: ; de <<= imm_b, imm_b in [0, 0x10)
-       ld      a,(bc)
-       inc     bc
-       ld      l,a
-math_sl_w: ; de <<= l & 0xf
+; heavyweight routines
+
+; these have unrolled loops and so there needs to be as much reuse as possible
+
+; for the multiply and divide the unrolled loops are placed in subroutines, so
+; there is some overhead to use them, but it allows the calling code to itself
+; be cheaply unrolled, so the amount left to do is determined by context not
+; by a counter; the tradeoff is do we count loops by push bc/pop bc/djnz or by
+; call/call/call, since both need a stack push/pop it should be about the same
+
+; for these routines the calling convention is geared to whatever the routine
+; needs to work most efficiently, this makes the usage more cumbersome as you
+; to do something like ex de,hl/call/ex de,hl but it is done this way because
+; otherwise some callsites need to exchange registers into place on one side
+; of a call/ret boundary only for them to be immediately exchanged back again
+
+; as an exception to this we may provide an earlier entry point(s), before the
+; routine proper, which exchanges arguments into place (the caller must still
+; deal with exchanges afterwards), we do this because of callsites that are in
+; the interpreter pages and need to conserve code size, therefore the prefix
+; chosen need not occur more than once, and we'll use the longest such prefix
+
+; while some of these earlier entry points have a meaning, e.g. math_divrev_l
+; before math_div_l, many of them do not, so we will just number them instead
+; (this is because the prefix code can be a compromise between callers' needs)
+
+math_sl_w0: ; hl = de << (l & 0xf)
         ex      de,hl
-math_slrev_w: ; de = hl << (e & 0xf)
+math_sl_w: ; hl <<= e & 0xf
         ; by 1
         bit     0,e
         jr      z,1$
@@ -1078,19 +1059,16 @@ math_slrev_w: ; de = hl << (e & 0xf)
         add     hl,hl
  3$:    ; by 8
         bit     3,e
-       ex      de,hl
         ret     z
-       ld      d,e
-       ld      e,0
+       ld      h,l
+       ld      l,0
         ret
  
-math_sl_imm_l: ; de:hl' << imm_b, imm_b in [0, 0x20)
-       ld      a,(bc)
-       inc     bc
-       ld      l,a
-math_sl_l: ; de:hl' <<= l & 0x1f
+math_sl_l0: ; hl:de' <<= e & 0x1f
+       exx
         ex      de,hl
-math_sl_l0: ; hl:hl' <<= e & 0x1f, for slrev
+       exx
+math_sl_l: ; hl:hl' <<= e & 0x1f
         ; by 1
         bit     0,e
         jr      z,1$
@@ -1140,33 +1118,32 @@ math_sl_l0: ; hl:hl' <<= e & 0x1f, for slrev
         ld      l,a
  4$:    ; by 16
         bit     4,e
-       ex      de,hl
         ret     z
         exx
         push    hl
         ld      hl,0
         exx
-       pop     de
+       pop     hl
         ret
  
-math_sr_uw: ; de >>= l & 0xf, logical
+math_sr_uw0: ; hl = de >> (l & 0xf), logical
         ex      de,hl
-math_srrev_uw: ; de = hl >> (e & 0xf), logical
+math_sr_uw: ; hl >>= e & 0xf, logical
         ld      a,e
         and     0x1f
         add     7
-math_srrev_uw0: ; de = hl >> (a - 7), immediate a in [7, 0x17), logical
+math_sr_uw1: ; hl >>= a - 7, immediate a in [7, 0x17), arithmetic
         ld      e,a
         sub     a
         jr      sr_w_entry
  
-math_sr_sw: ; de >>= l & 0xf, arithmetic
+math_sr_sw0: ; hl = de >> (l & 0xf), arithmetic
         ex      de,hl
-math_srrev_sw: ; de = hl >> (e & 0xf), arithmetic
+math_sr_sw: ; hl >>= e & 0xf, arithmetic
         ld      a,e
         and     0xf
         add     7
-math_srrev_sw0: ; de = hl >> (a - 7), immediate a in [7, 0x17), arithmetic
+math_sr_sw1: ; hl >>= a - 7, immediate a in [7, 0x17), arithmetic
         ld      e,a
         ld      a,h
         rla
@@ -1198,61 +1175,40 @@ sr_w_entry:
  3$:    ; by 8
         bit     3,e
         jr      z,4$
-       ld      e,h
-       ld      d,a
+       ld      l,h
+       ld      h,a
         ret
  4$:    ; by 16 (can't occur simultaneously with by 8)
         bit     4,e
-       ex      de,hl
         ret     z
-       ld      e,a
+       ld      l,a
         rla
         sbc     a,a
-       ld      d,a
-       ret
-
-math_mul_imm_l: ; de:hl' *= imm_l, big-endian imm_l
-       exx     
-       ex      de,hl
-       sub     a
-       ld      l,a
-       ld      h,a
-       exx
-       ld      l,a
         ld      h,a
-       ld      a,(bc)
-       inc     bc
-       call    mul_l0
-       ld      a,(bc)
-       inc     bc
-       call    mul_l
-       ld      a,(bc)
-       inc     bc
-       call    mul_l
-       ld      a,(bc)
-       inc     bc
-       call    mul_l
-       ex      de,hl
         ret
  
-math_sr_ul: ; de:hl' >>= l & 0x1f, logical
+math_sr_ul0: ; hl:de' >>= e & 0x1f, logical
+       exx
         ex      de,hl
-math_sr_ul0: ; de:hl' = hl:hl' >> (e & 0x1f), logical
+       exx
+math_sr_ul: ; hl:hl' >>= e & 0x1f, logical
         ld      a,e
         and     0x1f
         add     7
-math_sr_ul1: ; de:hl' = hl:hl' >> (a - 7), immediate a in [7, 0x27), logical
+math_sr_ul1: ; hl:hl' >>= a - 7, immediate a in [7, 0x27), logical
         ld      e,a
         sub     a
         jr      sr_l_entry
  
-math_sr_sl: ; de:hl' >>= l & 0x1f, arithmetic
+math_sr_sl0: ; hl:de' >>= e & 0x1f, arithmetic
+       exx
         ex      de,hl
-math_sr_sl0: ; de:hl' = hl:hl' >> (e & 0x1f), arithmetic
+       exx
+math_sr_sl: ; hl:hl' >>= e & 0x1f, arithmetic
         ld      a,e
         and     0xf
         add     7
-math_sr_sl1: ; de:hl' = hl:hl' >> (a - 7), immediate a in [7, 0x17), arithmetic
+math_sr_sl1: ; hl:hl' >>= a - 7, immediate a in [7, 0x27), arithmetic
         ld      e,a
         ld      a,h
         rla
@@ -1319,17 +1275,16 @@ sr_l_entry:
         bit     4,e
         jr      z,5$
         push    hl
-       ld      e,a
+       ld      l,a
         rla
         sbc     a,a
-       ld      d,a
+       ld      h,a
         exx
         pop     hl
         exx
         ret
  5$:    ; by 32 (can't occur simultaneously with by 16)
         bit     5,e
-       ex      de,hl
         ret     z
         exx
         ld      l,a
@@ -1337,10 +1292,11 @@ sr_l_entry:
         sbc     a,a
         ld      h,a
         exx
-       ld      e,a
-       ld      d,a
+       ld      l,a
+       ld      h,a
         ret
  
+; this routine is just an optimization, therefore use interpreter registers
  math_mul_imm_w: ; de *= imm_w, big-endian imm_w
         ld      hl,0
         ld      a,(bc)
@@ -1352,7 +1308,7 @@ math_mul_imm_w: ; de *= imm_w, big-endian imm_w
         ex      de,hl
         ret
  
-math_mul_w: ; de *= hl
+math_mul_w: ; hl *= de
         ld      a,l
         push    af
         ld      a,h
@@ -1360,9 +1316,6 @@ math_mul_w: ; de *= hl
         call    mul_w0
         pop     af
         call    mul_w
-       ex      de,hl
-       ret
-
  mul_w: ; bit 0
         add     hl,hl
  mul_w0:        rla
@@ -1405,8 +1358,32 @@ mul_w0:  rla
         add     hl,de
         ret
  
-math_mul_l: ; de:hl' *= hl:de'
+; this routine is just an optimization, therefore use interpreter registers
+math_mul_imm_l: ; de:hl' *= imm_l, big-endian imm_l
+       exx     
+       ex      de,hl
+       sub     a
+       ld      l,a
+       ld      h,a
+       exx
+       ld      l,a
+       ld      h,a
+       ld      a,(bc)
+       inc     bc
+       call    mul_l0
+       ld      a,(bc)
+       inc     bc
+       call    mul_l
+       ld      a,(bc)
+       inc     bc
+       call    mul_l
+       ld      a,(bc)
+       inc     bc
+       call    mul_l
         ex      de,hl
+       ret
+
+math_mul_l: ; hl:hl' *= de:de'
         exx
         ld      a,l
         push    af
@@ -1423,10 +1400,6 @@ math_mul_l: ; de:hl' *= hl:de'
         pop     af
         call    mul_l
         pop     af
-       call    mul_l
-       ex      de,hl
-       ret
-
  mul_l: ; bit 0
         exx
         add     hl,hl
@@ -1517,13 +1490,9 @@ mul_l0:  rla
         adc     hl,de
         ret
  
-; word division
-
-; use rst 8 then math_div_sw for math_div_imm_sw
-math_div_sw: ; de, hl = de % hl, de / hl, signed
+math_div_sw0: ; hl, de = de % hl, de / hl, signed
         ex      de,hl
-; use rst 8 then math_divrev_sw for math_divrev_imm_sw
-math_divrev_sw: ; de, hl = hl % de, hl / de, signed
+math_div_sw: ; hl, de = hl % de, hl / de, signed
         push    bc
         ld      a,h
         or      a
@@ -1548,7 +1517,6 @@ math_divrev_sw: ; de, hl = hl % de, hl / de, signed
  1$:    ld      d,b
         ld      e,a
         pop     bc
-       ex      de,hl
         ret
  
  div_w_n:
@@ -1571,7 +1539,6 @@ div_w_n:
         ld      d,b
         ld      e,a
         pop     bc
-       ex      de,hl
         ret
  
  div_w_nn: ; negative dividend, negative divisor
@@ -1585,14 +1552,11 @@ div_w_nn: ; negative dividend, negative divisor
         ld      d,b
         ld      e,a
         pop     bc
-       ex      de,hl
         ret
  
-; use rst 8 then math_div_uw for math_div_imm_uw
-math_div_uw: ; de, hl = de % hl, de / hl, unsigned
+math_div_uw0: ; hl, de = de % hl, de / hl, unsigned
         ex      de,hl
-; use rst 8 then math_divrev_uw for math_divrev_imm_uw
-math_divrev_uw: ; de, hl = hl % de, hl / de, unsigned
+math_div_uw: ; hl, de = hl % de, hl / de, unsigned
         push    bc
         ld      a,h
         ld      c,l
@@ -1607,7 +1571,6 @@ div_w_pp: ; positive dividend, positive divisor
  1$:    ld      d,b
         ld      e,a
         pop     bc
-       ex      de,hl
         ret
  
  ; non-restoring division routine
@@ -1882,13 +1845,11 @@ div_w_n18: ; done, below
         scf
         ret
  
-; long division
-
-math_div_imm_sl:
-       call    math_imm_l0
-math_div_sl: ; de:hl', hl:de' = de:hl' % hl:de', de:hl' / hl:de', signed
+math_div_sl0: ; hl:hl', de:de' = hl:de' % de:hl', hl:de' / de:hl', signed
+       exx
         ex      de,hl
-math_div_sl0: ; ; de:hl', hl:de' = hl:hl' % de:de', hl:hl' / de:de', signed
+       exx
+math_div_sl: ; ; hl:hl', de:de' = hl:hl' % de:de', hl:hl' / de:de', signed
         push    bc
         ld      a,h
         or      a
@@ -1897,6 +1858,11 @@ math_div_sl0: ; ; de:hl', hl:de' = hl:hl' % de:de', hl:hl' / de:de', signed
         jp      m,div_l_n               ; positive dividend
  
         ; positive dividend
+       exx
+       ld      c,l
+       ld      b,h
+       ld      hl,0
+       exx
         ld      a,h
         ld      c,l
         ld      hl,0
@@ -1907,18 +1873,32 @@ math_div_sl0: ; ; de:hl', hl:de' = hl:hl' % de:de', hl:hl' / de:de', signed
         ld      b,a
         ld      a,c
         call    div_l_ncf
+       ld      c,a
+       exx
+       ld      a,b
+       exx
+       call    div_l_ncf
+       exx
+       ld      b,a
+       ld      a,c
+       exx
+       call    div_l_ncf
         inc     a
-       jr      c,1$
+       jr      c,div_l_p_done
+       exx
         sbc     hl,de
-1$:    ld      d,b
-       ld      e,a
-       pop     bc
-       ex      de,hl
-       ret
+       exx
+       sbc     hl,de
+       jr      div_l_p_done
  
  div_l_n:
         ; negative dividend
-       dec     hl                      ; reduces remainder by 1 (we inc later)
+       call    dec_l                   ; reduces remainder by 1 (we inc later)
+       exx
+       ld      c,l
+       ld      b,h
+       ld      hl,-1
+       exx
         ld      a,h
         ld      c,l
         ld      hl,-1
@@ -1929,36 +1909,73 @@ div_l_n:
         ld      b,a
         ld      a,c
         call    div_lcf
+       ld      c,a
+       exx
+       ld      a,b
+       exx
+       call    div_lcf
+       exx
+       ld      b,a
+       ld      a,c
+       exx
+       call    div_lcf
         inc     a
-       jr      c,1$
+       jr      c,div_l_n_done
+       exx
         sbc     hl,de
-1$:    inc     hl                      ; get into range -divisor+1..0
-       ld      d,b
-       ld      e,a
-       pop     bc
-       ex      de,hl
-       ret
+       exx
+       sbc     hl,de
+       jr      div_l_n_done
  
  div_l_nn: ; negative dividend, negative divisor
         call    div_l_n0
         ld      b,a
         ld      a,c
         call    div_l_ncf
-       jr      nc,1$
+       ld      c,a
+       exx
+       ld      a,b
+       exx
+       call    div_l_ncf
+       exx
+       ld      b,a
+       ld      a,c
+       exx
+       call    div_l_ncf
+       jr      nc,div_l_n_done
+       exx
         add     hl,de
-1$:    inc     hl                      ; get into range divisor+1..0
-       ld      d,b
+       exx
+       adc     hl,de
+div_l_n_done:
+       exx
         ld      e,a
+       ld      d,b
+       exx
+       ld      e,c
+       ld      d,b
         pop     bc
-       ex      de,hl
-       ret
-
-math_div_imm_ul:
-       call    math_imm_l0
-math_div_ul: ; de:hl', hl:de' = de:hl' % hl:de', de:hl' / hl:de', unsigned
-       ex      de,hl
-math_div_ul0: ; ; de:hl', hl:de' = hl:hl' % de:de', hl:hl' / de:de', unsigned
+inc_l:                                 ; get into range divisor+1..0
+       exx
+       inc     hl
+       ld      a,l
+       or      h
+       exx
+       ret     nz
+       inc     hl
+       ret
+
+math_div_ul0: ; hl:hl', de:de' = hl:de' % de:hl', hl:de' / de:hl', unsigned
+       exx
+       ex      de,hl
+       exx
+math_div_ul: ; ; hl:hl', de:de' = hl:hl' % de:de', hl:hl' / de:de', unsigned
         push    bc
+       exx
+       ld      c,l
+       ld      b,h
+       ld      hl,0
+       exx
         ld      a,h
         ld      c,l
         ld      hl,0
@@ -1967,97 +1984,141 @@ div_l_pp: ; positive dividend, positive divisor
         ld      b,a
         ld      a,c
         call    div_lcf
-       jr      nc,1$
+       ld      c,a
+       exx
+       ld      a,b
+       exx
+       call    div_lcf
+       exx
+       ld      b,a
+       ld      a,c
+       exx
+       call    div_lcf
+       jr      nc,div_l_p_done
+       exx
         add     hl,de
-1$:    ld      d,b
+       exx
+       adc     hl,de
+div_l_p_done:
+       exx
         ld      e,a
+       ld      d,b
+       exx
+       ld      e,c
+       ld      d,b
         pop     bc
-       ex      de,hl
         ret
  
-; non-restoring division routine
-
-; de = divisor, hl:a = dividend with hl = previous remainder, a = next byte
-; enter at div0 with positive remainder in hl, such that hl < de
-; enter at div1 with negative remainder in hl, such that hl >= -de
-
-; div0/1 return a = 8-bit quotient as an odd number interpreted as -ff..ff,
-; by summing positive/negative place values, e.g. -80 +40 +20 -10 +8 -4 -2 +1
-
-; if entered at div0, there is a -80 and so quotient is in range -ff..-1
-; if entered at div1, there is a +80 and so quotient is in range 1..ff
-; falls out of loop after div01 with positive remainder, div11 with negative,
-; depending on this we should re-enter at div0 or div1, signalled by cf return
-
-; the successive quotient bytes can be concatenated into a full quotient,
-; but negative bytes require the next higher quotient byte to be decremented,
-; we know in advance if this will happen because the implied sign of the
-; quotient byte depends only on whether we entered at div0 or div1, hence,
-; before the div11 return we'll decrement to compensate for next negative byte
-
-; the decrement can also be seen as compensating for the extra add hl,de that
-; may be needed to make negative remainder positive before return to caller,
-; thus leaving quotient in a consistent state regardless of which exit taken,
-; remainder needs the add hl,de if cf=1 returned (equiv. return byte is even)
-
-; in the following code each sbc hl,de gets an inc a and each add hl,de gets
-; a dec a, guaranteeing the integrity of the division, the initial scf/rla is
-; needed to make the result 100 + -ff..ff or 1..1ff, so that the decrements
-; cannot borrow into the upcoming dividend bits also held in a, and there must
-; be another shift between the scf/rla and increment/decrement so that the scf
-; is implicitly in the 100s place, making the code awkward though it's correct
+dec_l:
+       exx
+       ld      a,l
+       or      h
+       dec     hl
+       exx
+       ret     nz
+       dec     hl
+       ret
  
-; now optimized to only inc/dec a when doing zero-crossing, fix above analysis
+; non-restoring division routine
+; see earlier comments for the word version, this extends the concept to long
  
  div_lcf:
         jr      c,div_l1
  div_l0: ; bit 0, above
         scf
         rla
+       exx
+       adc     hl,hl
+       exx
         adc     hl,hl
+       exx
+       sbc     hl,de
+       exx
         sbc     hl,de
         jr      nc,div_l01
         dec     a
  div_l11: ; bit 1, below
         add     a,a
+       exx
         adc     hl,hl
+       exx
+       adc     hl,hl
+       exx
         add     hl,de
+       exx
+       adc     hl,de
         jr      nc,div_l12
         inc     a
  div_l02: ; bit 2, above
         add     a,a
+       exx
+       adc     hl,hl
+       exx
         adc     hl,hl
+       exx
+       sbc     hl,de
+       exx
         sbc     hl,de
         jr      nc,div_l03
         dec     a
  div_l13: ; bit 3, below
         add     a,a
+       exx
         adc     hl,hl
+       exx
+       adc     hl,hl
+       exx
         add     hl,de
+       exx
+       adc     hl,de
         jr      nc,div_l14
         inc     a
  div_l04: ; bit 4, above
         add     a,a
+       exx
+       adc     hl,hl
+       exx
         adc     hl,hl
+       exx
+       sbc     hl,de
+       exx
         sbc     hl,de
         jr      nc,div_l05
         dec     a
  div_l15: ; bit 5, below
         add     a,a
+       exx
+       adc     hl,hl
+       exx
         adc     hl,hl
+       exx
         add     hl,de
+       exx
+       adc     hl,de
         jr      nc,div_l16
         inc     a
  div_l06: ; bit 6, above
         add     a,a
+       exx
         adc     hl,hl
+       exx
+       adc     hl,hl
+       exx
+       sbc     hl,de
+       exx
         sbc     hl,de
         jr      nc,div_l07
         dec     a
  div_l17: ; bit 7, below
         add     a,a
+       exx
+       adc     hl,hl
+       exx
         adc     hl,hl
+       exx
         add     hl,de
+       exx
+       adc     hl,de
         jr      nc,div_l18
         inc     a
  div_l08: ; done, above
@@ -2068,49 +2129,97 @@ div_l08: ; done, above
  
  div_l1: ; bit 0, below
         add     a,a
+       exx
+       adc     hl,hl
+       exx
         adc     hl,hl
+       exx
         add     hl,de
+       exx
+       adc     hl,de
         jr      nc,div_l11
         inc     a
  div_l01: ; bit 1, above
         add     a,a
+       exx
         adc     hl,hl
+       exx
+       adc     hl,hl
+       exx
+       sbc     hl,de
+       exx
         sbc     hl,de
         jr      nc,div_l02
         dec     a
  div_l12: ; bit 2, below
         add     a,a
+       exx
+       adc     hl,hl
+       exx
         adc     hl,hl
+       exx
         add     hl,de
+       exx
+       adc     hl,de
         jr      nc,div_l13
         inc     a
  div_l03: ; bit 3, above
         add     a,a
+       exx
+       adc     hl,hl
+       exx
         adc     hl,hl
+       exx
+       sbc     hl,de
+       exx
         sbc     hl,de
         jr      nc,div_l04
         dec     a
  div_l14: ; bit 4, below
         add     a,a
+       exx
         adc     hl,hl
+       exx
+       adc     hl,hl
+       exx
         add     hl,de
+       exx
+       adc     hl,de
         jr      nc,div_l15
         inc     a
  div_l05: ; bit 5, above
         add     a,a
+       exx
+       adc     hl,hl
+       exx
         adc     hl,hl
+       exx
+       sbc     hl,de
+       exx
         sbc     hl,de
         jr      nc,div_l06
         dec     a
  div_l16: ; bit 6, below
         add     a,a
+       exx
         adc     hl,hl
+       exx
+       adc     hl,hl
+       exx
         add     hl,de
+       exx
+       adc     hl,de
         jr      nc,div_l17
         inc     a
  div_l07: ; bit 7, above
         add     a,a
+       exx
         adc     hl,hl
+       exx
+       adc     hl,hl
+       exx
+       sbc     hl,de
+       exx
         sbc     hl,de
         jr      nc,div_l08
         dec     a
@@ -2121,65 +2230,110 @@ div_l18: ; done, below
         scf
         ret
  
-; divn0/1 are the same as div0/1 but carry reversed after add/subtract divisor
-; this is for negative divisors where we expect carry (means no zero crossing)
-
-; when divisor negated, remainder also negated, so we expect to do subtraction
-; when remainder negative and vice versa, need to clear carry after add hl,hl
+; version for negative divisors
+; see earlier comments for the word version, this extends the concept to long
  
  div_l_ncf:
         jr      c,div_l_n1
  div_l_n0: ; bit 0, above
         scf
         rla
+       exx
+       adc     hl,hl
+       exx
         adc     hl,hl
         or      a
+       exx
+       sbc     hl,de
+       exx
         sbc     hl,de
         jr      c,div_l_n01
         dec     a
  div_l_n11: ; bit 1, below
         add     a,a
+       exx
+       adc     hl,hl
+       exx
         adc     hl,hl
+       exx
         add     hl,de
+       exx
+       adc     hl,de
         jr      c,div_l_n12
         inc     a
  div_l_n02: ; bit 2, above
         add     a,a
+       exx
+       adc     hl,hl
+       exx
         adc     hl,hl
         or      a
+       exx
+       sbc     hl,de
+       exx
         sbc     hl,de
         jr      c,div_l_n03
         dec     a
  div_l_n13: ; bit 3, below
         add     a,a
+       exx
         adc     hl,hl
+       exx
+       adc     hl,hl
+       exx
         add     hl,de
+       exx
+       adc     hl,de
         jr      c,div_l_n14
         inc     a
  div_l_n04: ; bit 4, above
         add     a,a
+       exx
+       adc     hl,hl
+       exx
         adc     hl,hl
         or      a
+       exx
+       sbc     hl,de
+       exx
         sbc     hl,de
         jr      c,div_l_n05
         dec     a
  div_l_n15: ; bit 5, below
         add     a,a
+       exx
+       adc     hl,hl
+       exx
         adc     hl,hl
+       exx
         add     hl,de
+       exx
+       adc     hl,de
         jr      c,div_l_n16
         inc     a
  div_l_n06: ; bit 6, above
         add     a,a
+       exx
+       adc     hl,hl
+       exx
         adc     hl,hl
         or      a
+       exx
+       sbc     hl,de
+       exx
         sbc     hl,de
         jr      c,div_l_n07
         dec     a
  div_l_n17: ; bit 7, below
         add     a,a
+       exx
+       adc     hl,hl
+       exx
         adc     hl,hl
+       exx
         add     hl,de
+       exx
+       adc     hl,de
         jr      c,div_l_n18
         inc     a
  div_l_n08: ; done, above
@@ -2190,54 +2344,102 @@ div_l_n08: ; done, above
  
  div_l_n1: ; bit 0, below
         add     a,a
+       exx
+       adc     hl,hl
+       exx
         adc     hl,hl
+       exx
         add     hl,de
+       exx
+       adc     hl,de
         jr      c,div_l_n11
         inc     a
  div_l_n01: ; bit 1, above
         add     a,a
+       exx
         adc     hl,hl
+       exx
+       adc     hl,hl
+       exx
         or      a
         sbc     hl,de
+       exx
+       sbc     hl,de
         jr      c,div_l_n02
         dec     a
  div_l_n12: ; bit 2, below
         add     a,a
+       exx
         adc     hl,hl
+       exx
+       adc     hl,hl
+       exx
         add     hl,de
+       exx
+       adc     hl,de
         jr      c,div_l_n13
         inc     a
  div_l_n03: ; bit 3, above
         add     a,a
+       exx
         adc     hl,hl
+       exx
+       adc     hl,hl
+       exx
         or      a
         sbc     hl,de
+       exx
+       sbc     hl,de
         jr      c,div_l_n04
         dec     a
  div_l_n14: ; bit 4, below
         add     a,a
+       exx
+       adc     hl,hl
+       exx
         adc     hl,hl
+       exx
         add     hl,de
+       exx
+       adc     hl,de
         jr      c,div_l_n15
         inc     a
  div_l_n05: ; bit 5, above
         add     a,a
+       exx
         adc     hl,hl
+       exx
+       adc     hl,hl
+       exx
         or      a
         sbc     hl,de
+       exx
+       sbc     hl,de
         jr      c,div_l_n06
         dec     a
  div_l_n16: ; bit 6, below
         add     a,a
+       exx
         adc     hl,hl
+       exx
+       adc     hl,hl
+       exx
         add     hl,de
+       exx
+       adc     hl,de
         jr      c,div_l_n17
         inc     a
  div_l_n07: ; bit 7, above
         add     a,a
+       exx
+       adc     hl,hl
+       exx
         adc     hl,hl
+       exx
         or      a
         sbc     hl,de
+       exx
+       sbc     hl,de
         jr      c,div_l_n08
         dec     a
  div_l_n18: ; done, below
@@ -2247,7 +2449,6 @@ div_l_n18: ; done, below
         scf
         ret
  
-
  ; debugging
  
  print_trace: ; print af, bc, de, hl, sp, (sp)
@@ -2348,9 +2549,9 @@ start:
         ld      hl,print_trace
         ld      (page0_trace + 1),hl
  
-       ld      hl,rst8
-       ld      de,8
-       ld      bc,7
+       ld      hl,restarts
+       ld      de,0x28
+       ld      bc,restarts_end - restarts
         ldir
  
         ld      bc,sm_start
@@ -2362,7 +2563,35 @@ sm_start:
         .db     <page0_esc
         jp      0
  
-rst8:  ; immediate to hl
+restarts:
+       ; rst 0x28, immediate to hl
+       ld      a,(bc)
+       inc     bc
+       ld      l,a
+       ld      a,(bc)
+       inc     bc
+       ld      h,a
+       ret
+       .db     0
+       ; rst 0x30, pop hl:de'
+       pop     hl
+       exx
+       pop     de
+       exx
+       ex      (sp),hl
+       ret
+       .db     0,0
+       ; rst 0x38, immediate to hl:de'
+       ld      a,(bc)
+       inc     bc
+       exx
+       ld      e,a
+       exx
+       ld      a,(bc)
+       inc     bc
+       exx
+       ld      d,a
+       exx
         ld      a,(bc)
         inc     bc
         ld      l,a
@@ -2370,6 +2599,7 @@ rst8:     ; immediate to hl
         inc     bc
         ld      h,a
         ret
+restarts_end:
  
  sm_main:
         ; create stack frame
author	Nick Downing <nick@ndcode.org>
	Tue, 18 Jun 2019 14:23:09 +0000 (00:23 +1000)
committer	Nick Downing <nick@ndcode.org>
	Tue, 18 Jun 2019 14:24:13 +0000 (00:24 +1000)