Correct unsigned 16x16 to 32 bit multiply, need to add corrections for signed version

[stack_machine.git] / sm3.asm
diff --git a/sm3.asm b/sm3.asm

index 204cf2c..986ff44 100644 (file)
--- a/sm3.asm
+++ b/sm3.asm
@@ -1,11 +1,111 @@
-page0  =       2
-page1  =       4
-page2  =       6
+page0  =       3
+page1  =       5
+page2  =       7
  
         .area   SM (abs,ovr)
  
         .org    0x100
  
+       ld      hl,0x1234
+       ld      de,0x56
+       call    math_mul_uw0
+       call    print_hlde
+
+       ld      hl,0x6543
+       ld      de,0x21
+       call    math_mul_uw0
+       call    print_hlde
+
+       ld      hl,0xb975
+       ld      de,0x31
+       call    math_mul_uw0
+       call    print_hlde
+
+       ld      hl,0xdb97
+       ld      de,0x531
+       call    math_mul_uw0
+       call    print_hlde
+
+       ld      hl,0x1234
+       ld      de,0x56
+       call    math_mul_sw0
+       call    print_hlde
+
+       ld      hl,0x6543
+       ld      de,0x21
+       call    math_mul_sw0
+       call    print_hlde
+
+       ld      hl,0xb975
+       ld      de,0x31
+       call    math_mul_sw0
+       call    print_hlde
+
+       ld      hl,0xdb97
+       ld      de,0x531
+       call    math_mul_sw0
+       call    print_hlde
+
+       ld      hl,-0x1234
+       ld      de,0x56
+       call    math_mul_sw0
+       call    print_hlde
+
+       ld      hl,-0x6543
+       ld      de,0x21
+       call    math_mul_sw0
+       call    print_hlde
+
+       ld      hl,-0xb975
+       ld      de,0x31
+       call    math_mul_sw0
+       call    print_hlde
+
+       ld      hl,-0xdb97
+       ld      de,0x531
+       call    math_mul_sw0
+       call    print_hlde
+
+       ld      hl,0x1234
+       ld      de,-0x56
+       call    math_mul_sw0
+       call    print_hlde
+
+       ld      hl,0x6543
+       ld      de,-0x21
+       call    math_mul_sw0
+       call    print_hlde
+
+       ld      hl,0xb975
+       ld      de,-0x31
+       call    math_mul_sw0
+       call    print_hlde
+
+       ld      hl,0xdb97
+       ld      de,-0x531
+       call    math_mul_sw0
+       call    print_hlde
+
+       ld      hl,-0x1234
+       ld      de,-0x56
+       call    math_mul_sw0
+       call    print_hlde
+
+       ld      hl,-0x6543
+       ld      de,-0x21
+       call    math_mul_sw0
+       call    print_hlde
+
+       ld      hl,-0xb975
+       ld      de,-0x31
+       call    math_mul_sw0
+       call    print_hlde
+
+       ld      hl,-0xdb97
+       ld      de,-0x531
+       call    math_mul_sw0
+       call    print_hlde
+
         ld      hl,restarts
         ld      de,0x28
         ld      bc,restarts_end - restarts
@@ -402,7 +502,7 @@ page1_imm_mul_uw:
  page1_mul_w:
         pop     hl
         push    bc
-       call    math_mul_w0
+       call    math_smul_w0
         pop     bc
         jr      mul_w_done
  
@@ -412,7 +512,7 @@ page1_imm_divrev_sw:
  page1_div_sw:
         pop     hl      
         push    bc
-       call    math_div_sw0
+       call    math_sdiv_sw0
         jr      div_w_done
  
  page1_imm_div_sw:
@@ -421,7 +521,7 @@ page1_imm_div_sw:
  page1_divrev_sw:
         pop     hl
         push    bc
-       call    math_div_sw1
+       call    math_sdiv_sw
         jr      div_w_done
  
  page1_imm_divrev_uw:
@@ -430,7 +530,7 @@ page1_imm_divrev_uw:
  page1_div_uw:
         pop     hl
         push    bc
-       call    math_div_uw0
+       call    math_sdiv_uw0
         jr      div_w_done
  
  page1_imm_div_uw:
@@ -439,7 +539,7 @@ page1_imm_div_uw:
  page1_divrev_uw:
         pop     hl
         push    bc
-       call    math_div_uw1
+       call    math_sdiv_uw
         jr      div_w_done
  
  ; page 1 to 2
@@ -723,7 +823,7 @@ page2_mul_l:
  mul_l_entry:
         ex      de,hl
         push    bc
-       call    math_mul_l0
+       call    math_smul_l0
         pop     bc
         jr      mul_l_done
  
@@ -1640,17 +1740,32 @@ sr_l_entry:
         exx
         ret
  
-math_mul_w0: ; hl *= de
+; smul: short multiplication
+; 16 * 16 to 16 bit product (word)
+; 32 * 32 to 32 bit product (long)
+; mul: long multiplication
+; 16 + 16 * 16 to 32 bit product (word)
+; 32 + 32 * 32 to 64 bit product (long)
+
+; smul is implemented in a more optimal way that uses only left shifts,
+; since left shifts are cheaper on the z80, this works for smul because
+; there is no need to worry about propagating carries into high result
+
+; mul has the ability to initialize the product with some nonzero value,
+; which smul doesn't have because it only shifts zeros in from the left,
+; using this ability the long multiplication reverses the long division
+; (initialize product with a remainder, then add in quotient * divisor)
+
+math_smul_w0: ; hl *= de
         ld      c,l
         ld      b,h
-       ld      hl,0
-math_mul_w: ; hl += bc * de
+math_smul_w: ; hl = bc * de
         ld      a,d
-       call    mul_w0
+       call    smul_w0
         ld      a,e
-mul_w: ; bit 0
+smul_w:        ; bit 0
         add     hl,hl
-mul_w0:        rla
+smul_w0:       rla
         jr      nc,1$
         add     hl,bc
  1$:    ; bit 1
@@ -1690,33 +1805,31 @@ mul_w0: rla
         add     hl,bc
         ret
  
-math_mul_l0: ; hl':hl *= de':de
+math_smul_l0: ; hl':hl *= de':de
         ld      c,l
         ld      b,h
-       ld      hl,0
         exx
         ld      c,l
         ld      b,h
-       ld      hl,0
         exx
-math_mul_l: ; hl':hl += de':de * bc':bc
+math_smul_l: ; hl':hl = de':de * bc':bc
         exx
         ld      a,d
         exx
-       call    mul_l0
+       call    smul_l0
         exx
         ld      a,e
         exx
-       call    mul_l
+       call    smul_l
         ld      a,d
-       call    mul_l
+       call    smul_l
         ld      a,e
-mul_l: ; bit 0
+smul_l:        ; bit 0
         add     hl,hl
         exx
         adc     hl,hl
         exx
-mul_l0:        rla
+smul_l0:       rla
         jr      nc,1$
         add     hl,bc
         exx
@@ -1801,9 +1914,83 @@ mul_l0:  rla
         exx
         ret
  
-math_div_sw0: ; hl, de = hl % de, hl / de, signed
+math_mul_uw0: ; hl:de = hl * de, unsigned
+math_mul_sw0: ; hl:de = hl * de, signed
+       ld      c,l
+       ld      b,h
+       ld      hl,0
+math_mul_uw: ; hl:de = hl + bc * de, unsigned
+math_mul_sw: ; hl:de = hl + bc * de, signed
+       ld      a,e
+       call    mul_uw
+       ld      e,a
+       ld      a,d
+       call    mul_uw
+       ld      d,a
+       ret
+mul_uw: rra
+       ; bit 0
+       jr      nc,1$
+       add     hl,bc
+1$:    rr      h
+       rr      l
+       rra
+       ; bit 1
+       jr      nc,2$
+       add     hl,bc
+2$:    rr      h
+       rr      l
+       rra
+       ; bit 2
+       jr      nc,3$
+       add     hl,bc
+3$:    rr      h
+       rr      l
+       rra
+       ; bit 3
+       jr      nc,4$
+       add     hl,bc
+4$:    rr      h
+       rr      l
+       rra
+       ; bit 4
+       jr      nc,5$
+       add     hl,bc
+5$:    rr      h
+       rr      l
+       rra
+       ; bit 5
+       jr      nc,6$
+       add     hl,bc
+6$:    rr      h
+       rr      l
+       rra
+       ; bit 6 
+       jr      nc,7$
+       add     hl,bc
+7$:    rr      h
+       rr      l
+       rra
+       ; bit 7
+       jr      nc,8$
+       add     hl,bc
+8$:    rr      h
+       rr      l
+       rra
+       ret
+
+; sdiv: short division
+; 16 / 16 to 16 bit quotient, 16 bit remainder (word)
+; 32 / 32 to 32 bit quotient, 32 bit remainder (long)
+; div: long division
+; 32 / 16 to 16 bit quotient, 16 bit remainder (word)
+; 64 / 32 to 32 bit quotient, 32 bit remainder (long)
+
+; sdiv is implemented as sign/zero extension then div
+
+math_sdiv_sw0: ; hl, de = hl % de, hl / de, signed
         ex      de,hl
-math_div_sw1: ; hl, de = de % hl, de / hl, signed
+math_sdiv_sw: ; hl, de = de % hl, de / hl, signed
         ld      c,l
         ld      b,h
         ld      a,d
@@ -1862,9 +2049,9 @@ div_w_nn: ; negative dividend, negative divisor
         add     hl,bc
         ret
  
-math_div_uw0: ; hl, de = hl % de, hl / de, unsigned
+math_sdiv_uw0: ; hl, de = hl % de, hl / de, unsigned
         ex      de,hl
-math_div_uw1: ; hl, de = de % hl, de / hl, unsigned
+math_sdiv_uw: ; hl, de = de % hl, de / hl, unsigned
         ld      c,l
         ld      b,h
         ld      hl,0
@@ -2764,6 +2951,18 @@ div_l_n18: ; done, below
  
  ; debugging
  
+print_hlde:
+       call    print_word
+       ld      a,':
+       call    print_char
+       ex      de,hl
+       call    print_word
+       ex      de,hl
+       ld      a,0xd
+       call    print_char
+       ld      a,0xa
+       jp      print_char
+
  print_trace: ; print af, bc, hl':de, de':hl, (sp+2):(sp), sp
         call    print_trace2
         ld      a,(bc)