206 ld bc,restarts_end - restarts
219 ; rst 0x28, immediate to hl
228 ; rst 0x30, pop hl:de'
237 ; rst 0x38, immediate to de':hl
258 ; stack and control transfer
260 .org page0 * 0x100 - 18
283 ; page 0 administrative
391 ; word arithmetic operations
392 ; top stack word cached in de
394 .org page1 * 0x100 - 13
412 ; page 1 administrative
538 ; use addition for page1_imm_sub_w
552 jr add_w_done ;mul_w_done
560 jr add_w_done ;mul_w_done
708 ; long arithmetic operations
709 ; top stack long cached in de:hl'
711 .org page2 * 0x100 - 13
729 ; page 2 administrative
839 add_l_entry: ; optimize this
856 sub_l_entry: ; optimize this
865 ; use addition for page2_imm_sub_l
1030 ; lightweight routines
1032 ; these can be duplicated without much cost, and will be called often, so the
1033 ; calling convention is geared to what the interpreter needs (de:hl' and so on)
1035 math_imm_w: ; immediate to de
1049 math_imm_l: ; immediate to hl':de
1073 math_stkld_w: ; sp(imm_w) to de
1081 ; use inline code for math_ld_w
1092 math_stkld_l: ; sp(imm_w) to de:hl'
1100 math_ld_l: ; (hl) to hl':de
1121 math_stkst_w: ; de to sp(imm_w)
1139 math_stkst_l: ; de:hl' to sp(imm_w)
1147 math_st_l: ; hl':de to (hl)
1185 math_and_imm_w: ; de &= imm_w
1201 math_and_imm_l: ; hl':de &= imm_l
1229 math_and_w: ; de &= hl
1248 math_and_l: ; hl':de &= de':hl
1270 math_or_imm_w: ; de |= imm_w
1286 math_or_imm_l: ; hl':de |= imm_l
1314 math_or_w: ; de |= hl
1333 math_or_l: ; hl':de |= de':hl
1355 math_xor_imm_w: ; de ^= imm_w
1371 math_xor_imm_l: ; hl':de ^= imm_l
1399 math_xor_w: ; de ^= hl
1418 math_xor_l: ; hl':de ^= de':hl
1440 math_cmp_sw: ; cf=1 de < hl, zf=1 de == hl, signed
1442 math_cmprev_sw: ; cf=1 hl < de, zf=1 hl == de, signed
1457 ; use inline code for math_cmp_uw, math_cmprev_uw
1464 math_cmp_sl: ; cf=1 hl':de < de':hl, zf=1 hl':de == de':hl, signed
1487 math_cmp_ul: ; cf=1 hl':de < de':hl, zf=1 hl':de == de':hl, unsigned
1493 jr nz,cmp_l_dispatch
1510 math_cmprev_sl: ; cf=1 de':hl < hl':de, zf=1 de':hl == hl':de, signed
1534 math_cmprev_ul: ; cf=1 de':hl < hl':de, zf=1 de':hl == hl':de, signed
1541 jr nz,cmprev_l_dispatch
1552 ; heavyweight routines
1554 ; these have unrolled loops and so there needs to be as much reuse as possible
1556 ; for the multiply and divide the unrolled loops are placed in subroutines, so
1557 ; there is some overhead to use them, but it allows the calling code to itself
1558 ; be cheaply unrolled, so the amount left to do is determined by context not
1559 ; by a counter; the tradeoff is do we count loops by push bc/pop bc/djnz or by
1560 ; call/call/call, since both need a stack push/pop it should be about the same
1562 ; for these routines the calling convention is geared to whatever the routine
1563 ; needs to work most efficiently, this makes the usage more cumbersome as you
1564 ; to do something like ex de,hl/call/ex de,hl but it is done this way because
1565 ; otherwise some callsites need to exchange registers into place on one side
1566 ; of a call/ret boundary only for them to be immediately exchanged back again
1568 ; as an exception to this we may provide an earlier entry point(s), before the
1569 ; routine proper, which exchanges arguments into place (the caller must still
1570 ; deal with exchanges afterwards), we do this because of callsites that are in
1571 ; the interpreter pages and need to conserve code size, therefore the prefix
1572 ; chosen need not occur more than once, and we'll use the longest such prefix
1574 ; while some of these earlier entry points have a meaning, e.g. math_divrev_l
1575 ; before math_div_l, many of them do not, so we will just number them instead
1576 ; (this is because the prefix code can be a compromise between callers' needs)
1578 math_sl_w0: ; hl = de << (l & 0xf)
1580 math_sl_w: ; hl <<= e & 0xf
1604 math_sl_l0: ; de':hl <<= e & 0x1f
1608 math_sl_l: ; hl':hl <<= e & 0x1f
1666 math_sr_uw0: ; hl = de >> (l & 0xf), logical
1668 math_sr_uw: ; hl >>= e & 0xf, logical
1672 math_sr_uw1: ; hl >>= a - 7, immediate a in [7, 0x17), arithmetic
1677 math_sr_sw0: ; hl = de >> (l & 0xf), arithmetic
1679 math_sr_sw: ; hl >>= e & 0xf, arithmetic
1683 math_sr_sw1: ; hl >>= a - 7, immediate a in [7, 0x17), arithmetic
1718 4$: ; by 16 (can't occur simultaneously with by 8)
1727 math_sr_ul0: ; hl:de' >>= e & 0x1f, logical
1731 math_sr_ul: ; hl:hl' >>= e & 0x1f, logical
1735 math_sr_ul1: ; hl:hl' >>= a - 7, immediate a in [7, 0x27), logical
1740 math_sr_sl0: ; de':hl >>= e & 0x1f, arithmetic
1744 math_sr_sl: ; hl':hl >>= e & 0x1f, arithmetic
1748 math_sr_sl1: ; hl':hl >>= a - 7, immediate a in [7, 0x27), arithmetic
1825 5$: ; by 32 (can't occur simultaneously with by 16)
1838 ; smul: short multiplication
1839 ; 16 * 16 to 16 bit product (word)
1840 ; 32 * 32 to 32 bit product (long)
1841 ; mul: long multiplication
1842 ; 16 + 16 * 16 to 32 bit product (word)
1843 ; 32 + 32 * 32 to 64 bit product (long)
1845 ; smul is implemented in a more optimal way that uses only left shifts,
1846 ; since left shifts are cheaper on the z80, this works for smul because
1847 ; there is no need to worry about propagating carries into high result
1849 ; mul has the ability to initialize the product with some nonzero value,
1850 ; which smul doesn't have because it only shifts zeros in from the left,
1851 ; using this ability the long multiplication reverses the long division
1852 ; (initialize product with a remainder, then add in quotient * divisor)
1854 math_smul_w0: ; hl *= de
1857 math_smul_w: ; hl = bc * de
1903 math_smul_l0: ; hl':hl *= de':de
1910 math_smul_l: ; hl':hl = de':de * bc':bc
2012 math_mul_sw0: ; hl:de = hl * de, signed
2016 math_mul_sw: ; hl:de = hl + bc * de, signed
2018 rla ; cf will be preserved through to the last rra below
2023 call mul_uw1 ; do only 7 bits, get sign of d into cf
2038 math_mul_uw0: ; hl:de = hl * de, unsigned
2042 math_mul_uw: ; hl:de = hl + bc * de, unsigned
2103 math_mul_sl0: ; hl':hl:de':de = hl':hl * de':de, signed
2114 exx ; hard to optimize this
2115 math_mul_sl: ; hl':hl:de':de = hl':hl + bc':bc * de':de, signed
2118 rla ; cf will be preserved through to the last rra below
2134 call mul_ul1 ; do only 7 bits, get sign of d into cf
2140 exx ; hard to optimize this
2153 exx ; hard to optimize this
2164 math_mul_ul0: ; hl':hl:de':de = hl':hl * de':de, unsigned
2176 math_mul_ul: ; hl':hl:de':de = hl':hl + bc':bc * de':de, unsigned
2305 ; sdiv: short division
2306 ; 16 / 16 to 16 bit quotient, 16 bit remainder (word)
2307 ; 32 / 32 to 32 bit quotient, 32 bit remainder (long)
2308 ; div: long division
2309 ; 32 / 16 to 16 bit quotient, 16 bit remainder (word)
2310 ; 64 / 32 to 32 bit quotient, 32 bit remainder (long)
2312 ; sdiv is implemented as sign/zero extension then div
2314 math_sdiv_sw0: ; hl, de = hl % de, hl / de, signed
2316 math_sdiv_sw: ; hl, de = de % hl, de / hl, signed
2324 math_div_sw: ; hl, de = hl:de % bc, hl:de / bc, signed
2329 jp m,div_w_n ; positive dividend
2333 jr nc,div_w_pp ; positive dividend, positive divisor
2335 ; positive dividend, negative divisor
2348 dec de ; reduces remainder by 1 (we inc later)
2350 jr c,div_w_nn ; negative dividend, negative divisor
2352 ; negative dividend, positive divisor
2359 inc hl ; get into range -divisor+1..0
2364 div_w_nn: ; negative dividend, negative divisor
2370 inc hl ; get into range divisor+1..0
2375 math_sdiv_uw0: ; hl, de = hl % de, hl / de, unsigned
2377 math_sdiv_uw: ; hl, de = de % hl, de / hl, unsigned
2381 math_div_uw: ; hl, de = hl:de % bc, hl:de / bc, unsigned
2383 div_w_pp: ; positive dividend, positive divisor
2393 ; non-restoring division routine
2395 ; de = divisor, hl:a = dividend with hl = previous remainder, a = next byte
2396 ; enter at div0 with positive remainder in hl, such that hl < de
2397 ; enter at div1 with negative remainder in hl, such that hl >= -de
2399 ; div0/1 return a = 8-bit quotient as an odd number interpreted as -ff..ff,
2400 ; by summing positive/negative place values, e.g. -80 +40 +20 -10 +8 -4 -2 +1
2402 ; if entered at div0, there is a -80 and so quotient is in range -ff..-1
2403 ; if entered at div1, there is a +80 and so quotient is in range 1..ff
2404 ; falls out of loop after div01 with positive remainder, div11 with negative,
2405 ; depending on this we should re-enter at div0 or div1, signalled by cf return
2407 ; the successive quotient bytes can be concatenated into a full quotient,
2408 ; but negative bytes require the next higher quotient byte to be decremented,
2409 ; we know in advance if this will happen because the implied sign of the
2410 ; quotient byte depends only on whether we entered at div0 or div1, hence,
2411 ; before the div11 return we'll decrement to compensate for next negative byte
2413 ; the decrement can also be seen as compensating for the extra add hl,de that
2414 ; may be needed to make negative remainder positive before return to caller,
2415 ; thus leaving quotient in a consistent state regardless of which exit taken,
2416 ; remainder needs the add hl,de if cf=1 returned (equiv. return byte is even)
2418 ; in the following code each sbc hl,de gets an inc a and each add hl,de gets
2419 ; a dec a, guaranteeing the integrity of the division, the initial scf/rla is
2420 ; needed to make the result 100 + -ff..ff or 1..1ff, so that the decrements
2421 ; cannot borrow into the upcoming dividend bits also held in a, and there must
2422 ; be another shift between the scf/rla and increment/decrement so that the scf
2423 ; is implicitly in the 100s place, making the code awkward though it's correct
2425 ; now optimized to only inc/dec a when doing zero-crossing, fix above analysis
2446 ;dec a ; compensation
2487 ;dec a ; compensation
2511 ;dec a ; compensation
2551 ;dec a ; compensation
2558 div_w0: ; bit 0, above
2565 div_w11: ; bit 1, below
2571 div_w02: ; bit 2, above
2577 div_w13: ; bit 3, below
2583 div_w04: ; bit 4, above
2589 div_w15: ; bit 5, below
2595 div_w06: ; bit 6, above
2601 div_w17: ; bit 7, below
2607 div_w08: ; done, above
2613 div_w1: ; bit 0, below
2619 div_w01: ; bit 1, above
2625 div_w12: ; bit 2, below
2631 div_w03: ; bit 3, above
2637 div_w14: ; bit 4, below
2643 div_w05: ; bit 5, above
2649 div_w16: ; bit 6, below
2655 div_w07: ; bit 7, above
2661 div_w18: ; done, below
2664 ;dec a ; compensation
2668 ; divn0/1 are the same as div0/1 but carry reversed after add/subtract divisor
2669 ; this is for negative divisors where we expect carry (means no zero crossing)
2671 ; when divisor negated, remainder also negated, so we expect to do subtraction
2672 ; when remainder negative and vice versa, need to clear carry after add hl,hl
2676 div_w_n0: ; bit 0, above
2684 div_w_n11: ; bit 1, below
2690 div_w_n02: ; bit 2, above
2697 div_w_n13: ; bit 3, below
2703 div_w_n04: ; bit 4, above
2710 div_w_n15: ; bit 5, below
2716 div_w_n06: ; bit 6, above
2723 div_w_n17: ; bit 7, below
2729 div_w_n08: ; done, above
2735 div_w_n1: ; bit 0, below
2741 div_w_n01: ; bit 1, above
2748 div_w_n12: ; bit 2, below
2754 div_w_n03: ; bit 3, above
2761 div_w_n14: ; bit 4, below
2767 div_w_n05: ; bit 5, above
2774 div_w_n16: ; bit 6, below
2780 div_w_n07: ; bit 7, above
2787 div_w_n18: ; done, below
2790 ;dec a ; compensation
2795 math_sdiv_sl0: ; hl':hl, de':de = hl':de % de':hl, hl':de / de':hl, signed
2799 math_sdiv_sl: ; ; hl':hl, de':de = de':de % hl':hl, de':de / hl':hl, signed
2815 ; hl':hl:de':de % bc':bc, hl':hl:de':de / bc':bc, signed
2819 jp m,div_l_n ; positive dividend
2826 jr nc,div_l_pp ; positive dividend, positive divisor
2828 ; positive dividend, negative divisor
2852 div_l_n: ; negative dividend
2856 dec de ; reduces remainder by 1 (we inc later)
2864 jr c,div_l_nn ; negative dividend, negative divisor
2866 ; negative dividend, positive divisor
2888 2$: inc hl ; get into range divisor+1..0
2897 math_div_ul0: ; hl':hl, de':de = hl':de % de':hl, hl':de / de':hl, unsigned
2901 math_div_ul1: ; ; hl':hl, de':de = de':de % hl':hl, de':de / hl':hl, unsigned
2915 ; hl':hl:de':de % bc':bc, hl':hl:de':de / bc':bc, unsigned
2919 div_l_pp: ; positive dividend, positive divisor
2942 div_l_nn: ; negative dividend, negative divisor
2963 1$: inc hl ; get into range divisor+1..0
2972 ; non-restoring division routine
2973 ; see earlier comments for the word version, this extends the concept to long
3000 ;dec a ; compensation
3053 ;dec a ; compensation
3083 ;dec a ; compensation
3135 ;dec a ; compensation
3140 ; changed all jr to jp, revisit this
3144 div_l0: ; bit 0, above
3157 div_l11: ; bit 1, below
3169 div_l02: ; bit 2, above
3181 div_l13: ; bit 3, below
3193 div_l04: ; bit 4, above
3205 div_l15: ; bit 5, below
3217 div_l06: ; bit 6, above
3229 div_l17: ; bit 7, below
3241 div_l08: ; done, above
3247 div_l1: ; bit 0, below
3259 div_l01: ; bit 1, above
3271 div_l12: ; bit 2, below
3283 div_l03: ; bit 3, above
3295 div_l14: ; bit 4, below
3307 div_l05: ; bit 5, above
3319 div_l16: ; bit 6, below
3331 div_l07: ; bit 7, above
3343 div_l18: ; done, below
3346 ;dec a ; compensation
3350 ; version for negative divisors
3351 ; see earlier comments for the word version, this extends the concept to long
3355 div_l_n0: ; bit 0, above
3369 div_l_n11: ; bit 1, below
3381 div_l_n02: ; bit 2, above
3394 div_l_n13: ; bit 3, below
3406 div_l_n04: ; bit 4, above
3419 div_l_n15: ; bit 5, below
3431 div_l_n06: ; bit 6, above
3444 div_l_n17: ; bit 7, below
3456 div_l_n08: ; done, above
3462 div_l_n1: ; bit 0, below
3474 div_l_n01: ; bit 1, above
3487 div_l_n12: ; bit 2, below
3499 div_l_n03: ; bit 3, above
3512 div_l_n14: ; bit 4, below
3524 div_l_n05: ; bit 5, above
3537 div_l_n16: ; bit 6, below
3549 div_l_n07: ; bit 7, above
3562 div_l_n18: ; done, below
3565 ;dec a ; compensation
3604 print_trace: ; print af, bc, hl':de, de':hl, (sp+2):(sp), sp
3724 .ascii '0123456789abcdef'
3729 ; create stack frame
3737 ; push result pointer
3742 ; call sm_factorial(argument)
3758 .db <page1_imm_cmp_sw
3773 .db <page1_imm_add_w
3777 ; divide by place value
3780 ; replace current value with remainder
3784 ; print quotient plus '0
3786 .db <page1_imm_add_w
3796 .db <page1_imm_add_w
3822 ; enlarge stack frame
3830 ; push result pointer
3835 ; call sm_factorial(argument)
3851 .db <page1_imm_cmp_sw
3866 .db <page1_imm_add_w
3870 ; divide by place value
3873 ; replace current value with remainder
3877 ; print quotient plus '0
3879 .db <page1_imm_add_w
3884 .dw 4 ; cheating -- kill hi word of long too
3889 .db <page1_imm_add_w
3915 ; destroy stack frame
3923 .dw 10000,1000,100,10,1
3925 .dw 0xca00,0x3b9a ; 1000000000
3926 .dw 0xe100,0x5f5 ; 100000000
3927 .dw 0x9680,0x98 ; 10000000
3928 .dw 0x4240,0xf ; 1000000
3929 .dw 0x86a0,1 ; 100000
3942 .db <page1_imm_cmp_sw
3947 ; no, set up for *result =
3957 .db <page1_imm_add_w
3960 ; push result pointer
3965 ; call sm_factorial(argument - 1)
3978 ; set *result = sm_factorial(argument - 1) * argument
3985 ; yes, set up for *result =
4004 .db <page2_imm_cmp_sl
4010 ; no, set up for *result =
4020 .db <page2_imm_add_l
4023 ; push result pointer
4028 ; call sm_factorial(argument - 1)
4041 ; set *result = sm_factorial(argument - 1) * argument
4048 ; yes, set up for *result =