LIST
Makefile
compmodule
+em_end.s
+etext.s
+edata.s
end.s
libem_s.a
+READ_ME
sig.s
cms.s
gto.s
-ffp.s
+fp68881.s
fat.s
trp.s
dia.s
# $Header$
MACH=m68k2
-all: libem_o.a end.o
+ASAR=aal
+all: libem_o.a end.a
install: all
../../install libem_o.a tail_em
- ../../install end.o end_em
+ ../../install end.a end_em
cmp: all
-../../compare libem_o.a tail_em
- -../../compare end.o end_em
+ -../../compare end.a end_em
-end.o: end.s
+end.a: em_end.s etext.s edata.s end.s
+ $(MACH) -I../../../h -c em_end.s
+ $(MACH) -I../../../h -c edata.s
+ $(MACH) -I../../../h -c etext.s
$(MACH) -I../../../h -c end.s
+ $(ASAR) cr end.a em_end.o etext.o edata.o end.o
libem_o.a: libem_s.a
- ASAR=aal ; export ASAR ;\
+ ASAR=$(ASAR) ; export ASAR ;\
march . libem_o.a
clean:
- rm -f *.o libem_o.a
+ rm -f *.o libem_o.a end.a
opr :
make pr | opr
pr:
@arch pv libem_s.a | pr -h `pwd`/libem_s.a
- @pr `pwd`/end.s
+ @pr `pwd`/em_end.s `pwd`/edata.s `pwd`/etext.s `pwd`/end.s
-The original EM library routines saved all registers
-(including scratch registers) in global data; hence they
-were not reentrant.
-The new routines do not save registers d0,d1,d2,a0 and a1.
-They are reentrant.
+The routines in mli.s, mlu.s, dvi.s, and dvu.s are written by
+Kai-Uwe Bloem and were published on the comp.os.minix newsgroup.
+He allowed us to use them for ACK, but requested that
+they do not fall under the ACK copyright notice. So, they don't.
.sect .bss
! signed long divide
+ !-----------------------------------------------------------------------------
+ ! rewritten by Kai-Uwe Bloem (i5110401@dbstu1.bitnet) for speed.
+ ! #1 01/12/90 initial revision. Minor reduce of shift operations.
+ ! #2 03/07/90 use 68000 divu instruction whereever possible. This change
+ ! makes #1 superflous. (derived from my GNU division routine)
+ !-----------------------------------------------------------------------------
+ ! Some common cases can be handled in a special, much faster way :
+ ! 1) divisor = 0
+ ! => cause trap, then return to user. Result is undefined
+ ! 2) dividend < divisor
+ ! => quotient = 0, remainder = dividend
+ ! 3) divisor < 0x10000 ( i.e. divisor is only 16 bits wide )
+ ! => quotient and remainder can be calculated quite fast by repeated
+ ! application of 68000 divu operations (ca. 400 cycles)
+ ! 4) otherwise (due to #2, #3 dividend, divisor both wider then 16 bits)
+ ! => do slow division by shift and subtract
+ !-----------------------------------------------------------------------------
+
+
+ ! register usage:
+ ! : d0 divisor
+ ! d1 dividend
+ ! exit : d1 quotient
+ ! d2 remainder
+
.sect .text
.dvi:
- move.l (sp)+,a0 ! return address
- move.l (sp)+,d0
- move.l (sp)+,d1
- move.l d3,-(sp) ! save d3 and d4
- move.l d4,-(sp)
+ move.l (sp)+,a1 ! return address
+ move.l (sp)+,d0 ! divisor
+ move.l (sp)+,d2 ! dividend
+ move.l d3,a0 ! save d3
+ move.l d4,-(sp) ! save result sign register
clr.l d4
- tst.l d0 ! divisor
- bpl 1f
- neg.l d0
- not d4
-1:
- tst.l d1 ! dividend
- bpl 2f
- neg.l d1
- not d4
- swap d4
- not d4
- swap d4
+ tst.l d2
+ bpl 0f ! dividend is negative ?
+ neg.l d2 ! yes - negate
+ not.l d4 ! and note negation in d4
+0:
+ tst.l d0
+ bpl 0f ! divisor is negative ?
+ neg.l d0 ! yes - negate
+ not.w d4 ! note negation
+0:
+ clr.l d1 ! prepare quotient
+! === case 1: divisor = 0
+ tst.l d0 ! divisor = 0 ?
+ beq 9f ! yes - divide by zero trap
+! === case 2: dividend < divisor
+ cmp.l d0,d2 ! dividend < divisor ?
+ bcs 8f ! yes - division already finished
+! === case 3: divisor <= 0x0ffff
+ cmp.l #0x0ffff,d0 ! is divisor only 16 bits wide ?
+ bhi 2f
+ move.w d2,d3 ! save dividend.l
+ clr.w d2 ! prepare dividend.h for divu operation
+ swap d2
+ beq 0f ! dividend.h is all zero, no divu necessary
+ divu d0,d2
+0: move.w d2,d1 ! save quotient.h
+ swap d1
+ move.w d3,d2 ! divide dividend.l
+ divu d0,d2 ! (d2.h = remainder of prev divu)
+ move.w d2,d1 ! save qoutient.l
+ clr.w d2 ! get remainder
+ swap d2
+ bra 8f
+! === case 4: divisor and dividend both > 0x0ffff
2:
- move.l d1,-(sp)
- move.l d0,-(sp)
- jsr .dvu
- tst d4
- beq 5f
- neg.l d1 ! quotient
+ move #32-1,d3 ! loop count
+4:
+ lsl.l #1,d2 ! shift dividend ...
+ roxl.l #1,d1 ! ... into d1
+ cmp.l d0,d1 ! compare with divisor
+ bcs 5f
+ sub.l d0,d1 ! bigger, subtract divisor
+ add #1,d2 ! note subtraction in result
5:
- tst.l d4
- bpl 6f
- neg.l d2 ! remainder
-6:
- move.l (sp)+,d4 ! restore d4 and d3
- move.l (sp)+,d3
- jmp (a0)
+ dbra d3,4b
+ exg d1,d2 ! get results in the correct registers
+8:
+ tst.w d4 ! quotient < 0 ?
+ bpl 0f
+ neg.l d1 ! yes - negate
+0: tst.l d4 ! remainder < 0 ?
+ bpl 0f
+ neg.l d2
+0: move.l (sp)+,d4 ! restore d4
+ move.l a0,d3 ! restore d3
+ jmp (a1)
+
+EIDIVZ = 6
+9: move.w #EIDIVZ,-(sp)
+ jsr .trp
.sect .bss
! unsigned long divide
+ !-----------------------------------------------------------------------------
+ ! rewritten by Kai-Uwe Bloem (i5110401@dbstu1.bitnet) for speed.
+ ! #1 01/12/90 initial revision. Minor reduce of shift operations.
+ ! #2 03/07/90 use 68000 divu instruction whereever possible. This change
+ ! makes #1 superflous. (derived from my GNU division routine)
+ !-----------------------------------------------------------------------------
+ ! Some common cases can be handled in a special, much faster way :
+ ! 1) divisor = 0
+ ! => cause trap, then return to user. Result is undefined
+ ! 2) dividend < divisor
+ ! => quotient = 0, remainder = dividend
+ ! 3) divisor < 0x10000 ( i.e. divisor is only 16 bits wide )
+ ! => quotient and remainder can be calculated quite fast by repeated
+ ! application of 68000 divu operations (ca. 400 cycles)
+ ! 4) otherwise (due to #2, #3 dividend, divisor both wider then 16 bits)
+ ! => do slow division by shift and subtract
+ !-----------------------------------------------------------------------------
+
+
! register usage:
! : d0 divisor
! d1 dividend
! exit : d1 quotient
! d2 remainder
+
.sect .text
.dvu:
+ move.l d3,a0 ! save d3
move.l (sp)+,a1 ! return address
- move.l (sp)+,d0
- move.l (sp)+,d1
- move.l d3,-(sp) ! save d3
- tst.l d0
- bne 0f
- move.l (sp)+,d3
- move.w #EIDIVZ,-(sp)
- jsr .trp
-0:
- clr.l d2
- move.l #32,d3
-3:
- lsl.l #1,d1
- roxl.l #1,d2
- cmp.l d0,d2
- blt 4f
- sub.l d0,d2
- add #1,d1
+ move.l (sp)+,d0 ! divisor
+ move.l (sp)+,d2 ! dividend
+ clr.l d1 ! prepare quotient
+! === case 1: divisor = 0
+ tst.l d0 ! divisor = 0 ?
+ beq 9f ! yes - divide by zero trap
+! === case 2: dividend < divisor
+ cmp.l d0,d2 ! dividend < divisor ?
+ bcs 8f ! yes - division already finished
+! === case 3: divisor <= 0x0ffff
+ cmp.l #0x0ffff,d0 ! is divisor only 16 bits wide ?
+ bhi 2f
+ move.w d2,d3 ! save dividend.l
+ clr.w d2 ! prepare dividend.h for divu operation
+ swap d2
+ beq 0f ! dividend.h is all zero, no divu necessary
+ divu d0,d2
+0: move.w d2,d1 ! save quotient.h
+ swap d1
+ move.w d3,d2 ! divide dividend.l
+ divu d0,d2 ! (d2.h = remainder of prev divu)
+ move.w d2,d1 ! save qoutient.l
+ clr.w d2 ! get remainder
+ swap d2
+ bra 8f
+! === case 4: divisor and dividend both > 0x0ffff
+2:
+ move #32-1,d3 ! loop count
4:
- sub #1,d3
- bgt 3b
- move.l (sp)+,d3
+ lsl.l #1,d2 ! shift dividend ...
+ roxl.l #1,d1 ! ... into d1
+ cmp.l d0,d1 ! compare with divisor
+ bcs 5f
+ sub.l d0,d1 ! bigger, subtract divisor
+ add #1,d2 ! note subtraction in result
+5:
+ dbra d3,4b
+ exg d1,d2 ! get results in the correct registers
+8:
+ move.l a0,d3 ! restore d3
jmp (a1)
+
+EIDIVZ = 6
+9: move.w #EIDIVZ,-(sp)
+ jsr .trp
--- /dev/null
+.sect .text
+.sect .rom
+.sect .data
+.sect .bss
+.define _edata
+.sect .data
+ .align 4
+ .sect .data
+_edata:
--- /dev/null
+.sect .text
+.sect .rom
+.sect .data
+.sect .bss
+.define endtext,enddata,endbss,__end
+.sect .text
+ .align 4
+.sect .rom
+ .align 4
+.sect .data
+ .align 4
+.sect .bss
+ .align 4
+.sect .end ! only for declaration of _end, __end and endbss.
+
+ .sect .text
+endtext:
+ .sect .data
+enddata:
+ .sect .end
+__end:
+endbss:
-.define endtext,enddata,endbss,_etext,_edata,_end
.sect .text
.sect .rom
.sect .data
.sect .bss
-.sect .end ! only for declaration of _end and endbss.
-
- .sect .text
-endtext:
-_etext:
- .sect .data
-enddata:
-_edata:
- .sect .end
+.define _end
+.sect .end ! only for declaration of _end, __end and endbss.
_end:
-endbss:
--- /dev/null
+.sect .text
+.sect .rom
+.sect .data
+.sect .bss
+.define _etext
+.sect .text
+ .align 4
+ .sect .text
+_etext:
--- /dev/null
+.define .adf4, .adf8, .sbf4, .sbf8, .mlf4, .mlf8, .dvf4, .dvf8
+.define .ngf4, .ngf8, .fif4, .fif8, .fef4, .fef8
+.define .cif4, .cif8, .cuf4, .cuf8, .cfi, .cfu, .cff4, .cff8
+.define .cmf4, .cmf8
+.sect .text
+.sect .rom
+.sect .data
+.sect .bss
+
+! $Header$
+
+! Implement interface to floating point package for M68881
+
+ .sect .text
+.adf4:
+ move.l (sp)+,a0
+ fmove.s (sp),fp0
+ fadd.s 4(sp),fp0
+ fmove.s fp0,4(sp)
+ jmp (a0)
+
+.adf8:
+ move.l (sp)+,a0
+ fmove.d (sp),fp0
+ fadd.d 8(sp),fp0
+ fmove.d fp0,8(sp)
+ jmp (a0)
+
+.sbf4:
+ move.l (sp)+,a0
+ fmove.s (sp),fp0
+ fmove.s 4(sp),fp1
+ fsub fp0,fp1
+ fmove.s fp1,4(sp)
+ jmp (a0)
+
+.sbf8:
+ move.l (sp)+,a0
+ fmove.d (sp),fp0
+ fmove.d 8(sp),fp1
+ fsub fp0,fp1
+ fmove.d fp1,8(sp)
+ jmp (a0)
+
+.mlf4:
+ move.l (sp)+,a0
+ fmove.s (sp),fp0
+ fmul.s 4(sp),fp0
+ fmove.s fp0,4(sp)
+ jmp (a0)
+
+.mlf8:
+ move.l (sp)+,a0
+ fmove.d (sp),fp0
+ fmul.d 8(sp),fp0
+ fmove.d fp0,8(sp)
+ jmp (a0)
+
+.dvf4:
+ move.l (sp)+,a0
+ fmove.s (sp),fp0
+ fmove.s 4(sp),fp1
+ fdiv fp0,fp1
+ fmove.s fp1,4(sp)
+ jmp (a0)
+
+.dvf8:
+ move.l (sp)+,a0
+ fmove.d (sp),fp0
+ fmove.d 8(sp),fp1
+ fdiv fp0,fp1
+ fmove.d fp1,8(sp)
+ jmp (a0)
+
+.ngf4:
+ fmove.s 4(sp),fp0
+ fneg fp0
+ fmove.s fp0,4(sp)
+ rts
+
+.ngf8:
+ fmove.d 4(sp),fp0
+ fneg fp0
+ fmove.d fp0,4(sp)
+ rts
+
+.fif4:
+ move.l (sp)+,a0
+ move.l (sp),a1
+ fmove.s 4(sp),fp0
+ fmove.s 8(sp),fp1
+ fmul fp0,fp1
+ fintrz fp1,fp0
+ fsub fp0,fp1
+ fmove.s fp1,4(a1)
+ fmove.s fp0,(a1)
+ jmp (a0)
+
+.fif8:
+ move.l (sp)+,a0
+ move.l (sp),a1
+ fmove.d 4(sp),fp0
+ fmove.d 12(sp),fp1
+ fmul fp0,fp1
+ fintrz fp1,fp0
+ fsub fp0,fp1
+ fmove.d fp1,8(a1)
+ fmove.d fp0,(a1)
+ jmp (a0)
+
+.fef4:
+ move.l (sp)+,a0
+ move.l (sp),a1
+ fmove.s 4(sp),fp0
+ fgetexp fp0,fp1
+ fmove.l fpsr,d0
+ and.l #0x2000,d0 ! set if Infinity
+ beq 1f
+ move.w #129,(a1)
+ fmove.s 4(sp),fp0
+ fblt 2f
+ move.l #0x3f000000,2(a1)
+ jmp (a0)
+2:
+ move.l #0xbf000000,2(a1)
+ jmp (a0)
+1:
+ fmove.l fp1,d0
+ add.l #1,d0
+ fgetman fp0
+ fbne 1f
+ clr.l d0
+ bra 2f
+1:
+ fmove.l #2,fp1
+ fdiv fp1,fp0
+2:
+ fmove.s fp0,2(a1)
+ move.w d0,(a1)
+ jmp (a0)
+
+.fef8:
+ move.l (sp)+,a0
+ move.l (sp),a1
+ fmove.d 4(sp),fp0
+ fgetexp fp0,fp1
+ fmove.l fpsr,d0
+ and.l #0x2000,d0 ! set if Infinity
+ beq 1f
+ move.w #1025,(a1)
+ fmove.d 4(sp),fp0
+ fblt 2f
+ move.l #0x3fe00000,2(a1)
+ clr.l 6(a1)
+ jmp (a0)
+2:
+ move.l #0xbfe00000,2(a1)
+ clr.l 6(a1)
+ jmp (a0)
+1:
+ fmove.l fp1,d0
+ add.l #1,d0
+ fgetman fp0
+ fbne 1f
+ clr.l d0
+ bra 2f
+1:
+ fmove.l #2,fp1
+ fdiv fp1,fp0
+2:
+ fmove.d fp0,2(a1)
+ move.w d0,(a1)
+ jmp (a0)
+
+.cif4:
+ move.l (sp)+,a0
+ cmp.w #2,(sp)
+ bne 1f
+ fmove.w 2(sp),fp0
+ fmove.s fp0,(sp)
+ jmp (a0)
+1:
+ fmove.l 2(sp),fp0
+ fmove.s fp0,2(sp)
+ jmp (a0)
+
+.cif8:
+ move.l (sp)+,a0
+ cmp.w #2,(sp)
+ bne 1f
+ fmove.w 2(sp),fp0
+ fmove.d fp0,(sp)
+ jmp (a0)
+1:
+ fmove.l 2(sp),fp0
+ fmove.d fp0,(sp)
+ jmp (a0)
+
+.cuf4:
+ move.l (sp)+,a0
+ cmp.w #2,(sp)
+ bne 2f
+ fmove.w 2(sp),fp0
+ tst.w 2(sp)
+ bge 1f
+ fadd.l #65536,fp0
+1:
+ fmove.s fp0,(sp)
+ jmp (a0)
+2:
+ fmove.l 2(sp),fp0
+ tst.l 2(sp)
+ bge 1f
+ fsub.l #-2147483648,fp0
+ fsub.l #-2147483648,fp0
+1:
+ fmove.s fp0,2(sp)
+ jmp (a0)
+
+.cuf8:
+ move.l (sp)+,a0
+ move.w (sp),d0
+ cmp.w #2,d0
+ bne 2f
+ fmove.w 2(sp),fp0
+ tst.w 2(sp)
+ bge 1f
+ fadd.l #65536,fp0
+ bra 1f
+2:
+ fmove.l 2(sp),fp0
+ tst.l 2(sp)
+ bge 1f
+ fsub.l #-2147483648,fp0
+ fsub.l #-2147483648,fp0
+1:
+ fmove.d fp0,(sp)
+ jmp (a0)
+
+.cfi:
+ move.l (sp)+,a0
+ move.w (sp),d1
+ move.w 2(sp),d0
+ cmp.w #4,d0
+ bne 1f
+ fmove.s 4(sp),fp0
+ bra 2f
+1:
+ fmove.d 4(sp),fp0
+ add.l #4,sp
+2:
+ cmp.w #2,d1
+ bne 1f
+ fmove.w fp0,6(sp)
+ bra 2f
+1:
+ fmove.l fp0,4(sp)
+2:
+ cmp.w #4,d0
+ beq 1f
+ sub.l #4,sp
+1:
+ jmp (a0)
+
+.cfu:
+ move.l (sp)+,a0
+ move.w (sp),d1
+ move.w 2(sp),d2
+ cmp.w #4,d2
+ bne 1f
+ fmove.s 4(sp),fp0
+ fabs fp0
+ cmp.l #0x4f000000,4(sp)
+ bge 2f
+ fintrz fp0,fp0
+ fmove.l fp0,d0
+ bra 3f
+2:
+ fadd.l #-2147483648,fp0
+ fintrz fp0,fp0
+ fmove.l fp0,d0
+ bchg #31,d0
+ bra 3f
+1:
+ fmove.d 4(sp),fp0
+ add.l #4,sp
+ fabs fp0
+ cmp.l #0x41e00000,(sp)
+ bge 1f
+ fintrz fp0,fp0
+ fmove.l fp0,d0
+ bra 3f
+1:
+ fadd.l #-2147483648,fp0
+ fintrz fp0,fp0
+ fmove.l fp0,d0
+ bchg #31,d0
+3:
+ cmp.w #2,d1
+ bne 1f
+ move.w d0,6(sp)
+ bra 2f
+1:
+ move.l d0,4(sp)
+2:
+ cmp.w #4,d2
+ beq 1f
+ sub.l #4,sp
+1:
+ jmp (a0)
+
+.cff4:
+ move.l (sp)+,a0
+ fmove.d (sp),fp0
+ fmove.s fp0,4(sp)
+ jmp (a0)
+
+.cff8:
+ move.l (sp)+,a0
+ fmove.s (sp),fp0
+ fmove.d fp0,(sp)
+ jmp (a0)
+
+.cmf4:
+ move.l (sp)+,a0
+ clr.l d0
+ fmove.s (sp),fp0
+ fmove.s 4(sp),fp1
+ fcmp fp0,fp1
+ fbeq 2f
+ fblt 1f
+ add.l #1,d0
+ jmp (a0)
+1:
+ sub.l #1,d0
+2:
+ jmp (a0)
+
+.cmf8:
+ move.l (sp)+,a0
+ clr.l d0
+ fmove.d (sp),fp0
+ fmove.d 8(sp),fp1
+ fcmp fp0,fp1
+ fbeq 2f
+ fblt 1f
+ add.l #1,d0
+ jmp (a0)
+1:
+ sub.l #1,d0
+2:
+ jmp (a0)
.sect .data
.sect .bss
+ ! signed long mulitply
+ !-----------------------------------------------------------------------------
+ ! rewritten by Kai-Uwe Bloem (i5110401@dbstu1.bitnet) for speed.
+ ! #1 01/12/90 initial revision
+ !-----------------------------------------------------------------------------
+ ! 3 cases worth to recognize :
+ ! 1) both the upper word of u and v are zero
+ ! => 1 mult : Low*Low
+ ! 2) only one of the upper words is zero
+ ! => 2 mult : Low*HighLow
+ ! 3) both upper words are not zero
+ ! => 4 mult : HighLow*HighLow
+ ! there are other cases (e.g. lower word is zero but high word is not, or
+ ! one operand is all zero). However, this seems not to be very common, so
+ ! they are ignored for the price of superfluous multiplications in these
+ ! cases.
+ !-----------------------------------------------------------------------------
+
+ ! entry : d0 multiplicand
+ ! d1 multiplier
+ ! exit : d0 high order result
+ ! d1 low order result
+ ! d2,a0,a1 : destroyed
.sect .text
.mli:
- move.l (sp)+,a0
- move.l (sp)+,d1
- move.l (sp)+,d0
- move.l d5,-(sp)
- clr d5
- tst.l d0
- bpl 1f
- neg.l d0
- not d5
-1:
- tst.l d1
- bpl 2f
- neg.l d1
- not d5
-2:
- move.l d0,-(sp)
- move.l d1,-(sp)
- jsr .mlu
- tst d5
- beq 3f
+ move.l (sp)+,a1 ! return address
+ move.l d3,a0 ! save register
+ movem.w (sp)+,d0-d3 ! get v and u
+ move.w d5,-(sp) ! save sign register
+ move.w d2,d5
+ bge 0f ! negate u if neccessary
+ neg.w d1
+ negx.w d0
+0: tst.w d0
+ bge 0f ! negate v if neccessary
+ eor.w d0,d5
+ neg.w d1
+ negx.w d0
+0: bne 1f ! case 2) or 3)
+ tst.w d2
+ bne 2f ! case 2)
+! === case 1: _l x _l ===
+ mulu d3,d1 ! r.l = u.l x v.l
+9: ! (r.h is already zero)
+ tst.w d5 ! negate result if neccessary
+ bpl 0f
neg.l d1
negx.l d0
+0: move.w (sp)+,d5 ! return
+ move.l a0,d3
+ jmp (a1)
+! === possibly case 2) or case 3) ===
+1:
+ tst.w d2
+ bne 3f ! case 3)
+! === case 2: _l x hl ===
+ exg d0,d2 ! exchange u and v
+ exg d1,d3 ! (minimizes number of distinct cases)
+2:
+ mulu d1,d2 ! a = v.l x u.h
+ mulu d3,d1 ! r.l = v.l x u.l
+ swap d2 ! a = a << 16
+ clr.l d3
+ move.w d2,d3
+ clr.w d2
+ add.l d2,d1 ! r += a
+ addx.l d3,d0
+ bra 9b
+! === case 3: hl x hl ===
3:
- move.l (sp)+,d5
- jmp (a0)
+ move.l d4,-(sp) ! need more registers
+ move.w d2,d4
+ mulu d1,d4 ! a = v.l x u.h
+ mulu d3,d1 ! r.l = u.l x v.l
+ mulu d0,d3 ! b = v.h x u.l
+ mulu d2,d0 ! r.h = u.h x v.h
+ swap d1 ! (just for simplicity)
+ add.w d4,d1 ! r += a << 16
+ clr.w d4
+ swap d4
+ addx.l d4,d0
+ add.w d3,d1 ! r += b << 16
+ clr.w d3
+ swap d3
+ addx.l d3,d0
+ swap d1
+ move.l (sp)+,d4 ! return
+ bra 9b
.sect .data
.sect .bss
+ ! unsigned long mulitply
+ !-----------------------------------------------------------------------------
+ ! rewritten by Kai-Uwe Bloem (i5110401@dbstu1.bitnet) for speed.
+ ! #1 01/12/90 initial revision
+ !-----------------------------------------------------------------------------
+ ! 3 cases worth to recognize :
+ ! 1) both the upper word of u and v are zero
+ ! => 1 mult : Low*Low
+ ! 2) only one of the upper words is zero
+ ! => 2 mult : Low*HighLow
+ ! 3) both upper words are not zero
+ ! => 4 mult : HighLow*HighLow
+ ! there are other cases (e.g. lower word is zero but high word is not, or
+ ! one operand is all zero). However, this seems not to be very common, so
+ ! they are ignored for the price of superfluous multiplications in these
+ ! cases.
+ !-----------------------------------------------------------------------------
+
! entry : d0 multiplicand
! d1 multiplier
! exit : d0 high order result
! d1 low order result
+ ! d2,a0,a1 : destroyed
.sect .text
.mlu:
- move.l (sp)+,a1
- move.l (sp)+,d1
- move.l (sp)+,d0
- movem.l d3/d4/d6,-(sp)
- move.l d1,d3
- move.l d1,d2
- swap d2
- move.l d2,d4
- mulu d0,d1
- mulu d0,d2
- swap d0
- mulu d0,d3
- mulu d4,d0
- clr.l d6
- swap d1
- add d2,d1
- addx.l d6,d0
- add d3,d1
- addx.l d6,d0
- swap d1
- clr d2
- clr d3
- swap d2
+ move.l (sp)+,a1 ! return address
+ move.l d3,a0 ! save register
+ movem.w (sp)+,d0-d3 ! get v and u
+ tst.w d0
+ bne 1f ! case 2) or 3)
+ tst.w d2
+ bne 2f ! case 2)
+! === case 1: _l x _l ===
+ mulu d3,d1 ! r.l = u.l x v.l
+ move.l a0,d3 ! (r.h is already zero)
+ jmp (a1) ! return
+! === possibly case 2) or case 3) ===
+1:
+ tst.w d2
+ bne 3f ! case 3)
+! === case 2: _l x hl ===
+ exg d0,d2 ! exchange u and v
+ exg d1,d3 ! (minimizes number of distinct cases)
+2:
+ mulu d1,d2 ! a = v.l x u.h
+ mulu d3,d1 ! r.l = v.l x u.l
+ swap d2 ! a = a << 16
+ clr.l d3
+ move.w d2,d3
+ clr.w d2
+ add.l d2,d1 ! r += a
+ addx.l d3,d0
+ move.l a0,d3 ! return
+ jmp (a1)
+! === case 3: hl x hl ===
+3:
+ move.l d4,-(sp) ! need more registers
+ move.w d2,d4
+ mulu d1,d4 ! a = v.l x u.h
+ mulu d3,d1 ! r.l = u.l x v.l
+ mulu d0,d3 ! b = v.h x u.l
+ mulu d2,d0 ! r.h = u.h x v.h
+ swap d1 ! (just for simplicity)
+ add.w d4,d1 ! r += a << 16
+ clr.w d4
+ swap d4
+ addx.l d4,d0
+ add.w d3,d1 ! r += b << 16
+ clr.w d3
swap d3
- add.l d2,d0
- add.l d3,d0
- movem.l (sp)+,d3/d4/d6
+ addx.l d3,d0
+ swap d1
+ move.l (sp)+,d4 ! return
+ move.l a0,d3
jmp (a1)