From: ceriel Date: Fri, 1 Feb 1991 15:09:58 +0000 (+0000) Subject: Added end library and floating point processor support X-Git-Tag: release-5-5~1266 X-Git-Url: https://git.ndcode.org/public/gitweb.cgi?a=commitdiff_plain;h=0f4e675b50be005efa684e23a22257a4b4431f36;p=ack.git Added end library and floating point processor support --- diff --git a/mach/m68k2/libem/.distr b/mach/m68k2/libem/.distr index e862ca95b..ce020b7c5 100644 --- a/mach/m68k2/libem/.distr +++ b/mach/m68k2/libem/.distr @@ -1,5 +1,9 @@ LIST Makefile compmodule +em_end.s +etext.s +edata.s end.s libem_s.a +READ_ME diff --git a/mach/m68k2/libem/LIST b/mach/m68k2/libem/LIST index 62ca99ccb..98e42ba4b 100644 --- a/mach/m68k2/libem/LIST +++ b/mach/m68k2/libem/LIST @@ -26,7 +26,7 @@ shp.s sig.s cms.s gto.s -ffp.s +fp68881.s fat.s trp.s dia.s diff --git a/mach/m68k2/libem/Makefile b/mach/m68k2/libem/Makefile index 3b6cf5386..b8a2346a2 100644 --- a/mach/m68k2/libem/Makefile +++ b/mach/m68k2/libem/Makefile @@ -1,28 +1,33 @@ # $Header$ MACH=m68k2 -all: libem_o.a end.o +ASAR=aal +all: libem_o.a end.a install: all ../../install libem_o.a tail_em - ../../install end.o end_em + ../../install end.a end_em cmp: all -../../compare libem_o.a tail_em - -../../compare end.o end_em + -../../compare end.a end_em -end.o: end.s +end.a: em_end.s etext.s edata.s end.s + $(MACH) -I../../../h -c em_end.s + $(MACH) -I../../../h -c edata.s + $(MACH) -I../../../h -c etext.s $(MACH) -I../../../h -c end.s + $(ASAR) cr end.a em_end.o etext.o edata.o end.o libem_o.a: libem_s.a - ASAR=aal ; export ASAR ;\ + ASAR=$(ASAR) ; export ASAR ;\ march . libem_o.a clean: - rm -f *.o libem_o.a + rm -f *.o libem_o.a end.a opr : make pr | opr pr: @arch pv libem_s.a | pr -h `pwd`/libem_s.a - @pr `pwd`/end.s + @pr `pwd`/em_end.s `pwd`/edata.s `pwd`/etext.s `pwd`/end.s diff --git a/mach/m68k2/libem/READ_ME b/mach/m68k2/libem/READ_ME index f0e7d9a76..ca48431ca 100644 --- a/mach/m68k2/libem/READ_ME +++ b/mach/m68k2/libem/READ_ME @@ -1,5 +1,4 @@ -The original EM library routines saved all registers -(including scratch registers) in global data; hence they -were not reentrant. -The new routines do not save registers d0,d1,d2,a0 and a1. -They are reentrant. +The routines in mli.s, mlu.s, dvi.s, and dvu.s are written by +Kai-Uwe Bloem and were published on the comp.os.minix newsgroup. +He allowed us to use them for ACK, but requested that +they do not fall under the ACK copyright notice. So, they don't. diff --git a/mach/m68k2/libem/dvi.s b/mach/m68k2/libem/dvi.s index a17ad0f71..82470c9e6 100644 --- a/mach/m68k2/libem/dvi.s +++ b/mach/m68k2/libem/dvi.s @@ -5,38 +5,96 @@ .sect .bss ! signed long divide + !----------------------------------------------------------------------------- + ! rewritten by Kai-Uwe Bloem (i5110401@dbstu1.bitnet) for speed. + ! #1 01/12/90 initial revision. Minor reduce of shift operations. + ! #2 03/07/90 use 68000 divu instruction whereever possible. This change + ! makes #1 superflous. (derived from my GNU division routine) + !----------------------------------------------------------------------------- + ! Some common cases can be handled in a special, much faster way : + ! 1) divisor = 0 + ! => cause trap, then return to user. Result is undefined + ! 2) dividend < divisor + ! => quotient = 0, remainder = dividend + ! 3) divisor < 0x10000 ( i.e. divisor is only 16 bits wide ) + ! => quotient and remainder can be calculated quite fast by repeated + ! application of 68000 divu operations (ca. 400 cycles) + ! 4) otherwise (due to #2, #3 dividend, divisor both wider then 16 bits) + ! => do slow division by shift and subtract + !----------------------------------------------------------------------------- + + + ! register usage: + ! : d0 divisor + ! d1 dividend + ! exit : d1 quotient + ! d2 remainder + .sect .text .dvi: - move.l (sp)+,a0 ! return address - move.l (sp)+,d0 - move.l (sp)+,d1 - move.l d3,-(sp) ! save d3 and d4 - move.l d4,-(sp) + move.l (sp)+,a1 ! return address + move.l (sp)+,d0 ! divisor + move.l (sp)+,d2 ! dividend + move.l d3,a0 ! save d3 + move.l d4,-(sp) ! save result sign register clr.l d4 - tst.l d0 ! divisor - bpl 1f - neg.l d0 - not d4 -1: - tst.l d1 ! dividend - bpl 2f - neg.l d1 - not d4 - swap d4 - not d4 - swap d4 + tst.l d2 + bpl 0f ! dividend is negative ? + neg.l d2 ! yes - negate + not.l d4 ! and note negation in d4 +0: + tst.l d0 + bpl 0f ! divisor is negative ? + neg.l d0 ! yes - negate + not.w d4 ! note negation +0: + clr.l d1 ! prepare quotient +! === case 1: divisor = 0 + tst.l d0 ! divisor = 0 ? + beq 9f ! yes - divide by zero trap +! === case 2: dividend < divisor + cmp.l d0,d2 ! dividend < divisor ? + bcs 8f ! yes - division already finished +! === case 3: divisor <= 0x0ffff + cmp.l #0x0ffff,d0 ! is divisor only 16 bits wide ? + bhi 2f + move.w d2,d3 ! save dividend.l + clr.w d2 ! prepare dividend.h for divu operation + swap d2 + beq 0f ! dividend.h is all zero, no divu necessary + divu d0,d2 +0: move.w d2,d1 ! save quotient.h + swap d1 + move.w d3,d2 ! divide dividend.l + divu d0,d2 ! (d2.h = remainder of prev divu) + move.w d2,d1 ! save qoutient.l + clr.w d2 ! get remainder + swap d2 + bra 8f +! === case 4: divisor and dividend both > 0x0ffff 2: - move.l d1,-(sp) - move.l d0,-(sp) - jsr .dvu - tst d4 - beq 5f - neg.l d1 ! quotient + move #32-1,d3 ! loop count +4: + lsl.l #1,d2 ! shift dividend ... + roxl.l #1,d1 ! ... into d1 + cmp.l d0,d1 ! compare with divisor + bcs 5f + sub.l d0,d1 ! bigger, subtract divisor + add #1,d2 ! note subtraction in result 5: - tst.l d4 - bpl 6f - neg.l d2 ! remainder -6: - move.l (sp)+,d4 ! restore d4 and d3 - move.l (sp)+,d3 - jmp (a0) + dbra d3,4b + exg d1,d2 ! get results in the correct registers +8: + tst.w d4 ! quotient < 0 ? + bpl 0f + neg.l d1 ! yes - negate +0: tst.l d4 ! remainder < 0 ? + bpl 0f + neg.l d2 +0: move.l (sp)+,d4 ! restore d4 + move.l a0,d3 ! restore d3 + jmp (a1) + +EIDIVZ = 6 +9: move.w #EIDIVZ,-(sp) + jsr .trp diff --git a/mach/m68k2/libem/dvu.s b/mach/m68k2/libem/dvu.s index 7efdd441c..005c351e6 100644 --- a/mach/m68k2/libem/dvu.s +++ b/mach/m68k2/libem/dvu.s @@ -5,34 +5,77 @@ .sect .bss ! unsigned long divide + !----------------------------------------------------------------------------- + ! rewritten by Kai-Uwe Bloem (i5110401@dbstu1.bitnet) for speed. + ! #1 01/12/90 initial revision. Minor reduce of shift operations. + ! #2 03/07/90 use 68000 divu instruction whereever possible. This change + ! makes #1 superflous. (derived from my GNU division routine) + !----------------------------------------------------------------------------- + ! Some common cases can be handled in a special, much faster way : + ! 1) divisor = 0 + ! => cause trap, then return to user. Result is undefined + ! 2) dividend < divisor + ! => quotient = 0, remainder = dividend + ! 3) divisor < 0x10000 ( i.e. divisor is only 16 bits wide ) + ! => quotient and remainder can be calculated quite fast by repeated + ! application of 68000 divu operations (ca. 400 cycles) + ! 4) otherwise (due to #2, #3 dividend, divisor both wider then 16 bits) + ! => do slow division by shift and subtract + !----------------------------------------------------------------------------- + + ! register usage: ! : d0 divisor ! d1 dividend ! exit : d1 quotient ! d2 remainder + .sect .text .dvu: + move.l d3,a0 ! save d3 move.l (sp)+,a1 ! return address - move.l (sp)+,d0 - move.l (sp)+,d1 - move.l d3,-(sp) ! save d3 - tst.l d0 - bne 0f - move.l (sp)+,d3 - move.w #EIDIVZ,-(sp) - jsr .trp -0: - clr.l d2 - move.l #32,d3 -3: - lsl.l #1,d1 - roxl.l #1,d2 - cmp.l d0,d2 - blt 4f - sub.l d0,d2 - add #1,d1 + move.l (sp)+,d0 ! divisor + move.l (sp)+,d2 ! dividend + clr.l d1 ! prepare quotient +! === case 1: divisor = 0 + tst.l d0 ! divisor = 0 ? + beq 9f ! yes - divide by zero trap +! === case 2: dividend < divisor + cmp.l d0,d2 ! dividend < divisor ? + bcs 8f ! yes - division already finished +! === case 3: divisor <= 0x0ffff + cmp.l #0x0ffff,d0 ! is divisor only 16 bits wide ? + bhi 2f + move.w d2,d3 ! save dividend.l + clr.w d2 ! prepare dividend.h for divu operation + swap d2 + beq 0f ! dividend.h is all zero, no divu necessary + divu d0,d2 +0: move.w d2,d1 ! save quotient.h + swap d1 + move.w d3,d2 ! divide dividend.l + divu d0,d2 ! (d2.h = remainder of prev divu) + move.w d2,d1 ! save qoutient.l + clr.w d2 ! get remainder + swap d2 + bra 8f +! === case 4: divisor and dividend both > 0x0ffff +2: + move #32-1,d3 ! loop count 4: - sub #1,d3 - bgt 3b - move.l (sp)+,d3 + lsl.l #1,d2 ! shift dividend ... + roxl.l #1,d1 ! ... into d1 + cmp.l d0,d1 ! compare with divisor + bcs 5f + sub.l d0,d1 ! bigger, subtract divisor + add #1,d2 ! note subtraction in result +5: + dbra d3,4b + exg d1,d2 ! get results in the correct registers +8: + move.l a0,d3 ! restore d3 jmp (a1) + +EIDIVZ = 6 +9: move.w #EIDIVZ,-(sp) + jsr .trp diff --git a/mach/m68k2/libem/edata.s b/mach/m68k2/libem/edata.s new file mode 100644 index 000000000..f53adc109 --- /dev/null +++ b/mach/m68k2/libem/edata.s @@ -0,0 +1,9 @@ +.sect .text +.sect .rom +.sect .data +.sect .bss +.define _edata +.sect .data + .align 4 + .sect .data +_edata: diff --git a/mach/m68k2/libem/em_end.s b/mach/m68k2/libem/em_end.s new file mode 100644 index 000000000..a062368da --- /dev/null +++ b/mach/m68k2/libem/em_end.s @@ -0,0 +1,22 @@ +.sect .text +.sect .rom +.sect .data +.sect .bss +.define endtext,enddata,endbss,__end +.sect .text + .align 4 +.sect .rom + .align 4 +.sect .data + .align 4 +.sect .bss + .align 4 +.sect .end ! only for declaration of _end, __end and endbss. + + .sect .text +endtext: + .sect .data +enddata: + .sect .end +__end: +endbss: diff --git a/mach/m68k2/libem/end.s b/mach/m68k2/libem/end.s index 37e1cef2c..93a1e6e00 100644 --- a/mach/m68k2/libem/end.s +++ b/mach/m68k2/libem/end.s @@ -1,16 +1,7 @@ -.define endtext,enddata,endbss,_etext,_edata,_end .sect .text .sect .rom .sect .data .sect .bss -.sect .end ! only for declaration of _end and endbss. - - .sect .text -endtext: -_etext: - .sect .data -enddata: -_edata: - .sect .end +.define _end +.sect .end ! only for declaration of _end, __end and endbss. _end: -endbss: diff --git a/mach/m68k2/libem/etext.s b/mach/m68k2/libem/etext.s new file mode 100644 index 000000000..8c7453cb4 --- /dev/null +++ b/mach/m68k2/libem/etext.s @@ -0,0 +1,9 @@ +.sect .text +.sect .rom +.sect .data +.sect .bss +.define _etext +.sect .text + .align 4 + .sect .text +_etext: diff --git a/mach/m68k2/libem/fp68881.s b/mach/m68k2/libem/fp68881.s new file mode 100644 index 000000000..dd932fb43 --- /dev/null +++ b/mach/m68k2/libem/fp68881.s @@ -0,0 +1,352 @@ +.define .adf4, .adf8, .sbf4, .sbf8, .mlf4, .mlf8, .dvf4, .dvf8 +.define .ngf4, .ngf8, .fif4, .fif8, .fef4, .fef8 +.define .cif4, .cif8, .cuf4, .cuf8, .cfi, .cfu, .cff4, .cff8 +.define .cmf4, .cmf8 +.sect .text +.sect .rom +.sect .data +.sect .bss + +! $Header$ + +! Implement interface to floating point package for M68881 + + .sect .text +.adf4: + move.l (sp)+,a0 + fmove.s (sp),fp0 + fadd.s 4(sp),fp0 + fmove.s fp0,4(sp) + jmp (a0) + +.adf8: + move.l (sp)+,a0 + fmove.d (sp),fp0 + fadd.d 8(sp),fp0 + fmove.d fp0,8(sp) + jmp (a0) + +.sbf4: + move.l (sp)+,a0 + fmove.s (sp),fp0 + fmove.s 4(sp),fp1 + fsub fp0,fp1 + fmove.s fp1,4(sp) + jmp (a0) + +.sbf8: + move.l (sp)+,a0 + fmove.d (sp),fp0 + fmove.d 8(sp),fp1 + fsub fp0,fp1 + fmove.d fp1,8(sp) + jmp (a0) + +.mlf4: + move.l (sp)+,a0 + fmove.s (sp),fp0 + fmul.s 4(sp),fp0 + fmove.s fp0,4(sp) + jmp (a0) + +.mlf8: + move.l (sp)+,a0 + fmove.d (sp),fp0 + fmul.d 8(sp),fp0 + fmove.d fp0,8(sp) + jmp (a0) + +.dvf4: + move.l (sp)+,a0 + fmove.s (sp),fp0 + fmove.s 4(sp),fp1 + fdiv fp0,fp1 + fmove.s fp1,4(sp) + jmp (a0) + +.dvf8: + move.l (sp)+,a0 + fmove.d (sp),fp0 + fmove.d 8(sp),fp1 + fdiv fp0,fp1 + fmove.d fp1,8(sp) + jmp (a0) + +.ngf4: + fmove.s 4(sp),fp0 + fneg fp0 + fmove.s fp0,4(sp) + rts + +.ngf8: + fmove.d 4(sp),fp0 + fneg fp0 + fmove.d fp0,4(sp) + rts + +.fif4: + move.l (sp)+,a0 + move.l (sp),a1 + fmove.s 4(sp),fp0 + fmove.s 8(sp),fp1 + fmul fp0,fp1 + fintrz fp1,fp0 + fsub fp0,fp1 + fmove.s fp1,4(a1) + fmove.s fp0,(a1) + jmp (a0) + +.fif8: + move.l (sp)+,a0 + move.l (sp),a1 + fmove.d 4(sp),fp0 + fmove.d 12(sp),fp1 + fmul fp0,fp1 + fintrz fp1,fp0 + fsub fp0,fp1 + fmove.d fp1,8(a1) + fmove.d fp0,(a1) + jmp (a0) + +.fef4: + move.l (sp)+,a0 + move.l (sp),a1 + fmove.s 4(sp),fp0 + fgetexp fp0,fp1 + fmove.l fpsr,d0 + and.l #0x2000,d0 ! set if Infinity + beq 1f + move.w #129,(a1) + fmove.s 4(sp),fp0 + fblt 2f + move.l #0x3f000000,2(a1) + jmp (a0) +2: + move.l #0xbf000000,2(a1) + jmp (a0) +1: + fmove.l fp1,d0 + add.l #1,d0 + fgetman fp0 + fbne 1f + clr.l d0 + bra 2f +1: + fmove.l #2,fp1 + fdiv fp1,fp0 +2: + fmove.s fp0,2(a1) + move.w d0,(a1) + jmp (a0) + +.fef8: + move.l (sp)+,a0 + move.l (sp),a1 + fmove.d 4(sp),fp0 + fgetexp fp0,fp1 + fmove.l fpsr,d0 + and.l #0x2000,d0 ! set if Infinity + beq 1f + move.w #1025,(a1) + fmove.d 4(sp),fp0 + fblt 2f + move.l #0x3fe00000,2(a1) + clr.l 6(a1) + jmp (a0) +2: + move.l #0xbfe00000,2(a1) + clr.l 6(a1) + jmp (a0) +1: + fmove.l fp1,d0 + add.l #1,d0 + fgetman fp0 + fbne 1f + clr.l d0 + bra 2f +1: + fmove.l #2,fp1 + fdiv fp1,fp0 +2: + fmove.d fp0,2(a1) + move.w d0,(a1) + jmp (a0) + +.cif4: + move.l (sp)+,a0 + cmp.w #2,(sp) + bne 1f + fmove.w 2(sp),fp0 + fmove.s fp0,(sp) + jmp (a0) +1: + fmove.l 2(sp),fp0 + fmove.s fp0,2(sp) + jmp (a0) + +.cif8: + move.l (sp)+,a0 + cmp.w #2,(sp) + bne 1f + fmove.w 2(sp),fp0 + fmove.d fp0,(sp) + jmp (a0) +1: + fmove.l 2(sp),fp0 + fmove.d fp0,(sp) + jmp (a0) + +.cuf4: + move.l (sp)+,a0 + cmp.w #2,(sp) + bne 2f + fmove.w 2(sp),fp0 + tst.w 2(sp) + bge 1f + fadd.l #65536,fp0 +1: + fmove.s fp0,(sp) + jmp (a0) +2: + fmove.l 2(sp),fp0 + tst.l 2(sp) + bge 1f + fsub.l #-2147483648,fp0 + fsub.l #-2147483648,fp0 +1: + fmove.s fp0,2(sp) + jmp (a0) + +.cuf8: + move.l (sp)+,a0 + move.w (sp),d0 + cmp.w #2,d0 + bne 2f + fmove.w 2(sp),fp0 + tst.w 2(sp) + bge 1f + fadd.l #65536,fp0 + bra 1f +2: + fmove.l 2(sp),fp0 + tst.l 2(sp) + bge 1f + fsub.l #-2147483648,fp0 + fsub.l #-2147483648,fp0 +1: + fmove.d fp0,(sp) + jmp (a0) + +.cfi: + move.l (sp)+,a0 + move.w (sp),d1 + move.w 2(sp),d0 + cmp.w #4,d0 + bne 1f + fmove.s 4(sp),fp0 + bra 2f +1: + fmove.d 4(sp),fp0 + add.l #4,sp +2: + cmp.w #2,d1 + bne 1f + fmove.w fp0,6(sp) + bra 2f +1: + fmove.l fp0,4(sp) +2: + cmp.w #4,d0 + beq 1f + sub.l #4,sp +1: + jmp (a0) + +.cfu: + move.l (sp)+,a0 + move.w (sp),d1 + move.w 2(sp),d2 + cmp.w #4,d2 + bne 1f + fmove.s 4(sp),fp0 + fabs fp0 + cmp.l #0x4f000000,4(sp) + bge 2f + fintrz fp0,fp0 + fmove.l fp0,d0 + bra 3f +2: + fadd.l #-2147483648,fp0 + fintrz fp0,fp0 + fmove.l fp0,d0 + bchg #31,d0 + bra 3f +1: + fmove.d 4(sp),fp0 + add.l #4,sp + fabs fp0 + cmp.l #0x41e00000,(sp) + bge 1f + fintrz fp0,fp0 + fmove.l fp0,d0 + bra 3f +1: + fadd.l #-2147483648,fp0 + fintrz fp0,fp0 + fmove.l fp0,d0 + bchg #31,d0 +3: + cmp.w #2,d1 + bne 1f + move.w d0,6(sp) + bra 2f +1: + move.l d0,4(sp) +2: + cmp.w #4,d2 + beq 1f + sub.l #4,sp +1: + jmp (a0) + +.cff4: + move.l (sp)+,a0 + fmove.d (sp),fp0 + fmove.s fp0,4(sp) + jmp (a0) + +.cff8: + move.l (sp)+,a0 + fmove.s (sp),fp0 + fmove.d fp0,(sp) + jmp (a0) + +.cmf4: + move.l (sp)+,a0 + clr.l d0 + fmove.s (sp),fp0 + fmove.s 4(sp),fp1 + fcmp fp0,fp1 + fbeq 2f + fblt 1f + add.l #1,d0 + jmp (a0) +1: + sub.l #1,d0 +2: + jmp (a0) + +.cmf8: + move.l (sp)+,a0 + clr.l d0 + fmove.d (sp),fp0 + fmove.d 8(sp),fp1 + fcmp fp0,fp1 + fbeq 2f + fblt 1f + add.l #1,d0 + jmp (a0) +1: + sub.l #1,d0 +2: + jmp (a0) diff --git a/mach/m68k2/libem/mli.s b/mach/m68k2/libem/mli.s index a0aa9a4f2..660bda16a 100644 --- a/mach/m68k2/libem/mli.s +++ b/mach/m68k2/libem/mli.s @@ -4,31 +4,92 @@ .sect .data .sect .bss + ! signed long mulitply + !----------------------------------------------------------------------------- + ! rewritten by Kai-Uwe Bloem (i5110401@dbstu1.bitnet) for speed. + ! #1 01/12/90 initial revision + !----------------------------------------------------------------------------- + ! 3 cases worth to recognize : + ! 1) both the upper word of u and v are zero + ! => 1 mult : Low*Low + ! 2) only one of the upper words is zero + ! => 2 mult : Low*HighLow + ! 3) both upper words are not zero + ! => 4 mult : HighLow*HighLow + ! there are other cases (e.g. lower word is zero but high word is not, or + ! one operand is all zero). However, this seems not to be very common, so + ! they are ignored for the price of superfluous multiplications in these + ! cases. + !----------------------------------------------------------------------------- + + ! entry : d0 multiplicand + ! d1 multiplier + ! exit : d0 high order result + ! d1 low order result + ! d2,a0,a1 : destroyed .sect .text .mli: - move.l (sp)+,a0 - move.l (sp)+,d1 - move.l (sp)+,d0 - move.l d5,-(sp) - clr d5 - tst.l d0 - bpl 1f - neg.l d0 - not d5 -1: - tst.l d1 - bpl 2f - neg.l d1 - not d5 -2: - move.l d0,-(sp) - move.l d1,-(sp) - jsr .mlu - tst d5 - beq 3f + move.l (sp)+,a1 ! return address + move.l d3,a0 ! save register + movem.w (sp)+,d0-d3 ! get v and u + move.w d5,-(sp) ! save sign register + move.w d2,d5 + bge 0f ! negate u if neccessary + neg.w d1 + negx.w d0 +0: tst.w d0 + bge 0f ! negate v if neccessary + eor.w d0,d5 + neg.w d1 + negx.w d0 +0: bne 1f ! case 2) or 3) + tst.w d2 + bne 2f ! case 2) +! === case 1: _l x _l === + mulu d3,d1 ! r.l = u.l x v.l +9: ! (r.h is already zero) + tst.w d5 ! negate result if neccessary + bpl 0f neg.l d1 negx.l d0 +0: move.w (sp)+,d5 ! return + move.l a0,d3 + jmp (a1) +! === possibly case 2) or case 3) === +1: + tst.w d2 + bne 3f ! case 3) +! === case 2: _l x hl === + exg d0,d2 ! exchange u and v + exg d1,d3 ! (minimizes number of distinct cases) +2: + mulu d1,d2 ! a = v.l x u.h + mulu d3,d1 ! r.l = v.l x u.l + swap d2 ! a = a << 16 + clr.l d3 + move.w d2,d3 + clr.w d2 + add.l d2,d1 ! r += a + addx.l d3,d0 + bra 9b +! === case 3: hl x hl === 3: - move.l (sp)+,d5 - jmp (a0) + move.l d4,-(sp) ! need more registers + move.w d2,d4 + mulu d1,d4 ! a = v.l x u.h + mulu d3,d1 ! r.l = u.l x v.l + mulu d0,d3 ! b = v.h x u.l + mulu d2,d0 ! r.h = u.h x v.h + swap d1 ! (just for simplicity) + add.w d4,d1 ! r += a << 16 + clr.w d4 + swap d4 + addx.l d4,d0 + add.w d3,d1 ! r += b << 16 + clr.w d3 + swap d3 + addx.l d3,d0 + swap d1 + move.l (sp)+,d4 ! return + bra 9b diff --git a/mach/m68k2/libem/mlu.s b/mach/m68k2/libem/mlu.s index f3146c091..2554fe6e0 100644 --- a/mach/m68k2/libem/mlu.s +++ b/mach/m68k2/libem/mlu.s @@ -4,38 +4,79 @@ .sect .data .sect .bss + ! unsigned long mulitply + !----------------------------------------------------------------------------- + ! rewritten by Kai-Uwe Bloem (i5110401@dbstu1.bitnet) for speed. + ! #1 01/12/90 initial revision + !----------------------------------------------------------------------------- + ! 3 cases worth to recognize : + ! 1) both the upper word of u and v are zero + ! => 1 mult : Low*Low + ! 2) only one of the upper words is zero + ! => 2 mult : Low*HighLow + ! 3) both upper words are not zero + ! => 4 mult : HighLow*HighLow + ! there are other cases (e.g. lower word is zero but high word is not, or + ! one operand is all zero). However, this seems not to be very common, so + ! they are ignored for the price of superfluous multiplications in these + ! cases. + !----------------------------------------------------------------------------- + ! entry : d0 multiplicand ! d1 multiplier ! exit : d0 high order result ! d1 low order result + ! d2,a0,a1 : destroyed .sect .text .mlu: - move.l (sp)+,a1 - move.l (sp)+,d1 - move.l (sp)+,d0 - movem.l d3/d4/d6,-(sp) - move.l d1,d3 - move.l d1,d2 - swap d2 - move.l d2,d4 - mulu d0,d1 - mulu d0,d2 - swap d0 - mulu d0,d3 - mulu d4,d0 - clr.l d6 - swap d1 - add d2,d1 - addx.l d6,d0 - add d3,d1 - addx.l d6,d0 - swap d1 - clr d2 - clr d3 - swap d2 + move.l (sp)+,a1 ! return address + move.l d3,a0 ! save register + movem.w (sp)+,d0-d3 ! get v and u + tst.w d0 + bne 1f ! case 2) or 3) + tst.w d2 + bne 2f ! case 2) +! === case 1: _l x _l === + mulu d3,d1 ! r.l = u.l x v.l + move.l a0,d3 ! (r.h is already zero) + jmp (a1) ! return +! === possibly case 2) or case 3) === +1: + tst.w d2 + bne 3f ! case 3) +! === case 2: _l x hl === + exg d0,d2 ! exchange u and v + exg d1,d3 ! (minimizes number of distinct cases) +2: + mulu d1,d2 ! a = v.l x u.h + mulu d3,d1 ! r.l = v.l x u.l + swap d2 ! a = a << 16 + clr.l d3 + move.w d2,d3 + clr.w d2 + add.l d2,d1 ! r += a + addx.l d3,d0 + move.l a0,d3 ! return + jmp (a1) +! === case 3: hl x hl === +3: + move.l d4,-(sp) ! need more registers + move.w d2,d4 + mulu d1,d4 ! a = v.l x u.h + mulu d3,d1 ! r.l = u.l x v.l + mulu d0,d3 ! b = v.h x u.l + mulu d2,d0 ! r.h = u.h x v.h + swap d1 ! (just for simplicity) + add.w d4,d1 ! r += a << 16 + clr.w d4 + swap d4 + addx.l d4,d0 + add.w d3,d1 ! r += b << 16 + clr.w d3 swap d3 - add.l d2,d0 - add.l d3,d0 - movem.l (sp)+,d3/d4/d6 + addx.l d3,d0 + swap d1 + move.l (sp)+,d4 ! return + move.l a0,d3 jmp (a1)