From: George Koehler Date: Wed, 18 Oct 2017 16:12:42 +0000 (-0400) Subject: Use lwzu, stwu to tighten more loops. X-Git-Url: https://git.ndcode.org/public/gitweb.cgi?a=commitdiff_plain;h=459a9b5949a3ab4396af2628aea0c229383ba6f2;p=ack.git Use lwzu, stwu to tighten more loops. Because lwzu or stwu moves the pointer, I can remove an addi instruction from the loop, so the loop is slightly faster. I wrote a benchmark in Modula-2 that exercises some of these loops. I measured its time on my old PowerPC Mac. Its user time decreases from 8.401s to 8.217s with the tighter loops. --- diff --git a/mach/powerpc/libem/and.s b/mach/powerpc/libem/and.s index cf5feee4d..2ab910804 100644 --- a/mach/powerpc/libem/and.s +++ b/mach/powerpc/libem/and.s @@ -1,22 +1,20 @@ .sect .text ! Set intersection. -! Stack: ( b a size -- a*b ) +! Stack: ( a b size -- a*b ) .define .and .and: lwz r3, 0(sp) ! r3 = size srwi r7, r3, 2 mtspr ctr, r7 ! ctr = size / 4 - addi r4, sp, 4 ! r4 = ptr to set a - add r5, r4, r3 ! r5 = ptr to set b - li r6, 0 ! r6 = index -1: - lwzx r7, r4, r6 - lwzx r8, r5, r6 - and r8, r7, r8 ! intersection of words - stwx r8, r5, r6 - addi r6, r6, 4 + add r4, sp, r3 ! r4 = pointer before set a + + ! Loop with r4 in set a and sp in set b. +1: lwzu r5, 4(r4) + lwzu r6, 4(sp) + and r7, r5, r6 ! intersection of words + stw r7, 0(r4) bdnz 1b ! loop ctr times - mr sp, r5 + addi sp, sp, 4 ! drop last word of set b blr diff --git a/mach/powerpc/libem/cms.s b/mach/powerpc/libem/cms.s index 0bcb1ab0b..a9eb6df97 100644 --- a/mach/powerpc/libem/cms.s +++ b/mach/powerpc/libem/cms.s @@ -1,7 +1,7 @@ .sect .text ! Compare sets a, b. -! Stack: ( b a size -- result ) +! Stack: ( a b size -- result ) ! Result is 0 if equal, nonzero if not equal. .define .cms @@ -9,22 +9,19 @@ lwz r3, 0(sp) ! r3 = size of each set srwi r7, r3, 2 mtspr ctr, r7 ! ctr = size / 4 - addi r4, sp, 4 ! r4 = ptr to set a - add r5, r4, r3 ! r5 = ptr to set b - li r6, 0 ! r6 = index -1: - lwzx r7, r4, r6 - lwzx r8, r5, r6 - cmpw cr0, r7, r8 ! compare words in sets - addi r6, r6, 4 - bne cr0, 2f ! branch if not equal + add r4, sp, r3 ! r4 = pointer before set a + add r7, r4, r3 ! r7 = pointer to store result + + ! Loop with r4 in a set a and sp in set b. +1: lwzu r5, 4(r4) + lwzu r6, 4(sp) + cmpw r5, r6 ! compare words + bne 2f ! branch if not equal bdnz 1b ! loop ctr times - li r9, 0 ! equal: return 0 + + li r3, 0 ! equal: return 0 b 3f -2: - li r9, 1 ! not equal: return 1 -3: - slwi r7, r3, 1 - add sp, sp, r7 ! adjust stack pointer - stw r9, 0(sp) ! push result +2: li r3, 1 ! not equal: return 1 +3: mr sp, r7 + stw r3, 0(sp) ! push result blr diff --git a/mach/powerpc/libem/com.s b/mach/powerpc/libem/com.s index 3168cfe17..9e1acabda 100644 --- a/mach/powerpc/libem/com.s +++ b/mach/powerpc/libem/com.s @@ -5,16 +5,15 @@ .define .com .com: - lwz r3, 0 (sp) ! size - addi sp, sp, 4 + lwz r3, 0(sp) ! r3 = size + srwi r7, r3, 2 + mtspr ctr, r7 ! ctr = size / 4 + mr r4, sp ! r4 = pointer before set a - mr r4, sp ! r4 = pointer to set a - srwi r5, r3, 2 - mtspr ctr, r5 ! ctr = r3 / 4 -1: - lwz r6, 0(r4) - nor r6, r6, r6 ! complement of word - stw r6, 0(r4) - addi r4, r4, 4 + ! Loop with r4 in set a. +1: lwzu r5, 4(r4) + nor r7, r5, r5 ! complement of word + stw r7, 0(r4) bdnz 1b ! loop ctr times + addi sp, sp, 4 ! drop size from stack blr diff --git a/mach/powerpc/libem/ior.s b/mach/powerpc/libem/ior.s index b4b0b3fae..952c8b8ab 100644 --- a/mach/powerpc/libem/ior.s +++ b/mach/powerpc/libem/ior.s @@ -1,22 +1,20 @@ .sect .text ! Set union. -! Stack: ( b a size -- a+b ) +! Stack: ( a b size -- a+b ) .define .ior .ior: lwz r3, 0(sp) ! r3 = size srwi r7, r3, 2 mtspr ctr, r7 ! ctr = size / 4 - addi r4, sp, 4 ! r4 = ptr to set a - add r5, r4, r3 ! r5 = ptr to set b - li r6, 0 ! r6 = index -1: - lwzx r7, r4, r6 - lwzx r8, r5, r6 - or r8, r7, r8 ! union of words - stwx r8, r5, r6 - addi r6, r6, 4 + add r4, sp, r3 ! r4 = pointer before set a + + ! Loop with r4 in set a and sp in set b. +1: lwzu r5, 4(r4) + lwzu r6, 4(sp) + or r7, r5, r6 ! union of words + stw r7, 0(r4) bdnz 1b ! loop ctr times - mr sp, r5 + addi sp, sp, 4 ! drop last word of set b blr diff --git a/mach/powerpc/libem/xor.s b/mach/powerpc/libem/xor.s index 6dc4e7afc..ceb08b6ac 100644 --- a/mach/powerpc/libem/xor.s +++ b/mach/powerpc/libem/xor.s @@ -1,22 +1,20 @@ .sect .text ! Set symmetric difference. -! Stack: ( b a size -- a/b ) +! Stack: ( a b size -- a/b ) .define .xor .xor: lwz r3, 0(sp) ! r3 = size srwi r7, r3, 2 mtspr ctr, r7 ! ctr = size / 4 - addi r4, sp, 4 ! r4 = ptr to set a - add r5, r4, r3 ! r5 = ptr to set b - li r6, 0 ! r6 = index -1: - lwzx r7, r4, r6 - lwzx r8, r5, r6 - xor r8, r7, r8 ! symmetric difference of words - stwx r8, r5, r6 - addi r6, r6, 4 + add r4, sp, r3 ! r4 = pointer before set a + + ! Loop with r4 in set a and sp in set b. +1: lwzu r5, 4(r4) + lwzu r6, 4(sp) + xor r7, r5, r6 ! symmetric difference of words + stw r7, 0(r4) bdnz 1b ! loop ctr times - mr sp, r5 + addi sp, sp, 4 ! drop last word of set b blr diff --git a/mach/powerpc/libem/zer.s b/mach/powerpc/libem/zer.s index d35744bba..1c67da332 100644 --- a/mach/powerpc/libem/zer.s +++ b/mach/powerpc/libem/zer.s @@ -6,14 +6,11 @@ .define .zer .zer: lwz r3, 0(sp) ! r3 = size - srwi r5, r3, 2 - mtspr ctr, r5 ! ctr = word size - 4 - li r4, 0 ! r4 = 0 - addi sp, sp, 4 - subf sp, r3, sp ! sp = ptr to new set - li r6, 0 ! r6 = index -1: - stwx r4, sp, r6 ! store zero in set - addi r6, r6, 4 + srwi r7, r3, 2 + mtspr ctr, r7 ! ctr = size / 4 + addi sp, sp, 4 ! drop size from stack + li r4, 0 + +1: stwu r4, -4(sp) ! push zero bdnz 1b ! loop ctr times blr diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table index daf52b385..61af44a62 100644 --- a/mach/powerpc/ncg/table +++ b/mach/powerpc/ncg/table @@ -1897,6 +1897,11 @@ PATTERNS gen move %2, r4 leaving ret 0 + /* + * These rules for blm/bls are wrong if length is zero. + * So are several procedures in libem. + */ + pat blm /* Block move constant length */ leaving loc $1 @@ -1904,15 +1909,15 @@ PATTERNS pat bls /* Block move variable length */ with REG REG REG - uses reusing %1, REG, REG={CONST_0000_7FFF, 0} + /* ( src%3 dst%2 len%1 -- ) */ + uses reusing %1, REG, REG, REG gen - /* Wrong if size is zero */ - srwi %1, %1, {CONST, 2} - mtspr ctr, %1 - 1: - lwzx %a, %3, %b - stwx %a, %2, %b - addi %b, %b, {CONST, 4} + srwi %a, %1, {CONST, 2} + mtspr ctr, %a + addi %b, %3, {CONST, 0-4} + addi %c, %2, {CONST, 0-4} + 1: lwzu %a, {IND_RC_W, %b, 4} + stwu %a, {IND_RC_W, %c, 4} bdnz {LABEL, "1b"} pat csa /* Array-lookup switch */ @@ -1987,8 +1992,7 @@ PATTERNS REG={CONST_0000_7FFF, $1-1} gen mtspr ctr, %b - 1: - lwz %a, {IND_RC_W, %a, SL_OFFSET} + 1: lwz %a, {IND_RC_W, %a, SL_OFFSET} bdnz {LABEL, "1b"} yields %a