From 0c3f133ed106996172dd2c106b22e38ce695e63d Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 12 Feb 2010 16:41:49 +0100 Subject: Use unsigned comparison in sse memcpy/memset --- ChangeLog | 11 ++++ sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S | 71 +++++++++++++------------- sysdeps/i386/i686/multiarch/memcpy-ssse3.S | 71 +++++++++++++------------- sysdeps/i386/i686/multiarch/memset-sse2-rep.S | 6 +-- sysdeps/i386/i686/multiarch/memset-sse2.S | 12 ++--- 5 files changed, 90 insertions(+), 81 deletions(-) diff --git a/ChangeLog b/ChangeLog index 7812fa6a48..cc4855fc27 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,14 @@ +2010-02-12 H.J. Lu + + * sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S: Use unsigned + conditional jumps. + (shl_0_gobble_cache_loop_tail): Removed. + * sysdeps/i386/i686/multiarch/memcpy-ssse3.S: Likewise. + + * sysdeps/i386/i686/multiarch/memset-sse2-rep.S: Use unsigned + conditional jumps. + * sysdeps/i386/i686/multiarch/memset-sse2.S: Likewise. + 2009-10-27 Aurelien Jarno [BZ #10855] diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S index b26037d279..f85049185c 100644 --- a/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S +++ b/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S @@ -128,7 +128,7 @@ ENTRY (MEMCPY) jb L(copy_forward) je L(fwd_write_0bytes) cmp $32, %ecx - jge L(memmove_bwd) + jae L(memmove_bwd) jmp L(bk_write_less32bytes_2) L(memmove_bwd): add %ecx, %eax @@ -139,12 +139,12 @@ L(memmove_bwd): L(copy_forward): #endif cmp $48, %ecx - jge L(48bytesormore) + jae L(48bytesormore) L(fwd_write_less32bytes): #ifndef USE_AS_MEMMOVE cmp %dl, %al - jl L(bk_write) + jb L(bk_write) #endif add %ecx, %edx add %ecx, %eax @@ -181,7 +181,7 @@ L(48bytesormore): #endif mov %eax, %edi - jge L(large_page) + jae L(large_page) and $0xf, %edi jz L(shl_0) @@ -201,7 +201,7 @@ L(shl_0_loop): movdqa %xmm0, (%edx, %edi) movdqa %xmm1, 16(%edx, %edi) lea 32(%edi), %edi - jl L(shl_0_end) + jb L(shl_0_end) movdqa (%eax, %edi), %xmm0 movdqa 16(%eax, %edi), %xmm1 @@ -209,7 +209,7 @@ L(shl_0_loop): movdqa %xmm0, (%edx, %edi) movdqa %xmm1, 16(%edx, %edi) lea 32(%edi), %edi - jl L(shl_0_end) + jb L(shl_0_end) movdqa (%eax, %edi), %xmm0 movdqa 16(%eax, %edi), %xmm1 @@ -217,7 +217,7 @@ L(shl_0_loop): movdqa %xmm0, (%edx, %edi) movdqa %xmm1, 16(%edx, %edi) lea 32(%edi), %edi - jl L(shl_0_end) + jb L(shl_0_end) movdqa (%eax, %edi), %xmm0 movdqa 16(%eax, %edi), %xmm1 @@ -251,7 +251,7 @@ L(shl_0_gobble): shr $3, %esi sub %esi, %edi cmp %edi, %ecx - jge L(shl_0_gobble_mem_start) + jae L(shl_0_gobble_mem_start) lea -128(%ecx), %ecx ALIGN (4) L(shl_0_gobble_cache_loop): @@ -275,8 +275,7 @@ L(shl_0_gobble_cache_loop): movaps %xmm7, 0x70(%edx) lea 0x80(%edx), %edx - jge L(shl_0_gobble_cache_loop) -L(shl_0_gobble_cache_loop_tail): + jae L(shl_0_gobble_cache_loop) cmp $-0x40, %ecx lea 0x80(%ecx), %ecx jl L(shl_0_cache_less_64bytes) @@ -297,7 +296,7 @@ L(shl_0_gobble_cache_loop_tail): add $0x40, %edx L(shl_0_cache_less_64bytes): cmp $0x20, %ecx - jl L(shl_0_cache_less_32bytes) + jb L(shl_0_cache_less_32bytes) movdqa (%eax), %xmm0 sub $0x20, %ecx movdqa 0x10(%eax), %xmm1 @@ -307,7 +306,7 @@ L(shl_0_cache_less_64bytes): add $0x20, %edx L(shl_0_cache_less_32bytes): cmp $0x10, %ecx - jl L(shl_0_cache_less_16bytes) + jb L(shl_0_cache_less_16bytes) sub $0x10, %ecx movdqa (%eax), %xmm0 add $0x10, %eax @@ -352,7 +351,7 @@ L(shl_0_gobble_mem_loop): movaps %xmm7, 0x70(%edx) lea 0x80(%edx), %edx - jge L(shl_0_gobble_mem_loop) + jae L(shl_0_gobble_mem_loop) cmp $-0x40, %ecx lea 0x80(%ecx), %ecx jl L(shl_0_mem_less_64bytes) @@ -373,7 +372,7 @@ L(shl_0_gobble_mem_loop): add $0x40, %edx L(shl_0_mem_less_64bytes): cmp $0x20, %ecx - jl L(shl_0_mem_less_32bytes) + jb L(shl_0_mem_less_32bytes) movdqa (%eax), %xmm0 sub $0x20, %ecx movdqa 0x10(%eax), %xmm1 @@ -383,7 +382,7 @@ L(shl_0_mem_less_64bytes): add $0x20, %edx L(shl_0_mem_less_32bytes): cmp $0x10, %ecx - jl L(shl_0_mem_less_16bytes) + jb L(shl_0_mem_less_16bytes) sub $0x10, %ecx movdqa (%eax), %xmm0 add $0x10, %eax @@ -418,7 +417,7 @@ L(shl_1_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_1_end) + jb L(shl_1_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -461,7 +460,7 @@ L(shl_2_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_2_end) + jb L(shl_2_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -504,7 +503,7 @@ L(shl_3_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_3_end) + jb L(shl_3_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -547,7 +546,7 @@ L(shl_4_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_4_end) + jb L(shl_4_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -590,7 +589,7 @@ L(shl_5_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_5_end) + jb L(shl_5_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -634,7 +633,7 @@ L(shl_6_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_6_end) + jb L(shl_6_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -677,7 +676,7 @@ L(shl_7_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_7_end) + jb L(shl_7_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -720,7 +719,7 @@ L(shl_8_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_8_end) + jb L(shl_8_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -763,7 +762,7 @@ L(shl_9_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_9_end) + jb L(shl_9_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -806,7 +805,7 @@ L(shl_10_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_10_end) + jb L(shl_10_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -849,7 +848,7 @@ L(shl_11_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_11_end) + jb L(shl_11_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -892,7 +891,7 @@ L(shl_12_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_12_end) + jb L(shl_12_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -935,7 +934,7 @@ L(shl_13_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_13_end) + jb L(shl_13_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -978,7 +977,7 @@ L(shl_14_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_14_end) + jb L(shl_14_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -1022,7 +1021,7 @@ L(shl_15_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_15_end) + jb L(shl_15_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -1298,7 +1297,7 @@ L(large_page_loop): sub $0x40, %ecx L(large_page_less_64bytes): cmp $32, %ecx - jl L(large_page_less_32bytes) + jb L(large_page_less_32bytes) movdqu (%eax), %xmm0 movdqu 0x10(%eax), %xmm1 lea 0x20(%eax), %eax @@ -1665,11 +1664,11 @@ L(copy_backward): L(bk_aligned_4): cmp $64, %ecx - jge L(bk_write_more64bytes) + jae L(bk_write_more64bytes) L(bk_write_64bytesless): cmp $32, %ecx - jl L(bk_write_less32bytes) + jb L(bk_write_less32bytes) L(bk_write_more32bytes): /* Copy 32 bytes at a time. */ @@ -1704,7 +1703,7 @@ L(bk_write_less32bytes_2): ALIGN (4) L(bk_align): cmp $8, %ecx - jle L(bk_write_less32bytes) + jbe L(bk_write_less32bytes) testl $1, %edx /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, then (EDX & 2) must be != 0. */ @@ -1760,7 +1759,7 @@ L(bk_ssse3_align): L(bk_ssse3_cpy_pre): cmp $64, %ecx - jl L(bk_write_more32bytes) + jb L(bk_write_more32bytes) L(bk_ssse3_cpy): sub $64, %esi @@ -1775,7 +1774,7 @@ L(bk_ssse3_cpy): movdqu (%esi), %xmm0 movdqa %xmm0, (%edx) cmp $64, %ecx - jge L(bk_ssse3_cpy) + jae L(bk_ssse3_cpy) jmp L(bk_write_64bytesless) #endif diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S index 749c82d379..c512b0e812 100644 --- a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S +++ b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S @@ -128,7 +128,7 @@ ENTRY (MEMCPY) jb L(copy_forward) je L(fwd_write_0bytes) cmp $32, %ecx - jge L(memmove_bwd) + jae L(memmove_bwd) jmp L(bk_write_less32bytes_2) L(memmove_bwd): add %ecx, %eax @@ -139,12 +139,12 @@ L(memmove_bwd): L(copy_forward): #endif cmp $48, %ecx - jge L(48bytesormore) + jae L(48bytesormore) L(fwd_write_less32bytes): #ifndef USE_AS_MEMMOVE cmp %dl, %al - jl L(bk_write) + jb L(bk_write) #endif add %ecx, %edx add %ecx, %eax @@ -181,7 +181,7 @@ L(48bytesormore): #endif mov %eax, %edi - jge L(large_page) + jae L(large_page) and $0xf, %edi jz L(shl_0) @@ -202,7 +202,7 @@ L(shl_0_loop): movdqa %xmm0, (%edx, %edi) movdqa %xmm1, 16(%edx, %edi) lea 32(%edi), %edi - jl L(shl_0_end) + jb L(shl_0_end) movdqa (%eax, %edi), %xmm0 movdqa 16(%eax, %edi), %xmm1 @@ -210,7 +210,7 @@ L(shl_0_loop): movdqa %xmm0, (%edx, %edi) movdqa %xmm1, 16(%edx, %edi) lea 32(%edi), %edi - jl L(shl_0_end) + jb L(shl_0_end) movdqa (%eax, %edi), %xmm0 movdqa 16(%eax, %edi), %xmm1 @@ -218,7 +218,7 @@ L(shl_0_loop): movdqa %xmm0, (%edx, %edi) movdqa %xmm1, 16(%edx, %edi) lea 32(%edi), %edi - jl L(shl_0_end) + jb L(shl_0_end) movdqa (%eax, %edi), %xmm0 movdqa 16(%eax, %edi), %xmm1 @@ -250,7 +250,7 @@ L(shl_0_gobble): POP (%edi) lea -128(%ecx), %ecx - jge L(shl_0_gobble_mem_loop) + jae L(shl_0_gobble_mem_loop) L(shl_0_gobble_cache_loop): movdqa (%eax), %xmm0 movdqa 0x10(%eax), %xmm1 @@ -272,8 +272,7 @@ L(shl_0_gobble_cache_loop): movdqa %xmm7, 0x70(%edx) lea 0x80(%edx), %edx - jge L(shl_0_gobble_cache_loop) -L(shl_0_gobble_cache_loop_tail): + jae L(shl_0_gobble_cache_loop) cmp $-0x40, %ecx lea 0x80(%ecx), %ecx jl L(shl_0_cache_less_64bytes) @@ -294,7 +293,7 @@ L(shl_0_gobble_cache_loop_tail): add $0x40, %edx L(shl_0_cache_less_64bytes): cmp $0x20, %ecx - jl L(shl_0_cache_less_32bytes) + jb L(shl_0_cache_less_32bytes) movdqa (%eax), %xmm0 sub $0x20, %ecx movdqa 0x10(%eax), %xmm1 @@ -304,7 +303,7 @@ L(shl_0_cache_less_64bytes): add $0x20, %edx L(shl_0_cache_less_32bytes): cmp $0x10, %ecx - jl L(shl_0_cache_less_16bytes) + jb L(shl_0_cache_less_16bytes) sub $0x10, %ecx movdqa (%eax), %xmm0 add $0x10, %eax @@ -342,7 +341,7 @@ L(shl_0_gobble_mem_loop): movdqa %xmm7, 0x70(%edx) lea 0x80(%edx), %edx - jge L(shl_0_gobble_mem_loop) + jae L(shl_0_gobble_mem_loop) cmp $-0x40, %ecx lea 0x80(%ecx), %ecx jl L(shl_0_mem_less_64bytes) @@ -363,7 +362,7 @@ L(shl_0_gobble_mem_loop): add $0x40, %edx L(shl_0_mem_less_64bytes): cmp $0x20, %ecx - jl L(shl_0_mem_less_32bytes) + jb L(shl_0_mem_less_32bytes) movdqa (%eax), %xmm0 sub $0x20, %ecx movdqa 0x10(%eax), %xmm1 @@ -373,7 +372,7 @@ L(shl_0_mem_less_64bytes): add $0x20, %edx L(shl_0_mem_less_32bytes): cmp $0x10, %ecx - jl L(shl_0_mem_less_16bytes) + jb L(shl_0_mem_less_16bytes) sub $0x10, %ecx movdqa (%eax), %xmm0 add $0x10, %eax @@ -406,7 +405,7 @@ L(shl_1_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_1_end) + jb L(shl_1_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -449,7 +448,7 @@ L(shl_2_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_2_end) + jb L(shl_2_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -492,7 +491,7 @@ L(shl_3_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_3_end) + jb L(shl_3_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -535,7 +534,7 @@ L(shl_4_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_4_end) + jb L(shl_4_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -578,7 +577,7 @@ L(shl_5_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_5_end) + jb L(shl_5_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -622,7 +621,7 @@ L(shl_6_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_6_end) + jb L(shl_6_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -665,7 +664,7 @@ L(shl_7_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_7_end) + jb L(shl_7_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -708,7 +707,7 @@ L(shl_8_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_8_end) + jb L(shl_8_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -751,7 +750,7 @@ L(shl_9_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_9_end) + jb L(shl_9_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -794,7 +793,7 @@ L(shl_10_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_10_end) + jb L(shl_10_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -837,7 +836,7 @@ L(shl_11_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_11_end) + jb L(shl_11_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -880,7 +879,7 @@ L(shl_12_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_12_end) + jb L(shl_12_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -923,7 +922,7 @@ L(shl_13_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_13_end) + jb L(shl_13_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -966,7 +965,7 @@ L(shl_14_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_14_end) + jb L(shl_14_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -1010,7 +1009,7 @@ L(shl_15_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_15_end) + jb L(shl_15_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -1281,7 +1280,7 @@ L(large_page_loop): sub $0x40, %ecx L(large_page_less_64bytes): cmp $32, %ecx - jl L(large_page_less_32bytes) + jb L(large_page_less_32bytes) movdqu (%eax), %xmm0 movdqu 0x10(%eax), %xmm1 lea 0x20(%eax), %eax @@ -1617,11 +1616,11 @@ L(copy_backward): L(bk_aligned_4): cmp $64, %ecx - jge L(bk_write_more64bytes) + jae L(bk_write_more64bytes) L(bk_write_64bytesless): cmp $32, %ecx - jl L(bk_write_less32bytes) + jb L(bk_write_less32bytes) L(bk_write_more32bytes): /* Copy 32 bytes at a time. */ @@ -1656,7 +1655,7 @@ L(bk_write_less32bytes_2): ALIGN (4) L(bk_align): cmp $8, %ecx - jle L(bk_write_less32bytes) + jbe L(bk_write_less32bytes) testl $1, %edx /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, then (EDX & 2) must be != 0. */ @@ -1712,7 +1711,7 @@ L(bk_ssse3_align): L(bk_ssse3_cpy_pre): cmp $64, %ecx - jl L(bk_write_more32bytes) + jb L(bk_write_more32bytes) L(bk_ssse3_cpy): sub $64, %esi @@ -1727,7 +1726,7 @@ L(bk_ssse3_cpy): movdqu (%esi), %xmm0 movdqa %xmm0, (%edx) cmp $64, %ecx - jge L(bk_ssse3_cpy) + jae L(bk_ssse3_cpy) jmp L(bk_write_64bytesless) #endif diff --git a/sysdeps/i386/i686/multiarch/memset-sse2-rep.S b/sysdeps/i386/i686/multiarch/memset-sse2-rep.S index 84afffeb66..d4bf9b7d3e 100644 --- a/sysdeps/i386/i686/multiarch/memset-sse2-rep.S +++ b/sysdeps/i386/i686/multiarch/memset-sse2-rep.S @@ -261,7 +261,7 @@ L(not_aligned_16): ALIGN (4) L(aligned_16): cmp $128, %ecx - jge L(128bytesormore) + jae L(128bytesormore) L(aligned_16_less128bytes): BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) @@ -306,7 +306,7 @@ L(128bytesormore_normal): movdqa %xmm0, 0x60(%edx) movdqa %xmm0, 0x70(%edx) lea 128(%edx), %edx - jl L(128bytesless_normal) + jb L(128bytesless_normal) sub $128, %ecx @@ -319,7 +319,7 @@ L(128bytesormore_normal): movdqa %xmm0, 0x60(%edx) movdqa %xmm0, 0x70(%edx) lea 128(%edx), %edx - jge L(128bytesormore_normal) + jae L(128bytesormore_normal) L(128bytesless_normal): POP (%edi) diff --git a/sysdeps/i386/i686/multiarch/memset-sse2.S b/sysdeps/i386/i686/multiarch/memset-sse2.S index b2b979193e..00e552e44e 100644 --- a/sysdeps/i386/i686/multiarch/memset-sse2.S +++ b/sysdeps/i386/i686/multiarch/memset-sse2.S @@ -261,7 +261,7 @@ L(not_aligned_16): ALIGN (4) L(aligned_16): cmp $128, %ecx - jge L(128bytesormore) + jae L(128bytesormore) L(aligned_16_less128bytes): BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) @@ -312,7 +312,7 @@ L(128bytesormore_normal): movdqa %xmm0, 0x60(%edx) movdqa %xmm0, 0x70(%edx) lea 128(%edx), %edx - jl L(128bytesless_normal) + jb L(128bytesless_normal) sub $128, %ecx @@ -325,7 +325,7 @@ L(128bytesormore_normal): movdqa %xmm0, 0x60(%edx) movdqa %xmm0, 0x70(%edx) lea 128(%edx), %edx - jge L(128bytesormore_normal) + jae L(128bytesormore_normal) L(128bytesless_normal): lea 128(%ecx), %ecx @@ -346,7 +346,7 @@ L(128bytes_L2_normal): movaps %xmm0, 0x70(%edx) add $128, %edx cmp $128, %ecx - jge L(128bytes_L2_normal) + jae L(128bytes_L2_normal) L(128bytesless_L2_normal): BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) @@ -368,7 +368,7 @@ L(128bytesormore_shared_cache_loop): movdqa %xmm0, 0x70(%edx) add $0x80, %edx cmp $0x80, %ebx - jge L(128bytesormore_shared_cache_loop) + jae L(128bytesormore_shared_cache_loop) cmp $0x80, %ecx jb L(shared_cache_loop_end) ALIGN (4) @@ -384,7 +384,7 @@ L(128bytesormore_nt): movntdq %xmm0, 0x70(%edx) add $0x80, %edx cmp $0x80, %ecx - jge L(128bytesormore_nt) + jae L(128bytesormore_nt) sfence L(shared_cache_loop_end): #if defined DATA_CACHE_SIZE || !defined SHARED -- cgit v1.2.3