diff options
Diffstat (limited to 'sysdeps/x86_64/memchr.S')
-rw-r--r-- | sysdeps/x86_64/memchr.S | 103 |
1 files changed, 61 insertions, 42 deletions
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S index 132eacba8f..feef5d4f24 100644 --- a/sysdeps/x86_64/memchr.S +++ b/sysdeps/x86_64/memchr.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -18,26 +18,40 @@ #include <sysdep.h> +#ifdef USE_AS_WMEMCHR +# define MEMCHR wmemchr +# define PCMPEQ pcmpeqd +#else +# define MEMCHR memchr +# define PCMPEQ pcmpeqb +#endif + /* fast SSE2 version with using pmaxub and 64 byte loop */ .text -ENTRY(memchr) - movd %rsi, %xmm1 - mov %rdi, %rcx +ENTRY(MEMCHR) + movd %esi, %xmm1 + mov %edi, %ecx +#ifdef USE_AS_WMEMCHR + test %rdx, %rdx + jz L(return_null) + shl $2, %rdx +#else punpcklbw %xmm1, %xmm1 test %rdx, %rdx jz L(return_null) punpcklbw %xmm1, %xmm1 +#endif - and $63, %rcx + and $63, %ecx pshufd $0, %xmm1, %xmm1 - cmp $48, %rcx + cmp $48, %ecx ja L(crosscache) movdqu (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 + PCMPEQ %xmm1, %xmm0 pmovmskb %xmm0, %eax test %eax, %eax @@ -45,7 +59,7 @@ ENTRY(memchr) sub $16, %rdx jbe L(return_null) add $16, %rdi - and $15, %rcx + and $15, %ecx and $-16, %rdi add %rcx, %rdx sub $64, %rdx @@ -54,11 +68,11 @@ ENTRY(memchr) .p2align 4 L(crosscache): - and $15, %rcx + and $15, %ecx and $-16, %rdi movdqa (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 + PCMPEQ %xmm1, %xmm0 /* Check if there is a match. */ pmovmskb %xmm0, %eax /* Remove the leading bytes. */ @@ -76,8 +90,12 @@ L(crosscache): .p2align 4 L(unaligned_no_match): - add %rcx, %rdx - sub $16, %rdx + /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using + "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void + possible addition overflow. */ + neg %rcx + add $16, %rcx + sub %rcx, %rdx jbe L(return_null) add $16, %rdi sub $64, %rdx @@ -86,25 +104,25 @@ L(unaligned_no_match): .p2align 4 L(loop_prolog): movdqa (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 + PCMPEQ %xmm1, %xmm0 pmovmskb %xmm0, %eax test %eax, %eax jnz L(matches) movdqa 16(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 + PCMPEQ %xmm1, %xmm2 pmovmskb %xmm2, %eax test %eax, %eax jnz L(matches16) movdqa 32(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 + PCMPEQ %xmm1, %xmm3 pmovmskb %xmm3, %eax test %eax, %eax jnz L(matches32) movdqa 48(%rdi), %xmm4 - pcmpeqb %xmm1, %xmm4 + PCMPEQ %xmm1, %xmm4 add $64, %rdi pmovmskb %xmm4, %eax test %eax, %eax @@ -117,25 +135,25 @@ L(loop_prolog): jbe L(exit_loop) movdqa (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 + PCMPEQ %xmm1, %xmm0 pmovmskb %xmm0, %eax test %eax, %eax jnz L(matches) movdqa 16(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 + PCMPEQ %xmm1, %xmm2 pmovmskb %xmm2, %eax test %eax, %eax jnz L(matches16) movdqa 32(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 + PCMPEQ %xmm1, %xmm3 pmovmskb %xmm3, %eax test %eax, %eax jnz L(matches32) movdqa 48(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 + PCMPEQ %xmm1, %xmm3 pmovmskb %xmm3, %eax add $64, %rdi @@ -144,7 +162,7 @@ L(loop_prolog): mov %rdi, %rcx and $-64, %rdi - and $63, %rcx + and $63, %ecx add %rcx, %rdx .p2align 4 @@ -156,10 +174,10 @@ L(align64_loop): movdqa 32(%rdi), %xmm3 movdqa 48(%rdi), %xmm4 - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm1, %xmm2 - pcmpeqb %xmm1, %xmm3 - pcmpeqb %xmm1, %xmm4 + PCMPEQ %xmm1, %xmm0 + PCMPEQ %xmm1, %xmm2 + PCMPEQ %xmm1, %xmm3 + PCMPEQ %xmm1, %xmm4 pmaxub %xmm0, %xmm3 pmaxub %xmm2, %xmm4 @@ -182,9 +200,9 @@ L(align64_loop): jnz L(matches16) movdqa 32(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 + PCMPEQ %xmm1, %xmm3 - pcmpeqb 48(%rdi), %xmm1 + PCMPEQ 48(%rdi), %xmm1 pmovmskb %xmm3, %eax test %eax, %eax jnz L(matches32) @@ -196,52 +214,52 @@ L(align64_loop): .p2align 4 L(exit_loop): - add $32, %rdx + add $32, %edx jle L(exit_loop_32) movdqa (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 + PCMPEQ %xmm1, %xmm0 pmovmskb %xmm0, %eax test %eax, %eax jnz L(matches) movdqa 16(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 + PCMPEQ %xmm1, %xmm2 pmovmskb %xmm2, %eax test %eax, %eax jnz L(matches16) movdqa 32(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 + PCMPEQ %xmm1, %xmm3 pmovmskb %xmm3, %eax test %eax, %eax jnz L(matches32_1) - sub $16, %rdx + sub $16, %edx jle L(return_null) - pcmpeqb 48(%rdi), %xmm1 + PCMPEQ 48(%rdi), %xmm1 pmovmskb %xmm1, %eax test %eax, %eax jnz L(matches48_1) - xor %rax, %rax + xor %eax, %eax ret .p2align 4 L(exit_loop_32): - add $32, %rdx + add $32, %edx movdqa (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 + PCMPEQ %xmm1, %xmm0 pmovmskb %xmm0, %eax test %eax, %eax jnz L(matches_1) - sub $16, %rdx + sub $16, %edx jbe L(return_null) - pcmpeqb 16(%rdi), %xmm1 + PCMPEQ 16(%rdi), %xmm1 pmovmskb %xmm1, %eax test %eax, %eax jnz L(matches16_1) - xor %rax, %rax + xor %eax, %eax ret .p2align 4 @@ -302,10 +320,11 @@ L(matches48_1): .p2align 4 L(return_null): - xor %rax, %rax + xor %eax, %eax ret -END(memchr) +END(MEMCHR) +#ifndef USE_AS_WMEMCHR strong_alias (memchr, __memchr) - libc_hidden_builtin_def(memchr) +#endif |