summaryrefslogtreecommitdiff
path: root/sysdeps
diff options
context:
space:
mode:
authorH.J. Lu <hongjiu.lu@intel.com>2009-07-16 07:00:34 -0700
committerUlrich Drepper <drepper@redhat.com>2009-07-16 07:00:34 -0700
commite26c9b84155f31b37730fec7621f1d9a805b314d (patch)
treef0898b366e850d2a71a48bfac0deeefead4d6f97 /sysdeps
parentca419225a3c4f9f341eddf582b201211d1bf2aec (diff)
memcmp implementation for x86-64 using SSE2.
Diffstat (limited to 'sysdeps')
-rw-r--r--sysdeps/x86_64/memcmp.S359
1 files changed, 359 insertions, 0 deletions
diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
new file mode 100644
index 0000000000..165f42e17d
--- /dev/null
+++ b/sysdeps/x86_64/memcmp.S
@@ -0,0 +1,359 @@
+/* memcmp with SSE2
+ Copyright (C) 2009 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY (memcmp)
+ test %rdx, %rdx
+ jz L(finz)
+ cmpq $1, %rdx
+ jle L(finr1b)
+ subq %rdi, %rsi
+ movq %rdx, %r10
+ cmpq $32, %r10
+ jge L(gt32)
+ /* Handle small chunks and last block of less than 32 bytes. */
+L(small):
+ testq $1, %r10
+ jz L(s2b)
+ movzbl (%rdi), %eax
+ movzbl (%rdi, %rsi), %edx
+ subq $1, %r10
+ je L(finz1)
+ addq $1, %rdi
+ subl %edx, %eax
+ jnz L(exit)
+L(s2b):
+ testq $2, %r10
+ jz L(s4b)
+ movzwl (%rdi), %eax
+ movzwl (%rdi, %rsi), %edx
+ subq $2, %r10
+ je L(fin2_7)
+ addq $2, %rdi
+ cmpl %edx, %eax
+ jnz L(fin2_7)
+L(s4b):
+ testq $4, %r10
+ jz L(s8b)
+ movl (%rdi), %eax
+ movl (%rdi, %rsi), %edx
+ subq $4, %r10
+ je L(fin2_7)
+ addq $4, %rdi
+ cmpl %edx, %eax
+ jnz L(fin2_7)
+L(s8b):
+ testq $8, %r10
+ jz L(s16b)
+ movq (%rdi), %rax
+ movq (%rdi, %rsi), %rdx
+ subq $8, %r10
+ je L(fin2_7)
+ addq $8, %rdi
+ cmpq %rdx, %rax
+ jnz L(fin2_7)
+L(s16b):
+ movdqu (%rdi), %xmm1
+ movdqu (%rdi, %rsi), %xmm0
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ xorl %eax, %eax
+ subl $0xffff, %edx
+ jz L(finz)
+ bsfl %edx, %ecx
+ leaq (%rdi, %rcx), %rcx
+ movzbl (%rcx), %eax
+ movzbl (%rsi, %rcx), %edx
+ jmp L(finz1)
+
+ .p2align 4,, 4
+L(finr1b):
+ movzbl (%rdi), %eax
+ movzbl (%rsi), %edx
+L(finz1):
+ subl %edx, %eax
+L(exit):
+ ret
+
+ .p2align 4,, 4
+L(fin2_7):
+ cmpq %rdx, %rax
+ jz L(finz)
+ movq %rax, %r11
+ subq %rdx, %r11
+ bsfq %r11, %rcx
+ sarq $3, %rcx
+ salq $3, %rcx
+ sarq %cl, %rax
+ movzbl %al, %eax
+ sarq %cl, %rdx
+ movzbl %dl, %edx
+ subl %edx, %eax
+ ret
+
+ .p2align 4,, 4
+L(finz):
+ xorl %eax, %eax
+ ret
+
+ /* For blocks bigger than 32 bytes
+ 1. Advance one of the addr pointer to be 16B aligned.
+ 2. Treat the case of both addr pointers aligned to 16B
+ separately to avoid movdqu.
+ 3. Handle any blocks of greater than 64 consecutive bytes with
+ unrolling to reduce branches.
+ 4. At least one addr pointer is 16B aligned, use memory version
+ of pcmbeqb.
+ */
+ .p2align 4,, 4
+L(gt32):
+ movq %rdx, %r11
+ addq %rdi, %r11
+ movq %rdi, %r8
+
+ andq $15, %r8
+ jz L(16am)
+ /* Both pointers may be misaligned. */
+ movdqu (%rdi), %xmm1
+ movdqu (%rdi, %rsi), %xmm0
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ neg %r8
+ leaq 16(%rdi, %r8), %rdi
+L(16am):
+ /* Handle two 16B aligned pointers separately. */
+ testq $15, %rsi
+ jz L(ATR)
+ testq $16, %rdi
+ jz L(A32)
+ movdqu (%rdi, %rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+L(A32):
+ movq %r11, %r10
+ andq $-32, %r10
+ cmpq %r10, %rdi
+ jge L(mt16)
+ /* Pre-unroll to be ready for unrolled 64B loop. */
+ testq $32, %rdi
+ jz L(A64)
+ movdqu (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ movdqu (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+L(A64):
+ movq %r11, %r10
+ andq $-64, %r10
+ cmpq %r10, %rdi
+ jge L(mt32)
+
+L(A64main):
+ movdqu (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ movdqu (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ movdqu (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ movdqu (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ cmpq %rdi, %r10
+ jne L(A64main)
+
+L(mt32):
+ movq %r11, %r10
+ andq $-32, %r10
+ cmpq %r10, %rdi
+ jge L(mt16)
+
+L(A32main):
+ movdqu (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ movdqu (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ cmpq %rdi, %r10
+ jne L(A32main)
+L(mt16):
+ subq %rdi, %r11
+ je L(finz)
+ movq %r11, %r10
+ jmp L(small)
+
+ .p2align 4,, 4
+L(neq):
+ bsfl %edx, %ecx
+ movzbl (%rdi, %rcx), %eax
+ addq %rdi, %rsi
+ movzbl (%rsi,%rcx), %edx
+ jmp L(finz1)
+
+ .p2align 4,, 4
+L(ATR):
+ movq %r11, %r10
+ andq $-32, %r10
+ cmpq %r10, %rdi
+ jge L(mt16)
+ testq $16, %rdi
+ jz L(ATR32)
+
+ movdqa (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+ cmpq %rdi, %r10
+ je L(mt16)
+
+L(ATR32):
+ movq %r11, %r10
+ andq $-64, %r10
+ testq $32, %rdi
+ jz L(ATR64)
+
+ movdqa (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ movdqa (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+L(ATR64):
+ cmpq %rdi, %r10
+ je L(mt32)
+
+L(ATR64main):
+ movdqa (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ movdqa (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ movdqa (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ movdqa (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+ cmpq %rdi, %r10
+ jne L(ATR64main)
+
+ movq %r11, %r10
+ andq $-32, %r10
+ cmpq %r10, %rdi
+ jge L(mt16)
+
+L(ATR32res):
+ movdqa (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ movdqa (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ cmpq %r10, %rdi
+ jne L(ATR32res)
+
+ subq %rdi, %r11
+ je L(finz)
+ movq %r11, %r10
+ jmp L(small)
+ /* Align to 16byte to improve instruction fetch. */
+ .p2align 4,, 4
+END(memcmp)
+
+#undef bcmp
+weak_alias (memcmp, bcmp)
+libc_hidden_builtin_def (memcmp)