summaryrefslogtreecommitdiff
path: root/sysdeps/x86_64/wcslen.S
diff options
context:
space:
mode:
authorLiubov Dmitrieva <liubov.dmitrieva@gmail.com>2011-10-23 14:56:04 -0400
committerUlrich Drepper <drepper@gmail.com>2011-10-23 14:56:04 -0400
commitce7dd29f2863820ba858fe06c2ff61417df40d75 (patch)
tree32b9c93de4777c2d8239d94d67b9257cfccf1745 /sysdeps/x86_64/wcslen.S
parent6ddf460fa5f7f96e96134f962f4b2ba708b1095b (diff)
Optimized strnlen and wcscmp for x86-64
Diffstat (limited to 'sysdeps/x86_64/wcslen.S')
-rw-r--r--sysdeps/x86_64/wcslen.S239
1 files changed, 239 insertions, 0 deletions
diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
new file mode 100644
index 0000000000..0343ac78d0
--- /dev/null
+++ b/sysdeps/x86_64/wcslen.S
@@ -0,0 +1,239 @@
+/* Optimized wcslen for x86-64 with SSE2.
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY (__wcslen)
+ cmpl $0, (%rdi)
+ jz L(exit_tail0)
+ cmpl $0, 4(%rdi)
+ jz L(exit_tail1)
+ cmpl $0, 8(%rdi)
+ jz L(exit_tail2)
+ cmpl $0, 12(%rdi)
+ jz L(exit_tail3)
+ cmpl $0, 16(%rdi)
+ jz L(exit_tail4)
+ cmpl $0, 20(%rdi)
+ jz L(exit_tail5)
+ cmpl $0, 24(%rdi)
+ jz L(exit_tail6)
+ cmpl $0, 28(%rdi)
+ jz L(exit_tail7)
+
+ pxor %xmm0, %xmm0
+
+ lea 32(%rdi), %rax
+ lea 16(%rdi), %rcx
+ and $-16, %rax
+
+ pcmpeqd (%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ pxor %xmm1, %xmm1
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqd (%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ pxor %xmm2, %xmm2
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqd (%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ pxor %xmm3, %xmm3
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqd (%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqd (%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqd (%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqd (%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqd (%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqd (%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqd (%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqd (%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqd (%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ and $-0x40, %rax
+
+ .p2align 4
+L(aligned_64_loop):
+ movaps (%rax), %xmm0
+ movaps 16(%rax), %xmm1
+ movaps 32(%rax), %xmm2
+ movaps 48(%rax), %xmm6
+
+ pminub %xmm1, %xmm0
+ pminub %xmm6, %xmm2
+ pminub %xmm0, %xmm2
+ pcmpeqd %xmm3, %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ lea 64(%rax), %rax
+ jz L(aligned_64_loop)
+
+ pcmpeqd -64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 48(%rcx), %rcx
+ jnz L(exit)
+
+ pcmpeqd %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea -16(%rcx), %rcx
+ jnz L(exit)
+
+ pcmpeqd -32(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea -16(%rcx), %rcx
+ jnz L(exit)
+
+ pcmpeqd %xmm6, %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea -16(%rcx), %rcx
+ jnz L(exit)
+
+ jmp L(aligned_64_loop)
+
+ .p2align 4
+L(exit):
+ sub %rcx, %rax
+ shr $2, %rax
+ test %dl, %dl
+ jz L(exit_high)
+
+ mov %dl, %cl
+ and $15, %cl
+ jz L(exit_1)
+ ret
+
+ .p2align 4
+L(exit_high):
+ mov %dh, %ch
+ and $15, %ch
+ jz L(exit_3)
+ add $2, %rax
+ ret
+
+ .p2align 4
+L(exit_1):
+ add $1, %rax
+ ret
+
+ .p2align 4
+L(exit_3):
+ add $3, %rax
+ ret
+
+ .p2align 4
+L(exit_tail0):
+ xor %rax, %rax
+ ret
+
+ .p2align 4
+L(exit_tail1):
+ mov $1, %rax
+ ret
+
+ .p2align 4
+L(exit_tail2):
+ mov $2, %rax
+ ret
+
+ .p2align 4
+L(exit_tail3):
+ mov $3, %rax
+ ret
+
+ .p2align 4
+L(exit_tail4):
+ mov $4, %rax
+ ret
+
+ .p2align 4
+L(exit_tail5):
+ mov $5, %rax
+ ret
+
+ .p2align 4
+L(exit_tail6):
+ mov $6, %rax
+ ret
+
+ .p2align 4
+L(exit_tail7):
+ mov $7, %rax
+ ret
+
+END (__wcslen)
+
+weak_alias(__wcslen, wcslen)