summaryrefslogtreecommitdiff
path: root/sysdeps/x86_64/multiarch
diff options
context:
space:
mode:
authorAndreas Schwab <schwab@redhat.com>2009-07-20 11:02:11 +0200
committerAndreas Schwab <schwab@redhat.com>2009-07-20 11:02:11 +0200
commit53924a77a2b827e7f9af6424a6a30224d09692d1 (patch)
treeba5d034a512524339fcfed113518eb83201fdc23 /sysdeps/x86_64/multiarch
parent8ecde8e8c2a8e77804f954afffd9efe0ab951e52 (diff)
parent42e69bcf1137fccfd7a95645a9d316c6490b9ff9 (diff)
Merge commit 'origin/master' into fedora/master
Diffstat (limited to 'sysdeps/x86_64/multiarch')
-rw-r--r--sysdeps/x86_64/multiarch/Makefile8
-rw-r--r--sysdeps/x86_64/multiarch/rawmemchr.S1
-rw-r--r--sysdeps/x86_64/multiarch/stpcpy.S7
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy-c.c8
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy.S6
-rw-r--r--sysdeps/x86_64/multiarch/strcmp.S1
-rw-r--r--sysdeps/x86_64/multiarch/strcpy.S1918
-rw-r--r--sysdeps/x86_64/multiarch/strcspn-c.c312
-rw-r--r--sysdeps/x86_64/multiarch/strcspn.S82
-rw-r--r--sysdeps/x86_64/multiarch/strlen.S1
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-c.c8
-rw-r--r--sysdeps/x86_64/multiarch/strncpy.S3
-rw-r--r--sysdeps/x86_64/multiarch/strpbrk-c.c4
-rw-r--r--sysdeps/x86_64/multiarch/strpbrk.S3
-rw-r--r--sysdeps/x86_64/multiarch/strspn-c.c284
-rw-r--r--sysdeps/x86_64/multiarch/strspn.S63
16 files changed, 2708 insertions, 1 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 1c35e1ffb4..71e85f0652 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -4,5 +4,11 @@ gen-as-const-headers += ifunc-defines.sym
endif
ifeq ($(subdir),string)
-sysdep_routines += strncmp-c
+sysdep_routines += stpncpy-c strncpy-c strncmp-c
+ifeq (yes,$(config-cflags-sse4))
+sysdep_routines += strcspn-c strpbrk-c strspn-c
+CFLAGS-strcspn-c.c += -msse4
+CFLAGS-strpbrk-c.c += -msse4
+CFLAGS-strspn-c.c += -msse4
+endif
endif
diff --git a/sysdeps/x86_64/multiarch/rawmemchr.S b/sysdeps/x86_64/multiarch/rawmemchr.S
index 93ca631633..d4f265f430 100644
--- a/sysdeps/x86_64/multiarch/rawmemchr.S
+++ b/sysdeps/x86_64/multiarch/rawmemchr.S
@@ -77,6 +77,7 @@ __rawmemchr_sse42:
# undef ENTRY
# define ENTRY(name) \
.type __rawmemchr_sse2, @function; \
+ .align 16; \
__rawmemchr_sse2: cfi_startproc; \
CALL_MCOUNT
# undef END
diff --git a/sysdeps/x86_64/multiarch/stpcpy.S b/sysdeps/x86_64/multiarch/stpcpy.S
new file mode 100644
index 0000000000..b63d308edc
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpcpy.S
@@ -0,0 +1,7 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy
+#include "strcpy.S"
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/sysdeps/x86_64/multiarch/stpncpy-c.c b/sysdeps/x86_64/multiarch/stpncpy-c.c
new file mode 100644
index 0000000000..2fde77dcab
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy-c.c
@@ -0,0 +1,8 @@
+#define STPNCPY __stpncpy_sse2
+#ifdef SHARED
+#undef libc_hidden_def
+#define libc_hidden_def(name) \
+ __hidden_ver1 (__stpncpy_sse2, __GI___stpncpy, __stpncpy_sse2);
+#endif
+
+#include "stpncpy.c"
diff --git a/sysdeps/x86_64/multiarch/stpncpy.S b/sysdeps/x86_64/multiarch/stpncpy.S
new file mode 100644
index 0000000000..ff89a89491
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy.S
@@ -0,0 +1,6 @@
+#define STRCPY __stpncpy
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#include "strcpy.S"
+
+weak_alias (__stpncpy, stpncpy)
diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
index 2f4bf17d95..37985036aa 100644
--- a/sysdeps/x86_64/multiarch/strcmp.S
+++ b/sysdeps/x86_64/multiarch/strcmp.S
@@ -1659,6 +1659,7 @@ LABEL(unaligned_table):
# undef ENTRY
# define ENTRY(name) \
.type STRCMP_SSE2, @function; \
+ .align 16; \
STRCMP_SSE2: cfi_startproc; \
CALL_MCOUNT
# undef END
diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S
new file mode 100644
index 0000000000..25cd01307d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy.S
@@ -0,0 +1,1918 @@
+/* strcpy with SSSE3
+ Copyright (C) 2009 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <ifunc-defines.h>
+
+#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY)
+# ifndef STRCPY
+# define STRCPY strcpy
+# endif
+#endif
+
+#ifdef USE_AS_STPCPY
+# ifdef USE_AS_STRNCPY
+# define STRCPY_SSSE3 __stpncpy_ssse3
+# define STRCPY_SSE2 __stpncpy_sse2
+# define __GI_STRCPY __GI_stpncpy
+# else
+# define STRCPY_SSSE3 __stpcpy_ssse3
+# define STRCPY_SSE2 __stpcpy_sse2
+# define __GI_STRCPY __GI_stpcpy
+# define __GI___STRCPY __GI___stpcpy
+# endif
+#else
+# ifdef USE_AS_STRNCPY
+# define STRCPY_SSSE3 __strncpy_ssse3
+# define STRCPY_SSE2 __strncpy_sse2
+# define __GI_STRCPY __GI_strncpy
+# else
+# define STRCPY_SSSE3 __strcpy_ssse3
+# define STRCPY_SSE2 __strcpy_sse2
+# define __GI_STRCPY __GI_strcpy
+# endif
+#endif
+
+#ifndef LABEL
+#define LABEL(l) L(l)
+#endif
+
+/* Define multiple versions only for the definition in libc. */
+#ifndef NOT_IN_libc
+ .text
+ENTRY(STRCPY)
+ .type STRCPY, @gnu_indirect_function
+ cmpl $0, __cpu_features+KIND_OFFSET(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq STRCPY_SSE2(%rip), %rax
+ testl $(1<<9), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
+ jz 3f
+/* Avoid SSSE3 strcpy on Atom since it is slow. */
+ cmpl $1, __cpu_features+KIND_OFFSET(%rip)
+ jne 2f
+ cmpl $6, __cpu_features+FAMILY_OFFSET(%rip)
+ jne 2f
+ cmpl $28, __cpu_features+MODEL_OFFSET(%rip)
+ jz 3f
+2: leaq STRCPY_SSSE3(%rip), %rax
+3: ret
+END(STRCPY)
+
+ .section .text.ssse3,"ax",@progbits
+STRCPY_SSSE3:
+ cfi_startproc
+ CALL_MCOUNT
+
+/*
+ * This implementation uses SSE to copy up to 16 bytes at a time.
+ */
+#ifdef USE_AS_STRNCPY
+ test %rdx, %rdx
+ jz LABEL(strncpy_exitz)
+ mov %rdx, %r8
+#else
+ xor %edx, %edx
+#endif
+ mov %esi, %ecx
+ and $0xfffffffffffffff0, %rsi /*force rsi 16 byte align*/
+ and $15, %ecx
+ mov %rdi, %rax /*store return parameter*/
+
+
+ pxor %xmm0, %xmm0 /* clear %xmm0 */
+ pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/
+ pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/
+ shr %cl, %edx /* get real bits left in edx*/
+ test %edx, %edx /* edx must be 0 if there is no null char from rsi+%rcx */
+ jnz LABEL(less16bytes)
+
+#ifdef USE_AS_STRNCPY
+ lea -16(%r8,%rcx), %r11
+ cmp $0, %r11
+ jle LABEL(less16bytes) /* if r8 + rcx <= 16, branch to less16bytes. */
+#endif
+
+ mov %rcx, %r9
+ or %edi, %ecx
+ and $15, %ecx
+ lea -16(%r9), %r10
+ jz LABEL(ashr_0) /* ecx must be 0 if offset of rsi and rdi is 16 byte align*/
+
+ neg %r10 /* store the rest in rsi aligned 16 bytes for unaligned_exit*/
+
+ pxor %xmm0, %xmm0 /* clear %xmm0, may be polluted by unaligned operation*/
+ pcmpeqb 16(%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(less32bytes)
+ /*
+ * at least 16 byte available to fill destination rdi
+ */
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(less32bytes_strncpy_truncation)
+#endif
+ mov (%rsi, %r9), %rdx
+ mov %rdx, (%rdi)
+ mov 8(%rsi, %r9), %rdx
+ mov %rdx, 8(%rdi)
+
+ /*
+ * so far destatination rdi may be aligned by 16, re-calculate rsi to jump
+ * crossponding case
+ * rcx is offset of rsi
+ * rax is offset of rdi
+ */
+
+ and $0xfffffffffffffff0, %rdi /* force rdi 16 byte align */
+ mov %rax, %rdx /* rax store orignal rdi */
+ xor %rdi, %rdx /* equal to and $15, %rdx */
+#ifdef USE_AS_STRNCPY
+ add %rdx, %r8
+#endif
+
+ add $16, %rdi /* next 16 bytes for rdi */
+ sub %rdx, %r9
+
+ lea 16(%r9, %rsi), %rsi /*re-calculate rsi by (16 - rdx)+ rcx */
+ mov %esi, %ecx /*store offset of rsi */
+ and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */
+
+ and $15, %ecx /* ecx must be 0 if rdx is equal to rcx*/
+ jz LABEL(ashr_0)
+
+ lea -16(%rcx), %r10
+ mov %rcx, %r9
+ neg %r10
+ lea LABEL(unaligned_table)(%rip), %r11
+ movslq (%r11, %rcx,4), %rcx
+ lea (%r11, %rcx), %rcx
+ jmp *%rcx
+
+ /*
+ * The following cases will be handled by ashr_0 & ashr_0_start
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * 0 0 0 ashr_0
+ * n(1~15) n(1~15) 0 ashr_0_start
+ *
+ */
+ .p2align 5
+LABEL(ashr_0):
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_aligned)
+#endif
+ movdqa (%rsi), %xmm1 /* fetch first 16 bytes from rsi */
+ movdqa %xmm1, (%rdi) /* store first 16 bytes into rdi */
+ add $16, %rsi
+ add $16, %rdi
+ pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char */
+ pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/
+
+ test %edx, %edx /* edx must be 0 if there is no null char in rsi*/
+ jnz LABEL(aligned_16bytes)
+
+LABEL(ashr_0_loop):
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_aligned)
+#endif
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa %xmm1, (%rdi, %rcx)
+ add $16, %rcx
+ pcmpeqb (%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(aligned_exit)
+
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_aligned)
+#endif
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa %xmm1, (%rdi, %rcx)
+ add $16, %rcx
+ pcmpeqb (%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(aligned_exit)
+
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_aligned)
+#endif
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa %xmm1, (%rdi, %rcx)
+ add $16, %rcx
+ pcmpeqb (%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(aligned_exit)
+
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_aligned)
+#endif
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa %xmm1, (%rdi, %rcx)
+ add $16, %rcx
+ pcmpeqb (%rsi, %rcx), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jz LABEL(ashr_0_loop)
+
+ jmp LABEL(aligned_exit)
+ .p2align 4
+
+/*
+ * The following cases will be handled by ashr_15
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(15) n - 15 15((16 - (n -15) + n)%16 ashr_15
+ *
+ * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+ .p2align 4
+LABEL(ashr_15):
+ xor %ecx, %ecx /*clear ecx */
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ .p2align 4
+LABEL(ashr_15_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $15, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $15, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_15_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_14
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(14~15) n - 14 14((16 - (n -14) + n)%16 ashr_14
+ *
+ * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+ .p2align 4
+LABEL(ashr_14):
+ xor %ecx, %ecx /*clear ecx */
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ .p2align 4
+LABEL(ashr_14_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $14, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $14, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_14_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_13
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(13~15) n - 13 13((16 - (n -13) + n)%16 ashr_13
+ *
+ * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+ .p2align 4
+LABEL(ashr_13):
+ xor %ecx, %ecx /*clear ecx */
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ .p2align 4
+LABEL(ashr_13_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $13, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $13, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_13_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_12
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(12~15) n - 12 12((16 - (n -12) + n)%16 ashr_12
+ *
+ * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+ .p2align 4
+LABEL(ashr_12):
+ xor %ecx, %ecx /*clear ecx */
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ .p2align 4
+LABEL(ashr_12_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $12, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $12, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_12_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_11
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(11~15) n - 11 11((16 - (n -11) + n)%16 ashr_11
+ *
+ * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+ .p2align 4
+LABEL(ashr_11):
+ xor %ecx, %ecx /*clear ecx */
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ .p2align 4
+LABEL(ashr_11_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $11, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $11, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_11_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_10
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(10~15) n - 10 10((16 - (n -10) + n)%16 ashr_10
+ *
+ * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+ .p2align 4
+LABEL(ashr_10):
+ xor %ecx, %ecx /*clear ecx */
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ .p2align 4
+LABEL(ashr_10_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $10, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $10, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_10_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_9
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(9~15) n - 9 9((16 - (n -9) + n)%16 ashr_9
+ *
+ * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+ .p2align 4
+LABEL(ashr_9):
+ xor %ecx, %ecx /*clear ecx */
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ .p2align 4
+LABEL(ashr_9_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $9, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $9, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_9_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_8
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(8~15) n - 8 8((16 - (n -8) + n)%16 ashr_8
+ *
+ * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+ .p2align 4
+LABEL(ashr_8):
+ xor %ecx, %ecx /*clear ecx */
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ .p2align 4
+LABEL(ashr_8_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $8, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $8, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_8_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_7
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(7~15) n - 7 7((16 - (n -7) + n)%16 ashr_7
+ *
+ * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+ .p2align 4
+LABEL(ashr_7):
+ xor %ecx, %ecx /*clear ecx */
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ .p2align 4
+
+LABEL(ashr_7_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $7, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $7, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_7_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_6
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(6~15) n - 6 6((16 - (n -6) + n)%16 ashr_6
+ *
+ * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+ .p2align 4
+LABEL(ashr_6):
+ xor %ecx, %ecx /*clear ecx */
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ .p2align 4
+LABEL(ashr_6_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $6, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $6, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_6_use_ssse3)
+
+ /*
+ * The following cases will be handled by ashr_5
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(5~15) n - 5 5((16 - (n -5) + n)%16 ashr_5
+ *
+ * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+ .p2align 4
+LABEL(ashr_5):
+ xor %ecx, %ecx /*clear ecx */
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ .p2align 4
+LABEL(ashr_5_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $5, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $5, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_5_use_ssse3)
+
+/*
+ *
+ * The following cases will be handled by ashr_4
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(4~15) n - 4 4((16 - (n -4) + n)%16 ashr_4
+ *
+ * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+ .p2align 4
+LABEL(ashr_4):
+ xor %ecx, %ecx /*clear ecx */
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ .p2align 4
+LABEL(ashr_4_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $4, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $4, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_4_use_ssse3)
+
+/*
+ *
+ * The following cases will be handled by ashr_3
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(3~15) n - 3 3((16 - (n -3) + n)%16 ashr_3
+ *
+ * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+ .p2align 4
+LABEL(ashr_3):
+ xor %ecx, %ecx /*clear ecx */
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ .p2align 4
+LABEL(ashr_3_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $3, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $3, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_3_use_ssse3)
+
+/*
+ *
+ * The following cases will be handled by ashr_2
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(2~15) n - 2 2((16 - (n -2) + n)%16 ashr_2
+ *
+ * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+ .p2align 4
+LABEL(ashr_2):
+ xor %ecx, %ecx /*clear ecx */
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ .p2align 4
+LABEL(ashr_2_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $2, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $2, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_2_use_ssse3)
+
+/*
+ *
+ * The following cases will be handled by ashr_1
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(1~15) n - 1 1 ((16 - (n -1) + n)%16 ashr_1
+ *
+ * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+ .p2align 4
+LABEL(ashr_1):
+ xor %ecx, %ecx /*clear ecx */
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ .p2align 4
+LABEL(ashr_1_use_ssse3):
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+
+ palignr $1, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+
+ movdqa 16(%rsi, %rcx), %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe LABEL(strncpy_truncation_unaligned)
+#endif
+ palignr $1, (%rsi, %rcx), %xmm3
+ movdqa %xmm3, (%rdi, %rcx)
+ add $16, %rcx
+
+#ifdef USE_AS_STRNCPY
+ cmp %r10, %r8
+ jbe LABEL(unaligned_exit)
+#endif
+ jmp LABEL(ashr_1_use_ssse3)
+
+ .p2align 4
+LABEL(less32bytes):
+ xor %ecx, %ecx
+LABEL(unaligned_exit):
+ add %r9, %rsi /* r9 stores original offset of rsi*/
+ mov %rcx, %r9
+ mov %r10, %rcx
+ shl %cl, %edx /* after shl, calculate the exact number to be filled*/
+ mov %r9, %rcx
+ .p2align 4
+LABEL(aligned_exit):
+ add %rcx, %rdi /*locate exact address for rdi */
+LABEL(less16bytes):
+ add %rcx, %rsi /*locate exact address for rsi */
+LABEL(aligned_16bytes):
+#ifdef USE_AS_STRNCPY
+ mov $1, %r9d
+ lea -1(%r8), %rcx
+ shl %cl, %r9d
+ cmp $32, %r8
+ ja LABEL(strncpy_tail)
+ or %r9d, %edx
+LABEL(strncpy_tail):
+#endif
+ bsf %rdx, %rcx /*If a least significant 1 bit in %rdx is found, its bit index is stored in %rcx*/
+ lea LABEL(tail_table)(%rip), %r11
+ movslq (%r11, %rcx,4), %rcx
+ lea (%r11, %rcx), %rcx
+ jmp *%rcx
+
+#ifdef USE_AS_STRNCPY
+ .p2align 4
+LABEL(less32bytes_strncpy_truncation):
+ xor %ecx, %ecx
+LABEL(strncpy_truncation_unaligned):
+ add %r9, %rsi
+LABEL(strncpy_truncation_aligned):
+ add %rcx, %rdi
+ add %rcx, %rsi
+ add $16, %r8
+ lea -1(%r8), %rcx
+ lea LABEL(tail_table)(%rip), %r11
+ movslq (%r11, %rcx,4), %rcx
+ lea (%r11, %rcx), %rcx
+ jmp *%rcx
+ .p2align 4
+LABEL(strncpy_exitz):
+ mov %rdi, %rax
+ ret
+#endif
+
+#ifdef USE_AS_STRNCPY
+ .p2align 4
+LABEL(strncpy_fill_tail):
+ mov %rax, %rdx
+ movzx %cl, %rax
+ mov %r8, %rcx
+ add %rax, %rdi
+ xor %eax, %eax
+ shr $3, %ecx
+ jz LABEL(strncpy_fill_less_8)
+
+ rep stosq
+LABEL(strncpy_fill_less_8):
+ mov %r8, %rcx
+ and $7, %ecx
+ jz LABEL(strncpy_fill_return)
+LABEL(strncpy_fill_less_7):
+ sub $1, %ecx
+ mov %al, (%rdi, %rcx)
+ jnz LABEL(strncpy_fill_less_7)
+LABEL(strncpy_fill_return):
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rdx)
+ sbb $-1, %rdx
+#endif
+ mov %rdx, %rax
+ ret
+#endif
+ .p2align 4
+LABEL(tail_0):
+ mov (%rsi), %cl
+ mov %cl, (%rdi)
+#ifdef USE_AS_STPCPY
+ mov %rdi, %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $1, %cl
+ sub $1, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+ .p2align 4
+LABEL(tail_1):
+ mov (%rsi), %cx
+ mov %cx, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea 1(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $2, %cl
+ sub $2, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+ .p2align 4
+LABEL(tail_2):
+ mov (%rsi), %cx
+ mov %cx, (%rdi)
+ mov 1(%rsi), %cx
+ mov %cx, 1(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 2(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $3, %cl
+ sub $3, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+ .p2align 4
+LABEL(tail_3):
+ mov (%rsi), %ecx
+ mov %ecx, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea 3(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $4, %cl
+ sub $4, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+ .p2align 4
+LABEL(tail_4):
+ mov (%rsi), %ecx
+ mov %ecx, (%rdi)
+ mov 1(%rsi), %edx
+ mov %edx, 1(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 4(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $5, %cl
+ sub $5, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+ .p2align 4
+LABEL(tail_5):
+ mov (%rsi), %ecx
+ mov %ecx, (%rdi)
+ mov 2(%rsi), %edx
+ mov %edx, 2(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 5(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $6, %cl
+ sub $6, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+ .p2align 4
+LABEL(tail_6):
+ mov (%rsi), %ecx
+ mov %ecx, (%rdi)
+ mov 3(%rsi), %edx
+ mov %edx,3(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 6(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $7, %cl
+ sub $7, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_7):
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea 7(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $8, %cl
+ sub $8, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_8):
+
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 5(%rsi), %edx
+ mov %edx, 5(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 8(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $9, %cl
+ sub $9, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_9):
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 6(%rsi), %edx
+ mov %edx, 6(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 9(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $10, %cl
+ sub $10, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_10):
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 7(%rsi), %edx
+ mov %edx, 7(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 10(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $11, %cl
+ sub $11, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+ .p2align 4
+LABEL(tail_11):
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %edx
+ mov %edx, 8(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 11(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $12, %cl
+ sub $12, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+ .p2align 4
+LABEL(tail_12):
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 5(%rsi), %rcx
+ mov %rcx, 5(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 12(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $13, %cl
+ sub $13, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_13):
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 6(%rsi), %rcx
+ mov %rcx, 6(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 13(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $14, %cl
+ sub $14, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_14):
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 7(%rsi), %rcx
+ mov %rcx, 7(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 14(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $15, %cl
+ sub $15, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+
+LABEL(tail_15):
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 15(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $16, %cl
+ sub $16, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+
+ ret
+
+ .p2align 4
+LABEL(tail_16):
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 16(%rsi), %cl
+ mov %cl, 16(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 16(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $17, %cl
+ sub $17, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+ .p2align 4
+LABEL(tail_17):
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 16(%rsi), %cx
+ mov %cx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 17(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $18, %cl
+ sub $18, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_18):
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 15(%rsi), %ecx
+ mov %ecx,15(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 18(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $19, %cl
+ sub $19, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_19):
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 16(%rsi), %ecx
+ mov %ecx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 19(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $20, %cl
+ sub $20, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+ .p2align 4
+LABEL(tail_20):
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 13(%rsi), %rcx
+ mov %rcx, 13(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 20(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $21, %cl
+ sub $21, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+ .p2align 4
+LABEL(tail_21):
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 14(%rsi), %rcx
+ mov %rcx, 14(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 21(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $22, %cl
+ sub $22, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_22):
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 15(%rsi), %rcx
+ mov %rcx, 15(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 22(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $23, %cl
+ sub $23, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_23):
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 16(%rsi), %rcx
+ mov %rcx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 23(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $24, %cl
+ sub $24, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+
+ ret
+
+ .p2align 4
+LABEL(tail_24):
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 16(%rsi), %rcx
+ mov %rcx, 16(%rdi)
+ mov 21(%rsi), %edx
+ mov %edx, 21(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 24(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $25, %cl
+ sub $25, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_25):
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 16(%rsi), %rcx
+ mov %rcx, 16(%rdi)
+ mov 22(%rsi), %edx
+ mov %edx, 22(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 25(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $26, %cl
+ sub $26, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_26):
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 16(%rsi), %rcx
+ mov %rcx, 16(%rdi)
+ mov 23(%rsi), %edx
+ mov %edx, 23(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 26(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $27, %cl
+ sub $27, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_27):
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 16(%rsi), %rcx
+ mov %rcx, 16(%rdi)
+ mov 24(%rsi), %edx
+ mov %edx, 24(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 27(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $28, %cl
+ sub $28, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+ .p2align 4
+LABEL(tail_28):
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 16(%rsi), %rcx
+ mov %rcx, 16(%rdi)
+ mov 21(%rsi), %rdx
+ mov %rdx, 21(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 28(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $29, %cl
+ sub $29, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+
+ ret
+
+ .p2align 4
+LABEL(tail_29):
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 16(%rsi), %rcx
+ mov %rcx, 16(%rdi)
+ mov 22(%rsi), %rdx
+ mov %rdx, 22(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 29(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $30, %cl
+ sub $30, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+
+ ret
+
+
+ .p2align 4
+LABEL(tail_30):
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 16(%rsi), %rcx
+ mov %rcx, 16(%rdi)
+ mov 23(%rsi), %rdx
+ mov %rdx, 23(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 30(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $31, %cl
+ sub $31, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+
+ .p2align 4
+LABEL(tail_31):
+ mov (%rsi), %rcx
+ mov %rcx, (%rdi)
+ mov 8(%rsi), %rdx
+ mov %rdx, 8(%rdi)
+ mov 16(%rsi), %rcx
+ mov %rcx, 16(%rdi)
+ mov 24(%rsi), %rdx
+ mov %rdx, 24(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 31(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+ mov $32, %cl
+ sub $32, %r8
+ jnz LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+#endif
+#endif
+ ret
+ cfi_endproc
+ .size STRCPY_SSSE3, .-STRCPY_SSSE3
+
+ .p2align 4
+ .section .rodata.ssse3,"a",@progbits
+LABEL(tail_table):
+ .int LABEL(tail_0) - LABEL(tail_table)
+ .int LABEL(tail_1) - LABEL(tail_table)
+ .int LABEL(tail_2) - LABEL(tail_table)
+ .int LABEL(tail_3) - LABEL(tail_table)
+ .int LABEL(tail_4) - LABEL(tail_table)
+ .int LABEL(tail_5) - LABEL(tail_table)
+ .int LABEL(tail_6) - LABEL(tail_table)
+ .int LABEL(tail_7) - LABEL(tail_table)
+ .int LABEL(tail_8) - LABEL(tail_table)
+ .int LABEL(tail_9) - LABEL(tail_table)
+ .int LABEL(tail_10) - LABEL(tail_table)
+ .int LABEL(tail_11) - LABEL(tail_table)
+ .int LABEL(tail_12) - LABEL(tail_table)
+ .int LABEL(tail_13) - LABEL(tail_table)
+ .int LABEL(tail_14) - LABEL(tail_table)
+ .int LABEL(tail_15) - LABEL(tail_table)
+ .int LABEL(tail_16) - LABEL(tail_table)
+ .int LABEL(tail_17) - LABEL(tail_table)
+ .int LABEL(tail_18) - LABEL(tail_table)
+ .int LABEL(tail_19) - LABEL(tail_table)
+ .int LABEL(tail_20) - LABEL(tail_table)
+ .int LABEL(tail_21) - LABEL(tail_table)
+ .int LABEL(tail_22) - LABEL(tail_table)
+ .int LABEL(tail_23) - LABEL(tail_table)
+ .int LABEL(tail_24) - LABEL(tail_table)
+ .int LABEL(tail_25) - LABEL(tail_table)
+ .int LABEL(tail_26) - LABEL(tail_table)
+ .int LABEL(tail_27) - LABEL(tail_table)
+ .int LABEL(tail_28) - LABEL(tail_table)
+ .int LABEL(tail_29) - LABEL(tail_table)
+ .int LABEL(tail_30) - LABEL(tail_table)
+ .int LABEL(tail_31) - LABEL(tail_table)
+
+ .p2align 4
+LABEL(unaligned_table):
+ .int LABEL(ashr_0) - LABEL(unaligned_table)
+ .int LABEL(ashr_1) - LABEL(unaligned_table)
+ .int LABEL(ashr_2) - LABEL(unaligned_table)
+ .int LABEL(ashr_3) - LABEL(unaligned_table)
+ .int LABEL(ashr_4) - LABEL(unaligned_table)
+ .int LABEL(ashr_5) - LABEL(unaligned_table)
+ .int LABEL(ashr_6) - LABEL(unaligned_table)
+ .int LABEL(ashr_7) - LABEL(unaligned_table)
+ .int LABEL(ashr_8) - LABEL(unaligned_table)
+ .int LABEL(ashr_9) - LABEL(unaligned_table)
+ .int LABEL(ashr_10) - LABEL(unaligned_table)
+ .int LABEL(ashr_11) - LABEL(unaligned_table)
+ .int LABEL(ashr_12) - LABEL(unaligned_table)
+ .int LABEL(ashr_13) - LABEL(unaligned_table)
+ .int LABEL(ashr_14) - LABEL(unaligned_table)
+ .int LABEL(ashr_15) - LABEL(unaligned_table)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type STRCPY_SSE2, @function; \
+ .align 16; \
+ STRCPY_SSE2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size STRCPY_SSE2, .-STRCPY_SSE2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcpy calls through a PLT.
+ The speedup we get from using SSSE3 instruction is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_STRCPY; __GI_STRCPY = STRCPY_SSE2
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ .globl __GI___STRCPY; __GI___STRCPY = STRCPY_SSE2
+#endif
+
+#ifndef USE_AS_STRNCPY
+#include "../strcpy.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
new file mode 100644
index 0000000000..4512267d3f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcspn-c.c
@@ -0,0 +1,312 @@
+/* strcspn with SSE4.2 intrinsics
+ Copyright (C) 2009 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <nmmintrin.h>
+#include <string.h>
+
+/* We use 0x2:
+ _SIDD_SBYTE_OPS
+ | _SIDD_CMP_EQUAL_ANY
+ | _SIDD_POSITIVE_POLARITY
+ | _SIDD_LEAST_SIGNIFICANT
+ on pcmpistri to compare xmm/mem128
+
+ 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ X X X X X X X X X X X X X X X X
+
+ against xmm
+
+ 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ A A A A A A A A A A A A A A A A
+
+ to find out if the first 16byte data element has any byte A and
+ the offset of the first byte. There are 3 cases:
+
+ 1. The first 16byte data element has the byte A at the offset X.
+ 2. The first 16byte data element has EOS and doesn't have the byte A.
+ 3. The first 16byte data element is valid and doesn't have the byte A.
+
+ Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
+
+ 1 X 1 0/1 0
+ 2 16 0 1 0
+ 3 16 0 0 0
+
+ We exit from the loop for cases 1 and 2 with jbe which branches
+ when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset
+ X for case 1. */
+
+#ifndef STRCSPN_SSE2
+# define STRCSPN_SSE2 __strcspn_sse2
+# define STRCSPN_SSE42 __strcspn_sse42
+#endif
+
+#ifdef USE_AS_STRPBRK
+# define RETURN(val1, val2) return val1
+#else
+# define RETURN(val1, val2) return val2
+#endif
+
+extern
+#ifdef USE_AS_STRPBRK
+char *
+#else
+size_t
+#endif
+STRCSPN_SSE2 (const char *, const char *);
+
+
+#ifdef USE_AS_STRPBRK
+char *
+#else
+size_t
+#endif
+__attribute__ ((section (".text.sse4.2")))
+STRCSPN_SSE42 (const char *s, const char *a)
+{
+ if (*a == 0)
+ RETURN (NULL, strlen (s));
+
+ const char *aligned;
+ __m128i mask;
+ int offset = (int) ((size_t) a & 15);
+ if (offset != 0)
+ {
+ /* Load masks. */
+ aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L);
+ __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
+
+ switch (offset)
+ {
+ case 1:
+ mask = _mm_srli_si128 (mask0, 1);
+ break;
+ case 2:
+ mask = _mm_srli_si128 (mask0, 2);
+ break;
+ case 3:
+ mask = _mm_srli_si128 (mask0, 3);
+ break;
+ case 4:
+ mask = _mm_srli_si128 (mask0, 4);
+ break;
+ case 5:
+ mask = _mm_srli_si128 (mask0, 5);
+ break;
+ case 6:
+ mask = _mm_srli_si128 (mask0, 6);
+ break;
+ case 7:
+ mask = _mm_srli_si128 (mask0, 7);
+ break;
+ case 8:
+ mask = _mm_srli_si128 (mask0, 8);
+ break;
+ case 9:
+ mask = _mm_srli_si128 (mask0, 9);
+ break;
+ case 10:
+ mask = _mm_srli_si128 (mask0, 10);
+ break;
+ case 11:
+ mask = _mm_srli_si128 (mask0, 11);
+ break;
+ case 12:
+ mask = _mm_srli_si128 (mask0, 12);
+ break;
+ case 13:
+ mask = _mm_srli_si128 (mask0, 13);
+ break;
+ case 14:
+ mask = _mm_srli_si128 (mask0, 14);
+ break;
+ case 15:
+ mask = _mm_srli_si128 (mask0, 15);
+ break;
+ }
+
+ /* Find where the NULL terminator is. */
+ int length = _mm_cmpistri (mask, mask, 0x3a);
+ if (length == 16 - offset)
+ {
+ /* There is no NULL terminator. */
+ __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
+ int index = _mm_cmpistri (mask1, mask1, 0x3a);
+ length += index;
+
+ /* Don't use SSE4.2 if the length of A > 16. */
+ if (length > 16)
+ return STRCSPN_SSE2 (s, a);
+
+ if (index != 0)
+ {
+ /* Combine mask0 and mask1. */
+ switch (offset)
+ {
+ case 1:
+ mask = _mm_alignr_epi8 (mask1, mask0, 1);
+ break;
+ case 2:
+ mask = _mm_alignr_epi8 (mask1, mask0, 2);
+ break;
+ case 3:
+ mask = _mm_alignr_epi8 (mask1, mask0, 3);
+ break;
+ case 4:
+ mask = _mm_alignr_epi8 (mask1, mask0, 4);
+ break;
+ case 5:
+ mask = _mm_alignr_epi8 (mask1, mask0, 5);
+ break;
+ case 6:
+ mask = _mm_alignr_epi8 (mask1, mask0, 6);
+ break;
+ case 7:
+ mask = _mm_alignr_epi8 (mask1, mask0, 7);
+ break;
+ case 8:
+ mask = _mm_alignr_epi8 (mask1, mask0, 8);
+ break;
+ case 9:
+ mask = _mm_alignr_epi8 (mask1, mask0, 9);
+ break;
+ case 10:
+ mask = _mm_alignr_epi8 (mask1, mask0, 10);
+ break;
+ case 11:
+ mask = _mm_alignr_epi8 (mask1, mask0, 11);
+ break;
+ case 12:
+ mask = _mm_alignr_epi8 (mask1, mask0, 12);
+ break;
+ case 13:
+ mask = _mm_alignr_epi8 (mask1, mask0, 13);
+ break;
+ case 14:
+ mask = _mm_alignr_epi8 (mask1, mask0, 14);
+ break;
+ case 15:
+ mask = _mm_alignr_epi8 (mask1, mask0, 15);
+ break;
+ }
+ }
+ }
+ }
+ else
+ {
+ /* A is aligned. */
+ mask = _mm_load_si128 ((__m128i *) a);
+
+ /* Find where the NULL terminator is. */
+ int length = _mm_cmpistri (mask, mask, 0x3a);
+ if (length == 16)
+ {
+ /* There is no NULL terminator. Don't use SSE4.2 if the length
+ of A > 16. */
+ if (a[16] != 0)
+ return STRCSPN_SSE2 (s, a);
+ }
+ }
+
+ offset = (int) ((size_t) s & 15);
+ if (offset != 0)
+ {
+ /* Check partial string. */
+ aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L);
+ __m128i value = _mm_load_si128 ((__m128i *) aligned);
+
+ switch (offset)
+ {
+ case 1:
+ value = _mm_srli_si128 (value, 1);
+ break;
+ case 2:
+ value = _mm_srli_si128 (value, 2);
+ break;
+ case 3:
+ value = _mm_srli_si128 (value, 3);
+ break;
+ case 4:
+ value = _mm_srli_si128 (value, 4);
+ break;
+ case 5:
+ value = _mm_srli_si128 (value, 5);
+ break;
+ case 6:
+ value = _mm_srli_si128 (value, 6);
+ break;
+ case 7:
+ value = _mm_srli_si128 (value, 7);
+ break;
+ case 8:
+ value = _mm_srli_si128 (value, 8);
+ break;
+ case 9:
+ value = _mm_srli_si128 (value, 9);
+ break;
+ case 10:
+ value = _mm_srli_si128 (value, 10);
+ break;
+ case 11:
+ value = _mm_srli_si128 (value, 11);
+ break;
+ case 12:
+ value = _mm_srli_si128 (value, 12);
+ break;
+ case 13:
+ value = _mm_srli_si128 (value, 13);
+ break;
+ case 14:
+ value = _mm_srli_si128 (value, 14);
+ break;
+ case 15:
+ value = _mm_srli_si128 (value, 15);
+ break;
+ }
+
+ int length = _mm_cmpistri (mask, value, 0x2);
+ /* No need to check ZFlag since ZFlag is always 1. */
+ int cflag = _mm_cmpistrc (mask, value, 0x2);
+ if (cflag)
+ RETURN ((char *) (s + length), length);
+ /* Find where the NULL terminator is. */
+ int index = _mm_cmpistri (value, value, 0x3a);
+ if (index < 16 - offset)
+ RETURN (NULL, index);
+ aligned += 16;
+ }
+ else
+ aligned = s;
+
+ while (1)
+ {
+ __m128i value = _mm_load_si128 ((__m128i *) aligned);
+ int index = _mm_cmpistri (mask, value, 0x2);
+ int cflag = _mm_cmpistrc (mask, value, 0x2);
+ int zflag = _mm_cmpistrz (mask, value, 0x2);
+ if (cflag)
+ RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
+ if (zflag)
+ RETURN (NULL,
+ /* Find where the NULL terminator is. */
+ (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s));
+ aligned += 16;
+ }
+}
diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S
new file mode 100644
index 0000000000..cc75ab70e6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcspn.S
@@ -0,0 +1,82 @@
+/* Multiple versions of strcspn
+ Copyright (C) 2009 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <config.h>
+
+#ifdef HAVE_SSE4_SUPPORT
+
+#include <sysdep.h>
+#include <ifunc-defines.h>
+
+#ifdef USE_AS_STRPBRK
+#define STRCSPN_SSE42 __strpbrk_sse42
+#define STRCSPN_SSE2 __strpbrk_sse2
+#define __GI_STRCSPN __GI_strpbrk
+#else
+#ifndef STRCSPN
+#define STRCSPN strcspn
+#define STRCSPN_SSE42 __strcspn_sse42
+#define STRCSPN_SSE2 __strcspn_sse2
+#define __GI_STRCSPN __GI_strcspn
+#endif
+#endif
+
+/* Define multiple versions only for the definition in libc. Don't
+ define multiple versions for strpbrk in static library since we
+ need strpbrk before the initialization happened. */
+#if (defined SHARED || !defined USE_AS_STRPBRK) && !defined NOT_IN_libc
+ .text
+ENTRY(STRCSPN)
+ .type STRCSPN, @gnu_indirect_function
+ cmpl $0, __cpu_features+KIND_OFFSET(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq STRCSPN_SSE2(%rip), %rax
+ testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
+ jz 2f
+ leaq STRCSPN_SSE42(%rip), %rax
+2: ret
+END(STRCSPN)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type STRCSPN_SSE2, @function; \
+ .globl STRCSPN_SSE2; \
+ .align 16; \
+ STRCSPN_SSE2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcspn calls through a PLT.
+ The speedup we get from using SSE4.2 instruction is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_SSE2
+#endif
+
+#endif /* HAVE_SSE4_SUPPORT */
+
+#ifdef USE_AS_STRPBRK
+#include "../strpbrk.S"
+#else
+#include "../strcspn.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S
index 79e6a977ec..82b03ccc28 100644
--- a/sysdeps/x86_64/multiarch/strlen.S
+++ b/sysdeps/x86_64/multiarch/strlen.S
@@ -77,6 +77,7 @@ __strlen_sse42:
# undef ENTRY
# define ENTRY(name) \
.type __strlen_sse2, @function; \
+ .align 16; \
__strlen_sse2: cfi_startproc; \
CALL_MCOUNT
# undef END
diff --git a/sysdeps/x86_64/multiarch/strncpy-c.c b/sysdeps/x86_64/multiarch/strncpy-c.c
new file mode 100644
index 0000000000..296c32cb5d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-c.c
@@ -0,0 +1,8 @@
+#define STRNCPY __strncpy_sse2
+#ifdef SHARED
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name) \
+ __hidden_ver1 (__strncpy_sse2, __GI_strncpy, __strncpy_sse2);
+#endif
+
+#include "strncpy.c"
diff --git a/sysdeps/x86_64/multiarch/strncpy.S b/sysdeps/x86_64/multiarch/strncpy.S
new file mode 100644
index 0000000000..327a4ce447
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy.S
@@ -0,0 +1,3 @@
+#define STRCPY strncpy
+#define USE_AS_STRNCPY
+#include "strcpy.S"
diff --git a/sysdeps/x86_64/multiarch/strpbrk-c.c b/sysdeps/x86_64/multiarch/strpbrk-c.c
new file mode 100644
index 0000000000..c58dcb5605
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strpbrk-c.c
@@ -0,0 +1,4 @@
+#define USE_AS_STRPBRK
+#define STRCSPN_SSE2 __strpbrk_sse2
+#define STRCSPN_SSE42 __strpbrk_sse42
+#include "strcspn-c.c"
diff --git a/sysdeps/x86_64/multiarch/strpbrk.S b/sysdeps/x86_64/multiarch/strpbrk.S
new file mode 100644
index 0000000000..ed5bca6a94
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strpbrk.S
@@ -0,0 +1,3 @@
+#define STRCSPN strpbrk
+#define USE_AS_STRPBRK
+#include "strcspn.S"
diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
new file mode 100644
index 0000000000..5b99f0d383
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strspn-c.c
@@ -0,0 +1,284 @@
+/* strspn with SSE4.2 intrinsics
+ Copyright (C) 2009 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <nmmintrin.h>
+#include <string.h>
+
+/* We use 0x12:
+ _SIDD_SBYTE_OPS
+ | _SIDD_CMP_EQUAL_ANY
+ | _SIDD_NEGATIVE_POLARITY
+ | _SIDD_LEAST_SIGNIFICANT
+ on pcmpistri to compare xmm/mem128
+
+ 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ X X X X X X X X X X X X X X X X
+
+ against xmm
+
+ 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ A A A A A A A A A A A A A A A A
+
+ to find out if the first 16byte data element has any non-A byte and
+ the offset of the first byte. There are 2 cases:
+
+ 1. The first 16byte data element has the non-A byte, including
+ EOS, at the offset X.
+ 2. The first 16byte data element is valid and doesn't have the non-A
+ byte.
+
+ Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
+
+ case ECX CFlag ZFlag SFlag
+ 1 X 1 0/1 0
+ 2 16 0 0 0
+
+ We exit from the loop for case 1. */
+
+extern size_t __strspn_sse2 (const char *, const char *);
+
+
+size_t
+__attribute__ ((section (".text.sse4.2")))
+__strspn_sse42 (const char *s, const char *a)
+{
+ if (*a == 0)
+ return 0;
+
+ const char *aligned;
+ __m128i mask;
+ int offset = (int) ((size_t) a & 15);
+ if (offset != 0)
+ {
+ /* Load masks. */
+ aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L);
+ __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
+
+ switch (offset)
+ {
+ case 1:
+ mask = _mm_srli_si128 (mask0, 1);
+ break;
+ case 2:
+ mask = _mm_srli_si128 (mask0, 2);
+ break;
+ case 3:
+ mask = _mm_srli_si128 (mask0, 3);
+ break;
+ case 4:
+ mask = _mm_srli_si128 (mask0, 4);
+ break;
+ case 5:
+ mask = _mm_srli_si128 (mask0, 5);
+ break;
+ case 6:
+ mask = _mm_srli_si128 (mask0, 6);
+ break;
+ case 7:
+ mask = _mm_srli_si128 (mask0, 7);
+ break;
+ case 8:
+ mask = _mm_srli_si128 (mask0, 8);
+ break;
+ case 9:
+ mask = _mm_srli_si128 (mask0, 9);
+ break;
+ case 10:
+ mask = _mm_srli_si128 (mask0, 10);
+ break;
+ case 11:
+ mask = _mm_srli_si128 (mask0, 11);
+ break;
+ case 12:
+ mask = _mm_srli_si128 (mask0, 12);
+ break;
+ case 13:
+ mask = _mm_srli_si128 (mask0, 13);
+ break;
+ case 14:
+ mask = _mm_srli_si128 (mask0, 14);
+ break;
+ case 15:
+ mask = _mm_srli_si128 (mask0, 15);
+ break;
+ }
+
+ /* Find where the NULL terminator is. */
+ int length = _mm_cmpistri (mask, mask, 0x3a);
+ if (length == 16 - offset)
+ {
+ /* There is no NULL terminator. */
+ __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
+ int index = _mm_cmpistri (mask1, mask1, 0x3a);
+ length += index;
+
+ /* Don't use SSE4.2 if the length of A > 16. */
+ if (length > 16)
+ return __strspn_sse2 (s, a);
+
+ if (index != 0)
+ {
+ /* Combine mask0 and mask1. */
+ switch (offset)
+ {
+ case 1:
+ mask = _mm_alignr_epi8 (mask1, mask0, 1);
+ break;
+ case 2:
+ mask = _mm_alignr_epi8 (mask1, mask0, 2);
+ break;
+ case 3:
+ mask = _mm_alignr_epi8 (mask1, mask0, 3);
+ break;
+ case 4:
+ mask = _mm_alignr_epi8 (mask1, mask0, 4);
+ break;
+ case 5:
+ mask = _mm_alignr_epi8 (mask1, mask0, 5);
+ break;
+ case 6:
+ mask = _mm_alignr_epi8 (mask1, mask0, 6);
+ break;
+ case 7:
+ mask = _mm_alignr_epi8 (mask1, mask0, 7);
+ break;
+ case 8:
+ mask = _mm_alignr_epi8 (mask1, mask0, 8);
+ break;
+ case 9:
+ mask = _mm_alignr_epi8 (mask1, mask0, 9);
+ break;
+ case 10:
+ mask = _mm_alignr_epi8 (mask1, mask0, 10);
+ break;
+ case 11:
+ mask = _mm_alignr_epi8 (mask1, mask0, 11);
+ break;
+ case 12:
+ mask = _mm_alignr_epi8 (mask1, mask0, 12);
+ break;
+ case 13:
+ mask = _mm_alignr_epi8 (mask1, mask0, 13);
+ break;
+ case 14:
+ mask = _mm_alignr_epi8 (mask1, mask0, 14);
+ break;
+ case 15:
+ mask = _mm_alignr_epi8 (mask1, mask0, 15);
+ break;
+ }
+ }
+ }
+ }
+ else
+ {
+ /* A is aligned. */
+ mask = _mm_load_si128 ((__m128i *) a);
+
+ /* Find where the NULL terminator is. */
+ int length = _mm_cmpistri (mask, mask, 0x3a);
+ if (length == 16)
+ {
+ /* There is no NULL terminator. Don't use SSE4.2 if the length
+ of A > 16. */
+ if (a[16] != 0)
+ return __strspn_sse2 (s, a);
+ }
+ }
+
+ offset = (int) ((size_t) s & 15);
+ if (offset != 0)
+ {
+ /* Check partial string. */
+ aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L);
+ __m128i value = _mm_load_si128 ((__m128i *) aligned);
+
+ switch (offset)
+ {
+ case 1:
+ value = _mm_srli_si128 (value, 1);
+ break;
+ case 2:
+ value = _mm_srli_si128 (value, 2);
+ break;
+ case 3:
+ value = _mm_srli_si128 (value, 3);
+ break;
+ case 4:
+ value = _mm_srli_si128 (value, 4);
+ break;
+ case 5:
+ value = _mm_srli_si128 (value, 5);
+ break;
+ case 6:
+ value = _mm_srli_si128 (value, 6);
+ break;
+ case 7:
+ value = _mm_srli_si128 (value, 7);
+ break;
+ case 8:
+ value = _mm_srli_si128 (value, 8);
+ break;
+ case 9:
+ value = _mm_srli_si128 (value, 9);
+ break;
+ case 10:
+ value = _mm_srli_si128 (value, 10);
+ break;
+ case 11:
+ value = _mm_srli_si128 (value, 11);
+ break;
+ case 12:
+ value = _mm_srli_si128 (value, 12);
+ break;
+ case 13:
+ value = _mm_srli_si128 (value, 13);
+ break;
+ case 14:
+ value = _mm_srli_si128 (value, 14);
+ break;
+ case 15:
+ value = _mm_srli_si128 (value, 15);
+ break;
+ }
+
+ int length = _mm_cmpistri (mask, value, 0x12);
+ /* No need to check CFlag since it is always 1. */
+ if (length < 16 - offset)
+ return length;
+ /* Find where the NULL terminator is. */
+ int index = _mm_cmpistri (value, value, 0x3a);
+ if (index < 16 - offset)
+ return length;
+ aligned += 16;
+ }
+ else
+ aligned = s;
+
+ while (1)
+ {
+ __m128i value = _mm_load_si128 ((__m128i *) aligned);
+ int index = _mm_cmpistri (mask, value, 0x12);
+ int cflag = _mm_cmpistrc (mask, value, 0x12);
+ if (cflag)
+ return (size_t) (aligned + index - s);
+ aligned += 16;
+ }
+}
diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S
new file mode 100644
index 0000000000..4183a2cf60
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strspn.S
@@ -0,0 +1,63 @@
+/* Multiple versions of strspn
+ Copyright (C) 2009 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <config.h>
+
+#ifdef HAVE_SSE4_SUPPORT
+
+#include <sysdep.h>
+#include <ifunc-defines.h>
+
+/* Define multiple versions only for the definition in libc. */
+#ifndef NOT_IN_libc
+ .text
+ENTRY(strspn)
+ .type strspn, @gnu_indirect_function
+ cmpl $0, __cpu_features+KIND_OFFSET(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq __strspn_sse2(%rip), %rax
+ testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
+ jz 2f
+ leaq __strspn_sse42(%rip), %rax
+2: ret
+END(strspn)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __strspn_sse2, @function; \
+ .globl __strspn_sse2; \
+ .align 16; \
+ __strspn_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __strspn_sse2, .-__strspn_sse2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strspn calls through a PLT.
+ The speedup we get from using SSE4.2 instruction is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_strspn; __GI_strspn = __strspn_sse2
+#endif
+
+#endif /* HAVE_SSE4_SUPPORT */
+
+#include "../strspn.S"