23 files changed, 3363 insertions, 49 deletions
diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index da82093381..78fdb04fcb 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -4,6 +4,7 @@ long-double-fcts = yes
 ifeq ($(subdir),csu)
 sysdep_routines += hp-timing
 elide-routines.os += hp-timing
+gen-as-const-headers += link-defines.sym
 endif
 
 ifeq ($(subdir),gmon)
diff --git a/sysdeps/x86_64/bits/link.h b/sysdeps/x86_64/bits/link.h
index 5676b78753..643a293bb0 100644
--- a/sysdeps/x86_64/bits/link.h
+++ b/sysdeps/x86_64/bits/link.h
@@ -65,10 +65,19 @@ __END_DECLS
 /* Registers for entry into PLT on x86-64.  */
 # if __GNUC_PREREQ (4,0)
 typedef float La_x86_64_xmm __attribute__ ((__vector_size__ (16)));
+typedef float La_x86_64_ymm __attribute__ ((__vector_size__ (32)));
 # else
 typedef float La_x86_64_xmm __attribute__ ((__mode__ (__V4SF__)));
 # endif
 
+typedef union
+{
+# if __GNUC_PREREQ (4,0)
+  La_x86_64_ymm ymm[2];
+# endif
+  La_x86_64_xmm xmm[4];
+} La_x86_64_vector __attribute__ ((aligned(16)));
+
 typedef struct La_x86_64_regs
 {
   uint64_t lr_rdx;
@@ -80,6 +89,7 @@ typedef struct La_x86_64_regs
   uint64_t lr_rbp;
   uint64_t lr_rsp;
   La_x86_64_xmm lr_xmm[8];
+  La_x86_64_vector lr_vector[8];
 } La_x86_64_regs;
 
 /* Return values for calls from PLT on x86-64.  */
@@ -91,6 +101,8 @@ typedef struct La_x86_64_retval
   La_x86_64_xmm lrv_xmm1;
   long double lrv_st0;
   long double lrv_st1;
+  La_x86_64_vector lrv_vector0;
+  La_x86_64_vector lrv_vector1;
 } La_x86_64_retval;
 
 
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index d8d9bc12a4..49d239f075 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -17,7 +17,9 @@
    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
    02111-1307 USA.  */
 
+#include <config.h>
 #include <sysdep.h>
+#include <link-defines.h>
 
 	.text
 	.globl _dl_runtime_resolve
@@ -89,25 +91,85 @@ _dl_runtime_profile:
 
 	/* Actively align the La_x86_64_regs structure.  */
 	andq $0xfffffffffffffff0, %rsp
-	subq $192, %rsp		# sizeof(La_x86_64_regs)
+# ifdef HAVE_AVX_SUPPORT
+	/* sizeof(La_x86_64_regs).  Need extra space for 8 SSE registers
+	   to detect if any xmm0-xmm7 registers are changed by audit
+	   module.  */
+	subq $(LR_SIZE + XMM_SIZE*8), %rsp
+# else
+	subq $LR_SIZE, %rsp		# sizeof(La_x86_64_regs)
+# endif
 	movq %rsp, 24(%rbx)
 
-	movq %rdx,   (%rsp)	# Fill the La_x86_64_regs structure.
-	movq %r8,   8(%rsp)
-	movq %r9,  16(%rsp)
-	movq %rcx, 24(%rsp)
-	movq %rsi, 32(%rsp)
-	movq %rdi, 40(%rsp)
-	movq %rbp, 48(%rsp)
+	/* Fill the La_x86_64_regs structure.  */
+	movq %rdx, LR_RDX_OFFSET(%rsp)
+	movq %r8,  LR_R8_OFFSET(%rsp)
+	movq %r9,  LR_R9_OFFSET(%rsp)
+	movq %rcx, LR_RCX_OFFSET(%rsp)
+	movq %rsi, LR_RSI_OFFSET(%rsp)
+	movq %rdi, LR_RDI_OFFSET(%rsp)
+	movq %rbp, LR_RBP_OFFSET(%rsp)
+
 	leaq 48(%rbx), %rax
-	movq %rax, 56(%rsp)
-	movaps %xmm0,  64(%rsp)
-	movaps %xmm1,  80(%rsp)
-	movaps %xmm2,  96(%rsp)
-	movaps %xmm3, 112(%rsp)
-	movaps %xmm4, 128(%rsp)
-	movaps %xmm5, 144(%rsp)
-	movaps %xmm7, 160(%rsp)
+	movq %rax, LR_RSP_OFFSET(%rsp)
+
+	/* We always store the XMM registers even if AVX is available.
+	   This is to provide backward binary compatility for existing
+	   audit modules.  */
+	movaps %xmm0,		   (LR_XMM_OFFSET)(%rsp)
+	movaps %xmm1, (LR_XMM_OFFSET +   XMM_SIZE)(%rsp)
+	movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
+	movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
+	movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
+	movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
+	movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
+	movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
+
+# ifdef HAVE_AVX_SUPPORT
+	.data
+L(have_avx):
+	.zero 4
+	.size L(have_avx), 4
+	.previous
+
+	cmpl	$0, L(have_avx)(%rip)
+	jne	1f
+	movq	%rbx, %r11		# Save rbx
+	movl	$1, %eax
+	cpuid
+	movq	%r11,%rbx		# Restore rbx
+	movl	$1, %eax
+	testl	$(1 << 28), %ecx
+	jne	2f
+	negl	%eax
+2:	movl	%eax, L(have_avx)(%rip)
+	cmpl	$0, %eax
+
+1:	js	L(no_avx1)
+
+	/* This is to support AVX audit modules.  */
+	vmovdqu %ymm0,		      (LR_VECTOR_OFFSET)(%rsp)
+	vmovdqu %ymm1, (LR_VECTOR_OFFSET +   VECTOR_SIZE)(%rsp)
+	vmovdqu %ymm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
+	vmovdqu %ymm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
+	vmovdqu %ymm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
+	vmovdqu %ymm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
+	vmovdqu %ymm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
+	vmovdqu %ymm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
+
+	/* Save xmm0-xmm7 registers to detect if any of them are
+	   changed by audit module.  */
+	vmovdqa %xmm0,		    (LR_SIZE)(%rsp)
+	vmovdqa %xmm1, (LR_SIZE +   XMM_SIZE)(%rsp)
+	vmovdqa %xmm2, (LR_SIZE + XMM_SIZE*2)(%rsp)
+	vmovdqa %xmm3, (LR_SIZE + XMM_SIZE*3)(%rsp)
+	vmovdqa %xmm4, (LR_SIZE + XMM_SIZE*4)(%rsp)
+	vmovdqa %xmm5, (LR_SIZE + XMM_SIZE*5)(%rsp)
+	vmovdqa %xmm6, (LR_SIZE + XMM_SIZE*6)(%rsp)
+	vmovdqa %xmm7, (LR_SIZE + XMM_SIZE*7)(%rsp)
+
+L(no_avx1):
+# endif
 
 	movq %rsp, %rcx		# La_x86_64_regs pointer to %rcx.
 	movq 48(%rbx), %rdx	# Load return address if needed.
@@ -119,27 +181,87 @@ _dl_runtime_profile:
 	movq %rax, %r11		# Save return value.
 
 	movq 8(%rbx), %rax	# Get back register content.
-	movq      (%rsp), %rdx
-	movq     8(%rsp), %r8
-	movq    16(%rsp), %r9
-	movaps  64(%rsp), %xmm0
-	movaps  80(%rsp), %xmm1
-	movaps  96(%rsp), %xmm2
-	movaps 112(%rsp), %xmm3
-	movaps 128(%rsp), %xmm4
-	movaps 144(%rsp), %xmm5
-	movaps 160(%rsp), %xmm7
-
+	movq LR_RDX_OFFSET(%rsp), %rdx
+	movq  LR_R8_OFFSET(%rsp), %r8
+	movq  LR_R9_OFFSET(%rsp), %r9
+
+	movaps		    (LR_XMM_OFFSET)(%rsp), %xmm0
+	movaps	 (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1
+	movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2
+	movaps (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3
+	movaps (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4
+	movaps (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5
+	movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
+	movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
+
+# ifdef HAVE_AVX_SUPPORT
+	cmpl	$0, L(have_avx)(%rip)
+	js	L(no_avx2)
+
+	/* Check if any xmm0-xmm7 registers are changed by audit
+	   module.  */
+	vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu			(LR_VECTOR_OFFSET)(%rsp), %ymm0
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu	  (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6
+
+1:	vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
+	vpmovmskb %xmm8, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7
+
+L(no_avx2):
+1:
+# endif
 	movq 16(%rbx), %r10	# Anything in framesize?
 	testq %r10, %r10
-	jns 1f
+	jns 3f
 
 	/* There's nothing in the frame size, so there
 	   will be no call to the _dl_call_pltexit. */
 
-	movq 24(%rsp), %rcx	# Get back registers content.
-	movq 32(%rsp), %rsi
-	movq 40(%rsp), %rdi
+	/* Get back registers content.  */
+	movq LR_RCX_OFFSET(%rsp), %rcx
+	movq LR_RSI_OFFSET(%rsp), %rsi
+	movq LR_RDI_OFFSET(%rsp), %rdi
 
 	movq %rbx, %rsp
 	movq (%rsp), %rbx
@@ -151,7 +273,7 @@ _dl_runtime_profile:
 	cfi_adjust_cfa_offset(-48)
 	jmp *%r11		# Jump to function address.
 
-1:
+3:
 	cfi_adjust_cfa_offset(48)
 	cfi_rel_offset(%rbx, 0)
 	cfi_def_cfa_register(%rbx)
@@ -161,7 +283,7 @@ _dl_runtime_profile:
 	   temporary buffer of the size specified by the 'framesize'
 	   returned from _dl_profile_fixup */
 
-	leaq 56(%rbx), %rsi	# stack
+	leaq LR_RSP_OFFSET(%rbx), %rsi	# stack
 	addq $8, %r10
 	andq $0xfffffffffffffff0, %r10
 	movq %r10, %rcx
@@ -183,31 +305,80 @@ _dl_runtime_profile:
 	   _dl_call_pltexit.  The La_x86_64_regs is being pointed by rsp now,
 	   so we just need to allocate the sizeof(La_x86_64_retval) space on
 	   the stack, since the alignment has already been taken care of. */
-
-	subq $80, %rsp		# sizeof(La_x86_64_retval)
+# ifdef HAVE_AVX_SUPPORT
+	/* sizeof(La_x86_64_retval).  Need extra space for 2 SSE
+	   registers to detect if xmm0/xmm1 registers are changed
+	   by audit module.  */
+	subq $(LRV_SIZE + XMM_SIZE*2), %rsp
+# else
+	subq $LRV_SIZE, %rsp	# sizeof(La_x86_64_retval)
+# endif
 	movq %rsp, %rcx		# La_x86_64_retval argument to %rcx.
 
-	movq %rax, (%rcx)	# Fill in the La_x86_64_retval structure.
-	movq %rdx, 8(%rcx)
-	movaps %xmm0, 16(%rcx)
-	movaps %xmm1, 32(%rcx)
-	fstpt 48(%rcx)
-	fstpt 64(%rcx)
+	/* Fill in the La_x86_64_retval structure.  */
+	movq %rax, LRV_RAX_OFFSET(%rcx)
+	movq %rdx, LRV_RDX_OFFSET(%rcx)
+
+	movaps %xmm0, LRV_XMM0_OFFSET(%rcx)
+	movaps %xmm1, LRV_XMM1_OFFSET(%rcx)
+
+# ifdef HAVE_AVX_SUPPORT
+	cmpl	$0, L(have_avx)(%rip)
+	js	L(no_avx3)
+
+	/* This is to support AVX audit modules.  */
+	vmovdqu %ymm0, LRV_VECTOR0_OFFSET(%rcx)
+	vmovdqu %ymm1, LRV_VECTOR1_OFFSET(%rcx)
+
+	/* Save xmm0/xmm1 registers to detect if they are changed
+	   by audit module.  */
+	vmovdqa %xmm0,		  (LRV_SIZE)(%rcx)
+	vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx)
+
+L(no_avx3):
+# endif
+
+	fstpt LRV_ST0_OFFSET(%rcx)
+	fstpt LRV_ST1_OFFSET(%rcx)
 
 	movq 24(%rbx), %rdx	# La_x86_64_regs argument to %rdx.
 	movq 40(%rbx), %rsi	# Copy args pushed by PLT in register.
         movq 32(%rbx), %rdi	# %rdi: link_map, %rsi: reloc_index
 	call _dl_call_pltexit
 
-	movq  (%rsp), %rax	# Restore return registers.
-	movq 8(%rsp), %rdx
-	movaps 16(%rsp), %xmm0
-	movaps 32(%rsp), %xmm1
-	fldt 64(%rsp)
-	fldt 48(%rsp)
+	/* Restore return registers.  */
+	movq LRV_RAX_OFFSET(%rsp), %rax
+	movq LRV_RDX_OFFSET(%rsp), %rdx
+
+	movaps LRV_XMM0_OFFSET(%rsp), %xmm0
+	movaps LRV_XMM1_OFFSET(%rsp), %xmm1
+
+# ifdef HAVE_AVX_SUPPORT
+	cmpl	$0, L(have_avx)(%rip)
+	js	L(no_avx4)
+
+	/* Check if xmm0/xmm1 registers are changed by audit module.  */
+	vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2
+	vpmovmskb %xmm2, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu LRV_VECTOR0_OFFSET(%rsp), %ymm0
+
+1:	vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
+	vpmovmskb %xmm2, %esi
+	cmpl $0xffff, %esi
+	je 1f
+	vmovdqu LRV_VECTOR1_OFFSET(%rsp), %ymm1
+
+L(no_avx4):
+1:
+# endif
+
+	fldt LRV_ST1_OFFSET(%rsp)
+	fldt LRV_ST0_OFFSET(%rsp)
 
 	movq %rbx, %rsp
-	movq  (%rsp), %rbx
+	movq (%rsp), %rbx
 	cfi_restore(rbx)
 	cfi_def_cfa_register(%rsp)
 
diff --git a/sysdeps/x86_64/elf/configure b/sysdeps/x86_64/elf/configure
index 774654997d..221e74c2b8 100755
--- a/sysdeps/x86_64/elf/configure
+++ b/sysdeps/x86_64/elf/configure
@@ -79,3 +79,28 @@ cat >>confdefs.h <<\_ACEOF
 #define PI_STATIC_AND_HIDDEN 1
 _ACEOF
 
+
+{ $as_echo "$as_me:$LINENO: checking for AVX support" >&5
+$as_echo_n "checking for AVX support... " >&6; }
+if test "${libc_cv_cc_avx+set}" = set; then
+  $as_echo_n "(cached) " >&6
+else
+  if { ac_try='${CC-cc} -mavx -xc /dev/null -S -o /dev/null'
+  { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  libc_cv_cc_avx=yes
+else
+  libc_cv_cc_avx=no
+fi
+fi
+{ $as_echo "$as_me:$LINENO: result: $libc_cv_cc_avx" >&5
+$as_echo "$libc_cv_cc_avx" >&6; }
+if test $libc_cv_cc_avx = yes; then
+  cat >>confdefs.h <<\_ACEOF
+#define HAVE_AVX_SUPPORT 1
+_ACEOF
+
+fi
diff --git a/sysdeps/x86_64/elf/configure.in b/sysdeps/x86_64/elf/configure.in
index 9cb59d009c..14d1875302 100644
--- a/sysdeps/x86_64/elf/configure.in
+++ b/sysdeps/x86_64/elf/configure.in
@@ -32,3 +32,14 @@ fi
 dnl It is always possible to access static and hidden symbols in an
 dnl position independent way.
 AC_DEFINE(PI_STATIC_AND_HIDDEN)
+
+dnl Check if -mavx works.
+AC_CACHE_CHECK(for AVX support, libc_cv_cc_avx, [dnl
+if AC_TRY_COMMAND([${CC-cc} -mavx -xc /dev/null -S -o /dev/null]); then
+  libc_cv_cc_avx=yes
+else
+  libc_cv_cc_avx=no
+fi])
+if test $libc_cv_cc_avx = yes; then
+  AC_DEFINE(HAVE_AVX_SUPPORT)
+fi
diff --git a/sysdeps/x86_64/link-defines.sym b/sysdeps/x86_64/link-defines.sym
new file mode 100644
index 0000000000..1694d883ad
--- /dev/null
+++ b/sysdeps/x86_64/link-defines.sym
@@ -0,0 +1,28 @@
+#include "link.h"
+#include <stddef.h>
+
+--
+VECTOR_SIZE		sizeof (La_x86_64_vector)
+XMM_SIZE		sizeof (La_x86_64_xmm)
+
+LR_SIZE			sizeof (struct La_x86_64_regs)
+LR_RDX_OFFSET		offsetof (struct La_x86_64_regs, lr_rdx)
+LR_R8_OFFSET		offsetof (struct La_x86_64_regs, lr_r8)
+LR_R9_OFFSET		offsetof (struct La_x86_64_regs, lr_r9)
+LR_RCX_OFFSET		offsetof (struct La_x86_64_regs, lr_rcx)
+LR_RSI_OFFSET		offsetof (struct La_x86_64_regs, lr_rsi)
+LR_RDI_OFFSET		offsetof (struct La_x86_64_regs, lr_rdi)
+LR_RBP_OFFSET		offsetof (struct La_x86_64_regs, lr_rbp)
+LR_RSP_OFFSET		offsetof (struct La_x86_64_regs, lr_rsp)
+LR_XMM_OFFSET		offsetof (struct La_x86_64_regs, lr_xmm)
+LR_VECTOR_OFFSET	offsetof (struct La_x86_64_regs, lr_vector)
+
+LRV_SIZE		sizeof (struct La_x86_64_retval)
+LRV_RAX_OFFSET		offsetof (struct La_x86_64_retval, lrv_rax)
+LRV_RDX_OFFSET		offsetof (struct La_x86_64_retval, lrv_rdx)
+LRV_XMM0_OFFSET		offsetof (struct La_x86_64_retval, lrv_xmm0)
+LRV_XMM1_OFFSET		offsetof (struct La_x86_64_retval, lrv_xmm1)
+LRV_ST0_OFFSET		offsetof (struct La_x86_64_retval, lrv_st0)
+LRV_ST1_OFFSET		offsetof (struct La_x86_64_retval, lrv_st1)
+LRV_VECTOR0_OFFSET	offsetof (struct La_x86_64_retval, lrv_vector0)
+LRV_VECTOR1_OFFSET	offsetof (struct La_x86_64_retval, lrv_vector1)
diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
new file mode 100644
index 0000000000..a9fe13ae58
--- /dev/null
+++ b/sysdeps/x86_64/memcmp.S
@@ -0,0 +1,359 @@
+/* memcmp with SSE2
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY (memcmp)
+	test	%rdx, %rdx
+	jz	L(finz)
+	cmpq	$1, %rdx
+	jle	L(finr1b)
+	subq	%rdi, %rsi
+	movq	%rdx, %r10
+	cmpq	$32, %r10
+	jge	L(gt32)
+	/* Handle small chunks and last block of less than 32 bytes.  */
+L(small):
+	testq	$1, %r10
+	jz	L(s2b)
+	movzbl	(%rdi),	%eax
+	movzbl	(%rdi, %rsi), %edx
+	subq    $1, %r10
+	je	L(finz1)
+	addq	$1, %rdi
+	subl	%edx, %eax
+	jnz	L(exit)
+L(s2b):
+	testq	$2, %r10
+	jz	L(s4b)
+	movzwl	(%rdi),	%eax
+	movzwl	(%rdi, %rsi), %edx
+	subq    $2, %r10
+	je	L(fin2_7)
+	addq	$2, %rdi
+	cmpl	%edx, %eax
+	jnz	L(fin2_7)
+L(s4b):
+	testq	$4, %r10
+	jz	L(s8b)
+	movl	(%rdi),	%eax
+	movl	(%rdi, %rsi), %edx
+	subq    $4, %r10
+	je	L(fin2_7)
+	addq	$4, %rdi
+	cmpl	%edx, %eax
+	jnz	L(fin2_7)
+L(s8b):
+	testq	$8, %r10
+	jz	L(s16b)
+	movq	(%rdi),	%rax
+	movq	(%rdi, %rsi), %rdx
+	subq    $8, %r10
+	je	L(fin2_7)
+	addq	$8, %rdi
+	cmpq	%rdx, %rax
+	jnz	L(fin2_7)
+L(s16b):
+	movdqu    (%rdi), %xmm1
+	movdqu    (%rdi, %rsi), %xmm0
+	pcmpeqb   %xmm0, %xmm1
+	pmovmskb  %xmm1, %edx
+	xorl	  %eax, %eax
+	subl      $0xffff, %edx
+	jz	  L(finz)
+	bsfl      %edx, %ecx
+	leaq	 (%rdi, %rcx), %rcx
+	movzbl	 (%rcx), %eax
+	movzbl	 (%rsi, %rcx), %edx
+	jmp	 L(finz1)
+
+	.p2align 4,, 4
+L(finr1b):
+	movzbl	(%rdi), %eax
+	movzbl  (%rsi), %edx
+L(finz1):
+	subl	%edx, %eax
+L(exit):
+	ret
+
+	.p2align 4,, 4
+L(fin2_7):
+	cmpq	%rdx, %rax
+	jz	L(finz)
+	movq	%rax, %r11
+	subq	%rdx, %r11
+	bsfq	%r11, %rcx
+	sarq	$3, %rcx
+	salq	$3, %rcx
+	sarq	%cl, %rax
+	movzbl  %al, %eax
+	sarq	%cl, %rdx
+	movzbl  %dl, %edx
+	subl	%edx, %eax
+	ret
+
+	.p2align 4,, 4
+L(finz):
+	xorl	%eax, %eax
+	ret
+
+	/* For blocks bigger than 32 bytes
+	   1. Advance one of the addr pointer to be 16B aligned.
+	   2. Treat the case of both addr pointers aligned to 16B
+	      separately to avoid movdqu.
+	   3. Handle any blocks of greater than 64 consecutive bytes with
+	      unrolling to reduce branches.
+	   4. At least one addr pointer is 16B aligned, use memory version
+	      of pcmbeqb.
+	*/
+	.p2align 4,, 4
+L(gt32):
+	movq	%rdx, %r11
+	addq	%rdi, %r11
+	movq	%rdi, %r8
+
+	andq	$15, %r8
+	jz	L(16am)
+	/* Both pointers may be misaligned.  */
+	movdqu	(%rdi),	%xmm1
+	movdqu	(%rdi, %rsi), %xmm0
+	pcmpeqb   %xmm0, %xmm1
+	pmovmskb  %xmm1, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	neg	 %r8
+	leaq    16(%rdi, %r8), %rdi
+L(16am):
+	/* Handle two 16B aligned pointers separately.  */
+	testq   $15, %rsi
+	jz      L(ATR)
+	testq	$16, %rdi
+	jz	L(A32)
+	movdqu	(%rdi, %rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq	$16, %rdi
+L(A32):
+	movq	%r11, %r10
+	andq	$-32, %r10
+	cmpq	%r10, %rdi
+        jge	L(mt16)
+	/* Pre-unroll to be ready for unrolled 64B loop.  */
+	testq	$32, %rdi
+	jz	L(A64)
+	movdqu    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqu    (%rdi,%rsi), %xmm0
+	pcmpeqb  (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+L(A64):
+	movq	%r11, %r10
+	andq	$-64, %r10
+	cmpq	%r10, %rdi
+        jge	L(mt32)
+
+L(A64main):
+	movdqu    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqu    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqu    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqu    (%rdi,%rsi), %xmm0
+	pcmpeqb  (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	cmpq       %rdi, %r10
+	jne       L(A64main)
+
+L(mt32):
+	movq	%r11, %r10
+	andq	$-32, %r10
+	cmpq	%r10, %rdi
+        jge	L(mt16)
+
+L(A32main):
+	movdqu    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqu    (%rdi,%rsi), %xmm0
+	pcmpeqb  (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	cmpq       %rdi, %r10
+	jne       L(A32main)
+L(mt16):
+	subq       %rdi, %r11
+	je	  L(finz)
+	movq	  %r11, %r10
+	jmp	  L(small)
+
+	.p2align 4,, 4
+L(neq):
+	bsfl      %edx, %ecx
+	movzbl	 (%rdi, %rcx), %eax
+	addq	 %rdi, %rsi
+	movzbl	 (%rsi,%rcx), %edx
+	jmp	 L(finz1)
+
+	.p2align 4,, 4
+L(ATR):
+	movq	%r11, %r10
+	andq	$-32, %r10
+	cmpq	%r10, %rdi
+        jge	L(mt16)
+	testq	$16, %rdi
+	jz	L(ATR32)
+
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+	cmpq       %rdi, %r10
+	je       L(mt16)
+
+L(ATR32):
+	movq	%r11, %r10
+	andq	$-64, %r10
+	testq	$32, %rdi
+	jz	L(ATR64)
+
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+L(ATR64):
+	cmpq       %rdi, %r10
+	je	   L(mt32)
+
+L(ATR64main):
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+	cmpq       %rdi, %r10
+	jne       L(ATR64main)
+
+	movq	%r11, %r10
+	andq	$-32, %r10
+	cmpq	%r10, %rdi
+        jge	L(mt16)
+
+L(ATR32res):
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	movdqa    (%rdi,%rsi), %xmm0
+	pcmpeqb   (%rdi), %xmm0
+	pmovmskb  %xmm0, %edx
+	subl      $0xffff, %edx
+	jnz       L(neq)
+	addq       $16, %rdi
+
+	cmpq	  %r10, %rdi
+	jne       L(ATR32res)
+
+	subq       %rdi, %r11
+	je	  L(finz)
+	movq	  %r11, %r10
+	jmp	  L(small)
+	/* Align to 16byte to improve instruction fetch.  */
+	.p2align 4,, 4
+END(memcmp)
+
+#undef bcmp
+weak_alias (memcmp, bcmp)
+libc_hidden_builtin_def (memcmp)
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 1c35e1ffb4..71e85f0652 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -4,5 +4,11 @@ gen-as-const-headers += ifunc-defines.sym
 endif
 
 ifeq ($(subdir),string)
-sysdep_routines += strncmp-c
+sysdep_routines += stpncpy-c strncpy-c strncmp-c
+ifeq (yes,$(config-cflags-sse4))
+sysdep_routines += strcspn-c strpbrk-c strspn-c
+CFLAGS-strcspn-c.c += -msse4
+CFLAGS-strpbrk-c.c += -msse4
+CFLAGS-strspn-c.c += -msse4
+endif
 endif
diff --git a/sysdeps/x86_64/multiarch/rawmemchr.S b/sysdeps/x86_64/multiarch/rawmemchr.S
index 93ca631633..d4f265f430 100644
--- a/sysdeps/x86_64/multiarch/rawmemchr.S
+++ b/sysdeps/x86_64/multiarch/rawmemchr.S
@@ -77,6 +77,7 @@ __rawmemchr_sse42:
 # undef ENTRY
 # define ENTRY(name) \
 	.type __rawmemchr_sse2, @function; \
+	.align 16; \
 	__rawmemchr_sse2: cfi_startproc; \
 	CALL_MCOUNT
 # undef END
diff --git a/sysdeps/x86_64/multiarch/stpcpy.S b/sysdeps/x86_64/multiarch/stpcpy.S
new file mode 100644
index 0000000000..b63d308edc
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpcpy.S
@@ -0,0 +1,7 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy
+#include "strcpy.S"
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/sysdeps/x86_64/multiarch/stpncpy-c.c b/sysdeps/x86_64/multiarch/stpncpy-c.c
new file mode 100644
index 0000000000..2fde77dcab
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy-c.c
@@ -0,0 +1,8 @@
+#define STPNCPY __stpncpy_sse2
+#ifdef SHARED
+#undef libc_hidden_def
+#define libc_hidden_def(name) \
+  __hidden_ver1 (__stpncpy_sse2, __GI___stpncpy, __stpncpy_sse2);
+#endif
+
+#include "stpncpy.c"
diff --git a/sysdeps/x86_64/multiarch/stpncpy.S b/sysdeps/x86_64/multiarch/stpncpy.S
new file mode 100644
index 0000000000..ff89a89491
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy.S
@@ -0,0 +1,6 @@
+#define STRCPY __stpncpy
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#include "strcpy.S"
+
+weak_alias (__stpncpy, stpncpy)
diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
index 2f4bf17d95..37985036aa 100644
--- a/sysdeps/x86_64/multiarch/strcmp.S
+++ b/sysdeps/x86_64/multiarch/strcmp.S
@@ -1659,6 +1659,7 @@ LABEL(unaligned_table):
 # undef ENTRY
 # define ENTRY(name) \
 	.type STRCMP_SSE2, @function; \
+	.align 16; \
 	STRCMP_SSE2: cfi_startproc; \
 	CALL_MCOUNT
 # undef END
diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S
new file mode 100644
index 0000000000..25cd01307d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy.S
@@ -0,0 +1,1918 @@
+/* strcpy with SSSE3
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <ifunc-defines.h>
+
+#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY)
+# ifndef STRCPY
+#  define STRCPY strcpy
+# endif
+#endif
+
+#ifdef USE_AS_STPCPY
+# ifdef USE_AS_STRNCPY
+#  define STRCPY_SSSE3	__stpncpy_ssse3
+#  define STRCPY_SSE2	__stpncpy_sse2
+#  define __GI_STRCPY	__GI_stpncpy
+# else
+#  define STRCPY_SSSE3	__stpcpy_ssse3
+#  define STRCPY_SSE2	__stpcpy_sse2
+#  define __GI_STRCPY	__GI_stpcpy
+#  define __GI___STRCPY	__GI___stpcpy
+# endif
+#else
+# ifdef USE_AS_STRNCPY
+#  define STRCPY_SSSE3	__strncpy_ssse3
+#  define STRCPY_SSE2	__strncpy_sse2
+#  define __GI_STRCPY	__GI_strncpy
+# else
+#  define STRCPY_SSSE3	__strcpy_ssse3
+#  define STRCPY_SSE2	__strcpy_sse2
+#  define __GI_STRCPY	__GI_strcpy
+# endif
+#endif
+
+#ifndef LABEL
+#define LABEL(l) L(l)
+#endif
+
+/* Define multiple versions only for the definition in libc.  */
+#ifndef NOT_IN_libc
+	.text
+ENTRY(STRCPY)
+	.type	STRCPY, @gnu_indirect_function
+	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:	leaq	STRCPY_SSE2(%rip), %rax
+	testl	$(1<<9), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
+	jz	3f
+/* Avoid SSSE3 strcpy on Atom since it is slow.  */
+	cmpl	$1, __cpu_features+KIND_OFFSET(%rip)
+	jne	2f
+	cmpl	$6, __cpu_features+FAMILY_OFFSET(%rip)
+	jne	2f
+	cmpl	$28, __cpu_features+MODEL_OFFSET(%rip)
+	jz	3f
+2:	leaq	STRCPY_SSSE3(%rip), %rax
+3:	ret
+END(STRCPY)
+
+	.section .text.ssse3,"ax",@progbits
+STRCPY_SSSE3:
+	cfi_startproc
+	CALL_MCOUNT
+
+/*
+ * This implementation uses SSE to copy up to 16 bytes at a time.
+ */
+#ifdef USE_AS_STRNCPY
+	test    %rdx, %rdx
+	jz      LABEL(strncpy_exitz)
+	mov     %rdx, %r8
+#else
+	xor	%edx, %edx
+#endif
+	mov	%esi, %ecx
+	and	$0xfffffffffffffff0, %rsi	/*force rsi 16 byte align*/
+	and	$15, %ecx
+	mov	%rdi, %rax			/*store return parameter*/
+
+
+	pxor	%xmm0, %xmm0			/* clear %xmm0 */
+	pcmpeqb	(%rsi), %xmm0			/* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/
+	pmovmskb %xmm0, %edx			/* move each byte mask of %xmm0 to edx*/
+	shr	%cl, %edx			/* get real bits left in edx*/
+	test	%edx, %edx			/* edx must be 0 if there is no null char from rsi+%rcx */
+	jnz	LABEL(less16bytes)
+
+#ifdef USE_AS_STRNCPY
+	lea	-16(%r8,%rcx), %r11
+	cmp	$0, %r11
+	jle	LABEL(less16bytes)		/* if r8 + rcx <= 16, branch to less16bytes.  */
+#endif
+
+	mov	%rcx, %r9
+	or	%edi, %ecx
+	and	$15, %ecx
+	lea	-16(%r9), %r10
+	jz	LABEL(ashr_0)			/* ecx must be 0 if offset of rsi and rdi is 16 byte align*/
+
+	neg	%r10				/* store the rest in rsi aligned 16 bytes for unaligned_exit*/
+
+	pxor	%xmm0, %xmm0			/* clear %xmm0, may be polluted by unaligned operation*/
+	pcmpeqb	16(%rsi), %xmm0			/* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(less32bytes)
+	/*
+	* at least 16 byte available to fill destination rdi
+	*/
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(less32bytes_strncpy_truncation)
+#endif
+	mov	(%rsi, %r9), %rdx
+	mov	%rdx, (%rdi)
+	mov	8(%rsi, %r9), %rdx
+	mov	%rdx, 8(%rdi)
+
+	/*
+	* so far destatination rdi may be aligned by 16, re-calculate rsi to jump
+	* crossponding case
+	* rcx is offset of rsi
+	* rax is offset of rdi
+	*/
+
+	and	$0xfffffffffffffff0, %rdi	/* force rdi 16 byte align */
+	mov	%rax, %rdx			/* rax store orignal rdi */
+	xor	%rdi, %rdx			/* equal to and $15, %rdx */
+#ifdef USE_AS_STRNCPY
+	add     %rdx, %r8
+#endif
+
+	add	$16, %rdi			/* next 16 bytes for rdi */
+	sub	%rdx, %r9
+
+	lea	16(%r9, %rsi), %rsi		/*re-calculate rsi by (16 - rdx)+ rcx */
+	mov	%esi, %ecx			/*store offset of rsi */
+	and	$0xfffffffffffffff0, %rsi	/* force rsi 16 byte align */
+
+	and	$15, %ecx			/* ecx must be 0 if rdx is equal to rcx*/
+	jz	LABEL(ashr_0)
+
+	lea	-16(%rcx), %r10
+	mov	%rcx, %r9
+	neg	%r10
+	lea	LABEL(unaligned_table)(%rip), %r11
+	movslq  (%r11, %rcx,4), %rcx
+	lea	(%r11, %rcx), %rcx
+	jmp	*%rcx
+
+ /*
+ * The following cases will be handled by ashr_0 & ashr_0_start
+ *  rcx(offset of rsi)  rax(offset of rdi)  relative offset  corresponding case
+ *	0		    0		  0		 ashr_0
+ *	n(1~15)	     n(1~15)	   0		 ashr_0_start
+ *
+ */
+	.p2align 5
+LABEL(ashr_0):
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_aligned)
+#endif
+	movdqa  (%rsi), %xmm1	   /* fetch first 16 bytes from rsi */
+	movdqa  %xmm1, (%rdi)	   /* store first 16 bytes into rdi */
+	add     $16, %rsi
+	add     $16, %rdi
+	pcmpeqb  (%rsi), %xmm0		   /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char */
+	pmovmskb  %xmm0, %edx		   /* move each byte mask of %xmm0 to edx*/
+
+	test    %edx, %edx		  /* edx must be 0 if there is no null char in rsi*/
+	jnz	LABEL(aligned_16bytes)
+
+LABEL(ashr_0_loop):
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_aligned)
+#endif
+	movdqa  (%rsi, %rcx), %xmm1
+	movdqa  %xmm1, (%rdi, %rcx)
+	add	$16, %rcx
+	pcmpeqb  (%rsi, %rcx), %xmm0
+	pmovmskb  %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(aligned_exit)
+
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_aligned)
+#endif
+	movdqa  (%rsi, %rcx), %xmm1
+	movdqa  %xmm1, (%rdi, %rcx)
+	add	$16, %rcx
+	pcmpeqb  (%rsi, %rcx), %xmm0
+	pmovmskb  %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(aligned_exit)
+
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_aligned)
+#endif
+	movdqa  (%rsi, %rcx), %xmm1
+	movdqa  %xmm1, (%rdi, %rcx)
+	add	$16, %rcx
+	pcmpeqb  (%rsi, %rcx), %xmm0
+	pmovmskb  %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(aligned_exit)
+
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_aligned)
+#endif
+	movdqa  (%rsi, %rcx), %xmm1
+	movdqa  %xmm1, (%rdi, %rcx)
+	add	$16, %rcx
+	pcmpeqb  (%rsi, %rcx), %xmm0
+	pmovmskb  %xmm0, %edx
+	test	%edx, %edx
+	jz	LABEL(ashr_0_loop)
+
+	jmp	LABEL(aligned_exit)
+        .p2align 4
+
+/*
+ * The following cases will be handled by ashr_15
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(15)		n - 15		15((16 - (n -15) + n)%16	 ashr_15
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_15):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_15_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $15, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $15, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_15_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_14
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(14~15)		n - 14		14((16 - (n -14) + n)%16	 ashr_14
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_14):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_14_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $14, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $14, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_14_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_13
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(13~15)		n - 13		13((16 - (n -13) + n)%16	 ashr_13
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_13):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_13_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $13, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $13, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_13_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_12
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(12~15)		n - 12		12((16 - (n -12) + n)%16	 ashr_12
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_12):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_12_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $12, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $12, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_12_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_11
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(11~15)		n - 11		11((16 - (n -11) + n)%16	 ashr_11
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_11):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_11_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $11, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $11, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_11_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_10
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(10~15)		n - 10		10((16 - (n -10) + n)%16	 ashr_10
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_10):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_10_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $10, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $10, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_10_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_9
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(9~15)		n - 9		9((16 - (n -9) + n)%16	 ashr_9
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_9):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_9_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $9, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $9, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_9_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_8
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(8~15)		n - 8		8((16 - (n -8) + n)%16	 ashr_8
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_8):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_8_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $8, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $8, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_8_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_7
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(7~15)		n - 7		7((16 - (n -7) + n)%16	 ashr_7
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_7):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	.p2align 4
+
+LABEL(ashr_7_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $7, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $7, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_7_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_6
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(6~15)		n - 6		6((16 - (n -6) + n)%16	 ashr_6
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_6):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_6_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $6, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $6, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_6_use_ssse3)
+
+ /*
+ * The following cases will be handled by ashr_5
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(5~15)		n - 5		5((16 - (n -5) + n)%16	 ashr_5
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_5):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_5_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $5, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $5, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_5_use_ssse3)
+
+/*
+ *
+ * The following cases will be handled by ashr_4
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(4~15)		n - 4		4((16 - (n -4) + n)%16	 ashr_4
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_4):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_4_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $4, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $4, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_4_use_ssse3)
+
+/*
+ *
+ * The following cases will be handled by ashr_3
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(3~15)		n - 3		3((16 - (n -3) + n)%16	 ashr_3
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_3):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_3_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $3, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $3, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_3_use_ssse3)
+
+/*
+ *
+ * The following cases will be handled by ashr_2
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(2~15)		n - 2		2((16 - (n -2) + n)%16	 ashr_2
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_2):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_2_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $2, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $2, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_2_use_ssse3)
+
+/*
+ *
+ * The following cases will be handled by ashr_1
+ *  rcx(offset of rsi)  rax(offset of rdi)  relative offset  		corresponding case
+ *	n(1~15)		n - 1	   	1 ((16 - (n -1) + n)%16	 ashr_1
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_1):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_1_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $1, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+	palignr $1, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_1_use_ssse3)
+
+	.p2align 4
+LABEL(less32bytes):
+	xor	%ecx, %ecx
+LABEL(unaligned_exit):
+	add	%r9, %rsi		/* r9 stores original offset of rsi*/
+	mov	%rcx, %r9
+	mov	%r10, %rcx
+	shl	%cl, %edx		/* after shl, calculate the exact number to be filled*/
+	mov	%r9, %rcx
+	.p2align 4
+LABEL(aligned_exit):
+	add	%rcx, %rdi		/*locate exact address for rdi */
+LABEL(less16bytes):
+	add	%rcx, %rsi		/*locate exact address for rsi */
+LABEL(aligned_16bytes):
+#ifdef USE_AS_STRNCPY
+	mov     $1, %r9d
+	lea     -1(%r8), %rcx
+	shl     %cl, %r9d
+	cmp     $32, %r8
+	ja      LABEL(strncpy_tail)
+	or      %r9d, %edx
+LABEL(strncpy_tail):
+#endif
+	bsf	%rdx, %rcx		/*If a least significant 1 bit in %rdx is found, its bit index is stored in %rcx*/
+	lea	LABEL(tail_table)(%rip), %r11
+	movslq	(%r11, %rcx,4), %rcx
+	lea	(%r11, %rcx), %rcx
+	jmp	*%rcx
+
+#ifdef USE_AS_STRNCPY
+	.p2align 4
+LABEL(less32bytes_strncpy_truncation):
+	xor     %ecx, %ecx
+LABEL(strncpy_truncation_unaligned):
+	add      %r9, %rsi
+LABEL(strncpy_truncation_aligned):
+	add      %rcx, %rdi
+	add      %rcx, %rsi
+	add     $16, %r8
+	lea     -1(%r8), %rcx
+	lea     LABEL(tail_table)(%rip), %r11
+	movslq  (%r11, %rcx,4), %rcx
+	lea     (%r11, %rcx), %rcx
+	jmp     *%rcx
+	.p2align 4
+LABEL(strncpy_exitz):
+	mov     %rdi, %rax
+	ret
+#endif
+
+#ifdef USE_AS_STRNCPY
+	.p2align 4
+LABEL(strncpy_fill_tail):
+	mov	%rax, %rdx
+	movzx	%cl, %rax
+	mov	%r8, %rcx
+	add	%rax, %rdi
+	xor	%eax, %eax
+	shr	$3, %ecx
+	jz	LABEL(strncpy_fill_less_8)
+
+	rep	stosq
+LABEL(strncpy_fill_less_8):
+	mov	%r8, %rcx
+	and	$7, %ecx
+	jz	LABEL(strncpy_fill_return)
+LABEL(strncpy_fill_less_7):
+	sub	$1, %ecx
+	mov	%al, (%rdi, %rcx)
+	jnz	LABEL(strncpy_fill_less_7)
+LABEL(strncpy_fill_return):
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rdx)
+	sbb	$-1, %rdx
+#endif
+	mov	%rdx, %rax
+	ret
+#endif
+	.p2align 4
+LABEL(tail_0):
+	mov	(%rsi), %cl
+	mov	%cl, (%rdi)
+#ifdef USE_AS_STPCPY
+	mov	%rdi, %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$1, %cl
+	sub	$1, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_1):
+	mov	(%rsi), %cx
+	mov	%cx, (%rdi)
+#ifdef USE_AS_STPCPY
+	lea	1(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$2, %cl
+	sub	$2, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_2):
+	mov	(%rsi), %cx
+	mov	%cx, (%rdi)
+	mov	1(%rsi), %cx
+	mov	%cx, 1(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	2(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$3, %cl
+	sub	$3, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_3):
+	mov	(%rsi), %ecx
+	mov	%ecx, (%rdi)
+#ifdef USE_AS_STPCPY
+	lea	3(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$4, %cl
+	sub	$4, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_4):
+	mov	(%rsi), %ecx
+	mov	%ecx, (%rdi)
+	mov	1(%rsi), %edx
+	mov	%edx, 1(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	4(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$5, %cl
+	sub	$5, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_5):
+	mov	(%rsi), %ecx
+	mov	%ecx, (%rdi)
+	mov	2(%rsi), %edx
+	mov	%edx, 2(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	5(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$6, %cl
+	sub	$6, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_6):
+	mov	(%rsi), %ecx
+	mov	%ecx, (%rdi)
+	mov	3(%rsi), %edx
+	mov	%edx,3(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	6(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$7, %cl
+	sub	$7, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_7):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+#ifdef USE_AS_STPCPY
+	lea	7(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$8, %cl
+	sub	$8, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_8):
+
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	5(%rsi), %edx
+	mov	%edx, 5(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	8(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$9, %cl
+	sub	$9, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_9):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	6(%rsi), %edx
+	mov	%edx, 6(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	9(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$10, %cl
+	sub	$10, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_10):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	7(%rsi), %edx
+	mov	%edx, 7(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	10(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$11, %cl
+	sub	$11, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_11):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %edx
+	mov	%edx, 8(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	11(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$12, %cl
+	sub	$12, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_12):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	5(%rsi), %rcx
+	mov	%rcx, 5(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	12(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$13, %cl
+	sub	$13, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_13):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	6(%rsi), %rcx
+	mov	%rcx, 6(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	13(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$14, %cl
+	sub	$14, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_14):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	7(%rsi), %rcx
+	mov	%rcx, 7(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	14(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$15, %cl
+	sub	$15, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+LABEL(tail_15):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	15(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$16, %cl
+	sub	$16, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+
+	ret
+
+	.p2align 4
+LABEL(tail_16):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %cl
+	mov	%cl, 16(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	16(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$17, %cl
+	sub	$17, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_17):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %cx
+	mov	%cx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	17(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$18, %cl
+	sub	$18, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_18):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	15(%rsi), %ecx
+	mov	%ecx,15(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	18(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$19, %cl
+	sub	$19, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_19):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %ecx
+	mov	%ecx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	19(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$20, %cl
+	sub	$20, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_20):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	13(%rsi), %rcx
+	mov	%rcx, 13(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	20(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$21, %cl
+	sub	$21, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_21):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	14(%rsi), %rcx
+	mov	%rcx, 14(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	21(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$22, %cl
+	sub	$22, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_22):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	15(%rsi), %rcx
+	mov	%rcx, 15(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	22(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$23, %cl
+	sub	$23, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_23):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	23(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$24, %cl
+	sub	$24, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+
+	ret
+
+	.p2align 4
+LABEL(tail_24):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	21(%rsi), %edx
+	mov	%edx, 21(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	24(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$25, %cl
+	sub	$25, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_25):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	22(%rsi), %edx
+	mov	%edx, 22(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	25(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$26, %cl
+	sub	$26, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_26):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	23(%rsi), %edx
+	mov	%edx, 23(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	26(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$27, %cl
+	sub	$27, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_27):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	24(%rsi), %edx
+	mov	%edx, 24(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	27(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$28, %cl
+	sub	$28, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_28):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	21(%rsi), %rdx
+	mov	%rdx, 21(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	28(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$29, %cl
+	sub	$29, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+
+	ret
+
+	.p2align 4
+LABEL(tail_29):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	22(%rsi), %rdx
+	mov	%rdx, 22(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	29(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$30, %cl
+	sub	$30, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+
+	ret
+
+
+	.p2align 4
+LABEL(tail_30):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	23(%rsi), %rdx
+	mov	%rdx, 23(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	30(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$31, %cl
+	sub	$31, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_31):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	24(%rsi), %rdx
+	mov	%rdx, 24(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	31(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$32, %cl
+	sub	$32, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	cfi_endproc
+	.size	STRCPY_SSSE3, .-STRCPY_SSSE3
+
+	.p2align 4
+	.section .rodata.ssse3,"a",@progbits
+LABEL(tail_table):
+	.int	LABEL(tail_0) - LABEL(tail_table)
+	.int	LABEL(tail_1) - LABEL(tail_table)
+	.int	LABEL(tail_2) - LABEL(tail_table)
+	.int	LABEL(tail_3) - LABEL(tail_table)
+	.int	LABEL(tail_4) - LABEL(tail_table)
+	.int	LABEL(tail_5) - LABEL(tail_table)
+	.int	LABEL(tail_6) - LABEL(tail_table)
+	.int	LABEL(tail_7) - LABEL(tail_table)
+	.int	LABEL(tail_8) - LABEL(tail_table)
+	.int	LABEL(tail_9) - LABEL(tail_table)
+	.int	LABEL(tail_10) - LABEL(tail_table)
+	.int	LABEL(tail_11) - LABEL(tail_table)
+	.int	LABEL(tail_12) - LABEL(tail_table)
+	.int	LABEL(tail_13) - LABEL(tail_table)
+	.int	LABEL(tail_14) - LABEL(tail_table)
+	.int	LABEL(tail_15) - LABEL(tail_table)
+	.int	LABEL(tail_16) - LABEL(tail_table)
+	.int	LABEL(tail_17) - LABEL(tail_table)
+	.int	LABEL(tail_18) - LABEL(tail_table)
+	.int	LABEL(tail_19) - LABEL(tail_table)
+	.int	LABEL(tail_20) - LABEL(tail_table)
+	.int	LABEL(tail_21) - LABEL(tail_table)
+	.int	LABEL(tail_22) - LABEL(tail_table)
+	.int	LABEL(tail_23) - LABEL(tail_table)
+	.int	LABEL(tail_24) - LABEL(tail_table)
+	.int	LABEL(tail_25) - LABEL(tail_table)
+	.int	LABEL(tail_26) - LABEL(tail_table)
+	.int	LABEL(tail_27) - LABEL(tail_table)
+	.int	LABEL(tail_28) - LABEL(tail_table)
+	.int	LABEL(tail_29) - LABEL(tail_table)
+	.int	LABEL(tail_30) - LABEL(tail_table)
+	.int	LABEL(tail_31) - LABEL(tail_table)
+
+	.p2align 4
+LABEL(unaligned_table):
+	.int	LABEL(ashr_0) - LABEL(unaligned_table)
+	.int	LABEL(ashr_1) - LABEL(unaligned_table)
+	.int	LABEL(ashr_2) - LABEL(unaligned_table)
+	.int	LABEL(ashr_3) - LABEL(unaligned_table)
+	.int	LABEL(ashr_4) - LABEL(unaligned_table)
+	.int	LABEL(ashr_5) - LABEL(unaligned_table)
+	.int	LABEL(ashr_6) - LABEL(unaligned_table)
+	.int	LABEL(ashr_7) - LABEL(unaligned_table)
+	.int	LABEL(ashr_8) - LABEL(unaligned_table)
+	.int	LABEL(ashr_9) - LABEL(unaligned_table)
+	.int	LABEL(ashr_10) - LABEL(unaligned_table)
+	.int	LABEL(ashr_11) - LABEL(unaligned_table)
+	.int	LABEL(ashr_12) - LABEL(unaligned_table)
+	.int	LABEL(ashr_13) - LABEL(unaligned_table)
+	.int	LABEL(ashr_14) - LABEL(unaligned_table)
+	.int	LABEL(ashr_15) - LABEL(unaligned_table)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCPY_SSE2, @function; \
+	.align 16; \
+	STRCPY_SSE2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCPY_SSE2, .-STRCPY_SSE2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcpy calls through a PLT.
+   The speedup we get from using SSSE3 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCPY; __GI_STRCPY = STRCPY_SSE2
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+	.globl __GI___STRCPY; __GI___STRCPY = STRCPY_SSE2
+#endif
+
+#ifndef USE_AS_STRNCPY
+#include "../strcpy.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
new file mode 100644
index 0000000000..4512267d3f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcspn-c.c
@@ -0,0 +1,312 @@
+/* strcspn with SSE4.2 intrinsics
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <nmmintrin.h>
+#include <string.h>
+
+/* We use 0x2:
+	_SIDD_SBYTE_OPS
+	| _SIDD_CMP_EQUAL_ANY
+	| _SIDD_POSITIVE_POLARITY
+	| _SIDD_LEAST_SIGNIFICANT
+   on pcmpistri to compare xmm/mem128
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   X X X X X X X X X X X X X X X X
+
+   against xmm
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   A A A A A A A A A A A A A A A A
+
+   to find out if the first 16byte data element has any byte A and
+   the offset of the first byte.  There are 3 cases:
+
+   1. The first 16byte data element has the byte A at the offset X.
+   2. The first 16byte data element has EOS and doesn't have the byte A.
+   3. The first 16byte data element is valid and doesn't have the byte A.
+
+   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
+
+    1		 X	  1	 0/1	  0
+    2		16	  0	  1	  0
+    3		16	  0	  0	  0
+
+   We exit from the loop for cases 1 and 2 with jbe which branches
+   when either CFlag or ZFlag is 1.  If CFlag == 1, ECX has the offset
+   X for case 1.  */
+
+#ifndef STRCSPN_SSE2
+# define STRCSPN_SSE2 __strcspn_sse2
+# define STRCSPN_SSE42 __strcspn_sse42
+#endif
+
+#ifdef USE_AS_STRPBRK
+# define RETURN(val1, val2) return val1
+#else
+# define RETURN(val1, val2) return val2
+#endif
+
+extern
+#ifdef USE_AS_STRPBRK
+char *
+#else
+size_t
+#endif
+STRCSPN_SSE2 (const char *, const char *);
+
+
+#ifdef USE_AS_STRPBRK
+char *
+#else
+size_t
+#endif
+__attribute__ ((section (".text.sse4.2")))
+STRCSPN_SSE42 (const char *s, const char *a)
+{
+  if (*a == 0)
+    RETURN (NULL, strlen (s));
+
+  const char *aligned;
+  __m128i mask;
+  int offset = (int) ((size_t) a & 15);
+  if (offset != 0)
+    {
+      /* Load masks.  */
+      aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L);
+      __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
+
+      switch (offset)
+	{
+	case 1:
+	  mask = _mm_srli_si128 (mask0, 1);
+	  break;
+	case 2:
+	  mask = _mm_srli_si128 (mask0, 2);
+	  break;
+	case 3:
+	  mask = _mm_srli_si128 (mask0, 3);
+	  break;
+	case 4:
+	  mask = _mm_srli_si128 (mask0, 4);
+	  break;
+	case 5:
+	  mask = _mm_srli_si128 (mask0, 5);
+	  break;
+	case 6:
+	  mask = _mm_srli_si128 (mask0, 6);
+	  break;
+	case 7:
+	  mask = _mm_srli_si128 (mask0, 7);
+	  break;
+	case 8:
+	  mask = _mm_srli_si128 (mask0, 8);
+	  break;
+	case 9:
+	  mask = _mm_srli_si128 (mask0, 9);
+	  break;
+	case 10:
+	  mask = _mm_srli_si128 (mask0, 10);
+	  break;
+	case 11:
+	  mask = _mm_srli_si128 (mask0, 11);
+	  break;
+	case 12:
+	  mask = _mm_srli_si128 (mask0, 12);
+	  break;
+	case 13:
+	  mask = _mm_srli_si128 (mask0, 13);
+	  break;
+	case 14:
+	  mask = _mm_srli_si128 (mask0, 14);
+	  break;
+	case 15:
+	  mask = _mm_srli_si128 (mask0, 15);
+	  break;
+	}
+
+      /* Find where the NULL terminator is.  */
+      int length = _mm_cmpistri (mask, mask, 0x3a);
+      if (length == 16 - offset)
+	{
+	  /* There is no NULL terminator.  */
+	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
+	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
+	  length += index;
+
+	  /* Don't use SSE4.2 if the length of A > 16.  */
+	  if (length > 16)
+	    return STRCSPN_SSE2 (s, a);
+
+	  if (index != 0)
+	    {
+	      /* Combine mask0 and mask1.  */
+	      switch (offset)
+		{
+		case 1:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 1);
+		  break;
+		case 2:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 2);
+		  break;
+		case 3:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 3);
+		  break;
+		case 4:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 4);
+		  break;
+		case 5:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 5);
+		  break;
+		case 6:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 6);
+		  break;
+		case 7:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 7);
+		  break;
+		case 8:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 8);
+		  break;
+		case 9:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 9);
+		  break;
+		case 10:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 10);
+		  break;
+		case 11:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 11);
+		  break;
+		case 12:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 12);
+		  break;
+		case 13:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 13);
+		  break;
+		case 14:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 14);
+		  break;
+		case 15:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 15);
+		  break;
+		}
+	    }
+	}
+    }
+  else
+    {
+      /* A is aligned.  */
+      mask = _mm_load_si128 ((__m128i *) a);
+
+      /* Find where the NULL terminator is.  */
+      int length = _mm_cmpistri (mask, mask, 0x3a);
+      if (length == 16)
+	{
+	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
+	     of A > 16.  */
+	  if (a[16] != 0)
+	    return STRCSPN_SSE2 (s, a);
+	}
+    }
+
+  offset = (int) ((size_t) s & 15);
+  if (offset != 0)
+    {
+      /* Check partial string.  */
+      aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L);
+      __m128i value = _mm_load_si128 ((__m128i *) aligned);
+
+      switch (offset)
+	{
+	case 1:
+	  value = _mm_srli_si128 (value, 1);
+	  break;
+	case 2:
+	  value = _mm_srli_si128 (value, 2);
+	  break;
+	case 3:
+	  value = _mm_srli_si128 (value, 3);
+	  break;
+	case 4:
+	  value = _mm_srli_si128 (value, 4);
+	  break;
+	case 5:
+	  value = _mm_srli_si128 (value, 5);
+	  break;
+	case 6:
+	  value = _mm_srli_si128 (value, 6);
+	  break;
+	case 7:
+	  value = _mm_srli_si128 (value, 7);
+	  break;
+	case 8:
+	  value = _mm_srli_si128 (value, 8);
+	  break;
+	case 9:
+	  value = _mm_srli_si128 (value, 9);
+	  break;
+	case 10:
+	  value = _mm_srli_si128 (value, 10);
+	  break;
+	case 11:
+	  value = _mm_srli_si128 (value, 11);
+	  break;
+	case 12:
+	  value = _mm_srli_si128 (value, 12);
+	  break;
+	case 13:
+	  value = _mm_srli_si128 (value, 13);
+	  break;
+	case 14:
+	  value = _mm_srli_si128 (value, 14);
+	  break;
+	case 15:
+	  value = _mm_srli_si128 (value, 15);
+	  break;
+	}
+
+      int length = _mm_cmpistri (mask, value, 0x2);
+      /* No need to check ZFlag since ZFlag is always 1.  */
+      int cflag = _mm_cmpistrc (mask, value, 0x2);
+      if (cflag)
+	RETURN ((char *) (s + length), length);
+      /* Find where the NULL terminator is.  */
+      int index = _mm_cmpistri (value, value, 0x3a);
+      if (index < 16 - offset)
+	RETURN (NULL, index);
+      aligned += 16;
+    }
+  else
+    aligned = s;
+
+  while (1)
+    {
+      __m128i value = _mm_load_si128 ((__m128i *) aligned);
+      int index = _mm_cmpistri (mask, value, 0x2);
+      int cflag = _mm_cmpistrc (mask, value, 0x2);
+      int zflag = _mm_cmpistrz (mask, value, 0x2);
+      if (cflag)
+	RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
+      if (zflag)
+	RETURN (NULL,
+		/* Find where the NULL terminator is.  */
+		(size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s));
+      aligned += 16;
+    }
+}
diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S
new file mode 100644
index 0000000000..cc75ab70e6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcspn.S
@@ -0,0 +1,82 @@
+/* Multiple versions of strcspn
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <config.h>
+
+#ifdef HAVE_SSE4_SUPPORT
+
+#include <sysdep.h>
+#include <ifunc-defines.h>
+
+#ifdef USE_AS_STRPBRK
+#define STRCSPN_SSE42	__strpbrk_sse42
+#define STRCSPN_SSE2	__strpbrk_sse2
+#define __GI_STRCSPN	__GI_strpbrk
+#else
+#ifndef STRCSPN
+#define STRCSPN		strcspn
+#define STRCSPN_SSE42	__strcspn_sse42
+#define STRCSPN_SSE2	__strcspn_sse2
+#define __GI_STRCSPN	__GI_strcspn
+#endif
+#endif
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strpbrk in static library since we
+   need strpbrk before the initialization happened.  */
+#if (defined SHARED || !defined USE_AS_STRPBRK) && !defined NOT_IN_libc
+	.text
+ENTRY(STRCSPN)
+	.type	STRCSPN, @gnu_indirect_function
+	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:	leaq	STRCSPN_SSE2(%rip), %rax
+	testl	$(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
+	jz	2f
+	leaq	STRCSPN_SSE42(%rip), %rax
+2:	ret
+END(STRCSPN)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCSPN_SSE2, @function; \
+	.globl STRCSPN_SSE2; \
+	.align 16; \
+	STRCSPN_SSE2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcspn calls through a PLT.
+   The speedup we get from using SSE4.2 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_SSE2
+#endif
+
+#endif /* HAVE_SSE4_SUPPORT */
+
+#ifdef USE_AS_STRPBRK
+#include "../strpbrk.S"
+#else
+#include "../strcspn.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S
index 79e6a977ec..82b03ccc28 100644
--- a/sysdeps/x86_64/multiarch/strlen.S
+++ b/sysdeps/x86_64/multiarch/strlen.S
@@ -77,6 +77,7 @@ __strlen_sse42:
 # undef ENTRY
 # define ENTRY(name) \
 	.type __strlen_sse2, @function; \
+	.align 16; \
 	__strlen_sse2: cfi_startproc; \
 	CALL_MCOUNT
 # undef END
diff --git a/sysdeps/x86_64/multiarch/strncpy-c.c b/sysdeps/x86_64/multiarch/strncpy-c.c
new file mode 100644
index 0000000000..296c32cb5d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-c.c
@@ -0,0 +1,8 @@
+#define STRNCPY __strncpy_sse2
+#ifdef SHARED
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name) \
+  __hidden_ver1 (__strncpy_sse2, __GI_strncpy, __strncpy_sse2);
+#endif
+
+#include "strncpy.c"
diff --git a/sysdeps/x86_64/multiarch/strncpy.S b/sysdeps/x86_64/multiarch/strncpy.S
new file mode 100644
index 0000000000..327a4ce447
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy.S
@@ -0,0 +1,3 @@
+#define STRCPY strncpy
+#define USE_AS_STRNCPY
+#include "strcpy.S"
diff --git a/sysdeps/x86_64/multiarch/strpbrk-c.c b/sysdeps/x86_64/multiarch/strpbrk-c.c
new file mode 100644
index 0000000000..c58dcb5605
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strpbrk-c.c
@@ -0,0 +1,4 @@
+#define USE_AS_STRPBRK
+#define STRCSPN_SSE2 __strpbrk_sse2
+#define STRCSPN_SSE42 __strpbrk_sse42
+#include "strcspn-c.c"
diff --git a/sysdeps/x86_64/multiarch/strpbrk.S b/sysdeps/x86_64/multiarch/strpbrk.S
new file mode 100644
index 0000000000..ed5bca6a94
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strpbrk.S
@@ -0,0 +1,3 @@
+#define STRCSPN strpbrk
+#define USE_AS_STRPBRK
+#include "strcspn.S"
diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
new file mode 100644
index 0000000000..5b99f0d383
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strspn-c.c
@@ -0,0 +1,284 @@
+/* strspn with SSE4.2 intrinsics
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <nmmintrin.h>
+#include <string.h>
+
+/* We use 0x12:
+	_SIDD_SBYTE_OPS
+	| _SIDD_CMP_EQUAL_ANY
+	| _SIDD_NEGATIVE_POLARITY
+	| _SIDD_LEAST_SIGNIFICANT
+   on pcmpistri to compare xmm/mem128
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   X X X X X X X X X X X X X X X X
+
+   against xmm
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   A A A A A A A A A A A A A A A A
+
+   to find out if the first 16byte data element has any non-A byte and
+   the offset of the first byte.  There are 2 cases:
+
+   1. The first 16byte data element has the non-A byte, including
+      EOS, at the offset X.
+   2. The first 16byte data element is valid and doesn't have the non-A
+      byte.
+
+   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
+
+   case		ECX	CFlag	ZFlag	SFlag
+    1		 X	  1	 0/1	  0
+    2		16	  0	  0	  0
+
+   We exit from the loop for case 1.  */
+
+extern size_t __strspn_sse2 (const char *, const char *);
+
+
+size_t
+__attribute__ ((section (".text.sse4.2")))
+__strspn_sse42 (const char *s, const char *a)
+{
+  if (*a == 0)
+    return 0;
+
+  const char *aligned;
+  __m128i mask;
+  int offset = (int) ((size_t) a & 15);
+  if (offset != 0)
+    {
+      /* Load masks.  */
+      aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L);
+      __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
+
+      switch (offset)
+	{
+	case 1:
+	  mask = _mm_srli_si128 (mask0, 1);
+	  break;
+	case 2:
+	  mask = _mm_srli_si128 (mask0, 2);
+	  break;
+	case 3:
+	  mask = _mm_srli_si128 (mask0, 3);
+	  break;
+	case 4:
+	  mask = _mm_srli_si128 (mask0, 4);
+	  break;
+	case 5:
+	  mask = _mm_srli_si128 (mask0, 5);
+	  break;
+	case 6:
+	  mask = _mm_srli_si128 (mask0, 6);
+	  break;
+	case 7:
+	  mask = _mm_srli_si128 (mask0, 7);
+	  break;
+	case 8:
+	  mask = _mm_srli_si128 (mask0, 8);
+	  break;
+	case 9:
+	  mask = _mm_srli_si128 (mask0, 9);
+	  break;
+	case 10:
+	  mask = _mm_srli_si128 (mask0, 10);
+	  break;
+	case 11:
+	  mask = _mm_srli_si128 (mask0, 11);
+	  break;
+	case 12:
+	  mask = _mm_srli_si128 (mask0, 12);
+	  break;
+	case 13:
+	  mask = _mm_srli_si128 (mask0, 13);
+	  break;
+	case 14:
+	  mask = _mm_srli_si128 (mask0, 14);
+	  break;
+	case 15:
+	  mask = _mm_srli_si128 (mask0, 15);
+	  break;
+	}
+
+      /* Find where the NULL terminator is.  */
+      int length = _mm_cmpistri (mask, mask, 0x3a);
+      if (length == 16 - offset)
+	{
+	  /* There is no NULL terminator.  */
+	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
+	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
+	  length += index;
+
+	  /* Don't use SSE4.2 if the length of A > 16.  */
+	  if (length > 16)
+	    return __strspn_sse2 (s, a);
+
+	  if (index != 0)
+	    {
+	      /* Combine mask0 and mask1.  */
+	      switch (offset)
+		{
+		case 1:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 1);
+		  break;
+		case 2:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 2);
+		  break;
+		case 3:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 3);
+		  break;
+		case 4:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 4);
+		  break;
+		case 5:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 5);
+		  break;
+		case 6:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 6);
+		  break;
+		case 7:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 7);
+		  break;
+		case 8:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 8);
+		  break;
+		case 9:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 9);
+		  break;
+		case 10:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 10);
+		  break;
+		case 11:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 11);
+		  break;
+		case 12:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 12);
+		  break;
+		case 13:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 13);
+		  break;
+		case 14:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 14);
+		  break;
+		case 15:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 15);
+		  break;
+		}
+	    }
+	}
+    }
+  else
+    {
+      /* A is aligned.  */
+      mask = _mm_load_si128 ((__m128i *) a);
+
+      /* Find where the NULL terminator is.  */
+      int length = _mm_cmpistri (mask, mask, 0x3a);
+      if (length == 16)
+	{
+	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
+	     of A > 16.  */
+	  if (a[16] != 0)
+	    return __strspn_sse2 (s, a);
+	}
+    }
+
+  offset = (int) ((size_t) s & 15);
+  if (offset != 0)
+    {
+      /* Check partial string.  */
+      aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L);
+      __m128i value = _mm_load_si128 ((__m128i *) aligned);
+
+      switch (offset)
+	{
+	case 1:
+	  value = _mm_srli_si128 (value, 1);
+	  break;
+	case 2:
+	  value = _mm_srli_si128 (value, 2);
+	  break;
+	case 3:
+	  value = _mm_srli_si128 (value, 3);
+	  break;
+	case 4:
+	  value = _mm_srli_si128 (value, 4);
+	  break;
+	case 5:
+	  value = _mm_srli_si128 (value, 5);
+	  break;
+	case 6:
+	  value = _mm_srli_si128 (value, 6);
+	  break;
+	case 7:
+	  value = _mm_srli_si128 (value, 7);
+	  break;
+	case 8:
+	  value = _mm_srli_si128 (value, 8);
+	  break;
+	case 9:
+	  value = _mm_srli_si128 (value, 9);
+	  break;
+	case 10:
+	  value = _mm_srli_si128 (value, 10);
+	  break;
+	case 11:
+	  value = _mm_srli_si128 (value, 11);
+	  break;
+	case 12:
+	  value = _mm_srli_si128 (value, 12);
+	  break;
+	case 13:
+	  value = _mm_srli_si128 (value, 13);
+	  break;
+	case 14:
+	  value = _mm_srli_si128 (value, 14);
+	  break;
+	case 15:
+	  value = _mm_srli_si128 (value, 15);
+	  break;
+	}
+
+      int length = _mm_cmpistri (mask, value, 0x12);
+      /* No need to check CFlag since it is always 1.  */
+      if (length < 16 - offset)
+	return length;
+      /* Find where the NULL terminator is.  */
+      int index = _mm_cmpistri (value, value, 0x3a);
+      if (index < 16 - offset)
+	return length;
+      aligned += 16;
+    }
+  else
+    aligned = s;
+
+  while (1)
+    {
+      __m128i value = _mm_load_si128 ((__m128i *) aligned);
+      int index = _mm_cmpistri (mask, value, 0x12);
+      int cflag = _mm_cmpistrc (mask, value, 0x12);
+      if (cflag)
+	return (size_t) (aligned + index - s);
+      aligned += 16;
+    }
+}
diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S
new file mode 100644
index 0000000000..4183a2cf60
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strspn.S
@@ -0,0 +1,63 @@
+/* Multiple versions of strspn
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <config.h>
+
+#ifdef HAVE_SSE4_SUPPORT
+
+#include <sysdep.h>
+#include <ifunc-defines.h>
+
+/* Define multiple versions only for the definition in libc.  */
+#ifndef NOT_IN_libc
+	.text
+ENTRY(strspn)
+	.type	strspn, @gnu_indirect_function
+	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:	leaq	__strspn_sse2(%rip), %rax
+	testl	$(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
+	jz	2f
+	leaq	__strspn_sse42(%rip), %rax
+2:	ret
+END(strspn)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strspn_sse2, @function; \
+	.globl __strspn_sse2; \
+	.align 16; \
+	__strspn_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strspn_sse2, .-__strspn_sse2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strspn calls through a PLT.
+   The speedup we get from using SSE4.2 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strspn; __GI_strspn = __strspn_sse2
+#endif
+
+#endif /* HAVE_SSE4_SUPPORT */
+
+#include "../strspn.S"