summaryrefslogtreecommitdiff
path: root/sysdeps/x86_64/multiarch/strcpy.S
diff options
context:
space:
mode:
authorH.J. Lu <hongjiu.lu@intel.com>2011-06-24 15:14:22 -0400
committerUlrich Drepper <drepper@gmail.com>2011-06-24 15:14:22 -0400
commit8912479f9ea9f56dc188d3d00c4ba4259f600661 (patch)
treefc91331de86b054859ce0dfe3fdec2a06812aa4c /sysdeps/x86_64/multiarch/strcpy.S
parentd5495a116c6271c0ae8f6955b64b7b010b1b341a (diff)
Improved st{r,p}{,n}cpy for SSE2 and SSSE3 on x86-64
Diffstat (limited to 'sysdeps/x86_64/multiarch/strcpy.S')
-rw-r--r--sysdeps/x86_64/multiarch/strcpy.S1860
1 files changed, 24 insertions, 1836 deletions
diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S
index 02fa8d0710..381060f643 100644
--- a/sysdeps/x86_64/multiarch/strcpy.S
+++ b/sysdeps/x86_64/multiarch/strcpy.S
@@ -1,5 +1,5 @@
-/* strcpy with SSSE3
- Copyright (C) 2009 Free Software Foundation, Inc.
+/* Multiple versions of strcpy
+ Copyright (C) 2009, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -29,30 +29,32 @@
#ifdef USE_AS_STPCPY
# ifdef USE_AS_STRNCPY
-# define STRCPY_SSSE3 __stpncpy_ssse3
-# define STRCPY_SSE2 __stpncpy_sse2
-# define __GI_STRCPY __GI_stpncpy
+# define STRCPY_SSSE3 __stpncpy_ssse3
+# define STRCPY_SSE2 __stpncpy_sse2
+# define STRCPY_SSE2_UNALIGNED __stpncpy_sse2_unaligned
+# define __GI_STRCPY __GI_stpncpy
+# define __GI___STRCPY __GI___stpncpy
# else
-# define STRCPY_SSSE3 __stpcpy_ssse3
-# define STRCPY_SSE2 __stpcpy_sse2
-# define __GI_STRCPY __GI_stpcpy
-# define __GI___STRCPY __GI___stpcpy
+# define STRCPY_SSSE3 __stpcpy_ssse3
+# define STRCPY_SSE2 __stpcpy_sse2
+# define STRCPY_SSE2_UNALIGNED __stpcpy_sse2_unaligned
+# define __GI_STRCPY __GI_stpcpy
+# define __GI___STRCPY __GI___stpcpy
# endif
#else
# ifdef USE_AS_STRNCPY
-# define STRCPY_SSSE3 __strncpy_ssse3
-# define STRCPY_SSE2 __strncpy_sse2
-# define __GI_STRCPY __GI_strncpy
+# define STRCPY_SSSE3 __strncpy_ssse3
+# define STRCPY_SSE2 __strncpy_sse2
+# define STRCPY_SSE2_UNALIGNED __strncpy_sse2_unaligned
+# define __GI_STRCPY __GI_strncpy
# else
-# define STRCPY_SSSE3 __strcpy_ssse3
-# define STRCPY_SSE2 __strcpy_sse2
-# define __GI_STRCPY __GI_strcpy
+# define STRCPY_SSSE3 __strcpy_ssse3
+# define STRCPY_SSE2 __strcpy_sse2
+# define STRCPY_SSE2_UNALIGNED __strcpy_sse2_unaligned
+# define __GI_STRCPY __GI_strcpy
# endif
#endif
-#ifndef LABEL
-#define LABEL(l) L(l)
-#endif
/* Define multiple versions only for the definition in libc. */
#ifndef NOT_IN_libc
@@ -62,1830 +64,16 @@ ENTRY(STRCPY)
cmpl $0, __cpu_features+KIND_OFFSET(%rip)
jne 1f
call __init_cpu_features
-1: leaq STRCPY_SSE2(%rip), %rax
+1: leaq STRCPY_SSE2_UNALIGNED(%rip), %rax
+ testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+ jnz 2f
+ leaq STRCPY_SSE2(%rip), %rax
testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
jz 2f
leaq STRCPY_SSSE3(%rip), %rax
2: ret
END(STRCPY)
- .section .text.ssse3,"ax",@progbits
-STRCPY_SSSE3:
- cfi_startproc
- CALL_MCOUNT
-
-/*
- * This implementation uses SSE to copy up to 16 bytes at a time.
- */
-#ifdef USE_AS_STRNCPY
- test %rdx, %rdx
- jz LABEL(strncpy_exitz)
- mov %rdx, %r8
-#else
- xor %edx, %edx
-#endif
- mov %esi, %ecx
- and $0xfffffffffffffff0, %rsi /*force rsi 16 byte align*/
- and $15, %ecx
- mov %rdi, %rax /*store return parameter*/
-
-
- pxor %xmm0, %xmm0 /* clear %xmm0 */
- pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/
- pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/
- shr %cl, %edx /* get real bits left in edx*/
- test %edx, %edx /* edx must be 0 if there is no null char from rsi+%rcx */
- jnz LABEL(less16bytes)
-
-#ifdef USE_AS_STRNCPY
- lea -16(%r8,%rcx), %r11
- cmp $0, %r11
- jle LABEL(less16bytes) /* if r8 + rcx <= 16, branch to less16bytes. */
-#endif
-
- mov %rcx, %r9
- or %edi, %ecx
- and $15, %ecx
- lea -16(%r9), %r10
- jz LABEL(ashr_0) /* ecx must be 0 if offset of rsi and rdi is 16 byte align*/
-
- neg %r10 /* store the rest in rsi aligned 16 bytes for unaligned_exit*/
-
- pxor %xmm0, %xmm0 /* clear %xmm0, may be polluted by unaligned operation*/
- pcmpeqb 16(%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(less32bytes)
- /*
- * at least 16 byte available to fill destination rdi
- */
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(less32bytes_strncpy_truncation)
-#endif
- mov (%rsi, %r9), %rdx
- mov %rdx, (%rdi)
- mov 8(%rsi, %r9), %rdx
- mov %rdx, 8(%rdi)
-
- /*
- * so far destatination rdi may be aligned by 16, re-calculate rsi to jump
- * crossponding case
- * rcx is offset of rsi
- * rax is offset of rdi
- */
-
- and $0xfffffffffffffff0, %rdi /* force rdi 16 byte align */
- mov %rax, %rdx /* rax store orignal rdi */
- xor %rdi, %rdx /* equal to and $15, %rdx */
-#ifdef USE_AS_STRNCPY
- add %rdx, %r8
-#endif
-
- add $16, %rdi /* next 16 bytes for rdi */
- sub %rdx, %r9
-
- lea 16(%r9, %rsi), %rsi /*re-calculate rsi by (16 - rdx)+ rcx */
- mov %esi, %ecx /*store offset of rsi */
- and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */
-
- and $15, %ecx /* ecx must be 0 if rdx is equal to rcx*/
- jz LABEL(ashr_0)
-
- lea -16(%rcx), %r10
- mov %rcx, %r9
- neg %r10
- lea LABEL(unaligned_table)(%rip), %r11
- movslq (%r11, %rcx,4), %rcx
- lea (%r11, %rcx), %rcx
- jmp *%rcx
-
- /*
- * The following cases will be handled by ashr_0 & ashr_0_start
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * 0 0 0 ashr_0
- * n(1~15) n(1~15) 0 ashr_0_start
- *
- */
- .p2align 5
-LABEL(ashr_0):
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_aligned)
-#endif
- movdqa (%rsi), %xmm1 /* fetch first 16 bytes from rsi */
- movdqa %xmm1, (%rdi) /* store first 16 bytes into rdi */
- add $16, %rsi
- add $16, %rdi
- pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char */
- pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/
-
- test %edx, %edx /* edx must be 0 if there is no null char in rsi*/
- jnz LABEL(aligned_16bytes)
-
-LABEL(ashr_0_loop):
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_aligned)
-#endif
- movdqa (%rsi, %rcx), %xmm1
- movdqa %xmm1, (%rdi, %rcx)
- add $16, %rcx
- pcmpeqb (%rsi, %rcx), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(aligned_exit)
-
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_aligned)
-#endif
- movdqa (%rsi, %rcx), %xmm1
- movdqa %xmm1, (%rdi, %rcx)
- add $16, %rcx
- pcmpeqb (%rsi, %rcx), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(aligned_exit)
-
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_aligned)
-#endif
- movdqa (%rsi, %rcx), %xmm1
- movdqa %xmm1, (%rdi, %rcx)
- add $16, %rcx
- pcmpeqb (%rsi, %rcx), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(aligned_exit)
-
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_aligned)
-#endif
- movdqa (%rsi, %rcx), %xmm1
- movdqa %xmm1, (%rdi, %rcx)
- add $16, %rcx
- pcmpeqb (%rsi, %rcx), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jz LABEL(ashr_0_loop)
-
- jmp LABEL(aligned_exit)
- .p2align 4
-
-/*
- * The following cases will be handled by ashr_15
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(15) n - 15 15((16 - (n -15) + n)%16 ashr_15
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_15):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_15_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $15, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $15, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_15_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_14
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(14~15) n - 14 14((16 - (n -14) + n)%16 ashr_14
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_14):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_14_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $14, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $14, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_14_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_13
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(13~15) n - 13 13((16 - (n -13) + n)%16 ashr_13
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_13):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_13_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $13, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $13, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_13_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_12
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(12~15) n - 12 12((16 - (n -12) + n)%16 ashr_12
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_12):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_12_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $12, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $12, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_12_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_11
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(11~15) n - 11 11((16 - (n -11) + n)%16 ashr_11
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_11):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_11_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $11, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $11, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_11_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_10
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(10~15) n - 10 10((16 - (n -10) + n)%16 ashr_10
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_10):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_10_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $10, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $10, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_10_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_9
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(9~15) n - 9 9((16 - (n -9) + n)%16 ashr_9
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_9):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_9_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $9, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $9, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_9_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_8
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(8~15) n - 8 8((16 - (n -8) + n)%16 ashr_8
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_8):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_8_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $8, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $8, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_8_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_7
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(7~15) n - 7 7((16 - (n -7) + n)%16 ashr_7
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_7):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- .p2align 4
-
-LABEL(ashr_7_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $7, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $7, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_7_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_6
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(6~15) n - 6 6((16 - (n -6) + n)%16 ashr_6
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_6):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_6_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $6, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $6, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_6_use_ssse3)
-
- /*
- * The following cases will be handled by ashr_5
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(5~15) n - 5 5((16 - (n -5) + n)%16 ashr_5
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_5):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_5_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $5, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $5, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_5_use_ssse3)
-
-/*
- *
- * The following cases will be handled by ashr_4
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(4~15) n - 4 4((16 - (n -4) + n)%16 ashr_4
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_4):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_4_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $4, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $4, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_4_use_ssse3)
-
-/*
- *
- * The following cases will be handled by ashr_3
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(3~15) n - 3 3((16 - (n -3) + n)%16 ashr_3
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_3):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_3_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $3, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $3, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_3_use_ssse3)
-
-/*
- *
- * The following cases will be handled by ashr_2
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(2~15) n - 2 2((16 - (n -2) + n)%16 ashr_2
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_2):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_2_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $2, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $2, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_2_use_ssse3)
-
-/*
- *
- * The following cases will be handled by ashr_1
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(1~15) n - 1 1 ((16 - (n -1) + n)%16 ashr_1
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_1):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_1_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $1, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
- palignr $1, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_1_use_ssse3)
-
- .p2align 4
-LABEL(less32bytes):
- xor %ecx, %ecx
-LABEL(unaligned_exit):
- add %r9, %rsi /* r9 stores original offset of rsi*/
- mov %rcx, %r9
- mov %r10, %rcx
- shl %cl, %edx /* after shl, calculate the exact number to be filled*/
- mov %r9, %rcx
- .p2align 4
-LABEL(aligned_exit):
- add %rcx, %rdi /*locate exact address for rdi */
-LABEL(less16bytes):
- add %rcx, %rsi /*locate exact address for rsi */
-LABEL(aligned_16bytes):
-#ifdef USE_AS_STRNCPY
- mov $1, %r9d
- lea -1(%r8), %rcx
- shl %cl, %r9d
- cmp $32, %r8
- ja LABEL(strncpy_tail)
- or %r9d, %edx
-LABEL(strncpy_tail):
-#endif
- bsf %rdx, %rcx /*If a least significant 1 bit in %rdx is found, its bit index is stored in %rcx*/
- lea LABEL(tail_table)(%rip), %r11
- movslq (%r11, %rcx,4), %rcx
- lea (%r11, %rcx), %rcx
- jmp *%rcx
-
-#ifdef USE_AS_STRNCPY
- .p2align 4
-LABEL(less32bytes_strncpy_truncation):
- xor %ecx, %ecx
-LABEL(strncpy_truncation_unaligned):
- add %r9, %rsi
-LABEL(strncpy_truncation_aligned):
- add %rcx, %rdi
- add %rcx, %rsi
- add $16, %r8
- lea -1(%r8), %rcx
- lea LABEL(tail_table)(%rip), %r11
- movslq (%r11, %rcx,4), %rcx
- lea (%r11, %rcx), %rcx
- jmp *%rcx
- .p2align 4
-LABEL(strncpy_exitz):
- mov %rdi, %rax
- ret
-#endif
-
-#ifdef USE_AS_STRNCPY
- .p2align 4
-LABEL(strncpy_fill_tail):
- mov %rax, %rdx
- movzx %cl, %rax
- mov %r8, %rcx
- add %rax, %rdi
- xor %eax, %eax
- shr $3, %ecx
- jz LABEL(strncpy_fill_less_8)
-
- rep stosq
-LABEL(strncpy_fill_less_8):
- mov %r8, %rcx
- and $7, %ecx
- jz LABEL(strncpy_fill_return)
-LABEL(strncpy_fill_less_7):
- sub $1, %ecx
- mov %al, (%rdi, %rcx)
- jnz LABEL(strncpy_fill_less_7)
-LABEL(strncpy_fill_return):
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rdx)
- sbb $-1, %rdx
-#endif
- mov %rdx, %rax
- ret
-#endif
- .p2align 4
-LABEL(tail_0):
- mov (%rsi), %cl
- mov %cl, (%rdi)
-#ifdef USE_AS_STPCPY
- mov %rdi, %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $1, %cl
- sub $1, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_1):
- mov (%rsi), %cx
- mov %cx, (%rdi)
-#ifdef USE_AS_STPCPY
- lea 1(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $2, %cl
- sub $2, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_2):
- mov (%rsi), %cx
- mov %cx, (%rdi)
- mov 1(%rsi), %cx
- mov %cx, 1(%rdi)
-#ifdef USE_AS_STPCPY
- lea 2(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $3, %cl
- sub $3, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_3):
- mov (%rsi), %ecx
- mov %ecx, (%rdi)
-#ifdef USE_AS_STPCPY
- lea 3(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $4, %cl
- sub $4, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_4):
- mov (%rsi), %ecx
- mov %ecx, (%rdi)
- mov 1(%rsi), %edx
- mov %edx, 1(%rdi)
-#ifdef USE_AS_STPCPY
- lea 4(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $5, %cl
- sub $5, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_5):
- mov (%rsi), %ecx
- mov %ecx, (%rdi)
- mov 2(%rsi), %edx
- mov %edx, 2(%rdi)
-#ifdef USE_AS_STPCPY
- lea 5(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $6, %cl
- sub $6, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_6):
- mov (%rsi), %ecx
- mov %ecx, (%rdi)
- mov 3(%rsi), %edx
- mov %edx,3(%rdi)
-#ifdef USE_AS_STPCPY
- lea 6(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $7, %cl
- sub $7, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_7):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
-#ifdef USE_AS_STPCPY
- lea 7(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $8, %cl
- sub $8, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_8):
-
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 5(%rsi), %edx
- mov %edx, 5(%rdi)
-#ifdef USE_AS_STPCPY
- lea 8(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $9, %cl
- sub $9, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_9):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 6(%rsi), %edx
- mov %edx, 6(%rdi)
-#ifdef USE_AS_STPCPY
- lea 9(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $10, %cl
- sub $10, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_10):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 7(%rsi), %edx
- mov %edx, 7(%rdi)
-#ifdef USE_AS_STPCPY
- lea 10(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $11, %cl
- sub $11, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_11):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %edx
- mov %edx, 8(%rdi)
-#ifdef USE_AS_STPCPY
- lea 11(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $12, %cl
- sub $12, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_12):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 5(%rsi), %rcx
- mov %rcx, 5(%rdi)
-#ifdef USE_AS_STPCPY
- lea 12(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $13, %cl
- sub $13, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_13):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 6(%rsi), %rcx
- mov %rcx, 6(%rdi)
-#ifdef USE_AS_STPCPY
- lea 13(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $14, %cl
- sub $14, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_14):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 7(%rsi), %rcx
- mov %rcx, 7(%rdi)
-#ifdef USE_AS_STPCPY
- lea 14(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $15, %cl
- sub $15, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
-LABEL(tail_15):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
-#ifdef USE_AS_STPCPY
- lea 15(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $16, %cl
- sub $16, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
-
- ret
-
- .p2align 4
-LABEL(tail_16):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %cl
- mov %cl, 16(%rdi)
-#ifdef USE_AS_STPCPY
- lea 16(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $17, %cl
- sub $17, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_17):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %cx
- mov %cx, 16(%rdi)
-#ifdef USE_AS_STPCPY
- lea 17(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $18, %cl
- sub $18, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_18):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 15(%rsi), %ecx
- mov %ecx,15(%rdi)
-#ifdef USE_AS_STPCPY
- lea 18(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $19, %cl
- sub $19, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_19):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %ecx
- mov %ecx, 16(%rdi)
-#ifdef USE_AS_STPCPY
- lea 19(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $20, %cl
- sub $20, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_20):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 13(%rsi), %rcx
- mov %rcx, 13(%rdi)
-#ifdef USE_AS_STPCPY
- lea 20(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $21, %cl
- sub $21, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_21):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 14(%rsi), %rcx
- mov %rcx, 14(%rdi)
-#ifdef USE_AS_STPCPY
- lea 21(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $22, %cl
- sub $22, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_22):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 15(%rsi), %rcx
- mov %rcx, 15(%rdi)
-#ifdef USE_AS_STPCPY
- lea 22(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $23, %cl
- sub $23, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_23):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
-#ifdef USE_AS_STPCPY
- lea 23(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $24, %cl
- sub $24, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
-
- ret
-
- .p2align 4
-LABEL(tail_24):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
- mov 21(%rsi), %edx
- mov %edx, 21(%rdi)
-#ifdef USE_AS_STPCPY
- lea 24(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $25, %cl
- sub $25, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_25):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
- mov 22(%rsi), %edx
- mov %edx, 22(%rdi)
-#ifdef USE_AS_STPCPY
- lea 25(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $26, %cl
- sub $26, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_26):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
- mov 23(%rsi), %edx
- mov %edx, 23(%rdi)
-#ifdef USE_AS_STPCPY
- lea 26(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $27, %cl
- sub $27, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_27):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
- mov 24(%rsi), %edx
- mov %edx, 24(%rdi)
-#ifdef USE_AS_STPCPY
- lea 27(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $28, %cl
- sub $28, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_28):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
- mov 21(%rsi), %rdx
- mov %rdx, 21(%rdi)
-#ifdef USE_AS_STPCPY
- lea 28(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $29, %cl
- sub $29, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
-
- ret
-
- .p2align 4
-LABEL(tail_29):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
- mov 22(%rsi), %rdx
- mov %rdx, 22(%rdi)
-#ifdef USE_AS_STPCPY
- lea 29(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $30, %cl
- sub $30, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
-
- ret
-
-
- .p2align 4
-LABEL(tail_30):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
- mov 23(%rsi), %rdx
- mov %rdx, 23(%rdi)
-#ifdef USE_AS_STPCPY
- lea 30(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $31, %cl
- sub $31, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_31):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
- mov 24(%rsi), %rdx
- mov %rdx, 24(%rdi)
-#ifdef USE_AS_STPCPY
- lea 31(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $32, %cl
- sub $32, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- cfi_endproc
- .size STRCPY_SSSE3, .-STRCPY_SSSE3
-
- .p2align 4
- .section .rodata.ssse3,"a",@progbits
-LABEL(tail_table):
- .int LABEL(tail_0) - LABEL(tail_table)
- .int LABEL(tail_1) - LABEL(tail_table)
- .int LABEL(tail_2) - LABEL(tail_table)
- .int LABEL(tail_3) - LABEL(tail_table)
- .int LABEL(tail_4) - LABEL(tail_table)
- .int LABEL(tail_5) - LABEL(tail_table)
- .int LABEL(tail_6) - LABEL(tail_table)
- .int LABEL(tail_7) - LABEL(tail_table)
- .int LABEL(tail_8) - LABEL(tail_table)
- .int LABEL(tail_9) - LABEL(tail_table)
- .int LABEL(tail_10) - LABEL(tail_table)
- .int LABEL(tail_11) - LABEL(tail_table)
- .int LABEL(tail_12) - LABEL(tail_table)
- .int LABEL(tail_13) - LABEL(tail_table)
- .int LABEL(tail_14) - LABEL(tail_table)
- .int LABEL(tail_15) - LABEL(tail_table)
- .int LABEL(tail_16) - LABEL(tail_table)
- .int LABEL(tail_17) - LABEL(tail_table)
- .int LABEL(tail_18) - LABEL(tail_table)
- .int LABEL(tail_19) - LABEL(tail_table)
- .int LABEL(tail_20) - LABEL(tail_table)
- .int LABEL(tail_21) - LABEL(tail_table)
- .int LABEL(tail_22) - LABEL(tail_table)
- .int LABEL(tail_23) - LABEL(tail_table)
- .int LABEL(tail_24) - LABEL(tail_table)
- .int LABEL(tail_25) - LABEL(tail_table)
- .int LABEL(tail_26) - LABEL(tail_table)
- .int LABEL(tail_27) - LABEL(tail_table)
- .int LABEL(tail_28) - LABEL(tail_table)
- .int LABEL(tail_29) - LABEL(tail_table)
- .int LABEL(tail_30) - LABEL(tail_table)
- .int LABEL(tail_31) - LABEL(tail_table)
-
- .p2align 4
-LABEL(unaligned_table):
- .int LABEL(ashr_0) - LABEL(unaligned_table)
- .int LABEL(ashr_1) - LABEL(unaligned_table)
- .int LABEL(ashr_2) - LABEL(unaligned_table)
- .int LABEL(ashr_3) - LABEL(unaligned_table)
- .int LABEL(ashr_4) - LABEL(unaligned_table)
- .int LABEL(ashr_5) - LABEL(unaligned_table)
- .int LABEL(ashr_6) - LABEL(unaligned_table)
- .int LABEL(ashr_7) - LABEL(unaligned_table)
- .int LABEL(ashr_8) - LABEL(unaligned_table)
- .int LABEL(ashr_9) - LABEL(unaligned_table)
- .int LABEL(ashr_10) - LABEL(unaligned_table)
- .int LABEL(ashr_11) - LABEL(unaligned_table)
- .int LABEL(ashr_12) - LABEL(unaligned_table)
- .int LABEL(ashr_13) - LABEL(unaligned_table)
- .int LABEL(ashr_14) - LABEL(unaligned_table)
- .int LABEL(ashr_15) - LABEL(unaligned_table)
-
# undef ENTRY
# define ENTRY(name) \
.type STRCPY_SSE2, @function; \