diff options
author | Andreas Schwab <schwab@redhat.com> | 2009-07-20 11:02:11 +0200 |
---|---|---|
committer | Andreas Schwab <schwab@redhat.com> | 2009-07-20 11:02:11 +0200 |
commit | 53924a77a2b827e7f9af6424a6a30224d09692d1 (patch) | |
tree | ba5d034a512524339fcfed113518eb83201fdc23 /sysdeps/x86_64/multiarch | |
parent | 8ecde8e8c2a8e77804f954afffd9efe0ab951e52 (diff) | |
parent | 42e69bcf1137fccfd7a95645a9d316c6490b9ff9 (diff) |
Merge commit 'origin/master' into fedora/master
Diffstat (limited to 'sysdeps/x86_64/multiarch')
-rw-r--r-- | sysdeps/x86_64/multiarch/Makefile | 8 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/rawmemchr.S | 1 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/stpcpy.S | 7 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/stpncpy-c.c | 8 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/stpncpy.S | 6 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strcmp.S | 1 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strcpy.S | 1918 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strcspn-c.c | 312 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strcspn.S | 82 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strlen.S | 1 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strncpy-c.c | 8 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strncpy.S | 3 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strpbrk-c.c | 4 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strpbrk.S | 3 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strspn-c.c | 284 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strspn.S | 63 |
16 files changed, 2708 insertions, 1 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 1c35e1ffb4..71e85f0652 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -4,5 +4,11 @@ gen-as-const-headers += ifunc-defines.sym endif ifeq ($(subdir),string) -sysdep_routines += strncmp-c +sysdep_routines += stpncpy-c strncpy-c strncmp-c +ifeq (yes,$(config-cflags-sse4)) +sysdep_routines += strcspn-c strpbrk-c strspn-c +CFLAGS-strcspn-c.c += -msse4 +CFLAGS-strpbrk-c.c += -msse4 +CFLAGS-strspn-c.c += -msse4 +endif endif diff --git a/sysdeps/x86_64/multiarch/rawmemchr.S b/sysdeps/x86_64/multiarch/rawmemchr.S index 93ca631633..d4f265f430 100644 --- a/sysdeps/x86_64/multiarch/rawmemchr.S +++ b/sysdeps/x86_64/multiarch/rawmemchr.S @@ -77,6 +77,7 @@ __rawmemchr_sse42: # undef ENTRY # define ENTRY(name) \ .type __rawmemchr_sse2, @function; \ + .align 16; \ __rawmemchr_sse2: cfi_startproc; \ CALL_MCOUNT # undef END diff --git a/sysdeps/x86_64/multiarch/stpcpy.S b/sysdeps/x86_64/multiarch/stpcpy.S new file mode 100644 index 0000000000..b63d308edc --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpcpy.S @@ -0,0 +1,7 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy +#include "strcpy.S" + +weak_alias (__stpcpy, stpcpy) +libc_hidden_def (__stpcpy) +libc_hidden_builtin_def (stpcpy) diff --git a/sysdeps/x86_64/multiarch/stpncpy-c.c b/sysdeps/x86_64/multiarch/stpncpy-c.c new file mode 100644 index 0000000000..2fde77dcab --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpncpy-c.c @@ -0,0 +1,8 @@ +#define STPNCPY __stpncpy_sse2 +#ifdef SHARED +#undef libc_hidden_def +#define libc_hidden_def(name) \ + __hidden_ver1 (__stpncpy_sse2, __GI___stpncpy, __stpncpy_sse2); +#endif + +#include "stpncpy.c" diff --git a/sysdeps/x86_64/multiarch/stpncpy.S b/sysdeps/x86_64/multiarch/stpncpy.S new file mode 100644 index 0000000000..ff89a89491 --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpncpy.S @@ -0,0 +1,6 @@ +#define STRCPY __stpncpy +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#include "strcpy.S" + +weak_alias (__stpncpy, stpncpy) diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S index 2f4bf17d95..37985036aa 100644 --- a/sysdeps/x86_64/multiarch/strcmp.S +++ b/sysdeps/x86_64/multiarch/strcmp.S @@ -1659,6 +1659,7 @@ LABEL(unaligned_table): # undef ENTRY # define ENTRY(name) \ .type STRCMP_SSE2, @function; \ + .align 16; \ STRCMP_SSE2: cfi_startproc; \ CALL_MCOUNT # undef END diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S new file mode 100644 index 0000000000..25cd01307d --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcpy.S @@ -0,0 +1,1918 @@ +/* strcpy with SSSE3 + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <ifunc-defines.h> + +#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY) +# ifndef STRCPY +# define STRCPY strcpy +# endif +#endif + +#ifdef USE_AS_STPCPY +# ifdef USE_AS_STRNCPY +# define STRCPY_SSSE3 __stpncpy_ssse3 +# define STRCPY_SSE2 __stpncpy_sse2 +# define __GI_STRCPY __GI_stpncpy +# else +# define STRCPY_SSSE3 __stpcpy_ssse3 +# define STRCPY_SSE2 __stpcpy_sse2 +# define __GI_STRCPY __GI_stpcpy +# define __GI___STRCPY __GI___stpcpy +# endif +#else +# ifdef USE_AS_STRNCPY +# define STRCPY_SSSE3 __strncpy_ssse3 +# define STRCPY_SSE2 __strncpy_sse2 +# define __GI_STRCPY __GI_strncpy +# else +# define STRCPY_SSSE3 __strcpy_ssse3 +# define STRCPY_SSE2 __strcpy_sse2 +# define __GI_STRCPY __GI_strcpy +# endif +#endif + +#ifndef LABEL +#define LABEL(l) L(l) +#endif + +/* Define multiple versions only for the definition in libc. */ +#ifndef NOT_IN_libc + .text +ENTRY(STRCPY) + .type STRCPY, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq STRCPY_SSE2(%rip), %rax + testl $(1<<9), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip) + jz 3f +/* Avoid SSSE3 strcpy on Atom since it is slow. */ + cmpl $1, __cpu_features+KIND_OFFSET(%rip) + jne 2f + cmpl $6, __cpu_features+FAMILY_OFFSET(%rip) + jne 2f + cmpl $28, __cpu_features+MODEL_OFFSET(%rip) + jz 3f +2: leaq STRCPY_SSSE3(%rip), %rax +3: ret +END(STRCPY) + + .section .text.ssse3,"ax",@progbits +STRCPY_SSSE3: + cfi_startproc + CALL_MCOUNT + +/* + * This implementation uses SSE to copy up to 16 bytes at a time. + */ +#ifdef USE_AS_STRNCPY + test %rdx, %rdx + jz LABEL(strncpy_exitz) + mov %rdx, %r8 +#else + xor %edx, %edx +#endif + mov %esi, %ecx + and $0xfffffffffffffff0, %rsi /*force rsi 16 byte align*/ + and $15, %ecx + mov %rdi, %rax /*store return parameter*/ + + + pxor %xmm0, %xmm0 /* clear %xmm0 */ + pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/ + pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/ + shr %cl, %edx /* get real bits left in edx*/ + test %edx, %edx /* edx must be 0 if there is no null char from rsi+%rcx */ + jnz LABEL(less16bytes) + +#ifdef USE_AS_STRNCPY + lea -16(%r8,%rcx), %r11 + cmp $0, %r11 + jle LABEL(less16bytes) /* if r8 + rcx <= 16, branch to less16bytes. */ +#endif + + mov %rcx, %r9 + or %edi, %ecx + and $15, %ecx + lea -16(%r9), %r10 + jz LABEL(ashr_0) /* ecx must be 0 if offset of rsi and rdi is 16 byte align*/ + + neg %r10 /* store the rest in rsi aligned 16 bytes for unaligned_exit*/ + + pxor %xmm0, %xmm0 /* clear %xmm0, may be polluted by unaligned operation*/ + pcmpeqb 16(%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/ + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(less32bytes) + /* + * at least 16 byte available to fill destination rdi + */ +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(less32bytes_strncpy_truncation) +#endif + mov (%rsi, %r9), %rdx + mov %rdx, (%rdi) + mov 8(%rsi, %r9), %rdx + mov %rdx, 8(%rdi) + + /* + * so far destatination rdi may be aligned by 16, re-calculate rsi to jump + * crossponding case + * rcx is offset of rsi + * rax is offset of rdi + */ + + and $0xfffffffffffffff0, %rdi /* force rdi 16 byte align */ + mov %rax, %rdx /* rax store orignal rdi */ + xor %rdi, %rdx /* equal to and $15, %rdx */ +#ifdef USE_AS_STRNCPY + add %rdx, %r8 +#endif + + add $16, %rdi /* next 16 bytes for rdi */ + sub %rdx, %r9 + + lea 16(%r9, %rsi), %rsi /*re-calculate rsi by (16 - rdx)+ rcx */ + mov %esi, %ecx /*store offset of rsi */ + and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */ + + and $15, %ecx /* ecx must be 0 if rdx is equal to rcx*/ + jz LABEL(ashr_0) + + lea -16(%rcx), %r10 + mov %rcx, %r9 + neg %r10 + lea LABEL(unaligned_table)(%rip), %r11 + movslq (%r11, %rcx,4), %rcx + lea (%r11, %rcx), %rcx + jmp *%rcx + + /* + * The following cases will be handled by ashr_0 & ashr_0_start + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * 0 0 0 ashr_0 + * n(1~15) n(1~15) 0 ashr_0_start + * + */ + .p2align 5 +LABEL(ashr_0): +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi), %xmm1 /* fetch first 16 bytes from rsi */ + movdqa %xmm1, (%rdi) /* store first 16 bytes into rdi */ + add $16, %rsi + add $16, %rdi + pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char */ + pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/ + + test %edx, %edx /* edx must be 0 if there is no null char in rsi*/ + jnz LABEL(aligned_16bytes) + +LABEL(ashr_0_loop): +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(aligned_exit) + +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(aligned_exit) + +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(aligned_exit) + +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jz LABEL(ashr_0_loop) + + jmp LABEL(aligned_exit) + .p2align 4 + +/* + * The following cases will be handled by ashr_15 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(15) n - 15 15((16 - (n -15) + n)%16 ashr_15 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_15): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_15_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $15, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $15, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_15_use_ssse3) + +/* + * The following cases will be handled by ashr_14 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(14~15) n - 14 14((16 - (n -14) + n)%16 ashr_14 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_14): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_14_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $14, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $14, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_14_use_ssse3) + +/* + * The following cases will be handled by ashr_13 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(13~15) n - 13 13((16 - (n -13) + n)%16 ashr_13 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_13): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_13_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $13, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $13, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_13_use_ssse3) + +/* + * The following cases will be handled by ashr_12 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(12~15) n - 12 12((16 - (n -12) + n)%16 ashr_12 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_12): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_12_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $12, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $12, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_12_use_ssse3) + +/* + * The following cases will be handled by ashr_11 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(11~15) n - 11 11((16 - (n -11) + n)%16 ashr_11 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_11): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_11_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $11, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $11, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_11_use_ssse3) + +/* + * The following cases will be handled by ashr_10 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(10~15) n - 10 10((16 - (n -10) + n)%16 ashr_10 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_10): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_10_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $10, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $10, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_10_use_ssse3) + +/* + * The following cases will be handled by ashr_9 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(9~15) n - 9 9((16 - (n -9) + n)%16 ashr_9 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_9): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_9_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $9, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $9, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_9_use_ssse3) + +/* + * The following cases will be handled by ashr_8 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(8~15) n - 8 8((16 - (n -8) + n)%16 ashr_8 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_8): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_8_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $8, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $8, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_8_use_ssse3) + +/* + * The following cases will be handled by ashr_7 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(7~15) n - 7 7((16 - (n -7) + n)%16 ashr_7 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_7): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + .p2align 4 + +LABEL(ashr_7_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $7, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $7, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_7_use_ssse3) + +/* + * The following cases will be handled by ashr_6 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(6~15) n - 6 6((16 - (n -6) + n)%16 ashr_6 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_6): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_6_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $6, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $6, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_6_use_ssse3) + + /* + * The following cases will be handled by ashr_5 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(5~15) n - 5 5((16 - (n -5) + n)%16 ashr_5 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_5): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_5_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $5, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $5, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_5_use_ssse3) + +/* + * + * The following cases will be handled by ashr_4 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(4~15) n - 4 4((16 - (n -4) + n)%16 ashr_4 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_4): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_4_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $4, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $4, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_4_use_ssse3) + +/* + * + * The following cases will be handled by ashr_3 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(3~15) n - 3 3((16 - (n -3) + n)%16 ashr_3 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_3): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_3_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $3, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $3, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_3_use_ssse3) + +/* + * + * The following cases will be handled by ashr_2 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(2~15) n - 2 2((16 - (n -2) + n)%16 ashr_2 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_2): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_2_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $2, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $2, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_2_use_ssse3) + +/* + * + * The following cases will be handled by ashr_1 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(1~15) n - 1 1 ((16 - (n -1) + n)%16 ashr_1 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_1): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_1_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $1, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + palignr $1, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_1_use_ssse3) + + .p2align 4 +LABEL(less32bytes): + xor %ecx, %ecx +LABEL(unaligned_exit): + add %r9, %rsi /* r9 stores original offset of rsi*/ + mov %rcx, %r9 + mov %r10, %rcx + shl %cl, %edx /* after shl, calculate the exact number to be filled*/ + mov %r9, %rcx + .p2align 4 +LABEL(aligned_exit): + add %rcx, %rdi /*locate exact address for rdi */ +LABEL(less16bytes): + add %rcx, %rsi /*locate exact address for rsi */ +LABEL(aligned_16bytes): +#ifdef USE_AS_STRNCPY + mov $1, %r9d + lea -1(%r8), %rcx + shl %cl, %r9d + cmp $32, %r8 + ja LABEL(strncpy_tail) + or %r9d, %edx +LABEL(strncpy_tail): +#endif + bsf %rdx, %rcx /*If a least significant 1 bit in %rdx is found, its bit index is stored in %rcx*/ + lea LABEL(tail_table)(%rip), %r11 + movslq (%r11, %rcx,4), %rcx + lea (%r11, %rcx), %rcx + jmp *%rcx + +#ifdef USE_AS_STRNCPY + .p2align 4 +LABEL(less32bytes_strncpy_truncation): + xor %ecx, %ecx +LABEL(strncpy_truncation_unaligned): + add %r9, %rsi +LABEL(strncpy_truncation_aligned): + add %rcx, %rdi + add %rcx, %rsi + add $16, %r8 + lea -1(%r8), %rcx + lea LABEL(tail_table)(%rip), %r11 + movslq (%r11, %rcx,4), %rcx + lea (%r11, %rcx), %rcx + jmp *%rcx + .p2align 4 +LABEL(strncpy_exitz): + mov %rdi, %rax + ret +#endif + +#ifdef USE_AS_STRNCPY + .p2align 4 +LABEL(strncpy_fill_tail): + mov %rax, %rdx + movzx %cl, %rax + mov %r8, %rcx + add %rax, %rdi + xor %eax, %eax + shr $3, %ecx + jz LABEL(strncpy_fill_less_8) + + rep stosq +LABEL(strncpy_fill_less_8): + mov %r8, %rcx + and $7, %ecx + jz LABEL(strncpy_fill_return) +LABEL(strncpy_fill_less_7): + sub $1, %ecx + mov %al, (%rdi, %rcx) + jnz LABEL(strncpy_fill_less_7) +LABEL(strncpy_fill_return): +#ifdef USE_AS_STPCPY + cmpb $1, (%rdx) + sbb $-1, %rdx +#endif + mov %rdx, %rax + ret +#endif + .p2align 4 +LABEL(tail_0): + mov (%rsi), %cl + mov %cl, (%rdi) +#ifdef USE_AS_STPCPY + mov %rdi, %rax +#endif +#ifdef USE_AS_STRNCPY + mov $1, %cl + sub $1, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_1): + mov (%rsi), %cx + mov %cx, (%rdi) +#ifdef USE_AS_STPCPY + lea 1(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $2, %cl + sub $2, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_2): + mov (%rsi), %cx + mov %cx, (%rdi) + mov 1(%rsi), %cx + mov %cx, 1(%rdi) +#ifdef USE_AS_STPCPY + lea 2(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $3, %cl + sub $3, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_3): + mov (%rsi), %ecx + mov %ecx, (%rdi) +#ifdef USE_AS_STPCPY + lea 3(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $4, %cl + sub $4, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_4): + mov (%rsi), %ecx + mov %ecx, (%rdi) + mov 1(%rsi), %edx + mov %edx, 1(%rdi) +#ifdef USE_AS_STPCPY + lea 4(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $5, %cl + sub $5, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_5): + mov (%rsi), %ecx + mov %ecx, (%rdi) + mov 2(%rsi), %edx + mov %edx, 2(%rdi) +#ifdef USE_AS_STPCPY + lea 5(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $6, %cl + sub $6, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_6): + mov (%rsi), %ecx + mov %ecx, (%rdi) + mov 3(%rsi), %edx + mov %edx,3(%rdi) +#ifdef USE_AS_STPCPY + lea 6(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $7, %cl + sub $7, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_7): + mov (%rsi), %rcx + mov %rcx, (%rdi) +#ifdef USE_AS_STPCPY + lea 7(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $8, %cl + sub $8, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_8): + + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 5(%rsi), %edx + mov %edx, 5(%rdi) +#ifdef USE_AS_STPCPY + lea 8(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $9, %cl + sub $9, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_9): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 6(%rsi), %edx + mov %edx, 6(%rdi) +#ifdef USE_AS_STPCPY + lea 9(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $10, %cl + sub $10, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_10): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 7(%rsi), %edx + mov %edx, 7(%rdi) +#ifdef USE_AS_STPCPY + lea 10(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $11, %cl + sub $11, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_11): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %edx + mov %edx, 8(%rdi) +#ifdef USE_AS_STPCPY + lea 11(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $12, %cl + sub $12, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_12): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 5(%rsi), %rcx + mov %rcx, 5(%rdi) +#ifdef USE_AS_STPCPY + lea 12(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $13, %cl + sub $13, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_13): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 6(%rsi), %rcx + mov %rcx, 6(%rdi) +#ifdef USE_AS_STPCPY + lea 13(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $14, %cl + sub $14, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_14): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 7(%rsi), %rcx + mov %rcx, 7(%rdi) +#ifdef USE_AS_STPCPY + lea 14(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $15, %cl + sub $15, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + +LABEL(tail_15): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) +#ifdef USE_AS_STPCPY + lea 15(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $16, %cl + sub $16, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + + ret + + .p2align 4 +LABEL(tail_16): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %cl + mov %cl, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 16(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $17, %cl + sub $17, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_17): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %cx + mov %cx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 17(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $18, %cl + sub $18, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_18): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 15(%rsi), %ecx + mov %ecx,15(%rdi) +#ifdef USE_AS_STPCPY + lea 18(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $19, %cl + sub $19, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_19): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %ecx + mov %ecx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 19(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $20, %cl + sub $20, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_20): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 13(%rsi), %rcx + mov %rcx, 13(%rdi) +#ifdef USE_AS_STPCPY + lea 20(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $21, %cl + sub $21, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_21): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 14(%rsi), %rcx + mov %rcx, 14(%rdi) +#ifdef USE_AS_STPCPY + lea 21(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $22, %cl + sub $22, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_22): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 15(%rsi), %rcx + mov %rcx, 15(%rdi) +#ifdef USE_AS_STPCPY + lea 22(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $23, %cl + sub $23, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_23): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 23(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $24, %cl + sub $24, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + + ret + + .p2align 4 +LABEL(tail_24): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 21(%rsi), %edx + mov %edx, 21(%rdi) +#ifdef USE_AS_STPCPY + lea 24(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $25, %cl + sub $25, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_25): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 22(%rsi), %edx + mov %edx, 22(%rdi) +#ifdef USE_AS_STPCPY + lea 25(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $26, %cl + sub $26, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_26): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 23(%rsi), %edx + mov %edx, 23(%rdi) +#ifdef USE_AS_STPCPY + lea 26(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $27, %cl + sub $27, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_27): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 24(%rsi), %edx + mov %edx, 24(%rdi) +#ifdef USE_AS_STPCPY + lea 27(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $28, %cl + sub $28, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_28): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 21(%rsi), %rdx + mov %rdx, 21(%rdi) +#ifdef USE_AS_STPCPY + lea 28(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $29, %cl + sub $29, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + + ret + + .p2align 4 +LABEL(tail_29): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 22(%rsi), %rdx + mov %rdx, 22(%rdi) +#ifdef USE_AS_STPCPY + lea 29(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $30, %cl + sub $30, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + + ret + + + .p2align 4 +LABEL(tail_30): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 23(%rsi), %rdx + mov %rdx, 23(%rdi) +#ifdef USE_AS_STPCPY + lea 30(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $31, %cl + sub $31, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_31): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 24(%rsi), %rdx + mov %rdx, 24(%rdi) +#ifdef USE_AS_STPCPY + lea 31(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $32, %cl + sub $32, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + cfi_endproc + .size STRCPY_SSSE3, .-STRCPY_SSSE3 + + .p2align 4 + .section .rodata.ssse3,"a",@progbits +LABEL(tail_table): + .int LABEL(tail_0) - LABEL(tail_table) + .int LABEL(tail_1) - LABEL(tail_table) + .int LABEL(tail_2) - LABEL(tail_table) + .int LABEL(tail_3) - LABEL(tail_table) + .int LABEL(tail_4) - LABEL(tail_table) + .int LABEL(tail_5) - LABEL(tail_table) + .int LABEL(tail_6) - LABEL(tail_table) + .int LABEL(tail_7) - LABEL(tail_table) + .int LABEL(tail_8) - LABEL(tail_table) + .int LABEL(tail_9) - LABEL(tail_table) + .int LABEL(tail_10) - LABEL(tail_table) + .int LABEL(tail_11) - LABEL(tail_table) + .int LABEL(tail_12) - LABEL(tail_table) + .int LABEL(tail_13) - LABEL(tail_table) + .int LABEL(tail_14) - LABEL(tail_table) + .int LABEL(tail_15) - LABEL(tail_table) + .int LABEL(tail_16) - LABEL(tail_table) + .int LABEL(tail_17) - LABEL(tail_table) + .int LABEL(tail_18) - LABEL(tail_table) + .int LABEL(tail_19) - LABEL(tail_table) + .int LABEL(tail_20) - LABEL(tail_table) + .int LABEL(tail_21) - LABEL(tail_table) + .int LABEL(tail_22) - LABEL(tail_table) + .int LABEL(tail_23) - LABEL(tail_table) + .int LABEL(tail_24) - LABEL(tail_table) + .int LABEL(tail_25) - LABEL(tail_table) + .int LABEL(tail_26) - LABEL(tail_table) + .int LABEL(tail_27) - LABEL(tail_table) + .int LABEL(tail_28) - LABEL(tail_table) + .int LABEL(tail_29) - LABEL(tail_table) + .int LABEL(tail_30) - LABEL(tail_table) + .int LABEL(tail_31) - LABEL(tail_table) + + .p2align 4 +LABEL(unaligned_table): + .int LABEL(ashr_0) - LABEL(unaligned_table) + .int LABEL(ashr_1) - LABEL(unaligned_table) + .int LABEL(ashr_2) - LABEL(unaligned_table) + .int LABEL(ashr_3) - LABEL(unaligned_table) + .int LABEL(ashr_4) - LABEL(unaligned_table) + .int LABEL(ashr_5) - LABEL(unaligned_table) + .int LABEL(ashr_6) - LABEL(unaligned_table) + .int LABEL(ashr_7) - LABEL(unaligned_table) + .int LABEL(ashr_8) - LABEL(unaligned_table) + .int LABEL(ashr_9) - LABEL(unaligned_table) + .int LABEL(ashr_10) - LABEL(unaligned_table) + .int LABEL(ashr_11) - LABEL(unaligned_table) + .int LABEL(ashr_12) - LABEL(unaligned_table) + .int LABEL(ashr_13) - LABEL(unaligned_table) + .int LABEL(ashr_14) - LABEL(unaligned_table) + .int LABEL(ashr_15) - LABEL(unaligned_table) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCPY_SSE2, @function; \ + .align 16; \ + STRCPY_SSE2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCPY_SSE2, .-STRCPY_SSE2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcpy calls through a PLT. + The speedup we get from using SSSE3 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCPY; __GI_STRCPY = STRCPY_SSE2 +# undef libc_hidden_def +# define libc_hidden_def(name) \ + .globl __GI___STRCPY; __GI___STRCPY = STRCPY_SSE2 +#endif + +#ifndef USE_AS_STRNCPY +#include "../strcpy.S" +#endif diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c new file mode 100644 index 0000000000..4512267d3f --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcspn-c.c @@ -0,0 +1,312 @@ +/* strcspn with SSE4.2 intrinsics + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <nmmintrin.h> +#include <string.h> + +/* We use 0x2: + _SIDD_SBYTE_OPS + | _SIDD_CMP_EQUAL_ANY + | _SIDD_POSITIVE_POLARITY + | _SIDD_LEAST_SIGNIFICANT + on pcmpistri to compare xmm/mem128 + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + X X X X X X X X X X X X X X X X + + against xmm + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + A A A A A A A A A A A A A A A A + + to find out if the first 16byte data element has any byte A and + the offset of the first byte. There are 3 cases: + + 1. The first 16byte data element has the byte A at the offset X. + 2. The first 16byte data element has EOS and doesn't have the byte A. + 3. The first 16byte data element is valid and doesn't have the byte A. + + Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: + + 1 X 1 0/1 0 + 2 16 0 1 0 + 3 16 0 0 0 + + We exit from the loop for cases 1 and 2 with jbe which branches + when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset + X for case 1. */ + +#ifndef STRCSPN_SSE2 +# define STRCSPN_SSE2 __strcspn_sse2 +# define STRCSPN_SSE42 __strcspn_sse42 +#endif + +#ifdef USE_AS_STRPBRK +# define RETURN(val1, val2) return val1 +#else +# define RETURN(val1, val2) return val2 +#endif + +extern +#ifdef USE_AS_STRPBRK +char * +#else +size_t +#endif +STRCSPN_SSE2 (const char *, const char *); + + +#ifdef USE_AS_STRPBRK +char * +#else +size_t +#endif +__attribute__ ((section (".text.sse4.2"))) +STRCSPN_SSE42 (const char *s, const char *a) +{ + if (*a == 0) + RETURN (NULL, strlen (s)); + + const char *aligned; + __m128i mask; + int offset = (int) ((size_t) a & 15); + if (offset != 0) + { + /* Load masks. */ + aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L); + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); + + switch (offset) + { + case 1: + mask = _mm_srli_si128 (mask0, 1); + break; + case 2: + mask = _mm_srli_si128 (mask0, 2); + break; + case 3: + mask = _mm_srli_si128 (mask0, 3); + break; + case 4: + mask = _mm_srli_si128 (mask0, 4); + break; + case 5: + mask = _mm_srli_si128 (mask0, 5); + break; + case 6: + mask = _mm_srli_si128 (mask0, 6); + break; + case 7: + mask = _mm_srli_si128 (mask0, 7); + break; + case 8: + mask = _mm_srli_si128 (mask0, 8); + break; + case 9: + mask = _mm_srli_si128 (mask0, 9); + break; + case 10: + mask = _mm_srli_si128 (mask0, 10); + break; + case 11: + mask = _mm_srli_si128 (mask0, 11); + break; + case 12: + mask = _mm_srli_si128 (mask0, 12); + break; + case 13: + mask = _mm_srli_si128 (mask0, 13); + break; + case 14: + mask = _mm_srli_si128 (mask0, 14); + break; + case 15: + mask = _mm_srli_si128 (mask0, 15); + break; + } + + /* Find where the NULL terminator is. */ + int length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16 - offset) + { + /* There is no NULL terminator. */ + __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); + int index = _mm_cmpistri (mask1, mask1, 0x3a); + length += index; + + /* Don't use SSE4.2 if the length of A > 16. */ + if (length > 16) + return STRCSPN_SSE2 (s, a); + + if (index != 0) + { + /* Combine mask0 and mask1. */ + switch (offset) + { + case 1: + mask = _mm_alignr_epi8 (mask1, mask0, 1); + break; + case 2: + mask = _mm_alignr_epi8 (mask1, mask0, 2); + break; + case 3: + mask = _mm_alignr_epi8 (mask1, mask0, 3); + break; + case 4: + mask = _mm_alignr_epi8 (mask1, mask0, 4); + break; + case 5: + mask = _mm_alignr_epi8 (mask1, mask0, 5); + break; + case 6: + mask = _mm_alignr_epi8 (mask1, mask0, 6); + break; + case 7: + mask = _mm_alignr_epi8 (mask1, mask0, 7); + break; + case 8: + mask = _mm_alignr_epi8 (mask1, mask0, 8); + break; + case 9: + mask = _mm_alignr_epi8 (mask1, mask0, 9); + break; + case 10: + mask = _mm_alignr_epi8 (mask1, mask0, 10); + break; + case 11: + mask = _mm_alignr_epi8 (mask1, mask0, 11); + break; + case 12: + mask = _mm_alignr_epi8 (mask1, mask0, 12); + break; + case 13: + mask = _mm_alignr_epi8 (mask1, mask0, 13); + break; + case 14: + mask = _mm_alignr_epi8 (mask1, mask0, 14); + break; + case 15: + mask = _mm_alignr_epi8 (mask1, mask0, 15); + break; + } + } + } + } + else + { + /* A is aligned. */ + mask = _mm_load_si128 ((__m128i *) a); + + /* Find where the NULL terminator is. */ + int length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16) + { + /* There is no NULL terminator. Don't use SSE4.2 if the length + of A > 16. */ + if (a[16] != 0) + return STRCSPN_SSE2 (s, a); + } + } + + offset = (int) ((size_t) s & 15); + if (offset != 0) + { + /* Check partial string. */ + aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L); + __m128i value = _mm_load_si128 ((__m128i *) aligned); + + switch (offset) + { + case 1: + value = _mm_srli_si128 (value, 1); + break; + case 2: + value = _mm_srli_si128 (value, 2); + break; + case 3: + value = _mm_srli_si128 (value, 3); + break; + case 4: + value = _mm_srli_si128 (value, 4); + break; + case 5: + value = _mm_srli_si128 (value, 5); + break; + case 6: + value = _mm_srli_si128 (value, 6); + break; + case 7: + value = _mm_srli_si128 (value, 7); + break; + case 8: + value = _mm_srli_si128 (value, 8); + break; + case 9: + value = _mm_srli_si128 (value, 9); + break; + case 10: + value = _mm_srli_si128 (value, 10); + break; + case 11: + value = _mm_srli_si128 (value, 11); + break; + case 12: + value = _mm_srli_si128 (value, 12); + break; + case 13: + value = _mm_srli_si128 (value, 13); + break; + case 14: + value = _mm_srli_si128 (value, 14); + break; + case 15: + value = _mm_srli_si128 (value, 15); + break; + } + + int length = _mm_cmpistri (mask, value, 0x2); + /* No need to check ZFlag since ZFlag is always 1. */ + int cflag = _mm_cmpistrc (mask, value, 0x2); + if (cflag) + RETURN ((char *) (s + length), length); + /* Find where the NULL terminator is. */ + int index = _mm_cmpistri (value, value, 0x3a); + if (index < 16 - offset) + RETURN (NULL, index); + aligned += 16; + } + else + aligned = s; + + while (1) + { + __m128i value = _mm_load_si128 ((__m128i *) aligned); + int index = _mm_cmpistri (mask, value, 0x2); + int cflag = _mm_cmpistrc (mask, value, 0x2); + int zflag = _mm_cmpistrz (mask, value, 0x2); + if (cflag) + RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); + if (zflag) + RETURN (NULL, + /* Find where the NULL terminator is. */ + (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s)); + aligned += 16; + } +} diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S new file mode 100644 index 0000000000..cc75ab70e6 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcspn.S @@ -0,0 +1,82 @@ +/* Multiple versions of strcspn + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <config.h> + +#ifdef HAVE_SSE4_SUPPORT + +#include <sysdep.h> +#include <ifunc-defines.h> + +#ifdef USE_AS_STRPBRK +#define STRCSPN_SSE42 __strpbrk_sse42 +#define STRCSPN_SSE2 __strpbrk_sse2 +#define __GI_STRCSPN __GI_strpbrk +#else +#ifndef STRCSPN +#define STRCSPN strcspn +#define STRCSPN_SSE42 __strcspn_sse42 +#define STRCSPN_SSE2 __strcspn_sse2 +#define __GI_STRCSPN __GI_strcspn +#endif +#endif + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strpbrk in static library since we + need strpbrk before the initialization happened. */ +#if (defined SHARED || !defined USE_AS_STRPBRK) && !defined NOT_IN_libc + .text +ENTRY(STRCSPN) + .type STRCSPN, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq STRCSPN_SSE2(%rip), %rax + testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip) + jz 2f + leaq STRCSPN_SSE42(%rip), %rax +2: ret +END(STRCSPN) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCSPN_SSE2, @function; \ + .globl STRCSPN_SSE2; \ + .align 16; \ + STRCSPN_SSE2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcspn calls through a PLT. + The speedup we get from using SSE4.2 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_SSE2 +#endif + +#endif /* HAVE_SSE4_SUPPORT */ + +#ifdef USE_AS_STRPBRK +#include "../strpbrk.S" +#else +#include "../strcspn.S" +#endif diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S index 79e6a977ec..82b03ccc28 100644 --- a/sysdeps/x86_64/multiarch/strlen.S +++ b/sysdeps/x86_64/multiarch/strlen.S @@ -77,6 +77,7 @@ __strlen_sse42: # undef ENTRY # define ENTRY(name) \ .type __strlen_sse2, @function; \ + .align 16; \ __strlen_sse2: cfi_startproc; \ CALL_MCOUNT # undef END diff --git a/sysdeps/x86_64/multiarch/strncpy-c.c b/sysdeps/x86_64/multiarch/strncpy-c.c new file mode 100644 index 0000000000..296c32cb5d --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncpy-c.c @@ -0,0 +1,8 @@ +#define STRNCPY __strncpy_sse2 +#ifdef SHARED +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strncpy_sse2, __GI_strncpy, __strncpy_sse2); +#endif + +#include "strncpy.c" diff --git a/sysdeps/x86_64/multiarch/strncpy.S b/sysdeps/x86_64/multiarch/strncpy.S new file mode 100644 index 0000000000..327a4ce447 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncpy.S @@ -0,0 +1,3 @@ +#define STRCPY strncpy +#define USE_AS_STRNCPY +#include "strcpy.S" diff --git a/sysdeps/x86_64/multiarch/strpbrk-c.c b/sysdeps/x86_64/multiarch/strpbrk-c.c new file mode 100644 index 0000000000..c58dcb5605 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strpbrk-c.c @@ -0,0 +1,4 @@ +#define USE_AS_STRPBRK +#define STRCSPN_SSE2 __strpbrk_sse2 +#define STRCSPN_SSE42 __strpbrk_sse42 +#include "strcspn-c.c" diff --git a/sysdeps/x86_64/multiarch/strpbrk.S b/sysdeps/x86_64/multiarch/strpbrk.S new file mode 100644 index 0000000000..ed5bca6a94 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strpbrk.S @@ -0,0 +1,3 @@ +#define STRCSPN strpbrk +#define USE_AS_STRPBRK +#include "strcspn.S" diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c new file mode 100644 index 0000000000..5b99f0d383 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strspn-c.c @@ -0,0 +1,284 @@ +/* strspn with SSE4.2 intrinsics + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <nmmintrin.h> +#include <string.h> + +/* We use 0x12: + _SIDD_SBYTE_OPS + | _SIDD_CMP_EQUAL_ANY + | _SIDD_NEGATIVE_POLARITY + | _SIDD_LEAST_SIGNIFICANT + on pcmpistri to compare xmm/mem128 + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + X X X X X X X X X X X X X X X X + + against xmm + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + A A A A A A A A A A A A A A A A + + to find out if the first 16byte data element has any non-A byte and + the offset of the first byte. There are 2 cases: + + 1. The first 16byte data element has the non-A byte, including + EOS, at the offset X. + 2. The first 16byte data element is valid and doesn't have the non-A + byte. + + Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: + + case ECX CFlag ZFlag SFlag + 1 X 1 0/1 0 + 2 16 0 0 0 + + We exit from the loop for case 1. */ + +extern size_t __strspn_sse2 (const char *, const char *); + + +size_t +__attribute__ ((section (".text.sse4.2"))) +__strspn_sse42 (const char *s, const char *a) +{ + if (*a == 0) + return 0; + + const char *aligned; + __m128i mask; + int offset = (int) ((size_t) a & 15); + if (offset != 0) + { + /* Load masks. */ + aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L); + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); + + switch (offset) + { + case 1: + mask = _mm_srli_si128 (mask0, 1); + break; + case 2: + mask = _mm_srli_si128 (mask0, 2); + break; + case 3: + mask = _mm_srli_si128 (mask0, 3); + break; + case 4: + mask = _mm_srli_si128 (mask0, 4); + break; + case 5: + mask = _mm_srli_si128 (mask0, 5); + break; + case 6: + mask = _mm_srli_si128 (mask0, 6); + break; + case 7: + mask = _mm_srli_si128 (mask0, 7); + break; + case 8: + mask = _mm_srli_si128 (mask0, 8); + break; + case 9: + mask = _mm_srli_si128 (mask0, 9); + break; + case 10: + mask = _mm_srli_si128 (mask0, 10); + break; + case 11: + mask = _mm_srli_si128 (mask0, 11); + break; + case 12: + mask = _mm_srli_si128 (mask0, 12); + break; + case 13: + mask = _mm_srli_si128 (mask0, 13); + break; + case 14: + mask = _mm_srli_si128 (mask0, 14); + break; + case 15: + mask = _mm_srli_si128 (mask0, 15); + break; + } + + /* Find where the NULL terminator is. */ + int length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16 - offset) + { + /* There is no NULL terminator. */ + __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); + int index = _mm_cmpistri (mask1, mask1, 0x3a); + length += index; + + /* Don't use SSE4.2 if the length of A > 16. */ + if (length > 16) + return __strspn_sse2 (s, a); + + if (index != 0) + { + /* Combine mask0 and mask1. */ + switch (offset) + { + case 1: + mask = _mm_alignr_epi8 (mask1, mask0, 1); + break; + case 2: + mask = _mm_alignr_epi8 (mask1, mask0, 2); + break; + case 3: + mask = _mm_alignr_epi8 (mask1, mask0, 3); + break; + case 4: + mask = _mm_alignr_epi8 (mask1, mask0, 4); + break; + case 5: + mask = _mm_alignr_epi8 (mask1, mask0, 5); + break; + case 6: + mask = _mm_alignr_epi8 (mask1, mask0, 6); + break; + case 7: + mask = _mm_alignr_epi8 (mask1, mask0, 7); + break; + case 8: + mask = _mm_alignr_epi8 (mask1, mask0, 8); + break; + case 9: + mask = _mm_alignr_epi8 (mask1, mask0, 9); + break; + case 10: + mask = _mm_alignr_epi8 (mask1, mask0, 10); + break; + case 11: + mask = _mm_alignr_epi8 (mask1, mask0, 11); + break; + case 12: + mask = _mm_alignr_epi8 (mask1, mask0, 12); + break; + case 13: + mask = _mm_alignr_epi8 (mask1, mask0, 13); + break; + case 14: + mask = _mm_alignr_epi8 (mask1, mask0, 14); + break; + case 15: + mask = _mm_alignr_epi8 (mask1, mask0, 15); + break; + } + } + } + } + else + { + /* A is aligned. */ + mask = _mm_load_si128 ((__m128i *) a); + + /* Find where the NULL terminator is. */ + int length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16) + { + /* There is no NULL terminator. Don't use SSE4.2 if the length + of A > 16. */ + if (a[16] != 0) + return __strspn_sse2 (s, a); + } + } + + offset = (int) ((size_t) s & 15); + if (offset != 0) + { + /* Check partial string. */ + aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L); + __m128i value = _mm_load_si128 ((__m128i *) aligned); + + switch (offset) + { + case 1: + value = _mm_srli_si128 (value, 1); + break; + case 2: + value = _mm_srli_si128 (value, 2); + break; + case 3: + value = _mm_srli_si128 (value, 3); + break; + case 4: + value = _mm_srli_si128 (value, 4); + break; + case 5: + value = _mm_srli_si128 (value, 5); + break; + case 6: + value = _mm_srli_si128 (value, 6); + break; + case 7: + value = _mm_srli_si128 (value, 7); + break; + case 8: + value = _mm_srli_si128 (value, 8); + break; + case 9: + value = _mm_srli_si128 (value, 9); + break; + case 10: + value = _mm_srli_si128 (value, 10); + break; + case 11: + value = _mm_srli_si128 (value, 11); + break; + case 12: + value = _mm_srli_si128 (value, 12); + break; + case 13: + value = _mm_srli_si128 (value, 13); + break; + case 14: + value = _mm_srli_si128 (value, 14); + break; + case 15: + value = _mm_srli_si128 (value, 15); + break; + } + + int length = _mm_cmpistri (mask, value, 0x12); + /* No need to check CFlag since it is always 1. */ + if (length < 16 - offset) + return length; + /* Find where the NULL terminator is. */ + int index = _mm_cmpistri (value, value, 0x3a); + if (index < 16 - offset) + return length; + aligned += 16; + } + else + aligned = s; + + while (1) + { + __m128i value = _mm_load_si128 ((__m128i *) aligned); + int index = _mm_cmpistri (mask, value, 0x12); + int cflag = _mm_cmpistrc (mask, value, 0x12); + if (cflag) + return (size_t) (aligned + index - s); + aligned += 16; + } +} diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S new file mode 100644 index 0000000000..4183a2cf60 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strspn.S @@ -0,0 +1,63 @@ +/* Multiple versions of strspn + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <config.h> + +#ifdef HAVE_SSE4_SUPPORT + +#include <sysdep.h> +#include <ifunc-defines.h> + +/* Define multiple versions only for the definition in libc. */ +#ifndef NOT_IN_libc + .text +ENTRY(strspn) + .type strspn, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq __strspn_sse2(%rip), %rax + testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip) + jz 2f + leaq __strspn_sse42(%rip), %rax +2: ret +END(strspn) + +# undef ENTRY +# define ENTRY(name) \ + .type __strspn_sse2, @function; \ + .globl __strspn_sse2; \ + .align 16; \ + __strspn_sse2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __strspn_sse2, .-__strspn_sse2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strspn calls through a PLT. + The speedup we get from using SSE4.2 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_strspn; __GI_strspn = __strspn_sse2 +#endif + +#endif /* HAVE_SSE4_SUPPORT */ + +#include "../strspn.S" |