From 772f4e6a1be0e3fbd8c4e9edacf1887269f598dc Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Mon, 22 Jun 2009 20:38:41 -0700 Subject: Add SSE4.2 support for strcmp and strncmp on x86-64. --- ChangeLog | 12 + string/strncmp.c | 12 +- sysdeps/x86_64/multiarch/Makefile | 4 + sysdeps/x86_64/multiarch/init-arch.h | 3 + sysdeps/x86_64/multiarch/strcmp.S | 1677 ++++++++++++++++++++++++++++++++++ sysdeps/x86_64/multiarch/strncmp-c.c | 8 + sysdeps/x86_64/multiarch/strncmp.S | 3 + 7 files changed, 1714 insertions(+), 5 deletions(-) create mode 100644 sysdeps/x86_64/multiarch/strcmp.S create mode 100644 sysdeps/x86_64/multiarch/strncmp-c.c create mode 100644 sysdeps/x86_64/multiarch/strncmp.S diff --git a/ChangeLog b/ChangeLog index 08e4b863ef..8ddbd84af5 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,15 @@ +2009-06-22 H.J. Lu + + * string/strncmp.c (STRNCMP): New. Defined to strncmp if not + defined. + (strncmp): Renamed to STRNCMP. + * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add + strncmp-c for string. + * sysdeps/x86_64/multiarch/init-arch.h (HAS_SSE4_2): Define. + * sysdeps/x86_64/multiarch/strcmp.S: New file. + * sysdeps/x86_64/multiarch/strncmp.S: New file. + * sysdeps/x86_64/multiarch/strncmp-c.c: New file. + 2009-06-19 H.J. Lu * elf/Makefile (distribute): Add ifuncmain1staticpie.c, diff --git a/string/strncmp.c b/string/strncmp.c index 1adb2c0ebd..bb0cbfdf65 100644 --- a/string/strncmp.c +++ b/string/strncmp.c @@ -21,15 +21,16 @@ #undef strncmp +#ifndef STRNCMP +#define STRNCMP strncmp +#endif + /* Compare no more than N characters of S1 and S2, returning less than, equal to or greater than zero if S1 is lexicographically less than, equal to or greater than S2. */ int -strncmp (s1, s2, n) - const char *s1; - const char *s2; - size_t n; +STRNCMP (const char *s1, const char *s2, size_t n) { unsigned reg_char c1 = '\0'; unsigned reg_char c2 = '\0'; @@ -70,4 +71,5 @@ strncmp (s1, s2, n) return c1 - c2; } -libc_hidden_builtin_def (strncmp) + +libc_hidden_builtin_def (STRNCMP) diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 33d98c36e6..1c35e1ffb4 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -2,3 +2,7 @@ ifeq ($(subdir),csu) aux += init-arch gen-as-const-headers += ifunc-defines.sym endif + +ifeq ($(subdir),string) +sysdep_routines += strncmp-c +endif diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h index f160ba2a94..5c4892de38 100644 --- a/sysdeps/x86_64/multiarch/init-arch.h +++ b/sysdeps/x86_64/multiarch/init-arch.h @@ -56,3 +56,6 @@ extern void __init_cpu_features (void) attribute_hidden; #define HAS_POPCOUNT \ ((__cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 23)) != 0) + +#define HAS_SSE4_2 \ + ((__cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 20)) != 0) diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S new file mode 100644 index 0000000000..8a6aee673a --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcmp.S @@ -0,0 +1,1677 @@ +/* strcmp with SSE4.2 + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +#ifdef USE_AS_STRNCMP +/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz + if the new counter > the old one or is 0. */ +#define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + lea -16(%rcx, %r11), %r9; \ + cmp %r9, %r11; \ + jb LABEL(strcmp_exitz); \ + test %r9, %r9; \ + je LABEL(strcmp_exitz); \ + mov %r9, %r11 + +#define STRCMP_SSE42 __strncmp_sse42 +#define STRCMP_SSE2 __strncmp_sse2 +#define __GI_STRCMP __GI_strncmp +#else +#define UPDATE_STRNCMP_COUNTER +#ifndef STRCMP +#define STRCMP strcmp +#define STRCMP_SSE42 __strcmp_sse42 +#define STRCMP_SSE2 __strcmp_sse2 +#define __GI_STRCMP __GI_strcmp +#endif +#endif + +#ifndef LABEL +#define LABEL(l) L(l) +#endif + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strncmp in static library since we + need strncmp before the initialization happened. */ +#if (defined SHARED || !defined USE_AS_STRNCMP) && !defined NOT_IN_libc + .text +ENTRY(STRCMP) + .type STRCMP, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq STRCMP_SSE2(%rip), %rax + testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip) + jz 2f + leaq STRCMP_SSE42(%rip), %rax +2: ret +END(STRCMP) + +/* We use 0x1a: + _SIDD_SBYTE_OPS + | _SIDD_CMP_EQUAL_EACH + | _SIDD_NEGATIVE_POLARITY + | _SIDD_LEAST_SIGNIFICANT + on pcmpistri to find out if two 16byte data elements are the same + and the offset of the first different byte. There are 4 cases: + + 1. Both 16byte data elements are valid and identical. + 2. Both 16byte data elements have EOS and identical. + 3. Both 16byte data elements are valid and they differ at offset X. + 4. At least one 16byte data element has EOS at offset X. Two 16byte + data elements must differ at or before offset X. + + Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases: + + case ECX CFlag ZFlag SFlag + 1 16 0 0 0 + 2 16 0 1 1 + 3 X 1 0 0 + 4 0 <= X 1 0/1 0/1 + + We exit from the loop for cases 2, 3 and 4 with jbe which branches + when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for + case 2. */ + + /* Put all SSE 4.2 functions together. */ + .section .text.sse4.2,"ax",@progbits + .align 16 + .type STRCMP_SSE42, @function +STRCMP_SSE42: + cfi_startproc + CALL_MCOUNT + +/* + * This implementation uses SSE to compare up to 16 bytes at a time. + */ +#ifdef USE_AS_STRNCMP + test %rdx, %rdx + je LABEL(strcmp_exitz) + cmp $1, %rdx + je LABEL(Byte0) + mov %rdx, %r11 +#endif + mov %esi, %ecx + mov %edi, %eax +/* Use 64bit AND here to avoid long NOP padding. */ + and $0x3f, %rcx /* rsi alignment in cache line */ + and $0x3f, %rax /* rdi alignment in cache line */ + cmp $0x30, %ecx + ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */ + cmp $0x30, %eax + ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */ + movlpd (%rdi), %xmm1 + movlpd (%rsi), %xmm2 + movhpd 8(%rdi), %xmm1 + movhpd 8(%rsi), %xmm2 + pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ + jnz LABEL(less16bytes) /* If not, find different value or null char */ +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) /* finish comparision */ +#endif + add $16, %rsi /* prepare to search next 16 bytes */ + add $16, %rdi /* prepare to search next 16 bytes */ + + /* + * Determine source and destination string offsets from 16-byte alignment. + * Use relative offset difference between the two to determine which case + * below to use. + */ + .p2align 4 +LABEL(crosscache): + and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ + and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ + mov $0xffff, %edx /* for equivalent offset */ + xor %r8d, %r8d + and $0xf, %ecx /* offset of rsi */ + and $0xf, %eax /* offset of rdi */ + cmp %eax, %ecx + je LABEL(ashr_0) /* rsi and rdi relative offset same */ + ja LABEL(bigger) + mov %edx, %r8d /* r8d is offset flag for exit tail */ + xchg %ecx, %eax + xchg %rsi, %rdi +LABEL(bigger): + lea 15(%rax), %r9 + sub %rcx, %r9 + lea LABEL(unaligned_table)(%rip), %r10 + movslq (%r10, %r9,4), %r9 + lea (%r10, %r9), %r10 + jmp *%r10 /* jump to corresponding case */ + +/* + * The following cases will be handled by ashr_0 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(0~15) n(0~15) 15(15+ n-n) ashr_0 + */ + .p2align 4 +LABEL(ashr_0): + + movdqa (%rsi), %xmm1 + pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ + sub %r9d, %edx + /* + * edx must be the same with r9d if in left byte (16-rcx) is equal to + * the start from (16-rax) and no null char was seen. + */ + jne LABEL(less32bytes) /* mismatch or null char */ + UPDATE_STRNCMP_COUNTER + mov $16, %rcx + mov $16, %r9 + pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */ + + /* + * Now both strings are aligned at 16-byte boundary. Loop over strings + * checking 32-bytes per iteration. + */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + .p2align 4 +LABEL(ashr_0_use_sse4_2): + movdqa (%rdi,%rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + lea 16(%rdx), %rdx + jbe LABEL(ashr_0_use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + movdqa (%rdi,%rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + lea 16(%rdx), %rdx + jbe LABEL(ashr_0_use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + jmp LABEL(ashr_0_use_sse4_2) + + + .p2align 4 +LABEL(ashr_0_use_sse4_2_exit): + jnc LABEL(strcmp_exitz) +#ifdef USE_AS_STRNCMP + sub %rcx, %r11 + jbe LABEL(strcmp_exitz) +#endif + lea -16(%rdx, %rcx), %rcx + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %edx + sub %edx, %eax + ret + + + + +/* + * The following cases will be handled by ashr_1 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(15) n -15 0(15 +(n-15) - n) ashr_1 + */ + .p2align 4 +LABEL(ashr_1): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pslldq $15, %xmm2 /* shift first string to align with second */ + pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ + sub %r9d, %edx + jnz LABEL(less32bytes) /* mismatch or null char seen */ + movdqa (%rdi), %xmm3 + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads*/ + mov $1, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 1(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_1_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_1_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $1, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_1_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $1, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_1_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_1_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $1, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $14, %ecx + ja LABEL(loop_ashr_1_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_2 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 + */ + .p2align 4 +LABEL(ashr_2): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $14, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $2, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 2(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_2_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_2_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $2, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_2_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $2, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_2_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_2_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $2, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $13, %ecx + ja LABEL(loop_ashr_2_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_3 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 + */ + .p2align 4 +LABEL(ashr_3): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $13, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $3, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 3(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + +LABEL(loop_ashr_3_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_3_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $3, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_3_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $3, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_3_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_3_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $3, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $12, %ecx + ja LABEL(loop_ashr_3_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_4 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 + */ + .p2align 4 +LABEL(ashr_4): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $12, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $4, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 4(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_4_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_4_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $4, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_4_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $4, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_4_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_4_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $4, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $11, %ecx + ja LABEL(loop_ashr_4_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_5 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 + */ + .p2align 4 +LABEL(ashr_5): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $11, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $5, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 5(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_5_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_5_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $5, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_5_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + + palignr $5, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_5_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_5_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $5, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $10, %ecx + ja LABEL(loop_ashr_5_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_6 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 + */ + .p2align 4 +LABEL(ashr_6): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $10, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $6, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 6(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_6_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_6_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $6, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_6_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $6, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_6_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_6_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $6, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $9, %ecx + ja LABEL(loop_ashr_6_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_7 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 + */ + .p2align 4 +LABEL(ashr_7): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $9, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $7, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 7(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_7_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_7_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $7, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_7_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $7, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_7_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_7_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $7, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $8, %ecx + ja LABEL(loop_ashr_7_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_8 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 + */ + .p2align 4 +LABEL(ashr_8): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $8, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $8, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 8(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_8_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_8_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $8, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_8_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $8, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_8_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_8_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $8, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $7, %ecx + ja LABEL(loop_ashr_8_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_9 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 + */ + .p2align 4 +LABEL(ashr_9): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $7, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $9, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 9(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_9_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_9_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + + palignr $9, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_9_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $9, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_9_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_9_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $9, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $6, %ecx + ja LABEL(loop_ashr_9_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_10 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 + */ + .p2align 4 +LABEL(ashr_10): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $6, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $10, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 10(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_10_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_10_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $10, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_10_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $10, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_10_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_10_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $10, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $5, %ecx + ja LABEL(loop_ashr_10_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_11 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 + */ + .p2align 4 +LABEL(ashr_11): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $5, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $11, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 11(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_11_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_11_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $11, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_11_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $11, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_11_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_11_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $11, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $4, %ecx + ja LABEL(loop_ashr_11_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_12 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 + */ + .p2align 4 +LABEL(ashr_12): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $4, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $12, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 12(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_12_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_12_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $12, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_12_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $12, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_12_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_12_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $12, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $3, %ecx + ja LABEL(loop_ashr_12_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_13 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 + */ + .p2align 4 +LABEL(ashr_13): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $3, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $13, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 13(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_13_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_13_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $13, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_13_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $13, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_13_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_13_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $13, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $2, %ecx + ja LABEL(loop_ashr_13_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_14 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 + */ + .p2align 4 +LABEL(ashr_14): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $2, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $14, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 14(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_14_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_14_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $14, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_14_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $14, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_14_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_14_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $14, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $1, %ecx + ja LABEL(loop_ashr_14_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_15 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 + */ + .p2align 4 +LABEL(ashr_15): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $1, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $15, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 15(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + + sub $0x1000, %r10 /* subtract 4K pagesize */ + + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_15_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_15_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $15, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_15_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $15, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_15_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_15_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $15, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $0, %ecx + ja LABEL(loop_ashr_15_use_sse4_2) + +LABEL(nibble_ashr_use_sse4_2_exit): + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + .p2align 4 +LABEL(use_sse4_2_exit): + jnc LABEL(strcmp_exitz) +#ifdef USE_AS_STRNCMP + sub %rcx, %r11 + jbe LABEL(strcmp_exitz) +#endif + add %rcx, %rdx + lea -16(%rdi, %r9), %rdi + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + test %r8d, %r8d + jz LABEL(use_sse4_2_ret) + xchg %eax, %edx +LABEL(use_sse4_2_ret): + sub %edx, %eax + ret + + .p2align 4 +LABEL(aftertail): + pcmpeqb %xmm3, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + not %edx + + .p2align 4 +LABEL(exit): + lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */ +LABEL(less32bytes): + lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ + lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ + test %r8d, %r8d + jz LABEL(ret) + xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ + + .p2align 4 +LABEL(ret): +LABEL(less16bytes): + /* + * Check to see if BSF is fast on this processor. If not, use a different + * exit tail. + */ + bsf %rdx, %rdx /* find and store bit index in %rdx */ + +#ifdef USE_AS_STRNCMP + sub %rdx, %r11 + jbe LABEL(strcmp_exitz) +#endif + xor %ecx, %ecx /* clear %ecx */ + xor %eax, %eax /* clear %eax */ + + movb (%rsi, %rdx), %cl + movb (%rdi, %rdx), %al + + sub %ecx, %eax + ret + +LABEL(strcmp_exitz): + xor %eax, %eax + ret + + .p2align 4 +LABEL(Byte0): + /* + * never need to handle byte 0 for strncmpy +#ifdef USE_AS_STRNCMP + sub $0, %r11 + jbe LABEL(strcmp_exitz) +#endif + */ + movzx (%rsi), %ecx + movzx (%rdi), %eax + + sub %ecx, %eax + ret + + .p2align 4 +LABEL(Byte1): + +#ifdef USE_AS_STRNCMP + sub $1, %r11 + jbe LABEL(strcmp_exitz) +#endif + movzx 1(%rsi), %ecx + movzx 1(%rdi), %eax + + sub %ecx, %eax + ret + + .p2align 4 +LABEL(Byte2): + +#ifdef USE_AS_STRNCMP + sub $2, %r11 + jbe LABEL(strcmp_exitz) +#endif + movzx 2(%rsi), %ecx + movzx 2(%rdi), %eax + + sub %ecx, %eax + ret + + .p2align 4 +LABEL(Byte3): + +#ifdef USE_AS_STRNCMP + sub $3, %r11 + jbe LABEL(strcmp_exitz) +#endif + movzx 3(%rsi), %ecx + movzx 3(%rdi), %eax + + sub %ecx, %eax + ret + + .p2align 4 +LABEL(Byte4): + +#ifdef USE_AS_STRNCMP + sub $4, %r11 + jbe LABEL(strcmp_exitz) +#endif + movzx 4(%rsi), %ecx + movzx 4(%rdi), %eax + + sub %ecx, %eax + ret + + .p2align 4 +LABEL(Byte5): + +#ifdef USE_AS_STRNCMP + sub $5, %r11 + jbe LABEL(strcmp_exitz) +#endif + movzx 5(%rsi), %ecx + movzx 5(%rdi), %eax + + sub %ecx, %eax + ret + + .p2align 4 +LABEL(Byte6): + +#ifdef USE_AS_STRNCMP + sub $6, %r11 + jbe LABEL(strcmp_exitz) +#endif + movzx 6(%rsi), %ecx + movzx 6(%rdi), %eax + + sub %ecx, %eax + ret + + .p2align 4 +LABEL(next_8_bytes): + add $8, %rdi + add $8, %rsi +#ifdef USE_AS_STRNCMP + sub $8, %r11 + jbe LABEL(strcmp_exitz) +#endif + test $0x01, %dh + jnz LABEL(Byte0) + + test $0x02, %dh + jnz LABEL(Byte1) + + test $0x04, %dh + jnz LABEL(Byte2) + + test $0x08, %dh + jnz LABEL(Byte3) + + test $0x10, %dh + jnz LABEL(Byte4) + + test $0x20, %dh + jnz LABEL(Byte5) + + test $0x40, %dh + jnz LABEL(Byte6) + +#ifdef USE_AS_STRNCMP + sub $7, %r11 + jbe LABEL(strcmp_exitz) +#endif + movzx 7(%rsi), %ecx + movzx 7(%rdi), %eax + + sub %ecx, %eax + ret + cfi_endproc + .size STRCMP_SSE42, .-STRCMP_SSE42 + + /* Put all SSE 4.2 functions together. */ + .section .rodata.sse4.2,"a",@progbits + .p2align 4 +LABEL(unaligned_table): + .int LABEL(ashr_1) - LABEL(unaligned_table) + .int LABEL(ashr_2) - LABEL(unaligned_table) + .int LABEL(ashr_3) - LABEL(unaligned_table) + .int LABEL(ashr_4) - LABEL(unaligned_table) + .int LABEL(ashr_5) - LABEL(unaligned_table) + .int LABEL(ashr_6) - LABEL(unaligned_table) + .int LABEL(ashr_7) - LABEL(unaligned_table) + .int LABEL(ashr_8) - LABEL(unaligned_table) + .int LABEL(ashr_9) - LABEL(unaligned_table) + .int LABEL(ashr_10) - LABEL(unaligned_table) + .int LABEL(ashr_11) - LABEL(unaligned_table) + .int LABEL(ashr_12) - LABEL(unaligned_table) + .int LABEL(ashr_13) - LABEL(unaligned_table) + .int LABEL(ashr_14) - LABEL(unaligned_table) + .int LABEL(ashr_15) - LABEL(unaligned_table) + .int LABEL(ashr_0) - LABEL(unaligned_table) + + +# undef ENTRY +# define ENTRY(name) \ + .type STRCMP_SSE2, @function; \ + STRCMP_SSE2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCMP_SSE2, .-STRCMP_SSE2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcmp calls through a PLT. + The speedup we get from using SSE4.2 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCMP; __GI_STRCMP = STRCMP_SSE2 +#endif + +#ifndef USE_AS_STRNCMP +#include "../strcmp.S" +#endif diff --git a/sysdeps/x86_64/multiarch/strncmp-c.c b/sysdeps/x86_64/multiarch/strncmp-c.c new file mode 100644 index 0000000000..d4f74a418d --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncmp-c.c @@ -0,0 +1,8 @@ +#ifdef SHARED +#define STRNCMP __strncmp_sse2 +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strncmp_sse2, __GI_strncmp, __strncmp_sse2); +#endif + +#include "strncmp.c" diff --git a/sysdeps/x86_64/multiarch/strncmp.S b/sysdeps/x86_64/multiarch/strncmp.S new file mode 100644 index 0000000000..0af34e7f15 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncmp.S @@ -0,0 +1,3 @@ +#define STRCMP strncmp +#define USE_AS_STRNCMP +#include "strcmp.S" -- cgit v1.2.3