From 0dabf204ef2efaff8fa01a8d5d0f17eb8c0db796 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Tue, 13 Apr 2010 10:23:22 -0700 Subject: Fix makecontext on s390/s390x --- sysdeps/unix/sysv/linux/s390/s390-32/makecontext.c | 64 +++++++++++----------- sysdeps/unix/sysv/linux/s390/s390-64/makecontext.c | 62 ++++++++++----------- 2 files changed, 63 insertions(+), 63 deletions(-) (limited to 'sysdeps') diff --git a/sysdeps/unix/sysv/linux/s390/s390-32/makecontext.c b/sysdeps/unix/sysv/linux/s390/s390-32/makecontext.c index 94760e0c2b..0e309c3e29 100644 --- a/sysdeps/unix/sysv/linux/s390/s390-32/makecontext.c +++ b/sysdeps/unix/sysv/linux/s390/s390-32/makecontext.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2001 Free Software Foundation, Inc. +/* Copyright (C) 2001, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Martin Schwidefsky (schwidefsky@de.ibm.com). @@ -28,15 +28,15 @@ double, complex and structure with sizes 0, 2, 4 or 8 won't work. makecontext sets up a stack and the registers for the - context. The stack looks like this: - size offset + user context. The stack looks like this: + size offset %r15 -> +-----------------------+ - 4 | back chain (zero) | 0 - 4 | reserved | 4 - 88 | save area for (*func) | 8 - +-----------------------+ - n | overflow parameters | 96 - +-----------------------+ + 4 | back chain (zero) | 0 + 4 | reserved | 4 + 88 | save area for (*func) | 8 + +-----------------------+ + n | overflow parameters | 96 + +-----------------------+ The registers are set up like this: %r2-%r6: parameters 1 to 5 %r7 : (*func) pointer @@ -54,27 +54,27 @@ void __makecontext (ucontext_t *ucp, void (*func) (void), int argc, ...) { extern void __makecontext_ret (void); - unsigned long *sp; + unsigned long int *sp; va_list ap; - int i; - sp = (unsigned long *) (((unsigned long) ucp->uc_stack.ss_sp - + ucp->uc_stack.ss_size) & -8L); + sp = (unsigned long int *) (((unsigned long int) ucp->uc_stack.ss_sp + + ucp->uc_stack.ss_size) & -8L); /* Set the return address to trampoline. */ - ucp->uc_mcontext.gregs[14] = (long) __makecontext_ret; + ucp->uc_mcontext.gregs[14] = (long int) __makecontext_ret; /* Set register parameters. */ va_start (ap, argc); - for (i = 0; (i < argc) && (i < 5); i++) - ucp->uc_mcontext.gregs[2+i] = va_arg (ap, long); + for (int i = 0; i < argc && i < 5; ++i) + ucp->uc_mcontext.gregs[2 + i] = va_arg (ap, long int); /* The remaining arguments go to the overflow area. */ - if (argc > 5) { - sp -= argc - 5; - for (i = 5; i < argc; i++) - sp[i] = va_arg(ap, long); - } + if (argc > 5) + { + sp -= argc - 5; + for (int i = 5; i < argc; ++i) + sp[i - 5] = va_arg (ap, long int); + } va_end (ap); /* Make room for the save area and set the backchain. */ @@ -82,24 +82,24 @@ __makecontext (ucontext_t *ucp, void (*func) (void), int argc, ...) *sp = 0; /* Pass (*func) to __start_context in %r7. */ - ucp->uc_mcontext.gregs[7] = (long) func; + ucp->uc_mcontext.gregs[7] = (long int) func; /* Pass ucp->uc_link to __start_context in %r8. */ - ucp->uc_mcontext.gregs[8] = (long) ucp->uc_link; + ucp->uc_mcontext.gregs[8] = (long int) ucp->uc_link; /* Pass address of setcontext in %r9. */ - ucp->uc_mcontext.gregs[9] = (long) &setcontext; + ucp->uc_mcontext.gregs[9] = (long int) &setcontext; /* Set stack pointer. */ - ucp->uc_mcontext.gregs[15] = (long) sp; + ucp->uc_mcontext.gregs[15] = (long int) sp; } -asm(".text\n" - ".type __makecontext_ret,@function\n" - "__makecontext_ret:\n" - " basr %r14,%r7\n" - " lr %r2,%r8\n" - " br %r9\n" - ".size __makecontext_ret, .-__makecontext_ret"); +asm (".text\n" + ".type __makecontext_ret,@function\n" + "__makecontext_ret:\n" + " basr %r14,%r7\n" + " lr %r2,%r8\n" + " br %r9\n" + ".size __makecontext_ret, .-__makecontext_ret"); weak_alias (__makecontext, makecontext) diff --git a/sysdeps/unix/sysv/linux/s390/s390-64/makecontext.c b/sysdeps/unix/sysv/linux/s390/s390-64/makecontext.c index b08f1b4047..40ff3eefb4 100644 --- a/sysdeps/unix/sysv/linux/s390/s390-64/makecontext.c +++ b/sysdeps/unix/sysv/linux/s390/s390-64/makecontext.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2001 Free Software Foundation, Inc. +/* Copyright (C) 2001, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Martin Schwidefsky (schwidefsky@de.ibm.com). @@ -29,14 +29,14 @@ won't work. makecontext sets up a stack and the registers for the user context. The stack looks like this: - size offset + size offset %r15 -> +-----------------------+ - 8 | back chain (zero) | 0 - 8 | reserved | 8 - 144 | save area for (*func) | 16 - +-----------------------+ - n | overflow parameters | 160 - +-----------------------+ + 8 | back chain (zero) | 0 + 8 | reserved | 8 + 144 | save area for (*func) | 16 + +-----------------------+ + n | overflow parameters | 160 + +-----------------------+ The registers are set up like this: %r2-%r6: parameters 1 to 5 %r7 : (*func) pointer @@ -54,27 +54,27 @@ void __makecontext (ucontext_t *ucp, void (*func) (void), int argc, ...) { extern void __makecontext_ret (void); - unsigned long *sp; + unsigned long int *sp; va_list ap; - int i; - sp = (unsigned long *) (((unsigned long) ucp->uc_stack.ss_sp - + ucp->uc_stack.ss_size) & -8L); + sp = (unsigned long int *) (((unsigned long int) ucp->uc_stack.ss_sp + + ucp->uc_stack.ss_size) & -8L); /* Set the return address to trampoline. */ - ucp->uc_mcontext.gregs[14] = (long) __makecontext_ret; + ucp->uc_mcontext.gregs[14] = (long int) __makecontext_ret; /* Set register parameters. */ va_start (ap, argc); - for (i = 0; (i < argc) && (i < 5); i++) - ucp->uc_mcontext.gregs[2+i] = va_arg (ap, long); + for (int i = 0; i < argc && i < 5; ++i) + ucp->uc_mcontext.gregs[2 + i] = va_arg (ap, long int); /* The remaining arguments go to the overflow area. */ - if (argc > 5) { - sp -= argc - 5; - for (i = 5; i < argc; i++) - sp[i] = va_arg(ap, long); - } + if (argc > 5) + { + sp -= argc - 5; + for (int i = 5; i < argc; ++i) + sp[i - 5] = va_arg (ap, long int); + } va_end (ap); /* Make room for the save area and set the backchain. */ @@ -82,24 +82,24 @@ __makecontext (ucontext_t *ucp, void (*func) (void), int argc, ...) *sp = 0; /* Pass (*func) to __start_context in %r7. */ - ucp->uc_mcontext.gregs[7] = (long) func; + ucp->uc_mcontext.gregs[7] = (long int) func; /* Pass ucp->uc_link to __start_context in %r8. */ - ucp->uc_mcontext.gregs[8] = (long) ucp->uc_link; + ucp->uc_mcontext.gregs[8] = (long int) ucp->uc_link; /* Pass address of setcontext in %r9. */ - ucp->uc_mcontext.gregs[9] = (long) &setcontext; + ucp->uc_mcontext.gregs[9] = (long int) &setcontext; /* Set stack pointer. */ - ucp->uc_mcontext.gregs[15] = (long) sp; + ucp->uc_mcontext.gregs[15] = (long int) sp; } -asm(".text\n" - ".type __makecontext_ret,@function\n" - "__makecontext_ret:\n" - " basr %r14,%r7\n" - " lgr %r2,%r8\n" - " br %r9\n" - ".size __makecontext_ret, .-__makecontext_ret"); +asm (".text\n" + ".type __makecontext_ret,@function\n" + "__makecontext_ret:\n" + " basr %r14,%r7\n" + " lgr %r2,%r8\n" + " br %r9\n" + ".size __makecontext_ret, .-__makecontext_ret"); weak_alias (__makecontext, makecontext) -- cgit v1.2.3 From bbbdd7780916358c65e8bc98c989126a7db2f43e Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Tue, 13 Apr 2010 19:17:10 -0700 Subject: Update x86-64 cpu multiarch selection header. --- ChangeLog | 5 +++++ sysdeps/x86_64/multiarch/init-arch.h | 38 ++++++++++++++++++++---------------- 2 files changed, 26 insertions(+), 17 deletions(-) (limited to 'sysdeps') diff --git a/ChangeLog b/ChangeLog index 448eeec991..44debe81a8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2010-04-13 Ulrich Drepper + + * sysdeps/x86_64/multiarch/init-arch.h: Pretty printing. + Add SSE 4.1 macros. + 2010-04-10 Matt Fleming * elf/elf.h: Add SH specific ELF header flags. diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h index 5c73813404..b2f2de3796 100644 --- a/sysdeps/x86_64/multiarch/init-arch.h +++ b/sysdeps/x86_64/multiarch/init-arch.h @@ -20,21 +20,23 @@ #ifdef __ASSEMBLER__ -#include +# include -#define bit_SSE2 (1 << 26) -#define bit_SSSE3 (1 << 9) -#define bit_SSE4_2 (1 << 20) +# define bit_SSE2 (1 << 26) +# define bit_SSSE3 (1 << 9) +# define bit_SSE4_1 (1 << 19) +# define bit_SSE4_2 (1 << 20) -#define index_SSE2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET -#define index_SSSE3 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET -#define index_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET +# define index_SSE2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET +# define index_SSSE3 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET +# define index_SSE4_1 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET +# define index_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET #define index_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE #else /* __ASSEMBLER__ */ -#include +# include enum { @@ -84,20 +86,22 @@ extern void __init_cpu_features (void) attribute_hidden; extern const struct cpu_features *__get_cpu_features (void) __attribute__ ((const)); -#ifndef NOT_IN_libc -# define __get_cpu_features() (&__cpu_features) -#endif +# ifndef NOT_IN_libc +# define __get_cpu_features() (&__cpu_features) +# endif -#define HAS_CPU_FEATURE(idx, reg, bit) \ +# define HAS_CPU_FEATURE(idx, reg, bit) \ ((__get_cpu_features ()->cpuid[idx].reg & (1 << (bit))) != 0) /* Following are the feature tests used throughout libc. */ -#define HAS_SSE2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, edx, 26) -#define HAS_POPCOUNT HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 23) -#define HAS_SSE4_2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 20) -#define HAS_FMA HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 12) +# define HAS_SSE2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, edx, 26) +# define HAS_POPCOUNT HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 23) +# define HAS_SSSE3 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 9) +# define HAS_SSE4_1 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 19) +# define HAS_SSE4_2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 20) +# define HAS_FMA HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 12) -#define index_Fast_Rep_String FEATURE_INDEX_1 +# define index_Fast_Rep_String FEATURE_INDEX_1 #endif /* __ASSEMBLER__ */ -- cgit v1.2.3 From 404a6e32012cfef041709639e13d200ae0b2c0c6 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Wed, 14 Apr 2010 00:12:53 -0700 Subject: x86-64 SSE4 optimized memcmp This is 64bit SSE4 optimized memcmp. It improves memcmp by upto 3X on Intel Core i7. --- ChangeLog | 8 + sysdeps/x86_64/multiarch/Makefile | 2 +- sysdeps/x86_64/multiarch/memcmp-sse4.S | 1270 ++++++++++++++++++++++++++++++++ sysdeps/x86_64/multiarch/memcmp.S | 59 ++ sysdeps/x86_64/multiarch/rtld-memcmp.c | 1 + 5 files changed, 1339 insertions(+), 1 deletion(-) create mode 100644 sysdeps/x86_64/multiarch/memcmp-sse4.S create mode 100644 sysdeps/x86_64/multiarch/memcmp.S create mode 100644 sysdeps/x86_64/multiarch/rtld-memcmp.c (limited to 'sysdeps') diff --git a/ChangeLog b/ChangeLog index 44debe81a8..b3705f001a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2010-04-12 H.J. Lu + + * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add + memcmp-sse4. + * sysdeps/x86_64/multiarch/memcmp-sse4.S: New file. + * sysdeps/x86_64/multiarch/memcmp.S: New file. + * sysdeps/x86_64/multiarch/rtld-memcmp.c: New file. + 2010-04-13 Ulrich Drepper * sysdeps/x86_64/multiarch/init-arch.h: Pretty printing. diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 364e7bbbd2..c61cf70345 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -5,7 +5,7 @@ endif ifeq ($(subdir),string) sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ - strend-sse4 + strend-sse4 memcmp-sse4 ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c CFLAGS-strcspn-c.c += -msse4 diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S new file mode 100644 index 0000000000..886c8934d0 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S @@ -0,0 +1,1270 @@ +/* memcmp with SSE4.1 + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +#include +#include "asm-syntax.h" + +#ifndef MEMCMP +# define MEMCMP __memcmp_sse4_1 +#endif + +#ifndef ALIGN +# define ALIGN(n) .p2align n +#endif + +#define JMPTBL(I, B) (I - B) + +#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + lea TABLE(%rip), %r11; \ + movslq (%r11, INDEX, SCALE), %rcx; \ + add %r11, %rcx; \ + jmp *%rcx; \ + ud2 + + .section .text.sse4.1,"ax",@progbits +ENTRY (MEMCMP) + pxor %xmm0, %xmm0 + cmp $79, %rdx + ja L(79bytesormore) + cmp $1, %rdx + je L(firstbyte) + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + ALIGN (4) +L(firstbyte): + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + sub %ecx, %eax + ret + + ALIGN (4) +L(79bytesormore): + movdqu (%rsi), %xmm1 + movdqu (%rdi), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + mov %rsi, %rcx + and $-16, %rsi + add $16, %rsi + sub %rsi, %rcx + + sub %rcx, %rdi + add %rcx, %rdx + + cmp $128, %rdx + ja L(128bytesormore) +L(less128bytes): + sub $64, %rdx + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqu 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqu 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + cmp $32, %rdx + jb L(less32bytesin64) + + movdqu 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqu 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin64): + add $64, %rdi + add $64, %rsi + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +L(128bytesormore): + cmp $256, %rdx + ja L(256bytesormore) +L(less256bytes): + sub $128, %rdx + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqu 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqu 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqu 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqu 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqu 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqu 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + add $128, %rsi + add $128, %rdi + + cmp $64, %rdx + jae L(less128bytes) + + cmp $32, %rdx + jb L(less32bytesin128) + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin128): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +L(256bytesormore): + cmp $512, %rdx + ja L(512bytesormore) +L(less512bytes): + sub $256, %rdx + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqu 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqu 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqu 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqu 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqu 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqu 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + movdqu 128(%rdi), %xmm2 + pxor 128(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(144bytesin256) + + movdqu 144(%rdi), %xmm2 + pxor 144(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(160bytesin256) + + movdqu 160(%rdi), %xmm2 + pxor 160(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(176bytesin256) + + movdqu 176(%rdi), %xmm2 + pxor 176(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(192bytesin256) + + movdqu 192(%rdi), %xmm2 + pxor 192(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(208bytesin256) + + movdqu 208(%rdi), %xmm2 + pxor 208(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(224bytesin256) + + movdqu 224(%rdi), %xmm2 + pxor 224(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(240bytesin256) + + movdqu 240(%rdi), %xmm2 + pxor 240(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(256bytesin256) + + add $256, %rsi + add $256, %rdi + + cmp $128, %rdx + jae L(less256bytes) + + cmp $64, %rdx + jae L(less128bytes) + + cmp $32, %rdx + jb L(less32bytesin256) + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin256): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + ALIGN (4) +L(512bytesormore): + sub $64, %rdx +L(64bytesormore_loop): + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqu 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqu 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqu 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(64bytesormore_loop) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + ALIGN (4) +L(64bytesormore_loop_end): + add $16, %rdi + add $16, %rsi + ptest %xmm2, %xmm0 + jnc L(16bytes) + + add $16, %rdi + add $16, %rsi + ptest %xmm3, %xmm0 + jnc L(16bytes) + + add $16, %rdi + add $16, %rsi + ptest %xmm4, %xmm0 + jnc L(16bytes) + + add $16, %rdi + add $16, %rsi + jmp L(16bytes) + +L(256bytesin256): + add $256, %rdi + add $256, %rsi + jmp L(16bytes) +L(240bytesin256): + add $240, %rdi + add $240, %rsi + jmp L(16bytes) +L(224bytesin256): + add $224, %rdi + add $224, %rsi + jmp L(16bytes) +L(208bytesin256): + add $208, %rdi + add $208, %rsi + jmp L(16bytes) +L(192bytesin256): + add $192, %rdi + add $192, %rsi + jmp L(16bytes) +L(176bytesin256): + add $176, %rdi + add $176, %rsi + jmp L(16bytes) +L(160bytesin256): + add $160, %rdi + add $160, %rsi + jmp L(16bytes) +L(144bytesin256): + add $144, %rdi + add $144, %rsi + jmp L(16bytes) +L(128bytesin256): + add $128, %rdi + add $128, %rsi + jmp L(16bytes) +L(112bytesin256): + add $112, %rdi + add $112, %rsi + jmp L(16bytes) +L(96bytesin256): + add $96, %rdi + add $96, %rsi + jmp L(16bytes) +L(80bytesin256): + add $80, %rdi + add $80, %rsi + jmp L(16bytes) +L(64bytesin256): + add $64, %rdi + add $64, %rsi + jmp L(16bytes) +L(48bytesin256): + add $16, %rdi + add $16, %rsi +L(32bytesin256): + add $16, %rdi + add $16, %rsi +L(16bytesin256): + add $16, %rdi + add $16, %rsi +L(16bytes): + mov -16(%rdi), %rax + mov -16(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(8bytes): + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(12bytes): + mov -12(%rdi), %rax + mov -12(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(4bytes): + mov -4(%rsi), %ecx + mov -4(%rdi), %eax + cmp %eax, %ecx + jne L(diffin4bytes) +L(0bytes): + xor %eax, %eax + ret + + ALIGN (4) +L(65bytes): + movdqu -65(%rdi), %xmm1 + movdqu -65(%rsi), %xmm2 + mov $-65, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(49bytes): + movdqu -49(%rdi), %xmm1 + movdqu -49(%rsi), %xmm2 + mov $-49, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(33bytes): + movdqu -33(%rdi), %xmm1 + movdqu -33(%rsi), %xmm2 + mov $-33, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(17bytes): + mov -17(%rdi), %rax + mov -17(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(9bytes): + mov -9(%rdi), %rax + mov -9(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + movzbl -1(%rdi), %eax + movzbl -1(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(13bytes): + mov -13(%rdi), %rax + mov -13(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(5bytes): + mov -5(%rdi), %eax + mov -5(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + movzbl -1(%rdi), %eax + movzbl -1(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(66bytes): + movdqu -66(%rdi), %xmm1 + movdqu -66(%rsi), %xmm2 + mov $-66, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(50bytes): + movdqu -50(%rdi), %xmm1 + movdqu -50(%rsi), %xmm2 + mov $-50, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(34bytes): + movdqu -34(%rdi), %xmm1 + movdqu -34(%rsi), %xmm2 + mov $-34, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(18bytes): + mov -18(%rdi), %rax + mov -18(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(10bytes): + mov -10(%rdi), %rax + mov -10(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + movzwl -2(%rdi), %eax + movzwl -2(%rsi), %ecx + cmp %cl, %al + jne L(end) + and $0xffff, %eax + and $0xffff, %ecx + sub %ecx, %eax + ret + + ALIGN (4) +L(14bytes): + mov -14(%rdi), %rax + mov -14(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(6bytes): + mov -6(%rdi), %eax + mov -6(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) +L(2bytes): + movzwl -2(%rsi), %ecx + movzwl -2(%rdi), %eax + cmp %cl, %al + jne L(end) + and $0xffff, %eax + and $0xffff, %ecx + sub %ecx, %eax + ret + + ALIGN (4) +L(67bytes): + movdqu -67(%rdi), %xmm2 + movdqu -67(%rsi), %xmm1 + mov $-67, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(51bytes): + movdqu -51(%rdi), %xmm2 + movdqu -51(%rsi), %xmm1 + mov $-51, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(35bytes): + movdqu -35(%rsi), %xmm1 + movdqu -35(%rdi), %xmm2 + mov $-35, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(19bytes): + mov -19(%rdi), %rax + mov -19(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(11bytes): + mov -11(%rdi), %rax + mov -11(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -4(%rdi), %eax + mov -4(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(15bytes): + mov -15(%rdi), %rax + mov -15(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(7bytes): + mov -7(%rdi), %eax + mov -7(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + mov -4(%rdi), %eax + mov -4(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(3bytes): + movzwl -3(%rdi), %eax + movzwl -3(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin2bytes) +L(1bytes): + movzbl -1(%rdi), %eax + movzbl -1(%rsi), %ecx + sub %ecx, %eax + ret + + ALIGN (4) +L(68bytes): + movdqu -68(%rdi), %xmm2 + movdqu -68(%rsi), %xmm1 + mov $-68, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(52bytes): + movdqu -52(%rdi), %xmm2 + movdqu -52(%rsi), %xmm1 + mov $-52, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(36bytes): + movdqu -36(%rdi), %xmm2 + movdqu -36(%rsi), %xmm1 + mov $-36, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(20bytes): + movdqu -20(%rdi), %xmm2 + movdqu -20(%rsi), %xmm1 + mov $-20, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -4(%rdi), %eax + mov -4(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(69bytes): + movdqu -69(%rsi), %xmm1 + movdqu -69(%rdi), %xmm2 + mov $-69, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(53bytes): + movdqu -53(%rsi), %xmm1 + movdqu -53(%rdi), %xmm2 + mov $-53, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(37bytes): + movdqu -37(%rsi), %xmm1 + movdqu -37(%rdi), %xmm2 + mov $-37, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(21bytes): + movdqu -21(%rsi), %xmm1 + movdqu -21(%rdi), %xmm2 + mov $-21, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(70bytes): + movdqu -70(%rsi), %xmm1 + movdqu -70(%rdi), %xmm2 + mov $-70, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(54bytes): + movdqu -54(%rsi), %xmm1 + movdqu -54(%rdi), %xmm2 + mov $-54, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(38bytes): + movdqu -38(%rsi), %xmm1 + movdqu -38(%rdi), %xmm2 + mov $-38, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(22bytes): + movdqu -22(%rsi), %xmm1 + movdqu -22(%rdi), %xmm2 + mov $-22, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(71bytes): + movdqu -71(%rsi), %xmm1 + movdqu -71(%rdi), %xmm2 + mov $-71, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(55bytes): + movdqu -55(%rdi), %xmm2 + movdqu -55(%rsi), %xmm1 + mov $-55, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(39bytes): + movdqu -39(%rdi), %xmm2 + movdqu -39(%rsi), %xmm1 + mov $-39, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(23bytes): + movdqu -23(%rdi), %xmm2 + movdqu -23(%rsi), %xmm1 + mov $-23, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(72bytes): + movdqu -72(%rsi), %xmm1 + movdqu -72(%rdi), %xmm2 + mov $-72, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(56bytes): + movdqu -56(%rdi), %xmm2 + movdqu -56(%rsi), %xmm1 + mov $-56, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(40bytes): + movdqu -40(%rdi), %xmm2 + movdqu -40(%rsi), %xmm1 + mov $-40, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(24bytes): + movdqu -24(%rdi), %xmm2 + movdqu -24(%rsi), %xmm1 + mov $-24, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(73bytes): + movdqu -73(%rsi), %xmm1 + movdqu -73(%rdi), %xmm2 + mov $-73, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(57bytes): + movdqu -57(%rdi), %xmm2 + movdqu -57(%rsi), %xmm1 + mov $-57, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(41bytes): + movdqu -41(%rdi), %xmm2 + movdqu -41(%rsi), %xmm1 + mov $-41, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(25bytes): + movdqu -25(%rdi), %xmm2 + movdqu -25(%rsi), %xmm1 + mov $-25, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -9(%rdi), %rax + mov -9(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + movzbl -1(%rdi), %eax + movzbl -1(%rsi), %ecx + sub %ecx, %eax + ret + + ALIGN (4) +L(74bytes): + movdqu -74(%rsi), %xmm1 + movdqu -74(%rdi), %xmm2 + mov $-74, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(58bytes): + movdqu -58(%rdi), %xmm2 + movdqu -58(%rsi), %xmm1 + mov $-58, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(42bytes): + movdqu -42(%rdi), %xmm2 + movdqu -42(%rsi), %xmm1 + mov $-42, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(26bytes): + movdqu -26(%rdi), %xmm2 + movdqu -26(%rsi), %xmm1 + mov $-26, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -10(%rdi), %rax + mov -10(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + movzwl -2(%rdi), %eax + movzwl -2(%rsi), %ecx + jmp L(diffin2bytes) + + ALIGN (4) +L(75bytes): + movdqu -75(%rsi), %xmm1 + movdqu -75(%rdi), %xmm2 + mov $-75, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(59bytes): + movdqu -59(%rdi), %xmm2 + movdqu -59(%rsi), %xmm1 + mov $-59, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(43bytes): + movdqu -43(%rdi), %xmm2 + movdqu -43(%rsi), %xmm1 + mov $-43, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(27bytes): + movdqu -27(%rdi), %xmm2 + movdqu -27(%rsi), %xmm1 + mov $-27, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -11(%rdi), %rax + mov -11(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -4(%rdi), %eax + mov -4(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(76bytes): + movdqu -76(%rsi), %xmm1 + movdqu -76(%rdi), %xmm2 + mov $-76, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(60bytes): + movdqu -60(%rdi), %xmm2 + movdqu -60(%rsi), %xmm1 + mov $-60, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(44bytes): + movdqu -44(%rdi), %xmm2 + movdqu -44(%rsi), %xmm1 + mov $-44, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(28bytes): + movdqu -28(%rdi), %xmm2 + movdqu -28(%rsi), %xmm1 + mov $-28, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -12(%rdi), %rax + mov -12(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -4(%rdi), %eax + mov -4(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(77bytes): + movdqu -77(%rsi), %xmm1 + movdqu -77(%rdi), %xmm2 + mov $-77, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(61bytes): + movdqu -61(%rdi), %xmm2 + movdqu -61(%rsi), %xmm1 + mov $-61, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(45bytes): + movdqu -45(%rdi), %xmm2 + movdqu -45(%rsi), %xmm1 + mov $-45, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(29bytes): + movdqu -29(%rdi), %xmm2 + movdqu -29(%rsi), %xmm1 + mov $-29, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -13(%rdi), %rax + mov -13(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(78bytes): + movdqu -78(%rsi), %xmm1 + movdqu -78(%rdi), %xmm2 + mov $-78, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(62bytes): + movdqu -62(%rdi), %xmm2 + movdqu -62(%rsi), %xmm1 + mov $-62, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(46bytes): + movdqu -46(%rdi), %xmm2 + movdqu -46(%rsi), %xmm1 + mov $-46, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(30bytes): + movdqu -30(%rdi), %xmm2 + movdqu -30(%rsi), %xmm1 + mov $-30, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -14(%rdi), %rax + mov -14(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(79bytes): + movdqu -79(%rsi), %xmm1 + movdqu -79(%rdi), %xmm2 + mov $-79, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(63bytes): + movdqu -63(%rdi), %xmm2 + movdqu -63(%rsi), %xmm1 + mov $-63, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(47bytes): + movdqu -47(%rdi), %xmm2 + movdqu -47(%rsi), %xmm1 + mov $-47, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(31bytes): + movdqu -31(%rdi), %xmm2 + movdqu -31(%rsi), %xmm1 + mov $-31, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -15(%rdi), %rax + mov -15(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(64bytes): + movdqu -64(%rdi), %xmm2 + movdqu -64(%rsi), %xmm1 + mov $-64, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(48bytes): + movdqu -48(%rdi), %xmm2 + movdqu -48(%rsi), %xmm1 + mov $-48, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(32bytes): + movdqu -32(%rdi), %xmm2 + movdqu -32(%rsi), %xmm1 + mov $-32, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -16(%rdi), %rax + mov -16(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(less16bytes): + movsbq %dl, %rdx + mov (%rsi, %rdx), %rcx + mov (%rdi, %rdx), %rax + cmp %rax, %rcx + jne L(diffin8bytes) + mov 8(%rsi, %rdx), %rcx + mov 8(%rdi, %rdx), %rax +L(diffin8bytes): + cmp %eax, %ecx + jne L(diffin4bytes) + shr $32, %rcx + shr $32, %rax +L(diffin4bytes): + cmp %cx, %ax + jne L(diffin2bytes) + shr $16, %ecx + shr $16, %eax + xor %rdx, %rdx +L(diffin2bytes): + cmp %cl, %al + jne L(end) + and $0xffff, %eax + and $0xffff, %ecx + sub %ecx, %eax + ret + + ALIGN (4) +L(end): + and $0xff, %eax + and $0xff, %ecx + sub %ecx, %eax + ret + +END (MEMCMP) + + .section .rodata.sse4.1,"a",@progbits + ALIGN (3) +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(1bytes), L(table_64bytes)) + .int JMPTBL (L(2bytes), L(table_64bytes)) + .int JMPTBL (L(3bytes), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(5bytes), L(table_64bytes)) + .int JMPTBL (L(6bytes), L(table_64bytes)) + .int JMPTBL (L(7bytes), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(9bytes), L(table_64bytes)) + .int JMPTBL (L(10bytes), L(table_64bytes)) + .int JMPTBL (L(11bytes), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(13bytes), L(table_64bytes)) + .int JMPTBL (L(14bytes), L(table_64bytes)) + .int JMPTBL (L(15bytes), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(17bytes), L(table_64bytes)) + .int JMPTBL (L(18bytes), L(table_64bytes)) + .int JMPTBL (L(19bytes), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(21bytes), L(table_64bytes)) + .int JMPTBL (L(22bytes), L(table_64bytes)) + .int JMPTBL (L(23bytes), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(25bytes), L(table_64bytes)) + .int JMPTBL (L(26bytes), L(table_64bytes)) + .int JMPTBL (L(27bytes), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(29bytes), L(table_64bytes)) + .int JMPTBL (L(30bytes), L(table_64bytes)) + .int JMPTBL (L(31bytes), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(33bytes), L(table_64bytes)) + .int JMPTBL (L(34bytes), L(table_64bytes)) + .int JMPTBL (L(35bytes), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(37bytes), L(table_64bytes)) + .int JMPTBL (L(38bytes), L(table_64bytes)) + .int JMPTBL (L(39bytes), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(41bytes), L(table_64bytes)) + .int JMPTBL (L(42bytes), L(table_64bytes)) + .int JMPTBL (L(43bytes), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(45bytes), L(table_64bytes)) + .int JMPTBL (L(46bytes), L(table_64bytes)) + .int JMPTBL (L(47bytes), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(49bytes), L(table_64bytes)) + .int JMPTBL (L(50bytes), L(table_64bytes)) + .int JMPTBL (L(51bytes), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(53bytes), L(table_64bytes)) + .int JMPTBL (L(54bytes), L(table_64bytes)) + .int JMPTBL (L(55bytes), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(57bytes), L(table_64bytes)) + .int JMPTBL (L(58bytes), L(table_64bytes)) + .int JMPTBL (L(59bytes), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(61bytes), L(table_64bytes)) + .int JMPTBL (L(62bytes), L(table_64bytes)) + .int JMPTBL (L(63bytes), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) + .int JMPTBL (L(65bytes), L(table_64bytes)) + .int JMPTBL (L(66bytes), L(table_64bytes)) + .int JMPTBL (L(67bytes), L(table_64bytes)) + .int JMPTBL (L(68bytes), L(table_64bytes)) + .int JMPTBL (L(69bytes), L(table_64bytes)) + .int JMPTBL (L(70bytes), L(table_64bytes)) + .int JMPTBL (L(71bytes), L(table_64bytes)) + .int JMPTBL (L(72bytes), L(table_64bytes)) + .int JMPTBL (L(73bytes), L(table_64bytes)) + .int JMPTBL (L(74bytes), L(table_64bytes)) + .int JMPTBL (L(75bytes), L(table_64bytes)) + .int JMPTBL (L(76bytes), L(table_64bytes)) + .int JMPTBL (L(77bytes), L(table_64bytes)) + .int JMPTBL (L(78bytes), L(table_64bytes)) + .int JMPTBL (L(79bytes), L(table_64bytes)) +#endif diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S new file mode 100644 index 0000000000..301ab287f5 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcmp.S @@ -0,0 +1,59 @@ +/* Multiple versions of memcmp + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +/* Define multiple versions only for the definition in libc. */ +#ifndef NOT_IN_libc + .text +ENTRY(memcmp) + .type memcmp, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features(%rip) + jne 1f + call __init_cpu_features +1: leaq __memcmp_sse2(%rip), %rax + testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip) + jz 2f + leaq __memcmp_sse4_1(%rip), %rax +2: ret +END(memcmp) + +# undef ENTRY +# define ENTRY(name) \ + .type __memcmp_sse2, @function; \ + .p2align 4; \ + __memcmp_sse2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __memcmp_sse2, .-__memcmp_sse2 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memcmp; __GI_memcmp = __memcmp_sse2 +# endif +#endif + +#include "../memcmp.S" diff --git a/sysdeps/x86_64/multiarch/rtld-memcmp.c b/sysdeps/x86_64/multiarch/rtld-memcmp.c new file mode 100644 index 0000000000..0f271356c2 --- /dev/null +++ b/sysdeps/x86_64/multiarch/rtld-memcmp.c @@ -0,0 +1 @@ +#include "../rtld-memcmp.c" -- cgit v1.2.3 From dd37cd1a1243fad34d67ea54ebc36fc412da6e88 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Wed, 14 Apr 2010 17:53:44 -0700 Subject: Optimie x86-64 SSE4 memcmp for unaligned data. --- ChangeLog | 5 + malloc/malloc.c | 4 + sysdeps/x86_64/multiarch/memcmp-sse4.S | 377 ++++++++++++++++++++++++++++++++- 3 files changed, 380 insertions(+), 6 deletions(-) (limited to 'sysdeps') diff --git a/ChangeLog b/ChangeLog index b3705f001a..0196c24be8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2010-04-14 H.J. Lu + + * sysdeps/x86_64/multiarch/memcmp-sse4.S: Optimized for unaligned + data. + 2010-04-12 H.J. Lu * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add diff --git a/malloc/malloc.c b/malloc/malloc.c index 722b1d4961..b067b65bdc 100644 --- a/malloc/malloc.c +++ b/malloc/malloc.c @@ -3168,6 +3168,10 @@ static Void_t* sYSMALLOc(nb, av) INTERNAL_SIZE_T nb; mstate av; size = nb + mp_.top_pad + MINSIZE; +#define TWOM (2*1024*1024) + char *cur = (char*)MORECORE(0); + size = (char*)((size_t)(cur + size + TWOM - 1)&~(TWOM-1))-cur; + /* If contiguous, we can subtract out existing space that we hope to combine with new space. We add it back later only if diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S index 886c8934d0..25dba864f8 100644 --- a/sysdeps/x86_64/multiarch/memcmp-sse4.S +++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S @@ -72,6 +72,8 @@ L(79bytesormore): sub %rcx, %rdi add %rcx, %rdx + test $0xf, %rdi + jz L(2aligned) cmp $128, %rdx ja L(128bytesormore) @@ -120,8 +122,10 @@ L(less32bytesin64): BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) L(128bytesormore): + cmp $512, %rdx + ja L(512bytesormore) cmp $256, %rdx - ja L(256bytesormore) + ja L(less512bytes) L(less256bytes): sub $128, %rdx @@ -191,9 +195,6 @@ L(less32bytesin128): add %rdx, %rdi BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) -L(256bytesormore): - cmp $512, %rdx - ja L(512bytesormore) L(less512bytes): sub $256, %rdx movdqu (%rdi), %xmm2 @@ -307,7 +308,18 @@ L(less32bytesin256): ALIGN (4) L(512bytesormore): +#ifdef DATA_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %r8 +#else + mov __x86_64_data_cache_size_half(%rip), %r8 +#endif + mov %r8, %r9 + shr $1, %r8 + add %r9, %r8 + cmp %r8, %rdx + ja L(L2_L3_cache_unaglined) sub $64, %rdx + ALIGN (4) L(64bytesormore_loop): movdqu (%rdi), %xmm2 pxor (%rsi), %xmm2 @@ -337,6 +349,357 @@ L(64bytesormore_loop): add %rdx, %rdi BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) +L(L2_L3_cache_unaglined): + sub $64, %rdx + ALIGN (4) +L(L2_L3_unaligned_128bytes_loop): + prefetchnta 0x1c0(%rdi) + prefetchnta 0x1c0(%rsi) + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqu 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqu 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqu 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(L2_L3_unaligned_128bytes_loop) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +/* + * This case is for machines which are sensitive for unaligned instructions. + */ + ALIGN (4) +L(2aligned): + cmp $128, %rdx + ja L(128bytesormorein2aligned) +L(less128bytesin2aligned): + sub $64, %rdx + + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqa 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqa 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + cmp $32, %rdx + jb L(less32bytesin64in2alinged) + + movdqa 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqa 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin64in2alinged): + add $64, %rdi + add $64, %rsi + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + ALIGN (4) +L(128bytesormorein2aligned): + cmp $512, %rdx + ja L(512bytesormorein2aligned) + cmp $256, %rdx + ja L(256bytesormorein2aligned) +L(less256bytesin2alinged): + sub $128, %rdx + + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqa 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqa 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqa 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqa 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqa 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqa 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + add $128, %rsi + add $128, %rdi + + cmp $64, %rdx + jae L(less128bytesin2aligned) + + cmp $32, %rdx + jb L(less32bytesin128in2aligned) + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin128in2aligned): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + ALIGN (4) +L(256bytesormorein2aligned): + + sub $256, %rdx + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqa 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqa 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqa 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqa 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqa 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqa 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + movdqa 128(%rdi), %xmm2 + pxor 128(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(144bytesin256) + + movdqa 144(%rdi), %xmm2 + pxor 144(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(160bytesin256) + + movdqa 160(%rdi), %xmm2 + pxor 160(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(176bytesin256) + + movdqa 176(%rdi), %xmm2 + pxor 176(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(192bytesin256) + + movdqa 192(%rdi), %xmm2 + pxor 192(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(208bytesin256) + + movdqa 208(%rdi), %xmm2 + pxor 208(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(224bytesin256) + + movdqa 224(%rdi), %xmm2 + pxor 224(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(240bytesin256) + + movdqa 240(%rdi), %xmm2 + pxor 240(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(256bytesin256) + + add $256, %rsi + add $256, %rdi + + cmp $128, %rdx + jae L(less256bytesin2alinged) + + cmp $64, %rdx + jae L(less128bytesin2aligned) + + cmp $32, %rdx + jb L(less32bytesin256in2alinged) + + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin256in2alinged): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + ALIGN (4) +L(512bytesormorein2aligned): +#ifdef SHARED_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %r8 +#else + mov __x86_64_data_cache_size_half(%rip), %r8 +#endif + mov %r8, %r9 + shr $1, %r8 + add %r9, %r8 + cmp %r8, %rdx + ja L(L2_L3_cache_aglined) + + sub $64, %rdx + ALIGN (4) +L(64bytesormore_loopin2aligned): + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqa 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqa 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqa 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(64bytesormore_loopin2aligned) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) +L(L2_L3_cache_aglined): + sub $64, %rdx + ALIGN (4) +L(L2_L3_aligned_128bytes_loop): + prefetchnta 0x1c0(%rdi) + prefetchnta 0x1c0(%rsi) + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqa 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqa 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqa 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(L2_L3_aligned_128bytes_loop) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + ALIGN (4) L(64bytesormore_loop_end): add $16, %rdi @@ -1147,7 +1510,10 @@ L(32bytes): xor %eax, %eax ret - ALIGN (4) +/* + * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block. + */ + ALIGN (3) L(less16bytes): movsbq %dl, %rdx mov (%rsi, %rdx), %rcx @@ -1166,7 +1532,6 @@ L(diffin4bytes): jne L(diffin2bytes) shr $16, %ecx shr $16, %eax - xor %rdx, %rdx L(diffin2bytes): cmp %cl, %al jne L(end) -- cgit v1.2.3 From df87f54923da6cb94e1a7d65c3f2349c95d49700 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Wed, 14 Apr 2010 22:18:27 -0700 Subject: Check DATA_CACHE_SIZE_HALF --- ChangeLog | 5 +++++ sysdeps/x86_64/multiarch/memcmp-sse4.S | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'sysdeps') diff --git a/ChangeLog b/ChangeLog index 9b23fd979d..a780308f20 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2010-04-14 H.J. Lu + + * sysdeps/x86_64/multiarch/memcmp-sse4.S: Check + DATA_CACHE_SIZE_HALF instead of SHARED_CACHE_SIZE_HALF. + 2010-04-14 Andreas Schwab * elf/dl-version.c (_dl_check_map_versions): Avoid index overflow diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S index 25dba864f8..fc439bb013 100644 --- a/sysdeps/x86_64/multiarch/memcmp-sse4.S +++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S @@ -34,7 +34,7 @@ #define JMPTBL(I, B) (I - B) #define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - lea TABLE(%rip), %r11; \ + lea TABLE(%rip), %r11; \ movslq (%r11, INDEX, SCALE), %rcx; \ add %r11, %rcx; \ jmp *%rcx; \ @@ -624,7 +624,7 @@ L(less32bytesin256in2alinged): ALIGN (4) L(512bytesormorein2aligned): -#ifdef SHARED_CACHE_SIZE_HALF +#ifdef DATA_CACHE_SIZE_HALF mov $DATA_CACHE_SIZE_HALF, %r8 #else mov __x86_64_data_cache_size_half(%rip), %r8 -- cgit v1.2.3 From a11ec63713ea3903c482dc907a108be404191a02 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Wed, 14 Apr 2010 22:27:59 -0700 Subject: Add x86-32 FMA support --- ChangeLog | 11 ++++++++++ sysdeps/i386/configure | 25 ++++++++++++++++++++++ sysdeps/i386/configure.in | 11 ++++++++++ sysdeps/i386/i686/multiarch/Makefile | 6 ++++++ sysdeps/i386/i686/multiarch/Versions | 5 +++++ sysdeps/i386/i686/multiarch/s_fma-fma.c | 30 ++++++++++++++++++++++++++ sysdeps/i386/i686/multiarch/s_fma.c | 36 ++++++++++++++++++++++++++++++++ sysdeps/i386/i686/multiarch/s_fmaf-fma.c | 30 ++++++++++++++++++++++++++ sysdeps/i386/i686/multiarch/s_fmaf.c | 36 ++++++++++++++++++++++++++++++++ sysdeps/x86_64/elf/configure | 25 ---------------------- sysdeps/x86_64/elf/configure.in | 11 ---------- 11 files changed, 190 insertions(+), 36 deletions(-) create mode 100644 sysdeps/i386/i686/multiarch/Versions create mode 100644 sysdeps/i386/i686/multiarch/s_fma-fma.c create mode 100644 sysdeps/i386/i686/multiarch/s_fma.c create mode 100644 sysdeps/i386/i686/multiarch/s_fmaf-fma.c create mode 100644 sysdeps/i386/i686/multiarch/s_fmaf.c (limited to 'sysdeps') diff --git a/ChangeLog b/ChangeLog index a780308f20..da0dc3fe89 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,16 @@ 2010-04-14 H.J. Lu + * sysdeps/x86_64/elf/configure.in: Move AVX test to .... + * sysdeps/i386/configure.in: ...here. + * sysdeps/i386/i686/multiarch/Makefile (libm-sysdep_routines): Define. + (CFLAGS-s_fma-fma.c): Define. + (CFLAGS-s_fmaf-fma.c): Define. + * sysdeps/i386/i686/multiarch/Versions: New file. + * sysdeps/i386/i686/multiarch/s_fma-fma.c: New file. + * sysdeps/i386/i686/multiarch/s_fma.c: New file. + * sysdeps/i386/i686/multiarch/s_fmaf-fma.c: New file. + * sysdeps/i386/i686/multiarch/s_fmaf.c: New file. + * sysdeps/x86_64/multiarch/memcmp-sse4.S: Check DATA_CACHE_SIZE_HALF instead of SHARED_CACHE_SIZE_HALF. diff --git a/sysdeps/i386/configure b/sysdeps/i386/configure index 7814b3b313..21225cd9c9 100644 --- a/sysdeps/i386/configure +++ b/sysdeps/i386/configure @@ -656,3 +656,28 @@ fi fi { $as_echo "$as_me:$LINENO: result: $libc_cv_as_i686" >&5 $as_echo "$libc_cv_as_i686" >&6; } + +{ $as_echo "$as_me:$LINENO: checking for AVX support" >&5 +$as_echo_n "checking for AVX support... " >&6; } +if test "${libc_cv_cc_avx+set}" = set; then + $as_echo_n "(cached) " >&6 +else + if { ac_try='${CC-cc} -mavx -xc /dev/null -S -o /dev/null' + { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 + (eval $ac_try) 2>&5 + ac_status=$? + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; }; then + libc_cv_cc_avx=yes +else + libc_cv_cc_avx=no +fi +fi +{ $as_echo "$as_me:$LINENO: result: $libc_cv_cc_avx" >&5 +$as_echo "$libc_cv_cc_avx" >&6; } +if test $libc_cv_cc_avx = yes; then + cat >>confdefs.h <<\_ACEOF +#define HAVE_AVX_SUPPORT 1 +_ACEOF + +fi diff --git a/sysdeps/i386/configure.in b/sysdeps/i386/configure.in index 9fc7fa59fe..d8dd648f80 100644 --- a/sysdeps/i386/configure.in +++ b/sysdeps/i386/configure.in @@ -55,3 +55,14 @@ if AC_TRY_COMMAND([${CC-cc} -Wa,-mtune=i686 -xc /dev/null -S -o /dev/null]); the else libc_cv_as_i686=no fi]) + +dnl Check if -mavx works. +AC_CACHE_CHECK(for AVX support, libc_cv_cc_avx, [dnl +if AC_TRY_COMMAND([${CC-cc} -mavx -xc /dev/null -S -o /dev/null]); then + libc_cv_cc_avx=yes +else + libc_cv_cc_avx=no +fi]) +if test $libc_cv_cc_avx = yes; then + AC_DEFINE(HAVE_AVX_SUPPORT) +fi diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile index e8847d6fc4..124595068d 100644 --- a/sysdeps/i386/i686/multiarch/Makefile +++ b/sysdeps/i386/i686/multiarch/Makefile @@ -19,3 +19,9 @@ CFLAGS-strstr.c += -msse4 CFLAGS-strcasestr.c += -msse4 endif endif + +ifeq (mathyes,$(subdir)$(config-cflags-avx)) +libm-sysdep_routines += s_fma-fma s_fmaf-fma +CFLAGS-s_fma-fma.c += -mavx -mfpmath=sse +CFLAGS-s_fmaf-fma.c += -mavx -mfpmath=sse +endif diff --git a/sysdeps/i386/i686/multiarch/Versions b/sysdeps/i386/i686/multiarch/Versions new file mode 100644 index 0000000000..59b185ac8d --- /dev/null +++ b/sysdeps/i386/i686/multiarch/Versions @@ -0,0 +1,5 @@ +libc { + GLIBC_PRIVATE { + __get_cpu_features; + } +} diff --git a/sysdeps/i386/i686/multiarch/s_fma-fma.c b/sysdeps/i386/i686/multiarch/s_fma-fma.c new file mode 100644 index 0000000000..e6f77aec77 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/s_fma-fma.c @@ -0,0 +1,30 @@ +/* FMA version of fma. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include + +#ifdef HAVE_AVX_SUPPORT +double +__fma_fma (double x, double y, double z) +{ + asm ("vfmadd213sd %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z)); + return x; +} +#endif diff --git a/sysdeps/i386/i686/multiarch/s_fma.c b/sysdeps/i386/i686/multiarch/s_fma.c new file mode 100644 index 0000000000..d9291b0be8 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/s_fma.c @@ -0,0 +1,36 @@ +/* Multiple versions of fma. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include + +#ifdef HAVE_AVX_SUPPORT +#include +#include + +extern double __fma_ia32 (double x, double y, double z) attribute_hidden; +extern double __fma_fma (double x, double y, double z) attribute_hidden; + +libm_ifunc (__fma, HAS_FMA ? __fma_fma : __fma_ia32); +weak_alias (__fma, fma) + +# define __fma __fma_ia32 +#endif + +#include diff --git a/sysdeps/i386/i686/multiarch/s_fmaf-fma.c b/sysdeps/i386/i686/multiarch/s_fmaf-fma.c new file mode 100644 index 0000000000..887e9c3829 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/s_fmaf-fma.c @@ -0,0 +1,30 @@ +/* FMA version of fmaf. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include + +#ifdef HAVE_AVX_SUPPORT +float +__fmaf_fma (float x, float y, float z) +{ + asm ("vfmadd213ss %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z)); + return x; +} +#endif diff --git a/sysdeps/i386/i686/multiarch/s_fmaf.c b/sysdeps/i386/i686/multiarch/s_fmaf.c new file mode 100644 index 0000000000..4ea9be48ac --- /dev/null +++ b/sysdeps/i386/i686/multiarch/s_fmaf.c @@ -0,0 +1,36 @@ +/* Multiple versions of fmaf. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include + +#ifdef HAVE_AVX_SUPPORT +#include +#include + +extern float __fmaf_ia32 (float x, float y, float z) attribute_hidden; +extern float __fmaf_fma (float x, float y, float z) attribute_hidden; + +libm_ifunc (__fmaf, HAS_FMA ? __fmaf_fma : __fmaf_ia32); +weak_alias (__fmaf, fmaf) + +# define __fmaf __fmaf_ia32 +#endif + +#include diff --git a/sysdeps/x86_64/elf/configure b/sysdeps/x86_64/elf/configure index 0b93b0424e..50cf2fc453 100644 --- a/sysdeps/x86_64/elf/configure +++ b/sysdeps/x86_64/elf/configure @@ -47,28 +47,3 @@ cat >>confdefs.h <<\_ACEOF #define PI_STATIC_AND_HIDDEN 1 _ACEOF - -{ $as_echo "$as_me:$LINENO: checking for AVX support" >&5 -$as_echo_n "checking for AVX support... " >&6; } -if test "${libc_cv_cc_avx+set}" = set; then - $as_echo_n "(cached) " >&6 -else - if { ac_try='${CC-cc} -mavx -xc /dev/null -S -o /dev/null' - { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 - (eval $ac_try) 2>&5 - ac_status=$? - $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 - (exit $ac_status); }; }; then - libc_cv_cc_avx=yes -else - libc_cv_cc_avx=no -fi -fi -{ $as_echo "$as_me:$LINENO: result: $libc_cv_cc_avx" >&5 -$as_echo "$libc_cv_cc_avx" >&6; } -if test $libc_cv_cc_avx = yes; then - cat >>confdefs.h <<\_ACEOF -#define HAVE_AVX_SUPPORT 1 -_ACEOF - -fi diff --git a/sysdeps/x86_64/elf/configure.in b/sysdeps/x86_64/elf/configure.in index 14d1875302..9cb59d009c 100644 --- a/sysdeps/x86_64/elf/configure.in +++ b/sysdeps/x86_64/elf/configure.in @@ -32,14 +32,3 @@ fi dnl It is always possible to access static and hidden symbols in an dnl position independent way. AC_DEFINE(PI_STATIC_AND_HIDDEN) - -dnl Check if -mavx works. -AC_CACHE_CHECK(for AVX support, libc_cv_cc_avx, [dnl -if AC_TRY_COMMAND([${CC-cc} -mavx -xc /dev/null -S -o /dev/null]); then - libc_cv_cc_avx=yes -else - libc_cv_cc_avx=no -fi]) -if test $libc_cv_cc_avx = yes; then - AC_DEFINE(HAVE_AVX_SUPPORT) -fi -- cgit v1.2.3 From 94a27fabebf851f4722654aa118a7d5f29a9c2ee Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 14 Apr 2010 22:29:51 -0700 Subject: Whitespace fix. --- sysdeps/x86_64/elf/configure | 1 - 1 file changed, 1 deletion(-) (limited to 'sysdeps') diff --git a/sysdeps/x86_64/elf/configure b/sysdeps/x86_64/elf/configure index 50cf2fc453..f722b9e600 100644 --- a/sysdeps/x86_64/elf/configure +++ b/sysdeps/x86_64/elf/configure @@ -46,4 +46,3 @@ fi cat >>confdefs.h <<\_ACEOF #define PI_STATIC_AND_HIDDEN 1 _ACEOF - -- cgit v1.2.3 From 6cc2b8a6dfc93212ce8437a2fb734a9225a4ba1c Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 15 Apr 2010 07:49:30 -0700 Subject: Fix bugs in x86-32 strcmp-sse4.S and strcmp-ssse3.S --- string/test-strncmp.c | 39 +++++++++++++++++++++++++++--- sysdeps/i386/i686/multiarch/strcmp-sse4.S | 4 +-- sysdeps/i386/i686/multiarch/strcmp-ssse3.S | 11 +++++---- 3 files changed, 43 insertions(+), 11 deletions(-) (limited to 'sysdeps') diff --git a/string/test-strncmp.c b/string/test-strncmp.c index 5adf0eb311..3687879c25 100644 --- a/string/test-strncmp.c +++ b/string/test-strncmp.c @@ -1,5 +1,5 @@ /* Test and measure strncmp functions. - Copyright (C) 1999, 2002, 2003 Free Software Foundation, Inc. + Copyright (C) 1999, 2002, 2003, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. Written by Jakub Jelinek , 1999. @@ -51,8 +51,8 @@ stupid_strncmp (const char *s1, const char *s2, size_t n) return ret; } -static void -do_one_test (impl_t *impl, const char *s1, const char *s2, size_t n, +static int +check_result (impl_t *impl, const char *s1, const char *s2, size_t n, int exp_result) { int result = CALL (impl, s1, s2, n); @@ -63,9 +63,19 @@ do_one_test (impl_t *impl, const char *s1, const char *s2, size_t n, error (0, 0, "Wrong result in function %s %d %d", impl->name, result, exp_result); ret = 1; - return; + return -1; } + return 0; +} + +static void +do_one_test (impl_t *impl, const char *s1, const char *s2, size_t n, + int exp_result) +{ + if (check_result (impl, s1, s2, n, exp_result) < 0) + return; + if (HP_TIMING_AVAIL) { hp_timing_t start __attribute ((unused)); @@ -283,6 +293,25 @@ do_random_tests (void) } } +static void +check1 (void) +{ + char *s1 = (char *)(buf1 + 0xb2c); + char *s2 = (char *)(buf1 + 0xfd8); + size_t i; + int exp_result; + + strcpy(s1, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrs"); + strcpy(s2, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijkLMNOPQRSTUV"); + + for (i = 0; i < 80; i++) + { + exp_result = simple_strncmp (s1, s2, i); + FOR_EACH_IMPL (impl, 0) + check_result (impl, s1, s2, i, exp_result); + } +} + int test_main (void) { @@ -290,6 +319,8 @@ test_main (void) test_init (); + check1 (); + printf ("%23s", ""); FOR_EACH_IMPL (impl, 0) printf ("\t%s", impl->name); diff --git a/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/sysdeps/i386/i686/multiarch/strcmp-sse4.S index 81d6ec66f7..0de0a113c0 100644 --- a/sysdeps/i386/i686/multiarch/strcmp-sse4.S +++ b/sysdeps/i386/i686/multiarch/strcmp-sse4.S @@ -223,8 +223,8 @@ L(crosspage): inc %edx cmp $15, %edx jle L(crosspage) - add $16, %edi - add $16, %esi + add %edx, %edi + add %edx, %esi jmp L(check_offset) .p2align 4 diff --git a/sysdeps/i386/i686/multiarch/strcmp-ssse3.S b/sysdeps/i386/i686/multiarch/strcmp-ssse3.S index 40994c05b1..a4de2259d2 100644 --- a/sysdeps/i386/i686/multiarch/strcmp-ssse3.S +++ b/sysdeps/i386/i686/multiarch/strcmp-ssse3.S @@ -1484,17 +1484,18 @@ L(gobble_ashr_12): sub $0xffff, %esi jnz L(exit) +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx movdqa %xmm4, %xmm3 add $16, %edi jg L(nibble_ashr_12) -#ifdef USE_AS_STRNCMP - cmp $16, %ebp - lea -16(%ebp), %ebp - jbe L(more8byteseq) -#endif movdqa (%eax, %ecx), %xmm1 movdqa (%edx, %ecx), %xmm2 movdqa %xmm2, %xmm4 -- cgit v1.2.3