diff options
author | Andreas Schwab <schwab@redhat.com> | 2010-04-16 12:49:09 +0200 |
---|---|---|
committer | Andreas Schwab <schwab@redhat.com> | 2010-04-16 12:49:09 +0200 |
commit | 54627f3c15f4a6e2ff02e60afacdb5f8008d399a (patch) | |
tree | ca0a25e66c8e6d48c7db07e6152a0549c27a1916 | |
parent | 1ef17274decb5381a50a2146cb3444630f67b527 (diff) | |
parent | 1cdb2151fbad6bff650e85a0476972881bbc027b (diff) |
Merge remote branch 'origin/master' into fedora/master
24 files changed, 2133 insertions, 162 deletions
@@ -1,3 +1,69 @@ +2010-04-15 H.J. Lu <hongjiu.lu@intel.com> + + * string/test-strncmp.c (check_result): New function. + (do_one_test): Use it. + (check1): New function. + (test_main): Use it. + * sysdeps/i386/i686/multiarch/strcmp-sse4.S (crosspage): Properly + update source and destination. + * sysdeps/i386/i686/multiarch/strcmp-ssse3.S (gobble_ashr_12): + Properly check and update counter. + +2010-04-14 H.J. Lu <hongjiu.lu@intel.com> + + * sysdeps/x86_64/elf/configure.in: Move AVX test to .... + * sysdeps/i386/configure.in: ...here. + * sysdeps/i386/i686/multiarch/Makefile (libm-sysdep_routines): Define. + (CFLAGS-s_fma-fma.c): Define. + (CFLAGS-s_fmaf-fma.c): Define. + * sysdeps/i386/i686/multiarch/Versions: New file. + * sysdeps/i386/i686/multiarch/s_fma-fma.c: New file. + * sysdeps/i386/i686/multiarch/s_fma.c: New file. + * sysdeps/i386/i686/multiarch/s_fmaf-fma.c: New file. + * sysdeps/i386/i686/multiarch/s_fmaf.c: New file. + + * sysdeps/x86_64/multiarch/memcmp-sse4.S: Check + DATA_CACHE_SIZE_HALF instead of SHARED_CACHE_SIZE_HALF. + +2010-04-14 Andreas Schwab <schwab@redhat.com> + + * elf/dl-version.c (_dl_check_map_versions): Avoid index overflow + when dependencies are missing. + +2010-04-14 H.J. Lu <hongjiu.lu@intel.com> + + * sysdeps/x86_64/multiarch/memcmp-sse4.S: Optimized for unaligned + data. + +2010-04-12 H.J. Lu <hongjiu.lu@intel.com> + + * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add + memcmp-sse4. + * sysdeps/x86_64/multiarch/memcmp-sse4.S: New file. + * sysdeps/x86_64/multiarch/memcmp.S: New file. + * sysdeps/x86_64/multiarch/rtld-memcmp.c: New file. + +2010-04-13 Ulrich Drepper <drepper@redhat.com> + + * sysdeps/x86_64/multiarch/init-arch.h: Pretty printing. + Add SSE 4.1 macros. + +2010-04-10 Matt Fleming <matt@console-pimps.org> + + * elf/elf.h: Add SH specific ELF header flags. + +2010-04-13 Andreas Schwab <schwab@redhat.com> + + * sysdeps/unix/sysv/linux/s390/s390-32/makecontext.c: Fix setup of + overflow area. + * sysdeps/unix/sysv/linux/s390/s390-64/makecontext.c: Likewise. + +2010-04-12 Andreas Schwab <schwab@redhat.com> + + * stdlib/tst-makecontext3.c (main): Initialize ucontext_t objects + only with getcontext. Test for unimplemented makecontext by + checking errno. + 2010-04-09 Ulrich Drepper <drepper@redhat.com> * nscd/aicache.c (addhstaiX): Correct passing memory to address diff --git a/elf/dl-version.c b/elf/dl-version.c index 9e881162a6..c59a6c3cd3 100644 --- a/elf/dl-version.c +++ b/elf/dl-version.c @@ -322,10 +322,14 @@ _dl_check_map_versions (struct link_map *map, int verbose, int trace_mode) while (1) { ElfW(Half) ndx = aux->vna_other & 0x7fff; - map->l_versions[ndx].hash = aux->vna_hash; - map->l_versions[ndx].hidden = aux->vna_other & 0x8000; - map->l_versions[ndx].name = &strtab[aux->vna_name]; - map->l_versions[ndx].filename = &strtab[ent->vn_file]; + /* In trace mode, dependencies may be missing. */ + if (__builtin_expect (ndx < map->l_nversions, 1)) + { + map->l_versions[ndx].hash = aux->vna_hash; + map->l_versions[ndx].hidden = aux->vna_other & 0x8000; + map->l_versions[ndx].name = &strtab[aux->vna_name]; + map->l_versions[ndx].filename = &strtab[ent->vn_file]; + } if (aux->vna_next == 0) /* No more symbols. */ @@ -2477,6 +2477,30 @@ typedef Elf32_Addr Elf32_Conflict; /* SH specific declarations */ +/* Processor specific flags for the ELF header e_flags field. */ +#define EF_SH_MACH_MASK 0x1f +#define EF_SH_UNKNOWN 0x0 +#define EF_SH1 0x1 +#define EF_SH2 0x2 +#define EF_SH3 0x3 +#define EF_SH_DSP 0x4 +#define EF_SH3_DSP 0x5 +#define EF_SH4AL_DSP 0x6 +#define EF_SH3E 0x8 +#define EF_SH4 0x9 +#define EF_SH2E 0xb +#define EF_SH4A 0xc +#define EF_SH2A 0xd +#define EF_SH4_NOFPU 0x10 +#define EF_SH4A_NOFPU 0x11 +#define EF_SH4_NOMMU_NOFPU 0x12 +#define EF_SH2A_NOFPU 0x13 +#define EF_SH3_NOMMU 0x14 +#define EF_SH2A_SH4_NOFPU 0x15 +#define EF_SH2A_SH3_NOFPU 0x16 +#define EF_SH2A_SH4 0x17 +#define EF_SH2A_SH3E 0x18 + /* SH relocs. */ #define R_SH_NONE 0 #define R_SH_DIR32 1 diff --git a/stdlib/tst-makecontext3.c b/stdlib/tst-makecontext3.c index f127c6a579..a44169ae36 100644 --- a/stdlib/tst-makecontext3.c +++ b/stdlib/tst-makecontext3.c @@ -136,38 +136,42 @@ main (void) exit (1); } - ctx[1] = ctx[0]; + if (getcontext (&ctx[1]) != 0) + { + printf ("%s: getcontext: %m\n", __FUNCTION__); + exit (1); + } + ctx[1].uc_stack.ss_sp = st1; ctx[1].uc_stack.ss_size = sizeof st1; ctx[1].uc_link = &ctx[0]; - { - ucontext_t tempctx = ctx[1]; - makecontext (&ctx[1], (void (*) (void)) f1, 33, - 0x00000001 << flag, 0x00000004 << flag, - 0x00000012 << flag, 0x00000048 << flag, - 0x00000123 << flag, 0x0000048d << flag, - 0x00001234 << flag, 0x000048d1 << flag, - 0x00012345 << flag, 0x00048d15 << flag, - 0x00123456 << flag, 0x0048d159 << flag, - 0x01234567 << flag, 0x048d159e << flag, - 0x12345678 << flag, 0x48d159e2 << flag, - 0x23456789 << flag, 0x8d159e26 << flag, - 0x3456789a << flag, 0xd159e26a << flag, - 0x456789ab << flag, 0x159e26af << flag, - 0x56789abc << flag, 0x59e26af3 << flag, - 0x6789abcd << flag, 0x9e26af37 << flag, - 0x789abcde << flag, 0xe26af37b << flag, - 0x89abcdef << flag, 0x26af37bc << flag, - 0x9abcdef0 << flag, 0x6af37bc3 << flag, - 0xabcdef0f << flag); - - /* Without this check, a stub makecontext can make us spin forever. */ - if (memcmp (&tempctx, &ctx[1], sizeof ctx[1]) == 0) - { - puts ("makecontext was a no-op, presuming not implemented"); - return 0; - } - } + errno = 0; + makecontext (&ctx[1], (void (*) (void)) f1, 33, + 0x00000001 << flag, 0x00000004 << flag, + 0x00000012 << flag, 0x00000048 << flag, + 0x00000123 << flag, 0x0000048d << flag, + 0x00001234 << flag, 0x000048d1 << flag, + 0x00012345 << flag, 0x00048d15 << flag, + 0x00123456 << flag, 0x0048d159 << flag, + 0x01234567 << flag, 0x048d159e << flag, + 0x12345678 << flag, 0x48d159e2 << flag, + 0x23456789 << flag, 0x8d159e26 << flag, + 0x3456789a << flag, 0xd159e26a << flag, + 0x456789ab << flag, 0x159e26af << flag, + 0x56789abc << flag, 0x59e26af3 << flag, + 0x6789abcd << flag, 0x9e26af37 << flag, + 0x789abcde << flag, 0xe26af37b << flag, + 0x89abcdef << flag, 0x26af37bc << flag, + 0x9abcdef0 << flag, 0x6af37bc3 << flag, + 0xabcdef0f << flag); + + /* Without this check, a stub makecontext can make us spin forever. */ + if (errno == ENOSYS) + { + puts ("makecontext not implemented"); + back_in_main = 1; + return 0; + } /* Play some tricks with this context. */ if (++global == 1) diff --git a/string/test-strncmp.c b/string/test-strncmp.c index 5adf0eb311..3687879c25 100644 --- a/string/test-strncmp.c +++ b/string/test-strncmp.c @@ -1,5 +1,5 @@ /* Test and measure strncmp functions. - Copyright (C) 1999, 2002, 2003 Free Software Foundation, Inc. + Copyright (C) 1999, 2002, 2003, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. Written by Jakub Jelinek <jakub@redhat.com>, 1999. @@ -51,8 +51,8 @@ stupid_strncmp (const char *s1, const char *s2, size_t n) return ret; } -static void -do_one_test (impl_t *impl, const char *s1, const char *s2, size_t n, +static int +check_result (impl_t *impl, const char *s1, const char *s2, size_t n, int exp_result) { int result = CALL (impl, s1, s2, n); @@ -63,9 +63,19 @@ do_one_test (impl_t *impl, const char *s1, const char *s2, size_t n, error (0, 0, "Wrong result in function %s %d %d", impl->name, result, exp_result); ret = 1; - return; + return -1; } + return 0; +} + +static void +do_one_test (impl_t *impl, const char *s1, const char *s2, size_t n, + int exp_result) +{ + if (check_result (impl, s1, s2, n, exp_result) < 0) + return; + if (HP_TIMING_AVAIL) { hp_timing_t start __attribute ((unused)); @@ -283,6 +293,25 @@ do_random_tests (void) } } +static void +check1 (void) +{ + char *s1 = (char *)(buf1 + 0xb2c); + char *s2 = (char *)(buf1 + 0xfd8); + size_t i; + int exp_result; + + strcpy(s1, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrs"); + strcpy(s2, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijkLMNOPQRSTUV"); + + for (i = 0; i < 80; i++) + { + exp_result = simple_strncmp (s1, s2, i); + FOR_EACH_IMPL (impl, 0) + check_result (impl, s1, s2, i, exp_result); + } +} + int test_main (void) { @@ -290,6 +319,8 @@ test_main (void) test_init (); + check1 (); + printf ("%23s", ""); FOR_EACH_IMPL (impl, 0) printf ("\t%s", impl->name); diff --git a/sysdeps/i386/configure b/sysdeps/i386/configure index 7814b3b313..21225cd9c9 100644 --- a/sysdeps/i386/configure +++ b/sysdeps/i386/configure @@ -656,3 +656,28 @@ fi fi { $as_echo "$as_me:$LINENO: result: $libc_cv_as_i686" >&5 $as_echo "$libc_cv_as_i686" >&6; } + +{ $as_echo "$as_me:$LINENO: checking for AVX support" >&5 +$as_echo_n "checking for AVX support... " >&6; } +if test "${libc_cv_cc_avx+set}" = set; then + $as_echo_n "(cached) " >&6 +else + if { ac_try='${CC-cc} -mavx -xc /dev/null -S -o /dev/null' + { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 + (eval $ac_try) 2>&5 + ac_status=$? + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; }; then + libc_cv_cc_avx=yes +else + libc_cv_cc_avx=no +fi +fi +{ $as_echo "$as_me:$LINENO: result: $libc_cv_cc_avx" >&5 +$as_echo "$libc_cv_cc_avx" >&6; } +if test $libc_cv_cc_avx = yes; then + cat >>confdefs.h <<\_ACEOF +#define HAVE_AVX_SUPPORT 1 +_ACEOF + +fi diff --git a/sysdeps/i386/configure.in b/sysdeps/i386/configure.in index 9fc7fa59fe..d8dd648f80 100644 --- a/sysdeps/i386/configure.in +++ b/sysdeps/i386/configure.in @@ -55,3 +55,14 @@ if AC_TRY_COMMAND([${CC-cc} -Wa,-mtune=i686 -xc /dev/null -S -o /dev/null]); the else libc_cv_as_i686=no fi]) + +dnl Check if -mavx works. +AC_CACHE_CHECK(for AVX support, libc_cv_cc_avx, [dnl +if AC_TRY_COMMAND([${CC-cc} -mavx -xc /dev/null -S -o /dev/null]); then + libc_cv_cc_avx=yes +else + libc_cv_cc_avx=no +fi]) +if test $libc_cv_cc_avx = yes; then + AC_DEFINE(HAVE_AVX_SUPPORT) +fi diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile index e8847d6fc4..124595068d 100644 --- a/sysdeps/i386/i686/multiarch/Makefile +++ b/sysdeps/i386/i686/multiarch/Makefile @@ -19,3 +19,9 @@ CFLAGS-strstr.c += -msse4 CFLAGS-strcasestr.c += -msse4 endif endif + +ifeq (mathyes,$(subdir)$(config-cflags-avx)) +libm-sysdep_routines += s_fma-fma s_fmaf-fma +CFLAGS-s_fma-fma.c += -mavx -mfpmath=sse +CFLAGS-s_fmaf-fma.c += -mavx -mfpmath=sse +endif diff --git a/sysdeps/i386/i686/multiarch/Versions b/sysdeps/i386/i686/multiarch/Versions new file mode 100644 index 0000000000..59b185ac8d --- /dev/null +++ b/sysdeps/i386/i686/multiarch/Versions @@ -0,0 +1,5 @@ +libc { + GLIBC_PRIVATE { + __get_cpu_features; + } +} diff --git a/sysdeps/i386/i686/multiarch/s_fma-fma.c b/sysdeps/i386/i686/multiarch/s_fma-fma.c new file mode 100644 index 0000000000..e6f77aec77 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/s_fma-fma.c @@ -0,0 +1,30 @@ +/* FMA version of fma. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <config.h> + +#ifdef HAVE_AVX_SUPPORT +double +__fma_fma (double x, double y, double z) +{ + asm ("vfmadd213sd %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z)); + return x; +} +#endif diff --git a/sysdeps/i386/i686/multiarch/s_fma.c b/sysdeps/i386/i686/multiarch/s_fma.c new file mode 100644 index 0000000000..d9291b0be8 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/s_fma.c @@ -0,0 +1,36 @@ +/* Multiple versions of fma. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <config.h> + +#ifdef HAVE_AVX_SUPPORT +#include <math.h> +#include <init-arch.h> + +extern double __fma_ia32 (double x, double y, double z) attribute_hidden; +extern double __fma_fma (double x, double y, double z) attribute_hidden; + +libm_ifunc (__fma, HAS_FMA ? __fma_fma : __fma_ia32); +weak_alias (__fma, fma) + +# define __fma __fma_ia32 +#endif + +#include <math/s_fma.c> diff --git a/sysdeps/i386/i686/multiarch/s_fmaf-fma.c b/sysdeps/i386/i686/multiarch/s_fmaf-fma.c new file mode 100644 index 0000000000..887e9c3829 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/s_fmaf-fma.c @@ -0,0 +1,30 @@ +/* FMA version of fmaf. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <config.h> + +#ifdef HAVE_AVX_SUPPORT +float +__fmaf_fma (float x, float y, float z) +{ + asm ("vfmadd213ss %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z)); + return x; +} +#endif diff --git a/sysdeps/i386/i686/multiarch/s_fmaf.c b/sysdeps/i386/i686/multiarch/s_fmaf.c new file mode 100644 index 0000000000..4ea9be48ac --- /dev/null +++ b/sysdeps/i386/i686/multiarch/s_fmaf.c @@ -0,0 +1,36 @@ +/* Multiple versions of fmaf. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <config.h> + +#ifdef HAVE_AVX_SUPPORT +#include <math.h> +#include <init-arch.h> + +extern float __fmaf_ia32 (float x, float y, float z) attribute_hidden; +extern float __fmaf_fma (float x, float y, float z) attribute_hidden; + +libm_ifunc (__fmaf, HAS_FMA ? __fmaf_fma : __fmaf_ia32); +weak_alias (__fmaf, fmaf) + +# define __fmaf __fmaf_ia32 +#endif + +#include <math/s_fmaf.c> diff --git a/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/sysdeps/i386/i686/multiarch/strcmp-sse4.S index 81d6ec66f7..0de0a113c0 100644 --- a/sysdeps/i386/i686/multiarch/strcmp-sse4.S +++ b/sysdeps/i386/i686/multiarch/strcmp-sse4.S @@ -223,8 +223,8 @@ L(crosspage): inc %edx cmp $15, %edx jle L(crosspage) - add $16, %edi - add $16, %esi + add %edx, %edi + add %edx, %esi jmp L(check_offset) .p2align 4 diff --git a/sysdeps/i386/i686/multiarch/strcmp-ssse3.S b/sysdeps/i386/i686/multiarch/strcmp-ssse3.S index 40994c05b1..a4de2259d2 100644 --- a/sysdeps/i386/i686/multiarch/strcmp-ssse3.S +++ b/sysdeps/i386/i686/multiarch/strcmp-ssse3.S @@ -1484,17 +1484,18 @@ L(gobble_ashr_12): sub $0xffff, %esi jnz L(exit) +#ifdef USE_AS_STRNCMP + cmp $16, %ebp + lea -16(%ebp), %ebp + jbe L(more8byteseq) +#endif + add $16, %ecx movdqa %xmm4, %xmm3 add $16, %edi jg L(nibble_ashr_12) -#ifdef USE_AS_STRNCMP - cmp $16, %ebp - lea -16(%ebp), %ebp - jbe L(more8byteseq) -#endif movdqa (%eax, %ecx), %xmm1 movdqa (%edx, %ecx), %xmm2 movdqa %xmm2, %xmm4 diff --git a/sysdeps/unix/sysv/linux/s390/s390-32/makecontext.c b/sysdeps/unix/sysv/linux/s390/s390-32/makecontext.c index 94760e0c2b..0e309c3e29 100644 --- a/sysdeps/unix/sysv/linux/s390/s390-32/makecontext.c +++ b/sysdeps/unix/sysv/linux/s390/s390-32/makecontext.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2001 Free Software Foundation, Inc. +/* Copyright (C) 2001, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Martin Schwidefsky (schwidefsky@de.ibm.com). @@ -28,15 +28,15 @@ double, complex and structure with sizes 0, 2, 4 or 8 won't work. makecontext sets up a stack and the registers for the - context. The stack looks like this: - size offset + user context. The stack looks like this: + size offset %r15 -> +-----------------------+ - 4 | back chain (zero) | 0 - 4 | reserved | 4 - 88 | save area for (*func) | 8 - +-----------------------+ - n | overflow parameters | 96 - +-----------------------+ + 4 | back chain (zero) | 0 + 4 | reserved | 4 + 88 | save area for (*func) | 8 + +-----------------------+ + n | overflow parameters | 96 + +-----------------------+ The registers are set up like this: %r2-%r6: parameters 1 to 5 %r7 : (*func) pointer @@ -54,27 +54,27 @@ void __makecontext (ucontext_t *ucp, void (*func) (void), int argc, ...) { extern void __makecontext_ret (void); - unsigned long *sp; + unsigned long int *sp; va_list ap; - int i; - sp = (unsigned long *) (((unsigned long) ucp->uc_stack.ss_sp - + ucp->uc_stack.ss_size) & -8L); + sp = (unsigned long int *) (((unsigned long int) ucp->uc_stack.ss_sp + + ucp->uc_stack.ss_size) & -8L); /* Set the return address to trampoline. */ - ucp->uc_mcontext.gregs[14] = (long) __makecontext_ret; + ucp->uc_mcontext.gregs[14] = (long int) __makecontext_ret; /* Set register parameters. */ va_start (ap, argc); - for (i = 0; (i < argc) && (i < 5); i++) - ucp->uc_mcontext.gregs[2+i] = va_arg (ap, long); + for (int i = 0; i < argc && i < 5; ++i) + ucp->uc_mcontext.gregs[2 + i] = va_arg (ap, long int); /* The remaining arguments go to the overflow area. */ - if (argc > 5) { - sp -= argc - 5; - for (i = 5; i < argc; i++) - sp[i] = va_arg(ap, long); - } + if (argc > 5) + { + sp -= argc - 5; + for (int i = 5; i < argc; ++i) + sp[i - 5] = va_arg (ap, long int); + } va_end (ap); /* Make room for the save area and set the backchain. */ @@ -82,24 +82,24 @@ __makecontext (ucontext_t *ucp, void (*func) (void), int argc, ...) *sp = 0; /* Pass (*func) to __start_context in %r7. */ - ucp->uc_mcontext.gregs[7] = (long) func; + ucp->uc_mcontext.gregs[7] = (long int) func; /* Pass ucp->uc_link to __start_context in %r8. */ - ucp->uc_mcontext.gregs[8] = (long) ucp->uc_link; + ucp->uc_mcontext.gregs[8] = (long int) ucp->uc_link; /* Pass address of setcontext in %r9. */ - ucp->uc_mcontext.gregs[9] = (long) &setcontext; + ucp->uc_mcontext.gregs[9] = (long int) &setcontext; /* Set stack pointer. */ - ucp->uc_mcontext.gregs[15] = (long) sp; + ucp->uc_mcontext.gregs[15] = (long int) sp; } -asm(".text\n" - ".type __makecontext_ret,@function\n" - "__makecontext_ret:\n" - " basr %r14,%r7\n" - " lr %r2,%r8\n" - " br %r9\n" - ".size __makecontext_ret, .-__makecontext_ret"); +asm (".text\n" + ".type __makecontext_ret,@function\n" + "__makecontext_ret:\n" + " basr %r14,%r7\n" + " lr %r2,%r8\n" + " br %r9\n" + ".size __makecontext_ret, .-__makecontext_ret"); weak_alias (__makecontext, makecontext) diff --git a/sysdeps/unix/sysv/linux/s390/s390-64/makecontext.c b/sysdeps/unix/sysv/linux/s390/s390-64/makecontext.c index b08f1b4047..40ff3eefb4 100644 --- a/sysdeps/unix/sysv/linux/s390/s390-64/makecontext.c +++ b/sysdeps/unix/sysv/linux/s390/s390-64/makecontext.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2001 Free Software Foundation, Inc. +/* Copyright (C) 2001, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Martin Schwidefsky (schwidefsky@de.ibm.com). @@ -29,14 +29,14 @@ won't work. makecontext sets up a stack and the registers for the user context. The stack looks like this: - size offset + size offset %r15 -> +-----------------------+ - 8 | back chain (zero) | 0 - 8 | reserved | 8 - 144 | save area for (*func) | 16 - +-----------------------+ - n | overflow parameters | 160 - +-----------------------+ + 8 | back chain (zero) | 0 + 8 | reserved | 8 + 144 | save area for (*func) | 16 + +-----------------------+ + n | overflow parameters | 160 + +-----------------------+ The registers are set up like this: %r2-%r6: parameters 1 to 5 %r7 : (*func) pointer @@ -54,27 +54,27 @@ void __makecontext (ucontext_t *ucp, void (*func) (void), int argc, ...) { extern void __makecontext_ret (void); - unsigned long *sp; + unsigned long int *sp; va_list ap; - int i; - sp = (unsigned long *) (((unsigned long) ucp->uc_stack.ss_sp - + ucp->uc_stack.ss_size) & -8L); + sp = (unsigned long int *) (((unsigned long int) ucp->uc_stack.ss_sp + + ucp->uc_stack.ss_size) & -8L); /* Set the return address to trampoline. */ - ucp->uc_mcontext.gregs[14] = (long) __makecontext_ret; + ucp->uc_mcontext.gregs[14] = (long int) __makecontext_ret; /* Set register parameters. */ va_start (ap, argc); - for (i = 0; (i < argc) && (i < 5); i++) - ucp->uc_mcontext.gregs[2+i] = va_arg (ap, long); + for (int i = 0; i < argc && i < 5; ++i) + ucp->uc_mcontext.gregs[2 + i] = va_arg (ap, long int); /* The remaining arguments go to the overflow area. */ - if (argc > 5) { - sp -= argc - 5; - for (i = 5; i < argc; i++) - sp[i] = va_arg(ap, long); - } + if (argc > 5) + { + sp -= argc - 5; + for (int i = 5; i < argc; ++i) + sp[i - 5] = va_arg (ap, long int); + } va_end (ap); /* Make room for the save area and set the backchain. */ @@ -82,24 +82,24 @@ __makecontext (ucontext_t *ucp, void (*func) (void), int argc, ...) *sp = 0; /* Pass (*func) to __start_context in %r7. */ - ucp->uc_mcontext.gregs[7] = (long) func; + ucp->uc_mcontext.gregs[7] = (long int) func; /* Pass ucp->uc_link to __start_context in %r8. */ - ucp->uc_mcontext.gregs[8] = (long) ucp->uc_link; + ucp->uc_mcontext.gregs[8] = (long int) ucp->uc_link; /* Pass address of setcontext in %r9. */ - ucp->uc_mcontext.gregs[9] = (long) &setcontext; + ucp->uc_mcontext.gregs[9] = (long int) &setcontext; /* Set stack pointer. */ - ucp->uc_mcontext.gregs[15] = (long) sp; + ucp->uc_mcontext.gregs[15] = (long int) sp; } -asm(".text\n" - ".type __makecontext_ret,@function\n" - "__makecontext_ret:\n" - " basr %r14,%r7\n" - " lgr %r2,%r8\n" - " br %r9\n" - ".size __makecontext_ret, .-__makecontext_ret"); +asm (".text\n" + ".type __makecontext_ret,@function\n" + "__makecontext_ret:\n" + " basr %r14,%r7\n" + " lgr %r2,%r8\n" + " br %r9\n" + ".size __makecontext_ret, .-__makecontext_ret"); weak_alias (__makecontext, makecontext) diff --git a/sysdeps/x86_64/elf/configure b/sysdeps/x86_64/elf/configure index 0b93b0424e..f722b9e600 100644 --- a/sysdeps/x86_64/elf/configure +++ b/sysdeps/x86_64/elf/configure @@ -46,29 +46,3 @@ fi cat >>confdefs.h <<\_ACEOF #define PI_STATIC_AND_HIDDEN 1 _ACEOF - - -{ $as_echo "$as_me:$LINENO: checking for AVX support" >&5 -$as_echo_n "checking for AVX support... " >&6; } -if test "${libc_cv_cc_avx+set}" = set; then - $as_echo_n "(cached) " >&6 -else - if { ac_try='${CC-cc} -mavx -xc /dev/null -S -o /dev/null' - { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 - (eval $ac_try) 2>&5 - ac_status=$? - $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 - (exit $ac_status); }; }; then - libc_cv_cc_avx=yes -else - libc_cv_cc_avx=no -fi -fi -{ $as_echo "$as_me:$LINENO: result: $libc_cv_cc_avx" >&5 -$as_echo "$libc_cv_cc_avx" >&6; } -if test $libc_cv_cc_avx = yes; then - cat >>confdefs.h <<\_ACEOF -#define HAVE_AVX_SUPPORT 1 -_ACEOF - -fi diff --git a/sysdeps/x86_64/elf/configure.in b/sysdeps/x86_64/elf/configure.in index 14d1875302..9cb59d009c 100644 --- a/sysdeps/x86_64/elf/configure.in +++ b/sysdeps/x86_64/elf/configure.in @@ -32,14 +32,3 @@ fi dnl It is always possible to access static and hidden symbols in an dnl position independent way. AC_DEFINE(PI_STATIC_AND_HIDDEN) - -dnl Check if -mavx works. -AC_CACHE_CHECK(for AVX support, libc_cv_cc_avx, [dnl -if AC_TRY_COMMAND([${CC-cc} -mavx -xc /dev/null -S -o /dev/null]); then - libc_cv_cc_avx=yes -else - libc_cv_cc_avx=no -fi]) -if test $libc_cv_cc_avx = yes; then - AC_DEFINE(HAVE_AVX_SUPPORT) -fi diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 364e7bbbd2..c61cf70345 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -5,7 +5,7 @@ endif ifeq ($(subdir),string) sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ - strend-sse4 + strend-sse4 memcmp-sse4 ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c CFLAGS-strcspn-c.c += -msse4 diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h index 5c73813404..b2f2de3796 100644 --- a/sysdeps/x86_64/multiarch/init-arch.h +++ b/sysdeps/x86_64/multiarch/init-arch.h @@ -20,21 +20,23 @@ #ifdef __ASSEMBLER__ -#include <ifunc-defines.h> +# include <ifunc-defines.h> -#define bit_SSE2 (1 << 26) -#define bit_SSSE3 (1 << 9) -#define bit_SSE4_2 (1 << 20) +# define bit_SSE2 (1 << 26) +# define bit_SSSE3 (1 << 9) +# define bit_SSE4_1 (1 << 19) +# define bit_SSE4_2 (1 << 20) -#define index_SSE2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET -#define index_SSSE3 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET -#define index_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET +# define index_SSE2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET +# define index_SSSE3 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET +# define index_SSE4_1 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET +# define index_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET #define index_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE #else /* __ASSEMBLER__ */ -#include <sys/param.h> +# include <sys/param.h> enum { @@ -84,20 +86,22 @@ extern void __init_cpu_features (void) attribute_hidden; extern const struct cpu_features *__get_cpu_features (void) __attribute__ ((const)); -#ifndef NOT_IN_libc -# define __get_cpu_features() (&__cpu_features) -#endif +# ifndef NOT_IN_libc +# define __get_cpu_features() (&__cpu_features) +# endif -#define HAS_CPU_FEATURE(idx, reg, bit) \ +# define HAS_CPU_FEATURE(idx, reg, bit) \ ((__get_cpu_features ()->cpuid[idx].reg & (1 << (bit))) != 0) /* Following are the feature tests used throughout libc. */ -#define HAS_SSE2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, edx, 26) -#define HAS_POPCOUNT HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 23) -#define HAS_SSE4_2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 20) -#define HAS_FMA HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 12) +# define HAS_SSE2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, edx, 26) +# define HAS_POPCOUNT HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 23) +# define HAS_SSSE3 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 9) +# define HAS_SSE4_1 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 19) +# define HAS_SSE4_2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 20) +# define HAS_FMA HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 12) -#define index_Fast_Rep_String FEATURE_INDEX_1 +# define index_Fast_Rep_String FEATURE_INDEX_1 #endif /* __ASSEMBLER__ */ diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S new file mode 100644 index 0000000000..fc439bb013 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S @@ -0,0 +1,1635 @@ +/* memcmp with SSE4.1 + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +#include <sysdep.h> +#include "asm-syntax.h" + +#ifndef MEMCMP +# define MEMCMP __memcmp_sse4_1 +#endif + +#ifndef ALIGN +# define ALIGN(n) .p2align n +#endif + +#define JMPTBL(I, B) (I - B) + +#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + lea TABLE(%rip), %r11; \ + movslq (%r11, INDEX, SCALE), %rcx; \ + add %r11, %rcx; \ + jmp *%rcx; \ + ud2 + + .section .text.sse4.1,"ax",@progbits +ENTRY (MEMCMP) + pxor %xmm0, %xmm0 + cmp $79, %rdx + ja L(79bytesormore) + cmp $1, %rdx + je L(firstbyte) + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + ALIGN (4) +L(firstbyte): + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + sub %ecx, %eax + ret + + ALIGN (4) +L(79bytesormore): + movdqu (%rsi), %xmm1 + movdqu (%rdi), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + mov %rsi, %rcx + and $-16, %rsi + add $16, %rsi + sub %rsi, %rcx + + sub %rcx, %rdi + add %rcx, %rdx + test $0xf, %rdi + jz L(2aligned) + + cmp $128, %rdx + ja L(128bytesormore) +L(less128bytes): + sub $64, %rdx + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqu 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqu 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + cmp $32, %rdx + jb L(less32bytesin64) + + movdqu 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqu 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin64): + add $64, %rdi + add $64, %rsi + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +L(128bytesormore): + cmp $512, %rdx + ja L(512bytesormore) + cmp $256, %rdx + ja L(less512bytes) +L(less256bytes): + sub $128, %rdx + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqu 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqu 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqu 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqu 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqu 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqu 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + add $128, %rsi + add $128, %rdi + + cmp $64, %rdx + jae L(less128bytes) + + cmp $32, %rdx + jb L(less32bytesin128) + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin128): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +L(less512bytes): + sub $256, %rdx + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqu 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqu 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqu 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqu 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqu 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqu 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + movdqu 128(%rdi), %xmm2 + pxor 128(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(144bytesin256) + + movdqu 144(%rdi), %xmm2 + pxor 144(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(160bytesin256) + + movdqu 160(%rdi), %xmm2 + pxor 160(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(176bytesin256) + + movdqu 176(%rdi), %xmm2 + pxor 176(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(192bytesin256) + + movdqu 192(%rdi), %xmm2 + pxor 192(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(208bytesin256) + + movdqu 208(%rdi), %xmm2 + pxor 208(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(224bytesin256) + + movdqu 224(%rdi), %xmm2 + pxor 224(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(240bytesin256) + + movdqu 240(%rdi), %xmm2 + pxor 240(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(256bytesin256) + + add $256, %rsi + add $256, %rdi + + cmp $128, %rdx + jae L(less256bytes) + + cmp $64, %rdx + jae L(less128bytes) + + cmp $32, %rdx + jb L(less32bytesin256) + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin256): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + ALIGN (4) +L(512bytesormore): +#ifdef DATA_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %r8 +#else + mov __x86_64_data_cache_size_half(%rip), %r8 +#endif + mov %r8, %r9 + shr $1, %r8 + add %r9, %r8 + cmp %r8, %rdx + ja L(L2_L3_cache_unaglined) + sub $64, %rdx + ALIGN (4) +L(64bytesormore_loop): + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqu 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqu 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqu 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(64bytesormore_loop) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +L(L2_L3_cache_unaglined): + sub $64, %rdx + ALIGN (4) +L(L2_L3_unaligned_128bytes_loop): + prefetchnta 0x1c0(%rdi) + prefetchnta 0x1c0(%rsi) + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqu 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqu 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqu 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(L2_L3_unaligned_128bytes_loop) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +/* + * This case is for machines which are sensitive for unaligned instructions. + */ + ALIGN (4) +L(2aligned): + cmp $128, %rdx + ja L(128bytesormorein2aligned) +L(less128bytesin2aligned): + sub $64, %rdx + + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqa 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqa 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + cmp $32, %rdx + jb L(less32bytesin64in2alinged) + + movdqa 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqa 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin64in2alinged): + add $64, %rdi + add $64, %rsi + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + ALIGN (4) +L(128bytesormorein2aligned): + cmp $512, %rdx + ja L(512bytesormorein2aligned) + cmp $256, %rdx + ja L(256bytesormorein2aligned) +L(less256bytesin2alinged): + sub $128, %rdx + + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqa 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqa 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqa 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqa 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqa 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqa 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + add $128, %rsi + add $128, %rdi + + cmp $64, %rdx + jae L(less128bytesin2aligned) + + cmp $32, %rdx + jb L(less32bytesin128in2aligned) + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin128in2aligned): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + ALIGN (4) +L(256bytesormorein2aligned): + + sub $256, %rdx + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqa 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqa 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqa 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqa 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqa 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqa 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + movdqa 128(%rdi), %xmm2 + pxor 128(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(144bytesin256) + + movdqa 144(%rdi), %xmm2 + pxor 144(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(160bytesin256) + + movdqa 160(%rdi), %xmm2 + pxor 160(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(176bytesin256) + + movdqa 176(%rdi), %xmm2 + pxor 176(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(192bytesin256) + + movdqa 192(%rdi), %xmm2 + pxor 192(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(208bytesin256) + + movdqa 208(%rdi), %xmm2 + pxor 208(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(224bytesin256) + + movdqa 224(%rdi), %xmm2 + pxor 224(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(240bytesin256) + + movdqa 240(%rdi), %xmm2 + pxor 240(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(256bytesin256) + + add $256, %rsi + add $256, %rdi + + cmp $128, %rdx + jae L(less256bytesin2alinged) + + cmp $64, %rdx + jae L(less128bytesin2aligned) + + cmp $32, %rdx + jb L(less32bytesin256in2alinged) + + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin256in2alinged): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + ALIGN (4) +L(512bytesormorein2aligned): +#ifdef DATA_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %r8 +#else + mov __x86_64_data_cache_size_half(%rip), %r8 +#endif + mov %r8, %r9 + shr $1, %r8 + add %r9, %r8 + cmp %r8, %rdx + ja L(L2_L3_cache_aglined) + + sub $64, %rdx + ALIGN (4) +L(64bytesormore_loopin2aligned): + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqa 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqa 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqa 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(64bytesormore_loopin2aligned) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) +L(L2_L3_cache_aglined): + sub $64, %rdx + ALIGN (4) +L(L2_L3_aligned_128bytes_loop): + prefetchnta 0x1c0(%rdi) + prefetchnta 0x1c0(%rsi) + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqa 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqa 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqa 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(L2_L3_aligned_128bytes_loop) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + + ALIGN (4) +L(64bytesormore_loop_end): + add $16, %rdi + add $16, %rsi + ptest %xmm2, %xmm0 + jnc L(16bytes) + + add $16, %rdi + add $16, %rsi + ptest %xmm3, %xmm0 + jnc L(16bytes) + + add $16, %rdi + add $16, %rsi + ptest %xmm4, %xmm0 + jnc L(16bytes) + + add $16, %rdi + add $16, %rsi + jmp L(16bytes) + +L(256bytesin256): + add $256, %rdi + add $256, %rsi + jmp L(16bytes) +L(240bytesin256): + add $240, %rdi + add $240, %rsi + jmp L(16bytes) +L(224bytesin256): + add $224, %rdi + add $224, %rsi + jmp L(16bytes) +L(208bytesin256): + add $208, %rdi + add $208, %rsi + jmp L(16bytes) +L(192bytesin256): + add $192, %rdi + add $192, %rsi + jmp L(16bytes) +L(176bytesin256): + add $176, %rdi + add $176, %rsi + jmp L(16bytes) +L(160bytesin256): + add $160, %rdi + add $160, %rsi + jmp L(16bytes) +L(144bytesin256): + add $144, %rdi + add $144, %rsi + jmp L(16bytes) +L(128bytesin256): + add $128, %rdi + add $128, %rsi + jmp L(16bytes) +L(112bytesin256): + add $112, %rdi + add $112, %rsi + jmp L(16bytes) +L(96bytesin256): + add $96, %rdi + add $96, %rsi + jmp L(16bytes) +L(80bytesin256): + add $80, %rdi + add $80, %rsi + jmp L(16bytes) +L(64bytesin256): + add $64, %rdi + add $64, %rsi + jmp L(16bytes) +L(48bytesin256): + add $16, %rdi + add $16, %rsi +L(32bytesin256): + add $16, %rdi + add $16, %rsi +L(16bytesin256): + add $16, %rdi + add $16, %rsi +L(16bytes): + mov -16(%rdi), %rax + mov -16(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(8bytes): + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(12bytes): + mov -12(%rdi), %rax + mov -12(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(4bytes): + mov -4(%rsi), %ecx + mov -4(%rdi), %eax + cmp %eax, %ecx + jne L(diffin4bytes) +L(0bytes): + xor %eax, %eax + ret + + ALIGN (4) +L(65bytes): + movdqu -65(%rdi), %xmm1 + movdqu -65(%rsi), %xmm2 + mov $-65, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(49bytes): + movdqu -49(%rdi), %xmm1 + movdqu -49(%rsi), %xmm2 + mov $-49, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(33bytes): + movdqu -33(%rdi), %xmm1 + movdqu -33(%rsi), %xmm2 + mov $-33, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(17bytes): + mov -17(%rdi), %rax + mov -17(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(9bytes): + mov -9(%rdi), %rax + mov -9(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + movzbl -1(%rdi), %eax + movzbl -1(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(13bytes): + mov -13(%rdi), %rax + mov -13(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(5bytes): + mov -5(%rdi), %eax + mov -5(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + movzbl -1(%rdi), %eax + movzbl -1(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(66bytes): + movdqu -66(%rdi), %xmm1 + movdqu -66(%rsi), %xmm2 + mov $-66, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(50bytes): + movdqu -50(%rdi), %xmm1 + movdqu -50(%rsi), %xmm2 + mov $-50, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(34bytes): + movdqu -34(%rdi), %xmm1 + movdqu -34(%rsi), %xmm2 + mov $-34, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(18bytes): + mov -18(%rdi), %rax + mov -18(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(10bytes): + mov -10(%rdi), %rax + mov -10(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + movzwl -2(%rdi), %eax + movzwl -2(%rsi), %ecx + cmp %cl, %al + jne L(end) + and $0xffff, %eax + and $0xffff, %ecx + sub %ecx, %eax + ret + + ALIGN (4) +L(14bytes): + mov -14(%rdi), %rax + mov -14(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(6bytes): + mov -6(%rdi), %eax + mov -6(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) +L(2bytes): + movzwl -2(%rsi), %ecx + movzwl -2(%rdi), %eax + cmp %cl, %al + jne L(end) + and $0xffff, %eax + and $0xffff, %ecx + sub %ecx, %eax + ret + + ALIGN (4) +L(67bytes): + movdqu -67(%rdi), %xmm2 + movdqu -67(%rsi), %xmm1 + mov $-67, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(51bytes): + movdqu -51(%rdi), %xmm2 + movdqu -51(%rsi), %xmm1 + mov $-51, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(35bytes): + movdqu -35(%rsi), %xmm1 + movdqu -35(%rdi), %xmm2 + mov $-35, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(19bytes): + mov -19(%rdi), %rax + mov -19(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(11bytes): + mov -11(%rdi), %rax + mov -11(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -4(%rdi), %eax + mov -4(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(15bytes): + mov -15(%rdi), %rax + mov -15(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(7bytes): + mov -7(%rdi), %eax + mov -7(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + mov -4(%rdi), %eax + mov -4(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(3bytes): + movzwl -3(%rdi), %eax + movzwl -3(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin2bytes) +L(1bytes): + movzbl -1(%rdi), %eax + movzbl -1(%rsi), %ecx + sub %ecx, %eax + ret + + ALIGN (4) +L(68bytes): + movdqu -68(%rdi), %xmm2 + movdqu -68(%rsi), %xmm1 + mov $-68, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(52bytes): + movdqu -52(%rdi), %xmm2 + movdqu -52(%rsi), %xmm1 + mov $-52, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(36bytes): + movdqu -36(%rdi), %xmm2 + movdqu -36(%rsi), %xmm1 + mov $-36, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(20bytes): + movdqu -20(%rdi), %xmm2 + movdqu -20(%rsi), %xmm1 + mov $-20, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -4(%rdi), %eax + mov -4(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(69bytes): + movdqu -69(%rsi), %xmm1 + movdqu -69(%rdi), %xmm2 + mov $-69, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(53bytes): + movdqu -53(%rsi), %xmm1 + movdqu -53(%rdi), %xmm2 + mov $-53, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(37bytes): + movdqu -37(%rsi), %xmm1 + movdqu -37(%rdi), %xmm2 + mov $-37, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(21bytes): + movdqu -21(%rsi), %xmm1 + movdqu -21(%rdi), %xmm2 + mov $-21, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(70bytes): + movdqu -70(%rsi), %xmm1 + movdqu -70(%rdi), %xmm2 + mov $-70, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(54bytes): + movdqu -54(%rsi), %xmm1 + movdqu -54(%rdi), %xmm2 + mov $-54, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(38bytes): + movdqu -38(%rsi), %xmm1 + movdqu -38(%rdi), %xmm2 + mov $-38, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(22bytes): + movdqu -22(%rsi), %xmm1 + movdqu -22(%rdi), %xmm2 + mov $-22, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(71bytes): + movdqu -71(%rsi), %xmm1 + movdqu -71(%rdi), %xmm2 + mov $-71, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(55bytes): + movdqu -55(%rdi), %xmm2 + movdqu -55(%rsi), %xmm1 + mov $-55, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(39bytes): + movdqu -39(%rdi), %xmm2 + movdqu -39(%rsi), %xmm1 + mov $-39, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(23bytes): + movdqu -23(%rdi), %xmm2 + movdqu -23(%rsi), %xmm1 + mov $-23, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(72bytes): + movdqu -72(%rsi), %xmm1 + movdqu -72(%rdi), %xmm2 + mov $-72, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(56bytes): + movdqu -56(%rdi), %xmm2 + movdqu -56(%rsi), %xmm1 + mov $-56, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(40bytes): + movdqu -40(%rdi), %xmm2 + movdqu -40(%rsi), %xmm1 + mov $-40, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(24bytes): + movdqu -24(%rdi), %xmm2 + movdqu -24(%rsi), %xmm1 + mov $-24, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(73bytes): + movdqu -73(%rsi), %xmm1 + movdqu -73(%rdi), %xmm2 + mov $-73, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(57bytes): + movdqu -57(%rdi), %xmm2 + movdqu -57(%rsi), %xmm1 + mov $-57, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(41bytes): + movdqu -41(%rdi), %xmm2 + movdqu -41(%rsi), %xmm1 + mov $-41, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(25bytes): + movdqu -25(%rdi), %xmm2 + movdqu -25(%rsi), %xmm1 + mov $-25, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -9(%rdi), %rax + mov -9(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + movzbl -1(%rdi), %eax + movzbl -1(%rsi), %ecx + sub %ecx, %eax + ret + + ALIGN (4) +L(74bytes): + movdqu -74(%rsi), %xmm1 + movdqu -74(%rdi), %xmm2 + mov $-74, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(58bytes): + movdqu -58(%rdi), %xmm2 + movdqu -58(%rsi), %xmm1 + mov $-58, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(42bytes): + movdqu -42(%rdi), %xmm2 + movdqu -42(%rsi), %xmm1 + mov $-42, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(26bytes): + movdqu -26(%rdi), %xmm2 + movdqu -26(%rsi), %xmm1 + mov $-26, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -10(%rdi), %rax + mov -10(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + movzwl -2(%rdi), %eax + movzwl -2(%rsi), %ecx + jmp L(diffin2bytes) + + ALIGN (4) +L(75bytes): + movdqu -75(%rsi), %xmm1 + movdqu -75(%rdi), %xmm2 + mov $-75, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(59bytes): + movdqu -59(%rdi), %xmm2 + movdqu -59(%rsi), %xmm1 + mov $-59, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(43bytes): + movdqu -43(%rdi), %xmm2 + movdqu -43(%rsi), %xmm1 + mov $-43, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(27bytes): + movdqu -27(%rdi), %xmm2 + movdqu -27(%rsi), %xmm1 + mov $-27, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -11(%rdi), %rax + mov -11(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -4(%rdi), %eax + mov -4(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(76bytes): + movdqu -76(%rsi), %xmm1 + movdqu -76(%rdi), %xmm2 + mov $-76, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(60bytes): + movdqu -60(%rdi), %xmm2 + movdqu -60(%rsi), %xmm1 + mov $-60, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(44bytes): + movdqu -44(%rdi), %xmm2 + movdqu -44(%rsi), %xmm1 + mov $-44, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(28bytes): + movdqu -28(%rdi), %xmm2 + movdqu -28(%rsi), %xmm1 + mov $-28, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -12(%rdi), %rax + mov -12(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -4(%rdi), %eax + mov -4(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(77bytes): + movdqu -77(%rsi), %xmm1 + movdqu -77(%rdi), %xmm2 + mov $-77, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(61bytes): + movdqu -61(%rdi), %xmm2 + movdqu -61(%rsi), %xmm1 + mov $-61, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(45bytes): + movdqu -45(%rdi), %xmm2 + movdqu -45(%rsi), %xmm1 + mov $-45, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(29bytes): + movdqu -29(%rdi), %xmm2 + movdqu -29(%rsi), %xmm1 + mov $-29, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -13(%rdi), %rax + mov -13(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(78bytes): + movdqu -78(%rsi), %xmm1 + movdqu -78(%rdi), %xmm2 + mov $-78, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(62bytes): + movdqu -62(%rdi), %xmm2 + movdqu -62(%rsi), %xmm1 + mov $-62, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(46bytes): + movdqu -46(%rdi), %xmm2 + movdqu -46(%rsi), %xmm1 + mov $-46, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(30bytes): + movdqu -30(%rdi), %xmm2 + movdqu -30(%rsi), %xmm1 + mov $-30, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -14(%rdi), %rax + mov -14(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(79bytes): + movdqu -79(%rsi), %xmm1 + movdqu -79(%rdi), %xmm2 + mov $-79, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(63bytes): + movdqu -63(%rdi), %xmm2 + movdqu -63(%rsi), %xmm1 + mov $-63, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(47bytes): + movdqu -47(%rdi), %xmm2 + movdqu -47(%rsi), %xmm1 + mov $-47, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(31bytes): + movdqu -31(%rdi), %xmm2 + movdqu -31(%rsi), %xmm1 + mov $-31, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -15(%rdi), %rax + mov -15(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(64bytes): + movdqu -64(%rdi), %xmm2 + movdqu -64(%rsi), %xmm1 + mov $-64, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(48bytes): + movdqu -48(%rdi), %xmm2 + movdqu -48(%rsi), %xmm1 + mov $-48, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(32bytes): + movdqu -32(%rdi), %xmm2 + movdqu -32(%rsi), %xmm1 + mov $-32, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -16(%rdi), %rax + mov -16(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + +/* + * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block. + */ + ALIGN (3) +L(less16bytes): + movsbq %dl, %rdx + mov (%rsi, %rdx), %rcx + mov (%rdi, %rdx), %rax + cmp %rax, %rcx + jne L(diffin8bytes) + mov 8(%rsi, %rdx), %rcx + mov 8(%rdi, %rdx), %rax +L(diffin8bytes): + cmp %eax, %ecx + jne L(diffin4bytes) + shr $32, %rcx + shr $32, %rax +L(diffin4bytes): + cmp %cx, %ax + jne L(diffin2bytes) + shr $16, %ecx + shr $16, %eax +L(diffin2bytes): + cmp %cl, %al + jne L(end) + and $0xffff, %eax + and $0xffff, %ecx + sub %ecx, %eax + ret + + ALIGN (4) +L(end): + and $0xff, %eax + and $0xff, %ecx + sub %ecx, %eax + ret + +END (MEMCMP) + + .section .rodata.sse4.1,"a",@progbits + ALIGN (3) +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(1bytes), L(table_64bytes)) + .int JMPTBL (L(2bytes), L(table_64bytes)) + .int JMPTBL (L(3bytes), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(5bytes), L(table_64bytes)) + .int JMPTBL (L(6bytes), L(table_64bytes)) + .int JMPTBL (L(7bytes), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(9bytes), L(table_64bytes)) + .int JMPTBL (L(10bytes), L(table_64bytes)) + .int JMPTBL (L(11bytes), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(13bytes), L(table_64bytes)) + .int JMPTBL (L(14bytes), L(table_64bytes)) + .int JMPTBL (L(15bytes), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(17bytes), L(table_64bytes)) + .int JMPTBL (L(18bytes), L(table_64bytes)) + .int JMPTBL (L(19bytes), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(21bytes), L(table_64bytes)) + .int JMPTBL (L(22bytes), L(table_64bytes)) + .int JMPTBL (L(23bytes), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(25bytes), L(table_64bytes)) + .int JMPTBL (L(26bytes), L(table_64bytes)) + .int JMPTBL (L(27bytes), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(29bytes), L(table_64bytes)) + .int JMPTBL (L(30bytes), L(table_64bytes)) + .int JMPTBL (L(31bytes), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(33bytes), L(table_64bytes)) + .int JMPTBL (L(34bytes), L(table_64bytes)) + .int JMPTBL (L(35bytes), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(37bytes), L(table_64bytes)) + .int JMPTBL (L(38bytes), L(table_64bytes)) + .int JMPTBL (L(39bytes), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(41bytes), L(table_64bytes)) + .int JMPTBL (L(42bytes), L(table_64bytes)) + .int JMPTBL (L(43bytes), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(45bytes), L(table_64bytes)) + .int JMPTBL (L(46bytes), L(table_64bytes)) + .int JMPTBL (L(47bytes), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(49bytes), L(table_64bytes)) + .int JMPTBL (L(50bytes), L(table_64bytes)) + .int JMPTBL (L(51bytes), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(53bytes), L(table_64bytes)) + .int JMPTBL (L(54bytes), L(table_64bytes)) + .int JMPTBL (L(55bytes), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(57bytes), L(table_64bytes)) + .int JMPTBL (L(58bytes), L(table_64bytes)) + .int JMPTBL (L(59bytes), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(61bytes), L(table_64bytes)) + .int JMPTBL (L(62bytes), L(table_64bytes)) + .int JMPTBL (L(63bytes), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) + .int JMPTBL (L(65bytes), L(table_64bytes)) + .int JMPTBL (L(66bytes), L(table_64bytes)) + .int JMPTBL (L(67bytes), L(table_64bytes)) + .int JMPTBL (L(68bytes), L(table_64bytes)) + .int JMPTBL (L(69bytes), L(table_64bytes)) + .int JMPTBL (L(70bytes), L(table_64bytes)) + .int JMPTBL (L(71bytes), L(table_64bytes)) + .int JMPTBL (L(72bytes), L(table_64bytes)) + .int JMPTBL (L(73bytes), L(table_64bytes)) + .int JMPTBL (L(74bytes), L(table_64bytes)) + .int JMPTBL (L(75bytes), L(table_64bytes)) + .int JMPTBL (L(76bytes), L(table_64bytes)) + .int JMPTBL (L(77bytes), L(table_64bytes)) + .int JMPTBL (L(78bytes), L(table_64bytes)) + .int JMPTBL (L(79bytes), L(table_64bytes)) +#endif diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S new file mode 100644 index 0000000000..301ab287f5 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcmp.S @@ -0,0 +1,59 @@ +/* Multiple versions of memcmp + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ +#ifndef NOT_IN_libc + .text +ENTRY(memcmp) + .type memcmp, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features(%rip) + jne 1f + call __init_cpu_features +1: leaq __memcmp_sse2(%rip), %rax + testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip) + jz 2f + leaq __memcmp_sse4_1(%rip), %rax +2: ret +END(memcmp) + +# undef ENTRY +# define ENTRY(name) \ + .type __memcmp_sse2, @function; \ + .p2align 4; \ + __memcmp_sse2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __memcmp_sse2, .-__memcmp_sse2 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memcmp; __GI_memcmp = __memcmp_sse2 +# endif +#endif + +#include "../memcmp.S" diff --git a/sysdeps/x86_64/multiarch/rtld-memcmp.c b/sysdeps/x86_64/multiarch/rtld-memcmp.c new file mode 100644 index 0000000000..0f271356c2 --- /dev/null +++ b/sysdeps/x86_64/multiarch/rtld-memcmp.c @@ -0,0 +1 @@ +#include "../rtld-memcmp.c" |