summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Schwab <schwab@redhat.com>2010-04-16 12:49:09 +0200
committerAndreas Schwab <schwab@redhat.com>2010-04-16 12:49:09 +0200
commit54627f3c15f4a6e2ff02e60afacdb5f8008d399a (patch)
treeca0a25e66c8e6d48c7db07e6152a0549c27a1916
parent1ef17274decb5381a50a2146cb3444630f67b527 (diff)
parent1cdb2151fbad6bff650e85a0476972881bbc027b (diff)
Merge remote branch 'origin/master' into fedora/master
-rw-r--r--ChangeLog66
-rw-r--r--elf/dl-version.c12
-rw-r--r--elf/elf.h24
-rw-r--r--stdlib/tst-makecontext3.c62
-rw-r--r--string/test-strncmp.c39
-rw-r--r--sysdeps/i386/configure25
-rw-r--r--sysdeps/i386/configure.in11
-rw-r--r--sysdeps/i386/i686/multiarch/Makefile6
-rw-r--r--sysdeps/i386/i686/multiarch/Versions5
-rw-r--r--sysdeps/i386/i686/multiarch/s_fma-fma.c30
-rw-r--r--sysdeps/i386/i686/multiarch/s_fma.c36
-rw-r--r--sysdeps/i386/i686/multiarch/s_fmaf-fma.c30
-rw-r--r--sysdeps/i386/i686/multiarch/s_fmaf.c36
-rw-r--r--sysdeps/i386/i686/multiarch/strcmp-sse4.S4
-rw-r--r--sysdeps/i386/i686/multiarch/strcmp-ssse3.S11
-rw-r--r--sysdeps/unix/sysv/linux/s390/s390-32/makecontext.c64
-rw-r--r--sysdeps/unix/sysv/linux/s390/s390-64/makecontext.c62
-rw-r--r--sysdeps/x86_64/elf/configure26
-rw-r--r--sysdeps/x86_64/elf/configure.in11
-rw-r--r--sysdeps/x86_64/multiarch/Makefile2
-rw-r--r--sysdeps/x86_64/multiarch/init-arch.h38
-rw-r--r--sysdeps/x86_64/multiarch/memcmp-sse4.S1635
-rw-r--r--sysdeps/x86_64/multiarch/memcmp.S59
-rw-r--r--sysdeps/x86_64/multiarch/rtld-memcmp.c1
24 files changed, 2133 insertions, 162 deletions
diff --git a/ChangeLog b/ChangeLog
index bed6dfea8c..c22e562774 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,69 @@
+2010-04-15 H.J. Lu <hongjiu.lu@intel.com>
+
+ * string/test-strncmp.c (check_result): New function.
+ (do_one_test): Use it.
+ (check1): New function.
+ (test_main): Use it.
+ * sysdeps/i386/i686/multiarch/strcmp-sse4.S (crosspage): Properly
+ update source and destination.
+ * sysdeps/i386/i686/multiarch/strcmp-ssse3.S (gobble_ashr_12):
+ Properly check and update counter.
+
+2010-04-14 H.J. Lu <hongjiu.lu@intel.com>
+
+ * sysdeps/x86_64/elf/configure.in: Move AVX test to ....
+ * sysdeps/i386/configure.in: ...here.
+ * sysdeps/i386/i686/multiarch/Makefile (libm-sysdep_routines): Define.
+ (CFLAGS-s_fma-fma.c): Define.
+ (CFLAGS-s_fmaf-fma.c): Define.
+ * sysdeps/i386/i686/multiarch/Versions: New file.
+ * sysdeps/i386/i686/multiarch/s_fma-fma.c: New file.
+ * sysdeps/i386/i686/multiarch/s_fma.c: New file.
+ * sysdeps/i386/i686/multiarch/s_fmaf-fma.c: New file.
+ * sysdeps/i386/i686/multiarch/s_fmaf.c: New file.
+
+ * sysdeps/x86_64/multiarch/memcmp-sse4.S: Check
+ DATA_CACHE_SIZE_HALF instead of SHARED_CACHE_SIZE_HALF.
+
+2010-04-14 Andreas Schwab <schwab@redhat.com>
+
+ * elf/dl-version.c (_dl_check_map_versions): Avoid index overflow
+ when dependencies are missing.
+
+2010-04-14 H.J. Lu <hongjiu.lu@intel.com>
+
+ * sysdeps/x86_64/multiarch/memcmp-sse4.S: Optimized for unaligned
+ data.
+
+2010-04-12 H.J. Lu <hongjiu.lu@intel.com>
+
+ * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+ memcmp-sse4.
+ * sysdeps/x86_64/multiarch/memcmp-sse4.S: New file.
+ * sysdeps/x86_64/multiarch/memcmp.S: New file.
+ * sysdeps/x86_64/multiarch/rtld-memcmp.c: New file.
+
+2010-04-13 Ulrich Drepper <drepper@redhat.com>
+
+ * sysdeps/x86_64/multiarch/init-arch.h: Pretty printing.
+ Add SSE 4.1 macros.
+
+2010-04-10 Matt Fleming <matt@console-pimps.org>
+
+ * elf/elf.h: Add SH specific ELF header flags.
+
+2010-04-13 Andreas Schwab <schwab@redhat.com>
+
+ * sysdeps/unix/sysv/linux/s390/s390-32/makecontext.c: Fix setup of
+ overflow area.
+ * sysdeps/unix/sysv/linux/s390/s390-64/makecontext.c: Likewise.
+
+2010-04-12 Andreas Schwab <schwab@redhat.com>
+
+ * stdlib/tst-makecontext3.c (main): Initialize ucontext_t objects
+ only with getcontext. Test for unimplemented makecontext by
+ checking errno.
+
2010-04-09 Ulrich Drepper <drepper@redhat.com>
* nscd/aicache.c (addhstaiX): Correct passing memory to address
diff --git a/elf/dl-version.c b/elf/dl-version.c
index 9e881162a6..c59a6c3cd3 100644
--- a/elf/dl-version.c
+++ b/elf/dl-version.c
@@ -322,10 +322,14 @@ _dl_check_map_versions (struct link_map *map, int verbose, int trace_mode)
while (1)
{
ElfW(Half) ndx = aux->vna_other & 0x7fff;
- map->l_versions[ndx].hash = aux->vna_hash;
- map->l_versions[ndx].hidden = aux->vna_other & 0x8000;
- map->l_versions[ndx].name = &strtab[aux->vna_name];
- map->l_versions[ndx].filename = &strtab[ent->vn_file];
+ /* In trace mode, dependencies may be missing. */
+ if (__builtin_expect (ndx < map->l_nversions, 1))
+ {
+ map->l_versions[ndx].hash = aux->vna_hash;
+ map->l_versions[ndx].hidden = aux->vna_other & 0x8000;
+ map->l_versions[ndx].name = &strtab[aux->vna_name];
+ map->l_versions[ndx].filename = &strtab[ent->vn_file];
+ }
if (aux->vna_next == 0)
/* No more symbols. */
diff --git a/elf/elf.h b/elf/elf.h
index 1efe359119..a9558a3ddd 100644
--- a/elf/elf.h
+++ b/elf/elf.h
@@ -2477,6 +2477,30 @@ typedef Elf32_Addr Elf32_Conflict;
/* SH specific declarations */
+/* Processor specific flags for the ELF header e_flags field. */
+#define EF_SH_MACH_MASK 0x1f
+#define EF_SH_UNKNOWN 0x0
+#define EF_SH1 0x1
+#define EF_SH2 0x2
+#define EF_SH3 0x3
+#define EF_SH_DSP 0x4
+#define EF_SH3_DSP 0x5
+#define EF_SH4AL_DSP 0x6
+#define EF_SH3E 0x8
+#define EF_SH4 0x9
+#define EF_SH2E 0xb
+#define EF_SH4A 0xc
+#define EF_SH2A 0xd
+#define EF_SH4_NOFPU 0x10
+#define EF_SH4A_NOFPU 0x11
+#define EF_SH4_NOMMU_NOFPU 0x12
+#define EF_SH2A_NOFPU 0x13
+#define EF_SH3_NOMMU 0x14
+#define EF_SH2A_SH4_NOFPU 0x15
+#define EF_SH2A_SH3_NOFPU 0x16
+#define EF_SH2A_SH4 0x17
+#define EF_SH2A_SH3E 0x18
+
/* SH relocs. */
#define R_SH_NONE 0
#define R_SH_DIR32 1
diff --git a/stdlib/tst-makecontext3.c b/stdlib/tst-makecontext3.c
index f127c6a579..a44169ae36 100644
--- a/stdlib/tst-makecontext3.c
+++ b/stdlib/tst-makecontext3.c
@@ -136,38 +136,42 @@ main (void)
exit (1);
}
- ctx[1] = ctx[0];
+ if (getcontext (&ctx[1]) != 0)
+ {
+ printf ("%s: getcontext: %m\n", __FUNCTION__);
+ exit (1);
+ }
+
ctx[1].uc_stack.ss_sp = st1;
ctx[1].uc_stack.ss_size = sizeof st1;
ctx[1].uc_link = &ctx[0];
- {
- ucontext_t tempctx = ctx[1];
- makecontext (&ctx[1], (void (*) (void)) f1, 33,
- 0x00000001 << flag, 0x00000004 << flag,
- 0x00000012 << flag, 0x00000048 << flag,
- 0x00000123 << flag, 0x0000048d << flag,
- 0x00001234 << flag, 0x000048d1 << flag,
- 0x00012345 << flag, 0x00048d15 << flag,
- 0x00123456 << flag, 0x0048d159 << flag,
- 0x01234567 << flag, 0x048d159e << flag,
- 0x12345678 << flag, 0x48d159e2 << flag,
- 0x23456789 << flag, 0x8d159e26 << flag,
- 0x3456789a << flag, 0xd159e26a << flag,
- 0x456789ab << flag, 0x159e26af << flag,
- 0x56789abc << flag, 0x59e26af3 << flag,
- 0x6789abcd << flag, 0x9e26af37 << flag,
- 0x789abcde << flag, 0xe26af37b << flag,
- 0x89abcdef << flag, 0x26af37bc << flag,
- 0x9abcdef0 << flag, 0x6af37bc3 << flag,
- 0xabcdef0f << flag);
-
- /* Without this check, a stub makecontext can make us spin forever. */
- if (memcmp (&tempctx, &ctx[1], sizeof ctx[1]) == 0)
- {
- puts ("makecontext was a no-op, presuming not implemented");
- return 0;
- }
- }
+ errno = 0;
+ makecontext (&ctx[1], (void (*) (void)) f1, 33,
+ 0x00000001 << flag, 0x00000004 << flag,
+ 0x00000012 << flag, 0x00000048 << flag,
+ 0x00000123 << flag, 0x0000048d << flag,
+ 0x00001234 << flag, 0x000048d1 << flag,
+ 0x00012345 << flag, 0x00048d15 << flag,
+ 0x00123456 << flag, 0x0048d159 << flag,
+ 0x01234567 << flag, 0x048d159e << flag,
+ 0x12345678 << flag, 0x48d159e2 << flag,
+ 0x23456789 << flag, 0x8d159e26 << flag,
+ 0x3456789a << flag, 0xd159e26a << flag,
+ 0x456789ab << flag, 0x159e26af << flag,
+ 0x56789abc << flag, 0x59e26af3 << flag,
+ 0x6789abcd << flag, 0x9e26af37 << flag,
+ 0x789abcde << flag, 0xe26af37b << flag,
+ 0x89abcdef << flag, 0x26af37bc << flag,
+ 0x9abcdef0 << flag, 0x6af37bc3 << flag,
+ 0xabcdef0f << flag);
+
+ /* Without this check, a stub makecontext can make us spin forever. */
+ if (errno == ENOSYS)
+ {
+ puts ("makecontext not implemented");
+ back_in_main = 1;
+ return 0;
+ }
/* Play some tricks with this context. */
if (++global == 1)
diff --git a/string/test-strncmp.c b/string/test-strncmp.c
index 5adf0eb311..3687879c25 100644
--- a/string/test-strncmp.c
+++ b/string/test-strncmp.c
@@ -1,5 +1,5 @@
/* Test and measure strncmp functions.
- Copyright (C) 1999, 2002, 2003 Free Software Foundation, Inc.
+ Copyright (C) 1999, 2002, 2003, 2010 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Written by Jakub Jelinek <jakub@redhat.com>, 1999.
@@ -51,8 +51,8 @@ stupid_strncmp (const char *s1, const char *s2, size_t n)
return ret;
}
-static void
-do_one_test (impl_t *impl, const char *s1, const char *s2, size_t n,
+static int
+check_result (impl_t *impl, const char *s1, const char *s2, size_t n,
int exp_result)
{
int result = CALL (impl, s1, s2, n);
@@ -63,9 +63,19 @@ do_one_test (impl_t *impl, const char *s1, const char *s2, size_t n,
error (0, 0, "Wrong result in function %s %d %d", impl->name,
result, exp_result);
ret = 1;
- return;
+ return -1;
}
+ return 0;
+}
+
+static void
+do_one_test (impl_t *impl, const char *s1, const char *s2, size_t n,
+ int exp_result)
+{
+ if (check_result (impl, s1, s2, n, exp_result) < 0)
+ return;
+
if (HP_TIMING_AVAIL)
{
hp_timing_t start __attribute ((unused));
@@ -283,6 +293,25 @@ do_random_tests (void)
}
}
+static void
+check1 (void)
+{
+ char *s1 = (char *)(buf1 + 0xb2c);
+ char *s2 = (char *)(buf1 + 0xfd8);
+ size_t i;
+ int exp_result;
+
+ strcpy(s1, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrs");
+ strcpy(s2, "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijkLMNOPQRSTUV");
+
+ for (i = 0; i < 80; i++)
+ {
+ exp_result = simple_strncmp (s1, s2, i);
+ FOR_EACH_IMPL (impl, 0)
+ check_result (impl, s1, s2, i, exp_result);
+ }
+}
+
int
test_main (void)
{
@@ -290,6 +319,8 @@ test_main (void)
test_init ();
+ check1 ();
+
printf ("%23s", "");
FOR_EACH_IMPL (impl, 0)
printf ("\t%s", impl->name);
diff --git a/sysdeps/i386/configure b/sysdeps/i386/configure
index 7814b3b313..21225cd9c9 100644
--- a/sysdeps/i386/configure
+++ b/sysdeps/i386/configure
@@ -656,3 +656,28 @@ fi
fi
{ $as_echo "$as_me:$LINENO: result: $libc_cv_as_i686" >&5
$as_echo "$libc_cv_as_i686" >&6; }
+
+{ $as_echo "$as_me:$LINENO: checking for AVX support" >&5
+$as_echo_n "checking for AVX support... " >&6; }
+if test "${libc_cv_cc_avx+set}" = set; then
+ $as_echo_n "(cached) " >&6
+else
+ if { ac_try='${CC-cc} -mavx -xc /dev/null -S -o /dev/null'
+ { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+ (eval $ac_try) 2>&5
+ ac_status=$?
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; }; then
+ libc_cv_cc_avx=yes
+else
+ libc_cv_cc_avx=no
+fi
+fi
+{ $as_echo "$as_me:$LINENO: result: $libc_cv_cc_avx" >&5
+$as_echo "$libc_cv_cc_avx" >&6; }
+if test $libc_cv_cc_avx = yes; then
+ cat >>confdefs.h <<\_ACEOF
+#define HAVE_AVX_SUPPORT 1
+_ACEOF
+
+fi
diff --git a/sysdeps/i386/configure.in b/sysdeps/i386/configure.in
index 9fc7fa59fe..d8dd648f80 100644
--- a/sysdeps/i386/configure.in
+++ b/sysdeps/i386/configure.in
@@ -55,3 +55,14 @@ if AC_TRY_COMMAND([${CC-cc} -Wa,-mtune=i686 -xc /dev/null -S -o /dev/null]); the
else
libc_cv_as_i686=no
fi])
+
+dnl Check if -mavx works.
+AC_CACHE_CHECK(for AVX support, libc_cv_cc_avx, [dnl
+if AC_TRY_COMMAND([${CC-cc} -mavx -xc /dev/null -S -o /dev/null]); then
+ libc_cv_cc_avx=yes
+else
+ libc_cv_cc_avx=no
+fi])
+if test $libc_cv_cc_avx = yes; then
+ AC_DEFINE(HAVE_AVX_SUPPORT)
+fi
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index e8847d6fc4..124595068d 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -19,3 +19,9 @@ CFLAGS-strstr.c += -msse4
CFLAGS-strcasestr.c += -msse4
endif
endif
+
+ifeq (mathyes,$(subdir)$(config-cflags-avx))
+libm-sysdep_routines += s_fma-fma s_fmaf-fma
+CFLAGS-s_fma-fma.c += -mavx -mfpmath=sse
+CFLAGS-s_fmaf-fma.c += -mavx -mfpmath=sse
+endif
diff --git a/sysdeps/i386/i686/multiarch/Versions b/sysdeps/i386/i686/multiarch/Versions
new file mode 100644
index 0000000000..59b185ac8d
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/Versions
@@ -0,0 +1,5 @@
+libc {
+ GLIBC_PRIVATE {
+ __get_cpu_features;
+ }
+}
diff --git a/sysdeps/i386/i686/multiarch/s_fma-fma.c b/sysdeps/i386/i686/multiarch/s_fma-fma.c
new file mode 100644
index 0000000000..e6f77aec77
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/s_fma-fma.c
@@ -0,0 +1,30 @@
+/* FMA version of fma.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <config.h>
+
+#ifdef HAVE_AVX_SUPPORT
+double
+__fma_fma (double x, double y, double z)
+{
+ asm ("vfmadd213sd %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z));
+ return x;
+}
+#endif
diff --git a/sysdeps/i386/i686/multiarch/s_fma.c b/sysdeps/i386/i686/multiarch/s_fma.c
new file mode 100644
index 0000000000..d9291b0be8
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/s_fma.c
@@ -0,0 +1,36 @@
+/* Multiple versions of fma.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <config.h>
+
+#ifdef HAVE_AVX_SUPPORT
+#include <math.h>
+#include <init-arch.h>
+
+extern double __fma_ia32 (double x, double y, double z) attribute_hidden;
+extern double __fma_fma (double x, double y, double z) attribute_hidden;
+
+libm_ifunc (__fma, HAS_FMA ? __fma_fma : __fma_ia32);
+weak_alias (__fma, fma)
+
+# define __fma __fma_ia32
+#endif
+
+#include <math/s_fma.c>
diff --git a/sysdeps/i386/i686/multiarch/s_fmaf-fma.c b/sysdeps/i386/i686/multiarch/s_fmaf-fma.c
new file mode 100644
index 0000000000..887e9c3829
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/s_fmaf-fma.c
@@ -0,0 +1,30 @@
+/* FMA version of fmaf.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <config.h>
+
+#ifdef HAVE_AVX_SUPPORT
+float
+__fmaf_fma (float x, float y, float z)
+{
+ asm ("vfmadd213ss %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z));
+ return x;
+}
+#endif
diff --git a/sysdeps/i386/i686/multiarch/s_fmaf.c b/sysdeps/i386/i686/multiarch/s_fmaf.c
new file mode 100644
index 0000000000..4ea9be48ac
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/s_fmaf.c
@@ -0,0 +1,36 @@
+/* Multiple versions of fmaf.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <config.h>
+
+#ifdef HAVE_AVX_SUPPORT
+#include <math.h>
+#include <init-arch.h>
+
+extern float __fmaf_ia32 (float x, float y, float z) attribute_hidden;
+extern float __fmaf_fma (float x, float y, float z) attribute_hidden;
+
+libm_ifunc (__fmaf, HAS_FMA ? __fmaf_fma : __fmaf_ia32);
+weak_alias (__fmaf, fmaf)
+
+# define __fmaf __fmaf_ia32
+#endif
+
+#include <math/s_fmaf.c>
diff --git a/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/sysdeps/i386/i686/multiarch/strcmp-sse4.S
index 81d6ec66f7..0de0a113c0 100644
--- a/sysdeps/i386/i686/multiarch/strcmp-sse4.S
+++ b/sysdeps/i386/i686/multiarch/strcmp-sse4.S
@@ -223,8 +223,8 @@ L(crosspage):
inc %edx
cmp $15, %edx
jle L(crosspage)
- add $16, %edi
- add $16, %esi
+ add %edx, %edi
+ add %edx, %esi
jmp L(check_offset)
.p2align 4
diff --git a/sysdeps/i386/i686/multiarch/strcmp-ssse3.S b/sysdeps/i386/i686/multiarch/strcmp-ssse3.S
index 40994c05b1..a4de2259d2 100644
--- a/sysdeps/i386/i686/multiarch/strcmp-ssse3.S
+++ b/sysdeps/i386/i686/multiarch/strcmp-ssse3.S
@@ -1484,17 +1484,18 @@ L(gobble_ashr_12):
sub $0xffff, %esi
jnz L(exit)
+#ifdef USE_AS_STRNCMP
+ cmp $16, %ebp
+ lea -16(%ebp), %ebp
+ jbe L(more8byteseq)
+#endif
+
add $16, %ecx
movdqa %xmm4, %xmm3
add $16, %edi
jg L(nibble_ashr_12)
-#ifdef USE_AS_STRNCMP
- cmp $16, %ebp
- lea -16(%ebp), %ebp
- jbe L(more8byteseq)
-#endif
movdqa (%eax, %ecx), %xmm1
movdqa (%edx, %ecx), %xmm2
movdqa %xmm2, %xmm4
diff --git a/sysdeps/unix/sysv/linux/s390/s390-32/makecontext.c b/sysdeps/unix/sysv/linux/s390/s390-32/makecontext.c
index 94760e0c2b..0e309c3e29 100644
--- a/sysdeps/unix/sysv/linux/s390/s390-32/makecontext.c
+++ b/sysdeps/unix/sysv/linux/s390/s390-32/makecontext.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2001 Free Software Foundation, Inc.
+/* Copyright (C) 2001, 2010 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Martin Schwidefsky (schwidefsky@de.ibm.com).
@@ -28,15 +28,15 @@
double, complex and structure with sizes 0, 2, 4 or 8
won't work.
makecontext sets up a stack and the registers for the
- context. The stack looks like this:
- size offset
+ user context. The stack looks like this:
+ size offset
%r15 -> +-----------------------+
- 4 | back chain (zero) | 0
- 4 | reserved | 4
- 88 | save area for (*func) | 8
- +-----------------------+
- n | overflow parameters | 96
- +-----------------------+
+ 4 | back chain (zero) | 0
+ 4 | reserved | 4
+ 88 | save area for (*func) | 8
+ +-----------------------+
+ n | overflow parameters | 96
+ +-----------------------+
The registers are set up like this:
%r2-%r6: parameters 1 to 5
%r7 : (*func) pointer
@@ -54,27 +54,27 @@ void
__makecontext (ucontext_t *ucp, void (*func) (void), int argc, ...)
{
extern void __makecontext_ret (void);
- unsigned long *sp;
+ unsigned long int *sp;
va_list ap;
- int i;
- sp = (unsigned long *) (((unsigned long) ucp->uc_stack.ss_sp
- + ucp->uc_stack.ss_size) & -8L);
+ sp = (unsigned long int *) (((unsigned long int) ucp->uc_stack.ss_sp
+ + ucp->uc_stack.ss_size) & -8L);
/* Set the return address to trampoline. */
- ucp->uc_mcontext.gregs[14] = (long) __makecontext_ret;
+ ucp->uc_mcontext.gregs[14] = (long int) __makecontext_ret;
/* Set register parameters. */
va_start (ap, argc);
- for (i = 0; (i < argc) && (i < 5); i++)
- ucp->uc_mcontext.gregs[2+i] = va_arg (ap, long);
+ for (int i = 0; i < argc && i < 5; ++i)
+ ucp->uc_mcontext.gregs[2 + i] = va_arg (ap, long int);
/* The remaining arguments go to the overflow area. */
- if (argc > 5) {
- sp -= argc - 5;
- for (i = 5; i < argc; i++)
- sp[i] = va_arg(ap, long);
- }
+ if (argc > 5)
+ {
+ sp -= argc - 5;
+ for (int i = 5; i < argc; ++i)
+ sp[i - 5] = va_arg (ap, long int);
+ }
va_end (ap);
/* Make room for the save area and set the backchain. */
@@ -82,24 +82,24 @@ __makecontext (ucontext_t *ucp, void (*func) (void), int argc, ...)
*sp = 0;
/* Pass (*func) to __start_context in %r7. */
- ucp->uc_mcontext.gregs[7] = (long) func;
+ ucp->uc_mcontext.gregs[7] = (long int) func;
/* Pass ucp->uc_link to __start_context in %r8. */
- ucp->uc_mcontext.gregs[8] = (long) ucp->uc_link;
+ ucp->uc_mcontext.gregs[8] = (long int) ucp->uc_link;
/* Pass address of setcontext in %r9. */
- ucp->uc_mcontext.gregs[9] = (long) &setcontext;
+ ucp->uc_mcontext.gregs[9] = (long int) &setcontext;
/* Set stack pointer. */
- ucp->uc_mcontext.gregs[15] = (long) sp;
+ ucp->uc_mcontext.gregs[15] = (long int) sp;
}
-asm(".text\n"
- ".type __makecontext_ret,@function\n"
- "__makecontext_ret:\n"
- " basr %r14,%r7\n"
- " lr %r2,%r8\n"
- " br %r9\n"
- ".size __makecontext_ret, .-__makecontext_ret");
+asm (".text\n"
+ ".type __makecontext_ret,@function\n"
+ "__makecontext_ret:\n"
+ " basr %r14,%r7\n"
+ " lr %r2,%r8\n"
+ " br %r9\n"
+ ".size __makecontext_ret, .-__makecontext_ret");
weak_alias (__makecontext, makecontext)
diff --git a/sysdeps/unix/sysv/linux/s390/s390-64/makecontext.c b/sysdeps/unix/sysv/linux/s390/s390-64/makecontext.c
index b08f1b4047..40ff3eefb4 100644
--- a/sysdeps/unix/sysv/linux/s390/s390-64/makecontext.c
+++ b/sysdeps/unix/sysv/linux/s390/s390-64/makecontext.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2001 Free Software Foundation, Inc.
+/* Copyright (C) 2001, 2010 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Martin Schwidefsky (schwidefsky@de.ibm.com).
@@ -29,14 +29,14 @@
won't work.
makecontext sets up a stack and the registers for the
user context. The stack looks like this:
- size offset
+ size offset
%r15 -> +-----------------------+
- 8 | back chain (zero) | 0
- 8 | reserved | 8
- 144 | save area for (*func) | 16
- +-----------------------+
- n | overflow parameters | 160
- +-----------------------+
+ 8 | back chain (zero) | 0
+ 8 | reserved | 8
+ 144 | save area for (*func) | 16
+ +-----------------------+
+ n | overflow parameters | 160
+ +-----------------------+
The registers are set up like this:
%r2-%r6: parameters 1 to 5
%r7 : (*func) pointer
@@ -54,27 +54,27 @@ void
__makecontext (ucontext_t *ucp, void (*func) (void), int argc, ...)
{
extern void __makecontext_ret (void);
- unsigned long *sp;
+ unsigned long int *sp;
va_list ap;
- int i;
- sp = (unsigned long *) (((unsigned long) ucp->uc_stack.ss_sp
- + ucp->uc_stack.ss_size) & -8L);
+ sp = (unsigned long int *) (((unsigned long int) ucp->uc_stack.ss_sp
+ + ucp->uc_stack.ss_size) & -8L);
/* Set the return address to trampoline. */
- ucp->uc_mcontext.gregs[14] = (long) __makecontext_ret;
+ ucp->uc_mcontext.gregs[14] = (long int) __makecontext_ret;
/* Set register parameters. */
va_start (ap, argc);
- for (i = 0; (i < argc) && (i < 5); i++)
- ucp->uc_mcontext.gregs[2+i] = va_arg (ap, long);
+ for (int i = 0; i < argc && i < 5; ++i)
+ ucp->uc_mcontext.gregs[2 + i] = va_arg (ap, long int);
/* The remaining arguments go to the overflow area. */
- if (argc > 5) {
- sp -= argc - 5;
- for (i = 5; i < argc; i++)
- sp[i] = va_arg(ap, long);
- }
+ if (argc > 5)
+ {
+ sp -= argc - 5;
+ for (int i = 5; i < argc; ++i)
+ sp[i - 5] = va_arg (ap, long int);
+ }
va_end (ap);
/* Make room for the save area and set the backchain. */
@@ -82,24 +82,24 @@ __makecontext (ucontext_t *ucp, void (*func) (void), int argc, ...)
*sp = 0;
/* Pass (*func) to __start_context in %r7. */
- ucp->uc_mcontext.gregs[7] = (long) func;
+ ucp->uc_mcontext.gregs[7] = (long int) func;
/* Pass ucp->uc_link to __start_context in %r8. */
- ucp->uc_mcontext.gregs[8] = (long) ucp->uc_link;
+ ucp->uc_mcontext.gregs[8] = (long int) ucp->uc_link;
/* Pass address of setcontext in %r9. */
- ucp->uc_mcontext.gregs[9] = (long) &setcontext;
+ ucp->uc_mcontext.gregs[9] = (long int) &setcontext;
/* Set stack pointer. */
- ucp->uc_mcontext.gregs[15] = (long) sp;
+ ucp->uc_mcontext.gregs[15] = (long int) sp;
}
-asm(".text\n"
- ".type __makecontext_ret,@function\n"
- "__makecontext_ret:\n"
- " basr %r14,%r7\n"
- " lgr %r2,%r8\n"
- " br %r9\n"
- ".size __makecontext_ret, .-__makecontext_ret");
+asm (".text\n"
+ ".type __makecontext_ret,@function\n"
+ "__makecontext_ret:\n"
+ " basr %r14,%r7\n"
+ " lgr %r2,%r8\n"
+ " br %r9\n"
+ ".size __makecontext_ret, .-__makecontext_ret");
weak_alias (__makecontext, makecontext)
diff --git a/sysdeps/x86_64/elf/configure b/sysdeps/x86_64/elf/configure
index 0b93b0424e..f722b9e600 100644
--- a/sysdeps/x86_64/elf/configure
+++ b/sysdeps/x86_64/elf/configure
@@ -46,29 +46,3 @@ fi
cat >>confdefs.h <<\_ACEOF
#define PI_STATIC_AND_HIDDEN 1
_ACEOF
-
-
-{ $as_echo "$as_me:$LINENO: checking for AVX support" >&5
-$as_echo_n "checking for AVX support... " >&6; }
-if test "${libc_cv_cc_avx+set}" = set; then
- $as_echo_n "(cached) " >&6
-else
- if { ac_try='${CC-cc} -mavx -xc /dev/null -S -o /dev/null'
- { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
- (eval $ac_try) 2>&5
- ac_status=$?
- $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
- (exit $ac_status); }; }; then
- libc_cv_cc_avx=yes
-else
- libc_cv_cc_avx=no
-fi
-fi
-{ $as_echo "$as_me:$LINENO: result: $libc_cv_cc_avx" >&5
-$as_echo "$libc_cv_cc_avx" >&6; }
-if test $libc_cv_cc_avx = yes; then
- cat >>confdefs.h <<\_ACEOF
-#define HAVE_AVX_SUPPORT 1
-_ACEOF
-
-fi
diff --git a/sysdeps/x86_64/elf/configure.in b/sysdeps/x86_64/elf/configure.in
index 14d1875302..9cb59d009c 100644
--- a/sysdeps/x86_64/elf/configure.in
+++ b/sysdeps/x86_64/elf/configure.in
@@ -32,14 +32,3 @@ fi
dnl It is always possible to access static and hidden symbols in an
dnl position independent way.
AC_DEFINE(PI_STATIC_AND_HIDDEN)
-
-dnl Check if -mavx works.
-AC_CACHE_CHECK(for AVX support, libc_cv_cc_avx, [dnl
-if AC_TRY_COMMAND([${CC-cc} -mavx -xc /dev/null -S -o /dev/null]); then
- libc_cv_cc_avx=yes
-else
- libc_cv_cc_avx=no
-fi])
-if test $libc_cv_cc_avx = yes; then
- AC_DEFINE(HAVE_AVX_SUPPORT)
-fi
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 364e7bbbd2..c61cf70345 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -5,7 +5,7 @@ endif
ifeq ($(subdir),string)
sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
- strend-sse4
+ strend-sse4 memcmp-sse4
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
CFLAGS-strcspn-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
index 5c73813404..b2f2de3796 100644
--- a/sysdeps/x86_64/multiarch/init-arch.h
+++ b/sysdeps/x86_64/multiarch/init-arch.h
@@ -20,21 +20,23 @@
#ifdef __ASSEMBLER__
-#include <ifunc-defines.h>
+# include <ifunc-defines.h>
-#define bit_SSE2 (1 << 26)
-#define bit_SSSE3 (1 << 9)
-#define bit_SSE4_2 (1 << 20)
+# define bit_SSE2 (1 << 26)
+# define bit_SSSE3 (1 << 9)
+# define bit_SSE4_1 (1 << 19)
+# define bit_SSE4_2 (1 << 20)
-#define index_SSE2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET
-#define index_SSSE3 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
-#define index_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
+# define index_SSE2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET
+# define index_SSSE3 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
+# define index_SSE4_1 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
+# define index_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
#define index_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE
#else /* __ASSEMBLER__ */
-#include <sys/param.h>
+# include <sys/param.h>
enum
{
@@ -84,20 +86,22 @@ extern void __init_cpu_features (void) attribute_hidden;
extern const struct cpu_features *__get_cpu_features (void)
__attribute__ ((const));
-#ifndef NOT_IN_libc
-# define __get_cpu_features() (&__cpu_features)
-#endif
+# ifndef NOT_IN_libc
+# define __get_cpu_features() (&__cpu_features)
+# endif
-#define HAS_CPU_FEATURE(idx, reg, bit) \
+# define HAS_CPU_FEATURE(idx, reg, bit) \
((__get_cpu_features ()->cpuid[idx].reg & (1 << (bit))) != 0)
/* Following are the feature tests used throughout libc. */
-#define HAS_SSE2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, edx, 26)
-#define HAS_POPCOUNT HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 23)
-#define HAS_SSE4_2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 20)
-#define HAS_FMA HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 12)
+# define HAS_SSE2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, edx, 26)
+# define HAS_POPCOUNT HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 23)
+# define HAS_SSSE3 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 9)
+# define HAS_SSE4_1 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 19)
+# define HAS_SSE4_2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 20)
+# define HAS_FMA HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 12)
-#define index_Fast_Rep_String FEATURE_INDEX_1
+# define index_Fast_Rep_String FEATURE_INDEX_1
#endif /* __ASSEMBLER__ */
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
new file mode 100644
index 0000000000..fc439bb013
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -0,0 +1,1635 @@
+/* memcmp with SSE4.1
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef NOT_IN_libc
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#ifndef MEMCMP
+# define MEMCMP __memcmp_sse4_1
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n) .p2align n
+#endif
+
+#define JMPTBL(I, B) (I - B)
+
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ lea TABLE(%rip), %r11; \
+ movslq (%r11, INDEX, SCALE), %rcx; \
+ add %r11, %rcx; \
+ jmp *%rcx; \
+ ud2
+
+ .section .text.sse4.1,"ax",@progbits
+ENTRY (MEMCMP)
+ pxor %xmm0, %xmm0
+ cmp $79, %rdx
+ ja L(79bytesormore)
+ cmp $1, %rdx
+ je L(firstbyte)
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+ ALIGN (4)
+L(firstbyte):
+ movzbl (%rdi), %eax
+ movzbl (%rsi), %ecx
+ sub %ecx, %eax
+ ret
+
+ ALIGN (4)
+L(79bytesormore):
+ movdqu (%rsi), %xmm1
+ movdqu (%rdi), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+ mov %rsi, %rcx
+ and $-16, %rsi
+ add $16, %rsi
+ sub %rsi, %rcx
+
+ sub %rcx, %rdi
+ add %rcx, %rdx
+ test $0xf, %rdi
+ jz L(2aligned)
+
+ cmp $128, %rdx
+ ja L(128bytesormore)
+L(less128bytes):
+ sub $64, %rdx
+
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqu 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqu 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+ cmp $32, %rdx
+ jb L(less32bytesin64)
+
+ movdqu 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqu 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin64):
+ add $64, %rdi
+ add $64, %rsi
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+L(128bytesormore):
+ cmp $512, %rdx
+ ja L(512bytesormore)
+ cmp $256, %rdx
+ ja L(less512bytes)
+L(less256bytes):
+ sub $128, %rdx
+
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqu 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqu 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+
+ movdqu 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqu 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+
+ movdqu 96(%rdi), %xmm2
+ pxor 96(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(112bytesin256)
+
+ movdqu 112(%rdi), %xmm2
+ pxor 112(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(128bytesin256)
+
+ add $128, %rsi
+ add $128, %rdi
+
+ cmp $64, %rdx
+ jae L(less128bytes)
+
+ cmp $32, %rdx
+ jb L(less32bytesin128)
+
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin128):
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+L(less512bytes):
+ sub $256, %rdx
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqu 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqu 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+
+ movdqu 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqu 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+
+ movdqu 96(%rdi), %xmm2
+ pxor 96(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(112bytesin256)
+
+ movdqu 112(%rdi), %xmm2
+ pxor 112(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(128bytesin256)
+
+ movdqu 128(%rdi), %xmm2
+ pxor 128(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(144bytesin256)
+
+ movdqu 144(%rdi), %xmm2
+ pxor 144(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(160bytesin256)
+
+ movdqu 160(%rdi), %xmm2
+ pxor 160(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(176bytesin256)
+
+ movdqu 176(%rdi), %xmm2
+ pxor 176(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(192bytesin256)
+
+ movdqu 192(%rdi), %xmm2
+ pxor 192(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(208bytesin256)
+
+ movdqu 208(%rdi), %xmm2
+ pxor 208(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(224bytesin256)
+
+ movdqu 224(%rdi), %xmm2
+ pxor 224(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(240bytesin256)
+
+ movdqu 240(%rdi), %xmm2
+ pxor 240(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(256bytesin256)
+
+ add $256, %rsi
+ add $256, %rdi
+
+ cmp $128, %rdx
+ jae L(less256bytes)
+
+ cmp $64, %rdx
+ jae L(less128bytes)
+
+ cmp $32, %rdx
+ jb L(less32bytesin256)
+
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin256):
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+ ALIGN (4)
+L(512bytesormore):
+#ifdef DATA_CACHE_SIZE_HALF
+ mov $DATA_CACHE_SIZE_HALF, %r8
+#else
+ mov __x86_64_data_cache_size_half(%rip), %r8
+#endif
+ mov %r8, %r9
+ shr $1, %r8
+ add %r9, %r8
+ cmp %r8, %rdx
+ ja L(L2_L3_cache_unaglined)
+ sub $64, %rdx
+ ALIGN (4)
+L(64bytesormore_loop):
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ movdqa %xmm2, %xmm1
+
+ movdqu 16(%rdi), %xmm3
+ pxor 16(%rsi), %xmm3
+ por %xmm3, %xmm1
+
+ movdqu 32(%rdi), %xmm4
+ pxor 32(%rsi), %xmm4
+ por %xmm4, %xmm1
+
+ movdqu 48(%rdi), %xmm5
+ pxor 48(%rsi), %xmm5
+ por %xmm5, %xmm1
+
+ ptest %xmm1, %xmm0
+ jnc L(64bytesormore_loop_end)
+ add $64, %rsi
+ add $64, %rdi
+ sub $64, %rdx
+ jae L(64bytesormore_loop)
+
+ add $64, %rdx
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+L(L2_L3_cache_unaglined):
+ sub $64, %rdx
+ ALIGN (4)
+L(L2_L3_unaligned_128bytes_loop):
+ prefetchnta 0x1c0(%rdi)
+ prefetchnta 0x1c0(%rsi)
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ movdqa %xmm2, %xmm1
+
+ movdqu 16(%rdi), %xmm3
+ pxor 16(%rsi), %xmm3
+ por %xmm3, %xmm1
+
+ movdqu 32(%rdi), %xmm4
+ pxor 32(%rsi), %xmm4
+ por %xmm4, %xmm1
+
+ movdqu 48(%rdi), %xmm5
+ pxor 48(%rsi), %xmm5
+ por %xmm5, %xmm1
+
+ ptest %xmm1, %xmm0
+ jnc L(64bytesormore_loop_end)
+ add $64, %rsi
+ add $64, %rdi
+ sub $64, %rdx
+ jae L(L2_L3_unaligned_128bytes_loop)
+
+ add $64, %rdx
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+/*
+ * This case is for machines which are sensitive for unaligned instructions.
+ */
+ ALIGN (4)
+L(2aligned):
+ cmp $128, %rdx
+ ja L(128bytesormorein2aligned)
+L(less128bytesin2aligned):
+ sub $64, %rdx
+
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqa 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqa 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqa 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+ cmp $32, %rdx
+ jb L(less32bytesin64in2alinged)
+
+ movdqa 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqa 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin64in2alinged):
+ add $64, %rdi
+ add $64, %rsi
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+ ALIGN (4)
+L(128bytesormorein2aligned):
+ cmp $512, %rdx
+ ja L(512bytesormorein2aligned)
+ cmp $256, %rdx
+ ja L(256bytesormorein2aligned)
+L(less256bytesin2alinged):
+ sub $128, %rdx
+
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqa 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqa 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqa 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+
+ movdqa 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqa 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+
+ movdqa 96(%rdi), %xmm2
+ pxor 96(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(112bytesin256)
+
+ movdqa 112(%rdi), %xmm2
+ pxor 112(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(128bytesin256)
+
+ add $128, %rsi
+ add $128, %rdi
+
+ cmp $64, %rdx
+ jae L(less128bytesin2aligned)
+
+ cmp $32, %rdx
+ jb L(less32bytesin128in2aligned)
+
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin128in2aligned):
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+ ALIGN (4)
+L(256bytesormorein2aligned):
+
+ sub $256, %rdx
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqa 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqa 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqa 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+
+ movdqa 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqa 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+
+ movdqa 96(%rdi), %xmm2
+ pxor 96(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(112bytesin256)
+
+ movdqa 112(%rdi), %xmm2
+ pxor 112(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(128bytesin256)
+
+ movdqa 128(%rdi), %xmm2
+ pxor 128(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(144bytesin256)
+
+ movdqa 144(%rdi), %xmm2
+ pxor 144(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(160bytesin256)
+
+ movdqa 160(%rdi), %xmm2
+ pxor 160(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(176bytesin256)
+
+ movdqa 176(%rdi), %xmm2
+ pxor 176(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(192bytesin256)
+
+ movdqa 192(%rdi), %xmm2
+ pxor 192(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(208bytesin256)
+
+ movdqa 208(%rdi), %xmm2
+ pxor 208(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(224bytesin256)
+
+ movdqa 224(%rdi), %xmm2
+ pxor 224(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(240bytesin256)
+
+ movdqa 240(%rdi), %xmm2
+ pxor 240(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(256bytesin256)
+
+ add $256, %rsi
+ add $256, %rdi
+
+ cmp $128, %rdx
+ jae L(less256bytesin2alinged)
+
+ cmp $64, %rdx
+ jae L(less128bytesin2aligned)
+
+ cmp $32, %rdx
+ jb L(less32bytesin256in2alinged)
+
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqa 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin256in2alinged):
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+ ALIGN (4)
+L(512bytesormorein2aligned):
+#ifdef DATA_CACHE_SIZE_HALF
+ mov $DATA_CACHE_SIZE_HALF, %r8
+#else
+ mov __x86_64_data_cache_size_half(%rip), %r8
+#endif
+ mov %r8, %r9
+ shr $1, %r8
+ add %r9, %r8
+ cmp %r8, %rdx
+ ja L(L2_L3_cache_aglined)
+
+ sub $64, %rdx
+ ALIGN (4)
+L(64bytesormore_loopin2aligned):
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ movdqa %xmm2, %xmm1
+
+ movdqa 16(%rdi), %xmm3
+ pxor 16(%rsi), %xmm3
+ por %xmm3, %xmm1
+
+ movdqa 32(%rdi), %xmm4
+ pxor 32(%rsi), %xmm4
+ por %xmm4, %xmm1
+
+ movdqa 48(%rdi), %xmm5
+ pxor 48(%rsi), %xmm5
+ por %xmm5, %xmm1
+
+ ptest %xmm1, %xmm0
+ jnc L(64bytesormore_loop_end)
+ add $64, %rsi
+ add $64, %rdi
+ sub $64, %rdx
+ jae L(64bytesormore_loopin2aligned)
+
+ add $64, %rdx
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+L(L2_L3_cache_aglined):
+ sub $64, %rdx
+ ALIGN (4)
+L(L2_L3_aligned_128bytes_loop):
+ prefetchnta 0x1c0(%rdi)
+ prefetchnta 0x1c0(%rsi)
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ movdqa %xmm2, %xmm1
+
+ movdqa 16(%rdi), %xmm3
+ pxor 16(%rsi), %xmm3
+ por %xmm3, %xmm1
+
+ movdqa 32(%rdi), %xmm4
+ pxor 32(%rsi), %xmm4
+ por %xmm4, %xmm1
+
+ movdqa 48(%rdi), %xmm5
+ pxor 48(%rsi), %xmm5
+ por %xmm5, %xmm1
+
+ ptest %xmm1, %xmm0
+ jnc L(64bytesormore_loop_end)
+ add $64, %rsi
+ add $64, %rdi
+ sub $64, %rdx
+ jae L(L2_L3_aligned_128bytes_loop)
+
+ add $64, %rdx
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+
+ ALIGN (4)
+L(64bytesormore_loop_end):
+ add $16, %rdi
+ add $16, %rsi
+ ptest %xmm2, %xmm0
+ jnc L(16bytes)
+
+ add $16, %rdi
+ add $16, %rsi
+ ptest %xmm3, %xmm0
+ jnc L(16bytes)
+
+ add $16, %rdi
+ add $16, %rsi
+ ptest %xmm4, %xmm0
+ jnc L(16bytes)
+
+ add $16, %rdi
+ add $16, %rsi
+ jmp L(16bytes)
+
+L(256bytesin256):
+ add $256, %rdi
+ add $256, %rsi
+ jmp L(16bytes)
+L(240bytesin256):
+ add $240, %rdi
+ add $240, %rsi
+ jmp L(16bytes)
+L(224bytesin256):
+ add $224, %rdi
+ add $224, %rsi
+ jmp L(16bytes)
+L(208bytesin256):
+ add $208, %rdi
+ add $208, %rsi
+ jmp L(16bytes)
+L(192bytesin256):
+ add $192, %rdi
+ add $192, %rsi
+ jmp L(16bytes)
+L(176bytesin256):
+ add $176, %rdi
+ add $176, %rsi
+ jmp L(16bytes)
+L(160bytesin256):
+ add $160, %rdi
+ add $160, %rsi
+ jmp L(16bytes)
+L(144bytesin256):
+ add $144, %rdi
+ add $144, %rsi
+ jmp L(16bytes)
+L(128bytesin256):
+ add $128, %rdi
+ add $128, %rsi
+ jmp L(16bytes)
+L(112bytesin256):
+ add $112, %rdi
+ add $112, %rsi
+ jmp L(16bytes)
+L(96bytesin256):
+ add $96, %rdi
+ add $96, %rsi
+ jmp L(16bytes)
+L(80bytesin256):
+ add $80, %rdi
+ add $80, %rsi
+ jmp L(16bytes)
+L(64bytesin256):
+ add $64, %rdi
+ add $64, %rsi
+ jmp L(16bytes)
+L(48bytesin256):
+ add $16, %rdi
+ add $16, %rsi
+L(32bytesin256):
+ add $16, %rdi
+ add $16, %rsi
+L(16bytesin256):
+ add $16, %rdi
+ add $16, %rsi
+L(16bytes):
+ mov -16(%rdi), %rax
+ mov -16(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+L(8bytes):
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(12bytes):
+ mov -12(%rdi), %rax
+ mov -12(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+L(4bytes):
+ mov -4(%rsi), %ecx
+ mov -4(%rdi), %eax
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+L(0bytes):
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(65bytes):
+ movdqu -65(%rdi), %xmm1
+ movdqu -65(%rsi), %xmm2
+ mov $-65, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(49bytes):
+ movdqu -49(%rdi), %xmm1
+ movdqu -49(%rsi), %xmm2
+ mov $-49, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(33bytes):
+ movdqu -33(%rdi), %xmm1
+ movdqu -33(%rsi), %xmm2
+ mov $-33, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(17bytes):
+ mov -17(%rdi), %rax
+ mov -17(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+L(9bytes):
+ mov -9(%rdi), %rax
+ mov -9(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ movzbl -1(%rdi), %eax
+ movzbl -1(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(13bytes):
+ mov -13(%rdi), %rax
+ mov -13(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(5bytes):
+ mov -5(%rdi), %eax
+ mov -5(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ movzbl -1(%rdi), %eax
+ movzbl -1(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(66bytes):
+ movdqu -66(%rdi), %xmm1
+ movdqu -66(%rsi), %xmm2
+ mov $-66, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(50bytes):
+ movdqu -50(%rdi), %xmm1
+ movdqu -50(%rsi), %xmm2
+ mov $-50, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(34bytes):
+ movdqu -34(%rdi), %xmm1
+ movdqu -34(%rsi), %xmm2
+ mov $-34, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(18bytes):
+ mov -18(%rdi), %rax
+ mov -18(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+L(10bytes):
+ mov -10(%rdi), %rax
+ mov -10(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ movzwl -2(%rdi), %eax
+ movzwl -2(%rsi), %ecx
+ cmp %cl, %al
+ jne L(end)
+ and $0xffff, %eax
+ and $0xffff, %ecx
+ sub %ecx, %eax
+ ret
+
+ ALIGN (4)
+L(14bytes):
+ mov -14(%rdi), %rax
+ mov -14(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(6bytes):
+ mov -6(%rdi), %eax
+ mov -6(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+L(2bytes):
+ movzwl -2(%rsi), %ecx
+ movzwl -2(%rdi), %eax
+ cmp %cl, %al
+ jne L(end)
+ and $0xffff, %eax
+ and $0xffff, %ecx
+ sub %ecx, %eax
+ ret
+
+ ALIGN (4)
+L(67bytes):
+ movdqu -67(%rdi), %xmm2
+ movdqu -67(%rsi), %xmm1
+ mov $-67, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(51bytes):
+ movdqu -51(%rdi), %xmm2
+ movdqu -51(%rsi), %xmm1
+ mov $-51, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(35bytes):
+ movdqu -35(%rsi), %xmm1
+ movdqu -35(%rdi), %xmm2
+ mov $-35, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(19bytes):
+ mov -19(%rdi), %rax
+ mov -19(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+L(11bytes):
+ mov -11(%rdi), %rax
+ mov -11(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -4(%rdi), %eax
+ mov -4(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(15bytes):
+ mov -15(%rdi), %rax
+ mov -15(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(7bytes):
+ mov -7(%rdi), %eax
+ mov -7(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ mov -4(%rdi), %eax
+ mov -4(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(3bytes):
+ movzwl -3(%rdi), %eax
+ movzwl -3(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin2bytes)
+L(1bytes):
+ movzbl -1(%rdi), %eax
+ movzbl -1(%rsi), %ecx
+ sub %ecx, %eax
+ ret
+
+ ALIGN (4)
+L(68bytes):
+ movdqu -68(%rdi), %xmm2
+ movdqu -68(%rsi), %xmm1
+ mov $-68, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(52bytes):
+ movdqu -52(%rdi), %xmm2
+ movdqu -52(%rsi), %xmm1
+ mov $-52, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(36bytes):
+ movdqu -36(%rdi), %xmm2
+ movdqu -36(%rsi), %xmm1
+ mov $-36, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(20bytes):
+ movdqu -20(%rdi), %xmm2
+ movdqu -20(%rsi), %xmm1
+ mov $-20, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -4(%rdi), %eax
+ mov -4(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(69bytes):
+ movdqu -69(%rsi), %xmm1
+ movdqu -69(%rdi), %xmm2
+ mov $-69, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(53bytes):
+ movdqu -53(%rsi), %xmm1
+ movdqu -53(%rdi), %xmm2
+ mov $-53, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(37bytes):
+ movdqu -37(%rsi), %xmm1
+ movdqu -37(%rdi), %xmm2
+ mov $-37, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(21bytes):
+ movdqu -21(%rsi), %xmm1
+ movdqu -21(%rdi), %xmm2
+ mov $-21, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(70bytes):
+ movdqu -70(%rsi), %xmm1
+ movdqu -70(%rdi), %xmm2
+ mov $-70, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(54bytes):
+ movdqu -54(%rsi), %xmm1
+ movdqu -54(%rdi), %xmm2
+ mov $-54, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(38bytes):
+ movdqu -38(%rsi), %xmm1
+ movdqu -38(%rdi), %xmm2
+ mov $-38, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(22bytes):
+ movdqu -22(%rsi), %xmm1
+ movdqu -22(%rdi), %xmm2
+ mov $-22, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(71bytes):
+ movdqu -71(%rsi), %xmm1
+ movdqu -71(%rdi), %xmm2
+ mov $-71, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(55bytes):
+ movdqu -55(%rdi), %xmm2
+ movdqu -55(%rsi), %xmm1
+ mov $-55, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(39bytes):
+ movdqu -39(%rdi), %xmm2
+ movdqu -39(%rsi), %xmm1
+ mov $-39, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(23bytes):
+ movdqu -23(%rdi), %xmm2
+ movdqu -23(%rsi), %xmm1
+ mov $-23, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(72bytes):
+ movdqu -72(%rsi), %xmm1
+ movdqu -72(%rdi), %xmm2
+ mov $-72, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(56bytes):
+ movdqu -56(%rdi), %xmm2
+ movdqu -56(%rsi), %xmm1
+ mov $-56, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(40bytes):
+ movdqu -40(%rdi), %xmm2
+ movdqu -40(%rsi), %xmm1
+ mov $-40, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(24bytes):
+ movdqu -24(%rdi), %xmm2
+ movdqu -24(%rsi), %xmm1
+ mov $-24, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(73bytes):
+ movdqu -73(%rsi), %xmm1
+ movdqu -73(%rdi), %xmm2
+ mov $-73, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(57bytes):
+ movdqu -57(%rdi), %xmm2
+ movdqu -57(%rsi), %xmm1
+ mov $-57, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(41bytes):
+ movdqu -41(%rdi), %xmm2
+ movdqu -41(%rsi), %xmm1
+ mov $-41, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(25bytes):
+ movdqu -25(%rdi), %xmm2
+ movdqu -25(%rsi), %xmm1
+ mov $-25, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -9(%rdi), %rax
+ mov -9(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ movzbl -1(%rdi), %eax
+ movzbl -1(%rsi), %ecx
+ sub %ecx, %eax
+ ret
+
+ ALIGN (4)
+L(74bytes):
+ movdqu -74(%rsi), %xmm1
+ movdqu -74(%rdi), %xmm2
+ mov $-74, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(58bytes):
+ movdqu -58(%rdi), %xmm2
+ movdqu -58(%rsi), %xmm1
+ mov $-58, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(42bytes):
+ movdqu -42(%rdi), %xmm2
+ movdqu -42(%rsi), %xmm1
+ mov $-42, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(26bytes):
+ movdqu -26(%rdi), %xmm2
+ movdqu -26(%rsi), %xmm1
+ mov $-26, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -10(%rdi), %rax
+ mov -10(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ movzwl -2(%rdi), %eax
+ movzwl -2(%rsi), %ecx
+ jmp L(diffin2bytes)
+
+ ALIGN (4)
+L(75bytes):
+ movdqu -75(%rsi), %xmm1
+ movdqu -75(%rdi), %xmm2
+ mov $-75, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(59bytes):
+ movdqu -59(%rdi), %xmm2
+ movdqu -59(%rsi), %xmm1
+ mov $-59, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(43bytes):
+ movdqu -43(%rdi), %xmm2
+ movdqu -43(%rsi), %xmm1
+ mov $-43, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(27bytes):
+ movdqu -27(%rdi), %xmm2
+ movdqu -27(%rsi), %xmm1
+ mov $-27, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -11(%rdi), %rax
+ mov -11(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -4(%rdi), %eax
+ mov -4(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(76bytes):
+ movdqu -76(%rsi), %xmm1
+ movdqu -76(%rdi), %xmm2
+ mov $-76, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(60bytes):
+ movdqu -60(%rdi), %xmm2
+ movdqu -60(%rsi), %xmm1
+ mov $-60, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(44bytes):
+ movdqu -44(%rdi), %xmm2
+ movdqu -44(%rsi), %xmm1
+ mov $-44, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(28bytes):
+ movdqu -28(%rdi), %xmm2
+ movdqu -28(%rsi), %xmm1
+ mov $-28, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -12(%rdi), %rax
+ mov -12(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -4(%rdi), %eax
+ mov -4(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(77bytes):
+ movdqu -77(%rsi), %xmm1
+ movdqu -77(%rdi), %xmm2
+ mov $-77, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(61bytes):
+ movdqu -61(%rdi), %xmm2
+ movdqu -61(%rsi), %xmm1
+ mov $-61, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(45bytes):
+ movdqu -45(%rdi), %xmm2
+ movdqu -45(%rsi), %xmm1
+ mov $-45, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(29bytes):
+ movdqu -29(%rdi), %xmm2
+ movdqu -29(%rsi), %xmm1
+ mov $-29, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -13(%rdi), %rax
+ mov -13(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(78bytes):
+ movdqu -78(%rsi), %xmm1
+ movdqu -78(%rdi), %xmm2
+ mov $-78, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(62bytes):
+ movdqu -62(%rdi), %xmm2
+ movdqu -62(%rsi), %xmm1
+ mov $-62, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(46bytes):
+ movdqu -46(%rdi), %xmm2
+ movdqu -46(%rsi), %xmm1
+ mov $-46, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(30bytes):
+ movdqu -30(%rdi), %xmm2
+ movdqu -30(%rsi), %xmm1
+ mov $-30, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -14(%rdi), %rax
+ mov -14(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(79bytes):
+ movdqu -79(%rsi), %xmm1
+ movdqu -79(%rdi), %xmm2
+ mov $-79, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(63bytes):
+ movdqu -63(%rdi), %xmm2
+ movdqu -63(%rsi), %xmm1
+ mov $-63, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(47bytes):
+ movdqu -47(%rdi), %xmm2
+ movdqu -47(%rsi), %xmm1
+ mov $-47, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(31bytes):
+ movdqu -31(%rdi), %xmm2
+ movdqu -31(%rsi), %xmm1
+ mov $-31, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -15(%rdi), %rax
+ mov -15(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(64bytes):
+ movdqu -64(%rdi), %xmm2
+ movdqu -64(%rsi), %xmm1
+ mov $-64, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(48bytes):
+ movdqu -48(%rdi), %xmm2
+ movdqu -48(%rsi), %xmm1
+ mov $-48, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(32bytes):
+ movdqu -32(%rdi), %xmm2
+ movdqu -32(%rsi), %xmm1
+ mov $-32, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -16(%rdi), %rax
+ mov -16(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+/*
+ * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
+ */
+ ALIGN (3)
+L(less16bytes):
+ movsbq %dl, %rdx
+ mov (%rsi, %rdx), %rcx
+ mov (%rdi, %rdx), %rax
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov 8(%rsi, %rdx), %rcx
+ mov 8(%rdi, %rdx), %rax
+L(diffin8bytes):
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ shr $32, %rcx
+ shr $32, %rax
+L(diffin4bytes):
+ cmp %cx, %ax
+ jne L(diffin2bytes)
+ shr $16, %ecx
+ shr $16, %eax
+L(diffin2bytes):
+ cmp %cl, %al
+ jne L(end)
+ and $0xffff, %eax
+ and $0xffff, %ecx
+ sub %ecx, %eax
+ ret
+
+ ALIGN (4)
+L(end):
+ and $0xff, %eax
+ and $0xff, %ecx
+ sub %ecx, %eax
+ ret
+
+END (MEMCMP)
+
+ .section .rodata.sse4.1,"a",@progbits
+ ALIGN (3)
+L(table_64bytes):
+ .int JMPTBL (L(0bytes), L(table_64bytes))
+ .int JMPTBL (L(1bytes), L(table_64bytes))
+ .int JMPTBL (L(2bytes), L(table_64bytes))
+ .int JMPTBL (L(3bytes), L(table_64bytes))
+ .int JMPTBL (L(4bytes), L(table_64bytes))
+ .int JMPTBL (L(5bytes), L(table_64bytes))
+ .int JMPTBL (L(6bytes), L(table_64bytes))
+ .int JMPTBL (L(7bytes), L(table_64bytes))
+ .int JMPTBL (L(8bytes), L(table_64bytes))
+ .int JMPTBL (L(9bytes), L(table_64bytes))
+ .int JMPTBL (L(10bytes), L(table_64bytes))
+ .int JMPTBL (L(11bytes), L(table_64bytes))
+ .int JMPTBL (L(12bytes), L(table_64bytes))
+ .int JMPTBL (L(13bytes), L(table_64bytes))
+ .int JMPTBL (L(14bytes), L(table_64bytes))
+ .int JMPTBL (L(15bytes), L(table_64bytes))
+ .int JMPTBL (L(16bytes), L(table_64bytes))
+ .int JMPTBL (L(17bytes), L(table_64bytes))
+ .int JMPTBL (L(18bytes), L(table_64bytes))
+ .int JMPTBL (L(19bytes), L(table_64bytes))
+ .int JMPTBL (L(20bytes), L(table_64bytes))
+ .int JMPTBL (L(21bytes), L(table_64bytes))
+ .int JMPTBL (L(22bytes), L(table_64bytes))
+ .int JMPTBL (L(23bytes), L(table_64bytes))
+ .int JMPTBL (L(24bytes), L(table_64bytes))
+ .int JMPTBL (L(25bytes), L(table_64bytes))
+ .int JMPTBL (L(26bytes), L(table_64bytes))
+ .int JMPTBL (L(27bytes), L(table_64bytes))
+ .int JMPTBL (L(28bytes), L(table_64bytes))
+ .int JMPTBL (L(29bytes), L(table_64bytes))
+ .int JMPTBL (L(30bytes), L(table_64bytes))
+ .int JMPTBL (L(31bytes), L(table_64bytes))
+ .int JMPTBL (L(32bytes), L(table_64bytes))
+ .int JMPTBL (L(33bytes), L(table_64bytes))
+ .int JMPTBL (L(34bytes), L(table_64bytes))
+ .int JMPTBL (L(35bytes), L(table_64bytes))
+ .int JMPTBL (L(36bytes), L(table_64bytes))
+ .int JMPTBL (L(37bytes), L(table_64bytes))
+ .int JMPTBL (L(38bytes), L(table_64bytes))
+ .int JMPTBL (L(39bytes), L(table_64bytes))
+ .int JMPTBL (L(40bytes), L(table_64bytes))
+ .int JMPTBL (L(41bytes), L(table_64bytes))
+ .int JMPTBL (L(42bytes), L(table_64bytes))
+ .int JMPTBL (L(43bytes), L(table_64bytes))
+ .int JMPTBL (L(44bytes), L(table_64bytes))
+ .int JMPTBL (L(45bytes), L(table_64bytes))
+ .int JMPTBL (L(46bytes), L(table_64bytes))
+ .int JMPTBL (L(47bytes), L(table_64bytes))
+ .int JMPTBL (L(48bytes), L(table_64bytes))
+ .int JMPTBL (L(49bytes), L(table_64bytes))
+ .int JMPTBL (L(50bytes), L(table_64bytes))
+ .int JMPTBL (L(51bytes), L(table_64bytes))
+ .int JMPTBL (L(52bytes), L(table_64bytes))
+ .int JMPTBL (L(53bytes), L(table_64bytes))
+ .int JMPTBL (L(54bytes), L(table_64bytes))
+ .int JMPTBL (L(55bytes), L(table_64bytes))
+ .int JMPTBL (L(56bytes), L(table_64bytes))
+ .int JMPTBL (L(57bytes), L(table_64bytes))
+ .int JMPTBL (L(58bytes), L(table_64bytes))
+ .int JMPTBL (L(59bytes), L(table_64bytes))
+ .int JMPTBL (L(60bytes), L(table_64bytes))
+ .int JMPTBL (L(61bytes), L(table_64bytes))
+ .int JMPTBL (L(62bytes), L(table_64bytes))
+ .int JMPTBL (L(63bytes), L(table_64bytes))
+ .int JMPTBL (L(64bytes), L(table_64bytes))
+ .int JMPTBL (L(65bytes), L(table_64bytes))
+ .int JMPTBL (L(66bytes), L(table_64bytes))
+ .int JMPTBL (L(67bytes), L(table_64bytes))
+ .int JMPTBL (L(68bytes), L(table_64bytes))
+ .int JMPTBL (L(69bytes), L(table_64bytes))
+ .int JMPTBL (L(70bytes), L(table_64bytes))
+ .int JMPTBL (L(71bytes), L(table_64bytes))
+ .int JMPTBL (L(72bytes), L(table_64bytes))
+ .int JMPTBL (L(73bytes), L(table_64bytes))
+ .int JMPTBL (L(74bytes), L(table_64bytes))
+ .int JMPTBL (L(75bytes), L(table_64bytes))
+ .int JMPTBL (L(76bytes), L(table_64bytes))
+ .int JMPTBL (L(77bytes), L(table_64bytes))
+ .int JMPTBL (L(78bytes), L(table_64bytes))
+ .int JMPTBL (L(79bytes), L(table_64bytes))
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
new file mode 100644
index 0000000000..301ab287f5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp.S
@@ -0,0 +1,59 @@
+/* Multiple versions of memcmp
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#ifndef NOT_IN_libc
+ .text
+ENTRY(memcmp)
+ .type memcmp, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq __memcmp_sse2(%rip), %rax
+ testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+ jz 2f
+ leaq __memcmp_sse4_1(%rip), %rax
+2: ret
+END(memcmp)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __memcmp_sse2, @function; \
+ .p2align 4; \
+ __memcmp_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __memcmp_sse2, .-__memcmp_sse2
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_memcmp; __GI_memcmp = __memcmp_sse2
+# endif
+#endif
+
+#include "../memcmp.S"
diff --git a/sysdeps/x86_64/multiarch/rtld-memcmp.c b/sysdeps/x86_64/multiarch/rtld-memcmp.c
new file mode 100644
index 0000000000..0f271356c2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rtld-memcmp.c
@@ -0,0 +1 @@
+#include "../rtld-memcmp.c"