diff options
Diffstat (limited to 'sysdeps/x86_64/multiarch')
56 files changed, 1016 insertions, 733 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index d7002a9df3..d234f4ab66 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -1,5 +1,4 @@ ifeq ($(subdir),csu) -aux += init-arch tests += test-multiarch gen-as-const-headers += ifunc-defines.sym endif @@ -8,31 +7,26 @@ ifeq ($(subdir),string) sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ strcmp-sse2-unaligned strncmp-ssse3 \ - memcmp-sse4 memcpy-ssse3 \ - memcpy-sse2-unaligned mempcpy-ssse3 \ - memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ - memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \ - memmove-ssse3-back strcasecmp_l-ssse3 \ + memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned \ + memcpy-avx512-no-vzeroupper mempcpy-ssse3 memmove-ssse3 \ + memcpy-ssse3-back mempcpy-ssse3-back memmove-avx-unaligned \ + memcpy-avx-unaligned mempcpy-avx-unaligned \ + mempcpy-avx512-no-vzeroupper memmove-ssse3-back \ + memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \ strncase_l-ssse3 strcat-ssse3 strncat-ssse3\ strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \ strcpy-sse2-unaligned strncpy-sse2-unaligned \ stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ strcat-sse2-unaligned strncat-sse2-unaligned \ - strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned - -ifeq (yes,$(config-cflags-sse4)) -sysdep_routines += strcspn-c strpbrk-c strspn-c varshift + strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \ + strcspn-c strpbrk-c strspn-c varshift memset-avx2 \ + memset-avx512-no-vzeroupper CFLAGS-varshift.c += -msse4 CFLAGS-strcspn-c.c += -msse4 CFLAGS-strpbrk-c.c += -msse4 CFLAGS-strspn-c.c += -msse4 endif -ifeq (yes,$(config-cflags-avx2)) -sysdep_routines += memset-avx2 -endif -endif - ifeq ($(subdir),wcsmbs) sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c endif diff --git a/sysdeps/x86_64/multiarch/Versions b/sysdeps/x86_64/multiarch/Versions deleted file mode 100644 index 59b185ac8d..0000000000 --- a/sysdeps/x86_64/multiarch/Versions +++ /dev/null @@ -1,5 +0,0 @@ -libc { - GLIBC_PRIVATE { - __get_cpu_features; - } -} diff --git a/sysdeps/x86_64/multiarch/cacheinfo.c b/sysdeps/x86_64/multiarch/cacheinfo.c deleted file mode 100644 index f87b8dce6b..0000000000 --- a/sysdeps/x86_64/multiarch/cacheinfo.c +++ /dev/null @@ -1,2 +0,0 @@ -#define DISABLE_PREFERRED_MEMORY_INSTRUCTION -#include "../cacheinfo.c" diff --git a/sysdeps/x86_64/multiarch/ifunc-defines.sym b/sysdeps/x86_64/multiarch/ifunc-defines.sym index a410d8808f..3df946f343 100644 --- a/sysdeps/x86_64/multiarch/ifunc-defines.sym +++ b/sysdeps/x86_64/multiarch/ifunc-defines.sym @@ -4,7 +4,6 @@ -- CPU_FEATURES_SIZE sizeof (struct cpu_features) -KIND_OFFSET offsetof (struct cpu_features, kind) CPUID_OFFSET offsetof (struct cpu_features, cpuid) CPUID_SIZE sizeof (struct cpuid_registers) CPUID_EAX_OFFSET offsetof (struct cpuid_registers, eax) diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index b64e4f1532..188b6d36c6 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -1,5 +1,5 @@ /* Enumerate available IFUNC implementations of a function. x86-64 version. - Copyright (C) 2012-2015 Free Software Foundation, Inc. + Copyright (C) 2012-2016 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -20,10 +20,11 @@ #include <string.h> #include <wchar.h> #include <ifunc-impl-list.h> +#include <sysdep.h> #include "init-arch.h" /* Maximum number of IFUNC implementations. */ -#define MAX_IFUNC 4 +#define MAX_IFUNC 5 /* Fill ARRAY of MAX elements with IFUNC implementations for function NAME supported on target machine and return the number of valid @@ -39,48 +40,77 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/memcmp.S. */ IFUNC_IMPL (i, name, memcmp, - IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSE4_1, + IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_1), __memcmp_sse4_1) - IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSSE3, __memcmp_ssse3) + IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3), + __memcmp_ssse3) IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2)) - /* Support sysdeps/x86_64/multiarch/memmove_chk.S. */ + /* Support sysdeps/x86_64/multiarch/memmove_chk.c. */ IFUNC_IMPL (i, name, __memmove_chk, - IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_AVX, +#ifdef HAVE_AVX512_ASM_SUPPORT + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memmove_chk_avx512_no_vzeroupper) +#endif + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_ARCH_FEATURE (AVX_Usable), __memmove_chk_avx_unaligned) - IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3, + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_CPU_FEATURE (SSSE3), __memmove_chk_ssse3_back) - IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3, + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_CPU_FEATURE (SSSE3), __memmove_chk_ssse3) IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, __memmove_chk_sse2)) /* Support sysdeps/x86_64/multiarch/memmove.S. */ IFUNC_IMPL (i, name, memmove, - IFUNC_IMPL_ADD (array, i, memmove, HAS_AVX, + IFUNC_IMPL_ADD (array, i, memmove, + HAS_ARCH_FEATURE (AVX_Usable), __memmove_avx_unaligned) - IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3, +#ifdef HAVE_AVX512_ASM_SUPPORT + IFUNC_IMPL_ADD (array, i, memmove, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memmove_avx512_no_vzeroupper) +#endif + IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3), __memmove_ssse3_back) - IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3, + IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3), __memmove_ssse3) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2)) -#ifdef HAVE_AVX2_SUPPORT /* Support sysdeps/x86_64/multiarch/memset_chk.S. */ IFUNC_IMPL (i, name, __memset_chk, - IFUNC_IMPL_ADD (array, i, __memset_chk, 1, __memset_chk_sse2) - IFUNC_IMPL_ADD (array, i, __memset_chk, HAS_AVX2, - __memset_chk_avx2)) + IFUNC_IMPL_ADD (array, i, __memset_chk, 1, + __memset_chk_sse2) + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_ARCH_FEATURE (AVX2_Usable), + __memset_chk_avx2) +#ifdef HAVE_AVX512_ASM_SUPPORT + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_chk_avx512_no_vzeroupper) +#endif + ) /* Support sysdeps/x86_64/multiarch/memset.S. */ IFUNC_IMPL (i, name, memset, IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2) - IFUNC_IMPL_ADD (array, i, memset, HAS_AVX2, __memset_avx2)) + IFUNC_IMPL_ADD (array, i, memset, + HAS_ARCH_FEATURE (AVX2_Usable), + __memset_avx2) +#ifdef HAVE_AVX512_ASM_SUPPORT + IFUNC_IMPL_ADD (array, i, memset, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_avx512_no_vzeroupper) #endif + ) /* Support sysdeps/x86_64/multiarch/stpncpy.S. */ IFUNC_IMPL (i, name, stpncpy, - IFUNC_IMPL_ADD (array, i, stpncpy, HAS_SSSE3, + IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3), __stpncpy_ssse3) IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2_unaligned) @@ -88,38 +118,42 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/stpcpy.S. */ IFUNC_IMPL (i, name, stpcpy, - IFUNC_IMPL_ADD (array, i, stpcpy, HAS_SSSE3, __stpcpy_ssse3) + IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3), + __stpcpy_ssse3) IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2)) /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S. */ IFUNC_IMPL (i, name, strcasecmp, - IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_AVX, + IFUNC_IMPL_ADD (array, i, strcasecmp, + HAS_ARCH_FEATURE (AVX_Usable), __strcasecmp_avx) - IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSE4_2, + IFUNC_IMPL_ADD (array, i, strcasecmp, + HAS_CPU_FEATURE (SSE4_2), __strcasecmp_sse42) - IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSSE3, + IFUNC_IMPL_ADD (array, i, strcasecmp, + HAS_CPU_FEATURE (SSSE3), __strcasecmp_ssse3) IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2)) /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S. */ IFUNC_IMPL (i, name, strcasecmp_l, - IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_AVX, + IFUNC_IMPL_ADD (array, i, strcasecmp_l, + HAS_ARCH_FEATURE (AVX_Usable), __strcasecmp_l_avx) - IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_SSE4_2, + IFUNC_IMPL_ADD (array, i, strcasecmp_l, + HAS_CPU_FEATURE (SSE4_2), __strcasecmp_l_sse42) - IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_SSSE3, + IFUNC_IMPL_ADD (array, i, strcasecmp_l, + HAS_CPU_FEATURE (SSSE3), __strcasecmp_l_ssse3) IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1, __strcasecmp_l_sse2)) - /* Support sysdeps/x86_64/multiarch/strcasestr.c. */ - IFUNC_IMPL (i, name, strcasestr, - IFUNC_IMPL_ADD (array, i, strcasestr, 1, __strcasestr_sse2)) - /* Support sysdeps/x86_64/multiarch/strcat.S. */ IFUNC_IMPL (i, name, strcat, - IFUNC_IMPL_ADD (array, i, strcat, HAS_SSSE3, __strcat_ssse3) + IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3), + __strcat_ssse3) IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2)) @@ -130,48 +164,57 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strcmp.S. */ IFUNC_IMPL (i, name, strcmp, - IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSE4_2, __strcmp_sse42) - IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSSE3, __strcmp_ssse3) + IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2), + __strcmp_sse42) + IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSSE3), + __strcmp_ssse3) IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2)) /* Support sysdeps/x86_64/multiarch/strcpy.S. */ IFUNC_IMPL (i, name, strcpy, - IFUNC_IMPL_ADD (array, i, strcpy, HAS_SSSE3, __strcpy_ssse3) + IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3), + __strcpy_ssse3) IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2)) /* Support sysdeps/x86_64/multiarch/strcspn.S. */ IFUNC_IMPL (i, name, strcspn, - IFUNC_IMPL_ADD (array, i, strcspn, HAS_SSE4_2, + IFUNC_IMPL_ADD (array, i, strcspn, HAS_CPU_FEATURE (SSE4_2), __strcspn_sse42) IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_sse2)) /* Support sysdeps/x86_64/multiarch/strncase_l.S. */ IFUNC_IMPL (i, name, strncasecmp, - IFUNC_IMPL_ADD (array, i, strncasecmp, HAS_AVX, + IFUNC_IMPL_ADD (array, i, strncasecmp, + HAS_ARCH_FEATURE (AVX_Usable), __strncasecmp_avx) - IFUNC_IMPL_ADD (array, i, strncasecmp, HAS_SSE4_2, + IFUNC_IMPL_ADD (array, i, strncasecmp, + HAS_CPU_FEATURE (SSE4_2), __strncasecmp_sse42) - IFUNC_IMPL_ADD (array, i, strncasecmp, HAS_SSSE3, + IFUNC_IMPL_ADD (array, i, strncasecmp, + HAS_CPU_FEATURE (SSSE3), __strncasecmp_ssse3) IFUNC_IMPL_ADD (array, i, strncasecmp, 1, __strncasecmp_sse2)) /* Support sysdeps/x86_64/multiarch/strncase_l.S. */ IFUNC_IMPL (i, name, strncasecmp_l, - IFUNC_IMPL_ADD (array, i, strncasecmp_l, HAS_AVX, + IFUNC_IMPL_ADD (array, i, strncasecmp_l, + HAS_ARCH_FEATURE (AVX_Usable), __strncasecmp_l_avx) - IFUNC_IMPL_ADD (array, i, strncasecmp_l, HAS_SSE4_2, + IFUNC_IMPL_ADD (array, i, strncasecmp_l, + HAS_CPU_FEATURE (SSE4_2), __strncasecmp_l_sse42) - IFUNC_IMPL_ADD (array, i, strncasecmp_l, HAS_SSSE3, + IFUNC_IMPL_ADD (array, i, strncasecmp_l, + HAS_CPU_FEATURE (SSSE3), __strncasecmp_l_ssse3) IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1, __strncasecmp_l_sse2)) /* Support sysdeps/x86_64/multiarch/strncat.S. */ IFUNC_IMPL (i, name, strncat, - IFUNC_IMPL_ADD (array, i, strncat, HAS_SSSE3, + IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3), __strncat_ssse3) IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2_unaligned) @@ -179,7 +222,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strncpy.S. */ IFUNC_IMPL (i, name, strncpy, - IFUNC_IMPL_ADD (array, i, strncpy, HAS_SSSE3, + IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3), __strncpy_ssse3) IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2_unaligned) @@ -187,14 +230,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strpbrk.S. */ IFUNC_IMPL (i, name, strpbrk, - IFUNC_IMPL_ADD (array, i, strpbrk, HAS_SSE4_2, + IFUNC_IMPL_ADD (array, i, strpbrk, HAS_CPU_FEATURE (SSE4_2), __strpbrk_sse42) IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_sse2)) /* Support sysdeps/x86_64/multiarch/strspn.S. */ IFUNC_IMPL (i, name, strspn, - IFUNC_IMPL_ADD (array, i, strspn, HAS_SSE4_2, __strspn_sse42) + IFUNC_IMPL_ADD (array, i, strspn, HAS_CPU_FEATURE (SSE4_2), + __strspn_sse42) IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_sse2)) /* Support sysdeps/x86_64/multiarch/strstr.c. */ @@ -204,65 +248,95 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/wcscpy.S. */ IFUNC_IMPL (i, name, wcscpy, - IFUNC_IMPL_ADD (array, i, wcscpy, HAS_SSSE3, __wcscpy_ssse3) + IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3), + __wcscpy_ssse3) IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2)) /* Support sysdeps/x86_64/multiarch/wmemcmp.S. */ IFUNC_IMPL (i, name, wmemcmp, - IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_SSE4_1, + IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_1), __wmemcmp_sse4_1) - IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_SSSE3, + IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3), __wmemcmp_ssse3) IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) #ifdef SHARED /* Support sysdeps/x86_64/multiarch/memcpy_chk.S. */ IFUNC_IMPL (i, name, __memcpy_chk, - IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_AVX, +#ifdef HAVE_AVX512_ASM_SUPPORT + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memcpy_chk_avx512_no_vzeroupper) +#endif + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_ARCH_FEATURE (AVX_Usable), __memcpy_chk_avx_unaligned) - IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3, + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_CPU_FEATURE (SSSE3), __memcpy_chk_ssse3_back) - IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3, + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_CPU_FEATURE (SSSE3), __memcpy_chk_ssse3) IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, __memcpy_chk_sse2)) /* Support sysdeps/x86_64/multiarch/memcpy.S. */ IFUNC_IMPL (i, name, memcpy, - IFUNC_IMPL_ADD (array, i, memcpy, HAS_AVX, + IFUNC_IMPL_ADD (array, i, memcpy, + HAS_ARCH_FEATURE (AVX_Usable), __memcpy_avx_unaligned) - IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, + IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3), __memcpy_ssse3_back) - IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3) + IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3), + __memcpy_ssse3) +#ifdef HAVE_AVX512_ASM_SUPPORT + IFUNC_IMPL_ADD (array, i, memcpy, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memcpy_avx512_no_vzeroupper) +#endif IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2)) /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S. */ IFUNC_IMPL (i, name, __mempcpy_chk, - IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_AVX, +#ifdef HAVE_AVX512_ASM_SUPPORT + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __mempcpy_chk_avx512_no_vzeroupper) +#endif + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_ARCH_FEATURE (AVX_Usable), __mempcpy_chk_avx_unaligned) - IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3, + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_CPU_FEATURE (SSSE3), __mempcpy_chk_ssse3_back) - IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3, + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_CPU_FEATURE (SSSE3), __mempcpy_chk_ssse3) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, __mempcpy_chk_sse2)) /* Support sysdeps/x86_64/multiarch/mempcpy.S. */ IFUNC_IMPL (i, name, mempcpy, - IFUNC_IMPL_ADD (array, i, mempcpy, HAS_AVX, +#ifdef HAVE_AVX512_ASM_SUPPORT + IFUNC_IMPL_ADD (array, i, mempcpy, + HAS_ARCH_FEATURE (AVX512F_Usable), + __mempcpy_avx512_no_vzeroupper) +#endif + IFUNC_IMPL_ADD (array, i, mempcpy, + HAS_ARCH_FEATURE (AVX_Usable), __mempcpy_avx_unaligned) - IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3, + IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3), __mempcpy_ssse3_back) - IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3, + IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3), __mempcpy_ssse3) IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2)) /* Support sysdeps/x86_64/multiarch/strncmp.S. */ IFUNC_IMPL (i, name, strncmp, - IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSE4_2, + IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2), __strncmp_sse42) - IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSSE3, + IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3), __strncmp_ssse3) IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2)) #endif diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c deleted file mode 100644 index aaad5fa841..0000000000 --- a/sysdeps/x86_64/multiarch/init-arch.c +++ /dev/null @@ -1,223 +0,0 @@ -/* Initialize CPU feature data. - This file is part of the GNU C Library. - Copyright (C) 2008-2015 Free Software Foundation, Inc. - Contributed by Ulrich Drepper <drepper@redhat.com>. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <atomic.h> -#include <cpuid.h> -#include "init-arch.h" - - -struct cpu_features __cpu_features attribute_hidden; - - -static void -get_common_indeces (unsigned int *family, unsigned int *model) -{ - __cpuid (1, __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax, - __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ebx, - __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx, - __cpu_features.cpuid[COMMON_CPUID_INDEX_1].edx); - - unsigned int eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax; - *family = (eax >> 8) & 0x0f; - *model = (eax >> 4) & 0x0f; -} - - -void -__init_cpu_features (void) -{ - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - unsigned int family = 0; - unsigned int model = 0; - enum cpu_features_kind kind; - - __cpuid (0, __cpu_features.max_cpuid, ebx, ecx, edx); - - /* This spells out "GenuineIntel". */ - if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69) - { - kind = arch_kind_intel; - - get_common_indeces (&family, &model); - - unsigned int eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax; - unsigned int extended_family = (eax >> 20) & 0xff; - unsigned int extended_model = (eax >> 12) & 0xf0; - if (family == 0x0f) - { - family += extended_family; - model += extended_model; - } - else if (family == 0x06) - { - ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx; - model += extended_model; - switch (model) - { - case 0x1c: - case 0x26: - /* BSF is slow on Atom. */ - __cpu_features.feature[index_Slow_BSF] |= bit_Slow_BSF; - break; - - case 0x37: - case 0x4a: - case 0x4d: - case 0x5a: - case 0x5d: - /* Unaligned load versions are faster than SSSE3 - on Silvermont. */ -#if index_Fast_Unaligned_Load != index_Prefer_PMINUB_for_stringop -# error index_Fast_Unaligned_Load != index_Prefer_PMINUB_for_stringop -#endif -#if index_Fast_Unaligned_Load != index_Slow_SSE4_2 -# error index_Fast_Unaligned_Load != index_Slow_SSE4_2 -#endif - __cpu_features.feature[index_Fast_Unaligned_Load] - |= (bit_Fast_Unaligned_Load - | bit_Prefer_PMINUB_for_stringop - | bit_Slow_SSE4_2); - break; - - default: - /* Unknown family 0x06 processors. Assuming this is one - of Core i3/i5/i7 processors if AVX is available. */ - if ((ecx & bit_AVX) == 0) - break; - - case 0x1a: - case 0x1e: - case 0x1f: - case 0x25: - case 0x2c: - case 0x2e: - case 0x2f: - /* Rep string instructions, copy backward, unaligned loads - and pminub are fast on Intel Core i3, i5 and i7. */ -#if index_Fast_Rep_String != index_Fast_Copy_Backward -# error index_Fast_Rep_String != index_Fast_Copy_Backward -#endif -#if index_Fast_Rep_String != index_Fast_Unaligned_Load -# error index_Fast_Rep_String != index_Fast_Unaligned_Load -#endif -#if index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop -# error index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop -#endif - __cpu_features.feature[index_Fast_Rep_String] - |= (bit_Fast_Rep_String - | bit_Fast_Copy_Backward - | bit_Fast_Unaligned_Load - | bit_Prefer_PMINUB_for_stringop); - break; - } - } - } - /* This spells out "AuthenticAMD". */ - else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) - { - kind = arch_kind_amd; - - get_common_indeces (&family, &model); - - ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx; - - unsigned int eax; - __cpuid (0x80000000, eax, ebx, ecx, edx); - if (eax >= 0x80000001) - __cpuid (0x80000001, - __cpu_features.cpuid[COMMON_CPUID_INDEX_80000001].eax, - __cpu_features.cpuid[COMMON_CPUID_INDEX_80000001].ebx, - __cpu_features.cpuid[COMMON_CPUID_INDEX_80000001].ecx, - __cpu_features.cpuid[COMMON_CPUID_INDEX_80000001].edx); - } - else - kind = arch_kind_other; - - if (__cpu_features.max_cpuid >= 7) - __cpuid_count (7, 0, - __cpu_features.cpuid[COMMON_CPUID_INDEX_7].eax, - __cpu_features.cpuid[COMMON_CPUID_INDEX_7].ebx, - __cpu_features.cpuid[COMMON_CPUID_INDEX_7].ecx, - __cpu_features.cpuid[COMMON_CPUID_INDEX_7].edx); - - /* Can we call xgetbv? */ - if (CPUID_OSXSAVE) - { - unsigned int xcrlow; - unsigned int xcrhigh; - asm ("xgetbv" : "=a" (xcrlow), "=d" (xcrhigh) : "c" (0)); - /* Is YMM and XMM state usable? */ - if ((xcrlow & (bit_YMM_state | bit_XMM_state)) == - (bit_YMM_state | bit_XMM_state)) - { - /* Determine if AVX is usable. */ - if (CPUID_AVX) - __cpu_features.feature[index_AVX_Usable] |= bit_AVX_Usable; -#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load -# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load -#endif - /* Determine if AVX2 is usable. Unaligned load with 256-bit - AVX registers are faster on processors with AVX2. */ - if (CPUID_AVX2) - __cpu_features.feature[index_AVX2_Usable] - |= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load; - /* Check if OPMASK state, upper 256-bit of ZMM0-ZMM15 and - ZMM16-ZMM31 state are enabled. */ - if ((xcrlow & (bit_Opmask_state | bit_ZMM0_15_state - | bit_ZMM16_31_state)) == - (bit_Opmask_state | bit_ZMM0_15_state | bit_ZMM16_31_state)) - { - /* Determine if AVX512F is usable. */ - if (CPUID_AVX512F) - { - __cpu_features.feature[index_AVX512F_Usable] - |= bit_AVX512F_Usable; - /* Determine if AVX512DQ is usable. */ - if (CPUID_AVX512DQ) - __cpu_features.feature[index_AVX512DQ_Usable] - |= bit_AVX512DQ_Usable; - } - } - /* Determine if FMA is usable. */ - if (CPUID_FMA) - __cpu_features.feature[index_FMA_Usable] |= bit_FMA_Usable; - /* Determine if FMA4 is usable. */ - if (CPUID_FMA4) - __cpu_features.feature[index_FMA4_Usable] |= bit_FMA4_Usable; - } - } - - __cpu_features.family = family; - __cpu_features.model = model; - atomic_write_barrier (); - __cpu_features.kind = kind; -} - -#undef __get_cpu_features - -const struct cpu_features * -__get_cpu_features (void) -{ - if (__cpu_features.kind == arch_kind_unknown) - __init_cpu_features (); - - return &__cpu_features; -} diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h deleted file mode 100644 index cfc6e7049e..0000000000 --- a/sysdeps/x86_64/multiarch/init-arch.h +++ /dev/null @@ -1,206 +0,0 @@ -/* This file is part of the GNU C Library. - Copyright (C) 2008-2015 Free Software Foundation, Inc. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#define bit_Fast_Rep_String (1 << 0) -#define bit_Fast_Copy_Backward (1 << 1) -#define bit_Slow_BSF (1 << 2) -#define bit_Fast_Unaligned_Load (1 << 4) -#define bit_Prefer_PMINUB_for_stringop (1 << 5) -#define bit_AVX_Usable (1 << 6) -#define bit_FMA_Usable (1 << 7) -#define bit_FMA4_Usable (1 << 8) -#define bit_Slow_SSE4_2 (1 << 9) -#define bit_AVX2_Usable (1 << 10) -#define bit_AVX_Fast_Unaligned_Load (1 << 11) -#define bit_AVX512F_Usable (1 << 12) -#define bit_AVX512DQ_Usable (1 << 13) - -/* CPUID Feature flags. */ - -/* COMMON_CPUID_INDEX_1. */ -#define bit_SSE2 (1 << 26) -#define bit_SSSE3 (1 << 9) -#define bit_SSE4_1 (1 << 19) -#define bit_SSE4_2 (1 << 20) -#define bit_OSXSAVE (1 << 27) -#define bit_AVX (1 << 28) -#define bit_POPCOUNT (1 << 23) -#define bit_FMA (1 << 12) -#define bit_FMA4 (1 << 16) - -/* COMMON_CPUID_INDEX_7. */ -#define bit_RTM (1 << 11) -#define bit_AVX2 (1 << 5) -#define bit_AVX512F (1 << 16) -#define bit_AVX512DQ (1 << 17) - -/* XCR0 Feature flags. */ -#define bit_XMM_state (1 << 1) -#define bit_YMM_state (2 << 1) -#define bit_Opmask_state (1 << 5) -#define bit_ZMM0_15_state (1 << 6) -#define bit_ZMM16_31_state (1 << 7) - -/* The integer bit array index for the first set of internal feature bits. */ -# define FEATURE_INDEX_1 0 - -/* The current maximum size of the feature integer bit array. */ -# define FEATURE_INDEX_MAX 1 - -#ifdef __ASSEMBLER__ - -# include <ifunc-defines.h> - -# define index_SSE2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET -# define index_SSSE3 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET -# define index_SSE4_1 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET -# define index_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET -# define index_AVX COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET -# define index_AVX2 COMMON_CPUID_INDEX_7*CPUID_SIZE+CPUID_EBX_OFFSET - -# define index_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE -# define index_Fast_Copy_Backward FEATURE_INDEX_1*FEATURE_SIZE -# define index_Slow_BSF FEATURE_INDEX_1*FEATURE_SIZE -# define index_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE -# define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1*FEATURE_SIZE -# define index_AVX_Usable FEATURE_INDEX_1*FEATURE_SIZE -# define index_FMA_Usable FEATURE_INDEX_1*FEATURE_SIZE -# define index_FMA4_Usable FEATURE_INDEX_1*FEATURE_SIZE -# define index_Slow_SSE4_2 FEATURE_INDEX_1*FEATURE_SIZE -# define index_AVX2_Usable FEATURE_INDEX_1*FEATURE_SIZE -# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE -# define index_AVX512F_Usable FEATURE_INDEX_1*FEATURE_SIZE -# define index_AVX512DQ_Usable FEATURE_INDEX_1*FEATURE_SIZE - -#else /* __ASSEMBLER__ */ - -# include <sys/param.h> - -enum - { - COMMON_CPUID_INDEX_1 = 0, - COMMON_CPUID_INDEX_7, - COMMON_CPUID_INDEX_80000001, /* for AMD */ - /* Keep the following line at the end. */ - COMMON_CPUID_INDEX_MAX - }; - -extern struct cpu_features -{ - enum cpu_features_kind - { - arch_kind_unknown = 0, - arch_kind_intel, - arch_kind_amd, - arch_kind_other - } kind; - int max_cpuid; - struct cpuid_registers - { - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - } cpuid[COMMON_CPUID_INDEX_MAX]; - unsigned int family; - unsigned int model; - unsigned int feature[FEATURE_INDEX_MAX]; -} __cpu_features attribute_hidden; - - -extern void __init_cpu_features (void) attribute_hidden; -# define INIT_ARCH() \ - do \ - if (__cpu_features.kind == arch_kind_unknown) \ - __init_cpu_features (); \ - while (0) - -/* Used from outside libc.so to get access to the CPU features structure. */ -extern const struct cpu_features *__get_cpu_features (void) - __attribute__ ((const)); - -# if IS_IN (libc) -# define __get_cpu_features() (&__cpu_features) -# endif - -# define HAS_CPU_FEATURE(idx, reg, bit) \ - ((__get_cpu_features ()->cpuid[idx].reg & (bit)) != 0) - -/* Following are the feature tests used throughout libc. */ - -/* CPUID_* evaluates to true if the feature flag is enabled. - We always use &__cpu_features because the HAS_CPUID_* macros - are called only within __init_cpu_features, where we can't - call __get_cpu_features without infinite recursion. */ -# define HAS_CPUID_FLAG(idx, reg, bit) \ - (((&__cpu_features)->cpuid[idx].reg & (bit)) != 0) - -# define CPUID_OSXSAVE \ - HAS_CPUID_FLAG (COMMON_CPUID_INDEX_1, ecx, bit_OSXSAVE) -# define CPUID_AVX \ - HAS_CPUID_FLAG (COMMON_CPUID_INDEX_1, ecx, bit_AVX) -# define CPUID_FMA \ - HAS_CPUID_FLAG (COMMON_CPUID_INDEX_1, ecx, bit_FMA) -# define CPUID_FMA4 \ - HAS_CPUID_FLAG (COMMON_CPUID_INDEX_80000001, ecx, bit_FMA4) -# define CPUID_RTM \ - HAS_CPUID_FLAG (COMMON_CPUID_INDEX_7, ebx, bit_RTM) -# define CPUID_AVX2 \ - HAS_CPUID_FLAG (COMMON_CPUID_INDEX_7, ebx, bit_AVX2) -# define CPUID_AVX512F \ - HAS_CPUID_FLAG (COMMON_CPUID_INDEX_7, ebx, bit_AVX512F) -# define CPUID_AVX512DQ \ - HAS_CPUID_FLAG (COMMON_CPUID_INDEX_7, ebx, bit_AVX512DQ) - -/* HAS_* evaluates to true if we may use the feature at runtime. */ -# define HAS_SSE2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, edx, bit_SSE2) -# define HAS_POPCOUNT HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_POPCOUNT) -# define HAS_SSSE3 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_SSSE3) -# define HAS_SSE4_1 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_SSE4_1) -# define HAS_SSE4_2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_SSE4_2) -# define HAS_RTM HAS_CPU_FEATURE (COMMON_CPUID_INDEX_7, ebx, bit_RTM) - -# define index_Fast_Rep_String FEATURE_INDEX_1 -# define index_Fast_Copy_Backward FEATURE_INDEX_1 -# define index_Slow_BSF FEATURE_INDEX_1 -# define index_Fast_Unaligned_Load FEATURE_INDEX_1 -# define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1 -# define index_AVX_Usable FEATURE_INDEX_1 -# define index_FMA_Usable FEATURE_INDEX_1 -# define index_FMA4_Usable FEATURE_INDEX_1 -# define index_Slow_SSE4_2 FEATURE_INDEX_1 -# define index_AVX2_Usable FEATURE_INDEX_1 -# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1 -# define index_AVX512F_Usable FEATURE_INDEX_1 -# define index_AVX512DQ_Usable FEATURE_INDEX_1 - -# define HAS_ARCH_FEATURE(name) \ - ((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0) - -# define HAS_FAST_REP_STRING HAS_ARCH_FEATURE (Fast_Rep_String) -# define HAS_FAST_COPY_BACKWARD HAS_ARCH_FEATURE (Fast_Copy_Backward) -# define HAS_SLOW_BSF HAS_ARCH_FEATURE (Slow_BSF) -# define HAS_FAST_UNALIGNED_LOAD HAS_ARCH_FEATURE (Fast_Unaligned_Load) -# define HAS_AVX HAS_ARCH_FEATURE (AVX_Usable) -# define HAS_AVX2 HAS_ARCH_FEATURE (AVX2_Usable) -# define HAS_AVX512F HAS_ARCH_FEATURE (AVX512F_Usable) -# define HAS_AVX512DQ HAS_ARCH_FEATURE (AVX512DQ_Usable) -# define HAS_FMA HAS_ARCH_FEATURE (FMA_Usable) -# define HAS_FMA4 HAS_ARCH_FEATURE (FMA4_Usable) -# define HAS_AVX_FAST_UNALIGNED_LOAD HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) - -#endif /* __ASSEMBLER__ */ diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S index 533fece51a..786f87282c 100644 --- a/sysdeps/x86_64/multiarch/memcmp-sse4.S +++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S @@ -1,5 +1,5 @@ /* memcmp with SSE4.1, wmemcmp with SSE4.1 - Copyright (C) 2010-2015 Free Software Foundation, Inc. + Copyright (C) 2010-2016 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S index 948148b1cd..a22f399e02 100644 --- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S +++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S @@ -1,5 +1,5 @@ /* memcmp with SSSE3, wmemcmp with SSSE3 - Copyright (C) 2011-2015 Free Software Foundation, Inc. + Copyright (C) 2011-2016 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S index f8b46363d0..b5a1cc202e 100644 --- a/sysdeps/x86_64/multiarch/memcmp.S +++ b/sysdeps/x86_64/multiarch/memcmp.S @@ -1,6 +1,6 @@ /* Multiple versions of memcmp All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2015 Free Software Foundation, Inc. + Copyright (C) 2010-2016 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -26,16 +26,13 @@ .text ENTRY(memcmp) .type memcmp, @gnu_indirect_function - cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1f - call __init_cpu_features - -1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + LOAD_RTLD_GLOBAL_RO_RDX + HAS_CPU_FEATURE (SSSE3) jnz 2f leaq __memcmp_sse2(%rip), %rax ret -2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip) +2: HAS_CPU_FEATURE (SSE4_1) jz 3f leaq __memcmp_sse4_1(%rip), %rax ret diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S index 9f033f5456..74fed186e9 100644 --- a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S +++ b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S @@ -1,5 +1,5 @@ /* memcpy with AVX - Copyright (C) 2014-2015 Free Software Foundation, Inc. + Copyright (C) 2014-2016 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S new file mode 100644 index 0000000000..1bb12e81b0 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S @@ -0,0 +1,408 @@ +/* memcpy optimized with AVX512 for KNL hardware. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#if defined HAVE_AVX512_ASM_SUPPORT && IS_IN (libc) \ + && (defined SHARED \ + || defined USE_AS_MEMMOVE \ + || !defined USE_MULTIARCH) + +#include "asm-syntax.h" +#ifndef MEMCPY +# define MEMCPY __memcpy_avx512_no_vzeroupper +# define MEMCPY_CHK __memcpy_chk_avx512_no_vzeroupper +#endif + + .section .text,"ax",@progbits +#if !defined USE_AS_BCOPY +ENTRY (MEMCPY_CHK) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMCPY_CHK) +#endif + +ENTRY (MEMCPY) + mov %rdi, %rax +#ifdef USE_AS_MEMPCPY + add %rdx, %rax +#endif + lea (%rsi, %rdx), %rcx + lea (%rdi, %rdx), %r9 + cmp $512, %rdx + ja L(512bytesormore) + +L(check): + cmp $16, %rdx + jbe L(less_16bytes) + cmp $256, %rdx + jb L(less_256bytes) + vmovups (%rsi), %zmm0 + vmovups 0x40(%rsi), %zmm1 + vmovups 0x80(%rsi), %zmm2 + vmovups 0xC0(%rsi), %zmm3 + vmovups -0x100(%rcx), %zmm4 + vmovups -0xC0(%rcx), %zmm5 + vmovups -0x80(%rcx), %zmm6 + vmovups -0x40(%rcx), %zmm7 + vmovups %zmm0, (%rdi) + vmovups %zmm1, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm3, 0xC0(%rdi) + vmovups %zmm4, -0x100(%r9) + vmovups %zmm5, -0xC0(%r9) + vmovups %zmm6, -0x80(%r9) + vmovups %zmm7, -0x40(%r9) + ret + +L(less_256bytes): + cmp $128, %dl + jb L(less_128bytes) + vmovups (%rsi), %zmm0 + vmovups 0x40(%rsi), %zmm1 + vmovups -0x80(%rcx), %zmm2 + vmovups -0x40(%rcx), %zmm3 + vmovups %zmm0, (%rdi) + vmovups %zmm1, 0x40(%rdi) + vmovups %zmm2, -0x80(%r9) + vmovups %zmm3, -0x40(%r9) + ret + +L(less_128bytes): + cmp $64, %dl + jb L(less_64bytes) + vmovdqu (%rsi), %ymm0 + vmovdqu 0x20(%rsi), %ymm1 + vmovdqu -0x40(%rcx), %ymm2 + vmovdqu -0x20(%rcx), %ymm3 + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm1, 0x20(%rdi) + vmovdqu %ymm2, -0x40(%r9) + vmovdqu %ymm3, -0x20(%r9) + ret + +L(less_64bytes): + cmp $32, %dl + jb L(less_32bytes) + vmovdqu (%rsi), %ymm0 + vmovdqu -0x20(%rcx), %ymm1 + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm1, -0x20(%r9) + ret + +L(less_32bytes): + vmovdqu (%rsi), %xmm0 + vmovdqu -0x10(%rcx), %xmm1 + vmovdqu %xmm0, (%rdi) + vmovdqu %xmm1, -0x10(%r9) + ret + +L(less_16bytes): + cmp $8, %dl + jb L(less_8bytes) + movq (%rsi), %rsi + movq -0x8(%rcx), %rcx + movq %rsi, (%rdi) + movq %rcx, -0x8(%r9) + ret + +L(less_8bytes): + cmp $4, %dl + jb L(less_4bytes) + mov (%rsi), %esi + mov -0x4(%rcx), %ecx + mov %esi, (%rdi) + mov %ecx, -0x4(%r9) + ret + +L(less_4bytes): + cmp $2, %dl + jb L(less_2bytes) + mov (%rsi), %si + mov -0x2(%rcx), %cx + mov %si, (%rdi) + mov %cx, -0x2(%r9) + ret + +L(less_2bytes): + cmp $1, %dl + jb L(less_1bytes) + mov (%rsi), %cl + mov %cl, (%rdi) +L(less_1bytes): + ret + +L(512bytesormore): +#ifdef SHARED_CACHE_SIZE_HALF + mov $SHARED_CACHE_SIZE_HALF, %r8 +#else + mov __x86_shared_cache_size_half(%rip), %r8 +#endif + cmp %r8, %rdx + jae L(preloop_large) + cmp $1024, %rdx + ja L(1024bytesormore) + prefetcht1 (%rsi) + prefetcht1 0x40(%rsi) + prefetcht1 0x80(%rsi) + prefetcht1 0xC0(%rsi) + prefetcht1 0x100(%rsi) + prefetcht1 0x140(%rsi) + prefetcht1 0x180(%rsi) + prefetcht1 0x1C0(%rsi) + prefetcht1 -0x200(%rcx) + prefetcht1 -0x1C0(%rcx) + prefetcht1 -0x180(%rcx) + prefetcht1 -0x140(%rcx) + prefetcht1 -0x100(%rcx) + prefetcht1 -0xC0(%rcx) + prefetcht1 -0x80(%rcx) + prefetcht1 -0x40(%rcx) + vmovups (%rsi), %zmm0 + vmovups 0x40(%rsi), %zmm1 + vmovups 0x80(%rsi), %zmm2 + vmovups 0xC0(%rsi), %zmm3 + vmovups 0x100(%rsi), %zmm4 + vmovups 0x140(%rsi), %zmm5 + vmovups 0x180(%rsi), %zmm6 + vmovups 0x1C0(%rsi), %zmm7 + vmovups -0x200(%rcx), %zmm8 + vmovups -0x1C0(%rcx), %zmm9 + vmovups -0x180(%rcx), %zmm10 + vmovups -0x140(%rcx), %zmm11 + vmovups -0x100(%rcx), %zmm12 + vmovups -0xC0(%rcx), %zmm13 + vmovups -0x80(%rcx), %zmm14 + vmovups -0x40(%rcx), %zmm15 + vmovups %zmm0, (%rdi) + vmovups %zmm1, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm3, 0xC0(%rdi) + vmovups %zmm4, 0x100(%rdi) + vmovups %zmm5, 0x140(%rdi) + vmovups %zmm6, 0x180(%rdi) + vmovups %zmm7, 0x1C0(%rdi) + vmovups %zmm8, -0x200(%r9) + vmovups %zmm9, -0x1C0(%r9) + vmovups %zmm10, -0x180(%r9) + vmovups %zmm11, -0x140(%r9) + vmovups %zmm12, -0x100(%r9) + vmovups %zmm13, -0xC0(%r9) + vmovups %zmm14, -0x80(%r9) + vmovups %zmm15, -0x40(%r9) + ret + +L(1024bytesormore): + cmp %rsi, %rdi + ja L(1024bytesormore_bkw) + sub $512, %r9 + vmovups -0x200(%rcx), %zmm8 + vmovups -0x1C0(%rcx), %zmm9 + vmovups -0x180(%rcx), %zmm10 + vmovups -0x140(%rcx), %zmm11 + vmovups -0x100(%rcx), %zmm12 + vmovups -0xC0(%rcx), %zmm13 + vmovups -0x80(%rcx), %zmm14 + vmovups -0x40(%rcx), %zmm15 + prefetcht1 (%rsi) + prefetcht1 0x40(%rsi) + prefetcht1 0x80(%rsi) + prefetcht1 0xC0(%rsi) + prefetcht1 0x100(%rsi) + prefetcht1 0x140(%rsi) + prefetcht1 0x180(%rsi) + prefetcht1 0x1C0(%rsi) + +/* Loop with unaligned memory access. */ +L(gobble_512bytes_loop): + vmovups (%rsi), %zmm0 + vmovups 0x40(%rsi), %zmm1 + vmovups 0x80(%rsi), %zmm2 + vmovups 0xC0(%rsi), %zmm3 + vmovups 0x100(%rsi), %zmm4 + vmovups 0x140(%rsi), %zmm5 + vmovups 0x180(%rsi), %zmm6 + vmovups 0x1C0(%rsi), %zmm7 + add $512, %rsi + prefetcht1 (%rsi) + prefetcht1 0x40(%rsi) + prefetcht1 0x80(%rsi) + prefetcht1 0xC0(%rsi) + prefetcht1 0x100(%rsi) + prefetcht1 0x140(%rsi) + prefetcht1 0x180(%rsi) + prefetcht1 0x1C0(%rsi) + vmovups %zmm0, (%rdi) + vmovups %zmm1, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm3, 0xC0(%rdi) + vmovups %zmm4, 0x100(%rdi) + vmovups %zmm5, 0x140(%rdi) + vmovups %zmm6, 0x180(%rdi) + vmovups %zmm7, 0x1C0(%rdi) + add $512, %rdi + cmp %r9, %rdi + jb L(gobble_512bytes_loop) + vmovups %zmm8, (%r9) + vmovups %zmm9, 0x40(%r9) + vmovups %zmm10, 0x80(%r9) + vmovups %zmm11, 0xC0(%r9) + vmovups %zmm12, 0x100(%r9) + vmovups %zmm13, 0x140(%r9) + vmovups %zmm14, 0x180(%r9) + vmovups %zmm15, 0x1C0(%r9) + ret + +L(1024bytesormore_bkw): + add $512, %rdi + vmovups 0x1C0(%rsi), %zmm8 + vmovups 0x180(%rsi), %zmm9 + vmovups 0x140(%rsi), %zmm10 + vmovups 0x100(%rsi), %zmm11 + vmovups 0xC0(%rsi), %zmm12 + vmovups 0x80(%rsi), %zmm13 + vmovups 0x40(%rsi), %zmm14 + vmovups (%rsi), %zmm15 + prefetcht1 -0x40(%rcx) + prefetcht1 -0x80(%rcx) + prefetcht1 -0xC0(%rcx) + prefetcht1 -0x100(%rcx) + prefetcht1 -0x140(%rcx) + prefetcht1 -0x180(%rcx) + prefetcht1 -0x1C0(%rcx) + prefetcht1 -0x200(%rcx) + +/* Backward loop with unaligned memory access. */ +L(gobble_512bytes_loop_bkw): + vmovups -0x40(%rcx), %zmm0 + vmovups -0x80(%rcx), %zmm1 + vmovups -0xC0(%rcx), %zmm2 + vmovups -0x100(%rcx), %zmm3 + vmovups -0x140(%rcx), %zmm4 + vmovups -0x180(%rcx), %zmm5 + vmovups -0x1C0(%rcx), %zmm6 + vmovups -0x200(%rcx), %zmm7 + sub $512, %rcx + prefetcht1 -0x40(%rcx) + prefetcht1 -0x80(%rcx) + prefetcht1 -0xC0(%rcx) + prefetcht1 -0x100(%rcx) + prefetcht1 -0x140(%rcx) + prefetcht1 -0x180(%rcx) + prefetcht1 -0x1C0(%rcx) + prefetcht1 -0x200(%rcx) + vmovups %zmm0, -0x40(%r9) + vmovups %zmm1, -0x80(%r9) + vmovups %zmm2, -0xC0(%r9) + vmovups %zmm3, -0x100(%r9) + vmovups %zmm4, -0x140(%r9) + vmovups %zmm5, -0x180(%r9) + vmovups %zmm6, -0x1C0(%r9) + vmovups %zmm7, -0x200(%r9) + sub $512, %r9 + cmp %rdi, %r9 + ja L(gobble_512bytes_loop_bkw) + vmovups %zmm8, -0x40(%rdi) + vmovups %zmm9, -0x80(%rdi) + vmovups %zmm10, -0xC0(%rdi) + vmovups %zmm11, -0x100(%rdi) + vmovups %zmm12, -0x140(%rdi) + vmovups %zmm13, -0x180(%rdi) + vmovups %zmm14, -0x1C0(%rdi) + vmovups %zmm15, -0x200(%rdi) + ret + +L(preloop_large): + cmp %rsi, %rdi + ja L(preloop_large_bkw) + vmovups (%rsi), %zmm4 + vmovups 0x40(%rsi), %zmm5 + +/* Align destination for access with non-temporal stores in the loop. */ + mov %rdi, %r8 + and $-0x80, %rdi + add $0x80, %rdi + sub %rdi, %r8 + sub %r8, %rsi + add %r8, %rdx +L(gobble_256bytes_nt_loop): + prefetcht1 0x200(%rsi) + prefetcht1 0x240(%rsi) + prefetcht1 0x280(%rsi) + prefetcht1 0x2C0(%rsi) + prefetcht1 0x300(%rsi) + prefetcht1 0x340(%rsi) + prefetcht1 0x380(%rsi) + prefetcht1 0x3C0(%rsi) + vmovdqu64 (%rsi), %zmm0 + vmovdqu64 0x40(%rsi), %zmm1 + vmovdqu64 0x80(%rsi), %zmm2 + vmovdqu64 0xC0(%rsi), %zmm3 + vmovntdq %zmm0, (%rdi) + vmovntdq %zmm1, 0x40(%rdi) + vmovntdq %zmm2, 0x80(%rdi) + vmovntdq %zmm3, 0xC0(%rdi) + sub $256, %rdx + add $256, %rsi + add $256, %rdi + cmp $256, %rdx + ja L(gobble_256bytes_nt_loop) + sfence + vmovups %zmm4, (%rax) + vmovups %zmm5, 0x40(%rax) + jmp L(check) + +L(preloop_large_bkw): + vmovups -0x80(%rcx), %zmm4 + vmovups -0x40(%rcx), %zmm5 + +/* Align end of destination for access with non-temporal stores. */ + mov %r9, %r8 + and $-0x80, %r9 + sub %r9, %r8 + sub %r8, %rcx + sub %r8, %rdx + add %r9, %r8 +L(gobble_256bytes_nt_loop_bkw): + prefetcht1 -0x400(%rcx) + prefetcht1 -0x3C0(%rcx) + prefetcht1 -0x380(%rcx) + prefetcht1 -0x340(%rcx) + prefetcht1 -0x300(%rcx) + prefetcht1 -0x2C0(%rcx) + prefetcht1 -0x280(%rcx) + prefetcht1 -0x240(%rcx) + vmovdqu64 -0x100(%rcx), %zmm0 + vmovdqu64 -0xC0(%rcx), %zmm1 + vmovdqu64 -0x80(%rcx), %zmm2 + vmovdqu64 -0x40(%rcx), %zmm3 + vmovntdq %zmm0, -0x100(%r9) + vmovntdq %zmm1, -0xC0(%r9) + vmovntdq %zmm2, -0x80(%r9) + vmovntdq %zmm3, -0x40(%r9) + sub $256, %rdx + sub $256, %rcx + sub $256, %r9 + cmp $256, %rdx + ja L(gobble_256bytes_nt_loop_bkw) + sfence + vmovups %zmm4, -0x80(%r8) + vmovups %zmm5, -0x40(%r8) + jmp L(check) +END (MEMCPY) +#endif diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S index c5450af25a..c4509831fa 100644 --- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S @@ -1,5 +1,5 @@ /* memcpy with unaliged loads - Copyright (C) 2013-2015 Free Software Foundation, Inc. + Copyright (C) 2013-2016 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,6 +16,8 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ +#if IS_IN (libc) + #include <sysdep.h> #include "asm-syntax.h" @@ -169,3 +171,5 @@ L(between_5_8): movl %eax, -4(%rdi,%rdx) jmp L(return) END(__memcpy_sse2_unaligned) + +#endif diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S index 30e0d1c575..08b41e9e5a 100644 --- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S +++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S @@ -1,5 +1,5 @@ /* memcpy with SSSE3 and REP string - Copyright (C) 2010-2015 Free Software Foundation, Inc. + Copyright (C) 2010-2016 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S index 33cc493dd4..95de9695f9 100644 --- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S +++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S @@ -1,5 +1,5 @@ /* memcpy with SSSE3 - Copyright (C) 2010-2015 Free Software Foundation, Inc. + Copyright (C) 2010-2016 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S index 4e18cd3070..64a1bcd137 100644 --- a/sysdeps/x86_64/multiarch/memcpy.S +++ b/sysdeps/x86_64/multiarch/memcpy.S @@ -1,6 +1,6 @@ /* Multiple versions of memcpy All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2015 Free Software Foundation, Inc. + Copyright (C) 2010-2016 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -29,22 +29,28 @@ .text ENTRY(__new_memcpy) .type __new_memcpy, @gnu_indirect_function - cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1f - call __init_cpu_features + LOAD_RTLD_GLOBAL_RO_RDX +#ifdef HAVE_AVX512_ASM_SUPPORT + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 1f + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jz 1f + leaq __memcpy_avx512_no_vzeroupper(%rip), %rax + ret +#endif 1: leaq __memcpy_avx_unaligned(%rip), %rax - testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) - jz 1f + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz 2f ret -1: leaq __memcpy_sse2(%rip), %rax - testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip) - jnz 2f +2: leaq __memcpy_sse2(%rip), %rax + HAS_ARCH_FEATURE (Slow_BSF) + jnz 3f leaq __memcpy_sse2_unaligned(%rip), %rax ret -2: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) - jz 3f +3: HAS_CPU_FEATURE (SSSE3) + jz 4f leaq __memcpy_ssse3(%rip), %rax -3: ret +4: ret END(__new_memcpy) # undef ENTRY diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S index 1e756ea0c2..648217e971 100644 --- a/sysdeps/x86_64/multiarch/memcpy_chk.S +++ b/sysdeps/x86_64/multiarch/memcpy_chk.S @@ -1,6 +1,6 @@ /* Multiple versions of __memcpy_chk All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2015 Free Software Foundation, Inc. + Copyright (C) 2010-2016 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -29,17 +29,23 @@ .text ENTRY(__memcpy_chk) .type __memcpy_chk, @gnu_indirect_function - cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1f - call __init_cpu_features + LOAD_RTLD_GLOBAL_RO_RDX +#ifdef HAVE_AVX512_ASM_SUPPORT + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 1f + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jz 1f + leaq __memcpy_chk_avx512_no_vzeroupper(%rip), %rax + ret +#endif 1: leaq __memcpy_chk_sse2(%rip), %rax - testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + HAS_CPU_FEATURE (SSSE3) jz 2f leaq __memcpy_chk_ssse3(%rip), %rax - testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) + HAS_ARCH_FEATURE (Fast_Copy_Backward) jz 2f leaq __memcpy_chk_ssse3_back(%rip), %rax - testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) jz 2f leaq __memcpy_chk_avx_unaligned(%rip), %rax 2: ret diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S index 01eac94889..75e35f2957 100644 --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S @@ -1,5 +1,5 @@ /* memmove with AVX - Copyright (C) 2014-2015 Free Software Foundation, Inc. + Copyright (C) 2014-2016 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S new file mode 100644 index 0000000000..518d1fec35 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S @@ -0,0 +1,22 @@ +/* memmove optimized with AVX512 for KNL hardware. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define USE_AS_MEMMOVE +#define MEMCPY __memmove_avx512_no_vzeroupper +#define MEMCPY_CHK __memmove_chk_avx512_no_vzeroupper +#include "memcpy-avx512-no-vzeroupper.S" diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c index dd153a3eaa..8da5640bb0 100644 --- a/sysdeps/x86_64/multiarch/memmove.c +++ b/sysdeps/x86_64/multiarch/memmove.c @@ -1,6 +1,6 @@ /* Multiple versions of memmove. All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2015 Free Software Foundation, Inc. + Copyright (C) 2010-2016 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -36,6 +36,9 @@ extern __typeof (__redirect_memmove) __memmove_sse2 attribute_hidden; extern __typeof (__redirect_memmove) __memmove_ssse3 attribute_hidden; extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden; extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden; +# ifdef HAVE_AVX512_ASM_SUPPORT + extern __typeof (__redirect_memmove) __memmove_avx512_no_vzeroupper attribute_hidden; +# endif #endif @@ -49,12 +52,18 @@ extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden; ifunc symbol properly. */ extern __typeof (__redirect_memmove) __libc_memmove; libc_ifunc (__libc_memmove, - HAS_AVX_FAST_UNALIGNED_LOAD +#ifdef HAVE_AVX512_ASM_SUPPORT + HAS_ARCH_FEATURE (AVX512F_Usable) + && HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + ? __memmove_avx512_no_vzeroupper + : +#endif + (HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) ? __memmove_avx_unaligned - : (HAS_SSSE3 - ? (HAS_FAST_COPY_BACKWARD + : (HAS_CPU_FEATURE (SSSE3) + ? (HAS_ARCH_FEATURE (Fast_Copy_Backward) ? __memmove_ssse3_back : __memmove_ssse3) - : __memmove_sse2)); + : __memmove_sse2))); strong_alias (__libc_memmove, memmove) diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c index 8b12d002dc..f64da63180 100644 --- a/sysdeps/x86_64/multiarch/memmove_chk.c +++ b/sysdeps/x86_64/multiarch/memmove_chk.c @@ -1,6 +1,6 @@ /* Multiple versions of __memmove_chk. All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2015 Free Software Foundation, Inc. + Copyright (C) 2010-2016 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -26,12 +26,21 @@ extern __typeof (__memmove_chk) __memmove_chk_sse2 attribute_hidden; extern __typeof (__memmove_chk) __memmove_chk_ssse3 attribute_hidden; extern __typeof (__memmove_chk) __memmove_chk_ssse3_back attribute_hidden; extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden; +# ifdef HAVE_AVX512_ASM_SUPPORT + extern __typeof (__memmove_chk) __memmove_chk_avx512_no_vzeroupper attribute_hidden; +# endif #include "debug/memmove_chk.c" libc_ifunc (__memmove_chk, - HAS_AVX_FAST_UNALIGNED_LOAD ? __memmove_chk_avx_unaligned : - (HAS_SSSE3 - ? (HAS_FAST_COPY_BACKWARD +#ifdef HAVE_AVX512_ASM_SUPPORT + HAS_ARCH_FEATURE (AVX512F_Usable) + && HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + ? __memmove_chk_avx512_no_vzeroupper + : +#endif + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) ? __memmove_chk_avx_unaligned : + (HAS_CPU_FEATURE (SSSE3) + ? (HAS_ARCH_FEATURE (Fast_Copy_Backward) ? __memmove_chk_ssse3_back : __memmove_chk_ssse3) : __memmove_chk_sse2)); diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S index 128ff832fb..241378e770 100644 --- a/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S +++ b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S @@ -1,5 +1,5 @@ /* mempcpy with AVX - Copyright (C) 2014-2015 Free Software Foundation, Inc. + Copyright (C) 2014-2016 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S new file mode 100644 index 0000000000..fcc0945ea7 --- /dev/null +++ b/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S @@ -0,0 +1,22 @@ +/* mempcpy optimized with AVX512 for KNL hardware. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define USE_AS_MEMPCPY +#define MEMCPY __mempcpy_avx512_no_vzeroupper +#define MEMCPY_CHK __mempcpy_chk_avx512_no_vzeroupper +#include "memcpy-avx512-no-vzeroupper.S" diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S index 2eaacdf049..ed78623565 100644 --- a/sysdeps/x86_64/multiarch/mempcpy.S +++ b/sysdeps/x86_64/multiarch/mempcpy.S @@ -1,6 +1,6 @@ /* Multiple versions of mempcpy All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2015 Free Software Foundation, Inc. + Copyright (C) 2010-2016 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -27,17 +27,23 @@ #if defined SHARED && IS_IN (libc) ENTRY(__mempcpy) .type __mempcpy, @gnu_indirect_function - cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1f - call __init_cpu_features + LOAD_RTLD_GLOBAL_RO_RDX +#ifdef HAVE_AVX512_ASM_SUPPORT + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 1f + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jz 1f + leaq __mempcpy_avx512_no_vzeroupper(%rip), %rax + ret +#endif 1: leaq __mempcpy_sse2(%rip), %rax - testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + HAS_CPU_FEATURE (SSSE3) jz 2f leaq __mempcpy_ssse3(%rip), %rax - testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) + HAS_ARCH_FEATURE (Fast_Copy_Backward) jz 2f leaq __mempcpy_ssse3_back(%rip), %rax - testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) jz 2f leaq __mempcpy_avx_unaligned(%rip), %rax 2: ret diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S index 17b84701b0..6e8a89d38c 100644 --- a/sysdeps/x86_64/multiarch/mempcpy_chk.S +++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S @@ -1,6 +1,6 @@ /* Multiple versions of __mempcpy_chk All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2015 Free Software Foundation, Inc. + Copyright (C) 2010-2016 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -29,17 +29,23 @@ .text ENTRY(__mempcpy_chk) .type __mempcpy_chk, @gnu_indirect_function - cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1f - call __init_cpu_features + LOAD_RTLD_GLOBAL_RO_RDX +#ifdef HAVE_AVX512_ASM_SUPPORT + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 1f + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jz 1f + leaq __mempcpy_chk_avx512_no_vzeroupper(%rip), %rax + ret +#endif 1: leaq __mempcpy_chk_sse2(%rip), %rax - testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + HAS_CPU_FEATURE (SSSE3) jz 2f leaq __mempcpy_chk_ssse3(%rip), %rax - testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) + HAS_ARCH_FEATURE (Fast_Copy_Backward) jz 2f leaq __mempcpy_chk_ssse3_back(%rip), %rax - testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) jz 2f leaq __mempcpy_chk_avx_unaligned(%rip), %rax 2: ret diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S index 28eabade35..df634728d4 100644 --- a/sysdeps/x86_64/multiarch/memset-avx2.S +++ b/sysdeps/x86_64/multiarch/memset-avx2.S @@ -1,5 +1,5 @@ /* memset with AVX2 - Copyright (C) 2014-2015 Free Software Foundation, Inc. + Copyright (C) 2014-2016 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S new file mode 100644 index 0000000000..1e638d7ac2 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S @@ -0,0 +1,194 @@ +/* memset optimized with AVX512 for KNL hardware. + Copyright (C) 2015-2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#if defined HAVE_AVX512_ASM_SUPPORT && IS_IN (libc) + +#include "asm-syntax.h" +#ifndef MEMSET +# define MEMSET __memset_avx512_no_vzeroupper +# define MEMSET_CHK __memset_chk_avx512_no_vzeroupper +#endif + + .section .text,"ax",@progbits +#if defined PIC +ENTRY (MEMSET_CHK) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMSET_CHK) +#endif + +ENTRY (MEMSET) + vpxor %xmm0, %xmm0, %xmm0 + vmovd %esi, %xmm1 + lea (%rdi, %rdx), %rsi + mov %rdi, %rax + vpshufb %xmm0, %xmm1, %xmm0 + cmp $16, %rdx + jb L(less_16bytes) + cmp $512, %rdx + vbroadcastss %xmm0, %zmm2 + ja L(512bytesormore) + cmp $256, %rdx + jb L(less_256bytes) + vmovups %zmm2, (%rdi) + vmovups %zmm2, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm2, 0xC0(%rdi) + vmovups %zmm2, -0x100(%rsi) + vmovups %zmm2, -0xC0(%rsi) + vmovups %zmm2, -0x80(%rsi) + vmovups %zmm2, -0x40(%rsi) + ret + +L(less_256bytes): + cmp $128, %dl + jb L(less_128bytes) + vmovups %zmm2, (%rdi) + vmovups %zmm2, 0x40(%rdi) + vmovups %zmm2, -0x80(%rsi) + vmovups %zmm2, -0x40(%rsi) + ret + +L(less_128bytes): + cmp $64, %dl + jb L(less_64bytes) + vmovups %zmm2, (%rdi) + vmovups %zmm2, -0x40(%rsi) + ret + +L(less_64bytes): + cmp $32, %dl + jb L(less_32bytes) + vmovdqu %ymm2, (%rdi) + vmovdqu %ymm2, -0x20(%rsi) + ret + +L(less_32bytes): + vmovdqu %xmm0, (%rdi) + vmovdqu %xmm0, -0x10(%rsi) + ret + +L(less_16bytes): + cmp $8, %dl + jb L(less_8bytes) + vmovq %xmm0, (%rdi) + vmovq %xmm0, -0x08(%rsi) + ret + +L(less_8bytes): + vmovd %xmm0, %ecx + cmp $4, %dl + jb L(less_4bytes) + mov %ecx, (%rdi) + mov %ecx, -0x04(%rsi) + ret + +L(less_4bytes): + cmp $2, %dl + jb L(less_2bytes) + mov %cx, (%rdi) + mov %cx, -0x02(%rsi) + ret + +L(less_2bytes): + cmp $1, %dl + jb L(less_1bytes) + mov %cl, (%rdi) +L(less_1bytes): + ret + +L(512bytesormore): + mov __x86_shared_cache_size_half(%rip), %rcx + cmp %rcx, %rdx + ja L(preloop_large) + cmp $1024, %rdx + ja L(1024bytesormore) + + vmovups %zmm2, (%rdi) + vmovups %zmm2, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm2, 0xC0(%rdi) + vmovups %zmm2, 0x100(%rdi) + vmovups %zmm2, 0x140(%rdi) + vmovups %zmm2, 0x180(%rdi) + vmovups %zmm2, 0x1C0(%rdi) + vmovups %zmm2, -0x200(%rsi) + vmovups %zmm2, -0x1C0(%rsi) + vmovups %zmm2, -0x180(%rsi) + vmovups %zmm2, -0x140(%rsi) + vmovups %zmm2, -0x100(%rsi) + vmovups %zmm2, -0xC0(%rsi) + vmovups %zmm2, -0x80(%rsi) + vmovups %zmm2, -0x40(%rsi) + ret + +/* Align on 64 and loop with aligned stores. */ +L(1024bytesormore): + sub $0x100, %rsi + vmovups %zmm2, (%rax) + and $-0x40, %rdi + add $0x40, %rdi + +L(gobble_256bytes_loop): + vmovaps %zmm2, (%rdi) + vmovaps %zmm2, 0x40(%rdi) + vmovaps %zmm2, 0x80(%rdi) + vmovaps %zmm2, 0xC0(%rdi) + add $0x100, %rdi + cmp %rsi, %rdi + jb L(gobble_256bytes_loop) + vmovups %zmm2, (%rsi) + vmovups %zmm2, 0x40(%rsi) + vmovups %zmm2, 0x80(%rsi) + vmovups %zmm2, 0xC0(%rsi) + ret + +/* Align on 128 and loop with non-temporal stores. */ +L(preloop_large): + and $-0x80, %rdi + add $0x80, %rdi + vmovups %zmm2, (%rax) + vmovups %zmm2, 0x40(%rax) + sub $0x200, %rsi + +L(gobble_512bytes_nt_loop): + vmovntdq %zmm2, (%rdi) + vmovntdq %zmm2, 0x40(%rdi) + vmovntdq %zmm2, 0x80(%rdi) + vmovntdq %zmm2, 0xC0(%rdi) + vmovntdq %zmm2, 0x100(%rdi) + vmovntdq %zmm2, 0x140(%rdi) + vmovntdq %zmm2, 0x180(%rdi) + vmovntdq %zmm2, 0x1C0(%rdi) + add $0x200, %rdi + cmp %rsi, %rdi + jb L(gobble_512bytes_nt_loop) + sfence + vmovups %zmm2, (%rsi) + vmovups %zmm2, 0x40(%rsi) + vmovups %zmm2, 0x80(%rsi) + vmovups %zmm2, 0xC0(%rsi) + vmovups %zmm2, 0x100(%rsi) + vmovups %zmm2, 0x140(%rsi) + vmovups %zmm2, 0x180(%rsi) + vmovups %zmm2, 0x1C0(%rsi) + ret +END (MEMSET) +#endif diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S index c5f1fb340e..8e3b9b9764 100644 --- a/sysdeps/x86_64/multiarch/memset.S +++ b/sysdeps/x86_64/multiarch/memset.S @@ -1,6 +1,6 @@ /* Multiple versions of memset All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2014-2015 Free Software Foundation, Inc. + Copyright (C) 2014-2016 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -17,45 +17,48 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#ifdef HAVE_AVX2_SUPPORT #include <sysdep.h> #include <shlib-compat.h> #include <init-arch.h> /* Define multiple versions only for the definition in lib. */ -# if IS_IN (libc) +#if IS_IN (libc) ENTRY(memset) .type memset, @gnu_indirect_function - cmpl $0, __cpu_features+KIND_OFFSET(%rip) - jne 1f - call __init_cpu_features -1: leaq __memset_sse2(%rip), %rax - testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip) + LOAD_RTLD_GLOBAL_RO_RDX + leaq __memset_sse2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) jz 2f leaq __memset_avx2(%rip), %rax +#ifdef HAVE_AVX512_ASM_SUPPORT + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 2f + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jz 2f + leaq __memset_avx512_no_vzeroupper(%rip), %rax +#endif 2: ret END(memset) -# endif +#endif -# if IS_IN (libc) -# undef memset -# define memset __memset_sse2 +#if IS_IN (libc) +# undef memset +# define memset __memset_sse2 -# undef __memset_chk -# define __memset_chk __memset_chk_sse2 +# undef __memset_chk +# define __memset_chk __memset_chk_sse2 -# ifdef SHARED -# undef libc_hidden_builtin_def +# ifdef SHARED +# undef libc_hidden_builtin_def /* It doesn't make sense to send libc-internal memset calls through a PLT. The speedup we get from using GPR instruction is likely eaten away by the indirect call in the PLT. */ -# define libc_hidden_builtin_def(name) \ +# define libc_hidden_builtin_def(name) \ .globl __GI_memset; __GI_memset = __memset_sse2 -# endif - -# undef strong_alias -# define strong_alias(original, alias) # endif + +# undef strong_alias +# define strong_alias(original, alias) #endif #include "../memset.S" diff --git a/sysdeps/x86_64/multiarch/memset_chk.S b/sysdeps/x86_64/multiarch/memset_chk.S index 64fed3118a..9a7b270274 100644 --- a/sysdeps/x86_64/multiarch/memset_chk.S +++ b/sysdeps/x86_64/multiarch/memset_chk.S @@ -1,6 +1,6 @@ /* Multiple versions of memset_chk All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2014-2015 Free Software Foundation, Inc. + Copyright (C) 2014-2016 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -22,16 +22,21 @@ /* Define multiple versions only for the definition in lib. */ #if IS_IN (libc) -# if defined SHARED && defined HAVE_AVX2_SUPPORT +# ifdef SHARED ENTRY(__memset_chk) .type __memset_chk, @gnu_indirect_function - cmpl $0, __cpu_features+KIND_OFFSET(%rip) - jne 1f - call __init_cpu_features -1: leaq __memset_chk_sse2(%rip), %rax - testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip) + LOAD_RTLD_GLOBAL_RO_RDX + leaq __memset_chk_sse2(%rip), %rax + HAS_ARCH_FEATURE (AVX2_Usable) jz 2f leaq __memset_chk_avx2(%rip), %rax +#ifdef HAVE_AVX512_ASM_SUPPORT + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 2f + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jz 2f + leaq __memset_chk_avx512_no_vzeroupper(%rip), %rax +#endif 2: ret END(__memset_chk) diff --git a/sysdeps/x86_64/multiarch/rtld-memcmp.c b/sysdeps/x86_64/multiarch/rtld-memcmp.c deleted file mode 100644 index 0f271356c2..0000000000 --- a/sysdeps/x86_64/multiarch/rtld-memcmp.c +++ /dev/null @@ -1 +0,0 @@ -#include "../rtld-memcmp.c" diff --git a/sysdeps/x86_64/multiarch/rtld-memset.S b/sysdeps/x86_64/multiarch/rtld-memset.S deleted file mode 100644 index 8092aa07da..0000000000 --- a/sysdeps/x86_64/multiarch/rtld-memset.S +++ /dev/null @@ -1 +0,0 @@ -#include "../rtld-memset.S" diff --git a/sysdeps/x86_64/multiarch/sched_cpucount.c b/sysdeps/x86_64/multiarch/sched_cpucount.c index 72ad7b01a8..b75aeb79b2 100644 --- a/sysdeps/x86_64/multiarch/sched_cpucount.c +++ b/sysdeps/x86_64/multiarch/sched_cpucount.c @@ -1,6 +1,6 @@ /* Count bits in CPU set. x86-64 multi-arch version. This file is part of the GNU C Library. - Copyright (C) 2008-2015 Free Software Foundation, Inc. + Copyright (C) 2008-2016 Free Software Foundation, Inc. Contributed by Ulrich Drepper <drepper@redhat.com>. The GNU C Library is free software; you can redistribute it and/or @@ -33,4 +33,4 @@ #undef __sched_cpucount libc_ifunc (__sched_cpucount, - HAS_POPCOUNT ? popcount_cpucount : generic_cpucount); + HAS_CPU_FEATURE (POPCOUNT) ? popcount_cpucount : generic_cpucount); diff --git a/sysdeps/x86_64/multiarch/strcasestr.c b/sysdeps/x86_64/multiarch/strcasestr.c deleted file mode 100644 index 834e656a2c..0000000000 --- a/sysdeps/x86_64/multiarch/strcasestr.c +++ /dev/null @@ -1,13 +0,0 @@ -/* Multiple versions of strcasestr - All versions must be listed in ifunc-impl-list.c. */ - -#include "init-arch.h" - -#define STRCASESTR __strcasestr_sse2 - -#include "string/strcasestr.c" - -extern __typeof (__strcasestr_sse2) __strcasestr_sse2 attribute_hidden; - -libc_ifunc (__strcasestr, - __strcasestr_sse2); diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S index 81f1b40ef6..3a694d45c2 100644 --- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S @@ -1,5 +1,5 @@ /* strcat with SSE2 - Copyright (C) 2011-2015 Free Software Foundation, Inc. + Copyright (C) 2011-2016 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S index d7b990725a..96184d0f0f 100644 --- a/sysdeps/x86_64/multiarch/strcat-ssse3.S +++ b/sysdeps/x86_64/multiarch/strcat-ssse3.S @@ -1,5 +1,5 @@ /* strcat with SSSE3 - Copyright (C) 2011-2015 Free Software Foundation, Inc. + Copyright (C) 2011-2016 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/strcat.S b/sysdeps/x86_64/multiarch/strcat.S index 44993fade5..7bb38e68ad 100644 --- a/sysdeps/x86_64/multiarch/strcat.S +++ b/sysdeps/x86_64/multiarch/strcat.S @@ -1,6 +1,6 @@ /* Multiple versions of strcat All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2009-2015 Free Software Foundation, Inc. + Copyright (C) 2009-2016 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -47,14 +47,12 @@ .text ENTRY(STRCAT) .type STRCAT, @gnu_indirect_function - cmpl $0, __cpu_features+KIND_OFFSET(%rip) - jne 1f - call __init_cpu_features -1: leaq STRCAT_SSE2_UNALIGNED(%rip), %rax - testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip) + LOAD_RTLD_GLOBAL_RO_RDX + leaq STRCAT_SSE2_UNALIGNED(%rip), %rax + HAS_ARCH_FEATURE (Fast_Unaligned_Load) jnz 2f leaq STRCAT_SSE2(%rip), %rax - testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + HAS_CPU_FEATURE (SSSE3) jz 2f leaq STRCAT_SSSE3(%rip), %rax 2: ret diff --git a/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S index 0398650a01..979d112b28 100644 --- a/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S +++ b/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S @@ -1,5 +1,5 @@ /* strchr with SSE2 without bsf - Copyright (C) 2011-2015 Free Software Foundation, Inc. + Copyright (C) 2011-2016 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S index af55fac398..40683ad32b 100644 --- a/sysdeps/x86_64/multiarch/strchr.S +++ b/sysdeps/x86_64/multiarch/strchr.S @@ -1,5 +1,5 @@ /* Multiple versions of strchr - Copyright (C) 2009-2015 Free Software Foundation, Inc. + Copyright (C) 2009-2016 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -25,11 +25,9 @@ .text ENTRY(strchr) .type strchr, @gnu_indirect_function - cmpl $0, __cpu_features+KIND_OFFSET(%rip) - jne 1f - call __init_cpu_features -1: leaq __strchr_sse2(%rip), %rax -2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip) + LOAD_RTLD_GLOBAL_RO_RDX + leaq __strchr_sse2(%rip), %rax +2: HAS_ARCH_FEATURE (Slow_BSF) jz 3f leaq __strchr_sse2_no_bsf(%rip), %rax 3: ret diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S index 20b65fa775..bf555b4066 100644 --- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S @@ -1,5 +1,5 @@ /* strcmp with unaligned loads - Copyright (C) 2013-2015 Free Software Foundation, Inc. + Copyright (C) 2013-2016 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,6 +16,8 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ +#if IS_IN (libc) + #include "sysdep.h" ENTRY ( __strcmp_sse2_unaligned) @@ -207,3 +209,5 @@ L(different): subl %ecx, %eax ret END (__strcmp_sse2_unaligned) + +#endif diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S index 4dff0a564b..70df84ae32 100644 --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S @@ -1,5 +1,5 @@ /* strcmp with SSE4.2 - Copyright (C) 2009-2015 Free Software Foundation, Inc. + Copyright (C) 2009-2016 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S index f50f26c393..0e4a113f61 100644 --- a/sysdeps/x86_64/multiarch/strcmp.S +++ b/sysdeps/x86_64/multiarch/strcmp.S @@ -1,5 +1,5 @@ /* Multiple versions of strcmp - Copyright (C) 2009-2015 Free Software Foundation, Inc. + Copyright (C) 2009-2016 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -84,24 +84,20 @@ .text ENTRY(STRCMP) .type STRCMP, @gnu_indirect_function - /* Manually inlined call to __get_cpu_features. */ - cmpl $0, __cpu_features+KIND_OFFSET(%rip) - jne 1f - call __init_cpu_features -1: + LOAD_RTLD_GLOBAL_RO_RDX #ifdef USE_AS_STRCMP leaq __strcmp_sse2_unaligned(%rip), %rax - testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) jnz 3f #else - testl $bit_Slow_SSE4_2, __cpu_features+FEATURE_OFFSET+index_Slow_SSE4_2(%rip) + HAS_ARCH_FEATURE (Slow_SSE4_2) jnz 2f leaq STRCMP_SSE42(%rip), %rax - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) + HAS_CPU_FEATURE (SSE4_2) jnz 3f #endif 2: leaq STRCMP_SSSE3(%rip), %rax - testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + HAS_CPU_FEATURE (SSSE3) jnz 3f leaq STRCMP_SSE2(%rip), %rax 3: ret @@ -110,23 +106,17 @@ END(STRCMP) # ifdef USE_AS_STRCASECMP_L ENTRY(__strcasecmp) .type __strcasecmp, @gnu_indirect_function - /* Manually inlined call to __get_cpu_features. */ - cmpl $0, __cpu_features+KIND_OFFSET(%rip) - jne 1f - call __init_cpu_features -1: -# ifdef HAVE_AVX_SUPPORT + LOAD_RTLD_GLOBAL_RO_RDX leaq __strcasecmp_avx(%rip), %rax - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) + HAS_ARCH_FEATURE (AVX_Usable) jnz 3f -# endif - testl $bit_Slow_SSE4_2, __cpu_features+FEATURE_OFFSET+index_Slow_SSE4_2(%rip) + HAS_ARCH_FEATURE (Slow_SSE4_2) jnz 2f leaq __strcasecmp_sse42(%rip), %rax - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) + HAS_CPU_FEATURE (SSE4_2) jnz 3f 2: leaq __strcasecmp_ssse3(%rip), %rax - testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + HAS_CPU_FEATURE (SSSE3) jnz 3f leaq __strcasecmp_sse2(%rip), %rax 3: ret @@ -136,23 +126,17 @@ weak_alias (__strcasecmp, strcasecmp) # ifdef USE_AS_STRNCASECMP_L ENTRY(__strncasecmp) .type __strncasecmp, @gnu_indirect_function - /* Manually inlined call to __get_cpu_features. */ - cmpl $0, __cpu_features+KIND_OFFSET(%rip) - jne 1f - call __init_cpu_features -1: -# ifdef HAVE_AVX_SUPPORT + LOAD_RTLD_GLOBAL_RO_RDX leaq __strncasecmp_avx(%rip), %rax - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) + HAS_ARCH_FEATURE (AVX_Usable) jnz 3f -# endif - testl $bit_Slow_SSE4_2, __cpu_features+FEATURE_OFFSET+index_Slow_SSE4_2(%rip) + HAS_ARCH_FEATURE (Slow_SSE4_2) jnz 2f leaq __strncasecmp_sse42(%rip), %rax - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) + HAS_CPU_FEATURE (SSE4_2) jnz 3f 2: leaq __strncasecmp_ssse3(%rip), %rax - testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + HAS_CPU_FEATURE (SSSE3) jnz 3f leaq __strncasecmp_sse2(%rip), %rax 3: ret @@ -167,16 +151,14 @@ weak_alias (__strncasecmp, strncasecmp) # include "strcmp-sse42.S" -# ifdef HAVE_AVX_SUPPORT -# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L -# define LABEL(l) .L##l##_avx -# define GLABEL(l) l##_avx -# define USE_AVX 1 -# undef STRCMP_SSE42 -# define STRCMP_SSE42 STRCMP_AVX -# define SECTION avx -# include "strcmp-sse42.S" -# endif +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# define LABEL(l) .L##l##_avx +# define GLABEL(l) l##_avx +# define USE_AVX 1 +# undef STRCMP_SSE42 +# define STRCMP_SSE42 STRCMP_AVX +# define SECTION avx +# include "strcmp-sse42.S" # endif diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S index 8f03d1db24..caa74be2c2 100644 --- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S @@ -1,5 +1,5 @@ /* strcpy with SSE2 and unaligned load - Copyright (C) 2011-2015 Free Software Foundation, Inc. + Copyright (C) 2011-2016 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S index 1f22c9a918..5bdb7671cf 100644 --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S +++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S @@ -1,5 +1,5 @@ /* strcpy with SSSE3 - Copyright (C) 2011-2015 Free Software Foundation, Inc. + Copyright (C) 2011-2016 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S index 9464ee8b63..024f6ef899 100644 --- a/sysdeps/x86_64/multiarch/strcpy.S +++ b/sysdeps/x86_64/multiarch/strcpy.S @@ -1,6 +1,6 @@ /* Multiple versions of strcpy All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2009-2015 Free Software Foundation, Inc. + Copyright (C) 2009-2016 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -61,14 +61,12 @@ .text ENTRY(STRCPY) .type STRCPY, @gnu_indirect_function - cmpl $0, __cpu_features+KIND_OFFSET(%rip) - jne 1f - call __init_cpu_features -1: leaq STRCPY_SSE2_UNALIGNED(%rip), %rax - testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip) + LOAD_RTLD_GLOBAL_RO_RDX + leaq STRCPY_SSE2_UNALIGNED(%rip), %rax + HAS_ARCH_FEATURE (Fast_Unaligned_Load) jnz 2f leaq STRCPY_SSE2(%rip), %rax - testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + HAS_CPU_FEATURE (SSSE3) jz 2f leaq STRCPY_SSSE3(%rip), %rax 2: ret diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c index 60b2ed7a3f..91b804ddd6 100644 --- a/sysdeps/x86_64/multiarch/strcspn-c.c +++ b/sysdeps/x86_64/multiarch/strcspn-c.c @@ -1,5 +1,5 @@ /* strcspn with SSE4.2 intrinsics - Copyright (C) 2009-2015 Free Software Foundation, Inc. + Copyright (C) 2009-2016 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S index 95e882c443..8e7ff1c663 100644 --- a/sysdeps/x86_64/multiarch/strcspn.S +++ b/sysdeps/x86_64/multiarch/strcspn.S @@ -1,6 +1,6 @@ /* Multiple versions of strcspn All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2009-2015 Free Software Foundation, Inc. + Copyright (C) 2009-2016 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -19,9 +19,6 @@ <http://www.gnu.org/licenses/>. */ #include <config.h> - -#ifdef HAVE_SSE4_SUPPORT - #include <sysdep.h> #include <init-arch.h> @@ -45,11 +42,9 @@ .text ENTRY(STRCSPN) .type STRCSPN, @gnu_indirect_function - cmpl $0, __cpu_features+KIND_OFFSET(%rip) - jne 1f - call __init_cpu_features -1: leaq STRCSPN_SSE2(%rip), %rax - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) + LOAD_RTLD_GLOBAL_RO_RDX + leaq STRCSPN_SSE2(%rip), %rax + HAS_CPU_FEATURE (SSE4_2) jz 2f leaq STRCSPN_SSE42(%rip), %rax 2: ret @@ -66,7 +61,6 @@ END(STRCSPN) # define END(name) \ cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2 #endif -#endif /* HAVE_SSE4_SUPPORT */ #ifdef USE_AS_STRPBRK #include "../strpbrk.S" diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c index 6b0c80aa43..9675f9360e 100644 --- a/sysdeps/x86_64/multiarch/strspn-c.c +++ b/sysdeps/x86_64/multiarch/strspn-c.c @@ -1,5 +1,5 @@ /* strspn with SSE4.2 intrinsics - Copyright (C) 2009-2015 Free Software Foundation, Inc. + Copyright (C) 2009-2016 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S index b734c1729a..4942826b24 100644 --- a/sysdeps/x86_64/multiarch/strspn.S +++ b/sysdeps/x86_64/multiarch/strspn.S @@ -1,6 +1,6 @@ /* Multiple versions of strspn All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2009-2015 Free Software Foundation, Inc. + Copyright (C) 2009-2016 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -19,9 +19,6 @@ <http://www.gnu.org/licenses/>. */ #include <config.h> - -#ifdef HAVE_SSE4_SUPPORT - #include <sysdep.h> #include <init-arch.h> @@ -30,11 +27,9 @@ .text ENTRY(strspn) .type strspn, @gnu_indirect_function - cmpl $0, __cpu_features+KIND_OFFSET(%rip) - jne 1f - call __init_cpu_features -1: leaq __strspn_sse2(%rip), %rax - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) + LOAD_RTLD_GLOBAL_RO_RDX + leaq __strspn_sse2(%rip), %rax + HAS_CPU_FEATURE (SSE4_2) jz 2f leaq __strspn_sse42(%rip), %rax 2: ret @@ -52,6 +47,4 @@ END(strspn) cfi_endproc; .size __strspn_sse2, .-__strspn_sse2 #endif -#endif /* HAVE_SSE4_SUPPORT */ - #include "../strspn.S" diff --git a/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S index 4f0e2ebdab..4ead1dfaf5 100644 --- a/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S @@ -1,5 +1,5 @@ /* strstr with unaligned loads - Copyright (C) 2009-2015 Free Software Foundation, Inc. + Copyright (C) 2009-2016 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c index 507994bd38..eecba2243e 100644 --- a/sysdeps/x86_64/multiarch/strstr.c +++ b/sysdeps/x86_64/multiarch/strstr.c @@ -1,6 +1,6 @@ /* Multiple versions of strstr. All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2012-2015 Free Software Foundation, Inc. + Copyright (C) 2012-2016 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -41,7 +41,10 @@ extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden; /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle ifunc symbol properly. */ extern __typeof (__redirect_strstr) __libc_strstr; -libc_ifunc (__libc_strstr, HAS_FAST_UNALIGNED_LOAD ? __strstr_sse2_unaligned : __strstr_sse2) +libc_ifunc (__libc_strstr, + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + ? __strstr_sse2_unaligned + : __strstr_sse2) #undef strstr strong_alias (__libc_strstr, strstr) diff --git a/sysdeps/x86_64/multiarch/test-multiarch.c b/sysdeps/x86_64/multiarch/test-multiarch.c index 949d26e550..4eb0c16cd8 100644 --- a/sysdeps/x86_64/multiarch/test-multiarch.c +++ b/sysdeps/x86_64/multiarch/test-multiarch.c @@ -1,6 +1,6 @@ /* Test CPU feature data. This file is part of the GNU C Library. - Copyright (C) 2012-2015 Free Software Foundation, Inc. + Copyright (C) 2012-2016 Free Software Foundation, Inc. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -75,12 +75,18 @@ do_test (int argc, char **argv) int fails; get_cpuinfo (); - fails = check_proc ("avx", HAS_AVX, "HAS_AVX"); - fails += check_proc ("fma4", HAS_FMA4, "HAS_FMA4"); - fails += check_proc ("sse4_2", HAS_SSE4_2, "HAS_SSE4_2"); - fails += check_proc ("sse4_1", HAS_SSE4_1, "HAS_SSE4_1"); - fails += check_proc ("ssse3", HAS_SSSE3, "HAS_SSSE3"); - fails += check_proc ("popcnt", HAS_POPCOUNT, "HAS_POPCOUNT"); + fails = check_proc ("avx", HAS_ARCH_FEATURE (AVX_Usable), + "HAS_ARCH_FEATURE (AVX_Usable)"); + fails += check_proc ("fma4", HAS_ARCH_FEATURE (FMA4_Usable), + "HAS_ARCH_FEATURE (FMA4_Usable)"); + fails += check_proc ("sse4_2", HAS_CPU_FEATURE (SSE4_2), + "HAS_CPU_FEATURE (SSE4_2)"); + fails += check_proc ("sse4_1", HAS_CPU_FEATURE (SSE4_1) + , "HAS_CPU_FEATURE (SSE4_1)"); + fails += check_proc ("ssse3", HAS_CPU_FEATURE (SSSE3), + "HAS_CPU_FEATURE (SSSE3)"); + fails += check_proc ("popcnt", HAS_CPU_FEATURE (POPCOUNT), + "HAS_CPU_FEATURE (POPCOUNT)"); printf ("%d differences between /proc/cpuinfo and glibc code.\n", fails); diff --git a/sysdeps/x86_64/multiarch/varshift.c b/sysdeps/x86_64/multiarch/varshift.c index 0007ef79e5..7921be5b57 100644 --- a/sysdeps/x86_64/multiarch/varshift.c +++ b/sysdeps/x86_64/multiarch/varshift.c @@ -1,5 +1,5 @@ /* Helper for variable shifts of SSE registers. - Copyright (C) 2010-2015 Free Software Foundation, Inc. + Copyright (C) 2010-2016 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/multiarch/varshift.h b/sysdeps/x86_64/multiarch/varshift.h index 30ace3d914..7b27d0e9dd 100644 --- a/sysdeps/x86_64/multiarch/varshift.h +++ b/sysdeps/x86_64/multiarch/varshift.h @@ -1,5 +1,5 @@ /* Helper for variable shifts of SSE registers. - Copyright (C) 2010-2015 Free Software Foundation, Inc. + Copyright (C) 2010-2016 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S index 8097862574..341e57a5ca 100644 --- a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S +++ b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S @@ -1,5 +1,5 @@ /* wcscpy with SSSE3 - Copyright (C) 2011-2015 Free Software Foundation, Inc. + Copyright (C) 2011-2016 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/wcscpy.S b/sysdeps/x86_64/multiarch/wcscpy.S index ff2f5a73d1..8e7270b9c7 100644 --- a/sysdeps/x86_64/multiarch/wcscpy.S +++ b/sysdeps/x86_64/multiarch/wcscpy.S @@ -1,6 +1,6 @@ /* Multiple versions of wcscpy All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2011-2015 Free Software Foundation, Inc. + Copyright (C) 2011-2016 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -27,11 +27,8 @@ .text ENTRY(wcscpy) .type wcscpy, @gnu_indirect_function - cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1f - call __init_cpu_features - -1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + LOAD_RTLD_GLOBAL_RO_RDX + HAS_CPU_FEATURE (SSSE3) jnz 2f leaq __wcscpy_sse2(%rip), %rax ret diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S index 109e2457fe..b510f756e2 100644 --- a/sysdeps/x86_64/multiarch/wmemcmp.S +++ b/sysdeps/x86_64/multiarch/wmemcmp.S @@ -1,6 +1,6 @@ /* Multiple versions of wmemcmp All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2011-2015 Free Software Foundation, Inc. + Copyright (C) 2011-2016 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -26,16 +26,13 @@ .text ENTRY(wmemcmp) .type wmemcmp, @gnu_indirect_function - cmpl $0, KIND_OFFSET+__cpu_features(%rip) - jne 1f - call __init_cpu_features - -1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + LOAD_RTLD_GLOBAL_RO_RDX + HAS_CPU_FEATURE (SSSE3) jnz 2f leaq __wmemcmp_sse2(%rip), %rax ret -2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip) +2: HAS_CPU_FEATURE (SSE4_1) jz 3f leaq __wmemcmp_sse4_1(%rip), %rax ret |