From 5f3d0b78e011d2a72f9e88b0e9ef5bc081d18f97 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 30 Jan 2015 06:50:20 -0800 Subject: Use AVX unaligned memcpy only if AVX2 is available memcpy with unaligned 256-bit AVX register loads/stores are slow on older processorsl like Sandy Bridge. This patch adds bit_AVX_Fast_Unaligned_Load and sets it only when AVX2 is available. [BZ #17801] * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): Set the bit_AVX_Fast_Unaligned_Load bit for AVX2. * sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load): New. (index_AVX_Fast_Unaligned_Load): Likewise. (HAS_AVX_FAST_UNALIGNED_LOAD): Likewise. * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit. * sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise. * sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise. * sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD. * sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise. --- ChangeLog | 18 ++++++++++++++++++ NEWS | 4 ++-- sysdeps/x86_64/multiarch/init-arch.c | 9 +++++++-- sysdeps/x86_64/multiarch/init-arch.h | 4 ++++ sysdeps/x86_64/multiarch/memcpy.S | 2 +- sysdeps/x86_64/multiarch/memcpy_chk.S | 2 +- sysdeps/x86_64/multiarch/memmove.c | 2 +- sysdeps/x86_64/multiarch/memmove_chk.c | 2 +- sysdeps/x86_64/multiarch/mempcpy.S | 2 +- sysdeps/x86_64/multiarch/mempcpy_chk.S | 2 +- 10 files changed, 37 insertions(+), 10 deletions(-) diff --git a/ChangeLog b/ChangeLog index 26f7f3f3b1..a696e396b2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +2015-01-30 H.J. Lu + + [BZ #17801] + * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): + Set the bit_AVX_Fast_Unaligned_Load bit for AVX2. + * sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load): + New. + (index_AVX_Fast_Unaligned_Load): Likewise. + (HAS_AVX_FAST_UNALIGNED_LOAD): Likewise. + * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the + bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit. + * sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise. + * sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise. + * sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise. + * sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace + HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD. + * sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise. + 2015-01-29 Andreas Schwab * sysdeps/nptl/allocrtsig.c: Include . diff --git a/NEWS b/NEWS index 8e2729bddd..c91b9fc58a 100644 --- a/NEWS +++ b/NEWS @@ -17,8 +17,8 @@ Version 2.21 17601, 17608, 17616, 17625, 17630, 17633, 17634, 17635, 17647, 17653, 17657, 17658, 17664, 17665, 17668, 17682, 17702, 17717, 17719, 17722, 17723, 17724, 17725, 17732, 17733, 17744, 17745, 17746, 17747, 17748, - 17775, 17777, 17780, 17781, 17782, 17791, 17793, 17796, 17797, 17803, - 17806, 17834, 17844, 17848, 17868, 17869, 17870, 17885, 17892. + 17775, 17777, 17780, 17781, 17782, 17791, 17793, 17796, 17797, 17801, + 17803, 17806, 17834, 17844, 17848, 17868, 17869, 17870, 17885, 17892. * A new semaphore algorithm has been implemented in generic C code for all machines. Previous custom assembly implementations of semaphore were diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c index 9299360612..7dec21884d 100644 --- a/sysdeps/x86_64/multiarch/init-arch.c +++ b/sysdeps/x86_64/multiarch/init-arch.c @@ -171,9 +171,14 @@ __init_cpu_features (void) /* Determine if AVX is usable. */ if (CPUID_AVX) __cpu_features.feature[index_AVX_Usable] |= bit_AVX_Usable; - /* Determine if AVX2 is usable. */ +#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load +# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load +#endif + /* Determine if AVX2 is usable. Unaligned load with 256-bit + AVX registers are faster on processors with AVX2. */ if (CPUID_AVX2) - __cpu_features.feature[index_AVX2_Usable] |= bit_AVX2_Usable; + __cpu_features.feature[index_AVX2_Usable] + |= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load; /* Determine if FMA is usable. */ if (CPUID_FMA) __cpu_features.feature[index_FMA_Usable] |= bit_FMA_Usable; diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h index 55f1c5b34c..e6b5ba5530 100644 --- a/sysdeps/x86_64/multiarch/init-arch.h +++ b/sysdeps/x86_64/multiarch/init-arch.h @@ -25,6 +25,7 @@ #define bit_FMA4_Usable (1 << 8) #define bit_Slow_SSE4_2 (1 << 9) #define bit_AVX2_Usable (1 << 10) +#define bit_AVX_Fast_Unaligned_Load (1 << 11) /* CPUID Feature flags. */ @@ -74,6 +75,7 @@ # define index_FMA4_Usable FEATURE_INDEX_1*FEATURE_SIZE # define index_Slow_SSE4_2 FEATURE_INDEX_1*FEATURE_SIZE # define index_AVX2_Usable FEATURE_INDEX_1*FEATURE_SIZE +# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE #else /* __ASSEMBLER__ */ @@ -169,6 +171,7 @@ extern const struct cpu_features *__get_cpu_features (void) # define index_FMA4_Usable FEATURE_INDEX_1 # define index_Slow_SSE4_2 FEATURE_INDEX_1 # define index_AVX2_Usable FEATURE_INDEX_1 +# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1 # define HAS_ARCH_FEATURE(name) \ ((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0) @@ -181,5 +184,6 @@ extern const struct cpu_features *__get_cpu_features (void) # define HAS_AVX2 HAS_ARCH_FEATURE (AVX2_Usable) # define HAS_FMA HAS_ARCH_FEATURE (FMA_Usable) # define HAS_FMA4 HAS_ARCH_FEATURE (FMA4_Usable) +# define HAS_AVX_FAST_UNALIGNED_LOAD HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) #endif /* __ASSEMBLER__ */ diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S index 992e40db81..4e18cd3070 100644 --- a/sysdeps/x86_64/multiarch/memcpy.S +++ b/sysdeps/x86_64/multiarch/memcpy.S @@ -33,7 +33,7 @@ ENTRY(__new_memcpy) jne 1f call __init_cpu_features 1: leaq __memcpy_avx_unaligned(%rip), %rax - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) jz 1f ret 1: leaq __memcpy_sse2(%rip), %rax diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S index 5e9cf004b0..1e756ea0c2 100644 --- a/sysdeps/x86_64/multiarch/memcpy_chk.S +++ b/sysdeps/x86_64/multiarch/memcpy_chk.S @@ -39,7 +39,7 @@ ENTRY(__memcpy_chk) testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) jz 2f leaq __memcpy_chk_ssse3_back(%rip), %rax - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) jz 2f leaq __memcpy_chk_avx_unaligned(%rip), %rax 2: ret diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c index d93bfd05c0..dd153a3eaa 100644 --- a/sysdeps/x86_64/multiarch/memmove.c +++ b/sysdeps/x86_64/multiarch/memmove.c @@ -49,7 +49,7 @@ extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden; ifunc symbol properly. */ extern __typeof (__redirect_memmove) __libc_memmove; libc_ifunc (__libc_memmove, - HAS_AVX + HAS_AVX_FAST_UNALIGNED_LOAD ? __memmove_avx_unaligned : (HAS_SSSE3 ? (HAS_FAST_COPY_BACKWARD diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c index 743ca2a460..8b12d002dc 100644 --- a/sysdeps/x86_64/multiarch/memmove_chk.c +++ b/sysdeps/x86_64/multiarch/memmove_chk.c @@ -30,7 +30,7 @@ extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden; #include "debug/memmove_chk.c" libc_ifunc (__memmove_chk, - HAS_AVX ? __memmove_chk_avx_unaligned : + HAS_AVX_FAST_UNALIGNED_LOAD ? __memmove_chk_avx_unaligned : (HAS_SSSE3 ? (HAS_FAST_COPY_BACKWARD ? __memmove_chk_ssse3_back : __memmove_chk_ssse3) diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S index cdf1dab62b..2eaacdf049 100644 --- a/sysdeps/x86_64/multiarch/mempcpy.S +++ b/sysdeps/x86_64/multiarch/mempcpy.S @@ -37,7 +37,7 @@ ENTRY(__mempcpy) testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) jz 2f leaq __mempcpy_ssse3_back(%rip), %rax - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) jz 2f leaq __mempcpy_avx_unaligned(%rip), %rax 2: ret diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S index b7f9e89ea2..17b84701b0 100644 --- a/sysdeps/x86_64/multiarch/mempcpy_chk.S +++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S @@ -39,7 +39,7 @@ ENTRY(__mempcpy_chk) testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) jz 2f leaq __mempcpy_chk_ssse3_back(%rip), %rax - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) jz 2f leaq __mempcpy_chk_avx_unaligned(%rip), %rax 2: ret -- cgit v1.2.3