diff options
Diffstat (limited to 'sysdeps/x86_64/multiarch')
48 files changed, 987 insertions, 1974 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 203d16eed3..57a3c13e8a 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -6,25 +6,24 @@ endif ifeq ($(subdir),string) -sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ - strend-sse4 memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned mempcpy-ssse3 \ +sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ + strcmp-sse2-unaligned strncmp-ssse3 \ + memcmp-sse4 memcpy-ssse3 \ + memcpy-sse2-unaligned mempcpy-ssse3 \ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ - memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \ + memmove-ssse3-back strcasecmp_l-ssse3 \ strncase_l-ssse3 strcat-ssse3 strncat-ssse3\ strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \ strcpy-sse2-unaligned strncpy-sse2-unaligned \ stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ strcat-sse2-unaligned strncat-sse2-unaligned \ - strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3 + strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned ifeq (yes,$(config-cflags-sse4)) -sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift +sysdep_routines += strcspn-c strpbrk-c strspn-c varshift CFLAGS-varshift.c += -msse4 CFLAGS-strcspn-c.c += -msse4 CFLAGS-strpbrk-c.c += -msse4 CFLAGS-strspn-c.c += -msse4 -CFLAGS-strstr.c += -msse4 -CFLAGS-strcasestr.c += -msse4 -CFLAGS-strcasestr-nonascii.c += -msse4 endif endif diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 28d35793c5..6da9be1420 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -1,5 +1,5 @@ /* Enumerate available IFUNC implementations of a function. x86-64 version. - Copyright (C) 2012-2013 Free Software Foundation, Inc. + Copyright (C) 2012-2014 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -61,12 +61,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __memmove_ssse3) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2)) - /* Support sysdeps/x86_64/multiarch/rawmemchr.S. */ - IFUNC_IMPL (i, name, rawmemchr, - IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_SSE4_2, - __rawmemchr_sse42) - IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2)) - /* Support sysdeps/x86_64/multiarch/stpncpy.S. */ IFUNC_IMPL (i, name, stpncpy, IFUNC_IMPL_ADD (array, i, stpncpy, HAS_SSSE3, @@ -104,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strcasestr.c. */ IFUNC_IMPL (i, name, strcasestr, - IFUNC_IMPL_ADD (array, i, strcasestr, HAS_SSE4_2, - __strcasestr_sse42) IFUNC_IMPL_ADD (array, i, strcasestr, 1, __strcasestr_sse2)) /* Support sysdeps/x86_64/multiarch/strcat.S. */ @@ -116,7 +108,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strchr.S. */ IFUNC_IMPL (i, name, strchr, - IFUNC_IMPL_ADD (array, i, strchr, HAS_SSE4_2, __strchr_sse42) IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf) IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2)) @@ -124,6 +115,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, strcmp, IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSE4_2, __strcmp_sse42) IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSSE3, __strcmp_ssse3) + IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2)) /* Support sysdeps/x86_64/multiarch/strcpy.S. */ @@ -182,21 +174,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __strpbrk_sse42) IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_sse2)) - /* Support sysdeps/x86_64/multiarch/strrchr.S. */ - IFUNC_IMPL (i, name, strrchr, - IFUNC_IMPL_ADD (array, i, strrchr, HAS_SSE4_2, - __strrchr_sse42) - IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2_no_bsf) - IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2)) /* Support sysdeps/x86_64/multiarch/strspn.S. */ IFUNC_IMPL (i, name, strspn, IFUNC_IMPL_ADD (array, i, strspn, HAS_SSE4_2, __strspn_sse42) IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_sse2)) - /* Support sysdeps/x86_64/multiarch/strstr-c.c. */ + /* Support sysdeps/x86_64/multiarch/strstr.c. */ IFUNC_IMPL (i, name, strstr, - IFUNC_IMPL_ADD (array, i, strstr, HAS_SSE4_2, __strstr_sse42) + IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2)) /* Support sysdeps/x86_64/multiarch/wcscpy.S. */ diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c index 7daaf46099..db74d977f2 100644 --- a/sysdeps/x86_64/multiarch/init-arch.c +++ b/sysdeps/x86_64/multiarch/init-arch.c @@ -1,6 +1,6 @@ /* Initialize CPU feature data. This file is part of the GNU C Library. - Copyright (C) 2008-2013 Free Software Foundation, Inc. + Copyright (C) 2008-2014 Free Software Foundation, Inc. Contributed by Ulrich Drepper <drepper@redhat.com>. The GNU C Library is free software; you can redistribute it and/or @@ -78,6 +78,21 @@ __init_cpu_features (void) __cpu_features.feature[index_Slow_BSF] |= bit_Slow_BSF; break; + case 0x37: + /* Unaligned load versions are faster than SSSE3 + on Silvermont. */ +#if index_Fast_Unaligned_Load != index_Prefer_PMINUB_for_stringop +# error index_Fast_Unaligned_Load != index_Prefer_PMINUB_for_stringop +#endif +#if index_Fast_Unaligned_Load != index_Slow_SSE4_2 +# error index_Fast_Unaligned_Load != index_Slow_SSE4_2 +#endif + __cpu_features.feature[index_Fast_Unaligned_Load] + |= (bit_Fast_Unaligned_Load + | bit_Prefer_PMINUB_for_stringop + | bit_Slow_SSE4_2); + break; + default: /* Unknown family 0x06 processors. Assuming this is one of Core i3/i5/i7 processors if AVX is available. */ diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h index 28edbf7d07..793707a4da 100644 --- a/sysdeps/x86_64/multiarch/init-arch.h +++ b/sysdeps/x86_64/multiarch/init-arch.h @@ -1,5 +1,5 @@ /* This file is part of the GNU C Library. - Copyright (C) 2008-2013 Free Software Foundation, Inc. + Copyright (C) 2008-2014 Free Software Foundation, Inc. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -23,6 +23,7 @@ #define bit_AVX_Usable (1 << 6) #define bit_FMA_Usable (1 << 7) #define bit_FMA4_Usable (1 << 8) +#define bit_Slow_SSE4_2 (1 << 9) /* CPUID Feature flags. */ @@ -62,6 +63,7 @@ # define index_AVX_Usable FEATURE_INDEX_1*FEATURE_SIZE # define index_FMA_Usable FEATURE_INDEX_1*FEATURE_SIZE # define index_FMA4_Usable FEATURE_INDEX_1*FEATURE_SIZE +# define index_Slow_SSE4_2 FEATURE_INDEX_1*FEATURE_SIZE #else /* __ASSEMBLER__ */ @@ -156,9 +158,11 @@ extern const struct cpu_features *__get_cpu_features (void) # define index_Fast_Copy_Backward FEATURE_INDEX_1 # define index_Slow_BSF FEATURE_INDEX_1 # define index_Fast_Unaligned_Load FEATURE_INDEX_1 +# define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1 # define index_AVX_Usable FEATURE_INDEX_1 # define index_FMA_Usable FEATURE_INDEX_1 # define index_FMA4_Usable FEATURE_INDEX_1 +# define index_Slow_SSE4_2 FEATURE_INDEX_1 # define HAS_ARCH_FEATURE(name) \ ((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0) diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S index 1ed4200f4c..e753d62bf4 100644 --- a/sysdeps/x86_64/multiarch/memcmp-sse4.S +++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S @@ -1,5 +1,5 @@ /* memcmp with SSE4.1, wmemcmp with SSE4.1 - Copyright (C) 2010-2013 Free Software Foundation, Inc. + Copyright (C) 2010-2014 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -25,10 +25,6 @@ # define MEMCMP __memcmp_sse4_1 # endif -# ifndef ALIGN -# define ALIGN(n) .p2align n -# endif - # define JMPTBL(I, B) (I - B) # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ @@ -60,7 +56,7 @@ ENTRY (MEMCMP) BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) # ifndef USE_AS_WMEMCMP - ALIGN (4) + .p2align 4 L(firstbyte): movzbl (%rdi), %eax movzbl (%rsi), %ecx @@ -68,7 +64,7 @@ L(firstbyte): ret # endif - ALIGN (4) + .p2align 4 L(79bytesormore): movdqu (%rsi), %xmm1 movdqu (%rdi), %xmm2 @@ -316,7 +312,7 @@ L(less32bytesin256): add %rdx, %rdi BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(512bytesormore): # ifdef DATA_CACHE_SIZE_HALF mov $DATA_CACHE_SIZE_HALF, %R8_LP @@ -329,7 +325,7 @@ L(512bytesormore): cmp %r8, %rdx ja L(L2_L3_cache_unaglined) sub $64, %rdx - ALIGN (4) + .p2align 4 L(64bytesormore_loop): movdqu (%rdi), %xmm2 pxor (%rsi), %xmm2 @@ -361,7 +357,7 @@ L(64bytesormore_loop): L(L2_L3_cache_unaglined): sub $64, %rdx - ALIGN (4) + .p2align 4 L(L2_L3_unaligned_128bytes_loop): prefetchnta 0x1c0(%rdi) prefetchnta 0x1c0(%rsi) @@ -396,7 +392,7 @@ L(L2_L3_unaligned_128bytes_loop): /* * This case is for machines which are sensitive for unaligned instructions. */ - ALIGN (4) + .p2align 4 L(2aligned): cmp $128, %rdx ja L(128bytesormorein2aligned) @@ -444,7 +440,7 @@ L(less32bytesin64in2alinged): add %rdx, %rdi BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(128bytesormorein2aligned): cmp $512, %rdx ja L(512bytesormorein2aligned) @@ -519,7 +515,7 @@ L(less32bytesin128in2aligned): add %rdx, %rdi BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(256bytesormorein2aligned): sub $256, %rdx @@ -632,7 +628,7 @@ L(less32bytesin256in2alinged): add %rdx, %rdi BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(512bytesormorein2aligned): # ifdef DATA_CACHE_SIZE_HALF mov $DATA_CACHE_SIZE_HALF, %R8_LP @@ -646,7 +642,7 @@ L(512bytesormorein2aligned): ja L(L2_L3_cache_aglined) sub $64, %rdx - ALIGN (4) + .p2align 4 L(64bytesormore_loopin2aligned): movdqa (%rdi), %xmm2 pxor (%rsi), %xmm2 @@ -678,7 +674,7 @@ L(64bytesormore_loopin2aligned): L(L2_L3_cache_aglined): sub $64, %rdx - ALIGN (4) + .p2align 4 L(L2_L3_aligned_128bytes_loop): prefetchnta 0x1c0(%rdi) prefetchnta 0x1c0(%rsi) @@ -711,7 +707,7 @@ L(L2_L3_aligned_128bytes_loop): BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(64bytesormore_loop_end): add $16, %rdi add $16, %rsi @@ -806,7 +802,7 @@ L(8bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(12bytes): mov -12(%rdi), %rax mov -12(%rsi), %rcx @@ -827,7 +823,7 @@ L(0bytes): # ifndef USE_AS_WMEMCMP /* unreal case for wmemcmp */ - ALIGN (4) + .p2align 4 L(65bytes): movdqu -65(%rdi), %xmm1 movdqu -65(%rsi), %xmm2 @@ -864,7 +860,7 @@ L(9bytes): sub %edx, %eax ret - ALIGN (4) + .p2align 4 L(13bytes): mov -13(%rdi), %rax mov -13(%rsi), %rcx @@ -877,7 +873,7 @@ L(13bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(5bytes): mov -5(%rdi), %eax mov -5(%rsi), %ecx @@ -888,7 +884,7 @@ L(5bytes): sub %edx, %eax ret - ALIGN (4) + .p2align 4 L(66bytes): movdqu -66(%rdi), %xmm1 movdqu -66(%rsi), %xmm2 @@ -929,7 +925,7 @@ L(10bytes): sub %ecx, %eax ret - ALIGN (4) + .p2align 4 L(14bytes): mov -14(%rdi), %rax mov -14(%rsi), %rcx @@ -942,7 +938,7 @@ L(14bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(6bytes): mov -6(%rdi), %eax mov -6(%rsi), %ecx @@ -958,7 +954,7 @@ L(2bytes): sub %ecx, %eax ret - ALIGN (4) + .p2align 4 L(67bytes): movdqu -67(%rdi), %xmm2 movdqu -67(%rsi), %xmm1 @@ -997,7 +993,7 @@ L(11bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(15bytes): mov -15(%rdi), %rax mov -15(%rsi), %rcx @@ -1010,7 +1006,7 @@ L(15bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(7bytes): mov -7(%rdi), %eax mov -7(%rsi), %ecx @@ -1023,7 +1019,7 @@ L(7bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(3bytes): movzwl -3(%rdi), %eax movzwl -3(%rsi), %ecx @@ -1036,7 +1032,7 @@ L(1bytes): ret # endif - ALIGN (4) + .p2align 4 L(68bytes): movdqu -68(%rdi), %xmm2 movdqu -68(%rsi), %xmm1 @@ -1079,7 +1075,7 @@ L(20bytes): # ifndef USE_AS_WMEMCMP /* unreal cases for wmemcmp */ - ALIGN (4) + .p2align 4 L(69bytes): movdqu -69(%rsi), %xmm1 movdqu -69(%rdi), %xmm2 @@ -1115,7 +1111,7 @@ L(21bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(70bytes): movdqu -70(%rsi), %xmm1 movdqu -70(%rdi), %xmm2 @@ -1151,7 +1147,7 @@ L(22bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(71bytes): movdqu -71(%rsi), %xmm1 movdqu -71(%rdi), %xmm2 @@ -1188,7 +1184,7 @@ L(23bytes): ret # endif - ALIGN (4) + .p2align 4 L(72bytes): movdqu -72(%rsi), %xmm1 movdqu -72(%rdi), %xmm2 @@ -1227,7 +1223,7 @@ L(24bytes): # ifndef USE_AS_WMEMCMP /* unreal cases for wmemcmp */ - ALIGN (4) + .p2align 4 L(73bytes): movdqu -73(%rsi), %xmm1 movdqu -73(%rdi), %xmm2 @@ -1265,7 +1261,7 @@ L(25bytes): sub %ecx, %eax ret - ALIGN (4) + .p2align 4 L(74bytes): movdqu -74(%rsi), %xmm1 movdqu -74(%rdi), %xmm2 @@ -1302,7 +1298,7 @@ L(26bytes): movzwl -2(%rsi), %ecx jmp L(diffin2bytes) - ALIGN (4) + .p2align 4 L(75bytes): movdqu -75(%rsi), %xmm1 movdqu -75(%rdi), %xmm2 @@ -1342,7 +1338,7 @@ L(27bytes): xor %eax, %eax ret # endif - ALIGN (4) + .p2align 4 L(76bytes): movdqu -76(%rsi), %xmm1 movdqu -76(%rdi), %xmm2 @@ -1388,7 +1384,7 @@ L(28bytes): # ifndef USE_AS_WMEMCMP /* unreal cases for wmemcmp */ - ALIGN (4) + .p2align 4 L(77bytes): movdqu -77(%rsi), %xmm1 movdqu -77(%rdi), %xmm2 @@ -1430,7 +1426,7 @@ L(29bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(78bytes): movdqu -78(%rsi), %xmm1 movdqu -78(%rdi), %xmm2 @@ -1470,7 +1466,7 @@ L(30bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(79bytes): movdqu -79(%rsi), %xmm1 movdqu -79(%rdi), %xmm2 @@ -1510,7 +1506,7 @@ L(31bytes): xor %eax, %eax ret # endif - ALIGN (4) + .p2align 4 L(64bytes): movdqu -64(%rdi), %xmm2 movdqu -64(%rsi), %xmm1 @@ -1548,7 +1544,7 @@ L(32bytes): /* * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block. */ - ALIGN (3) + .p2align 3 L(less16bytes): movsbq %dl, %rdx mov (%rsi, %rdx), %rcx @@ -1585,7 +1581,7 @@ L(diffin2bytes): sub %ecx, %eax ret - ALIGN (4) + .p2align 4 L(end): and $0xff, %eax and $0xff, %ecx @@ -1599,7 +1595,7 @@ L(end): neg %eax ret - ALIGN (4) + .p2align 4 L(nequal_bigger): ret @@ -1611,7 +1607,7 @@ L(unreal_case): END (MEMCMP) .section .rodata.sse4.1,"a",@progbits - ALIGN (3) + .p2align 3 # ifndef USE_AS_WMEMCMP L(table_64bytes): .int JMPTBL (L(0bytes), L(table_64bytes)) diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S index bdd2ed213c..5f7572fbab 100644 --- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S +++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S @@ -1,5 +1,5 @@ /* memcmp with SSSE3, wmemcmp with SSSE3 - Copyright (C) 2011-2013 Free Software Foundation, Inc. + Copyright (C) 2011-2014 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -25,10 +25,6 @@ # define MEMCMP __memcmp_ssse3 # endif -# ifndef ALIGN -# define ALIGN(n) .p2align n -# endif - /* Warning! wmemcmp has to use SIGNED comparison for elements. memcmp has to use UNSIGNED comparison for elemnts. @@ -50,7 +46,7 @@ ENTRY (MEMCMP) add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 /* ECX >= 32. */ L(48bytesormore): movdqu (%rdi), %xmm3 @@ -90,7 +86,7 @@ L(48bytesormore): je L(shr_6) jmp L(shr_7) - ALIGN (2) + .p2align 2 L(next_unaligned_table): cmp $8, %edx je L(shr_8) @@ -117,7 +113,7 @@ L(next_unaligned_table): jmp L(shr_12) # endif - ALIGN (4) + .p2align 4 L(shr_0): cmp $80, %rcx lea -48(%rcx), %rcx @@ -137,7 +133,7 @@ L(shr_0): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_0_gobble): movdqa (%rsi), %xmm0 xor %eax, %eax @@ -180,7 +176,7 @@ L(next): # ifndef USE_AS_WMEMCMP - ALIGN (4) + .p2align 4 L(shr_1): cmp $80, %rcx lea -48(%rcx), %rcx @@ -207,7 +203,7 @@ L(shr_1): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_1_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -258,7 +254,7 @@ L(shr_1_gobble_next): jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_2): cmp $80, %rcx lea -48(%rcx), %rcx @@ -285,7 +281,7 @@ L(shr_2): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_2_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -335,7 +331,7 @@ L(shr_2_gobble_next): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_3): cmp $80, %rcx lea -48(%rcx), %rcx @@ -362,7 +358,7 @@ L(shr_3): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_3_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -414,7 +410,7 @@ L(shr_3_gobble_next): # endif - ALIGN (4) + .p2align 4 L(shr_4): cmp $80, %rcx lea -48(%rcx), %rcx @@ -441,7 +437,7 @@ L(shr_4): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_4_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -493,7 +489,7 @@ L(shr_4_gobble_next): # ifndef USE_AS_WMEMCMP - ALIGN (4) + .p2align 4 L(shr_5): cmp $80, %rcx lea -48(%rcx), %rcx @@ -520,7 +516,7 @@ L(shr_5): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_5_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -570,7 +566,7 @@ L(shr_5_gobble_next): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_6): cmp $80, %rcx lea -48(%rcx), %rcx @@ -597,7 +593,7 @@ L(shr_6): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_6_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -647,7 +643,7 @@ L(shr_6_gobble_next): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_7): cmp $80, %rcx lea -48(%rcx), %rcx @@ -674,7 +670,7 @@ L(shr_7): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_7_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -726,7 +722,7 @@ L(shr_7_gobble_next): # endif - ALIGN (4) + .p2align 4 L(shr_8): cmp $80, %rcx lea -48(%rcx), %rcx @@ -753,7 +749,7 @@ L(shr_8): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_8_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -805,7 +801,7 @@ L(shr_8_gobble_next): # ifndef USE_AS_WMEMCMP - ALIGN (4) + .p2align 4 L(shr_9): cmp $80, %rcx lea -48(%rcx), %rcx @@ -832,7 +828,7 @@ L(shr_9): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_9_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -882,7 +878,7 @@ L(shr_9_gobble_next): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_10): cmp $80, %rcx lea -48(%rcx), %rcx @@ -909,7 +905,7 @@ L(shr_10): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_10_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -959,7 +955,7 @@ L(shr_10_gobble_next): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_11): cmp $80, %rcx lea -48(%rcx), %rcx @@ -986,7 +982,7 @@ L(shr_11): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_11_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -1038,7 +1034,7 @@ L(shr_11_gobble_next): # endif - ALIGN (4) + .p2align 4 L(shr_12): cmp $80, %rcx lea -48(%rcx), %rcx @@ -1065,7 +1061,7 @@ L(shr_12): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_12_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -1117,7 +1113,7 @@ L(shr_12_gobble_next): # ifndef USE_AS_WMEMCMP - ALIGN (4) + .p2align 4 L(shr_13): cmp $80, %rcx lea -48(%rcx), %rcx @@ -1144,7 +1140,7 @@ L(shr_13): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_13_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -1194,7 +1190,7 @@ L(shr_13_gobble_next): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_14): cmp $80, %rcx lea -48(%rcx), %rcx @@ -1221,7 +1217,7 @@ L(shr_14): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_14_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -1271,7 +1267,7 @@ L(shr_14_gobble_next): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_15): cmp $80, %rcx lea -48(%rcx), %rcx @@ -1298,7 +1294,7 @@ L(shr_15): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_15_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -1348,7 +1344,7 @@ L(shr_15_gobble_next): add %rcx, %rdi jmp L(less48bytes) # endif - ALIGN (4) + .p2align 4 L(exit): pmovmskb %xmm1, %r8d sub $0xffff, %r8d @@ -1389,56 +1385,56 @@ L(less16bytes): sub %edx, %eax ret - ALIGN (4) + .p2align 4 L(Byte16): movzbl -16(%rdi), %eax movzbl -16(%rsi), %edx sub %edx, %eax ret - ALIGN (4) + .p2align 4 L(Byte17): movzbl -15(%rdi), %eax movzbl -15(%rsi), %edx sub %edx, %eax ret - ALIGN (4) + .p2align 4 L(Byte18): movzbl -14(%rdi), %eax movzbl -14(%rsi), %edx sub %edx, %eax ret - ALIGN (4) + .p2align 4 L(Byte19): movzbl -13(%rdi), %eax movzbl -13(%rsi), %edx sub %edx, %eax ret - ALIGN (4) + .p2align 4 L(Byte20): movzbl -12(%rdi), %eax movzbl -12(%rsi), %edx sub %edx, %eax ret - ALIGN (4) + .p2align 4 L(Byte21): movzbl -11(%rdi), %eax movzbl -11(%rsi), %edx sub %edx, %eax ret - ALIGN (4) + .p2align 4 L(Byte22): movzbl -10(%rdi), %eax movzbl -10(%rsi), %edx sub %edx, %eax ret - ALIGN (4) + .p2align 4 L(next_24_bytes): lea 8(%rdi), %rdi lea 8(%rsi), %rsi @@ -1463,10 +1459,8 @@ L(next_24_bytes): test $0x40, %dh jnz L(Byte22) - mov -9(%rdi), %eax - and $0xff, %eax - mov -9(%rsi), %edx - and $0xff, %edx + movzbl -9(%rdi), %eax + movzbl -9(%rsi), %edx sub %edx, %eax ret # else @@ -1481,14 +1475,14 @@ L(next_24_bytes): jne L(find_diff) ret - ALIGN (4) + .p2align 4 L(second_double_word): mov -12(%rdi), %eax cmp -12(%rsi), %eax jne L(find_diff) ret - ALIGN (4) + .p2align 4 L(next_two_double_words): and $15, %dh jz L(fourth_double_word) @@ -1497,7 +1491,7 @@ L(next_two_double_words): jne L(find_diff) ret - ALIGN (4) + .p2align 4 L(fourth_double_word): mov -4(%rdi), %eax cmp -4(%rsi), %eax @@ -1505,7 +1499,7 @@ L(fourth_double_word): ret # endif - ALIGN (4) + .p2align 4 L(less48bytes): cmp $8, %ecx jae L(more8bytes) @@ -1529,7 +1523,7 @@ L(less48bytes): jmp L(4bytes) # endif - ALIGN (4) + .p2align 4 L(more8bytes): cmp $16, %ecx jae L(more16bytes) @@ -1553,7 +1547,7 @@ L(more8bytes): jmp L(12bytes) # endif - ALIGN (4) + .p2align 4 L(more16bytes): cmp $24, %ecx jae L(more24bytes) @@ -1577,7 +1571,7 @@ L(more16bytes): jmp L(20bytes) # endif - ALIGN (4) + .p2align 4 L(more24bytes): cmp $32, %ecx jae L(more32bytes) @@ -1601,7 +1595,7 @@ L(more24bytes): jmp L(28bytes) # endif - ALIGN (4) + .p2align 4 L(more32bytes): cmp $40, %ecx jae L(more40bytes) @@ -1625,7 +1619,7 @@ L(more32bytes): jmp L(36bytes) # endif - ALIGN (4) + .p2align 4 L(more40bytes): cmp $40, %ecx je L(40bytes) @@ -1644,7 +1638,7 @@ L(more40bytes): je L(46bytes) jmp L(47bytes) - ALIGN (4) + .p2align 4 L(44bytes): movl -44(%rdi), %eax movl -44(%rsi), %ecx @@ -1704,7 +1698,7 @@ L(0bytes): xor %eax, %eax ret # else - ALIGN (4) + .p2align 4 L(44bytes): movl -44(%rdi), %eax cmp -44(%rsi), %eax @@ -1755,7 +1749,7 @@ L(0bytes): # endif # ifndef USE_AS_WMEMCMP - ALIGN (4) + .p2align 4 L(45bytes): movl -45(%rdi), %eax movl -45(%rsi), %ecx @@ -1818,7 +1812,7 @@ L(1bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(46bytes): movl -46(%rdi), %eax movl -46(%rsi), %ecx @@ -1884,7 +1878,7 @@ L(2bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(47bytes): movl -47(%rdi), %eax movl -47(%rsi), %ecx @@ -1953,7 +1947,7 @@ L(3bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(find_diff): cmpb %cl, %al jne L(set) @@ -1975,19 +1969,19 @@ L(set): # else /* for wmemcmp */ - ALIGN (4) + .p2align 4 L(find_diff): mov $1, %eax jg L(find_diff_bigger) neg %eax ret - ALIGN (4) + .p2align 4 L(find_diff_bigger): ret # endif - ALIGN (4) + .p2align 4 L(equal): xor %eax, %eax ret diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S index da88af248a..627d8d05cf 100644 --- a/sysdeps/x86_64/multiarch/memcmp.S +++ b/sysdeps/x86_64/multiarch/memcmp.S @@ -1,6 +1,6 @@ /* Multiple versions of memcmp All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2013 Free Software Foundation, Inc. + Copyright (C) 2010-2014 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S index efdfea238f..07241b8e2b 100644 --- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S @@ -1,5 +1,5 @@ /* memcpy with unaliged loads - Copyright (C) 2013 Free Software Foundation, Inc. + Copyright (C) 2013-2014 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -20,10 +20,6 @@ #include "asm-syntax.h" -#ifndef ALIGN -# define ALIGN(n) .p2align n -#endif - ENTRY(__memcpy_sse2_unaligned) movq %rsi, %rax @@ -44,7 +40,7 @@ L(return): movq %rdi, %rax ret .p2align 4,,10 - ALIGN(4) + .p2align 4 .L31: movdqu 16(%rsi), %xmm8 cmpq $64, %rdx @@ -77,7 +73,7 @@ L(return): leaq 32(%r10), %r8 leaq 48(%r10), %rax .p2align 4,,10 - ALIGN(4) + .p2align 4 L(loop): movdqu (%rcx,%r10), %xmm8 movdqa %xmm8, (%rcx) @@ -151,7 +147,7 @@ L(less_16): .L3: leaq -1(%rdx), %rax .p2align 4,,10 - ALIGN(4) + .p2align 4 .L11: movzbl (%rsi,%rax), %edx movb %dl, (%rdi,%rax) diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S index fc9fcef27d..899ccbc34b 100644 --- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S +++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S @@ -1,5 +1,5 @@ /* memcpy with SSSE3 and REP string - Copyright (C) 2010-2013 Free Software Foundation, Inc. + Copyright (C) 2010-2014 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -31,10 +31,6 @@ # define MEMCPY_CHK __memcpy_chk_ssse3_back #endif -#ifndef ALIGN -# define ALIGN(n) .p2align n -#endif - #define JMPTBL(I, B) I - B /* Branch to an entry in a jump table. TABLE is a jump table with @@ -87,7 +83,7 @@ L(bk_write): BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) #endif - ALIGN (4) + .p2align 4 L(144bytesormore): #ifndef USE_AS_MEMMOVE @@ -119,7 +115,7 @@ L(144bytesormore): jmp *%r9 ud2 - ALIGN (4) + .p2align 4 L(copy_backward): #ifdef DATA_CACHE_SIZE mov $DATA_CACHE_SIZE, %RCX_LP @@ -149,7 +145,7 @@ L(copy_backward): jmp *%r9 ud2 - ALIGN (4) + .p2align 4 L(shl_0): mov %rdx, %r9 @@ -162,7 +158,7 @@ L(shl_0): #endif jae L(gobble_mem_fwd) sub $0x80, %rdx - ALIGN (4) + .p2align 4 L(shl_0_loop): movdqa (%rsi), %xmm1 movdqa %xmm1, (%rdi) @@ -190,7 +186,7 @@ L(shl_0_loop): add %rdx, %rdi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_0_bwd): sub $0x80, %rdx L(copy_backward_loop): @@ -221,7 +217,7 @@ L(copy_backward_loop): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_1): sub $0x80, %rdx movaps -0x01(%rsi), %xmm1 @@ -258,7 +254,7 @@ L(shl_1): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_1_bwd): movaps -0x01(%rsi), %xmm1 @@ -304,7 +300,7 @@ L(shl_1_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_2): sub $0x80, %rdx movaps -0x02(%rsi), %xmm1 @@ -341,7 +337,7 @@ L(shl_2): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_2_bwd): movaps -0x02(%rsi), %xmm1 @@ -387,7 +383,7 @@ L(shl_2_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_3): sub $0x80, %rdx movaps -0x03(%rsi), %xmm1 @@ -424,7 +420,7 @@ L(shl_3): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_3_bwd): movaps -0x03(%rsi), %xmm1 @@ -470,7 +466,7 @@ L(shl_3_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_4): sub $0x80, %rdx movaps -0x04(%rsi), %xmm1 @@ -507,7 +503,7 @@ L(shl_4): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_4_bwd): movaps -0x04(%rsi), %xmm1 @@ -553,7 +549,7 @@ L(shl_4_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_5): sub $0x80, %rdx movaps -0x05(%rsi), %xmm1 @@ -590,7 +586,7 @@ L(shl_5): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_5_bwd): movaps -0x05(%rsi), %xmm1 @@ -636,7 +632,7 @@ L(shl_5_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_6): sub $0x80, %rdx movaps -0x06(%rsi), %xmm1 @@ -673,7 +669,7 @@ L(shl_6): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_6_bwd): movaps -0x06(%rsi), %xmm1 @@ -719,7 +715,7 @@ L(shl_6_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_7): sub $0x80, %rdx movaps -0x07(%rsi), %xmm1 @@ -756,7 +752,7 @@ L(shl_7): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_7_bwd): movaps -0x07(%rsi), %xmm1 @@ -802,7 +798,7 @@ L(shl_7_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_8): sub $0x80, %rdx movaps -0x08(%rsi), %xmm1 @@ -839,7 +835,7 @@ L(shl_8): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_8_bwd): movaps -0x08(%rsi), %xmm1 @@ -886,7 +882,7 @@ L(shl_8_end_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_9): sub $0x80, %rdx movaps -0x09(%rsi), %xmm1 @@ -923,7 +919,7 @@ L(shl_9): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_9_bwd): movaps -0x09(%rsi), %xmm1 @@ -969,7 +965,7 @@ L(shl_9_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_10): sub $0x80, %rdx movaps -0x0a(%rsi), %xmm1 @@ -1006,7 +1002,7 @@ L(shl_10): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_10_bwd): movaps -0x0a(%rsi), %xmm1 @@ -1052,7 +1048,7 @@ L(shl_10_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_11): sub $0x80, %rdx movaps -0x0b(%rsi), %xmm1 @@ -1089,7 +1085,7 @@ L(shl_11): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_11_bwd): movaps -0x0b(%rsi), %xmm1 @@ -1135,7 +1131,7 @@ L(shl_11_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_12): sub $0x80, %rdx movdqa -0x0c(%rsi), %xmm1 @@ -1173,7 +1169,7 @@ L(shl_12): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_12_bwd): movaps -0x0c(%rsi), %xmm1 @@ -1219,7 +1215,7 @@ L(shl_12_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_13): sub $0x80, %rdx movaps -0x0d(%rsi), %xmm1 @@ -1256,7 +1252,7 @@ L(shl_13): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_13_bwd): movaps -0x0d(%rsi), %xmm1 @@ -1302,7 +1298,7 @@ L(shl_13_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_14): sub $0x80, %rdx movaps -0x0e(%rsi), %xmm1 @@ -1339,7 +1335,7 @@ L(shl_14): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_14_bwd): movaps -0x0e(%rsi), %xmm1 @@ -1385,7 +1381,7 @@ L(shl_14_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_15): sub $0x80, %rdx movaps -0x0f(%rsi), %xmm1 @@ -1422,7 +1418,7 @@ L(shl_15): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_15_bwd): movaps -0x0f(%rsi), %xmm1 @@ -1468,7 +1464,7 @@ L(shl_15_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(gobble_mem_fwd): movdqu (%rsi), %xmm1 movdqu %xmm0, (%r8) @@ -1570,7 +1566,7 @@ L(gobble_mem_fwd_end): add %rdx, %rdi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(gobble_mem_bwd): add %rdx, %rsi add %rdx, %rdi @@ -2833,7 +2829,7 @@ L(bwd_write_1bytes): END (MEMCPY) .section .rodata.ssse3,"a",@progbits - ALIGN (3) + .p2align 3 L(table_144_bytes_bwd): .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd)) .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd)) @@ -2980,7 +2976,7 @@ L(table_144_bytes_bwd): .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd)) .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd)) - ALIGN (3) + .p2align 3 L(table_144_bytes_fwd): .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd)) .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd)) @@ -3127,7 +3123,7 @@ L(table_144_bytes_fwd): .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd)) .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd)) - ALIGN (3) + .p2align 3 L(shl_table_fwd): .int JMPTBL (L(shl_0), L(shl_table_fwd)) .int JMPTBL (L(shl_1), L(shl_table_fwd)) @@ -3146,7 +3142,7 @@ L(shl_table_fwd): .int JMPTBL (L(shl_14), L(shl_table_fwd)) .int JMPTBL (L(shl_15), L(shl_table_fwd)) - ALIGN (3) + .p2align 3 L(shl_table_bwd): .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S index 9642ceecd9..0ad9a0008a 100644 --- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S +++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S @@ -1,5 +1,5 @@ /* memcpy with SSSE3 - Copyright (C) 2010-2013 Free Software Foundation, Inc. + Copyright (C) 2010-2014 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -31,10 +31,6 @@ # define MEMCPY_CHK __memcpy_chk_ssse3 #endif -#ifndef ALIGN -# define ALIGN(n) .p2align n -#endif - #define JMPTBL(I, B) I - B /* Branch to an entry in a jump table. TABLE is a jump table with @@ -80,7 +76,7 @@ L(copy_forward): jmp *%r9 ud2 - ALIGN (4) + .p2align 4 L(80bytesormore): #ifndef USE_AS_MEMMOVE cmp %dil, %sil @@ -113,7 +109,7 @@ L(80bytesormore): #endif BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4) - ALIGN (4) + .p2align 4 L(copy_backward): movdqu -16(%rsi, %rdx), %xmm0 add %rdx, %rsi @@ -144,7 +140,7 @@ L(copy_backward): #endif BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4) - ALIGN (4) + .p2align 4 L(shl_0): sub $16, %rdx movdqa (%rsi), %xmm1 @@ -172,7 +168,7 @@ L(shl_0_less_64bytes): add %rdx, %rdi BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_0_gobble): #ifdef DATA_CACHE_SIZE_HALF cmp $DATA_CACHE_SIZE_HALF, %RDX_LP @@ -228,7 +224,7 @@ L(shl_0_cache_less_64bytes): add %rdx, %rdi BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_0_gobble_mem_loop): prefetcht0 0x1c0(%rsi) prefetcht0 0x280(%rsi) @@ -287,7 +283,7 @@ L(shl_0_mem_less_32bytes): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_0_bwd): sub $16, %rdx movdqa -0x10(%rsi), %xmm1 @@ -313,7 +309,7 @@ L(shl_0_bwd): L(shl_0_less_64bytes_bwd): BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_0_gobble_bwd): #ifdef DATA_CACHE_SIZE_HALF cmp $DATA_CACHE_SIZE_HALF, %RDX_LP @@ -367,7 +363,7 @@ L(shl_0_gobble_bwd_loop): L(shl_0_gobble_bwd_less_64bytes): BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_0_gobble_mem_bwd_loop): prefetcht0 -0x1c0(%rsi) prefetcht0 -0x280(%rsi) @@ -423,7 +419,7 @@ L(shl_0_mem_bwd_less_64bytes): L(shl_0_mem_bwd_less_32bytes): BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_1): lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9 cmp %rcx, %rdx @@ -466,7 +462,7 @@ L(shl_1_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_1_bwd): lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -508,7 +504,7 @@ L(shl_1_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_2): lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9 cmp %rcx, %rdx @@ -551,7 +547,7 @@ L(shl_2_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_2_bwd): lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -593,7 +589,7 @@ L(shl_2_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_3): lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9 cmp %rcx, %rdx @@ -636,7 +632,7 @@ L(shl_3_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_3_bwd): lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -678,7 +674,7 @@ L(shl_3_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_4): lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9 cmp %rcx, %rdx @@ -721,7 +717,7 @@ L(shl_4_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_4_bwd): lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -763,7 +759,7 @@ L(shl_4_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_5): lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9 cmp %rcx, %rdx @@ -806,7 +802,7 @@ L(shl_5_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_5_bwd): lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -848,7 +844,7 @@ L(shl_5_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_6): lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9 cmp %rcx, %rdx @@ -891,7 +887,7 @@ L(shl_6_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_6_bwd): lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -933,7 +929,7 @@ L(shl_6_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_7): lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9 cmp %rcx, %rdx @@ -976,7 +972,7 @@ L(shl_7_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_7_bwd): lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -1018,7 +1014,7 @@ L(shl_7_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_8): lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9 cmp %rcx, %rdx @@ -1051,7 +1047,7 @@ L(shl_8_loop_L1): movaps %xmm5, -0x10(%rdi) jmp *%r9 ud2 - ALIGN (4) + .p2align 4 L(shl_8_end): lea 64(%rdx), %rdx movaps %xmm4, -0x20(%rdi) @@ -1061,7 +1057,7 @@ L(shl_8_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_8_bwd): lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -1103,7 +1099,7 @@ L(shl_8_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_9): lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9 cmp %rcx, %rdx @@ -1146,7 +1142,7 @@ L(shl_9_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_9_bwd): lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -1188,7 +1184,7 @@ L(shl_9_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_10): lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9 cmp %rcx, %rdx @@ -1231,7 +1227,7 @@ L(shl_10_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_10_bwd): lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -1273,7 +1269,7 @@ L(shl_10_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_11): lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9 cmp %rcx, %rdx @@ -1316,7 +1312,7 @@ L(shl_11_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_11_bwd): lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -1358,7 +1354,7 @@ L(shl_11_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_12): lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9 cmp %rcx, %rdx @@ -1401,7 +1397,7 @@ L(shl_12_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_12_bwd): lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -1443,7 +1439,7 @@ L(shl_12_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_13): lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9 cmp %rcx, %rdx @@ -1486,7 +1482,7 @@ L(shl_13_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_13_bwd): lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -1528,7 +1524,7 @@ L(shl_13_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_14): lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9 cmp %rcx, %rdx @@ -1571,7 +1567,7 @@ L(shl_14_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_14_bwd): lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -1613,7 +1609,7 @@ L(shl_14_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_15): lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9 cmp %rcx, %rdx @@ -1656,7 +1652,7 @@ L(shl_15_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_15_bwd): lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -1698,7 +1694,7 @@ L(shl_15_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(write_72bytes): movdqu -72(%rsi), %xmm0 movdqu -56(%rsi), %xmm1 @@ -1716,7 +1712,7 @@ L(write_72bytes): mov %rcx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_64bytes): movdqu -64(%rsi), %xmm0 mov -48(%rsi), %rcx @@ -1734,7 +1730,7 @@ L(write_64bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_56bytes): movdqu -56(%rsi), %xmm0 mov -40(%rsi), %r8 @@ -1750,7 +1746,7 @@ L(write_56bytes): mov %rcx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_48bytes): mov -48(%rsi), %rcx mov -40(%rsi), %r8 @@ -1766,7 +1762,7 @@ L(write_48bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_40bytes): mov -40(%rsi), %r8 mov -32(%rsi), %r9 @@ -1780,7 +1776,7 @@ L(write_40bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_32bytes): mov -32(%rsi), %r9 mov -24(%rsi), %r10 @@ -1792,7 +1788,7 @@ L(write_32bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_24bytes): mov -24(%rsi), %r10 mov -16(%rsi), %r11 @@ -1802,7 +1798,7 @@ L(write_24bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_16bytes): mov -16(%rsi), %r11 mov -8(%rsi), %rdx @@ -1810,14 +1806,14 @@ L(write_16bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_8bytes): mov -8(%rsi), %rdx mov %rdx, -8(%rdi) L(write_0bytes): ret - ALIGN (4) + .p2align 4 L(write_73bytes): movdqu -73(%rsi), %xmm0 movdqu -57(%rsi), %xmm1 @@ -1837,7 +1833,7 @@ L(write_73bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_65bytes): movdqu -65(%rsi), %xmm0 movdqu -49(%rsi), %xmm1 @@ -1855,7 +1851,7 @@ L(write_65bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_57bytes): movdqu -57(%rsi), %xmm0 mov -41(%rsi), %r8 @@ -1873,7 +1869,7 @@ L(write_57bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_49bytes): movdqu -49(%rsi), %xmm0 mov -33(%rsi), %r9 @@ -1889,7 +1885,7 @@ L(write_49bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_41bytes): mov -41(%rsi), %r8 mov -33(%rsi), %r9 @@ -1905,7 +1901,7 @@ L(write_41bytes): mov %dl, -1(%rdi) ret - ALIGN (4) + .p2align 4 L(write_33bytes): mov -33(%rsi), %r9 mov -25(%rsi), %r10 @@ -1919,7 +1915,7 @@ L(write_33bytes): mov %dl, -1(%rdi) ret - ALIGN (4) + .p2align 4 L(write_25bytes): mov -25(%rsi), %r10 mov -17(%rsi), %r11 @@ -1931,7 +1927,7 @@ L(write_25bytes): mov %dl, -1(%rdi) ret - ALIGN (4) + .p2align 4 L(write_17bytes): mov -17(%rsi), %r11 mov -9(%rsi), %rcx @@ -1941,7 +1937,7 @@ L(write_17bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_9bytes): mov -9(%rsi), %rcx mov -4(%rsi), %edx @@ -1949,13 +1945,13 @@ L(write_9bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_1bytes): mov -1(%rsi), %dl mov %dl, -1(%rdi) ret - ALIGN (4) + .p2align 4 L(write_74bytes): movdqu -74(%rsi), %xmm0 movdqu -58(%rsi), %xmm1 @@ -1975,7 +1971,7 @@ L(write_74bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_66bytes): movdqu -66(%rsi), %xmm0 movdqu -50(%rsi), %xmm1 @@ -1995,7 +1991,7 @@ L(write_66bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_58bytes): movdqu -58(%rsi), %xmm1 mov -42(%rsi), %r8 @@ -2013,7 +2009,7 @@ L(write_58bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_50bytes): movdqu -50(%rsi), %xmm0 mov -34(%rsi), %r9 @@ -2029,7 +2025,7 @@ L(write_50bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_42bytes): mov -42(%rsi), %r8 mov -34(%rsi), %r9 @@ -2045,7 +2041,7 @@ L(write_42bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_34bytes): mov -34(%rsi), %r9 mov -26(%rsi), %r10 @@ -2059,7 +2055,7 @@ L(write_34bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_26bytes): mov -26(%rsi), %r10 mov -18(%rsi), %r11 @@ -2071,7 +2067,7 @@ L(write_26bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_18bytes): mov -18(%rsi), %r11 mov -10(%rsi), %rcx @@ -2081,7 +2077,7 @@ L(write_18bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_10bytes): mov -10(%rsi), %rcx mov -4(%rsi), %edx @@ -2089,13 +2085,13 @@ L(write_10bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_2bytes): mov -2(%rsi), %dx mov %dx, -2(%rdi) ret - ALIGN (4) + .p2align 4 L(write_75bytes): movdqu -75(%rsi), %xmm0 movdqu -59(%rsi), %xmm1 @@ -2115,7 +2111,7 @@ L(write_75bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_67bytes): movdqu -67(%rsi), %xmm0 movdqu -59(%rsi), %xmm1 @@ -2135,7 +2131,7 @@ L(write_67bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_59bytes): movdqu -59(%rsi), %xmm0 mov -43(%rsi), %r8 @@ -2153,7 +2149,7 @@ L(write_59bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_51bytes): movdqu -51(%rsi), %xmm0 mov -35(%rsi), %r9 @@ -2169,7 +2165,7 @@ L(write_51bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_43bytes): mov -43(%rsi), %r8 mov -35(%rsi), %r9 @@ -2185,7 +2181,7 @@ L(write_43bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_35bytes): mov -35(%rsi), %r9 mov -27(%rsi), %r10 @@ -2199,7 +2195,7 @@ L(write_35bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_27bytes): mov -27(%rsi), %r10 mov -19(%rsi), %r11 @@ -2211,7 +2207,7 @@ L(write_27bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_19bytes): mov -19(%rsi), %r11 mov -11(%rsi), %rcx @@ -2221,7 +2217,7 @@ L(write_19bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_11bytes): mov -11(%rsi), %rcx mov -4(%rsi), %edx @@ -2229,7 +2225,7 @@ L(write_11bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_3bytes): mov -3(%rsi), %dx mov -2(%rsi), %cx @@ -2237,7 +2233,7 @@ L(write_3bytes): mov %cx, -2(%rdi) ret - ALIGN (4) + .p2align 4 L(write_76bytes): movdqu -76(%rsi), %xmm0 movdqu -60(%rsi), %xmm1 @@ -2257,7 +2253,7 @@ L(write_76bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_68bytes): movdqu -68(%rsi), %xmm0 movdqu -52(%rsi), %xmm1 @@ -2275,7 +2271,7 @@ L(write_68bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_60bytes): movdqu -60(%rsi), %xmm0 mov -44(%rsi), %r8 @@ -2293,7 +2289,7 @@ L(write_60bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_52bytes): movdqu -52(%rsi), %xmm0 mov -36(%rsi), %r9 @@ -2309,7 +2305,7 @@ L(write_52bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_44bytes): mov -44(%rsi), %r8 mov -36(%rsi), %r9 @@ -2325,7 +2321,7 @@ L(write_44bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_36bytes): mov -36(%rsi), %r9 mov -28(%rsi), %r10 @@ -2339,7 +2335,7 @@ L(write_36bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_28bytes): mov -28(%rsi), %r10 mov -20(%rsi), %r11 @@ -2351,7 +2347,7 @@ L(write_28bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_20bytes): mov -20(%rsi), %r11 mov -12(%rsi), %rcx @@ -2361,7 +2357,7 @@ L(write_20bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_12bytes): mov -12(%rsi), %rcx mov -4(%rsi), %edx @@ -2369,13 +2365,13 @@ L(write_12bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_4bytes): mov -4(%rsi), %edx mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_77bytes): movdqu -77(%rsi), %xmm0 movdqu -61(%rsi), %xmm1 @@ -2395,7 +2391,7 @@ L(write_77bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_69bytes): movdqu -69(%rsi), %xmm0 movdqu -53(%rsi), %xmm1 @@ -2413,7 +2409,7 @@ L(write_69bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_61bytes): movdqu -61(%rsi), %xmm0 mov -45(%rsi), %r8 @@ -2431,7 +2427,7 @@ L(write_61bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_53bytes): movdqu -53(%rsi), %xmm0 mov -45(%rsi), %r8 @@ -2448,7 +2444,7 @@ L(write_53bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_45bytes): mov -45(%rsi), %r8 mov -37(%rsi), %r9 @@ -2464,7 +2460,7 @@ L(write_45bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_37bytes): mov -37(%rsi), %r9 mov -29(%rsi), %r10 @@ -2478,7 +2474,7 @@ L(write_37bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_29bytes): mov -29(%rsi), %r10 mov -21(%rsi), %r11 @@ -2490,7 +2486,7 @@ L(write_29bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_21bytes): mov -21(%rsi), %r11 mov -13(%rsi), %rcx @@ -2500,7 +2496,7 @@ L(write_21bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_13bytes): mov -13(%rsi), %rcx mov -8(%rsi), %rdx @@ -2508,7 +2504,7 @@ L(write_13bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_5bytes): mov -5(%rsi), %edx mov -4(%rsi), %ecx @@ -2516,7 +2512,7 @@ L(write_5bytes): mov %ecx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_78bytes): movdqu -78(%rsi), %xmm0 movdqu -62(%rsi), %xmm1 @@ -2536,7 +2532,7 @@ L(write_78bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_70bytes): movdqu -70(%rsi), %xmm0 movdqu -54(%rsi), %xmm1 @@ -2554,7 +2550,7 @@ L(write_70bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_62bytes): movdqu -62(%rsi), %xmm0 mov -46(%rsi), %r8 @@ -2572,7 +2568,7 @@ L(write_62bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_54bytes): movdqu -54(%rsi), %xmm0 mov -38(%rsi), %r9 @@ -2588,7 +2584,7 @@ L(write_54bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_46bytes): mov -46(%rsi), %r8 mov -38(%rsi), %r9 @@ -2604,7 +2600,7 @@ L(write_46bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_38bytes): mov -38(%rsi), %r9 mov -30(%rsi), %r10 @@ -2618,7 +2614,7 @@ L(write_38bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_30bytes): mov -30(%rsi), %r10 mov -22(%rsi), %r11 @@ -2630,7 +2626,7 @@ L(write_30bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_22bytes): mov -22(%rsi), %r11 mov -14(%rsi), %rcx @@ -2640,7 +2636,7 @@ L(write_22bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_14bytes): mov -14(%rsi), %rcx mov -8(%rsi), %rdx @@ -2648,7 +2644,7 @@ L(write_14bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_6bytes): mov -6(%rsi), %edx mov -4(%rsi), %ecx @@ -2656,7 +2652,7 @@ L(write_6bytes): mov %ecx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_79bytes): movdqu -79(%rsi), %xmm0 movdqu -63(%rsi), %xmm1 @@ -2676,7 +2672,7 @@ L(write_79bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_71bytes): movdqu -71(%rsi), %xmm0 movdqu -55(%rsi), %xmm1 @@ -2694,7 +2690,7 @@ L(write_71bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_63bytes): movdqu -63(%rsi), %xmm0 mov -47(%rsi), %r8 @@ -2712,7 +2708,7 @@ L(write_63bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_55bytes): movdqu -55(%rsi), %xmm0 mov -39(%rsi), %r9 @@ -2728,7 +2724,7 @@ L(write_55bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_47bytes): mov -47(%rsi), %r8 mov -39(%rsi), %r9 @@ -2744,7 +2740,7 @@ L(write_47bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_39bytes): mov -39(%rsi), %r9 mov -31(%rsi), %r10 @@ -2758,7 +2754,7 @@ L(write_39bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_31bytes): mov -31(%rsi), %r10 mov -23(%rsi), %r11 @@ -2770,7 +2766,7 @@ L(write_31bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_23bytes): mov -23(%rsi), %r11 mov -15(%rsi), %rcx @@ -2780,7 +2776,7 @@ L(write_23bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_15bytes): mov -15(%rsi), %rcx mov -8(%rsi), %rdx @@ -2788,7 +2784,7 @@ L(write_15bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_7bytes): mov -7(%rsi), %edx mov -4(%rsi), %ecx @@ -2796,7 +2792,7 @@ L(write_7bytes): mov %ecx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(large_page_fwd): movdqu (%rsi), %xmm1 lea 16(%rsi), %rsi @@ -2859,7 +2855,7 @@ L(large_page_less_64bytes): BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) #ifdef USE_AS_MEMMOVE - ALIGN (4) + .p2align 4 L(ll_cache_copy_fwd_start): prefetcht0 0x1c0(%rsi) prefetcht0 0x200(%rsi) @@ -2906,7 +2902,7 @@ L(large_page_ll_less_fwd_64bytes): BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) #endif - ALIGN (4) + .p2align 4 L(large_page_bwd): movdqu -0x10(%rsi), %xmm1 lea -16(%rsi), %rsi @@ -2966,7 +2962,7 @@ L(large_page_less_bwd_64bytes): BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) #ifdef USE_AS_MEMMOVE - ALIGN (4) + .p2align 4 L(ll_cache_copy_bwd_start): prefetcht0 -0x1c0(%rsi) prefetcht0 -0x200(%rsi) @@ -3014,7 +3010,7 @@ L(large_page_ll_less_bwd_64bytes): END (MEMCPY) .section .rodata.ssse3,"a",@progbits - ALIGN (3) + .p2align 3 L(table_less_80bytes): .int JMPTBL (L(write_0bytes), L(table_less_80bytes)) .int JMPTBL (L(write_1bytes), L(table_less_80bytes)) @@ -3097,7 +3093,7 @@ L(table_less_80bytes): .int JMPTBL (L(write_78bytes), L(table_less_80bytes)) .int JMPTBL (L(write_79bytes), L(table_less_80bytes)) - ALIGN (3) + .p2align 3 L(shl_table): .int JMPTBL (L(shl_0), L(shl_table)) .int JMPTBL (L(shl_1), L(shl_table)) @@ -3116,7 +3112,7 @@ L(shl_table): .int JMPTBL (L(shl_14), L(shl_table)) .int JMPTBL (L(shl_15), L(shl_table)) - ALIGN (3) + .p2align 3 L(shl_table_bwd): .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S index a1e5031376..40ae926386 100644 --- a/sysdeps/x86_64/multiarch/memcpy.S +++ b/sysdeps/x86_64/multiarch/memcpy.S @@ -1,6 +1,6 @@ /* Multiple versions of memcpy All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2013 Free Software Foundation, Inc. + Copyright (C) 2010-2014 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S index ad01d8cd9f..3c0270fd23 100644 --- a/sysdeps/x86_64/multiarch/memcpy_chk.S +++ b/sysdeps/x86_64/multiarch/memcpy_chk.S @@ -1,6 +1,6 @@ /* Multiple versions of __memcpy_chk All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2013 Free Software Foundation, Inc. + Copyright (C) 2010-2014 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c index 8149c487d5..ba86e7bbb1 100644 --- a/sysdeps/x86_64/multiarch/memmove.c +++ b/sysdeps/x86_64/multiarch/memmove.c @@ -1,6 +1,6 @@ /* Multiple versions of memmove. All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2013 Free Software Foundation, Inc. + Copyright (C) 2010-2014 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c index 17ed460324..cb1acb6598 100644 --- a/sysdeps/x86_64/multiarch/memmove_chk.c +++ b/sysdeps/x86_64/multiarch/memmove_chk.c @@ -1,6 +1,6 @@ /* Multiple versions of __memmove_chk. All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2013 Free Software Foundation, Inc. + Copyright (C) 2010-2014 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S index b8b7fcd121..b9f04c2ec4 100644 --- a/sysdeps/x86_64/multiarch/mempcpy.S +++ b/sysdeps/x86_64/multiarch/mempcpy.S @@ -1,6 +1,6 @@ /* Multiple versions of mempcpy All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2013 Free Software Foundation, Inc. + Copyright (C) 2010-2014 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S index 3801db399b..c28473a669 100644 --- a/sysdeps/x86_64/multiarch/mempcpy_chk.S +++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S @@ -1,6 +1,6 @@ /* Multiple versions of __mempcpy_chk All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2013 Free Software Foundation, Inc. + Copyright (C) 2010-2014 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/rawmemchr.S b/sysdeps/x86_64/multiarch/rawmemchr.S deleted file mode 100644 index 50de38ffb5..0000000000 --- a/sysdeps/x86_64/multiarch/rawmemchr.S +++ /dev/null @@ -1,103 +0,0 @@ -/* Multiple versions of rawmemchr - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2009-2013 Free Software Foundation, Inc. - Contributed by Ulrich Drepper <drepper@redhat.com>. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - -/* Define multiple versions only for the definition in lib. */ -#ifndef NOT_IN_libc - .text -ENTRY(rawmemchr) - .type rawmemchr, @gnu_indirect_function - cmpl $0, __cpu_features+KIND_OFFSET(%rip) - jne 1f - call __init_cpu_features -1: testl $bit_Prefer_PMINUB_for_stringop, __cpu_features+FEATURE_OFFSET+index_Prefer_PMINUB_for_stringop(%rip) - jnz 2f - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) - jz 2f - leaq __rawmemchr_sse42(%rip), %rax - ret -2: leaq __rawmemchr_sse2(%rip), %rax - ret - -END(rawmemchr) -strong_alias (rawmemchr, __rawmemchr) - - - .section .text.sse4.2,"ax",@progbits - .align 16 - .type __rawmemchr_sse42, @function - .globl __rawmemchr_sse42 - .hidden __rawmemchr_sse42 -__rawmemchr_sse42: - cfi_startproc - CALL_MCOUNT - movd %esi, %xmm1 - movq %rdi, %rcx - pxor %xmm2, %xmm2 - andq $~15, %rdi - orl $0xffffffff, %esi - pshufb %xmm2, %xmm1 - movdqa (%rdi), %xmm0 - subq %rdi, %rcx - pcmpeqb %xmm1, %xmm0 - shl %cl, %esi - pmovmskb %xmm0, %ecx - movl $16, %eax - movl $16, %edx - andl %esi, %ecx - jnz 1f - -2: pcmpestri $0x08, 16(%rdi), %xmm1 - leaq 16(%rdi), %rdi - jnc 2b - - leaq (%rdi,%rcx), %rax - ret - -1: bsfl %ecx, %eax - addq %rdi, %rax - ret - cfi_endproc - .size __rawmemchr_sse42, .-__rawmemchr_sse42 - - -# undef ENTRY -# define ENTRY(name) \ - .type __rawmemchr_sse2, @function; \ - .align 16; \ - .globl __rawmemchr_sse2; \ - .hidden __rawmemchr_sse2; \ - __rawmemchr_sse2: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __rawmemchr_sse2, .-__rawmemchr_sse2 -# undef libc_hidden_builtin_def -/* It doesn't make sense to send libc-internal rawmemchr calls through a PLT. - The speedup we get from using SSE4.2 instruction is likely eaten away - by the indirect call in the PLT. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_sse2 -#endif - -#include "../rawmemchr.S" diff --git a/sysdeps/x86_64/multiarch/sched_cpucount.c b/sysdeps/x86_64/multiarch/sched_cpucount.c index cd127cdc69..68a043a169 100644 --- a/sysdeps/x86_64/multiarch/sched_cpucount.c +++ b/sysdeps/x86_64/multiarch/sched_cpucount.c @@ -1,6 +1,6 @@ /* Count bits in CPU set. x86-64 multi-arch version. This file is part of the GNU C Library. - Copyright (C) 2008-2013 Free Software Foundation, Inc. + Copyright (C) 2008-2014 Free Software Foundation, Inc. Contributed by Ulrich Drepper <drepper@redhat.com>. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/multiarch/strcasestr-c.c b/sysdeps/x86_64/multiarch/strcasestr-c.c deleted file mode 100644 index c13a4c44f3..0000000000 --- a/sysdeps/x86_64/multiarch/strcasestr-c.c +++ /dev/null @@ -1,19 +0,0 @@ -/* Multiple versions of strcasestr - All versions must be listed in ifunc-impl-list.c. */ - -#include "init-arch.h" - -#define STRCASESTR __strcasestr_sse2 - -#include "string/strcasestr.c" - -extern char *__strcasestr_sse42 (const char *, const char *) attribute_hidden; -extern __typeof (__strcasestr_sse2) __strcasestr_sse2 attribute_hidden; - -#if 1 -libc_ifunc (__strcasestr, - HAS_SSE4_2 ? __strcasestr_sse42 : __strcasestr_sse2); -#else -libc_ifunc (__strcasestr, - 0 ? __strcasestr_sse42 : __strcasestr_sse2); -#endif diff --git a/sysdeps/x86_64/multiarch/strcasestr-nonascii.c b/sysdeps/x86_64/multiarch/strcasestr-nonascii.c deleted file mode 100644 index 032a6420d6..0000000000 --- a/sysdeps/x86_64/multiarch/strcasestr-nonascii.c +++ /dev/null @@ -1,50 +0,0 @@ -/* strstr with SSE4.2 intrinsics - Copyright (C) 2010-2013 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <ctype.h> -#include <xmmintrin.h> - - -/* Similar to __m128i_strloadu. Convert to lower case for none-POSIX/C - locale. */ -static __m128i -__m128i_strloadu_tolower (const unsigned char *p) -{ - union - { - char b[16]; - __m128i x; - } u; - - for (int i = 0; i < 16; ++i) - if (p[i] == 0) - { - u.b[i] = 0; - break; - } - else - u.b[i] = tolower (p[i]); - - return u.x; -} - - -#define STRCASESTR_NONASCII -#define USE_AS_STRCASESTR -#define STRSTR_SSE42 __strcasestr_sse42_nonascii -#include "strstr.c" diff --git a/sysdeps/x86_64/multiarch/strcasestr.c b/sysdeps/x86_64/multiarch/strcasestr.c index d1cfb3b264..834e656a2c 100644 --- a/sysdeps/x86_64/multiarch/strcasestr.c +++ b/sysdeps/x86_64/multiarch/strcasestr.c @@ -1,7 +1,13 @@ -extern char *__strcasestr_sse42_nonascii (const unsigned char *s1, - const unsigned char *s2) - attribute_hidden; +/* Multiple versions of strcasestr + All versions must be listed in ifunc-impl-list.c. */ -#define USE_AS_STRCASESTR -#define STRSTR_SSE42 __strcasestr_sse42 -#include "strstr.c" +#include "init-arch.h" + +#define STRCASESTR __strcasestr_sse2 + +#include "string/strcasestr.c" + +extern __typeof (__strcasestr_sse2) __strcasestr_sse2 attribute_hidden; + +libc_ifunc (__strcasestr, + __strcasestr_sse2); diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S index 028c6d3d74..dc782f2c23 100644 --- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S @@ -1,5 +1,5 @@ /* strcat with SSE2 - Copyright (C) 2011-2013 Free Software Foundation, Inc. + Copyright (C) 2011-2014 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S index 8101b91e59..fde7b90822 100644 --- a/sysdeps/x86_64/multiarch/strcat-ssse3.S +++ b/sysdeps/x86_64/multiarch/strcat-ssse3.S @@ -1,5 +1,5 @@ /* strcat with SSSE3 - Copyright (C) 2011-2013 Free Software Foundation, Inc. + Copyright (C) 2011-2014 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/strcat.S b/sysdeps/x86_64/multiarch/strcat.S index f94dc709be..d5c9d847d4 100644 --- a/sysdeps/x86_64/multiarch/strcat.S +++ b/sysdeps/x86_64/multiarch/strcat.S @@ -1,6 +1,6 @@ /* Multiple versions of strcat All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2009-2013 Free Software Foundation, Inc. + Copyright (C) 2009-2014 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S index 72da62f3d2..0b3f0961c3 100644 --- a/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S +++ b/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S @@ -1,5 +1,5 @@ /* strchr with SSE2 without bsf - Copyright (C) 2011-2013 Free Software Foundation, Inc. + Copyright (C) 2011-2014 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S index 6860329449..63a35fa62f 100644 --- a/sysdeps/x86_64/multiarch/strchr.S +++ b/sysdeps/x86_64/multiarch/strchr.S @@ -1,5 +1,5 @@ /* Multiple versions of strchr - Copyright (C) 2009-2013 Free Software Foundation, Inc. + Copyright (C) 2009-2014 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -29,10 +29,6 @@ ENTRY(strchr) jne 1f call __init_cpu_features 1: leaq __strchr_sse2(%rip), %rax - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) - jz 2f - leaq __strchr_sse42(%rip), %rax - ret 2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip) jz 3f leaq __strchr_sse2_no_bsf(%rip), %rax @@ -40,127 +36,6 @@ ENTRY(strchr) END(strchr) -/* - This implementation uses SSE4 instructions to compare up to 16 bytes - at a time looking for the first occurrence of the character c in the - string s: - - char *strchr (const char *s, int c); - - We use 0xa: - _SIDD_SBYTE_OPS - | _SIDD_CMP_EQUAL_EACH - | _SIDD_LEAST_SIGNIFICANT - on pcmpistri to compare xmm/mem128 - - 0 1 2 3 4 5 6 7 8 9 A B C D E F - X X X X X X X X X X X X X X X X - - against xmm - - 0 1 2 3 4 5 6 7 8 9 A B C D E F - C C C C C C C C C C C C C C C C - - to find out if the first 16byte data element has a byte C and the - offset of the first byte. There are 3 cases: - - 1. The first 16byte data element has the byte C at the offset X. - 2. The first 16byte data element has EOS and doesn't have the byte C. - 3. The first 16byte data element is valid and doesn't have the byte C. - - Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases: - - case ECX CFlag ZFlag SFlag - 1 X 1 0/1 0 - 2 16 0 1 0 - 3 16 0 0 0 - - We exit from the loop for cases 1 and 2 with jbe which branches - when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset - X for case 1. */ - - .section .text.sse4.2,"ax",@progbits - .align 16 - .type __strchr_sse42, @function - .globl __strchr_sse42 - .hidden __strchr_sse42 -__strchr_sse42: - cfi_startproc - CALL_MCOUNT - testb %sil, %sil - je __strend_sse4 - pxor %xmm2, %xmm2 - movd %esi, %xmm1 - movl %edi, %ecx - pshufb %xmm2, %xmm1 - andl $15, %ecx - movq %rdi, %r8 - je L(aligned_start) - -/* Handle unaligned string. */ - andq $-16, %r8 - movdqa (%r8), %xmm0 - pcmpeqb %xmm0, %xmm2 - pcmpeqb %xmm1, %xmm0 - /* Find where NULL is. */ - pmovmskb %xmm2, %edx - /* Check if there is a match. */ - pmovmskb %xmm0, %esi - /* Remove the leading bytes. */ - sarl %cl, %edx - sarl %cl, %esi - testl %esi, %esi - je L(unaligned_no_match) - /* Check which byte is a match. */ - bsfl %esi, %eax - /* Is there a NULL? */ - testl %edx, %edx - je L(unaligned_match) - bsfl %edx, %esi - cmpl %esi, %eax - /* Return NULL if NULL comes first. */ - ja L(return_null) -L(unaligned_match): - addq %rdi, %rax - ret - - .p2align 4 -L(unaligned_no_match): - testl %edx, %edx - jne L(return_null) - -/* Loop start on aligned string. */ -L(loop): - addq $16, %r8 -L(aligned_start): - pcmpistri $0x2, (%r8), %xmm1 - jbe L(wrap) - addq $16, %r8 - pcmpistri $0x2, (%r8), %xmm1 - jbe L(wrap) - addq $16, %r8 - pcmpistri $0x2, (%r8), %xmm1 - jbe L(wrap) - addq $16, %r8 - pcmpistri $0x2, (%r8), %xmm1 - jbe L(wrap) - jmp L(loop) -L(wrap): - jc L(loop_exit) - -/* Return NULL. */ -L(return_null): - xorl %eax, %eax - ret - -/* Loop exit. */ - .p2align 4 -L(loop_exit): - leaq (%r8,%rcx), %rax - ret - cfi_endproc - .size __strchr_sse42, .-__strchr_sse42 - # undef ENTRY # define ENTRY(name) \ diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S new file mode 100644 index 0000000000..b133ffc3ea --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S @@ -0,0 +1,209 @@ +/* strcmp with unaligned loads + Copyright (C) 2013-2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" + +ENTRY ( __strcmp_sse2_unaligned) + movl %edi, %eax + xorl %edx, %edx + pxor %xmm7, %xmm7 + orl %esi, %eax + andl $4095, %eax + cmpl $4032, %eax + jg L(cross_page) + movdqu (%rdi), %xmm1 + movdqu (%rsi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pminub %xmm1, %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + testq %rax, %rax + je L(next_48_bytes) +L(return): + bsfq %rax, %rdx + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + subl %edx, %eax + ret + + .p2align 4 +L(next_48_bytes): + movdqu 16(%rdi), %xmm6 + movdqu 16(%rsi), %xmm3 + movdqu 32(%rdi), %xmm5 + pcmpeqb %xmm6, %xmm3 + movdqu 32(%rsi), %xmm2 + pminub %xmm6, %xmm3 + pcmpeqb %xmm1, %xmm3 + movdqu 48(%rdi), %xmm4 + pcmpeqb %xmm5, %xmm2 + pmovmskb %xmm3, %edx + movdqu 48(%rsi), %xmm0 + pminub %xmm5, %xmm2 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm2, %eax + salq $16, %rdx + pminub %xmm4, %xmm0 + pcmpeqb %xmm1, %xmm0 + salq $32, %rax + orq %rdx, %rax + pmovmskb %xmm0, %ecx + movq %rcx, %rdx + salq $48, %rdx + orq %rdx, %rax + jne L(return) +L(main_loop_header): + leaq 64(%rdi), %rdx + movl $4096, %ecx + pxor %xmm9, %xmm9 + andq $-64, %rdx + subq %rdi, %rdx + leaq (%rdi, %rdx), %rax + addq %rsi, %rdx + movq %rdx, %rsi + andl $4095, %esi + subq %rsi, %rcx + shrq $6, %rcx + movq %rcx, %rsi + jmp L(loop_start) + + .p2align 4 +L(loop): + addq $64, %rax + addq $64, %rdx +L(loop_start): + testq %rsi, %rsi + leaq -1(%rsi), %rsi + je L(loop_cross_page) +L(back_to_loop): + movdqu (%rdx), %xmm0 + movdqu 16(%rdx), %xmm1 + movdqa (%rax), %xmm2 + movdqa 16(%rax), %xmm3 + pcmpeqb %xmm2, %xmm0 + movdqu 32(%rdx), %xmm5 + pcmpeqb %xmm3, %xmm1 + pminub %xmm2, %xmm0 + movdqu 48(%rdx), %xmm6 + pminub %xmm3, %xmm1 + movdqa 32(%rax), %xmm2 + pminub %xmm1, %xmm0 + movdqa 48(%rax), %xmm3 + pcmpeqb %xmm2, %xmm5 + pcmpeqb %xmm3, %xmm6 + pminub %xmm2, %xmm5 + pminub %xmm3, %xmm6 + pminub %xmm5, %xmm0 + pminub %xmm6, %xmm0 + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %ecx + testl %ecx, %ecx + je L(loop) + pcmpeqb %xmm7, %xmm5 + movdqu (%rdx), %xmm0 + pcmpeqb %xmm7, %xmm1 + movdqa (%rax), %xmm2 + pcmpeqb %xmm2, %xmm0 + pminub %xmm2, %xmm0 + pcmpeqb %xmm7, %xmm6 + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm1, %ecx + pmovmskb %xmm5, %r8d + pmovmskb %xmm0, %edi + salq $16, %rcx + salq $32, %r8 + pmovmskb %xmm6, %esi + orq %r8, %rcx + orq %rdi, %rcx + salq $48, %rsi + orq %rsi, %rcx + bsfq %rcx, %rcx + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax + ret + + .p2align 4 +L(loop_cross_page): + xor %r10, %r10 + movq %rdx, %r9 + and $63, %r9 + subq %r9, %r10 + + movdqa (%rdx, %r10), %xmm0 + movdqa 16(%rdx, %r10), %xmm1 + movdqu (%rax, %r10), %xmm2 + movdqu 16(%rax, %r10), %xmm3 + pcmpeqb %xmm2, %xmm0 + movdqa 32(%rdx, %r10), %xmm5 + pcmpeqb %xmm3, %xmm1 + pminub %xmm2, %xmm0 + movdqa 48(%rdx, %r10), %xmm6 + pminub %xmm3, %xmm1 + movdqu 32(%rax, %r10), %xmm2 + movdqu 48(%rax, %r10), %xmm3 + pcmpeqb %xmm2, %xmm5 + pcmpeqb %xmm3, %xmm6 + pminub %xmm2, %xmm5 + pminub %xmm3, %xmm6 + + pcmpeqb %xmm7, %xmm0 + pcmpeqb %xmm7, %xmm1 + pcmpeqb %xmm7, %xmm5 + pcmpeqb %xmm7, %xmm6 + + pmovmskb %xmm1, %ecx + pmovmskb %xmm5, %r8d + pmovmskb %xmm0, %edi + salq $16, %rcx + salq $32, %r8 + pmovmskb %xmm6, %esi + orq %r8, %rdi + orq %rcx, %rdi + salq $48, %rsi + orq %rsi, %rdi + movq %r9, %rcx + movq $63, %rsi + shrq %cl, %rdi + test %rdi, %rdi + je L(back_to_loop) + bsfq %rdi, %rcx + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax + ret + + .p2align 4 +L(cross_page_loop): + cmpb %cl, %al + jne L(different) + addq $1, %rdx + cmpq $64, %rdx + je L(main_loop_header) +L(cross_page): + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %ecx + testb %al, %al + jne L(cross_page_loop) + xorl %eax, %eax +L(different): + subl %ecx, %eax + ret +END (__strcmp_sse2_unaligned) diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S index a503e92115..2d0758a656 100644 --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S @@ -1,5 +1,5 @@ /* strcmp with SSE4.2 - Copyright (C) 2009-2013 Free Software Foundation, Inc. + Copyright (C) 2009-2014 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -206,7 +206,7 @@ LABEL(touppermask): jnz LABEL(less16bytes)/* If not, find different value or null char */ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L sub $16, %r11 - jbe LABEL(strcmp_exitz)/* finish comparision */ + jbe LABEL(strcmp_exitz)/* finish comparison */ #endif add $16, %rsi /* prepare to search next 16 bytes */ add $16, %rdi /* prepare to search next 16 bytes */ diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S index f69aaf42b3..f3e0ca1259 100644 --- a/sysdeps/x86_64/multiarch/strcmp.S +++ b/sysdeps/x86_64/multiarch/strcmp.S @@ -1,5 +1,5 @@ /* Multiple versions of strcmp - Copyright (C) 2009-2013 Free Software Foundation, Inc. + Copyright (C) 2009-2014 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -66,6 +66,7 @@ # define STRCMP_SSE2 __strncasecmp_l_sse2 # define __GI_STRCMP __GI___strncasecmp_l #else +# define USE_AS_STRCMP # define UPDATE_STRNCMP_COUNTER # ifndef STRCMP # define STRCMP strcmp @@ -88,14 +89,22 @@ ENTRY(STRCMP) jne 1f call __init_cpu_features 1: +#ifdef USE_AS_STRCMP + leaq __strcmp_sse2_unaligned(%rip), %rax + testl $bit_Fast_Unaligned_Load, __cpu_features+CPUID_OFFSET+index_Fast_Unaligned_Load(%rip) + jnz 3f +#else + testl $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip) + jnz 2f leaq STRCMP_SSE42(%rip), %rax testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) - jnz 2f - leaq STRCMP_SSSE3(%rip), %rax + jnz 3f +#endif +2: leaq STRCMP_SSSE3(%rip), %rax testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) - jnz 2f + jnz 3f leaq STRCMP_SSE2(%rip), %rax -2: ret +3: ret END(STRCMP) # ifdef USE_AS_STRCASECMP_L @@ -109,16 +118,18 @@ ENTRY(__strcasecmp) # ifdef HAVE_AVX_SUPPORT leaq __strcasecmp_avx(%rip), %rax testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) - jnz 2f + jnz 3f # endif + testl $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip) + jnz 2f leaq __strcasecmp_sse42(%rip), %rax testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) - jnz 2f - leaq __strcasecmp_ssse3(%rip), %rax + jnz 3f +2: leaq __strcasecmp_ssse3(%rip), %rax testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) - jnz 2f + jnz 3f leaq __strcasecmp_sse2(%rip), %rax -2: ret +3: ret END(__strcasecmp) weak_alias (__strcasecmp, strcasecmp) # endif @@ -133,16 +144,18 @@ ENTRY(__strncasecmp) # ifdef HAVE_AVX_SUPPORT leaq __strncasecmp_avx(%rip), %rax testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) - jnz 2f + jnz 3f # endif + testl $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip) + jnz 2f leaq __strncasecmp_sse42(%rip), %rax testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) - jnz 2f - leaq __strncasecmp_ssse3(%rip), %rax + jnz 3f +2: leaq __strncasecmp_ssse3(%rip), %rax testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) - jnz 2f + jnz 3f leaq __strncasecmp_sse2(%rip), %rax -2: ret +3: ret END(__strncasecmp) weak_alias (__strncasecmp, strncasecmp) # endif diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S index cd56e5637a..be7513d480 100644 --- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S @@ -1,5 +1,5 @@ /* strcpy with SSE2 and unaligned load - Copyright (C) 2011-2013 Free Software Foundation, Inc. + Copyright (C) 2011-2014 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -93,7 +93,7 @@ ENTRY (STRCPY) movdqu (%rsi, %rcx), %xmm1 /* copy 16 bytes */ movdqu %xmm1, (%rdi) -/* If source adress alignment != destination adress alignment */ +/* If source address alignment != destination address alignment */ .p2align 4 L(Unalign16Both): sub %rcx, %rdi @@ -289,7 +289,7 @@ L(Unaligned64Leave): BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) # endif -/* If source adress alignment == destination adress alignment */ +/* If source address alignment == destination address alignment */ L(SourceStringAlignmentLess32): pxor %xmm0, %xmm0 diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S index 42ee00bd5c..86569ff54a 100644 --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S +++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S @@ -1,5 +1,5 @@ /* strcpy with SSSE3 - Copyright (C) 2011-2013 Free Software Foundation, Inc. + Copyright (C) 2011-2014 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S index 919a411a9e..80ed98b30a 100644 --- a/sysdeps/x86_64/multiarch/strcpy.S +++ b/sysdeps/x86_64/multiarch/strcpy.S @@ -1,6 +1,6 @@ /* Multiple versions of strcpy All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2009-2013 Free Software Foundation, Inc. + Copyright (C) 2009-2014 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c index 9c0dcf0e8f..a9a6c8ae74 100644 --- a/sysdeps/x86_64/multiarch/strcspn-c.c +++ b/sysdeps/x86_64/multiarch/strcspn-c.c @@ -1,5 +1,5 @@ /* strcspn with SSE4.2 intrinsics - Copyright (C) 2009-2013 Free Software Foundation, Inc. + Copyright (C) 2009-2014 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S index df9616510b..24f55e9579 100644 --- a/sysdeps/x86_64/multiarch/strcspn.S +++ b/sysdeps/x86_64/multiarch/strcspn.S @@ -1,6 +1,6 @@ /* Multiple versions of strcspn All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2009-2013 Free Software Foundation, Inc. + Copyright (C) 2009-2014 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/strend-sse4.S b/sysdeps/x86_64/multiarch/strend-sse4.S deleted file mode 100644 index c5a7ae28a6..0000000000 --- a/sysdeps/x86_64/multiarch/strend-sse4.S +++ /dev/null @@ -1,48 +0,0 @@ -/* Return the pointer to the end of string, using SSE4.2 - Copyright (C) 2009-2013 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "asm-syntax.h" - - .section .text.sse4.2,"ax",@progbits -ENTRY (__strend_sse4) - pxor %xmm2, %xmm2 - movq %rdi, %rcx - andq $~15, %rdi - movdqa %xmm2, %xmm1 - pcmpeqb (%rdi), %xmm2 - orl $0xffffffff, %esi - subq %rdi, %rcx - shll %cl, %esi - pmovmskb %xmm2, %edx - andl %esi, %edx - jnz 1f - -2: pcmpistri $0x08, 16(%rdi), %xmm1 - leaq 16(%rdi), %rdi - jnz 2b - - leaq (%rdi,%rcx), %rax - ret - -1: bsfl %edx, %eax - addq %rdi, %rax - ret - -END (__strend_sse4) diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S deleted file mode 100644 index fcef610dbc..0000000000 --- a/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S +++ /dev/null @@ -1,555 +0,0 @@ -/* strrchr with SSE2 without bsf and bsr - Copyright (C) 2011-2013 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#if defined SHARED && !defined NOT_IN_libc - -# include <sysdep.h> -# include "asm-syntax.h" - - atom_text_section -ENTRY (__strrchr_sse2_no_bsf) - - movd %rsi, %xmm1 - pxor %xmm2, %xmm2 - mov %rdi, %rcx - punpcklbw %xmm1, %xmm1 - punpcklbw %xmm1, %xmm1 - /* ECX has OFFSET. */ - and $63, %rcx - cmp $48, %rcx - pshufd $0, %xmm1, %xmm1 - ja L(crosscache) - -/* unaligned string. */ - movdqu (%rdi), %xmm0 - pcmpeqb %xmm0, %xmm2 - pcmpeqb %xmm1, %xmm0 - /* Find where NULL is. */ - pmovmskb %xmm2, %rcx - /* Check if there is a match. */ - pmovmskb %xmm0, %rax - add $16, %rdi - - test %rax, %rax - jnz L(unaligned_match1) - - test %rcx, %rcx - jnz L(return_null) - - and $-16, %rdi - xor %r8, %r8 - jmp L(loop) - - .p2align 4 -L(unaligned_match1): - test %rcx, %rcx - jnz L(prolog_find_zero_1) - - mov %rax, %r8 - mov %rdi, %rsi - and $-16, %rdi - jmp L(loop) - - .p2align 4 -L(crosscache): -/* Hancle unaligned string. */ - and $15, %rcx - and $-16, %rdi - pxor %xmm3, %xmm3 - movdqa (%rdi), %xmm0 - pcmpeqb %xmm0, %xmm3 - pcmpeqb %xmm1, %xmm0 - /* Find where NULL is. */ - pmovmskb %xmm3, %rdx - /* Check if there is a match. */ - pmovmskb %xmm0, %rax - /* Remove the leading bytes. */ - shr %cl, %rdx - shr %cl, %rax - add $16, %rdi - - test %rax, %rax - jnz L(unaligned_match) - - test %rdx, %rdx - jnz L(return_null) - - xor %r8, %r8 - jmp L(loop) - - .p2align 4 -L(unaligned_match): - test %rdx, %rdx - jnz L(prolog_find_zero) - - mov %rax, %r8 - lea (%rdi, %rcx), %rsi - -/* Loop start on aligned string. */ - .p2align 4 -L(loop): - movdqa (%rdi), %xmm0 - pcmpeqb %xmm0, %xmm2 - add $16, %rdi - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm2, %rcx - pmovmskb %xmm0, %rax - or %rax, %rcx - jnz L(matches) - - movdqa (%rdi), %xmm0 - pcmpeqb %xmm0, %xmm2 - add $16, %rdi - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm2, %rcx - pmovmskb %xmm0, %rax - or %rax, %rcx - jnz L(matches) - - movdqa (%rdi), %xmm0 - pcmpeqb %xmm0, %xmm2 - add $16, %rdi - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm2, %rcx - pmovmskb %xmm0, %rax - or %rax, %rcx - jnz L(matches) - - movdqa (%rdi), %xmm0 - pcmpeqb %xmm0, %xmm2 - add $16, %rdi - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm2, %rcx - pmovmskb %xmm0, %rax - or %rax, %rcx - jz L(loop) - -L(matches): - test %rax, %rax - jnz L(match) -L(return_value): - test %r8, %r8 - jz L(return_null) - mov %r8, %rax - mov %rsi, %rdi - jmp L(match_exit) - - .p2align 4 -L(match): - pmovmskb %xmm2, %rcx - test %rcx, %rcx - jnz L(find_zero) - mov %rax, %r8 - mov %rdi, %rsi - jmp L(loop) - - .p2align 4 -L(find_zero): - test %cl, %cl - jz L(find_zero_high) - mov %cl, %dl - and $15, %dl - jz L(find_zero_8) - test $0x01, %cl - jnz L(FindZeroExit1) - test $0x02, %cl - jnz L(FindZeroExit2) - test $0x04, %cl - jnz L(FindZeroExit3) - and $1 << 4 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(find_zero_8): - test $0x10, %cl - jnz L(FindZeroExit5) - test $0x20, %cl - jnz L(FindZeroExit6) - test $0x40, %cl - jnz L(FindZeroExit7) - and $1 << 8 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(find_zero_high): - mov %ch, %dh - and $15, %dh - jz L(find_zero_high_8) - test $0x01, %ch - jnz L(FindZeroExit9) - test $0x02, %ch - jnz L(FindZeroExit10) - test $0x04, %ch - jnz L(FindZeroExit11) - and $1 << 12 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(find_zero_high_8): - test $0x10, %ch - jnz L(FindZeroExit13) - test $0x20, %ch - jnz L(FindZeroExit14) - test $0x40, %ch - jnz L(FindZeroExit15) - and $1 << 16 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(FindZeroExit1): - and $1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(FindZeroExit2): - and $1 << 2 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(FindZeroExit3): - and $1 << 3 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(FindZeroExit5): - and $1 << 5 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(FindZeroExit6): - and $1 << 6 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(FindZeroExit7): - and $1 << 7 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(FindZeroExit9): - and $1 << 9 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(FindZeroExit10): - and $1 << 10 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(FindZeroExit11): - and $1 << 11 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(FindZeroExit13): - and $1 << 13 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(FindZeroExit14): - and $1 << 14 - 1, %rax - jz L(return_value) - jmp L(match_exit) - - .p2align 4 -L(FindZeroExit15): - and $1 << 15 - 1, %rax - jz L(return_value) - - .p2align 4 -L(match_exit): - test %ah, %ah - jnz L(match_exit_high) - mov %al, %dl - and $15 << 4, %dl - jnz L(match_exit_8) - test $0x08, %al - jnz L(Exit4) - test $0x04, %al - jnz L(Exit3) - test $0x02, %al - jnz L(Exit2) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(match_exit_8): - test $0x80, %al - jnz L(Exit8) - test $0x40, %al - jnz L(Exit7) - test $0x20, %al - jnz L(Exit6) - lea -12(%rdi), %rax - ret - - .p2align 4 -L(match_exit_high): - mov %ah, %dh - and $15 << 4, %dh - jnz L(match_exit_high_8) - test $0x08, %ah - jnz L(Exit12) - test $0x04, %ah - jnz L(Exit11) - test $0x02, %ah - jnz L(Exit10) - lea -8(%rdi), %rax - ret - - .p2align 4 -L(match_exit_high_8): - test $0x80, %ah - jnz L(Exit16) - test $0x40, %ah - jnz L(Exit15) - test $0x20, %ah - jnz L(Exit14) - lea -4(%rdi), %rax - ret - - .p2align 4 -L(Exit2): - lea -15(%rdi), %rax - ret - - .p2align 4 -L(Exit3): - lea -14(%rdi), %rax - ret - - .p2align 4 -L(Exit4): - lea -13(%rdi), %rax - ret - - .p2align 4 -L(Exit6): - lea -11(%rdi), %rax - ret - - .p2align 4 -L(Exit7): - lea -10(%rdi), %rax - ret - - .p2align 4 -L(Exit8): - lea -9(%rdi), %rax - ret - - .p2align 4 -L(Exit10): - lea -7(%rdi), %rax - ret - - .p2align 4 -L(Exit11): - lea -6(%rdi), %rax - ret - - .p2align 4 -L(Exit12): - lea -5(%rdi), %rax - ret - - .p2align 4 -L(Exit14): - lea -3(%rdi), %rax - ret - - .p2align 4 -L(Exit15): - lea -2(%rdi), %rax - ret - - .p2align 4 -L(Exit16): - lea -1(%rdi), %rax - ret - -/* Return NULL. */ - .p2align 4 -L(return_null): - xor %rax, %rax - ret - - .p2align 4 -L(prolog_find_zero): - add %rcx, %rdi - mov %rdx, %rcx -L(prolog_find_zero_1): - test %cl, %cl - jz L(prolog_find_zero_high) - mov %cl, %dl - and $15, %dl - jz L(prolog_find_zero_8) - test $0x01, %cl - jnz L(PrologFindZeroExit1) - test $0x02, %cl - jnz L(PrologFindZeroExit2) - test $0x04, %cl - jnz L(PrologFindZeroExit3) - and $1 << 4 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(prolog_find_zero_8): - test $0x10, %cl - jnz L(PrologFindZeroExit5) - test $0x20, %cl - jnz L(PrologFindZeroExit6) - test $0x40, %cl - jnz L(PrologFindZeroExit7) - and $1 << 8 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(prolog_find_zero_high): - mov %ch, %dh - and $15, %dh - jz L(prolog_find_zero_high_8) - test $0x01, %ch - jnz L(PrologFindZeroExit9) - test $0x02, %ch - jnz L(PrologFindZeroExit10) - test $0x04, %ch - jnz L(PrologFindZeroExit11) - and $1 << 12 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(prolog_find_zero_high_8): - test $0x10, %ch - jnz L(PrologFindZeroExit13) - test $0x20, %ch - jnz L(PrologFindZeroExit14) - test $0x40, %ch - jnz L(PrologFindZeroExit15) - and $1 << 16 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(PrologFindZeroExit1): - and $1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(PrologFindZeroExit2): - and $1 << 2 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(PrologFindZeroExit3): - and $1 << 3 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(PrologFindZeroExit5): - and $1 << 5 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(PrologFindZeroExit6): - and $1 << 6 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(PrologFindZeroExit7): - and $1 << 7 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(PrologFindZeroExit9): - and $1 << 9 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(PrologFindZeroExit10): - and $1 << 10 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(PrologFindZeroExit11): - and $1 << 11 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(PrologFindZeroExit13): - and $1 << 13 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(PrologFindZeroExit14): - and $1 << 14 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - - .p2align 4 -L(PrologFindZeroExit15): - and $1 << 15 - 1, %rax - jnz L(match_exit) - xor %rax, %rax - ret - -END (__strrchr_sse2_no_bsf) -#endif diff --git a/sysdeps/x86_64/multiarch/strrchr.S b/sysdeps/x86_64/multiarch/strrchr.S deleted file mode 100644 index ee6af6e9dd..0000000000 --- a/sysdeps/x86_64/multiarch/strrchr.S +++ /dev/null @@ -1,286 +0,0 @@ -/* Multiple versions of strrchr - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2009-2013 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - -/* Define multiple versions only for the definition in libc and for - the DSO. In static binaries we need strrchr before the initialization - happened. */ -#if defined SHARED && !defined NOT_IN_libc - .text -ENTRY(strrchr) - .type strrchr, @gnu_indirect_function - cmpl $0, __cpu_features+KIND_OFFSET(%rip) - jne 1f - call __init_cpu_features -1: leaq __strrchr_sse2(%rip), %rax - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) - jz 2f - leaq __strrchr_sse42(%rip), %rax - ret -2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip) - jz 3f - leaq __strrchr_sse2_no_bsf(%rip), %rax -3: ret -END(strrchr) - -/* - This implementation uses SSE4 instructions to compare up to 16 bytes - at a time looking for the last occurrence of the character c in the - string s: - - char *strrchr (const char *s, int c); - - We use 0x4a: - _SIDD_SBYTE_OPS - | _SIDD_CMP_EQUAL_EACH - | _SIDD_MOST_SIGNIFICANT - on pcmpistri to compare xmm/mem128 - - 0 1 2 3 4 5 6 7 8 9 A B C D E F - X X X X X X X X X X X X X X X X - - against xmm - - 0 1 2 3 4 5 6 7 8 9 A B C D E F - C C C C C C C C C C C C C C C C - - to find out if the first 16byte data element has a byte C and the - last offset. There are 4 cases: - - 1. The first 16byte data element has EOS and has the byte C at the - last offset X. - 2. The first 16byte data element is valid and has the byte C at the - last offset X. - 3. The first 16byte data element has EOS and doesn't have the byte C. - 4. The first 16byte data element is valid and doesn't have the byte C. - - Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases: - - case ECX CFlag ZFlag SFlag - 1 X 1 1 0 - 2 X 1 0 0 - 3 16 0 1 0 - 4 16 0 0 0 - - We exit from the loop for cases 1 and 3 with jz which branches - when ZFlag is 1. If CFlag == 1, ECX has the offset X for case 1. */ - - - .section .text.sse4.2,"ax",@progbits - .align 16 - .type __strrchr_sse42, @function - .globl __strrchr_sse42 - .hidden __strrchr_sse42 -__strrchr_sse42: - cfi_startproc - CALL_MCOUNT - testb %sil, %sil - je __strend_sse4 - xor %eax,%eax /* RAX has the last occurrence of s. */ - movd %esi, %xmm1 - punpcklbw %xmm1, %xmm1 - movl %edi, %esi - punpcklbw %xmm1, %xmm1 - andl $15, %esi - pshufd $0, %xmm1, %xmm1 - movq %rdi, %r8 - je L(loop) - -/* Handle unaligned string using psrldq. */ - leaq L(psrldq_table)(%rip), %rdx - andq $-16, %r8 - movslq (%rdx,%rsi,4),%r9 - movdqa (%r8), %xmm0 - addq %rdx, %r9 - jmp *%r9 - -/* Handle unaligned string with offset 1 using psrldq. */ - .p2align 4 -L(psrldq_1): - psrldq $1, %xmm0 - - .p2align 4 -L(unaligned_pcmpistri): - pcmpistri $0x4a, %xmm1, %xmm0 - jnc L(unaligned_no_byte) - leaq (%rdi,%rcx), %rax -L(unaligned_no_byte): - /* Find the length of the unaligned string. */ - pcmpistri $0x3a, %xmm0, %xmm0 - movl $16, %edx - subl %esi, %edx - cmpl %ecx, %edx - /* Return RAX if the unaligned fragment to next 16B already - contain the NULL terminator. */ - jg L(exit) - addq $16, %r8 - -/* Loop start on aligned string. */ - .p2align 4 -L(loop): - pcmpistri $0x4a, (%r8), %xmm1 - jbe L(match_or_eos) - addq $16, %r8 - jmp L(loop) - .p2align 4 -L(match_or_eos): - je L(had_eos) -L(match_no_eos): - leaq (%r8,%rcx), %rax - addq $16, %r8 - jmp L(loop) - .p2align 4 -L(had_eos): - jnc L(exit) - leaq (%r8,%rcx), %rax - .p2align 4 -L(exit): - ret - -/* Handle unaligned string with offset 15 using psrldq. */ - .p2align 4 -L(psrldq_15): - psrldq $15, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 14 using psrldq. */ - .p2align 4 -L(psrldq_14): - psrldq $14, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 13 using psrldq. */ - .p2align 4 -L(psrldq_13): - psrldq $13, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 12 using psrldq. */ - .p2align 4 -L(psrldq_12): - psrldq $12, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 11 using psrldq. */ - .p2align 4 -L(psrldq_11): - psrldq $11, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 10 using psrldq. */ - .p2align 4 -L(psrldq_10): - psrldq $10, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 9 using psrldq. */ - .p2align 4 -L(psrldq_9): - psrldq $9, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 8 using psrldq. */ - .p2align 4 -L(psrldq_8): - psrldq $8, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 7 using psrldq. */ - .p2align 4 -L(psrldq_7): - psrldq $7, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 6 using psrldq. */ - .p2align 4 -L(psrldq_6): - psrldq $6, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 5 using psrldq. */ - .p2align 4 -L(psrldq_5): - psrldq $5, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 4 using psrldq. */ - .p2align 4 -L(psrldq_4): - psrldq $4, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 3 using psrldq. */ - .p2align 4 -L(psrldq_3): - psrldq $3, %xmm0 - jmp L(unaligned_pcmpistri) - -/* Handle unaligned string with offset 2 using psrldq. */ - .p2align 4 -L(psrldq_2): - psrldq $2, %xmm0 - jmp L(unaligned_pcmpistri) - - cfi_endproc - .size __strrchr_sse42, .-__strrchr_sse42 - - .section .rodata.sse4.2,"a",@progbits - .p2align 4 -L(psrldq_table): - .int L(loop) - L(psrldq_table) - .int L(psrldq_1) - L(psrldq_table) - .int L(psrldq_2) - L(psrldq_table) - .int L(psrldq_3) - L(psrldq_table) - .int L(psrldq_4) - L(psrldq_table) - .int L(psrldq_5) - L(psrldq_table) - .int L(psrldq_6) - L(psrldq_table) - .int L(psrldq_7) - L(psrldq_table) - .int L(psrldq_8) - L(psrldq_table) - .int L(psrldq_9) - L(psrldq_table) - .int L(psrldq_10) - L(psrldq_table) - .int L(psrldq_11) - L(psrldq_table) - .int L(psrldq_12) - L(psrldq_table) - .int L(psrldq_13) - L(psrldq_table) - .int L(psrldq_14) - L(psrldq_table) - .int L(psrldq_15) - L(psrldq_table) - - -# undef ENTRY -# define ENTRY(name) \ - .type __strrchr_sse2, @function; \ - .align 16; \ - .globl __strrchr_sse2; \ - .hidden __strrchr_sse2; \ - __strrchr_sse2: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __strrchr_sse2, .-__strrchr_sse2 -# undef libc_hidden_builtin_def -/* It doesn't make sense to send libc-internal strrchr calls through a PLT. - The speedup we get from using SSE4.2 instruction is likely eaten away - by the indirect call in the PLT. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_strrchr; __GI_strrchr = __strrchr_sse2 -#endif - -#include "../strrchr.S" diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c index 8128cb9769..8d19e5ca36 100644 --- a/sysdeps/x86_64/multiarch/strspn-c.c +++ b/sysdeps/x86_64/multiarch/strspn-c.c @@ -1,5 +1,5 @@ /* strspn with SSE4.2 intrinsics - Copyright (C) 2009-2013 Free Software Foundation, Inc. + Copyright (C) 2009-2014 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S index 79fbf3c574..bf7308eade 100644 --- a/sysdeps/x86_64/multiarch/strspn.S +++ b/sysdeps/x86_64/multiarch/strspn.S @@ -1,6 +1,6 @@ /* Multiple versions of strspn All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2009-2013 Free Software Foundation, Inc. + Copyright (C) 2009-2014 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/strstr-c.c b/sysdeps/x86_64/multiarch/strstr-c.c deleted file mode 100644 index 42bbe48172..0000000000 --- a/sysdeps/x86_64/multiarch/strstr-c.c +++ /dev/null @@ -1,47 +0,0 @@ -/* Multiple versions of strstr. - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2012-2013 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -/* Redefine strstr so that the compiler won't complain about the type - mismatch with the IFUNC selector in strong_alias, below. */ -#undef strstr -#define strstr __redirect_strstr -#include <string.h> -#undef strstr - -#define STRSTR __strstr_sse2 -#ifdef SHARED -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(name) \ - __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2); -#endif - -#include "string/strstr.c" - -extern __typeof (__redirect_strstr) __strstr_sse42 attribute_hidden; -extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden; - -#include "init-arch.h" - -/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle - ifunc symbol properly. */ -extern __typeof (__redirect_strstr) __libc_strstr; -libc_ifunc (__libc_strstr, HAS_SSE4_2 ? __strstr_sse42 : __strstr_sse2) - -#undef strstr -strong_alias (__libc_strstr, strstr) diff --git a/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S new file mode 100644 index 0000000000..5b8009c733 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S @@ -0,0 +1,374 @@ +/* strstr with unaligned loads + Copyright (C) 2009-2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +ENTRY(__strstr_sse2_unaligned) + movzbl (%rsi), %eax + testb %al, %al + je L(empty) + movzbl 1(%rsi), %edx + testb %dl, %dl + je L(strchr) + movd %eax, %xmm1 + movd %edx, %xmm2 + movq %rdi, %rax + andl $4095, %eax + punpcklbw %xmm1, %xmm1 + cmpq $4031, %rax + punpcklbw %xmm2, %xmm2 + punpcklwd %xmm1, %xmm1 + punpcklwd %xmm2, %xmm2 + pshufd $0, %xmm1, %xmm1 + pshufd $0, %xmm2, %xmm2 + ja L(cross_page) + movdqu (%rdi), %xmm3 + pxor %xmm5, %xmm5 + movdqu 1(%rdi), %xmm4 + movdqa %xmm3, %xmm6 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm2, %xmm4 + movdqu 16(%rdi), %xmm0 + pcmpeqb %xmm5, %xmm6 + pminub %xmm4, %xmm3 + movdqa %xmm3, %xmm4 + movdqu 17(%rdi), %xmm3 + pcmpeqb %xmm0, %xmm5 + pcmpeqb %xmm2, %xmm3 + por %xmm6, %xmm4 + pcmpeqb %xmm1, %xmm0 + pminub %xmm3, %xmm0 + por %xmm5, %xmm0 + pmovmskb %xmm4, %r8d + pmovmskb %xmm0, %eax + salq $16, %rax + orq %rax, %r8 + je L(next_32_bytes) +L(next_pair_index): + bsf %r8, %rax + addq %rdi, %rax + cmpb $0, (%rax) + je L(zero1) + movzbl 2(%rsi), %edx + testb %dl, %dl + je L(found1) + cmpb 2(%rax), %dl + jne L(next_pair) + xorl %edx, %edx + jmp L(pair_loop_start) + + .p2align 4 +L(strchr): + movzbl %al, %esi + jmp __strchr_sse2 + + .p2align 4 +L(pair_loop): + addq $1, %rdx + cmpb 2(%rax,%rdx), %cl + jne L(next_pair) +L(pair_loop_start): + movzbl 3(%rsi,%rdx), %ecx + testb %cl, %cl + jne L(pair_loop) +L(found1): + ret +L(zero1): + xorl %eax, %eax + ret + + .p2align 4 +L(next_pair): + leaq -1(%r8), %rax + andq %rax, %r8 + jne L(next_pair_index) + + .p2align 4 +L(next_32_bytes): + movdqu 32(%rdi), %xmm3 + pxor %xmm5, %xmm5 + movdqu 33(%rdi), %xmm4 + movdqa %xmm3, %xmm6 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm2, %xmm4 + movdqu 48(%rdi), %xmm0 + pcmpeqb %xmm5, %xmm6 + pminub %xmm4, %xmm3 + movdqa %xmm3, %xmm4 + movdqu 49(%rdi), %xmm3 + pcmpeqb %xmm0, %xmm5 + pcmpeqb %xmm2, %xmm3 + por %xmm6, %xmm4 + pcmpeqb %xmm1, %xmm0 + pminub %xmm3, %xmm0 + por %xmm5, %xmm0 + pmovmskb %xmm4, %eax + salq $32, %rax + pmovmskb %xmm0, %r8d + salq $48, %r8 + orq %rax, %r8 + je L(loop_header) +L(next_pair2_index): + bsfq %r8, %rax + addq %rdi, %rax + cmpb $0, (%rax) + je L(zero2) + movzbl 2(%rsi), %edx + testb %dl, %dl + je L(found2) + cmpb 2(%rax), %dl + jne L(next_pair2) + xorl %edx, %edx + jmp L(pair_loop2_start) + + .p2align 4 +L(pair_loop2): + addq $1, %rdx + cmpb 2(%rax,%rdx), %cl + jne L(next_pair2) +L(pair_loop2_start): + movzbl 3(%rsi,%rdx), %ecx + testb %cl, %cl + jne L(pair_loop2) +L(found2): + ret + L(zero2): + xorl %eax, %eax + ret +L(empty): + mov %rdi, %rax + ret + + .p2align 4 +L(next_pair2): + leaq -1(%r8), %rax + andq %rax, %r8 + jne L(next_pair2_index) +L(loop_header): + movq $-512, %r11 + movq %rdi, %r9 + + pxor %xmm7, %xmm7 + andq $-64, %rdi + + .p2align 4 +L(loop): + movdqa 64(%rdi), %xmm3 + movdqu 63(%rdi), %xmm6 + movdqa %xmm3, %xmm0 + pxor %xmm2, %xmm3 + pxor %xmm1, %xmm6 + movdqa 80(%rdi), %xmm10 + por %xmm3, %xmm6 + pminub %xmm10, %xmm0 + movdqu 79(%rdi), %xmm3 + pxor %xmm2, %xmm10 + pxor %xmm1, %xmm3 + movdqa 96(%rdi), %xmm9 + por %xmm10, %xmm3 + pminub %xmm9, %xmm0 + pxor %xmm2, %xmm9 + movdqa 112(%rdi), %xmm8 + addq $64, %rdi + pminub %xmm6, %xmm3 + movdqu 31(%rdi), %xmm4 + pminub %xmm8, %xmm0 + pxor %xmm2, %xmm8 + pxor %xmm1, %xmm4 + por %xmm9, %xmm4 + pminub %xmm4, %xmm3 + movdqu 47(%rdi), %xmm5 + pxor %xmm1, %xmm5 + por %xmm8, %xmm5 + pminub %xmm5, %xmm3 + pminub %xmm3, %xmm0 + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %eax + testl %eax, %eax + je L(loop) + pminub (%rdi), %xmm6 + pminub 32(%rdi),%xmm4 + pminub 48(%rdi),%xmm5 + pcmpeqb %xmm7, %xmm6 + pcmpeqb %xmm7, %xmm5 + pmovmskb %xmm6, %edx + movdqa 16(%rdi), %xmm8 + pcmpeqb %xmm7, %xmm4 + movdqu 15(%rdi), %xmm0 + pmovmskb %xmm5, %r8d + movdqa %xmm8, %xmm3 + pmovmskb %xmm4, %ecx + pcmpeqb %xmm1,%xmm0 + pcmpeqb %xmm2,%xmm3 + salq $32, %rcx + pcmpeqb %xmm7,%xmm8 + salq $48, %r8 + pminub %xmm0,%xmm3 + orq %rcx, %rdx + por %xmm3,%xmm8 + orq %rdx, %r8 + pmovmskb %xmm8, %eax + salq $16, %rax + orq %rax, %r8 + je L(loop) +L(next_pair_index3): + bsfq %r8, %rcx + addq %rdi, %rcx + cmpb $0, (%rcx) + je L(zero) + xorl %eax, %eax + movzbl 2(%rsi), %edx + testb %dl, %dl + je L(success3) + cmpb 1(%rcx), %dl + jne L(next_pair3) + jmp L(pair_loop_start3) + + .p2align 4 +L(pair_loop3): + addq $1, %rax + cmpb 1(%rcx,%rax), %dl + jne L(next_pair3) +L(pair_loop_start3): + movzbl 3(%rsi,%rax), %edx + testb %dl, %dl + jne L(pair_loop3) +L(success3): + lea -1(%rcx), %rax + ret + + .p2align 4 +L(next_pair3): + addq %rax, %r11 + movq %rdi, %rax + subq %r9, %rax + cmpq %r11, %rax + jl L(switch_strstr) + leaq -1(%r8), %rax + andq %rax, %r8 + jne L(next_pair_index3) + jmp L(loop) + + .p2align 4 +L(switch_strstr): + movq %rdi, %rdi + jmp __strstr_sse2 + + .p2align 4 +L(cross_page): + + movq %rdi, %rax + pxor %xmm0, %xmm0 + andq $-64, %rax + movdqa (%rax), %xmm3 + movdqu -1(%rax), %xmm4 + movdqa %xmm3, %xmm8 + movdqa 16(%rax), %xmm5 + pcmpeqb %xmm1, %xmm4 + pcmpeqb %xmm0, %xmm8 + pcmpeqb %xmm2, %xmm3 + movdqa %xmm5, %xmm7 + pminub %xmm4, %xmm3 + movdqu 15(%rax), %xmm4 + pcmpeqb %xmm0, %xmm7 + por %xmm3, %xmm8 + movdqa %xmm5, %xmm3 + movdqa 32(%rax), %xmm5 + pcmpeqb %xmm1, %xmm4 + pcmpeqb %xmm2, %xmm3 + movdqa %xmm5, %xmm6 + pmovmskb %xmm8, %ecx + pminub %xmm4, %xmm3 + movdqu 31(%rax), %xmm4 + por %xmm3, %xmm7 + movdqa %xmm5, %xmm3 + pcmpeqb %xmm0, %xmm6 + movdqa 48(%rax), %xmm5 + pcmpeqb %xmm1, %xmm4 + pmovmskb %xmm7, %r8d + pcmpeqb %xmm2, %xmm3 + pcmpeqb %xmm5, %xmm0 + pminub %xmm4, %xmm3 + movdqu 47(%rax), %xmm4 + por %xmm3, %xmm6 + movdqa %xmm5, %xmm3 + salq $16, %r8 + pcmpeqb %xmm1, %xmm4 + pcmpeqb %xmm2, %xmm3 + pmovmskb %xmm6, %r10d + pminub %xmm4, %xmm3 + por %xmm3, %xmm0 + salq $32, %r10 + orq %r10, %r8 + orq %rcx, %r8 + movl %edi, %ecx + pmovmskb %xmm0, %edx + subl %eax, %ecx + salq $48, %rdx + orq %rdx, %r8 + shrq %cl, %r8 + je L(loop_header) +L(next_pair_index4): + bsfq %r8, %rax + addq %rdi, %rax + cmpb $0, (%rax) + je L(zero) + + cmpq %rax,%rdi + je L(next_pair4) + + movzbl 2(%rsi), %edx + testb %dl, %dl + je L(found3) + cmpb 1(%rax), %dl + jne L(next_pair4) + xorl %edx, %edx + jmp L(pair_loop_start4) + + .p2align 4 +L(pair_loop4): + addq $1, %rdx + cmpb 1(%rax,%rdx), %cl + jne L(next_pair4) +L(pair_loop_start4): + movzbl 3(%rsi,%rdx), %ecx + testb %cl, %cl + jne L(pair_loop4) +L(found3): + subq $1, %rax + ret + + .p2align 4 +L(next_pair4): + leaq -1(%r8), %rax + andq %rax, %r8 + jne L(next_pair_index4) + jmp L(loop_header) + + .p2align 4 +L(found): + rep + ret + + .p2align 4 +L(zero): + xorl %eax, %eax + ret + + +END(__strstr_sse2_unaligned) diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c index cd63b68c01..b41374d754 100644 --- a/sysdeps/x86_64/multiarch/strstr.c +++ b/sysdeps/x86_64/multiarch/strstr.c @@ -1,6 +1,6 @@ -/* strstr with SSE4.2 intrinsics - Copyright (C) 2009-2013 Free Software Foundation, Inc. - Contributed by Intel Corporation. +/* Multiple versions of strstr. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2012-2014 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -17,369 +17,31 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include <nmmintrin.h> -#include "varshift.h" - -#ifndef STRSTR_SSE42 -# define STRSTR_SSE42 __strstr_sse42 -#endif - -#ifdef USE_AS_STRCASESTR -# include <ctype.h> -# include <locale/localeinfo.h> - -# define LOADBYTE(C) tolower (C) -# define CMPBYTE(C1, C2) (tolower (C1) == tolower (C2)) -#else -# define LOADBYTE(C) (C) -# define CMPBYTE(C1, C2) ((C1) == (C2)) -#endif - -/* We use 0xe ordered-compare: - _SIDD_SBYTE_OPS - | _SIDD_CMP_EQUAL_ORDER - | _SIDD_LEAST_SIGNIFICANT - on pcmpistri to do the scanning and string comparsion requirements of - sub-string match. In the scanning phase, we process Cflag and ECX - index to locate the first fragment match; once the first fragment - match position has been identified, we do comparison of subsequent - string fragments until we can conclude false or true match; whe - n concluding a false match, we may need to repeat scanning process - from next relevant offset in the target string. - - In the scanning phase we have 4 cases: - case ECX CFlag ZFlag SFlag - 1 16 0 0 0 - 2a 16 0 0 1 - 2b 16 0 1 0 - 2c 16 0 1 1 - - 1. No ordered-comparison match, both 16B fragments are valid, so - continue to next fragment. - 2. No ordered-comparison match, there is EOS in either fragment, - 2a. Zflg = 0, Sflg = 1, we continue - 2b. Zflg = 1, Sflg = 0, we conclude no match and return. - 2c. Zflg = 1, sflg = 1, lenth determine match or no match - - In the string comparison phase, the 1st fragment match is fixed up - to produce ECX = 0. Subsequent fragment compare of nonzero index - and no match conclude a false match. - - case ECX CFlag ZFlag SFlag - 3 X 1 0 0/1 - 4a 0 1 0 0 - 4b 0 1 0 1 - 4c 0 < X 1 0 0/1 - 5 16 0 1 0 - - 3. An initial ordered-comparison fragment match, we fix up to do - subsequent string comparison - 4a. Continuation of fragment comparison of a string compare. - 4b. EOS reached in the reference string, we conclude true match and - return - 4c. String compare failed if index is nonzero, we need to go back to - scanning - 5. failed string compare, go back to scanning - */ - -#if !(defined USE_AS_STRCASESTR && defined STRCASESTR_NONASCII) -/* Simple replacement of movdqu to address 4KB boundary cross issue. - If EOS occurs within less than 16B before 4KB boundary, we don't - cross to next page. */ -static __m128i -__m128i_strloadu (const unsigned char * p, __m128i zero) -{ - if (__builtin_expect ((int) ((size_t) p & 0xfff) > 0xff0, 0)) - { - size_t offset = ((size_t) p & (16 - 1)); - __m128i a = _mm_load_si128 ((__m128i *) (p - offset)); - int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero)); - if ((bmsk >> offset) != 0) - return __m128i_shift_right (a, offset); - } - return _mm_loadu_si128 ((__m128i *) p); -} -#endif - -#if defined USE_AS_STRCASESTR && !defined STRCASESTR_NONASCII - -/* Similar to __m128i_strloadu. Convert to lower case for POSIX/C - locale and other which have single-byte letters only in the ASCII - range. */ -static __m128i -__m128i_strloadu_tolower (const unsigned char *p, __m128i zero, __m128i uclow, - __m128i uchigh, __m128i lcqword) -{ - __m128i frag = __m128i_strloadu (p, zero); - - /* Compare if 'Z' > bytes. Inverted way to get a mask for byte <= 'Z'. */ - __m128i r2 = _mm_cmpgt_epi8 (uchigh, frag); - /* Compare if bytes are > 'A' - 1. */ - __m128i r1 = _mm_cmpgt_epi8 (frag, uclow); - /* Mask byte == ff if byte(r2) <= 'Z' and byte(r1) > 'A' - 1. */ - __m128i mask = _mm_and_si128 (r2, r1); - /* Apply lowercase bit 6 mask for above mask bytes == ff. */ - return _mm_or_si128 (frag, _mm_and_si128 (mask, lcqword)); -} - -#endif - -/* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP - algorithm) overlap for a fully populated 16B vector. - Input parameter: 1st 16Byte loaded from the reference string of a - strstr function. - We don't use KMP algorithm if reference string is less than 16B. */ -static int -__inline__ __attribute__ ((__always_inline__,)) -KMP16Bovrlap (__m128i s2) -{ - __m128i b = _mm_unpacklo_epi8 (s2, s2); - __m128i a = _mm_unpacklo_epi8 (b, b); - a = _mm_shuffle_epi32 (a, 0); - b = _mm_srli_si128 (s2, sizeof (char)); - int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a)); - - /* _BitScanForward(&k1, bmsk); */ - int k1; - __asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk)); - if (!bmsk) - return 16; - else if (bmsk == 0x7fff) - return 1; - else if (!k1) - { - /* There are al least two distinct chars in s2. If byte 0 and 1 are - idential and the distinct value lies farther down, we can deduce - the next byte offset to restart full compare is least no earlier - than byte 3. */ - return 3; - } - else - { - /* Byte 1 is not degenerated to byte 0. */ - return k1 + 1; - } -} - -char * -__attribute__ ((section (".text.sse4.2"))) -STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2) -{ -#define p1 s1 - const unsigned char *p2 = s2; - -#ifndef STRCASESTR_NONASCII - if (__builtin_expect (p2[0] == '\0', 0)) - return (char *) p1; - - if (__builtin_expect (p1[0] == '\0', 0)) - return NULL; - - /* Check if p1 length is 1 byte long. */ - if (__builtin_expect (p1[1] == '\0', 0)) - return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL; -#endif - -#ifdef USE_AS_STRCASESTR -# ifndef STRCASESTR_NONASCII - if (__builtin_expect (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE) - != 0, 0)) - return __strcasestr_sse42_nonascii (s1, s2); - - const __m128i uclow = _mm_set1_epi8 (0x40); - const __m128i uchigh = _mm_set1_epi8 (0x5b); - const __m128i lcqword = _mm_set1_epi8 (0x20); - const __m128i zero = _mm_setzero_si128 (); -# define strloadu(p) __m128i_strloadu_tolower (p, zero, uclow, uchigh, lcqword) -# else -# define strloadu __m128i_strloadu_tolower -# define zero _mm_setzero_si128 () -# endif -#else -# define strloadu(p) __m128i_strloadu (p, zero) - const __m128i zero = _mm_setzero_si128 (); +/* Redefine strstr so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +#undef strstr +#define strstr __redirect_strstr +#include <string.h> +#undef strstr + +#define STRSTR __strstr_sse2 +#ifdef SHARED +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2); #endif - /* p1 > 1 byte long. Load up to 16 bytes of fragment. */ - __m128i frag1 = strloadu (p1); - - __m128i frag2; - if (p2[1] != '\0') - /* p2 is > 1 byte long. */ - frag2 = strloadu (p2); - else - frag2 = _mm_insert_epi8 (zero, LOADBYTE (p2[0]), 0); - - /* Unsigned bytes, equal order, does frag2 has null? */ - int cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); - int cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); - int cmp = _mm_cmpistri (frag2, frag1, 0x0c); - int cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c); - if (cmp_s & cmp_c) - { - int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2, zero)); - int len; - __asm ("bsfl %[bmsk], %[len]" - : [len] "=r" (len) : [bmsk] "r" (bmsk)); - p1 += cmp; - if ((len + cmp) <= 16) - return (char *) p1; - - /* Load up to 16 bytes of fragment. */ - frag1 = strloadu (p1); - cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); - cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c); - cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); - cmp = _mm_cmpistri (frag2, frag1, 0x0c); - if ((len + cmp) <= 16) - return (char *) p1 + cmp; - } - - if (cmp_s) - { - /* Adjust addr for 16B alginment in ensuing loop. */ - while (!cmp_z) - { - p1 += cmp; - /* Load up to 16 bytes of fragment. */ - frag1 = strloadu (p1); - cmp = _mm_cmpistri (frag2, frag1, 0x0c); - cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); - cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); - /* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp - once already, this time cmp will be zero and we can exit. */ - if ((!cmp) & cmp_c) - break; - } - - if (!cmp_c) - return NULL; - - /* Since s2 is less than 16 bytes, com_c is definitive - determination of full match. */ - return (char *) p1 + cmp; - } - - /* General case, s2 is at least 16 bytes or more. - First, the common case of false-match at first byte of p2. */ - const unsigned char *pt = NULL; - int kmp_fwd = 0; -re_trace: - while (!cmp_c) - { - /* frag1 has null. */ - if (cmp_z) - return NULL; - - /* frag 1 has no null, advance 16 bytes. */ - p1 += 16; - /* Load up to 16 bytes of fragment. */ - frag1 = strloadu (p1); - /* Unsigned bytes, equal order, is there a partial match? */ - cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); - cmp = _mm_cmpistri (frag2, frag1, 0x0c); - cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); - } - - /* Next, handle initial positive match as first byte of p2. We have - a partial fragment match, make full determination until we reached - end of s2. */ - if (!cmp) - { - if (cmp_z) - return (char *) p1; - - pt = p1; - p1 += 16; - p2 += 16; - /* Load up to 16 bytes of fragment. */ - frag2 = strloadu (p2); - } - else - { - /* Adjust 16B alignment. */ - p1 += cmp; - pt = p1; - } - - /* Load up to 16 bytes of fragment. */ - frag1 = strloadu (p1); - - /* Unsigned bytes, equal order, does frag2 has null? */ - cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); - cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); - cmp = _mm_cmpistri (frag2, frag1, 0x0c); - cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c); - while (!(cmp | cmp_z | cmp_s)) - { - p1 += 16; - p2 += 16; - /* Load up to 16 bytes of fragment. */ - frag2 = strloadu (p2); - /* Load up to 16 bytes of fragment. */ - frag1 = strloadu (p1); - /* Unsigned bytes, equal order, does frag2 has null? */ - cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); - cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); - cmp = _mm_cmpistri (frag2, frag1, 0x0c); - cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c); - } - - /* Full determination yielded a false result, retrace s1 to next - starting position. - Zflg 1 0 1 0/1 - Sflg 0 1 1 0/1 - cmp na 0 0 >0 - action done done continue continue if s2 < s1 - false match retrace s1 else false - */ - - if (cmp_s & !cmp) - return (char *) pt; - if (cmp_z) - { - if (!cmp_s) - return NULL; - - /* Handle both zero and sign flag set and s1 is shorter in - length. */ - int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2)); - int bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1)); - int len; - int len1; - __asm ("bsfl %[bmsk], %[len]" - : [len] "=r" (len) : [bmsk] "r" (bmsk)); - __asm ("bsfl %[bmsk1], %[len1]" - : [len1] "=r" (len1) : [bmsk1] "r" (bmsk1)); - if (len >= len1) - return NULL; - } - else if (!cmp) - return (char *) pt; - - /* Otherwise, we have to retrace and continue. Default of multiple - paths that need to retrace from next byte in s1. */ - p2 = s2; - frag2 = strloadu (p2); - - if (!kmp_fwd) - kmp_fwd = KMP16Bovrlap (frag2); +#include "string/strstr.c" - /* KMP algorithm predicted overlap needs to be corrected for - partial fragment compare. */ - p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd); +extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden; +extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden; - /* Since s2 is at least 16 bytes long, we're certain there is no - match. */ - if (p1[0] == '\0') - return NULL; +#include "init-arch.h" - /* Load up to 16 bytes of fragment. */ - frag1 = strloadu (p1); +/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle + ifunc symbol properly. */ +extern __typeof (__redirect_strstr) __libc_strstr; +libc_ifunc (__libc_strstr, HAS_FAST_UNALIGNED_LOAD ? __strstr_sse2_unaligned : __strstr_sse2) - /* Unsigned bytes, equal order, is there a partial match? */ - cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); - cmp = _mm_cmpistri (frag2, frag1, 0x0c); - cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); - goto re_trace; -} +#undef strstr +strong_alias (__libc_strstr, strstr) diff --git a/sysdeps/x86_64/multiarch/test-multiarch.c b/sysdeps/x86_64/multiarch/test-multiarch.c index 7ad7cca21e..0b144bc06d 100644 --- a/sysdeps/x86_64/multiarch/test-multiarch.c +++ b/sysdeps/x86_64/multiarch/test-multiarch.c @@ -1,6 +1,6 @@ /* Test CPU feature data. This file is part of the GNU C Library. - Copyright (C) 2012-2013 Free Software Foundation, Inc. + Copyright (C) 2012-2014 Free Software Foundation, Inc. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public diff --git a/sysdeps/x86_64/multiarch/varshift.c b/sysdeps/x86_64/multiarch/varshift.c index cdb0efb187..9761fb20c3 100644 --- a/sysdeps/x86_64/multiarch/varshift.c +++ b/sysdeps/x86_64/multiarch/varshift.c @@ -1,5 +1,5 @@ /* Helper for variable shifts of SSE registers. - Copyright (C) 2010-2013 Free Software Foundation, Inc. + Copyright (C) 2010-2014 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/multiarch/varshift.h b/sysdeps/x86_64/multiarch/varshift.h index 5b7e910eb2..4436a605bd 100644 --- a/sysdeps/x86_64/multiarch/varshift.h +++ b/sysdeps/x86_64/multiarch/varshift.h @@ -1,5 +1,5 @@ /* Helper for variable shifts of SSE registers. - Copyright (C) 2010-2013 Free Software Foundation, Inc. + Copyright (C) 2010-2014 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S index b7de092228..c79389ec3b 100644 --- a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S +++ b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S @@ -1,5 +1,5 @@ /* wcscpy with SSSE3 - Copyright (C) 2011-2013 Free Software Foundation, Inc. + Copyright (C) 2011-2014 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/wcscpy.S b/sysdeps/x86_64/multiarch/wcscpy.S index e5ac97e558..f12ba27d60 100644 --- a/sysdeps/x86_64/multiarch/wcscpy.S +++ b/sysdeps/x86_64/multiarch/wcscpy.S @@ -1,6 +1,6 @@ /* Multiple versions of wcscpy All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2011-2013 Free Software Foundation, Inc. + Copyright (C) 2011-2014 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S index f7c8040527..37b9bbaeea 100644 --- a/sysdeps/x86_64/multiarch/wmemcmp.S +++ b/sysdeps/x86_64/multiarch/wmemcmp.S @@ -1,6 +1,6 @@ /* Multiple versions of wmemcmp All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2011-2013 Free Software Foundation, Inc. + Copyright (C) 2011-2014 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. |