summaryrefslogtreecommitdiff
path: root/sysdeps/x86_64/multiarch
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/x86_64/multiarch')
-rw-r--r--sysdeps/x86_64/multiarch/Makefile15
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-impl-list.c22
-rw-r--r--sysdeps/x86_64/multiarch/init-arch.c17
-rw-r--r--sysdeps/x86_64/multiarch/init-arch.h6
-rw-r--r--sysdeps/x86_64/multiarch/memcmp-sse4.S86
-rw-r--r--sysdeps/x86_64/multiarch/memcmp-ssse3.S134
-rw-r--r--sysdeps/x86_64/multiarch/memcmp.S2
-rw-r--r--sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S12
-rw-r--r--sysdeps/x86_64/multiarch/memcpy-ssse3-back.S88
-rw-r--r--sysdeps/x86_64/multiarch/memcpy-ssse3.S256
-rw-r--r--sysdeps/x86_64/multiarch/memcpy.S2
-rw-r--r--sysdeps/x86_64/multiarch/memcpy_chk.S2
-rw-r--r--sysdeps/x86_64/multiarch/memmove.c2
-rw-r--r--sysdeps/x86_64/multiarch/memmove_chk.c2
-rw-r--r--sysdeps/x86_64/multiarch/mempcpy.S2
-rw-r--r--sysdeps/x86_64/multiarch/mempcpy_chk.S2
-rw-r--r--sysdeps/x86_64/multiarch/rawmemchr.S103
-rw-r--r--sysdeps/x86_64/multiarch/sched_cpucount.c2
-rw-r--r--sysdeps/x86_64/multiarch/strcasestr-c.c19
-rw-r--r--sysdeps/x86_64/multiarch/strcasestr-nonascii.c50
-rw-r--r--sysdeps/x86_64/multiarch/strcasestr.c18
-rw-r--r--sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S2
-rw-r--r--sysdeps/x86_64/multiarch/strcat-ssse3.S2
-rw-r--r--sysdeps/x86_64/multiarch/strcat.S2
-rw-r--r--sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S2
-rw-r--r--sysdeps/x86_64/multiarch/strchr.S127
-rw-r--r--sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S209
-rw-r--r--sysdeps/x86_64/multiarch/strcmp-sse42.S4
-rw-r--r--sysdeps/x86_64/multiarch/strcmp.S43
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S6
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-ssse3.S2
-rw-r--r--sysdeps/x86_64/multiarch/strcpy.S2
-rw-r--r--sysdeps/x86_64/multiarch/strcspn-c.c2
-rw-r--r--sysdeps/x86_64/multiarch/strcspn.S2
-rw-r--r--sysdeps/x86_64/multiarch/strend-sse4.S48
-rw-r--r--sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S555
-rw-r--r--sysdeps/x86_64/multiarch/strrchr.S286
-rw-r--r--sysdeps/x86_64/multiarch/strspn-c.c2
-rw-r--r--sysdeps/x86_64/multiarch/strspn.S2
-rw-r--r--sysdeps/x86_64/multiarch/strstr-c.c47
-rw-r--r--sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S374
-rw-r--r--sysdeps/x86_64/multiarch/strstr.c388
-rw-r--r--sysdeps/x86_64/multiarch/test-multiarch.c2
-rw-r--r--sysdeps/x86_64/multiarch/varshift.c2
-rw-r--r--sysdeps/x86_64/multiarch/varshift.h2
-rw-r--r--sysdeps/x86_64/multiarch/wcscpy-ssse3.S2
-rw-r--r--sysdeps/x86_64/multiarch/wcscpy.S2
-rw-r--r--sysdeps/x86_64/multiarch/wmemcmp.S2
48 files changed, 987 insertions, 1974 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 203d16eed3..57a3c13e8a 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -6,25 +6,24 @@ endif
ifeq ($(subdir),string)
-sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
- strend-sse4 memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned mempcpy-ssse3 \
+sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
+ strcmp-sse2-unaligned strncmp-ssse3 \
+ memcmp-sse4 memcpy-ssse3 \
+ memcpy-sse2-unaligned mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
- memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
+ memmove-ssse3-back strcasecmp_l-ssse3 \
strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
strcpy-sse2-unaligned strncpy-sse2-unaligned \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
strcat-sse2-unaligned strncat-sse2-unaligned \
- strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3
+ strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned
ifeq (yes,$(config-cflags-sse4))
-sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
+sysdep_routines += strcspn-c strpbrk-c strspn-c varshift
CFLAGS-varshift.c += -msse4
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
CFLAGS-strspn-c.c += -msse4
-CFLAGS-strstr.c += -msse4
-CFLAGS-strcasestr.c += -msse4
-CFLAGS-strcasestr-nonascii.c += -msse4
endif
endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 28d35793c5..6da9be1420 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -1,5 +1,5 @@
/* Enumerate available IFUNC implementations of a function. x86-64 version.
- Copyright (C) 2012-2013 Free Software Foundation, Inc.
+ Copyright (C) 2012-2014 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -61,12 +61,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memmove_ssse3)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2))
- /* Support sysdeps/x86_64/multiarch/rawmemchr.S. */
- IFUNC_IMPL (i, name, rawmemchr,
- IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_SSE4_2,
- __rawmemchr_sse42)
- IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
-
/* Support sysdeps/x86_64/multiarch/stpncpy.S. */
IFUNC_IMPL (i, name, stpncpy,
IFUNC_IMPL_ADD (array, i, stpncpy, HAS_SSSE3,
@@ -104,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasestr.c. */
IFUNC_IMPL (i, name, strcasestr,
- IFUNC_IMPL_ADD (array, i, strcasestr, HAS_SSE4_2,
- __strcasestr_sse42)
IFUNC_IMPL_ADD (array, i, strcasestr, 1, __strcasestr_sse2))
/* Support sysdeps/x86_64/multiarch/strcat.S. */
@@ -116,7 +108,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strchr.S. */
IFUNC_IMPL (i, name, strchr,
- IFUNC_IMPL_ADD (array, i, strchr, HAS_SSE4_2, __strchr_sse42)
IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
@@ -124,6 +115,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, strcmp,
IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSE4_2, __strcmp_sse42)
IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSSE3, __strcmp_ssse3)
+ IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2))
/* Support sysdeps/x86_64/multiarch/strcpy.S. */
@@ -182,21 +174,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__strpbrk_sse42)
IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_sse2))
- /* Support sysdeps/x86_64/multiarch/strrchr.S. */
- IFUNC_IMPL (i, name, strrchr,
- IFUNC_IMPL_ADD (array, i, strrchr, HAS_SSE4_2,
- __strrchr_sse42)
- IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2_no_bsf)
- IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2))
/* Support sysdeps/x86_64/multiarch/strspn.S. */
IFUNC_IMPL (i, name, strspn,
IFUNC_IMPL_ADD (array, i, strspn, HAS_SSE4_2, __strspn_sse42)
IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_sse2))
- /* Support sysdeps/x86_64/multiarch/strstr-c.c. */
+ /* Support sysdeps/x86_64/multiarch/strstr.c. */
IFUNC_IMPL (i, name, strstr,
- IFUNC_IMPL_ADD (array, i, strstr, HAS_SSE4_2, __strstr_sse42)
+ IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
/* Support sysdeps/x86_64/multiarch/wcscpy.S. */
diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c
index 7daaf46099..db74d977f2 100644
--- a/sysdeps/x86_64/multiarch/init-arch.c
+++ b/sysdeps/x86_64/multiarch/init-arch.c
@@ -1,6 +1,6 @@
/* Initialize CPU feature data.
This file is part of the GNU C Library.
- Copyright (C) 2008-2013 Free Software Foundation, Inc.
+ Copyright (C) 2008-2014 Free Software Foundation, Inc.
Contributed by Ulrich Drepper <drepper@redhat.com>.
The GNU C Library is free software; you can redistribute it and/or
@@ -78,6 +78,21 @@ __init_cpu_features (void)
__cpu_features.feature[index_Slow_BSF] |= bit_Slow_BSF;
break;
+ case 0x37:
+ /* Unaligned load versions are faster than SSSE3
+ on Silvermont. */
+#if index_Fast_Unaligned_Load != index_Prefer_PMINUB_for_stringop
+# error index_Fast_Unaligned_Load != index_Prefer_PMINUB_for_stringop
+#endif
+#if index_Fast_Unaligned_Load != index_Slow_SSE4_2
+# error index_Fast_Unaligned_Load != index_Slow_SSE4_2
+#endif
+ __cpu_features.feature[index_Fast_Unaligned_Load]
+ |= (bit_Fast_Unaligned_Load
+ | bit_Prefer_PMINUB_for_stringop
+ | bit_Slow_SSE4_2);
+ break;
+
default:
/* Unknown family 0x06 processors. Assuming this is one
of Core i3/i5/i7 processors if AVX is available. */
diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
index 28edbf7d07..793707a4da 100644
--- a/sysdeps/x86_64/multiarch/init-arch.h
+++ b/sysdeps/x86_64/multiarch/init-arch.h
@@ -1,5 +1,5 @@
/* This file is part of the GNU C Library.
- Copyright (C) 2008-2013 Free Software Foundation, Inc.
+ Copyright (C) 2008-2014 Free Software Foundation, Inc.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -23,6 +23,7 @@
#define bit_AVX_Usable (1 << 6)
#define bit_FMA_Usable (1 << 7)
#define bit_FMA4_Usable (1 << 8)
+#define bit_Slow_SSE4_2 (1 << 9)
/* CPUID Feature flags. */
@@ -62,6 +63,7 @@
# define index_AVX_Usable FEATURE_INDEX_1*FEATURE_SIZE
# define index_FMA_Usable FEATURE_INDEX_1*FEATURE_SIZE
# define index_FMA4_Usable FEATURE_INDEX_1*FEATURE_SIZE
+# define index_Slow_SSE4_2 FEATURE_INDEX_1*FEATURE_SIZE
#else /* __ASSEMBLER__ */
@@ -156,9 +158,11 @@ extern const struct cpu_features *__get_cpu_features (void)
# define index_Fast_Copy_Backward FEATURE_INDEX_1
# define index_Slow_BSF FEATURE_INDEX_1
# define index_Fast_Unaligned_Load FEATURE_INDEX_1
+# define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1
# define index_AVX_Usable FEATURE_INDEX_1
# define index_FMA_Usable FEATURE_INDEX_1
# define index_FMA4_Usable FEATURE_INDEX_1
+# define index_Slow_SSE4_2 FEATURE_INDEX_1
# define HAS_ARCH_FEATURE(name) \
((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0)
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
index 1ed4200f4c..e753d62bf4 100644
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -1,5 +1,5 @@
/* memcmp with SSE4.1, wmemcmp with SSE4.1
- Copyright (C) 2010-2013 Free Software Foundation, Inc.
+ Copyright (C) 2010-2014 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -25,10 +25,6 @@
# define MEMCMP __memcmp_sse4_1
# endif
-# ifndef ALIGN
-# define ALIGN(n) .p2align n
-# endif
-
# define JMPTBL(I, B) (I - B)
# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
@@ -60,7 +56,7 @@ ENTRY (MEMCMP)
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
# ifndef USE_AS_WMEMCMP
- ALIGN (4)
+ .p2align 4
L(firstbyte):
movzbl (%rdi), %eax
movzbl (%rsi), %ecx
@@ -68,7 +64,7 @@ L(firstbyte):
ret
# endif
- ALIGN (4)
+ .p2align 4
L(79bytesormore):
movdqu (%rsi), %xmm1
movdqu (%rdi), %xmm2
@@ -316,7 +312,7 @@ L(less32bytesin256):
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(512bytesormore):
# ifdef DATA_CACHE_SIZE_HALF
mov $DATA_CACHE_SIZE_HALF, %R8_LP
@@ -329,7 +325,7 @@ L(512bytesormore):
cmp %r8, %rdx
ja L(L2_L3_cache_unaglined)
sub $64, %rdx
- ALIGN (4)
+ .p2align 4
L(64bytesormore_loop):
movdqu (%rdi), %xmm2
pxor (%rsi), %xmm2
@@ -361,7 +357,7 @@ L(64bytesormore_loop):
L(L2_L3_cache_unaglined):
sub $64, %rdx
- ALIGN (4)
+ .p2align 4
L(L2_L3_unaligned_128bytes_loop):
prefetchnta 0x1c0(%rdi)
prefetchnta 0x1c0(%rsi)
@@ -396,7 +392,7 @@ L(L2_L3_unaligned_128bytes_loop):
/*
* This case is for machines which are sensitive for unaligned instructions.
*/
- ALIGN (4)
+ .p2align 4
L(2aligned):
cmp $128, %rdx
ja L(128bytesormorein2aligned)
@@ -444,7 +440,7 @@ L(less32bytesin64in2alinged):
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(128bytesormorein2aligned):
cmp $512, %rdx
ja L(512bytesormorein2aligned)
@@ -519,7 +515,7 @@ L(less32bytesin128in2aligned):
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(256bytesormorein2aligned):
sub $256, %rdx
@@ -632,7 +628,7 @@ L(less32bytesin256in2alinged):
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(512bytesormorein2aligned):
# ifdef DATA_CACHE_SIZE_HALF
mov $DATA_CACHE_SIZE_HALF, %R8_LP
@@ -646,7 +642,7 @@ L(512bytesormorein2aligned):
ja L(L2_L3_cache_aglined)
sub $64, %rdx
- ALIGN (4)
+ .p2align 4
L(64bytesormore_loopin2aligned):
movdqa (%rdi), %xmm2
pxor (%rsi), %xmm2
@@ -678,7 +674,7 @@ L(64bytesormore_loopin2aligned):
L(L2_L3_cache_aglined):
sub $64, %rdx
- ALIGN (4)
+ .p2align 4
L(L2_L3_aligned_128bytes_loop):
prefetchnta 0x1c0(%rdi)
prefetchnta 0x1c0(%rsi)
@@ -711,7 +707,7 @@ L(L2_L3_aligned_128bytes_loop):
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(64bytesormore_loop_end):
add $16, %rdi
add $16, %rsi
@@ -806,7 +802,7 @@ L(8bytes):
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(12bytes):
mov -12(%rdi), %rax
mov -12(%rsi), %rcx
@@ -827,7 +823,7 @@ L(0bytes):
# ifndef USE_AS_WMEMCMP
/* unreal case for wmemcmp */
- ALIGN (4)
+ .p2align 4
L(65bytes):
movdqu -65(%rdi), %xmm1
movdqu -65(%rsi), %xmm2
@@ -864,7 +860,7 @@ L(9bytes):
sub %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(13bytes):
mov -13(%rdi), %rax
mov -13(%rsi), %rcx
@@ -877,7 +873,7 @@ L(13bytes):
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(5bytes):
mov -5(%rdi), %eax
mov -5(%rsi), %ecx
@@ -888,7 +884,7 @@ L(5bytes):
sub %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(66bytes):
movdqu -66(%rdi), %xmm1
movdqu -66(%rsi), %xmm2
@@ -929,7 +925,7 @@ L(10bytes):
sub %ecx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(14bytes):
mov -14(%rdi), %rax
mov -14(%rsi), %rcx
@@ -942,7 +938,7 @@ L(14bytes):
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(6bytes):
mov -6(%rdi), %eax
mov -6(%rsi), %ecx
@@ -958,7 +954,7 @@ L(2bytes):
sub %ecx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(67bytes):
movdqu -67(%rdi), %xmm2
movdqu -67(%rsi), %xmm1
@@ -997,7 +993,7 @@ L(11bytes):
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(15bytes):
mov -15(%rdi), %rax
mov -15(%rsi), %rcx
@@ -1010,7 +1006,7 @@ L(15bytes):
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(7bytes):
mov -7(%rdi), %eax
mov -7(%rsi), %ecx
@@ -1023,7 +1019,7 @@ L(7bytes):
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(3bytes):
movzwl -3(%rdi), %eax
movzwl -3(%rsi), %ecx
@@ -1036,7 +1032,7 @@ L(1bytes):
ret
# endif
- ALIGN (4)
+ .p2align 4
L(68bytes):
movdqu -68(%rdi), %xmm2
movdqu -68(%rsi), %xmm1
@@ -1079,7 +1075,7 @@ L(20bytes):
# ifndef USE_AS_WMEMCMP
/* unreal cases for wmemcmp */
- ALIGN (4)
+ .p2align 4
L(69bytes):
movdqu -69(%rsi), %xmm1
movdqu -69(%rdi), %xmm2
@@ -1115,7 +1111,7 @@ L(21bytes):
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(70bytes):
movdqu -70(%rsi), %xmm1
movdqu -70(%rdi), %xmm2
@@ -1151,7 +1147,7 @@ L(22bytes):
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(71bytes):
movdqu -71(%rsi), %xmm1
movdqu -71(%rdi), %xmm2
@@ -1188,7 +1184,7 @@ L(23bytes):
ret
# endif
- ALIGN (4)
+ .p2align 4
L(72bytes):
movdqu -72(%rsi), %xmm1
movdqu -72(%rdi), %xmm2
@@ -1227,7 +1223,7 @@ L(24bytes):
# ifndef USE_AS_WMEMCMP
/* unreal cases for wmemcmp */
- ALIGN (4)
+ .p2align 4
L(73bytes):
movdqu -73(%rsi), %xmm1
movdqu -73(%rdi), %xmm2
@@ -1265,7 +1261,7 @@ L(25bytes):
sub %ecx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(74bytes):
movdqu -74(%rsi), %xmm1
movdqu -74(%rdi), %xmm2
@@ -1302,7 +1298,7 @@ L(26bytes):
movzwl -2(%rsi), %ecx
jmp L(diffin2bytes)
- ALIGN (4)
+ .p2align 4
L(75bytes):
movdqu -75(%rsi), %xmm1
movdqu -75(%rdi), %xmm2
@@ -1342,7 +1338,7 @@ L(27bytes):
xor %eax, %eax
ret
# endif
- ALIGN (4)
+ .p2align 4
L(76bytes):
movdqu -76(%rsi), %xmm1
movdqu -76(%rdi), %xmm2
@@ -1388,7 +1384,7 @@ L(28bytes):
# ifndef USE_AS_WMEMCMP
/* unreal cases for wmemcmp */
- ALIGN (4)
+ .p2align 4
L(77bytes):
movdqu -77(%rsi), %xmm1
movdqu -77(%rdi), %xmm2
@@ -1430,7 +1426,7 @@ L(29bytes):
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(78bytes):
movdqu -78(%rsi), %xmm1
movdqu -78(%rdi), %xmm2
@@ -1470,7 +1466,7 @@ L(30bytes):
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(79bytes):
movdqu -79(%rsi), %xmm1
movdqu -79(%rdi), %xmm2
@@ -1510,7 +1506,7 @@ L(31bytes):
xor %eax, %eax
ret
# endif
- ALIGN (4)
+ .p2align 4
L(64bytes):
movdqu -64(%rdi), %xmm2
movdqu -64(%rsi), %xmm1
@@ -1548,7 +1544,7 @@ L(32bytes):
/*
* Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
*/
- ALIGN (3)
+ .p2align 3
L(less16bytes):
movsbq %dl, %rdx
mov (%rsi, %rdx), %rcx
@@ -1585,7 +1581,7 @@ L(diffin2bytes):
sub %ecx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(end):
and $0xff, %eax
and $0xff, %ecx
@@ -1599,7 +1595,7 @@ L(end):
neg %eax
ret
- ALIGN (4)
+ .p2align 4
L(nequal_bigger):
ret
@@ -1611,7 +1607,7 @@ L(unreal_case):
END (MEMCMP)
.section .rodata.sse4.1,"a",@progbits
- ALIGN (3)
+ .p2align 3
# ifndef USE_AS_WMEMCMP
L(table_64bytes):
.int JMPTBL (L(0bytes), L(table_64bytes))
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
index bdd2ed213c..5f7572fbab 100644
--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
@@ -1,5 +1,5 @@
/* memcmp with SSSE3, wmemcmp with SSSE3
- Copyright (C) 2011-2013 Free Software Foundation, Inc.
+ Copyright (C) 2011-2014 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -25,10 +25,6 @@
# define MEMCMP __memcmp_ssse3
# endif
-# ifndef ALIGN
-# define ALIGN(n) .p2align n
-# endif
-
/* Warning!
wmemcmp has to use SIGNED comparison for elements.
memcmp has to use UNSIGNED comparison for elemnts.
@@ -50,7 +46,7 @@ ENTRY (MEMCMP)
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
/* ECX >= 32. */
L(48bytesormore):
movdqu (%rdi), %xmm3
@@ -90,7 +86,7 @@ L(48bytesormore):
je L(shr_6)
jmp L(shr_7)
- ALIGN (2)
+ .p2align 2
L(next_unaligned_table):
cmp $8, %edx
je L(shr_8)
@@ -117,7 +113,7 @@ L(next_unaligned_table):
jmp L(shr_12)
# endif
- ALIGN (4)
+ .p2align 4
L(shr_0):
cmp $80, %rcx
lea -48(%rcx), %rcx
@@ -137,7 +133,7 @@ L(shr_0):
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_0_gobble):
movdqa (%rsi), %xmm0
xor %eax, %eax
@@ -180,7 +176,7 @@ L(next):
# ifndef USE_AS_WMEMCMP
- ALIGN (4)
+ .p2align 4
L(shr_1):
cmp $80, %rcx
lea -48(%rcx), %rcx
@@ -207,7 +203,7 @@ L(shr_1):
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_1_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
@@ -258,7 +254,7 @@ L(shr_1_gobble_next):
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_2):
cmp $80, %rcx
lea -48(%rcx), %rcx
@@ -285,7 +281,7 @@ L(shr_2):
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_2_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
@@ -335,7 +331,7 @@ L(shr_2_gobble_next):
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_3):
cmp $80, %rcx
lea -48(%rcx), %rcx
@@ -362,7 +358,7 @@ L(shr_3):
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_3_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
@@ -414,7 +410,7 @@ L(shr_3_gobble_next):
# endif
- ALIGN (4)
+ .p2align 4
L(shr_4):
cmp $80, %rcx
lea -48(%rcx), %rcx
@@ -441,7 +437,7 @@ L(shr_4):
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_4_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
@@ -493,7 +489,7 @@ L(shr_4_gobble_next):
# ifndef USE_AS_WMEMCMP
- ALIGN (4)
+ .p2align 4
L(shr_5):
cmp $80, %rcx
lea -48(%rcx), %rcx
@@ -520,7 +516,7 @@ L(shr_5):
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_5_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
@@ -570,7 +566,7 @@ L(shr_5_gobble_next):
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_6):
cmp $80, %rcx
lea -48(%rcx), %rcx
@@ -597,7 +593,7 @@ L(shr_6):
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_6_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
@@ -647,7 +643,7 @@ L(shr_6_gobble_next):
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_7):
cmp $80, %rcx
lea -48(%rcx), %rcx
@@ -674,7 +670,7 @@ L(shr_7):
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_7_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
@@ -726,7 +722,7 @@ L(shr_7_gobble_next):
# endif
- ALIGN (4)
+ .p2align 4
L(shr_8):
cmp $80, %rcx
lea -48(%rcx), %rcx
@@ -753,7 +749,7 @@ L(shr_8):
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_8_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
@@ -805,7 +801,7 @@ L(shr_8_gobble_next):
# ifndef USE_AS_WMEMCMP
- ALIGN (4)
+ .p2align 4
L(shr_9):
cmp $80, %rcx
lea -48(%rcx), %rcx
@@ -832,7 +828,7 @@ L(shr_9):
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_9_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
@@ -882,7 +878,7 @@ L(shr_9_gobble_next):
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_10):
cmp $80, %rcx
lea -48(%rcx), %rcx
@@ -909,7 +905,7 @@ L(shr_10):
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_10_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
@@ -959,7 +955,7 @@ L(shr_10_gobble_next):
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_11):
cmp $80, %rcx
lea -48(%rcx), %rcx
@@ -986,7 +982,7 @@ L(shr_11):
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_11_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
@@ -1038,7 +1034,7 @@ L(shr_11_gobble_next):
# endif
- ALIGN (4)
+ .p2align 4
L(shr_12):
cmp $80, %rcx
lea -48(%rcx), %rcx
@@ -1065,7 +1061,7 @@ L(shr_12):
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_12_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
@@ -1117,7 +1113,7 @@ L(shr_12_gobble_next):
# ifndef USE_AS_WMEMCMP
- ALIGN (4)
+ .p2align 4
L(shr_13):
cmp $80, %rcx
lea -48(%rcx), %rcx
@@ -1144,7 +1140,7 @@ L(shr_13):
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_13_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
@@ -1194,7 +1190,7 @@ L(shr_13_gobble_next):
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_14):
cmp $80, %rcx
lea -48(%rcx), %rcx
@@ -1221,7 +1217,7 @@ L(shr_14):
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_14_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
@@ -1271,7 +1267,7 @@ L(shr_14_gobble_next):
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_15):
cmp $80, %rcx
lea -48(%rcx), %rcx
@@ -1298,7 +1294,7 @@ L(shr_15):
add %rcx, %rdi
jmp L(less48bytes)
- ALIGN (4)
+ .p2align 4
L(shr_15_gobble):
sub $32, %rcx
movdqa 16(%rsi), %xmm0
@@ -1348,7 +1344,7 @@ L(shr_15_gobble_next):
add %rcx, %rdi
jmp L(less48bytes)
# endif
- ALIGN (4)
+ .p2align 4
L(exit):
pmovmskb %xmm1, %r8d
sub $0xffff, %r8d
@@ -1389,56 +1385,56 @@ L(less16bytes):
sub %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(Byte16):
movzbl -16(%rdi), %eax
movzbl -16(%rsi), %edx
sub %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(Byte17):
movzbl -15(%rdi), %eax
movzbl -15(%rsi), %edx
sub %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(Byte18):
movzbl -14(%rdi), %eax
movzbl -14(%rsi), %edx
sub %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(Byte19):
movzbl -13(%rdi), %eax
movzbl -13(%rsi), %edx
sub %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(Byte20):
movzbl -12(%rdi), %eax
movzbl -12(%rsi), %edx
sub %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(Byte21):
movzbl -11(%rdi), %eax
movzbl -11(%rsi), %edx
sub %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(Byte22):
movzbl -10(%rdi), %eax
movzbl -10(%rsi), %edx
sub %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(next_24_bytes):
lea 8(%rdi), %rdi
lea 8(%rsi), %rsi
@@ -1463,10 +1459,8 @@ L(next_24_bytes):
test $0x40, %dh
jnz L(Byte22)
- mov -9(%rdi), %eax
- and $0xff, %eax
- mov -9(%rsi), %edx
- and $0xff, %edx
+ movzbl -9(%rdi), %eax
+ movzbl -9(%rsi), %edx
sub %edx, %eax
ret
# else
@@ -1481,14 +1475,14 @@ L(next_24_bytes):
jne L(find_diff)
ret
- ALIGN (4)
+ .p2align 4
L(second_double_word):
mov -12(%rdi), %eax
cmp -12(%rsi), %eax
jne L(find_diff)
ret
- ALIGN (4)
+ .p2align 4
L(next_two_double_words):
and $15, %dh
jz L(fourth_double_word)
@@ -1497,7 +1491,7 @@ L(next_two_double_words):
jne L(find_diff)
ret
- ALIGN (4)
+ .p2align 4
L(fourth_double_word):
mov -4(%rdi), %eax
cmp -4(%rsi), %eax
@@ -1505,7 +1499,7 @@ L(fourth_double_word):
ret
# endif
- ALIGN (4)
+ .p2align 4
L(less48bytes):
cmp $8, %ecx
jae L(more8bytes)
@@ -1529,7 +1523,7 @@ L(less48bytes):
jmp L(4bytes)
# endif
- ALIGN (4)
+ .p2align 4
L(more8bytes):
cmp $16, %ecx
jae L(more16bytes)
@@ -1553,7 +1547,7 @@ L(more8bytes):
jmp L(12bytes)
# endif
- ALIGN (4)
+ .p2align 4
L(more16bytes):
cmp $24, %ecx
jae L(more24bytes)
@@ -1577,7 +1571,7 @@ L(more16bytes):
jmp L(20bytes)
# endif
- ALIGN (4)
+ .p2align 4
L(more24bytes):
cmp $32, %ecx
jae L(more32bytes)
@@ -1601,7 +1595,7 @@ L(more24bytes):
jmp L(28bytes)
# endif
- ALIGN (4)
+ .p2align 4
L(more32bytes):
cmp $40, %ecx
jae L(more40bytes)
@@ -1625,7 +1619,7 @@ L(more32bytes):
jmp L(36bytes)
# endif
- ALIGN (4)
+ .p2align 4
L(more40bytes):
cmp $40, %ecx
je L(40bytes)
@@ -1644,7 +1638,7 @@ L(more40bytes):
je L(46bytes)
jmp L(47bytes)
- ALIGN (4)
+ .p2align 4
L(44bytes):
movl -44(%rdi), %eax
movl -44(%rsi), %ecx
@@ -1704,7 +1698,7 @@ L(0bytes):
xor %eax, %eax
ret
# else
- ALIGN (4)
+ .p2align 4
L(44bytes):
movl -44(%rdi), %eax
cmp -44(%rsi), %eax
@@ -1755,7 +1749,7 @@ L(0bytes):
# endif
# ifndef USE_AS_WMEMCMP
- ALIGN (4)
+ .p2align 4
L(45bytes):
movl -45(%rdi), %eax
movl -45(%rsi), %ecx
@@ -1818,7 +1812,7 @@ L(1bytes):
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(46bytes):
movl -46(%rdi), %eax
movl -46(%rsi), %ecx
@@ -1884,7 +1878,7 @@ L(2bytes):
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(47bytes):
movl -47(%rdi), %eax
movl -47(%rsi), %ecx
@@ -1953,7 +1947,7 @@ L(3bytes):
xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(find_diff):
cmpb %cl, %al
jne L(set)
@@ -1975,19 +1969,19 @@ L(set):
# else
/* for wmemcmp */
- ALIGN (4)
+ .p2align 4
L(find_diff):
mov $1, %eax
jg L(find_diff_bigger)
neg %eax
ret
- ALIGN (4)
+ .p2align 4
L(find_diff_bigger):
ret
# endif
- ALIGN (4)
+ .p2align 4
L(equal):
xor %eax, %eax
ret
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
index da88af248a..627d8d05cf 100644
--- a/sysdeps/x86_64/multiarch/memcmp.S
+++ b/sysdeps/x86_64/multiarch/memcmp.S
@@ -1,6 +1,6 @@
/* Multiple versions of memcmp
All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2010-2013 Free Software Foundation, Inc.
+ Copyright (C) 2010-2014 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
index efdfea238f..07241b8e2b 100644
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -1,5 +1,5 @@
/* memcpy with unaliged loads
- Copyright (C) 2013 Free Software Foundation, Inc.
+ Copyright (C) 2013-2014 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -20,10 +20,6 @@
#include "asm-syntax.h"
-#ifndef ALIGN
-# define ALIGN(n) .p2align n
-#endif
-
ENTRY(__memcpy_sse2_unaligned)
movq %rsi, %rax
@@ -44,7 +40,7 @@ L(return):
movq %rdi, %rax
ret
.p2align 4,,10
- ALIGN(4)
+ .p2align 4
.L31:
movdqu 16(%rsi), %xmm8
cmpq $64, %rdx
@@ -77,7 +73,7 @@ L(return):
leaq 32(%r10), %r8
leaq 48(%r10), %rax
.p2align 4,,10
- ALIGN(4)
+ .p2align 4
L(loop):
movdqu (%rcx,%r10), %xmm8
movdqa %xmm8, (%rcx)
@@ -151,7 +147,7 @@ L(less_16):
.L3:
leaq -1(%rdx), %rax
.p2align 4,,10
- ALIGN(4)
+ .p2align 4
.L11:
movzbl (%rsi,%rax), %edx
movb %dl, (%rdi,%rax)
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
index fc9fcef27d..899ccbc34b 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
@@ -1,5 +1,5 @@
/* memcpy with SSSE3 and REP string
- Copyright (C) 2010-2013 Free Software Foundation, Inc.
+ Copyright (C) 2010-2014 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -31,10 +31,6 @@
# define MEMCPY_CHK __memcpy_chk_ssse3_back
#endif
-#ifndef ALIGN
-# define ALIGN(n) .p2align n
-#endif
-
#define JMPTBL(I, B) I - B
/* Branch to an entry in a jump table. TABLE is a jump table with
@@ -87,7 +83,7 @@ L(bk_write):
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
#endif
- ALIGN (4)
+ .p2align 4
L(144bytesormore):
#ifndef USE_AS_MEMMOVE
@@ -119,7 +115,7 @@ L(144bytesormore):
jmp *%r9
ud2
- ALIGN (4)
+ .p2align 4
L(copy_backward):
#ifdef DATA_CACHE_SIZE
mov $DATA_CACHE_SIZE, %RCX_LP
@@ -149,7 +145,7 @@ L(copy_backward):
jmp *%r9
ud2
- ALIGN (4)
+ .p2align 4
L(shl_0):
mov %rdx, %r9
@@ -162,7 +158,7 @@ L(shl_0):
#endif
jae L(gobble_mem_fwd)
sub $0x80, %rdx
- ALIGN (4)
+ .p2align 4
L(shl_0_loop):
movdqa (%rsi), %xmm1
movdqa %xmm1, (%rdi)
@@ -190,7 +186,7 @@ L(shl_0_loop):
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_0_bwd):
sub $0x80, %rdx
L(copy_backward_loop):
@@ -221,7 +217,7 @@ L(copy_backward_loop):
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_1):
sub $0x80, %rdx
movaps -0x01(%rsi), %xmm1
@@ -258,7 +254,7 @@ L(shl_1):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_1_bwd):
movaps -0x01(%rsi), %xmm1
@@ -304,7 +300,7 @@ L(shl_1_bwd):
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_2):
sub $0x80, %rdx
movaps -0x02(%rsi), %xmm1
@@ -341,7 +337,7 @@ L(shl_2):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_2_bwd):
movaps -0x02(%rsi), %xmm1
@@ -387,7 +383,7 @@ L(shl_2_bwd):
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_3):
sub $0x80, %rdx
movaps -0x03(%rsi), %xmm1
@@ -424,7 +420,7 @@ L(shl_3):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_3_bwd):
movaps -0x03(%rsi), %xmm1
@@ -470,7 +466,7 @@ L(shl_3_bwd):
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_4):
sub $0x80, %rdx
movaps -0x04(%rsi), %xmm1
@@ -507,7 +503,7 @@ L(shl_4):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_4_bwd):
movaps -0x04(%rsi), %xmm1
@@ -553,7 +549,7 @@ L(shl_4_bwd):
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_5):
sub $0x80, %rdx
movaps -0x05(%rsi), %xmm1
@@ -590,7 +586,7 @@ L(shl_5):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_5_bwd):
movaps -0x05(%rsi), %xmm1
@@ -636,7 +632,7 @@ L(shl_5_bwd):
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_6):
sub $0x80, %rdx
movaps -0x06(%rsi), %xmm1
@@ -673,7 +669,7 @@ L(shl_6):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_6_bwd):
movaps -0x06(%rsi), %xmm1
@@ -719,7 +715,7 @@ L(shl_6_bwd):
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_7):
sub $0x80, %rdx
movaps -0x07(%rsi), %xmm1
@@ -756,7 +752,7 @@ L(shl_7):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_7_bwd):
movaps -0x07(%rsi), %xmm1
@@ -802,7 +798,7 @@ L(shl_7_bwd):
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_8):
sub $0x80, %rdx
movaps -0x08(%rsi), %xmm1
@@ -839,7 +835,7 @@ L(shl_8):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_8_bwd):
movaps -0x08(%rsi), %xmm1
@@ -886,7 +882,7 @@ L(shl_8_end_bwd):
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_9):
sub $0x80, %rdx
movaps -0x09(%rsi), %xmm1
@@ -923,7 +919,7 @@ L(shl_9):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_9_bwd):
movaps -0x09(%rsi), %xmm1
@@ -969,7 +965,7 @@ L(shl_9_bwd):
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_10):
sub $0x80, %rdx
movaps -0x0a(%rsi), %xmm1
@@ -1006,7 +1002,7 @@ L(shl_10):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_10_bwd):
movaps -0x0a(%rsi), %xmm1
@@ -1052,7 +1048,7 @@ L(shl_10_bwd):
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_11):
sub $0x80, %rdx
movaps -0x0b(%rsi), %xmm1
@@ -1089,7 +1085,7 @@ L(shl_11):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_11_bwd):
movaps -0x0b(%rsi), %xmm1
@@ -1135,7 +1131,7 @@ L(shl_11_bwd):
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_12):
sub $0x80, %rdx
movdqa -0x0c(%rsi), %xmm1
@@ -1173,7 +1169,7 @@ L(shl_12):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_12_bwd):
movaps -0x0c(%rsi), %xmm1
@@ -1219,7 +1215,7 @@ L(shl_12_bwd):
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_13):
sub $0x80, %rdx
movaps -0x0d(%rsi), %xmm1
@@ -1256,7 +1252,7 @@ L(shl_13):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_13_bwd):
movaps -0x0d(%rsi), %xmm1
@@ -1302,7 +1298,7 @@ L(shl_13_bwd):
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_14):
sub $0x80, %rdx
movaps -0x0e(%rsi), %xmm1
@@ -1339,7 +1335,7 @@ L(shl_14):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_14_bwd):
movaps -0x0e(%rsi), %xmm1
@@ -1385,7 +1381,7 @@ L(shl_14_bwd):
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_15):
sub $0x80, %rdx
movaps -0x0f(%rsi), %xmm1
@@ -1422,7 +1418,7 @@ L(shl_15):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_15_bwd):
movaps -0x0f(%rsi), %xmm1
@@ -1468,7 +1464,7 @@ L(shl_15_bwd):
sub %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(gobble_mem_fwd):
movdqu (%rsi), %xmm1
movdqu %xmm0, (%r8)
@@ -1570,7 +1566,7 @@ L(gobble_mem_fwd_end):
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(gobble_mem_bwd):
add %rdx, %rsi
add %rdx, %rdi
@@ -2833,7 +2829,7 @@ L(bwd_write_1bytes):
END (MEMCPY)
.section .rodata.ssse3,"a",@progbits
- ALIGN (3)
+ .p2align 3
L(table_144_bytes_bwd):
.int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
.int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
@@ -2980,7 +2976,7 @@ L(table_144_bytes_bwd):
.int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
.int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
- ALIGN (3)
+ .p2align 3
L(table_144_bytes_fwd):
.int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
.int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
@@ -3127,7 +3123,7 @@ L(table_144_bytes_fwd):
.int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
.int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
- ALIGN (3)
+ .p2align 3
L(shl_table_fwd):
.int JMPTBL (L(shl_0), L(shl_table_fwd))
.int JMPTBL (L(shl_1), L(shl_table_fwd))
@@ -3146,7 +3142,7 @@ L(shl_table_fwd):
.int JMPTBL (L(shl_14), L(shl_table_fwd))
.int JMPTBL (L(shl_15), L(shl_table_fwd))
- ALIGN (3)
+ .p2align 3
L(shl_table_bwd):
.int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
.int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
index 9642ceecd9..0ad9a0008a 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
@@ -1,5 +1,5 @@
/* memcpy with SSSE3
- Copyright (C) 2010-2013 Free Software Foundation, Inc.
+ Copyright (C) 2010-2014 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -31,10 +31,6 @@
# define MEMCPY_CHK __memcpy_chk_ssse3
#endif
-#ifndef ALIGN
-# define ALIGN(n) .p2align n
-#endif
-
#define JMPTBL(I, B) I - B
/* Branch to an entry in a jump table. TABLE is a jump table with
@@ -80,7 +76,7 @@ L(copy_forward):
jmp *%r9
ud2
- ALIGN (4)
+ .p2align 4
L(80bytesormore):
#ifndef USE_AS_MEMMOVE
cmp %dil, %sil
@@ -113,7 +109,7 @@ L(80bytesormore):
#endif
BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
- ALIGN (4)
+ .p2align 4
L(copy_backward):
movdqu -16(%rsi, %rdx), %xmm0
add %rdx, %rsi
@@ -144,7 +140,7 @@ L(copy_backward):
#endif
BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
- ALIGN (4)
+ .p2align 4
L(shl_0):
sub $16, %rdx
movdqa (%rsi), %xmm1
@@ -172,7 +168,7 @@ L(shl_0_less_64bytes):
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_0_gobble):
#ifdef DATA_CACHE_SIZE_HALF
cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
@@ -228,7 +224,7 @@ L(shl_0_cache_less_64bytes):
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_0_gobble_mem_loop):
prefetcht0 0x1c0(%rsi)
prefetcht0 0x280(%rsi)
@@ -287,7 +283,7 @@ L(shl_0_mem_less_32bytes):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_0_bwd):
sub $16, %rdx
movdqa -0x10(%rsi), %xmm1
@@ -313,7 +309,7 @@ L(shl_0_bwd):
L(shl_0_less_64bytes_bwd):
BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_0_gobble_bwd):
#ifdef DATA_CACHE_SIZE_HALF
cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
@@ -367,7 +363,7 @@ L(shl_0_gobble_bwd_loop):
L(shl_0_gobble_bwd_less_64bytes):
BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_0_gobble_mem_bwd_loop):
prefetcht0 -0x1c0(%rsi)
prefetcht0 -0x280(%rsi)
@@ -423,7 +419,7 @@ L(shl_0_mem_bwd_less_64bytes):
L(shl_0_mem_bwd_less_32bytes):
BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_1):
lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
cmp %rcx, %rdx
@@ -466,7 +462,7 @@ L(shl_1_end):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_1_bwd):
lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
cmp %rcx, %rdx
@@ -508,7 +504,7 @@ L(shl_1_bwd_end):
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_2):
lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
cmp %rcx, %rdx
@@ -551,7 +547,7 @@ L(shl_2_end):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_2_bwd):
lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
cmp %rcx, %rdx
@@ -593,7 +589,7 @@ L(shl_2_bwd_end):
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_3):
lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
cmp %rcx, %rdx
@@ -636,7 +632,7 @@ L(shl_3_end):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_3_bwd):
lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
cmp %rcx, %rdx
@@ -678,7 +674,7 @@ L(shl_3_bwd_end):
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_4):
lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
cmp %rcx, %rdx
@@ -721,7 +717,7 @@ L(shl_4_end):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_4_bwd):
lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
cmp %rcx, %rdx
@@ -763,7 +759,7 @@ L(shl_4_bwd_end):
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_5):
lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
cmp %rcx, %rdx
@@ -806,7 +802,7 @@ L(shl_5_end):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_5_bwd):
lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
cmp %rcx, %rdx
@@ -848,7 +844,7 @@ L(shl_5_bwd_end):
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_6):
lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
cmp %rcx, %rdx
@@ -891,7 +887,7 @@ L(shl_6_end):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_6_bwd):
lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
cmp %rcx, %rdx
@@ -933,7 +929,7 @@ L(shl_6_bwd_end):
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_7):
lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
cmp %rcx, %rdx
@@ -976,7 +972,7 @@ L(shl_7_end):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_7_bwd):
lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
cmp %rcx, %rdx
@@ -1018,7 +1014,7 @@ L(shl_7_bwd_end):
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_8):
lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
cmp %rcx, %rdx
@@ -1051,7 +1047,7 @@ L(shl_8_loop_L1):
movaps %xmm5, -0x10(%rdi)
jmp *%r9
ud2
- ALIGN (4)
+ .p2align 4
L(shl_8_end):
lea 64(%rdx), %rdx
movaps %xmm4, -0x20(%rdi)
@@ -1061,7 +1057,7 @@ L(shl_8_end):
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_8_bwd):
lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
cmp %rcx, %rdx
@@ -1103,7 +1099,7 @@ L(shl_8_bwd_end):
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_9):
lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
cmp %rcx, %rdx
@@ -1146,7 +1142,7 @@ L(shl_9_end):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_9_bwd):
lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
cmp %rcx, %rdx
@@ -1188,7 +1184,7 @@ L(shl_9_bwd_end):
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_10):
lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
cmp %rcx, %rdx
@@ -1231,7 +1227,7 @@ L(shl_10_end):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_10_bwd):
lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
cmp %rcx, %rdx
@@ -1273,7 +1269,7 @@ L(shl_10_bwd_end):
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_11):
lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
cmp %rcx, %rdx
@@ -1316,7 +1312,7 @@ L(shl_11_end):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_11_bwd):
lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
cmp %rcx, %rdx
@@ -1358,7 +1354,7 @@ L(shl_11_bwd_end):
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_12):
lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
cmp %rcx, %rdx
@@ -1401,7 +1397,7 @@ L(shl_12_end):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_12_bwd):
lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
cmp %rcx, %rdx
@@ -1443,7 +1439,7 @@ L(shl_12_bwd_end):
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_13):
lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
cmp %rcx, %rdx
@@ -1486,7 +1482,7 @@ L(shl_13_end):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_13_bwd):
lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
cmp %rcx, %rdx
@@ -1528,7 +1524,7 @@ L(shl_13_bwd_end):
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_14):
lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
cmp %rcx, %rdx
@@ -1571,7 +1567,7 @@ L(shl_14_end):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_14_bwd):
lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
cmp %rcx, %rdx
@@ -1613,7 +1609,7 @@ L(shl_14_bwd_end):
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_15):
lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
cmp %rcx, %rdx
@@ -1656,7 +1652,7 @@ L(shl_15_end):
add %rdx, %rsi
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(shl_15_bwd):
lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
cmp %rcx, %rdx
@@ -1698,7 +1694,7 @@ L(shl_15_bwd_end):
movdqu %xmm0, (%r8)
BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
- ALIGN (4)
+ .p2align 4
L(write_72bytes):
movdqu -72(%rsi), %xmm0
movdqu -56(%rsi), %xmm1
@@ -1716,7 +1712,7 @@ L(write_72bytes):
mov %rcx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_64bytes):
movdqu -64(%rsi), %xmm0
mov -48(%rsi), %rcx
@@ -1734,7 +1730,7 @@ L(write_64bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_56bytes):
movdqu -56(%rsi), %xmm0
mov -40(%rsi), %r8
@@ -1750,7 +1746,7 @@ L(write_56bytes):
mov %rcx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_48bytes):
mov -48(%rsi), %rcx
mov -40(%rsi), %r8
@@ -1766,7 +1762,7 @@ L(write_48bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_40bytes):
mov -40(%rsi), %r8
mov -32(%rsi), %r9
@@ -1780,7 +1776,7 @@ L(write_40bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_32bytes):
mov -32(%rsi), %r9
mov -24(%rsi), %r10
@@ -1792,7 +1788,7 @@ L(write_32bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_24bytes):
mov -24(%rsi), %r10
mov -16(%rsi), %r11
@@ -1802,7 +1798,7 @@ L(write_24bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_16bytes):
mov -16(%rsi), %r11
mov -8(%rsi), %rdx
@@ -1810,14 +1806,14 @@ L(write_16bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_8bytes):
mov -8(%rsi), %rdx
mov %rdx, -8(%rdi)
L(write_0bytes):
ret
- ALIGN (4)
+ .p2align 4
L(write_73bytes):
movdqu -73(%rsi), %xmm0
movdqu -57(%rsi), %xmm1
@@ -1837,7 +1833,7 @@ L(write_73bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_65bytes):
movdqu -65(%rsi), %xmm0
movdqu -49(%rsi), %xmm1
@@ -1855,7 +1851,7 @@ L(write_65bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_57bytes):
movdqu -57(%rsi), %xmm0
mov -41(%rsi), %r8
@@ -1873,7 +1869,7 @@ L(write_57bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_49bytes):
movdqu -49(%rsi), %xmm0
mov -33(%rsi), %r9
@@ -1889,7 +1885,7 @@ L(write_49bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_41bytes):
mov -41(%rsi), %r8
mov -33(%rsi), %r9
@@ -1905,7 +1901,7 @@ L(write_41bytes):
mov %dl, -1(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_33bytes):
mov -33(%rsi), %r9
mov -25(%rsi), %r10
@@ -1919,7 +1915,7 @@ L(write_33bytes):
mov %dl, -1(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_25bytes):
mov -25(%rsi), %r10
mov -17(%rsi), %r11
@@ -1931,7 +1927,7 @@ L(write_25bytes):
mov %dl, -1(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_17bytes):
mov -17(%rsi), %r11
mov -9(%rsi), %rcx
@@ -1941,7 +1937,7 @@ L(write_17bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_9bytes):
mov -9(%rsi), %rcx
mov -4(%rsi), %edx
@@ -1949,13 +1945,13 @@ L(write_9bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_1bytes):
mov -1(%rsi), %dl
mov %dl, -1(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_74bytes):
movdqu -74(%rsi), %xmm0
movdqu -58(%rsi), %xmm1
@@ -1975,7 +1971,7 @@ L(write_74bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_66bytes):
movdqu -66(%rsi), %xmm0
movdqu -50(%rsi), %xmm1
@@ -1995,7 +1991,7 @@ L(write_66bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_58bytes):
movdqu -58(%rsi), %xmm1
mov -42(%rsi), %r8
@@ -2013,7 +2009,7 @@ L(write_58bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_50bytes):
movdqu -50(%rsi), %xmm0
mov -34(%rsi), %r9
@@ -2029,7 +2025,7 @@ L(write_50bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_42bytes):
mov -42(%rsi), %r8
mov -34(%rsi), %r9
@@ -2045,7 +2041,7 @@ L(write_42bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_34bytes):
mov -34(%rsi), %r9
mov -26(%rsi), %r10
@@ -2059,7 +2055,7 @@ L(write_34bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_26bytes):
mov -26(%rsi), %r10
mov -18(%rsi), %r11
@@ -2071,7 +2067,7 @@ L(write_26bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_18bytes):
mov -18(%rsi), %r11
mov -10(%rsi), %rcx
@@ -2081,7 +2077,7 @@ L(write_18bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_10bytes):
mov -10(%rsi), %rcx
mov -4(%rsi), %edx
@@ -2089,13 +2085,13 @@ L(write_10bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_2bytes):
mov -2(%rsi), %dx
mov %dx, -2(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_75bytes):
movdqu -75(%rsi), %xmm0
movdqu -59(%rsi), %xmm1
@@ -2115,7 +2111,7 @@ L(write_75bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_67bytes):
movdqu -67(%rsi), %xmm0
movdqu -59(%rsi), %xmm1
@@ -2135,7 +2131,7 @@ L(write_67bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_59bytes):
movdqu -59(%rsi), %xmm0
mov -43(%rsi), %r8
@@ -2153,7 +2149,7 @@ L(write_59bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_51bytes):
movdqu -51(%rsi), %xmm0
mov -35(%rsi), %r9
@@ -2169,7 +2165,7 @@ L(write_51bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_43bytes):
mov -43(%rsi), %r8
mov -35(%rsi), %r9
@@ -2185,7 +2181,7 @@ L(write_43bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_35bytes):
mov -35(%rsi), %r9
mov -27(%rsi), %r10
@@ -2199,7 +2195,7 @@ L(write_35bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_27bytes):
mov -27(%rsi), %r10
mov -19(%rsi), %r11
@@ -2211,7 +2207,7 @@ L(write_27bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_19bytes):
mov -19(%rsi), %r11
mov -11(%rsi), %rcx
@@ -2221,7 +2217,7 @@ L(write_19bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_11bytes):
mov -11(%rsi), %rcx
mov -4(%rsi), %edx
@@ -2229,7 +2225,7 @@ L(write_11bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_3bytes):
mov -3(%rsi), %dx
mov -2(%rsi), %cx
@@ -2237,7 +2233,7 @@ L(write_3bytes):
mov %cx, -2(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_76bytes):
movdqu -76(%rsi), %xmm0
movdqu -60(%rsi), %xmm1
@@ -2257,7 +2253,7 @@ L(write_76bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_68bytes):
movdqu -68(%rsi), %xmm0
movdqu -52(%rsi), %xmm1
@@ -2275,7 +2271,7 @@ L(write_68bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_60bytes):
movdqu -60(%rsi), %xmm0
mov -44(%rsi), %r8
@@ -2293,7 +2289,7 @@ L(write_60bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_52bytes):
movdqu -52(%rsi), %xmm0
mov -36(%rsi), %r9
@@ -2309,7 +2305,7 @@ L(write_52bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_44bytes):
mov -44(%rsi), %r8
mov -36(%rsi), %r9
@@ -2325,7 +2321,7 @@ L(write_44bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_36bytes):
mov -36(%rsi), %r9
mov -28(%rsi), %r10
@@ -2339,7 +2335,7 @@ L(write_36bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_28bytes):
mov -28(%rsi), %r10
mov -20(%rsi), %r11
@@ -2351,7 +2347,7 @@ L(write_28bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_20bytes):
mov -20(%rsi), %r11
mov -12(%rsi), %rcx
@@ -2361,7 +2357,7 @@ L(write_20bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_12bytes):
mov -12(%rsi), %rcx
mov -4(%rsi), %edx
@@ -2369,13 +2365,13 @@ L(write_12bytes):
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_4bytes):
mov -4(%rsi), %edx
mov %edx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_77bytes):
movdqu -77(%rsi), %xmm0
movdqu -61(%rsi), %xmm1
@@ -2395,7 +2391,7 @@ L(write_77bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_69bytes):
movdqu -69(%rsi), %xmm0
movdqu -53(%rsi), %xmm1
@@ -2413,7 +2409,7 @@ L(write_69bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_61bytes):
movdqu -61(%rsi), %xmm0
mov -45(%rsi), %r8
@@ -2431,7 +2427,7 @@ L(write_61bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_53bytes):
movdqu -53(%rsi), %xmm0
mov -45(%rsi), %r8
@@ -2448,7 +2444,7 @@ L(write_53bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_45bytes):
mov -45(%rsi), %r8
mov -37(%rsi), %r9
@@ -2464,7 +2460,7 @@ L(write_45bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_37bytes):
mov -37(%rsi), %r9
mov -29(%rsi), %r10
@@ -2478,7 +2474,7 @@ L(write_37bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_29bytes):
mov -29(%rsi), %r10
mov -21(%rsi), %r11
@@ -2490,7 +2486,7 @@ L(write_29bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_21bytes):
mov -21(%rsi), %r11
mov -13(%rsi), %rcx
@@ -2500,7 +2496,7 @@ L(write_21bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_13bytes):
mov -13(%rsi), %rcx
mov -8(%rsi), %rdx
@@ -2508,7 +2504,7 @@ L(write_13bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_5bytes):
mov -5(%rsi), %edx
mov -4(%rsi), %ecx
@@ -2516,7 +2512,7 @@ L(write_5bytes):
mov %ecx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_78bytes):
movdqu -78(%rsi), %xmm0
movdqu -62(%rsi), %xmm1
@@ -2536,7 +2532,7 @@ L(write_78bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_70bytes):
movdqu -70(%rsi), %xmm0
movdqu -54(%rsi), %xmm1
@@ -2554,7 +2550,7 @@ L(write_70bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_62bytes):
movdqu -62(%rsi), %xmm0
mov -46(%rsi), %r8
@@ -2572,7 +2568,7 @@ L(write_62bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_54bytes):
movdqu -54(%rsi), %xmm0
mov -38(%rsi), %r9
@@ -2588,7 +2584,7 @@ L(write_54bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_46bytes):
mov -46(%rsi), %r8
mov -38(%rsi), %r9
@@ -2604,7 +2600,7 @@ L(write_46bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_38bytes):
mov -38(%rsi), %r9
mov -30(%rsi), %r10
@@ -2618,7 +2614,7 @@ L(write_38bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_30bytes):
mov -30(%rsi), %r10
mov -22(%rsi), %r11
@@ -2630,7 +2626,7 @@ L(write_30bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_22bytes):
mov -22(%rsi), %r11
mov -14(%rsi), %rcx
@@ -2640,7 +2636,7 @@ L(write_22bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_14bytes):
mov -14(%rsi), %rcx
mov -8(%rsi), %rdx
@@ -2648,7 +2644,7 @@ L(write_14bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_6bytes):
mov -6(%rsi), %edx
mov -4(%rsi), %ecx
@@ -2656,7 +2652,7 @@ L(write_6bytes):
mov %ecx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_79bytes):
movdqu -79(%rsi), %xmm0
movdqu -63(%rsi), %xmm1
@@ -2676,7 +2672,7 @@ L(write_79bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_71bytes):
movdqu -71(%rsi), %xmm0
movdqu -55(%rsi), %xmm1
@@ -2694,7 +2690,7 @@ L(write_71bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_63bytes):
movdqu -63(%rsi), %xmm0
mov -47(%rsi), %r8
@@ -2712,7 +2708,7 @@ L(write_63bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_55bytes):
movdqu -55(%rsi), %xmm0
mov -39(%rsi), %r9
@@ -2728,7 +2724,7 @@ L(write_55bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_47bytes):
mov -47(%rsi), %r8
mov -39(%rsi), %r9
@@ -2744,7 +2740,7 @@ L(write_47bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_39bytes):
mov -39(%rsi), %r9
mov -31(%rsi), %r10
@@ -2758,7 +2754,7 @@ L(write_39bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_31bytes):
mov -31(%rsi), %r10
mov -23(%rsi), %r11
@@ -2770,7 +2766,7 @@ L(write_31bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_23bytes):
mov -23(%rsi), %r11
mov -15(%rsi), %rcx
@@ -2780,7 +2776,7 @@ L(write_23bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_15bytes):
mov -15(%rsi), %rcx
mov -8(%rsi), %rdx
@@ -2788,7 +2784,7 @@ L(write_15bytes):
mov %rdx, -8(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(write_7bytes):
mov -7(%rsi), %edx
mov -4(%rsi), %ecx
@@ -2796,7 +2792,7 @@ L(write_7bytes):
mov %ecx, -4(%rdi)
ret
- ALIGN (4)
+ .p2align 4
L(large_page_fwd):
movdqu (%rsi), %xmm1
lea 16(%rsi), %rsi
@@ -2859,7 +2855,7 @@ L(large_page_less_64bytes):
BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
#ifdef USE_AS_MEMMOVE
- ALIGN (4)
+ .p2align 4
L(ll_cache_copy_fwd_start):
prefetcht0 0x1c0(%rsi)
prefetcht0 0x200(%rsi)
@@ -2906,7 +2902,7 @@ L(large_page_ll_less_fwd_64bytes):
BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
#endif
- ALIGN (4)
+ .p2align 4
L(large_page_bwd):
movdqu -0x10(%rsi), %xmm1
lea -16(%rsi), %rsi
@@ -2966,7 +2962,7 @@ L(large_page_less_bwd_64bytes):
BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
#ifdef USE_AS_MEMMOVE
- ALIGN (4)
+ .p2align 4
L(ll_cache_copy_bwd_start):
prefetcht0 -0x1c0(%rsi)
prefetcht0 -0x200(%rsi)
@@ -3014,7 +3010,7 @@ L(large_page_ll_less_bwd_64bytes):
END (MEMCPY)
.section .rodata.ssse3,"a",@progbits
- ALIGN (3)
+ .p2align 3
L(table_less_80bytes):
.int JMPTBL (L(write_0bytes), L(table_less_80bytes))
.int JMPTBL (L(write_1bytes), L(table_less_80bytes))
@@ -3097,7 +3093,7 @@ L(table_less_80bytes):
.int JMPTBL (L(write_78bytes), L(table_less_80bytes))
.int JMPTBL (L(write_79bytes), L(table_less_80bytes))
- ALIGN (3)
+ .p2align 3
L(shl_table):
.int JMPTBL (L(shl_0), L(shl_table))
.int JMPTBL (L(shl_1), L(shl_table))
@@ -3116,7 +3112,7 @@ L(shl_table):
.int JMPTBL (L(shl_14), L(shl_table))
.int JMPTBL (L(shl_15), L(shl_table))
- ALIGN (3)
+ .p2align 3
L(shl_table_bwd):
.int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
.int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
index a1e5031376..40ae926386 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -1,6 +1,6 @@
/* Multiple versions of memcpy
All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2010-2013 Free Software Foundation, Inc.
+ Copyright (C) 2010-2014 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
index ad01d8cd9f..3c0270fd23 100644
--- a/sysdeps/x86_64/multiarch/memcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -1,6 +1,6 @@
/* Multiple versions of __memcpy_chk
All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2010-2013 Free Software Foundation, Inc.
+ Copyright (C) 2010-2014 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c
index 8149c487d5..ba86e7bbb1 100644
--- a/sysdeps/x86_64/multiarch/memmove.c
+++ b/sysdeps/x86_64/multiarch/memmove.c
@@ -1,6 +1,6 @@
/* Multiple versions of memmove.
All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2010-2013 Free Software Foundation, Inc.
+ Copyright (C) 2010-2014 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c
index 17ed460324..cb1acb6598 100644
--- a/sysdeps/x86_64/multiarch/memmove_chk.c
+++ b/sysdeps/x86_64/multiarch/memmove_chk.c
@@ -1,6 +1,6 @@
/* Multiple versions of __memmove_chk.
All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2010-2013 Free Software Foundation, Inc.
+ Copyright (C) 2010-2014 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index b8b7fcd121..b9f04c2ec4 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -1,6 +1,6 @@
/* Multiple versions of mempcpy
All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2010-2013 Free Software Foundation, Inc.
+ Copyright (C) 2010-2014 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
index 3801db399b..c28473a669 100644
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -1,6 +1,6 @@
/* Multiple versions of __mempcpy_chk
All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2010-2013 Free Software Foundation, Inc.
+ Copyright (C) 2010-2014 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
diff --git a/sysdeps/x86_64/multiarch/rawmemchr.S b/sysdeps/x86_64/multiarch/rawmemchr.S
deleted file mode 100644
index 50de38ffb5..0000000000
--- a/sysdeps/x86_64/multiarch/rawmemchr.S
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Multiple versions of rawmemchr
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2009-2013 Free Software Foundation, Inc.
- Contributed by Ulrich Drepper <drepper@redhat.com>.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-
-/* Define multiple versions only for the definition in lib. */
-#ifndef NOT_IN_libc
- .text
-ENTRY(rawmemchr)
- .type rawmemchr, @gnu_indirect_function
- cmpl $0, __cpu_features+KIND_OFFSET(%rip)
- jne 1f
- call __init_cpu_features
-1: testl $bit_Prefer_PMINUB_for_stringop, __cpu_features+FEATURE_OFFSET+index_Prefer_PMINUB_for_stringop(%rip)
- jnz 2f
- testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
- jz 2f
- leaq __rawmemchr_sse42(%rip), %rax
- ret
-2: leaq __rawmemchr_sse2(%rip), %rax
- ret
-
-END(rawmemchr)
-strong_alias (rawmemchr, __rawmemchr)
-
-
- .section .text.sse4.2,"ax",@progbits
- .align 16
- .type __rawmemchr_sse42, @function
- .globl __rawmemchr_sse42
- .hidden __rawmemchr_sse42
-__rawmemchr_sse42:
- cfi_startproc
- CALL_MCOUNT
- movd %esi, %xmm1
- movq %rdi, %rcx
- pxor %xmm2, %xmm2
- andq $~15, %rdi
- orl $0xffffffff, %esi
- pshufb %xmm2, %xmm1
- movdqa (%rdi), %xmm0
- subq %rdi, %rcx
- pcmpeqb %xmm1, %xmm0
- shl %cl, %esi
- pmovmskb %xmm0, %ecx
- movl $16, %eax
- movl $16, %edx
- andl %esi, %ecx
- jnz 1f
-
-2: pcmpestri $0x08, 16(%rdi), %xmm1
- leaq 16(%rdi), %rdi
- jnc 2b
-
- leaq (%rdi,%rcx), %rax
- ret
-
-1: bsfl %ecx, %eax
- addq %rdi, %rax
- ret
- cfi_endproc
- .size __rawmemchr_sse42, .-__rawmemchr_sse42
-
-
-# undef ENTRY
-# define ENTRY(name) \
- .type __rawmemchr_sse2, @function; \
- .align 16; \
- .globl __rawmemchr_sse2; \
- .hidden __rawmemchr_sse2; \
- __rawmemchr_sse2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size __rawmemchr_sse2, .-__rawmemchr_sse2
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal rawmemchr calls through a PLT.
- The speedup we get from using SSE4.2 instruction is likely eaten away
- by the indirect call in the PLT. */
-# define libc_hidden_builtin_def(name) \
- .globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_sse2
-#endif
-
-#include "../rawmemchr.S"
diff --git a/sysdeps/x86_64/multiarch/sched_cpucount.c b/sysdeps/x86_64/multiarch/sched_cpucount.c
index cd127cdc69..68a043a169 100644
--- a/sysdeps/x86_64/multiarch/sched_cpucount.c
+++ b/sysdeps/x86_64/multiarch/sched_cpucount.c
@@ -1,6 +1,6 @@
/* Count bits in CPU set. x86-64 multi-arch version.
This file is part of the GNU C Library.
- Copyright (C) 2008-2013 Free Software Foundation, Inc.
+ Copyright (C) 2008-2014 Free Software Foundation, Inc.
Contributed by Ulrich Drepper <drepper@redhat.com>.
The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/multiarch/strcasestr-c.c b/sysdeps/x86_64/multiarch/strcasestr-c.c
deleted file mode 100644
index c13a4c44f3..0000000000
--- a/sysdeps/x86_64/multiarch/strcasestr-c.c
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Multiple versions of strcasestr
- All versions must be listed in ifunc-impl-list.c. */
-
-#include "init-arch.h"
-
-#define STRCASESTR __strcasestr_sse2
-
-#include "string/strcasestr.c"
-
-extern char *__strcasestr_sse42 (const char *, const char *) attribute_hidden;
-extern __typeof (__strcasestr_sse2) __strcasestr_sse2 attribute_hidden;
-
-#if 1
-libc_ifunc (__strcasestr,
- HAS_SSE4_2 ? __strcasestr_sse42 : __strcasestr_sse2);
-#else
-libc_ifunc (__strcasestr,
- 0 ? __strcasestr_sse42 : __strcasestr_sse2);
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcasestr-nonascii.c b/sysdeps/x86_64/multiarch/strcasestr-nonascii.c
deleted file mode 100644
index 032a6420d6..0000000000
--- a/sysdeps/x86_64/multiarch/strcasestr-nonascii.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/* strstr with SSE4.2 intrinsics
- Copyright (C) 2010-2013 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <ctype.h>
-#include <xmmintrin.h>
-
-
-/* Similar to __m128i_strloadu. Convert to lower case for none-POSIX/C
- locale. */
-static __m128i
-__m128i_strloadu_tolower (const unsigned char *p)
-{
- union
- {
- char b[16];
- __m128i x;
- } u;
-
- for (int i = 0; i < 16; ++i)
- if (p[i] == 0)
- {
- u.b[i] = 0;
- break;
- }
- else
- u.b[i] = tolower (p[i]);
-
- return u.x;
-}
-
-
-#define STRCASESTR_NONASCII
-#define USE_AS_STRCASESTR
-#define STRSTR_SSE42 __strcasestr_sse42_nonascii
-#include "strstr.c"
diff --git a/sysdeps/x86_64/multiarch/strcasestr.c b/sysdeps/x86_64/multiarch/strcasestr.c
index d1cfb3b264..834e656a2c 100644
--- a/sysdeps/x86_64/multiarch/strcasestr.c
+++ b/sysdeps/x86_64/multiarch/strcasestr.c
@@ -1,7 +1,13 @@
-extern char *__strcasestr_sse42_nonascii (const unsigned char *s1,
- const unsigned char *s2)
- attribute_hidden;
+/* Multiple versions of strcasestr
+ All versions must be listed in ifunc-impl-list.c. */
-#define USE_AS_STRCASESTR
-#define STRSTR_SSE42 __strcasestr_sse42
-#include "strstr.c"
+#include "init-arch.h"
+
+#define STRCASESTR __strcasestr_sse2
+
+#include "string/strcasestr.c"
+
+extern __typeof (__strcasestr_sse2) __strcasestr_sse2 attribute_hidden;
+
+libc_ifunc (__strcasestr,
+ __strcasestr_sse2);
diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
index 028c6d3d74..dc782f2c23 100644
--- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
@@ -1,5 +1,5 @@
/* strcat with SSE2
- Copyright (C) 2011-2013 Free Software Foundation, Inc.
+ Copyright (C) 2011-2014 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
index 8101b91e59..fde7b90822 100644
--- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
+++ b/sysdeps/x86_64/multiarch/strcat-ssse3.S
@@ -1,5 +1,5 @@
/* strcat with SSSE3
- Copyright (C) 2011-2013 Free Software Foundation, Inc.
+ Copyright (C) 2011-2014 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
diff --git a/sysdeps/x86_64/multiarch/strcat.S b/sysdeps/x86_64/multiarch/strcat.S
index f94dc709be..d5c9d847d4 100644
--- a/sysdeps/x86_64/multiarch/strcat.S
+++ b/sysdeps/x86_64/multiarch/strcat.S
@@ -1,6 +1,6 @@
/* Multiple versions of strcat
All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2009-2013 Free Software Foundation, Inc.
+ Copyright (C) 2009-2014 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
diff --git a/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S
index 72da62f3d2..0b3f0961c3 100644
--- a/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S
+++ b/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S
@@ -1,5 +1,5 @@
/* strchr with SSE2 without bsf
- Copyright (C) 2011-2013 Free Software Foundation, Inc.
+ Copyright (C) 2011-2014 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
index 6860329449..63a35fa62f 100644
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ b/sysdeps/x86_64/multiarch/strchr.S
@@ -1,5 +1,5 @@
/* Multiple versions of strchr
- Copyright (C) 2009-2013 Free Software Foundation, Inc.
+ Copyright (C) 2009-2014 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -29,10 +29,6 @@ ENTRY(strchr)
jne 1f
call __init_cpu_features
1: leaq __strchr_sse2(%rip), %rax
- testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
- jz 2f
- leaq __strchr_sse42(%rip), %rax
- ret
2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
jz 3f
leaq __strchr_sse2_no_bsf(%rip), %rax
@@ -40,127 +36,6 @@ ENTRY(strchr)
END(strchr)
-/*
- This implementation uses SSE4 instructions to compare up to 16 bytes
- at a time looking for the first occurrence of the character c in the
- string s:
-
- char *strchr (const char *s, int c);
-
- We use 0xa:
- _SIDD_SBYTE_OPS
- | _SIDD_CMP_EQUAL_EACH
- | _SIDD_LEAST_SIGNIFICANT
- on pcmpistri to compare xmm/mem128
-
- 0 1 2 3 4 5 6 7 8 9 A B C D E F
- X X X X X X X X X X X X X X X X
-
- against xmm
-
- 0 1 2 3 4 5 6 7 8 9 A B C D E F
- C C C C C C C C C C C C C C C C
-
- to find out if the first 16byte data element has a byte C and the
- offset of the first byte. There are 3 cases:
-
- 1. The first 16byte data element has the byte C at the offset X.
- 2. The first 16byte data element has EOS and doesn't have the byte C.
- 3. The first 16byte data element is valid and doesn't have the byte C.
-
- Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
-
- case ECX CFlag ZFlag SFlag
- 1 X 1 0/1 0
- 2 16 0 1 0
- 3 16 0 0 0
-
- We exit from the loop for cases 1 and 2 with jbe which branches
- when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset
- X for case 1. */
-
- .section .text.sse4.2,"ax",@progbits
- .align 16
- .type __strchr_sse42, @function
- .globl __strchr_sse42
- .hidden __strchr_sse42
-__strchr_sse42:
- cfi_startproc
- CALL_MCOUNT
- testb %sil, %sil
- je __strend_sse4
- pxor %xmm2, %xmm2
- movd %esi, %xmm1
- movl %edi, %ecx
- pshufb %xmm2, %xmm1
- andl $15, %ecx
- movq %rdi, %r8
- je L(aligned_start)
-
-/* Handle unaligned string. */
- andq $-16, %r8
- movdqa (%r8), %xmm0
- pcmpeqb %xmm0, %xmm2
- pcmpeqb %xmm1, %xmm0
- /* Find where NULL is. */
- pmovmskb %xmm2, %edx
- /* Check if there is a match. */
- pmovmskb %xmm0, %esi
- /* Remove the leading bytes. */
- sarl %cl, %edx
- sarl %cl, %esi
- testl %esi, %esi
- je L(unaligned_no_match)
- /* Check which byte is a match. */
- bsfl %esi, %eax
- /* Is there a NULL? */
- testl %edx, %edx
- je L(unaligned_match)
- bsfl %edx, %esi
- cmpl %esi, %eax
- /* Return NULL if NULL comes first. */
- ja L(return_null)
-L(unaligned_match):
- addq %rdi, %rax
- ret
-
- .p2align 4
-L(unaligned_no_match):
- testl %edx, %edx
- jne L(return_null)
-
-/* Loop start on aligned string. */
-L(loop):
- addq $16, %r8
-L(aligned_start):
- pcmpistri $0x2, (%r8), %xmm1
- jbe L(wrap)
- addq $16, %r8
- pcmpistri $0x2, (%r8), %xmm1
- jbe L(wrap)
- addq $16, %r8
- pcmpistri $0x2, (%r8), %xmm1
- jbe L(wrap)
- addq $16, %r8
- pcmpistri $0x2, (%r8), %xmm1
- jbe L(wrap)
- jmp L(loop)
-L(wrap):
- jc L(loop_exit)
-
-/* Return NULL. */
-L(return_null):
- xorl %eax, %eax
- ret
-
-/* Loop exit. */
- .p2align 4
-L(loop_exit):
- leaq (%r8,%rcx), %rax
- ret
- cfi_endproc
- .size __strchr_sse42, .-__strchr_sse42
-
# undef ENTRY
# define ENTRY(name) \
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
new file mode 100644
index 0000000000..b133ffc3ea
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
@@ -0,0 +1,209 @@
+/* strcmp with unaligned loads
+ Copyright (C) 2013-2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "sysdep.h"
+
+ENTRY ( __strcmp_sse2_unaligned)
+ movl %edi, %eax
+ xorl %edx, %edx
+ pxor %xmm7, %xmm7
+ orl %esi, %eax
+ andl $4095, %eax
+ cmpl $4032, %eax
+ jg L(cross_page)
+ movdqu (%rdi), %xmm1
+ movdqu (%rsi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pminub %xmm1, %xmm0
+ pxor %xmm1, %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ testq %rax, %rax
+ je L(next_48_bytes)
+L(return):
+ bsfq %rax, %rdx
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %edx
+ subl %edx, %eax
+ ret
+
+ .p2align 4
+L(next_48_bytes):
+ movdqu 16(%rdi), %xmm6
+ movdqu 16(%rsi), %xmm3
+ movdqu 32(%rdi), %xmm5
+ pcmpeqb %xmm6, %xmm3
+ movdqu 32(%rsi), %xmm2
+ pminub %xmm6, %xmm3
+ pcmpeqb %xmm1, %xmm3
+ movdqu 48(%rdi), %xmm4
+ pcmpeqb %xmm5, %xmm2
+ pmovmskb %xmm3, %edx
+ movdqu 48(%rsi), %xmm0
+ pminub %xmm5, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm2, %eax
+ salq $16, %rdx
+ pminub %xmm4, %xmm0
+ pcmpeqb %xmm1, %xmm0
+ salq $32, %rax
+ orq %rdx, %rax
+ pmovmskb %xmm0, %ecx
+ movq %rcx, %rdx
+ salq $48, %rdx
+ orq %rdx, %rax
+ jne L(return)
+L(main_loop_header):
+ leaq 64(%rdi), %rdx
+ movl $4096, %ecx
+ pxor %xmm9, %xmm9
+ andq $-64, %rdx
+ subq %rdi, %rdx
+ leaq (%rdi, %rdx), %rax
+ addq %rsi, %rdx
+ movq %rdx, %rsi
+ andl $4095, %esi
+ subq %rsi, %rcx
+ shrq $6, %rcx
+ movq %rcx, %rsi
+ jmp L(loop_start)
+
+ .p2align 4
+L(loop):
+ addq $64, %rax
+ addq $64, %rdx
+L(loop_start):
+ testq %rsi, %rsi
+ leaq -1(%rsi), %rsi
+ je L(loop_cross_page)
+L(back_to_loop):
+ movdqu (%rdx), %xmm0
+ movdqu 16(%rdx), %xmm1
+ movdqa (%rax), %xmm2
+ movdqa 16(%rax), %xmm3
+ pcmpeqb %xmm2, %xmm0
+ movdqu 32(%rdx), %xmm5
+ pcmpeqb %xmm3, %xmm1
+ pminub %xmm2, %xmm0
+ movdqu 48(%rdx), %xmm6
+ pminub %xmm3, %xmm1
+ movdqa 32(%rax), %xmm2
+ pminub %xmm1, %xmm0
+ movdqa 48(%rax), %xmm3
+ pcmpeqb %xmm2, %xmm5
+ pcmpeqb %xmm3, %xmm6
+ pminub %xmm2, %xmm5
+ pminub %xmm3, %xmm6
+ pminub %xmm5, %xmm0
+ pminub %xmm6, %xmm0
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %ecx
+ testl %ecx, %ecx
+ je L(loop)
+ pcmpeqb %xmm7, %xmm5
+ movdqu (%rdx), %xmm0
+ pcmpeqb %xmm7, %xmm1
+ movdqa (%rax), %xmm2
+ pcmpeqb %xmm2, %xmm0
+ pminub %xmm2, %xmm0
+ pcmpeqb %xmm7, %xmm6
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm1, %ecx
+ pmovmskb %xmm5, %r8d
+ pmovmskb %xmm0, %edi
+ salq $16, %rcx
+ salq $32, %r8
+ pmovmskb %xmm6, %esi
+ orq %r8, %rcx
+ orq %rdi, %rcx
+ salq $48, %rsi
+ orq %rsi, %rcx
+ bsfq %rcx, %rcx
+ movzbl (%rax, %rcx), %eax
+ movzbl (%rdx, %rcx), %edx
+ subl %edx, %eax
+ ret
+
+ .p2align 4
+L(loop_cross_page):
+ xor %r10, %r10
+ movq %rdx, %r9
+ and $63, %r9
+ subq %r9, %r10
+
+ movdqa (%rdx, %r10), %xmm0
+ movdqa 16(%rdx, %r10), %xmm1
+ movdqu (%rax, %r10), %xmm2
+ movdqu 16(%rax, %r10), %xmm3
+ pcmpeqb %xmm2, %xmm0
+ movdqa 32(%rdx, %r10), %xmm5
+ pcmpeqb %xmm3, %xmm1
+ pminub %xmm2, %xmm0
+ movdqa 48(%rdx, %r10), %xmm6
+ pminub %xmm3, %xmm1
+ movdqu 32(%rax, %r10), %xmm2
+ movdqu 48(%rax, %r10), %xmm3
+ pcmpeqb %xmm2, %xmm5
+ pcmpeqb %xmm3, %xmm6
+ pminub %xmm2, %xmm5
+ pminub %xmm3, %xmm6
+
+ pcmpeqb %xmm7, %xmm0
+ pcmpeqb %xmm7, %xmm1
+ pcmpeqb %xmm7, %xmm5
+ pcmpeqb %xmm7, %xmm6
+
+ pmovmskb %xmm1, %ecx
+ pmovmskb %xmm5, %r8d
+ pmovmskb %xmm0, %edi
+ salq $16, %rcx
+ salq $32, %r8
+ pmovmskb %xmm6, %esi
+ orq %r8, %rdi
+ orq %rcx, %rdi
+ salq $48, %rsi
+ orq %rsi, %rdi
+ movq %r9, %rcx
+ movq $63, %rsi
+ shrq %cl, %rdi
+ test %rdi, %rdi
+ je L(back_to_loop)
+ bsfq %rdi, %rcx
+ movzbl (%rax, %rcx), %eax
+ movzbl (%rdx, %rcx), %edx
+ subl %edx, %eax
+ ret
+
+ .p2align 4
+L(cross_page_loop):
+ cmpb %cl, %al
+ jne L(different)
+ addq $1, %rdx
+ cmpq $64, %rdx
+ je L(main_loop_header)
+L(cross_page):
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %ecx
+ testb %al, %al
+ jne L(cross_page_loop)
+ xorl %eax, %eax
+L(different):
+ subl %ecx, %eax
+ ret
+END (__strcmp_sse2_unaligned)
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
index a503e92115..2d0758a656 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
@@ -1,5 +1,5 @@
/* strcmp with SSE4.2
- Copyright (C) 2009-2013 Free Software Foundation, Inc.
+ Copyright (C) 2009-2014 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -206,7 +206,7 @@ LABEL(touppermask):
jnz LABEL(less16bytes)/* If not, find different value or null char */
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
sub $16, %r11
- jbe LABEL(strcmp_exitz)/* finish comparision */
+ jbe LABEL(strcmp_exitz)/* finish comparison */
#endif
add $16, %rsi /* prepare to search next 16 bytes */
add $16, %rdi /* prepare to search next 16 bytes */
diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
index f69aaf42b3..f3e0ca1259 100644
--- a/sysdeps/x86_64/multiarch/strcmp.S
+++ b/sysdeps/x86_64/multiarch/strcmp.S
@@ -1,5 +1,5 @@
/* Multiple versions of strcmp
- Copyright (C) 2009-2013 Free Software Foundation, Inc.
+ Copyright (C) 2009-2014 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -66,6 +66,7 @@
# define STRCMP_SSE2 __strncasecmp_l_sse2
# define __GI_STRCMP __GI___strncasecmp_l
#else
+# define USE_AS_STRCMP
# define UPDATE_STRNCMP_COUNTER
# ifndef STRCMP
# define STRCMP strcmp
@@ -88,14 +89,22 @@ ENTRY(STRCMP)
jne 1f
call __init_cpu_features
1:
+#ifdef USE_AS_STRCMP
+ leaq __strcmp_sse2_unaligned(%rip), %rax
+ testl $bit_Fast_Unaligned_Load, __cpu_features+CPUID_OFFSET+index_Fast_Unaligned_Load(%rip)
+ jnz 3f
+#else
+ testl $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
+ jnz 2f
leaq STRCMP_SSE42(%rip), %rax
testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
- jnz 2f
- leaq STRCMP_SSSE3(%rip), %rax
+ jnz 3f
+#endif
+2: leaq STRCMP_SSSE3(%rip), %rax
testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
- jnz 2f
+ jnz 3f
leaq STRCMP_SSE2(%rip), %rax
-2: ret
+3: ret
END(STRCMP)
# ifdef USE_AS_STRCASECMP_L
@@ -109,16 +118,18 @@ ENTRY(__strcasecmp)
# ifdef HAVE_AVX_SUPPORT
leaq __strcasecmp_avx(%rip), %rax
testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
- jnz 2f
+ jnz 3f
# endif
+ testl $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
+ jnz 2f
leaq __strcasecmp_sse42(%rip), %rax
testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
- jnz 2f
- leaq __strcasecmp_ssse3(%rip), %rax
+ jnz 3f
+2: leaq __strcasecmp_ssse3(%rip), %rax
testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
- jnz 2f
+ jnz 3f
leaq __strcasecmp_sse2(%rip), %rax
-2: ret
+3: ret
END(__strcasecmp)
weak_alias (__strcasecmp, strcasecmp)
# endif
@@ -133,16 +144,18 @@ ENTRY(__strncasecmp)
# ifdef HAVE_AVX_SUPPORT
leaq __strncasecmp_avx(%rip), %rax
testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
- jnz 2f
+ jnz 3f
# endif
+ testl $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
+ jnz 2f
leaq __strncasecmp_sse42(%rip), %rax
testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
- jnz 2f
- leaq __strncasecmp_ssse3(%rip), %rax
+ jnz 3f
+2: leaq __strncasecmp_ssse3(%rip), %rax
testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
- jnz 2f
+ jnz 3f
leaq __strncasecmp_sse2(%rip), %rax
-2: ret
+3: ret
END(__strncasecmp)
weak_alias (__strncasecmp, strncasecmp)
# endif
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
index cd56e5637a..be7513d480 100644
--- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
@@ -1,5 +1,5 @@
/* strcpy with SSE2 and unaligned load
- Copyright (C) 2011-2013 Free Software Foundation, Inc.
+ Copyright (C) 2011-2014 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -93,7 +93,7 @@ ENTRY (STRCPY)
movdqu (%rsi, %rcx), %xmm1 /* copy 16 bytes */
movdqu %xmm1, (%rdi)
-/* If source adress alignment != destination adress alignment */
+/* If source address alignment != destination address alignment */
.p2align 4
L(Unalign16Both):
sub %rcx, %rdi
@@ -289,7 +289,7 @@ L(Unaligned64Leave):
BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
# endif
-/* If source adress alignment == destination adress alignment */
+/* If source address alignment == destination address alignment */
L(SourceStringAlignmentLess32):
pxor %xmm0, %xmm0
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
index 42ee00bd5c..86569ff54a 100644
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
@@ -1,5 +1,5 @@
/* strcpy with SSSE3
- Copyright (C) 2011-2013 Free Software Foundation, Inc.
+ Copyright (C) 2011-2014 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S
index 919a411a9e..80ed98b30a 100644
--- a/sysdeps/x86_64/multiarch/strcpy.S
+++ b/sysdeps/x86_64/multiarch/strcpy.S
@@ -1,6 +1,6 @@
/* Multiple versions of strcpy
All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2009-2013 Free Software Foundation, Inc.
+ Copyright (C) 2009-2014 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
index 9c0dcf0e8f..a9a6c8ae74 100644
--- a/sysdeps/x86_64/multiarch/strcspn-c.c
+++ b/sysdeps/x86_64/multiarch/strcspn-c.c
@@ -1,5 +1,5 @@
/* strcspn with SSE4.2 intrinsics
- Copyright (C) 2009-2013 Free Software Foundation, Inc.
+ Copyright (C) 2009-2014 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S
index df9616510b..24f55e9579 100644
--- a/sysdeps/x86_64/multiarch/strcspn.S
+++ b/sysdeps/x86_64/multiarch/strcspn.S
@@ -1,6 +1,6 @@
/* Multiple versions of strcspn
All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2009-2013 Free Software Foundation, Inc.
+ Copyright (C) 2009-2014 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
diff --git a/sysdeps/x86_64/multiarch/strend-sse4.S b/sysdeps/x86_64/multiarch/strend-sse4.S
deleted file mode 100644
index c5a7ae28a6..0000000000
--- a/sysdeps/x86_64/multiarch/strend-sse4.S
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Return the pointer to the end of string, using SSE4.2
- Copyright (C) 2009-2013 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
- .section .text.sse4.2,"ax",@progbits
-ENTRY (__strend_sse4)
- pxor %xmm2, %xmm2
- movq %rdi, %rcx
- andq $~15, %rdi
- movdqa %xmm2, %xmm1
- pcmpeqb (%rdi), %xmm2
- orl $0xffffffff, %esi
- subq %rdi, %rcx
- shll %cl, %esi
- pmovmskb %xmm2, %edx
- andl %esi, %edx
- jnz 1f
-
-2: pcmpistri $0x08, 16(%rdi), %xmm1
- leaq 16(%rdi), %rdi
- jnz 2b
-
- leaq (%rdi,%rcx), %rax
- ret
-
-1: bsfl %edx, %eax
- addq %rdi, %rax
- ret
-
-END (__strend_sse4)
diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
deleted file mode 100644
index fcef610dbc..0000000000
--- a/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
+++ /dev/null
@@ -1,555 +0,0 @@
-/* strrchr with SSE2 without bsf and bsr
- Copyright (C) 2011-2013 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if defined SHARED && !defined NOT_IN_libc
-
-# include <sysdep.h>
-# include "asm-syntax.h"
-
- atom_text_section
-ENTRY (__strrchr_sse2_no_bsf)
-
- movd %rsi, %xmm1
- pxor %xmm2, %xmm2
- mov %rdi, %rcx
- punpcklbw %xmm1, %xmm1
- punpcklbw %xmm1, %xmm1
- /* ECX has OFFSET. */
- and $63, %rcx
- cmp $48, %rcx
- pshufd $0, %xmm1, %xmm1
- ja L(crosscache)
-
-/* unaligned string. */
- movdqu (%rdi), %xmm0
- pcmpeqb %xmm0, %xmm2
- pcmpeqb %xmm1, %xmm0
- /* Find where NULL is. */
- pmovmskb %xmm2, %rcx
- /* Check if there is a match. */
- pmovmskb %xmm0, %rax
- add $16, %rdi
-
- test %rax, %rax
- jnz L(unaligned_match1)
-
- test %rcx, %rcx
- jnz L(return_null)
-
- and $-16, %rdi
- xor %r8, %r8
- jmp L(loop)
-
- .p2align 4
-L(unaligned_match1):
- test %rcx, %rcx
- jnz L(prolog_find_zero_1)
-
- mov %rax, %r8
- mov %rdi, %rsi
- and $-16, %rdi
- jmp L(loop)
-
- .p2align 4
-L(crosscache):
-/* Hancle unaligned string. */
- and $15, %rcx
- and $-16, %rdi
- pxor %xmm3, %xmm3
- movdqa (%rdi), %xmm0
- pcmpeqb %xmm0, %xmm3
- pcmpeqb %xmm1, %xmm0
- /* Find where NULL is. */
- pmovmskb %xmm3, %rdx
- /* Check if there is a match. */
- pmovmskb %xmm0, %rax
- /* Remove the leading bytes. */
- shr %cl, %rdx
- shr %cl, %rax
- add $16, %rdi
-
- test %rax, %rax
- jnz L(unaligned_match)
-
- test %rdx, %rdx
- jnz L(return_null)
-
- xor %r8, %r8
- jmp L(loop)
-
- .p2align 4
-L(unaligned_match):
- test %rdx, %rdx
- jnz L(prolog_find_zero)
-
- mov %rax, %r8
- lea (%rdi, %rcx), %rsi
-
-/* Loop start on aligned string. */
- .p2align 4
-L(loop):
- movdqa (%rdi), %xmm0
- pcmpeqb %xmm0, %xmm2
- add $16, %rdi
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm0
- pcmpeqb %xmm0, %xmm2
- add $16, %rdi
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm0
- pcmpeqb %xmm0, %xmm2
- add $16, %rdi
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm0
- pcmpeqb %xmm0, %xmm2
- add $16, %rdi
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- or %rax, %rcx
- jz L(loop)
-
-L(matches):
- test %rax, %rax
- jnz L(match)
-L(return_value):
- test %r8, %r8
- jz L(return_null)
- mov %r8, %rax
- mov %rsi, %rdi
- jmp L(match_exit)
-
- .p2align 4
-L(match):
- pmovmskb %xmm2, %rcx
- test %rcx, %rcx
- jnz L(find_zero)
- mov %rax, %r8
- mov %rdi, %rsi
- jmp L(loop)
-
- .p2align 4
-L(find_zero):
- test %cl, %cl
- jz L(find_zero_high)
- mov %cl, %dl
- and $15, %dl
- jz L(find_zero_8)
- test $0x01, %cl
- jnz L(FindZeroExit1)
- test $0x02, %cl
- jnz L(FindZeroExit2)
- test $0x04, %cl
- jnz L(FindZeroExit3)
- and $1 << 4 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(find_zero_8):
- test $0x10, %cl
- jnz L(FindZeroExit5)
- test $0x20, %cl
- jnz L(FindZeroExit6)
- test $0x40, %cl
- jnz L(FindZeroExit7)
- and $1 << 8 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(find_zero_high):
- mov %ch, %dh
- and $15, %dh
- jz L(find_zero_high_8)
- test $0x01, %ch
- jnz L(FindZeroExit9)
- test $0x02, %ch
- jnz L(FindZeroExit10)
- test $0x04, %ch
- jnz L(FindZeroExit11)
- and $1 << 12 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(find_zero_high_8):
- test $0x10, %ch
- jnz L(FindZeroExit13)
- test $0x20, %ch
- jnz L(FindZeroExit14)
- test $0x40, %ch
- jnz L(FindZeroExit15)
- and $1 << 16 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit1):
- and $1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit2):
- and $1 << 2 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit3):
- and $1 << 3 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit5):
- and $1 << 5 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit6):
- and $1 << 6 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit7):
- and $1 << 7 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit9):
- and $1 << 9 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit10):
- and $1 << 10 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit11):
- and $1 << 11 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit13):
- and $1 << 13 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit14):
- and $1 << 14 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit15):
- and $1 << 15 - 1, %rax
- jz L(return_value)
-
- .p2align 4
-L(match_exit):
- test %ah, %ah
- jnz L(match_exit_high)
- mov %al, %dl
- and $15 << 4, %dl
- jnz L(match_exit_8)
- test $0x08, %al
- jnz L(Exit4)
- test $0x04, %al
- jnz L(Exit3)
- test $0x02, %al
- jnz L(Exit2)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(match_exit_8):
- test $0x80, %al
- jnz L(Exit8)
- test $0x40, %al
- jnz L(Exit7)
- test $0x20, %al
- jnz L(Exit6)
- lea -12(%rdi), %rax
- ret
-
- .p2align 4
-L(match_exit_high):
- mov %ah, %dh
- and $15 << 4, %dh
- jnz L(match_exit_high_8)
- test $0x08, %ah
- jnz L(Exit12)
- test $0x04, %ah
- jnz L(Exit11)
- test $0x02, %ah
- jnz L(Exit10)
- lea -8(%rdi), %rax
- ret
-
- .p2align 4
-L(match_exit_high_8):
- test $0x80, %ah
- jnz L(Exit16)
- test $0x40, %ah
- jnz L(Exit15)
- test $0x20, %ah
- jnz L(Exit14)
- lea -4(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit2):
- lea -15(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit3):
- lea -14(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit4):
- lea -13(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit6):
- lea -11(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit7):
- lea -10(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit8):
- lea -9(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit10):
- lea -7(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit11):
- lea -6(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit12):
- lea -5(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit14):
- lea -3(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit15):
- lea -2(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit16):
- lea -1(%rdi), %rax
- ret
-
-/* Return NULL. */
- .p2align 4
-L(return_null):
- xor %rax, %rax
- ret
-
- .p2align 4
-L(prolog_find_zero):
- add %rcx, %rdi
- mov %rdx, %rcx
-L(prolog_find_zero_1):
- test %cl, %cl
- jz L(prolog_find_zero_high)
- mov %cl, %dl
- and $15, %dl
- jz L(prolog_find_zero_8)
- test $0x01, %cl
- jnz L(PrologFindZeroExit1)
- test $0x02, %cl
- jnz L(PrologFindZeroExit2)
- test $0x04, %cl
- jnz L(PrologFindZeroExit3)
- and $1 << 4 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_8):
- test $0x10, %cl
- jnz L(PrologFindZeroExit5)
- test $0x20, %cl
- jnz L(PrologFindZeroExit6)
- test $0x40, %cl
- jnz L(PrologFindZeroExit7)
- and $1 << 8 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_high):
- mov %ch, %dh
- and $15, %dh
- jz L(prolog_find_zero_high_8)
- test $0x01, %ch
- jnz L(PrologFindZeroExit9)
- test $0x02, %ch
- jnz L(PrologFindZeroExit10)
- test $0x04, %ch
- jnz L(PrologFindZeroExit11)
- and $1 << 12 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_high_8):
- test $0x10, %ch
- jnz L(PrologFindZeroExit13)
- test $0x20, %ch
- jnz L(PrologFindZeroExit14)
- test $0x40, %ch
- jnz L(PrologFindZeroExit15)
- and $1 << 16 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit1):
- and $1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit2):
- and $1 << 2 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit3):
- and $1 << 3 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit5):
- and $1 << 5 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit6):
- and $1 << 6 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit7):
- and $1 << 7 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit9):
- and $1 << 9 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit10):
- and $1 << 10 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit11):
- and $1 << 11 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit13):
- and $1 << 13 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit14):
- and $1 << 14 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit15):
- and $1 << 15 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
-END (__strrchr_sse2_no_bsf)
-#endif
diff --git a/sysdeps/x86_64/multiarch/strrchr.S b/sysdeps/x86_64/multiarch/strrchr.S
deleted file mode 100644
index ee6af6e9dd..0000000000
--- a/sysdeps/x86_64/multiarch/strrchr.S
+++ /dev/null
@@ -1,286 +0,0 @@
-/* Multiple versions of strrchr
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2009-2013 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-
-/* Define multiple versions only for the definition in libc and for
- the DSO. In static binaries we need strrchr before the initialization
- happened. */
-#if defined SHARED && !defined NOT_IN_libc
- .text
-ENTRY(strrchr)
- .type strrchr, @gnu_indirect_function
- cmpl $0, __cpu_features+KIND_OFFSET(%rip)
- jne 1f
- call __init_cpu_features
-1: leaq __strrchr_sse2(%rip), %rax
- testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
- jz 2f
- leaq __strrchr_sse42(%rip), %rax
- ret
-2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
- jz 3f
- leaq __strrchr_sse2_no_bsf(%rip), %rax
-3: ret
-END(strrchr)
-
-/*
- This implementation uses SSE4 instructions to compare up to 16 bytes
- at a time looking for the last occurrence of the character c in the
- string s:
-
- char *strrchr (const char *s, int c);
-
- We use 0x4a:
- _SIDD_SBYTE_OPS
- | _SIDD_CMP_EQUAL_EACH
- | _SIDD_MOST_SIGNIFICANT
- on pcmpistri to compare xmm/mem128
-
- 0 1 2 3 4 5 6 7 8 9 A B C D E F
- X X X X X X X X X X X X X X X X
-
- against xmm
-
- 0 1 2 3 4 5 6 7 8 9 A B C D E F
- C C C C C C C C C C C C C C C C
-
- to find out if the first 16byte data element has a byte C and the
- last offset. There are 4 cases:
-
- 1. The first 16byte data element has EOS and has the byte C at the
- last offset X.
- 2. The first 16byte data element is valid and has the byte C at the
- last offset X.
- 3. The first 16byte data element has EOS and doesn't have the byte C.
- 4. The first 16byte data element is valid and doesn't have the byte C.
-
- Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
-
- case ECX CFlag ZFlag SFlag
- 1 X 1 1 0
- 2 X 1 0 0
- 3 16 0 1 0
- 4 16 0 0 0
-
- We exit from the loop for cases 1 and 3 with jz which branches
- when ZFlag is 1. If CFlag == 1, ECX has the offset X for case 1. */
-
-
- .section .text.sse4.2,"ax",@progbits
- .align 16
- .type __strrchr_sse42, @function
- .globl __strrchr_sse42
- .hidden __strrchr_sse42
-__strrchr_sse42:
- cfi_startproc
- CALL_MCOUNT
- testb %sil, %sil
- je __strend_sse4
- xor %eax,%eax /* RAX has the last occurrence of s. */
- movd %esi, %xmm1
- punpcklbw %xmm1, %xmm1
- movl %edi, %esi
- punpcklbw %xmm1, %xmm1
- andl $15, %esi
- pshufd $0, %xmm1, %xmm1
- movq %rdi, %r8
- je L(loop)
-
-/* Handle unaligned string using psrldq. */
- leaq L(psrldq_table)(%rip), %rdx
- andq $-16, %r8
- movslq (%rdx,%rsi,4),%r9
- movdqa (%r8), %xmm0
- addq %rdx, %r9
- jmp *%r9
-
-/* Handle unaligned string with offset 1 using psrldq. */
- .p2align 4
-L(psrldq_1):
- psrldq $1, %xmm0
-
- .p2align 4
-L(unaligned_pcmpistri):
- pcmpistri $0x4a, %xmm1, %xmm0
- jnc L(unaligned_no_byte)
- leaq (%rdi,%rcx), %rax
-L(unaligned_no_byte):
- /* Find the length of the unaligned string. */
- pcmpistri $0x3a, %xmm0, %xmm0
- movl $16, %edx
- subl %esi, %edx
- cmpl %ecx, %edx
- /* Return RAX if the unaligned fragment to next 16B already
- contain the NULL terminator. */
- jg L(exit)
- addq $16, %r8
-
-/* Loop start on aligned string. */
- .p2align 4
-L(loop):
- pcmpistri $0x4a, (%r8), %xmm1
- jbe L(match_or_eos)
- addq $16, %r8
- jmp L(loop)
- .p2align 4
-L(match_or_eos):
- je L(had_eos)
-L(match_no_eos):
- leaq (%r8,%rcx), %rax
- addq $16, %r8
- jmp L(loop)
- .p2align 4
-L(had_eos):
- jnc L(exit)
- leaq (%r8,%rcx), %rax
- .p2align 4
-L(exit):
- ret
-
-/* Handle unaligned string with offset 15 using psrldq. */
- .p2align 4
-L(psrldq_15):
- psrldq $15, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 14 using psrldq. */
- .p2align 4
-L(psrldq_14):
- psrldq $14, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 13 using psrldq. */
- .p2align 4
-L(psrldq_13):
- psrldq $13, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 12 using psrldq. */
- .p2align 4
-L(psrldq_12):
- psrldq $12, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 11 using psrldq. */
- .p2align 4
-L(psrldq_11):
- psrldq $11, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 10 using psrldq. */
- .p2align 4
-L(psrldq_10):
- psrldq $10, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 9 using psrldq. */
- .p2align 4
-L(psrldq_9):
- psrldq $9, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 8 using psrldq. */
- .p2align 4
-L(psrldq_8):
- psrldq $8, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 7 using psrldq. */
- .p2align 4
-L(psrldq_7):
- psrldq $7, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 6 using psrldq. */
- .p2align 4
-L(psrldq_6):
- psrldq $6, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 5 using psrldq. */
- .p2align 4
-L(psrldq_5):
- psrldq $5, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 4 using psrldq. */
- .p2align 4
-L(psrldq_4):
- psrldq $4, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 3 using psrldq. */
- .p2align 4
-L(psrldq_3):
- psrldq $3, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 2 using psrldq. */
- .p2align 4
-L(psrldq_2):
- psrldq $2, %xmm0
- jmp L(unaligned_pcmpistri)
-
- cfi_endproc
- .size __strrchr_sse42, .-__strrchr_sse42
-
- .section .rodata.sse4.2,"a",@progbits
- .p2align 4
-L(psrldq_table):
- .int L(loop) - L(psrldq_table)
- .int L(psrldq_1) - L(psrldq_table)
- .int L(psrldq_2) - L(psrldq_table)
- .int L(psrldq_3) - L(psrldq_table)
- .int L(psrldq_4) - L(psrldq_table)
- .int L(psrldq_5) - L(psrldq_table)
- .int L(psrldq_6) - L(psrldq_table)
- .int L(psrldq_7) - L(psrldq_table)
- .int L(psrldq_8) - L(psrldq_table)
- .int L(psrldq_9) - L(psrldq_table)
- .int L(psrldq_10) - L(psrldq_table)
- .int L(psrldq_11) - L(psrldq_table)
- .int L(psrldq_12) - L(psrldq_table)
- .int L(psrldq_13) - L(psrldq_table)
- .int L(psrldq_14) - L(psrldq_table)
- .int L(psrldq_15) - L(psrldq_table)
-
-
-# undef ENTRY
-# define ENTRY(name) \
- .type __strrchr_sse2, @function; \
- .align 16; \
- .globl __strrchr_sse2; \
- .hidden __strrchr_sse2; \
- __strrchr_sse2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size __strrchr_sse2, .-__strrchr_sse2
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal strrchr calls through a PLT.
- The speedup we get from using SSE4.2 instruction is likely eaten away
- by the indirect call in the PLT. */
-# define libc_hidden_builtin_def(name) \
- .globl __GI_strrchr; __GI_strrchr = __strrchr_sse2
-#endif
-
-#include "../strrchr.S"
diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
index 8128cb9769..8d19e5ca36 100644
--- a/sysdeps/x86_64/multiarch/strspn-c.c
+++ b/sysdeps/x86_64/multiarch/strspn-c.c
@@ -1,5 +1,5 @@
/* strspn with SSE4.2 intrinsics
- Copyright (C) 2009-2013 Free Software Foundation, Inc.
+ Copyright (C) 2009-2014 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S
index 79fbf3c574..bf7308eade 100644
--- a/sysdeps/x86_64/multiarch/strspn.S
+++ b/sysdeps/x86_64/multiarch/strspn.S
@@ -1,6 +1,6 @@
/* Multiple versions of strspn
All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2009-2013 Free Software Foundation, Inc.
+ Copyright (C) 2009-2014 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
diff --git a/sysdeps/x86_64/multiarch/strstr-c.c b/sysdeps/x86_64/multiarch/strstr-c.c
deleted file mode 100644
index 42bbe48172..0000000000
--- a/sysdeps/x86_64/multiarch/strstr-c.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Multiple versions of strstr.
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2012-2013 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-/* Redefine strstr so that the compiler won't complain about the type
- mismatch with the IFUNC selector in strong_alias, below. */
-#undef strstr
-#define strstr __redirect_strstr
-#include <string.h>
-#undef strstr
-
-#define STRSTR __strstr_sse2
-#ifdef SHARED
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(name) \
- __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2);
-#endif
-
-#include "string/strstr.c"
-
-extern __typeof (__redirect_strstr) __strstr_sse42 attribute_hidden;
-extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
-
-#include "init-arch.h"
-
-/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
- ifunc symbol properly. */
-extern __typeof (__redirect_strstr) __libc_strstr;
-libc_ifunc (__libc_strstr, HAS_SSE4_2 ? __strstr_sse42 : __strstr_sse2)
-
-#undef strstr
-strong_alias (__libc_strstr, strstr)
diff --git a/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S
new file mode 100644
index 0000000000..5b8009c733
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S
@@ -0,0 +1,374 @@
+/* strstr with unaligned loads
+ Copyright (C) 2009-2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ENTRY(__strstr_sse2_unaligned)
+ movzbl (%rsi), %eax
+ testb %al, %al
+ je L(empty)
+ movzbl 1(%rsi), %edx
+ testb %dl, %dl
+ je L(strchr)
+ movd %eax, %xmm1
+ movd %edx, %xmm2
+ movq %rdi, %rax
+ andl $4095, %eax
+ punpcklbw %xmm1, %xmm1
+ cmpq $4031, %rax
+ punpcklbw %xmm2, %xmm2
+ punpcklwd %xmm1, %xmm1
+ punpcklwd %xmm2, %xmm2
+ pshufd $0, %xmm1, %xmm1
+ pshufd $0, %xmm2, %xmm2
+ ja L(cross_page)
+ movdqu (%rdi), %xmm3
+ pxor %xmm5, %xmm5
+ movdqu 1(%rdi), %xmm4
+ movdqa %xmm3, %xmm6
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm2, %xmm4
+ movdqu 16(%rdi), %xmm0
+ pcmpeqb %xmm5, %xmm6
+ pminub %xmm4, %xmm3
+ movdqa %xmm3, %xmm4
+ movdqu 17(%rdi), %xmm3
+ pcmpeqb %xmm0, %xmm5
+ pcmpeqb %xmm2, %xmm3
+ por %xmm6, %xmm4
+ pcmpeqb %xmm1, %xmm0
+ pminub %xmm3, %xmm0
+ por %xmm5, %xmm0
+ pmovmskb %xmm4, %r8d
+ pmovmskb %xmm0, %eax
+ salq $16, %rax
+ orq %rax, %r8
+ je L(next_32_bytes)
+L(next_pair_index):
+ bsf %r8, %rax
+ addq %rdi, %rax
+ cmpb $0, (%rax)
+ je L(zero1)
+ movzbl 2(%rsi), %edx
+ testb %dl, %dl
+ je L(found1)
+ cmpb 2(%rax), %dl
+ jne L(next_pair)
+ xorl %edx, %edx
+ jmp L(pair_loop_start)
+
+ .p2align 4
+L(strchr):
+ movzbl %al, %esi
+ jmp __strchr_sse2
+
+ .p2align 4
+L(pair_loop):
+ addq $1, %rdx
+ cmpb 2(%rax,%rdx), %cl
+ jne L(next_pair)
+L(pair_loop_start):
+ movzbl 3(%rsi,%rdx), %ecx
+ testb %cl, %cl
+ jne L(pair_loop)
+L(found1):
+ ret
+L(zero1):
+ xorl %eax, %eax
+ ret
+
+ .p2align 4
+L(next_pair):
+ leaq -1(%r8), %rax
+ andq %rax, %r8
+ jne L(next_pair_index)
+
+ .p2align 4
+L(next_32_bytes):
+ movdqu 32(%rdi), %xmm3
+ pxor %xmm5, %xmm5
+ movdqu 33(%rdi), %xmm4
+ movdqa %xmm3, %xmm6
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm2, %xmm4
+ movdqu 48(%rdi), %xmm0
+ pcmpeqb %xmm5, %xmm6
+ pminub %xmm4, %xmm3
+ movdqa %xmm3, %xmm4
+ movdqu 49(%rdi), %xmm3
+ pcmpeqb %xmm0, %xmm5
+ pcmpeqb %xmm2, %xmm3
+ por %xmm6, %xmm4
+ pcmpeqb %xmm1, %xmm0
+ pminub %xmm3, %xmm0
+ por %xmm5, %xmm0
+ pmovmskb %xmm4, %eax
+ salq $32, %rax
+ pmovmskb %xmm0, %r8d
+ salq $48, %r8
+ orq %rax, %r8
+ je L(loop_header)
+L(next_pair2_index):
+ bsfq %r8, %rax
+ addq %rdi, %rax
+ cmpb $0, (%rax)
+ je L(zero2)
+ movzbl 2(%rsi), %edx
+ testb %dl, %dl
+ je L(found2)
+ cmpb 2(%rax), %dl
+ jne L(next_pair2)
+ xorl %edx, %edx
+ jmp L(pair_loop2_start)
+
+ .p2align 4
+L(pair_loop2):
+ addq $1, %rdx
+ cmpb 2(%rax,%rdx), %cl
+ jne L(next_pair2)
+L(pair_loop2_start):
+ movzbl 3(%rsi,%rdx), %ecx
+ testb %cl, %cl
+ jne L(pair_loop2)
+L(found2):
+ ret
+ L(zero2):
+ xorl %eax, %eax
+ ret
+L(empty):
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(next_pair2):
+ leaq -1(%r8), %rax
+ andq %rax, %r8
+ jne L(next_pair2_index)
+L(loop_header):
+ movq $-512, %r11
+ movq %rdi, %r9
+
+ pxor %xmm7, %xmm7
+ andq $-64, %rdi
+
+ .p2align 4
+L(loop):
+ movdqa 64(%rdi), %xmm3
+ movdqu 63(%rdi), %xmm6
+ movdqa %xmm3, %xmm0
+ pxor %xmm2, %xmm3
+ pxor %xmm1, %xmm6
+ movdqa 80(%rdi), %xmm10
+ por %xmm3, %xmm6
+ pminub %xmm10, %xmm0
+ movdqu 79(%rdi), %xmm3
+ pxor %xmm2, %xmm10
+ pxor %xmm1, %xmm3
+ movdqa 96(%rdi), %xmm9
+ por %xmm10, %xmm3
+ pminub %xmm9, %xmm0
+ pxor %xmm2, %xmm9
+ movdqa 112(%rdi), %xmm8
+ addq $64, %rdi
+ pminub %xmm6, %xmm3
+ movdqu 31(%rdi), %xmm4
+ pminub %xmm8, %xmm0
+ pxor %xmm2, %xmm8
+ pxor %xmm1, %xmm4
+ por %xmm9, %xmm4
+ pminub %xmm4, %xmm3
+ movdqu 47(%rdi), %xmm5
+ pxor %xmm1, %xmm5
+ por %xmm8, %xmm5
+ pminub %xmm5, %xmm3
+ pminub %xmm3, %xmm0
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %eax
+ testl %eax, %eax
+ je L(loop)
+ pminub (%rdi), %xmm6
+ pminub 32(%rdi),%xmm4
+ pminub 48(%rdi),%xmm5
+ pcmpeqb %xmm7, %xmm6
+ pcmpeqb %xmm7, %xmm5
+ pmovmskb %xmm6, %edx
+ movdqa 16(%rdi), %xmm8
+ pcmpeqb %xmm7, %xmm4
+ movdqu 15(%rdi), %xmm0
+ pmovmskb %xmm5, %r8d
+ movdqa %xmm8, %xmm3
+ pmovmskb %xmm4, %ecx
+ pcmpeqb %xmm1,%xmm0
+ pcmpeqb %xmm2,%xmm3
+ salq $32, %rcx
+ pcmpeqb %xmm7,%xmm8
+ salq $48, %r8
+ pminub %xmm0,%xmm3
+ orq %rcx, %rdx
+ por %xmm3,%xmm8
+ orq %rdx, %r8
+ pmovmskb %xmm8, %eax
+ salq $16, %rax
+ orq %rax, %r8
+ je L(loop)
+L(next_pair_index3):
+ bsfq %r8, %rcx
+ addq %rdi, %rcx
+ cmpb $0, (%rcx)
+ je L(zero)
+ xorl %eax, %eax
+ movzbl 2(%rsi), %edx
+ testb %dl, %dl
+ je L(success3)
+ cmpb 1(%rcx), %dl
+ jne L(next_pair3)
+ jmp L(pair_loop_start3)
+
+ .p2align 4
+L(pair_loop3):
+ addq $1, %rax
+ cmpb 1(%rcx,%rax), %dl
+ jne L(next_pair3)
+L(pair_loop_start3):
+ movzbl 3(%rsi,%rax), %edx
+ testb %dl, %dl
+ jne L(pair_loop3)
+L(success3):
+ lea -1(%rcx), %rax
+ ret
+
+ .p2align 4
+L(next_pair3):
+ addq %rax, %r11
+ movq %rdi, %rax
+ subq %r9, %rax
+ cmpq %r11, %rax
+ jl L(switch_strstr)
+ leaq -1(%r8), %rax
+ andq %rax, %r8
+ jne L(next_pair_index3)
+ jmp L(loop)
+
+ .p2align 4
+L(switch_strstr):
+ movq %rdi, %rdi
+ jmp __strstr_sse2
+
+ .p2align 4
+L(cross_page):
+
+ movq %rdi, %rax
+ pxor %xmm0, %xmm0
+ andq $-64, %rax
+ movdqa (%rax), %xmm3
+ movdqu -1(%rax), %xmm4
+ movdqa %xmm3, %xmm8
+ movdqa 16(%rax), %xmm5
+ pcmpeqb %xmm1, %xmm4
+ pcmpeqb %xmm0, %xmm8
+ pcmpeqb %xmm2, %xmm3
+ movdqa %xmm5, %xmm7
+ pminub %xmm4, %xmm3
+ movdqu 15(%rax), %xmm4
+ pcmpeqb %xmm0, %xmm7
+ por %xmm3, %xmm8
+ movdqa %xmm5, %xmm3
+ movdqa 32(%rax), %xmm5
+ pcmpeqb %xmm1, %xmm4
+ pcmpeqb %xmm2, %xmm3
+ movdqa %xmm5, %xmm6
+ pmovmskb %xmm8, %ecx
+ pminub %xmm4, %xmm3
+ movdqu 31(%rax), %xmm4
+ por %xmm3, %xmm7
+ movdqa %xmm5, %xmm3
+ pcmpeqb %xmm0, %xmm6
+ movdqa 48(%rax), %xmm5
+ pcmpeqb %xmm1, %xmm4
+ pmovmskb %xmm7, %r8d
+ pcmpeqb %xmm2, %xmm3
+ pcmpeqb %xmm5, %xmm0
+ pminub %xmm4, %xmm3
+ movdqu 47(%rax), %xmm4
+ por %xmm3, %xmm6
+ movdqa %xmm5, %xmm3
+ salq $16, %r8
+ pcmpeqb %xmm1, %xmm4
+ pcmpeqb %xmm2, %xmm3
+ pmovmskb %xmm6, %r10d
+ pminub %xmm4, %xmm3
+ por %xmm3, %xmm0
+ salq $32, %r10
+ orq %r10, %r8
+ orq %rcx, %r8
+ movl %edi, %ecx
+ pmovmskb %xmm0, %edx
+ subl %eax, %ecx
+ salq $48, %rdx
+ orq %rdx, %r8
+ shrq %cl, %r8
+ je L(loop_header)
+L(next_pair_index4):
+ bsfq %r8, %rax
+ addq %rdi, %rax
+ cmpb $0, (%rax)
+ je L(zero)
+
+ cmpq %rax,%rdi
+ je L(next_pair4)
+
+ movzbl 2(%rsi), %edx
+ testb %dl, %dl
+ je L(found3)
+ cmpb 1(%rax), %dl
+ jne L(next_pair4)
+ xorl %edx, %edx
+ jmp L(pair_loop_start4)
+
+ .p2align 4
+L(pair_loop4):
+ addq $1, %rdx
+ cmpb 1(%rax,%rdx), %cl
+ jne L(next_pair4)
+L(pair_loop_start4):
+ movzbl 3(%rsi,%rdx), %ecx
+ testb %cl, %cl
+ jne L(pair_loop4)
+L(found3):
+ subq $1, %rax
+ ret
+
+ .p2align 4
+L(next_pair4):
+ leaq -1(%r8), %rax
+ andq %rax, %r8
+ jne L(next_pair_index4)
+ jmp L(loop_header)
+
+ .p2align 4
+L(found):
+ rep
+ ret
+
+ .p2align 4
+L(zero):
+ xorl %eax, %eax
+ ret
+
+
+END(__strstr_sse2_unaligned)
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
index cd63b68c01..b41374d754 100644
--- a/sysdeps/x86_64/multiarch/strstr.c
+++ b/sysdeps/x86_64/multiarch/strstr.c
@@ -1,6 +1,6 @@
-/* strstr with SSE4.2 intrinsics
- Copyright (C) 2009-2013 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
+/* Multiple versions of strstr.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2012-2014 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -17,369 +17,31 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#include <nmmintrin.h>
-#include "varshift.h"
-
-#ifndef STRSTR_SSE42
-# define STRSTR_SSE42 __strstr_sse42
-#endif
-
-#ifdef USE_AS_STRCASESTR
-# include <ctype.h>
-# include <locale/localeinfo.h>
-
-# define LOADBYTE(C) tolower (C)
-# define CMPBYTE(C1, C2) (tolower (C1) == tolower (C2))
-#else
-# define LOADBYTE(C) (C)
-# define CMPBYTE(C1, C2) ((C1) == (C2))
-#endif
-
-/* We use 0xe ordered-compare:
- _SIDD_SBYTE_OPS
- | _SIDD_CMP_EQUAL_ORDER
- | _SIDD_LEAST_SIGNIFICANT
- on pcmpistri to do the scanning and string comparsion requirements of
- sub-string match. In the scanning phase, we process Cflag and ECX
- index to locate the first fragment match; once the first fragment
- match position has been identified, we do comparison of subsequent
- string fragments until we can conclude false or true match; whe
- n concluding a false match, we may need to repeat scanning process
- from next relevant offset in the target string.
-
- In the scanning phase we have 4 cases:
- case ECX CFlag ZFlag SFlag
- 1 16 0 0 0
- 2a 16 0 0 1
- 2b 16 0 1 0
- 2c 16 0 1 1
-
- 1. No ordered-comparison match, both 16B fragments are valid, so
- continue to next fragment.
- 2. No ordered-comparison match, there is EOS in either fragment,
- 2a. Zflg = 0, Sflg = 1, we continue
- 2b. Zflg = 1, Sflg = 0, we conclude no match and return.
- 2c. Zflg = 1, sflg = 1, lenth determine match or no match
-
- In the string comparison phase, the 1st fragment match is fixed up
- to produce ECX = 0. Subsequent fragment compare of nonzero index
- and no match conclude a false match.
-
- case ECX CFlag ZFlag SFlag
- 3 X 1 0 0/1
- 4a 0 1 0 0
- 4b 0 1 0 1
- 4c 0 < X 1 0 0/1
- 5 16 0 1 0
-
- 3. An initial ordered-comparison fragment match, we fix up to do
- subsequent string comparison
- 4a. Continuation of fragment comparison of a string compare.
- 4b. EOS reached in the reference string, we conclude true match and
- return
- 4c. String compare failed if index is nonzero, we need to go back to
- scanning
- 5. failed string compare, go back to scanning
- */
-
-#if !(defined USE_AS_STRCASESTR && defined STRCASESTR_NONASCII)
-/* Simple replacement of movdqu to address 4KB boundary cross issue.
- If EOS occurs within less than 16B before 4KB boundary, we don't
- cross to next page. */
-static __m128i
-__m128i_strloadu (const unsigned char * p, __m128i zero)
-{
- if (__builtin_expect ((int) ((size_t) p & 0xfff) > 0xff0, 0))
- {
- size_t offset = ((size_t) p & (16 - 1));
- __m128i a = _mm_load_si128 ((__m128i *) (p - offset));
- int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero));
- if ((bmsk >> offset) != 0)
- return __m128i_shift_right (a, offset);
- }
- return _mm_loadu_si128 ((__m128i *) p);
-}
-#endif
-
-#if defined USE_AS_STRCASESTR && !defined STRCASESTR_NONASCII
-
-/* Similar to __m128i_strloadu. Convert to lower case for POSIX/C
- locale and other which have single-byte letters only in the ASCII
- range. */
-static __m128i
-__m128i_strloadu_tolower (const unsigned char *p, __m128i zero, __m128i uclow,
- __m128i uchigh, __m128i lcqword)
-{
- __m128i frag = __m128i_strloadu (p, zero);
-
- /* Compare if 'Z' > bytes. Inverted way to get a mask for byte <= 'Z'. */
- __m128i r2 = _mm_cmpgt_epi8 (uchigh, frag);
- /* Compare if bytes are > 'A' - 1. */
- __m128i r1 = _mm_cmpgt_epi8 (frag, uclow);
- /* Mask byte == ff if byte(r2) <= 'Z' and byte(r1) > 'A' - 1. */
- __m128i mask = _mm_and_si128 (r2, r1);
- /* Apply lowercase bit 6 mask for above mask bytes == ff. */
- return _mm_or_si128 (frag, _mm_and_si128 (mask, lcqword));
-}
-
-#endif
-
-/* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP
- algorithm) overlap for a fully populated 16B vector.
- Input parameter: 1st 16Byte loaded from the reference string of a
- strstr function.
- We don't use KMP algorithm if reference string is less than 16B. */
-static int
-__inline__ __attribute__ ((__always_inline__,))
-KMP16Bovrlap (__m128i s2)
-{
- __m128i b = _mm_unpacklo_epi8 (s2, s2);
- __m128i a = _mm_unpacklo_epi8 (b, b);
- a = _mm_shuffle_epi32 (a, 0);
- b = _mm_srli_si128 (s2, sizeof (char));
- int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a));
-
- /* _BitScanForward(&k1, bmsk); */
- int k1;
- __asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk));
- if (!bmsk)
- return 16;
- else if (bmsk == 0x7fff)
- return 1;
- else if (!k1)
- {
- /* There are al least two distinct chars in s2. If byte 0 and 1 are
- idential and the distinct value lies farther down, we can deduce
- the next byte offset to restart full compare is least no earlier
- than byte 3. */
- return 3;
- }
- else
- {
- /* Byte 1 is not degenerated to byte 0. */
- return k1 + 1;
- }
-}
-
-char *
-__attribute__ ((section (".text.sse4.2")))
-STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
-{
-#define p1 s1
- const unsigned char *p2 = s2;
-
-#ifndef STRCASESTR_NONASCII
- if (__builtin_expect (p2[0] == '\0', 0))
- return (char *) p1;
-
- if (__builtin_expect (p1[0] == '\0', 0))
- return NULL;
-
- /* Check if p1 length is 1 byte long. */
- if (__builtin_expect (p1[1] == '\0', 0))
- return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL;
-#endif
-
-#ifdef USE_AS_STRCASESTR
-# ifndef STRCASESTR_NONASCII
- if (__builtin_expect (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE)
- != 0, 0))
- return __strcasestr_sse42_nonascii (s1, s2);
-
- const __m128i uclow = _mm_set1_epi8 (0x40);
- const __m128i uchigh = _mm_set1_epi8 (0x5b);
- const __m128i lcqword = _mm_set1_epi8 (0x20);
- const __m128i zero = _mm_setzero_si128 ();
-# define strloadu(p) __m128i_strloadu_tolower (p, zero, uclow, uchigh, lcqword)
-# else
-# define strloadu __m128i_strloadu_tolower
-# define zero _mm_setzero_si128 ()
-# endif
-#else
-# define strloadu(p) __m128i_strloadu (p, zero)
- const __m128i zero = _mm_setzero_si128 ();
+/* Redefine strstr so that the compiler won't complain about the type
+ mismatch with the IFUNC selector in strong_alias, below. */
+#undef strstr
+#define strstr __redirect_strstr
+#include <string.h>
+#undef strstr
+
+#define STRSTR __strstr_sse2
+#ifdef SHARED
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name) \
+ __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2);
#endif
- /* p1 > 1 byte long. Load up to 16 bytes of fragment. */
- __m128i frag1 = strloadu (p1);
-
- __m128i frag2;
- if (p2[1] != '\0')
- /* p2 is > 1 byte long. */
- frag2 = strloadu (p2);
- else
- frag2 = _mm_insert_epi8 (zero, LOADBYTE (p2[0]), 0);
-
- /* Unsigned bytes, equal order, does frag2 has null? */
- int cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- int cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- int cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- int cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
- if (cmp_s & cmp_c)
- {
- int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2, zero));
- int len;
- __asm ("bsfl %[bmsk], %[len]"
- : [len] "=r" (len) : [bmsk] "r" (bmsk));
- p1 += cmp;
- if ((len + cmp) <= 16)
- return (char *) p1;
-
- /* Load up to 16 bytes of fragment. */
- frag1 = strloadu (p1);
- cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
- cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- if ((len + cmp) <= 16)
- return (char *) p1 + cmp;
- }
-
- if (cmp_s)
- {
- /* Adjust addr for 16B alginment in ensuing loop. */
- while (!cmp_z)
- {
- p1 += cmp;
- /* Load up to 16 bytes of fragment. */
- frag1 = strloadu (p1);
- cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- /* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp
- once already, this time cmp will be zero and we can exit. */
- if ((!cmp) & cmp_c)
- break;
- }
-
- if (!cmp_c)
- return NULL;
-
- /* Since s2 is less than 16 bytes, com_c is definitive
- determination of full match. */
- return (char *) p1 + cmp;
- }
-
- /* General case, s2 is at least 16 bytes or more.
- First, the common case of false-match at first byte of p2. */
- const unsigned char *pt = NULL;
- int kmp_fwd = 0;
-re_trace:
- while (!cmp_c)
- {
- /* frag1 has null. */
- if (cmp_z)
- return NULL;
-
- /* frag 1 has no null, advance 16 bytes. */
- p1 += 16;
- /* Load up to 16 bytes of fragment. */
- frag1 = strloadu (p1);
- /* Unsigned bytes, equal order, is there a partial match? */
- cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- }
-
- /* Next, handle initial positive match as first byte of p2. We have
- a partial fragment match, make full determination until we reached
- end of s2. */
- if (!cmp)
- {
- if (cmp_z)
- return (char *) p1;
-
- pt = p1;
- p1 += 16;
- p2 += 16;
- /* Load up to 16 bytes of fragment. */
- frag2 = strloadu (p2);
- }
- else
- {
- /* Adjust 16B alignment. */
- p1 += cmp;
- pt = p1;
- }
-
- /* Load up to 16 bytes of fragment. */
- frag1 = strloadu (p1);
-
- /* Unsigned bytes, equal order, does frag2 has null? */
- cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
- while (!(cmp | cmp_z | cmp_s))
- {
- p1 += 16;
- p2 += 16;
- /* Load up to 16 bytes of fragment. */
- frag2 = strloadu (p2);
- /* Load up to 16 bytes of fragment. */
- frag1 = strloadu (p1);
- /* Unsigned bytes, equal order, does frag2 has null? */
- cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
- }
-
- /* Full determination yielded a false result, retrace s1 to next
- starting position.
- Zflg 1 0 1 0/1
- Sflg 0 1 1 0/1
- cmp na 0 0 >0
- action done done continue continue if s2 < s1
- false match retrace s1 else false
- */
-
- if (cmp_s & !cmp)
- return (char *) pt;
- if (cmp_z)
- {
- if (!cmp_s)
- return NULL;
-
- /* Handle both zero and sign flag set and s1 is shorter in
- length. */
- int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2));
- int bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1));
- int len;
- int len1;
- __asm ("bsfl %[bmsk], %[len]"
- : [len] "=r" (len) : [bmsk] "r" (bmsk));
- __asm ("bsfl %[bmsk1], %[len1]"
- : [len1] "=r" (len1) : [bmsk1] "r" (bmsk1));
- if (len >= len1)
- return NULL;
- }
- else if (!cmp)
- return (char *) pt;
-
- /* Otherwise, we have to retrace and continue. Default of multiple
- paths that need to retrace from next byte in s1. */
- p2 = s2;
- frag2 = strloadu (p2);
-
- if (!kmp_fwd)
- kmp_fwd = KMP16Bovrlap (frag2);
+#include "string/strstr.c"
- /* KMP algorithm predicted overlap needs to be corrected for
- partial fragment compare. */
- p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd);
+extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
+extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
- /* Since s2 is at least 16 bytes long, we're certain there is no
- match. */
- if (p1[0] == '\0')
- return NULL;
+#include "init-arch.h"
- /* Load up to 16 bytes of fragment. */
- frag1 = strloadu (p1);
+/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
+ ifunc symbol properly. */
+extern __typeof (__redirect_strstr) __libc_strstr;
+libc_ifunc (__libc_strstr, HAS_FAST_UNALIGNED_LOAD ? __strstr_sse2_unaligned : __strstr_sse2)
- /* Unsigned bytes, equal order, is there a partial match? */
- cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
- cmp = _mm_cmpistri (frag2, frag1, 0x0c);
- cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
- goto re_trace;
-}
+#undef strstr
+strong_alias (__libc_strstr, strstr)
diff --git a/sysdeps/x86_64/multiarch/test-multiarch.c b/sysdeps/x86_64/multiarch/test-multiarch.c
index 7ad7cca21e..0b144bc06d 100644
--- a/sysdeps/x86_64/multiarch/test-multiarch.c
+++ b/sysdeps/x86_64/multiarch/test-multiarch.c
@@ -1,6 +1,6 @@
/* Test CPU feature data.
This file is part of the GNU C Library.
- Copyright (C) 2012-2013 Free Software Foundation, Inc.
+ Copyright (C) 2012-2014 Free Software Foundation, Inc.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
diff --git a/sysdeps/x86_64/multiarch/varshift.c b/sysdeps/x86_64/multiarch/varshift.c
index cdb0efb187..9761fb20c3 100644
--- a/sysdeps/x86_64/multiarch/varshift.c
+++ b/sysdeps/x86_64/multiarch/varshift.c
@@ -1,5 +1,5 @@
/* Helper for variable shifts of SSE registers.
- Copyright (C) 2010-2013 Free Software Foundation, Inc.
+ Copyright (C) 2010-2014 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/multiarch/varshift.h b/sysdeps/x86_64/multiarch/varshift.h
index 5b7e910eb2..4436a605bd 100644
--- a/sysdeps/x86_64/multiarch/varshift.h
+++ b/sysdeps/x86_64/multiarch/varshift.h
@@ -1,5 +1,5 @@
/* Helper for variable shifts of SSE registers.
- Copyright (C) 2010-2013 Free Software Foundation, Inc.
+ Copyright (C) 2010-2014 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
index b7de092228..c79389ec3b 100644
--- a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
@@ -1,5 +1,5 @@
/* wcscpy with SSSE3
- Copyright (C) 2011-2013 Free Software Foundation, Inc.
+ Copyright (C) 2011-2014 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
diff --git a/sysdeps/x86_64/multiarch/wcscpy.S b/sysdeps/x86_64/multiarch/wcscpy.S
index e5ac97e558..f12ba27d60 100644
--- a/sysdeps/x86_64/multiarch/wcscpy.S
+++ b/sysdeps/x86_64/multiarch/wcscpy.S
@@ -1,6 +1,6 @@
/* Multiple versions of wcscpy
All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2011-2013 Free Software Foundation, Inc.
+ Copyright (C) 2011-2014 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S
index f7c8040527..37b9bbaeea 100644
--- a/sysdeps/x86_64/multiarch/wmemcmp.S
+++ b/sysdeps/x86_64/multiarch/wmemcmp.S
@@ -1,6 +1,6 @@
/* Multiple versions of wmemcmp
All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2011-2013 Free Software Foundation, Inc.
+ Copyright (C) 2011-2014 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.