diff options
author | Andreas Schwab <schwab@redhat.com> | 2010-01-19 17:01:59 +0100 |
---|---|---|
committer | Andreas Schwab <schwab@redhat.com> | 2010-01-20 17:35:37 +0100 |
commit | 7d16e875fe9092378b7d61d6c749fbb87e7d1e7e (patch) | |
tree | 422963be75b1fa594d0ea4c8f9698039be20c0af | |
parent | f0dfc72f9d9bae2e8b210113ca2c4a4428d4cb5b (diff) |
Revert "Optimize 32bit memset/memcpy with SSE2/SSSE3."
This reverts commit 3af48cbdfaeb8bc389de1caeb33bc29811da80e8.
36 files changed, 15 insertions, 6339 deletions
@@ -169,68 +169,6 @@ * posix/sched.h: Define time_t and pid_t for XPG7. -2010-01-12 H.J. Lu <hongjiu.lu@intel.com> - - * sysdeps/i386/i686/bcopy.S: New file. - - * sysdeps/i386/i686/cacheinfo.c (__x86_64_data_cache_size): Define. - - * sysdeps/i386/i686/memcpy.S (__memcpy_chk): Use ENTRY_CHK - and END_CHK. - * sysdeps/i386/i686/memmove.S (__memmove_chk): Likewise. - * sysdeps/i386/i686/mempcpy.S (__mempcpy_chk): Likewise. - * sysdeps/i386/i686/memset.S (__memset_chk): Likewise. - - * sysdeps/i386/i686/memmove.S: Support USE_AS_BCOPY. - - * sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add - bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 memmove-ssse3 - memcpy-ssse3-rep mempcpy-ssse3-rep memmove-ssse3-rep - bcopy-ssse3 bcopy-ssse3-rep memset-sse2-rep bzero-sse2-rep - * sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S: New file. - * sysdeps/i386/i686/multiarch/bcopy-ssse3.S: New file. - * sysdeps/i386/i686/multiarch/bcopy.S: New file. - * sysdeps/i386/i686/multiarch/bzero-sse2-rep.S: New file. - * sysdeps/i386/i686/multiarch/bzero-sse2.S: New file. - * sysdeps/i386/i686/multiarch/bzero.S: New file. - * sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S: New file. - * sysdeps/i386/i686/multiarch/memcpy-ssse3.S: New file. - * sysdeps/i386/i686/multiarch/memcpy.S: New file. - * sysdeps/i386/i686/multiarch/memcpy_chk.S: New file. - * sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S: New file. - * sysdeps/i386/i686/multiarch/memmove-ssse3.S: New file. - * sysdeps/i386/i686/multiarch/memmove.S: New file. - * sysdeps/i386/i686/multiarch/memmove_chk.S: New file. - * sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S: New file. - * sysdeps/i386/i686/multiarch/mempcpy-ssse3.S: New file. - * sysdeps/i386/i686/multiarch/mempcpy.S: New file. - * sysdeps/i386/i686/multiarch/mempcpy_chk.S: New file. - * sysdeps/i386/i686/multiarch/memset-sse2-rep.S: New file. - * sysdeps/i386/i686/multiarch/memset-sse2.S: New file. - * sysdeps/i386/i686/multiarch/memset.S: New file. - * sysdeps/i386/i686/multiarch/memset_chk.S: New file. - - * sysdeps/i386/sysdep.h (ENTRY_CHK): New. - (END_CHK): Likewise. - - * sysdeps/i386/i686/multiarch/ifunc-defines.sym: Add - FEATURE_OFFSET, FEATURE_SIZE and FEATURE_INDEX_1. - * sysdeps/x86_64/multiarch/ifunc-defines.sym: Likewise. - - * sysdeps/x86_64/cacheinfo.c (intel_02_cache_info): Add entries - for 0x0e and 0x80. - (__x86_64_data_cache_size): New. - (init_cacheinfo): Set __x86_64_data_cache_size. - - * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): Turn - on bit_Fast_Rep_String for Intel Core i7. - - * sysdeps/x86_64/multiarch/init-arch.h (bit_Fast_Rep_String): New. - (index_Fast_Rep_String): Likewise. - (FEATURE_INDEX_1): Likewise. - (FEATURE_INDEX_MAX): Likewise. - (cpu_features): Add feature. - 2010-01-12 Ulrich Drepper <drepper@redhat.com> * conform/data/sys/select.h-data: Fix up for XPG7. diff --git a/sysdeps/i386/i686/bcopy.S b/sysdeps/i386/i686/bcopy.S deleted file mode 100644 index 15ef9419a4..0000000000 --- a/sysdeps/i386/i686/bcopy.S +++ /dev/null @@ -1,3 +0,0 @@ -#define USE_AS_BCOPY -#define memmove bcopy -#include <sysdeps/i386/i686/memmove.S> diff --git a/sysdeps/i386/i686/cacheinfo.c b/sysdeps/i386/i686/cacheinfo.c index f8b7f521ca..82e4cd223e 100644 --- a/sysdeps/i386/i686/cacheinfo.c +++ b/sysdeps/i386/i686/cacheinfo.c @@ -1,4 +1,3 @@ -#define __x86_64_data_cache_size __x86_data_cache_size #define __x86_64_data_cache_size_half __x86_data_cache_size_half #define __x86_64_shared_cache_size __x86_shared_cache_size #define __x86_64_shared_cache_size_half __x86_shared_cache_size_half diff --git a/sysdeps/i386/i686/memcpy.S b/sysdeps/i386/i686/memcpy.S index 86ee082beb..0b2da1ea27 100644 --- a/sysdeps/i386/i686/memcpy.S +++ b/sysdeps/i386/i686/memcpy.S @@ -32,11 +32,11 @@ .text #if defined PIC && !defined NOT_IN_libc -ENTRY_CHK (__memcpy_chk) +ENTRY (__memcpy_chk) movl 12(%esp), %eax cmpl %eax, 16(%esp) jb HIDDEN_JUMPTARGET (__chk_fail) -END_CHK (__memcpy_chk) +END (__memcpy_chk) #endif ENTRY (BP_SYM (memcpy)) ENTER diff --git a/sysdeps/i386/i686/memmove.S b/sysdeps/i386/i686/memmove.S index 981f14f4e0..b93b5c729f 100644 --- a/sysdeps/i386/i686/memmove.S +++ b/sysdeps/i386/i686/memmove.S @@ -26,27 +26,18 @@ #define PARMS LINKAGE+4 /* one spilled register */ #define RTN PARMS +#define DEST RTN+RTN_SIZE +#define SRC DEST+PTR_SIZE +#define LEN SRC+PTR_SIZE .text - -#ifdef USE_AS_BCOPY -# define SRC RTN+RTN_SIZE -# define DEST SRC+PTR_SIZE -# define LEN DEST+PTR_SIZE -#else -# define DEST RTN+RTN_SIZE -# define SRC DEST+PTR_SIZE -# define LEN SRC+PTR_SIZE - -# if defined PIC && !defined NOT_IN_libc -ENTRY_CHK (__memmove_chk) +#if defined PIC && !defined NOT_IN_libc +ENTRY (__memmove_chk) movl 12(%esp), %eax cmpl %eax, 16(%esp) jb HIDDEN_JUMPTARGET (__chk_fail) -END_CHK (__memmove_chk) -# endif +END (__memmove_chk) #endif - ENTRY (BP_SYM (memmove)) ENTER @@ -78,10 +69,8 @@ ENTRY (BP_SYM (memmove)) movsl movl %edx, %esi cfi_restore (esi) -#ifndef USE_AS_BCOPY movl DEST(%esp), %eax RETURN_BOUNDED_POINTER (DEST(%esp)) -#endif popl %edi cfi_adjust_cfa_offset (-4) @@ -112,10 +101,8 @@ ENTRY (BP_SYM (memmove)) movsl movl %edx, %esi cfi_restore (esi) -#ifndef USE_AS_BCOPY movl DEST(%esp), %eax RETURN_BOUNDED_POINTER (DEST(%esp)) -#endif cld popl %edi @@ -125,6 +112,4 @@ ENTRY (BP_SYM (memmove)) LEAVE RET_PTR END (BP_SYM (memmove)) -#ifndef USE_AS_BCOPY libc_hidden_builtin_def (memmove) -#endif diff --git a/sysdeps/i386/i686/mempcpy.S b/sysdeps/i386/i686/mempcpy.S index c10686fb3d..6437e4a5d4 100644 --- a/sysdeps/i386/i686/mempcpy.S +++ b/sysdeps/i386/i686/mempcpy.S @@ -32,11 +32,11 @@ .text #if defined PIC && !defined NOT_IN_libc -ENTRY_CHK (__mempcpy_chk) +ENTRY (__mempcpy_chk) movl 12(%esp), %eax cmpl %eax, 16(%esp) jb HIDDEN_JUMPTARGET (__chk_fail) -END_CHK (__mempcpy_chk) +END (__mempcpy_chk) #endif ENTRY (BP_SYM (__mempcpy)) ENTER diff --git a/sysdeps/i386/i686/memset.S b/sysdeps/i386/i686/memset.S index b343af7b64..dfa1aa7019 100644 --- a/sysdeps/i386/i686/memset.S +++ b/sysdeps/i386/i686/memset.S @@ -40,11 +40,11 @@ .text #if defined PIC && !defined NOT_IN_libc && !BZERO_P -ENTRY_CHK (__memset_chk) +ENTRY (__memset_chk) movl 12(%esp), %eax cmpl %eax, 16(%esp) jb HIDDEN_JUMPTARGET (__chk_fail) -END_CHK (__memset_chk) +END (__memset_chk) #endif ENTRY (BP_SYM (memset)) ENTER diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile index fbad9ae734..e1553b284e 100644 --- a/sysdeps/i386/i686/multiarch/Makefile +++ b/sysdeps/i386/i686/multiarch/Makefile @@ -4,10 +4,6 @@ gen-as-const-headers += ifunc-defines.sym endif ifeq ($(subdir),string) -sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \ - memmove-ssse3 memcpy-ssse3-rep mempcpy-ssse3-rep \ - memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \ - memset-sse2-rep bzero-sse2-rep ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c CFLAGS-strcspn-c.c += -msse4 diff --git a/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S b/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S deleted file mode 100644 index cbc8b420e8..0000000000 --- a/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_MEMMOVE -#define USE_AS_BCOPY -#define MEMCPY __bcopy_ssse3_rep -#include "memcpy-ssse3-rep.S" diff --git a/sysdeps/i386/i686/multiarch/bcopy-ssse3.S b/sysdeps/i386/i686/multiarch/bcopy-ssse3.S deleted file mode 100644 index 36aac44b9c..0000000000 --- a/sysdeps/i386/i686/multiarch/bcopy-ssse3.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_MEMMOVE -#define USE_AS_BCOPY -#define MEMCPY __bcopy_ssse3 -#include "memcpy-ssse3.S" diff --git a/sysdeps/i386/i686/multiarch/bcopy.S b/sysdeps/i386/i686/multiarch/bcopy.S deleted file mode 100644 index 8671bf684e..0000000000 --- a/sysdeps/i386/i686/multiarch/bcopy.S +++ /dev/null @@ -1,89 +0,0 @@ -/* Multiple versions of bcopy - Copyright (C) 2010 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - -#include <sysdep.h> -#include <init-arch.h> - -/* Define multiple versions only for the definition in lib. */ -#ifndef NOT_IN_libc -# ifdef SHARED - .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits - .globl __i686.get_pc_thunk.bx - .hidden __i686.get_pc_thunk.bx - .p2align 4 - .type __i686.get_pc_thunk.bx,@function -__i686.get_pc_thunk.bx: - movl (%esp), %ebx - ret - - .text -ENTRY(bcopy) - .type bcopy, @gnu_indirect_function - pushl %ebx - cfi_adjust_cfa_offset (4) - cfi_rel_offset (ebx, 0) - call __i686.get_pc_thunk.bx - addl $_GLOBAL_OFFSET_TABLE_, %ebx - cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) - jne 1f - call __init_cpu_features -1: leal __bcopy_ia32@GOTOFF(%ebx), %eax - testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx) - jz 2f - leal __bcopy_ssse3@GOTOFF(%ebx), %eax - testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx) - jz 2f - leal __bcopy_ssse3_rep@GOTOFF(%ebx), %eax -2: popl %ebx - cfi_adjust_cfa_offset (-4) - cfi_restore (ebx) - ret -END(bcopy) -# else - .text -ENTRY(bcopy) - .type bcopy, @gnu_indirect_function - cmpl $0, KIND_OFFSET+__cpu_features - jne 1f - call __init_cpu_features -1: leal __bcopy_ia32, %eax - testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features - jz 2f - leal __bcopy_ssse3, %eax - testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features - jz 2f - leal __bcopy_ssse3_rep, %eax -2: ret -END(bcopy) -# endif - -# undef ENTRY -# define ENTRY(name) \ - .type __bcopy_ia32, @function; \ - .p2align 4; \ - __bcopy_ia32: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __bcopy_ia32, .-__bcopy_ia32 - -#endif - -#include "../bcopy.S" diff --git a/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S b/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S deleted file mode 100644 index 507b288bb3..0000000000 --- a/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S +++ /dev/null @@ -1,3 +0,0 @@ -#define USE_AS_BZERO -#define __memset_sse2_rep __bzero_sse2_rep -#include "memset-sse2-rep.S" diff --git a/sysdeps/i386/i686/multiarch/bzero-sse2.S b/sysdeps/i386/i686/multiarch/bzero-sse2.S deleted file mode 100644 index 8d04512e4e..0000000000 --- a/sysdeps/i386/i686/multiarch/bzero-sse2.S +++ /dev/null @@ -1,3 +0,0 @@ -#define USE_AS_BZERO -#define __memset_sse2 __bzero_sse2 -#include "memset-sse2.S" diff --git a/sysdeps/i386/i686/multiarch/bzero.S b/sysdeps/i386/i686/multiarch/bzero.S deleted file mode 100644 index 8c740a42dc..0000000000 --- a/sysdeps/i386/i686/multiarch/bzero.S +++ /dev/null @@ -1,97 +0,0 @@ -/* Multiple versions of bzero - Copyright (C) 2010 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - -#include <sysdep.h> -#include <init-arch.h> - -/* Define multiple versions only for the definition in lib. */ -#ifndef NOT_IN_libc -# ifdef SHARED - .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits - .globl __i686.get_pc_thunk.bx - .hidden __i686.get_pc_thunk.bx - .p2align 4 - .type __i686.get_pc_thunk.bx,@function -__i686.get_pc_thunk.bx: - movl (%esp), %ebx - ret - - .text -ENTRY(__bzero) - .type __bzero, @gnu_indirect_function - pushl %ebx - cfi_adjust_cfa_offset (4) - cfi_rel_offset (ebx, 0) - call __i686.get_pc_thunk.bx - addl $_GLOBAL_OFFSET_TABLE_, %ebx - cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) - jne 1f - call __init_cpu_features -1: leal __bzero_ia32@GOTOFF(%ebx), %eax - testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx) - jz 2f - leal __bzero_sse2@GOTOFF(%ebx), %eax - testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx) - jz 2f - leal __bzero_sse2_rep@GOTOFF(%ebx), %eax -2: popl %ebx - cfi_adjust_cfa_offset (-4) - cfi_restore (ebx) - ret -END(__bzero) -# else - .text -ENTRY(__bzero) - .type __bzero, @gnu_indirect_function - cmpl $0, KIND_OFFSET+__cpu_features - jne 1f - call __init_cpu_features -1: leal __bzero_ia32, %eax - testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features - jz 2f - leal __bzero_sse2, %eax - testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features - jz 2f - leal __bzero_sse2_rep, %eax -2: ret -END(__bzero) -# endif - -# undef ENTRY -# define ENTRY(name) \ - .type __bzero_ia32, @function; \ - .p2align 4; \ - __bzero_ia32: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __bzero_ia32, .-__bzero_ia32 - -# ifdef SHARED -# undef libc_hidden_builtin_def -/* IFUNC doesn't work with the hidden functions in shared library since - they will be called without setting up EBX needed for PLT which is - used by IFUNC. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI___bzero; __GI___bzero = __bzero_ia32 -# endif -#endif - -#include "../bzero.S" diff --git a/sysdeps/i386/i686/multiarch/ifunc-defines.sym b/sysdeps/i386/i686/multiarch/ifunc-defines.sym index eb1538abcc..e2021cdf87 100644 --- a/sysdeps/i386/i686/multiarch/ifunc-defines.sym +++ b/sysdeps/i386/i686/multiarch/ifunc-defines.sym @@ -13,8 +13,5 @@ CPUID_ECX_OFFSET offsetof (struct cpuid_registers, ecx) CPUID_EDX_OFFSET offsetof (struct cpuid_registers, edx) FAMILY_OFFSET offsetof (struct cpu_features, family) MODEL_OFFSET offsetof (struct cpu_features, model) -FEATURE_OFFSET offsetof (struct cpu_features, feature) -FEATURE_SIZE sizeof (unsigned int) COMMON_CPUID_INDEX_1 -FEATURE_INDEX_1 diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S deleted file mode 100644 index b26037d279..0000000000 --- a/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S +++ /dev/null @@ -1,1785 +0,0 @@ -/* memcpy with SSSE3 and REP string. - Copyright (C) 2010 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - -#include <sysdep.h> - -#if !defined NOT_IN_libc \ - && (defined SHARED \ - || defined USE_AS_MEMMOVE \ - || !defined USE_MULTIARCH) - -#include "asm-syntax.h" - -#ifndef MEMCPY -# define MEMCPY __memcpy_ssse3_rep -# define MEMCPY_CHK __memcpy_chk_ssse3_rep -#endif - -#ifdef USE_AS_BCOPY -# define SRC PARMS -# define DEST SRC+4 -# define LEN DEST+4 -#else -# define DEST PARMS -# define SRC DEST+4 -# define LEN SRC+4 -#endif - -#define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) - -#define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) - -#define PUSH(REG) pushl REG; CFI_PUSH (REG) -#define POP(REG) popl REG; CFI_POP (REG) - -#ifdef SHARED -# define PARMS 8 /* Preserve EBX. */ -# define ENTRANCE PUSH (%ebx); -# define RETURN_END POP (%ebx); ret -# define RETURN RETURN_END; CFI_PUSH (%ebx) -# define JMPTBL(I, B) I - B - -/* Load an entry in a jump table into EBX and branch to it. TABLE is a - jump table with relative offsets. INDEX is a register contains the - index into the jump table. SCALE is the scale of INDEX. */ -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - /* We first load PC into EBX. */ \ - call __i686.get_pc_thunk.bx; \ - /* Get the address of the jump table. */ \ - addl $(TABLE - .), %ebx; \ - /* Get the entry and convert the relative offset to the \ - absolute address. */ \ - addl (%ebx,INDEX,SCALE), %ebx; \ - /* We loaded the jump table. Go. */ \ - jmp *%ebx - -# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \ - addl $(TABLE - .), %ebx - -# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ - addl (%ebx,INDEX,SCALE), %ebx; \ - /* We loaded the jump table. Go. */ \ - jmp *%ebx - - .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits - .globl __i686.get_pc_thunk.bx - .hidden __i686.get_pc_thunk.bx - ALIGN (4) - .type __i686.get_pc_thunk.bx,@function -__i686.get_pc_thunk.bx: - movl (%esp), %ebx - ret -#else -# define PARMS 4 -# define ENTRANCE -# define RETURN_END ret -# define RETURN RETURN_END -# define JMPTBL(I, B) I - -/* Branch to an entry in a jump table. TABLE is a jump table with - absolute offsets. INDEX is a register contains the index into the - jump table. SCALE is the scale of INDEX. */ -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - jmp *TABLE(,INDEX,SCALE) - -# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) - -# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ - jmp *TABLE(,INDEX,SCALE) -#endif - - .section .text.ssse3,"ax",@progbits -#if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BCOPY -ENTRY (MEMCPY_CHK) - movl 12(%esp), %eax - cmpl %eax, 16(%esp) - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMCPY_CHK) -#endif -ENTRY (MEMCPY) - ENTRANCE - movl LEN(%esp), %ecx - movl SRC(%esp), %eax - movl DEST(%esp), %edx - -#ifdef USE_AS_MEMMOVE - cmp %eax, %edx - jb L(copy_forward) - je L(fwd_write_0bytes) - cmp $32, %ecx - jge L(memmove_bwd) - jmp L(bk_write_less32bytes_2) -L(memmove_bwd): - add %ecx, %eax - cmp %eax, %edx - movl SRC(%esp), %eax - jb L(copy_backward) - -L(copy_forward): -#endif - cmp $48, %ecx - jge L(48bytesormore) - -L(fwd_write_less32bytes): -#ifndef USE_AS_MEMMOVE - cmp %dl, %al - jl L(bk_write) -#endif - add %ecx, %edx - add %ecx, %eax - BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) -#ifndef USE_AS_MEMMOVE -L(bk_write): - BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) -#endif - - ALIGN (4) -/* ECX > 32 and EDX is 4 byte aligned. */ -L(48bytesormore): - movdqu (%eax), %xmm0 - PUSH (%edi) - movl %edx, %edi - and $-16, %edx - PUSH (%esi) - add $16, %edx - movl %edi, %esi - sub %edx, %edi - add %edi, %ecx - sub %edi, %eax - -#ifdef SHARED_CACHE_SIZE_HALF - cmp $SHARED_CACHE_SIZE_HALF, %ecx -#else -# ifdef SHARED - call __i686.get_pc_thunk.bx - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx -# else - cmp __x86_shared_cache_size_half, %ecx -# endif -#endif - - mov %eax, %edi - jge L(large_page) - and $0xf, %edi - jz L(shl_0) - - BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) - - ALIGN (4) -L(shl_0): - movdqu %xmm0, (%esi) - xor %edi, %edi - cmp $127, %ecx - ja L(shl_0_gobble) - lea -32(%ecx), %ecx -L(shl_0_loop): - movdqa (%eax, %edi), %xmm0 - movdqa 16(%eax, %edi), %xmm1 - sub $32, %ecx - movdqa %xmm0, (%edx, %edi) - movdqa %xmm1, 16(%edx, %edi) - lea 32(%edi), %edi - jl L(shl_0_end) - - movdqa (%eax, %edi), %xmm0 - movdqa 16(%eax, %edi), %xmm1 - sub $32, %ecx - movdqa %xmm0, (%edx, %edi) - movdqa %xmm1, 16(%edx, %edi) - lea 32(%edi), %edi - jl L(shl_0_end) - - movdqa (%eax, %edi), %xmm0 - movdqa 16(%eax, %edi), %xmm1 - sub $32, %ecx - movdqa %xmm0, (%edx, %edi) - movdqa %xmm1, 16(%edx, %edi) - lea 32(%edi), %edi - jl L(shl_0_end) - - movdqa (%eax, %edi), %xmm0 - movdqa 16(%eax, %edi), %xmm1 - sub $32, %ecx - movdqa %xmm0, (%edx, %edi) - movdqa %xmm1, 16(%edx, %edi) - lea 32(%edi), %edi -L(shl_0_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - add %edi, %eax - POP (%esi) - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) - -L(shl_0_gobble): - -#ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %ecx -#else -# ifdef SHARED - call __i686.get_pc_thunk.bx - add $_GLOBAL_OFFSET_TABLE_, %ebx - mov __x86_data_cache_size_half@GOTOFF(%ebx), %edi -# else - mov __x86_data_cache_size_half, %edi -# endif -#endif - mov %edi, %esi - shr $3, %esi - sub %esi, %edi - cmp %edi, %ecx - jge L(shl_0_gobble_mem_start) - lea -128(%ecx), %ecx - ALIGN (4) -L(shl_0_gobble_cache_loop): - movdqa (%eax), %xmm0 - movaps 0x10(%eax), %xmm1 - movaps 0x20(%eax), %xmm2 - movaps 0x30(%eax), %xmm3 - movaps 0x40(%eax), %xmm4 - movaps 0x50(%eax), %xmm5 - movaps 0x60(%eax), %xmm6 - movaps 0x70(%eax), %xmm7 - lea 0x80(%eax), %eax - sub $128, %ecx - movdqa %xmm0, (%edx) - movaps %xmm1, 0x10(%edx) - movaps %xmm2, 0x20(%edx) - movaps %xmm3, 0x30(%edx) - movaps %xmm4, 0x40(%edx) - movaps %xmm5, 0x50(%edx) - movaps %xmm6, 0x60(%edx) - movaps %xmm7, 0x70(%edx) - lea 0x80(%edx), %edx - - jge L(shl_0_gobble_cache_loop) -L(shl_0_gobble_cache_loop_tail): - cmp $-0x40, %ecx - lea 0x80(%ecx), %ecx - jl L(shl_0_cache_less_64bytes) - - movdqa (%eax), %xmm0 - sub $0x40, %ecx - movdqa 0x10(%eax), %xmm1 - - movdqa %xmm0, (%edx) - movdqa %xmm1, 0x10(%edx) - - movdqa 0x20(%eax), %xmm0 - movdqa 0x30(%eax), %xmm1 - add $0x40, %eax - - movdqa %xmm0, 0x20(%edx) - movdqa %xmm1, 0x30(%edx) - add $0x40, %edx -L(shl_0_cache_less_64bytes): - cmp $0x20, %ecx - jl L(shl_0_cache_less_32bytes) - movdqa (%eax), %xmm0 - sub $0x20, %ecx - movdqa 0x10(%eax), %xmm1 - add $0x20, %eax - movdqa %xmm0, (%edx) - movdqa %xmm1, 0x10(%edx) - add $0x20, %edx -L(shl_0_cache_less_32bytes): - cmp $0x10, %ecx - jl L(shl_0_cache_less_16bytes) - sub $0x10, %ecx - movdqa (%eax), %xmm0 - add $0x10, %eax - movdqa %xmm0, (%edx) - add $0x10, %edx -L(shl_0_cache_less_16bytes): - add %ecx, %edx - add %ecx, %eax - POP (%esi) - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) - - - ALIGN (4) -L(shl_0_gobble_mem_start): - cmp %al, %dl - je L(copy_page_by_rep) - lea -128(%ecx), %ecx -L(shl_0_gobble_mem_loop): - prefetchnta 0x1c0(%eax) - prefetchnta 0x280(%eax) - prefetchnta 0x1c0(%edx) - prefetchnta 0x280(%edx) - - movdqa (%eax), %xmm0 - movaps 0x10(%eax), %xmm1 - movaps 0x20(%eax), %xmm2 - movaps 0x30(%eax), %xmm3 - movaps 0x40(%eax), %xmm4 - movaps 0x50(%eax), %xmm5 - movaps 0x60(%eax), %xmm6 - movaps 0x70(%eax), %xmm7 - lea 0x80(%eax), %eax - sub $0x80, %ecx - movdqa %xmm0, (%edx) - movaps %xmm1, 0x10(%edx) - movaps %xmm2, 0x20(%edx) - movaps %xmm3, 0x30(%edx) - movaps %xmm4, 0x40(%edx) - movaps %xmm5, 0x50(%edx) - movaps %xmm6, 0x60(%edx) - movaps %xmm7, 0x70(%edx) - lea 0x80(%edx), %edx - - jge L(shl_0_gobble_mem_loop) - cmp $-0x40, %ecx - lea 0x80(%ecx), %ecx - jl L(shl_0_mem_less_64bytes) - - movdqa (%eax), %xmm0 - sub $0x40, %ecx - movdqa 0x10(%eax), %xmm1 - - movdqa %xmm0, (%edx) - movdqa %xmm1, 0x10(%edx) - - movdqa 0x20(%eax), %xmm0 - movdqa 0x30(%eax), %xmm1 - add $0x40, %eax - - movdqa %xmm0, 0x20(%edx) - movdqa %xmm1, 0x30(%edx) - add $0x40, %edx -L(shl_0_mem_less_64bytes): - cmp $0x20, %ecx - jl L(shl_0_mem_less_32bytes) - movdqa (%eax), %xmm0 - sub $0x20, %ecx - movdqa 0x10(%eax), %xmm1 - add $0x20, %eax - movdqa %xmm0, (%edx) - movdqa %xmm1, 0x10(%edx) - add $0x20, %edx -L(shl_0_mem_less_32bytes): - cmp $0x10, %ecx - jl L(shl_0_mem_less_16bytes) - sub $0x10, %ecx - movdqa (%eax), %xmm0 - add $0x10, %eax - movdqa %xmm0, (%edx) - add $0x10, %edx -L(shl_0_mem_less_16bytes): - add %ecx, %edx - add %ecx, %eax - POP (%esi) - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) - - - ALIGN (4) -L(shl_1): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -1(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_1_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $1, %xmm2, %xmm3 - palignr $1, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_1_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $1, %xmm2, %xmm3 - palignr $1, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_1_loop) - -L(shl_1_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 1(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) -L(shl_2): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -2(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_2_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $2, %xmm2, %xmm3 - palignr $2, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_2_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $2, %xmm2, %xmm3 - palignr $2, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_2_loop) - -L(shl_2_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 2(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) -L(shl_3): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -3(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_3_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $3, %xmm2, %xmm3 - palignr $3, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_3_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $3, %xmm2, %xmm3 - palignr $3, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_3_loop) - -L(shl_3_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 3(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) -L(shl_4): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -4(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_4_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $4, %xmm2, %xmm3 - palignr $4, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_4_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $4, %xmm2, %xmm3 - palignr $4, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_4_loop) - -L(shl_4_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 4(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) -L(shl_5): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -5(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_5_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $5, %xmm2, %xmm3 - palignr $5, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_5_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $5, %xmm2, %xmm3 - palignr $5, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_5_loop) - -L(shl_5_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 5(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - - ALIGN (4) -L(shl_6): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -6(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_6_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $6, %xmm2, %xmm3 - palignr $6, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_6_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $6, %xmm2, %xmm3 - palignr $6, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_6_loop) - -L(shl_6_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 6(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) -L(shl_7): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -7(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_7_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $7, %xmm2, %xmm3 - palignr $7, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_7_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $7, %xmm2, %xmm3 - palignr $7, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_7_loop) - -L(shl_7_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 7(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) -L(shl_8): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -8(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_8_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $8, %xmm2, %xmm3 - palignr $8, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_8_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $8, %xmm2, %xmm3 - palignr $8, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_8_loop) - -L(shl_8_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 8(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) -L(shl_9): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -9(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_9_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $9, %xmm2, %xmm3 - palignr $9, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_9_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $9, %xmm2, %xmm3 - palignr $9, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_9_loop) - -L(shl_9_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 9(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) -L(shl_10): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -10(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_10_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $10, %xmm2, %xmm3 - palignr $10, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_10_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $10, %xmm2, %xmm3 - palignr $10, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_10_loop) - -L(shl_10_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 10(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) -L(shl_11): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -11(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_11_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $11, %xmm2, %xmm3 - palignr $11, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_11_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $11, %xmm2, %xmm3 - palignr $11, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_11_loop) - -L(shl_11_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 11(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) -L(shl_12): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -12(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_12_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $12, %xmm2, %xmm3 - palignr $12, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_12_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $12, %xmm2, %xmm3 - palignr $12, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_12_loop) - -L(shl_12_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 12(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) -L(shl_13): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -13(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_13_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $13, %xmm2, %xmm3 - palignr $13, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_13_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $13, %xmm2, %xmm3 - palignr $13, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_13_loop) - -L(shl_13_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 13(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) -L(shl_14): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -14(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_14_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $14, %xmm2, %xmm3 - palignr $14, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_14_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $14, %xmm2, %xmm3 - palignr $14, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_14_loop) - -L(shl_14_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 14(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - - ALIGN (4) -L(shl_15): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -15(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_15_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $15, %xmm2, %xmm3 - palignr $15, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_15_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $15, %xmm2, %xmm3 - palignr $15, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_15_loop) - -L(shl_15_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 15(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - - ALIGN (4) -L(fwd_write_44bytes): - movl -44(%eax), %ecx - movl %ecx, -44(%edx) -L(fwd_write_40bytes): - movl -40(%eax), %ecx - movl %ecx, -40(%edx) -L(fwd_write_36bytes): - movl -36(%eax), %ecx - movl %ecx, -36(%edx) -L(fwd_write_32bytes): - movl -32(%eax), %ecx - movl %ecx, -32(%edx) -L(fwd_write_28bytes): - movl -28(%eax), %ecx - movl %ecx, -28(%edx) -L(fwd_write_24bytes): - movl -24(%eax), %ecx - movl %ecx, -24(%edx) -L(fwd_write_20bytes): - movl -20(%eax), %ecx - movl %ecx, -20(%edx) -L(fwd_write_16bytes): - movl -16(%eax), %ecx - movl %ecx, -16(%edx) -L(fwd_write_12bytes): - movl -12(%eax), %ecx - movl %ecx, -12(%edx) -L(fwd_write_8bytes): - movl -8(%eax), %ecx - movl %ecx, -8(%edx) -L(fwd_write_4bytes): - movl -4(%eax), %ecx - movl %ecx, -4(%edx) -L(fwd_write_0bytes): -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -#endif - RETURN - - ALIGN (4) -L(fwd_write_5bytes): - movl -5(%eax), %ecx - movl -4(%eax), %eax - movl %ecx, -5(%edx) - movl %eax, -4(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -#endif - RETURN - - ALIGN (4) -L(fwd_write_45bytes): - movl -45(%eax), %ecx - movl %ecx, -45(%edx) -L(fwd_write_41bytes): - movl -41(%eax), %ecx - movl %ecx, -41(%edx) -L(fwd_write_37bytes): - movl -37(%eax), %ecx - movl %ecx, -37(%edx) -L(fwd_write_33bytes): - movl -33(%eax), %ecx - movl %ecx, -33(%edx) -L(fwd_write_29bytes): - movl -29(%eax), %ecx - movl %ecx, -29(%edx) -L(fwd_write_25bytes): - movl -25(%eax), %ecx - movl %ecx, -25(%edx) -L(fwd_write_21bytes): - movl -21(%eax), %ecx - movl %ecx, -21(%edx) -L(fwd_write_17bytes): - movl -17(%eax), %ecx - movl %ecx, -17(%edx) -L(fwd_write_13bytes): - movl -13(%eax), %ecx - movl %ecx, -13(%edx) -L(fwd_write_9bytes): - movl -9(%eax), %ecx - movl %ecx, -9(%edx) - movl -5(%eax), %ecx - movl %ecx, -5(%edx) -L(fwd_write_1bytes): - movzbl -1(%eax), %ecx - movb %cl, -1(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -#endif - RETURN - - ALIGN (4) -L(fwd_write_46bytes): - movl -46(%eax), %ecx - movl %ecx, -46(%edx) -L(fwd_write_42bytes): - movl -42(%eax), %ecx - movl %ecx, -42(%edx) -L(fwd_write_38bytes): - movl -38(%eax), %ecx - movl %ecx, -38(%edx) -L(fwd_write_34bytes): - movl -34(%eax), %ecx - movl %ecx, -34(%edx) -L(fwd_write_30bytes): - movl -30(%eax), %ecx - movl %ecx, -30(%edx) -L(fwd_write_26bytes): - movl -26(%eax), %ecx - movl %ecx, -26(%edx) -L(fwd_write_22bytes): - movl -22(%eax), %ecx - movl %ecx, -22(%edx) -L(fwd_write_18bytes): - movl -18(%eax), %ecx - movl %ecx, -18(%edx) -L(fwd_write_14bytes): - movl -14(%eax), %ecx - movl %ecx, -14(%edx) -L(fwd_write_10bytes): - movl -10(%eax), %ecx - movl %ecx, -10(%edx) -L(fwd_write_6bytes): - movl -6(%eax), %ecx - movl %ecx, -6(%edx) -L(fwd_write_2bytes): - movzwl -2(%eax), %ecx - movw %cx, -2(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -#endif - RETURN - - ALIGN (4) -L(fwd_write_47bytes): - movl -47(%eax), %ecx - movl %ecx, -47(%edx) -L(fwd_write_43bytes): - movl -43(%eax), %ecx - movl %ecx, -43(%edx) -L(fwd_write_39bytes): - movl -39(%eax), %ecx - movl %ecx, -39(%edx) -L(fwd_write_35bytes): - movl -35(%eax), %ecx - movl %ecx, -35(%edx) -L(fwd_write_31bytes): - movl -31(%eax), %ecx - movl %ecx, -31(%edx) -L(fwd_write_27bytes): - movl -27(%eax), %ecx - movl %ecx, -27(%edx) -L(fwd_write_23bytes): - movl -23(%eax), %ecx - movl %ecx, -23(%edx) -L(fwd_write_19bytes): - movl -19(%eax), %ecx - movl %ecx, -19(%edx) -L(fwd_write_15bytes): - movl -15(%eax), %ecx - movl %ecx, -15(%edx) -L(fwd_write_11bytes): - movl -11(%eax), %ecx - movl %ecx, -11(%edx) -L(fwd_write_7bytes): - movl -7(%eax), %ecx - movl %ecx, -7(%edx) -L(fwd_write_3bytes): - movzwl -3(%eax), %ecx - movzbl -1(%eax), %eax - movw %cx, -3(%edx) - movb %al, -1(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -#endif - RETURN - - ALIGN (4) -L(large_page): - movdqu (%eax), %xmm1 - lea 16(%eax), %eax - movdqu %xmm0, (%esi) - movntdq %xmm1, (%edx) - lea 16(%edx), %edx - cmp %al, %dl - je L(copy_page_by_rep) -L(large_page_loop_init): - POP (%esi) - lea -0x90(%ecx), %ecx - POP (%edi) -L(large_page_loop): - prefetchnta 0x1c0(%eax) - prefetchnta 0x280(%eax) - movdqu (%eax), %xmm0 - movdqu 0x10(%eax), %xmm1 - movdqu 0x20(%eax), %xmm2 - movdqu 0x30(%eax), %xmm3 - movdqu 0x40(%eax), %xmm4 - movdqu 0x50(%eax), %xmm5 - movdqu 0x60(%eax), %xmm6 - movdqu 0x70(%eax), %xmm7 - lea 0x80(%eax), %eax - lfence - sub $0x80, %ecx - movntdq %xmm0, (%edx) - movntdq %xmm1, 0x10(%edx) - movntdq %xmm2, 0x20(%edx) - movntdq %xmm3, 0x30(%edx) - movntdq %xmm4, 0x40(%edx) - movntdq %xmm5, 0x50(%edx) - movntdq %xmm6, 0x60(%edx) - movntdq %xmm7, 0x70(%edx) - lea 0x80(%edx), %edx - jae L(large_page_loop) - cmp $-0x40, %ecx - lea 0x80(%ecx), %ecx - jl L(large_page_less_64bytes) - - movdqu (%eax), %xmm0 - movdqu 0x10(%eax), %xmm1 - movdqu 0x20(%eax), %xmm2 - movdqu 0x30(%eax), %xmm3 - lea 0x40(%eax), %eax - - movntdq %xmm0, (%edx) - movntdq %xmm1, 0x10(%edx) - movntdq %xmm2, 0x20(%edx) - movntdq %xmm3, 0x30(%edx) - lea 0x40(%edx), %edx - sub $0x40, %ecx -L(large_page_less_64bytes): - cmp $32, %ecx - jl L(large_page_less_32bytes) - movdqu (%eax), %xmm0 - movdqu 0x10(%eax), %xmm1 - lea 0x20(%eax), %eax - movntdq %xmm0, (%edx) - movntdq %xmm1, 0x10(%edx) - lea 0x20(%edx), %edx - sub $0x20, %ecx -L(large_page_less_32bytes): - add %ecx, %edx - add %ecx, %eax - sfence - BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) -L(copy_page_by_rep): - mov %eax, %esi - mov %edx, %edi - mov %ecx, %edx - shr $2, %ecx - and $3, %edx - rep movsl - jz L(copy_page_by_rep_exit) - cmp $2, %edx - jb L(copy_page_by_rep_left_1) - movzwl (%esi), %eax - movw %ax, (%edi) - add $2, %esi - add $2, %edi - sub $2, %edx - jz L(copy_page_by_rep_exit) -L(copy_page_by_rep_left_1): - movzbl (%esi), %eax - movb %al, (%edi) -L(copy_page_by_rep_exit): - POP (%esi) - POP (%edi) -#ifndef USE_AS_BCOPY - movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY - movl LEN(%esp), %ecx - add %ecx, %eax -# endif -#endif - RETURN - - ALIGN (4) -L(bk_write_44bytes): - movl 40(%eax), %ecx - movl %ecx, 40(%edx) -L(bk_write_40bytes): - movl 36(%eax), %ecx - movl %ecx, 36(%edx) -L(bk_write_36bytes): - movl 32(%eax), %ecx - movl %ecx, 32(%edx) -L(bk_write_32bytes): - movl 28(%eax), %ecx - movl %ecx, 28(%edx) -L(bk_write_28bytes): - movl 24(%eax), %ecx - movl %ecx, 24(%edx) -L(bk_write_24bytes): - movl 20(%eax), %ecx - movl %ecx, 20(%edx) -L(bk_write_20bytes): - movl 16(%eax), %ecx - movl %ecx, 16(%edx) -L(bk_write_16bytes): - movl 12(%eax), %ecx - movl %ecx, 12(%edx) -L(bk_write_12bytes): - movl 8(%eax), %ecx - movl %ecx, 8(%edx) -L(bk_write_8bytes): - movl 4(%eax), %ecx - movl %ecx, 4(%edx) -L(bk_write_4bytes): - movl (%eax), %ecx - movl %ecx, (%edx) -L(bk_write_0bytes): -#ifndef USE_AS_BCOPY - movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY - movl LEN(%esp), %ecx - add %ecx, %eax -# endif -#endif - RETURN - - ALIGN (4) -L(bk_write_45bytes): - movl 41(%eax), %ecx - movl %ecx, 41(%edx) -L(bk_write_41bytes): - movl 37(%eax), %ecx - movl %ecx, 37(%edx) -L(bk_write_37bytes): - movl 33(%eax), %ecx - movl %ecx, 33(%edx) -L(bk_write_33bytes): - movl 29(%eax), %ecx - movl %ecx, 29(%edx) -L(bk_write_29bytes): - movl 25(%eax), %ecx - movl %ecx, 25(%edx) -L(bk_write_25bytes): - movl 21(%eax), %ecx - movl %ecx, 21(%edx) -L(bk_write_21bytes): - movl 17(%eax), %ecx - movl %ecx, 17(%edx) -L(bk_write_17bytes): - movl 13(%eax), %ecx - movl %ecx, 13(%edx) -L(bk_write_13bytes): - movl 9(%eax), %ecx - movl %ecx, 9(%edx) -L(bk_write_9bytes): - movl 5(%eax), %ecx - movl %ecx, 5(%edx) -L(bk_write_5bytes): - movl 1(%eax), %ecx - movl %ecx, 1(%edx) -L(bk_write_1bytes): - movzbl (%eax), %ecx - movb %cl, (%edx) -#ifndef USE_AS_BCOPY - movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY - movl LEN(%esp), %ecx - add %ecx, %eax -# endif -#endif - RETURN - - ALIGN (4) -L(bk_write_46bytes): - movl 42(%eax), %ecx - movl %ecx, 42(%edx) -L(bk_write_42bytes): - movl 38(%eax), %ecx - movl %ecx, 38(%edx) -L(bk_write_38bytes): - movl 34(%eax), %ecx - movl %ecx, 34(%edx) -L(bk_write_34bytes): - movl 30(%eax), %ecx - movl %ecx, 30(%edx) -L(bk_write_30bytes): - movl 26(%eax), %ecx - movl %ecx, 26(%edx) -L(bk_write_26bytes): - movl 22(%eax), %ecx - movl %ecx, 22(%edx) -L(bk_write_22bytes): - movl 18(%eax), %ecx - movl %ecx, 18(%edx) -L(bk_write_18bytes): - movl 14(%eax), %ecx - movl %ecx, 14(%edx) -L(bk_write_14bytes): - movl 10(%eax), %ecx - movl %ecx, 10(%edx) -L(bk_write_10bytes): - movl 6(%eax), %ecx - movl %ecx, 6(%edx) -L(bk_write_6bytes): - movl 2(%eax), %ecx - movl %ecx, 2(%edx) -L(bk_write_2bytes): - movzwl (%eax), %ecx - movw %cx, (%edx) -#ifndef USE_AS_BCOPY - movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY - movl LEN(%esp), %ecx - add %ecx, %eax -# endif -#endif - RETURN - - ALIGN (4) -L(bk_write_47bytes): - movl 43(%eax), %ecx - movl %ecx, 43(%edx) -L(bk_write_43bytes): - movl 39(%eax), %ecx - movl %ecx, 39(%edx) -L(bk_write_39bytes): - movl 35(%eax), %ecx - movl %ecx, 35(%edx) -L(bk_write_35bytes): - movl 31(%eax), %ecx - movl %ecx, 31(%edx) -L(bk_write_31bytes): - movl 27(%eax), %ecx - movl %ecx, 27(%edx) -L(bk_write_27bytes): - movl 23(%eax), %ecx - movl %ecx, 23(%edx) -L(bk_write_23bytes): - movl 19(%eax), %ecx - movl %ecx, 19(%edx) -L(bk_write_19bytes): - movl 15(%eax), %ecx - movl %ecx, 15(%edx) -L(bk_write_15bytes): - movl 11(%eax), %ecx - movl %ecx, 11(%edx) -L(bk_write_11bytes): - movl 7(%eax), %ecx - movl %ecx, 7(%edx) -L(bk_write_7bytes): - movl 3(%eax), %ecx - movl %ecx, 3(%edx) -L(bk_write_3bytes): - movzwl 1(%eax), %ecx - movw %cx, 1(%edx) - movzbl (%eax), %eax - movb %al, (%edx) -#ifndef USE_AS_BCOPY - movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY - movl LEN(%esp), %ecx - add %ecx, %eax -# endif -#endif - RETURN_END - - - .pushsection .rodata.ssse3,"a",@progbits - ALIGN (2) -L(table_48bytes_fwd): - .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) - - ALIGN (2) -L(shl_table): - .int JMPTBL (L(shl_0), L(shl_table)) - .int JMPTBL (L(shl_1), L(shl_table)) - .int JMPTBL (L(shl_2), L(shl_table)) - .int JMPTBL (L(shl_3), L(shl_table)) - .int JMPTBL (L(shl_4), L(shl_table)) - .int JMPTBL (L(shl_5), L(shl_table)) - .int JMPTBL (L(shl_6), L(shl_table)) - .int JMPTBL (L(shl_7), L(shl_table)) - .int JMPTBL (L(shl_8), L(shl_table)) - .int JMPTBL (L(shl_9), L(shl_table)) - .int JMPTBL (L(shl_10), L(shl_table)) - .int JMPTBL (L(shl_11), L(shl_table)) - .int JMPTBL (L(shl_12), L(shl_table)) - .int JMPTBL (L(shl_13), L(shl_table)) - .int JMPTBL (L(shl_14), L(shl_table)) - .int JMPTBL (L(shl_15), L(shl_table)) - - ALIGN (2) -L(table_48_bytes_bwd): - .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) - - .popsection - -#ifdef USE_AS_MEMMOVE - ALIGN (4) -L(copy_backward): - PUSH (%esi) - movl %eax, %esi - lea (%ecx,%edx,1),%edx - lea (%ecx,%esi,1),%esi - testl $0x3, %edx - jnz L(bk_align) - -L(bk_aligned_4): - cmp $64, %ecx - jge L(bk_write_more64bytes) - -L(bk_write_64bytesless): - cmp $32, %ecx - jl L(bk_write_less32bytes) - -L(bk_write_more32bytes): - /* Copy 32 bytes at a time. */ - sub $32, %ecx - movl -4(%esi), %eax - movl %eax, -4(%edx) - movl -8(%esi), %eax - movl %eax, -8(%edx) - movl -12(%esi), %eax - movl %eax, -12(%edx) - movl -16(%esi), %eax - movl %eax, -16(%edx) - movl -20(%esi), %eax - movl %eax, -20(%edx) - movl -24(%esi), %eax - movl %eax, -24(%edx) - movl -28(%esi), %eax - movl %eax, -28(%edx) - movl -32(%esi), %eax - movl %eax, -32(%edx) - sub $32, %edx - sub $32, %esi - -L(bk_write_less32bytes): - movl %esi, %eax - sub %ecx, %edx - sub %ecx, %eax - POP (%esi) -L(bk_write_less32bytes_2): - BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) - - ALIGN (4) -L(bk_align): - cmp $8, %ecx - jle L(bk_write_less32bytes) - testl $1, %edx - /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, - then (EDX & 2) must be != 0. */ - jz L(bk_got2) - sub $1, %esi - sub $1, %ecx - sub $1, %edx - movzbl (%esi), %eax - movb %al, (%edx) - - testl $2, %edx - jz L(bk_aligned_4) - -L(bk_got2): - sub $2, %esi - sub $2, %ecx - sub $2, %edx - movzwl (%esi), %eax - movw %ax, (%edx) - jmp L(bk_aligned_4) - - ALIGN (4) -L(bk_write_more64bytes): - /* Check alignment of last byte. */ - testl $15, %edx - jz L(bk_ssse3_cpy_pre) - -/* EDX is aligned 4 bytes, but not 16 bytes. */ -L(bk_ssse3_align): - sub $4, %esi - sub $4, %ecx - sub $4, %edx - movl (%esi), %eax - movl %eax, (%edx) - - testl $15, %edx - jz L(bk_ssse3_cpy_pre) - - sub $4, %esi - sub $4, %ecx - sub $4, %edx - movl (%esi), %eax - movl %eax, (%edx) - - testl $15, %edx - jz L(bk_ssse3_cpy_pre) - - sub $4, %esi - sub $4, %ecx - sub $4, %edx - movl (%esi), %eax - movl %eax, (%edx) - -L(bk_ssse3_cpy_pre): - cmp $64, %ecx - jl L(bk_write_more32bytes) - -L(bk_ssse3_cpy): - sub $64, %esi - sub $64, %ecx - sub $64, %edx - movdqu 0x30(%esi), %xmm3 - movdqa %xmm3, 0x30(%edx) - movdqu 0x20(%esi), %xmm2 - movdqa %xmm2, 0x20(%edx) - movdqu 0x10(%esi), %xmm1 - movdqa %xmm1, 0x10(%edx) - movdqu (%esi), %xmm0 - movdqa %xmm0, (%edx) - cmp $64, %ecx - jge L(bk_ssse3_cpy) - jmp L(bk_write_64bytesless) - -#endif - -END (MEMCPY) - -#endif diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S deleted file mode 100644 index 749c82d379..0000000000 --- a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S +++ /dev/null @@ -1,1737 +0,0 @@ -/* memcpy with SSSE3 - Copyright (C) 2010 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - -#include <sysdep.h> - -#if !defined NOT_IN_libc \ - && (defined SHARED \ - || defined USE_AS_MEMMOVE \ - || !defined USE_MULTIARCH) - -#include "asm-syntax.h" - -#ifndef MEMCPY -# define MEMCPY __memcpy_ssse3 -# define MEMCPY_CHK __memcpy_chk_ssse3 -#endif - -#ifdef USE_AS_BCOPY -# define SRC PARMS -# define DEST SRC+4 -# define LEN DEST+4 -#else -# define DEST PARMS -# define SRC DEST+4 -# define LEN SRC+4 -#endif - -#define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) - -#define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) - -#define PUSH(REG) pushl REG; CFI_PUSH (REG) -#define POP(REG) popl REG; CFI_POP (REG) - -#ifdef SHARED -# define PARMS 8 /* Preserve EBX. */ -# define ENTRANCE PUSH (%ebx); -# define RETURN_END POP (%ebx); ret -# define RETURN RETURN_END; CFI_PUSH (%ebx) -# define JMPTBL(I, B) I - B - -/* Load an entry in a jump table into EBX and branch to it. TABLE is a - jump table with relative offsets. INDEX is a register contains the - index into the jump table. SCALE is the scale of INDEX. */ -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - /* We first load PC into EBX. */ \ - call __i686.get_pc_thunk.bx; \ - /* Get the address of the jump table. */ \ - addl $(TABLE - .), %ebx; \ - /* Get the entry and convert the relative offset to the \ - absolute address. */ \ - addl (%ebx,INDEX,SCALE), %ebx; \ - /* We loaded the jump table. Go. */ \ - jmp *%ebx - -# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \ - addl $(TABLE - .), %ebx - -# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ - addl (%ebx,INDEX,SCALE), %ebx; \ - /* We loaded the jump table. Go. */ \ - jmp *%ebx - - .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits - .globl __i686.get_pc_thunk.bx - .hidden __i686.get_pc_thunk.bx - ALIGN (4) - .type __i686.get_pc_thunk.bx,@function -__i686.get_pc_thunk.bx: - movl (%esp), %ebx - ret -#else -# define PARMS 4 -# define ENTRANCE -# define RETURN_END ret -# define RETURN RETURN_END -# define JMPTBL(I, B) I - -/* Branch to an entry in a jump table. TABLE is a jump table with - absolute offsets. INDEX is a register contains the index into the - jump table. SCALE is the scale of INDEX. */ -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - jmp *TABLE(,INDEX,SCALE) - -# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) - -# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ - jmp *TABLE(,INDEX,SCALE) -#endif - - .section .text.ssse3,"ax",@progbits -#if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BCOPY -ENTRY (MEMCPY_CHK) - movl 12(%esp), %eax - cmpl %eax, 16(%esp) - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMCPY_CHK) -#endif -ENTRY (MEMCPY) - ENTRANCE - movl LEN(%esp), %ecx - movl SRC(%esp), %eax - movl DEST(%esp), %edx - -#ifdef USE_AS_MEMMOVE - cmp %eax, %edx - jb L(copy_forward) - je L(fwd_write_0bytes) - cmp $32, %ecx - jge L(memmove_bwd) - jmp L(bk_write_less32bytes_2) -L(memmove_bwd): - add %ecx, %eax - cmp %eax, %edx - movl SRC(%esp), %eax - jb L(copy_backward) - -L(copy_forward): -#endif - cmp $48, %ecx - jge L(48bytesormore) - -L(fwd_write_less32bytes): -#ifndef USE_AS_MEMMOVE - cmp %dl, %al - jl L(bk_write) -#endif - add %ecx, %edx - add %ecx, %eax - BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) -#ifndef USE_AS_MEMMOVE -L(bk_write): - BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) -#endif - - ALIGN (4) -/* ECX > 32 and EDX is 4 byte aligned. */ -L(48bytesormore): - movdqu (%eax), %xmm0 - PUSH (%edi) - movl %edx, %edi - and $-16, %edx - PUSH (%esi) - add $16, %edx - movl %edi, %esi - sub %edx, %edi - add %edi, %ecx - sub %edi, %eax - -#ifdef SHARED_CACHE_SIZE_HALF - cmp $SHARED_CACHE_SIZE_HALF, %ecx -#else -# ifdef SHARED - call __i686.get_pc_thunk.bx - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx -# else - cmp __x86_shared_cache_size_half, %ecx -# endif -#endif - - mov %eax, %edi - jge L(large_page) - and $0xf, %edi - jz L(shl_0) - - BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) - - ALIGN (4) -L(shl_0): - movdqu %xmm0, (%esi) - xor %edi, %edi - POP (%esi) - cmp $127, %ecx - ja L(shl_0_gobble) - lea -32(%ecx), %ecx -L(shl_0_loop): - movdqa (%eax, %edi), %xmm0 - movdqa 16(%eax, %edi), %xmm1 - sub $32, %ecx - movdqa %xmm0, (%edx, %edi) - movdqa %xmm1, 16(%edx, %edi) - lea 32(%edi), %edi - jl L(shl_0_end) - - movdqa (%eax, %edi), %xmm0 - movdqa 16(%eax, %edi), %xmm1 - sub $32, %ecx - movdqa %xmm0, (%edx, %edi) - movdqa %xmm1, 16(%edx, %edi) - lea 32(%edi), %edi - jl L(shl_0_end) - - movdqa (%eax, %edi), %xmm0 - movdqa 16(%eax, %edi), %xmm1 - sub $32, %ecx - movdqa %xmm0, (%edx, %edi) - movdqa %xmm1, 16(%edx, %edi) - lea 32(%edi), %edi - jl L(shl_0_end) - - movdqa (%eax, %edi), %xmm0 - movdqa 16(%eax, %edi), %xmm1 - sub $32, %ecx - movdqa %xmm0, (%edx, %edi) - movdqa %xmm1, 16(%edx, %edi) - lea 32(%edi), %edi -L(shl_0_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - add %edi, %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) - -L(shl_0_gobble): - -#ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %ecx -#else -# ifdef SHARED - call __i686.get_pc_thunk.bx - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx -# else - cmp __x86_data_cache_size_half, %ecx -# endif -#endif - - POP (%edi) - lea -128(%ecx), %ecx - jge L(shl_0_gobble_mem_loop) -L(shl_0_gobble_cache_loop): - movdqa (%eax), %xmm0 - movdqa 0x10(%eax), %xmm1 - movdqa 0x20(%eax), %xmm2 - movdqa 0x30(%eax), %xmm3 - movdqa 0x40(%eax), %xmm4 - movdqa 0x50(%eax), %xmm5 - movdqa 0x60(%eax), %xmm6 - movdqa 0x70(%eax), %xmm7 - lea 0x80(%eax), %eax - sub $128, %ecx - movdqa %xmm0, (%edx) - movdqa %xmm1, 0x10(%edx) - movdqa %xmm2, 0x20(%edx) - movdqa %xmm3, 0x30(%edx) - movdqa %xmm4, 0x40(%edx) - movdqa %xmm5, 0x50(%edx) - movdqa %xmm6, 0x60(%edx) - movdqa %xmm7, 0x70(%edx) - lea 0x80(%edx), %edx - - jge L(shl_0_gobble_cache_loop) -L(shl_0_gobble_cache_loop_tail): - cmp $-0x40, %ecx - lea 0x80(%ecx), %ecx - jl L(shl_0_cache_less_64bytes) - - movdqa (%eax), %xmm0 - sub $0x40, %ecx - movdqa 0x10(%eax), %xmm1 - - movdqa %xmm0, (%edx) - movdqa %xmm1, 0x10(%edx) - - movdqa 0x20(%eax), %xmm0 - movdqa 0x30(%eax), %xmm1 - add $0x40, %eax - - movdqa %xmm0, 0x20(%edx) - movdqa %xmm1, 0x30(%edx) - add $0x40, %edx -L(shl_0_cache_less_64bytes): - cmp $0x20, %ecx - jl L(shl_0_cache_less_32bytes) - movdqa (%eax), %xmm0 - sub $0x20, %ecx - movdqa 0x10(%eax), %xmm1 - add $0x20, %eax - movdqa %xmm0, (%edx) - movdqa %xmm1, 0x10(%edx) - add $0x20, %edx -L(shl_0_cache_less_32bytes): - cmp $0x10, %ecx - jl L(shl_0_cache_less_16bytes) - sub $0x10, %ecx - movdqa (%eax), %xmm0 - add $0x10, %eax - movdqa %xmm0, (%edx) - add $0x10, %edx -L(shl_0_cache_less_16bytes): - add %ecx, %edx - add %ecx, %eax - BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) - - - ALIGN (4) -L(shl_0_gobble_mem_loop): - prefetcht0 0x1c0(%eax) - prefetcht0 0x280(%eax) - prefetcht0 0x1c0(%edx) - - movdqa (%eax), %xmm0 - movdqa 0x10(%eax), %xmm1 - movdqa 0x20(%eax), %xmm2 - movdqa 0x30(%eax), %xmm3 - movdqa 0x40(%eax), %xmm4 - movdqa 0x50(%eax), %xmm5 - movdqa 0x60(%eax), %xmm6 - movdqa 0x70(%eax), %xmm7 - lea 0x80(%eax), %eax - sub $0x80, %ecx - movdqa %xmm0, (%edx) - movdqa %xmm1, 0x10(%edx) - movdqa %xmm2, 0x20(%edx) - movdqa %xmm3, 0x30(%edx) - movdqa %xmm4, 0x40(%edx) - movdqa %xmm5, 0x50(%edx) - movdqa %xmm6, 0x60(%edx) - movdqa %xmm7, 0x70(%edx) - lea 0x80(%edx), %edx - - jge L(shl_0_gobble_mem_loop) - cmp $-0x40, %ecx - lea 0x80(%ecx), %ecx - jl L(shl_0_mem_less_64bytes) - - movdqa (%eax), %xmm0 - sub $0x40, %ecx - movdqa 0x10(%eax), %xmm1 - - movdqa %xmm0, (%edx) - movdqa %xmm1, 0x10(%edx) - - movdqa 0x20(%eax), %xmm0 - movdqa 0x30(%eax), %xmm1 - add $0x40, %eax - - movdqa %xmm0, 0x20(%edx) - movdqa %xmm1, 0x30(%edx) - add $0x40, %edx -L(shl_0_mem_less_64bytes): - cmp $0x20, %ecx - jl L(shl_0_mem_less_32bytes) - movdqa (%eax), %xmm0 - sub $0x20, %ecx - movdqa 0x10(%eax), %xmm1 - add $0x20, %eax - movdqa %xmm0, (%edx) - movdqa %xmm1, 0x10(%edx) - add $0x20, %edx -L(shl_0_mem_less_32bytes): - cmp $0x10, %ecx - jl L(shl_0_mem_less_16bytes) - sub $0x10, %ecx - movdqa (%eax), %xmm0 - add $0x10, %eax - movdqa %xmm0, (%edx) - add $0x10, %edx -L(shl_0_mem_less_16bytes): - add %ecx, %edx - add %ecx, %eax - BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) - - - ALIGN (4) -L(shl_1): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -1(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_1_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $1, %xmm2, %xmm3 - palignr $1, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_1_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $1, %xmm2, %xmm3 - palignr $1, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_1_loop) - -L(shl_1_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 1(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) -L(shl_2): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -2(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_2_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $2, %xmm2, %xmm3 - palignr $2, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_2_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $2, %xmm2, %xmm3 - palignr $2, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_2_loop) - -L(shl_2_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 2(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) -L(shl_3): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -3(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_3_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $3, %xmm2, %xmm3 - palignr $3, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_3_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $3, %xmm2, %xmm3 - palignr $3, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_3_loop) - -L(shl_3_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 3(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) -L(shl_4): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -4(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_4_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $4, %xmm2, %xmm3 - palignr $4, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_4_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $4, %xmm2, %xmm3 - palignr $4, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_4_loop) - -L(shl_4_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 4(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) -L(shl_5): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -5(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_5_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $5, %xmm2, %xmm3 - palignr $5, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_5_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $5, %xmm2, %xmm3 - palignr $5, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_5_loop) - -L(shl_5_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 5(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - - ALIGN (4) -L(shl_6): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -6(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_6_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $6, %xmm2, %xmm3 - palignr $6, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_6_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $6, %xmm2, %xmm3 - palignr $6, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_6_loop) - -L(shl_6_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 6(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) -L(shl_7): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -7(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_7_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $7, %xmm2, %xmm3 - palignr $7, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_7_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $7, %xmm2, %xmm3 - palignr $7, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_7_loop) - -L(shl_7_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 7(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) -L(shl_8): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -8(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_8_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $8, %xmm2, %xmm3 - palignr $8, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_8_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $8, %xmm2, %xmm3 - palignr $8, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_8_loop) - -L(shl_8_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 8(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) -L(shl_9): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -9(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_9_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $9, %xmm2, %xmm3 - palignr $9, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_9_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $9, %xmm2, %xmm3 - palignr $9, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_9_loop) - -L(shl_9_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 9(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) -L(shl_10): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -10(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_10_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $10, %xmm2, %xmm3 - palignr $10, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_10_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $10, %xmm2, %xmm3 - palignr $10, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_10_loop) - -L(shl_10_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 10(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) -L(shl_11): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -11(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_11_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $11, %xmm2, %xmm3 - palignr $11, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_11_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $11, %xmm2, %xmm3 - palignr $11, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_11_loop) - -L(shl_11_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 11(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) -L(shl_12): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -12(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_12_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $12, %xmm2, %xmm3 - palignr $12, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_12_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $12, %xmm2, %xmm3 - palignr $12, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_12_loop) - -L(shl_12_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 12(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) -L(shl_13): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -13(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_13_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $13, %xmm2, %xmm3 - palignr $13, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_13_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $13, %xmm2, %xmm3 - palignr $13, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_13_loop) - -L(shl_13_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 13(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) -L(shl_14): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -14(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_14_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $14, %xmm2, %xmm3 - palignr $14, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_14_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $14, %xmm2, %xmm3 - palignr $14, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_14_loop) - -L(shl_14_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 14(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - - ALIGN (4) -L(shl_15): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -15(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_15_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $15, %xmm2, %xmm3 - palignr $15, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jl L(shl_15_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $15, %xmm2, %xmm3 - palignr $15, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_15_loop) - -L(shl_15_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 15(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - - ALIGN (4) -L(fwd_write_44bytes): - movl -44(%eax), %ecx - movl %ecx, -44(%edx) -L(fwd_write_40bytes): - movl -40(%eax), %ecx - movl %ecx, -40(%edx) -L(fwd_write_36bytes): - movl -36(%eax), %ecx - movl %ecx, -36(%edx) -L(fwd_write_32bytes): - movl -32(%eax), %ecx - movl %ecx, -32(%edx) -L(fwd_write_28bytes): - movl -28(%eax), %ecx - movl %ecx, -28(%edx) -L(fwd_write_24bytes): - movl -24(%eax), %ecx - movl %ecx, -24(%edx) -L(fwd_write_20bytes): - movl -20(%eax), %ecx - movl %ecx, -20(%edx) -L(fwd_write_16bytes): - movl -16(%eax), %ecx - movl %ecx, -16(%edx) -L(fwd_write_12bytes): - movl -12(%eax), %ecx - movl %ecx, -12(%edx) -L(fwd_write_8bytes): - movl -8(%eax), %ecx - movl %ecx, -8(%edx) -L(fwd_write_4bytes): - movl -4(%eax), %ecx - movl %ecx, -4(%edx) -L(fwd_write_0bytes): -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -#endif - RETURN - - ALIGN (4) -L(fwd_write_5bytes): - movl -5(%eax), %ecx - movl -4(%eax), %eax - movl %ecx, -5(%edx) - movl %eax, -4(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -#endif - RETURN - - ALIGN (4) -L(fwd_write_45bytes): - movl -45(%eax), %ecx - movl %ecx, -45(%edx) -L(fwd_write_41bytes): - movl -41(%eax), %ecx - movl %ecx, -41(%edx) -L(fwd_write_37bytes): - movl -37(%eax), %ecx - movl %ecx, -37(%edx) -L(fwd_write_33bytes): - movl -33(%eax), %ecx - movl %ecx, -33(%edx) -L(fwd_write_29bytes): - movl -29(%eax), %ecx - movl %ecx, -29(%edx) -L(fwd_write_25bytes): - movl -25(%eax), %ecx - movl %ecx, -25(%edx) -L(fwd_write_21bytes): - movl -21(%eax), %ecx - movl %ecx, -21(%edx) -L(fwd_write_17bytes): - movl -17(%eax), %ecx - movl %ecx, -17(%edx) -L(fwd_write_13bytes): - movl -13(%eax), %ecx - movl %ecx, -13(%edx) -L(fwd_write_9bytes): - movl -9(%eax), %ecx - movl %ecx, -9(%edx) - movl -5(%eax), %ecx - movl %ecx, -5(%edx) -L(fwd_write_1bytes): - movzbl -1(%eax), %ecx - movb %cl, -1(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -#endif - RETURN - - ALIGN (4) -L(fwd_write_46bytes): - movl -46(%eax), %ecx - movl %ecx, -46(%edx) -L(fwd_write_42bytes): - movl -42(%eax), %ecx - movl %ecx, -42(%edx) -L(fwd_write_38bytes): - movl -38(%eax), %ecx - movl %ecx, -38(%edx) -L(fwd_write_34bytes): - movl -34(%eax), %ecx - movl %ecx, -34(%edx) -L(fwd_write_30bytes): - movl -30(%eax), %ecx - movl %ecx, -30(%edx) -L(fwd_write_26bytes): - movl -26(%eax), %ecx - movl %ecx, -26(%edx) -L(fwd_write_22bytes): - movl -22(%eax), %ecx - movl %ecx, -22(%edx) -L(fwd_write_18bytes): - movl -18(%eax), %ecx - movl %ecx, -18(%edx) -L(fwd_write_14bytes): - movl -14(%eax), %ecx - movl %ecx, -14(%edx) -L(fwd_write_10bytes): - movl -10(%eax), %ecx - movl %ecx, -10(%edx) -L(fwd_write_6bytes): - movl -6(%eax), %ecx - movl %ecx, -6(%edx) -L(fwd_write_2bytes): - movzwl -2(%eax), %ecx - movw %cx, -2(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -#endif - RETURN - - ALIGN (4) -L(fwd_write_47bytes): - movl -47(%eax), %ecx - movl %ecx, -47(%edx) -L(fwd_write_43bytes): - movl -43(%eax), %ecx - movl %ecx, -43(%edx) -L(fwd_write_39bytes): - movl -39(%eax), %ecx - movl %ecx, -39(%edx) -L(fwd_write_35bytes): - movl -35(%eax), %ecx - movl %ecx, -35(%edx) -L(fwd_write_31bytes): - movl -31(%eax), %ecx - movl %ecx, -31(%edx) -L(fwd_write_27bytes): - movl -27(%eax), %ecx - movl %ecx, -27(%edx) -L(fwd_write_23bytes): - movl -23(%eax), %ecx - movl %ecx, -23(%edx) -L(fwd_write_19bytes): - movl -19(%eax), %ecx - movl %ecx, -19(%edx) -L(fwd_write_15bytes): - movl -15(%eax), %ecx - movl %ecx, -15(%edx) -L(fwd_write_11bytes): - movl -11(%eax), %ecx - movl %ecx, -11(%edx) -L(fwd_write_7bytes): - movl -7(%eax), %ecx - movl %ecx, -7(%edx) -L(fwd_write_3bytes): - movzwl -3(%eax), %ecx - movzbl -1(%eax), %eax - movw %cx, -3(%edx) - movb %al, -1(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -#endif - RETURN - - ALIGN (4) -L(large_page): - movdqu (%eax), %xmm1 - lea 16(%eax), %eax - movdqu %xmm0, (%esi) - movntdq %xmm1, (%edx) - lea 16(%edx), %edx - POP (%esi) - lea -0x90(%ecx), %ecx - POP (%edi) -L(large_page_loop): - movdqu (%eax), %xmm0 - movdqu 0x10(%eax), %xmm1 - movdqu 0x20(%eax), %xmm2 - movdqu 0x30(%eax), %xmm3 - movdqu 0x40(%eax), %xmm4 - movdqu 0x50(%eax), %xmm5 - movdqu 0x60(%eax), %xmm6 - movdqu 0x70(%eax), %xmm7 - lea 0x80(%eax), %eax - - sub $0x80, %ecx - movntdq %xmm0, (%edx) - movntdq %xmm1, 0x10(%edx) - movntdq %xmm2, 0x20(%edx) - movntdq %xmm3, 0x30(%edx) - movntdq %xmm4, 0x40(%edx) - movntdq %xmm5, 0x50(%edx) - movntdq %xmm6, 0x60(%edx) - movntdq %xmm7, 0x70(%edx) - lea 0x80(%edx), %edx - jae L(large_page_loop) - cmp $-0x40, %ecx - lea 0x80(%ecx), %ecx - jl L(large_page_less_64bytes) - - movdqu (%eax), %xmm0 - movdqu 0x10(%eax), %xmm1 - movdqu 0x20(%eax), %xmm2 - movdqu 0x30(%eax), %xmm3 - lea 0x40(%eax), %eax - - movntdq %xmm0, (%edx) - movntdq %xmm1, 0x10(%edx) - movntdq %xmm2, 0x20(%edx) - movntdq %xmm3, 0x30(%edx) - lea 0x40(%edx), %edx - sub $0x40, %ecx -L(large_page_less_64bytes): - cmp $32, %ecx - jl L(large_page_less_32bytes) - movdqu (%eax), %xmm0 - movdqu 0x10(%eax), %xmm1 - lea 0x20(%eax), %eax - movntdq %xmm0, (%edx) - movntdq %xmm1, 0x10(%edx) - lea 0x20(%edx), %edx - sub $0x20, %ecx -L(large_page_less_32bytes): - add %ecx, %edx - add %ecx, %eax - sfence - BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) - - - ALIGN (4) -L(bk_write_44bytes): - movl 40(%eax), %ecx - movl %ecx, 40(%edx) -L(bk_write_40bytes): - movl 36(%eax), %ecx - movl %ecx, 36(%edx) -L(bk_write_36bytes): - movl 32(%eax), %ecx - movl %ecx, 32(%edx) -L(bk_write_32bytes): - movl 28(%eax), %ecx - movl %ecx, 28(%edx) -L(bk_write_28bytes): - movl 24(%eax), %ecx - movl %ecx, 24(%edx) -L(bk_write_24bytes): - movl 20(%eax), %ecx - movl %ecx, 20(%edx) -L(bk_write_20bytes): - movl 16(%eax), %ecx - movl %ecx, 16(%edx) -L(bk_write_16bytes): - movl 12(%eax), %ecx - movl %ecx, 12(%edx) -L(bk_write_12bytes): - movl 8(%eax), %ecx - movl %ecx, 8(%edx) -L(bk_write_8bytes): - movl 4(%eax), %ecx - movl %ecx, 4(%edx) -L(bk_write_4bytes): - movl (%eax), %ecx - movl %ecx, (%edx) -L(bk_write_0bytes): -#ifndef USE_AS_BCOPY - movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY - movl LEN(%esp), %ecx - add %ecx, %eax -# endif -#endif - RETURN - - ALIGN (4) -L(bk_write_45bytes): - movl 41(%eax), %ecx - movl %ecx, 41(%edx) -L(bk_write_41bytes): - movl 37(%eax), %ecx - movl %ecx, 37(%edx) -L(bk_write_37bytes): - movl 33(%eax), %ecx - movl %ecx, 33(%edx) -L(bk_write_33bytes): - movl 29(%eax), %ecx - movl %ecx, 29(%edx) -L(bk_write_29bytes): - movl 25(%eax), %ecx - movl %ecx, 25(%edx) -L(bk_write_25bytes): - movl 21(%eax), %ecx - movl %ecx, 21(%edx) -L(bk_write_21bytes): - movl 17(%eax), %ecx - movl %ecx, 17(%edx) -L(bk_write_17bytes): - movl 13(%eax), %ecx - movl %ecx, 13(%edx) -L(bk_write_13bytes): - movl 9(%eax), %ecx - movl %ecx, 9(%edx) -L(bk_write_9bytes): - movl 5(%eax), %ecx - movl %ecx, 5(%edx) -L(bk_write_5bytes): - movl 1(%eax), %ecx - movl %ecx, 1(%edx) -L(bk_write_1bytes): - movzbl (%eax), %ecx - movb %cl, (%edx) -#ifndef USE_AS_BCOPY - movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY - movl LEN(%esp), %ecx - add %ecx, %eax -# endif -#endif - RETURN - - ALIGN (4) -L(bk_write_46bytes): - movl 42(%eax), %ecx - movl %ecx, 42(%edx) -L(bk_write_42bytes): - movl 38(%eax), %ecx - movl %ecx, 38(%edx) -L(bk_write_38bytes): - movl 34(%eax), %ecx - movl %ecx, 34(%edx) -L(bk_write_34bytes): - movl 30(%eax), %ecx - movl %ecx, 30(%edx) -L(bk_write_30bytes): - movl 26(%eax), %ecx - movl %ecx, 26(%edx) -L(bk_write_26bytes): - movl 22(%eax), %ecx - movl %ecx, 22(%edx) -L(bk_write_22bytes): - movl 18(%eax), %ecx - movl %ecx, 18(%edx) -L(bk_write_18bytes): - movl 14(%eax), %ecx - movl %ecx, 14(%edx) -L(bk_write_14bytes): - movl 10(%eax), %ecx - movl %ecx, 10(%edx) -L(bk_write_10bytes): - movl 6(%eax), %ecx - movl %ecx, 6(%edx) -L(bk_write_6bytes): - movl 2(%eax), %ecx - movl %ecx, 2(%edx) -L(bk_write_2bytes): - movzwl (%eax), %ecx - movw %cx, (%edx) -#ifndef USE_AS_BCOPY - movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY - movl LEN(%esp), %ecx - add %ecx, %eax -# endif -#endif - RETURN - - ALIGN (4) -L(bk_write_47bytes): - movl 43(%eax), %ecx - movl %ecx, 43(%edx) -L(bk_write_43bytes): - movl 39(%eax), %ecx - movl %ecx, 39(%edx) -L(bk_write_39bytes): - movl 35(%eax), %ecx - movl %ecx, 35(%edx) -L(bk_write_35bytes): - movl 31(%eax), %ecx - movl %ecx, 31(%edx) -L(bk_write_31bytes): - movl 27(%eax), %ecx - movl %ecx, 27(%edx) -L(bk_write_27bytes): - movl 23(%eax), %ecx - movl %ecx, 23(%edx) -L(bk_write_23bytes): - movl 19(%eax), %ecx - movl %ecx, 19(%edx) -L(bk_write_19bytes): - movl 15(%eax), %ecx - movl %ecx, 15(%edx) -L(bk_write_15bytes): - movl 11(%eax), %ecx - movl %ecx, 11(%edx) -L(bk_write_11bytes): - movl 7(%eax), %ecx - movl %ecx, 7(%edx) -L(bk_write_7bytes): - movl 3(%eax), %ecx - movl %ecx, 3(%edx) -L(bk_write_3bytes): - movzwl 1(%eax), %ecx - movw %cx, 1(%edx) - movzbl (%eax), %eax - movb %al, (%edx) -#ifndef USE_AS_BCOPY - movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY - movl LEN(%esp), %ecx - add %ecx, %eax -# endif -#endif - RETURN_END - - - .pushsection .rodata.ssse3,"a",@progbits - ALIGN (2) -L(table_48bytes_fwd): - .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) - - ALIGN (2) -L(shl_table): - .int JMPTBL (L(shl_0), L(shl_table)) - .int JMPTBL (L(shl_1), L(shl_table)) - .int JMPTBL (L(shl_2), L(shl_table)) - .int JMPTBL (L(shl_3), L(shl_table)) - .int JMPTBL (L(shl_4), L(shl_table)) - .int JMPTBL (L(shl_5), L(shl_table)) - .int JMPTBL (L(shl_6), L(shl_table)) - .int JMPTBL (L(shl_7), L(shl_table)) - .int JMPTBL (L(shl_8), L(shl_table)) - .int JMPTBL (L(shl_9), L(shl_table)) - .int JMPTBL (L(shl_10), L(shl_table)) - .int JMPTBL (L(shl_11), L(shl_table)) - .int JMPTBL (L(shl_12), L(shl_table)) - .int JMPTBL (L(shl_13), L(shl_table)) - .int JMPTBL (L(shl_14), L(shl_table)) - .int JMPTBL (L(shl_15), L(shl_table)) - - ALIGN (2) -L(table_48_bytes_bwd): - .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) - - .popsection - -#ifdef USE_AS_MEMMOVE - ALIGN (4) -L(copy_backward): - PUSH (%esi) - movl %eax, %esi - lea (%ecx,%edx,1),%edx - lea (%ecx,%esi,1),%esi - testl $0x3, %edx - jnz L(bk_align) - -L(bk_aligned_4): - cmp $64, %ecx - jge L(bk_write_more64bytes) - -L(bk_write_64bytesless): - cmp $32, %ecx - jl L(bk_write_less32bytes) - -L(bk_write_more32bytes): - /* Copy 32 bytes at a time. */ - sub $32, %ecx - movl -4(%esi), %eax - movl %eax, -4(%edx) - movl -8(%esi), %eax - movl %eax, -8(%edx) - movl -12(%esi), %eax - movl %eax, -12(%edx) - movl -16(%esi), %eax - movl %eax, -16(%edx) - movl -20(%esi), %eax - movl %eax, -20(%edx) - movl -24(%esi), %eax - movl %eax, -24(%edx) - movl -28(%esi), %eax - movl %eax, -28(%edx) - movl -32(%esi), %eax - movl %eax, -32(%edx) - sub $32, %edx - sub $32, %esi - -L(bk_write_less32bytes): - movl %esi, %eax - sub %ecx, %edx - sub %ecx, %eax - POP (%esi) -L(bk_write_less32bytes_2): - BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) - - ALIGN (4) -L(bk_align): - cmp $8, %ecx - jle L(bk_write_less32bytes) - testl $1, %edx - /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, - then (EDX & 2) must be != 0. */ - jz L(bk_got2) - sub $1, %esi - sub $1, %ecx - sub $1, %edx - movzbl (%esi), %eax - movb %al, (%edx) - - testl $2, %edx - jz L(bk_aligned_4) - -L(bk_got2): - sub $2, %esi - sub $2, %ecx - sub $2, %edx - movzwl (%esi), %eax - movw %ax, (%edx) - jmp L(bk_aligned_4) - - ALIGN (4) -L(bk_write_more64bytes): - /* Check alignment of last byte. */ - testl $15, %edx - jz L(bk_ssse3_cpy_pre) - -/* EDX is aligned 4 bytes, but not 16 bytes. */ -L(bk_ssse3_align): - sub $4, %esi - sub $4, %ecx - sub $4, %edx - movl (%esi), %eax - movl %eax, (%edx) - - testl $15, %edx - jz L(bk_ssse3_cpy_pre) - - sub $4, %esi - sub $4, %ecx - sub $4, %edx - movl (%esi), %eax - movl %eax, (%edx) - - testl $15, %edx - jz L(bk_ssse3_cpy_pre) - - sub $4, %esi - sub $4, %ecx - sub $4, %edx - movl (%esi), %eax - movl %eax, (%edx) - -L(bk_ssse3_cpy_pre): - cmp $64, %ecx - jl L(bk_write_more32bytes) - -L(bk_ssse3_cpy): - sub $64, %esi - sub $64, %ecx - sub $64, %edx - movdqu 0x30(%esi), %xmm3 - movdqa %xmm3, 0x30(%edx) - movdqu 0x20(%esi), %xmm2 - movdqa %xmm2, 0x20(%edx) - movdqu 0x10(%esi), %xmm1 - movdqa %xmm1, 0x10(%edx) - movdqu (%esi), %xmm0 - movdqa %xmm0, (%edx) - cmp $64, %ecx - jge L(bk_ssse3_cpy) - jmp L(bk_write_64bytesless) - -#endif - -END (MEMCPY) - -#endif diff --git a/sysdeps/i386/i686/multiarch/memcpy.S b/sysdeps/i386/i686/multiarch/memcpy.S deleted file mode 100644 index bf1c7cc2d2..0000000000 --- a/sysdeps/i386/i686/multiarch/memcpy.S +++ /dev/null @@ -1,90 +0,0 @@ -/* Multiple versions of memcpy - Copyright (C) 2010 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - -#include <sysdep.h> -#include <init-arch.h> - -/* Define multiple versions only for the definition in lib and for - DSO. In static binaries we need memcpy before the initialization - happened. */ -#if defined SHARED && !defined NOT_IN_libc - .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits - .globl __i686.get_pc_thunk.bx - .hidden __i686.get_pc_thunk.bx - .p2align 4 - .type __i686.get_pc_thunk.bx,@function -__i686.get_pc_thunk.bx: - movl (%esp), %ebx - ret - - .text -ENTRY(memcpy) - .type memcpy, @gnu_indirect_function - pushl %ebx - cfi_adjust_cfa_offset (4) - cfi_rel_offset (ebx, 0) - call __i686.get_pc_thunk.bx - addl $_GLOBAL_OFFSET_TABLE_, %ebx - cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) - jne 1f - call __init_cpu_features -1: leal __memcpy_ia32@GOTOFF(%ebx), %eax - testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx) - jz 2f - leal __memcpy_ssse3@GOTOFF(%ebx), %eax - testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx) - jz 2f - leal __memcpy_ssse3_rep@GOTOFF(%ebx), %eax -2: popl %ebx - cfi_adjust_cfa_offset (-4) - cfi_restore (ebx) - ret -END(memcpy) - -# undef ENTRY -# define ENTRY(name) \ - .type __memcpy_ia32, @function; \ - .p2align 4; \ - __memcpy_ia32: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __memcpy_ia32, .-__memcpy_ia32 - -# undef ENTRY_CHK -# define ENTRY_CHK(name) \ - .type __memcpy_chk_ia32, @function; \ - .globl __memcpy_chk_ia32; \ - .p2align 4; \ - __memcpy_chk_ia32: cfi_startproc; \ - CALL_MCOUNT -# undef END_CHK -# define END_CHK(name) \ - cfi_endproc; .size __memcpy_chk_ia32, .-__memcpy_chk_ia32 - -# undef libc_hidden_builtin_def -/* IFUNC doesn't work with the hidden functions in shared library since - they will be called without setting up EBX needed for PLT which is - used by IFUNC. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_memcpy; __GI_memcpy = __memcpy_ia32 -#endif - -#include "../memcpy.S" diff --git a/sysdeps/i386/i686/multiarch/memcpy_chk.S b/sysdeps/i386/i686/multiarch/memcpy_chk.S deleted file mode 100644 index 171ac8adef..0000000000 --- a/sysdeps/i386/i686/multiarch/memcpy_chk.S +++ /dev/null @@ -1,64 +0,0 @@ -/* Multiple versions of __memcpy_chk - Copyright (C) 2010 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - -#include <sysdep.h> -#include <init-arch.h> - -/* Define multiple versions only for the definition in lib and for - DSO. There are no multiarch memcpy functions for static binaries. - */ -#ifndef NOT_IN_libc -# ifdef SHARED - .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits - .globl __i686.get_pc_thunk.bx - .hidden __i686.get_pc_thunk.bx - .p2align 4 - .type __i686.get_pc_thunk.bx,@function -__i686.get_pc_thunk.bx: - movl (%esp), %ebx - ret - - .text -ENTRY(__memcpy_chk) - .type __memcpy_chk, @gnu_indirect_function - pushl %ebx - cfi_adjust_cfa_offset (4) - cfi_rel_offset (ebx, 0) - call __i686.get_pc_thunk.bx - addl $_GLOBAL_OFFSET_TABLE_, %ebx - cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) - jne 1f - call __init_cpu_features -1: leal __memcpy_chk_ia32@GOTOFF(%ebx), %eax - testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx) - jz 2f - leal __memcpy_chk_ssse3@GOTOFF(%ebx), %eax - testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx) - jz 2f - leal __memcpy_chk_ssse3_rep@GOTOFF(%ebx), %eax -2: popl %ebx - cfi_adjust_cfa_offset (-4) - cfi_restore (ebx) - ret -END(__memcpy_chk) -# else -# include "../memcpy_chk.S" -# endif -#endif diff --git a/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S b/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S deleted file mode 100644 index d202fc4a13..0000000000 --- a/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_MEMMOVE -#define MEMCPY __memmove_ssse3_rep -#define MEMCPY_CHK __memmove_chk_ssse3_rep -#include "memcpy-ssse3-rep.S" diff --git a/sysdeps/i386/i686/multiarch/memmove-ssse3.S b/sysdeps/i386/i686/multiarch/memmove-ssse3.S deleted file mode 100644 index 295430b1ef..0000000000 --- a/sysdeps/i386/i686/multiarch/memmove-ssse3.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_MEMMOVE -#define MEMCPY __memmove_ssse3 -#define MEMCPY_CHK __memmove_chk_ssse3 -#include "memcpy-ssse3.S" diff --git a/sysdeps/i386/i686/multiarch/memmove.S b/sysdeps/i386/i686/multiarch/memmove.S deleted file mode 100644 index e0529c0126..0000000000 --- a/sysdeps/i386/i686/multiarch/memmove.S +++ /dev/null @@ -1,117 +0,0 @@ -/* Multiple versions of memmove - Copyright (C) 2010 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - -#include <sysdep.h> -#include <init-arch.h> - -/* Define multiple versions only for the definition in lib. */ -#ifndef NOT_IN_libc -# ifdef SHARED - .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits - .globl __i686.get_pc_thunk.bx - .hidden __i686.get_pc_thunk.bx - .p2align 4 - .type __i686.get_pc_thunk.bx,@function -__i686.get_pc_thunk.bx: - movl (%esp), %ebx - ret - - .text -ENTRY(memmove) - .type memmove, @gnu_indirect_function - pushl %ebx - cfi_adjust_cfa_offset (4) - cfi_rel_offset (ebx, 0) - call __i686.get_pc_thunk.bx - addl $_GLOBAL_OFFSET_TABLE_, %ebx - cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) - jne 1f - call __init_cpu_features -1: leal __memmove_ia32@GOTOFF(%ebx), %eax - testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx) - jz 2f - leal __memmove_ssse3@GOTOFF(%ebx), %eax - testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx) - jz 2f - leal __memmove_ssse3_rep@GOTOFF(%ebx), %eax -2: popl %ebx - cfi_adjust_cfa_offset (-4) - cfi_restore (ebx) - ret -END(memmove) - -# undef ENTRY -# define ENTRY(name) \ - .type __memmove_ia32, @function; \ - .p2align 4; \ - __memmove_ia32: cfi_startproc; \ - CALL_MCOUNT -# else - .text -ENTRY(memmove) - .type memmove, @gnu_indirect_function - cmpl $0, KIND_OFFSET+__cpu_features - jne 1f - call __init_cpu_features -1: leal __memmove_ia32, %eax - testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features - jz 2f - leal __memmove_ssse3, %eax - testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features - jz 2f - leal __memmove_ssse3_rep, %eax -2: ret -END(memmove) - -# undef ENTRY -# define ENTRY(name) \ - .type __memmove_ia32, @function; \ - .globl __memmove_ia32; \ - .p2align 4; \ - __memmove_ia32: cfi_startproc; \ - CALL_MCOUNT -# endif - -# undef END -# define END(name) \ - cfi_endproc; .size __memmove_ia32, .-__memmove_ia32 - -# undef ENTRY_CHK -# define ENTRY_CHK(name) \ - .type __memmove_chk_ia32, @function; \ - .globl __memmove_chk_ia32; \ - .p2align 4; \ - __memmove_chk_ia32: cfi_startproc; \ - CALL_MCOUNT -# undef END_CHK -# define END_CHK(name) \ - cfi_endproc; .size __memmove_chk_ia32, .-__memmove_chk_ia32 - -# ifdef SHARED -# undef libc_hidden_builtin_def -/* IFUNC doesn't work with the hidden functions in shared library since - they will be called without setting up EBX needed for PLT which is - used by IFUNC. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_memmove; __GI_memmove = __memmove_ia32 -# endif -#endif - -#include "../memmove.S" diff --git a/sysdeps/i386/i686/multiarch/memmove_chk.S b/sysdeps/i386/i686/multiarch/memmove_chk.S deleted file mode 100644 index e33f2a31b3..0000000000 --- a/sysdeps/i386/i686/multiarch/memmove_chk.S +++ /dev/null @@ -1,112 +0,0 @@ -/* Multiple versions of __memmove_chk - Copyright (C) 2010 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - -#include <sysdep.h> -#include <init-arch.h> - -/* Define multiple versions only for the definition in lib. */ -#ifndef NOT_IN_libc -# ifdef SHARED - .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits - .globl __i686.get_pc_thunk.bx - .hidden __i686.get_pc_thunk.bx - .p2align 4 - .type __i686.get_pc_thunk.bx,@function -__i686.get_pc_thunk.bx: - movl (%esp), %ebx - ret - - .text -ENTRY(__memmove_chk) - .type __memmove_chk, @gnu_indirect_function - pushl %ebx - cfi_adjust_cfa_offset (4) - cfi_rel_offset (ebx, 0) - call __i686.get_pc_thunk.bx - addl $_GLOBAL_OFFSET_TABLE_, %ebx - cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) - jne 1f - call __init_cpu_features -1: leal __memmove_chk_ia32@GOTOFF(%ebx), %eax - testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx) - jz 2f - leal __memmove_chk_ssse3@GOTOFF(%ebx), %eax - testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx) - jz 2f - leal __memmove_chk_ssse3_rep@GOTOFF(%ebx), %eax -2: popl %ebx - cfi_adjust_cfa_offset (-4) - cfi_restore (ebx) - ret -END(__memmove_chk) -# else - .text -ENTRY(__memmove_chk) - .type __memmove_chk, @gnu_indirect_function - cmpl $0, KIND_OFFSET+__cpu_features - jne 1f - call __init_cpu_features -1: leal __memmove_chk_ia32, %eax - testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features - jz 2f - leal __memmove_chk_ssse3, %eax - testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features - jz 2f - leal __memmove_chk_ssse3_rep, %eax -2: ret -END(__memmove_chk) - - .type __memmove_chk_ssse3, @function - .p2align 4; -__memmove_chk_ssse3: - cfi_startproc - CALL_MCOUNT - movl 12(%esp), %eax - cmpl %eax, 16(%esp) - jb __chk_fail - jmp __memmove_ssse3 - cfi_endproc - .size __memmove_chk_ssse3, .-__memmove_chk_ssse3 - - .type __memmove_chk_ssse3_rep, @function - .p2align 4; -__memmove_chk_ssse3_rep: - cfi_startproc - CALL_MCOUNT - movl 12(%esp), %eax - cmpl %eax, 16(%esp) - jb __chk_fail - jmp __memmove_ssse3_rep - cfi_endproc - .size __memmove_chk_ssse3_rep, .-__memmove_chk_ssse3_rep - - .type __memmove_chk_ia32, @function - .p2align 4; -__memmove_chk_ia32: - cfi_startproc - CALL_MCOUNT - movl 12(%esp), %eax - cmpl %eax, 16(%esp) - jb __chk_fail - jmp __memmove_ia32 - cfi_endproc - .size __memmove_chk_ia32, .-__memmove_chk_ia32 -# endif -#endif diff --git a/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S b/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S deleted file mode 100644 index 5357b33e18..0000000000 --- a/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_MEMPCPY -#define MEMCPY __mempcpy_ssse3_rep -#define MEMCPY_CHK __mempcpy_chk_ssse3_rep -#include "memcpy-ssse3-rep.S" diff --git a/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S b/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S deleted file mode 100644 index 822d98e954..0000000000 --- a/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_MEMPCPY -#define MEMCPY __mempcpy_ssse3 -#define MEMCPY_CHK __mempcpy_chk_ssse3 -#include "memcpy-ssse3.S" diff --git a/sysdeps/i386/i686/multiarch/mempcpy.S b/sysdeps/i386/i686/multiarch/mempcpy.S deleted file mode 100644 index df830d2e63..0000000000 --- a/sysdeps/i386/i686/multiarch/mempcpy.S +++ /dev/null @@ -1,93 +0,0 @@ -/* Multiple versions of mempcpy - Copyright (C) 2010 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - -#include <sysdep.h> -#include <init-arch.h> - -/* Define multiple versions only for the definition in lib and for - DSO. In static binaries we need mempcpy before the initialization - happened. */ -#if defined SHARED && !defined NOT_IN_libc - .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits - .globl __i686.get_pc_thunk.bx - .hidden __i686.get_pc_thunk.bx - .p2align 4 - .type __i686.get_pc_thunk.bx,@function -__i686.get_pc_thunk.bx: - movl (%esp), %ebx - ret - - .text -ENTRY(__mempcpy) - .type __mempcpy, @gnu_indirect_function - pushl %ebx - cfi_adjust_cfa_offset (4) - cfi_rel_offset (ebx, 0) - call __i686.get_pc_thunk.bx - addl $_GLOBAL_OFFSET_TABLE_, %ebx - cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) - jne 1f - call __init_cpu_features -1: leal __mempcpy_ia32@GOTOFF(%ebx), %eax - testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx) - jz 2f - leal __mempcpy_ssse3@GOTOFF(%ebx), %eax - testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx) - jz 2f - leal __mempcpy_ssse3_rep@GOTOFF(%ebx), %eax -2: popl %ebx - cfi_adjust_cfa_offset (-4) - cfi_restore (ebx) - ret -END(__mempcpy) - -# undef ENTRY -# define ENTRY(name) \ - .type __mempcpy_ia32, @function; \ - .p2align 4; \ - __mempcpy_ia32: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __mempcpy_ia32, .-__mempcpy_ia32 - -# undef ENTRY_CHK -# define ENTRY_CHK(name) \ - .type __mempcpy_chk_ia32, @function; \ - .globl __mempcpy_chk_ia32; \ - .p2align 4; \ - __mempcpy_chk_ia32: cfi_startproc; \ - CALL_MCOUNT -# undef END_CHK -# define END_CHK(name) \ - cfi_endproc; .size __mempcpy_chk_ia32, .-__mempcpy_chk_ia32 - -# undef libc_hidden_def -# undef libc_hidden_builtin_def -/* IFUNC doesn't work with the hidden functions in shared library since - they will be called without setting up EBX needed for PLT which is - used by IFUNC. */ -# define libc_hidden_def(name) \ - .globl __GI_mempcpy; __GI_mempcpy = __mempcpy_ia32 -# define libc_hidden_builtin_def(name) \ - .globl __GI___mempcpy; __GI___mempcpy = __mempcpy_ia32 -#endif - -#include "../mempcpy.S" diff --git a/sysdeps/i386/i686/multiarch/mempcpy_chk.S b/sysdeps/i386/i686/multiarch/mempcpy_chk.S deleted file mode 100644 index 828fb5e608..0000000000 --- a/sysdeps/i386/i686/multiarch/mempcpy_chk.S +++ /dev/null @@ -1,64 +0,0 @@ -/* Multiple versions of __mempcpy_chk - Copyright (C) 2010 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - -#include <sysdep.h> -#include <init-arch.h> - -/* Define multiple versions only for the definition in lib and for - DSO. There are no multiarch mempcpy functions for static binaries. - */ -#ifndef NOT_IN_libc -# ifdef SHARED - .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits - .globl __i686.get_pc_thunk.bx - .hidden __i686.get_pc_thunk.bx - .p2align 4 - .type __i686.get_pc_thunk.bx,@function -__i686.get_pc_thunk.bx: - movl (%esp), %ebx - ret - - .text -ENTRY(__mempcpy_chk) - .type __mempcpy_chk, @gnu_indirect_function - pushl %ebx - cfi_adjust_cfa_offset (4) - cfi_rel_offset (ebx, 0) - call __i686.get_pc_thunk.bx - addl $_GLOBAL_OFFSET_TABLE_, %ebx - cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) - jne 1f - call __init_cpu_features -1: leal __mempcpy_chk_ia32@GOTOFF(%ebx), %eax - testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx) - jz 2f - leal __mempcpy_chk_ssse3@GOTOFF(%ebx), %eax - testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx) - jz 2f - leal __mempcpy_chk_ssse3_rep@GOTOFF(%ebx), %eax -2: popl %ebx - cfi_adjust_cfa_offset (-4) - cfi_restore (ebx) - ret -END(__mempcpy_chk) -# else -# include "../mempcpy_chk.S" -# endif -#endif diff --git a/sysdeps/i386/i686/multiarch/memset-sse2-rep.S b/sysdeps/i386/i686/multiarch/memset-sse2-rep.S deleted file mode 100644 index 84afffeb66..0000000000 --- a/sysdeps/i386/i686/multiarch/memset-sse2-rep.S +++ /dev/null @@ -1,821 +0,0 @@ -/* memset with SSE2 and REP string. - Copyright (C) 2010 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - -#ifndef NOT_IN_libc - -#include <sysdep.h> -#include "asm-syntax.h" - -#define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) - -#define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) - -#define PUSH(REG) pushl REG; CFI_PUSH (REG) -#define POP(REG) popl REG; CFI_POP (REG) - -#ifdef USE_AS_BZERO -# define DEST PARMS -# define LEN DEST+4 -# define SETRTNVAL -#else -# define DEST PARMS -# define CHR DEST+4 -# define LEN CHR+4 -# define SETRTNVAL movl DEST(%esp), %eax -#endif - -#ifdef SHARED -# define ENTRANCE PUSH (%ebx); -# define RETURN_END POP (%ebx); ret -# define RETURN RETURN_END; CFI_PUSH (%ebx) -# define PARMS 8 /* Preserve EBX. */ -# define JMPTBL(I, B) I - B - -/* Load an entry in a jump table into EBX and branch to it. TABLE is a - jump table with relative offsets. */ -# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ - /* We first load PC into EBX. */ \ - call __i686.get_pc_thunk.bx; \ - /* Get the address of the jump table. */ \ - add $(TABLE - .), %ebx; \ - /* Get the entry and convert the relative offset to the \ - absolute address. */ \ - add (%ebx,%ecx,4), %ebx; \ - add %ecx, %edx; \ - /* We loaded the jump table and adjuested EDX. Go. */ \ - jmp *%ebx - - .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits - .globl __i686.get_pc_thunk.bx - .hidden __i686.get_pc_thunk.bx - ALIGN (4) - .type __i686.get_pc_thunk.bx,@function -__i686.get_pc_thunk.bx: - movl (%esp), %ebx - ret -#else -# define ENTRANCE -# define RETURN_END ret -# define RETURN RETURN_END -# define PARMS 4 -# define JMPTBL(I, B) I - -/* Branch to an entry in a jump table. TABLE is a jump table with - absolute offsets. */ -# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ - add %ecx, %edx; \ - jmp *TABLE(,%ecx,4) -#endif - - .section .text.sse2,"ax",@progbits -#if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BZERO -ENTRY (__memset_chk_sse2_rep) - movl 12(%esp), %eax - cmpl %eax, 16(%esp) - jb HIDDEN_JUMPTARGET (__chk_fail) -END (__memset_chk_sse2_rep) -#endif -ENTRY (__memset_sse2_rep) - ENTRANCE - - movl LEN(%esp), %ecx -#ifdef USE_AS_BZERO - xor %eax, %eax -#else - movzbl CHR(%esp), %eax - movb %al, %ah - /* Fill the whole EAX with pattern. */ - movl %eax, %edx - shl $16, %eax - or %edx, %eax -#endif - movl DEST(%esp), %edx - cmp $32, %ecx - jae L(32bytesormore) - -L(write_less32bytes): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes)) - - - .pushsection .rodata.sse2,"a",@progbits - ALIGN (2) -L(table_less_32bytes): - .int JMPTBL (L(write_0bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_1bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_2bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_3bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_4bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_5bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_6bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_7bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_8bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_9bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_10bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_11bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_12bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_13bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_14bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_15bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_16bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_17bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_18bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_19bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_20bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_21bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_22bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_23bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_24bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_25bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_26bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_27bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_28bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_29bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_30bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_31bytes), L(table_less_32bytes)) - .popsection - - ALIGN (4) -L(write_28bytes): - movl %eax, -28(%edx) -L(write_24bytes): - movl %eax, -24(%edx) -L(write_20bytes): - movl %eax, -20(%edx) -L(write_16bytes): - movl %eax, -16(%edx) -L(write_12bytes): - movl %eax, -12(%edx) -L(write_8bytes): - movl %eax, -8(%edx) -L(write_4bytes): - movl %eax, -4(%edx) -L(write_0bytes): - SETRTNVAL - RETURN - - ALIGN (4) -L(write_29bytes): - movl %eax, -29(%edx) -L(write_25bytes): - movl %eax, -25(%edx) -L(write_21bytes): - movl %eax, -21(%edx) -L(write_17bytes): - movl %eax, -17(%edx) -L(write_13bytes): - movl %eax, -13(%edx) -L(write_9bytes): - movl %eax, -9(%edx) -L(write_5bytes): - movl %eax, -5(%edx) -L(write_1bytes): - movb %al, -1(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(write_30bytes): - movl %eax, -30(%edx) -L(write_26bytes): - movl %eax, -26(%edx) -L(write_22bytes): - movl %eax, -22(%edx) -L(write_18bytes): - movl %eax, -18(%edx) -L(write_14bytes): - movl %eax, -14(%edx) -L(write_10bytes): - movl %eax, -10(%edx) -L(write_6bytes): - movl %eax, -6(%edx) -L(write_2bytes): - movw %ax, -2(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(write_31bytes): - movl %eax, -31(%edx) -L(write_27bytes): - movl %eax, -27(%edx) -L(write_23bytes): - movl %eax, -23(%edx) -L(write_19bytes): - movl %eax, -19(%edx) -L(write_15bytes): - movl %eax, -15(%edx) -L(write_11bytes): - movl %eax, -11(%edx) -L(write_7bytes): - movl %eax, -7(%edx) -L(write_3bytes): - movw %ax, -3(%edx) - movb %al, -1(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -/* ECX > 32 and EDX is 4 byte aligned. */ -L(32bytesormore): - /* Fill xmm0 with the pattern. */ -#ifdef USE_AS_BZERO - pxor %xmm0, %xmm0 -#else - movd %eax, %xmm0 - punpcklbw %xmm0, %xmm0 - pshufd $0, %xmm0, %xmm0 -#endif - testl $0xf, %edx - jz L(aligned_16) -/* ECX > 32 and EDX is not 16 byte aligned. */ -L(not_aligned_16): - movdqu %xmm0, (%edx) - movl %edx, %eax - and $-16, %edx - add $16, %edx - sub %edx, %eax - add %eax, %ecx - movd %xmm0, %eax - - ALIGN (4) -L(aligned_16): - cmp $128, %ecx - jge L(128bytesormore) - -L(aligned_16_less128bytes): - BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) - - ALIGN (4) -L(128bytesormore): - PUSH (%edi) -#ifdef DATA_CACHE_SIZE - PUSH (%ebx) - mov $DATA_CACHE_SIZE, %ebx -#else -# ifdef SHARED - call __i686.get_pc_thunk.bx - add $_GLOBAL_OFFSET_TABLE_, %ebx - mov __x86_data_cache_size@GOTOFF(%ebx), %ebx -# else - PUSH (%ebx) - mov __x86_data_cache_size, %ebx -# endif -#endif - mov %ebx, %edi - shr $4, %ebx - sub %ebx, %edi -#if defined DATA_CACHE_SIZE || !defined SHARED - POP (%ebx) -#endif -/* - * When data size approximate the end of L1 cache, - * fast string will prefetch and combine data efficiently. - */ - cmp %edi, %ecx - jae L(128bytesormore_nt) - subl $128, %ecx -L(128bytesormore_normal): - sub $128, %ecx - movdqa %xmm0, (%edx) - movdqa %xmm0, 0x10(%edx) - movdqa %xmm0, 0x20(%edx) - movdqa %xmm0, 0x30(%edx) - movdqa %xmm0, 0x40(%edx) - movdqa %xmm0, 0x50(%edx) - movdqa %xmm0, 0x60(%edx) - movdqa %xmm0, 0x70(%edx) - lea 128(%edx), %edx - jl L(128bytesless_normal) - - - sub $128, %ecx - movdqa %xmm0, (%edx) - movdqa %xmm0, 0x10(%edx) - movdqa %xmm0, 0x20(%edx) - movdqa %xmm0, 0x30(%edx) - movdqa %xmm0, 0x40(%edx) - movdqa %xmm0, 0x50(%edx) - movdqa %xmm0, 0x60(%edx) - movdqa %xmm0, 0x70(%edx) - lea 128(%edx), %edx - jge L(128bytesormore_normal) - -L(128bytesless_normal): - POP (%edi) - lea 128(%ecx), %ecx - BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) - - ALIGN (4) -L(128bytesormore_nt): - mov %edx, %edi - mov %ecx, %edx - shr $2, %ecx - and $3, %edx - rep stosl - jz L(copy_page_by_rep_exit) - cmp $2, %edx - jb L(copy_page_by_rep_left_1) - movw %ax, (%edi) - add $2, %edi - sub $2, %edx - jz L(copy_page_by_rep_exit) -L(copy_page_by_rep_left_1): - movb %al, (%edi) -L(copy_page_by_rep_exit): - POP (%edi) - SETRTNVAL - RETURN - - .pushsection .rodata.sse2,"a",@progbits - ALIGN (2) -L(table_16_128bytes): - .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes)) - .popsection - - ALIGN (4) -L(aligned_16_112bytes): - movdqa %xmm0, -112(%edx) -L(aligned_16_96bytes): - movdqa %xmm0, -96(%edx) -L(aligned_16_80bytes): - movdqa %xmm0, -80(%edx) -L(aligned_16_64bytes): - movdqa %xmm0, -64(%edx) -L(aligned_16_48bytes): - movdqa %xmm0, -48(%edx) -L(aligned_16_32bytes): - movdqa %xmm0, -32(%edx) -L(aligned_16_16bytes): - movdqa %xmm0, -16(%edx) -L(aligned_16_0bytes): - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_113bytes): - movdqa %xmm0, -113(%edx) -L(aligned_16_97bytes): - movdqa %xmm0, -97(%edx) -L(aligned_16_81bytes): - movdqa %xmm0, -81(%edx) -L(aligned_16_65bytes): - movdqa %xmm0, -65(%edx) -L(aligned_16_49bytes): - movdqa %xmm0, -49(%edx) -L(aligned_16_33bytes): - movdqa %xmm0, -33(%edx) -L(aligned_16_17bytes): - movdqa %xmm0, -17(%edx) -L(aligned_16_1bytes): - movb %al, -1(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_114bytes): - movdqa %xmm0, -114(%edx) -L(aligned_16_98bytes): - movdqa %xmm0, -98(%edx) -L(aligned_16_82bytes): - movdqa %xmm0, -82(%edx) -L(aligned_16_66bytes): - movdqa %xmm0, -66(%edx) -L(aligned_16_50bytes): - movdqa %xmm0, -50(%edx) -L(aligned_16_34bytes): - movdqa %xmm0, -34(%edx) -L(aligned_16_18bytes): - movdqa %xmm0, -18(%edx) -L(aligned_16_2bytes): - movw %ax, -2(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_115bytes): - movdqa %xmm0, -115(%edx) -L(aligned_16_99bytes): - movdqa %xmm0, -99(%edx) -L(aligned_16_83bytes): - movdqa %xmm0, -83(%edx) -L(aligned_16_67bytes): - movdqa %xmm0, -67(%edx) -L(aligned_16_51bytes): - movdqa %xmm0, -51(%edx) -L(aligned_16_35bytes): - movdqa %xmm0, -35(%edx) -L(aligned_16_19bytes): - movdqa %xmm0, -19(%edx) -L(aligned_16_3bytes): - movw %ax, -3(%edx) - movb %al, -1(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_116bytes): - movdqa %xmm0, -116(%edx) -L(aligned_16_100bytes): - movdqa %xmm0, -100(%edx) -L(aligned_16_84bytes): - movdqa %xmm0, -84(%edx) -L(aligned_16_68bytes): - movdqa %xmm0, -68(%edx) -L(aligned_16_52bytes): - movdqa %xmm0, -52(%edx) -L(aligned_16_36bytes): - movdqa %xmm0, -36(%edx) -L(aligned_16_20bytes): - movdqa %xmm0, -20(%edx) -L(aligned_16_4bytes): - movl %eax, -4(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_117bytes): - movdqa %xmm0, -117(%edx) -L(aligned_16_101bytes): - movdqa %xmm0, -101(%edx) -L(aligned_16_85bytes): - movdqa %xmm0, -85(%edx) -L(aligned_16_69bytes): - movdqa %xmm0, -69(%edx) -L(aligned_16_53bytes): - movdqa %xmm0, -53(%edx) -L(aligned_16_37bytes): - movdqa %xmm0, -37(%edx) -L(aligned_16_21bytes): - movdqa %xmm0, -21(%edx) -L(aligned_16_5bytes): - movl %eax, -5(%edx) - movb %al, -1(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_118bytes): - movdqa %xmm0, -118(%edx) -L(aligned_16_102bytes): - movdqa %xmm0, -102(%edx) -L(aligned_16_86bytes): - movdqa %xmm0, -86(%edx) -L(aligned_16_70bytes): - movdqa %xmm0, -70(%edx) -L(aligned_16_54bytes): - movdqa %xmm0, -54(%edx) -L(aligned_16_38bytes): - movdqa %xmm0, -38(%edx) -L(aligned_16_22bytes): - movdqa %xmm0, -22(%edx) -L(aligned_16_6bytes): - movl %eax, -6(%edx) - movw %ax, -2(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_119bytes): - movdqa %xmm0, -119(%edx) -L(aligned_16_103bytes): - movdqa %xmm0, -103(%edx) -L(aligned_16_87bytes): - movdqa %xmm0, -87(%edx) -L(aligned_16_71bytes): - movdqa %xmm0, -71(%edx) -L(aligned_16_55bytes): - movdqa %xmm0, -55(%edx) -L(aligned_16_39bytes): - movdqa %xmm0, -39(%edx) -L(aligned_16_23bytes): - movdqa %xmm0, -23(%edx) -L(aligned_16_7bytes): - movl %eax, -7(%edx) - movw %ax, -3(%edx) - movb %al, -1(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_120bytes): - movdqa %xmm0, -120(%edx) -L(aligned_16_104bytes): - movdqa %xmm0, -104(%edx) -L(aligned_16_88bytes): - movdqa %xmm0, -88(%edx) -L(aligned_16_72bytes): - movdqa %xmm0, -72(%edx) -L(aligned_16_56bytes): - movdqa %xmm0, -56(%edx) -L(aligned_16_40bytes): - movdqa %xmm0, -40(%edx) -L(aligned_16_24bytes): - movdqa %xmm0, -24(%edx) -L(aligned_16_8bytes): - movq %xmm0, -8(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_121bytes): - movdqa %xmm0, -121(%edx) -L(aligned_16_105bytes): - movdqa %xmm0, -105(%edx) -L(aligned_16_89bytes): - movdqa %xmm0, -89(%edx) -L(aligned_16_73bytes): - movdqa %xmm0, -73(%edx) -L(aligned_16_57bytes): - movdqa %xmm0, -57(%edx) -L(aligned_16_41bytes): - movdqa %xmm0, -41(%edx) -L(aligned_16_25bytes): - movdqa %xmm0, -25(%edx) -L(aligned_16_9bytes): - movq %xmm0, -9(%edx) - movb %al, -1(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_122bytes): - movdqa %xmm0, -122(%edx) -L(aligned_16_106bytes): - movdqa %xmm0, -106(%edx) -L(aligned_16_90bytes): - movdqa %xmm0, -90(%edx) -L(aligned_16_74bytes): - movdqa %xmm0, -74(%edx) -L(aligned_16_58bytes): - movdqa %xmm0, -58(%edx) -L(aligned_16_42bytes): - movdqa %xmm0, -42(%edx) -L(aligned_16_26bytes): - movdqa %xmm0, -26(%edx) -L(aligned_16_10bytes): - movq %xmm0, -10(%edx) - movw %ax, -2(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_123bytes): - movdqa %xmm0, -123(%edx) -L(aligned_16_107bytes): - movdqa %xmm0, -107(%edx) -L(aligned_16_91bytes): - movdqa %xmm0, -91(%edx) -L(aligned_16_75bytes): - movdqa %xmm0, -75(%edx) -L(aligned_16_59bytes): - movdqa %xmm0, -59(%edx) -L(aligned_16_43bytes): - movdqa %xmm0, -43(%edx) -L(aligned_16_27bytes): - movdqa %xmm0, -27(%edx) -L(aligned_16_11bytes): - movq %xmm0, -11(%edx) - movw %ax, -3(%edx) - movb %al, -1(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_124bytes): - movdqa %xmm0, -124(%edx) -L(aligned_16_108bytes): - movdqa %xmm0, -108(%edx) -L(aligned_16_92bytes): - movdqa %xmm0, -92(%edx) -L(aligned_16_76bytes): - movdqa %xmm0, -76(%edx) -L(aligned_16_60bytes): - movdqa %xmm0, -60(%edx) -L(aligned_16_44bytes): - movdqa %xmm0, -44(%edx) -L(aligned_16_28bytes): - movdqa %xmm0, -28(%edx) -L(aligned_16_12bytes): - movq %xmm0, -12(%edx) - movl %eax, -4(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_125bytes): - movdqa %xmm0, -125(%edx) -L(aligned_16_109bytes): - movdqa %xmm0, -109(%edx) -L(aligned_16_93bytes): - movdqa %xmm0, -93(%edx) -L(aligned_16_77bytes): - movdqa %xmm0, -77(%edx) -L(aligned_16_61bytes): - movdqa %xmm0, -61(%edx) -L(aligned_16_45bytes): - movdqa %xmm0, -45(%edx) -L(aligned_16_29bytes): - movdqa %xmm0, -29(%edx) -L(aligned_16_13bytes): - movq %xmm0, -13(%edx) - movl %eax, -5(%edx) - movb %al, -1(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_126bytes): - movdqa %xmm0, -126(%edx) -L(aligned_16_110bytes): - movdqa %xmm0, -110(%edx) -L(aligned_16_94bytes): - movdqa %xmm0, -94(%edx) -L(aligned_16_78bytes): - movdqa %xmm0, -78(%edx) -L(aligned_16_62bytes): - movdqa %xmm0, -62(%edx) -L(aligned_16_46bytes): - movdqa %xmm0, -46(%edx) -L(aligned_16_30bytes): - movdqa %xmm0, -30(%edx) -L(aligned_16_14bytes): - movq %xmm0, -14(%edx) - movl %eax, -6(%edx) - movw %ax, -2(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_127bytes): - movdqa %xmm0, -127(%edx) -L(aligned_16_111bytes): - movdqa %xmm0, -111(%edx) -L(aligned_16_95bytes): - movdqa %xmm0, -95(%edx) -L(aligned_16_79bytes): - movdqa %xmm0, -79(%edx) -L(aligned_16_63bytes): - movdqa %xmm0, -63(%edx) -L(aligned_16_47bytes): - movdqa %xmm0, -47(%edx) -L(aligned_16_31bytes): - movdqa %xmm0, -31(%edx) -L(aligned_16_15bytes): - movq %xmm0, -15(%edx) - movl %eax, -7(%edx) - movw %ax, -3(%edx) - movb %al, -1(%edx) - SETRTNVAL - RETURN_END - -END (__memset_sse2_rep) - -#endif diff --git a/sysdeps/i386/i686/multiarch/memset-sse2.S b/sysdeps/i386/i686/multiarch/memset-sse2.S deleted file mode 100644 index b2b979193e..0000000000 --- a/sysdeps/i386/i686/multiarch/memset-sse2.S +++ /dev/null @@ -1,867 +0,0 @@ -/* memset with SSE2 - Copyright (C) 2010 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - -#ifndef NOT_IN_libc - -#include <sysdep.h> -#include "asm-syntax.h" - -#define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) - -#define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) - -#define PUSH(REG) pushl REG; CFI_PUSH (REG) -#define POP(REG) popl REG; CFI_POP (REG) - -#ifdef USE_AS_BZERO -# define DEST PARMS -# define LEN DEST+4 -# define SETRTNVAL -#else -# define DEST PARMS -# define CHR DEST+4 -# define LEN CHR+4 -# define SETRTNVAL movl DEST(%esp), %eax -#endif - -#ifdef SHARED -# define ENTRANCE PUSH (%ebx); -# define RETURN_END POP (%ebx); ret -# define RETURN RETURN_END; CFI_PUSH (%ebx) -# define PARMS 8 /* Preserve EBX. */ -# define JMPTBL(I, B) I - B - -/* Load an entry in a jump table into EBX and branch to it. TABLE is a - jump table with relative offsets. */ -# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ - /* We first load PC into EBX. */ \ - call __i686.get_pc_thunk.bx; \ - /* Get the address of the jump table. */ \ - add $(TABLE - .), %ebx; \ - /* Get the entry and convert the relative offset to the \ - absolute address. */ \ - add (%ebx,%ecx,4), %ebx; \ - add %ecx, %edx; \ - /* We loaded the jump table and adjuested EDX. Go. */ \ - jmp *%ebx - - .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits - .globl __i686.get_pc_thunk.bx - .hidden __i686.get_pc_thunk.bx - ALIGN (4) - .type __i686.get_pc_thunk.bx,@function -__i686.get_pc_thunk.bx: - movl (%esp), %ebx - ret -#else -# define ENTRANCE -# define RETURN_END ret -# define RETURN RETURN_END -# define PARMS 4 -# define JMPTBL(I, B) I - -/* Branch to an entry in a jump table. TABLE is a jump table with - absolute offsets. */ -# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ - add %ecx, %edx; \ - jmp *TABLE(,%ecx,4) -#endif - - .section .text.sse2,"ax",@progbits -#if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BZERO -ENTRY (__memset_chk_sse2) - movl 12(%esp), %eax - cmpl %eax, 16(%esp) - jb HIDDEN_JUMPTARGET (__chk_fail) -END (__memset_chk_sse2) -#endif -ENTRY (__memset_sse2) - ENTRANCE - - movl LEN(%esp), %ecx -#ifdef USE_AS_BZERO - xor %eax, %eax -#else - movzbl CHR(%esp), %eax - movb %al, %ah - /* Fill the whole EAX with pattern. */ - movl %eax, %edx - shl $16, %eax - or %edx, %eax -#endif - movl DEST(%esp), %edx - cmp $32, %ecx - jae L(32bytesormore) - -L(write_less32bytes): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes)) - - - .pushsection .rodata.sse2,"a",@progbits - ALIGN (2) -L(table_less_32bytes): - .int JMPTBL (L(write_0bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_1bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_2bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_3bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_4bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_5bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_6bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_7bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_8bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_9bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_10bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_11bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_12bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_13bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_14bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_15bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_16bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_17bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_18bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_19bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_20bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_21bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_22bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_23bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_24bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_25bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_26bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_27bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_28bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_29bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_30bytes), L(table_less_32bytes)) - .int JMPTBL (L(write_31bytes), L(table_less_32bytes)) - .popsection - - ALIGN (4) -L(write_28bytes): - movl %eax, -28(%edx) -L(write_24bytes): - movl %eax, -24(%edx) -L(write_20bytes): - movl %eax, -20(%edx) -L(write_16bytes): - movl %eax, -16(%edx) -L(write_12bytes): - movl %eax, -12(%edx) -L(write_8bytes): - movl %eax, -8(%edx) -L(write_4bytes): - movl %eax, -4(%edx) -L(write_0bytes): - SETRTNVAL - RETURN - - ALIGN (4) -L(write_29bytes): - movl %eax, -29(%edx) -L(write_25bytes): - movl %eax, -25(%edx) -L(write_21bytes): - movl %eax, -21(%edx) -L(write_17bytes): - movl %eax, -17(%edx) -L(write_13bytes): - movl %eax, -13(%edx) -L(write_9bytes): - movl %eax, -9(%edx) -L(write_5bytes): - movl %eax, -5(%edx) -L(write_1bytes): - movb %al, -1(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(write_30bytes): - movl %eax, -30(%edx) -L(write_26bytes): - movl %eax, -26(%edx) -L(write_22bytes): - movl %eax, -22(%edx) -L(write_18bytes): - movl %eax, -18(%edx) -L(write_14bytes): - movl %eax, -14(%edx) -L(write_10bytes): - movl %eax, -10(%edx) -L(write_6bytes): - movl %eax, -6(%edx) -L(write_2bytes): - movw %ax, -2(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(write_31bytes): - movl %eax, -31(%edx) -L(write_27bytes): - movl %eax, -27(%edx) -L(write_23bytes): - movl %eax, -23(%edx) -L(write_19bytes): - movl %eax, -19(%edx) -L(write_15bytes): - movl %eax, -15(%edx) -L(write_11bytes): - movl %eax, -11(%edx) -L(write_7bytes): - movl %eax, -7(%edx) -L(write_3bytes): - movw %ax, -3(%edx) - movb %al, -1(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -/* ECX > 32 and EDX is 4 byte aligned. */ -L(32bytesormore): - /* Fill xmm0 with the pattern. */ -#ifdef USE_AS_BZERO - pxor %xmm0, %xmm0 -#else - movd %eax, %xmm0 - punpcklbw %xmm0, %xmm0 - pshufd $0, %xmm0, %xmm0 -#endif - testl $0xf, %edx - jz L(aligned_16) -/* ECX > 32 and EDX is not 16 byte aligned. */ -L(not_aligned_16): - movdqu %xmm0, (%edx) - movl %edx, %eax - and $-16, %edx - add $16, %edx - sub %edx, %eax - add %eax, %ecx - movd %xmm0, %eax - - ALIGN (4) -L(aligned_16): - cmp $128, %ecx - jge L(128bytesormore) - -L(aligned_16_less128bytes): - BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) - - ALIGN (4) -L(128bytesormore): -#ifdef SHARED_CACHE_SIZE - PUSH (%ebx) - mov $SHARED_CACHE_SIZE, %ebx -#else -# ifdef SHARED - call __i686.get_pc_thunk.bx - add $_GLOBAL_OFFSET_TABLE_, %ebx - mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx -# else - PUSH (%ebx) - mov __x86_shared_cache_size, %ebx -# endif -#endif - cmp %ebx, %ecx - jae L(128bytesormore_nt_start) - - -#ifdef DATA_CACHE_SIZE - POP (%ebx) - cmp $DATA_CACHE_SIZE, %ecx -#else -# ifdef SHARED - call __i686.get_pc_thunk.bx - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx -# else - POP (%ebx) - cmp __x86_data_cache_size, %ecx -# endif -#endif - - jae L(128bytes_L2_normal) - subl $128, %ecx -L(128bytesormore_normal): - sub $128, %ecx - movdqa %xmm0, (%edx) - movdqa %xmm0, 0x10(%edx) - movdqa %xmm0, 0x20(%edx) - movdqa %xmm0, 0x30(%edx) - movdqa %xmm0, 0x40(%edx) - movdqa %xmm0, 0x50(%edx) - movdqa %xmm0, 0x60(%edx) - movdqa %xmm0, 0x70(%edx) - lea 128(%edx), %edx - jl L(128bytesless_normal) - - - sub $128, %ecx - movdqa %xmm0, (%edx) - movdqa %xmm0, 0x10(%edx) - movdqa %xmm0, 0x20(%edx) - movdqa %xmm0, 0x30(%edx) - movdqa %xmm0, 0x40(%edx) - movdqa %xmm0, 0x50(%edx) - movdqa %xmm0, 0x60(%edx) - movdqa %xmm0, 0x70(%edx) - lea 128(%edx), %edx - jge L(128bytesormore_normal) - -L(128bytesless_normal): - lea 128(%ecx), %ecx - BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) - - ALIGN (4) -L(128bytes_L2_normal): - prefetcht0 0x380(%edx) - prefetcht0 0x3c0(%edx) - sub $128, %ecx - movdqa %xmm0, (%edx) - movaps %xmm0, 0x10(%edx) - movaps %xmm0, 0x20(%edx) - movaps %xmm0, 0x30(%edx) - movaps %xmm0, 0x40(%edx) - movaps %xmm0, 0x50(%edx) - movaps %xmm0, 0x60(%edx) - movaps %xmm0, 0x70(%edx) - add $128, %edx - cmp $128, %ecx - jge L(128bytes_L2_normal) - -L(128bytesless_L2_normal): - BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) - -L(128bytesormore_nt_start): - sub %ebx, %ecx - ALIGN (4) -L(128bytesormore_shared_cache_loop): - prefetcht0 0x3c0(%edx) - prefetcht0 0x380(%edx) - sub $0x80, %ebx - movdqa %xmm0, (%edx) - movdqa %xmm0, 0x10(%edx) - movdqa %xmm0, 0x20(%edx) - movdqa %xmm0, 0x30(%edx) - movdqa %xmm0, 0x40(%edx) - movdqa %xmm0, 0x50(%edx) - movdqa %xmm0, 0x60(%edx) - movdqa %xmm0, 0x70(%edx) - add $0x80, %edx - cmp $0x80, %ebx - jge L(128bytesormore_shared_cache_loop) - cmp $0x80, %ecx - jb L(shared_cache_loop_end) - ALIGN (4) -L(128bytesormore_nt): - sub $0x80, %ecx - movntdq %xmm0, (%edx) - movntdq %xmm0, 0x10(%edx) - movntdq %xmm0, 0x20(%edx) - movntdq %xmm0, 0x30(%edx) - movntdq %xmm0, 0x40(%edx) - movntdq %xmm0, 0x50(%edx) - movntdq %xmm0, 0x60(%edx) - movntdq %xmm0, 0x70(%edx) - add $0x80, %edx - cmp $0x80, %ecx - jge L(128bytesormore_nt) - sfence -L(shared_cache_loop_end): -#if defined DATA_CACHE_SIZE || !defined SHARED - POP (%ebx) -#endif - BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) - - - .pushsection .rodata.sse2,"a",@progbits - ALIGN (2) -L(table_16_128bytes): - .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes)) - .int JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes)) - .popsection - - ALIGN (4) -L(aligned_16_112bytes): - movdqa %xmm0, -112(%edx) -L(aligned_16_96bytes): - movdqa %xmm0, -96(%edx) -L(aligned_16_80bytes): - movdqa %xmm0, -80(%edx) -L(aligned_16_64bytes): - movdqa %xmm0, -64(%edx) -L(aligned_16_48bytes): - movdqa %xmm0, -48(%edx) -L(aligned_16_32bytes): - movdqa %xmm0, -32(%edx) -L(aligned_16_16bytes): - movdqa %xmm0, -16(%edx) -L(aligned_16_0bytes): - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_113bytes): - movdqa %xmm0, -113(%edx) -L(aligned_16_97bytes): - movdqa %xmm0, -97(%edx) -L(aligned_16_81bytes): - movdqa %xmm0, -81(%edx) -L(aligned_16_65bytes): - movdqa %xmm0, -65(%edx) -L(aligned_16_49bytes): - movdqa %xmm0, -49(%edx) -L(aligned_16_33bytes): - movdqa %xmm0, -33(%edx) -L(aligned_16_17bytes): - movdqa %xmm0, -17(%edx) -L(aligned_16_1bytes): - movb %al, -1(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_114bytes): - movdqa %xmm0, -114(%edx) -L(aligned_16_98bytes): - movdqa %xmm0, -98(%edx) -L(aligned_16_82bytes): - movdqa %xmm0, -82(%edx) -L(aligned_16_66bytes): - movdqa %xmm0, -66(%edx) -L(aligned_16_50bytes): - movdqa %xmm0, -50(%edx) -L(aligned_16_34bytes): - movdqa %xmm0, -34(%edx) -L(aligned_16_18bytes): - movdqa %xmm0, -18(%edx) -L(aligned_16_2bytes): - movw %ax, -2(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_115bytes): - movdqa %xmm0, -115(%edx) -L(aligned_16_99bytes): - movdqa %xmm0, -99(%edx) -L(aligned_16_83bytes): - movdqa %xmm0, -83(%edx) -L(aligned_16_67bytes): - movdqa %xmm0, -67(%edx) -L(aligned_16_51bytes): - movdqa %xmm0, -51(%edx) -L(aligned_16_35bytes): - movdqa %xmm0, -35(%edx) -L(aligned_16_19bytes): - movdqa %xmm0, -19(%edx) -L(aligned_16_3bytes): - movw %ax, -3(%edx) - movb %al, -1(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_116bytes): - movdqa %xmm0, -116(%edx) -L(aligned_16_100bytes): - movdqa %xmm0, -100(%edx) -L(aligned_16_84bytes): - movdqa %xmm0, -84(%edx) -L(aligned_16_68bytes): - movdqa %xmm0, -68(%edx) -L(aligned_16_52bytes): - movdqa %xmm0, -52(%edx) -L(aligned_16_36bytes): - movdqa %xmm0, -36(%edx) -L(aligned_16_20bytes): - movdqa %xmm0, -20(%edx) -L(aligned_16_4bytes): - movl %eax, -4(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_117bytes): - movdqa %xmm0, -117(%edx) -L(aligned_16_101bytes): - movdqa %xmm0, -101(%edx) -L(aligned_16_85bytes): - movdqa %xmm0, -85(%edx) -L(aligned_16_69bytes): - movdqa %xmm0, -69(%edx) -L(aligned_16_53bytes): - movdqa %xmm0, -53(%edx) -L(aligned_16_37bytes): - movdqa %xmm0, -37(%edx) -L(aligned_16_21bytes): - movdqa %xmm0, -21(%edx) -L(aligned_16_5bytes): - movl %eax, -5(%edx) - movb %al, -1(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_118bytes): - movdqa %xmm0, -118(%edx) -L(aligned_16_102bytes): - movdqa %xmm0, -102(%edx) -L(aligned_16_86bytes): - movdqa %xmm0, -86(%edx) -L(aligned_16_70bytes): - movdqa %xmm0, -70(%edx) -L(aligned_16_54bytes): - movdqa %xmm0, -54(%edx) -L(aligned_16_38bytes): - movdqa %xmm0, -38(%edx) -L(aligned_16_22bytes): - movdqa %xmm0, -22(%edx) -L(aligned_16_6bytes): - movl %eax, -6(%edx) - movw %ax, -2(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_119bytes): - movdqa %xmm0, -119(%edx) -L(aligned_16_103bytes): - movdqa %xmm0, -103(%edx) -L(aligned_16_87bytes): - movdqa %xmm0, -87(%edx) -L(aligned_16_71bytes): - movdqa %xmm0, -71(%edx) -L(aligned_16_55bytes): - movdqa %xmm0, -55(%edx) -L(aligned_16_39bytes): - movdqa %xmm0, -39(%edx) -L(aligned_16_23bytes): - movdqa %xmm0, -23(%edx) -L(aligned_16_7bytes): - movl %eax, -7(%edx) - movw %ax, -3(%edx) - movb %al, -1(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_120bytes): - movdqa %xmm0, -120(%edx) -L(aligned_16_104bytes): - movdqa %xmm0, -104(%edx) -L(aligned_16_88bytes): - movdqa %xmm0, -88(%edx) -L(aligned_16_72bytes): - movdqa %xmm0, -72(%edx) -L(aligned_16_56bytes): - movdqa %xmm0, -56(%edx) -L(aligned_16_40bytes): - movdqa %xmm0, -40(%edx) -L(aligned_16_24bytes): - movdqa %xmm0, -24(%edx) -L(aligned_16_8bytes): - movq %xmm0, -8(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_121bytes): - movdqa %xmm0, -121(%edx) -L(aligned_16_105bytes): - movdqa %xmm0, -105(%edx) -L(aligned_16_89bytes): - movdqa %xmm0, -89(%edx) -L(aligned_16_73bytes): - movdqa %xmm0, -73(%edx) -L(aligned_16_57bytes): - movdqa %xmm0, -57(%edx) -L(aligned_16_41bytes): - movdqa %xmm0, -41(%edx) -L(aligned_16_25bytes): - movdqa %xmm0, -25(%edx) -L(aligned_16_9bytes): - movq %xmm0, -9(%edx) - movb %al, -1(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_122bytes): - movdqa %xmm0, -122(%edx) -L(aligned_16_106bytes): - movdqa %xmm0, -106(%edx) -L(aligned_16_90bytes): - movdqa %xmm0, -90(%edx) -L(aligned_16_74bytes): - movdqa %xmm0, -74(%edx) -L(aligned_16_58bytes): - movdqa %xmm0, -58(%edx) -L(aligned_16_42bytes): - movdqa %xmm0, -42(%edx) -L(aligned_16_26bytes): - movdqa %xmm0, -26(%edx) -L(aligned_16_10bytes): - movq %xmm0, -10(%edx) - movw %ax, -2(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_123bytes): - movdqa %xmm0, -123(%edx) -L(aligned_16_107bytes): - movdqa %xmm0, -107(%edx) -L(aligned_16_91bytes): - movdqa %xmm0, -91(%edx) -L(aligned_16_75bytes): - movdqa %xmm0, -75(%edx) -L(aligned_16_59bytes): - movdqa %xmm0, -59(%edx) -L(aligned_16_43bytes): - movdqa %xmm0, -43(%edx) -L(aligned_16_27bytes): - movdqa %xmm0, -27(%edx) -L(aligned_16_11bytes): - movq %xmm0, -11(%edx) - movw %ax, -3(%edx) - movb %al, -1(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_124bytes): - movdqa %xmm0, -124(%edx) -L(aligned_16_108bytes): - movdqa %xmm0, -108(%edx) -L(aligned_16_92bytes): - movdqa %xmm0, -92(%edx) -L(aligned_16_76bytes): - movdqa %xmm0, -76(%edx) -L(aligned_16_60bytes): - movdqa %xmm0, -60(%edx) -L(aligned_16_44bytes): - movdqa %xmm0, -44(%edx) -L(aligned_16_28bytes): - movdqa %xmm0, -28(%edx) -L(aligned_16_12bytes): - movq %xmm0, -12(%edx) - movl %eax, -4(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_125bytes): - movdqa %xmm0, -125(%edx) -L(aligned_16_109bytes): - movdqa %xmm0, -109(%edx) -L(aligned_16_93bytes): - movdqa %xmm0, -93(%edx) -L(aligned_16_77bytes): - movdqa %xmm0, -77(%edx) -L(aligned_16_61bytes): - movdqa %xmm0, -61(%edx) -L(aligned_16_45bytes): - movdqa %xmm0, -45(%edx) -L(aligned_16_29bytes): - movdqa %xmm0, -29(%edx) -L(aligned_16_13bytes): - movq %xmm0, -13(%edx) - movl %eax, -5(%edx) - movb %al, -1(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_126bytes): - movdqa %xmm0, -126(%edx) -L(aligned_16_110bytes): - movdqa %xmm0, -110(%edx) -L(aligned_16_94bytes): - movdqa %xmm0, -94(%edx) -L(aligned_16_78bytes): - movdqa %xmm0, -78(%edx) -L(aligned_16_62bytes): - movdqa %xmm0, -62(%edx) -L(aligned_16_46bytes): - movdqa %xmm0, -46(%edx) -L(aligned_16_30bytes): - movdqa %xmm0, -30(%edx) -L(aligned_16_14bytes): - movq %xmm0, -14(%edx) - movl %eax, -6(%edx) - movw %ax, -2(%edx) - SETRTNVAL - RETURN - - ALIGN (4) -L(aligned_16_127bytes): - movdqa %xmm0, -127(%edx) -L(aligned_16_111bytes): - movdqa %xmm0, -111(%edx) -L(aligned_16_95bytes): - movdqa %xmm0, -95(%edx) -L(aligned_16_79bytes): - movdqa %xmm0, -79(%edx) -L(aligned_16_63bytes): - movdqa %xmm0, -63(%edx) -L(aligned_16_47bytes): - movdqa %xmm0, -47(%edx) -L(aligned_16_31bytes): - movdqa %xmm0, -31(%edx) -L(aligned_16_15bytes): - movq %xmm0, -15(%edx) - movl %eax, -7(%edx) - movw %ax, -3(%edx) - movb %al, -1(%edx) - SETRTNVAL - RETURN_END - -END (__memset_sse2) - -#endif diff --git a/sysdeps/i386/i686/multiarch/memset.S b/sysdeps/i386/i686/multiarch/memset.S deleted file mode 100644 index 34dddcef7b..0000000000 --- a/sysdeps/i386/i686/multiarch/memset.S +++ /dev/null @@ -1,112 +0,0 @@ -/* Multiple versions of memset - Copyright (C) 2010 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - -#include <sysdep.h> -#include <init-arch.h> - -/* Define multiple versions only for the definition in lib. */ -#ifndef NOT_IN_libc -# ifdef SHARED - .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits - .globl __i686.get_pc_thunk.bx - .hidden __i686.get_pc_thunk.bx - .p2align 4 - .type __i686.get_pc_thunk.bx,@function -__i686.get_pc_thunk.bx: - movl (%esp), %ebx - ret - - .text -ENTRY(memset) - .type memset, @gnu_indirect_function - pushl %ebx - cfi_adjust_cfa_offset (4) - cfi_rel_offset (ebx, 0) - call __i686.get_pc_thunk.bx - addl $_GLOBAL_OFFSET_TABLE_, %ebx - cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) - jne 1f - call __init_cpu_features -1: leal __memset_ia32@GOTOFF(%ebx), %eax - testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx) - jz 2f - leal __memset_sse2@GOTOFF(%ebx), %eax - testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx) - jz 2f - leal __memset_sse2_rep@GOTOFF(%ebx), %eax -2: popl %ebx - cfi_adjust_cfa_offset (-4) - cfi_restore (ebx) - ret -END(memset) -# else - .text -ENTRY(memset) - .type memset, @gnu_indirect_function - cmpl $0, KIND_OFFSET+__cpu_features - jne 1f - call __init_cpu_features -1: leal __memset_ia32, %eax - testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features - jz 2f - leal __memset_sse2, %eax - testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features - jz 2f - leal __memset_sse2_rep, %eax -2: ret -END(memset) -# endif - -# undef ENTRY -# define ENTRY(name) \ - .type __memset_ia32, @function; \ - .globl __memset_ia32; \ - .p2align 4; \ - __memset_ia32: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __memset_ia32, .-__memset_ia32 - -# undef ENTRY_CHK -# define ENTRY_CHK(name) \ - .type __memset_chk_ia32, @function; \ - .globl __memset_chk_ia32; \ - .p2align 4; \ - __memset_chk_ia32: cfi_startproc; \ - CALL_MCOUNT -# undef END_CHK -# define END_CHK(name) \ - cfi_endproc; .size __memset_chk_ia32, .-__memset_chk_ia32 - -# ifdef SHARED -# undef libc_hidden_builtin_def -/* IFUNC doesn't work with the hidden functions in shared library since - they will be called without setting up EBX needed for PLT which is - used by IFUNC. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_memset; __GI_memset = __memset_ia32 -# endif - -# undef strong_alias -# define strong_alias(original, alias) -#endif - -#include "../memset.S" diff --git a/sysdeps/i386/i686/multiarch/memset_chk.S b/sysdeps/i386/i686/multiarch/memset_chk.S deleted file mode 100644 index d659c7e56d..0000000000 --- a/sysdeps/i386/i686/multiarch/memset_chk.S +++ /dev/null @@ -1,116 +0,0 @@ -/* Multiple versions of __memset_chk - Copyright (C) 2010 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - -#include <sysdep.h> -#include <init-arch.h> - -/* Define multiple versions only for the definition in lib. */ -#ifndef NOT_IN_libc -# ifdef SHARED - .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits - .globl __i686.get_pc_thunk.bx - .hidden __i686.get_pc_thunk.bx - .p2align 4 - .type __i686.get_pc_thunk.bx,@function -__i686.get_pc_thunk.bx: - movl (%esp), %ebx - ret - - .text -ENTRY(__memset_chk) - .type __memset_chk, @gnu_indirect_function - pushl %ebx - cfi_adjust_cfa_offset (4) - cfi_rel_offset (ebx, 0) - call __i686.get_pc_thunk.bx - addl $_GLOBAL_OFFSET_TABLE_, %ebx - cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) - jne 1f - call __init_cpu_features -1: leal __memset_chk_ia32@GOTOFF(%ebx), %eax - testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx) - jz 2f - leal __memset_chk_sse2@GOTOFF(%ebx), %eax - testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx) - jz 2f - leal __memset_chk_sse2_rep@GOTOFF(%ebx), %eax -2: popl %ebx - cfi_adjust_cfa_offset (-4) - cfi_restore (ebx) - ret -END(__memset_chk) - -strong_alias (__memset_chk, __memset_zero_constant_len_parameter) - .section .gnu.warning.__memset_zero_constant_len_parameter - .string "memset used with constant zero length parameter; this could be due to transposed parameters" -# else - .text -ENTRY(__memset_chk) - .type __memset_chk, @gnu_indirect_function - cmpl $0, KIND_OFFSET+__cpu_features - jne 1f - call __init_cpu_features -1: leal __memset_chk_ia32, %eax - testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features - jz 2f - leal __memset_chk_sse2, %eax - testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features - jz 2f - leal __memset_chk_sse2_rep, %eax -2: ret -END(__memset_chk) - - .type __memset_chk_sse2, @function - .p2align 4; -__memset_chk_sse2: - cfi_startproc - CALL_MCOUNT - movl 12(%esp), %eax - cmpl %eax, 16(%esp) - jb __chk_fail - jmp __memset_sse2 - cfi_endproc - .size __memset_chk_sse2, .-__memset_chk_sse2 - - .type __memset_chk_sse2_rep, @function - .p2align 4; -__memset_chk_sse2_rep: - cfi_startproc - CALL_MCOUNT - movl 12(%esp), %eax - cmpl %eax, 16(%esp) - jb __chk_fail - jmp __memset_sse2_rep - cfi_endproc - .size __memset_chk_sse2_rep, .-__memset_chk_sse2_rep - - .type __memset_chk_ia32, @function - .p2align 4; -__memset_chk_ia32: - cfi_startproc - CALL_MCOUNT - movl 12(%esp), %eax - cmpl %eax, 16(%esp) - jb __chk_fail - jmp __memset_ia32 - cfi_endproc - .size __memset_chk_ia32, .-__memset_chk_ia32 -# endif -#endif diff --git a/sysdeps/i386/sysdep.h b/sysdeps/i386/sysdep.h index efdc82dde7..e03a8e926d 100644 --- a/sysdeps/i386/sysdep.h +++ b/sysdeps/i386/sysdep.h @@ -67,9 +67,6 @@ ASM_SIZE_DIRECTIVE(name) \ STABS_FUN_END(name) -#define ENTRY_CHK(name) ENTRY (name) -#define END_CHK(name) END (name) - #ifdef HAVE_CPP_ASM_DEBUGINFO /* Disable that goop, because we just pass -g through to the assembler and it generates proper line number information directly. */ diff --git a/sysdeps/x86_64/cacheinfo.c b/sysdeps/x86_64/cacheinfo.c index 54220379ec..5b66c62eb3 100644 --- a/sysdeps/x86_64/cacheinfo.c +++ b/sysdeps/x86_64/cacheinfo.c @@ -74,7 +74,6 @@ static const struct intel_02_cache_info { 0x0a, 2, 32, M(_SC_LEVEL1_DCACHE_SIZE), 8192 }, { 0x0c, 4, 32, M(_SC_LEVEL1_DCACHE_SIZE), 16384 }, { 0x0d, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 }, - { 0x0e, 6, 64, M(_SC_LEVEL1_DCACHE_SIZE), 24576 }, { 0x21, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 }, { 0x22, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 524288 }, { 0x23, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 }, @@ -114,7 +113,6 @@ static const struct intel_02_cache_info { 0x7c, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 }, { 0x7d, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 2097152 }, { 0x7f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 }, - { 0x80, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 }, { 0x82, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 262144 }, { 0x83, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 524288 }, { 0x84, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 1048576 }, @@ -454,10 +452,9 @@ __cache_sysconf (int name) } -/* Data cache size for use in memory and string routines, typically +/* Half the data cache size for use in memory and string routines, typically L1 size. */ long int __x86_64_data_cache_size_half attribute_hidden = 32 * 1024 / 2; -long int __x86_64_data_cache_size attribute_hidden = 32 * 1024; /* Shared cache size for use in memory and string routines, typically L2 or L3 size. */ long int __x86_64_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2; @@ -660,10 +657,7 @@ init_cacheinfo (void) } if (data > 0) - { - __x86_64_data_cache_size_half = data / 2; - __x86_64_data_cache_size = data; - } + __x86_64_data_cache_size_half = data / 2; if (shared > 0) { diff --git a/sysdeps/x86_64/multiarch/ifunc-defines.sym b/sysdeps/x86_64/multiarch/ifunc-defines.sym index eb1538abcc..e2021cdf87 100644 --- a/sysdeps/x86_64/multiarch/ifunc-defines.sym +++ b/sysdeps/x86_64/multiarch/ifunc-defines.sym @@ -13,8 +13,5 @@ CPUID_ECX_OFFSET offsetof (struct cpuid_registers, ecx) CPUID_EDX_OFFSET offsetof (struct cpuid_registers, edx) FAMILY_OFFSET offsetof (struct cpu_features, family) MODEL_OFFSET offsetof (struct cpu_features, model) -FEATURE_OFFSET offsetof (struct cpu_features, feature) -FEATURE_SIZE sizeof (unsigned int) COMMON_CPUID_INDEX_1 -FEATURE_INDEX_1 diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c index 50b2a38fbd..7823aceb9b 100644 --- a/sysdeps/x86_64/multiarch/init-arch.c +++ b/sysdeps/x86_64/multiarch/init-arch.c @@ -64,23 +64,7 @@ __init_cpu_features (void) __cpu_features.model += extended_model; } else if (__cpu_features.family == 0x06) - { - __cpu_features.model += extended_model; - switch (__cpu_features.model) - { - case 0x1a: - case 0x1e: - case 0x1f: - case 0x25: - case 0x2e: - case 0x2f: - /* Rep string instructions are fast on Intel Core i3, i5 - and i7. */ - __cpu_features.feature[index_Fast_Rep_String] - |= bit_Fast_Rep_String; - break; - } - } + __cpu_features.model += extended_model; } /* This spells out "AuthenticAMD". */ else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h index 69492cb3bf..0f8f77a8a1 100644 --- a/sysdeps/x86_64/multiarch/init-arch.h +++ b/sysdeps/x86_64/multiarch/init-arch.h @@ -16,8 +16,6 @@ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ -#define bit_Fast_Rep_String (1 << 0) - #ifdef __ASSEMBLER__ #include <ifunc-defines.h> @@ -30,8 +28,6 @@ #define index_SSSE3 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET #define index_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET -#define index_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE - #else /* __ASSEMBLER__ */ #include <sys/param.h> @@ -43,13 +39,6 @@ enum COMMON_CPUID_INDEX_MAX }; -enum - { - FEATURE_INDEX_1 = 0, - /* Keep the following line at the end. */ - FEATURE_INDEX_MAX - }; - extern struct cpu_features { enum @@ -69,7 +58,6 @@ extern struct cpu_features } cpuid[COMMON_CPUID_INDEX_MAX]; unsigned int family; unsigned int model; - unsigned int feature[FEATURE_INDEX_MAX]; } __cpu_features attribute_hidden; @@ -98,6 +86,4 @@ extern const struct cpu_features *__get_cpu_features (void) #define HAS_SSE4_2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 20) #define HAS_FMA HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 12) -#define index_Fast_Rep_String FEATURE_INDEX_1 - #endif /* __ASSEMBLER__ */ |