diff options
Diffstat (limited to 'sysdeps/x86_64')
724 files changed, 16693 insertions, 8898 deletions
diff --git a/sysdeps/x86_64/Implies b/sysdeps/x86_64/Implies index 811c19a8f2..3d7ded70d2 100644 --- a/sysdeps/x86_64/Implies +++ b/sysdeps/x86_64/Implies @@ -1,4 +1,5 @@ x86 +ieee754/float128 ieee754/ldbl-96 ieee754/dbl-64/wordsize-64 ieee754/dbl-64 diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile index 67ed5ba213..9f1562f1b2 100644 --- a/sysdeps/x86_64/Makefile +++ b/sysdeps/x86_64/Makefile @@ -7,6 +7,10 @@ endif ifeq ($(subdir),gmon) sysdep_routines += _mcount +# We cannot compile _mcount.S with -pg because that would create +# recursive calls when ENTRY is used. Just copy the normal static +# object. +sysdep_noprof += _mcount endif ifeq ($(subdir),malloc) @@ -23,7 +27,7 @@ ifeq ($(subdir),elf) CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\ -mno-mmx) -sysdep-dl-routines += tlsdesc dl-tlsdesc +sysdep-dl-routines += tlsdesc dl-tlsdesc tls_get_addr tests += ifuncmain8 modules-names += ifuncmod8 @@ -39,28 +43,66 @@ $(objpfx)tst-quad2: $(objpfx)tst-quadmod2.so quad-pie-test += tst-quad1pie tst-quad2pie tests += $(quad-pie-test) tests-pie += $(quad-pie-test) +test-extras += tst-quadmod1pie tst-quadmod2pie +extra-test-objs += tst-quadmod1pie.o tst-quadmod2pie.o $(objpfx)tst-quad1pie: $(objpfx)tst-quadmod1pie.o $(objpfx)tst-quad2pie: $(objpfx)tst-quadmod2pie.o -tests += tst-audit3 tst-audit4 tst-audit5 tst-audit6 tst-audit7 tst-audit10 +CFLAGS-tst-quad1pie.c = $(PIE-ccflag) +CFLAGS-tst-quad2pie.c = $(PIE-ccflag) +tests += tst-x86_64-1 +modules-names += x86_64/tst-x86_64mod-1 +LDFLAGS-tst-x86_64mod-1.so = -Wl,-soname,tst-x86_64mod-1.so +ifneq (no,$(have-tunables)) +# Test the state size for XSAVE when XSAVEC is disabled. +tst-x86_64-1-ENV = GLIBC_TUNABLES=glibc.tune.hwcaps=-XSAVEC_Usable +endif + +$(objpfx)tst-x86_64-1: $(objpfx)x86_64/tst-x86_64mod-1.so + +ifneq (no,$(have-tunables)) +tests += tst-platform-1 +modules-names += tst-platformmod-1 x86_64/tst-platformmod-2 +CFLAGS-tst-platform-1.c = -mno-avx +CFLAGS-tst-platformmod-1.c = -mno-avx +CFLAGS-tst-platformmod-2.c = -mno-avx +LDFLAGS-tst-platformmod-2.so = -Wl,-soname,tst-platformmod-2.so +$(objpfx)tst-platform-1: $(objpfx)tst-platformmod-1.so +$(objpfx)tst-platform-1.out: $(objpfx)x86_64/tst-platformmod-2.so +# Turn off AVX512F_Usable and AVX2_Usable so that GLRO(dl_platform) is +# always set to x86_64. +tst-platform-1-ENV = LD_PRELOAD=$(objpfx)\$$PLATFORM/tst-platformmod-2.so \ + GLIBC_TUNABLES=glibc.tune.hwcaps=-AVX512F_Usable,-AVX2_Usable +endif + +tests += tst-audit3 tst-audit4 tst-audit5 tst-audit6 tst-audit7 \ + tst-audit10 tst-sse tst-avx tst-avx512 +test-extras += tst-audit4-aux tst-audit10-aux \ + tst-avx-aux tst-avx512-aux +extra-test-objs += tst-audit4-aux.o tst-audit10-aux.o \ + tst-avx-aux.o tst-avx512-aux.o + +ifeq ($(have-insert),yes) tests += tst-split-dynreloc LDFLAGS-tst-split-dynreloc = -Wl,-T,$(..)sysdeps/x86_64/tst-split-dynreloc.lds tst-split-dynreloc-ENV = LD_BIND_NOW=1 +endif modules-names += tst-auditmod3a tst-auditmod3b \ tst-auditmod4a tst-auditmod4b \ tst-auditmod5a tst-auditmod5b \ tst-auditmod6a tst-auditmod6b tst-auditmod6c \ tst-auditmod7a tst-auditmod7b \ - tst-auditmod10a tst-auditmod10b + tst-auditmod10a tst-auditmod10b \ + tst-ssemod tst-avxmod tst-avx512mod $(objpfx)tst-audit3: $(objpfx)tst-auditmod3a.so $(objpfx)tst-audit3.out: $(objpfx)tst-auditmod3b.so tst-audit3-ENV = LD_AUDIT=$(objpfx)tst-auditmod3b.so -$(objpfx)tst-audit4: $(objpfx)tst-auditmod4a.so +$(objpfx)tst-audit4: $(objpfx)tst-audit4-aux.o $(objpfx)tst-auditmod4a.so $(objpfx)tst-audit4.out: $(objpfx)tst-auditmod4b.so tst-audit4-ENV = LD_AUDIT=$(objpfx)tst-auditmod4b.so @@ -77,25 +119,49 @@ $(objpfx)tst-audit7: $(objpfx)tst-auditmod7a.so $(objpfx)tst-audit7.out: $(objpfx)tst-auditmod7b.so tst-audit7-ENV = LD_AUDIT=$(objpfx)tst-auditmod7b.so -$(objpfx)tst-audit10: $(objpfx)tst-auditmod10a.so +$(objpfx)tst-audit10: $(objpfx)tst-audit10-aux.o $(objpfx)tst-auditmod10a.so $(objpfx)tst-audit10.out: $(objpfx)tst-auditmod10b.so tst-audit10-ENV = LD_AUDIT=$(objpfx)tst-auditmod10b.so +$(objpfx)tst-sse: $(objpfx)tst-ssemod.so +$(objpfx)tst-avx: $(objpfx)tst-avx-aux.o $(objpfx)tst-avxmod.so +$(objpfx)tst-avx512: $(objpfx)tst-avx512-aux.o $(objpfx)tst-avx512mod.so + AVX-CFLAGS=-mavx -mno-vzeroupper -CFLAGS-tst-audit4.c += $(AVX-CFLAGS) +CFLAGS-tst-audit4-aux.c += $(AVX-CFLAGS) CFLAGS-tst-auditmod4a.c += $(AVX-CFLAGS) CFLAGS-tst-auditmod4b.c += $(AVX-CFLAGS) CFLAGS-tst-auditmod6b.c += $(AVX-CFLAGS) CFLAGS-tst-auditmod6c.c += $(AVX-CFLAGS) CFLAGS-tst-auditmod7b.c += $(AVX-CFLAGS) +CFLAGS-tst-avx-aux.c += $(AVX-CFLAGS) +CFLAGS-tst-avxmod.c += $(AVX-CFLAGS) ifeq (yes,$(config-cflags-avx512)) AVX512-CFLAGS = -mavx512f -CFLAGS-tst-audit10.c += $(AVX512-CFLAGS) +CFLAGS-tst-audit10-aux.c += $(AVX512-CFLAGS) CFLAGS-tst-auditmod10a.c += $(AVX512-CFLAGS) CFLAGS-tst-auditmod10b.c += $(AVX512-CFLAGS) +CFLAGS-tst-avx512-aux.c += $(AVX512-CFLAGS) +CFLAGS-tst-avx512mod.c += $(AVX512-CFLAGS) endif endif ifeq ($(subdir),csu) -gen-as-const-headers += tlsdesc.sym +gen-as-const-headers += tlsdesc.sym rtld-offsets.sym endif + +$(objpfx)x86_64/tst-x86_64mod-1.os: $(objpfx)tst-x86_64mod-1.os + $(make-target-directory) + rm -f $@ + ln $< $@ + +do-tests-clean common-mostlyclean: tst-x86_64-1-clean + +.PHONY: tst-x86_64-1-clean +tst-x86_64-1-clean: + -rm -rf $(objpfx)x86_64 + +$(objpfx)x86_64/tst-platformmod-2.os: $(objpfx)tst-platformmod-2.os + $(make-target-directory) + rm -f $@ + ln $< $@ diff --git a/sysdeps/x86_64/__longjmp.S b/sysdeps/x86_64/__longjmp.S index c164626577..d7d123e4bc 100644 --- a/sysdeps/x86_64/__longjmp.S +++ b/sysdeps/x86_64/__longjmp.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2001-2016 Free Software Foundation, Inc. +/* Copyright (C) 2001-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -17,9 +17,18 @@ #include <sysdep.h> #include <jmpbuf-offsets.h> +#include <jmp_buf-ssp.h> #include <asm-syntax.h> #include <stap-probe.h> +/* Don't restore shadow stack register if + 1. Shadow stack isn't enabled. Or + 2. __longjmp is defined for __longjmp_cancel. + */ +#if !SHSTK_ENABLED || defined __longjmp +# undef SHADOW_STACK_POINTER_OFFSET +#endif + /* Jump to the position specified by ENV, causing the setjmp call there to return VAL, or 1 if VAL is 0. void __longjmp (__jmp_buf env, int val). */ @@ -42,6 +51,41 @@ ENTRY(__longjmp) orq %rax, %r9 # endif #endif +#ifdef SHADOW_STACK_POINTER_OFFSET +# if IS_IN (libc) && defined SHARED && defined FEATURE_1_OFFSET + /* Check if Shadow Stack is enabled. */ + testl $X86_FEATURE_1_SHSTK, %fs:FEATURE_1_OFFSET + jz L(skip_ssp) +# else + xorl %eax, %eax +# endif + /* Check and adjust the Shadow-Stack-Pointer. */ + /* Get the current ssp. */ + rdsspq %rax + /* And compare it with the saved ssp value. */ + subq SHADOW_STACK_POINTER_OFFSET(%rdi), %rax + je L(skip_ssp) + /* Count the number of frames to adjust and adjust it + with incssp instruction. The instruction can adjust + the ssp by [0..255] value only thus use a loop if + the number of frames is bigger than 255. */ + negq %rax + shrq $3, %rax + /* NB: We saved Shadow-Stack-Pointer of setjmp. Since we are + restoring Shadow-Stack-Pointer of setjmp's caller, we + need to unwind shadow stack by one more frame. */ + addq $1, %rax + + movl $255, %ebx +L(loop): + cmpq %rbx, %rax + cmovb %rax, %rbx + incsspq %rbx + subq %rbx, %rax + ja L(loop) + +L(skip_ssp): +#endif LIBC_PROBE (longjmp, 3, LP_SIZE@%RDI_LP, -4@%esi, LP_SIZE@%RDX_LP) /* We add unwind information for the target here. */ cfi_def_cfa(%rdi, 0) diff --git a/sysdeps/x86_64/_mcount.S b/sysdeps/x86_64/_mcount.S index 5d7edd2a29..a2f4068b61 100644 --- a/sysdeps/x86_64/_mcount.S +++ b/sysdeps/x86_64/_mcount.S @@ -1,5 +1,5 @@ /* Machine-specific calling sequence for `mcount' profiling function. x86-64 version. - Copyright (C) 2002-2016 Free Software Foundation, Inc. + Copyright (C) 2002-2018 Free Software Foundation, Inc. Contributed by Andreas Jaeger <aj@suse.de>. This file is part of the GNU C Library. @@ -24,81 +24,102 @@ #include <sysdep.h> - .globl C_SYMBOL_NAME(_mcount) - .type C_SYMBOL_NAME(_mcount), @function - .align ALIGNARG(4) -C_LABEL(_mcount) +ENTRY(_mcount) /* Allocate space for 7 registers. */ subq $56,%rsp + cfi_adjust_cfa_offset (56) movq %rax,(%rsp) + cfi_rel_offset (rax, 0) movq %rcx,8(%rsp) + cfi_rel_offset (rcx, 8) movq %rdx,16(%rsp) + cfi_rel_offset (rdx, 16) movq %rsi,24(%rsp) + cfi_rel_offset (rsi, 24) movq %rdi,32(%rsp) + cfi_rel_offset (rdi, 32) movq %r8,40(%rsp) + cfi_rel_offset (r8, 40) movq %r9,48(%rsp) + cfi_rel_offset (r9, 48) /* Setup parameter for __mcount_internal. */ /* selfpc is the return address on the stack. */ movq 56(%rsp),%rsi /* Get frompc via the frame pointer. */ movq 8(%rbp),%rdi -#ifdef PIC - call C_SYMBOL_NAME(__mcount_internal)@PLT -#else call C_SYMBOL_NAME(__mcount_internal) -#endif /* Pop the saved registers. Please note that `mcount' has no return value. */ movq 48(%rsp),%r9 + cfi_restore (r9) movq 40(%rsp),%r8 + cfi_restore (r8) movq 32(%rsp),%rdi + cfi_restore (rdi) movq 24(%rsp),%rsi + cfi_restore (rsi) movq 16(%rsp),%rdx + cfi_restore (rdx) movq 8(%rsp),%rcx + cfi_restore (rcx) movq (%rsp),%rax + cfi_restore (rax) addq $56,%rsp + cfi_adjust_cfa_offset (-56) ret - - ASM_SIZE_DIRECTIVE(C_SYMBOL_NAME(_mcount)) +END(_mcount) #undef mcount weak_alias (_mcount, mcount) - .globl C_SYMBOL_NAME(__fentry__) - .type C_SYMBOL_NAME(__fentry__), @function - .align ALIGNARG(4) -C_LABEL(__fentry__) - /* Allocate space for 7 registers. */ +/* __fentry__ is different from _mcount in that it is called before + function prolog. This means (among other things) that it has non-standard + stack alignment on entry: (%RSP & 0xF) == 0. */ + +ENTRY(__fentry__) + /* Allocate space for 7 registers + (+8 bytes for proper stack alignment). */ subq $64,%rsp + cfi_adjust_cfa_offset (64) movq %rax,(%rsp) + cfi_rel_offset (rax, 0) movq %rcx,8(%rsp) + cfi_rel_offset (rcx, 8) movq %rdx,16(%rsp) + cfi_rel_offset (rdx, 16) movq %rsi,24(%rsp) + cfi_rel_offset (rsi, 24) movq %rdi,32(%rsp) + cfi_rel_offset (rdi, 32) movq %r8,40(%rsp) + cfi_rel_offset (r8, 40) movq %r9,48(%rsp) + cfi_rel_offset (r9, 48) /* Setup parameter for __mcount_internal. */ /* selfpc is the return address on the stack. */ movq 64(%rsp),%rsi /* caller is the return address above it */ movq 72(%rsp),%rdi -#ifdef PIC - call C_SYMBOL_NAME(__mcount_internal)@PLT -#else call C_SYMBOL_NAME(__mcount_internal) -#endif /* Pop the saved registers. Please note that `__fentry__' has no return value. */ movq 48(%rsp),%r9 + cfi_restore (r9) movq 40(%rsp),%r8 + cfi_restore (r8) movq 32(%rsp),%rdi + cfi_restore (rdi) movq 24(%rsp),%rsi + cfi_restore (rsi) movq 16(%rsp),%rdx + cfi_restore (rdx) movq 8(%rsp),%rcx + cfi_restore (rcx) movq (%rsp),%rax + cfi_restore (rax) addq $64,%rsp + cfi_adjust_cfa_offset (-64) ret - - ASM_SIZE_DIRECTIVE(C_SYMBOL_NAME(__fentry__)) +END(__fentry__) diff --git a/sysdeps/x86_64/add_n.S b/sysdeps/x86_64/add_n.S index fc99811476..ba9699a2f0 100644 --- a/sysdeps/x86_64/add_n.S +++ b/sysdeps/x86_64/add_n.S @@ -1,6 +1,6 @@ /* x86-64 __mpn_add_n -- Add two limb vectors of the same length > 0 and store sum in a third limb vector. - Copyright (C) 2006-2016 Free Software Foundation, Inc. + Copyright (C) 2006-2018 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify diff --git a/sysdeps/x86_64/addmul_1.S b/sysdeps/x86_64/addmul_1.S index ab7c2fa701..eefe8004b0 100644 --- a/sysdeps/x86_64/addmul_1.S +++ b/sysdeps/x86_64/addmul_1.S @@ -1,6 +1,6 @@ /* x86-64 __mpn_addmul_1 -- Multiply a limb vector with a limb and add the result to a second limb vector. - Copyright (C) 2003-2016 Free Software Foundation, Inc. + Copyright (C) 2003-2018 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify diff --git a/sysdeps/x86_64/atomic-machine.h b/sysdeps/x86_64/atomic-machine.h index a5b86eb3ce..9d31c64962 100644 --- a/sysdeps/x86_64/atomic-machine.h +++ b/sysdeps/x86_64/atomic-machine.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2002-2016 Free Software Foundation, Inc. +/* Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. @@ -16,10 +16,12 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include <stdint.h> -#include <tls.h> /* For tcbhead_t. */ -#include <libc-internal.h> +#ifndef _X86_64_ATOMIC_MACHINE_H +#define _X86_64_ATOMIC_MACHINE_H 1 +#include <stdint.h> +#include <tls.h> /* For tcbhead_t. */ +#include <libc-pointer-arith.h> /* For cast_to_integer. */ typedef int8_t atomic8_t; typedef uint8_t uatomic8_t; @@ -57,6 +59,7 @@ typedef uintmax_t uatomic_max_t; #define __HAVE_64B_ATOMICS 1 #define USE_ATOMIC_COMPILER_BUILTINS 1 +#define ATOMIC_EXCHANGE_USES_CAS 0 #define atomic_compare_and_exchange_val_acq(mem, newval, oldval) \ __sync_val_compare_and_swap (mem, oldval, newval) @@ -475,3 +478,5 @@ typedef uintmax_t uatomic_max_t; __asm __volatile (LOCK_PREFIX "orl $0, (%%rsp)" ::: "memory") #define atomic_read_barrier() __asm ("" ::: "memory") #define atomic_write_barrier() __asm ("" ::: "memory") + +#endif /* atomic-machine.h */ diff --git a/sysdeps/x86_64/backtrace.c b/sysdeps/x86_64/backtrace.c deleted file mode 100644 index e04407c516..0000000000 --- a/sysdeps/x86_64/backtrace.c +++ /dev/null @@ -1,133 +0,0 @@ -/* Return backtrace of current program state. - Copyright (C) 2003-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - Contributed by Jakub Jelinek <jakub@redhat.com>, 2003. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <libc-lock.h> -#include <dlfcn.h> -#include <execinfo.h> -#include <stdlib.h> -#include <unwind.h> - -struct trace_arg -{ - void **array; - _Unwind_Word cfa; - int cnt; - int size; -}; - -#ifdef SHARED -static _Unwind_Reason_Code (*unwind_backtrace) (_Unwind_Trace_Fn, void *); -static _Unwind_Ptr (*unwind_getip) (struct _Unwind_Context *); -static _Unwind_Word (*unwind_getcfa) (struct _Unwind_Context *); -static void *libgcc_handle; - - -/* Dummy version in case libgcc_s does not contain the real code. */ -static _Unwind_Word -dummy_getcfa (struct _Unwind_Context *ctx __attribute__ ((unused))) -{ - return 0; -} - - -static void -init (void) -{ - libgcc_handle = __libc_dlopen ("libgcc_s.so.1"); - - if (libgcc_handle == NULL) - return; - - unwind_backtrace = __libc_dlsym (libgcc_handle, "_Unwind_Backtrace"); - unwind_getip = __libc_dlsym (libgcc_handle, "_Unwind_GetIP"); - if (unwind_getip == NULL) - unwind_backtrace = NULL; - unwind_getcfa = (__libc_dlsym (libgcc_handle, "_Unwind_GetCFA") - ?: dummy_getcfa); -} -#else -# define unwind_backtrace _Unwind_Backtrace -# define unwind_getip _Unwind_GetIP -# define unwind_getcfa _Unwind_GetCFA -#endif - -static _Unwind_Reason_Code -backtrace_helper (struct _Unwind_Context *ctx, void *a) -{ - struct trace_arg *arg = a; - - /* We are first called with address in the __backtrace function. - Skip it. */ - if (arg->cnt != -1) - { - arg->array[arg->cnt] = (void *) unwind_getip (ctx); - - /* Check whether we make any progress. */ - _Unwind_Word cfa = unwind_getcfa (ctx); - - if (arg->cnt > 0 && arg->array[arg->cnt - 1] == arg->array[arg->cnt] - && cfa == arg->cfa) - return _URC_END_OF_STACK; - arg->cfa = cfa; - } - if (++arg->cnt == arg->size) - return _URC_END_OF_STACK; - return _URC_NO_REASON; -} - -int -__backtrace (void **array, int size) -{ - struct trace_arg arg = { .array = array, .cfa = 0, .size = size, .cnt = -1 }; - - if (size <= 0) - return 0; - -#ifdef SHARED - __libc_once_define (static, once); - - __libc_once (once, init); - if (unwind_backtrace == NULL) - return 0; -#endif - - unwind_backtrace (backtrace_helper, &arg); - - /* _Unwind_Backtrace seems to put NULL address above - _start. Fix it up here. */ - if (arg.cnt > 1 && arg.array[arg.cnt - 1] == NULL) - --arg.cnt; - return arg.cnt != -1 ? arg.cnt : 0; -} -weak_alias (__backtrace, backtrace) -libc_hidden_def (__backtrace) - - -#ifdef SHARED -/* Free all resources if necessary. */ -libc_freeres_fn (free_mem) -{ - unwind_backtrace = NULL; - if (libgcc_handle != NULL) - { - __libc_dlclose (libgcc_handle); - libgcc_handle = NULL; - } -} -#endif diff --git a/sysdeps/x86_64/bsd-_setjmp.S b/sysdeps/x86_64/bsd-_setjmp.S index 1a2a94f1a6..58c997de59 100644 --- a/sysdeps/x86_64/bsd-_setjmp.S +++ b/sysdeps/x86_64/bsd-_setjmp.S @@ -1,5 +1,5 @@ /* BSD `_setjmp' entry point to `sigsetjmp (..., 0)'. x86-64 version. - Copyright (C) 1994-2016 Free Software Foundation, Inc. + Copyright (C) 1994-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/bsd-setjmp.S b/sysdeps/x86_64/bsd-setjmp.S index 11d9d8daa0..8e3c430dac 100644 --- a/sysdeps/x86_64/bsd-setjmp.S +++ b/sysdeps/x86_64/bsd-setjmp.S @@ -1,5 +1,5 @@ /* BSD `setjmp' entry point to `sigsetjmp (..., 1)'. x86-64 version. - Copyright (C) 1994-2016 Free Software Foundation, Inc. + Copyright (C) 1994-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/cacheinfo.c b/sysdeps/x86_64/cacheinfo.c deleted file mode 100644 index 96463df064..0000000000 --- a/sysdeps/x86_64/cacheinfo.c +++ /dev/null @@ -1,665 +0,0 @@ -/* x86_64 cache info. - Copyright (C) 2003-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <assert.h> -#include <stdbool.h> -#include <stdlib.h> -#include <unistd.h> -#include <cpuid.h> -#include <init-arch.h> - -#define is_intel GLRO(dl_x86_cpu_features).kind == arch_kind_intel -#define is_amd GLRO(dl_x86_cpu_features).kind == arch_kind_amd -#define max_cpuid GLRO(dl_x86_cpu_features).max_cpuid - -static const struct intel_02_cache_info -{ - unsigned char idx; - unsigned char assoc; - unsigned char linesize; - unsigned char rel_name; - unsigned int size; -} intel_02_known [] = - { -#define M(sc) ((sc) - _SC_LEVEL1_ICACHE_SIZE) - { 0x06, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 8192 }, - { 0x08, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 16384 }, - { 0x09, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 32768 }, - { 0x0a, 2, 32, M(_SC_LEVEL1_DCACHE_SIZE), 8192 }, - { 0x0c, 4, 32, M(_SC_LEVEL1_DCACHE_SIZE), 16384 }, - { 0x0d, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 }, - { 0x0e, 6, 64, M(_SC_LEVEL1_DCACHE_SIZE), 24576 }, - { 0x21, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 }, - { 0x22, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 524288 }, - { 0x23, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 }, - { 0x25, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 }, - { 0x29, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 }, - { 0x2c, 8, 64, M(_SC_LEVEL1_DCACHE_SIZE), 32768 }, - { 0x30, 8, 64, M(_SC_LEVEL1_ICACHE_SIZE), 32768 }, - { 0x39, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 }, - { 0x3a, 6, 64, M(_SC_LEVEL2_CACHE_SIZE), 196608 }, - { 0x3b, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 }, - { 0x3c, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 }, - { 0x3d, 6, 64, M(_SC_LEVEL2_CACHE_SIZE), 393216 }, - { 0x3e, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 }, - { 0x3f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 }, - { 0x41, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 131072 }, - { 0x42, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 262144 }, - { 0x43, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 524288 }, - { 0x44, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 1048576 }, - { 0x45, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 2097152 }, - { 0x46, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 }, - { 0x47, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 }, - { 0x48, 12, 64, M(_SC_LEVEL2_CACHE_SIZE), 3145728 }, - { 0x49, 16, 64, M(_SC_LEVEL2_CACHE_SIZE), 4194304 }, - { 0x4a, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 6291456 }, - { 0x4b, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 }, - { 0x4c, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 }, - { 0x4d, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 16777216 }, - { 0x4e, 24, 64, M(_SC_LEVEL2_CACHE_SIZE), 6291456 }, - { 0x60, 8, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 }, - { 0x66, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 8192 }, - { 0x67, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 }, - { 0x68, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 32768 }, - { 0x78, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 }, - { 0x79, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 }, - { 0x7a, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 }, - { 0x7b, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 }, - { 0x7c, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 }, - { 0x7d, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 2097152 }, - { 0x7f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 }, - { 0x80, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 }, - { 0x82, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 262144 }, - { 0x83, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 524288 }, - { 0x84, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 1048576 }, - { 0x85, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 2097152 }, - { 0x86, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 }, - { 0x87, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 }, - { 0xd0, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 524288 }, - { 0xd1, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 }, - { 0xd2, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 }, - { 0xd6, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 }, - { 0xd7, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 }, - { 0xd8, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 }, - { 0xdc, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 }, - { 0xdd, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 }, - { 0xde, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 }, - { 0xe2, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 }, - { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 }, - { 0xe4, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 }, - { 0xea, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 }, - { 0xeb, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 18874368 }, - { 0xec, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 25165824 }, - }; - -#define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0])) - -static int -intel_02_known_compare (const void *p1, const void *p2) -{ - const struct intel_02_cache_info *i1; - const struct intel_02_cache_info *i2; - - i1 = (const struct intel_02_cache_info *) p1; - i2 = (const struct intel_02_cache_info *) p2; - - if (i1->idx == i2->idx) - return 0; - - return i1->idx < i2->idx ? -1 : 1; -} - - -static long int -__attribute__ ((noinline)) -intel_check_word (int name, unsigned int value, bool *has_level_2, - bool *no_level_2_or_3) -{ - if ((value & 0x80000000) != 0) - /* The register value is reserved. */ - return 0; - - /* Fold the name. The _SC_ constants are always in the order SIZE, - ASSOC, LINESIZE. */ - int folded_rel_name = (M(name) / 3) * 3; - - while (value != 0) - { - unsigned int byte = value & 0xff; - - if (byte == 0x40) - { - *no_level_2_or_3 = true; - - if (folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE)) - /* No need to look further. */ - break; - } - else if (byte == 0xff) - { - /* CPUID leaf 0x4 contains all the information. We need to - iterate over it. */ - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - - unsigned int round = 0; - while (1) - { - __cpuid_count (4, round, eax, ebx, ecx, edx); - - enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f; - if (type == null) - /* That was the end. */ - break; - - unsigned int level = (eax >> 5) & 0x7; - - if ((level == 1 && type == data - && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE)) - || (level == 1 && type == inst - && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE)) - || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE)) - || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE)) - || (level == 4 && folded_rel_name == M(_SC_LEVEL4_CACHE_SIZE))) - { - unsigned int offset = M(name) - folded_rel_name; - - if (offset == 0) - /* Cache size. */ - return (((ebx >> 22) + 1) - * (((ebx >> 12) & 0x3ff) + 1) - * ((ebx & 0xfff) + 1) - * (ecx + 1)); - if (offset == 1) - return (ebx >> 22) + 1; - - assert (offset == 2); - return (ebx & 0xfff) + 1; - } - - ++round; - } - /* There is no other cache information anywhere else. */ - break; - } - else - { - if (byte == 0x49 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE)) - { - /* Intel reused this value. For family 15, model 6 it - specifies the 3rd level cache. Otherwise the 2nd - level cache. */ - unsigned int family = GLRO(dl_x86_cpu_features).family; - unsigned int model = GLRO(dl_x86_cpu_features).model; - - if (family == 15 && model == 6) - { - /* The level 3 cache is encoded for this model like - the level 2 cache is for other models. Pretend - the caller asked for the level 2 cache. */ - name = (_SC_LEVEL2_CACHE_SIZE - + (name - _SC_LEVEL3_CACHE_SIZE)); - folded_rel_name = M(_SC_LEVEL2_CACHE_SIZE); - } - } - - struct intel_02_cache_info *found; - struct intel_02_cache_info search; - - search.idx = byte; - found = bsearch (&search, intel_02_known, nintel_02_known, - sizeof (intel_02_known[0]), intel_02_known_compare); - if (found != NULL) - { - if (found->rel_name == folded_rel_name) - { - unsigned int offset = M(name) - folded_rel_name; - - if (offset == 0) - /* Cache size. */ - return found->size; - if (offset == 1) - return found->assoc; - - assert (offset == 2); - return found->linesize; - } - - if (found->rel_name == M(_SC_LEVEL2_CACHE_SIZE)) - *has_level_2 = true; - } - } - - /* Next byte for the next round. */ - value >>= 8; - } - - /* Nothing found. */ - return 0; -} - - -static long int __attribute__ ((noinline)) -handle_intel (int name, unsigned int maxidx) -{ - assert (maxidx >= 2); - - /* OK, we can use the CPUID instruction to get all info about the - caches. */ - unsigned int cnt = 0; - unsigned int max = 1; - long int result = 0; - bool no_level_2_or_3 = false; - bool has_level_2 = false; - - while (cnt++ < max) - { - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - __cpuid (2, eax, ebx, ecx, edx); - - /* The low byte of EAX in the first round contain the number of - rounds we have to make. At least one, the one we are already - doing. */ - if (cnt == 1) - { - max = eax & 0xff; - eax &= 0xffffff00; - } - - /* Process the individual registers' value. */ - result = intel_check_word (name, eax, &has_level_2, &no_level_2_or_3); - if (result != 0) - return result; - - result = intel_check_word (name, ebx, &has_level_2, &no_level_2_or_3); - if (result != 0) - return result; - - result = intel_check_word (name, ecx, &has_level_2, &no_level_2_or_3); - if (result != 0) - return result; - - result = intel_check_word (name, edx, &has_level_2, &no_level_2_or_3); - if (result != 0) - return result; - } - - if (name >= _SC_LEVEL2_CACHE_SIZE && name <= _SC_LEVEL3_CACHE_LINESIZE - && no_level_2_or_3) - return -1; - - return 0; -} - - -static long int __attribute__ ((noinline)) -handle_amd (int name) -{ - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - __cpuid (0x80000000, eax, ebx, ecx, edx); - - /* No level 4 cache (yet). */ - if (name > _SC_LEVEL3_CACHE_LINESIZE) - return 0; - - unsigned int fn = 0x80000005 + (name >= _SC_LEVEL2_CACHE_SIZE); - if (eax < fn) - return 0; - - __cpuid (fn, eax, ebx, ecx, edx); - - if (name < _SC_LEVEL1_DCACHE_SIZE) - { - name += _SC_LEVEL1_DCACHE_SIZE - _SC_LEVEL1_ICACHE_SIZE; - ecx = edx; - } - - switch (name) - { - case _SC_LEVEL1_DCACHE_SIZE: - return (ecx >> 14) & 0x3fc00; - - case _SC_LEVEL1_DCACHE_ASSOC: - ecx >>= 16; - if ((ecx & 0xff) == 0xff) - /* Fully associative. */ - return (ecx << 2) & 0x3fc00; - return ecx & 0xff; - - case _SC_LEVEL1_DCACHE_LINESIZE: - return ecx & 0xff; - - case _SC_LEVEL2_CACHE_SIZE: - return (ecx & 0xf000) == 0 ? 0 : (ecx >> 6) & 0x3fffc00; - - case _SC_LEVEL2_CACHE_ASSOC: - switch ((ecx >> 12) & 0xf) - { - case 0: - case 1: - case 2: - case 4: - return (ecx >> 12) & 0xf; - case 6: - return 8; - case 8: - return 16; - case 10: - return 32; - case 11: - return 48; - case 12: - return 64; - case 13: - return 96; - case 14: - return 128; - case 15: - return ((ecx >> 6) & 0x3fffc00) / (ecx & 0xff); - default: - return 0; - } - /* NOTREACHED */ - - case _SC_LEVEL2_CACHE_LINESIZE: - return (ecx & 0xf000) == 0 ? 0 : ecx & 0xff; - - case _SC_LEVEL3_CACHE_SIZE: - return (edx & 0xf000) == 0 ? 0 : (edx & 0x3ffc0000) << 1; - - case _SC_LEVEL3_CACHE_ASSOC: - switch ((edx >> 12) & 0xf) - { - case 0: - case 1: - case 2: - case 4: - return (edx >> 12) & 0xf; - case 6: - return 8; - case 8: - return 16; - case 10: - return 32; - case 11: - return 48; - case 12: - return 64; - case 13: - return 96; - case 14: - return 128; - case 15: - return ((edx & 0x3ffc0000) << 1) / (edx & 0xff); - default: - return 0; - } - /* NOTREACHED */ - - case _SC_LEVEL3_CACHE_LINESIZE: - return (edx & 0xf000) == 0 ? 0 : edx & 0xff; - - default: - assert (! "cannot happen"); - } - return -1; -} - - -/* Get the value of the system variable NAME. */ -long int -attribute_hidden -__cache_sysconf (int name) -{ - if (is_intel) - return handle_intel (name, max_cpuid); - - if (is_amd) - return handle_amd (name); - - // XXX Fill in more vendors. - - /* CPU not known, we have no information. */ - return 0; -} - - -/* Data cache size for use in memory and string routines, typically - L1 size, rounded to multiple of 256 bytes. */ -long int __x86_data_cache_size_half attribute_hidden = 32 * 1024 / 2; -long int __x86_data_cache_size attribute_hidden = 32 * 1024; -/* Similar to __x86_data_cache_size_half, but not rounded. */ -long int __x86_raw_data_cache_size_half attribute_hidden = 32 * 1024 / 2; -/* Similar to __x86_data_cache_size, but not rounded. */ -long int __x86_raw_data_cache_size attribute_hidden = 32 * 1024; -/* Shared cache size for use in memory and string routines, typically - L2 or L3 size, rounded to multiple of 256 bytes. */ -long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2; -long int __x86_shared_cache_size attribute_hidden = 1024 * 1024; -/* Similar to __x86_shared_cache_size_half, but not rounded. */ -long int __x86_raw_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2; -/* Similar to __x86_shared_cache_size, but not rounded. */ -long int __x86_raw_shared_cache_size attribute_hidden = 1024 * 1024; - -#ifndef DISABLE_PREFETCHW -/* PREFETCHW support flag for use in memory and string routines. */ -int __x86_prefetchw attribute_hidden; -#endif - - -static void -__attribute__((constructor)) -init_cacheinfo (void) -{ - /* Find out what brand of processor. */ - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - int max_cpuid_ex; - long int data = -1; - long int shared = -1; - unsigned int level; - unsigned int threads = 0; - - if (is_intel) - { - data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid); - - /* Try L3 first. */ - level = 3; - shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, max_cpuid); - - if (shared <= 0) - { - /* Try L2 otherwise. */ - level = 2; - shared = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid); - } - - /* Figure out the number of logical threads that share the - highest cache level. */ - if (max_cpuid >= 4) - { - unsigned int family = GLRO(dl_x86_cpu_features).family; - unsigned int model = GLRO(dl_x86_cpu_features).model; - - int i = 0; - - /* Query until desired cache level is enumerated. */ - do - { - __cpuid_count (4, i++, eax, ebx, ecx, edx); - - /* There seems to be a bug in at least some Pentium Ds - which sometimes fail to iterate all cache parameters. - Do not loop indefinitely here, stop in this case and - assume there is no such information. */ - if ((eax & 0x1f) == 0) - goto intel_bug_no_cache_info; - } - while (((eax >> 5) & 0x7) != level); - - threads = (eax >> 14) & 0x3ff; - - /* If max_cpuid >= 11, THREADS is the maximum number of - addressable IDs for logical processors sharing the - cache, instead of the maximum number of threads - sharing the cache. */ - if (threads && max_cpuid >= 11) - { - /* Find the number of logical processors shipped in - one core and apply count mask. */ - i = 0; - while (1) - { - __cpuid_count (11, i++, eax, ebx, ecx, edx); - - int shipped = ebx & 0xff; - int type = ecx & 0xff0; - if (shipped == 0 || type == 0) - break; - else if (type == 0x200) - { - int count_mask; - - /* Compute count mask. */ - asm ("bsr %1, %0" - : "=r" (count_mask) : "g" (threads)); - count_mask = ~(-1 << (count_mask + 1)); - threads = (shipped - 1) & count_mask; - break; - } - } - } - threads += 1; - if (threads > 2 && level == 2 && family == 6) - { - switch (model) - { - case 0x57: - /* Knights Landing has L2 cache shared by 2 cores. */ - case 0x37: - case 0x4a: - case 0x4d: - case 0x5a: - case 0x5d: - /* Silvermont has L2 cache shared by 2 cores. */ - threads = 2; - break; - default: - break; - } - } - } - else - { - intel_bug_no_cache_info: - /* Assume that all logical threads share the highest cache level. */ - - threads - = ((GLRO(dl_x86_cpu_features).cpuid[COMMON_CPUID_INDEX_1].ebx - >> 16) & 0xff); - } - - /* Cap usage of highest cache level to the number of supported - threads. */ - if (shared > 0 && threads > 0) - shared /= threads; - } - /* This spells out "AuthenticAMD". */ - else if (is_amd) - { - data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); - long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE); - shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); - - /* Get maximum extended function. */ - __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx); - - if (shared <= 0) - /* No shared L3 cache. All we have is the L2 cache. */ - shared = core; - else - { - /* Figure out the number of logical threads that share L3. */ - if (max_cpuid_ex >= 0x80000008) - { - /* Get width of APIC ID. */ - __cpuid (0x80000008, max_cpuid_ex, ebx, ecx, edx); - threads = 1 << ((ecx >> 12) & 0x0f); - } - - if (threads == 0) - { - /* If APIC ID width is not available, use logical - processor count. */ - __cpuid (0x00000001, max_cpuid_ex, ebx, ecx, edx); - - if ((edx & (1 << 28)) != 0) - threads = (ebx >> 16) & 0xff; - } - - /* Cap usage of highest cache level to the number of - supported threads. */ - if (threads > 0) - shared /= threads; - - /* Account for exclusive L2 and L3 caches. */ - shared += core; - } - -#ifndef DISABLE_PREFETCHW - if (max_cpuid_ex >= 0x80000001) - { - __cpuid (0x80000001, eax, ebx, ecx, edx); - /* PREFETCHW || 3DNow! */ - if ((ecx & 0x100) || (edx & 0x80000000)) - __x86_prefetchw = -1; - } -#endif - } - - if (data > 0) - { - __x86_raw_data_cache_size_half = data / 2; - __x86_raw_data_cache_size = data; - /* Round data cache size to multiple of 256 bytes. */ - data = data & ~255L; - __x86_data_cache_size_half = data / 2; - __x86_data_cache_size = data; - } - - if (shared > 0) - { - __x86_raw_shared_cache_size_half = shared / 2; - __x86_raw_shared_cache_size = shared; - /* Round shared cache size to multiple of 256 bytes. */ - shared = shared & ~255L; - __x86_shared_cache_size_half = shared / 2; - __x86_shared_cache_size = shared; - } -} diff --git a/sysdeps/x86_64/configure b/sysdeps/x86_64/configure index c72b9d3184..8674d14569 100644 --- a/sysdeps/x86_64/configure +++ b/sysdeps/x86_64/configure @@ -1,13 +1,12 @@ # This file is generated from configure.ac by Autoconf. DO NOT EDIT! # Local configure fragment for sysdeps/x86_64. -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX512 support in assembler" >&5 -$as_echo_n "checking for AVX512 support in assembler... " >&6; } -if ${libc_cv_asm_avx512+:} false; then : +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX512DQ support in assembler" >&5 +$as_echo_n "checking for AVX512DQ support in assembler... " >&6; } +if ${libc_cv_asm_avx512dq+:} false; then : $as_echo_n "(cached) " >&6 else cat > conftest.s <<\EOF - vmovdqu64 %zmm0, (%rsp) vandpd (%rax), %zmm6, %zmm1 EOF if { ac_try='${CC-cc} -c $ASFLAGS conftest.s 1>&5' @@ -16,16 +15,16 @@ if { ac_try='${CC-cc} -c $ASFLAGS conftest.s 1>&5' ac_status=$? $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; }; }; then - libc_cv_asm_avx512=yes + libc_cv_asm_avx512dq=yes else - libc_cv_asm_avx512=no + libc_cv_asm_avx512dq=no fi rm -f conftest* fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_asm_avx512" >&5 -$as_echo "$libc_cv_asm_avx512" >&6; } -if test $libc_cv_asm_avx512 == yes; then - $as_echo "#define HAVE_AVX512_ASM_SUPPORT 1" >>confdefs.h +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_asm_avx512dq" >&5 +$as_echo "$libc_cv_asm_avx512dq" >&6; } +if test $libc_cv_asm_avx512dq = yes; then + $as_echo "#define HAVE_AVX512DQ_ASM_SUPPORT 1" >>confdefs.h fi @@ -40,7 +39,7 @@ else ac_status=$? $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; }; }; then : - libc_cv_cc_avx512=$libc_cv_asm_avx512 + libc_cv_cc_avx512=$libc_cv_asm_avx512dq else libc_cv_cc_avx512=no fi @@ -77,7 +76,7 @@ rm -f conftest* fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_asm_mpx" >&5 $as_echo "$libc_cv_asm_mpx" >&6; } -if test $libc_cv_asm_mpx == yes; then +if test $libc_cv_asm_mpx = yes; then $as_echo "#define HAVE_MPX_SUPPORT 1" >>confdefs.h fi @@ -86,6 +85,41 @@ if test x"$build_mathvec" = xnotset; then build_mathvec=yes fi +if test "$static_pie" = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for linker static PIE support" >&5 +$as_echo_n "checking for linker static PIE support... " >&6; } +if ${libc_cv_ld_static_pie+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat > conftest.s <<\EOF + .text + .global _start + .weak foo +_start: + leaq foo(%rip), %rax +EOF + libc_cv_pie_option="-Wl,-pie" + if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -nostartfiles -nostdlib $no_ssp $libc_cv_pie_option -o conftest conftest.s 1>&5' + { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 + (eval $ac_try) 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; }; then + libc_cv_ld_static_pie=yes + else + libc_cv_ld_static_pie=no + fi +rm -f conftest* +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_ld_static_pie" >&5 +$as_echo "$libc_cv_ld_static_pie" >&6; } + if test "$libc_cv_ld_static_pie" != yes; then + as_fn_error $? "linker support for static PIE needed" "$LINENO" 5 + fi +fi + $as_echo "#define PI_STATIC_AND_HIDDEN 1" >>confdefs.h -# work around problem with autoconf and empty lines at the end of files + +test -n "$critic_missing" && as_fn_error $? " +*** $critic_missing" "$LINENO" 5 diff --git a/sysdeps/x86_64/configure.ac b/sysdeps/x86_64/configure.ac index 37b1059af3..b7d2c0124f 100644 --- a/sysdeps/x86_64/configure.ac +++ b/sysdeps/x86_64/configure.ac @@ -1,25 +1,24 @@ GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory. # Local configure fragment for sysdeps/x86_64. -dnl Check if asm supports AVX512. -AC_CACHE_CHECK(for AVX512 support in assembler, libc_cv_asm_avx512, [dnl +dnl Check if asm supports AVX512DQ. +AC_CACHE_CHECK(for AVX512DQ support in assembler, libc_cv_asm_avx512dq, [dnl cat > conftest.s <<\EOF - vmovdqu64 %zmm0, (%rsp) vandpd (%rax), %zmm6, %zmm1 EOF if AC_TRY_COMMAND(${CC-cc} -c $ASFLAGS conftest.s 1>&AS_MESSAGE_LOG_FD); then - libc_cv_asm_avx512=yes + libc_cv_asm_avx512dq=yes else - libc_cv_asm_avx512=no + libc_cv_asm_avx512dq=no fi rm -f conftest*]) -if test $libc_cv_asm_avx512 == yes; then - AC_DEFINE(HAVE_AVX512_ASM_SUPPORT) +if test $libc_cv_asm_avx512dq = yes; then + AC_DEFINE(HAVE_AVX512DQ_ASM_SUPPORT) fi dnl Check if -mavx512f works. AC_CACHE_CHECK(for AVX512 support, libc_cv_cc_avx512, [dnl -LIBC_TRY_CC_OPTION([-mavx512f], [libc_cv_cc_avx512=$libc_cv_asm_avx512], [libc_cv_cc_avx512=no]) +LIBC_TRY_CC_OPTION([-mavx512f], [libc_cv_cc_avx512=$libc_cv_asm_avx512dq], [libc_cv_cc_avx512=no]) ]) if test $libc_cv_cc_avx512 = yes; then AC_DEFINE(HAVE_AVX512_SUPPORT) @@ -37,7 +36,7 @@ else libc_cv_asm_mpx=no fi rm -f conftest*]) -if test $libc_cv_asm_mpx == yes; then +if test $libc_cv_asm_mpx = yes; then AC_DEFINE(HAVE_MPX_SUPPORT) fi @@ -45,7 +44,34 @@ if test x"$build_mathvec" = xnotset; then build_mathvec=yes fi +dnl Check if linker supports static PIE with the fix for +dnl +dnl https://sourceware.org/bugzilla/show_bug.cgi?id=21782 +dnl +if test "$static_pie" = yes; then + AC_CACHE_CHECK(for linker static PIE support, libc_cv_ld_static_pie, [dnl +cat > conftest.s <<\EOF + .text + .global _start + .weak foo +_start: + leaq foo(%rip), %rax +EOF + libc_cv_pie_option="-Wl,-pie" + if AC_TRY_COMMAND(${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -nostartfiles -nostdlib $no_ssp $libc_cv_pie_option -o conftest conftest.s 1>&AS_MESSAGE_LOG_FD); then + libc_cv_ld_static_pie=yes + else + libc_cv_ld_static_pie=no + fi +rm -f conftest*]) + if test "$libc_cv_ld_static_pie" != yes; then + AC_MSG_ERROR([linker support for static PIE needed]) + fi +fi + dnl It is always possible to access static and hidden symbols in an dnl position independent way. AC_DEFINE(PI_STATIC_AND_HIDDEN) -# work around problem with autoconf and empty lines at the end of files + +test -n "$critic_missing" && AC_MSG_ERROR([ +*** $critic_missing]) diff --git a/sysdeps/x86_64/crti.S b/sysdeps/x86_64/crti.S index a34525974a..067ac14884 100644 --- a/sysdeps/x86_64/crti.S +++ b/sysdeps/x86_64/crti.S @@ -1,5 +1,5 @@ /* Special .init and .fini section support for x86-64. - Copyright (C) 2012-2016 Free Software Foundation, Inc. + Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -58,15 +58,17 @@ .section .init,"ax",@progbits .p2align 2 .globl _init + .hidden _init .type _init, @function _init: + _CET_ENDBR /* Maintain 16-byte stack alignment for called functions. */ subq $8, %rsp #if PREINIT_FUNCTION_WEAK movq PREINIT_FUNCTION@GOTPCREL(%rip), %rax testq %rax, %rax je .Lno_weak_fn - call PREINIT_FUNCTION@PLT + call *%rax .Lno_weak_fn: #else call PREINIT_FUNCTION @@ -75,6 +77,8 @@ _init: .section .fini,"ax",@progbits .p2align 2 .globl _fini + .hidden _fini .type _fini, @function _fini: + _CET_ENDBR subq $8, %rsp diff --git a/sysdeps/x86_64/crtn.S b/sysdeps/x86_64/crtn.S index b2fa0c6765..2463d742fd 100644 --- a/sysdeps/x86_64/crtn.S +++ b/sysdeps/x86_64/crtn.S @@ -1,5 +1,5 @@ /* Special .init and .fini section support for x86-64. - Copyright (C) 2012-2016 Free Software Foundation, Inc. + Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/dl-irel.h b/sysdeps/x86_64/dl-irel.h index 80d7d1dd78..6ecc50fb42 100644 --- a/sysdeps/x86_64/dl-irel.h +++ b/sysdeps/x86_64/dl-irel.h @@ -1,6 +1,6 @@ /* Machine-dependent ELF indirect relocation inline functions. x86-64 version. - Copyright (C) 2009-2016 Free Software Foundation, Inc. + Copyright (C) 2009-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/dl-lookupcfg.h b/sysdeps/x86_64/dl-lookupcfg.h index 033b475889..5399cf25ab 100644 --- a/sysdeps/x86_64/dl-lookupcfg.h +++ b/sysdeps/x86_64/dl-lookupcfg.h @@ -1,5 +1,5 @@ /* Configuration of lookup functions. - Copyright (C) 2005-2016 Free Software Foundation, Inc. + Copyright (C) 2005-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -26,7 +26,6 @@ struct link_map; -extern void _dl_unmap (struct link_map *map) - internal_function attribute_hidden; +extern void _dl_unmap (struct link_map *map) attribute_hidden; #define DL_UNMAP(map) _dl_unmap (map) diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h index 980ca73cf2..1942ed5061 100644 --- a/sysdeps/x86_64/dl-machine.h +++ b/sysdeps/x86_64/dl-machine.h @@ -1,5 +1,5 @@ /* Machine-dependent ELF dynamic relocation inline functions. x86-64 version. - Copyright (C) 2001-2016 Free Software Foundation, Inc. + Copyright (C) 2001-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Andreas Jaeger <aj@suse.de>. @@ -66,9 +66,9 @@ static inline int __attribute__ ((unused, always_inline)) elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) { Elf64_Addr *got; - extern void _dl_runtime_resolve_sse (ElfW(Word)) attribute_hidden; - extern void _dl_runtime_resolve_avx (ElfW(Word)) attribute_hidden; - extern void _dl_runtime_resolve_avx512 (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_resolve_fxsave (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_resolve_xsave (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_resolve_xsavec (ElfW(Word)) attribute_hidden; extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden; extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden; extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden; @@ -117,12 +117,14 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) /* This function will get called to fix up the GOT entry indicated by the offset on the stack, and then jump to the resolved address. */ - if (HAS_ARCH_FEATURE (AVX512F_Usable)) - *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx512; - else if (HAS_ARCH_FEATURE (AVX_Usable)) - *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx; + if (GLRO(dl_x86_cpu_features).xsave_state_size != 0) + *(ElfW(Addr) *) (got + 2) + = (HAS_ARCH_FEATURE (XSAVEC_Usable) + ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec + : (ElfW(Addr)) &_dl_runtime_resolve_xsave); else - *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_sse; + *(ElfW(Addr) *) (got + 2) + = (ElfW(Addr)) &_dl_runtime_resolve_fxsave; } } @@ -220,15 +222,20 @@ _dl_start_user:\n\ static inline void __attribute__ ((unused)) dl_platform_init (void) { +#if IS_IN (rtld) + /* init_cpu_features has been called early from __libc_start_main in + static executable. */ + init_cpu_features (&GLRO(dl_x86_cpu_features)); +#else if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0') /* Avoid an empty string which would disturb us. */ GLRO(dl_platform) = NULL; - - init_cpu_features (&GLRO(dl_x86_cpu_features)); +#endif } static inline ElfW(Addr) elf_machine_fixup_plt (struct link_map *map, lookup_t t, + const ElfW(Sym) *refsym, const ElfW(Sym) *sym, const ElfW(Rela) *reloc, ElfW(Addr) *reloc_addr, ElfW(Addr) value) { @@ -299,15 +306,29 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc, const ElfW(Sym) *const refsym = sym; # endif struct link_map *sym_map = RESOLVE_MAP (&sym, version, r_type); - ElfW(Addr) value = (sym == NULL ? 0 - : (ElfW(Addr)) sym_map->l_addr + sym->st_value); + ElfW(Addr) value = SYMBOL_ADDRESS (sym_map, sym, true); if (sym != NULL - && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC, - 0) - && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1) - && __builtin_expect (!skip_ifunc, 1)) - value = ((ElfW(Addr) (*) (void)) value) (); + && __glibc_unlikely (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC) + && __glibc_likely (sym->st_shndx != SHN_UNDEF) + && __glibc_likely (!skip_ifunc)) + { +# ifndef RTLD_BOOTSTRAP + if (sym_map != map + && sym_map->l_type != lt_executable + && !sym_map->l_relocated) + { + const char *strtab + = (const char *) D_PTR (map, l_info[DT_STRTAB]); + _dl_error_printf ("\ +%s: Relink `%s' with `%s' for IFUNC symbol `%s'\n", + RTLD_PROGNAME, map->l_name, + sym_map->l_name, + strtab + refsym->st_name); + } +# endif + value = ((ElfW(Addr) (*) (void)) value) (); + } switch (r_type) { @@ -477,8 +498,8 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc, break; memcpy (reloc_addr_arg, (void *) value, MIN (sym->st_size, refsym->st_size)); - if (__builtin_expect (sym->st_size > refsym->st_size, 0) - || (__builtin_expect (sym->st_size < refsym->st_size, 0) + if (__glibc_unlikely (sym->st_size > refsym->st_size) + || (__glibc_unlikely (sym->st_size < refsym->st_size) && GLRO(dl_verbose))) { fmt = "\ @@ -531,7 +552,8 @@ elf_machine_lazy_rel (struct link_map *map, /* Check for unexpected PLT reloc type. */ if (__glibc_likely (r_type == R_X86_64_JUMP_SLOT)) { - if (__builtin_expect (map->l_mach.plt, 0) == 0) + /* Prelink has been deprecated. */ + if (__glibc_likely (map->l_mach.plt == 0)) *reloc_addr += l_addr; else *reloc_addr = diff --git a/sysdeps/x86_64/dl-procinfo.c b/sysdeps/x86_64/dl-procinfo.c index 4625695dfb..269ce7e87d 100644 --- a/sysdeps/x86_64/dl-procinfo.c +++ b/sysdeps/x86_64/dl-procinfo.c @@ -1,5 +1,5 @@ /* Data for x86-64 version of processor capability information. - Copyright (C) 2015-2016 Free Software Foundation, Inc. + Copyright (C) 2015-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -39,19 +39,7 @@ # define PROCINFO_CLASS #endif -#if !defined PROCINFO_DECL && defined SHARED - ._dl_x86_cpu_features -#else -PROCINFO_CLASS struct cpu_features _dl_x86_cpu_features -#endif -#ifndef PROCINFO_DECL -= { } -#endif -#if !defined SHARED || defined PROCINFO_DECL -; -#else -, -#endif +#include <sysdeps/x86/dl-procinfo.c> #undef PROCINFO_DECL #undef PROCINFO_CLASS diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c new file mode 100644 index 0000000000..533ee2b3a6 --- /dev/null +++ b/sysdeps/x86_64/dl-tls.c @@ -0,0 +1,53 @@ +/* Thread-local storage handling in the ELF dynamic linker. x86-64 version. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifdef SHARED +/* Work around GCC PR58066, due to which __tls_get_addr may be called + with an unaligned stack. The compat implementation is in + tls_get_addr-compat.S. */ + +# include <dl-tls.h> + +/* Define __tls_get_addr within elf/dl-tls.c under a different + name. */ +extern __typeof__ (__tls_get_addr) ___tls_get_addr; + +# define __tls_get_addr ___tls_get_addr +# include <elf/dl-tls.c> +# undef __tls_get_addr + +hidden_ver (___tls_get_addr, __tls_get_addr) + +/* Only handle slow paths for __tls_get_addr. */ +attribute_hidden +void * +__tls_get_addr_slow (GET_ADDR_ARGS) +{ + dtv_t *dtv = THREAD_DTV (); + + if (__glibc_unlikely (dtv[0].counter != GL(dl_tls_generation))) + return update_get_addr (GET_ADDR_PARAM); + + return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL); +} +#else + +/* No compatibility symbol needed. */ +# include <elf/dl-tls.c> + +#endif diff --git a/sysdeps/x86_64/dl-tls.h b/sysdeps/x86_64/dl-tls.h index 0f101e6ac6..bc18e70b23 100644 --- a/sysdeps/x86_64/dl-tls.h +++ b/sysdeps/x86_64/dl-tls.h @@ -1,5 +1,5 @@ /* Thread-local storage handling in the ELF dynamic linker. x86-64 version. - Copyright (C) 2002-2016 Free Software Foundation, Inc. + Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,6 +16,9 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ +#ifndef _X86_64_DL_TLS_H +#define _X86_64_DL_TLS_H + #include <stdint.h> /* Type used for the representation of TLS information in the GOT. */ @@ -28,5 +31,4 @@ typedef struct dl_tls_index extern void *__tls_get_addr (tls_index *ti); -/* Value used for dtv entries for which the allocation is delayed. */ -#define TLS_DTV_UNALLOCATED ((void *) -1l) +#endif /* _X86_64_DL_TLS_H */ diff --git a/sysdeps/x86_64/dl-tlsdesc.S b/sysdeps/x86_64/dl-tlsdesc.S index 3cb7c3d031..80d771cd88 100644 --- a/sysdeps/x86_64/dl-tlsdesc.S +++ b/sysdeps/x86_64/dl-tlsdesc.S @@ -1,5 +1,5 @@ /* Thread-local storage handling in the ELF dynamic linker. x86_64 version. - Copyright (C) 2004-2016 Free Software Foundation, Inc. + Copyright (C) 2004-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -37,6 +37,7 @@ cfi_startproc .align 16 _dl_tlsdesc_return: + _CET_ENDBR movq 8(%rax), %rax ret cfi_endproc @@ -58,6 +59,7 @@ _dl_tlsdesc_return: cfi_startproc .align 16 _dl_tlsdesc_undefweak: + _CET_ENDBR movq 8(%rax), %rax subq %fs:0, %rax ret @@ -96,6 +98,7 @@ _dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax")) cfi_startproc .align 16 _dl_tlsdesc_dynamic: + _CET_ENDBR /* Preserve call-clobbered registers that we modify. We need two scratch regs anyway. */ movq %rsi, -16(%rsp) @@ -128,7 +131,11 @@ _dl_tlsdesc_dynamic: movq %r10, 40(%rsp) movq %r11, 48(%rsp) /* %rdi already points to the tlsinfo data structure. */ +#ifdef NO_RTLD_HIDDEN + call JUMPTARGET (__tls_get_addr) +#else call HIDDEN_JUMPTARGET (__tls_get_addr) +#endif movq 8(%rsp), %rdx movq 16(%rsp), %rcx movq 24(%rsp), %r8 @@ -162,15 +169,17 @@ _dl_tlsdesc_dynamic: .align 16 /* The PLT entry will have pushed the link_map pointer. */ _dl_tlsdesc_resolve_rela: + _CET_ENDBR cfi_adjust_cfa_offset (8) - /* Save all call-clobbered registers. */ - subq $72, %rsp - cfi_adjust_cfa_offset (72) + /* Save all call-clobbered registers. Add 8 bytes for push in + the PLT entry to align the stack. */ + subq $80, %rsp + cfi_adjust_cfa_offset (80) movq %rax, (%rsp) movq %rdi, 8(%rsp) movq %rax, %rdi /* Pass tlsdesc* in %rdi. */ movq %rsi, 16(%rsp) - movq 72(%rsp), %rsi /* Pass link_map* in %rsi. */ + movq 80(%rsp), %rsi /* Pass link_map* in %rsi. */ movq %r8, 24(%rsp) movq %r9, 32(%rsp) movq %r10, 40(%rsp) @@ -187,8 +196,8 @@ _dl_tlsdesc_resolve_rela: movq 48(%rsp), %r11 movq 56(%rsp), %rdx movq 64(%rsp), %rcx - addq $80, %rsp - cfi_adjust_cfa_offset (-80) + addq $88, %rsp + cfi_adjust_cfa_offset (-88) jmp *(%rax) cfi_endproc .size _dl_tlsdesc_resolve_rela, .-_dl_tlsdesc_resolve_rela @@ -211,6 +220,7 @@ _dl_tlsdesc_resolve_rela: .align 16 _dl_tlsdesc_resolve_hold: 0: + _CET_ENDBR /* Save all call-clobbered registers. */ subq $72, %rsp cfi_adjust_cfa_offset (72) diff --git a/sysdeps/x86_64/dl-tlsdesc.h b/sysdeps/x86_64/dl-tlsdesc.h index 11e1a50b8f..66e659bb5c 100644 --- a/sysdeps/x86_64/dl-tlsdesc.h +++ b/sysdeps/x86_64/dl-tlsdesc.h @@ -1,6 +1,6 @@ /* Thread-local storage descriptor handling in the ELF dynamic linker. x86_64 version. - Copyright (C) 2005-2016 Free Software Foundation, Inc. + Copyright (C) 2005-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -62,7 +62,7 @@ extern ptrdiff_t attribute_hidden # ifdef SHARED extern void *_dl_make_tlsdesc_dynamic (struct link_map *map, size_t ti_offset) - internal_function attribute_hidden; + attribute_hidden; extern ptrdiff_t attribute_hidden _dl_tlsdesc_dynamic(struct tlsdesc *); # endif diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S index 9fb6b13983..ef1425cbb9 100644 --- a/sysdeps/x86_64/dl-trampoline.S +++ b/sysdeps/x86_64/dl-trampoline.S @@ -1,5 +1,5 @@ /* PLT trampolines. x86-64 version. - Copyright (C) 2004-2016 Free Software Foundation, Inc. + Copyright (C) 2004-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,6 +18,7 @@ #include <config.h> #include <sysdep.h> +#include <cpu-features.h> #include <link-defines.h> #ifndef DL_STACK_ALIGNMENT @@ -33,37 +34,24 @@ # define DL_STACK_ALIGNMENT 8 #endif -#ifndef DL_RUNIME_UNALIGNED_VEC_SIZE -/* The maximum size of unaligned vector load and store. */ -# define DL_RUNIME_UNALIGNED_VEC_SIZE 16 -#endif - -/* True if _dl_runtime_resolve should align stack to VEC_SIZE bytes. */ -#define DL_RUNIME_RESOLVE_REALIGN_STACK \ - (VEC_SIZE > DL_STACK_ALIGNMENT \ - && VEC_SIZE > DL_RUNIME_UNALIGNED_VEC_SIZE) - -/* Align vector register save area to 16 bytes. */ -#define REGISTER_SAVE_VEC_OFF 0 +/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align + stack to 16 bytes before calling _dl_fixup. */ +#define DL_RUNTIME_RESOLVE_REALIGN_STACK \ + (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \ + || 16 > DL_STACK_ALIGNMENT) /* Area on stack to save and restore registers used for parameter passing when calling _dl_fixup. */ #ifdef __ILP32__ -# define REGISTER_SAVE_RAX (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8) # define PRESERVE_BND_REGS_PREFIX #else -/* Align bound register save area to 16 bytes. */ -# define REGISTER_SAVE_BND0 (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8) -# define REGISTER_SAVE_BND1 (REGISTER_SAVE_BND0 + 16) -# define REGISTER_SAVE_BND2 (REGISTER_SAVE_BND1 + 16) -# define REGISTER_SAVE_BND3 (REGISTER_SAVE_BND2 + 16) -# define REGISTER_SAVE_RAX (REGISTER_SAVE_BND3 + 16) # ifdef HAVE_MPX_SUPPORT # define PRESERVE_BND_REGS_PREFIX bnd # else # define PRESERVE_BND_REGS_PREFIX .byte 0xf2 # endif #endif +#define REGISTER_SAVE_RAX 0 #define REGISTER_SAVE_RCX (REGISTER_SAVE_RAX + 8) #define REGISTER_SAVE_RDX (REGISTER_SAVE_RCX + 8) #define REGISTER_SAVE_RSI (REGISTER_SAVE_RDX + 8) @@ -73,59 +61,58 @@ #define RESTORE_AVX -#ifdef HAVE_AVX512_ASM_SUPPORT -# define VEC_SIZE 64 -# define VMOVA vmovdqa64 -# if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT -# define VMOV vmovdqa64 -# else -# define VMOV vmovdqu64 -# endif -# define VEC(i) zmm##i -# define _dl_runtime_resolve _dl_runtime_resolve_avx512 -# define _dl_runtime_profile _dl_runtime_profile_avx512 -# include "dl-trampoline.h" -# undef _dl_runtime_resolve -# undef _dl_runtime_profile -# undef VEC -# undef VMOV -# undef VMOVA -# undef VEC_SIZE -#else -strong_alias (_dl_runtime_resolve_avx, _dl_runtime_resolve_avx512) - .hidden _dl_runtime_resolve_avx512 -strong_alias (_dl_runtime_profile_avx, _dl_runtime_profile_avx512) - .hidden _dl_runtime_profile_avx512 -#endif +#define VEC_SIZE 64 +#define VMOVA vmovdqa64 +#define VEC(i) zmm##i +#define _dl_runtime_profile _dl_runtime_profile_avx512 +#include "dl-trampoline.h" +#undef _dl_runtime_profile +#undef VEC +#undef VMOVA +#undef VEC_SIZE #define VEC_SIZE 32 #define VMOVA vmovdqa -#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT -# define VMOV vmovdqa -#else -# define VMOV vmovdqu -#endif #define VEC(i) ymm##i -#define _dl_runtime_resolve _dl_runtime_resolve_avx #define _dl_runtime_profile _dl_runtime_profile_avx #include "dl-trampoline.h" -#undef _dl_runtime_resolve #undef _dl_runtime_profile #undef VEC -#undef VMOV #undef VMOVA #undef VEC_SIZE /* movaps/movups is 1-byte shorter. */ #define VEC_SIZE 16 #define VMOVA movaps -#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT -# define VMOV movaps -#else -# define VMOV movups -#endif #define VEC(i) xmm##i -#define _dl_runtime_resolve _dl_runtime_resolve_sse #define _dl_runtime_profile _dl_runtime_profile_sse #undef RESTORE_AVX #include "dl-trampoline.h" +#undef _dl_runtime_profile +#undef VEC +#undef VMOVA +#undef VEC_SIZE + +#define USE_FXSAVE +#define STATE_SAVE_ALIGNMENT 16 +#define _dl_runtime_resolve _dl_runtime_resolve_fxsave +#include "dl-trampoline.h" +#undef _dl_runtime_resolve +#undef USE_FXSAVE +#undef STATE_SAVE_ALIGNMENT + +#define USE_XSAVE +#define STATE_SAVE_ALIGNMENT 64 +#define _dl_runtime_resolve _dl_runtime_resolve_xsave +#include "dl-trampoline.h" +#undef _dl_runtime_resolve +#undef USE_XSAVE +#undef STATE_SAVE_ALIGNMENT + +#define USE_XSAVEC +#define STATE_SAVE_ALIGNMENT 64 +#define _dl_runtime_resolve _dl_runtime_resolve_xsavec +#include "dl-trampoline.h" +#undef _dl_runtime_resolve +#undef USE_XSAVEC +#undef STATE_SAVE_ALIGNMENT diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h index f4191833ab..a28b1e73a4 100644 --- a/sysdeps/x86_64/dl-trampoline.h +++ b/sysdeps/x86_64/dl-trampoline.h @@ -1,5 +1,5 @@ /* PLT trampolines. x86-64 version. - Copyright (C) 2009-2016 Free Software Foundation, Inc. + Copyright (C) 2009-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,40 +16,47 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#undef REGISTER_SAVE_AREA_RAW -#ifdef __ILP32__ -/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as VEC0 to - VEC7. */ -# define REGISTER_SAVE_AREA_RAW (8 * 7 + VEC_SIZE * 8) -#else -/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as - BND0, BND1, BND2, BND3 and VEC0 to VEC7. */ -# define REGISTER_SAVE_AREA_RAW (8 * 7 + 16 * 4 + VEC_SIZE * 8) -#endif + .text +#ifdef _dl_runtime_resolve -#undef REGISTER_SAVE_AREA -#undef LOCAL_STORAGE_AREA -#undef BASE -#if DL_RUNIME_RESOLVE_REALIGN_STACK -# define REGISTER_SAVE_AREA (REGISTER_SAVE_AREA_RAW + 8) -/* Local stack area before jumping to function address: RBX. */ -# define LOCAL_STORAGE_AREA 8 -# define BASE rbx -# if (REGISTER_SAVE_AREA % VEC_SIZE) != 0 -# error REGISTER_SAVE_AREA must be multples of VEC_SIZE +# undef REGISTER_SAVE_AREA +# undef LOCAL_STORAGE_AREA +# undef BASE + +# if (STATE_SAVE_ALIGNMENT % 16) != 0 +# error STATE_SAVE_ALIGNMENT must be multples of 16 +# endif + +# if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0 +# error STATE_SAVE_OFFSET must be multples of STATE_SAVE_ALIGNMENT # endif -#else -# define REGISTER_SAVE_AREA REGISTER_SAVE_AREA_RAW + +# if DL_RUNTIME_RESOLVE_REALIGN_STACK +/* Local stack area before jumping to function address: RBX. */ +# define LOCAL_STORAGE_AREA 8 +# define BASE rbx +# ifdef USE_FXSAVE +/* Use fxsave to save XMM registers. */ +# define REGISTER_SAVE_AREA (512 + STATE_SAVE_OFFSET) +# if (REGISTER_SAVE_AREA % 16) != 0 +# error REGISTER_SAVE_AREA must be multples of 16 +# endif +# endif +# else +# ifndef USE_FXSAVE +# error USE_FXSAVE must be defined +# endif +/* Use fxsave to save XMM registers. */ +# define REGISTER_SAVE_AREA (512 + STATE_SAVE_OFFSET + 8) /* Local stack area before jumping to function address: All saved registers. */ -# define LOCAL_STORAGE_AREA REGISTER_SAVE_AREA -# define BASE rsp -# if (REGISTER_SAVE_AREA % 16) != 8 -# error REGISTER_SAVE_AREA must be odd multples of 8 +# define LOCAL_STORAGE_AREA REGISTER_SAVE_AREA +# define BASE rsp +# if (REGISTER_SAVE_AREA % 16) != 8 +# error REGISTER_SAVE_AREA must be odd multples of 8 +# endif # endif -#endif - .text .globl _dl_runtime_resolve .hidden _dl_runtime_resolve .type _dl_runtime_resolve, @function @@ -57,19 +64,31 @@ cfi_startproc _dl_runtime_resolve: cfi_adjust_cfa_offset(16) # Incorporate PLT -#if DL_RUNIME_RESOLVE_REALIGN_STACK -# if LOCAL_STORAGE_AREA != 8 -# error LOCAL_STORAGE_AREA must be 8 -# endif + _CET_ENDBR +# if DL_RUNTIME_RESOLVE_REALIGN_STACK +# if LOCAL_STORAGE_AREA != 8 +# error LOCAL_STORAGE_AREA must be 8 +# endif pushq %rbx # push subtracts stack by 8. cfi_adjust_cfa_offset(8) cfi_rel_offset(%rbx, 0) mov %RSP_LP, %RBX_LP cfi_def_cfa_register(%rbx) - and $-VEC_SIZE, %RSP_LP -#endif + and $-STATE_SAVE_ALIGNMENT, %RSP_LP +# endif +# ifdef REGISTER_SAVE_AREA sub $REGISTER_SAVE_AREA, %RSP_LP +# if !DL_RUNTIME_RESOLVE_REALIGN_STACK cfi_adjust_cfa_offset(REGISTER_SAVE_AREA) +# endif +# else + # Allocate stack space of the required size to save the state. +# if IS_IN (rtld) + sub _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP +# else + sub _dl_x86_cpu_features+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP +# endif +# endif # Preserve registers otherwise clobbered. movq %rax, REGISTER_SAVE_RAX(%rsp) movq %rcx, REGISTER_SAVE_RCX(%rsp) @@ -78,59 +97,42 @@ _dl_runtime_resolve: movq %rdi, REGISTER_SAVE_RDI(%rsp) movq %r8, REGISTER_SAVE_R8(%rsp) movq %r9, REGISTER_SAVE_R9(%rsp) - VMOV %VEC(0), (REGISTER_SAVE_VEC_OFF)(%rsp) - VMOV %VEC(1), (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp) - VMOV %VEC(2), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp) - VMOV %VEC(3), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp) - VMOV %VEC(4), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp) - VMOV %VEC(5), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp) - VMOV %VEC(6), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp) - VMOV %VEC(7), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp) -#ifndef __ILP32__ - # We also have to preserve bound registers. These are nops if - # Intel MPX isn't available or disabled. -# ifdef HAVE_MPX_SUPPORT - bndmov %bnd0, REGISTER_SAVE_BND0(%rsp) - bndmov %bnd1, REGISTER_SAVE_BND1(%rsp) - bndmov %bnd2, REGISTER_SAVE_BND2(%rsp) - bndmov %bnd3, REGISTER_SAVE_BND3(%rsp) +# ifdef USE_FXSAVE + fxsave STATE_SAVE_OFFSET(%rsp) # else -# if REGISTER_SAVE_BND0 == 0 - .byte 0x66,0x0f,0x1b,0x04,0x24 + movl $STATE_SAVE_MASK, %eax + xorl %edx, %edx + # Clear the XSAVE Header. +# ifdef USE_XSAVE + movq %rdx, (STATE_SAVE_OFFSET + 512)(%rsp) + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8)(%rsp) +# endif + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 2)(%rsp) + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 3)(%rsp) + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 4)(%rsp) + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 5)(%rsp) + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 6)(%rsp) + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 7)(%rsp) +# ifdef USE_XSAVE + xsave STATE_SAVE_OFFSET(%rsp) # else - .byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0 + xsavec STATE_SAVE_OFFSET(%rsp) # endif - .byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1 - .byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2 - .byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3 # endif -#endif # Copy args pushed by PLT in register. # %rdi: link_map, %rsi: reloc_index mov (LOCAL_STORAGE_AREA + 8)(%BASE), %RSI_LP mov LOCAL_STORAGE_AREA(%BASE), %RDI_LP call _dl_fixup # Call resolver. mov %RAX_LP, %R11_LP # Save return value -#ifndef __ILP32__ - # Restore bound registers. These are nops if Intel MPX isn't - # avaiable or disabled. -# ifdef HAVE_MPX_SUPPORT - bndmov REGISTER_SAVE_BND3(%rsp), %bnd3 - bndmov REGISTER_SAVE_BND2(%rsp), %bnd2 - bndmov REGISTER_SAVE_BND1(%rsp), %bnd1 - bndmov REGISTER_SAVE_BND0(%rsp), %bnd0 + # Get register content back. +# ifdef USE_FXSAVE + fxrstor STATE_SAVE_OFFSET(%rsp) # else - .byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3 - .byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2 - .byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1 -# if REGISTER_SAVE_BND0 == 0 - .byte 0x66,0x0f,0x1a,0x04,0x24 -# else - .byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0 -# endif + movl $STATE_SAVE_MASK, %eax + xorl %edx, %edx + xrstor STATE_SAVE_OFFSET(%rsp) # endif -#endif - # Get register content back. movq REGISTER_SAVE_R9(%rsp), %r9 movq REGISTER_SAVE_R8(%rsp), %r8 movq REGISTER_SAVE_RDI(%rsp), %rdi @@ -138,20 +140,12 @@ _dl_runtime_resolve: movq REGISTER_SAVE_RDX(%rsp), %rdx movq REGISTER_SAVE_RCX(%rsp), %rcx movq REGISTER_SAVE_RAX(%rsp), %rax - VMOV (REGISTER_SAVE_VEC_OFF)(%rsp), %VEC(0) - VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp), %VEC(1) - VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp), %VEC(2) - VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp), %VEC(3) - VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp), %VEC(4) - VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp), %VEC(5) - VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp), %VEC(6) - VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp), %VEC(7) -#if DL_RUNIME_RESOLVE_REALIGN_STACK +# if DL_RUNTIME_RESOLVE_REALIGN_STACK mov %RBX_LP, %RSP_LP cfi_def_cfa_register(%rsp) movq (%rsp), %rbx cfi_restore(%rbx) -#endif +# endif # Adjust stack(PLT did 2 pushes) add $(LOCAL_STORAGE_AREA + 16), %RSP_LP cfi_adjust_cfa_offset(-(LOCAL_STORAGE_AREA + 16)) @@ -160,9 +154,10 @@ _dl_runtime_resolve: jmp *%r11 # Jump to function address. cfi_endproc .size _dl_runtime_resolve, .-_dl_runtime_resolve +#endif -#ifndef PROF +#if !defined PROF && defined _dl_runtime_profile # if (LR_VECTOR_OFFSET % VEC_SIZE) != 0 # error LR_VECTOR_OFFSET must be multples of VEC_SIZE # endif @@ -174,6 +169,7 @@ _dl_runtime_resolve: _dl_runtime_profile: cfi_startproc cfi_adjust_cfa_offset(16) # Incorporate PLT + _CET_ENDBR /* The La_x86_64_regs data structure pointed to by the fourth paramater must be VEC_SIZE-byte aligned. This must be explicitly enforced. We have the set up a dynamically @@ -446,8 +442,16 @@ _dl_runtime_profile: # ifdef RESTORE_AVX /* sizeof(La_x86_64_retval). Need extra space for 2 SSE registers to detect if xmm0/xmm1 registers are changed - by audit module. */ - sub $(LRV_SIZE + XMM_SIZE*2), %RSP_LP + by audit module. Since rsp is aligned to VEC_SIZE, we + need to make sure that the address of La_x86_64_retval + + LRV_VECTOR0_OFFSET is aligned to VEC_SIZE. */ +# define LRV_SPACE (LRV_SIZE + XMM_SIZE*2) +# define LRV_MISALIGNED ((LRV_SIZE + LRV_VECTOR0_OFFSET) & (VEC_SIZE - 1)) +# if LRV_MISALIGNED == 0 + sub $LRV_SPACE, %RSP_LP +# else + sub $(LRV_SPACE + VEC_SIZE - LRV_MISALIGNED), %RSP_LP +# endif # else sub $LRV_SIZE, %RSP_LP # sizeof(La_x86_64_retval) # endif diff --git a/sysdeps/x86_64/ffs.c b/sysdeps/x86_64/ffs.c index be5b6c8589..fa5b20544d 100644 --- a/sysdeps/x86_64/ffs.c +++ b/sysdeps/x86_64/ffs.c @@ -1,7 +1,7 @@ /* ffs -- find first set bit in a word, counted from least significant end. For AMD x86-64. This file is part of the GNU C Library. - Copyright (C) 1991-2016 Free Software Foundation, Inc. + Copyright (C) 1991-2018 Free Software Foundation, Inc. Contributed by Ulrich Drepper <drepper@cygnus.com>. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/ffsll.c b/sysdeps/x86_64/ffsll.c index c0f5abc446..206deb6810 100644 --- a/sysdeps/x86_64/ffsll.c +++ b/sysdeps/x86_64/ffsll.c @@ -1,7 +1,7 @@ /* ffsll -- find first set bit in a word, counted from least significant end. For AMD x86-64. This file is part of the GNU C Library. - Copyright (C) 1991-2016 Free Software Foundation, Inc. + Copyright (C) 1991-2018 Free Software Foundation, Inc. Contributed by Ulrich Drepper <drepper@cygnus.com>. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile index 88742faff1..2b7d69bb50 100644 --- a/sysdeps/x86_64/fpu/Makefile +++ b/sysdeps/x86_64/fpu/Makefile @@ -30,11 +30,171 @@ ifeq ($(subdir),math) ifeq ($(build-mathvec),yes) libmvec-tests += double-vlen2 double-vlen4 double-vlen4-avx2 \ float-vlen4 float-vlen8 float-vlen8-avx2 +tests += test-double-libmvec-alias test-double-libmvec-alias-avx \ + test-double-libmvec-alias-avx2 test-double-libmvec-alias-main \ + test-double-libmvec-alias-avx-main test-double-libmvec-alias-avx2-main \ + test-float-libmvec-alias test-float-libmvec-alias-avx \ + test-float-libmvec-alias-avx2 test-float-libmvec-alias-main \ + test-float-libmvec-alias-avx-main test-float-libmvec-alias-avx2-main \ + test-double-libmvec-sincos test-double-libmvec-sincos-avx \ + test-double-libmvec-sincos-avx2 test-float-libmvec-sincosf \ + test-float-libmvec-sincosf-avx test-float-libmvec-sincosf-avx2 +modules-names += test-double-libmvec-alias-mod \ + test-double-libmvec-alias-avx-mod \ + test-double-libmvec-alias-avx2-mod \ + test-float-libmvec-alias-mod \ + test-float-libmvec-alias-avx-mod \ + test-float-libmvec-alias-avx2-mod +modules-names-tests += test-double-libmvec-alias-mod \ + test-double-libmvec-alias-avx-mod \ + test-double-libmvec-alias-avx2-mod \ + test-float-libmvec-alias-mod \ + test-float-libmvec-alias-avx-mod \ + test-float-libmvec-alias-avx2-mod +extra-test-objs += test-double-libmvec-sincos-avx-main.o \ + test-double-libmvec-sincos-avx2-main.o \ + test-double-libmvec-sincos-main.o \ + test-float-libmvec-sincosf-avx-main.o \ + test-float-libmvec-sincosf-avx2-main.o\ + test-float-libmvec-sincosf-main.o +test-double-libmvec-alias-mod.so-no-z-defs = yes +test-double-libmvec-alias-avx-mod.so-no-z-defs = yes +test-double-libmvec-alias-avx2-mod.so-no-z-defs = yes +test-float-libmvec-alias-mod.so-no-z-defs = yes +test-float-libmvec-alias-avx-mod.so-no-z-defs = yes +test-float-libmvec-alias-avx2-mod.so-no-z-defs = yes + +$(objpfx)test-double-libmvec-alias: \ + $(objpfx)test-double-libmvec-alias-mod.so +$(objpfx)test-double-libmvec-alias-mod.so: \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-double-libmvec-alias-avx: \ + $(objpfx)test-double-libmvec-alias-avx-mod.so +$(objpfx)test-double-libmvec-alias-avx-mod.so: \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-double-libmvec-alias-avx2: \ + $(objpfx)test-double-libmvec-alias-avx2-mod.so +$(objpfx)test-double-libmvec-alias-avx2-mod.so: \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-double-libmvec-alias-main: \ + $(objpfx)test-double-libmvec-alias-mod.os \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-double-libmvec-alias-avx-main: \ + $(objpfx)test-double-libmvec-alias-avx-mod.os \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-double-libmvec-alias-avx2-main: \ + $(objpfx)test-double-libmvec-alias-avx2-mod.os \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-float-libmvec-alias: \ + $(objpfx)test-float-libmvec-alias-mod.so +$(objpfx)test-float-libmvec-alias-mod.so: \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-float-libmvec-alias-avx: \ + $(objpfx)test-float-libmvec-alias-avx-mod.so +$(objpfx)test-float-libmvec-alias-avx-mod.so: \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-float-libmvec-alias-avx2: \ + $(objpfx)test-float-libmvec-alias-avx2-mod.so +$(objpfx)test-float-libmvec-alias-avx2-mod.so: \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-float-libmvec-alias-main: \ + $(objpfx)test-float-libmvec-alias-mod.os \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-float-libmvec-alias-avx-main: \ + $(objpfx)test-float-libmvec-alias-avx-mod.os \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-float-libmvec-alias-avx2-main: \ + $(objpfx)test-float-libmvec-alias-avx2-mod.os \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-double-libmvec-sincos: \ + $(objpfx)test-double-libmvec-sincos.o \ + $(objpfx)test-double-libmvec-sincos-main.o $(libmvec) + +$(objpfx)test-double-libmvec-sincos-avx: \ + $(objpfx)test-double-libmvec-sincos-avx.o \ + $(objpfx)test-double-libmvec-sincos-avx-main.o $(libmvec) + +$(objpfx)test-double-libmvec-sincos-avx2: \ + $(objpfx)test-double-libmvec-sincos-avx2.o \ + $(objpfx)test-double-libmvec-sincos-avx2-main.o $(libmvec) + +$(objpfx)test-float-libmvec-sincosf: \ + $(objpfx)test-float-libmvec-sincosf.o \ + $(objpfx)test-float-libmvec-sincosf-main.o $(libmvec) + +$(objpfx)test-float-libmvec-sincosf-avx: \ + $(objpfx)test-float-libmvec-sincosf-avx.o \ + $(objpfx)test-float-libmvec-sincosf-avx-main.o $(libmvec) + +$(objpfx)test-float-libmvec-sincosf-avx2: \ + $(objpfx)test-float-libmvec-sincosf-avx2.o \ + $(objpfx)test-float-libmvec-sincosf-avx2-main.o $(libmvec) ifeq (yes,$(config-cflags-avx512)) libmvec-tests += double-vlen8 float-vlen16 +tests += test-double-libmvec-alias-avx512 \ + test-float-libmvec-alias-avx512 \ + test-double-libmvec-alias-avx512-main \ + test-float-libmvec-alias-avx512-main \ + test-double-libmvec-sincos-avx512 \ + test-float-libmvec-sincosf-avx512 +modules-names += test-double-libmvec-alias-avx512-mod \ + test-float-libmvec-alias-avx512-mod +modules-names-tests += test-double-libmvec-alias-avx512-mod \ + test-float-libmvec-alias-avx512-mod +extra-test-objs += test-double-libmvec-sincos-avx512-main.o \ + test-float-libmvec-sincosf-avx512-main.o +test-double-libmvec-alias-avx512-mod.so-no-z-defs = yes +test-float-libmvec-alias-avx512-mod.so-no-z-defs = yes + +$(objpfx)test-double-libmvec-alias-avx512: \ + $(objpfx)test-double-libmvec-alias-avx512-mod.so +$(objpfx)test-double-libmvec-alias-avx512-mod.so: \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-double-libmvec-alias-avx512-main: \ + $(objpfx)test-double-libmvec-alias-avx512-mod.os \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-float-libmvec-alias-avx512: \ + $(objpfx)test-float-libmvec-alias-avx512-mod.so +$(objpfx)test-float-libmvec-alias-avx512-mod.so: \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-float-libmvec-alias-avx512-main: \ + $(objpfx)test-float-libmvec-alias-avx512-mod.os \ + $(objpfx)../mathvec/libmvec_nonshared.a $(libmvec) + +$(objpfx)test-double-libmvec-sincos-avx512: \ + $(objpfx)test-double-libmvec-sincos-avx512.o \ + $(objpfx)test-double-libmvec-sincos-avx512-main.o $(libmvec) + +$(objpfx)test-float-libmvec-sincosf-avx512: \ + $(objpfx)test-float-libmvec-sincosf-avx512.o \ + $(objpfx)test-float-libmvec-sincosf-avx512-main.o $(libmvec) endif +double-vlen2-funcs = cos exp log pow sin sincos +double-vlen4-funcs = cos exp log pow sin sincos +double-vlen4-avx2-funcs = cos exp log pow sin sincos +double-vlen8-funcs = cos exp log pow sin sincos +float-vlen4-funcs = cos exp log pow sin sincos +float-vlen8-funcs = cos exp log pow sin sincos +float-vlen8-avx2-funcs = cos exp log pow sin sincos +float-vlen16-funcs = cos exp log pow sin sincos + double-vlen4-arch-ext-cflags = -mavx double-vlen4-arch-ext2-cflags = -mavx2 double-vlen8-arch-ext-cflags = -mavx512f @@ -43,11 +203,37 @@ float-vlen8-arch-ext-cflags = -mavx float-vlen8-arch-ext2-cflags = -mavx2 float-vlen16-arch-ext-cflags = -mavx512f -CFLAGS-test-double-vlen4-avx2.c = $(libm-test-vec-cflags) +libmvec-sincos-cflags = $(libm-test-fast-math-cflags) -fno-inline -fopenmp -Wno-unknown-pragmas +libmvec-alias-cflags = $(libmvec-sincos-cflags) -ffloat-store -ffinite-math-only + +CFLAGS-test-double-libmvec-alias-mod.c = $(libmvec-alias-cflags) +CFLAGS-test-double-libmvec-alias-avx-mod.c = $(double-vlen4-arch-ext-cflags) $(libmvec-alias-cflags) -DREQUIRE_AVX +CFLAGS-test-double-libmvec-alias-avx2-mod.c = $(double-vlen4-arch-ext2-cflags) $(libmvec-alias-cflags) -DREQUIRE_AVX2 +CFLAGS-test-double-libmvec-alias-avx512-mod.c = $(double-vlen8-arch-ext-cflags) $(libmvec-alias-cflags) -DREQUIRE_AVX512F + +CFLAGS-test-float-libmvec-alias-mod.c = $(libmvec-alias-cflags) +CFLAGS-test-float-libmvec-alias-avx-mod.c = $(double-vlen4-arch-ext-cflags) $(libmvec-alias-cflags) -DREQUIRE_AVX +CFLAGS-test-float-libmvec-alias-avx2-mod.c = $(double-vlen4-arch-ext2-cflags) $(libmvec-alias-cflags) -DREQUIRE_AVX2 +CFLAGS-test-float-libmvec-alias-avx512-mod.c = $(double-vlen8-arch-ext-cflags) $(libmvec-alias-cflags) -DREQUIRE_AVX512F + CFLAGS-test-double-vlen4-avx2-wrappers.c = $(double-vlen4-arch-ext2-cflags) -CFLAGS-test-float-vlen8-avx2.c = $(libm-test-vec-cflags) CFLAGS-test-float-vlen8-avx2-wrappers.c = $(float-vlen8-arch-ext2-cflags) +CFLAGS-test-double-libmvec-sincos-main.c = $(libmvec-sincos-cflags) +CFLAGS-test-double-libmvec-sincos-avx.c = -DREQUIRE_AVX +CFLAGS-test-double-libmvec-sincos-avx-main.c = $(libmvec-sincos-cflags) $(double-vlen4-arch-ext-cflags) +CFLAGS-test-double-libmvec-sincos-avx2.c = -DREQUIRE_AVX2 +CFLAGS-test-double-libmvec-sincos-avx2-main.c = $(libmvec-sincos-cflags) $(double-vlen4-arch-ext2-cflags) +CFLAGS-test-double-libmvec-sincos-avx512.c = -DREQUIRE_AVX512F +CFLAGS-test-double-libmvec-sincos-avx512-main.c = $(libmvec-sincos-cflags) $(double-vlen8-arch-ext-cflags) + +CFLAGS-test-float-libmvec-sincosf-main.c = $(libmvec-sincos-cflags) +CFLAGS-test-float-libmvec-sincosf-avx.c = -DREQUIRE_AVX +CFLAGS-test-float-libmvec-sincosf-avx-main.c = $(libmvec-sincos-cflags) $(float-vlen8-arch-ext-cflags) +CFLAGS-test-float-libmvec-sincosf-avx2.c = -DREQUIRE_AVX2 +CFLAGS-test-float-libmvec-sincosf-avx2-main.c = $(libmvec-sincos-cflags) $(float-vlen8-arch-ext2-cflags) +CFLAGS-test-float-libmvec-sincosf-avx512.c = -DREQUIRE_AVX512F +CFLAGS-test-float-libmvec-sincosf-avx512-main.c = $(libmvec-sincos-cflags) $(float-vlen16-arch-ext-cflags) endif endif diff --git a/sysdeps/x86_64/fpu/dla.h b/sysdeps/x86_64/fpu/dla.h deleted file mode 100644 index 688efa0f5b..0000000000 --- a/sysdeps/x86_64/fpu/dla.h +++ /dev/null @@ -1,8 +0,0 @@ -#include <features.h> - -#ifdef __FMA4__ -# define DLA_FMS(x,y,z) \ - __builtin_fma (x, y, -(z)) -#endif - -#include "sysdeps/ieee754/dbl-64/dla.h" diff --git a/sysdeps/x86_64/fpu/e_expf.S b/sysdeps/x86_64/fpu/e_expf.S deleted file mode 100644 index d4b63a8d8e..0000000000 --- a/sysdeps/x86_64/fpu/e_expf.S +++ /dev/null @@ -1,339 +0,0 @@ -/* Optimized __ieee754_expf function. - Copyright (C) 2012-2016 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -/* Short algorithm description: - * - * Let K = 64 (table size). - * e^x = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y)) - * where - * x = m*log(2)/K + y, y in [0.0..log(2)/K] - * m = n*K + j, m,n,j - signed integer, j in [0..K-1] - * values of 2^(j/K) are tabulated as T[j]. - * - * P(y) is a minimax polynomial approximation of expf(x)-1 - * on small interval [0.0..log(2)/K]. - * - * P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as - * z = y*y; P(y) = (P3*z + P1)*z + (P2*z + P0)*y - * - * Special cases: - * expf(NaN) = NaN - * expf(+INF) = +INF - * expf(-INF) = 0 - * expf(x) = 1 for subnormals - * for finite argument, only expf(0)=1 is exact - * expf(x) overflows if x>88.7228317260742190 - * expf(x) underflows if x<-103.972076416015620 - */ - - .text -ENTRY(__ieee754_expf) - /* Input: single precision x in %xmm0 */ - cvtss2sd %xmm0, %xmm1 /* Convert x to double precision */ - movd %xmm0, %ecx /* Copy x */ - movsd L(DP_KLN2)(%rip), %xmm2 /* DP K/log(2) */ - movsd L(DP_P2)(%rip), %xmm3 /* DP P2 */ - movl %ecx, %eax /* x */ - mulsd %xmm1, %xmm2 /* DP x*K/log(2) */ - andl $0x7fffffff, %ecx /* |x| */ - lea L(DP_T)(%rip), %rsi /* address of table T[j] */ - cmpl $0x42ad496b, %ecx /* |x|<125*log(2) ? */ - movsd L(DP_P3)(%rip), %xmm4 /* DP P3 */ - addsd L(DP_RS)(%rip), %xmm2 /* DP x*K/log(2)+RS */ - jae L(special_paths) - - /* Here if |x|<125*log(2) */ - cmpl $0x31800000, %ecx /* |x|<2^(-28) ? */ - jb L(small_arg) - - /* Main path: here if 2^(-28)<=|x|<125*log(2) */ - cvtsd2ss %xmm2, %xmm2 /* SP x*K/log(2)+RS */ - movd %xmm2, %eax /* bits of n*K+j with trash */ - subss L(SP_RS)(%rip), %xmm2 /* SP t=round(x*K/log(2)) */ - movl %eax, %edx /* n*K+j with trash */ - cvtss2sd %xmm2, %xmm2 /* DP t */ - andl $0x3f, %eax /* bits of j */ - mulsd L(DP_NLN2K)(%rip), %xmm2/* DP -t*log(2)/K */ - andl $0xffffffc0, %edx /* bits of n */ -#ifdef __AVX__ - vaddsd %xmm1, %xmm2, %xmm0 /* DP y=x-t*log(2)/K */ - vmulsd %xmm0, %xmm0, %xmm2 /* DP z=y*y */ -#else - addsd %xmm1, %xmm2 /* DP y=x-t*log(2)/K */ - movaps %xmm2, %xmm0 /* DP y */ - mulsd %xmm2, %xmm2 /* DP z=y*y */ -#endif - mulsd %xmm2, %xmm4 /* DP P3*z */ - addl $0x1fc0, %edx /* bits of n + SP exponent bias */ - mulsd %xmm2, %xmm3 /* DP P2*z */ - shll $17, %edx /* SP 2^n */ - addsd L(DP_P1)(%rip), %xmm4 /* DP P3*z+P1 */ - addsd L(DP_P0)(%rip), %xmm3 /* DP P2*z+P0 */ - movd %edx, %xmm1 /* SP 2^n */ - mulsd %xmm2, %xmm4 /* DP (P3*z+P1)*z */ - mulsd %xmm3, %xmm0 /* DP (P2*z+P0)*y */ - addsd %xmm4, %xmm0 /* DP P(y) */ - mulsd (%rsi,%rax,8), %xmm0 /* DP P(y)*T[j] */ - addsd (%rsi,%rax,8), %xmm0 /* DP T[j]*(P(y)+1) */ - cvtsd2ss %xmm0, %xmm0 /* SP T[j]*(P(y)+1) */ - mulss %xmm1, %xmm0 /* SP result=2^n*(T[j]*(P(y)+1)) */ - ret - - .p2align 4 -L(small_arg): - /* Here if 0<=|x|<2^(-28) */ - addss L(SP_ONE)(%rip), %xmm0 /* 1.0 + x */ - /* Return 1.0 with inexact raised, except for x==0 */ - ret - - .p2align 4 -L(special_paths): - /* Here if 125*log(2)<=|x| */ - shrl $31, %eax /* Get sign bit of x, and depending on it: */ - lea L(SP_RANGE)(%rip), %rdx /* load over/underflow bound */ - cmpl (%rdx,%rax,4), %ecx /* |x|<under/overflow bound ? */ - jbe L(near_under_or_overflow) - - /* Here if |x|>under/overflow bound */ - cmpl $0x7f800000, %ecx /* |x| is finite ? */ - jae L(arg_inf_or_nan) - - /* Here if |x|>under/overflow bound, and x is finite */ - testq %rax, %rax /* sign of x nonzero ? */ - je L(res_overflow) - - /* Here if -inf<x<underflow bound (x<0) */ - movss L(SP_SMALL)(%rip), %xmm0/* load small value 2^(-100) */ - mulss %xmm0, %xmm0 /* Return underflowed result (zero or subnormal) */ - ret - - .p2align 4 -L(res_overflow): - /* Here if overflow bound<x<inf (x>0) */ - movss L(SP_LARGE)(%rip), %xmm0/* load large value 2^100 */ - mulss %xmm0, %xmm0 /* Return overflowed result (Inf or max normal) */ - ret - - .p2align 4 -L(arg_inf_or_nan): - /* Here if |x| is Inf or NAN */ - jne L(arg_nan) /* |x| is Inf ? */ - - /* Here if |x| is Inf */ - lea L(SP_INF_0)(%rip), %rdx /* depending on sign of x: */ - movss (%rdx,%rax,4), %xmm0 /* return zero or Inf */ - ret - - .p2align 4 -L(arg_nan): - /* Here if |x| is NaN */ - addss %xmm0, %xmm0 /* Return x+x (raise invalid) */ - ret - - .p2align 4 -L(near_under_or_overflow): - /* Here if 125*log(2)<=|x|<under/overflow bound */ - cvtsd2ss %xmm2, %xmm2 /* SP x*K/log(2)+RS */ - movd %xmm2, %eax /* bits of n*K+j with trash */ - subss L(SP_RS)(%rip), %xmm2 /* SP t=round(x*K/log(2)) */ - movl %eax, %edx /* n*K+j with trash */ - cvtss2sd %xmm2, %xmm2 /* DP t */ - andl $0x3f, %eax /* bits of j */ - mulsd L(DP_NLN2K)(%rip), %xmm2/* DP -t*log(2)/K */ - andl $0xffffffc0, %edx /* bits of n */ -#ifdef __AVX__ - vaddsd %xmm1, %xmm2, %xmm0 /* DP y=x-t*log(2)/K */ - vmulsd %xmm0, %xmm0, %xmm2 /* DP z=y*y */ -#else - addsd %xmm1, %xmm2 /* DP y=x-t*log(2)/K */ - movaps %xmm2, %xmm0 /* DP y */ - mulsd %xmm2, %xmm2 /* DP z=y*y */ -#endif - mulsd %xmm2, %xmm4 /* DP P3*z */ - addl $0xffc0, %edx /* bits of n + DP exponent bias */ - mulsd %xmm2, %xmm3 /* DP P2*z */ - shlq $46, %rdx /* DP 2^n */ - addsd L(DP_P1)(%rip), %xmm4 /* DP P3*z+P1 */ - addsd L(DP_P0)(%rip), %xmm3 /* DP P2*z+P0 */ - movd %rdx, %xmm1 /* DP 2^n */ - mulsd %xmm2, %xmm4 /* DP (P3*z+P1)*z */ - mulsd %xmm3, %xmm0 /* DP (P2*z+P0)*y */ - addsd %xmm4, %xmm0 /* DP P(y) */ - mulsd (%rsi,%rax,8), %xmm0 /* DP P(y)*T[j] */ - addsd (%rsi,%rax,8), %xmm0 /* DP T[j]*(P(y)+1) */ - mulsd %xmm1, %xmm0 /* DP result=2^n*(T[j]*(P(y)+1)) */ - cvtsd2ss %xmm0, %xmm0 /* convert result to single precision */ - ret -END(__ieee754_expf) - - .section .rodata, "a" - .p2align 3 -L(DP_T): /* table of double precision values 2^(j/K) for j=[0..K-1] */ - .long 0x00000000, 0x3ff00000 - .long 0x3e778061, 0x3ff02c9a - .long 0xd3158574, 0x3ff059b0 - .long 0x18759bc8, 0x3ff08745 - .long 0x6cf9890f, 0x3ff0b558 - .long 0x32d3d1a2, 0x3ff0e3ec - .long 0xd0125b51, 0x3ff11301 - .long 0xaea92de0, 0x3ff1429a - .long 0x3c7d517b, 0x3ff172b8 - .long 0xeb6fcb75, 0x3ff1a35b - .long 0x3168b9aa, 0x3ff1d487 - .long 0x88628cd6, 0x3ff2063b - .long 0x6e756238, 0x3ff2387a - .long 0x65e27cdd, 0x3ff26b45 - .long 0xf51fdee1, 0x3ff29e9d - .long 0xa6e4030b, 0x3ff2d285 - .long 0x0a31b715, 0x3ff306fe - .long 0xb26416ff, 0x3ff33c08 - .long 0x373aa9cb, 0x3ff371a7 - .long 0x34e59ff7, 0x3ff3a7db - .long 0x4c123422, 0x3ff3dea6 - .long 0x21f72e2a, 0x3ff4160a - .long 0x6061892d, 0x3ff44e08 - .long 0xb5c13cd0, 0x3ff486a2 - .long 0xd5362a27, 0x3ff4bfda - .long 0x769d2ca7, 0x3ff4f9b2 - .long 0x569d4f82, 0x3ff5342b - .long 0x36b527da, 0x3ff56f47 - .long 0xdd485429, 0x3ff5ab07 - .long 0x15ad2148, 0x3ff5e76f - .long 0xb03a5585, 0x3ff6247e - .long 0x82552225, 0x3ff66238 - .long 0x667f3bcd, 0x3ff6a09e - .long 0x3c651a2f, 0x3ff6dfb2 - .long 0xe8ec5f74, 0x3ff71f75 - .long 0x564267c9, 0x3ff75feb - .long 0x73eb0187, 0x3ff7a114 - .long 0x36cf4e62, 0x3ff7e2f3 - .long 0x994cce13, 0x3ff82589 - .long 0x9b4492ed, 0x3ff868d9 - .long 0x422aa0db, 0x3ff8ace5 - .long 0x99157736, 0x3ff8f1ae - .long 0xb0cdc5e5, 0x3ff93737 - .long 0x9fde4e50, 0x3ff97d82 - .long 0x82a3f090, 0x3ff9c491 - .long 0x7b5de565, 0x3ffa0c66 - .long 0xb23e255d, 0x3ffa5503 - .long 0x5579fdbf, 0x3ffa9e6b - .long 0x995ad3ad, 0x3ffae89f - .long 0xb84f15fb, 0x3ffb33a2 - .long 0xf2fb5e47, 0x3ffb7f76 - .long 0x904bc1d2, 0x3ffbcc1e - .long 0xdd85529c, 0x3ffc199b - .long 0x2e57d14b, 0x3ffc67f1 - .long 0xdcef9069, 0x3ffcb720 - .long 0x4a07897c, 0x3ffd072d - .long 0xdcfba487, 0x3ffd5818 - .long 0x03db3285, 0x3ffda9e6 - .long 0x337b9b5f, 0x3ffdfc97 - .long 0xe78b3ff6, 0x3ffe502e - .long 0xa2a490da, 0x3ffea4af - .long 0xee615a27, 0x3ffefa1b - .long 0x5b6e4540, 0x3fff5076 - .long 0x819e90d8, 0x3fffa7c1 - .type L(DP_T), @object - ASM_SIZE_DIRECTIVE(L(DP_T)) - - .section .rodata.cst8,"aM",@progbits,8 - .p2align 3 -L(DP_KLN2): /* double precision K/log(2) */ - .long 0x652b82fe, 0x40571547 - .type L(DP_KLN2), @object - ASM_SIZE_DIRECTIVE(L(DP_KLN2)) - - .p2align 3 -L(DP_NLN2K): /* double precision -log(2)/K */ - .long 0xfefa39ef, 0xbf862e42 - .type L(DP_NLN2K), @object - ASM_SIZE_DIRECTIVE(L(DP_NLN2K)) - - .p2align 3 -L(DP_RS): /* double precision 2^23+2^22 */ - .long 0x00000000, 0x41680000 - .type L(DP_RS), @object - ASM_SIZE_DIRECTIVE(L(DP_RS)) - - .p2align 3 -L(DP_P3): /* double precision polynomial coefficient P3 */ - .long 0xeb78fa85, 0x3fa56420 - .type L(DP_P3), @object - ASM_SIZE_DIRECTIVE(L(DP_P3)) - - .p2align 3 -L(DP_P1): /* double precision polynomial coefficient P1 */ - .long 0x008d6118, 0x3fe00000 - .type L(DP_P1), @object - ASM_SIZE_DIRECTIVE(L(DP_P1)) - - .p2align 3 -L(DP_P2): /* double precision polynomial coefficient P2 */ - .long 0xda752d4f, 0x3fc55550 - .type L(DP_P2), @object - ASM_SIZE_DIRECTIVE(L(DP_P2)) - - .p2align 3 -L(DP_P0): /* double precision polynomial coefficient P0 */ - .long 0xffffe7c6, 0x3fefffff - .type L(DP_P0), @object - ASM_SIZE_DIRECTIVE(L(DP_P0)) - - .p2align 2 -L(SP_RANGE): /* single precision overflow/underflow bounds */ - .long 0x42b17217 /* if x>this bound, then result overflows */ - .long 0x42cff1b4 /* if x<this bound, then result underflows */ - .type L(SP_RANGE), @object - ASM_SIZE_DIRECTIVE(L(SP_RANGE)) - - .p2align 2 -L(SP_INF_0): - .long 0x7f800000 /* single precision Inf */ - .long 0 /* single precision zero */ - .type L(SP_INF_0), @object - ASM_SIZE_DIRECTIVE(L(SP_INF_0)) - - .section .rodata.cst4,"aM",@progbits,4 - .p2align 2 -L(SP_RS): /* single precision 2^23+2^22 */ - .long 0x4b400000 - .type L(SP_RS), @object - ASM_SIZE_DIRECTIVE(L(SP_RS)) - - .p2align 2 -L(SP_SMALL): /* single precision small value 2^(-100) */ - .long 0x0d800000 - .type L(SP_SMALL), @object - ASM_SIZE_DIRECTIVE(L(SP_SMALL)) - - .p2align 2 -L(SP_LARGE): /* single precision large value 2^100 */ - .long 0x71800000 - .type L(SP_LARGE), @object - ASM_SIZE_DIRECTIVE(L(SP_LARGE)) - - .p2align 2 -L(SP_ONE): /* single precision 1.0 */ - .long 0x3f800000 - .type L(SP_ONE), @object - ASM_SIZE_DIRECTIVE(L(SP_ONE)) - -strong_alias (__ieee754_expf, __expf_finite) diff --git a/sysdeps/x86_64/fpu/e_expl.S b/sysdeps/x86_64/fpu/e_expl.S index 8b3ddaec59..b75a103803 100644 --- a/sysdeps/x86_64/fpu/e_expl.S +++ b/sysdeps/x86_64/fpu/e_expl.S @@ -22,6 +22,7 @@ * -- moshier@na-net.ornl.gov */ +#include <libm-alias-ldouble.h> #include <machine/asm.h> #include <x86_64-math-asm.h> @@ -99,7 +100,7 @@ ENTRY(IEEE754_EXPL) /* Below -64.0 (may be -NaN or -Inf). */ andb %ah, %dh cmpb $0x01, %dh - je 2f /* Is +-NaN, jump. */ + je 6f /* Is +-NaN, jump. */ jmp 1f /* -large, possibly -Inf. */ 4: /* In range -64.0 to 64.0 (may be +-0 but not NaN or +-Inf). */ @@ -141,7 +142,7 @@ ENTRY(IEEE754_EXPL) cmpb $0x05, %dh je 1f /* Is +-Inf, jump. */ cmpb $0x01, %dh - je 2f /* Is +-NaN, jump. */ + je 6f /* Is +-NaN, jump. */ /* Overflow or underflow; saturate. */ fstp %st fldt MO(csat) @@ -207,10 +208,13 @@ ENTRY(IEEE754_EXPL) fldz /* Set result to 0. */ #endif 2: ret +6: /* NaN argument. */ + fadd %st + ret END(IEEE754_EXPL) #ifdef USE_AS_EXPM1L libm_hidden_def (__expm1l) -weak_alias (__expm1l, expm1l) +libm_alias_ldouble (__expm1, expm1) #else strong_alias (IEEE754_EXPL, EXPL_FINITE) #endif diff --git a/sysdeps/x86_64/fpu/e_log10l.S b/sysdeps/x86_64/fpu/e_log10l.S index 8fa61644c1..e0cb88e32e 100644 --- a/sysdeps/x86_64/fpu/e_log10l.S +++ b/sysdeps/x86_64/fpu/e_log10l.S @@ -64,6 +64,7 @@ ENTRY(__ieee754_log10l) jnz 4b // in case x is ±Inf fstp %st(1) fstp %st(1) + fadd %st(0) ret END(__ieee754_log10l) diff --git a/sysdeps/x86_64/fpu/e_log2l.S b/sysdeps/x86_64/fpu/e_log2l.S index a063255ddd..023ec29164 100644 --- a/sysdeps/x86_64/fpu/e_log2l.S +++ b/sysdeps/x86_64/fpu/e_log2l.S @@ -63,6 +63,7 @@ ENTRY(__ieee754_log2l) jnz 4b // in case x is ±Inf fstp %st(1) fstp %st(1) + fadd %st(0) ret END (__ieee754_log2l) diff --git a/sysdeps/x86_64/fpu/e_logl.S b/sysdeps/x86_64/fpu/e_logl.S index dbe6fd59dc..0d3576f48b 100644 --- a/sysdeps/x86_64/fpu/e_logl.S +++ b/sysdeps/x86_64/fpu/e_logl.S @@ -66,6 +66,7 @@ ENTRY(__ieee754_logl) jnz 4b // in case x is +-Inf fstp %st(1) fstp %st(1) + fadd %st(0) ret END (__ieee754_logl) diff --git a/sysdeps/x86_64/fpu/e_powl.S b/sysdeps/x86_64/fpu/e_powl.S index 1f68cf0102..f32228104e 100644 --- a/sysdeps/x86_64/fpu/e_powl.S +++ b/sysdeps/x86_64/fpu/e_powl.S @@ -1,5 +1,5 @@ /* ix87 specific implementation of pow function. - Copyright (C) 1996-2016 Free Software Foundation, Inc. + Copyright (C) 1996-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. @@ -26,9 +26,9 @@ .type one,@object one: .double 1.0 ASM_SIZE_DIRECTIVE(one) - .type p3,@object -p3: .byte 0, 0, 0, 0, 0, 0, 0x20, 0x40 - ASM_SIZE_DIRECTIVE(p3) + .type p2,@object +p2: .byte 0, 0, 0, 0, 0, 0, 0x10, 0x40 + ASM_SIZE_DIRECTIVE(p2) .type p63,@object p63: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43 ASM_SIZE_DIRECTIVE(p63) @@ -136,12 +136,12 @@ ENTRY(__ieee754_powl) jmp 3f 9: /* OK, we have an integer value for y. Unless very small - (we use < 8), use the algorithm for real exponent to avoid + (we use < 4), use the algorithm for real exponent to avoid accumulation of errors. */ - fldl MO(p3) // 8 : y : x - fld %st(1) // y : 8 : y : x - fabs // |y| : 8 : y : x - fcomip %st(1), %st // 8 : y : x + fldl MO(p2) // 4 : y : x + fld %st(1) // y : 4 : y : x + fabs // |y| : 4 : y : x + fcomip %st(1), %st // 4 : y : x fstp %st(0) // y : x jnc 3f mov -8(%rsp),%eax @@ -184,9 +184,15 @@ ENTRY(__ieee754_powl) 30: fldt 8(%rsp) // x : y fldl MO(one) // 1.0 : x : y fucomip %st(1),%st // x : y - je 31f - fxch // y : x -31: fstp %st(1) + je 32f +31: /* At least one argument NaN, and result should be NaN. */ + faddp + ret +32: jc 31b + /* pow (1, NaN); check if the NaN signaling. */ + testb $0x40, 31(%rsp) + jz 31b + fstp %st(1) ret .align ALIGNARG(4) @@ -217,12 +223,24 @@ ENTRY(__ieee754_powl) cfi_adjust_cfa_offset (-40) ret - // pow(x,±0) = 1 + // pow(x,±0) = 1, unless x is sNaN .align ALIGNARG(4) 11: fstp %st(0) // pop y + fldt 8(%rsp) // x + fxam + fnstsw + andb $0x45, %ah + cmpb $0x01, %ah + je 112f // x is NaN +111: fstp %st(0) fldl MO(one) ret +112: testb $0x40, 15(%rsp) + jnz 111b + fadd %st(0) + ret + // y == ±inf .align ALIGNARG(4) 12: fstp %st(0) // pop y @@ -255,6 +273,7 @@ ENTRY(__ieee754_powl) .align ALIGNARG(4) 13: fldt 8(%rsp) // load x == NaN + fadd %st(0) ret .align ALIGNARG(4) diff --git a/sysdeps/x86_64/fpu/e_scalbl.S b/sysdeps/x86_64/fpu/e_scalbl.S index 331bee580c..2982dc3b9e 100644 --- a/sysdeps/x86_64/fpu/e_scalbl.S +++ b/sysdeps/x86_64/fpu/e_scalbl.S @@ -44,7 +44,7 @@ ENTRY(__ieee754_scalbl) fnstsw andl $0x4500, %eax cmpl $0x0100, %eax - je 3f + je 2f fld %st(1) frndint fcomip %st(2), %st @@ -75,15 +75,8 @@ ENTRY(__ieee754_scalbl) #endif ret - /* The result is NaN, but we must not raise an exception. - So use a variable. */ -2: fstp %st - fstp %st - fldl MO(nan) - ret - - /* The first parameter is a NaN. Return it. */ -3: fstp %st(1) + /* The result is NaN; raise an exception for sNaN arguments. */ +2: faddp ret /* Return NaN and raise the invalid exception. */ diff --git a/sysdeps/x86_64/fpu/e_sqrt.c b/sysdeps/x86_64/fpu/e_sqrt.c index 4b86434913..f4c2e5fd7c 100644 --- a/sysdeps/x86_64/fpu/e_sqrt.c +++ b/sysdeps/x86_64/fpu/e_sqrt.c @@ -1,5 +1,5 @@ /* Square root of floating point number. - Copyright (C) 2002-2016 Free Software Foundation, Inc. + Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/e_sqrtf.c b/sysdeps/x86_64/fpu/e_sqrtf.c index 639137b735..8f76ccb530 100644 --- a/sysdeps/x86_64/fpu/e_sqrtf.c +++ b/sysdeps/x86_64/fpu/e_sqrtf.c @@ -1,5 +1,5 @@ /* Square root of floating point number. - Copyright (C) 2002-2016 Free Software Foundation, Inc. + Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/fclrexcpt.c b/sysdeps/x86_64/fpu/fclrexcpt.c index a8e00c0141..e7f6aa341f 100644 --- a/sysdeps/x86_64/fpu/fclrexcpt.c +++ b/sysdeps/x86_64/fpu/fclrexcpt.c @@ -1,5 +1,5 @@ /* Clear given exceptions in current floating-point environment. - Copyright (C) 2001-2016 Free Software Foundation, Inc. + Copyright (C) 2001-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/fedisblxcpt.c b/sysdeps/x86_64/fpu/fedisblxcpt.c index f1ea6cfa97..9153f997ed 100644 --- a/sysdeps/x86_64/fpu/fedisblxcpt.c +++ b/sysdeps/x86_64/fpu/fedisblxcpt.c @@ -1,5 +1,5 @@ /* Disable floating-point exceptions. - Copyright (C) 2001-2016 Free Software Foundation, Inc. + Copyright (C) 2001-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Andreas Jaeger <aj@suse.de>, 2001. diff --git a/sysdeps/x86_64/fpu/feenablxcpt.c b/sysdeps/x86_64/fpu/feenablxcpt.c index df4c628b8d..7a3e26b3f9 100644 --- a/sysdeps/x86_64/fpu/feenablxcpt.c +++ b/sysdeps/x86_64/fpu/feenablxcpt.c @@ -1,5 +1,5 @@ /* Enable floating-point exceptions. - Copyright (C) 2001-2016 Free Software Foundation, Inc. + Copyright (C) 2001-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Andreas Jaeger <aj@suse.de>, 2001. diff --git a/sysdeps/x86_64/fpu/fegetenv.c b/sysdeps/x86_64/fpu/fegetenv.c index a28efb36f3..9461af7575 100644 --- a/sysdeps/x86_64/fpu/fegetenv.c +++ b/sysdeps/x86_64/fpu/fegetenv.c @@ -1,5 +1,5 @@ /* Store current floating-point environment. - Copyright (C) 2001-2016 Free Software Foundation, Inc. + Copyright (C) 2001-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/fegetexcept.c b/sysdeps/x86_64/fpu/fegetexcept.c index 8acd0382a0..ce54c251ba 100644 --- a/sysdeps/x86_64/fpu/fegetexcept.c +++ b/sysdeps/x86_64/fpu/fegetexcept.c @@ -1,5 +1,5 @@ /* Get enabled floating-point exceptions. - Copyright (C) 2001-2016 Free Software Foundation, Inc. + Copyright (C) 2001-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Andreas Jaeger <aj@suse.de>, 2001. diff --git a/sysdeps/x86_64/fpu/fegetmode.c b/sysdeps/x86_64/fpu/fegetmode.c new file mode 100644 index 0000000000..cc4f12649b --- /dev/null +++ b/sysdeps/x86_64/fpu/fegetmode.c @@ -0,0 +1,28 @@ +/* Store current floating-point control modes. x86_64 version. + Copyright (C) 2016-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <fpu_control.h> + +int +fegetmode (femode_t *modep) +{ + _FPU_GETCW (modep->__control_word); + __asm__ ("stmxcsr %0" : "=m" (modep->__mxcsr)); + return 0; +} diff --git a/sysdeps/x86_64/fpu/fegetround.c b/sysdeps/x86_64/fpu/fegetround.c index 296d366560..0f31cafedd 100644 --- a/sysdeps/x86_64/fpu/fegetround.c +++ b/sysdeps/x86_64/fpu/fegetround.c @@ -1,5 +1,5 @@ /* Return current rounding direction. - Copyright (C) 1997-2016 Free Software Foundation, Inc. + Copyright (C) 1997-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. diff --git a/sysdeps/x86_64/fpu/feholdexcpt.c b/sysdeps/x86_64/fpu/feholdexcpt.c index a040c3dea5..dec689beb2 100644 --- a/sysdeps/x86_64/fpu/feholdexcpt.c +++ b/sysdeps/x86_64/fpu/feholdexcpt.c @@ -1,5 +1,5 @@ /* Store current floating-point environment and clear exceptions. - Copyright (C) 2001-2016 Free Software Foundation, Inc. + Copyright (C) 2001-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/fesetenv.c b/sysdeps/x86_64/fpu/fesetenv.c index 355d02aaa6..c12dba5101 100644 --- a/sysdeps/x86_64/fpu/fesetenv.c +++ b/sysdeps/x86_64/fpu/fesetenv.c @@ -1,5 +1,5 @@ /* Install given floating-point environment. - Copyright (C) 2001-2016 Free Software Foundation, Inc. + Copyright (C) 2001-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/fesetexcept.c b/sysdeps/x86_64/fpu/fesetexcept.c new file mode 100644 index 0000000000..122a7629dc --- /dev/null +++ b/sysdeps/x86_64/fpu/fesetexcept.c @@ -0,0 +1,31 @@ +/* Set given exception flags. x86_64 version. + Copyright (C) 2016-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> + +int +fesetexcept (int excepts) +{ + unsigned int mxcsr; + + __asm__ ("stmxcsr %0" : "=m" (*&mxcsr)); + mxcsr |= excepts & FE_ALL_EXCEPT; + __asm__ ("ldmxcsr %0" : : "m" (*&mxcsr)); + + return 0; +} diff --git a/sysdeps/x86_64/fpu/fesetmode.c b/sysdeps/x86_64/fpu/fesetmode.c new file mode 100644 index 0000000000..0771e4c10a --- /dev/null +++ b/sysdeps/x86_64/fpu/fesetmode.c @@ -0,0 +1,50 @@ +/* Install given floating-point control modes. x86_64 version. + Copyright (C) 2016-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <fpu_control.h> + +/* All exceptions, including the x86-specific "denormal operand" + exception. */ +#define FE_ALL_EXCEPT_X86 (FE_ALL_EXCEPT | __FE_DENORM) + +int +fesetmode (const femode_t *modep) +{ + fpu_control_t cw; + unsigned int mxcsr; + __asm__ ("stmxcsr %0" : "=m" (mxcsr)); + /* Preserve SSE exception flags but restore other state in + MXCSR. */ + mxcsr &= FE_ALL_EXCEPT_X86; + if (modep == FE_DFL_MODE) + { + cw = _FPU_DEFAULT; + /* Default MXCSR state has all bits zero except for those + masking exceptions. */ + mxcsr |= FE_ALL_EXCEPT_X86 << 7; + } + else + { + cw = modep->__control_word; + mxcsr |= modep->__mxcsr & ~FE_ALL_EXCEPT_X86; + } + _FPU_SETCW (cw); + __asm__ ("ldmxcsr %0" : : "m" (mxcsr)); + return 0; +} diff --git a/sysdeps/x86_64/fpu/fesetround.c b/sysdeps/x86_64/fpu/fesetround.c index 475d63f4db..e5afc1d57a 100644 --- a/sysdeps/x86_64/fpu/fesetround.c +++ b/sysdeps/x86_64/fpu/fesetround.c @@ -1,5 +1,5 @@ /* Set current rounding direction. - Copyright (C) 2001-2016 Free Software Foundation, Inc. + Copyright (C) 2001-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/feupdateenv.c b/sysdeps/x86_64/fpu/feupdateenv.c index f035d57ca8..00da535e64 100644 --- a/sysdeps/x86_64/fpu/feupdateenv.c +++ b/sysdeps/x86_64/fpu/feupdateenv.c @@ -1,5 +1,5 @@ /* Install given floating-point environment and raise exceptions. - Copyright (C) 1997-2016 Free Software Foundation, Inc. + Copyright (C) 1997-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. diff --git a/sysdeps/x86_64/fpu/fgetexcptflg.c b/sysdeps/x86_64/fpu/fgetexcptflg.c index 938cf3e62b..16719ceb5e 100644 --- a/sysdeps/x86_64/fpu/fgetexcptflg.c +++ b/sysdeps/x86_64/fpu/fgetexcptflg.c @@ -1,5 +1,5 @@ /* Store current representation for exceptions. - Copyright (C) 2001-2016 Free Software Foundation, Inc. + Copyright (C) 2001-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/fraiseexcpt.c b/sysdeps/x86_64/fpu/fraiseexcpt.c index e2abbbec33..ca1c223053 100644 --- a/sysdeps/x86_64/fpu/fraiseexcpt.c +++ b/sysdeps/x86_64/fpu/fraiseexcpt.c @@ -1,5 +1,5 @@ /* Raise given exceptions. - Copyright (C) 2001-2016 Free Software Foundation, Inc. + Copyright (C) 2001-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/fsetexcptflg.c b/sysdeps/x86_64/fpu/fsetexcptflg.c index 76f7bad9a8..821dd9d786 100644 --- a/sysdeps/x86_64/fpu/fsetexcptflg.c +++ b/sysdeps/x86_64/fpu/fsetexcptflg.c @@ -1,5 +1,5 @@ /* Set floating-point environment exception handling. - Copyright (C) 2001-2016 Free Software Foundation, Inc. + Copyright (C) 2001-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/ftestexcept.c b/sysdeps/x86_64/fpu/ftestexcept.c index c8f2c01c67..63167c68df 100644 --- a/sysdeps/x86_64/fpu/ftestexcept.c +++ b/sysdeps/x86_64/fpu/ftestexcept.c @@ -1,5 +1,5 @@ /* Test exception in current environment. - Copyright (C) 2001-2016 Free Software Foundation, Inc. + Copyright (C) 2001-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/k_rem_pio2l.c b/sysdeps/x86_64/fpu/k_rem_pio2l.c deleted file mode 100644 index eea55a98d2..0000000000 --- a/sysdeps/x86_64/fpu/k_rem_pio2l.c +++ /dev/null @@ -1 +0,0 @@ -/* Not needed. */ diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps index 445b47527d..912db318b6 100644 --- a/sysdeps/x86_64/fpu/libm-test-ulps +++ b/sysdeps/x86_64/fpu/libm-test-ulps @@ -3,1015 +3,1293 @@ # Maximal error of functions: Function: "acos": float: 1 +float128: 1 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "acos_downward": double: 1 float: 1 +float128: 1 idouble: 1 ifloat: 1 +ifloat128: 1 ildouble: 2 ldouble: 2 Function: "acos_towardzero": double: 1 float: 1 +float128: 1 idouble: 1 ifloat: 1 +ifloat128: 1 ildouble: 2 ldouble: 2 Function: "acos_upward": double: 1 float: 1 +float128: 1 idouble: 1 ifloat: 1 +ifloat128: 1 ildouble: 2 ldouble: 2 Function: "acosh": double: 2 float: 2 +float128: 2 idouble: 2 ifloat: 2 +ifloat128: 2 ildouble: 2 ldouble: 2 Function: "acosh_downward": double: 2 float: 2 +float128: 3 idouble: 2 ifloat: 2 +ifloat128: 3 ildouble: 4 ldouble: 4 Function: "acosh_towardzero": double: 2 float: 2 +float128: 2 idouble: 2 ifloat: 2 +ifloat128: 2 ildouble: 4 ldouble: 4 Function: "acosh_upward": double: 2 float: 2 +float128: 2 idouble: 2 ifloat: 2 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: "asin": float: 1 +float128: 1 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "asin_downward": double: 1 float: 1 +float128: 2 idouble: 1 ifloat: 1 +ifloat128: 2 ildouble: 2 ldouble: 2 Function: "asin_towardzero": double: 1 float: 1 +float128: 1 idouble: 1 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "asin_upward": double: 1 float: 1 +float128: 2 idouble: 1 ifloat: 1 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: "asinh": double: 1 float: 1 +float128: 3 idouble: 1 ifloat: 1 +ifloat128: 3 ildouble: 3 ldouble: 3 Function: "asinh_downward": double: 3 float: 3 +float128: 4 idouble: 3 ifloat: 3 +ifloat128: 4 ildouble: 5 ldouble: 5 Function: "asinh_towardzero": double: 2 float: 2 +float128: 2 idouble: 2 ifloat: 2 +ifloat128: 2 ildouble: 4 ldouble: 4 Function: "asinh_upward": double: 3 float: 3 +float128: 4 idouble: 3 ifloat: 3 +ifloat128: 4 ildouble: 5 ldouble: 5 Function: "atan": float: 1 +float128: 1 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "atan2": float: 1 +float128: 1 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "atan2_downward": double: 1 float: 2 +float128: 2 idouble: 1 ifloat: 2 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: "atan2_towardzero": double: 1 float: 2 +float128: 3 idouble: 1 ifloat: 2 +ifloat128: 3 ildouble: 1 ldouble: 1 Function: "atan2_upward": double: 1 float: 2 +float128: 2 idouble: 1 ifloat: 2 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: "atan_downward": double: 1 float: 2 +float128: 2 idouble: 1 ifloat: 2 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: "atan_towardzero": double: 1 float: 1 +float128: 1 idouble: 1 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "atan_upward": double: 1 float: 2 +float128: 2 idouble: 1 ifloat: 2 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: "atanh": double: 2 float: 2 +float128: 3 idouble: 2 ifloat: 2 +ifloat128: 3 ildouble: 3 ldouble: 3 Function: "atanh_downward": double: 3 float: 3 +float128: 4 idouble: 3 ifloat: 3 +ifloat128: 4 ildouble: 5 ldouble: 5 Function: "atanh_towardzero": double: 2 float: 2 +float128: 2 idouble: 2 ifloat: 2 +ifloat128: 2 ildouble: 4 ldouble: 4 Function: "atanh_upward": double: 3 float: 3 +float128: 4 idouble: 3 ifloat: 3 +ifloat128: 4 ildouble: 5 ldouble: 5 Function: "cabs": double: 1 +float128: 1 idouble: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "cabs_downward": double: 1 +float128: 1 idouble: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "cabs_towardzero": double: 1 +float128: 1 idouble: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "cabs_upward": double: 1 +float128: 1 idouble: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: Real part of "cacos": double: 1 float: 2 +float128: 2 idouble: 1 ifloat: 2 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: Imaginary part of "cacos": -double: 1 +double: 2 float: 2 -idouble: 1 +float128: 2 +idouble: 2 ifloat: 2 +ifloat128: 2 ildouble: 2 ldouble: 2 Function: Real part of "cacos_downward": -double: 2 +double: 3 float: 2 -idouble: 2 +float128: 3 +idouble: 3 ifloat: 2 +ifloat128: 3 ildouble: 2 ldouble: 2 Function: Imaginary part of "cacos_downward": double: 5 float: 3 +float128: 6 idouble: 5 ifloat: 3 -ildouble: 5 -ldouble: 5 +ifloat128: 6 +ildouble: 6 +ldouble: 6 Function: Real part of "cacos_towardzero": -double: 2 +double: 3 float: 2 -idouble: 2 +float128: 3 +idouble: 3 ifloat: 2 +ifloat128: 3 ildouble: 2 ldouble: 2 Function: Imaginary part of "cacos_towardzero": double: 5 float: 3 +float128: 5 idouble: 5 ifloat: 3 +ifloat128: 5 ildouble: 5 ldouble: 5 Function: Real part of "cacos_upward": double: 2 float: 2 +float128: 3 idouble: 2 ifloat: 2 +ifloat128: 3 ildouble: 2 ldouble: 2 Function: Imaginary part of "cacos_upward": -double: 4 -float: 4 -idouble: 4 -ifloat: 4 -ildouble: 5 -ldouble: 5 +double: 5 +float: 7 +float128: 7 +idouble: 5 +ifloat: 7 +ifloat128: 7 +ildouble: 7 +ldouble: 7 Function: Real part of "cacosh": -double: 1 +double: 2 float: 2 -idouble: 1 +float128: 2 +idouble: 2 ifloat: 2 +ifloat128: 2 ildouble: 2 ldouble: 2 Function: Imaginary part of "cacosh": double: 1 float: 2 +float128: 2 idouble: 1 ifloat: 2 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: Real part of "cacosh_downward": double: 5 float: 3 +float128: 5 idouble: 5 ifloat: 3 +ifloat128: 5 ildouble: 5 ldouble: 5 Function: Imaginary part of "cacosh_downward": -double: 2 -float: 2 -idouble: 2 -ifloat: 2 -ildouble: 2 -ldouble: 2 +double: 3 +float: 3 +float128: 4 +idouble: 3 +ifloat: 3 +ifloat128: 4 +ildouble: 3 +ldouble: 3 Function: Real part of "cacosh_towardzero": double: 5 float: 3 +float128: 5 idouble: 5 ifloat: 3 +ifloat128: 5 ildouble: 5 ldouble: 5 Function: Imaginary part of "cacosh_towardzero": -double: 2 +double: 3 float: 2 -idouble: 2 +float128: 3 +idouble: 3 ifloat: 2 +ifloat128: 3 ildouble: 2 ldouble: 2 Function: Real part of "cacosh_upward": double: 4 float: 4 +float128: 6 idouble: 4 ifloat: 4 +ifloat128: 6 ildouble: 5 ldouble: 5 Function: Imaginary part of "cacosh_upward": -double: 2 +double: 3 float: 2 -idouble: 2 +float128: 4 +idouble: 3 ifloat: 2 -ildouble: 2 -ldouble: 2 +ifloat128: 4 +ildouble: 3 +ldouble: 3 Function: "carg": float: 1 +float128: 2 ifloat: 1 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: "carg_downward": double: 1 float: 2 +float128: 2 idouble: 1 ifloat: 2 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: "carg_towardzero": double: 1 float: 2 +float128: 3 idouble: 1 ifloat: 2 +ifloat128: 3 ildouble: 1 ldouble: 1 Function: "carg_upward": double: 1 float: 2 +float128: 2 idouble: 1 ifloat: 2 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: Real part of "casin": double: 1 float: 1 +float128: 2 idouble: 1 ifloat: 1 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: Imaginary part of "casin": -double: 1 +double: 2 float: 2 -idouble: 1 +float128: 2 +idouble: 2 ifloat: 2 +ifloat128: 2 ildouble: 2 ldouble: 2 Function: Real part of "casin_downward": double: 3 -float: 1 +float: 2 +float128: 3 idouble: 3 -ifloat: 1 -ildouble: 2 -ldouble: 2 +ifloat: 2 +ifloat128: 3 +ildouble: 3 +ldouble: 3 Function: Imaginary part of "casin_downward": double: 5 float: 3 +float128: 6 idouble: 5 ifloat: 3 -ildouble: 5 -ldouble: 5 +ifloat128: 6 +ildouble: 6 +ldouble: 6 Function: Real part of "casin_towardzero": double: 3 float: 1 +float128: 3 idouble: 3 ifloat: 1 -ildouble: 2 -ldouble: 2 +ifloat128: 3 +ildouble: 3 +ldouble: 3 Function: Imaginary part of "casin_towardzero": double: 5 float: 3 +float128: 5 idouble: 5 ifloat: 3 +ifloat128: 5 ildouble: 5 ldouble: 5 Function: Real part of "casin_upward": -double: 2 -float: 1 -idouble: 2 -ifloat: 1 +double: 3 +float: 2 +float128: 3 +idouble: 3 +ifloat: 2 +ifloat128: 3 ildouble: 2 ldouble: 2 Function: Imaginary part of "casin_upward": -double: 4 -float: 4 -idouble: 4 -ifloat: 4 -ildouble: 5 -ldouble: 5 +double: 5 +float: 7 +float128: 7 +idouble: 5 +ifloat: 7 +ifloat128: 7 +ildouble: 7 +ldouble: 7 Function: Real part of "casinh": -double: 1 +double: 2 float: 2 -idouble: 1 +float128: 2 +idouble: 2 ifloat: 2 +ifloat128: 2 ildouble: 2 ldouble: 2 Function: Imaginary part of "casinh": double: 1 float: 1 +float128: 2 idouble: 1 ifloat: 1 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: Real part of "casinh_downward": double: 5 float: 3 +float128: 6 idouble: 5 ifloat: 3 -ildouble: 5 -ldouble: 5 +ifloat128: 6 +ildouble: 6 +ldouble: 6 Function: Imaginary part of "casinh_downward": double: 3 -float: 1 +float: 2 +float128: 3 idouble: 3 -ifloat: 1 -ildouble: 2 -ldouble: 2 +ifloat: 2 +ifloat128: 3 +ildouble: 3 +ldouble: 3 Function: Real part of "casinh_towardzero": double: 5 float: 3 +float128: 5 idouble: 5 ifloat: 3 +ifloat128: 5 ildouble: 5 ldouble: 5 Function: Imaginary part of "casinh_towardzero": double: 3 float: 1 +float128: 3 idouble: 3 ifloat: 1 -ildouble: 2 -ldouble: 2 +ifloat128: 3 +ildouble: 3 +ldouble: 3 Function: Real part of "casinh_upward": -double: 4 -float: 4 -idouble: 4 -ifloat: 4 -ildouble: 5 -ldouble: 5 +double: 5 +float: 7 +float128: 7 +idouble: 5 +ifloat: 7 +ifloat128: 7 +ildouble: 7 +ldouble: 7 Function: Imaginary part of "casinh_upward": -double: 2 +double: 3 float: 2 -idouble: 2 +float128: 3 +idouble: 3 ifloat: 2 +ifloat128: 3 ildouble: 2 ldouble: 2 Function: Real part of "catan": +double: 1 float: 1 +float128: 1 +idouble: 1 ifloat: 1 +ifloat128: 1 +ildouble: 1 +ldouble: 1 Function: Imaginary part of "catan": double: 1 float: 1 +float128: 1 idouble: 1 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: Real part of "catan_downward": double: 1 -float: 1 +float: 2 +float128: 2 idouble: 1 -ifloat: 1 +ifloat: 2 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: Imaginary part of "catan_downward": double: 2 float: 2 +float128: 2 idouble: 2 ifloat: 2 +ifloat128: 2 ildouble: 4 ldouble: 4 Function: Real part of "catan_towardzero": double: 1 -float: 1 +float: 2 +float128: 2 idouble: 1 -ifloat: 1 +ifloat: 2 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: Imaginary part of "catan_towardzero": double: 2 -float: 1 +float: 2 +float128: 2 idouble: 2 -ifloat: 1 +ifloat: 2 +ifloat128: 2 ildouble: 4 ldouble: 4 Function: Real part of "catan_upward": +double: 1 float: 1 +float128: 2 +idouble: 1 ifloat: 1 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: Imaginary part of "catan_upward": double: 3 float: 3 +float128: 3 idouble: 3 ifloat: 3 +ifloat128: 3 ildouble: 3 ldouble: 3 Function: Real part of "catanh": double: 1 float: 1 +float128: 1 idouble: 1 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: Imaginary part of "catanh": +double: 1 float: 1 +float128: 1 +idouble: 1 ifloat: 1 +ifloat128: 1 +ildouble: 1 +ldouble: 1 Function: Real part of "catanh_downward": double: 2 float: 2 +float128: 2 idouble: 2 ifloat: 2 +ifloat128: 2 ildouble: 4 ldouble: 4 Function: Imaginary part of "catanh_downward": double: 1 float: 2 +float128: 2 idouble: 1 ifloat: 2 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: Real part of "catanh_towardzero": double: 2 -float: 1 +float: 2 +float128: 2 idouble: 2 -ifloat: 1 +ifloat: 2 +ifloat128: 2 ildouble: 4 ldouble: 4 Function: Imaginary part of "catanh_towardzero": double: 1 float: 2 +float128: 2 idouble: 1 ifloat: 2 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: Real part of "catanh_upward": double: 4 -float: 3 +float: 4 +float128: 4 idouble: 4 -ifloat: 3 +ifloat: 4 +ifloat128: 4 ildouble: 4 ldouble: 4 Function: Imaginary part of "catanh_upward": +double: 1 float: 1 +float128: 2 +idouble: 1 ifloat: 1 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: "cbrt": double: 3 float: 1 +float128: 1 idouble: 3 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "cbrt_downward": double: 4 float: 1 +float128: 1 idouble: 4 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "cbrt_towardzero": double: 3 float: 1 +float128: 1 idouble: 3 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "cbrt_upward": double: 5 float: 1 +float128: 1 idouble: 5 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: Real part of "ccos": double: 1 float: 1 +float128: 1 idouble: 1 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: Imaginary part of "ccos": double: 1 float: 1 +float128: 1 idouble: 1 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: Real part of "ccos_downward": double: 1 float: 1 +float128: 2 idouble: 1 ifloat: 1 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: Imaginary part of "ccos_downward": double: 2 float: 3 +float128: 2 idouble: 2 ifloat: 3 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: Real part of "ccos_towardzero": double: 1 float: 2 +float128: 2 idouble: 1 ifloat: 2 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: Imaginary part of "ccos_towardzero": double: 2 float: 3 +float128: 2 idouble: 2 ifloat: 3 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: Real part of "ccos_upward": double: 1 float: 2 +float128: 3 idouble: 1 ifloat: 2 +ifloat128: 3 ildouble: 2 ldouble: 2 Function: Imaginary part of "ccos_upward": double: 2 float: 2 +float128: 2 idouble: 2 ifloat: 2 +ifloat128: 2 ildouble: 2 ldouble: 2 Function: Real part of "ccosh": double: 1 float: 1 +float128: 1 idouble: 1 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: Imaginary part of "ccosh": double: 1 float: 1 +float128: 1 idouble: 1 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: Real part of "ccosh_downward": double: 1 float: 2 +float128: 2 idouble: 1 ifloat: 2 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: Imaginary part of "ccosh_downward": double: 2 float: 3 +float128: 2 idouble: 2 ifloat: 3 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: Real part of "ccosh_towardzero": double: 1 float: 3 +float128: 2 idouble: 1 ifloat: 3 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: Imaginary part of "ccosh_towardzero": double: 2 float: 3 +float128: 2 idouble: 2 ifloat: 3 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: Real part of "ccosh_upward": double: 1 float: 2 +float128: 3 idouble: 1 ifloat: 2 +ifloat128: 3 ildouble: 2 ldouble: 2 Function: Imaginary part of "ccosh_upward": double: 2 float: 2 +float128: 2 idouble: 2 ifloat: 2 +ifloat128: 2 ildouble: 2 ldouble: 2 Function: Real part of "cexp": double: 2 float: 1 +float128: 1 idouble: 2 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: Imaginary part of "cexp": double: 1 float: 2 +float128: 1 idouble: 1 ifloat: 2 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: Real part of "cexp_downward": double: 1 float: 2 +float128: 2 idouble: 1 ifloat: 2 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: Imaginary part of "cexp_downward": double: 1 float: 3 +float128: 2 idouble: 1 ifloat: 3 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: Real part of "cexp_towardzero": double: 1 float: 2 +float128: 2 idouble: 1 ifloat: 2 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: Imaginary part of "cexp_towardzero": double: 1 float: 3 +float128: 2 idouble: 1 ifloat: 3 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: Real part of "cexp_upward": double: 1 float: 2 +float128: 3 idouble: 1 ifloat: 2 +ifloat128: 3 ildouble: 2 ldouble: 2 Function: Imaginary part of "cexp_upward": double: 1 float: 2 +float128: 3 idouble: 1 ifloat: 2 +ifloat128: 3 ildouble: 3 ldouble: 3 Function: Real part of "clog": double: 3 float: 3 +float128: 2 idouble: 3 ifloat: 3 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: Imaginary part of "clog": float: 1 +float128: 1 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: Real part of "clog10": double: 3 float: 4 +float128: 2 idouble: 3 ifloat: 4 +ifloat128: 2 ildouble: 4 ldouble: 4 Function: Imaginary part of "clog10": double: 2 float: 2 +float128: 2 idouble: 2 ifloat: 2 +ifloat128: 2 ildouble: 2 ldouble: 2 Function: Real part of "clog10_downward": double: 5 -float: 4 +float: 5 +float128: 3 idouble: 5 -ifloat: 4 +ifloat: 5 +ifloat128: 3 ildouble: 8 ldouble: 8 Function: Imaginary part of "clog10_downward": double: 2 float: 4 +float128: 3 idouble: 2 ifloat: 4 +ifloat128: 3 ildouble: 3 ldouble: 3 Function: Real part of "clog10_towardzero": double: 5 float: 5 +float128: 4 idouble: 5 ifloat: 5 +ifloat128: 4 ildouble: 8 ldouble: 8 Function: Imaginary part of "clog10_towardzero": double: 2 float: 4 +float128: 3 idouble: 2 ifloat: 4 +ifloat128: 3 ildouble: 3 ldouble: 3 Function: Real part of "clog10_upward": double: 6 float: 5 +float128: 4 idouble: 6 ifloat: 5 +ifloat128: 4 ildouble: 8 ldouble: 8 Function: Imaginary part of "clog10_upward": double: 2 float: 4 +float128: 3 idouble: 2 ifloat: 4 +ifloat128: 3 ildouble: 3 ldouble: 3 Function: Real part of "clog_downward": double: 4 float: 3 +float128: 3 idouble: 4 ifloat: 3 +ifloat128: 3 ildouble: 5 ldouble: 5 Function: Imaginary part of "clog_downward": double: 1 float: 2 +float128: 2 idouble: 1 ifloat: 2 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: Real part of "clog_towardzero": double: 4 float: 4 +float128: 3 idouble: 4 ifloat: 4 +ifloat128: 3 ildouble: 5 ldouble: 5 Function: Imaginary part of "clog_towardzero": double: 1 float: 3 +float128: 2 idouble: 1 ifloat: 3 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: Real part of "clog_upward": double: 4 float: 3 +float128: 4 idouble: 4 ifloat: 3 +ifloat128: 4 ildouble: 4 ldouble: 4 Function: Imaginary part of "clog_upward": double: 1 float: 2 +float128: 2 idouble: 1 ifloat: 2 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: "cos": +double: 1 +float128: 1 +idouble: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "cos_downward": double: 1 +float128: 3 idouble: 1 +ifloat128: 3 ildouble: 3 ldouble: 3 Function: "cos_towardzero": double: 1 +float128: 1 idouble: 1 +ifloat128: 1 ildouble: 2 ldouble: 2 Function: "cos_upward": double: 1 +float128: 2 idouble: 1 +ifloat128: 2 ildouble: 2 ldouble: 2 @@ -1029,7 +1307,7 @@ Function: "cos_vlen4_avx2": double: 2 Function: "cos_vlen8": -double: 1 +double: 2 float: 1 Function: "cos_vlen8_avx2": @@ -1038,546 +1316,690 @@ float: 1 Function: "cosh": double: 1 float: 1 +float128: 1 idouble: 1 ifloat: 1 +ifloat128: 1 ildouble: 2 ldouble: 2 Function: "cosh_downward": double: 1 float: 1 +float128: 2 idouble: 1 ifloat: 1 +ifloat128: 1 ildouble: 2 ldouble: 3 Function: "cosh_towardzero": double: 1 float: 1 +float128: 2 idouble: 1 ifloat: 1 +ifloat128: 1 ildouble: 2 ldouble: 2 Function: "cosh_upward": double: 1 float: 2 +float128: 3 idouble: 1 ifloat: 2 +ifloat128: 1 ildouble: 2 ldouble: 3 Function: Real part of "cpow": double: 2 float: 5 +float128: 4 idouble: 2 ifloat: 5 +ifloat128: 4 ildouble: 3 ldouble: 3 Function: Imaginary part of "cpow": float: 2 +float128: 1 ifloat: 2 +ifloat128: 1 ildouble: 4 ldouble: 4 Function: Real part of "cpow_downward": double: 4 float: 8 +float128: 6 idouble: 4 ifloat: 8 +ifloat128: 6 ildouble: 7 ldouble: 7 Function: Imaginary part of "cpow_downward": double: 1 float: 2 +float128: 2 idouble: 1 ifloat: 2 +ifloat128: 2 ildouble: 2 ldouble: 2 Function: Real part of "cpow_towardzero": double: 4 float: 8 +float128: 6 idouble: 4 ifloat: 8 +ifloat128: 6 ildouble: 7 ldouble: 7 Function: Imaginary part of "cpow_towardzero": double: 1 float: 2 +float128: 2 idouble: 1 ifloat: 2 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: Real part of "cpow_upward": double: 4 float: 1 +float128: 3 idouble: 4 ifloat: 1 +ifloat128: 3 ildouble: 2 ldouble: 2 Function: Imaginary part of "cpow_upward": double: 1 float: 2 +float128: 2 idouble: 1 ifloat: 2 +ifloat128: 2 ildouble: 2 ldouble: 2 Function: Real part of "csin": double: 1 float: 1 +float128: 1 idouble: 1 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 +Function: Imaginary part of "csin": +float128: 1 +ifloat128: 1 + Function: Real part of "csin_downward": double: 2 float: 3 +float128: 2 idouble: 2 ifloat: 3 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: Imaginary part of "csin_downward": double: 1 float: 2 +float128: 2 idouble: 1 ifloat: 2 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: Real part of "csin_towardzero": double: 2 float: 3 +float128: 2 idouble: 2 ifloat: 3 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: Imaginary part of "csin_towardzero": double: 2 float: 2 +float128: 2 idouble: 2 ifloat: 2 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: Real part of "csin_upward": double: 2 float: 3 +float128: 2 idouble: 2 ifloat: 3 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: Imaginary part of "csin_upward": double: 1 float: 3 +float128: 3 idouble: 1 ifloat: 3 +ifloat128: 3 ildouble: 3 ldouble: 3 Function: Real part of "csinh": float: 1 +float128: 1 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: Imaginary part of "csinh": double: 1 float: 1 +float128: 1 idouble: 1 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: Real part of "csinh_downward": double: 2 float: 2 +float128: 2 idouble: 2 ifloat: 2 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: Imaginary part of "csinh_downward": double: 2 float: 3 +float128: 2 idouble: 2 ifloat: 3 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: Real part of "csinh_towardzero": double: 2 float: 2 +float128: 2 idouble: 2 ifloat: 2 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: Imaginary part of "csinh_towardzero": double: 2 float: 3 +float128: 2 idouble: 2 ifloat: 3 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: Real part of "csinh_upward": double: 1 float: 3 +float128: 3 idouble: 1 ifloat: 3 +ifloat128: 3 ildouble: 3 ldouble: 3 Function: Imaginary part of "csinh_upward": double: 2 float: 3 +float128: 2 idouble: 2 ifloat: 3 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: Real part of "csqrt": double: 2 float: 2 +float128: 2 idouble: 2 ifloat: 2 +ifloat128: 2 ildouble: 2 ldouble: 2 Function: Imaginary part of "csqrt": double: 2 float: 2 +float128: 2 idouble: 2 ifloat: 2 +ifloat128: 2 ildouble: 2 ldouble: 2 Function: Real part of "csqrt_downward": double: 5 float: 4 +float128: 4 idouble: 5 ifloat: 4 +ifloat128: 4 ildouble: 5 ldouble: 5 Function: Imaginary part of "csqrt_downward": double: 4 float: 3 +float128: 3 idouble: 4 ifloat: 3 +ifloat128: 3 ildouble: 4 ldouble: 4 Function: Real part of "csqrt_towardzero": double: 4 float: 3 +float128: 3 idouble: 4 ifloat: 3 +ifloat128: 3 ildouble: 4 ldouble: 4 Function: Imaginary part of "csqrt_towardzero": double: 4 float: 3 +float128: 3 idouble: 4 ifloat: 3 +ifloat128: 3 ildouble: 4 ldouble: 4 Function: Real part of "csqrt_upward": double: 5 float: 4 +float128: 4 idouble: 5 ifloat: 4 +ifloat128: 4 ildouble: 5 ldouble: 5 Function: Imaginary part of "csqrt_upward": double: 3 float: 3 +float128: 3 idouble: 3 ifloat: 3 +ifloat128: 3 ildouble: 4 ldouble: 4 Function: Real part of "ctan": double: 1 float: 1 +float128: 3 idouble: 1 ifloat: 1 +ifloat128: 3 ildouble: 2 ldouble: 2 Function: Imaginary part of "ctan": double: 2 float: 2 +float128: 3 idouble: 2 ifloat: 2 +ifloat128: 3 ildouble: 1 ldouble: 1 Function: Real part of "ctan_downward": double: 6 float: 5 +float128: 4 idouble: 6 ifloat: 5 +ifloat128: 4 ildouble: 5 ldouble: 5 Function: Imaginary part of "ctan_downward": double: 2 float: 2 +float128: 5 idouble: 2 ifloat: 2 +ifloat128: 5 ildouble: 4 ldouble: 4 Function: Real part of "ctan_towardzero": double: 5 float: 3 +float128: 4 idouble: 5 ifloat: 3 +ifloat128: 4 ildouble: 5 ldouble: 5 Function: Imaginary part of "ctan_towardzero": double: 2 float: 2 +float128: 5 idouble: 2 ifloat: 2 +ifloat128: 5 ildouble: 4 ldouble: 4 Function: Real part of "ctan_upward": double: 2 float: 4 +float128: 5 idouble: 2 ifloat: 4 +ifloat128: 5 ildouble: 3 ldouble: 3 Function: Imaginary part of "ctan_upward": double: 2 -float: 1 +float: 2 +float128: 5 idouble: 2 -ifloat: 1 +ifloat: 2 +ifloat128: 5 ildouble: 3 ldouble: 3 Function: Real part of "ctanh": double: 2 float: 2 +float128: 3 idouble: 2 ifloat: 2 +ifloat128: 3 ildouble: 1 ldouble: 1 Function: Imaginary part of "ctanh": double: 2 float: 2 +float128: 3 idouble: 2 ifloat: 2 +ifloat128: 3 ildouble: 2 ldouble: 2 Function: Real part of "ctanh_downward": double: 4 float: 2 +float128: 5 idouble: 4 ifloat: 2 +ifloat128: 5 ildouble: 4 ldouble: 4 Function: Imaginary part of "ctanh_downward": double: 6 float: 5 +float128: 4 idouble: 6 ifloat: 5 +ifloat128: 4 ildouble: 4 ldouble: 4 Function: Real part of "ctanh_towardzero": double: 2 float: 2 +float128: 5 idouble: 2 ifloat: 2 +ifloat128: 5 ildouble: 4 ldouble: 4 Function: Imaginary part of "ctanh_towardzero": double: 5 float: 3 +float128: 3 idouble: 5 ifloat: 3 +ifloat128: 3 ildouble: 3 ldouble: 3 Function: Real part of "ctanh_upward": double: 2 float: 2 +float128: 5 idouble: 2 ifloat: 2 +ifloat128: 5 ildouble: 3 ldouble: 3 Function: Imaginary part of "ctanh_upward": double: 2 float: 3 +float128: 5 idouble: 2 ifloat: 3 +ifloat128: 5 ildouble: 3 ldouble: 3 Function: "erf": double: 1 float: 1 +float128: 1 idouble: 1 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "erf_downward": double: 1 float: 1 +float128: 2 idouble: 1 ifloat: 1 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: "erf_towardzero": double: 1 float: 1 +float128: 1 idouble: 1 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "erf_upward": double: 1 float: 1 +float128: 2 idouble: 1 ifloat: 1 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: "erfc": double: 3 float: 2 +float128: 2 idouble: 3 ifloat: 2 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: "erfc_downward": double: 5 float: 6 +float128: 5 idouble: 5 ifloat: 6 +ifloat128: 5 ildouble: 4 ldouble: 4 Function: "erfc_towardzero": double: 3 float: 4 +float128: 4 idouble: 3 ifloat: 4 +ifloat128: 4 ildouble: 4 ldouble: 4 Function: "erfc_upward": double: 5 float: 6 +float128: 5 idouble: 5 ifloat: 6 +ifloat128: 5 ildouble: 5 ldouble: 5 Function: "exp": +float128: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "exp10": double: 2 +float128: 2 idouble: 2 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: "exp10_downward": double: 2 float: 1 +float128: 3 idouble: 2 ifloat: 1 +ifloat128: 3 ildouble: 2 ldouble: 2 Function: "exp10_towardzero": double: 2 float: 1 +float128: 3 idouble: 2 ifloat: 1 +ifloat128: 3 ildouble: 2 ldouble: 2 Function: "exp10_upward": double: 2 float: 1 +float128: 3 idouble: 2 ifloat: 1 +ifloat128: 3 ildouble: 2 ldouble: 2 Function: "exp2": double: 1 float: 1 +float128: 1 idouble: 1 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "exp2_downward": double: 1 float: 1 +float128: 1 idouble: 1 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "exp2_towardzero": double: 1 float: 1 +float128: 1 idouble: 1 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "exp2_upward": double: 1 float: 1 +float128: 2 idouble: 1 ifloat: 1 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: "exp_downward": double: 1 +float: 1 idouble: 1 +ifloat: 1 ildouble: 1 ldouble: 1 Function: "exp_towardzero": double: 1 +float: 1 idouble: 1 +ifloat: 1 ildouble: 2 ldouble: 2 @@ -1612,32 +2034,40 @@ float: 1 Function: "expm1": double: 1 float: 1 +float128: 1 idouble: 1 ifloat: 1 +ifloat128: 1 ildouble: 2 ldouble: 2 Function: "expm1_downward": double: 1 float: 1 +float128: 2 idouble: 1 ifloat: 1 +ifloat128: 2 ildouble: 4 ldouble: 4 Function: "expm1_towardzero": double: 1 float: 2 +float128: 4 idouble: 1 ifloat: 2 +ifloat128: 4 ildouble: 4 ldouble: 4 Function: "expm1_upward": double: 1 float: 1 +float128: 3 idouble: 1 ifloat: 1 +ifloat128: 3 ildouble: 4 ldouble: 4 @@ -1675,275 +2105,347 @@ ldouble: 6 Function: "hypot": double: 1 +float128: 1 idouble: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "hypot_downward": double: 1 +float128: 1 idouble: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "hypot_towardzero": double: 1 +float128: 1 idouble: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "hypot_upward": double: 1 +float128: 1 idouble: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "j0": double: 2 float: 2 +float128: 2 idouble: 2 ifloat: 2 +ifloat128: 2 ildouble: 2 ldouble: 2 Function: "j0_downward": double: 2 float: 4 +float128: 4 idouble: 2 ifloat: 4 +ifloat128: 4 ildouble: 4 ldouble: 4 Function: "j0_towardzero": double: 3 float: 2 +float128: 2 idouble: 3 ifloat: 2 +ifloat128: 2 ildouble: 5 ldouble: 5 Function: "j0_upward": double: 3 float: 2 +float128: 5 idouble: 3 ifloat: 2 +ifloat128: 5 ildouble: 4 ldouble: 4 Function: "j1": double: 1 float: 2 +float128: 4 idouble: 1 ifloat: 2 +ifloat128: 4 ildouble: 1 ldouble: 1 Function: "j1_downward": double: 3 float: 3 +float128: 4 idouble: 3 ifloat: 3 +ifloat128: 4 ildouble: 4 ldouble: 4 Function: "j1_towardzero": double: 3 float: 2 +float128: 4 idouble: 3 ifloat: 2 +ifloat128: 4 ildouble: 4 ldouble: 4 Function: "j1_upward": double: 3 float: 5 +float128: 3 idouble: 3 ifloat: 5 +ifloat128: 3 ildouble: 3 ldouble: 3 Function: "jn": double: 4 float: 4 +float128: 7 idouble: 4 ifloat: 4 +ifloat128: 7 ildouble: 4 ldouble: 4 Function: "jn_downward": double: 5 float: 5 +float128: 8 idouble: 5 ifloat: 5 +ifloat128: 8 ildouble: 4 ldouble: 4 Function: "jn_towardzero": double: 5 float: 5 +float128: 8 idouble: 5 ifloat: 5 +ifloat128: 8 ildouble: 5 ldouble: 5 Function: "jn_upward": double: 5 float: 5 +float128: 7 idouble: 5 ifloat: 5 +ifloat128: 7 ildouble: 5 ldouble: 5 Function: "lgamma": double: 4 float: 4 +float128: 5 idouble: 4 ifloat: 4 +ifloat128: 5 ildouble: 4 ldouble: 4 Function: "lgamma_downward": double: 5 float: 4 +float128: 8 idouble: 5 ifloat: 4 +ifloat128: 8 ildouble: 7 ldouble: 7 Function: "lgamma_towardzero": double: 5 float: 4 +float128: 5 idouble: 5 ifloat: 4 +ifloat128: 5 ildouble: 7 ldouble: 7 Function: "lgamma_upward": double: 5 float: 5 +float128: 8 idouble: 5 ifloat: 5 +ifloat128: 8 ildouble: 6 ldouble: 6 Function: "log": float: 1 +float128: 1 ifloat: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "log10": double: 2 float: 2 +float128: 1 idouble: 2 ifloat: 2 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "log10_downward": double: 2 float: 3 +float128: 1 idouble: 2 ifloat: 3 +ifloat128: 1 ildouble: 2 ldouble: 2 Function: "log10_towardzero": double: 2 float: 2 +float128: 1 idouble: 2 ifloat: 2 +ifloat128: 1 ildouble: 2 ldouble: 2 Function: "log10_upward": double: 2 float: 2 +float128: 1 idouble: 2 ifloat: 2 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "log1p": double: 1 float: 1 +float128: 2 idouble: 1 ifloat: 1 +ifloat128: 2 ildouble: 2 ldouble: 2 Function: "log1p_downward": double: 2 float: 2 +float128: 3 idouble: 2 ifloat: 2 +ifloat128: 3 ildouble: 4 ldouble: 4 Function: "log1p_towardzero": double: 2 float: 2 +float128: 3 idouble: 2 ifloat: 2 +ifloat128: 3 ildouble: 4 ldouble: 4 Function: "log1p_upward": double: 2 float: 2 +float128: 2 idouble: 2 ifloat: 2 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: "log2": double: 2 float: 1 +float128: 2 idouble: 2 ifloat: 1 +ifloat128: 2 ildouble: 1 ldouble: 1 Function: "log2_downward": double: 3 float: 3 +float128: 3 idouble: 3 ifloat: 3 +ifloat128: 3 ildouble: 1 ldouble: 1 Function: "log2_towardzero": double: 2 float: 2 +float128: 1 idouble: 2 ifloat: 2 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "log2_upward": double: 3 float: 3 +float128: 1 idouble: 3 ifloat: 3 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "log_downward": float: 2 +float128: 1 ifloat: 2 +ifloat128: 1 ildouble: 2 ldouble: 2 Function: "log_towardzero": float: 2 +float128: 2 ifloat: 2 +ifloat128: 2 ildouble: 2 ldouble: 2 Function: "log_upward": double: 1 float: 2 +float128: 1 idouble: 1 ifloat: 2 +ifloat128: 1 ildouble: 1 ldouble: 1 @@ -1965,67 +2467,47 @@ double: 1 float: 3 Function: "log_vlen8_avx2": -float: 2 +float: 3 Function: "pow": +double: 1 float: 1 +float128: 2 +idouble: 1 ifloat: 1 +ifloat128: 2 ildouble: 1 ldouble: 1 -Function: "pow10": -double: 2 -idouble: 2 -ildouble: 1 -ldouble: 1 - -Function: "pow10_downward": -double: 2 -float: 1 -idouble: 2 -ifloat: 1 -ildouble: 2 -ldouble: 2 - -Function: "pow10_towardzero": -double: 2 -float: 1 -idouble: 2 -ifloat: 1 -ildouble: 2 -ldouble: 2 - -Function: "pow10_upward": -double: 2 -float: 1 -idouble: 2 -ifloat: 1 -ildouble: 2 -ldouble: 2 - Function: "pow_downward": double: 1 float: 1 +float128: 2 idouble: 1 ifloat: 1 +ifloat128: 2 ildouble: 4 ldouble: 4 Function: "pow_towardzero": double: 1 float: 1 +float128: 2 idouble: 1 ifloat: 1 -ildouble: 1 -ldouble: 1 +ifloat128: 2 +ildouble: 4 +ldouble: 4 Function: "pow_upward": double: 1 float: 1 +float128: 2 idouble: 1 ifloat: 1 -ildouble: 2 -ldouble: 2 +ifloat128: 2 +ildouble: 4 +ldouble: 4 Function: "pow_vlen16": float: 3 @@ -2048,24 +2530,34 @@ Function: "pow_vlen8_avx2": float: 3 Function: "sin": +double: 1 +float128: 1 +idouble: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "sin_downward": double: 1 +float128: 3 idouble: 1 +ifloat128: 3 ildouble: 3 ldouble: 3 Function: "sin_towardzero": double: 1 +float128: 2 idouble: 1 +ifloat128: 2 ildouble: 2 ldouble: 2 Function: "sin_upward": double: 1 +float128: 3 idouble: 1 +ifloat128: 3 ildouble: 3 ldouble: 3 @@ -2090,24 +2582,34 @@ Function: "sin_vlen8_avx2": float: 1 Function: "sincos": +double: 1 +float128: 1 +idouble: 1 +ifloat128: 1 ildouble: 1 ldouble: 1 Function: "sincos_downward": double: 1 +float128: 3 idouble: 1 +ifloat128: 3 ildouble: 3 ldouble: 3 Function: "sincos_towardzero": double: 1 +float128: 2 idouble: 1 +ifloat128: 2 ildouble: 2 ldouble: 2 Function: "sincos_upward": double: 1 +float128: 3 idouble: 1 +ifloat128: 3 ildouble: 3 ldouble: 3 @@ -2125,7 +2627,7 @@ Function: "sincos_vlen4_avx2": double: 2 Function: "sincos_vlen8": -double: 1 +double: 2 float: 1 Function: "sincos_vlen8_avx2": @@ -2134,222 +2636,278 @@ float: 1 Function: "sinh": double: 2 float: 2 +float128: 2 idouble: 2 ifloat: 2 +ifloat128: 2 ildouble: 2 ldouble: 2 Function: "sinh_downward": double: 3 float: 3 +float128: 3 idouble: 3 ifloat: 3 +ifloat128: 3 ildouble: 5 ldouble: 5 Function: "sinh_towardzero": double: 2 float: 2 +float128: 3 idouble: 2 ifloat: 2 +ifloat128: 3 ildouble: 4 ldouble: 4 Function: "sinh_upward": double: 3 float: 3 +float128: 4 idouble: 3 ifloat: 3 +ifloat128: 4 ildouble: 5 ldouble: 5 Function: "tan": float: 1 +float128: 1 ifloat: 1 +ifloat128: 1 ildouble: 2 ldouble: 2 Function: "tan_downward": double: 1 float: 2 +float128: 1 idouble: 1 ifloat: 2 +ifloat128: 1 ildouble: 3 ldouble: 3 Function: "tan_towardzero": double: 1 float: 1 +float128: 1 idouble: 1 ifloat: 1 +ifloat128: 1 ildouble: 3 ldouble: 3 Function: "tan_upward": double: 1 float: 1 +float128: 1 idouble: 1 ifloat: 1 +ifloat128: 1 ildouble: 2 ldouble: 2 Function: "tanh": double: 2 float: 2 +float128: 2 idouble: 2 ifloat: 2 +ifloat128: 2 ildouble: 3 ldouble: 3 Function: "tanh_downward": double: 3 float: 3 +float128: 4 idouble: 3 ifloat: 3 +ifloat128: 4 ildouble: 4 ldouble: 4 Function: "tanh_towardzero": double: 2 float: 2 +float128: 3 idouble: 2 ifloat: 2 +ifloat128: 3 ildouble: 3 ldouble: 3 Function: "tanh_upward": double: 3 float: 3 +float128: 3 idouble: 3 ifloat: 3 +ifloat128: 3 ildouble: 4 ldouble: 4 Function: "tgamma": double: 5 float: 5 +float128: 4 idouble: 5 ifloat: 5 +ifloat128: 4 ildouble: 5 ldouble: 5 Function: "tgamma_downward": double: 5 float: 5 +float128: 5 idouble: 5 ifloat: 5 +ifloat128: 5 ildouble: 5 ldouble: 5 Function: "tgamma_towardzero": double: 5 float: 5 +float128: 5 idouble: 5 ifloat: 5 +ifloat128: 5 ildouble: 5 ldouble: 5 Function: "tgamma_upward": double: 5 float: 5 +float128: 4 idouble: 5 ifloat: 5 +ifloat128: 4 ildouble: 5 ldouble: 5 Function: "y0": double: 2 float: 1 +float128: 3 idouble: 2 ifloat: 1 +ifloat128: 3 ildouble: 1 ldouble: 1 Function: "y0_downward": double: 3 float: 4 +float128: 4 idouble: 3 ifloat: 4 +ifloat128: 4 ildouble: 5 ldouble: 5 Function: "y0_towardzero": double: 3 float: 3 +float128: 3 idouble: 3 ifloat: 3 +ifloat128: 3 ildouble: 5 ldouble: 5 Function: "y0_upward": double: 3 float: 5 +float128: 3 idouble: 3 ifloat: 5 +ifloat128: 3 ildouble: 3 ldouble: 3 Function: "y1": double: 3 float: 2 +float128: 2 idouble: 3 ifloat: 2 +ifloat128: 2 ildouble: 2 ldouble: 2 Function: "y1_downward": double: 3 float: 2 +float128: 4 idouble: 3 ifloat: 2 +ifloat128: 4 ildouble: 7 ldouble: 7 Function: "y1_towardzero": double: 3 float: 2 +float128: 2 idouble: 3 ifloat: 2 +ifloat128: 2 ildouble: 5 ldouble: 5 Function: "y1_upward": double: 7 float: 2 +float128: 5 idouble: 7 ifloat: 2 +ifloat128: 5 ildouble: 7 ldouble: 7 Function: "yn": double: 3 float: 3 +float128: 5 idouble: 3 ifloat: 3 +ifloat128: 5 ildouble: 4 ldouble: 4 Function: "yn_downward": double: 3 float: 4 +float128: 5 idouble: 3 ifloat: 4 +ifloat128: 5 ildouble: 5 ldouble: 5 Function: "yn_towardzero": double: 3 float: 3 +float128: 5 idouble: 3 ifloat: 3 +ifloat128: 5 ildouble: 5 ldouble: 5 Function: "yn_upward": double: 4 float: 5 +float128: 5 idouble: 4 ifloat: 5 +ifloat128: 5 ildouble: 4 ldouble: 4 diff --git a/sysdeps/x86_64/fpu/libm-test-ulps-name b/sysdeps/x86_64/fpu/libm-test-ulps-name new file mode 100644 index 0000000000..1c09346681 --- /dev/null +++ b/sysdeps/x86_64/fpu/libm-test-ulps-name @@ -0,0 +1 @@ +x86_64 diff --git a/sysdeps/x86_64/fpu/math-tests-arch.h b/sysdeps/x86_64/fpu/math-tests-arch.h index 867152046e..a5df133292 100644 --- a/sysdeps/x86_64/fpu/math-tests-arch.h +++ b/sysdeps/x86_64/fpu/math-tests-arch.h @@ -1,5 +1,5 @@ /* Runtime architecture check for math tests. x86_64 version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,11 +16,11 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ +#include <cpu-features.h> + #if defined REQUIRE_AVX -# include <init-arch.h> # define INIT_ARCH_EXT - # define CHECK_ARCH_EXT \ do \ { \ @@ -29,10 +29,8 @@ while (0) #elif defined REQUIRE_AVX2 -# include <init-arch.h> # define INIT_ARCH_EXT - # define CHECK_ARCH_EXT \ do \ { \ @@ -41,10 +39,8 @@ while (0) #elif defined REQUIRE_AVX512F -# include <init-arch.h> # define INIT_ARCH_EXT - # define CHECK_ARCH_EXT \ do \ { \ diff --git a/sysdeps/x86_64/fpu/math_ldbl.h b/sysdeps/x86_64/fpu/math_ldbl.h index b9ff8dadaf..27f8fce904 100644 --- a/sysdeps/x86_64/fpu/math_ldbl.h +++ b/sysdeps/x86_64/fpu/math_ldbl.h @@ -1,6 +1,25 @@ -#ifndef _MATH_PRIVATE_H_ -#error "Never use <math_ldbl.h> directly; include <math_private.h> instead." -#endif +/* Manipulation of the bit representation of 'long double' quantities. + Copyright (C) 2001-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _MATH_LDBL_H_ +#define _MATH_LDBL_H_ 1 + +#include <stdint.h> /* A union which permits us to convert between a long double and three 32 bit ints. */ @@ -10,8 +29,8 @@ typedef union long double value; struct { - u_int32_t lsw; - u_int32_t msw; + uint32_t lsw; + uint32_t msw; int sign_exponent:16; unsigned int empty1:16; unsigned int empty0:32; @@ -77,3 +96,5 @@ do { \ se_u.parts.sign_exponent = (exp); \ (d) = se_u.value; \ } while (0) + +#endif /* math_ldbl.h */ diff --git a/sysdeps/x86_64/fpu/math_private.h b/sysdeps/x86_64/fpu/math_private.h index 027a6a3a4d..13052893ef 100644 --- a/sysdeps/x86_64/fpu/math_private.h +++ b/sysdeps/x86_64/fpu/math_private.h @@ -48,38 +48,6 @@ #include <sysdeps/i386/fpu/fenv_private.h> #include_next <math_private.h> -extern __always_inline double -__ieee754_sqrt (double d) -{ - double res; -#if defined __AVX__ || defined SSE2AVX - asm ("vsqrtsd %1, %0, %0" : "=x" (res) : "xm" (d)); -#else - asm ("sqrtsd %1, %0" : "=x" (res) : "xm" (d)); -#endif - return res; -} - -extern __always_inline float -__ieee754_sqrtf (float d) -{ - float res; -#if defined __AVX__ || defined SSE2AVX - asm ("vsqrtss %1, %0, %0" : "=x" (res) : "xm" (d)); -#else - asm ("sqrtss %1, %0" : "=x" (res) : "xm" (d)); -#endif - return res; -} - -extern __always_inline long double -__ieee754_sqrtl (long double d) -{ - long double res; - asm ("fsqrt" : "=t" (res) : "0" (d)); - return res; -} - #ifdef __SSE4_1__ extern __always_inline double __rint (double d) diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile index 34542155aa..9f387248aa 100644 --- a/sysdeps/x86_64/fpu/multiarch/Makefile +++ b/sysdeps/x86_64/fpu/multiarch/Makefile @@ -1,12 +1,54 @@ ifeq ($(subdir),math) libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \ - s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c + s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c \ + s_trunc-c s_truncf-c + +libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \ + s_floorf-sse4_1 s_nearbyint-sse4_1 \ + s_nearbyintf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \ + s_trunc-sse4_1 s_truncf-sse4_1 + +libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \ + e_asin-fma e_atan2-fma s_sin-fma s_tan-fma \ + mpa-fma \ + sincos32-fma doasin-fma dosincos-fma \ + mpatan2-fma mpatan-fma mpsqrt-fma mptan-fma + +CFLAGS-doasin-fma.c = -mfma -mavx2 +CFLAGS-dosincos-fma.c = -mfma -mavx2 +CFLAGS-e_asin-fma.c = -mfma -mavx2 +CFLAGS-e_atan2-fma.c = -mfma -mavx2 +CFLAGS-e_exp-fma.c = -mfma -mavx2 +CFLAGS-e_log-fma.c = -mfma -mavx2 +CFLAGS-e_pow-fma.c = -mfma -mavx2 $(config-cflags-nofma) +CFLAGS-mpa-fma.c = -mfma -mavx2 +CFLAGS-mpatan-fma.c = -mfma -mavx2 +CFLAGS-mpatan2-fma.c = -mfma -mavx2 +CFLAGS-mpsqrt-fma.c = -mfma -mavx2 +CFLAGS-mptan-fma.c = -mfma -mavx2 +CFLAGS-s_atan-fma.c = -mfma -mavx2 +CFLAGS-sincos32-fma.c = -mfma -mavx2 +CFLAGS-s_sin-fma.c = -mfma -mavx2 +CFLAGS-s_tan-fma.c = -mfma -mavx2 + +libm-sysdep_routines += s_sinf-sse2 s_cosf-sse2 s_sincosf-sse2 + +libm-sysdep_routines += e_exp2f-fma e_expf-fma e_log2f-fma e_logf-fma \ + e_powf-fma s_sinf-fma s_cosf-fma s_sincosf-fma + +CFLAGS-e_exp2f-fma.c = -mfma -mavx2 +CFLAGS-e_expf-fma.c = -mfma -mavx2 +CFLAGS-e_log2f-fma.c = -mfma -mavx2 +CFLAGS-e_logf-fma.c = -mfma -mavx2 +CFLAGS-e_powf-fma.c = -mfma -mavx2 +CFLAGS-s_sinf-fma.c = -mfma -mavx2 +CFLAGS-s_cosf-fma.c = -mfma -mavx2 +CFLAGS-s_sincosf-fma.c = -mfma -mavx2 libm-sysdep_routines += e_exp-fma4 e_log-fma4 e_pow-fma4 s_atan-fma4 \ e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4 \ - mplog-fma4 mpa-fma4 slowexp-fma4 slowpow-fma4 \ + mpa-fma4 \ sincos32-fma4 doasin-fma4 dosincos-fma4 \ - halfulp-fma4 mpexp-fma4 \ mpatan2-fma4 mpatan-fma4 mpsqrt-fma4 mptan-fma4 CFLAGS-doasin-fma4.c = -mfma4 @@ -16,35 +58,26 @@ CFLAGS-e_atan2-fma4.c = -mfma4 CFLAGS-e_exp-fma4.c = -mfma4 CFLAGS-e_log-fma4.c = -mfma4 CFLAGS-e_pow-fma4.c = -mfma4 $(config-cflags-nofma) -CFLAGS-halfulp-fma4.c = -mfma4 CFLAGS-mpa-fma4.c = -mfma4 CFLAGS-mpatan-fma4.c = -mfma4 CFLAGS-mpatan2-fma4.c = -mfma4 -CFLAGS-mpexp-fma4.c = -mfma4 -CFLAGS-mplog-fma4.c = -mfma4 CFLAGS-mpsqrt-fma4.c = -mfma4 CFLAGS-mptan-fma4.c = -mfma4 CFLAGS-s_atan-fma4.c = -mfma4 CFLAGS-sincos32-fma4.c = -mfma4 -CFLAGS-slowexp-fma4.c = -mfma4 -CFLAGS-slowpow-fma4.c = -mfma4 CFLAGS-s_sin-fma4.c = -mfma4 CFLAGS-s_tan-fma4.c = -mfma4 libm-sysdep_routines += e_exp-avx e_log-avx s_atan-avx \ e_atan2-avx s_sin-avx s_tan-avx \ - mplog-avx mpa-avx slowexp-avx \ - mpexp-avx + mpa-avx CFLAGS-e_atan2-avx.c = -msse2avx -DSSE2AVX CFLAGS-e_exp-avx.c = -msse2avx -DSSE2AVX CFLAGS-e_log-avx.c = -msse2avx -DSSE2AVX CFLAGS-mpa-avx.c = -msse2avx -DSSE2AVX -CFLAGS-mpexp-avx.c = -msse2avx -DSSE2AVX -CFLAGS-mplog-avx.c = -msse2avx -DSSE2AVX CFLAGS-s_atan-avx.c = -msse2avx -DSSE2AVX CFLAGS-s_sin-avx.c = -msse2avx -DSSE2AVX -CFLAGS-slowexp-avx.c = -msse2avx -DSSE2AVX CFLAGS-s_tan-avx.c = -msse2avx -DSSE2AVX endif @@ -66,5 +99,35 @@ libmvec-sysdep_routines += svml_d_cos2_core_sse4 svml_d_cos4_core_avx2 \ svml_d_pow4_core_avx2 svml_d_pow8_core_avx512 \ svml_s_powf4_core_sse4 svml_s_powf8_core_avx2 \ svml_s_powf16_core_avx512 svml_s_sincosf4_core_sse4 \ - svml_s_sincosf8_core_avx2 svml_s_sincosf16_core_avx512 + svml_s_sincosf8_core_avx2 \ + svml_s_sincosf16_core_avx512 \ + svml_d_cos2_core-sse2 svml_d_cos4_core-sse \ + svml_d_cos8_core-avx2 svml_d_exp2_core-sse2 \ + svml_d_exp4_core-sse svml_d_exp8_core-avx2 \ + svml_d_log2_core-sse2 svml_d_log4_core-sse \ + svml_d_log8_core-avx2 svml_d_pow2_core-sse2 \ + svml_d_pow4_core-sse svml_d_pow8_core-avx2 \ + svml_d_sin2_core-sse2 svml_d_sin4_core-sse \ + svml_d_sin8_core-avx2 \ + svml_d_sincos2_core-sse2 \ + svml_d_sincos4_core-sse \ + svml_d_sincos8_core-avx2 \ + svml_s_cosf16_core-avx2 \ + svml_s_cosf4_core-sse2 \ + svml_s_cosf8_core-sse \ + svml_s_expf16_core-avx2 \ + svml_s_expf4_core-sse2 \ + svml_s_expf8_core-sse \ + svml_s_logf16_core-avx2 \ + svml_s_logf4_core-sse2 \ + svml_s_logf8_core-sse \ + svml_s_powf16_core-avx2 \ + svml_s_powf4_core-sse2 \ + svml_s_powf8_core-sse \ + svml_s_sincosf16_core-avx2 \ + svml_s_sincosf4_core-sse2 \ + svml_s_sincosf8_core-sse \ + svml_s_sinf16_core-avx2 \ + svml_s_sinf4_core-sse2 \ + svml_s_sinf8_core-sse endif diff --git a/sysdeps/x86_64/fpu/multiarch/doasin-fma.c b/sysdeps/x86_64/fpu/multiarch/doasin-fma.c new file mode 100644 index 0000000000..7a09865fca --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/doasin-fma.c @@ -0,0 +1,4 @@ +#define __doasin __doasin_fma +#define SECTION __attribute__ ((section (".text.fma"))) + +#include <sysdeps/ieee754/dbl-64/doasin.c> diff --git a/sysdeps/x86_64/fpu/multiarch/dosincos-fma.c b/sysdeps/x86_64/fpu/multiarch/dosincos-fma.c new file mode 100644 index 0000000000..5744586bdb --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/dosincos-fma.c @@ -0,0 +1,6 @@ +#define __docos __docos_fma +#define __dubcos __dubcos_fma +#define __dubsin __dubsin_fma +#define SECTION __attribute__ ((section (".text.fma"))) + +#include <sysdeps/ieee754/dbl-64/dosincos.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_asin-fma.c b/sysdeps/x86_64/fpu/multiarch/e_asin-fma.c new file mode 100644 index 0000000000..50e9c64247 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_asin-fma.c @@ -0,0 +1,11 @@ +#define __ieee754_acos __ieee754_acos_fma +#define __ieee754_asin __ieee754_asin_fma +#define __cos32 __cos32_fma +#define __doasin __doasin_fma +#define __docos __docos_fma +#define __dubcos __dubcos_fma +#define __dubsin __dubsin_fma +#define __sin32 __sin32_fma +#define SECTION __attribute__ ((section (".text.fma"))) + +#include <sysdeps/ieee754/dbl-64/e_asin.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_asin.c b/sysdeps/x86_64/fpu/multiarch/e_asin.c index 111a5b99bd..8d47004e4f 100644 --- a/sysdeps/x86_64/fpu/multiarch/e_asin.c +++ b/sysdeps/x86_64/fpu/multiarch/e_asin.c @@ -1,26 +1,40 @@ -#include <init-arch.h> -#include <math.h> -#include <math_private.h> - -extern double __ieee754_acos_sse2 (double); -extern double __ieee754_asin_sse2 (double); -extern double __ieee754_acos_fma4 (double); -extern double __ieee754_asin_fma4 (double); - -libm_ifunc (__ieee754_acos, - HAS_ARCH_FEATURE (FMA4_Usable) - ? __ieee754_acos_fma4 - : __ieee754_acos_sse2); -strong_alias (__ieee754_acos, __acos_finite) +/* Multiple versions of IEEE 754 asin and acos. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +extern double __redirect_ieee754_asin (double); +extern double __redirect_ieee754_acos (double); + +#define SYMBOL_NAME ieee754_asin +#include "ifunc-fma4.h" -libm_ifunc (__ieee754_asin, - HAS_ARCH_FEATURE (FMA4_Usable) - ? __ieee754_asin_fma4 - : __ieee754_asin_sse2); +libc_ifunc_redirected (__redirect_ieee754_asin, __ieee754_asin, + IFUNC_SELECTOR ()); strong_alias (__ieee754_asin, __asin_finite) -#define __ieee754_acos __ieee754_acos_sse2 -#define __ieee754_asin __ieee754_asin_sse2 +#undef SYMBOL_NAME +#define SYMBOL_NAME ieee754_acos +#include "ifunc-fma4.h" + +libc_ifunc_redirected (__redirect_ieee754_acos, __ieee754_acos, + IFUNC_SELECTOR ()); +strong_alias (__ieee754_acos, __acos_finite) +#define __ieee754_acos __ieee754_acos_sse2 +#define __ieee754_asin __ieee754_asin_sse2 #include <sysdeps/ieee754/dbl-64/e_asin.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_atan2-fma.c b/sysdeps/x86_64/fpu/multiarch/e_atan2-fma.c new file mode 100644 index 0000000000..caba686496 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_atan2-fma.c @@ -0,0 +1,10 @@ +#define __ieee754_atan2 __ieee754_atan2_fma +#define __add __add_fma +#define __dbl_mp __dbl_mp_fma +#define __dvd __dvd_fma +#define __mpatan2 __mpatan2_fma +#define __mul __mul_fma +#define __sub __sub_fma +#define SECTION __attribute__ ((section (".text.fma"))) + +#include <sysdeps/ieee754/dbl-64/e_atan2.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_atan2.c b/sysdeps/x86_64/fpu/multiarch/e_atan2.c index 9ca3c02a44..6c2dd5af37 100644 --- a/sysdeps/x86_64/fpu/multiarch/e_atan2.c +++ b/sysdeps/x86_64/fpu/multiarch/e_atan2.c @@ -1,18 +1,29 @@ -#include <init-arch.h> -#include <math.h> -#include <math_private.h> +/* Multiple versions of IEEE 754 atan. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. -extern double __ieee754_atan2_sse2 (double, double); -extern double __ieee754_atan2_avx (double, double); -extern double __ieee754_atan2_fma4 (double, double); + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. -libm_ifunc (__ieee754_atan2, - HAS_ARCH_FEATURE (FMA4_Usable) ? __ieee754_atan2_fma4 - : (HAS_ARCH_FEATURE (AVX_Usable) - ? __ieee754_atan2_avx : __ieee754_atan2_sse2)); -strong_alias (__ieee754_atan2, __atan2_finite) + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. -#define __ieee754_atan2 __ieee754_atan2_sse2 + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +extern double __redirect_ieee754_atan2 (double, double); +#define SYMBOL_NAME ieee754_atan2 +#include "ifunc-avx-fma4.h" +libc_ifunc_redirected (__redirect_ieee754_atan2, + __ieee754_atan2, IFUNC_SELECTOR ()); +strong_alias (__ieee754_atan2, __atan2_finite) + +#define __ieee754_atan2 __ieee754_atan2_sse2 #include <sysdeps/ieee754/dbl-64/e_atan2.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_exp-avx.c b/sysdeps/x86_64/fpu/multiarch/e_exp-avx.c index ee5dd6d2dc..afd917442a 100644 --- a/sysdeps/x86_64/fpu/multiarch/e_exp-avx.c +++ b/sysdeps/x86_64/fpu/multiarch/e_exp-avx.c @@ -1,6 +1,5 @@ #define __ieee754_exp __ieee754_exp_avx #define __exp1 __exp1_avx -#define __slowexp __slowexp_avx #define SECTION __attribute__ ((section (".text.avx"))) #include <sysdeps/ieee754/dbl-64/e_exp.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_exp-fma.c b/sysdeps/x86_64/fpu/multiarch/e_exp-fma.c new file mode 100644 index 0000000000..765b1b9dd3 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_exp-fma.c @@ -0,0 +1,5 @@ +#define __ieee754_exp __ieee754_exp_fma +#define __exp1 __exp1_fma +#define SECTION __attribute__ ((section (".text.fma"))) + +#include <sysdeps/ieee754/dbl-64/e_exp.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_exp-fma4.c b/sysdeps/x86_64/fpu/multiarch/e_exp-fma4.c index ae6eb67603..9ac7acad28 100644 --- a/sysdeps/x86_64/fpu/multiarch/e_exp-fma4.c +++ b/sysdeps/x86_64/fpu/multiarch/e_exp-fma4.c @@ -1,6 +1,5 @@ #define __ieee754_exp __ieee754_exp_fma4 #define __exp1 __exp1_fma4 -#define __slowexp __slowexp_fma4 #define SECTION __attribute__ ((section (".text.fma4"))) #include <sysdeps/ieee754/dbl-64/e_exp.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_exp.c b/sysdeps/x86_64/fpu/multiarch/e_exp.c index b7d7b5ff27..7cd7d1729c 100644 --- a/sysdeps/x86_64/fpu/multiarch/e_exp.c +++ b/sysdeps/x86_64/fpu/multiarch/e_exp.c @@ -1,18 +1,29 @@ -#include <init-arch.h> -#include <math.h> -#include <math_private.h> +/* Multiple versions of IEEE 754 exp. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. -extern double __ieee754_exp_sse2 (double); -extern double __ieee754_exp_avx (double); -extern double __ieee754_exp_fma4 (double); + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. -libm_ifunc (__ieee754_exp, - HAS_ARCH_FEATURE (FMA4_Usable) ? __ieee754_exp_fma4 - : (HAS_ARCH_FEATURE (AVX_Usable) - ? __ieee754_exp_avx : __ieee754_exp_sse2)); -strong_alias (__ieee754_exp, __exp_finite) + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. -#define __ieee754_exp __ieee754_exp_sse2 + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +extern double __redirect_ieee754_exp (double); +#define SYMBOL_NAME ieee754_exp +#include "ifunc-avx-fma4.h" +libc_ifunc_redirected (__redirect_ieee754_exp, __ieee754_exp, + IFUNC_SELECTOR ()); +strong_alias (__ieee754_exp, __exp_finite) + +#define __ieee754_exp __ieee754_exp_sse2 #include <sysdeps/ieee754/dbl-64/e_exp.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_exp2f-fma.c b/sysdeps/x86_64/fpu/multiarch/e_exp2f-fma.c new file mode 100644 index 0000000000..c915a50794 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_exp2f-fma.c @@ -0,0 +1,3 @@ +#define __exp2f __exp2f_fma + +#include <sysdeps/ieee754/flt-32/e_exp2f.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_exp2f.c b/sysdeps/x86_64/fpu/multiarch/e_exp2f.c new file mode 100644 index 0000000000..e3a0706839 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_exp2f.c @@ -0,0 +1,40 @@ +/* Multiple versions of exp2f. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <libm-alias-float.h> + +extern float __redirect_exp2f (float); + +#define SYMBOL_NAME exp2f +#include "ifunc-fma.h" + +libc_ifunc_redirected (__redirect_exp2f, __exp2f, IFUNC_SELECTOR ()); + +#ifdef SHARED +# include <shlib-compat.h> +versioned_symbol (libm, __exp2f, exp2f, GLIBC_2_27); +libm_alias_float_other (__exp2, exp2) +#else +libm_alias_float (__exp2, exp2) +#endif + +strong_alias (__exp2f, __ieee754_exp2f) +strong_alias (__exp2f, __exp2f_finite) + +#define __exp2f __exp2f_sse2 +#include <sysdeps/ieee754/flt-32/e_exp2f.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_expf-fma.c b/sysdeps/x86_64/fpu/multiarch/e_expf-fma.c new file mode 100644 index 0000000000..4e01cd6a82 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_expf-fma.c @@ -0,0 +1,3 @@ +#define __expf __expf_fma + +#include <sysdeps/ieee754/flt-32/e_expf.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_expf.c b/sysdeps/x86_64/fpu/multiarch/e_expf.c new file mode 100644 index 0000000000..2b7c7ccbd0 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_expf.c @@ -0,0 +1,43 @@ +/* Multiple versions of expf. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <libm-alias-float.h> + +extern float __redirect_expf (float); + +#define SYMBOL_NAME expf +#include "ifunc-fma.h" + +libc_ifunc_redirected (__redirect_expf, __expf, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (__expf, __GI___expf, __redirect_expf) + __attribute__ ((visibility ("hidden"))); + +# include <shlib-compat.h> +versioned_symbol (libm, __expf, expf, GLIBC_2_27); +libm_alias_float_other (__exp, exp) +#else +libm_alias_float (__exp, exp) +#endif + +strong_alias (__expf, __ieee754_expf) +strong_alias (__expf, __expf_finite) + +#define __expf __expf_sse2 +#include <sysdeps/ieee754/flt-32/e_expf.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_log-avx.c b/sysdeps/x86_64/fpu/multiarch/e_log-avx.c index c669019bc2..b22a5767be 100644 --- a/sysdeps/x86_64/fpu/multiarch/e_log-avx.c +++ b/sysdeps/x86_64/fpu/multiarch/e_log-avx.c @@ -1,8 +1,4 @@ #define __ieee754_log __ieee754_log_avx -#define __mplog __mplog_avx -#define __add __add_avx -#define __dbl_mp __dbl_mp_avx -#define __sub __sub_avx #define SECTION __attribute__ ((section (".text.avx"))) #include <sysdeps/ieee754/dbl-64/e_log.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_log-fma.c b/sysdeps/x86_64/fpu/multiarch/e_log-fma.c new file mode 100644 index 0000000000..bce0ee03c2 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_log-fma.c @@ -0,0 +1,4 @@ +#define __ieee754_log __ieee754_log_fma +#define SECTION __attribute__ ((section (".text.fma"))) + +#include <sysdeps/ieee754/dbl-64/e_log.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_log-fma4.c b/sysdeps/x86_64/fpu/multiarch/e_log-fma4.c index a2346cc618..f458f9c23c 100644 --- a/sysdeps/x86_64/fpu/multiarch/e_log-fma4.c +++ b/sysdeps/x86_64/fpu/multiarch/e_log-fma4.c @@ -1,8 +1,4 @@ #define __ieee754_log __ieee754_log_fma4 -#define __mplog __mplog_fma4 -#define __add __add_fma4 -#define __dbl_mp __dbl_mp_fma4 -#define __sub __sub_fma4 #define SECTION __attribute__ ((section (".text.fma4"))) #include <sysdeps/ieee754/dbl-64/e_log.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_log.c b/sysdeps/x86_64/fpu/multiarch/e_log.c index cf9533d6c0..e0a1b02fae 100644 --- a/sysdeps/x86_64/fpu/multiarch/e_log.c +++ b/sysdeps/x86_64/fpu/multiarch/e_log.c @@ -1,18 +1,29 @@ -#include <init-arch.h> -#include <math.h> -#include <math_private.h> +/* Multiple versions of IEEE 754 log. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. -extern double __ieee754_log_sse2 (double); -extern double __ieee754_log_avx (double); -extern double __ieee754_log_fma4 (double); + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. -libm_ifunc (__ieee754_log, - HAS_ARCH_FEATURE (FMA4_Usable) ? __ieee754_log_fma4 - : (HAS_ARCH_FEATURE (AVX_Usable) - ? __ieee754_log_avx : __ieee754_log_sse2)); -strong_alias (__ieee754_log, __log_finite) + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. -#define __ieee754_log __ieee754_log_sse2 + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +extern double __redirect_ieee754_log (double); +#define SYMBOL_NAME ieee754_log +#include "ifunc-avx-fma4.h" +libc_ifunc_redirected (__redirect_ieee754_log, __ieee754_log, + IFUNC_SELECTOR ()); +strong_alias (__ieee754_log, __log_finite) + +#define __ieee754_log __ieee754_log_sse2 #include <sysdeps/ieee754/dbl-64/e_log.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_log2f-fma.c b/sysdeps/x86_64/fpu/multiarch/e_log2f-fma.c new file mode 100644 index 0000000000..8a76b836fb --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_log2f-fma.c @@ -0,0 +1,3 @@ +#define __log2f __log2f_fma + +#include <sysdeps/ieee754/flt-32/e_log2f.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_log2f.c b/sysdeps/x86_64/fpu/multiarch/e_log2f.c new file mode 100644 index 0000000000..12d0c30dd3 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_log2f.c @@ -0,0 +1,43 @@ +/* Multiple versions of log2f. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <libm-alias-float.h> + +extern float __redirect_log2f (float); + +#define SYMBOL_NAME log2f +#include "ifunc-fma.h" + +libc_ifunc_redirected (__redirect_log2f, __log2f, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (__log2f, __GI___log2f, __redirect_log2f) + __attribute__ ((visibility ("hidden"))); + +# include <shlib-compat.h> +versioned_symbol (libm, __log2f, log2f, GLIBC_2_27); +libm_alias_float_other (__log2, log2) +#else +libm_alias_float (__log2, log2) +#endif + +strong_alias (__log2f, __ieee754_log2f) +strong_alias (__log2f, __log2f_finite) + +#define __log2f __log2f_sse2 +#include <sysdeps/ieee754/flt-32/e_log2f.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_logf-fma.c b/sysdeps/x86_64/fpu/multiarch/e_logf-fma.c new file mode 100644 index 0000000000..a47fd8195f --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_logf-fma.c @@ -0,0 +1,3 @@ +#define __logf __logf_fma + +#include <sysdeps/ieee754/flt-32/e_logf.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_logf.c b/sysdeps/x86_64/fpu/multiarch/e_logf.c new file mode 100644 index 0000000000..224d40a1e4 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_logf.c @@ -0,0 +1,43 @@ +/* Multiple versions of logf. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <libm-alias-float.h> + +extern float __redirect_logf (float); + +#define SYMBOL_NAME logf +#include "ifunc-fma.h" + +libc_ifunc_redirected (__redirect_logf, __logf, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (__logf, __GI___logf, __redirect_logf) + __attribute__ ((visibility ("hidden"))); + +# include <shlib-compat.h> +versioned_symbol (libm, __logf, logf, GLIBC_2_27); +libm_alias_float_other (__log, log) +#else +libm_alias_float (__log, log) +#endif + +strong_alias (__logf, __ieee754_logf) +strong_alias (__logf, __logf_finite) + +#define __logf __logf_sse2 +#include <sysdeps/ieee754/flt-32/e_logf.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_pow-fma.c b/sysdeps/x86_64/fpu/multiarch/e_pow-fma.c new file mode 100644 index 0000000000..73c1e7fb89 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_pow-fma.c @@ -0,0 +1,5 @@ +#define __ieee754_pow __ieee754_pow_fma +#define __exp1 __exp1_fma +#define SECTION __attribute__ ((section (".text.fma"))) + +#include <sysdeps/ieee754/dbl-64/e_pow.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_pow-fma4.c b/sysdeps/x86_64/fpu/multiarch/e_pow-fma4.c index 5b3ea8e103..8971b655ca 100644 --- a/sysdeps/x86_64/fpu/multiarch/e_pow-fma4.c +++ b/sysdeps/x86_64/fpu/multiarch/e_pow-fma4.c @@ -1,6 +1,5 @@ #define __ieee754_pow __ieee754_pow_fma4 #define __exp1 __exp1_fma4 -#define __slowpow __slowpow_fma4 #define SECTION __attribute__ ((section (".text.fma4"))) #include <sysdeps/ieee754/dbl-64/e_pow.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_pow.c b/sysdeps/x86_64/fpu/multiarch/e_pow.c index a5c5d89c3e..084073c936 100644 --- a/sysdeps/x86_64/fpu/multiarch/e_pow.c +++ b/sysdeps/x86_64/fpu/multiarch/e_pow.c @@ -1,17 +1,29 @@ -#include <init-arch.h> -#include <math.h> -#include <math_private.h> +/* Multiple versions of IEEE 754 pow. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. -extern double __ieee754_pow_sse2 (double, double); -extern double __ieee754_pow_fma4 (double, double); + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. -libm_ifunc (__ieee754_pow, - HAS_ARCH_FEATURE (FMA4_Usable) - ? __ieee754_pow_fma4 - : __ieee754_pow_sse2); -strong_alias (__ieee754_pow, __pow_finite) + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. -#define __ieee754_pow __ieee754_pow_sse2 + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +extern double __redirect_ieee754_pow (double, double); +#define SYMBOL_NAME ieee754_pow +#include "ifunc-fma4.h" +libc_ifunc_redirected (__redirect_ieee754_pow, + __ieee754_pow, IFUNC_SELECTOR ()); +strong_alias (__ieee754_pow, __pow_finite) + +#define __ieee754_pow __ieee754_pow_sse2 #include <sysdeps/ieee754/dbl-64/e_pow.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_powf-fma.c b/sysdeps/x86_64/fpu/multiarch/e_powf-fma.c new file mode 100644 index 0000000000..fdf5dcc56a --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_powf-fma.c @@ -0,0 +1,3 @@ +#define __powf __powf_fma + +#include <sysdeps/ieee754/flt-32/e_powf.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_powf.c b/sysdeps/x86_64/fpu/multiarch/e_powf.c new file mode 100644 index 0000000000..a185006f40 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_powf.c @@ -0,0 +1,46 @@ +/* Multiple versions of powf. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <libm-alias-float.h> + +#define powf __redirect_powf +#define __DECL_SIMD___redirect_powf +#include <math.h> +#undef powf + +#define SYMBOL_NAME powf +#include "ifunc-fma.h" + +libc_ifunc_redirected (__redirect_powf, __powf, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (__powf, __GI___powf, __redirect_powf) + __attribute__ ((visibility ("hidden"))); + +# include <shlib-compat.h> +versioned_symbol (libm, __powf, powf, GLIBC_2_27); +libm_alias_float_other (__pow, pow) +#else +libm_alias_float (__pow, pow) +#endif + +strong_alias (__powf, __ieee754_powf) +strong_alias (__powf, __powf_finite) + +#define __powf __powf_sse2 +#include <sysdeps/ieee754/flt-32/e_powf.c> diff --git a/sysdeps/x86_64/fpu/multiarch/halfulp-fma4.c b/sysdeps/x86_64/fpu/multiarch/halfulp-fma4.c deleted file mode 100644 index a00c17c016..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/halfulp-fma4.c +++ /dev/null @@ -1,4 +0,0 @@ -#define __halfulp __halfulp_fma4 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/halfulp.c> diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-avx-fma4.h b/sysdeps/x86_64/fpu/multiarch/ifunc-avx-fma4.h new file mode 100644 index 0000000000..a5f9375afc --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/ifunc-avx-fma4.h @@ -0,0 +1,43 @@ +/* Common definition for ifunc selections optimized with AVX, AVX2/FMA + and FMA4. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <init-arch.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (fma) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (fma4) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (CPU_FEATURES_ARCH_P (cpu_features, FMA_Usable) + && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)) + return OPTIMIZE (fma); + + if (CPU_FEATURES_ARCH_P (cpu_features, FMA4_Usable)) + return OPTIMIZE (fma4); + + if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Usable)) + return OPTIMIZE (avx); + + return OPTIMIZE (sse2); +} diff --git a/sysdeps/x86_64/fpu/multiarch/s_ceilf.S b/sysdeps/x86_64/fpu/multiarch/ifunc-fma.h index 9a06a5c174..63a8cd221f 100644 --- a/sysdeps/x86_64/fpu/multiarch/s_ceilf.S +++ b/sysdeps/x86_64/fpu/multiarch/ifunc-fma.h @@ -1,6 +1,6 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Common definition for ifunc selections optimized with AVX2/FMA. + Copyright (C) 2017-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. - Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -16,23 +16,19 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include <machine/asm.h> #include <init-arch.h> +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (fma) attribute_hidden; -ENTRY(__ceilf) - .type __ceilf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq __ceilf_sse41(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jnz 2f - leaq __ceilf_c(%rip), %rax -2: ret -END(__ceilf) -weak_alias (__ceilf, ceilf) +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + if (CPU_FEATURES_ARCH_P (cpu_features, FMA_Usable) + && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)) + return OPTIMIZE (fma); -ENTRY(__ceilf_sse41) - roundss $2, %xmm0, %xmm0 - ret -END(__ceilf_sse41) + return OPTIMIZE (sse2); +} diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h b/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h new file mode 100644 index 0000000000..a2526a2ee0 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h @@ -0,0 +1,39 @@ +/* Common definition for ifunc selections optimized with AVX2/FMA and + FMA4. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <init-arch.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (fma) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (fma4) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (CPU_FEATURES_ARCH_P (cpu_features, FMA_Usable) + && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)) + return OPTIMIZE (fma); + + if (CPU_FEATURES_ARCH_P (cpu_features, FMA4_Usable)) + return OPTIMIZE (fma4); + + return OPTIMIZE (sse2); +} diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx2.h b/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx2.h new file mode 100644 index 0000000000..bd2d32e418 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx2.h @@ -0,0 +1,39 @@ +/* Common definition for libmathvec ifunc selections optimized with + AVX2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <init-arch.h> + +#undef PASTER2 +#define PASTER2(x,y) x##_##y + +extern void REDIRECT_NAME (void); +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse_wrapper) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (CPU_FEATURES_ARCH_P (cpu_features, FMA_Usable) + && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)) + return OPTIMIZE (avx2); + + return OPTIMIZE (sse_wrapper); +} diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512.h b/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512.h new file mode 100644 index 0000000000..174e462cfb --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512.h @@ -0,0 +1,45 @@ +/* Common definition for libmathvec ifunc selections optimized with + AVX512. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <init-arch.h> + +#undef PASTER2 +#define PASTER2(x,y) x##_##y + +extern void REDIRECT_NAME (void); +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_wrapper) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (knl) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (skx) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (!CPU_FEATURES_ARCH_P (cpu_features, MathVec_Prefer_No_AVX512)) + { + if (CPU_FEATURES_ARCH_P (cpu_features, AVX512DQ_Usable)) + return OPTIMIZE (skx); + + if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)) + return OPTIMIZE (knl); + } + + return OPTIMIZE (avx2_wrapper); +} diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-sse4_1.h b/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-sse4_1.h new file mode 100644 index 0000000000..c1e70ebfc1 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-sse4_1.h @@ -0,0 +1,38 @@ +/* Common definition for libmathvec ifunc selections optimized with + SSE4.1. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <init-arch.h> + +#undef PASTER2 +#define PASTER2(x,y) x##_##y + +extern void REDIRECT_NAME (void); +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (CPU_FEATURES_CPU_P (cpu_features, SSE4_1)) + return OPTIMIZE (sse4); + + return OPTIMIZE (sse2); +} diff --git a/sysdeps/x86_64/fpu/multiarch/s_ceil.S b/sysdeps/x86_64/fpu/multiarch/ifunc-sse4_1.h index 40fa729955..a8710ba802 100644 --- a/sysdeps/x86_64/fpu/multiarch/s_ceil.S +++ b/sysdeps/x86_64/fpu/multiarch/ifunc-sse4_1.h @@ -1,6 +1,6 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Common definition for ifunc selections optimized with SSE4.1. + Copyright (C) 2017-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. - Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -16,23 +16,18 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include <machine/asm.h> #include <init-arch.h> +extern __typeof (REDIRECT_NAME) OPTIMIZE (c) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse41) attribute_hidden; -ENTRY(__ceil) - .type __ceil, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq __ceil_sse41(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jnz 2f - leaq __ceil_c(%rip), %rax -2: ret -END(__ceil) -weak_alias (__ceil, ceil) +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + if (CPU_FEATURES_CPU_P (cpu_features, SSE4_1)) + return OPTIMIZE (sse41); -ENTRY(__ceil_sse41) - roundsd $2, %xmm0, %xmm0 - ret -END(__ceil_sse41) + return OPTIMIZE (c); +} diff --git a/sysdeps/x86_64/fpu/multiarch/mpa-fma.c b/sysdeps/x86_64/fpu/multiarch/mpa-fma.c new file mode 100644 index 0000000000..177cc2517f --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/mpa-fma.c @@ -0,0 +1,14 @@ +#define __add __add_fma +#define __mul __mul_fma +#define __sqr __sqr_fma +#define __sub __sub_fma +#define __dbl_mp __dbl_mp_fma +#define __dvd __dvd_fma + +#define NO___CPY 1 +#define NO___MP_DBL 1 +#define NO___ACR 1 +#define NO__CONST 1 +#define SECTION __attribute__ ((section (".text.fma"))) + +#include <sysdeps/ieee754/dbl-64/mpa.c> diff --git a/sysdeps/x86_64/fpu/multiarch/mpatan-fma.c b/sysdeps/x86_64/fpu/multiarch/mpatan-fma.c new file mode 100644 index 0000000000..d216f9142d --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/mpatan-fma.c @@ -0,0 +1,10 @@ +#define __mpatan __mpatan_fma +#define __add __add_fma +#define __dvd __dvd_fma +#define __mpsqrt __mpsqrt_fma +#define __mul __mul_fma +#define __sub __sub_fma +#define AVOID_MPATAN_H 1 +#define SECTION __attribute__ ((section (".text.fma"))) + +#include <sysdeps/ieee754/dbl-64/mpatan.c> diff --git a/sysdeps/x86_64/fpu/multiarch/mpatan2-fma.c b/sysdeps/x86_64/fpu/multiarch/mpatan2-fma.c new file mode 100644 index 0000000000..98df336f79 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/mpatan2-fma.c @@ -0,0 +1,9 @@ +#define __mpatan2 __mpatan2_fma +#define __add __add_fma +#define __dvd __dvd_fma +#define __mpatan __mpatan_fma +#define __mpsqrt __mpsqrt_fma +#define __mul __mul_fma +#define SECTION __attribute__ ((section (".text.fma"))) + +#include <sysdeps/ieee754/dbl-64/mpatan2.c> diff --git a/sysdeps/x86_64/fpu/multiarch/mpexp-avx.c b/sysdeps/x86_64/fpu/multiarch/mpexp-avx.c deleted file mode 100644 index 87f29c96c9..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/mpexp-avx.c +++ /dev/null @@ -1,9 +0,0 @@ -#define __mpexp __mpexp_avx -#define __add __add_avx -#define __dbl_mp __dbl_mp_avx -#define __dvd __dvd_avx -#define __mul __mul_avx -#define AVOID_MPEXP_H 1 -#define SECTION __attribute__ ((section (".text.avx"))) - -#include <sysdeps/ieee754/dbl-64/mpexp.c> diff --git a/sysdeps/x86_64/fpu/multiarch/mpexp-fma4.c b/sysdeps/x86_64/fpu/multiarch/mpexp-fma4.c deleted file mode 100644 index 07ca6e9ad0..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/mpexp-fma4.c +++ /dev/null @@ -1,9 +0,0 @@ -#define __mpexp __mpexp_fma4 -#define __add __add_fma4 -#define __dbl_mp __dbl_mp_fma4 -#define __dvd __dvd_fma4 -#define __mul __mul_fma4 -#define AVOID_MPEXP_H 1 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/mpexp.c> diff --git a/sysdeps/x86_64/fpu/multiarch/mplog-avx.c b/sysdeps/x86_64/fpu/multiarch/mplog-avx.c deleted file mode 100644 index fd783d9a67..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/mplog-avx.c +++ /dev/null @@ -1,8 +0,0 @@ -#define __mplog __mplog_avx -#define __add __add_avx -#define __mpexp __mpexp_avx -#define __mul __mul_avx -#define __sub __sub_avx -#define SECTION __attribute__ ((section (".text.avx"))) - -#include <sysdeps/ieee754/dbl-64/mplog.c> diff --git a/sysdeps/x86_64/fpu/multiarch/mplog-fma4.c b/sysdeps/x86_64/fpu/multiarch/mplog-fma4.c deleted file mode 100644 index b4733118d7..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/mplog-fma4.c +++ /dev/null @@ -1,8 +0,0 @@ -#define __mplog __mplog_fma4 -#define __add __add_fma4 -#define __mpexp __mpexp_fma4 -#define __mul __mul_fma4 -#define __sub __sub_fma4 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/mplog.c> diff --git a/sysdeps/x86_64/fpu/multiarch/mpsqrt-fma.c b/sysdeps/x86_64/fpu/multiarch/mpsqrt-fma.c new file mode 100644 index 0000000000..44d7a23ae3 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/mpsqrt-fma.c @@ -0,0 +1,8 @@ +#define __mpsqrt __mpsqrt_fma +#define __dbl_mp __dbl_mp_fma +#define __mul __mul_fma +#define __sub __sub_fma +#define AVOID_MPSQRT_H 1 +#define SECTION __attribute__ ((section (".text.fma"))) + +#include <sysdeps/ieee754/dbl-64/mpsqrt.c> diff --git a/sysdeps/x86_64/fpu/multiarch/mptan-fma.c b/sysdeps/x86_64/fpu/multiarch/mptan-fma.c new file mode 100644 index 0000000000..d1a691413c --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/mptan-fma.c @@ -0,0 +1,7 @@ +#define __mptan __mptan_fma +#define __c32 __c32_fma +#define __dvd __dvd_fma +#define __mpranred __mpranred_fma +#define SECTION __attribute__ ((section (".text.fma"))) + +#include <sysdeps/ieee754/dbl-64/mptan.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_atan-avx.c b/sysdeps/x86_64/fpu/multiarch/s_atan-avx.c index b5cb9c3a75..41816bfe6c 100644 --- a/sysdeps/x86_64/fpu/multiarch/s_atan-avx.c +++ b/sysdeps/x86_64/fpu/multiarch/s_atan-avx.c @@ -1,4 +1,4 @@ -#define atan __atan_avx +#define __atan __atan_avx #define __add __add_avx #define __dbl_mp __dbl_mp_avx #define __mul __mul_avx diff --git a/sysdeps/x86_64/fpu/multiarch/s_atan-fma.c b/sysdeps/x86_64/fpu/multiarch/s_atan-fma.c new file mode 100644 index 0000000000..363e32bcbd --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_atan-fma.c @@ -0,0 +1,9 @@ +#define __atan __atan_fma +#define __add __add_fma +#define __dbl_mp __dbl_mp_fma +#define __mpatan __mpatan_fma +#define __mul __mul_fma +#define __sub __sub_fma +#define SECTION __attribute__ ((section (".text.fma"))) + +#include <sysdeps/ieee754/dbl-64/s_atan.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_atan-fma4.c b/sysdeps/x86_64/fpu/multiarch/s_atan-fma4.c index 9e83e6cdab..ad8d3af579 100644 --- a/sysdeps/x86_64/fpu/multiarch/s_atan-fma4.c +++ b/sysdeps/x86_64/fpu/multiarch/s_atan-fma4.c @@ -1,4 +1,4 @@ -#define atan __atan_fma4 +#define __atan __atan_fma4 #define __add __add_fma4 #define __dbl_mp __dbl_mp_fma4 #define __mpatan __mpatan_fma4 diff --git a/sysdeps/x86_64/fpu/multiarch/s_atan.c b/sysdeps/x86_64/fpu/multiarch/s_atan.c index 742e95cb96..f9ce8549ab 100644 --- a/sysdeps/x86_64/fpu/multiarch/s_atan.c +++ b/sysdeps/x86_64/fpu/multiarch/s_atan.c @@ -1,15 +1,30 @@ -#include <init-arch.h> -#include <math.h> +/* Multiple versions of atan. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. -extern double __atan_sse2 (double); -extern double __atan_avx (double); -extern double __atan_fma4 (double); + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. -libm_ifunc (atan, (HAS_ARCH_FEATURE (FMA4_Usable) ? __atan_fma4 : - HAS_ARCH_FEATURE (AVX_Usable) - ? __atan_avx : __atan_sse2)); + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. -#define atan __atan_sse2 + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ +#include <libm-alias-double.h> +extern double __redirect_atan (double); + +#define SYMBOL_NAME atan +#include "ifunc-avx-fma4.h" + +libc_ifunc_redirected (__redirect_atan, __atan, IFUNC_SELECTOR ()); +libm_alias_double (__atan, atan) + +#define __atan __atan_sse2 #include <sysdeps/ieee754/dbl-64/s_atan.c> diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S b/sysdeps/x86_64/fpu/multiarch/s_ceil-sse4_1.S index fcc0945ea7..e90f05b42f 100644 --- a/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S +++ b/sysdeps/x86_64/fpu/multiarch/s_ceil-sse4_1.S @@ -1,6 +1,6 @@ -/* mempcpy optimized with AVX512 for KNL hardware. - Copyright (C) 2016 Free Software Foundation, Inc. +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -16,7 +16,10 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define USE_AS_MEMPCPY -#define MEMCPY __mempcpy_avx512_no_vzeroupper -#define MEMCPY_CHK __mempcpy_chk_avx512_no_vzeroupper -#include "memcpy-avx512-no-vzeroupper.S" +#include <sysdep.h> + + .section .text.sse4.1,"ax",@progbits +ENTRY(__ceil_sse41) + roundsd $10, %xmm0, %xmm0 + ret +END(__ceil_sse41) diff --git a/sysdeps/x86_64/fpu/multiarch/s_ceil.c b/sysdeps/x86_64/fpu/multiarch/s_ceil.c new file mode 100644 index 0000000000..070fcdddea --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_ceil.c @@ -0,0 +1,31 @@ +/* Multiple versions of __ceil. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <libm-alias-double.h> + +#define ceil __redirect_ceil +#define __ceil __redirect___ceil +#include <math.h> +#undef ceil +#undef __ceil + +#define SYMBOL_NAME ceil +#include "ifunc-sse4_1.h" + +libc_ifunc_redirected (__redirect_ceil, __ceil, IFUNC_SELECTOR ()); +libm_alias_double (__ceil, ceil) diff --git a/sysdeps/x86_64/fpu/multiarch/s_ceilf-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_ceilf-sse4_1.S new file mode 100644 index 0000000000..c3bd24c5ae --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_ceilf-sse4_1.S @@ -0,0 +1,25 @@ +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .section .text.sse4.1,"ax",@progbits +ENTRY(__ceilf_sse41) + roundss $10, %xmm0, %xmm0 + ret +END(__ceilf_sse41) diff --git a/sysdeps/x86_64/fpu/multiarch/s_ceilf.c b/sysdeps/x86_64/fpu/multiarch/s_ceilf.c new file mode 100644 index 0000000000..db0c6c4bc3 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_ceilf.c @@ -0,0 +1,31 @@ +/* Multiple versions of __ceilf. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <libm-alias-float.h> + +#define ceilf __redirect_ceilf +#define __ceilf __redirect___ceilf +#include <math.h> +#undef ceilf +#undef __ceilf + +#define SYMBOL_NAME ceilf +#include "ifunc-sse4_1.h" + +libc_ifunc_redirected (__redirect_ceilf, __ceilf, IFUNC_SELECTOR ()); +libm_alias_float (__ceil, ceil) diff --git a/sysdeps/x86_64/fpu/multiarch/s_cosf-fma.c b/sysdeps/x86_64/fpu/multiarch/s_cosf-fma.c new file mode 100644 index 0000000000..5f9191aef9 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_cosf-fma.c @@ -0,0 +1,2 @@ +#define COSF __cosf_fma +#include <sysdeps/ieee754/flt-32/s_cosf.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_cosf-sse2.c b/sysdeps/x86_64/fpu/multiarch/s_cosf-sse2.c new file mode 100644 index 0000000000..87cf42a82a --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_cosf-sse2.c @@ -0,0 +1,2 @@ +#define COSF __cosf_sse2 +#include <sysdeps/ieee754/flt-32/s_cosf.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_cosf.c b/sysdeps/x86_64/fpu/multiarch/s_cosf.c new file mode 100644 index 0000000000..33959d3d01 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_cosf.c @@ -0,0 +1,28 @@ +/* Multiple versions of cosf. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <libm-alias-float.h> + +extern float __redirect_cosf (float); + +#define SYMBOL_NAME cosf +#include "ifunc-fma.h" + +libc_ifunc_redirected (__redirect_cosf, __cosf, IFUNC_SELECTOR ()); + +libm_alias_float (__cos, cos) diff --git a/sysdeps/x86_64/fpu/multiarch/s_floor-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_floor-sse4_1.S new file mode 100644 index 0000000000..b3c7aa29ff --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_floor-sse4_1.S @@ -0,0 +1,25 @@ +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .section .text.sse4.1,"ax",@progbits +ENTRY(__floor_sse41) + roundsd $9, %xmm0, %xmm0 + ret +END(__floor_sse41) diff --git a/sysdeps/x86_64/fpu/multiarch/s_floor.c b/sysdeps/x86_64/fpu/multiarch/s_floor.c new file mode 100644 index 0000000000..58f8ed8eaf --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_floor.c @@ -0,0 +1,31 @@ +/* Multiple versions of __floor. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <libm-alias-double.h> + +#define floor __redirect_floor +#define __floor __redirect___floor +#include <math.h> +#undef floor +#undef __floor + +#define SYMBOL_NAME floor +#include "ifunc-sse4_1.h" + +libc_ifunc_redirected (__redirect_floor, __floor, IFUNC_SELECTOR ()); +libm_alias_double (__floor, floor) diff --git a/sysdeps/x86_64/fpu/multiarch/s_floorf-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_floorf-sse4_1.S new file mode 100644 index 0000000000..43461d3e6b --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_floorf-sse4_1.S @@ -0,0 +1,25 @@ +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .section .text.sse4.1,"ax",@progbits +ENTRY(__floorf_sse41) + roundss $9, %xmm0, %xmm0 + ret +END(__floorf_sse41) diff --git a/sysdeps/x86_64/fpu/multiarch/s_floorf.c b/sysdeps/x86_64/fpu/multiarch/s_floorf.c new file mode 100644 index 0000000000..5ef2fec2e3 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_floorf.c @@ -0,0 +1,31 @@ +/* Multiple versions of __floorf. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <libm-alias-float.h> + +#define floorf __redirect_floorf +#define __floorf __redirect___floorf +#include <math.h> +#undef floorf +#undef __floorf + +#define SYMBOL_NAME floorf +#include "ifunc-sse4_1.h" + +libc_ifunc_redirected (__redirect_floorf, __floorf, IFUNC_SELECTOR ()); +libm_alias_float (__floor, floor) diff --git a/sysdeps/x86_64/fpu/multiarch/s_fma.c b/sysdeps/x86_64/fpu/multiarch/s_fma.c index 1de1a84cbe..875c76d372 100644 --- a/sysdeps/x86_64/fpu/multiarch/s_fma.c +++ b/sysdeps/x86_64/fpu/multiarch/s_fma.c @@ -1,5 +1,5 @@ /* FMA version of fma. - Copyright (C) 2009-2016 Free Software Foundation, Inc. + Copyright (C) 2009-2018 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -20,6 +20,7 @@ #include <config.h> #include <math.h> #include <init-arch.h> +#include <libm-alias-double.h> extern double __fma_sse2 (double x, double y, double z) attribute_hidden; @@ -43,7 +44,7 @@ __fma_fma4 (double x, double y, double z) libm_ifunc (__fma, HAS_ARCH_FEATURE (FMA_Usable) ? __fma_fma3 : (HAS_ARCH_FEATURE (FMA4_Usable) ? __fma_fma4 : __fma_sse2)); -weak_alias (__fma, fma) +libm_alias_double (__fma, fma) #define __fma __fma_sse2 diff --git a/sysdeps/x86_64/fpu/multiarch/s_fmaf.c b/sysdeps/x86_64/fpu/multiarch/s_fmaf.c index 8905e4b54f..5f4c2ec0be 100644 --- a/sysdeps/x86_64/fpu/multiarch/s_fmaf.c +++ b/sysdeps/x86_64/fpu/multiarch/s_fmaf.c @@ -1,5 +1,5 @@ /* FMA version of fmaf. - Copyright (C) 2009-2016 Free Software Foundation, Inc. + Copyright (C) 2009-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -19,6 +19,7 @@ #include <config.h> #include <math.h> #include <init-arch.h> +#include <libm-alias-float.h> extern float __fmaf_sse2 (float x, float y, float z) attribute_hidden; @@ -42,7 +43,7 @@ __fmaf_fma4 (float x, float y, float z) libm_ifunc (__fmaf, HAS_ARCH_FEATURE (FMA_Usable) ? __fmaf_fma3 : (HAS_ARCH_FEATURE (FMA4_Usable) ? __fmaf_fma4 : __fmaf_sse2)); -weak_alias (__fmaf, fmaf) +libm_alias_float (__fma, fma) #define __fmaf __fmaf_sse2 diff --git a/sysdeps/x86_64/fpu/multiarch/s_nearbyint-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_nearbyint-sse4_1.S new file mode 100644 index 0000000000..f9ac36e4f0 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_nearbyint-sse4_1.S @@ -0,0 +1,25 @@ +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .section .text.sse4.1,"ax",@progbits +ENTRY(__nearbyint_sse41) + roundsd $0xc, %xmm0, %xmm0 + ret +END(__nearbyint_sse41) diff --git a/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S b/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S deleted file mode 100644 index 5091cf5813..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <machine/asm.h> -#include <init-arch.h> - - -ENTRY(__nearbyint) - .type __nearbyint, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq __nearbyint_sse41(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jnz 2f - leaq __nearbyint_c(%rip), %rax -2: ret -END(__nearbyint) -weak_alias (__nearbyint, nearbyint) - - -ENTRY(__nearbyint_sse41) - roundsd $0xc, %xmm0, %xmm0 - ret -END(__nearbyint_sse41) diff --git a/sysdeps/x86_64/fpu/multiarch/s_nearbyint.c b/sysdeps/x86_64/fpu/multiarch/s_nearbyint.c new file mode 100644 index 0000000000..d92945fd14 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_nearbyint.c @@ -0,0 +1,32 @@ +/* Multiple versions of __nearbyint. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <libm-alias-double.h> + +#define nearbyint __redirect_nearbyint +#define __nearbyint __redirect___nearbyint +#include <math.h> +#undef nearbyint +#undef __nearbyint + +#define SYMBOL_NAME nearbyint +#include "ifunc-sse4_1.h" + +libc_ifunc_redirected (__redirect_nearbyint, __nearbyint, + IFUNC_SELECTOR ()); +libm_alias_double (__nearbyint, nearbyint) diff --git a/sysdeps/x86_64/fpu/multiarch/s_nearbyintf-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_nearbyintf-sse4_1.S new file mode 100644 index 0000000000..2f427da778 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_nearbyintf-sse4_1.S @@ -0,0 +1,25 @@ +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .section .text.sse4.1,"ax",@progbits +ENTRY(__nearbyintf_sse41) + roundss $0xc, %xmm0, %xmm0 + ret +END(__nearbyintf_sse41) diff --git a/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S b/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S deleted file mode 100644 index 4a13700001..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <machine/asm.h> -#include <init-arch.h> - - -ENTRY(__nearbyintf) - .type __nearbyintf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq __nearbyintf_sse41(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jnz 2f - leaq __nearbyintf_c(%rip), %rax -2: ret -END(__nearbyintf) -weak_alias (__nearbyintf, nearbyintf) - - -ENTRY(__nearbyintf_sse41) - roundss $0xc, %xmm0, %xmm0 - ret -END(__nearbyintf_sse41) diff --git a/sysdeps/x86_64/fpu/multiarch/s_rint.S b/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.c index 1c0d1e14b7..ba7be27956 100644 --- a/sysdeps/x86_64/fpu/multiarch/s_rint.S +++ b/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.c @@ -1,6 +1,6 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Multiple versions of __nearbyintf. + Copyright (C) 2017-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. - Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -16,23 +16,17 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include <machine/asm.h> -#include <init-arch.h> +#include <libm-alias-float.h> +#define nearbyintf __redirect_nearbyintf +#define __nearbyintf __redirect___nearbyintf +#include <math.h> +#undef nearbyintf +#undef __nearbyintf -ENTRY(__rint) - .type __rint, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq __rint_sse41(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jnz 2f - leaq __rint_c(%rip), %rax -2: ret -END(__rint) -weak_alias (__rint, rint) +#define SYMBOL_NAME nearbyintf +#include "ifunc-sse4_1.h" - -ENTRY(__rint_sse41) - roundsd $4, %xmm0, %xmm0 - ret -END(__rint_sse41) +libc_ifunc_redirected (__redirect_nearbyintf, __nearbyintf, + IFUNC_SELECTOR ()); +libm_alias_float (__nearbyint, nearbyint) diff --git a/sysdeps/x86_64/fpu/multiarch/s_rint-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_rint-sse4_1.S new file mode 100644 index 0000000000..7d7568a1a0 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_rint-sse4_1.S @@ -0,0 +1,25 @@ +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .section .text.sse4.1,"ax",@progbits +ENTRY(__rint_sse41) + roundsd $4, %xmm0, %xmm0 + ret +END(__rint_sse41) diff --git a/sysdeps/x86_64/fpu/multiarch/s_rint.c b/sysdeps/x86_64/fpu/multiarch/s_rint.c new file mode 100644 index 0000000000..f1cb2fed0c --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_rint.c @@ -0,0 +1,31 @@ +/* Multiple versions of __rint. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <libm-alias-double.h> + +#define rint __redirect_rint +#define __rint __redirect___rint +#include <math.h> +#undef rint +#undef __rint + +#define SYMBOL_NAME rint +#include "ifunc-sse4_1.h" + +libc_ifunc_redirected (__redirect_rint, __rint, IFUNC_SELECTOR ()); +libm_alias_double (__rint, rint) diff --git a/sysdeps/x86_64/fpu/multiarch/s_rintf-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_rintf-sse4_1.S new file mode 100644 index 0000000000..ef5d896f55 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_rintf-sse4_1.S @@ -0,0 +1,25 @@ +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .section .text.sse4.1,"ax",@progbits +ENTRY(__rintf_sse41) + roundss $4, %xmm0, %xmm0 + ret +END(__rintf_sse41) diff --git a/sysdeps/x86_64/fpu/multiarch/s_rintf.S b/sysdeps/x86_64/fpu/multiarch/s_rintf.S deleted file mode 100644 index 8e42fa561f..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/s_rintf.S +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <machine/asm.h> -#include <init-arch.h> - - -ENTRY(__rintf) - .type __rintf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq __rintf_sse41(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jnz 2f - leaq __rintf_c(%rip), %rax -2: ret -END(__rintf) -weak_alias (__rintf, rintf) - - -ENTRY(__rintf_sse41) - roundss $4, %xmm0, %xmm0 - ret -END(__rintf_sse41) diff --git a/sysdeps/x86_64/fpu/multiarch/s_rintf.c b/sysdeps/x86_64/fpu/multiarch/s_rintf.c new file mode 100644 index 0000000000..41323b3b5b --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_rintf.c @@ -0,0 +1,31 @@ +/* Multiple versions of __rintf. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <libm-alias-float.h> + +#define rintf __redirect_rintf +#define __rintf __redirect___rintf +#include <math.h> +#undef rintf +#undef __rintf + +#define SYMBOL_NAME rintf +#include "ifunc-sse4_1.h" + +libc_ifunc_redirected (__redirect_rintf, __rintf, IFUNC_SELECTOR ()); +libm_alias_float (__rint, rint) diff --git a/sysdeps/x86_64/fpu/multiarch/s_sin-fma.c b/sysdeps/x86_64/fpu/multiarch/s_sin-fma.c new file mode 100644 index 0000000000..15f3c394d5 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_sin-fma.c @@ -0,0 +1,11 @@ +#define __cos __cos_fma +#define __sin __sin_fma +#define __docos __docos_fma +#define __dubsin __dubsin_fma +#define __mpcos __mpcos_fma +#define __mpcos1 __mpcos1_fma +#define __mpsin __mpsin_fma +#define __mpsin1 __mpsin1_fma +#define SECTION __attribute__ ((section (".text.fma"))) + +#include <sysdeps/ieee754/dbl-64/s_sin.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_sin.c b/sysdeps/x86_64/fpu/multiarch/s_sin.c index 8ffd3e7125..b289269240 100644 --- a/sysdeps/x86_64/fpu/multiarch/s_sin.c +++ b/sysdeps/x86_64/fpu/multiarch/s_sin.c @@ -1,26 +1,39 @@ -#include <init-arch.h> -#include <math.h> -#undef NAN - -extern double __cos_sse2 (double); -extern double __sin_sse2 (double); -extern double __cos_avx (double); -extern double __sin_avx (double); -extern double __cos_fma4 (double); -extern double __sin_fma4 (double); - -libm_ifunc (__cos, (HAS_ARCH_FEATURE (FMA4_Usable) ? __cos_fma4 : - HAS_ARCH_FEATURE (AVX_Usable) - ? __cos_avx : __cos_sse2)); -weak_alias (__cos, cos) - -libm_ifunc (__sin, (HAS_ARCH_FEATURE (FMA4_Usable) ? __sin_fma4 : - HAS_ARCH_FEATURE (AVX_Usable) - ? __sin_avx : __sin_sse2)); -weak_alias (__sin, sin) +/* Multiple versions of sin and cos. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. -#define __cos __cos_sse2 -#define __sin __sin_sse2 + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <libm-alias-double.h> + +extern double __redirect_sin (double); +extern double __redirect_cos (double); +#define SYMBOL_NAME sin +#include "ifunc-avx-fma4.h" +libc_ifunc_redirected (__redirect_sin, __sin, IFUNC_SELECTOR ()); +libm_alias_double (__sin, sin) + +#undef SYMBOL_NAME +#define SYMBOL_NAME cos +#include "ifunc-avx-fma4.h" + +libc_ifunc_redirected (__redirect_cos, __cos, IFUNC_SELECTOR ()); +libm_alias_double (__cos, cos) + +#define __cos __cos_sse2 +#define __sin __sin_sse2 #include <sysdeps/ieee754/dbl-64/s_sin.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincosf-fma.c b/sysdeps/x86_64/fpu/multiarch/s_sincosf-fma.c new file mode 100644 index 0000000000..64abe7abca --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_sincosf-fma.c @@ -0,0 +1,240 @@ +/* Compute sine and cosine of argument optimized with vector. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <errno.h> +#include <math.h> +#include <math_private.h> +#include <x86intrin.h> +#include <libm-alias-float.h> +#include "s_sincosf.h" + +#define SINCOSF __sincosf_fma + +#ifndef SINCOSF +# define SINCOSF_FUNC __sincosf +#else +# define SINCOSF_FUNC SINCOSF +#endif + +/* Chebyshev constants for sin and cos, range -PI/4 - PI/4. */ +static const __v2df V0 = { -0x1.5555555551cd9p-3, -0x1.ffffffffe98aep-2}; +static const __v2df V1 = { 0x1.1111110c2688bp-7, 0x1.55555545c50c7p-5 }; +static const __v2df V2 = { -0x1.a019f8b4bd1f9p-13, -0x1.6c16b348b6874p-10 }; +static const __v2df V3 = { 0x1.71d7264e6b5b4p-19, 0x1.a00eb9ac43ccp-16 }; +static const __v2df V4 = { -0x1.a947e1674b58ap-26, -0x1.23c97dd8844d7p-22 }; + +/* Chebyshev constants for sin and cos, range 2^-27 - 2^-5. */ +static const __v2df VC0 = { -0x1.555555543d49dp-3, -0x1.fffffff5cc6fdp-2 }; +static const __v2df VC1 = { 0x1.110f475cec8c5p-7, 0x1.55514b178dac5p-5 }; + +static const __v2df v2ones = { 1.0, 1.0 }; + +/* Compute the sine and cosine values using Chebyshev polynomials where + THETA is the range reduced absolute value of the input + and it is less than Pi/4, + N is calculated as trunc(|x|/(Pi/4)) + 1 and it is used to decide + whether a sine or cosine approximation is more accurate and + SIGNBIT is used to add the correct sign after the Chebyshev + polynomial is computed. */ +static void +reduced_sincos (const double theta, const unsigned int n, + const unsigned int signbit, float *sinx, float *cosx) +{ + __v2df v2x, v2sx, v2cx; + const __v2df v2theta = { theta, theta }; + const __v2df v2theta2 = v2theta * v2theta; + /* Here sinf() and cosf() are calculated using sin Chebyshev polynomial: + x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). */ + v2x = V3 + v2theta2 * V4; /* S3+x^2*S4. */ + v2x = V2 + v2theta2 * v2x; /* S2+x^2*(S3+x^2*S4). */ + v2x = V1 + v2theta2 * v2x; /* S1+x^2*(S2+x^2*(S3+x^2*S4)). */ + v2x = V0 + v2theta2 * v2x; /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))). */ + v2x = v2theta2 * v2x; + v2cx = v2ones + v2x; + v2sx = v2theta + v2theta * v2x; + /* We are operating on |x|, so we need to add back the original + signbit for sinf. */ + /* Determine positive or negative primary interval. */ + /* Are we in the primary interval of sin or cos? */ + if ((n & 2) == 0) + { + const __v2df v2sign = + { + ones[((n >> 2) & 1) ^ signbit], + ones[((n + 2) >> 2) & 1] + }; + v2cx[0] = v2sx[0]; + v2cx *= v2sign; + __v4sf v4sx = _mm_cvtpd_ps (v2cx); + *sinx = v4sx[0]; + *cosx = v4sx[1]; + } + else + { + const __v2df v2sign = + { + ones[((n + 2) >> 2) & 1], + ones[((n >> 2) & 1) ^ signbit] + }; + v2cx[0] = v2sx[0]; + v2cx *= v2sign; + __v4sf v4sx = _mm_cvtpd_ps (v2cx); + *sinx = v4sx[1]; + *cosx = v4sx[0]; + } +} + +void +SINCOSF_FUNC (float x, float *sinx, float *cosx) +{ + double theta = x; + double abstheta = fabs (theta); + uint32_t ix, xi; + GET_FLOAT_WORD (xi, x); + /* |x| */ + ix = xi & 0x7fffffff; + /* If |x|< Pi/4. */ + if (ix < 0x3f490fdb) + { + if (ix >= 0x3d000000) /* |x| >= 2^-5. */ + { + __v2df v2x, v2sx, v2cx; + const __v2df v2theta = { theta, theta }; + const __v2df v2theta2 = v2theta * v2theta; + /* Chebyshev polynomial of the form for sin and cos. */ + v2x = V3 + v2theta2 * V4; + v2x = V2 + v2theta2 * v2x; + v2x = V1 + v2theta2 * v2x; + v2x = V0 + v2theta2 * v2x; + v2x = v2theta2 * v2x; + v2cx = v2ones + v2x; + v2sx = v2theta + v2theta * v2x; + v2cx[0] = v2sx[0]; + __v4sf v4sx = _mm_cvtpd_ps (v2cx); + *sinx = v4sx[0]; + *cosx = v4sx[1]; + } + else if (ix >= 0x32000000) /* |x| >= 2^-27. */ + { + /* A simpler Chebyshev approximation is close enough for this range: + for sin: x+x^3*(SS0+x^2*SS1) + for cos: 1.0+x^2*(CC0+x^3*CC1). */ + __v2df v2x, v2sx, v2cx; + const __v2df v2theta = { theta, theta }; + const __v2df v2theta2 = v2theta * v2theta; + v2x = VC0 + v2theta * v2theta2 * VC1; + v2x = v2theta2 * v2x; + v2cx = v2ones + v2x; + v2sx = v2theta + v2theta * v2x; + v2cx[0] = v2sx[0]; + __v4sf v4sx = _mm_cvtpd_ps (v2cx); + *sinx = v4sx[0]; + *cosx = v4sx[1]; + } + else + { + /* Handle some special cases. */ + if (ix) + *sinx = theta - (theta * SMALL); + else + *sinx = theta; + *cosx = 1.0 - abstheta; + } + } + else /* |x| >= Pi/4. */ + { + unsigned int signbit = xi >> 31; + if (ix < 0x40e231d6) /* |x| < 9*Pi/4. */ + { + /* There are cases where FE_UPWARD rounding mode can + produce a result of abstheta * inv_PI_4 == 9, + where abstheta < 9pi/4, so the domain for + pio2_table must go to 5 (9 / 2 + 1). */ + unsigned int n = (abstheta * inv_PI_4) + 1; + theta = abstheta - pio2_table[n / 2]; + reduced_sincos (theta, n, signbit, sinx, cosx); + } + else if (ix < 0x7f800000) + { + if (ix < 0x4b000000) /* |x| < 2^23. */ + { + unsigned int n = ((unsigned int) (abstheta * inv_PI_4)) + 1; + double x = n / 2; + theta = (abstheta - x * PI_2_hi) - x * PI_2_lo; + /* Argument reduction needed. */ + reduced_sincos (theta, n, signbit, sinx, cosx); + } + else /* |x| >= 2^23. */ + { + x = fabsf (x); + int exponent + = (ix >> FLOAT_EXPONENT_SHIFT) - FLOAT_EXPONENT_BIAS; + exponent += 3; + exponent /= 28; + double a = invpio4_table[exponent] * x; + double b = invpio4_table[exponent + 1] * x; + double c = invpio4_table[exponent + 2] * x; + double d = invpio4_table[exponent + 3] * x; + uint64_t l = a; + l &= ~0x7; + a -= l; + double e = a + b; + l = e; + e = a - l; + if (l & 1) + { + e -= 1.0; + e += b; + e += c; + e += d; + e *= M_PI_4; + reduced_sincos (e, l + 1, signbit, sinx, cosx); + } + else + { + e += b; + e += c; + e += d; + if (e <= 1.0) + { + e *= M_PI_4; + reduced_sincos (e, l + 1, signbit, sinx, cosx); + } + else + { + l++; + e -= 2.0; + e *= M_PI_4; + reduced_sincos (e, l + 1, signbit, sinx, cosx); + } + } + } + } + else + { + if (ix == 0x7f800000) + __set_errno (EDOM); + /* sin/cos(Inf or NaN) is NaN. */ + *sinx = *cosx = x - x; + } + } +} + +#ifndef SINCOSF +libm_alias_float (__sincos, sincos) +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincosf-sse2.S b/sysdeps/x86_64/fpu/multiarch/s_sincosf-sse2.S new file mode 100644 index 0000000000..51d012bb12 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_sincosf-sse2.S @@ -0,0 +1,2 @@ +#define __sincosf __sincosf_sse2 +#include <sysdeps/x86_64/fpu/s_sincosf.S> diff --git a/sysdeps/x86_64/fpu/test-float-vlen4.c b/sysdeps/x86_64/fpu/multiarch/s_sincosf.c index f6a4cf5c1e..6cb4295558 100644 --- a/sysdeps/x86_64/fpu/test-float-vlen4.c +++ b/sysdeps/x86_64/fpu/multiarch/s_sincosf.c @@ -1,5 +1,5 @@ -/* Tests for SSE ISA versions of vector math functions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. +/* Multiple versions of sincosf. + Copyright (C) 2017 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,13 +16,13 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include "test-float-vlen4.h" +#include <libm-alias-float.h> -#define TEST_VECTOR_cosf 1 -#define TEST_VECTOR_sinf 1 -#define TEST_VECTOR_sincosf 1 -#define TEST_VECTOR_logf 1 -#define TEST_VECTOR_expf 1 -#define TEST_VECTOR_powf 1 +extern void __redirect_sincosf (float, float *, float *); -#include "libm-test.c" +#define SYMBOL_NAME sincosf +#include "ifunc-fma.h" + +libc_ifunc_redirected (__redirect_sincosf, __sincosf, IFUNC_SELECTOR ()); + +libm_alias_float (__sincos, sincos) diff --git a/sysdeps/x86_64/fpu/multiarch/s_sinf-fma.c b/sysdeps/x86_64/fpu/multiarch/s_sinf-fma.c new file mode 100644 index 0000000000..34440ebf4a --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_sinf-fma.c @@ -0,0 +1,2 @@ +#define SINF __sinf_fma +#include <sysdeps/ieee754/flt-32/s_sinf.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_sinf-sse2.c b/sysdeps/x86_64/fpu/multiarch/s_sinf-sse2.c new file mode 100644 index 0000000000..74e32c98db --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_sinf-sse2.c @@ -0,0 +1,2 @@ +#define SINF __sinf_sse2 +#include <sysdeps/ieee754/flt-32/s_sinf.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_sinf.c b/sysdeps/x86_64/fpu/multiarch/s_sinf.c new file mode 100644 index 0000000000..4fdfbd8d3e --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_sinf.c @@ -0,0 +1,28 @@ +/* Multiple versions of sinf. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <libm-alias-float.h> + +extern float __redirect_sinf (float); + +#define SYMBOL_NAME sinf +#include "ifunc-fma.h" + +libc_ifunc_redirected (__redirect_sinf, __sinf, IFUNC_SELECTOR ()); + +libm_alias_float (__sin, sin) diff --git a/sysdeps/x86_64/fpu/multiarch/s_tan-avx.c b/sysdeps/x86_64/fpu/multiarch/s_tan-avx.c index 53de5d3c98..5ee29a9a06 100644 --- a/sysdeps/x86_64/fpu/multiarch/s_tan-avx.c +++ b/sysdeps/x86_64/fpu/multiarch/s_tan-avx.c @@ -1,4 +1,4 @@ -#define tan __tan_avx +#define __tan __tan_avx #define __dbl_mp __dbl_mp_avx #define __sub __sub_avx #define SECTION __attribute__ ((section (".text.avx"))) diff --git a/sysdeps/x86_64/fpu/multiarch/s_tan-fma.c b/sysdeps/x86_64/fpu/multiarch/s_tan-fma.c new file mode 100644 index 0000000000..1a1b9d2490 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_tan-fma.c @@ -0,0 +1,8 @@ +#define __tan __tan_fma +#define __dbl_mp __dbl_mp_fma +#define __mpranred __mpranred_fma +#define __mptan __mptan_fma +#define __sub __sub_fma +#define SECTION __attribute__ ((section (".text.fma"))) + +#include <sysdeps/ieee754/dbl-64/s_tan.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_tan-fma4.c b/sysdeps/x86_64/fpu/multiarch/s_tan-fma4.c index a805440b46..e4e9f6cb85 100644 --- a/sysdeps/x86_64/fpu/multiarch/s_tan-fma4.c +++ b/sysdeps/x86_64/fpu/multiarch/s_tan-fma4.c @@ -1,4 +1,4 @@ -#define tan __tan_fma4 +#define __tan __tan_fma4 #define __dbl_mp __dbl_mp_fma4 #define __mpranred __mpranred_fma4 #define __mptan __mptan_fma4 diff --git a/sysdeps/x86_64/fpu/multiarch/s_tan.c b/sysdeps/x86_64/fpu/multiarch/s_tan.c index 25f3bca07e..bb75d8d0bc 100644 --- a/sysdeps/x86_64/fpu/multiarch/s_tan.c +++ b/sysdeps/x86_64/fpu/multiarch/s_tan.c @@ -1,15 +1,30 @@ -#include <init-arch.h> -#include <math.h> +/* Multiple versions of tan. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. -extern double __tan_sse2 (double); -extern double __tan_avx (double); -extern double __tan_fma4 (double); + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. -libm_ifunc (tan, (HAS_ARCH_FEATURE (FMA4_Usable) ? __tan_fma4 : - HAS_ARCH_FEATURE (AVX_Usable) - ? __tan_avx : __tan_sse2)); + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. -#define tan __tan_sse2 + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ +#include <libm-alias-double.h> +extern double __redirect_tan (double); + +#define SYMBOL_NAME tan +#include "ifunc-avx-fma4.h" + +libc_ifunc_redirected (__redirect_tan, __tan, IFUNC_SELECTOR ()); +libm_alias_double (__tan, tan) + +#define __tan __tan_sse2 #include <sysdeps/ieee754/dbl-64/s_tan.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_trunc-c.c b/sysdeps/x86_64/fpu/multiarch/s_trunc-c.c new file mode 100644 index 0000000000..6204ae3c77 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_trunc-c.c @@ -0,0 +1,2 @@ +#define __trunc __trunc_c +#include <sysdeps/ieee754/dbl-64/wordsize-64/s_trunc.c> diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S b/sysdeps/x86_64/fpu/multiarch/s_trunc-sse4_1.S index 241378e770..b8046bfa0c 100644 --- a/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S +++ b/sysdeps/x86_64/fpu/multiarch/s_trunc-sse4_1.S @@ -1,5 +1,5 @@ -/* mempcpy with AVX - Copyright (C) 2014-2016 Free Software Foundation, Inc. +/* trunc for SSE4.1. + Copyright (C) 2017-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,7 +16,10 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define USE_AS_MEMPCPY -#define MEMCPY __mempcpy_avx_unaligned -#define MEMCPY_CHK __mempcpy_chk_avx_unaligned -#include "memcpy-avx-unaligned.S" +#include <sysdep.h> + + .section .text.sse4.1,"ax",@progbits +ENTRY(__trunc_sse41) + roundsd $11, %xmm0, %xmm0 + ret +END(__trunc_sse41) diff --git a/sysdeps/x86_64/fpu/multiarch/s_trunc.c b/sysdeps/x86_64/fpu/multiarch/s_trunc.c new file mode 100644 index 0000000000..a1b0c60630 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_trunc.c @@ -0,0 +1,31 @@ +/* Multiple versions of __trunc. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <libm-alias-double.h> + +#define trunc __redirect_trunc +#define __trunc __redirect___trunc +#include <math.h> +#undef trunc +#undef __trunc + +#define SYMBOL_NAME trunc +#include "ifunc-sse4_1.h" + +libc_ifunc_redirected (__redirect_trunc, __trunc, IFUNC_SELECTOR ()); +libm_alias_double (__trunc, trunc) diff --git a/sysdeps/x86_64/fpu/multiarch/s_truncf-c.c b/sysdeps/x86_64/fpu/multiarch/s_truncf-c.c new file mode 100644 index 0000000000..7a5ac7da1f --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_truncf-c.c @@ -0,0 +1,2 @@ +#define __truncf __truncf_c +#include <sysdeps/ieee754/flt-32/s_truncf.c> diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S b/sysdeps/x86_64/fpu/multiarch/s_truncf-sse4_1.S index 75e35f2957..2dabc0be57 100644 --- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S +++ b/sysdeps/x86_64/fpu/multiarch/s_truncf-sse4_1.S @@ -1,5 +1,5 @@ -/* memmove with AVX - Copyright (C) 2014-2016 Free Software Foundation, Inc. +/* truncf for SSE4.1. + Copyright (C) 2017-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,7 +16,10 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define USE_AS_MEMMOVE -#define MEMCPY __memmove_avx_unaligned -#define MEMCPY_CHK __memmove_chk_avx_unaligned -#include "memcpy-avx-unaligned.S" +#include <sysdep.h> + + .section .text.sse4.1,"ax",@progbits +ENTRY(__truncf_sse41) + roundss $11, %xmm0, %xmm0 + ret +END(__truncf_sse41) diff --git a/sysdeps/x86_64/fpu/multiarch/s_truncf.c b/sysdeps/x86_64/fpu/multiarch/s_truncf.c new file mode 100644 index 0000000000..a7e220bd0c --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_truncf.c @@ -0,0 +1,31 @@ +/* Multiple versions of __truncf. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <libm-alias-float.h> + +#define truncf __redirect_truncf +#define __truncf __redirect___truncf +#include <math.h> +#undef truncf +#undef __truncf + +#define SYMBOL_NAME truncf +#include "ifunc-sse4_1.h" + +libc_ifunc_redirected (__redirect_truncf, __truncf, IFUNC_SELECTOR ()); +libm_alias_float (__trunc, trunc) diff --git a/sysdeps/x86_64/fpu/multiarch/sincos32-fma.c b/sysdeps/x86_64/fpu/multiarch/sincos32-fma.c new file mode 100644 index 0000000000..dcd44bc5e8 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/sincos32-fma.c @@ -0,0 +1,15 @@ +#define __cos32 __cos32_fma +#define __sin32 __sin32_fma +#define __c32 __c32_fma +#define __mpsin __mpsin_fma +#define __mpsin1 __mpsin1_fma +#define __mpcos __mpcos_fma +#define __mpcos1 __mpcos1_fma +#define __mpranred __mpranred_fma +#define __add __add_fma +#define __dbl_mp __dbl_mp_fma +#define __mul __mul_fma +#define __sub __sub_fma +#define SECTION __attribute__ ((section (".text.fma"))) + +#include <sysdeps/ieee754/dbl-64/sincos32.c> diff --git a/sysdeps/x86_64/fpu/multiarch/slowexp-avx.c b/sysdeps/x86_64/fpu/multiarch/slowexp-avx.c deleted file mode 100644 index d01c6d71a4..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/slowexp-avx.c +++ /dev/null @@ -1,9 +0,0 @@ -#define __slowexp __slowexp_avx -#define __add __add_avx -#define __dbl_mp __dbl_mp_avx -#define __mpexp __mpexp_avx -#define __mul __mul_avx -#define __sub __sub_avx -#define SECTION __attribute__ ((section (".text.avx"))) - -#include <sysdeps/ieee754/dbl-64/slowexp.c> diff --git a/sysdeps/x86_64/fpu/multiarch/slowexp-fma4.c b/sysdeps/x86_64/fpu/multiarch/slowexp-fma4.c deleted file mode 100644 index 3bcde84233..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/slowexp-fma4.c +++ /dev/null @@ -1,9 +0,0 @@ -#define __slowexp __slowexp_fma4 -#define __add __add_fma4 -#define __dbl_mp __dbl_mp_fma4 -#define __mpexp __mpexp_fma4 -#define __mul __mul_fma4 -#define __sub __sub_fma4 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/slowexp.c> diff --git a/sysdeps/x86_64/fpu/multiarch/slowpow-fma4.c b/sysdeps/x86_64/fpu/multiarch/slowpow-fma4.c deleted file mode 100644 index 69d69823bb..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/slowpow-fma4.c +++ /dev/null @@ -1,11 +0,0 @@ -#define __slowpow __slowpow_fma4 -#define __add __add_fma4 -#define __dbl_mp __dbl_mp_fma4 -#define __mpexp __mpexp_fma4 -#define __mplog __mplog_fma4 -#define __mul __mul_fma4 -#define __sub __sub_fma4 -#define __halfulp __halfulp_fma4 -#define SECTION __attribute__ ((section (".text.fma4"))) - -#include <sysdeps/ieee754/dbl-64/slowpow.c> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core-sse2.S new file mode 100644 index 0000000000..a85729807f --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core-sse2.S @@ -0,0 +1,20 @@ +/* SSE2 version of vectorized cos, vector length is 2. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVbN2v_cos _ZGVbN2v_cos_sse2 +#include "../svml_d_cos2_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S deleted file mode 100644 index 7d720e2fcb..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized cos, vector length is 2. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVbN2v_cos) - .type _ZGVbN2v_cos, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVbN2v_cos_sse4(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jz 2f - ret -2: leaq _ZGVbN2v_cos_sse2(%rip), %rax - ret -END (_ZGVbN2v_cos) -libmvec_hidden_def (_ZGVbN2v_cos) - -#define _ZGVbN2v_cos _ZGVbN2v_cos_sse2 -#include "../svml_d_cos2_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.c new file mode 100644 index 0000000000..3ff39eecd7 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.c @@ -0,0 +1,27 @@ +/* Multiple versions of vectorized cos, vector length is 2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVbN2v_cos +#include "ifunc-mathvec-sse4_1.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVbN2v_cos, __GI__ZGVbN2v_cos, __redirect__ZGVbN2v_cos) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S index 088fcae067..10be76e207 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S @@ -1,5 +1,5 @@ /* Function cos vectorized with SSE4. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -205,7 +205,7 @@ ENTRY (_ZGVbN2v_cos_sse4) shlq $4, %r15 movsd 200(%rsp,%r15), %xmm0 - call cos@PLT + call JUMPTARGET(cos) movsd %xmm0, 264(%rsp,%r15) jmp .LBL_1_8 @@ -215,7 +215,7 @@ ENTRY (_ZGVbN2v_cos_sse4) shlq $4, %r15 movsd 192(%rsp,%r15), %xmm0 - call cos@PLT + call JUMPTARGET(cos) movsd %xmm0, 256(%rsp,%r15) jmp .LBL_1_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core-sse.S new file mode 100644 index 0000000000..9f406ea7c9 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core-sse.S @@ -0,0 +1,20 @@ +/* SSE version of vectorized cos, vector length is 4. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVdN4v_cos _ZGVdN4v_cos_sse_wrapper +#include "../svml_d_cos4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S deleted file mode 100644 index 65a3570d2e..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized cos, vector length is 4. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVdN4v_cos) - .type _ZGVdN4v_cos, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVdN4v_cos_avx2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - ret -2: leaq _ZGVdN4v_cos_sse_wrapper(%rip), %rax - ret -END (_ZGVdN4v_cos) -libmvec_hidden_def (_ZGVdN4v_cos) - -#define _ZGVdN4v_cos _ZGVdN4v_cos_sse_wrapper -#include "../svml_d_cos4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.c new file mode 100644 index 0000000000..cb8405201a --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.c @@ -0,0 +1,27 @@ +/* Multiple versions of vectorized cos, vector length is 4. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVdN4v_cos +#include "ifunc-mathvec-avx2.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVdN4v_cos, __GI__ZGVdN4v_cos, __redirect__ZGVdN4v_cos) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S index 4e653216d9..38cdc6bb03 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S @@ -1,5 +1,5 @@ /* Function cos vectorized with AVX2. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -188,7 +188,7 @@ ENTRY (_ZGVdN4v_cos_avx2) vmovsd 328(%rsp,%r15), %xmm0 vzeroupper - call cos@PLT + call JUMPTARGET(cos) vmovsd %xmm0, 392(%rsp,%r15) jmp .LBL_1_8 @@ -199,7 +199,7 @@ ENTRY (_ZGVdN4v_cos_avx2) vmovsd 320(%rsp,%r15), %xmm0 vzeroupper - call cos@PLT + call JUMPTARGET(cos) vmovsd %xmm0, 384(%rsp,%r15) jmp .LBL_1_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core-avx2.S new file mode 100644 index 0000000000..081baeeff5 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core-avx2.S @@ -0,0 +1,20 @@ +/* AVX2 version of vectorized cos, vector length is 8. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVeN8v_cos _ZGVeN8v_cos_avx2_wrapper +#include "../svml_d_cos8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S deleted file mode 100644 index 3e7f16d44e..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S +++ /dev/null @@ -1,37 +0,0 @@ -/* Multiple versions of vectorized cos, vector length is 8. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVeN8v_cos) - .type _ZGVeN8v_cos, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX -1: leaq _ZGVeN8v_cos_skx(%rip), %rax - HAS_ARCH_FEATURE (AVX512DQ_Usable) - jnz 2f - leaq _ZGVeN8v_cos_knl(%rip), %rax - HAS_ARCH_FEATURE (AVX512F_Usable) - jnz 2f - leaq _ZGVeN8v_cos_avx2_wrapper(%rip), %rax -2: ret -END (_ZGVeN8v_cos) - -#define _ZGVeN8v_cos _ZGVeN8v_cos_avx2_wrapper -#include "../svml_d_cos8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.c new file mode 100644 index 0000000000..4aa12595bc --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.c @@ -0,0 +1,27 @@ +/* Multiple versions of vectorized cos, vector length is 8. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVeN8v_cos +#include "ifunc-mathvec-avx512.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVeN8v_cos, __GI__ZGVeN8v_cos, __redirect__ZGVeN8v_cos) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S index 1cac1d827a..24e3b36357 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S @@ -1,5 +1,5 @@ /* Function cos vectorized with AVX-512, KNL and SKX versions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -22,7 +22,7 @@ .text ENTRY (_ZGVeN8v_cos_knl) -#ifndef HAVE_AVX512_ASM_SUPPORT +#ifndef HAVE_AVX512DQ_ASM_SUPPORT WRAPPER_IMPL_AVX512 _ZGVdN4v_cos #else /* @@ -221,7 +221,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos movzbl %r12b, %r15d shlq $4, %r15 vmovsd 1160(%rsp,%r15), %xmm0 - call cos@PLT + call JUMPTARGET(cos) vmovsd %xmm0, 1224(%rsp,%r15) jmp .LBL_1_8 @@ -229,14 +229,14 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos movzbl %r12b, %r15d shlq $4, %r15 vmovsd 1152(%rsp,%r15), %xmm0 - call cos@PLT + call JUMPTARGET(cos) vmovsd %xmm0, 1216(%rsp,%r15) jmp .LBL_1_7 #endif END (_ZGVeN8v_cos_knl) ENTRY (_ZGVeN8v_cos_skx) -#ifndef HAVE_AVX512_ASM_SUPPORT +#ifndef HAVE_AVX512DQ_ASM_SUPPORT WRAPPER_IMPL_AVX512 _ZGVdN4v_cos #else /* @@ -438,7 +438,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos vzeroupper vmovsd 1160(%rsp,%r15), %xmm0 - call cos@PLT + call JUMPTARGET(cos) vmovsd %xmm0, 1224(%rsp,%r15) jmp .LBL_2_8 @@ -450,7 +450,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos vzeroupper vmovsd 1152(%rsp,%r15), %xmm0 - call cos@PLT + call JUMPTARGET(cos) vmovsd %xmm0, 1216(%rsp,%r15) jmp .LBL_2_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core-sse2.S new file mode 100644 index 0000000000..3591eb1f19 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core-sse2.S @@ -0,0 +1,20 @@ +/* SSE2 version of vectorized exp. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVbN2v_exp _ZGVbN2v_exp_sse2 +#include "../svml_d_exp2_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S deleted file mode 100644 index 136c67a550..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized exp. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVbN2v_exp) - .type _ZGVbN2v_exp, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVbN2v_exp_sse4(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jz 2f - ret -2: leaq _ZGVbN2v_exp_sse2(%rip), %rax - ret -END (_ZGVbN2v_exp) -libmvec_hidden_def (_ZGVbN2v_exp) - -#define _ZGVbN2v_exp _ZGVbN2v_exp_sse2 -#include "../svml_d_exp2_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.c new file mode 100644 index 0000000000..2cfe8937c9 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.c @@ -0,0 +1,27 @@ +/* Multiple versions of vectorized exp, vector length is 2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVbN2v_exp +#include "ifunc-mathvec-sse4_1.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVbN2v_exp, __GI__ZGVbN2v_exp, __redirect__ZGVbN2v_exp) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S index 445b230152..e98d11b311 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S @@ -1,5 +1,5 @@ /* Function exp vectorized with SSE4. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -207,7 +207,7 @@ ENTRY (_ZGVbN2v_exp_sse4) shlq $4, %r15 movsd 200(%rsp,%r15), %xmm0 - call exp@PLT + call JUMPTARGET(__exp_finite) movsd %xmm0, 264(%rsp,%r15) jmp .LBL_1_8 @@ -217,7 +217,7 @@ ENTRY (_ZGVbN2v_exp_sse4) shlq $4, %r15 movsd 192(%rsp,%r15), %xmm0 - call exp@PLT + call JUMPTARGET(__exp_finite) movsd %xmm0, 256(%rsp,%r15) jmp .LBL_1_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core-sse.S new file mode 100644 index 0000000000..f8e0b5517a --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core-sse.S @@ -0,0 +1,20 @@ +/* SSE version of vectorized exp. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVdN4v_exp _ZGVdN4v_exp_sse_wrapper +#include "../svml_d_exp4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S deleted file mode 100644 index 9d6a47be0a..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized exp. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVdN4v_exp) - .type _ZGVdN4v_exp, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVdN4v_exp_avx2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - ret -2: leaq _ZGVdN4v_exp_sse_wrapper(%rip), %rax - ret -END (_ZGVdN4v_exp) -libmvec_hidden_def (_ZGVdN4v_exp) - -#define _ZGVdN4v_exp _ZGVdN4v_exp_sse_wrapper -#include "../svml_d_exp4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.c new file mode 100644 index 0000000000..59bb36984a --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.c @@ -0,0 +1,27 @@ +/* Multiple versions of vectorized exp, vector length is 4. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVdN4v_exp +#include "ifunc-mathvec-avx2.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVdN4v_exp, __GI__ZGVdN4v_exp, __redirect__ZGVdN4v_exp) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S index 25f9e28941..87990f8ad7 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S @@ -1,5 +1,5 @@ /* Function exp vectorized with AVX2. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -193,7 +193,7 @@ ENTRY (_ZGVdN4v_exp_avx2) vmovsd 328(%rsp,%r15), %xmm0 vzeroupper - call exp@PLT + call JUMPTARGET(__exp_finite) vmovsd %xmm0, 392(%rsp,%r15) jmp .LBL_1_8 @@ -204,7 +204,7 @@ ENTRY (_ZGVdN4v_exp_avx2) vmovsd 320(%rsp,%r15), %xmm0 vzeroupper - call exp@PLT + call JUMPTARGET(__exp_finite) vmovsd %xmm0, 384(%rsp,%r15) jmp .LBL_1_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core-avx2.S new file mode 100644 index 0000000000..b1d3cad0e1 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core-avx2.S @@ -0,0 +1,20 @@ +/* AVX2 version of vectorized exp. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVeN8v_exp _ZGVeN8v_exp_avx2_wrapper +#include "../svml_d_exp8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S deleted file mode 100644 index 317ee36e61..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S +++ /dev/null @@ -1,37 +0,0 @@ -/* Multiple versions of vectorized exp. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVeN8v_exp) - .type _ZGVeN8v_exp, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVeN8v_exp_skx(%rip), %rax - HAS_ARCH_FEATURE (AVX512DQ_Usable) - jnz 2f - leaq _ZGVeN8v_exp_knl(%rip), %rax - HAS_ARCH_FEATURE (AVX512F_Usable) - jnz 2f - leaq _ZGVeN8v_exp_avx2_wrapper(%rip), %rax -2: ret -END (_ZGVeN8v_exp) - -#define _ZGVeN8v_exp _ZGVeN8v_exp_avx2_wrapper -#include "../svml_d_exp8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.c new file mode 100644 index 0000000000..cfdc96ec86 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.c @@ -0,0 +1,27 @@ +/* Multiple versions of vectorized exp, vector length is 8. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVeN8v_exp +#include "ifunc-mathvec-avx512.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVeN8v_exp, __GI__ZGVeN8v_exp, __redirect__ZGVeN8v_exp) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S index 74f1d2ce7b..8dd8a03e4b 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S @@ -1,5 +1,5 @@ /* Function exp vectorized with AVX-512. KNL and SKX versions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -22,7 +22,7 @@ .text ENTRY (_ZGVeN8v_exp_knl) -#ifndef HAVE_AVX512_ASM_SUPPORT +#ifndef HAVE_AVX512DQ_ASM_SUPPORT WRAPPER_IMPL_AVX512 _ZGVdN4v_exp #else /* @@ -223,7 +223,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_exp movzbl %r12b, %r15d shlq $4, %r15 vmovsd 1160(%rsp,%r15), %xmm0 - call exp@PLT + call JUMPTARGET(__exp_finite) vmovsd %xmm0, 1224(%rsp,%r15) jmp .LBL_1_8 @@ -231,14 +231,14 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_exp movzbl %r12b, %r15d shlq $4, %r15 vmovsd 1152(%rsp,%r15), %xmm0 - call exp@PLT + call JUMPTARGET(__exp_finite) vmovsd %xmm0, 1216(%rsp,%r15) jmp .LBL_1_7 #endif END (_ZGVeN8v_exp_knl) ENTRY (_ZGVeN8v_exp_skx) -#ifndef HAVE_AVX512_ASM_SUPPORT +#ifndef HAVE_AVX512DQ_ASM_SUPPORT WRAPPER_IMPL_AVX512 _ZGVdN4v_exp #else /* @@ -438,7 +438,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_exp vmovsd 1160(%rsp,%r15), %xmm0 vzeroupper vmovsd 1160(%rsp,%r15), %xmm0 - call exp@PLT + call JUMPTARGET(__exp_finite) vmovsd %xmm0, 1224(%rsp,%r15) jmp .LBL_2_8 @@ -448,7 +448,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_exp vmovsd 1152(%rsp,%r15), %xmm0 vzeroupper vmovsd 1152(%rsp,%r15), %xmm0 - call exp@PLT + call JUMPTARGET(__exp_finite) vmovsd %xmm0, 1216(%rsp,%r15) jmp .LBL_2_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core-sse2.S new file mode 100644 index 0000000000..761a1a537d --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core-sse2.S @@ -0,0 +1,20 @@ +/* SSE2 version of vectorized log. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVbN2v_log _ZGVbN2v_log_sse2 +#include "../svml_d_log2_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S deleted file mode 100644 index 03d86a3e63..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized log. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVbN2v_log) - .type _ZGVbN2v_log, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVbN2v_log_sse4(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jz 2f - ret -2: leaq _ZGVbN2v_log_sse2(%rip), %rax - ret -END (_ZGVbN2v_log) -libmvec_hidden_def (_ZGVbN2v_log) - -#define _ZGVbN2v_log _ZGVbN2v_log_sse2 -#include "../svml_d_log2_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.c new file mode 100644 index 0000000000..c24437a3be --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.c @@ -0,0 +1,27 @@ +/* Multiple versions of vectorized log, vector length is 2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVbN2v_log +#include "ifunc-mathvec-sse4_1.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVbN2v_log, __GI__ZGVbN2v_log, __redirect__ZGVbN2v_log) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S index 5d254288f6..eb854c68d6 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S @@ -1,5 +1,5 @@ /* Function log vectorized with SSE4. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -211,7 +211,7 @@ ENTRY (_ZGVbN2v_log_sse4) shlq $4, %r15 movsd 200(%rsp,%r15), %xmm0 - call log@PLT + call JUMPTARGET(__log_finite) movsd %xmm0, 264(%rsp,%r15) jmp .LBL_1_8 @@ -221,7 +221,7 @@ ENTRY (_ZGVbN2v_log_sse4) shlq $4, %r15 movsd 192(%rsp,%r15), %xmm0 - call log@PLT + call JUMPTARGET(__log_finite) movsd %xmm0, 256(%rsp,%r15) jmp .LBL_1_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core-sse.S new file mode 100644 index 0000000000..2460512f78 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core-sse.S @@ -0,0 +1,20 @@ +/* SSE version of vectorized log. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVdN4v_log _ZGVdN4v_log_sse_wrapper +#include "../svml_d_log4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S deleted file mode 100644 index 9f6ddbef15..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized log. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVdN4v_log) - .type _ZGVdN4v_log, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVdN4v_log_avx2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - ret -2: leaq _ZGVdN4v_log_sse_wrapper(%rip), %rax - ret -END (_ZGVdN4v_log) -libmvec_hidden_def (_ZGVdN4v_log) - -#define _ZGVdN4v_log _ZGVdN4v_log_sse_wrapper -#include "../svml_d_log4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.c new file mode 100644 index 0000000000..5751370d65 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.c @@ -0,0 +1,27 @@ +/* Multiple versions of vectorized log, vector length is 4. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVdN4v_log +#include "ifunc-mathvec-avx2.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVdN4v_log, __GI__ZGVdN4v_log, __redirect__ZGVdN4v_log) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core_avx2.S index 5da298747d..81515850e1 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core_avx2.S @@ -1,5 +1,5 @@ /* Function log vectorized with AVX2. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -191,7 +191,7 @@ ENTRY (_ZGVdN4v_log_avx2) vmovsd 328(%rsp,%r15), %xmm0 vzeroupper - call log@PLT + call JUMPTARGET(__log_finite) vmovsd %xmm0, 392(%rsp,%r15) jmp .LBL_1_8 @@ -202,7 +202,7 @@ ENTRY (_ZGVdN4v_log_avx2) vmovsd 320(%rsp,%r15), %xmm0 vzeroupper - call log@PLT + call JUMPTARGET(__log_finite) vmovsd %xmm0, 384(%rsp,%r15) jmp .LBL_1_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core-avx2.S new file mode 100644 index 0000000000..ecfbeafb23 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core-avx2.S @@ -0,0 +1,20 @@ +/* AVX2 version of vectorized log. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVeN8v_log _ZGVeN8v_log_avx2_wrapper +#include "../svml_d_log8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S deleted file mode 100644 index 2e1a1da1a5..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S +++ /dev/null @@ -1,37 +0,0 @@ -/* Multiple versions of vectorized log. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVeN8v_log) - .type _ZGVeN8v_log, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVeN8v_log_skx(%rip), %rax - HAS_ARCH_FEATURE (AVX512DQ_Usable) - jnz 2f - leaq _ZGVeN8v_log_knl(%rip), %rax - HAS_ARCH_FEATURE (AVX512F_Usable) - jnz 2f - leaq _ZGVeN8v_log_avx2_wrapper(%rip), %rax -2: ret -END (_ZGVeN8v_log) - -#define _ZGVeN8v_log _ZGVeN8v_log_avx2_wrapper -#include "../svml_d_log8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.c new file mode 100644 index 0000000000..1e796dcfdd --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.c @@ -0,0 +1,27 @@ +/* Multiple versions of vectorized log, vector length is 8. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVeN8v_log +#include "ifunc-mathvec-avx512.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVeN8v_log, __GI__ZGVeN8v_log, __redirect__ZGVeN8v_log) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S index dca8e61f34..ae8af8d861 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S @@ -1,5 +1,5 @@ /* Function log vectorized with AVX-512. KNL and SKX versions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -22,7 +22,7 @@ .text ENTRY (_ZGVeN8v_log_knl) -#ifndef HAVE_AVX512_ASM_SUPPORT +#ifndef HAVE_AVX512DQ_ASM_SUPPORT WRAPPER_IMPL_AVX512 _ZGVdN4v_log #else /* @@ -222,7 +222,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log movzbl %r12b, %r15d shlq $4, %r15 vmovsd 1160(%rsp,%r15), %xmm0 - call log@PLT + call JUMPTARGET(__log_finite) vmovsd %xmm0, 1224(%rsp,%r15) jmp .LBL_1_8 @@ -230,14 +230,14 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log movzbl %r12b, %r15d shlq $4, %r15 vmovsd 1152(%rsp,%r15), %xmm0 - call log@PLT + call JUMPTARGET(__log_finite) vmovsd %xmm0, 1216(%rsp,%r15) jmp .LBL_1_7 #endif END (_ZGVeN8v_log_knl) ENTRY (_ZGVeN8v_log_skx) -#ifndef HAVE_AVX512_ASM_SUPPORT +#ifndef HAVE_AVX512DQ_ASM_SUPPORT WRAPPER_IMPL_AVX512 _ZGVdN4v_log #else /* @@ -443,7 +443,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log vzeroupper vmovsd 1160(%rsp,%r15), %xmm0 - call log@PLT + call JUMPTARGET(__log_finite) vmovsd %xmm0, 1224(%rsp,%r15) jmp .LBL_2_8 @@ -455,7 +455,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log vzeroupper vmovsd 1152(%rsp,%r15), %xmm0 - call log@PLT + call JUMPTARGET(__log_finite) vmovsd %xmm0, 1216(%rsp,%r15) jmp .LBL_2_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core-sse2.S new file mode 100644 index 0000000000..2d8ad50681 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core-sse2.S @@ -0,0 +1,20 @@ +/* SSE2 version of vectorized pow. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVbN2vv_pow _ZGVbN2vv_pow_sse2 +#include "../svml_d_pow2_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S deleted file mode 100644 index 4a50246889..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized pow. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVbN2vv_pow) - .type _ZGVbN2vv_pow, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVbN2vv_pow_sse4(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jz 2f - ret -2: leaq _ZGVbN2vv_pow_sse2(%rip), %rax - ret -END (_ZGVbN2vv_pow) -libmvec_hidden_def (_ZGVbN2vv_pow) - -#define _ZGVbN2vv_pow _ZGVbN2vv_pow_sse2 -#include "../svml_d_pow2_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.c new file mode 100644 index 0000000000..3424c0e326 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized pow, vector length is 2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVbN2vv_pow +#include "ifunc-mathvec-sse4_1.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVbN2vv_pow, __GI__ZGVbN2vv_pow, + __redirect__ZGVbN2vv_pow) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S index 064d170878..77828b44d5 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S @@ -1,5 +1,5 @@ /* Function pow vectorized with SSE4. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -413,7 +413,7 @@ ENTRY (_ZGVbN2vv_pow_sse4) movsd 72(%rsp,%r15), %xmm0 movsd 136(%rsp,%r15), %xmm1 - call pow@PLT + call JUMPTARGET(__pow_finite) movsd %xmm0, 200(%rsp,%r15) jmp .LBL_1_8 @@ -424,7 +424,7 @@ ENTRY (_ZGVbN2vv_pow_sse4) movsd 64(%rsp,%r15), %xmm0 movsd 128(%rsp,%r15), %xmm1 - call pow@PLT + call JUMPTARGET(__pow_finite) movsd %xmm0, 192(%rsp,%r15) jmp .LBL_1_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core-sse.S new file mode 100644 index 0000000000..4dcd14ff20 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core-sse.S @@ -0,0 +1,20 @@ +/* SSE version of vectorized pow. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVdN4vv_pow _ZGVdN4vv_pow_sse_wrapper +#include "../svml_d_pow4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.c new file mode 100644 index 0000000000..447be39401 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized pow, vector length is 4. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVdN4vv_pow +#include "ifunc-mathvec-avx2.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVdN4vv_pow, __GI__ZGVdN4vv_pow, + __redirect__ZGVdN4vv_pow) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S index f2a73ffe1e..c43d62f202 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S @@ -1,5 +1,5 @@ /* Function pow vectorized with AVX2. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -367,7 +367,7 @@ ENTRY (_ZGVdN4vv_pow_avx2) vmovsd 264(%rsp,%r15), %xmm1 vzeroupper - call pow@PLT + call JUMPTARGET(__pow_finite) vmovsd %xmm0, 328(%rsp,%r15) jmp .LBL_1_8 @@ -379,7 +379,7 @@ ENTRY (_ZGVdN4vv_pow_avx2) vmovsd 256(%rsp,%r15), %xmm1 vzeroupper - call pow@PLT + call JUMPTARGET(__pow_finite) vmovsd %xmm0, 320(%rsp,%r15) jmp .LBL_1_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core-avx2.S new file mode 100644 index 0000000000..8acf700e76 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core-avx2.S @@ -0,0 +1,20 @@ +/* AVX2 version of vectorized pow. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVeN8vv_pow _ZGVeN8vv_pow_avx2_wrapper +#include "../svml_d_pow8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S deleted file mode 100644 index 30bc53f2f7..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S +++ /dev/null @@ -1,37 +0,0 @@ -/* Multiple versions of vectorized pow. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVeN8vv_pow) - .type _ZGVeN8vv_pow, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVeN8vv_pow_skx(%rip), %rax - HAS_ARCH_FEATURE (AVX512DQ_Usable) - jnz 2f - leaq _ZGVeN8vv_pow_knl(%rip), %rax - HAS_ARCH_FEATURE (AVX512F_Usable) - jnz 2f - leaq _ZGVeN8vv_pow_avx2_wrapper(%rip), %rax -2: ret -END (_ZGVeN8vv_pow) - -#define _ZGVeN8vv_pow _ZGVeN8vv_pow_avx2_wrapper -#include "../svml_d_pow8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.c new file mode 100644 index 0000000000..62f96965bb --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized pow, vector length is 8. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVeN8vv_pow +#include "ifunc-mathvec-avx512.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVeN8vv_pow, __GI__ZGVeN8vv_pow, + __redirect__ZGVeN8vv_pow) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S index 4a515233fc..a28c39b73d 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S @@ -1,5 +1,5 @@ /* Function pow vectorized with AVX-512. KNL and SKX versions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -82,7 +82,7 @@ .text ENTRY (_ZGVeN8vv_pow_knl) -#ifndef HAVE_AVX512_ASM_SUPPORT +#ifndef HAVE_AVX512DQ_ASM_SUPPORT WRAPPER_IMPL_AVX512_ff _ZGVdN4vv_pow #else pushq %rbp @@ -392,7 +392,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN4vv_pow shlq $4, %r15 vmovsd 1160(%rsp,%r15), %xmm0 vmovsd 1224(%rsp,%r15), %xmm1 - call pow@PLT + call JUMPTARGET(__pow_finite) vmovsd %xmm0, 1288(%rsp,%r15) jmp .LBL_1_8 @@ -401,7 +401,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN4vv_pow shlq $4, %r15 vmovsd 1152(%rsp,%r15), %xmm0 vmovsd 1216(%rsp,%r15), %xmm1 - call pow@PLT + call JUMPTARGET(__pow_finite) vmovsd %xmm0, 1280(%rsp,%r15) jmp .LBL_1_7 @@ -409,7 +409,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN4vv_pow END (_ZGVeN8vv_pow_knl) ENTRY (_ZGVeN8vv_pow_skx) -#ifndef HAVE_AVX512_ASM_SUPPORT +#ifndef HAVE_AVX512DQ_ASM_SUPPORT WRAPPER_IMPL_AVX512_ff _ZGVdN4vv_pow #else pushq %rbp @@ -720,7 +720,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN4vv_pow vzeroupper vmovsd 1160(%rsp,%r15), %xmm0 - call pow@PLT + call JUMPTARGET(__pow_finite) vmovsd %xmm0, 1288(%rsp,%r15) jmp .LBL_2_8 @@ -732,7 +732,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN4vv_pow vzeroupper vmovsd 1152(%rsp,%r15), %xmm0 - call pow@PLT + call JUMPTARGET(__pow_finite) vmovsd %xmm0, 1280(%rsp,%r15) jmp .LBL_2_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core-sse2.S new file mode 100644 index 0000000000..cb7b31aa1c --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core-sse2.S @@ -0,0 +1,20 @@ +/* SSE2 version of vectorized sin. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVbN2v_sin _ZGVbN2v_sin_sse2 +#include "../svml_d_sin2_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S deleted file mode 100644 index 112bec2224..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized sin. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVbN2v_sin) - .type _ZGVbN2v_sin, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVbN2v_sin_sse4(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jz 2f - ret -2: leaq _ZGVbN2v_sin_sse2(%rip), %rax - ret -END (_ZGVbN2v_sin) -libmvec_hidden_def (_ZGVbN2v_sin) - -#define _ZGVbN2v_sin _ZGVbN2v_sin_sse2 -#include "../svml_d_sin2_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.c new file mode 100644 index 0000000000..1c5788f205 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.c @@ -0,0 +1,27 @@ +/* Multiple versions of vectorized sin, vector length is 2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVbN2v_sin +#include "ifunc-mathvec-sse4_1.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVbN2v_sin, __GI__ZGVbN2v_sin, __redirect__ZGVbN2v_sin) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S index 5755ce6f74..15980e9eeb 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S @@ -1,5 +1,5 @@ /* Function sin vectorized with SSE4. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -211,7 +211,7 @@ ENTRY (_ZGVbN2v_sin_sse4) shlq $4, %r15 movsd 200(%rsp,%r15), %xmm0 - call sin@PLT + call JUMPTARGET(sin) movsd %xmm0, 264(%rsp,%r15) jmp .LBL_1_8 @@ -221,7 +221,7 @@ ENTRY (_ZGVbN2v_sin_sse4) shlq $4, %r15 movsd 192(%rsp,%r15), %xmm0 - call sin@PLT + call JUMPTARGET(sin) movsd %xmm0, 256(%rsp,%r15) jmp .LBL_1_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core-sse.S new file mode 100644 index 0000000000..07fae6f3b4 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core-sse.S @@ -0,0 +1,20 @@ +/* SSE version of vectorized sin, vector length is 4. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVdN4v_sin _ZGVdN4v_sin_sse_wrapper +#include "../svml_d_sin4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S deleted file mode 100644 index 700a1c629d..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized sin, vector length is 4. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVdN4v_sin) - .type _ZGVdN4v_sin, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVdN4v_sin_avx2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - ret -2: leaq _ZGVdN4v_sin_sse_wrapper(%rip), %rax - ret -END (_ZGVdN4v_sin) -libmvec_hidden_def (_ZGVdN4v_sin) - -#define _ZGVdN4v_sin _ZGVdN4v_sin_sse_wrapper -#include "../svml_d_sin4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.c new file mode 100644 index 0000000000..b5933914aa --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.c @@ -0,0 +1,27 @@ +/* Multiple versions of vectorized sin, vector length is 4. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVdN4v_sin +#include "ifunc-mathvec-avx2.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVdN4v_sin, __GI__ZGVdN4v_sin, __redirect__ZGVdN4v_sin) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core_avx2.S index 46b557158a..4f0917c56d 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core_avx2.S @@ -1,5 +1,5 @@ /* Function sin vectorized with AVX2. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -191,7 +191,7 @@ ENTRY (_ZGVdN4v_sin_avx2) vmovsd 328(%rsp,%r15), %xmm0 vzeroupper - call sin@PLT + call JUMPTARGET(sin) vmovsd %xmm0, 392(%rsp,%r15) jmp .LBL_1_8 @@ -202,7 +202,7 @@ ENTRY (_ZGVdN4v_sin_avx2) vmovsd 320(%rsp,%r15), %xmm0 vzeroupper - call sin@PLT + call JUMPTARGET(sin) vmovsd %xmm0, 384(%rsp,%r15) jmp .LBL_1_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core-avx2.S new file mode 100644 index 0000000000..b64c3390d6 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core-avx2.S @@ -0,0 +1,23 @@ +/* AVX2 version of vectorized sin. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#define _ZGVeN8v_sin _ZGVeN8v_sin_avx2_wrapper +#include "../svml_d_sin8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S deleted file mode 100644 index 5afce0ed88..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S +++ /dev/null @@ -1,37 +0,0 @@ -/* Multiple versions of vectorized sin. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVeN8v_sin) - .type _ZGVeN8v_sin, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVeN8v_sin_skx(%rip), %rax - HAS_ARCH_FEATURE (AVX512DQ_Usable) - jnz 2f - leaq _ZGVeN8v_sin_knl(%rip), %rax - HAS_ARCH_FEATURE (AVX512F_Usable) - jnz 2f - leaq _ZGVeN8v_sin_avx2_wrapper(%rip), %rax -2: ret -END (_ZGVeN8v_sin) - -#define _ZGVeN8v_sin _ZGVeN8v_sin_avx2_wrapper -#include "../svml_d_sin8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.c new file mode 100644 index 0000000000..57023d8494 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.c @@ -0,0 +1,27 @@ +/* Multiple versions of vectorized sin, vector length is 8. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVeN8v_sin +#include "ifunc-mathvec-avx512.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVeN8v_sin, __GI__ZGVeN8v_sin, __redirect__ZGVeN8v_sin) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S index 6c565f3861..2d4b14fd1b 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S @@ -1,5 +1,5 @@ /* Function sin vectorized with AVX-512, KNL and SKX versions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -22,7 +22,7 @@ .text ENTRY (_ZGVeN8v_sin_knl) -#ifndef HAVE_AVX512_ASM_SUPPORT +#ifndef HAVE_AVX512DQ_ASM_SUPPORT WRAPPER_IMPL_AVX512 _ZGVdN4v_sin #else /* @@ -222,7 +222,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin movzbl %r12b, %r15d shlq $4, %r15 vmovsd 1160(%rsp,%r15), %xmm0 - call sin@PLT + call JUMPTARGET(sin) vmovsd %xmm0, 1224(%rsp,%r15) jmp .LBL_1_8 @@ -230,14 +230,14 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin movzbl %r12b, %r15d shlq $4, %r15 vmovsd 1152(%rsp,%r15), %xmm0 - call sin@PLT + call JUMPTARGET(sin) vmovsd %xmm0, 1216(%rsp,%r15) jmp .LBL_1_7 #endif END (_ZGVeN8v_sin_knl) ENTRY (_ZGVeN8v_sin_skx) -#ifndef HAVE_AVX512_ASM_SUPPORT +#ifndef HAVE_AVX512DQ_ASM_SUPPORT WRAPPER_IMPL_AVX512 _ZGVdN4v_sin #else /* @@ -440,7 +440,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin vzeroupper vmovsd 1160(%rsp,%r15), %xmm0 - call sin@PLT + call JUMPTARGET(sin) vmovsd %xmm0, 1224(%rsp,%r15) jmp .LBL_2_8 @@ -452,7 +452,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin vzeroupper vmovsd 1152(%rsp,%r15), %xmm0 - call sin@PLT + call JUMPTARGET(sin) vmovsd %xmm0, 1216(%rsp,%r15) jmp .LBL_2_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core-sse2.S new file mode 100644 index 0000000000..ab7f9c500d --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core-sse2.S @@ -0,0 +1,20 @@ +/* SSE2 version of vectorized sincos. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVbN2vvv_sincos _ZGVbN2vvv_sincos_sse2 +#include "../svml_d_sincos2_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S deleted file mode 100644 index 883d7d33a4..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized sincos. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVbN2vvv_sincos) - .type _ZGVbN2vvv_sincos, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVbN2vvv_sincos_sse4(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jz 2f - ret -2: leaq _ZGVbN2vvv_sincos_sse2(%rip), %rax - ret -END (_ZGVbN2vvv_sincos) -libmvec_hidden_def (_ZGVbN2vvv_sincos) - -#define _ZGVbN2vvv_sincos _ZGVbN2vvv_sincos_sse2 -#include "../svml_d_sincos2_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.c new file mode 100644 index 0000000000..f373bb40a3 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized sincos, vector length is 2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVbN2vvv_sincos +#include "ifunc-mathvec-sse4_1.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVbN2vvv_sincos, __GI__ZGVbN2vvv_sincos, + __redirect__ZGVbN2vvv_sincos) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S index 65ad540122..b4dfa37898 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S @@ -1,5 +1,5 @@ /* Function sincos vectorized with SSE4. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -20,7 +20,7 @@ #include "svml_d_trig_data.h" .text -ENTRY (_ZGVbN2vvv_sincos_sse4) +ENTRY (_ZGVbN2vl8l8_sincos_sse4) /* ALGORITHM DESCRIPTION: @@ -287,12 +287,12 @@ ENTRY (_ZGVbN2vvv_sincos_sse4) shlq $4, %r15 movsd 136(%rsp,%r15), %xmm0 - call sin@PLT + call JUMPTARGET(sin) movsd %xmm0, 200(%rsp,%r15) movsd 136(%rsp,%r15), %xmm0 - call cos@PLT + call JUMPTARGET(cos) movsd %xmm0, 264(%rsp,%r15) jmp .LBL_1_8 @@ -302,13 +302,67 @@ ENTRY (_ZGVbN2vvv_sincos_sse4) shlq $4, %r15 movsd 128(%rsp,%r15), %xmm0 - call sin@PLT + call JUMPTARGET(sin) movsd %xmm0, 192(%rsp,%r15) movsd 128(%rsp,%r15), %xmm0 - call cos@PLT + call JUMPTARGET(cos) movsd %xmm0, 256(%rsp,%r15) jmp .LBL_1_7 +END (_ZGVbN2vl8l8_sincos_sse4) +libmvec_hidden_def(_ZGVbN2vl8l8_sincos_sse4) + +/* vvv version implemented with wrapper to vl8l8 variant. */ +ENTRY (_ZGVbN2vvv_sincos_sse4) +#ifndef __ILP32__ + subq $72, %rsp + .cfi_def_cfa_offset 80 + movdqu %xmm1, 32(%rsp) + lea (%rsp), %rdi + movdqu %xmm2, 48(%rdi) + lea 16(%rsp), %rsi + call HIDDEN_JUMPTARGET(_ZGVbN2vl8l8_sincos_sse4) + movq 32(%rsp), %rdx + movq 48(%rsp), %rsi + movq 40(%rsp), %r8 + movq 56(%rsp), %r10 + movq (%rsp), %rax + movq 16(%rsp), %rcx + movq 8(%rsp), %rdi + movq 24(%rsp), %r9 + movq %rax, (%rdx) + movq %rcx, (%rsi) + movq %rdi, (%r8) + movq %r9, (%r10) + addq $72, %rsp + .cfi_def_cfa_offset 8 + ret +#else + subl $72, %esp + .cfi_def_cfa_offset 80 + leal 48(%rsp), %esi + movaps %xmm1, 16(%esp) + leal 32(%rsp), %edi + movaps %xmm2, (%esp) + call HIDDEN_JUMPTARGET(_ZGVbN2vl8l8_sincos_sse4) + movdqa 16(%esp), %xmm1 + movsd 32(%esp), %xmm0 + movq %xmm1, %rax + movdqa (%esp), %xmm2 + movsd %xmm0, (%eax) + movsd 40(%esp), %xmm0 + pextrd $1, %xmm1, %eax + movsd %xmm0, (%eax) + movsd 48(%esp), %xmm0 + movq %xmm2, %rax + movsd %xmm0, (%eax) + movsd 56(%esp), %xmm0 + pextrd $1, %xmm2, %eax + movsd %xmm0, (%eax) + addl $72, %esp + .cfi_def_cfa_offset 8 + ret +#endif END (_ZGVbN2vvv_sincos_sse4) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core-sse.S new file mode 100644 index 0000000000..10b4a2cf16 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core-sse.S @@ -0,0 +1,20 @@ +/* SSE version of vectorized sincos. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVdN4vvv_sincos _ZGVdN4vvv_sincos_sse_wrapper +#include "../svml_d_sincos4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S deleted file mode 100644 index 69a3f74650..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized sincos. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVdN4vvv_sincos) - .type _ZGVdN4vvv_sincos, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVdN4vvv_sincos_avx2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - ret -2: leaq _ZGVdN4vvv_sincos_sse_wrapper(%rip), %rax - ret -END (_ZGVdN4vvv_sincos) -libmvec_hidden_def (_ZGVdN4vvv_sincos) - -#define _ZGVdN4vvv_sincos _ZGVdN4vvv_sincos_sse_wrapper -#include "../svml_d_sincos4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.c new file mode 100644 index 0000000000..1fabd7b471 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized sincos, vector length is 4. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVdN4vvv_sincos +#include "ifunc-mathvec-avx2.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVdN4vvv_sincos, __GI__ZGVdN4vvv_sincos, + __redirect__ZGVdN4vvv_sincos) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S index 60d03e9f8b..d56aa96ac9 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S @@ -1,5 +1,5 @@ /* Function sincos vectorized with AVX2. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -20,7 +20,7 @@ #include "svml_d_trig_data.h" .text -ENTRY (_ZGVdN4vvv_sincos_avx2) +ENTRY (_ZGVdN4vl8l8_sincos_avx2) /* ALGORITHM DESCRIPTION: @@ -248,12 +248,12 @@ ENTRY (_ZGVdN4vvv_sincos_avx2) vmovsd 264(%rsp,%r15), %xmm0 vzeroupper - call sin@PLT + call JUMPTARGET(sin) vmovsd %xmm0, 328(%rsp,%r15) vmovsd 264(%rsp,%r15), %xmm0 - call cos@PLT + call JUMPTARGET(cos) vmovsd %xmm0, 392(%rsp,%r15) jmp .LBL_1_8 @@ -264,14 +264,110 @@ ENTRY (_ZGVdN4vvv_sincos_avx2) vmovsd 256(%rsp,%r15), %xmm0 vzeroupper - call sin@PLT + call JUMPTARGET(sin) vmovsd %xmm0, 320(%rsp,%r15) vmovsd 256(%rsp,%r15), %xmm0 - call cos@PLT + call JUMPTARGET(cos) vmovsd %xmm0, 384(%rsp,%r15) jmp .LBL_1_7 +END (_ZGVdN4vl8l8_sincos_avx2) +libmvec_hidden_def(_ZGVdN4vl8l8_sincos_avx2) + +/* vvv version implemented with wrapper to vl8l8 variant. */ +ENTRY (_ZGVdN4vvv_sincos_avx2) +#ifndef __ILP32__ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-32, %rsp + subq $128, %rsp + vmovdqu %ymm1, 64(%rsp) + lea (%rsp), %rdi + vmovdqu %ymm2, 96(%rdi) + lea 32(%rsp), %rsi + call HIDDEN_JUMPTARGET(_ZGVdN4vl8l8_sincos_avx2) + movq 64(%rsp), %rdx + movq 96(%rsp), %rsi + movq 72(%rsp), %r8 + movq 104(%rsp), %r10 + movq (%rsp), %rax + movq 32(%rsp), %rcx + movq 8(%rsp), %rdi + movq 40(%rsp), %r9 + movq %rax, (%rdx) + movq %rcx, (%rsi) + movq 80(%rsp), %rax + movq 112(%rsp), %rcx + movq %rdi, (%r8) + movq %r9, (%r10) + movq 88(%rsp), %rdi + movq 120(%rsp), %r9 + movq 16(%rsp), %r11 + movq 48(%rsp), %rdx + movq 24(%rsp), %rsi + movq 56(%rsp), %r8 + movq %r11, (%rax) + movq %rdx, (%rcx) + movq %rsi, (%rdi) + movq %r8, (%r9) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-32, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x78,0x6 + leal -48(%rbp), %esi + leal -80(%rbp), %edi + subl $104, %esp + vmovaps %xmm1, -96(%ebp) + vmovaps %xmm2, -112(%ebp) + call HIDDEN_JUMPTARGET(_ZGVdN4vl8l8_sincos_avx2) + movl -96(%ebp), %eax + vmovsd -80(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -92(%ebp), %eax + vmovsd -72(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -88(%ebp), %eax + vmovsd -64(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -84(%ebp), %eax + vmovsd -56(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -112(%ebp), %eax + vmovsd -48(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -108(%ebp), %eax + vmovsd -40(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -104(%ebp), %eax + vmovsd -32(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -100(%ebp), %eax + vmovsd -24(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + addl $104, %esp + popq %r10 + .cfi_def_cfa 10, 0 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif END (_ZGVdN4vvv_sincos_avx2) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core-avx2.S new file mode 100644 index 0000000000..8cf88f6461 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core-avx2.S @@ -0,0 +1,20 @@ +/* AVX2 version of vectorized sincos. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVeN8vvv_sincos _ZGVeN8vvv_sincos_avx2_wrapper +#include "../svml_d_sincos8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S deleted file mode 100644 index 64cb08c5d1..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S +++ /dev/null @@ -1,37 +0,0 @@ -/* Multiple versions of vectorized sincos. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVeN8vvv_sincos) - .type _ZGVeN8vvv_sincos, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVeN8vvv_sincos_skx(%rip), %rax - HAS_ARCH_FEATURE (AVX512DQ_Usable) - jnz 2f - leaq _ZGVeN8vvv_sincos_knl(%rip), %rax - HAS_ARCH_FEATURE (AVX512F_Usable) - jnz 2f - leaq _ZGVeN8vvv_sincos_avx2_wrapper(%rip), %rax -2: ret -END (_ZGVeN8vvv_sincos) - -#define _ZGVeN8vvv_sincos _ZGVeN8vvv_sincos_avx2_wrapper -#include "../svml_d_sincos8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.c new file mode 100644 index 0000000000..1409872ed2 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized sincos, vector length is 8. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVeN8vvv_sincos +#include "ifunc-mathvec-avx512.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVeN8vvv_sincos, __GI__ZGVeN8vvv_sincos, + __redirect__ZGVeN8vvv_sincos) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S index 44700f90b8..2df626c0c1 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S @@ -1,5 +1,5 @@ /* Function sincos vectorized with AVX-512. KNL and SKX versions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -36,9 +36,9 @@ sin(R), sin(R') are approximated by corresponding polynomial. */ .text -ENTRY (_ZGVeN8vvv_sincos_knl) -#ifndef HAVE_AVX512_ASM_SUPPORT -WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos +ENTRY (_ZGVeN8vl8l8_sincos_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos #else pushq %rbp cfi_adjust_cfa_offset (8) @@ -278,12 +278,12 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos shlq $4, %r15 vmovsd 1160(%rsp,%r15), %xmm0 - call sin@PLT + call JUMPTARGET(sin) vmovsd %xmm0, 1224(%rsp,%r15) vmovsd 1160(%rsp,%r15), %xmm0 - call cos@PLT + call JUMPTARGET(cos) vmovsd %xmm0, 1288(%rsp,%r15) jmp .LBL_1_8 @@ -293,22 +293,23 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos shlq $4, %r15 vmovsd 1152(%rsp,%r15), %xmm0 - call sin@PLT + call JUMPTARGET(sin) vmovsd %xmm0, 1216(%rsp,%r15) vmovsd 1152(%rsp,%r15), %xmm0 - call cos@PLT + call JUMPTARGET(cos) vmovsd %xmm0, 1280(%rsp,%r15) jmp .LBL_1_7 #endif -END (_ZGVeN8vvv_sincos_knl) +END (_ZGVeN8vl8l8_sincos_knl) +libmvec_hidden_def(_ZGVeN8vl8l8_sincos_knl) -ENTRY (_ZGVeN8vvv_sincos_skx) -#ifndef HAVE_AVX512_ASM_SUPPORT -WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos +ENTRY (_ZGVeN8vl8l8_sincos_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos #else pushq %rbp cfi_adjust_cfa_offset (8) @@ -557,12 +558,12 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos vzeroupper vmovsd 1160(%rsp,%r15), %xmm0 - call sin@PLT + call JUMPTARGET(sin) vmovsd %xmm0, 1224(%rsp,%r15) vmovsd 1160(%rsp,%r15), %xmm0 - call cos@PLT + call JUMPTARGET(cos) vmovsd %xmm0, 1288(%rsp,%r15) jmp .LBL_2_8 @@ -574,17 +575,171 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos vzeroupper vmovsd 1152(%rsp,%r15), %xmm0 - call sin@PLT + call JUMPTARGET(sin) vmovsd %xmm0, 1216(%rsp,%r15) vmovsd 1152(%rsp,%r15), %xmm0 - call cos@PLT + call JUMPTARGET(cos) vmovsd %xmm0, 1280(%rsp,%r15) jmp .LBL_2_7 #endif +END (_ZGVeN8vl8l8_sincos_skx) +libmvec_hidden_def(_ZGVeN8vl8l8_sincos_skx) + +/* Wrapper between vvv and vl8l8 vector variants. */ +.macro WRAPPER_AVX512_vvv_vl8l8 callee +#ifndef __ILP32__ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $256, %rsp + vmovups %zmm1, 128(%rsp) + lea (%rsp), %rdi + vmovups %zmm2, 192(%rdi) + lea 64(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + movq 128(%rsp), %rdx + movq 136(%rsp), %rsi + movq 144(%rsp), %r8 + movq 152(%rsp), %r10 + movq (%rsp), %rax + movq 8(%rsp), %rcx + movq 16(%rsp), %rdi + movq 24(%rsp), %r9 + movq %rax, (%rdx) + movq %rcx, (%rsi) + movq 160(%rsp), %rax + movq 168(%rsp), %rcx + movq %rdi, (%r8) + movq %r9, (%r10) + movq 176(%rsp), %rdi + movq 184(%rsp), %r9 + movq 32(%rsp), %r11 + movq 40(%rsp), %rdx + movq 48(%rsp), %rsi + movq 56(%rsp), %r8 + movq %r11, (%rax) + movq %rdx, (%rcx) + movq 192(%rsp), %r11 + movq 200(%rsp), %rdx + movq %rsi, (%rdi) + movq %r8, (%r9) + movq 208(%rsp), %rsi + movq 216(%rsp), %r8 + movq 64(%rsp), %r10 + movq 72(%rsp), %rax + movq 80(%rsp), %rcx + movq 88(%rsp), %rdi + movq %r10, (%r11) + movq %rax, (%rdx) + movq 224(%rsp), %r10 + movq 232(%rsp), %rax + movq %rcx, (%rsi) + movq %rdi, (%r8) + movq 240(%rsp), %rcx + movq 248(%rsp), %rdi + movq 96(%rsp), %r9 + movq 104(%rsp), %r11 + movq 112(%rsp), %rdx + movq 120(%rsp), %rsi + movq %r9, (%r10) + movq %r11, (%rax) + movq %rdx, (%rcx) + movq %rsi, (%rdi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-64, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x78,0x6 + leal -112(%rbp), %esi + leal -176(%rbp), %edi + subl $232, %esp + vmovdqa %ymm1, -208(%ebp) + vmovdqa %ymm2, -240(%ebp) + call HIDDEN_JUMPTARGET(\callee) + vmovdqa -208(%ebp), %xmm0 + vmovq %xmm0, %rax + vmovsd -176(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -168(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movq -200(%ebp), %rax + vmovsd -160(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -152(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movq -192(%ebp), %rax + vmovsd -144(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -136(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movq -184(%ebp), %rax + vmovsd -128(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -120(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + vmovdqa -240(%ebp), %xmm0 + vmovq %xmm0, %rax + vmovsd -112(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -104(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movq -232(%ebp), %rax + vmovsd -96(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -88(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movq -224(%ebp), %rax + vmovsd -80(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -72(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movq -216(%ebp), %rax + vmovsd -64(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -56(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + addl $232, %esp + popq %r10 + .cfi_def_cfa 10, 0 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +.endm + +ENTRY (_ZGVeN8vvv_sincos_knl) +WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_knl +END (_ZGVeN8vvv_sincos_knl) + +ENTRY (_ZGVeN8vvv_sincos_skx) +WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx END (_ZGVeN8vvv_sincos_skx) .section .rodata, "a" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core-avx2.S new file mode 100644 index 0000000000..f01f89f294 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core-avx2.S @@ -0,0 +1,20 @@ +/* AVX2 version of vectorized cosf. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVeN16v_cosf _ZGVeN16v_cosf_avx2_wrapper +#include "../svml_s_cosf16_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S deleted file mode 100644 index 755254a280..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S +++ /dev/null @@ -1,37 +0,0 @@ -/* Multiple versions of vectorized cosf. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVeN16v_cosf) - .type _ZGVeN16v_cosf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVeN16v_cosf_skx(%rip), %rax - HAS_ARCH_FEATURE (AVX512DQ_Usable) - jnz 2f - leaq _ZGVeN16v_cosf_knl(%rip), %rax - HAS_ARCH_FEATURE (AVX512F_Usable) - jnz 2f - leaq _ZGVeN16v_cosf_avx2_wrapper(%rip), %rax -2: ret -END (_ZGVeN16v_cosf) - -#define _ZGVeN16v_cosf _ZGVeN16v_cosf_avx2_wrapper -#include "../svml_s_cosf16_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.c new file mode 100644 index 0000000000..5bd0441b16 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized cosf, vector length is 16. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVeN16v_cosf +#include "ifunc-mathvec-avx512.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVeN16v_cosf, __GI__ZGVeN16v_cosf, + __redirect__ZGVeN16v_cosf) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S index 5004cd4758..6ea1137b42 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S @@ -1,5 +1,5 @@ /* Function cosf vectorized with AVX-512. KNL and SKX versions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -22,7 +22,7 @@ .text ENTRY (_ZGVeN16v_cosf_knl) -#ifndef HAVE_AVX512_ASM_SUPPORT +#ifndef HAVE_AVX512DQ_ASM_SUPPORT WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf #else /* @@ -225,21 +225,21 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf cfi_restore_state movzbl %r12b, %r15d vmovss 1156(%rsp,%r15,8), %xmm0 - call cosf@PLT + call JUMPTARGET(cosf) vmovss %xmm0, 1220(%rsp,%r15,8) jmp .LBL_1_8 .LBL_1_12: movzbl %r12b, %r15d vmovss 1152(%rsp,%r15,8), %xmm0 - call cosf@PLT + call JUMPTARGET(cosf) vmovss %xmm0, 1216(%rsp,%r15,8) jmp .LBL_1_7 #endif END (_ZGVeN16v_cosf_knl) ENTRY (_ZGVeN16v_cosf_skx) -#ifndef HAVE_AVX512_ASM_SUPPORT +#ifndef HAVE_AVX512DQ_ASM_SUPPORT WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf #else /* @@ -440,7 +440,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf vmovss 1156(%rsp,%r15,8), %xmm0 vzeroupper vmovss 1156(%rsp,%r15,8), %xmm0 - call cosf@PLT + call JUMPTARGET(cosf) vmovss %xmm0, 1220(%rsp,%r15,8) jmp .LBL_2_8 .LBL_2_12: @@ -448,7 +448,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf vmovss 1152(%rsp,%r15,8), %xmm0 vzeroupper vmovss 1152(%rsp,%r15,8), %xmm0 - call cosf@PLT + call JUMPTARGET(cosf) vmovss %xmm0, 1216(%rsp,%r15,8) jmp .LBL_2_7 #endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core-sse2.S new file mode 100644 index 0000000000..727189f8e6 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core-sse2.S @@ -0,0 +1,20 @@ +/* SSE2 version of vectorized cosf, vector length is 4. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVbN4v_cosf _ZGVbN4v_cosf_sse2 +#include "../svml_s_cosf4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S deleted file mode 100644 index ad7de18851..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized cosf, vector length is 4. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVbN4v_cosf) - .type _ZGVbN4v_cosf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVbN4v_cosf_sse4(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jz 2f - ret -2: leaq _ZGVbN4v_cosf_sse2(%rip), %rax - ret -END (_ZGVbN4v_cosf) -libmvec_hidden_def (_ZGVbN4v_cosf) - -#define _ZGVbN4v_cosf _ZGVbN4v_cosf_sse2 -#include "../svml_s_cosf4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.c new file mode 100644 index 0000000000..dde470af5d --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized cosf, vector length is 4. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVbN4v_cosf +#include "ifunc-mathvec-sse4_1.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVbN4v_cosf, __GI__ZGVbN4v_cosf, + __redirect__ZGVbN4v_cosf) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S index d23ff72a30..f4e0553bb3 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S @@ -1,5 +1,5 @@ /* Function cosf vectorized with SSE4. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -211,7 +211,7 @@ ENTRY (_ZGVbN4v_cosf_sse4) movzbl %r12b, %r15d movss 196(%rsp,%r15,8), %xmm0 - call cosf@PLT + call JUMPTARGET(cosf) movss %xmm0, 260(%rsp,%r15,8) jmp .LBL_1_8 @@ -220,7 +220,7 @@ ENTRY (_ZGVbN4v_cosf_sse4) movzbl %r12b, %r15d movss 192(%rsp,%r15,8), %xmm0 - call cosf@PLT + call JUMPTARGET(cosf) movss %xmm0, 256(%rsp,%r15,8) jmp .LBL_1_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core-sse.S new file mode 100644 index 0000000000..1e1a5540c3 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core-sse.S @@ -0,0 +1,20 @@ +/* SSE version of vectorized cosf, vector length is 8. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVdN8v_cosf _ZGVdN8v_cosf_sse_wrapper +#include "../svml_s_cosf8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S deleted file mode 100644 index 602c70e324..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized cosf, vector length is 8. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVdN8v_cosf) - .type _ZGVdN8v_cosf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVdN8v_cosf_avx2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - ret -2: leaq _ZGVdN8v_cosf_sse_wrapper(%rip), %rax - ret -END (_ZGVdN8v_cosf) -libmvec_hidden_def (_ZGVdN8v_cosf) - -#define _ZGVdN8v_cosf _ZGVdN8v_cosf_sse_wrapper -#include "../svml_s_cosf8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.c new file mode 100644 index 0000000000..56531b215a --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized cosf, vector length is 8. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVdN8v_cosf +#include "ifunc-mathvec-avx2.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVdN8v_cosf, __GI__ZGVdN8v_cosf, + __redirect__ZGVdN8v_cosf) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S index 513f3c0a29..dbff4a7b7e 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S @@ -1,5 +1,5 @@ /* Function cosf vectorized with AVX2. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -197,7 +197,7 @@ ENTRY (_ZGVdN8v_cosf_avx2) vmovss 324(%rsp,%r15,8), %xmm0 vzeroupper - call cosf@PLT + call JUMPTARGET(cosf) vmovss %xmm0, 388(%rsp,%r15,8) jmp .LBL_1_8 @@ -207,7 +207,7 @@ ENTRY (_ZGVdN8v_cosf_avx2) vmovss 320(%rsp,%r15,8), %xmm0 vzeroupper - call cosf@PLT + call JUMPTARGET(cosf) vmovss %xmm0, 384(%rsp,%r15,8) jmp .LBL_1_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core-avx2.S new file mode 100644 index 0000000000..e0b7fd787f --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core-avx2.S @@ -0,0 +1,23 @@ +/* AVX2 version of vectorized expf. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#define _ZGVeN16v_expf _ZGVeN16v_expf_avx2_wrapper +#include "../svml_s_expf16_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S deleted file mode 100644 index f990d36483..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S +++ /dev/null @@ -1,37 +0,0 @@ -/* Multiple versions of vectorized expf. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVeN16v_expf) - .type _ZGVeN16v_expf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVeN16v_expf_skx(%rip), %rax - HAS_ARCH_FEATURE (AVX512DQ_Usable) - jnz 2f - leaq _ZGVeN16v_expf_knl(%rip), %rax - HAS_ARCH_FEATURE (AVX512F_Usable) - jnz 2f - leaq _ZGVeN16v_expf_avx2_wrapper(%rip), %rax -2: ret -END (_ZGVeN16v_expf) - -#define _ZGVeN16v_expf _ZGVeN16v_expf_avx2_wrapper -#include "../svml_s_expf16_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.c new file mode 100644 index 0000000000..d358d93546 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized expf, vector length is 16. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVeN16v_expf +#include "ifunc-mathvec-avx512.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVeN16v_expf, __GI__ZGVeN16v_expf, + __redirect__ZGVeN16v_expf) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S index 7eb7a1b775..89ba0df28f 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S @@ -1,5 +1,5 @@ /* Function expf vectorized with AVX-512. KNL and SKX versions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -22,7 +22,7 @@ .text ENTRY (_ZGVeN16v_expf_knl) -#ifndef HAVE_AVX512_ASM_SUPPORT +#ifndef HAVE_AVX512DQ_ASM_SUPPORT WRAPPER_IMPL_AVX512 _ZGVdN8v_expf #else /* @@ -212,14 +212,14 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf cfi_restore_state movzbl %r12b, %r15d vmovss 1156(%rsp,%r15,8), %xmm0 - call expf@PLT + call JUMPTARGET(__expf_finite) vmovss %xmm0, 1220(%rsp,%r15,8) jmp .LBL_1_8 .LBL_1_12: movzbl %r12b, %r15d vmovss 1152(%rsp,%r15,8), %xmm0 - call expf@PLT + call JUMPTARGET(__expf_finite) vmovss %xmm0, 1216(%rsp,%r15,8) jmp .LBL_1_7 @@ -227,7 +227,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf END (_ZGVeN16v_expf_knl) ENTRY (_ZGVeN16v_expf_skx) -#ifndef HAVE_AVX512_ASM_SUPPORT +#ifndef HAVE_AVX512DQ_ASM_SUPPORT WRAPPER_IMPL_AVX512 _ZGVdN8v_expf #else /* @@ -422,7 +422,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf vzeroupper vmovss 1156(%rsp,%r15,8), %xmm0 - call expf@PLT + call JUMPTARGET(__expf_finite) vmovss %xmm0, 1220(%rsp,%r15,8) jmp .LBL_2_8 @@ -433,7 +433,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf vzeroupper vmovss 1152(%rsp,%r15,8), %xmm0 - call expf@PLT + call JUMPTARGET(__expf_finite) vmovss %xmm0, 1216(%rsp,%r15,8) jmp .LBL_2_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core-sse2.S new file mode 100644 index 0000000000..8f57e4bbd9 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core-sse2.S @@ -0,0 +1,20 @@ +/* SSE2 version of vectorized expf. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVbN4v_expf _ZGVbN4v_expf_sse2 +#include "../svml_s_expf4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S deleted file mode 100644 index 2fbe6d475e..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized expf. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVbN4v_expf) - .type _ZGVbN4v_expf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVbN4v_expf_sse4(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jz 2f - ret -2: leaq _ZGVbN4v_expf_sse2(%rip), %rax - ret -END (_ZGVbN4v_expf) -libmvec_hidden_def (_ZGVbN4v_expf) - -#define _ZGVbN4v_expf _ZGVbN4v_expf_sse2 -#include "../svml_s_expf4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.c new file mode 100644 index 0000000000..82befe0b5d --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized expf, vector length is 4. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVbN4v_expf +#include "ifunc-mathvec-sse4_1.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVbN4v_expf, __GI__ZGVbN4v_expf, + __redirect__ZGVbN4v_expf) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S index c6f91e8dc1..254ec94096 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S @@ -1,5 +1,5 @@ /* Function expf vectorized with SSE4. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -195,7 +195,7 @@ ENTRY (_ZGVbN4v_expf_sse4) movzbl %r12b, %r15d movss 196(%rsp,%r15,8), %xmm0 - call expf@PLT + call JUMPTARGET(__expf_finite) movss %xmm0, 260(%rsp,%r15,8) jmp .LBL_1_8 @@ -204,7 +204,7 @@ ENTRY (_ZGVbN4v_expf_sse4) movzbl %r12b, %r15d movss 192(%rsp,%r15,8), %xmm0 - call expf@PLT + call JUMPTARGET(__expf_finite) movss %xmm0, 256(%rsp,%r15,8) jmp .LBL_1_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core-sse.S new file mode 100644 index 0000000000..459699c80c --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core-sse.S @@ -0,0 +1,20 @@ +/* SSE version of vectorized expf. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVdN8v_expf _ZGVdN8v_expf_sse_wrapper +#include "../svml_s_expf8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S deleted file mode 100644 index 7d19bb423d..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized expf. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVdN8v_expf) - .type _ZGVdN8v_expf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVdN8v_expf_avx2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - ret -2: leaq _ZGVdN8v_expf_sse_wrapper(%rip), %rax - ret -END (_ZGVdN8v_expf) -libmvec_hidden_def (_ZGVdN8v_expf) - -#define _ZGVdN8v_expf _ZGVdN8v_expf_sse_wrapper -#include "../svml_s_expf8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.c new file mode 100644 index 0000000000..0b8a47ede0 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized expf, vector length is 8. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVdN8v_expf +#include "ifunc-mathvec-avx2.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVdN8v_expf, __GI__ZGVdN8v_expf, + __redirect__ZGVdN8v_expf) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S index c6be6954f7..ae1d5317e4 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S @@ -1,5 +1,5 @@ /* Function expf vectorized with AVX2. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -184,7 +184,7 @@ ENTRY(_ZGVdN8v_expf_avx2) vmovss 324(%rsp,%r15,8), %xmm0 vzeroupper - call expf@PLT + call JUMPTARGET(__expf_finite) vmovss %xmm0, 388(%rsp,%r15,8) jmp .LBL_1_8 @@ -194,7 +194,7 @@ ENTRY(_ZGVdN8v_expf_avx2) vmovss 320(%rsp,%r15,8), %xmm0 vzeroupper - call expf@PLT + call JUMPTARGET(__expf_finite) vmovss %xmm0, 384(%rsp,%r15,8) jmp .LBL_1_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core-avx2.S new file mode 100644 index 0000000000..b23bd12fa0 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core-avx2.S @@ -0,0 +1,20 @@ +/* AVX2 version of vectorized logf. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVeN16v_logf _ZGVeN16v_logf_avx2_wrapper +#include "../svml_s_logf16_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S deleted file mode 100644 index 9efb2fb7df..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S +++ /dev/null @@ -1,37 +0,0 @@ -/* Multiple versions of vectorized logf. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVeN16v_logf) - .type _ZGVeN16v_logf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVeN16v_logf_skx(%rip), %rax - HAS_ARCH_FEATURE (AVX512DQ_Usable) - jnz 2f - leaq _ZGVeN16v_logf_knl(%rip), %rax - HAS_ARCH_FEATURE (AVX512F_Usable) - jnz 2f - leaq _ZGVeN16v_logf_avx2_wrapper(%rip), %rax -2: ret -END (_ZGVeN16v_logf) - -#define _ZGVeN16v_logf _ZGVeN16v_logf_avx2_wrapper -#include "../svml_s_logf16_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.c new file mode 100644 index 0000000000..fec61883b4 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized logf, vector length is 16. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVeN16v_logf +#include "ifunc-mathvec-avx512.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVeN16v_logf, __GI__ZGVeN16v_logf, + __redirect__ZGVeN16v_logf) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S index 6209058381..4cf0a96fe4 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S @@ -1,5 +1,5 @@ /* Function logf vectorized with AVX-512. KNL and SKX versions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -22,7 +22,7 @@ .text ENTRY (_ZGVeN16v_logf_knl) -#ifndef HAVE_AVX512_ASM_SUPPORT +#ifndef HAVE_AVX512DQ_ASM_SUPPORT WRAPPER_IMPL_AVX512 _ZGVdN8v_logf #else /* @@ -197,21 +197,21 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf cfi_restore_state movzbl %r12b, %r15d vmovss 1156(%rsp,%r15,8), %xmm0 - call logf@PLT + call JUMPTARGET(__logf_finite) vmovss %xmm0, 1220(%rsp,%r15,8) jmp .LBL_1_8 .LBL_1_12: movzbl %r12b, %r15d vmovss 1152(%rsp,%r15,8), %xmm0 - call logf@PLT + call JUMPTARGET(__logf_finite) vmovss %xmm0, 1216(%rsp,%r15,8) jmp .LBL_1_7 #endif END (_ZGVeN16v_logf_knl) ENTRY (_ZGVeN16v_logf_skx) -#ifndef HAVE_AVX512_ASM_SUPPORT +#ifndef HAVE_AVX512DQ_ASM_SUPPORT WRAPPER_IMPL_AVX512 _ZGVdN8v_logf #else /* @@ -391,7 +391,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf vzeroupper vmovss 1156(%rsp,%r15,8), %xmm0 - call logf@PLT + call JUMPTARGET(__logf_finite) vmovss %xmm0, 1220(%rsp,%r15,8) jmp .LBL_2_8 @@ -402,7 +402,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf vzeroupper vmovss 1152(%rsp,%r15,8), %xmm0 - call logf@PLT + call JUMPTARGET(__logf_finite) vmovss %xmm0, 1216(%rsp,%r15,8) jmp .LBL_2_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core-sse2.S new file mode 100644 index 0000000000..2c2331e1d8 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core-sse2.S @@ -0,0 +1,20 @@ +/* SSE2 version of vectorized logf. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVbN4v_logf _ZGVbN4v_logf_sse2 +#include "../svml_s_logf4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S deleted file mode 100644 index c85615ac25..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized logf. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVbN4v_logf) - .type _ZGVbN4v_logf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVbN4v_logf_sse4(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jz 2f - ret -2: leaq _ZGVbN4v_logf_sse2(%rip), %rax - ret -END (_ZGVbN4v_logf) -libmvec_hidden_def (_ZGVbN4v_logf) - -#define _ZGVbN4v_logf _ZGVbN4v_logf_sse2 -#include "../svml_s_logf4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.c new file mode 100644 index 0000000000..f249c351bd --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized logf, vector length is 4. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVbN4v_logf +#include "ifunc-mathvec-sse4_1.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVbN4v_logf, __GI__ZGVbN4v_logf, + __redirect__ZGVbN4v_logf) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S index 1ce9838513..651eb5eb1a 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S @@ -1,5 +1,5 @@ /* Function logf vectorized with SSE4. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -177,7 +177,7 @@ ENTRY (_ZGVbN4v_logf_sse4) movzbl %r12b, %r15d movss 196(%rsp,%r15,8), %xmm0 - call logf@PLT + call JUMPTARGET(__logf_finite) movss %xmm0, 260(%rsp,%r15,8) jmp .LBL_1_8 @@ -186,7 +186,7 @@ ENTRY (_ZGVbN4v_logf_sse4) movzbl %r12b, %r15d movss 192(%rsp,%r15,8), %xmm0 - call logf@PLT + call JUMPTARGET(__logf_finite) movss %xmm0, 256(%rsp,%r15,8) jmp .LBL_1_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core-sse.S new file mode 100644 index 0000000000..862379277b --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core-sse.S @@ -0,0 +1,20 @@ +/* SSE version of vectorized logf. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVdN8v_logf _ZGVdN8v_logf_sse_wrapper +#include "../svml_s_logf8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S deleted file mode 100644 index 8f6d83dd56..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized logf. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVdN8v_logf) - .type _ZGVdN8v_logf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVdN8v_logf_avx2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - ret -2: leaq _ZGVdN8v_logf_sse_wrapper(%rip), %rax - ret -END (_ZGVdN8v_logf) -libmvec_hidden_def (_ZGVdN8v_logf) - -#define _ZGVdN8v_logf _ZGVdN8v_logf_sse_wrapper -#include "../svml_s_logf8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.c new file mode 100644 index 0000000000..dbd29657ca --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized logf, vector length is 8. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVdN8v_logf +#include "ifunc-mathvec-avx2.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVdN8v_logf, __GI__ZGVdN8v_logf, + __redirect__ZGVdN8v_logf) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S index 91fb549ce6..c7f5448fcb 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S @@ -1,5 +1,5 @@ /* Function logf vectorized with AVX2. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -166,7 +166,7 @@ ENTRY(_ZGVdN8v_logf_avx2) vmovss 324(%rsp,%r15,8), %xmm0 vzeroupper - call logf@PLT + call JUMPTARGET(__logf_finite) vmovss %xmm0, 388(%rsp,%r15,8) jmp .LBL_1_8 @@ -176,7 +176,7 @@ ENTRY(_ZGVdN8v_logf_avx2) vmovss 320(%rsp,%r15,8), %xmm0 vzeroupper - call logf@PLT + call JUMPTARGET(__logf_finite) vmovss %xmm0, 384(%rsp,%r15,8) jmp .LBL_1_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core-avx2.S new file mode 100644 index 0000000000..de705c8632 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core-avx2.S @@ -0,0 +1,20 @@ +/* AVX2 version of vectorized powf. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVeN16vv_powf _ZGVeN16vv_powf_avx2_wrapper +#include "../svml_s_powf16_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S deleted file mode 100644 index 80048ce977..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S +++ /dev/null @@ -1,37 +0,0 @@ -/* Multiple versions of vectorized powf. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVeN16vv_powf) - .type _ZGVeN16vv_powf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVeN16vv_powf_skx(%rip), %rax - HAS_ARCH_FEATURE (AVX512DQ_Usable) - jnz 2f - leaq _ZGVeN16vv_powf_knl(%rip), %rax - HAS_ARCH_FEATURE (AVX512F_Usable) - jnz 2f - leaq _ZGVeN16vv_powf_avx2_wrapper(%rip), %rax -2: ret -END (_ZGVeN16vv_powf) - -#define _ZGVeN16vv_powf _ZGVeN16vv_powf_avx2_wrapper -#include "../svml_s_powf16_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.c new file mode 100644 index 0000000000..91ea810441 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized powf, vector length is 16. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVeN16vv_powf +#include "ifunc-mathvec-avx512.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVeN16vv_powf, __GI__ZGVeN16vv_powf, + __redirect__ZGVeN16vv_powf) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S index 45d48723af..bdcd50afe1 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S @@ -1,5 +1,5 @@ /* Function powf vectorized with AVX-512. KNL and SKX versions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -82,7 +82,7 @@ .text ENTRY (_ZGVeN16vv_powf_knl) -#ifndef HAVE_AVX512_ASM_SUPPORT +#ifndef HAVE_AVX512DQ_ASM_SUPPORT WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf #else pushq %rbp @@ -344,7 +344,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf movzbl %r12b, %r15d vmovss 1156(%rsp,%r15,8), %xmm0 vmovss 1220(%rsp,%r15,8), %xmm1 - call powf@PLT + call JUMPTARGET(__powf_finite) vmovss %xmm0, 1284(%rsp,%r15,8) jmp .LBL_1_8 @@ -352,14 +352,14 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf movzbl %r12b, %r15d vmovss 1152(%rsp,%r15,8), %xmm0 vmovss 1216(%rsp,%r15,8), %xmm1 - call powf@PLT + call JUMPTARGET(__powf_finite) vmovss %xmm0, 1280(%rsp,%r15,8) jmp .LBL_1_7 #endif END (_ZGVeN16vv_powf_knl) ENTRY (_ZGVeN16vv_powf_skx) -#ifndef HAVE_AVX512_ASM_SUPPORT +#ifndef HAVE_AVX512DQ_ASM_SUPPORT WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf #else pushq %rbp @@ -629,7 +629,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf vmovss 1156(%rsp,%r15,8), %xmm1 vzeroupper vmovss 1092(%rsp,%r15,8), %xmm0 - call powf@PLT + call JUMPTARGET(__powf_finite) vmovss %xmm0, 1220(%rsp,%r15,8) jmp .LBL_2_8 @@ -638,7 +638,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf vmovss 1152(%rsp,%r15,8), %xmm1 vzeroupper vmovss 1088(%rsp,%r15,8), %xmm0 - call powf@PLT + call JUMPTARGET(__powf_finite) vmovss %xmm0, 1216(%rsp,%r15,8) jmp .LBL_2_7 #endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core-sse2.S new file mode 100644 index 0000000000..b6789a621d --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core-sse2.S @@ -0,0 +1,20 @@ +/* SSE2 version of vectorized powf. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVbN4vv_powf _ZGVbN4vv_powf_sse2 +#include "../svml_s_powf4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S deleted file mode 100644 index b46821189b..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized powf. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVbN4vv_powf) - .type _ZGVbN4vv_powf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVbN4vv_powf_sse4(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jz 2f - ret -2: leaq _ZGVbN4vv_powf_sse2(%rip), %rax - ret -END (_ZGVbN4vv_powf) -libmvec_hidden_def (_ZGVbN4vv_powf) - -#define _ZGVbN4vv_powf _ZGVbN4vv_powf_sse2 -#include "../svml_s_powf4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.c new file mode 100644 index 0000000000..8149d7c991 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized powf, vector length is 4. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVbN4vv_powf +#include "ifunc-mathvec-sse4_1.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVbN4vv_powf, __GI__ZGVbN4vv_powf, + __redirect__ZGVbN4vv_powf) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S index 420f98c6a6..bc59545c98 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S @@ -1,5 +1,5 @@ /* Function powf vectorized with SSE4. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -356,7 +356,7 @@ ENTRY (_ZGVbN4vv_powf_sse4) movss 68(%rsp,%r15,8), %xmm0 movss 132(%rsp,%r15,8), %xmm1 - call powf@PLT + call JUMPTARGET(__powf_finite) movss %xmm0, 196(%rsp,%r15,8) jmp .LBL_1_8 @@ -366,7 +366,7 @@ ENTRY (_ZGVbN4vv_powf_sse4) movss 64(%rsp,%r15,8), %xmm0 movss 128(%rsp,%r15,8), %xmm1 - call powf@PLT + call JUMPTARGET(__powf_finite) movss %xmm0, 192(%rsp,%r15,8) jmp .LBL_1_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core-sse.S new file mode 100644 index 0000000000..48da6d25c7 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core-sse.S @@ -0,0 +1,20 @@ +/* SSE version of vectorized powf. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVdN8vv_powf _ZGVdN8vv_powf_sse_wrapper +#include "../svml_s_powf8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S deleted file mode 100644 index 945908a2ff..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized powf. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVdN8vv_powf) - .type _ZGVdN8vv_powf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVdN8vv_powf_avx2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - ret -2: leaq _ZGVdN8vv_powf_sse_wrapper(%rip), %rax - ret -END (_ZGVdN8vv_powf) -libmvec_hidden_def (_ZGVdN8vv_powf) - -#define _ZGVdN8vv_powf _ZGVdN8vv_powf_sse_wrapper -#include "../svml_s_powf8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.c new file mode 100644 index 0000000000..0da188180e --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized sinf, vector length is 8. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVdN8vv_powf +#include "ifunc-mathvec-avx2.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVdN8vv_powf, __GI__ZGVdN8vv_powf, + __redirect__ZGVdN8vv_powf) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core_avx2.S index 4446859130..53a4b4bc2b 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core_avx2.S @@ -1,5 +1,5 @@ /* Function powf vectorized with AVX2. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -317,7 +317,7 @@ ENTRY(_ZGVdN8vv_powf_avx2) vmovss 132(%rsp,%r15,8), %xmm1 vzeroupper - call powf@PLT + call JUMPTARGET(__powf_finite) vmovss %xmm0, 196(%rsp,%r15,8) jmp .LBL_1_8 @@ -328,7 +328,7 @@ ENTRY(_ZGVdN8vv_powf_avx2) vmovss 128(%rsp,%r15,8), %xmm1 vzeroupper - call powf@PLT + call JUMPTARGET(__powf_finite) vmovss %xmm0, 192(%rsp,%r15,8) jmp .LBL_1_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core-avx2.S new file mode 100644 index 0000000000..c677e3f1cf --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core-avx2.S @@ -0,0 +1,20 @@ +/* AVX2 version of vectorized sincosf. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVeN16vvv_sincosf _ZGVeN16vvv_sincosf_avx2_wrapper +#include "../svml_s_sincosf16_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S deleted file mode 100644 index 16cee0c676..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S +++ /dev/null @@ -1,37 +0,0 @@ -/* Multiple versions of vectorized sincosf. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVeN16vvv_sincosf) - .type _ZGVeN16vvv_sincosf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVeN16vvv_sincosf_skx(%rip), %rax - HAS_ARCH_FEATURE (AVX512DQ_Usable) - jnz 2f - leaq _ZGVeN16vvv_sincosf_knl(%rip), %rax - HAS_ARCH_FEATURE (AVX512F_Usable) - jnz 2f - leaq _ZGVeN16vvv_sincosf_avx2_wrapper(%rip), %rax -2: ret -END (_ZGVeN16vvv_sincosf) - -#define _ZGVeN16vvv_sincosf _ZGVeN16vvv_sincosf_avx2_wrapper -#include "../svml_s_sincosf16_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.c new file mode 100644 index 0000000000..b753be6bbd --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized sincosf, vector length is 16. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVeN16vvv_sincosf +#include "ifunc-mathvec-avx512.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVeN16vvv_sincosf, __GI__ZGVeN16vvv_sincosf, + __redirect__ZGVeN16vvv_sincosf) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S index 758aeeaeed..5fa4bc412a 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S @@ -1,5 +1,5 @@ /* Function sincosf vectorized with AVX-512. KNL and SKX versions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -41,7 +41,7 @@ b) Calculate 2 polynomials for sin and cos: RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))); - c) Swap RS & RC if if first bit of obtained value after + c) Swap RS & RC if first bit of obtained value after Right Shifting is set to 1. Using And, Andnot & Or operations. 3) Destination sign setting a) Set shifted destination sign using XOR operation: @@ -49,9 +49,9 @@ R2 = XOR( RC, SC ). */ .text -ENTRY (_ZGVeN16vvv_sincosf_knl) -#ifndef HAVE_AVX512_ASM_SUPPORT -WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf +ENTRY (_ZGVeN16vl4l4_sincosf_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512_fFF _ZGVdN8vl4l4_sincosf #else pushq %rbp cfi_adjust_cfa_offset (8) @@ -243,12 +243,12 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf movzbl %r12b, %r15d vmovss 1156(%rsp,%r15,8), %xmm0 - call sinf@PLT + call JUMPTARGET(sinf) vmovss %xmm0, 1220(%rsp,%r15,8) vmovss 1156(%rsp,%r15,8), %xmm0 - call cosf@PLT + call JUMPTARGET(cosf) vmovss %xmm0, 1284(%rsp,%r15,8) jmp .LBL_1_8 @@ -257,20 +257,21 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf movzbl %r12b, %r15d vmovss 1152(%rsp,%r15,8), %xmm0 - call sinf@PLT + call JUMPTARGET(sinf) vmovss %xmm0, 1216(%rsp,%r15,8) vmovss 1152(%rsp,%r15,8), %xmm0 - call cosf@PLT + call JUMPTARGET(cosf) vmovss %xmm0, 1280(%rsp,%r15,8) jmp .LBL_1_7 #endif -END (_ZGVeN16vvv_sincosf_knl) +END (_ZGVeN16vl4l4_sincosf_knl) +libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_knl) -ENTRY (_ZGVeN16vvv_sincosf_skx) -#ifndef HAVE_AVX512_ASM_SUPPORT +ENTRY (_ZGVeN16vl4l4_sincosf_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf #else pushq %rbp @@ -470,12 +471,12 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf vzeroupper vmovss 1156(%rsp,%r15,8), %xmm0 - call sinf@PLT + call JUMPTARGET(sinf) vmovss %xmm0, 1220(%rsp,%r15,8) vmovss 1156(%rsp,%r15,8), %xmm0 - call cosf@PLT + call JUMPTARGET(cosf) vmovss %xmm0, 1284(%rsp,%r15,8) jmp .LBL_2_8 @@ -486,16 +487,266 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf vzeroupper vmovss 1152(%rsp,%r15,8), %xmm0 - call sinf@PLT + call JUMPTARGET(sinf) vmovss %xmm0, 1216(%rsp,%r15,8) vmovss 1152(%rsp,%r15,8), %xmm0 - call cosf@PLT + call JUMPTARGET(cosf) vmovss %xmm0, 1280(%rsp,%r15,8) jmp .LBL_2_7 #endif +END (_ZGVeN16vl4l4_sincosf_skx) +libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_skx) + +/* Wrapper between vvv and vl4l4 vector variants. */ +.macro WRAPPER_AVX512_vvv_vl4l4 callee +#ifndef __ILP32__ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $384, %rsp + vmovups %zmm1, 128(%rsp) + lea (%rsp), %rdi + vmovups %zmm2, 192(%rdi) + vmovups %zmm3, 256(%rdi) + vmovups %zmm4, 320(%rdi) + lea 64(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + movq 128(%rsp), %rdx + movq 136(%rsp), %rsi + movq 144(%rsp), %r8 + movq 152(%rsp), %r10 + movl (%rsp), %eax + movl 4(%rsp), %ecx + movl 8(%rsp), %edi + movl 12(%rsp), %r9d + movl %eax, (%rdx) + movl %ecx, (%rsi) + movq 160(%rsp), %rax + movq 168(%rsp), %rcx + movl %edi, (%r8) + movl %r9d, (%r10) + movq 176(%rsp), %rdi + movq 184(%rsp), %r9 + movl 16(%rsp), %r11d + movl 20(%rsp), %edx + movl 24(%rsp), %esi + movl 28(%rsp), %r8d + movl %r11d, (%rax) + movl %edx, (%rcx) + movq 192(%rsp), %r11 + movq 200(%rsp), %rdx + movl %esi, (%rdi) + movl %r8d, (%r9) + movq 208(%rsp), %rsi + movq 216(%rsp), %r8 + movl 32(%rsp), %r10d + movl 36(%rsp), %eax + movl 40(%rsp), %ecx + movl 44(%rsp), %edi + movl %r10d, (%r11) + movl %eax, (%rdx) + movq 224(%rsp), %r10 + movq 232(%rsp), %rax + movl %ecx, (%rsi) + movl %edi, (%r8) + movq 240(%rsp), %rcx + movq 248(%rsp), %rdi + movl 48(%rsp), %r9d + movl 52(%rsp), %r11d + movl 56(%rsp), %edx + movl 60(%rsp), %esi + movl %r9d, (%r10) + movl %r11d, (%rax) + movq 256(%rsp), %r9 + movq 264(%rsp), %r11 + movl %edx, (%rcx) + movl %esi, (%rdi) + movq 272(%rsp), %rdx + movq 280(%rsp), %rsi + movl 64(%rsp), %r8d + movl 68(%rsp), %r10d + movl 72(%rsp), %eax + movl 76(%rsp), %ecx + movl %r8d, (%r9) + movl %r10d, (%r11) + movq 288(%rsp), %r8 + movq 296(%rsp), %r10 + movl %eax, (%rdx) + movl %ecx, (%rsi) + movq 304(%rsp), %rax + movq 312(%rsp), %rcx + movl 80(%rsp), %edi + movl 84(%rsp), %r9d + movl 88(%rsp), %r11d + movl 92(%rsp), %edx + movl %edi, (%r8) + movl %r9d, (%r10) + movq 320(%rsp), %rdi + movq 328(%rsp), %r9 + movl %r11d, (%rax) + movl %edx, (%rcx) + movq 336(%rsp), %r11 + movq 344(%rsp), %rdx + movl 96(%rsp), %esi + movl 100(%rsp), %r8d + movl 104(%rsp), %r10d + movl 108(%rsp), %eax + movl %esi, (%rdi) + movl %r8d, (%r9) + movq 352(%rsp), %rsi + movq 360(%rsp), %r8 + movl %r10d, (%r11) + movl %eax, (%rdx) + movq 368(%rsp), %r10 + movq 376(%rsp), %rax + movl 112(%rsp), %ecx + movl 116(%rsp), %edi + movl 120(%rsp), %r9d + movl 124(%rsp), %r11d + movl %ecx, (%rsi) + movl %edi, (%r8) + movl %r9d, (%r10) + movl %r11d, (%rax) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-64, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x78,0x6 + leal -112(%rbp), %esi + leal -176(%rbp), %edi + subl $296, %esp + vmovdqa64 %zmm1, -240(%ebp) + vmovdqa64 %zmm2, -304(%ebp) + call HIDDEN_JUMPTARGET(\callee) + movl -240(%ebp), %eax + vmovss -176(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -236(%ebp), %eax + vmovss -172(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -232(%ebp), %eax + vmovss -168(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -228(%ebp), %eax + vmovss -164(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -224(%ebp), %eax + vmovss -160(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -220(%ebp), %eax + vmovss -156(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -216(%ebp), %eax + vmovss -152(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -212(%ebp), %eax + vmovss -148(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -208(%ebp), %eax + vmovss -144(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -204(%ebp), %eax + vmovss -140(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -200(%ebp), %eax + vmovss -136(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -196(%ebp), %eax + vmovss -132(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -192(%ebp), %eax + vmovss -128(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -188(%ebp), %eax + vmovss -124(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -184(%ebp), %eax + vmovss -120(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -180(%ebp), %eax + vmovss -116(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -304(%ebp), %eax + vmovss -112(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -300(%ebp), %eax + vmovss -108(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -296(%ebp), %eax + vmovss -104(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -292(%ebp), %eax + vmovss -100(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -288(%ebp), %eax + vmovss -96(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -284(%ebp), %eax + vmovss -92(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -280(%ebp), %eax + vmovss -88(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -276(%ebp), %eax + vmovss -84(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -272(%ebp), %eax + vmovss -80(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -268(%ebp), %eax + vmovss -76(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -264(%ebp), %eax + vmovss -72(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -260(%ebp), %eax + vmovss -68(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -256(%ebp), %eax + vmovss -64(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -252(%ebp), %eax + vmovss -60(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -248(%ebp), %eax + vmovss -56(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -244(%ebp), %eax + vmovss -52(%ebp), %xmm0 + vmovss %xmm0, (%eax) + addl $296, %esp + popq %r10 + .cfi_def_cfa 10, 0 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +.endm + +ENTRY (_ZGVeN16vvv_sincosf_knl) +WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_knl +END (_ZGVeN16vvv_sincosf_knl) + +ENTRY (_ZGVeN16vvv_sincosf_skx) +WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx END (_ZGVeN16vvv_sincosf_skx) .section .rodata, "a" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core-sse2.S new file mode 100644 index 0000000000..cc718b3a2e --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core-sse2.S @@ -0,0 +1,20 @@ +/* SSE2 version of vectorized sincosf. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVbN4vvv_sincosf _ZGVbN4vvv_sincosf_sse2 +#include "../svml_s_sincosf4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S deleted file mode 100644 index d72b4049e2..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized sincosf. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVbN4vvv_sincosf) - .type _ZGVbN4vvv_sincosf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVbN4vvv_sincosf_sse4(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jz 2f - ret -2: leaq _ZGVbN4vvv_sincosf_sse2(%rip), %rax - ret -END (_ZGVbN4vvv_sincosf) -libmvec_hidden_def (_ZGVbN4vvv_sincosf) - -#define _ZGVbN4vvv_sincosf _ZGVbN4vvv_sincosf_sse2 -#include "../svml_s_sincosf4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.c new file mode 100644 index 0000000000..705d96a8fb --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized sincosf, vector length is 4. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVbN4vvv_sincosf +#include "ifunc-mathvec-sse4_1.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVbN4vvv_sincosf, __GI__ZGVbN4vvv_sincosf, + __redirect__ZGVbN4vvv_sincosf) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S index 643fc0ca3b..d758ceeb30 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S @@ -1,5 +1,5 @@ /* Function sincosf vectorized with SSE4. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -20,7 +20,7 @@ #include "svml_s_trig_data.h" .text -ENTRY (_ZGVbN4vvv_sincosf_sse4) +ENTRY (_ZGVbN4vl4l4_sincosf_sse4) /* ALGORITHM DESCRIPTION: @@ -42,7 +42,7 @@ ENTRY (_ZGVbN4vvv_sincosf_sse4) b) Calculate 2 polynomials for sin and cos: RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))); - c) Swap RS & RC if if first bit of obtained value after + c) Swap RS & RC if first bit of obtained value after Right Shifting is set to 1. Using And, Andnot & Or operations. 3) Destination sign setting a) Set shifted destination sign using XOR operation: @@ -241,12 +241,12 @@ ENTRY (_ZGVbN4vvv_sincosf_sse4) movzbl %r12b, %r15d movss 132(%rsp,%r15,8), %xmm0 - call sinf@PLT + call JUMPTARGET(sinf) movss %xmm0, 196(%rsp,%r15,8) movss 132(%rsp,%r15,8), %xmm0 - call cosf@PLT + call JUMPTARGET(cosf) movss %xmm0, 260(%rsp,%r15,8) jmp .LBL_1_8 @@ -255,14 +255,92 @@ ENTRY (_ZGVbN4vvv_sincosf_sse4) movzbl %r12b, %r15d movss 128(%rsp,%r15,8), %xmm0 - call sinf@PLT + call JUMPTARGET(sinf) movss %xmm0, 192(%rsp,%r15,8) movss 128(%rsp,%r15,8), %xmm0 - call cosf@PLT + call JUMPTARGET(cosf) movss %xmm0, 256(%rsp,%r15,8) jmp .LBL_1_7 +END (_ZGVbN4vl4l4_sincosf_sse4) +libmvec_hidden_def(_ZGVbN4vl4l4_sincosf_sse4) + +/* vvv version implemented with wrapper to vl4l4 variant. */ +ENTRY (_ZGVbN4vvv_sincosf_sse4) +#ifndef __ILP32__ + subq $104, %rsp + .cfi_def_cfa_offset 112 + movdqu %xmm1, 32(%rsp) + lea (%rsp), %rdi + movdqu %xmm2, 48(%rdi) + lea 16(%rsp), %rsi + movdqu %xmm3, 48(%rsi) + movdqu %xmm4, 64(%rsi) + call HIDDEN_JUMPTARGET(_ZGVbN4vl4l4_sincosf_sse4) + movq 32(%rsp), %rdx + movq 40(%rsp), %rsi + movq 48(%rsp), %r8 + movq 56(%rsp), %r10 + movl (%rsp), %eax + movl 4(%rsp), %ecx + movl 8(%rsp), %edi + movl 12(%rsp), %r9d + movl %eax, (%rdx) + movl %ecx, (%rsi) + movq 64(%rsp), %rax + movq 72(%rsp), %rcx + movl %edi, (%r8) + movl %r9d, (%r10) + movq 80(%rsp), %rdi + movq 88(%rsp), %r9 + movl 16(%rsp), %r11d + movl 20(%rsp), %edx + movl 24(%rsp), %esi + movl 28(%rsp), %r8d + movl %r11d, (%rax) + movl %edx, (%rcx) + movl %esi, (%rdi) + movl %r8d, (%r9) + addq $104, %rsp + .cfi_def_cfa_offset 8 + ret +#else + subl $72, %esp + .cfi_def_cfa_offset 80 + leal 48(%rsp), %esi + movaps %xmm1, 16(%esp) + leal 32(%rsp), %edi + movaps %xmm2, (%esp) + call HIDDEN_JUMPTARGET(_ZGVbN4vl4l4_sincosf_sse4) + movl 16(%esp), %eax + movss 32(%esp), %xmm0 + movss %xmm0, (%eax) + movl 20(%esp), %eax + movss 36(%esp), %xmm0 + movss %xmm0, (%eax) + movl 24(%esp), %eax + movss 40(%esp), %xmm0 + movss %xmm0, (%eax) + movl 28(%esp), %eax + movss 44(%esp), %xmm0 + movss %xmm0, (%eax) + movl (%esp), %eax + movss 48(%esp), %xmm0 + movss %xmm0, (%eax) + movl 4(%esp), %eax + movss 52(%esp), %xmm0 + movss %xmm0, (%eax) + movl 8(%esp), %eax + movss 56(%esp), %xmm0 + movss %xmm0, (%eax) + movl 12(%esp), %eax + movss 60(%esp), %xmm0 + movss %xmm0, (%eax) + addl $72, %esp + .cfi_def_cfa_offset 8 + ret +#endif END (_ZGVbN4vvv_sincosf_sse4) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core-sse.S new file mode 100644 index 0000000000..348d1e6619 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core-sse.S @@ -0,0 +1,20 @@ +/* SSE version of vectorized sincosf. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVdN8vvv_sincosf _ZGVdN8vvv_sincosf_sse_wrapper +#include "../svml_s_sincosf8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S deleted file mode 100644 index 0123b8024e..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized sincosf. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVdN8vvv_sincosf) - .type _ZGVdN8vvv_sincosf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVdN8vvv_sincosf_avx2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - ret -2: leaq _ZGVdN8vvv_sincosf_sse_wrapper(%rip), %rax - ret -END (_ZGVdN8vvv_sincosf) -libmvec_hidden_def (_ZGVdN8vvv_sincosf) - -#define _ZGVdN8vvv_sincosf _ZGVdN8vvv_sincosf_sse_wrapper -#include "../svml_s_sincosf8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.c new file mode 100644 index 0000000000..74f3d3f041 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized sincosf, vector length is 8. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVdN8vvv_sincosf +#include "ifunc-mathvec-avx2.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVdN8vvv_sincosf, __GI__ZGVdN8vvv_sincosf, + __redirect__ZGVdN8vvv_sincosf) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S index f2a0ba7116..8b4b92dd94 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S @@ -1,5 +1,5 @@ /* Function sincosf vectorized with AVX2. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -20,7 +20,7 @@ #include "svml_s_trig_data.h" .text -ENTRY(_ZGVdN8vvv_sincosf_avx2) +ENTRY (_ZGVdN8vl4l4_sincosf_avx2) /* ALGORITHM DESCRIPTION: @@ -42,7 +42,7 @@ ENTRY(_ZGVdN8vvv_sincosf_avx2) b) Calculate 2 polynomials for sin and cos: RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))); - c) Swap RS & RC if if first bit of obtained value after + c) Swap RS & RC if first bit of obtained value after Right Shifting is set to 1. Using And, Andnot & Or operations. 3) Destination sign setting a) Set shifted destination sign using XOR operation: @@ -213,12 +213,12 @@ ENTRY(_ZGVdN8vvv_sincosf_avx2) vmovss 260(%rsp,%r15,8), %xmm0 vzeroupper - call sinf@PLT + call JUMPTARGET(sinf) vmovss %xmm0, 324(%rsp,%r15,8) vmovss 260(%rsp,%r15,8), %xmm0 - call cosf@PLT + call JUMPTARGET(cosf) vmovss %xmm0, 388(%rsp,%r15,8) jmp .LBL_1_8 @@ -228,14 +228,162 @@ ENTRY(_ZGVdN8vvv_sincosf_avx2) vmovss 256(%rsp,%r15,8), %xmm0 vzeroupper - call sinf@PLT + call JUMPTARGET(sinf) vmovss %xmm0, 320(%rsp,%r15,8) vmovss 256(%rsp,%r15,8), %xmm0 - call cosf@PLT + call JUMPTARGET(cosf) vmovss %xmm0, 384(%rsp,%r15,8) jmp .LBL_1_7 -END(_ZGVdN8vvv_sincosf_avx2) +END (_ZGVdN8vl4l4_sincosf_avx2) +libmvec_hidden_def(_ZGVdN8vl4l4_sincosf_avx2) + +/* vvv version implemented with wrapper to vl4l4 variant. */ +ENTRY (_ZGVdN8vvv_sincosf_avx2) +#ifndef __ILP32__ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-32, %rsp + subq $192, %rsp + vmovdqu %ymm1, 64(%rsp) + lea (%rsp), %rdi + vmovdqu %ymm2, 96(%rdi) + vmovdqu %ymm3, 128(%rdi) + vmovdqu %ymm4, 160(%rdi) + lea 32(%rsp), %rsi + call HIDDEN_JUMPTARGET(_ZGVdN8vl4l4_sincosf_avx2) + movq 64(%rsp), %rdx + movq 72(%rsp), %rsi + movq 80(%rsp), %r8 + movq 88(%rsp), %r10 + movl (%rsp), %eax + movl 4(%rsp), %ecx + movl 8(%rsp), %edi + movl 12(%rsp), %r9d + movl %eax, (%rdx) + movl %ecx, (%rsi) + movq 96(%rsp), %rax + movq 104(%rsp), %rcx + movl %edi, (%r8) + movl %r9d, (%r10) + movq 112(%rsp), %rdi + movq 120(%rsp), %r9 + movl 16(%rsp), %r11d + movl 20(%rsp), %edx + movl 24(%rsp), %esi + movl 28(%rsp), %r8d + movl %r11d, (%rax) + movl %edx, (%rcx) + movq 128(%rsp), %r11 + movq 136(%rsp), %rdx + movl %esi, (%rdi) + movl %r8d, (%r9) + movq 144(%rsp), %rsi + movq 152(%rsp), %r8 + movl 32(%rsp), %r10d + movl 36(%rsp), %eax + movl 40(%rsp), %ecx + movl 44(%rsp), %edi + movl %r10d, (%r11) + movl %eax, (%rdx) + movq 160(%rsp), %r10 + movq 168(%rsp), %rax + movl %ecx, (%rsi) + movl %edi, (%r8) + movq 176(%rsp), %rcx + movq 184(%rsp), %rdi + movl 48(%rsp), %r9d + movl 52(%rsp), %r11d + movl 56(%rsp), %edx + movl 60(%rsp), %esi + movl %r9d, (%r10) + movl %r11d, (%rax) + movl %edx, (%rcx) + movl %esi, (%rdi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-32, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x78,0x6 + leal -48(%rbp), %esi + leal -80(%rbp), %edi + subl $136, %esp + vmovdqa %ymm1, -112(%ebp) + vmovdqa %ymm2, -144(%ebp) + call HIDDEN_JUMPTARGET(_ZGVdN8vl4l4_sincosf_avx2) + vmovdqa -112(%ebp), %xmm0 + vmovq %xmm0, %rax + vmovss -80(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -76(%ebp), %xmm0 + shrq $32, %rax + vmovss %xmm0, (%eax) + movq -104(%ebp), %rax + vmovss -72(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -68(%ebp), %xmm0 + shrq $32, %rax + vmovss %xmm0, (%eax) + movq -96(%ebp), %rax + vmovss -64(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -60(%ebp), %xmm0 + shrq $32, %rax + vmovss %xmm0, (%eax) + movq -88(%ebp), %rax + vmovss -56(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -52(%ebp), %xmm0 + shrq $32, %rax + vmovss %xmm0, (%eax) + vmovdqa -144(%ebp), %xmm0 + vmovq %xmm0, %rax + vmovss -48(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -44(%ebp), %xmm0 + shrq $32, %rax + vmovss %xmm0, (%eax) + movq -136(%ebp), %rax + vmovss -40(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -36(%ebp), %xmm0 + shrq $32, %rax + vmovss %xmm0, (%eax) + movq -128(%ebp), %rax + vmovss -32(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -28(%ebp), %xmm0 + shrq $32, %rax + vmovss %xmm0, (%eax) + movq -120(%ebp), %rax + vmovss -24(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -20(%ebp), %xmm0 + shrq $32, %rax + vmovss %xmm0, (%eax) + addl $136, %esp + popq %r10 + .cfi_def_cfa 10, 0 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +END (_ZGVdN8vvv_sincosf_avx2) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core-avx2.S new file mode 100644 index 0000000000..fa521b9dac --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core-avx2.S @@ -0,0 +1,20 @@ +/* AVX2 version of vectorized sinf. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVeN16v_sinf _ZGVeN16v_sinf_avx2_wrapper +#include "../svml_s_sinf16_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S deleted file mode 100644 index 2212cdd94d..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S +++ /dev/null @@ -1,37 +0,0 @@ -/* Multiple versions of vectorized sinf. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVeN16v_sinf) - .type _ZGVeN16v_sinf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVeN16v_sinf_skx(%rip), %rax - HAS_ARCH_FEATURE (AVX512DQ_Usable) - jnz 2f - leaq _ZGVeN16v_sinf_knl(%rip), %rax - HAS_ARCH_FEATURE (AVX512F_Usable) - jnz 2f - leaq _ZGVeN16v_sinf_avx2_wrapper(%rip), %rax -2: ret -END (_ZGVeN16v_sinf) - -#define _ZGVeN16v_sinf _ZGVeN16v_sinf_avx2_wrapper -#include "../svml_s_sinf16_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.c new file mode 100644 index 0000000000..97e5b58284 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized sinf, vector length is 16. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVeN16v_sinf +#include "ifunc-mathvec-avx512.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVeN16v_sinf, __GI__ZGVeN16v_sinf, + __redirect__ZGVeN16v_sinf) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S index 61d8d3793a..141f747eb5 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S @@ -1,5 +1,5 @@ /* Function sinf vectorized with AVX-512. KNL and SKX versions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -22,7 +22,7 @@ .text ENTRY(_ZGVeN16v_sinf_knl) -#ifndef HAVE_AVX512_ASM_SUPPORT +#ifndef HAVE_AVX512DQ_ASM_SUPPORT WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf #else /* @@ -229,21 +229,21 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf cfi_restore_state movzbl %r12b, %r15d vmovss 1156(%rsp,%r15,8), %xmm0 - call sinf@PLT + call JUMPTARGET(sinf) vmovss %xmm0, 1220(%rsp,%r15,8) jmp .LBL_1_8 .LBL_1_12: movzbl %r12b, %r15d vmovss 1152(%rsp,%r15,8), %xmm0 - call sinf@PLT + call JUMPTARGET(sinf) vmovss %xmm0, 1216(%rsp,%r15,8) jmp .LBL_1_7 #endif END(_ZGVeN16v_sinf_knl) ENTRY (_ZGVeN16v_sinf_skx) -#ifndef HAVE_AVX512_ASM_SUPPORT +#ifndef HAVE_AVX512DQ_ASM_SUPPORT WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf #else /* @@ -455,7 +455,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf vzeroupper vmovss 1156(%rsp,%r15,8), %xmm0 - call sinf@PLT + call JUMPTARGET(sinf) vmovss %xmm0, 1220(%rsp,%r15,8) jmp .LBL_2_8 @@ -466,7 +466,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf vzeroupper vmovss 1152(%rsp,%r15,8), %xmm0 - call sinf@PLT + call JUMPTARGET(sinf) vmovss %xmm0, 1216(%rsp,%r15,8) jmp .LBL_2_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core-sse2.S new file mode 100644 index 0000000000..1d2e65c39d --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core-sse2.S @@ -0,0 +1,20 @@ +/* SSE2 version of vectorized sinf. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVbN4v_sinf _ZGVbN4v_sinf_sse2 +#include "../svml_s_sinf4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S deleted file mode 100644 index b31554730d..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized sinf. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVbN4v_sinf) - .type _ZGVbN4v_sinf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVbN4v_sinf_sse4(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jz 2f - ret -2: leaq _ZGVbN4v_sinf_sse2(%rip), %rax - ret -END (_ZGVbN4v_sinf) -libmvec_hidden_def (_ZGVbN4v_sinf) - -#define _ZGVbN4v_sinf _ZGVbN4v_sinf_sse2 -#include "../svml_s_sinf4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.c new file mode 100644 index 0000000000..93b8bfebbf --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized sinf, vector length is 4. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVbN4v_sinf +#include "ifunc-mathvec-sse4_1.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVbN4v_sinf, __GI__ZGVbN4v_sinf, + __redirect__ZGVbN4v_sinf) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S index 5268ab1f09..39a4c92235 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S @@ -1,5 +1,5 @@ /* Function sinf vectorized with SSE4. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -207,7 +207,7 @@ ENTRY(_ZGVbN4v_sinf_sse4) movzbl %r12b, %r15d movss 196(%rsp,%r15,8), %xmm0 - call sinf@PLT + call JUMPTARGET(sinf) movss %xmm0, 260(%rsp,%r15,8) jmp .LBL_1_8 @@ -216,7 +216,7 @@ ENTRY(_ZGVbN4v_sinf_sse4) movzbl %r12b, %r15d movss 192(%rsp,%r15,8), %xmm0 - call sinf@PLT + call JUMPTARGET(sinf) movss %xmm0, 256(%rsp,%r15,8) jmp .LBL_1_7 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core-sse.S new file mode 100644 index 0000000000..f2af3a0b4b --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core-sse.S @@ -0,0 +1,20 @@ +/* SSE version of vectorized sinf, vector length is 8. + Copyright (C) 2014-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define _ZGVdN8v_sinf _ZGVdN8v_sinf_sse_wrapper +#include "../svml_s_sinf8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S deleted file mode 100644 index 47fe0a4adc..0000000000 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S +++ /dev/null @@ -1,36 +0,0 @@ -/* Multiple versions of vectorized sinf, vector length is 8. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVdN8v_sinf) - .type _ZGVdN8v_sinf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX -1: leaq _ZGVdN8v_sinf_avx2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - ret -2: leaq _ZGVdN8v_sinf_sse_wrapper(%rip), %rax - ret -END (_ZGVdN8v_sinf) -libmvec_hidden_def (_ZGVdN8v_sinf) - -#define _ZGVdN8v_sinf _ZGVdN8v_sinf_sse_wrapper -#include "../svml_s_sinf8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.c new file mode 100644 index 0000000000..cf13b6647c --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized sinf, vector length is 8. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVdN8v_sinf +#include "ifunc-mathvec-avx2.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVdN8v_sinf, __GI__ZGVdN8v_sinf, + __redirect__ZGVdN8v_sinf) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S index 9fdaadb2e8..5f7a95e9ad 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S @@ -1,5 +1,5 @@ /* Function sinf vectorized with AVX2. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -201,7 +201,7 @@ ENTRY(_ZGVdN8v_sinf_avx2) vmovss 324(%rsp,%r15,8), %xmm0 vzeroupper - call sinf@PLT + call JUMPTARGET(sinf) vmovss %xmm0, 388(%rsp,%r15,8) jmp .LBL_1_8 @@ -211,7 +211,7 @@ ENTRY(_ZGVdN8v_sinf_avx2) vmovss 320(%rsp,%r15,8), %xmm0 vzeroupper - call sinf@PLT + call JUMPTARGET(sinf) vmovss %xmm0, 384(%rsp,%r15,8) jmp .LBL_1_7 diff --git a/sysdeps/x86_64/fpu/printf_fphex.c b/sysdeps/x86_64/fpu/printf_fphex.c index 0fbaa3748e..62efed10da 100644 --- a/sysdeps/x86_64/fpu/printf_fphex.c +++ b/sysdeps/x86_64/fpu/printf_fphex.c @@ -1,5 +1,5 @@ /* Print floating point number in hexadecimal notation according to ISO C99. - Copyright (C) 1997-2016 Free Software Foundation, Inc. + Copyright (C) 1997-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/s_ceill.S b/sysdeps/x86_64/fpu/s_ceill.S index 910c371d58..8f2bd351f6 100644 --- a/sysdeps/x86_64/fpu/s_ceill.S +++ b/sysdeps/x86_64/fpu/s_ceill.S @@ -5,27 +5,33 @@ * Public domain. */ +#include <libm-alias-ldouble.h> #include <machine/asm.h> ENTRY(__ceill) fldt 8(%rsp) - fstcw -4(%rsp) /* store fpu control word */ + fnstenv -28(%rsp) /* store fpu environment */ /* We use here %edx although only the low 1 bits are defined. But none of the operations should care and they are faster than the 16 bit operations. */ movl $0x0800,%edx /* round towards +oo */ - orl -4(%rsp),%edx + orl -28(%rsp),%edx andl $0xfbff,%edx - movl %edx,-8(%rsp) - fldcw -8(%rsp) /* load modified control word */ + movl %edx,-32(%rsp) + fldcw -32(%rsp) /* load modified control word */ frndint /* round */ - fldcw -4(%rsp) /* restore original control word */ + /* Preserve "invalid" exceptions from sNaN input. */ + fnstsw + andl $0x1, %eax + orl %eax, -24(%rsp) + + fldenv -28(%rsp) /* restore original environment */ ret END (__ceill) -weak_alias (__ceill, ceill) +libm_alias_ldouble (__ceil, ceil) diff --git a/sysdeps/x86_64/fpu/s_copysign.S b/sysdeps/x86_64/fpu/s_copysign.S index 18f568f46f..e2921ce770 100644 --- a/sysdeps/x86_64/fpu/s_copysign.S +++ b/sysdeps/x86_64/fpu/s_copysign.S @@ -1,5 +1,5 @@ /* copy sign, double version. - Copyright (C) 2002-2016 Free Software Foundation, Inc. + Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Andreas Jaeger <aj@suse.de>, 2002. @@ -18,6 +18,7 @@ <http://www.gnu.org/licenses/>. */ #include <machine/asm.h> +#include <libm-alias-double.h> .section .rodata.cst16,"aM",@progbits,16 @@ -47,4 +48,4 @@ ENTRY(__copysign) ret END (__copysign) -weak_alias (__copysign, copysign) +libm_alias_double (__copysign, copysign) diff --git a/sysdeps/x86_64/fpu/s_copysignf.S b/sysdeps/x86_64/fpu/s_copysignf.S index 00a1fabaee..4093e781fe 100644 --- a/sysdeps/x86_64/fpu/s_copysignf.S +++ b/sysdeps/x86_64/fpu/s_copysignf.S @@ -1,5 +1,5 @@ /* copy sign, double version. - Copyright (C) 2002-2016 Free Software Foundation, Inc. + Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Andreas Jaeger <aj@suse.de>, 2002. @@ -18,6 +18,7 @@ <http://www.gnu.org/licenses/>. */ #include <machine/asm.h> +#include <libm-alias-float.h> .section .rodata @@ -42,4 +43,4 @@ ENTRY(__copysignf) retq END (__copysignf) -weak_alias (__copysignf, copysignf) +libm_alias_float (__copysign, copysign) diff --git a/sysdeps/x86_64/fpu/s_copysignl.S b/sysdeps/x86_64/fpu/s_copysignl.S index 2ffd612d65..8616205d38 100644 --- a/sysdeps/x86_64/fpu/s_copysignl.S +++ b/sysdeps/x86_64/fpu/s_copysignl.S @@ -5,6 +5,7 @@ * Public domain. */ +#include <libm-alias-ldouble.h> #include <machine/asm.h> RCSID("$NetBSD: $") @@ -19,4 +20,4 @@ ENTRY(__copysignl) fldt 8(%rsp) ret END (__copysignl) -weak_alias (__copysignl, copysignl) +libm_alias_ldouble (__copysign, copysign) diff --git a/sysdeps/x86_64/fpu/s_cosf.S b/sysdeps/x86_64/fpu/s_cosf.S deleted file mode 100644 index 31968e498f..0000000000 --- a/sysdeps/x86_64/fpu/s_cosf.S +++ /dev/null @@ -1,533 +0,0 @@ -/* Optimized cosf function. - Copyright (C) 2012-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#define __need_Emath -#include <bits/errno.h> - -/* Short algorithm description: - * - * 1) if |x| == 0: return 1.0-|x|. - * 2) if |x| < 2^-27: return 1.0-|x|. - * 3) if |x| < 2^-5 : return 1.0+x^2*DP_COS2_0+x^5*DP_COS2_1. - * 4) if |x| < Pi/4: return 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). - * 5) if |x| < 9*Pi/4: - * 5.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+3, - * t=|x|-j*Pi/4. - * 5.2) Reconstruction: - * s = (-1.0)^((n>>2)&1) - * if(n&2 != 0) { - * using cos(t) polynomial for |t|<Pi/4, result is - * s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))). - * } else { - * using sin(t) polynomial for |t|<Pi/4, result is - * s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))). - * } - * 6) if |x| < 2^23, large args: - * 6.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3, - * t=|x|-j*Pi/4. - * 6.2) Reconstruction same as (5.2). - * 7) if |x| >= 2^23, very large args: - * 7.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3, - * t=|x|-j*Pi/4. - * 7.2) Reconstruction same as (5.2). - * 8) if x is Inf, return x-x, and set errno=EDOM. - * 9) if x is NaN, return x-x. - * - * Special cases: - * cos(+-0) = 1 not raising inexact, - * cos(subnormal) raises inexact, - * cos(min_normalized) raises inexact, - * cos(normalized) raises inexact, - * cos(Inf) = NaN, raises invalid, sets errno to EDOM, - * cos(NaN) = NaN. - */ - - .text -ENTRY(__cosf) - /* Input: single precision x in %xmm0 */ - - movd %xmm0, %eax /* Bits of x */ - movaps %xmm0, %xmm7 /* Copy of x */ - cvtss2sd %xmm0, %xmm0 /* DP x */ - movss L(SP_ABS_MASK)(%rip), %xmm3 - andl $0x7fffffff, %eax /* |x| */ - - cmpl $0x3f490fdb, %eax /* |x|<Pi/4? */ - jb L(arg_less_pio4) - - /* Here if |x|>=Pi/4 */ - andps %xmm7, %xmm3 /* SP |x| */ - andpd L(DP_ABS_MASK)(%rip), %xmm0 /* DP |x| */ - movss L(SP_INVPIO4)(%rip), %xmm2 /* SP 1/(Pi/4) */ - - cmpl $0x40e231d6, %eax /* |x|<9*Pi/4? */ - jae L(large_args) - - /* Here if Pi/4<=|x|<9*Pi/4 */ - mulss %xmm3, %xmm2 /* SP |x|/(Pi/4) */ - cvttss2si %xmm2, %eax /* k, number of Pi/4 in x */ - lea L(PIO4J)(%rip), %rsi - addl $1, %eax /* k+1 */ - movl $0x0e, %edx - andl %eax, %edx /* j = (k+1)&0x0e */ - addl $2, %eax /* n */ - subsd (%rsi,%rdx,8), %xmm0 /* t = |x| - j * Pi/4 */ - -L(reconstruction): - /* Input: %eax=n, %xmm0=t */ - testl $2, %eax /* n&2 != 0? */ - jz L(sin_poly) - -/*L(cos_poly):*/ - /* Here if cos(x) calculated using cos(t) polynomial for |t|<Pi/4: - * y = t*t; z = y*y; - * s = sign(x) * (-1.0)^((n>>2)&1) - * result = s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))) - */ - shrl $2, %eax /* n>>2 */ - mulsd %xmm0, %xmm0 /* y=t^2 */ - andl $1, %eax /* (n>>2)&1 */ - movaps %xmm0, %xmm1 /* y */ - mulsd %xmm0, %xmm0 /* z=t^4 */ - - movsd L(DP_C4)(%rip), %xmm4 /* C4 */ - mulsd %xmm0, %xmm4 /* z*C4 */ - movsd L(DP_C3)(%rip), %xmm3 /* C3 */ - mulsd %xmm0, %xmm3 /* z*C3 */ - lea L(DP_ONES)(%rip), %rsi - addsd L(DP_C2)(%rip), %xmm4 /* C2+z*C4 */ - mulsd %xmm0, %xmm4 /* z*(C2+z*C4) */ - addsd L(DP_C1)(%rip), %xmm3 /* C1+z*C3 */ - mulsd %xmm0, %xmm3 /* z*(C1+z*C3) */ - addsd L(DP_C0)(%rip), %xmm4 /* C0+z*(C2+z*C4) */ - mulsd %xmm1, %xmm4 /* y*(C0+z*(C2+z*C4)) */ - - addsd %xmm4, %xmm3 /* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */ - /* 1.0+y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */ - addsd L(DP_ONES)(%rip), %xmm3 - - mulsd (%rsi,%rax,8), %xmm3 /* DP result */ - cvtsd2ss %xmm3, %xmm0 /* SP result */ - ret - - .p2align 4 -L(sin_poly): - /* Here if cos(x) calculated using sin(t) polynomial for |t|<Pi/4: - * y = t*t; z = y*y; - * s = sign(x) * (-1.0)^((n>>2)&1) - * result = s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))) - */ - - movaps %xmm0, %xmm4 /* t */ - shrl $2, %eax /* n>>2 */ - mulsd %xmm0, %xmm0 /* y=t^2 */ - andl $1, %eax /* (n>>2)&1 */ - movaps %xmm0, %xmm1 /* y */ - mulsd %xmm0, %xmm0 /* z=t^4 */ - - movsd L(DP_S4)(%rip), %xmm2 /* S4 */ - mulsd %xmm0, %xmm2 /* z*S4 */ - movsd L(DP_S3)(%rip), %xmm3 /* S3 */ - mulsd %xmm0, %xmm3 /* z*S3 */ - lea L(DP_ONES)(%rip), %rsi - addsd L(DP_S2)(%rip), %xmm2 /* S2+z*S4 */ - mulsd %xmm0, %xmm2 /* z*(S2+z*S4) */ - addsd L(DP_S1)(%rip), %xmm3 /* S1+z*S3 */ - mulsd %xmm0, %xmm3 /* z*(S1+z*S3) */ - addsd L(DP_S0)(%rip), %xmm2 /* S0+z*(S2+z*S4) */ - mulsd %xmm1, %xmm2 /* y*(S0+z*(S2+z*S4)) */ - /* t*s, where s = sign(x) * (-1.0)^((n>>2)&1) */ - mulsd (%rsi,%rax,8), %xmm4 - /* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ - addsd %xmm2, %xmm3 - /* t*s*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ - mulsd %xmm4, %xmm3 - /* t*s*(1.0+y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ - addsd %xmm4, %xmm3 - cvtsd2ss %xmm3, %xmm0 /* SP result */ - ret - - .p2align 4 -L(large_args): - /* Here if |x|>=9*Pi/4 */ - cmpl $0x7f800000, %eax /* x is Inf or NaN? */ - jae L(arg_inf_or_nan) - - /* Here if finite |x|>=9*Pi/4 */ - cmpl $0x4b000000, %eax /* |x|<2^23? */ - jae L(very_large_args) - - /* Here if 9*Pi/4<=|x|<2^23 */ - movsd L(DP_INVPIO4)(%rip), %xmm1 /* 1/(Pi/4) */ - mulsd %xmm0, %xmm1 /* |x|/(Pi/4) */ - cvttsd2si %xmm1, %eax /* k=trunc(|x|/(Pi/4)) */ - addl $1, %eax /* k+1 */ - movl %eax, %edx - andl $0xfffffffe, %edx /* j=(k+1)&0xfffffffe */ - cvtsi2sdl %edx, %xmm4 /* DP j */ - movsd L(DP_PIO4HI)(%rip), %xmm2 /* -PIO4HI = high part of -Pi/4 */ - mulsd %xmm4, %xmm2 /* -j*PIO4HI */ - movsd L(DP_PIO4LO)(%rip), %xmm3 /* -PIO4LO = low part of -Pi/4 */ - addsd %xmm2, %xmm0 /* |x| - j*PIO4HI */ - addl $2, %eax /* n */ - mulsd %xmm3, %xmm4 /* j*PIO4LO */ - addsd %xmm4, %xmm0 /* t = |x| - j*PIO4HI - j*PIO4LO */ - jmp L(reconstruction) - - .p2align 4 -L(very_large_args): - /* Here if finite |x|>=2^23 */ - - /* bitpos = (ix>>23) - BIAS_32 + 59; */ - shrl $23, %eax /* eb = biased exponent of x */ - /* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */ - subl $68, %eax - movl $28, %ecx /* %cl=28 */ - movl %eax, %edx /* bitpos copy */ - - /* j = bitpos/28; */ - div %cl /* j in register %al=%ax/%cl */ - movapd %xmm0, %xmm3 /* |x| */ - /* clear unneeded remainder from %ah */ - andl $0xff, %eax - - imull $28, %eax, %ecx /* j*28 */ - lea L(_FPI)(%rip), %rsi - movsd L(DP_HI_MASK)(%rip), %xmm4 /* DP_HI_MASK */ - movapd %xmm0, %xmm5 /* |x| */ - mulsd -16(%rsi,%rax,8), %xmm3 /* tmp3 = FPI[j-2]*|x| */ - movapd %xmm0, %xmm1 /* |x| */ - mulsd -8(%rsi,%rax,8), %xmm5 /* tmp2 = FPI[j-1]*|x| */ - mulsd (%rsi,%rax,8), %xmm0 /* tmp0 = FPI[j]*|x| */ - addl $19, %ecx /* j*28+19 */ - mulsd 8(%rsi,%rax,8), %xmm1 /* tmp1 = FPI[j+1]*|x| */ - cmpl %ecx, %edx /* bitpos>=j*28+19? */ - jl L(very_large_skip1) - - /* Here if bitpos>=j*28+19 */ - andpd %xmm3, %xmm4 /* HI(tmp3) */ - subsd %xmm4, %xmm3 /* tmp3 = tmp3 - HI(tmp3) */ -L(very_large_skip1): - - movsd L(DP_2POW52)(%rip), %xmm6 - movapd %xmm5, %xmm2 /* tmp2 copy */ - addsd %xmm3, %xmm5 /* tmp5 = tmp3 + tmp2 */ - movl $1, %edx - addsd %xmm5, %xmm6 /* tmp6 = tmp5 + 2^52 */ - movsd 8+L(DP_2POW52)(%rip), %xmm4 - movd %xmm6, %eax /* k = I64_LO(tmp6); */ - addsd %xmm6, %xmm4 /* tmp4 = tmp6 - 2^52 */ - comisd %xmm5, %xmm4 /* tmp4 > tmp5? */ - jbe L(very_large_skip2) - - /* Here if tmp4 > tmp5 */ - subl $1, %eax /* k-- */ - addsd 8+L(DP_ONES)(%rip), %xmm4 /* tmp4 -= 1.0 */ -L(very_large_skip2): - - andl %eax, %edx /* k&1 */ - lea L(DP_ZERONE)(%rip), %rsi - subsd %xmm4, %xmm3 /* tmp3 -= tmp4 */ - addsd (%rsi,%rdx,8), %xmm3 /* t = DP_ZERONE[k&1] + tmp3 */ - addsd %xmm2, %xmm3 /* t += tmp2 */ - addsd %xmm3, %xmm0 /* t += tmp0 */ - addl $3, %eax /* n=k+3 */ - addsd %xmm1, %xmm0 /* t += tmp1 */ - mulsd L(DP_PIO4)(%rip), %xmm0 /* t *= PI04 */ - - jmp L(reconstruction) /* end of very_large_args peth */ - - .p2align 4 -L(arg_less_pio4): - /* Here if |x|<Pi/4 */ - cmpl $0x3d000000, %eax /* |x|<2^-5? */ - jl L(arg_less_2pn5) - - /* Here if 2^-5<=|x|<Pi/4 */ - mulsd %xmm0, %xmm0 /* y=x^2 */ - movaps %xmm0, %xmm1 /* y */ - mulsd %xmm0, %xmm0 /* z=x^4 */ - movsd L(DP_C4)(%rip), %xmm3 /* C4 */ - mulsd %xmm0, %xmm3 /* z*C4 */ - movsd L(DP_C3)(%rip), %xmm5 /* C3 */ - mulsd %xmm0, %xmm5 /* z*C3 */ - addsd L(DP_C2)(%rip), %xmm3 /* C2+z*C4 */ - mulsd %xmm0, %xmm3 /* z*(C2+z*C4) */ - addsd L(DP_C1)(%rip), %xmm5 /* C1+z*C3 */ - mulsd %xmm0, %xmm5 /* z*(C1+z*C3) */ - addsd L(DP_C0)(%rip), %xmm3 /* C0+z*(C2+z*C4) */ - mulsd %xmm1, %xmm3 /* y*(C0+z*(C2+z*C4)) */ - /* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */ - addsd %xmm5, %xmm3 - /* 1.0 + y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */ - addsd L(DP_ONES)(%rip), %xmm3 - cvtsd2ss %xmm3, %xmm0 /* SP result */ - ret - - .p2align 4 -L(arg_less_2pn5): - /* Here if |x|<2^-5 */ - cmpl $0x32000000, %eax /* |x|<2^-27? */ - jl L(arg_less_2pn27) - - /* Here if 2^-27<=|x|<2^-5 */ - mulsd %xmm0, %xmm0 /* DP x^2 */ - movsd L(DP_COS2_1)(%rip), %xmm3 /* DP DP_COS2_1 */ - mulsd %xmm0, %xmm3 /* DP x^2*DP_COS2_1 */ - addsd L(DP_COS2_0)(%rip), %xmm3 /* DP DP_COS2_0+x^2*DP_COS2_1 */ - mulsd %xmm0, %xmm3 /* DP x^2*DP_COS2_0+x^4*DP_COS2_1 */ - /* DP 1.0+x^2*DP_COS2_0+x^4*DP_COS2_1 */ - addsd L(DP_ONES)(%rip), %xmm3 - cvtsd2ss %xmm3, %xmm0 /* SP result */ - ret - - .p2align 4 -L(arg_less_2pn27): - /* Here if |x|<2^-27 */ - andps L(SP_ABS_MASK)(%rip),%xmm7 /* |x| */ - movss L(SP_ONE)(%rip), %xmm0 /* 1.0 */ - subss %xmm7, %xmm0 /* result is 1.0-|x| */ - ret - - .p2align 4 -L(arg_inf_or_nan): - /* Here if |x| is Inf or NAN */ - jne L(skip_errno_setting) /* in case of x is NaN */ - - /* Align stack to 16 bytes. */ - subq $8, %rsp - cfi_adjust_cfa_offset (8) - /* Here if x is Inf. Set errno to EDOM. */ - call JUMPTARGET(__errno_location) - addq $8, %rsp - cfi_adjust_cfa_offset (-8) - - movl $EDOM, (%rax) - - .p2align 4 -L(skip_errno_setting): - /* Here if |x| is Inf or NAN. Continued. */ - movaps %xmm7, %xmm0 /* load x */ - subss %xmm0, %xmm0 /* Result is NaN */ - ret -END(__cosf) - - .section .rodata, "a" - .p2align 3 -L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */ - .long 0x00000000,0x00000000 - .long 0x54442d18,0x3fe921fb - .long 0x54442d18,0x3ff921fb - .long 0x7f3321d2,0x4002d97c - .long 0x54442d18,0x400921fb - .long 0x2955385e,0x400f6a7a - .long 0x7f3321d2,0x4012d97c - .long 0xe9bba775,0x4015fdbb - .long 0x54442d18,0x401921fb - .long 0xbeccb2bb,0x401c463a - .long 0x2955385e,0x401f6a7a - .type L(PIO4J), @object - ASM_SIZE_DIRECTIVE(L(PIO4J)) - - .p2align 3 -L(_FPI): /* 4/Pi broken into sum of positive DP values */ - .long 0x00000000,0x00000000 - .long 0x6c000000,0x3ff45f30 - .long 0x2a000000,0x3e3c9c88 - .long 0xa8000000,0x3c54fe13 - .long 0xd0000000,0x3aaf47d4 - .long 0x6c000000,0x38fbb81b - .long 0xe0000000,0x3714acc9 - .long 0x7c000000,0x3560e410 - .long 0x56000000,0x33bca2c7 - .long 0xac000000,0x31fbd778 - .long 0xe0000000,0x300b7246 - .long 0xe8000000,0x2e5d2126 - .long 0x48000000,0x2c970032 - .long 0xe8000000,0x2ad77504 - .long 0xe0000000,0x290921cf - .long 0xb0000000,0x274deb1c - .long 0xe0000000,0x25829a73 - .long 0xbe000000,0x23fd1046 - .long 0x10000000,0x2224baed - .long 0x8e000000,0x20709d33 - .long 0x80000000,0x1e535a2f - .long 0x64000000,0x1cef904e - .long 0x30000000,0x1b0d6398 - .long 0x24000000,0x1964ce7d - .long 0x16000000,0x17b908bf - .type L(_FPI), @object - ASM_SIZE_DIRECTIVE(L(_FPI)) - -/* Coefficients of polynomial - for cos(x)~=1.0+x^2*DP_COS2_0+x^4*DP_COS2_1, |x|<2^-5. */ - .p2align 3 -L(DP_COS2_0): - .long 0xff5cc6fd,0xbfdfffff - .type L(DP_COS2_0), @object - ASM_SIZE_DIRECTIVE(L(DP_COS2_0)) - - .p2align 3 -L(DP_COS2_1): - .long 0xb178dac5,0x3fa55514 - .type L(DP_COS2_1), @object - ASM_SIZE_DIRECTIVE(L(DP_COS2_1)) - - .p2align 3 -L(DP_ZERONE): - .long 0x00000000,0x00000000 /* 0.0 */ - .long 0x00000000,0xbff00000 /* 1.0 */ - .type L(DP_ZERONE), @object - ASM_SIZE_DIRECTIVE(L(DP_ZERONE)) - - .p2align 3 -L(DP_ONES): - .long 0x00000000,0x3ff00000 /* +1.0 */ - .long 0x00000000,0xbff00000 /* -1.0 */ - .type L(DP_ONES), @object - ASM_SIZE_DIRECTIVE(L(DP_ONES)) - -/* Coefficients of polynomial - for sin(t)~=t+t^3*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))), |t|<Pi/4. */ - .p2align 3 -L(DP_S3): - .long 0x64e6b5b4,0x3ec71d72 - .type L(DP_S3), @object - ASM_SIZE_DIRECTIVE(L(DP_S3)) - - .p2align 3 -L(DP_S1): - .long 0x10c2688b,0x3f811111 - .type L(DP_S1), @object - ASM_SIZE_DIRECTIVE(L(DP_S1)) - - .p2align 3 -L(DP_S4): - .long 0x1674b58a,0xbe5a947e - .type L(DP_S4), @object - ASM_SIZE_DIRECTIVE(L(DP_S4)) - - .p2align 3 -L(DP_S2): - .long 0x8b4bd1f9,0xbf2a019f - .type L(DP_S2),@object - ASM_SIZE_DIRECTIVE(L(DP_S2)) - - .p2align 3 -L(DP_S0): - .long 0x55551cd9,0xbfc55555 - .type L(DP_S0), @object - ASM_SIZE_DIRECTIVE(L(DP_S0)) - -/* Coefficients of polynomial - for cos(t)~=1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))), |t|<Pi/4. */ - .p2align 3 -L(DP_C3): - .long 0x9ac43cc0,0x3efa00eb - .type L(DP_C3), @object - ASM_SIZE_DIRECTIVE(L(DP_C3)) - - .p2align 3 -L(DP_C1): - .long 0x545c50c7,0x3fa55555 - .type L(DP_C1), @object - ASM_SIZE_DIRECTIVE(L(DP_C1)) - - .p2align 3 -L(DP_C4): - .long 0xdd8844d7,0xbe923c97 - .type L(DP_C4), @object - ASM_SIZE_DIRECTIVE(L(DP_C4)) - - .p2align 3 -L(DP_C2): - .long 0x348b6874,0xbf56c16b - .type L(DP_C2), @object - ASM_SIZE_DIRECTIVE(L(DP_C2)) - - .p2align 3 -L(DP_C0): - .long 0xfffe98ae,0xbfdfffff - .type L(DP_C0), @object - ASM_SIZE_DIRECTIVE(L(DP_C0)) - - .p2align 3 -L(DP_PIO4): - .long 0x54442d18,0x3fe921fb /* Pi/4 */ - .type L(DP_PIO4), @object - ASM_SIZE_DIRECTIVE(L(DP_PIO4)) - - .p2align 3 -L(DP_2POW52): - .long 0x00000000,0x43300000 /* +2^52 */ - .long 0x00000000,0xc3300000 /* -2^52 */ - .type L(DP_2POW52), @object - ASM_SIZE_DIRECTIVE(L(DP_2POW52)) - - .p2align 3 -L(DP_INVPIO4): - .long 0x6dc9c883,0x3ff45f30 /* 4/Pi */ - .type L(DP_INVPIO4), @object - ASM_SIZE_DIRECTIVE(L(DP_INVPIO4)) - - .p2align 3 -L(DP_PIO4HI): - .long 0x54000000,0xbfe921fb /* High part of Pi/4 */ - .type L(DP_PIO4HI), @object - ASM_SIZE_DIRECTIVE(L(DP_PIO4HI)) - - .p2align 3 -L(DP_PIO4LO): - .long 0x11A62633,0xbe010b46 /* Low part of Pi/4 */ - .type L(DP_PIO4LO), @object - ASM_SIZE_DIRECTIVE(L(DP_PIO4LO)) - - .p2align 2 -L(SP_INVPIO4): - .long 0x3fa2f983 /* 4/Pi */ - .type L(SP_INVPIO4), @object - ASM_SIZE_DIRECTIVE(L(SP_INVPIO4)) - - .p2align 4 -L(DP_ABS_MASK): /* Mask for getting DP absolute value */ - .long 0xffffffff,0x7fffffff - .long 0xffffffff,0x7fffffff - .type L(DP_ABS_MASK), @object - ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK)) - - .p2align 3 -L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */ - .long 0x00000000,0xffffffff - .type L(DP_HI_MASK), @object - ASM_SIZE_DIRECTIVE(L(DP_HI_MASK)) - - .p2align 4 -L(SP_ABS_MASK): /* Mask for getting SP absolute value */ - .long 0x7fffffff,0x7fffffff - .long 0x7fffffff,0x7fffffff - .type L(SP_ABS_MASK), @object - ASM_SIZE_DIRECTIVE(L(SP_ABS_MASK)) - - .p2align 2 -L(SP_ONE): - .long 0x3f800000 /* 1.0 */ - .type L(SP_ONE), @object - ASM_SIZE_DIRECTIVE(L(SP_ONE)) - -weak_alias(__cosf, cosf) diff --git a/sysdeps/x86_64/fpu/s_fabs.c b/sysdeps/x86_64/fpu/s_fabs.c index d3a313fdf5..d1e17878d4 100644 --- a/sysdeps/x86_64/fpu/s_fabs.c +++ b/sysdeps/x86_64/fpu/s_fabs.c @@ -1,5 +1,5 @@ /* Absolute value of floating point number. - Copyright (C) 2002-2016 Free Software Foundation, Inc. + Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -17,10 +17,11 @@ <http://www.gnu.org/licenses/>. */ #include <math.h> +#include <libm-alias-double.h> double __fabs (double x) { return __builtin_fabs (x); } -weak_alias (__fabs, fabs) +libm_alias_double (__fabs, fabs) diff --git a/sysdeps/x86_64/fpu/s_fabsf.c b/sysdeps/x86_64/fpu/s_fabsf.c index e6dcda9433..2f39228560 100644 --- a/sysdeps/x86_64/fpu/s_fabsf.c +++ b/sysdeps/x86_64/fpu/s_fabsf.c @@ -1,5 +1,5 @@ /* Absolute value of floating point number. - Copyright (C) 2002-2016 Free Software Foundation, Inc. + Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -17,10 +17,11 @@ <http://www.gnu.org/licenses/>. */ #include <math.h> +#include <libm-alias-float.h> float __fabsf (float x) { return __builtin_fabsf (x); } -weak_alias (__fabsf, fabsf) +libm_alias_float (__fabs, fabs) diff --git a/sysdeps/x86_64/fpu/s_fabsl.S b/sysdeps/x86_64/fpu/s_fabsl.S index 6881ff11c7..7f03ecdccb 100644 --- a/sysdeps/x86_64/fpu/s_fabsl.S +++ b/sysdeps/x86_64/fpu/s_fabsl.S @@ -1,5 +1,5 @@ /* Absolute value of floating point number. - Copyright (C) 2002-2016 Free Software Foundation, Inc. + Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -17,6 +17,7 @@ <http://www.gnu.org/licenses/>. */ #include <sysdep.h> +#include <libm-alias-ldouble.h> .text ENTRY(__fabsl) @@ -24,4 +25,4 @@ ENTRY(__fabsl) fabs ret END(__fabsl) -weak_alias (__fabsl, fabsl) +libm_alias_ldouble (__fabs, fabs) diff --git a/sysdeps/x86_64/fpu/s_floorl.S b/sysdeps/x86_64/fpu/s_floorl.S index f9ecc388df..75f8255648 100644 --- a/sysdeps/x86_64/fpu/s_floorl.S +++ b/sysdeps/x86_64/fpu/s_floorl.S @@ -5,26 +5,32 @@ * Public domain. */ +#include <libm-alias-ldouble.h> #include <machine/asm.h> ENTRY(__floorl) fldt 8(%rsp) - fstcw -4(%rsp) /* store fpu control word */ + fnstenv -28(%rsp) /* store fpu environment */ /* We use here %edx although only the low 1 bits are defined. But none of the operations should care and they are faster than the 16 bit operations. */ movl $0x400,%edx /* round towards -oo */ - orl -4(%rsp),%edx + orl -28(%rsp),%edx andl $0xf7ff,%edx - movl %edx,-8(%rsp) - fldcw -8(%rsp) /* load modified control word */ + movl %edx,-32(%rsp) + fldcw -32(%rsp) /* load modified control word */ frndint /* round */ - fldcw -4(%rsp) /* restore original control word */ + /* Preserve "invalid" exceptions from sNaN input. */ + fnstsw + andl $0x1, %eax + orl %eax, -24(%rsp) + + fldenv -28(%rsp) /* restore original environment */ ret END (__floorl) -weak_alias (__floorl, floorl) +libm_alias_ldouble (__floor, floor) diff --git a/sysdeps/x86_64/fpu/s_fmax.S b/sysdeps/x86_64/fpu/s_fmax.S index 02096c0aea..7cd8f1ed10 100644 --- a/sysdeps/x86_64/fpu/s_fmax.S +++ b/sysdeps/x86_64/fpu/s_fmax.S @@ -1,5 +1,5 @@ /* Compute maximum of two numbers, regarding NaN as missing argument. - Copyright (C) 2002-2016 Free Software Foundation, Inc. + Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Andreas Jaeger <aj@suse.de>, 2002. @@ -18,6 +18,7 @@ <http://www.gnu.org/licenses/>. */ #include <sysdep.h> +#include <libm-alias-double.h> .text ENTRY(__fmax) @@ -27,9 +28,26 @@ ENTRY(__fmax) jmp 2f 1: ucomisd %xmm1, %xmm1 // Is xmm1 a NaN? - jp 2f // then return xmm0 + jp 3f + // xmm0 is a NaN; xmm1 is not. Test if xmm0 is signaling. + movsd %xmm0, -8(%rsp) + testb $0x8, -2(%rsp) + jz 4f movsd %xmm1, %xmm0 // otherwise return xmm1 + ret + +3: // xmm1 is a NaN; xmm0 may or may not be. + ucomisd %xmm0, %xmm0 + jp 4f + // xmm1 is a NaN; xmm0 is not. Test if xmm1 is signaling. + movsd %xmm1, -8(%rsp) + testb $0x8, -2(%rsp) + jz 4f + ret + +4: // Both arguments are NaNs, or one is a signaling NaN. + addsd %xmm1, %xmm0 2: ret END(__fmax) -weak_alias (__fmax, fmax) +libm_alias_double (__fmax, fmax) diff --git a/sysdeps/x86_64/fpu/s_fmaxf.S b/sysdeps/x86_64/fpu/s_fmaxf.S index 28e129701e..9b932fddc2 100644 --- a/sysdeps/x86_64/fpu/s_fmaxf.S +++ b/sysdeps/x86_64/fpu/s_fmaxf.S @@ -1,5 +1,5 @@ /* Compute maximum of two numbers, regarding NaN as missing argument. - Copyright (C) 2002-2016 Free Software Foundation, Inc. + Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Andreas Jaeger <aj@suse.de>, 2002. @@ -18,6 +18,7 @@ <http://www.gnu.org/licenses/>. */ #include <sysdep.h> +#include <libm-alias-float.h> .text ENTRY(__fmaxf) @@ -27,9 +28,26 @@ ENTRY(__fmaxf) jmp 2f 1: ucomiss %xmm1, %xmm1 // Is xmm1 a NaN? - jp 2f // then return xmm0 + jp 3f + // xmm0 is a NaN; xmm1 is not. Test if xmm0 is signaling. + movss %xmm0, -4(%rsp) + testb $0x40, -2(%rsp) + jz 4f movss %xmm1, %xmm0 // otherwise return xmm1 + ret + +3: // xmm1 is a NaN; xmm0 may or may not be. + ucomiss %xmm0, %xmm0 + jp 4f + // xmm1 is a NaN; xmm0 is not. Test if xmm1 is signaling. + movss %xmm1, -4(%rsp) + testb $0x40, -2(%rsp) + jz 4f + ret + +4: // Both arguments are NaNs, or one is a signaling NaN. + addss %xmm1, %xmm0 2: ret END(__fmaxf) -weak_alias (__fmaxf, fmaxf) +libm_alias_float (__fmax, fmax) diff --git a/sysdeps/x86_64/fpu/s_fmaxl.S b/sysdeps/x86_64/fpu/s_fmaxl.S index f0c2bc0d56..3463a07083 100644 --- a/sysdeps/x86_64/fpu/s_fmaxl.S +++ b/sysdeps/x86_64/fpu/s_fmaxl.S @@ -1,5 +1,5 @@ /* Compute maximum of two numbers, regarding NaN as missing argument. - Copyright (C) 1997-2016 Free Software Foundation, Inc. + Copyright (C) 1997-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. @@ -18,22 +18,42 @@ <http://www.gnu.org/licenses/>. */ #include <sysdep.h> +#include <libm-alias-ldouble.h> .text ENTRY(__fmaxl) fldt 8(%rsp) // x fldt 24(%rsp) // x : y - fucomi %st(0), %st - fcmovu %st(1), %st // now %st contains y if not NaN, x otherwise - - fxch - fucomi %st(1), %st + jp 2f fcmovb %st(1), %st fstp %st(1) ret + +2: // Unordered. + fucomi %st(0), %st + jp 3f + // st(1) is a NaN; st(0) is not. Test if st(1) is signaling. + testb $0x40, 15(%rsp) + jz 4f + fstp %st(1) + ret + +3: // st(0) is a NaN; st(1) may or may not be. + fxch + fucomi %st(0), %st + jp 4f + // st(1) is a NaN; st(0) is not. Test if st(1) is signaling. + testb $0x40, 31(%rsp) + jz 4f + fstp %st(1) + ret + +4: // Both arguments are NaNs, or one is a signaling NaN. + faddp + ret END(__fmaxl) -weak_alias (__fmaxl, fmaxl) +libm_alias_ldouble (__fmax, fmax) diff --git a/sysdeps/x86_64/fpu/s_fmin.S b/sysdeps/x86_64/fpu/s_fmin.S index fb14e2f3ed..15b6eaed90 100644 --- a/sysdeps/x86_64/fpu/s_fmin.S +++ b/sysdeps/x86_64/fpu/s_fmin.S @@ -1,5 +1,5 @@ /* Compute minimum of two numbers, regarding NaN as missing argument. - Copyright (C) 2002-2016 Free Software Foundation, Inc. + Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Andreas Jaeger <aj@suse.de>, 2002. @@ -18,6 +18,7 @@ <http://www.gnu.org/licenses/>. */ #include <sysdep.h> +#include <libm-alias-double.h> .text ENTRY(__fmin) @@ -27,9 +28,26 @@ ENTRY(__fmin) jmp 2f 1: ucomisd %xmm1, %xmm1 // Is xmm1 a NaN? - jp 2f // then return xmm0 + jp 3f + // xmm0 is a NaN; xmm1 is not. Test if xmm0 is signaling. + movsd %xmm0, -8(%rsp) + testb $0x8, -2(%rsp) + jz 4f movsd %xmm1, %xmm0 // otherwise return xmm1 + ret + +3: // xmm1 is a NaN; xmm0 may or may not be. + ucomisd %xmm0, %xmm0 + jp 4f + // xmm1 is a NaN; xmm0 is not. Test if xmm1 is signaling. + movsd %xmm1, -8(%rsp) + testb $0x8, -2(%rsp) + jz 4f + ret + +4: // Both arguments are NaNs, or one is a signaling NaN. + addsd %xmm1, %xmm0 2: ret END(__fmin) -weak_alias (__fmin, fmin) +libm_alias_double (__fmin, fmin) diff --git a/sysdeps/x86_64/fpu/s_fminf.S b/sysdeps/x86_64/fpu/s_fminf.S index c8d6d0fd33..28e26aead5 100644 --- a/sysdeps/x86_64/fpu/s_fminf.S +++ b/sysdeps/x86_64/fpu/s_fminf.S @@ -1,5 +1,5 @@ /* Compute minimum of two numbers, regarding NaN as missing argument. - Copyright (C) 2002-2016 Free Software Foundation, Inc. + Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Andreas Jaeger <aj@suse.de>, 2002. @@ -18,6 +18,7 @@ <http://www.gnu.org/licenses/>. */ #include <sysdep.h> +#include <libm-alias-float.h> .text ENTRY(__fminf) @@ -27,9 +28,26 @@ ENTRY(__fminf) jmp 2f 1: ucomiss %xmm1, %xmm1 // Is xmm1 a NaN? - jp 2f // then return xmm0 + jp 3f + // xmm0 is a NaN; xmm1 is not. Test if xmm0 is signaling. + movss %xmm0, -4(%rsp) + testb $0x40, -2(%rsp) + jz 4f movss %xmm1, %xmm0 // otherwise return xmm1 + ret + +3: // xmm1 is a NaN; xmm0 may or may not be. + ucomiss %xmm0, %xmm0 + jp 4f + // xmm1 is a NaN; xmm0 is not. Test if xmm1 is signaling. + movss %xmm1, -4(%rsp) + testb $0x40, -2(%rsp) + jz 4f + ret + +4: // Both arguments are NaNs, or one is a signaling NaN. + addss %xmm1, %xmm0 2: ret END(__fminf) -weak_alias (__fminf, fminf) +libm_alias_float (__fmin, fmin) diff --git a/sysdeps/x86_64/fpu/s_fminl.S b/sysdeps/x86_64/fpu/s_fminl.S index f1a06d29d7..df81762449 100644 --- a/sysdeps/x86_64/fpu/s_fminl.S +++ b/sysdeps/x86_64/fpu/s_fminl.S @@ -1,5 +1,5 @@ /* Compute minimum of two numbers, regarding NaN as missing argument. - Copyright (C) 1997-2016 Free Software Foundation, Inc. + Copyright (C) 1997-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. @@ -18,20 +18,42 @@ <http://www.gnu.org/licenses/>. */ #include <sysdep.h> +#include <libm-alias-ldouble.h> .text ENTRY(__fminl) fldt 8(%rsp) // x fldt 24(%rsp) // x : y - fucomi %st(0), %st - fcmovu %st(1), %st // now %st contains y if not NaN, x otherwise - fucomi %st(1), %st + jp 2f fcmovnb %st(1), %st fstp %st(1) ret + +2: // Unordered. + fucomi %st(0), %st + jp 3f + // st(1) is a NaN; st(0) is not. Test if st(1) is signaling. + testb $0x40, 15(%rsp) + jz 4f + fstp %st(1) + ret + +3: // st(0) is a NaN; st(1) may or may not be. + fxch + fucomi %st(0), %st + jp 4f + // st(1) is a NaN; st(0) is not. Test if st(1) is signaling. + testb $0x40, 31(%rsp) + jz 4f + fstp %st(1) + ret + +4: // Both arguments are NaNs, or one is a signaling NaN. + faddp + ret END(__fminl) -weak_alias (__fminl, fminl) +libm_alias_ldouble (__fmin, fmin) diff --git a/sysdeps/x86_64/fpu/s_llrint.S b/sysdeps/x86_64/fpu/s_llrint.S index 6634c653ea..7b93724e46 100644 --- a/sysdeps/x86_64/fpu/s_llrint.S +++ b/sysdeps/x86_64/fpu/s_llrint.S @@ -1,6 +1,6 @@ /* Round argument to nearest integral value according to current rounding direction. - Copyright (C) 2002-2016 Free Software Foundation, Inc. + Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Andreas Jaeger <aj@suse.d>, 2002. @@ -19,14 +19,15 @@ <http://www.gnu.org/licenses/>. */ #include <sysdep.h> +#include <libm-alias-double.h> .text ENTRY(__llrint) cvtsd2si %xmm0,%rax ret END(__llrint) -weak_alias (__llrint, llrint) +libm_alias_double (__llrint, llrint) #ifndef __ILP32__ strong_alias (__llrint, __lrint) -weak_alias (__llrint, lrint) +libm_alias_double (__llrint, lrint) #endif diff --git a/sysdeps/x86_64/fpu/s_llrintf.S b/sysdeps/x86_64/fpu/s_llrintf.S index 5ac03dffd9..b6088de1ff 100644 --- a/sysdeps/x86_64/fpu/s_llrintf.S +++ b/sysdeps/x86_64/fpu/s_llrintf.S @@ -1,6 +1,6 @@ /* Round argument to nearest integral value according to current rounding direction. - Copyright (C) 2002-2016 Free Software Foundation, Inc. + Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Andreas Jaeger <aj@suse.d>, 2002. @@ -19,14 +19,15 @@ <http://www.gnu.org/licenses/>. */ #include <sysdep.h> +#include <libm-alias-float.h> .text ENTRY(__llrintf) cvtss2si %xmm0,%rax ret END(__llrintf) -weak_alias (__llrintf, llrintf) +libm_alias_float (__llrint, llrint) #ifndef __ILP32__ strong_alias (__llrintf, __lrintf) -weak_alias (__llrintf, lrintf) +libm_alias_float (__llrint, lrint) #endif diff --git a/sysdeps/x86_64/fpu/s_llrintl.S b/sysdeps/x86_64/fpu/s_llrintl.S index 5f4d827dff..49f6ff1961 100644 --- a/sysdeps/x86_64/fpu/s_llrintl.S +++ b/sysdeps/x86_64/fpu/s_llrintl.S @@ -1,6 +1,6 @@ /* Round argument to nearest integral value according to current rounding direction. - Copyright (C) 1997-2016 Free Software Foundation, Inc. + Copyright (C) 1997-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,6 +18,7 @@ <http://www.gnu.org/licenses/>. */ #include <sysdep.h> +#include <libm-alias-ldouble.h> .text ENTRY(__llrintl) @@ -27,8 +28,8 @@ ENTRY(__llrintl) movq -8(%rsp),%rax ret END(__llrintl) -weak_alias (__llrintl, llrintl) +libm_alias_ldouble (__llrint, llrint) #ifndef __ILP32__ strong_alias (__llrintl, __lrintl) -weak_alias (__llrintl, lrintl) +libm_alias_ldouble (__llrint, lrint) #endif diff --git a/sysdeps/x86_64/fpu/s_log1pl.S b/sysdeps/x86_64/fpu/s_log1pl.S index e83f64d3c0..947e5e4552 100644 --- a/sysdeps/x86_64/fpu/s_log1pl.S +++ b/sysdeps/x86_64/fpu/s_log1pl.S @@ -68,6 +68,7 @@ ENTRY(__log1pl) jnz 4b // in case x is ±Inf fstp %st(1) fstp %st(1) + fadd %st(0) ret END (__log1pl) diff --git a/sysdeps/x86_64/fpu/s_nearbyintl.S b/sysdeps/x86_64/fpu/s_nearbyintl.S index 76d41bdd52..80508bdbee 100644 --- a/sysdeps/x86_64/fpu/s_nearbyintl.S +++ b/sysdeps/x86_64/fpu/s_nearbyintl.S @@ -4,15 +4,12 @@ */ /* Adapted for use as nearbyint by Ulrich Drepper <drepper@cygnus.com>. */ +#include <libm-alias-ldouble.h> #include <machine/asm.h> ENTRY(__nearbyintl) fldt 8(%rsp) fnstenv -28(%rsp) - movl -28(%rsp), %eax - orl $0x20, %eax - movl %eax, -32(%rsp) - fldcw -32(%rsp) frndint fnstsw andl $0x1, %eax @@ -20,4 +17,4 @@ ENTRY(__nearbyintl) fldenv -28(%rsp) ret END (__nearbyintl) -weak_alias (__nearbyintl, nearbyintl) +libm_alias_ldouble (__nearbyint, nearbyint) diff --git a/sysdeps/x86_64/fpu/s_signbit.S b/sysdeps/x86_64/fpu/s_signbit.S index 92a79d3123..becfc646cb 100644 --- a/sysdeps/x86_64/fpu/s_signbit.S +++ b/sysdeps/x86_64/fpu/s_signbit.S @@ -1,5 +1,5 @@ /* Return nonzero value if number is negative. - Copyright (C) 2009-2016 Free Software Foundation, Inc. + Copyright (C) 2009-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@redha.com>, 2009. diff --git a/sysdeps/x86_64/fpu/s_signbitf.S b/sysdeps/x86_64/fpu/s_signbitf.S index 885645372e..c7be6a6329 100644 --- a/sysdeps/x86_64/fpu/s_signbitf.S +++ b/sysdeps/x86_64/fpu/s_signbitf.S @@ -1,5 +1,5 @@ /* Return nonzero value if number is negative. - Copyright (C) 2009-2016 Free Software Foundation, Inc. + Copyright (C) 2009-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@redha.com>, 2009. diff --git a/sysdeps/x86_64/fpu/s_sincosf.S b/sysdeps/x86_64/fpu/s_sincosf.S index 5e7cbe57e3..2086e8ca5c 100644 --- a/sysdeps/x86_64/fpu/s_sincosf.S +++ b/sysdeps/x86_64/fpu/s_sincosf.S @@ -1,5 +1,5 @@ /* Optimized sincosf function. - Copyright (C) 2012-2016 Free Software Foundation, Inc. + Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -17,8 +17,8 @@ <http://www.gnu.org/licenses/>. */ #include <sysdep.h> -#define __need_Emath -#include <bits/errno.h> +#include <errno.h> +#include <libm-alias-float.h> /* Short algorithm description: * @@ -561,4 +561,6 @@ L(SP_ONE): .type L(SP_ONE), @object ASM_SIZE_DIRECTIVE(L(SP_ONE)) -weak_alias(__sincosf, sincosf) +#ifndef __sincosf +libm_alias_float (__sincos, sincos) +#endif diff --git a/sysdeps/x86_64/fpu/s_sinf.S b/sysdeps/x86_64/fpu/s_sinf.S deleted file mode 100644 index c980c6e207..0000000000 --- a/sysdeps/x86_64/fpu/s_sinf.S +++ /dev/null @@ -1,559 +0,0 @@ -/* Optimized sinf function. - Copyright (C) 2012-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#define __need_Emath -#include <bits/errno.h> - -/* Short algorithm description: - * - * 1) if |x| == 0: return x. - * 2) if |x| < 2^-27: return x-x*DP_SMALL, raise underflow only when needed. - * 3) if |x| < 2^-5 : return x+x^3*DP_SIN2_0+x^5*DP_SIN2_1. - * 4) if |x| < Pi/4: return x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). - * 5) if |x| < 9*Pi/4: - * 5.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+1, - * t=|x|-j*Pi/4. - * 5.2) Reconstruction: - * s = sign(x) * (-1.0)^((n>>2)&1) - * if(n&2 != 0) { - * using cos(t) polynomial for |t|<Pi/4, result is - * s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))). - * } else { - * using sin(t) polynomial for |t|<Pi/4, result is - * s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))). - * } - * 6) if |x| < 2^23, large args: - * 6.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, - * t=|x|-j*Pi/4. - * 6.2) Reconstruction same as (5.2). - * 7) if |x| >= 2^23, very large args: - * 7.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, - * t=|x|-j*Pi/4. - * 7.2) Reconstruction same as (5.2). - * 8) if x is Inf, return x-x, and set errno=EDOM. - * 9) if x is NaN, return x-x. - * - * Special cases: - * sin(+-0) = +-0 not raising inexact/underflow, - * sin(subnormal) raises inexact/underflow, - * sin(min_normalized) raises inexact/underflow, - * sin(normalized) raises inexact, - * sin(Inf) = NaN, raises invalid, sets errno to EDOM, - * sin(NaN) = NaN. - */ - - .text -ENTRY(__sinf) - /* Input: single precision x in %xmm0 */ - - movd %xmm0, %eax /* Bits of x */ - movaps %xmm0, %xmm7 /* Copy of x */ - cvtss2sd %xmm0, %xmm0 /* DP x */ - movss L(SP_ABS_MASK)(%rip), %xmm3 - movl %eax, %edi /* Copy of x bits */ - andl $0x7fffffff, %eax /* |x| */ - - cmpl $0x3f490fdb, %eax /* |x|<Pi/4? */ - jb L(arg_less_pio4) - - /* Here if |x|>=Pi/4 */ - andps %xmm7, %xmm3 /* SP |x| */ - andpd L(DP_ABS_MASK)(%rip),%xmm0 /* DP |x| */ - movss L(SP_INVPIO4)(%rip), %xmm2 /* SP 1/(Pi/4) */ - - cmpl $0x40e231d6, %eax /* |x|<9*Pi/4? */ - jae L(large_args) - - /* Here if Pi/4<=|x|<9*Pi/4 */ - mulss %xmm3, %xmm2 /* SP |x|/(Pi/4) */ - movl %edi, %ecx /* Load x */ - cvttss2si %xmm2, %eax /* k, number of Pi/4 in x */ - lea L(PIO4J)(%rip), %rsi - shrl $31, %ecx /* sign of x */ - addl $1, %eax /* k+1 */ - movl $0x0e, %edx - andl %eax, %edx /* j = (k+1)&0x0e */ - subsd (%rsi,%rdx,8), %xmm0 /* t = |x| - j * Pi/4 */ - -L(reconstruction): - /* Input: %eax=n, %xmm0=t, %ecx=sign(x) */ - testl $2, %eax /* n&2 != 0? */ - jz L(sin_poly) - -/*L(cos_poly):*/ - /* Here if sin(x) calculated using cos(t) polynomial for |t|<Pi/4: - * y = t*t; z = y*y; - * s = sign(x) * (-1.0)^((n>>2)&1) - * result = s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))) - */ - shrl $2, %eax /* n>>2 */ - mulsd %xmm0, %xmm0 /* y=t^2 */ - andl $1, %eax /* (n>>2)&1 */ - movaps %xmm0, %xmm1 /* y */ - mulsd %xmm0, %xmm0 /* z=t^4 */ - - movsd L(DP_C4)(%rip), %xmm4 /* C4 */ - mulsd %xmm0, %xmm4 /* z*C4 */ - xorl %eax, %ecx /* (-1.0)^((n>>2)&1) XOR sign(x) */ - movsd L(DP_C3)(%rip), %xmm3 /* C3 */ - mulsd %xmm0, %xmm3 /* z*C3 */ - lea L(DP_ONES)(%rip), %rsi - addsd L(DP_C2)(%rip), %xmm4 /* C2+z*C4 */ - mulsd %xmm0, %xmm4 /* z*(C2+z*C4) */ - addsd L(DP_C1)(%rip), %xmm3 /* C1+z*C3 */ - mulsd %xmm0, %xmm3 /* z*(C1+z*C3) */ - addsd L(DP_C0)(%rip), %xmm4 /* C0+z*(C2+z*C4) */ - mulsd %xmm1, %xmm4 /* y*(C0+z*(C2+z*C4)) */ - - /* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */ - addsd %xmm4, %xmm3 - /* 1.0+y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */ - addsd L(DP_ONES)(%rip), %xmm3 - - mulsd (%rsi,%rcx,8), %xmm3 /* DP result */ - cvtsd2ss %xmm3, %xmm0 /* SP result */ - ret - - .p2align 4 -L(sin_poly): - /* Here if sin(x) calculated using sin(t) polynomial for |t|<Pi/4: - * y = t*t; z = y*y; - * s = sign(x) * (-1.0)^((n>>2)&1) - * result = s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))) - */ - - movaps %xmm0, %xmm4 /* t */ - shrl $2, %eax /* n>>2 */ - mulsd %xmm0, %xmm0 /* y=t^2 */ - andl $1, %eax /* (n>>2)&1 */ - movaps %xmm0, %xmm1 /* y */ - xorl %eax, %ecx /* (-1.0)^((n>>2)&1) XOR sign(x) */ - mulsd %xmm0, %xmm0 /* z=t^4 */ - - movsd L(DP_S4)(%rip), %xmm2 /* S4 */ - mulsd %xmm0, %xmm2 /* z*S4 */ - movsd L(DP_S3)(%rip), %xmm3 /* S3 */ - mulsd %xmm0, %xmm3 /* z*S3 */ - lea L(DP_ONES)(%rip), %rsi - addsd L(DP_S2)(%rip), %xmm2 /* S2+z*S4 */ - mulsd %xmm0, %xmm2 /* z*(S2+z*S4) */ - addsd L(DP_S1)(%rip), %xmm3 /* S1+z*S3 */ - mulsd %xmm0, %xmm3 /* z*(S1+z*S3) */ - addsd L(DP_S0)(%rip), %xmm2 /* S0+z*(S2+z*S4) */ - mulsd %xmm1, %xmm2 /* y*(S0+z*(S2+z*S4)) */ - /* t*s, where s = sign(x) * (-1.0)^((n>>2)&1) */ - mulsd (%rsi,%rcx,8), %xmm4 - /* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ - addsd %xmm2, %xmm3 - /* t*s*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ - mulsd %xmm4, %xmm3 - /* t*s*(1.0+y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ - addsd %xmm4, %xmm3 - cvtsd2ss %xmm3, %xmm0 /* SP result */ - ret - - .p2align 4 -L(large_args): - /* Here if |x|>=9*Pi/4 */ - cmpl $0x7f800000, %eax /* x is Inf or NaN? */ - jae L(arg_inf_or_nan) - - /* Here if finite |x|>=9*Pi/4 */ - cmpl $0x4b000000, %eax /* |x|<2^23? */ - jae L(very_large_args) - - /* Here if 9*Pi/4<=|x|<2^23 */ - movsd L(DP_INVPIO4)(%rip), %xmm1 /* 1/(Pi/4) */ - mulsd %xmm0, %xmm1 /* |x|/(Pi/4) */ - cvttsd2si %xmm1, %eax /* k=trunc(|x|/(Pi/4)) */ - addl $1, %eax /* k+1 */ - movl %eax, %edx - andl $0xfffffffe, %edx /* j=(k+1)&0xfffffffe */ - cvtsi2sdl %edx, %xmm4 /* DP j */ - movl %edi, %ecx /* Load x */ - movsd L(DP_PIO4HI)(%rip), %xmm2 /* -PIO4HI = high part of -Pi/4 */ - shrl $31, %ecx /* sign bit of x */ - mulsd %xmm4, %xmm2 /* -j*PIO4HI */ - movsd L(DP_PIO4LO)(%rip), %xmm3 /* -PIO4LO = low part of -Pi/4 */ - addsd %xmm2, %xmm0 /* |x| - j*PIO4HI */ - mulsd %xmm3, %xmm4 /* j*PIO4LO */ - addsd %xmm4, %xmm0 /* t = |x| - j*PIO4HI - j*PIO4LO */ - jmp L(reconstruction) - - .p2align 4 -L(very_large_args): - /* Here if finite |x|>=2^23 */ - - /* bitpos = (ix>>23) - BIAS_32 + 59; */ - shrl $23, %eax /* eb = biased exponent of x */ - /* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */ - subl $68, %eax - movl $28, %ecx /* %cl=28 */ - movl %eax, %edx /* bitpos copy */ - - /* j = bitpos/28; */ - div %cl /* j in register %al=%ax/%cl */ - movapd %xmm0, %xmm3 /* |x| */ - /* clear unneeded remainder from %ah */ - andl $0xff, %eax - - imull $28, %eax, %ecx /* j*28 */ - lea L(_FPI)(%rip), %rsi - movsd L(DP_HI_MASK)(%rip), %xmm4 /* DP_HI_MASK */ - movapd %xmm0, %xmm5 /* |x| */ - mulsd -16(%rsi,%rax,8), %xmm3 /* tmp3 = FPI[j-2]*|x| */ - movapd %xmm0, %xmm1 /* |x| */ - mulsd -8(%rsi,%rax,8), %xmm5 /* tmp2 = FPI[j-1]*|x| */ - mulsd (%rsi,%rax,8), %xmm0 /* tmp0 = FPI[j]*|x| */ - addl $19, %ecx /* j*28+19 */ - mulsd 8(%rsi,%rax,8), %xmm1 /* tmp1 = FPI[j+1]*|x| */ - cmpl %ecx, %edx /* bitpos>=j*28+19? */ - jl L(very_large_skip1) - - /* Here if bitpos>=j*28+19 */ - andpd %xmm3, %xmm4 /* HI(tmp3) */ - subsd %xmm4, %xmm3 /* tmp3 = tmp3 - HI(tmp3) */ -L(very_large_skip1): - - movsd L(DP_2POW52)(%rip), %xmm6 - movapd %xmm5, %xmm2 /* tmp2 copy */ - addsd %xmm3, %xmm5 /* tmp5 = tmp3 + tmp2 */ - movl $1, %edx - addsd %xmm5, %xmm6 /* tmp6 = tmp5 + 2^52 */ - movsd 8+L(DP_2POW52)(%rip), %xmm4 - movd %xmm6, %eax /* k = I64_LO(tmp6); */ - addsd %xmm6, %xmm4 /* tmp4 = tmp6 - 2^52 */ - movl %edi, %ecx /* Load x */ - comisd %xmm5, %xmm4 /* tmp4 > tmp5? */ - jbe L(very_large_skip2) - - /* Here if tmp4 > tmp5 */ - subl $1, %eax /* k-- */ - addsd 8+L(DP_ONES)(%rip), %xmm4 /* tmp4 -= 1.0 */ -L(very_large_skip2): - - andl %eax, %edx /* k&1 */ - lea L(DP_ZERONE)(%rip), %rsi - subsd %xmm4, %xmm3 /* tmp3 -= tmp4 */ - addsd (%rsi,%rdx,8), %xmm3 /* t = DP_ZERONE[k&1] + tmp3 */ - addsd %xmm2, %xmm3 /* t += tmp2 */ - shrl $31, %ecx /* sign of x */ - addsd %xmm3, %xmm0 /* t += tmp0 */ - addl $1, %eax /* n=k+1 */ - addsd %xmm1, %xmm0 /* t += tmp1 */ - mulsd L(DP_PIO4)(%rip), %xmm0 /* t *= PI04 */ - - jmp L(reconstruction) /* end of very_large_args peth */ - - .p2align 4 -L(arg_less_pio4): - /* Here if |x|<Pi/4 */ - cmpl $0x3d000000, %eax /* |x|<2^-5? */ - jl L(arg_less_2pn5) - - /* Here if 2^-5<=|x|<Pi/4 */ - movaps %xmm0, %xmm3 /* x */ - mulsd %xmm0, %xmm0 /* y=x^2 */ - movaps %xmm0, %xmm1 /* y */ - mulsd %xmm0, %xmm0 /* z=x^4 */ - movsd L(DP_S4)(%rip), %xmm4 /* S4 */ - mulsd %xmm0, %xmm4 /* z*S4 */ - movsd L(DP_S3)(%rip), %xmm5 /* S3 */ - mulsd %xmm0, %xmm5 /* z*S3 */ - addsd L(DP_S2)(%rip), %xmm4 /* S2+z*S4 */ - mulsd %xmm0, %xmm4 /* z*(S2+z*S4) */ - addsd L(DP_S1)(%rip), %xmm5 /* S1+z*S3 */ - mulsd %xmm0, %xmm5 /* z*(S1+z*S3) */ - addsd L(DP_S0)(%rip), %xmm4 /* S0+z*(S2+z*S4) */ - mulsd %xmm1, %xmm4 /* y*(S0+z*(S2+z*S4)) */ - mulsd %xmm3, %xmm5 /* x*z*(S1+z*S3) */ - mulsd %xmm3, %xmm4 /* x*y*(S0+z*(S2+z*S4)) */ - /* x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ - addsd %xmm5, %xmm4 - /* x + x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ - addsd %xmm4, %xmm3 - cvtsd2ss %xmm3, %xmm0 /* SP result */ - ret - - .p2align 4 -L(arg_less_2pn5): - /* Here if |x|<2^-5 */ - cmpl $0x32000000, %eax /* |x|<2^-27? */ - jl L(arg_less_2pn27) - - /* Here if 2^-27<=|x|<2^-5 */ - movaps %xmm0, %xmm1 /* DP x */ - mulsd %xmm0, %xmm0 /* DP x^2 */ - movsd L(DP_SIN2_1)(%rip), %xmm3 /* DP DP_SIN2_1 */ - mulsd %xmm0, %xmm3 /* DP x^2*DP_SIN2_1 */ - addsd L(DP_SIN2_0)(%rip), %xmm3 /* DP DP_SIN2_0+x^2*DP_SIN2_1 */ - mulsd %xmm0, %xmm3 /* DP x^2*DP_SIN2_0+x^4*DP_SIN2_1 */ - mulsd %xmm1, %xmm3 /* DP x^3*DP_SIN2_0+x^5*DP_SIN2_1 */ - addsd %xmm1, %xmm3 /* DP x+x^3*DP_SIN2_0+x^5*DP_SIN2_1 */ - cvtsd2ss %xmm3, %xmm0 /* SP result */ - ret - - .p2align 4 -L(arg_less_2pn27): - cmpl $0, %eax /* x=0? */ - je L(arg_zero) /* in case x=0 return sin(+-0)==+-0 */ - /* Here if |x|<2^-27 */ - /* - * Special cases here: - * sin(subnormal) raises inexact/underflow - * sin(min_normalized) raises inexact/underflow - * sin(normalized) raises inexact - */ - movaps %xmm0, %xmm3 /* Copy of DP x */ - mulsd L(DP_SMALL)(%rip), %xmm0 /* x*DP_SMALL */ - subsd %xmm0, %xmm3 /* Result is x-x*DP_SMALL */ - cvtsd2ss %xmm3, %xmm0 /* Result converted to SP */ - ret - - .p2align 4 -L(arg_zero): - movaps %xmm7, %xmm0 /* SP x */ - ret - - .p2align 4 -L(arg_inf_or_nan): - /* Here if |x| is Inf or NAN */ - jne L(skip_errno_setting) /* in case of x is NaN */ - - /* Align stack to 16 bytes. */ - subq $8, %rsp - cfi_adjust_cfa_offset (8) - /* Here if x is Inf. Set errno to EDOM. */ - call JUMPTARGET(__errno_location) - addq $8, %rsp - cfi_adjust_cfa_offset (-8) - - movl $EDOM, (%rax) - - .p2align 4 -L(skip_errno_setting): - /* Here if |x| is Inf or NAN. Continued. */ - movaps %xmm7, %xmm0 /* load x */ - subss %xmm0, %xmm0 /* Result is NaN */ - ret -END(__sinf) - - .section .rodata, "a" - .p2align 3 -L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */ - .long 0x00000000,0x00000000 - .long 0x54442d18,0x3fe921fb - .long 0x54442d18,0x3ff921fb - .long 0x7f3321d2,0x4002d97c - .long 0x54442d18,0x400921fb - .long 0x2955385e,0x400f6a7a - .long 0x7f3321d2,0x4012d97c - .long 0xe9bba775,0x4015fdbb - .long 0x54442d18,0x401921fb - .long 0xbeccb2bb,0x401c463a - .long 0x2955385e,0x401f6a7a - .type L(PIO4J), @object - ASM_SIZE_DIRECTIVE(L(PIO4J)) - - .p2align 3 -L(_FPI): /* 4/Pi broken into sum of positive DP values */ - .long 0x00000000,0x00000000 - .long 0x6c000000,0x3ff45f30 - .long 0x2a000000,0x3e3c9c88 - .long 0xa8000000,0x3c54fe13 - .long 0xd0000000,0x3aaf47d4 - .long 0x6c000000,0x38fbb81b - .long 0xe0000000,0x3714acc9 - .long 0x7c000000,0x3560e410 - .long 0x56000000,0x33bca2c7 - .long 0xac000000,0x31fbd778 - .long 0xe0000000,0x300b7246 - .long 0xe8000000,0x2e5d2126 - .long 0x48000000,0x2c970032 - .long 0xe8000000,0x2ad77504 - .long 0xe0000000,0x290921cf - .long 0xb0000000,0x274deb1c - .long 0xe0000000,0x25829a73 - .long 0xbe000000,0x23fd1046 - .long 0x10000000,0x2224baed - .long 0x8e000000,0x20709d33 - .long 0x80000000,0x1e535a2f - .long 0x64000000,0x1cef904e - .long 0x30000000,0x1b0d6398 - .long 0x24000000,0x1964ce7d - .long 0x16000000,0x17b908bf - .type L(_FPI), @object - ASM_SIZE_DIRECTIVE(L(_FPI)) - -/* Coefficients of polynomial - for sin(x)~=x+x^3*DP_SIN2_0+x^5*DP_SIN2_1, |x|<2^-5. */ - .p2align 3 -L(DP_SIN2_0): - .long 0x5543d49d,0xbfc55555 - .type L(DP_SIN2_0), @object - ASM_SIZE_DIRECTIVE(L(DP_SIN2_0)) - - .p2align 3 -L(DP_SIN2_1): - .long 0x75cec8c5,0x3f8110f4 - .type L(DP_SIN2_1), @object - ASM_SIZE_DIRECTIVE(L(DP_SIN2_1)) - - .p2align 3 -L(DP_ZERONE): - .long 0x00000000,0x00000000 /* 0.0 */ - .long 0x00000000,0xbff00000 /* 1.0 */ - .type L(DP_ZERONE), @object - ASM_SIZE_DIRECTIVE(L(DP_ZERONE)) - - .p2align 3 -L(DP_ONES): - .long 0x00000000,0x3ff00000 /* +1.0 */ - .long 0x00000000,0xbff00000 /* -1.0 */ - .type L(DP_ONES), @object - ASM_SIZE_DIRECTIVE(L(DP_ONES)) - -/* Coefficients of polynomial - for sin(t)~=t+t^3*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))), |t|<Pi/4. */ - .p2align 3 -L(DP_S3): - .long 0x64e6b5b4,0x3ec71d72 - .type L(DP_S3), @object - ASM_SIZE_DIRECTIVE(L(DP_S3)) - - .p2align 3 -L(DP_S1): - .long 0x10c2688b,0x3f811111 - .type L(DP_S1), @object - ASM_SIZE_DIRECTIVE(L(DP_S1)) - - .p2align 3 -L(DP_S4): - .long 0x1674b58a,0xbe5a947e - .type L(DP_S4), @object - ASM_SIZE_DIRECTIVE(L(DP_S4)) - - .p2align 3 -L(DP_S2): - .long 0x8b4bd1f9,0xbf2a019f - .type L(DP_S2), @object - ASM_SIZE_DIRECTIVE(L(DP_S2)) - - .p2align 3 -L(DP_S0): - .long 0x55551cd9,0xbfc55555 - .type L(DP_S0), @object - ASM_SIZE_DIRECTIVE(L(DP_S0)) - - .p2align 3 -L(DP_SMALL): - .long 0x00000000,0x3cd00000 /* 2^(-50) */ - .type L(DP_SMALL), @object - ASM_SIZE_DIRECTIVE(L(DP_SMALL)) - -/* Coefficients of polynomial - for cos(t)~=1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))), |t|<Pi/4. */ - .p2align 3 -L(DP_C3): - .long 0x9ac43cc0,0x3efa00eb - .type L(DP_C3), @object - ASM_SIZE_DIRECTIVE(L(DP_C3)) - - .p2align 3 -L(DP_C1): - .long 0x545c50c7,0x3fa55555 - .type L(DP_C1), @object - ASM_SIZE_DIRECTIVE(L(DP_C1)) - - .p2align 3 -L(DP_C4): - .long 0xdd8844d7,0xbe923c97 - .type L(DP_C4), @object - ASM_SIZE_DIRECTIVE(L(DP_C4)) - - .p2align 3 -L(DP_C2): - .long 0x348b6874,0xbf56c16b - .type L(DP_C2), @object - ASM_SIZE_DIRECTIVE(L(DP_C2)) - - .p2align 3 -L(DP_C0): - .long 0xfffe98ae,0xbfdfffff - .type L(DP_C0), @object - ASM_SIZE_DIRECTIVE(L(DP_C0)) - - .p2align 3 -L(DP_PIO4): - .long 0x54442d18,0x3fe921fb /* Pi/4 */ - .type L(DP_PIO4), @object - ASM_SIZE_DIRECTIVE(L(DP_PIO4)) - - .p2align 3 -L(DP_2POW52): - .long 0x00000000,0x43300000 /* +2^52 */ - .long 0x00000000,0xc3300000 /* -2^52 */ - .type L(DP_2POW52), @object - ASM_SIZE_DIRECTIVE(L(DP_2POW52)) - - .p2align 3 -L(DP_INVPIO4): - .long 0x6dc9c883,0x3ff45f30 /* 4/Pi */ - .type L(DP_INVPIO4), @object - ASM_SIZE_DIRECTIVE(L(DP_INVPIO4)) - - .p2align 3 -L(DP_PIO4HI): - .long 0x54000000,0xbfe921fb /* High part of Pi/4 */ - .type L(DP_PIO4HI), @object - ASM_SIZE_DIRECTIVE(L(DP_PIO4HI)) - - .p2align 3 -L(DP_PIO4LO): - .long 0x11A62633,0xbe010b46 /* Low part of Pi/4 */ - .type L(DP_PIO4LO), @object - ASM_SIZE_DIRECTIVE(L(DP_PIO4LO)) - - .p2align 2 -L(SP_INVPIO4): - .long 0x3fa2f983 /* 4/Pi */ - .type L(SP_INVPIO4), @object - ASM_SIZE_DIRECTIVE(L(SP_INVPIO4)) - - .p2align 4 -L(DP_ABS_MASK): /* Mask for getting DP absolute value */ - .long 0xffffffff,0x7fffffff - .long 0xffffffff,0x7fffffff - .type L(DP_ABS_MASK), @object - ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK)) - - .p2align 3 -L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */ - .long 0x00000000,0xffffffff - .type L(DP_HI_MASK),@object - ASM_SIZE_DIRECTIVE(L(DP_HI_MASK)) - - .p2align 4 -L(SP_ABS_MASK): /* Mask for getting SP absolute value */ - .long 0x7fffffff,0x7fffffff - .long 0x7fffffff,0x7fffffff - .type L(SP_ABS_MASK), @object - ASM_SIZE_DIRECTIVE(L(SP_ABS_MASK)) - -weak_alias(__sinf, sinf) diff --git a/sysdeps/x86_64/fpu/s_truncl.S b/sysdeps/x86_64/fpu/s_truncl.S index c37cf00241..22427ece00 100644 --- a/sysdeps/x86_64/fpu/s_truncl.S +++ b/sysdeps/x86_64/fpu/s_truncl.S @@ -1,5 +1,5 @@ /* Truncate long double value. - Copyright (C) 1997-2016 Free Software Foundation, Inc. + Copyright (C) 1997-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. @@ -17,17 +17,21 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ +#include <libm-alias-ldouble.h> #include <machine/asm.h> ENTRY(__truncl) fldt 8(%rsp) - fstcw -4(%rsp) + fnstenv -28(%rsp) movl $0xc00, %edx - orl -4(%rsp), %edx - movl %edx, -8(%rsp) - fldcw -8(%rsp) + orl -28(%rsp), %edx + movl %edx, -32(%rsp) + fldcw -32(%rsp) frndint - fldcw -4(%rsp) + fnstsw + andl $0x1, %eax + orl %eax, -24(%rsp) + fldenv -28(%rsp) ret END(__truncl) -weak_alias (__truncl, truncl) +libm_alias_ldouble (__trunc, trunc) diff --git a/sysdeps/x86_64/fpu/svml_d_cos2_core.S b/sysdeps/x86_64/fpu/svml_d_cos2_core.S index 7f62d29917..111548367b 100644 --- a/sysdeps/x86_64/fpu/svml_d_cos2_core.S +++ b/sysdeps/x86_64/fpu/svml_d_cos2_core.S @@ -1,5 +1,5 @@ /* Function cos vectorized with SSE2. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_d_cos4_core.S b/sysdeps/x86_64/fpu/svml_d_cos4_core.S index b92ff13b86..28b31d510c 100644 --- a/sysdeps/x86_64/fpu/svml_d_cos4_core.S +++ b/sysdeps/x86_64/fpu/svml_d_cos4_core.S @@ -1,5 +1,5 @@ /* Function cos vectorized with AVX2, wrapper version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S b/sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S index a3da721e35..988d0650ca 100644 --- a/sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S +++ b/sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S @@ -1,5 +1,5 @@ /* Function cos vectorized in AVX ISA as wrapper to SSE4 ISA version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_d_cos8_core.S b/sysdeps/x86_64/fpu/svml_d_cos8_core.S index e5d986d11a..830776b5d2 100644 --- a/sysdeps/x86_64/fpu/svml_d_cos8_core.S +++ b/sysdeps/x86_64/fpu/svml_d_cos8_core.S @@ -1,5 +1,5 @@ /* Function cos vectorized with AVX-512, wrapper to AVX2. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_d_exp2_core.S b/sysdeps/x86_64/fpu/svml_d_exp2_core.S index 9e511037a1..e19ddb7f3b 100644 --- a/sysdeps/x86_64/fpu/svml_d_exp2_core.S +++ b/sysdeps/x86_64/fpu/svml_d_exp2_core.S @@ -1,5 +1,5 @@ /* Function exp vectorized with SSE2. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -21,7 +21,7 @@ .text ENTRY (_ZGVbN2v_exp) -WRAPPER_IMPL_SSE2 exp +WRAPPER_IMPL_SSE2 __exp_finite END (_ZGVbN2v_exp) #ifndef USE_MULTIARCH diff --git a/sysdeps/x86_64/fpu/svml_d_exp4_core.S b/sysdeps/x86_64/fpu/svml_d_exp4_core.S index 8cac8adbc7..341fea8f30 100644 --- a/sysdeps/x86_64/fpu/svml_d_exp4_core.S +++ b/sysdeps/x86_64/fpu/svml_d_exp4_core.S @@ -1,5 +1,5 @@ /* Function exp vectorized with AVX2, wrapper version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_d_exp4_core_avx.S b/sysdeps/x86_64/fpu/svml_d_exp4_core_avx.S index 1a0fbf574a..39e6fcf228 100644 --- a/sysdeps/x86_64/fpu/svml_d_exp4_core_avx.S +++ b/sysdeps/x86_64/fpu/svml_d_exp4_core_avx.S @@ -1,5 +1,5 @@ /* Function exp vectorized in AVX ISA as wrapper to SSE4 ISA version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_d_exp8_core.S b/sysdeps/x86_64/fpu/svml_d_exp8_core.S index 2486e888a4..94edc01fcb 100644 --- a/sysdeps/x86_64/fpu/svml_d_exp8_core.S +++ b/sysdeps/x86_64/fpu/svml_d_exp8_core.S @@ -1,5 +1,5 @@ /* Function exp vectorized with AVX-512. Wrapper to AVX2 version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_d_exp_data.S b/sysdeps/x86_64/fpu/svml_d_exp_data.S index 6d1acbdd21..5e229c9bcc 100644 --- a/sysdeps/x86_64/fpu/svml_d_exp_data.S +++ b/sysdeps/x86_64/fpu/svml_d_exp_data.S @@ -1,5 +1,5 @@ /* Data for vector function exp. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_d_exp_data.h b/sysdeps/x86_64/fpu/svml_d_exp_data.h index f993403d47..a3721ce137 100644 --- a/sysdeps/x86_64/fpu/svml_d_exp_data.h +++ b/sysdeps/x86_64/fpu/svml_d_exp_data.h @@ -1,5 +1,5 @@ /* Offsets for data table for function exp. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_d_log2_core.S b/sysdeps/x86_64/fpu/svml_d_log2_core.S index 8ea40fee56..41522f2069 100644 --- a/sysdeps/x86_64/fpu/svml_d_log2_core.S +++ b/sysdeps/x86_64/fpu/svml_d_log2_core.S @@ -1,5 +1,5 @@ /* Function log vectorized with SSE2. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -21,7 +21,7 @@ .text ENTRY (_ZGVbN2v_log) -WRAPPER_IMPL_SSE2 log +WRAPPER_IMPL_SSE2 __log_finite END (_ZGVbN2v_log) #ifndef USE_MULTIARCH diff --git a/sysdeps/x86_64/fpu/svml_d_log4_core.S b/sysdeps/x86_64/fpu/svml_d_log4_core.S index 72813d8921..5857b45aa0 100644 --- a/sysdeps/x86_64/fpu/svml_d_log4_core.S +++ b/sysdeps/x86_64/fpu/svml_d_log4_core.S @@ -1,5 +1,5 @@ /* Function log vectorized with AVX2, wrapper version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_d_log4_core_avx.S b/sysdeps/x86_64/fpu/svml_d_log4_core_avx.S index 6ca1139931..bab3ba9877 100644 --- a/sysdeps/x86_64/fpu/svml_d_log4_core_avx.S +++ b/sysdeps/x86_64/fpu/svml_d_log4_core_avx.S @@ -1,5 +1,5 @@ /* Function log vectorized in AVX ISA as wrapper to SSE4 ISA version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_d_log8_core.S b/sysdeps/x86_64/fpu/svml_d_log8_core.S index 6850fd9a44..bb3523ee0d 100644 --- a/sysdeps/x86_64/fpu/svml_d_log8_core.S +++ b/sysdeps/x86_64/fpu/svml_d_log8_core.S @@ -1,5 +1,5 @@ /* Function log vectorized with AVX-512. Wrapper to AVX2 version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_d_log_data.S b/sysdeps/x86_64/fpu/svml_d_log_data.S index 9ab541b23f..0514551ccf 100644 --- a/sysdeps/x86_64/fpu/svml_d_log_data.S +++ b/sysdeps/x86_64/fpu/svml_d_log_data.S @@ -1,5 +1,5 @@ /* Data for function log. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_d_log_data.h b/sysdeps/x86_64/fpu/svml_d_log_data.h index 30c2b54a4b..a317c7b845 100644 --- a/sysdeps/x86_64/fpu/svml_d_log_data.h +++ b/sysdeps/x86_64/fpu/svml_d_log_data.h @@ -1,5 +1,5 @@ /* Offsets for data table for function log. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_d_pow2_core.S b/sysdeps/x86_64/fpu/svml_d_pow2_core.S index b25515c825..b2451b2ed5 100644 --- a/sysdeps/x86_64/fpu/svml_d_pow2_core.S +++ b/sysdeps/x86_64/fpu/svml_d_pow2_core.S @@ -1,5 +1,5 @@ /* Function pow vectorized with SSE2. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -21,7 +21,7 @@ .text ENTRY (_ZGVbN2vv_pow) -WRAPPER_IMPL_SSE2_ff pow +WRAPPER_IMPL_SSE2_ff __pow_finite END (_ZGVbN2vv_pow) #ifndef USE_MULTIARCH diff --git a/sysdeps/x86_64/fpu/svml_d_pow4_core.S b/sysdeps/x86_64/fpu/svml_d_pow4_core.S index 547993799e..1520ba1d45 100644 --- a/sysdeps/x86_64/fpu/svml_d_pow4_core.S +++ b/sysdeps/x86_64/fpu/svml_d_pow4_core.S @@ -1,5 +1,5 @@ /* Function pow vectorized with AVX2, wrapper version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_d_pow4_core_avx.S b/sysdeps/x86_64/fpu/svml_d_pow4_core_avx.S index 4e4e9867b4..d4b265c91a 100644 --- a/sysdeps/x86_64/fpu/svml_d_pow4_core_avx.S +++ b/sysdeps/x86_64/fpu/svml_d_pow4_core_avx.S @@ -1,5 +1,5 @@ /* Function pow vectorized in AVX ISA as wrapper to SSE4 ISA version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_d_pow8_core.S b/sysdeps/x86_64/fpu/svml_d_pow8_core.S index 372e5a9c83..15292ccebd 100644 --- a/sysdeps/x86_64/fpu/svml_d_pow8_core.S +++ b/sysdeps/x86_64/fpu/svml_d_pow8_core.S @@ -1,5 +1,5 @@ /* Function pow vectorized with AVX-512. Wrapper to AVX2 version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_d_pow_data.S b/sysdeps/x86_64/fpu/svml_d_pow_data.S index 8481f95455..9e5f99c25e 100644 --- a/sysdeps/x86_64/fpu/svml_d_pow_data.S +++ b/sysdeps/x86_64/fpu/svml_d_pow_data.S @@ -1,5 +1,5 @@ /* Data for function pow. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_d_pow_data.h b/sysdeps/x86_64/fpu/svml_d_pow_data.h index 239ba96984..55b573b2a7 100644 --- a/sysdeps/x86_64/fpu/svml_d_pow_data.h +++ b/sysdeps/x86_64/fpu/svml_d_pow_data.h @@ -1,5 +1,5 @@ /* Offsets for data table for function pow. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_d_sin2_core.S b/sysdeps/x86_64/fpu/svml_d_sin2_core.S index f6ec13104b..6485e0819f 100644 --- a/sysdeps/x86_64/fpu/svml_d_sin2_core.S +++ b/sysdeps/x86_64/fpu/svml_d_sin2_core.S @@ -1,5 +1,5 @@ /* Function sin vectorized with SSE2. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_d_sin4_core.S b/sysdeps/x86_64/fpu/svml_d_sin4_core.S index 95a1dec6f6..7c7c426451 100644 --- a/sysdeps/x86_64/fpu/svml_d_sin4_core.S +++ b/sysdeps/x86_64/fpu/svml_d_sin4_core.S @@ -1,5 +1,5 @@ /* Function sin vectorized with AVX2, wrapper version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_d_sin4_core_avx.S b/sysdeps/x86_64/fpu/svml_d_sin4_core_avx.S index 29d1526a12..a8200dfc58 100644 --- a/sysdeps/x86_64/fpu/svml_d_sin4_core_avx.S +++ b/sysdeps/x86_64/fpu/svml_d_sin4_core_avx.S @@ -1,5 +1,5 @@ /* Function sin vectorized in AVX ISA as wrapper to SSE4 ISA version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_d_sin8_core.S b/sysdeps/x86_64/fpu/svml_d_sin8_core.S index abd86b3d98..7f07a41ba1 100644 --- a/sysdeps/x86_64/fpu/svml_d_sin8_core.S +++ b/sysdeps/x86_64/fpu/svml_d_sin8_core.S @@ -1,5 +1,5 @@ /* Function sin vectorized with AVX-512, wrapper to AVX2 version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_d_sincos2_core.S b/sysdeps/x86_64/fpu/svml_d_sincos2_core.S index 74afa0a677..ebf9e25aca 100644 --- a/sysdeps/x86_64/fpu/svml_d_sincos2_core.S +++ b/sysdeps/x86_64/fpu/svml_d_sincos2_core.S @@ -1,5 +1,5 @@ /* Function sincos vectorized with SSE2. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -20,8 +20,89 @@ #include "svml_d_wrapper_impl.h" .text -ENTRY (_ZGVbN2vvv_sincos) +ENTRY (_ZGVbN2vl8l8_sincos) WRAPPER_IMPL_SSE2_fFF sincos +END (_ZGVbN2vl8l8_sincos) +libmvec_hidden_def (_ZGVbN2vl8l8_sincos) + +/* SSE2 ISA version as wrapper to scalar (for vector + function declared with #pragma omp declare simd notinbranch). */ +.macro WRAPPER_IMPL_SSE2_fFF_vvv callee +#ifndef __ILP32__ + subq $88, %rsp + cfi_adjust_cfa_offset(88) + movaps %xmm0, 64(%rsp) + lea (%rsp), %rdi + movdqa %xmm1, 32(%rdi) + lea 16(%rsp), %rsi + movdqa %xmm2, 32(%rsi) + call JUMPTARGET(\callee) + movsd 72(%rsp), %xmm0 + lea 8(%rsp), %rdi + lea 24(%rsp), %rsi + call JUMPTARGET(\callee) + movq 32(%rsp), %rdx + movq 48(%rsp), %rsi + movq 40(%rsp), %r8 + movq 56(%rsp), %r10 + movq (%rsp), %rax + movq 16(%rsp), %rcx + movq 8(%rsp), %rdi + movq 24(%rsp), %r9 + movq %rax, (%rdx) + movq %rcx, (%rsi) + movq %rdi, (%r8) + movq %r9, (%r10) + addq $88, %rsp + cfi_adjust_cfa_offset(-88) + ret +#else + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + pushq %rbx + .cfi_def_cfa_offset 24 + .cfi_offset 3, -24 + subl $88, %esp + .cfi_def_cfa_offset 112 + leal 64(%rsp), %esi + movaps %xmm1, 32(%esp) + leal 48(%rsp), %edi + movaps %xmm2, 16(%esp) + movq %rsi, %rbp + movq %rdi, %rbx + movaps %xmm0, (%esp) + call JUMPTARGET(\callee) + movupd 8(%esp), %xmm0 + leal 8(%rbp), %esi + leal 8(%rbx), %edi + call JUMPTARGET(\callee) + movdqa 32(%esp), %xmm1 + movsd 48(%esp), %xmm0 + movq %xmm1, %rax + movdqa 16(%esp), %xmm2 + movsd %xmm0, (%eax) + movsd 56(%esp), %xmm0 + pextrd $1, %xmm1, %eax + movsd %xmm0, (%eax) + movsd 64(%esp), %xmm0 + movq %xmm2, %rax + movsd %xmm0, (%eax) + movsd 72(%esp), %xmm0 + pextrd $1, %xmm2, %eax + movsd %xmm0, (%eax) + addl $88, %esp + .cfi_def_cfa_offset 24 + popq %rbx + .cfi_def_cfa_offset 16 + popq %rbp + .cfi_def_cfa_offset 8 + ret +#endif +.endm + +ENTRY (_ZGVbN2vvv_sincos) +WRAPPER_IMPL_SSE2_fFF_vvv sincos END (_ZGVbN2vvv_sincos) #ifndef USE_MULTIARCH diff --git a/sysdeps/x86_64/fpu/svml_d_sincos4_core.S b/sysdeps/x86_64/fpu/svml_d_sincos4_core.S index 2c0b011fb3..626a2b3a7b 100644 --- a/sysdeps/x86_64/fpu/svml_d_sincos4_core.S +++ b/sysdeps/x86_64/fpu/svml_d_sincos4_core.S @@ -1,5 +1,5 @@ /* Function sincos vectorized with AVX2, wrapper version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -20,8 +20,131 @@ #include "svml_d_wrapper_impl.h" .text +ENTRY (_ZGVdN4vl8l8_sincos) +WRAPPER_IMPL_AVX_fFF _ZGVbN2vl8l8_sincos +END (_ZGVdN4vl8l8_sincos) +libmvec_hidden_def (_ZGVdN4vl8l8_sincos) + +/* AVX2 ISA version as wrapper to SSE ISA version (for vector + function declared with #pragma omp declare simd notinbranch). */ +.macro WRAPPER_IMPL_AVX2_fFF_vvv callee +#ifndef __ILP32__ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-32, %rsp + subq $160, %rsp + vmovupd %ymm0, 128(%rsp) + lea (%rsp), %rdi + vmovdqu %ymm1, 64(%rdi) + vmovdqu %ymm2, 96(%rdi) + lea 32(%rsp), %rsi + vzeroupper + call HIDDEN_JUMPTARGET(\callee) + vmovupd 144(%rsp), %xmm0 + lea 16(%rsp), %rdi + lea 48(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + movq 64(%rsp), %rdx + movq 96(%rsp), %rsi + movq 72(%rsp), %r8 + movq 104(%rsp), %r10 + movq (%rsp), %rax + movq 32(%rsp), %rcx + movq 8(%rsp), %rdi + movq 40(%rsp), %r9 + movq %rax, (%rdx) + movq %rcx, (%rsi) + movq 80(%rsp), %rax + movq 112(%rsp), %rcx + movq %rdi, (%r8) + movq %r9, (%r10) + movq 88(%rsp), %rdi + movq 120(%rsp), %r9 + movq 16(%rsp), %r11 + movq 48(%rsp), %rdx + movq 24(%rsp), %rsi + movq 56(%rsp), %r8 + movq %r11, (%rax) + movq %rdx, (%rcx) + movq %rsi, (%rdi) + movq %r8, (%r9) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-32, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r12 + leal -80(%rbp), %esi + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x70,0x6 + .cfi_escape 0x10,0xc,0x2,0x76,0x78 + leal -112(%rbp), %edi + movq %rsi, %r12 + pushq %rbx + .cfi_escape 0x10,0x3,0x2,0x76,0x68 + movq %rdi, %rbx + subl $152, %esp + vmovaps %xmm1, -128(%ebp) + vmovaps %xmm2, -144(%ebp) + vmovapd %ymm0, -176(%ebp) + vzeroupper + call HIDDEN_JUMPTARGET(\callee) + leal 16(%r12), %esi + vmovapd -160(%ebp), %xmm0 + leal 16(%rbx), %edi + call HIDDEN_JUMPTARGET(\callee) + movq -128(%ebp), %rax + vmovsd -112(%ebp), %xmm0 + vmovdqa -128(%ebp), %xmm5 + vmovdqa -144(%ebp), %xmm1 + vmovsd %xmm0, (%eax) + vmovsd -104(%ebp), %xmm0 + vpextrd $1, %xmm5, %eax + vmovsd %xmm0, (%eax) + movq -120(%ebp), %rax + vmovsd -96(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + vmovsd -88(%ebp), %xmm0 + vpextrd $3, %xmm5, %eax + vmovsd %xmm0, (%eax) + movq -144(%ebp), %rax + vmovsd -80(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + vmovsd -72(%ebp), %xmm0 + vpextrd $1, %xmm1, %eax + vmovsd %xmm0, (%eax) + movq -136(%ebp), %rax + vmovsd -64(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + vmovsd -56(%ebp), %xmm0 + vpextrd $3, %xmm1, %eax + vmovsd %xmm0, (%eax) + addl $152, %esp + popq %rbx + popq %r10 + .cfi_def_cfa 10, 0 + popq %r12 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +.endm + ENTRY (_ZGVdN4vvv_sincos) -WRAPPER_IMPL_AVX_fFF _ZGVbN2vvv_sincos +WRAPPER_IMPL_AVX2_fFF_vvv _ZGVbN2vl8l8_sincos END (_ZGVdN4vvv_sincos) #ifndef USE_MULTIARCH diff --git a/sysdeps/x86_64/fpu/svml_d_sincos4_core_avx.S b/sysdeps/x86_64/fpu/svml_d_sincos4_core_avx.S index e4320a97c7..4a5d4f637a 100644 --- a/sysdeps/x86_64/fpu/svml_d_sincos4_core_avx.S +++ b/sysdeps/x86_64/fpu/svml_d_sincos4_core_avx.S @@ -1,5 +1,5 @@ /* Function sincos vectorized in AVX ISA as wrapper to SSE4 ISA version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -20,6 +20,124 @@ #include "svml_d_wrapper_impl.h" .text +ENTRY (_ZGVcN4vl8l8_sincos) +WRAPPER_IMPL_AVX_fFF _ZGVbN2vl8l8_sincos +END (_ZGVcN4vl8l8_sincos) + +/* AVX ISA version as wrapper to SSE ISA version (for vector + function declared with #pragma omp declare simd notinbranch). */ +.macro WRAPPER_IMPL_AVX_fFF_vvv callee +#ifndef __ILP32__ + pushq %rbp + movq %rsp, %rbp + andq $-32, %rsp + subq $160, %rsp + vmovupd %ymm0, 64(%rsp) + lea (%rsp), %rdi + vmovdqu %xmm1, 96(%rdi) + vmovdqu %xmm2, 112(%rdi) + vmovdqu %xmm3, 128(%rdi) + vmovdqu %xmm4, 144(%rdi) + lea 32(%rsp), %rsi + vzeroupper + call HIDDEN_JUMPTARGET(\callee) + vmovdqu 80(%rsp), %xmm0 + lea 16(%rsp), %rdi + lea 48(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + movq 96(%rsp), %rdx + movq 104(%rsp), %rsi + movq 112(%rsp), %r8 + movq 120(%rsp), %r10 + movq (%rsp), %rax + movq 8(%rsp), %rcx + movq 16(%rsp), %rdi + movq 24(%rsp), %r9 + movq %rax, (%rdx) + movq %rcx, (%rsi) + movq 128(%rsp), %rax + movq 136(%rsp), %rcx + movq %rdi, (%r8) + movq %r9, (%r10) + movq 144(%rsp), %rdi + movq 152(%rsp), %r9 + movq 32(%rsp), %r11 + movq 40(%rsp), %rdx + movq 48(%rsp), %rsi + movq 56(%rsp), %r8 + movq %r11, (%rax) + movq %rdx, (%rcx) + movq %rsi, (%rdi) + movq %r8, (%r9) + movq %rbp, %rsp + popq %rbp + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-32, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r12 + leal -80(%rbp), %esi + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x70,0x6 + .cfi_escape 0x10,0xc,0x2,0x76,0x78 + leal -112(%rbp), %edi + movq %rsi, %r12 + pushq %rbx + .cfi_escape 0x10,0x3,0x2,0x76,0x68 + movq %rdi, %rbx + subl $152, %esp + vmovaps %xmm1, -128(%ebp) + vmovaps %xmm2, -144(%ebp) + vmovapd %ymm0, -176(%ebp) + vzeroupper + call HIDDEN_JUMPTARGET(\callee) + leal 16(%r12), %esi + vmovupd -160(%ebp), %xmm0 + leal 16(%rbx), %edi + call HIDDEN_JUMPTARGET(\callee) + movq -128(%ebp), %rax + vmovsd -112(%ebp), %xmm0 + vmovdqa -128(%ebp), %xmm5 + vmovdqa -144(%ebp), %xmm1 + vmovsd %xmm0, (%eax) + vmovsd -104(%ebp), %xmm0 + vpextrd $1, %xmm5, %eax + vmovsd %xmm0, (%eax) + movq -120(%ebp), %rax + vmovsd -96(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + vmovsd -88(%ebp), %xmm0 + vpextrd $3, %xmm5, %eax + vmovsd %xmm0, (%eax) + movq -144(%ebp), %rax + vmovsd -80(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + vmovsd -72(%ebp), %xmm0 + vpextrd $1, %xmm1, %eax + vmovsd %xmm0, (%eax) + movq -136(%ebp), %rax + vmovsd -64(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + vmovsd -56(%ebp), %xmm0 + vpextrd $3, %xmm1, %eax + vmovsd %xmm0, (%eax) + addl $152, %esp + popq %rbx + popq %r10 + .cfi_def_cfa 10, 0 + popq %r12 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +.endm + ENTRY (_ZGVcN4vvv_sincos) -WRAPPER_IMPL_AVX_fFF _ZGVbN2vvv_sincos +WRAPPER_IMPL_AVX_fFF_vvv _ZGVbN2vl8l8_sincos END (_ZGVcN4vvv_sincos) diff --git a/sysdeps/x86_64/fpu/svml_d_sincos8_core.S b/sysdeps/x86_64/fpu/svml_d_sincos8_core.S index 68d490e5bc..7cf453872b 100644 --- a/sysdeps/x86_64/fpu/svml_d_sincos8_core.S +++ b/sysdeps/x86_64/fpu/svml_d_sincos8_core.S @@ -1,5 +1,5 @@ /* Function sincos vectorized with AVX-512. Wrapper to AVX2 version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -20,6 +20,172 @@ #include "svml_d_wrapper_impl.h" .text +ENTRY (_ZGVeN8vl8l8_sincos) +WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos +END (_ZGVeN8vl8l8_sincos) + +/* AVX512 ISA version as wrapper to AVX2 ISA version (for vector + function declared with #pragma omp declare simd notinbranch). */ +.macro WRAPPER_IMPL_AVX512_fFF_vvv callee +#ifndef __ILP32__ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $320, %rsp + vmovups %zmm0, 256(%rsp) + lea (%rsp), %rdi + vmovups %zmm1, 128(%rdi) + vmovups %zmm2, 192(%rdi) + lea 64(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + vmovdqu 288(%rsp), %ymm0 + lea 32(%rsp), %rdi + lea 96(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + movq 128(%rsp), %rdx + movq 192(%rsp), %rsi + movq 136(%rsp), %r8 + movq 200(%rsp), %r10 + movq (%rsp), %rax + movq 64(%rsp), %rcx + movq 8(%rsp), %rdi + movq 72(%rsp), %r9 + movq %rax, (%rdx) + movq %rcx, (%rsi) + movq 144(%rsp), %rax + movq 208(%rsp), %rcx + movq %rdi, (%r8) + movq %r9, (%r10) + movq 152(%rsp), %rdi + movq 216(%rsp), %r9 + movq 16(%rsp), %r11 + movq 80(%rsp), %rdx + movq 24(%rsp), %rsi + movq 88(%rsp), %r8 + movq %r11, (%rax) + movq %rdx, (%rcx) + movq 160(%rsp), %r11 + movq 224(%rsp), %rdx + movq %rsi, (%rdi) + movq %r8, (%r9) + movq 168(%rsp), %rsi + movq 232(%rsp), %r8 + movq 32(%rsp), %r10 + movq 96(%rsp), %rax + movq 40(%rsp), %rcx + movq 104(%rsp), %rdi + movq %r10, (%r11) + movq %rax, (%rdx) + movq 176(%rsp), %r10 + movq 240(%rsp), %rax + movq %rcx, (%rsi) + movq %rdi, (%r8) + movq 184(%rsp), %rcx + movq 248(%rsp), %rdi + movq 48(%rsp), %r9 + movq 112(%rsp), %r11 + movq 56(%rsp), %rdx + movq 120(%rsp), %rsi + movq %r9, (%r10) + movq %r11, (%rax) + movq %rdx, (%rcx) + movq %rsi, (%rdi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-64, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r12 + leal -112(%rbp), %esi + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x70,0x6 + .cfi_escape 0x10,0xc,0x2,0x76,0x78 + leal -176(%rbp), %edi + movq %rsi, %r12 + pushq %rbx + .cfi_escape 0x10,0x3,0x2,0x76,0x68 + movq %rdi, %rbx + subl $280, %esp + vmovdqa %ymm1, -208(%ebp) + vmovdqa %ymm2, -240(%ebp) + vmovapd %zmm0, -304(%ebp) + call HIDDEN_JUMPTARGET(\callee) + leal 32(%r12), %esi + vmovupd -272(%ebp), %ymm0 + leal 32(%rbx), %edi + call HIDDEN_JUMPTARGET(\callee) + movl -208(%ebp), %eax + vmovsd -176(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -204(%ebp), %eax + vmovsd -168(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -200(%ebp), %eax + vmovsd -160(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -196(%ebp), %eax + vmovsd -152(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -192(%ebp), %eax + vmovsd -144(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -188(%ebp), %eax + vmovsd -136(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -184(%ebp), %eax + vmovsd -128(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -180(%ebp), %eax + vmovsd -120(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -240(%ebp), %eax + vmovsd -112(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -236(%ebp), %eax + vmovsd -104(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -232(%ebp), %eax + vmovsd -96(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -228(%ebp), %eax + vmovsd -88(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -224(%ebp), %eax + vmovsd -80(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -220(%ebp), %eax + vmovsd -72(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -216(%ebp), %eax + vmovsd -64(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movl -212(%ebp), %eax + vmovsd -56(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + addl $280, %esp + popq %rbx + popq %r10 + .cfi_def_cfa 10, 0 + popq %r12 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +.endm + ENTRY (_ZGVeN8vvv_sincos) -WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos +WRAPPER_IMPL_AVX512_fFF_vvv _ZGVdN4vl8l8_sincos END (_ZGVeN8vvv_sincos) diff --git a/sysdeps/x86_64/fpu/svml_d_trig_data.S b/sysdeps/x86_64/fpu/svml_d_trig_data.S index 887dacee91..2b148325fc 100644 --- a/sysdeps/x86_64/fpu/svml_d_trig_data.S +++ b/sysdeps/x86_64/fpu/svml_d_trig_data.S @@ -1,5 +1,5 @@ /* Data for vectorized sin, cos, sincos. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_d_trig_data.h b/sysdeps/x86_64/fpu/svml_d_trig_data.h index 4617b5e0c3..b9bb5dc6af 100644 --- a/sysdeps/x86_64/fpu/svml_d_trig_data.h +++ b/sysdeps/x86_64/fpu/svml_d_trig_data.h @@ -1,5 +1,5 @@ /* Offsets for data table for vectorized sin, cos, sincos. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h index 54f4f58371..d8452e0c2b 100644 --- a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h +++ b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h @@ -1,5 +1,5 @@ /* Wrapper implementations of vector math functions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -21,10 +21,10 @@ subq $40, %rsp cfi_adjust_cfa_offset(40) movaps %xmm0, (%rsp) - call \callee@PLT + call JUMPTARGET(\callee) movsd %xmm0, 16(%rsp) movsd 8(%rsp), %xmm0 - call \callee@PLT + call JUMPTARGET(\callee) movsd 16(%rsp), %xmm1 movsd %xmm0, 24(%rsp) unpcklpd %xmm0, %xmm1 @@ -40,11 +40,11 @@ cfi_adjust_cfa_offset(56) movaps %xmm0, (%rsp) movaps %xmm1, 16(%rsp) - call \callee@PLT + call JUMPTARGET(\callee) movsd %xmm0, 32(%rsp) movsd 8(%rsp), %xmm0 movsd 24(%rsp), %xmm1 - call \callee@PLT + call JUMPTARGET(\callee) movsd 32(%rsp), %xmm1 movsd %xmm0, 40(%rsp) unpcklpd %xmm0, %xmm1 @@ -69,7 +69,7 @@ leaq 16(%rsp), %rsi leaq 24(%rsp), %rdi movaps %xmm0, (%rsp) - call \callee@PLT + call JUMPTARGET(\callee) leaq 16(%rsp), %rsi leaq 24(%rsp), %rdi movsd 24(%rsp), %xmm0 @@ -79,7 +79,7 @@ movsd 16(%rsp), %xmm0 movsd %xmm0, (%rbx) movapd %xmm1, %xmm0 - call \callee@PLT + call JUMPTARGET(\callee) movsd 24(%rsp), %xmm0 movsd %xmm0, 8(%rbp) movsd 16(%rsp), %xmm0 @@ -201,29 +201,14 @@ cfi_def_cfa_register (%rbp) andq $-64, %rsp subq $128, %rsp -/* Below is encoding for vmovups %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x04 - .byte 0x24 + vmovups %zmm0, (%rsp) vmovupd (%rsp), %ymm0 call HIDDEN_JUMPTARGET(\callee) vmovupd %ymm0, 64(%rsp) vmovupd 32(%rsp), %ymm0 call HIDDEN_JUMPTARGET(\callee) vmovupd %ymm0, 96(%rsp) -/* Below is encoding for vmovups 64(%rsp), %zmm0. */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x10 - .byte 0x44 - .byte 0x24 - .byte 0x01 + vmovups 64(%rsp), %zmm0 movq %rbp, %rsp cfi_def_cfa_register (%rsp) popq %rbp @@ -241,23 +226,8 @@ cfi_def_cfa_register (%rbp) andq $-64, %rsp subq $192, %rsp -/* Below is encoding for vmovups %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x04 - .byte 0x24 -/* Below is encoding for vmovups %zmm1, 64(%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x4c - .byte 0x24 - .byte 0x01 + vmovups %zmm0, (%rsp) + vmovups %zmm1, 64(%rsp) vmovupd (%rsp), %ymm0 vmovupd 64(%rsp), %ymm1 call HIDDEN_JUMPTARGET(\callee) @@ -266,15 +236,7 @@ vmovupd 96(%rsp), %ymm1 call HIDDEN_JUMPTARGET(\callee) vmovupd %ymm0, 160(%rsp) -/* Below is encoding for vmovups 128(%rsp), %zmm0. */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x10 - .byte 0x44 - .byte 0x24 - .byte 0x02 + vmovups 128(%rsp), %zmm0 movq %rbp, %rsp cfi_def_cfa_register (%rsp) popq %rbp @@ -299,14 +261,7 @@ cfi_rel_offset (%r13, 0) subq $176, %rsp movq %rsi, %r13 -/* Below is encoding for vmovups %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x04 - .byte 0x24 + vmovups %zmm0, (%rsp) movq %rdi, %r12 vmovupd (%rsp), %ymm0 call HIDDEN_JUMPTARGET(\callee) diff --git a/sysdeps/x86_64/fpu/svml_finite_alias.S b/sysdeps/x86_64/fpu/svml_finite_alias.S index 2dcfc37590..21a9d6d2ee 100644 --- a/sysdeps/x86_64/fpu/svml_finite_alias.S +++ b/sysdeps/x86_64/fpu/svml_finite_alias.S @@ -2,7 +2,7 @@ aliases in libmvec.so while compiler creates the vector names based on scalar asm name. Corresponding discussion is at <https://gcc.gnu.org/ml/gcc/2015-06/msg00173.html>. - Copyright (C) 2015-2016 Free Software Foundation, Inc. + Copyright (C) 2015-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_cosf16_core.S b/sysdeps/x86_64/fpu/svml_s_cosf16_core.S index 9ca4fbfaa8..d1a4647082 100644 --- a/sysdeps/x86_64/fpu/svml_s_cosf16_core.S +++ b/sysdeps/x86_64/fpu/svml_s_cosf16_core.S @@ -1,5 +1,5 @@ /* Function cosf vectorized with AVX-512. Wrapper to AVX2 version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_cosf4_core.S b/sysdeps/x86_64/fpu/svml_s_cosf4_core.S index 363090c54a..d58ccecc09 100644 --- a/sysdeps/x86_64/fpu/svml_s_cosf4_core.S +++ b/sysdeps/x86_64/fpu/svml_s_cosf4_core.S @@ -1,5 +1,5 @@ /* Function cosf vectorized with SSE2, wrapper version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_cosf8_core.S b/sysdeps/x86_64/fpu/svml_s_cosf8_core.S index 26a6a4e4d6..f9dc74fc49 100644 --- a/sysdeps/x86_64/fpu/svml_s_cosf8_core.S +++ b/sysdeps/x86_64/fpu/svml_s_cosf8_core.S @@ -1,5 +1,5 @@ /* Function cosf vectorized with AVX2, wrapper version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_cosf8_core_avx.S b/sysdeps/x86_64/fpu/svml_s_cosf8_core_avx.S index 6c210d98ce..45f14e23df 100644 --- a/sysdeps/x86_64/fpu/svml_s_cosf8_core_avx.S +++ b/sysdeps/x86_64/fpu/svml_s_cosf8_core_avx.S @@ -1,5 +1,5 @@ /* Function cosf vectorized in AVX ISA as wrapper to SSE4 ISA version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_expf16_core.S b/sysdeps/x86_64/fpu/svml_s_expf16_core.S index d8eecac674..4e18b6f544 100644 --- a/sysdeps/x86_64/fpu/svml_s_expf16_core.S +++ b/sysdeps/x86_64/fpu/svml_s_expf16_core.S @@ -1,5 +1,5 @@ /* Function expf vectorized with AVX-512. Wrapper to AVX2 version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_expf4_core.S b/sysdeps/x86_64/fpu/svml_s_expf4_core.S index 65b5d1a3ce..a2a6209621 100644 --- a/sysdeps/x86_64/fpu/svml_s_expf4_core.S +++ b/sysdeps/x86_64/fpu/svml_s_expf4_core.S @@ -1,5 +1,5 @@ /* Function expf vectorized with SSE2. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -22,7 +22,7 @@ .text ENTRY (_ZGVbN4v_expf) -WRAPPER_IMPL_SSE2 expf +WRAPPER_IMPL_SSE2 __expf_finite END (_ZGVbN4v_expf) #ifndef USE_MULTIARCH diff --git a/sysdeps/x86_64/fpu/svml_s_expf8_core.S b/sysdeps/x86_64/fpu/svml_s_expf8_core.S index e3cf975bf6..46297208cd 100644 --- a/sysdeps/x86_64/fpu/svml_s_expf8_core.S +++ b/sysdeps/x86_64/fpu/svml_s_expf8_core.S @@ -1,5 +1,5 @@ /* Function expf vectorized with AVX2, wrapper version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_expf8_core_avx.S b/sysdeps/x86_64/fpu/svml_s_expf8_core_avx.S index 90469d7dcf..1210dcf885 100644 --- a/sysdeps/x86_64/fpu/svml_s_expf8_core_avx.S +++ b/sysdeps/x86_64/fpu/svml_s_expf8_core_avx.S @@ -1,5 +1,5 @@ /* Function expf vectorized in AVX ISA as wrapper to SSE4 ISA version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_expf_data.S b/sysdeps/x86_64/fpu/svml_s_expf_data.S index 4b644082b6..a1cb6e7591 100644 --- a/sysdeps/x86_64/fpu/svml_s_expf_data.S +++ b/sysdeps/x86_64/fpu/svml_s_expf_data.S @@ -1,5 +1,5 @@ /* Data for function expf. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_expf_data.h b/sysdeps/x86_64/fpu/svml_s_expf_data.h index 3610633c96..56a1d8bdf6 100644 --- a/sysdeps/x86_64/fpu/svml_s_expf_data.h +++ b/sysdeps/x86_64/fpu/svml_s_expf_data.h @@ -1,5 +1,5 @@ /* Offsets for data table for vector function expf. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_logf16_core.S b/sysdeps/x86_64/fpu/svml_s_logf16_core.S index cc2e97df78..e1f4b0cf0c 100644 --- a/sysdeps/x86_64/fpu/svml_s_logf16_core.S +++ b/sysdeps/x86_64/fpu/svml_s_logf16_core.S @@ -1,5 +1,5 @@ /* Function logf vectorized with AVX-512. Wrapper to AVX2 version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_logf4_core.S b/sysdeps/x86_64/fpu/svml_s_logf4_core.S index 195f328d92..496b93ffa6 100644 --- a/sysdeps/x86_64/fpu/svml_s_logf4_core.S +++ b/sysdeps/x86_64/fpu/svml_s_logf4_core.S @@ -1,5 +1,5 @@ /* Function logf vectorized with SSE2. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -22,7 +22,7 @@ .text ENTRY (_ZGVbN4v_logf) -WRAPPER_IMPL_SSE2 logf +WRAPPER_IMPL_SSE2 __logf_finite END (_ZGVbN4v_logf) #ifndef USE_MULTIARCH diff --git a/sysdeps/x86_64/fpu/svml_s_logf8_core.S b/sysdeps/x86_64/fpu/svml_s_logf8_core.S index 8bb6926667..f0ccee7205 100644 --- a/sysdeps/x86_64/fpu/svml_s_logf8_core.S +++ b/sysdeps/x86_64/fpu/svml_s_logf8_core.S @@ -1,5 +1,5 @@ /* Function logf vectorized with AVX2, wrapper version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_logf8_core_avx.S b/sysdeps/x86_64/fpu/svml_s_logf8_core_avx.S index c2efba23f2..1ddd0381cd 100644 --- a/sysdeps/x86_64/fpu/svml_s_logf8_core_avx.S +++ b/sysdeps/x86_64/fpu/svml_s_logf8_core_avx.S @@ -1,5 +1,5 @@ /* Function logf vectorized in AVX ISA as wrapper to SSE4 ISA version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_logf_data.S b/sysdeps/x86_64/fpu/svml_s_logf_data.S index a5675f5c7a..154f98c2e0 100644 --- a/sysdeps/x86_64/fpu/svml_s_logf_data.S +++ b/sysdeps/x86_64/fpu/svml_s_logf_data.S @@ -1,5 +1,5 @@ /* Data for vector function logf. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_logf_data.h b/sysdeps/x86_64/fpu/svml_s_logf_data.h index 619d5c4bd1..82a9903b10 100644 --- a/sysdeps/x86_64/fpu/svml_s_logf_data.h +++ b/sysdeps/x86_64/fpu/svml_s_logf_data.h @@ -1,5 +1,5 @@ /* Offsets for data table for vectorized function logf. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_powf16_core.S b/sysdeps/x86_64/fpu/svml_s_powf16_core.S index cb52af0c6b..0859996d0a 100644 --- a/sysdeps/x86_64/fpu/svml_s_powf16_core.S +++ b/sysdeps/x86_64/fpu/svml_s_powf16_core.S @@ -1,5 +1,5 @@ /* Function powf vectorized with AVX-512. Wrapper to AVX2 version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_powf4_core.S b/sysdeps/x86_64/fpu/svml_s_powf4_core.S index 88fae60892..4276e6ea28 100644 --- a/sysdeps/x86_64/fpu/svml_s_powf4_core.S +++ b/sysdeps/x86_64/fpu/svml_s_powf4_core.S @@ -1,5 +1,5 @@ /* Function powf vectorized with SSE2. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -21,7 +21,7 @@ .text ENTRY (_ZGVbN4vv_powf) -WRAPPER_IMPL_SSE2_ff powf +WRAPPER_IMPL_SSE2_ff __powf_finite END (_ZGVbN4vv_powf) #ifndef USE_MULTIARCH diff --git a/sysdeps/x86_64/fpu/svml_s_powf8_core.S b/sysdeps/x86_64/fpu/svml_s_powf8_core.S index 8ea44897c1..764dc99ee7 100644 --- a/sysdeps/x86_64/fpu/svml_s_powf8_core.S +++ b/sysdeps/x86_64/fpu/svml_s_powf8_core.S @@ -1,5 +1,5 @@ /* Function powf vectorized with AVX2, wrapper version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_powf8_core_avx.S b/sysdeps/x86_64/fpu/svml_s_powf8_core_avx.S index b5e4e5e6ef..8bb1ef22fd 100644 --- a/sysdeps/x86_64/fpu/svml_s_powf8_core_avx.S +++ b/sysdeps/x86_64/fpu/svml_s_powf8_core_avx.S @@ -1,5 +1,5 @@ /* Function powf vectorized in AVX ISA as wrapper to SSE4 ISA version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_powf_data.S b/sysdeps/x86_64/fpu/svml_s_powf_data.S index fc1a3d9390..74a31abd1e 100644 --- a/sysdeps/x86_64/fpu/svml_s_powf_data.S +++ b/sysdeps/x86_64/fpu/svml_s_powf_data.S @@ -1,5 +1,5 @@ /* Data for function powf. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_powf_data.h b/sysdeps/x86_64/fpu/svml_s_powf_data.h index 514004238a..5d3270cf27 100644 --- a/sysdeps/x86_64/fpu/svml_s_powf_data.h +++ b/sysdeps/x86_64/fpu/svml_s_powf_data.h @@ -1,5 +1,5 @@ /* Offsets for data table for function powf. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S b/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S index 5cbf10b8da..40eb974a74 100644 --- a/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S +++ b/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S @@ -1,5 +1,5 @@ /* Function sincosf vectorized with AVX-512. Wrapper to AVX2 version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -20,6 +20,270 @@ #include "svml_s_wrapper_impl.h" .text +ENTRY (_ZGVeN16vl4l4_sincosf) +WRAPPER_IMPL_AVX512_fFF _ZGVdN8vl4l4_sincosf +END (_ZGVeN16vl4l4_sincosf) + +/* AVX512 ISA version as wrapper to AVX2 ISA version (for vector + function declared with #pragma omp declare simd notinbranch). */ +.macro WRAPPER_IMPL_AVX512_fFF_vvv callee +#ifndef __ILP32__ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + vmovups %zmm0, 384(%rsp) + lea (%rsp), %rdi + vmovups %zmm1, 128(%rdi) + vmovups %zmm2, 192(%rdi) + vmovups %zmm3, 256(%rdi) + vmovups %zmm4, 320(%rdi) + lea 64(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + vmovdqu 416(%rsp), %ymm0 + lea 32(%rsp), %rdi + lea 96(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + movq 128(%rsp), %rdx + movq 136(%rsp), %rsi + movq 144(%rsp), %r8 + movq 152(%rsp), %r10 + movl (%rsp), %eax + movl 4(%rsp), %ecx + movl 8(%rsp), %edi + movl 12(%rsp), %r9d + movl %eax, (%rdx) + movl %ecx, (%rsi) + movq 160(%rsp), %rax + movq 168(%rsp), %rcx + movl %edi, (%r8) + movl %r9d, (%r10) + movq 176(%rsp), %rdi + movq 184(%rsp), %r9 + movl 16(%rsp), %r11d + movl 20(%rsp), %edx + movl 24(%rsp), %esi + movl 28(%rsp), %r8d + movl %r11d, (%rax) + movl %edx, (%rcx) + movq 192(%rsp), %r11 + movq 200(%rsp), %rdx + movl %esi, (%rdi) + movl %r8d, (%r9) + movq 208(%rsp), %rsi + movq 216(%rsp), %r8 + movl 32(%rsp), %r10d + movl 36(%rsp), %eax + movl 40(%rsp), %ecx + movl 44(%rsp), %edi + movl %r10d, (%r11) + movl %eax, (%rdx) + movq 224(%rsp), %r10 + movq 232(%rsp), %rax + movl %ecx, (%rsi) + movl %edi, (%r8) + movq 240(%rsp), %rcx + movq 248(%rsp), %rdi + movl 48(%rsp), %r9d + movl 52(%rsp), %r11d + movl 56(%rsp), %edx + movl 60(%rsp), %esi + movl %r9d, (%r10) + movl %r11d, (%rax) + movq 256(%rsp), %r9 + movq 264(%rsp), %r11 + movl %edx, (%rcx) + movl %esi, (%rdi) + movq 272(%rsp), %rdx + movq 280(%rsp), %rsi + movl 64(%rsp), %r8d + movl 68(%rsp), %r10d + movl 72(%rsp), %eax + movl 76(%rsp), %ecx + movl %r8d, (%r9) + movl %r10d, (%r11) + movq 288(%rsp), %r8 + movq 296(%rsp), %r10 + movl %eax, (%rdx) + movl %ecx, (%rsi) + movq 304(%rsp), %rax + movq 312(%rsp), %rcx + movl 80(%rsp), %edi + movl 84(%rsp), %r9d + movl 88(%rsp), %r11d + movl 92(%rsp), %edx + movl %edi, (%r8) + movl %r9d, (%r10) + movq 320(%rsp), %rdi + movq 328(%rsp), %r9 + movl %r11d, (%rax) + movl %edx, (%rcx) + movq 336(%rsp), %r11 + movq 344(%rsp), %rdx + movl 96(%rsp), %esi + movl 100(%rsp), %r8d + movl 104(%rsp), %r10d + movl 108(%rsp), %eax + movl %esi, (%rdi) + movl %r8d, (%r9) + movq 352(%rsp), %rsi + movq 360(%rsp), %r8 + movl %r10d, (%r11) + movl %eax, (%rdx) + movq 368(%rsp), %r10 + movq 376(%rsp), %rax + movl 112(%rsp), %ecx + movl 116(%rsp), %edi + movl 120(%rsp), %r9d + movl 124(%rsp), %r11d + movl %ecx, (%rsi) + movl %edi, (%r8) + movl %r9d, (%r10) + movl %r11d, (%rax) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-64, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r12 + leal -112(%rbp), %esi + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x70,0x6 + .cfi_escape 0x10,0xc,0x2,0x76,0x78 + leal -176(%rbp), %edi + movq %rsi, %r12 + pushq %rbx + .cfi_escape 0x10,0x3,0x2,0x76,0x68 + movq %rdi, %rbx + subl $344, %esp + vmovdqa64 %zmm1, -240(%ebp) + vmovdqa64 %zmm2, -304(%ebp) + vmovaps %zmm0, -368(%ebp) + call HIDDEN_JUMPTARGET(\callee) + leal 32(%r12), %esi + vmovups -336(%ebp), %ymm0 + leal 32(%rbx), %edi + call HIDDEN_JUMPTARGET(\callee) + movl -240(%ebp), %eax + vmovss -176(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -236(%ebp), %eax + vmovss -172(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -232(%ebp), %eax + vmovss -168(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -228(%ebp), %eax + vmovss -164(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -224(%ebp), %eax + vmovss -160(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -220(%ebp), %eax + vmovss -156(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -216(%ebp), %eax + vmovss -152(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -212(%ebp), %eax + vmovss -148(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -208(%ebp), %eax + vmovss -144(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -204(%ebp), %eax + vmovss -140(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -200(%ebp), %eax + vmovss -136(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -196(%ebp), %eax + vmovss -132(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -192(%ebp), %eax + vmovss -128(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -188(%ebp), %eax + vmovss -124(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -184(%ebp), %eax + vmovss -120(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -180(%ebp), %eax + vmovss -116(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -304(%ebp), %eax + vmovss -112(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -300(%ebp), %eax + vmovss -108(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -296(%ebp), %eax + vmovss -104(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -292(%ebp), %eax + vmovss -100(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -288(%ebp), %eax + vmovss -96(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -284(%ebp), %eax + vmovss -92(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -280(%ebp), %eax + vmovss -88(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -276(%ebp), %eax + vmovss -84(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -272(%ebp), %eax + vmovss -80(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -268(%ebp), %eax + vmovss -76(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -264(%ebp), %eax + vmovss -72(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -260(%ebp), %eax + vmovss -68(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -256(%ebp), %eax + vmovss -64(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -252(%ebp), %eax + vmovss -60(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -248(%ebp), %eax + vmovss -56(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -244(%ebp), %eax + vmovss -52(%ebp), %xmm0 + vmovss %xmm0, (%eax) + addl $344, %esp + popq %rbx + popq %r10 + .cfi_def_cfa 10, 0 + popq %r12 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +.endm + ENTRY (_ZGVeN16vvv_sincosf) -WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf +WRAPPER_IMPL_AVX512_fFF_vvv _ZGVdN8vl4l4_sincosf END (_ZGVeN16vvv_sincosf) diff --git a/sysdeps/x86_64/fpu/svml_s_sincosf4_core.S b/sysdeps/x86_64/fpu/svml_s_sincosf4_core.S index 1a7d2733af..5daa5118d6 100644 --- a/sysdeps/x86_64/fpu/svml_s_sincosf4_core.S +++ b/sysdeps/x86_64/fpu/svml_s_sincosf4_core.S @@ -1,5 +1,5 @@ /* Function sincosf vectorized with SSE2. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,13 +16,135 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ - #include <sysdep.h> #include "svml_s_wrapper_impl.h" .text -ENTRY (_ZGVbN4vvv_sincosf) +ENTRY (_ZGVbN4vl4l4_sincosf) WRAPPER_IMPL_SSE2_fFF sincosf +END (_ZGVbN4vl4l4_sincosf) +libmvec_hidden_def (_ZGVbN4vl4l4_sincosf) + +/* SSE2 ISA version as wrapper to scalar (for vector + function declared with #pragma omp declare simd notinbranch). */ +.macro WRAPPER_IMPL_SSE2_fFF_vvv callee +#ifndef __ILP32__ + subq $120, %rsp + cfi_adjust_cfa_offset(120) + movaps %xmm0, 96(%rsp) + lea (%rsp), %rdi + movdqa %xmm1, 32(%rdi) + lea 16(%rsp), %rsi + movdqa %xmm2, 32(%rsi) + movdqa %xmm3, 48(%rsi) + movdqa %xmm4, 64(%rsi) + call JUMPTARGET(\callee) + movss 100(%rsp), %xmm0 + lea 4(%rsp), %rdi + lea 20(%rsp), %rsi + call JUMPTARGET(\callee) + movss 104(%rsp), %xmm0 + lea 8(%rsp), %rdi + lea 24(%rsp), %rsi + call JUMPTARGET(\callee) + movss 108(%rsp), %xmm0 + lea 12(%rsp), %rdi + lea 28(%rsp), %rsi + call JUMPTARGET(\callee) + movq 32(%rsp), %rdx + movq 40(%rsp), %rsi + movq 48(%rsp), %r8 + movq 56(%rsp), %r10 + movl (%rsp), %eax + movl 4(%rsp), %ecx + movl 8(%rsp), %edi + movl 12(%rsp), %r9d + movl %eax, (%rdx) + movl %ecx, (%rsi) + movq 64(%rsp), %rax + movq 72(%rsp), %rcx + movl %edi, (%r8) + movl %r9d, (%r10) + movq 80(%rsp), %rdi + movq 88(%rsp), %r9 + movl 16(%rsp), %r11d + movl 20(%rsp), %edx + movl 24(%rsp), %esi + movl 28(%rsp), %r8d + movl %r11d, (%rax) + movl %edx, (%rcx) + movl %esi, (%rdi) + movl %r8d, (%r9) + addq $120, %rsp + cfi_adjust_cfa_offset(-120) + ret +#else + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + pushq %rbx + .cfi_def_cfa_offset 24 + .cfi_offset 3, -24 + subl $88, %esp + .cfi_def_cfa_offset 112 + leal 64(%rsp), %esi + movaps %xmm1, (%esp) + leal 48(%rsp), %edi + movaps %xmm2, 16(%esp) + movq %rsi, %rbp + movq %rdi, %rbx + movaps %xmm0, 32(%esp) + call JUMPTARGET(\callee) + movups 36(%esp), %xmm0 + leal 4(%rbp), %esi + leal 4(%rbx), %edi + call JUMPTARGET(\callee) + movups 40(%esp), %xmm0 + leal 8(%rbp), %esi + leal 8(%rbx), %edi + call JUMPTARGET(\callee) + movups 44(%esp), %xmm0 + leal 12(%rbp), %esi + leal 12(%rbx), %edi + call JUMPTARGET(\callee) + movq (%esp), %rax + movss 48(%esp), %xmm0 + movdqa (%esp), %xmm4 + movdqa 16(%esp), %xmm7 + movss %xmm0, (%eax) + movss 52(%esp), %xmm0 + pextrd $1, %xmm4, %eax + movss %xmm0, (%eax) + movq 8(%esp), %rax + movss 56(%esp), %xmm0 + movss %xmm0, (%eax) + movss 60(%esp), %xmm0 + pextrd $3, %xmm4, %eax + movss %xmm0, (%eax) + movq 16(%esp), %rax + movss 64(%esp), %xmm0 + movss %xmm0, (%eax) + movss 68(%esp), %xmm0 + pextrd $1, %xmm7, %eax + movss %xmm0, (%eax) + movq 24(%esp), %rax + movss 72(%esp), %xmm0 + movss %xmm0, (%eax) + movss 76(%esp), %xmm0 + pextrd $3, %xmm7, %eax + movss %xmm0, (%eax) + addl $88, %esp + .cfi_def_cfa_offset 24 + popq %rbx + .cfi_def_cfa_offset 16 + popq %rbp + .cfi_def_cfa_offset 8 + ret +#endif +.endm + +ENTRY (_ZGVbN4vvv_sincosf) +WRAPPER_IMPL_SSE2_fFF_vvv sincosf END (_ZGVbN4vvv_sincosf) #ifndef USE_MULTIARCH diff --git a/sysdeps/x86_64/fpu/svml_s_sincosf8_core.S b/sysdeps/x86_64/fpu/svml_s_sincosf8_core.S index 74d1dfd1a8..d6d4600d10 100644 --- a/sysdeps/x86_64/fpu/svml_s_sincosf8_core.S +++ b/sysdeps/x86_64/fpu/svml_s_sincosf8_core.S @@ -1,5 +1,5 @@ /* Function sincosf vectorized with AVX2, wrapper version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -20,8 +20,179 @@ #include "svml_s_wrapper_impl.h" .text +ENTRY (_ZGVdN8vl4l4_sincosf) +WRAPPER_IMPL_AVX_fFF _ZGVbN4vl4l4_sincosf +END (_ZGVdN8vl4l4_sincosf) +libmvec_hidden_def (_ZGVdN8vl4l4_sincosf) + +/* AVX2 ISA version as wrapper to SSE ISA version (for vector + function declared with #pragma omp declare simd notinbranch). */ +.macro WRAPPER_IMPL_AVX2_fFF_vvv callee +#ifndef __ILP32__ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-32, %rsp + subq $224, %rsp + vmovups %ymm0, 192(%rsp) + lea (%rsp), %rdi + vmovdqu %ymm1, 64(%rdi) + vmovdqu %ymm2, 96(%rdi) + vmovdqu %ymm3, 128(%rdi) + vmovdqu %ymm4, 160(%rdi) + lea 32(%rsp), %rsi + vzeroupper + call HIDDEN_JUMPTARGET(\callee) + vmovups 208(%rsp), %xmm0 + lea 16(%rsp), %rdi + lea 48(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + movq 64(%rsp), %rdx + movq 72(%rsp), %rsi + movq 80(%rsp), %r8 + movq 88(%rsp), %r10 + movl (%rsp), %eax + movl 4(%rsp), %ecx + movl 8(%rsp), %edi + movl 12(%rsp), %r9d + movl %eax, (%rdx) + movl %ecx, (%rsi) + movq 96(%rsp), %rax + movq 104(%rsp), %rcx + movl %edi, (%r8) + movl %r9d, (%r10) + movq 112(%rsp), %rdi + movq 120(%rsp), %r9 + movl 16(%rsp), %r11d + movl 20(%rsp), %edx + movl 24(%rsp), %esi + movl 28(%rsp), %r8d + movl %r11d, (%rax) + movl %edx, (%rcx) + movq 128(%rsp), %r11 + movq 136(%rsp), %rdx + movl %esi, (%rdi) + movl %r8d, (%r9) + movq 144(%rsp), %rsi + movq 152(%rsp), %r8 + movl 32(%rsp), %r10d + movl 36(%rsp), %eax + movl 40(%rsp), %ecx + movl 44(%rsp), %edi + movl %r10d, (%r11) + movl %eax, (%rdx) + movq 160(%rsp), %r10 + movq 168(%rsp), %rax + movl %ecx, (%rsi) + movl %edi, (%r8) + movq 176(%rsp), %rcx + movq 184(%rsp), %rdi + movl 48(%rsp), %r9d + movl 52(%rsp), %r11d + movl 56(%rsp), %edx + movl 60(%rsp), %esi + movl %r9d, (%r10) + movl %r11d, (%rax) + movl %edx, (%rcx) + movl %esi, (%rdi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-32, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r12 + leal -80(%rbp), %esi + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x70,0x6 + .cfi_escape 0x10,0xc,0x2,0x76,0x78 + leal -112(%rbp), %edi + movq %rsi, %r12 + pushq %rbx + .cfi_escape 0x10,0x3,0x2,0x76,0x68 + movq %rdi, %rbx + subl $184, %esp + vmovdqa %ymm1, -144(%ebp) + vmovdqa %ymm2, -176(%ebp) + vmovaps %ymm0, -208(%ebp) + vzeroupper + call HIDDEN_JUMPTARGET(\callee) + leal 16(%r12), %esi + vmovups -192(%ebp), %xmm0 + leal 16(%rbx), %edi + call HIDDEN_JUMPTARGET(\callee) + movl -144(%ebp), %eax + vmovss -112(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -140(%ebp), %eax + vmovss -108(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -136(%ebp), %eax + vmovss -104(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -132(%ebp), %eax + vmovss -100(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -128(%ebp), %eax + vmovss -96(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -124(%ebp), %eax + vmovss -92(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -120(%ebp), %eax + vmovss -88(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -116(%ebp), %eax + vmovss -84(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -176(%ebp), %eax + vmovss -80(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -172(%ebp), %eax + vmovss -76(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -168(%ebp), %eax + vmovss -72(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -164(%ebp), %eax + vmovss -68(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -160(%ebp), %eax + vmovss -64(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -156(%ebp), %eax + vmovss -60(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -152(%ebp), %eax + vmovss -56(%ebp), %xmm0 + vmovss %xmm0, (%eax) + movl -148(%ebp), %eax + vmovss -52(%ebp), %xmm0 + vmovss %xmm0, (%eax) + addl $184, %esp + popq %rbx + popq %r10 + .cfi_def_cfa 10, 0 + popq %r12 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +.endm + ENTRY (_ZGVdN8vvv_sincosf) -WRAPPER_IMPL_AVX_fFF _ZGVbN4vvv_sincosf +WRAPPER_IMPL_AVX2_fFF_vvv _ZGVbN4vl4l4_sincosf END (_ZGVdN8vvv_sincosf) #ifndef USE_MULTIARCH diff --git a/sysdeps/x86_64/fpu/svml_s_sincosf8_core_avx.S b/sysdeps/x86_64/fpu/svml_s_sincosf8_core_avx.S index 55b8b2d768..585e6d87c4 100644 --- a/sysdeps/x86_64/fpu/svml_s_sincosf8_core_avx.S +++ b/sysdeps/x86_64/fpu/svml_s_sincosf8_core_avx.S @@ -1,5 +1,5 @@ /* Function sincosf vectorized in AVX ISA as wrapper to SSE4 ISA version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -20,6 +20,179 @@ #include "svml_s_wrapper_impl.h" .text -ENTRY(_ZGVcN8vvv_sincosf) -WRAPPER_IMPL_AVX_fFF _ZGVbN4vvv_sincosf -END(_ZGVcN8vvv_sincosf) +ENTRY (_ZGVcN8vl4l4_sincosf) +WRAPPER_IMPL_AVX_fFF _ZGVbN4vl4l4_sincosf +END (_ZGVcN8vl4l4_sincosf) + +/* AVX ISA version as wrapper to SSE ISA version (for vector + function declared with #pragma omp declare simd notinbranch). */ +.macro WRAPPER_IMPL_AVX_fFF_vvv callee +#ifndef __ILP32__ + pushq %rbp + movq %rsp, %rbp + andq $-32, %rsp + subq $224, %rsp + vmovups %ymm0, 64(%rsp) + lea (%rsp), %rdi + vmovdqu %xmm1, 96(%rdi) + vmovdqu %xmm2, 112(%rdi) + vmovdqu %xmm3, 128(%rdi) + vmovdqu %xmm4, 144(%rdi) + vmovdqu %xmm5, 160(%rdi) + lea 32(%rsp), %rsi + vmovdqu %xmm6, 144(%rsi) + vmovdqu %xmm7, 160(%rsi) + vzeroupper + call HIDDEN_JUMPTARGET(\callee) + vmovdqu 80(%rsp), %xmm0 + lea 16(%rsp), %rdi + lea 48(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + movq 96(%rsp), %rdx + movq 104(%rsp), %rsi + movq 112(%rsp), %r8 + movq 120(%rsp), %r10 + movl (%rsp), %eax + movl 4(%rsp), %ecx + movl 8(%rsp), %edi + movl 12(%rsp), %r9d + movl %eax, (%rdx) + movl %ecx, (%rsi) + movq 128(%rsp), %rax + movq 136(%rsp), %rcx + movl %edi, (%r8) + movl %r9d, (%r10) + movq 144(%rsp), %rdi + movq 152(%rsp), %r9 + movl 16(%rsp), %r11d + movl 20(%rsp), %edx + movl 24(%rsp), %esi + movl 28(%rsp), %r8d + movl %r11d, (%rax) + movl %edx, (%rcx) + movq 160(%rsp), %r11 + movq 168(%rsp), %rdx + movl %esi, (%rdi) + movl %r8d, (%r9) + movq 176(%rsp), %rsi + movq 184(%rsp), %r8 + movl 32(%rsp), %r10d + movl 36(%rsp), %eax + movl 40(%rsp), %ecx + movl 44(%rsp), %edi + movl %r10d, (%r11) + movl %eax, (%rdx) + movq 192(%rsp), %r10 + movq 200(%rsp), %rax + movl %ecx, (%rsi) + movl %edi, (%r8) + movq 16(%rbp), %rcx + movq 24(%rbp), %rdi + movl 48(%rsp), %r9d + movl 52(%rsp), %r11d + movl 56(%rsp), %edx + movl 60(%rsp), %esi + movl %r9d, (%r10) + movl %r11d, (%rax) + movl %edx, (%rcx) + movl %esi, (%rdi) + movq %rbp, %rsp + popq %rbp + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-32, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r12 + leal -80(%rbp), %esi + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x70,0x6 + .cfi_escape 0x10,0xc,0x2,0x76,0x78 + leal -112(%rbp), %edi + movq %rsi, %r12 + pushq %rbx + .cfi_escape 0x10,0x3,0x2,0x76,0x68 + movq %rdi, %rbx + subl $184, %esp + vmovaps %xmm1, -128(%ebp) + vmovaps %xmm2, -144(%ebp) + vmovaps %xmm3, -160(%ebp) + vmovaps %xmm4, -176(%ebp) + vmovaps %ymm0, -208(%ebp) + vzeroupper + call HIDDEN_JUMPTARGET(\callee) + leal 16(%r12), %esi + vmovups -192(%ebp), %xmm0 + leal 16(%rbx), %edi + call HIDDEN_JUMPTARGET(\callee) + movq -128(%ebp), %rax + vmovss -112(%ebp), %xmm0 + vmovdqa -128(%ebp), %xmm7 + vmovdqa -144(%ebp), %xmm3 + vmovss %xmm0, (%eax) + vmovss -108(%ebp), %xmm0 + vpextrd $1, %xmm7, %eax + vmovss %xmm0, (%eax) + movq -120(%ebp), %rax + vmovss -104(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -100(%ebp), %xmm0 + vpextrd $3, %xmm7, %eax + vmovdqa -160(%ebp), %xmm7 + vmovss %xmm0, (%eax) + movq -144(%ebp), %rax + vmovss -96(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -92(%ebp), %xmm0 + vpextrd $1, %xmm3, %eax + vmovss %xmm0, (%eax) + movq -136(%ebp), %rax + vmovss -88(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -84(%ebp), %xmm0 + vpextrd $3, %xmm3, %eax + vmovss %xmm0, (%eax) + movq -160(%ebp), %rax + vmovss -80(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -76(%ebp), %xmm0 + vpextrd $1, %xmm7, %eax + vmovss %xmm0, (%eax) + movq -152(%ebp), %rax + vmovss -72(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -68(%ebp), %xmm0 + vpextrd $3, %xmm7, %eax + vmovss %xmm0, (%eax) + movq -176(%ebp), %rax + vmovss -64(%ebp), %xmm0 + vmovdqa -176(%ebp), %xmm3 + vmovss %xmm0, (%eax) + vmovss -60(%ebp), %xmm0 + vpextrd $1, %xmm3, %eax + vmovss %xmm0, (%eax) + movq -168(%ebp), %rax + vmovss -56(%ebp), %xmm0 + vmovss %xmm0, (%eax) + vmovss -52(%ebp), %xmm0 + vpextrd $3, %xmm3, %eax + vmovss %xmm0, (%eax) + addl $184, %esp + popq %rbx + popq %r10 + .cfi_def_cfa 10, 0 + popq %r12 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +.endm + +ENTRY (_ZGVcN8vvv_sincosf) +WRAPPER_IMPL_AVX_fFF_vvv _ZGVbN4vl4l4_sincosf +END (_ZGVcN8vvv_sincosf) diff --git a/sysdeps/x86_64/fpu/svml_s_sinf16_core.S b/sysdeps/x86_64/fpu/svml_s_sinf16_core.S index d7a31e1ea6..8c5547e26f 100644 --- a/sysdeps/x86_64/fpu/svml_s_sinf16_core.S +++ b/sysdeps/x86_64/fpu/svml_s_sinf16_core.S @@ -1,5 +1,5 @@ /* Function sinf vectorized with AVX-512. Wrapper to AVX2 version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_sinf4_core.S b/sysdeps/x86_64/fpu/svml_s_sinf4_core.S index 6f10137134..d56137b32a 100644 --- a/sysdeps/x86_64/fpu/svml_s_sinf4_core.S +++ b/sysdeps/x86_64/fpu/svml_s_sinf4_core.S @@ -1,5 +1,5 @@ /* Function sinf vectorized with SSE2. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_sinf8_core.S b/sysdeps/x86_64/fpu/svml_s_sinf8_core.S index c459658688..e39392243e 100644 --- a/sysdeps/x86_64/fpu/svml_s_sinf8_core.S +++ b/sysdeps/x86_64/fpu/svml_s_sinf8_core.S @@ -1,5 +1,5 @@ /* Function sinf vectorized with AVX2, wrapper version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_sinf8_core_avx.S b/sysdeps/x86_64/fpu/svml_s_sinf8_core_avx.S index 5e95aa2e02..9984e6f9f7 100644 --- a/sysdeps/x86_64/fpu/svml_s_sinf8_core_avx.S +++ b/sysdeps/x86_64/fpu/svml_s_sinf8_core_avx.S @@ -1,5 +1,5 @@ /* Function sinf vectorized in AVX ISA as wrapper to SSE4 ISA version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_trig_data.S b/sysdeps/x86_64/fpu/svml_s_trig_data.S index b61aa6abb9..8f1e1f60b8 100644 --- a/sysdeps/x86_64/fpu/svml_s_trig_data.S +++ b/sysdeps/x86_64/fpu/svml_s_trig_data.S @@ -1,5 +1,5 @@ /* Data for function cosf. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_trig_data.h b/sysdeps/x86_64/fpu/svml_s_trig_data.h index 2e469a918a..0faf161c08 100644 --- a/sysdeps/x86_64/fpu/svml_s_trig_data.h +++ b/sysdeps/x86_64/fpu/svml_s_trig_data.h @@ -1,5 +1,5 @@ /* Offsets for data table for vectorized sinf, cosf, sincosf. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h index b1a03be3d9..937afb5cbc 100644 --- a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h +++ b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h @@ -1,5 +1,5 @@ /* Wrapper implementations of vector math functions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -21,16 +21,16 @@ subq $40, %rsp cfi_adjust_cfa_offset(40) movaps %xmm0, (%rsp) - call \callee@PLT + call JUMPTARGET(\callee) movss %xmm0, 16(%rsp) movss 4(%rsp), %xmm0 - call \callee@PLT + call JUMPTARGET(\callee) movss %xmm0, 20(%rsp) movss 8(%rsp), %xmm0 - call \callee@PLT + call JUMPTARGET(\callee) movss %xmm0, 24(%rsp) movss 12(%rsp), %xmm0 - call \callee@PLT + call JUMPTARGET(\callee) movss 16(%rsp), %xmm3 movss 20(%rsp), %xmm2 movss 24(%rsp), %xmm1 @@ -50,19 +50,19 @@ cfi_adjust_cfa_offset(56) movaps %xmm0, (%rsp) movaps %xmm1, 16(%rsp) - call \callee@PLT + call JUMPTARGET(\callee) movss %xmm0, 32(%rsp) movss 4(%rsp), %xmm0 movss 20(%rsp), %xmm1 - call \callee@PLT + call JUMPTARGET(\callee) movss %xmm0, 36(%rsp) movss 8(%rsp), %xmm0 movss 24(%rsp), %xmm1 - call \callee@PLT + call JUMPTARGET(\callee) movss %xmm0, 40(%rsp) movss 12(%rsp), %xmm0 movss 28(%rsp), %xmm1 - call \callee@PLT + call JUMPTARGET(\callee) movss 32(%rsp), %xmm3 movss 36(%rsp), %xmm2 movss 40(%rsp), %xmm1 @@ -91,7 +91,7 @@ leaq 24(%rsp), %rsi leaq 28(%rsp), %rdi movaps %xmm0, (%rsp) - call \callee@PLT + call JUMPTARGET(\callee) leaq 24(%rsp), %rsi leaq 28(%rsp), %rdi movss 28(%rsp), %xmm0 @@ -101,7 +101,7 @@ movss %xmm0, (%rbx) movaps %xmm1, %xmm0 shufps $85, %xmm1, %xmm0 - call \callee@PLT + call JUMPTARGET(\callee) movss 28(%rsp), %xmm0 leaq 24(%rsp), %rsi movss %xmm0, 4(%rbp) @@ -111,7 +111,7 @@ movss %xmm0, 4(%rbx) movaps %xmm1, %xmm0 unpckhps %xmm1, %xmm0 - call \callee@PLT + call JUMPTARGET(\callee) movaps (%rsp), %xmm1 leaq 24(%rsp), %rsi leaq 28(%rsp), %rdi @@ -121,7 +121,7 @@ movss 24(%rsp), %xmm0 movss %xmm0, 8(%rbx) movaps %xmm1, %xmm0 - call \callee@PLT + call JUMPTARGET(\callee) movss 28(%rsp), %xmm0 movss %xmm0, 12(%rbp) movss 24(%rsp), %xmm0 @@ -246,29 +246,14 @@ cfi_def_cfa_register (%rbp) andq $-64, %rsp subq $128, %rsp -/* Below is encoding for vmovups %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x04 - .byte 0x24 + vmovups %zmm0, (%rsp) vmovupd (%rsp), %ymm0 call HIDDEN_JUMPTARGET(\callee) vmovupd %ymm0, 64(%rsp) vmovupd 32(%rsp), %ymm0 call HIDDEN_JUMPTARGET(\callee) vmovupd %ymm0, 96(%rsp) -/* Below is encoding for vmovups 64(%rsp), %zmm0. */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x10 - .byte 0x44 - .byte 0x24 - .byte 0x01 + vmovups 64(%rsp), %zmm0 movq %rbp, %rsp cfi_def_cfa_register (%rsp) popq %rbp @@ -286,23 +271,8 @@ cfi_def_cfa_register (%rbp) andq $-64, %rsp subq $192, %rsp -/* Below is encoding for vmovups %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x04 - .byte 0x24 -/* Below is encoding for vmovups %zmm1, 64(%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x11 - .byte 0x4c - .byte 0x24 - .byte 0x01 + vmovups %zmm0, (%rsp) + vmovups %zmm1, 64(%rsp) vmovups (%rsp), %ymm0 vmovups 64(%rsp), %ymm1 call HIDDEN_JUMPTARGET(\callee) @@ -311,15 +281,7 @@ vmovups 96(%rsp), %ymm1 call HIDDEN_JUMPTARGET(\callee) vmovups %ymm0, 160(%rsp) -/* Below is encoding for vmovups 128(%rsp), %zmm0. */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x10 - .byte 0x44 - .byte 0x24 - .byte 0x02 + vmovups 128(%rsp), %zmm0 movq %rbp, %rsp cfi_def_cfa_register (%rsp) popq %rbp @@ -340,14 +302,7 @@ pushq %r13 subq $176, %rsp movq %rsi, %r13 -/* Below is encoding for vmovaps %zmm0, (%rsp). */ - .byte 0x62 - .byte 0xf1 - .byte 0x7c - .byte 0x48 - .byte 0x29 - .byte 0x04 - .byte 0x24 + vmovaps %zmm0, (%rsp) movq %rdi, %r12 vmovaps (%rsp), %ymm0 call HIDDEN_JUMPTARGET(\callee) diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx-main.c b/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx-main.c new file mode 100644 index 0000000000..43914ef0e7 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx-main.c @@ -0,0 +1 @@ +#include "test-double-libmvec-alias.c" diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx-mod.c b/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx-mod.c new file mode 100644 index 0000000000..514883dcf9 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx-mod.c @@ -0,0 +1 @@ +#include "test-double-libmvec-alias-mod.c" diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx.c b/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx.c new file mode 100644 index 0000000000..43914ef0e7 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx.c @@ -0,0 +1 @@ +#include "test-double-libmvec-alias.c" diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2-main.c b/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2-main.c new file mode 100644 index 0000000000..43914ef0e7 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2-main.c @@ -0,0 +1 @@ +#include "test-double-libmvec-alias.c" diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2-mod.c b/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2-mod.c new file mode 100644 index 0000000000..514883dcf9 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2-mod.c @@ -0,0 +1 @@ +#include "test-double-libmvec-alias-mod.c" diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2.c b/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2.c new file mode 100644 index 0000000000..43914ef0e7 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx2.c @@ -0,0 +1 @@ +#include "test-double-libmvec-alias.c" diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512-main.c b/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512-main.c new file mode 100644 index 0000000000..43914ef0e7 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512-main.c @@ -0,0 +1 @@ +#include "test-double-libmvec-alias.c" diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512-mod.c b/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512-mod.c new file mode 100644 index 0000000000..514883dcf9 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512-mod.c @@ -0,0 +1 @@ +#include "test-double-libmvec-alias-mod.c" diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512.c b/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512.c new file mode 100644 index 0000000000..43914ef0e7 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-double-libmvec-alias-avx512.c @@ -0,0 +1 @@ +#include "test-double-libmvec-alias.c" diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-alias-main.c b/sysdeps/x86_64/fpu/test-double-libmvec-alias-main.c new file mode 100644 index 0000000000..43914ef0e7 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-double-libmvec-alias-main.c @@ -0,0 +1 @@ +#include "test-double-libmvec-alias.c" diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-alias-mod.c b/sysdeps/x86_64/fpu/test-double-libmvec-alias-mod.c new file mode 100644 index 0000000000..6f2e588021 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-double-libmvec-alias-mod.c @@ -0,0 +1,25 @@ +/* Part of test to build shared library to ensure link against + *_finite aliases from libmvec. + Copyright (C) 2016-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <math.h> +#include <stdlib.h> +#include <math-tests-arch.h> + +#include "test-double.h" +#include "test-libmvec-alias-mod.c" diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-alias.c b/sysdeps/x86_64/fpu/test-double-libmvec-alias.c new file mode 100644 index 0000000000..d38b49d6c8 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-double-libmvec-alias.c @@ -0,0 +1,29 @@ +/* Part of test to ensure link against *_finite aliases from libmvec. + Copyright (C) 2016-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +extern int +test_finite_alias (void); + +static int +do_test (void) +{ + return test_finite_alias (); +} + +#define TEST_FUNCTION do_test () +#include "../../../test-skeleton.c" diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx-main.c b/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx-main.c new file mode 100644 index 0000000000..fc2ffea314 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx-main.c @@ -0,0 +1 @@ +#include "test-double-libmvec-sincos-main.c" diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx.c b/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx.c new file mode 100644 index 0000000000..896f1bcbaf --- /dev/null +++ b/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx.c @@ -0,0 +1 @@ +#include "test-double-libmvec-sincos.c" diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx2-main.c b/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx2-main.c new file mode 100644 index 0000000000..fc2ffea314 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx2-main.c @@ -0,0 +1 @@ +#include "test-double-libmvec-sincos-main.c" diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx2.c b/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx2.c new file mode 100644 index 0000000000..896f1bcbaf --- /dev/null +++ b/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx2.c @@ -0,0 +1 @@ +#include "test-double-libmvec-sincos.c" diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx512-main.c b/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx512-main.c new file mode 100644 index 0000000000..fc2ffea314 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx512-main.c @@ -0,0 +1 @@ +#include "test-double-libmvec-sincos-main.c" diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx512.c b/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx512.c new file mode 100644 index 0000000000..896f1bcbaf --- /dev/null +++ b/sysdeps/x86_64/fpu/test-double-libmvec-sincos-avx512.c @@ -0,0 +1 @@ +#include "test-double-libmvec-sincos.c" diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-sincos-main.c b/sysdeps/x86_64/fpu/test-double-libmvec-sincos-main.c new file mode 100644 index 0000000000..2e52fddf5d --- /dev/null +++ b/sysdeps/x86_64/fpu/test-double-libmvec-sincos-main.c @@ -0,0 +1,43 @@ +/* Test for vector sincos ABI. + Copyright (C) 2016-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <math.h> + +#define N 1000 +double x[N], s[N], c[N]; +double* s_ptrs[N]; +double* c_ptrs[N]; + +int +test_sincos_abi (void) +{ + int i; + + for(i = 0; i < N; i++) + { + x[i] = i / 3; + s_ptrs[i] = &s[i]; + c_ptrs[i] = &c[i]; + } + +#pragma omp simd + for(i = 0; i < N; i++) + sincos (x[i], s_ptrs[i], c_ptrs[i]); + + return 0; +} diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-sincos.c b/sysdeps/x86_64/fpu/test-double-libmvec-sincos.c new file mode 100644 index 0000000000..cffaa73135 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-double-libmvec-sincos.c @@ -0,0 +1,44 @@ +/* Test for vector sincos ABI. + Copyright (C) 2016-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <math-tests-arch.h> + +extern int test_sincos_abi (void); + +int arch_check = 1; + +static void +check_arch (void) +{ + CHECK_ARCH_EXT; + arch_check = 0; +} + +static int +do_test (void) +{ + check_arch (); + + if (arch_check) + return 77; + + return test_sincos_abi (); +} + +#define TEST_FUNCTION do_test () +#include "../../../test-skeleton.c" diff --git a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c index a9d15979aa..4ff1439f9c 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c +++ b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c @@ -1,5 +1,5 @@ /* Wrapper part of tests for SSE ISA versions of vector math functions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -17,13 +17,17 @@ <http://www.gnu.org/licenses/>. */ #include "test-double-vlen2.h" +#include "test-math-vector-sincos.h" #include <immintrin.h> #define VEC_TYPE __m128d VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVbN2v_cos) VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVbN2v_sin) -VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincos), _ZGVbN2vvv_sincos) VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVbN2v_log) VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVbN2v_exp) VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVbN2vv_pow) + +#define VEC_INT_TYPE __m128i + +VECTOR_WRAPPER_fFF_2 (WRAPPER_NAME (sincos), _ZGVbN2vvv_sincos) diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c index eb6a531502..c7bdad517b 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c +++ b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c @@ -1,5 +1,5 @@ /* Wrapper part of tests for AVX2 ISA versions of vector math functions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -17,6 +17,7 @@ <http://www.gnu.org/licenses/>. */ #include "test-double-vlen4.h" +#include "test-math-vector-sincos.h" #include <immintrin.h> #undef VEC_SUFF @@ -26,7 +27,14 @@ VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVdN4v_cos) VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVdN4v_sin) -VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincos), _ZGVdN4vvv_sincos) VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVdN4v_log) VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVdN4v_exp) VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVdN4vv_pow) + +#ifndef __ILP32__ +# define VEC_INT_TYPE __m256i +#else +# define VEC_INT_TYPE __m128i +#endif + +VECTOR_WRAPPER_fFF_2 (WRAPPER_NAME (sincos), _ZGVdN4vvv_sincos) diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-avx2.c b/sysdeps/x86_64/fpu/test-double-vlen4-avx2.h index 0cadef03d6..4b196e66fc 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen4-avx2.c +++ b/sysdeps/x86_64/fpu/test-double-vlen4-avx2.h @@ -1,5 +1,5 @@ /* Tests for AVX2 ISA versions of vector math functions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,18 +16,10 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include "test-double-vlen4.h" +#include <test-double-vlen4.h> #undef VEC_SUFF #define VEC_SUFF _vlen4_avx2 -#define TEST_VECTOR_cos 1 -#define TEST_VECTOR_sin 1 -#define TEST_VECTOR_sincos 1 -#define TEST_VECTOR_log 1 -#define TEST_VECTOR_exp 1 -#define TEST_VECTOR_pow 1 - +#undef REQUIRE_AVX #define REQUIRE_AVX2 - -#include "libm-test.c" diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c index 52b81da3ee..2bb0085700 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c +++ b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c @@ -1,5 +1,5 @@ /* Wrapper part of tests for AVX ISA versions of vector math functions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -17,13 +17,21 @@ <http://www.gnu.org/licenses/>. */ #include "test-double-vlen4.h" +#include "test-math-vector-sincos.h" #include <immintrin.h> #define VEC_TYPE __m256d VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVcN4v_cos) VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVcN4v_sin) -VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincos), _ZGVcN4vvv_sincos) VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVcN4v_log) VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVcN4v_exp) VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVcN4vv_pow) + +#define VEC_INT_TYPE __m128i + +#ifndef __ILP32__ +VECTOR_WRAPPER_fFF_3 (WRAPPER_NAME (sincos), _ZGVcN4vvv_sincos) +#else +VECTOR_WRAPPER_fFF_2 (WRAPPER_NAME (sincos), _ZGVcN4vvv_sincos) +#endif diff --git a/sysdeps/x86_64/fpu/test-double-vlen4.c b/sysdeps/x86_64/fpu/test-double-vlen4.h index 9ae97f1388..316340cb59 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen4.c +++ b/sysdeps/x86_64/fpu/test-double-vlen4.h @@ -1,5 +1,5 @@ /* Tests for AVX ISA versions of vector math functions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,15 +16,6 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include "test-double-vlen4.h" - -#define TEST_VECTOR_cos 1 -#define TEST_VECTOR_sin 1 -#define TEST_VECTOR_sincos 1 -#define TEST_VECTOR_log 1 -#define TEST_VECTOR_exp 1 -#define TEST_VECTOR_pow 1 +#include_next <test-double-vlen4.h> #define REQUIRE_AVX - -#include "libm-test.c" diff --git a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c index c10bb9cb4a..ea179284ed 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c +++ b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c @@ -1,5 +1,5 @@ /* Wrapper part of tests for AVX-512 versions of vector math functions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -17,13 +17,21 @@ <http://www.gnu.org/licenses/>. */ #include "test-double-vlen8.h" +#include "test-math-vector-sincos.h" #include <immintrin.h> #define VEC_TYPE __m512d VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVeN8v_cos) VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVeN8v_sin) -VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincos), _ZGVeN8vvv_sincos) VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVeN8v_log) VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVeN8v_exp) VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVeN8vv_pow) + +#ifndef __ILP32__ +# define VEC_INT_TYPE __m512i +#else +# define VEC_INT_TYPE __m256i +#endif + +VECTOR_WRAPPER_fFF_2 (WRAPPER_NAME (sincos), _ZGVeN8vvv_sincos) diff --git a/sysdeps/x86_64/fpu/test-double-vlen8.c b/sysdeps/x86_64/fpu/test-double-vlen8.h index 4fb6c8d196..41d188081e 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen8.c +++ b/sysdeps/x86_64/fpu/test-double-vlen8.h @@ -1,5 +1,5 @@ /* Tests for AVX-512 versions of vector math functions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,15 +16,6 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include "test-double-vlen8.h" - -#define TEST_VECTOR_cos 1 -#define TEST_VECTOR_sin 1 -#define TEST_VECTOR_sincos 1 -#define TEST_VECTOR_log 1 -#define TEST_VECTOR_exp 1 -#define TEST_VECTOR_pow 1 +#include_next <test-double-vlen8.h> #define REQUIRE_AVX512F - -#include "libm-test.c" diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx-main.c b/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx-main.c new file mode 100644 index 0000000000..f3691cc8e6 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx-main.c @@ -0,0 +1 @@ +#include "test-float-libmvec-alias.c" diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx-mod.c b/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx-mod.c new file mode 100644 index 0000000000..7fc3d8aedd --- /dev/null +++ b/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx-mod.c @@ -0,0 +1 @@ +#include "test-float-libmvec-alias-mod.c" diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx.c b/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx.c new file mode 100644 index 0000000000..f3691cc8e6 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx.c @@ -0,0 +1 @@ +#include "test-float-libmvec-alias.c" diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2-main.c b/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2-main.c new file mode 100644 index 0000000000..f3691cc8e6 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2-main.c @@ -0,0 +1 @@ +#include "test-float-libmvec-alias.c" diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2-mod.c b/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2-mod.c new file mode 100644 index 0000000000..7fc3d8aedd --- /dev/null +++ b/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2-mod.c @@ -0,0 +1 @@ +#include "test-float-libmvec-alias-mod.c" diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2.c b/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2.c new file mode 100644 index 0000000000..f3691cc8e6 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx2.c @@ -0,0 +1 @@ +#include "test-float-libmvec-alias.c" diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512-main.c b/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512-main.c new file mode 100644 index 0000000000..f3691cc8e6 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512-main.c @@ -0,0 +1 @@ +#include "test-float-libmvec-alias.c" diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512-mod.c b/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512-mod.c new file mode 100644 index 0000000000..7fc3d8aedd --- /dev/null +++ b/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512-mod.c @@ -0,0 +1 @@ +#include "test-float-libmvec-alias-mod.c" diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512.c b/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512.c new file mode 100644 index 0000000000..f3691cc8e6 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-float-libmvec-alias-avx512.c @@ -0,0 +1 @@ +#include "test-float-libmvec-alias.c" diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-alias-main.c b/sysdeps/x86_64/fpu/test-float-libmvec-alias-main.c new file mode 100644 index 0000000000..f3691cc8e6 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-float-libmvec-alias-main.c @@ -0,0 +1 @@ +#include "test-float-libmvec-alias.c" diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-alias-mod.c b/sysdeps/x86_64/fpu/test-float-libmvec-alias-mod.c new file mode 100644 index 0000000000..5e6a587a94 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-float-libmvec-alias-mod.c @@ -0,0 +1,25 @@ +/* Part of test to build shared library to ensure link against + *_finite aliases from libmvec. + Copyright (C) 2016-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <math.h> +#include <stdlib.h> +#include <math-tests-arch.h> + +#include "test-float.h" +#include "test-libmvec-alias-mod.c" diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-alias.c b/sysdeps/x86_64/fpu/test-float-libmvec-alias.c new file mode 100644 index 0000000000..d38b49d6c8 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-float-libmvec-alias.c @@ -0,0 +1,29 @@ +/* Part of test to ensure link against *_finite aliases from libmvec. + Copyright (C) 2016-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +extern int +test_finite_alias (void); + +static int +do_test (void) +{ + return test_finite_alias (); +} + +#define TEST_FUNCTION do_test () +#include "../../../test-skeleton.c" diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx-main.c b/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx-main.c new file mode 100644 index 0000000000..558e2ac649 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx-main.c @@ -0,0 +1 @@ +#include "test-float-libmvec-sincosf-main.c" diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx.c b/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx.c new file mode 100644 index 0000000000..5b45f0a055 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx.c @@ -0,0 +1 @@ +#include "test-float-libmvec-sincosf.c" diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx2-main.c b/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx2-main.c new file mode 100644 index 0000000000..558e2ac649 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx2-main.c @@ -0,0 +1 @@ +#include "test-float-libmvec-sincosf-main.c" diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx2.c b/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx2.c new file mode 100644 index 0000000000..5b45f0a055 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx2.c @@ -0,0 +1 @@ +#include "test-float-libmvec-sincosf.c" diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx512-main.c b/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx512-main.c new file mode 100644 index 0000000000..558e2ac649 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx512-main.c @@ -0,0 +1 @@ +#include "test-float-libmvec-sincosf-main.c" diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx512.c b/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx512.c new file mode 100644 index 0000000000..5b45f0a055 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-avx512.c @@ -0,0 +1 @@ +#include "test-float-libmvec-sincosf.c" diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-main.c b/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-main.c new file mode 100644 index 0000000000..ce1dd1a8a4 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-float-libmvec-sincosf-main.c @@ -0,0 +1,42 @@ +/* Test for vector sincosf ABI. + Copyright (C) 2016-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <math.h> + +#define N 1000 +float x[N], s[N], c[N]; +float *s_ptrs[N]; +float *c_ptrs[N]; + +int +test_sincosf_abi (void) +{ + int i; + for(i = 0; i < N; i++) + { + x[i] = i / 3; + s_ptrs[i] = &s[i]; + c_ptrs[i] = &c[i]; + } + +#pragma omp simd + for(i = 0; i < N; i++) + sincosf (x[i], s_ptrs[i], c_ptrs[i]); + + return 0; +} diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-sincosf.c b/sysdeps/x86_64/fpu/test-float-libmvec-sincosf.c new file mode 100644 index 0000000000..a56d9680a0 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-float-libmvec-sincosf.c @@ -0,0 +1,44 @@ +/* Test for vector sincosf ABI. + Copyright (C) 2016-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <math-tests-arch.h> + +extern int test_sincosf_abi (void); + +int arch_check = 1; + +static void +check_arch (void) +{ + CHECK_ARCH_EXT; + arch_check = 0; +} + +static int +do_test (void) +{ + check_arch (); + + if (arch_check) + return 77; + + return test_sincosf_abi (); +} + +#define TEST_FUNCTION do_test () +#include "../../../test-skeleton.c" diff --git a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c index dc09e4a338..d2a81ecf53 100644 --- a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c +++ b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c @@ -1,5 +1,5 @@ /* Wrapper part of tests for AVX-512 ISA versions of vector math functions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -17,13 +17,21 @@ <http://www.gnu.org/licenses/>. */ #include "test-float-vlen16.h" +#include "test-math-vector-sincos.h" #include <immintrin.h> #define VEC_TYPE __m512 VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVeN16v_cosf) VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVeN16v_sinf) -VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincosf), _ZGVeN16vvv_sincosf) VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVeN16v_logf) VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVeN16v_expf) VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVeN16vv_powf) + +#define VEC_INT_TYPE __m512i + +#ifndef __ILP32__ +VECTOR_WRAPPER_fFF_3 (WRAPPER_NAME (sincosf), _ZGVeN16vvv_sincosf) +#else +VECTOR_WRAPPER_fFF_2 (WRAPPER_NAME (sincosf), _ZGVeN16vvv_sincosf) +#endif diff --git a/sysdeps/x86_64/fpu/test-float-vlen16.c b/sysdeps/x86_64/fpu/test-float-vlen16.h index 882bfc840d..ffe27866b5 100644 --- a/sysdeps/x86_64/fpu/test-float-vlen16.c +++ b/sysdeps/x86_64/fpu/test-float-vlen16.h @@ -1,5 +1,5 @@ /* Tests for AVX-512 ISA versions of vector math functions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,15 +16,6 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include "test-float-vlen16.h" - -#define TEST_VECTOR_cosf 1 -#define TEST_VECTOR_sinf 1 -#define TEST_VECTOR_sincosf 1 -#define TEST_VECTOR_logf 1 -#define TEST_VECTOR_expf 1 -#define TEST_VECTOR_powf 1 +#include_next <test-float-vlen16.h> #define REQUIRE_AVX512F - -#include "libm-test.c" diff --git a/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c index 0bb9818146..afa7da26f6 100644 --- a/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c +++ b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c @@ -1,5 +1,5 @@ /* Wrapper part of tests for SSE ISA versions of vector math functions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -17,13 +17,21 @@ <http://www.gnu.org/licenses/>. */ #include "test-float-vlen4.h" +#include "test-math-vector-sincos.h" #include <immintrin.h> #define VEC_TYPE __m128 VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVbN4v_cosf) VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVbN4v_sinf) -VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincosf), _ZGVbN4vvv_sincosf) VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVbN4v_logf) VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVbN4v_expf) VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVbN4vv_powf) + +#define VEC_INT_TYPE __m128i + +#ifndef __ILP32__ +VECTOR_WRAPPER_fFF_3 (WRAPPER_NAME (sincosf), _ZGVbN4vvv_sincosf) +#else +VECTOR_WRAPPER_fFF_2 (WRAPPER_NAME (sincosf), _ZGVbN4vvv_sincosf) +#endif diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c index 4985ac2379..d7e79a3f37 100644 --- a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c +++ b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c @@ -1,5 +1,5 @@ /* Wrapper part of tests for AVX2 ISA versions of vector math functions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -17,6 +17,7 @@ <http://www.gnu.org/licenses/>. */ #include "test-float-vlen8.h" +#include "test-math-vector-sincos.h" #include <immintrin.h> #undef VEC_SUFF @@ -26,7 +27,17 @@ VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVdN8v_cosf) VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVdN8v_sinf) -VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincosf), _ZGVdN8vvv_sincosf) VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVdN8v_logf) VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVdN8v_expf) VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVdN8vv_powf) + +/* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf. */ +#undef VECTOR_WRAPPER_fFF + +#define VEC_INT_TYPE __m256i + +#ifndef __ILP32__ +VECTOR_WRAPPER_fFF_3 (WRAPPER_NAME (sincosf), _ZGVdN8vvv_sincosf) +#else +VECTOR_WRAPPER_fFF_2 (WRAPPER_NAME (sincosf), _ZGVdN8vvv_sincosf) +#endif diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-avx2.c b/sysdeps/x86_64/fpu/test-float-vlen8-avx2.h index 7a416385b6..c468dd6e69 100644 --- a/sysdeps/x86_64/fpu/test-float-vlen8-avx2.c +++ b/sysdeps/x86_64/fpu/test-float-vlen8-avx2.h @@ -1,5 +1,5 @@ /* Tests for AVX2 ISA versions of vector math functions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,18 +16,10 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include "test-float-vlen8.h" +#include <test-float-vlen8.h> #undef VEC_SUFF #define VEC_SUFF _vlen8_avx2 -#define TEST_VECTOR_cosf 1 -#define TEST_VECTOR_sinf 1 -#define TEST_VECTOR_sincosf 1 -#define TEST_VECTOR_logf 1 -#define TEST_VECTOR_expf 1 -#define TEST_VECTOR_powf 1 - +#undef REQUIRE_AVX #define REQUIRE_AVX2 - -#include "libm-test.c" diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c index 9cc2883399..6f7869ba3d 100644 --- a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c +++ b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c @@ -1,5 +1,5 @@ /* Wrapper part of tests for AVX ISA versions of vector math functions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -17,13 +17,21 @@ <http://www.gnu.org/licenses/>. */ #include "test-float-vlen8.h" +#include "test-math-vector-sincos.h" #include <immintrin.h> #define VEC_TYPE __m256 VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVcN8v_cosf) VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVcN8v_sinf) -VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincosf), _ZGVcN8vvv_sincosf) VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVcN8v_logf) VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVcN8v_expf) VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVcN8vv_powf) + +#define VEC_INT_TYPE __m128i + +#ifndef __ILP32__ +VECTOR_WRAPPER_fFF_4 (WRAPPER_NAME (sincosf), _ZGVcN8vvv_sincosf) +#else +VECTOR_WRAPPER_fFF_3 (WRAPPER_NAME (sincosf), _ZGVcN8vvv_sincosf) +#endif diff --git a/sysdeps/x86_64/fpu/test-float-vlen8.c b/sysdeps/x86_64/fpu/test-float-vlen8.h index c92a50ae7e..153820ecc2 100644 --- a/sysdeps/x86_64/fpu/test-float-vlen8.c +++ b/sysdeps/x86_64/fpu/test-float-vlen8.h @@ -1,5 +1,5 @@ /* Tests for AVX ISA versions of vector math functions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,15 +16,6 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include "test-float-vlen8.h" - -#define TEST_VECTOR_cosf 1 -#define TEST_VECTOR_sinf 1 -#define TEST_VECTOR_sincosf 1 -#define TEST_VECTOR_logf 1 -#define TEST_VECTOR_expf 1 -#define TEST_VECTOR_powf 1 +#include_next <test-float-vlen8.h> #define REQUIRE_AVX - -#include "libm-test.c" diff --git a/sysdeps/x86_64/fpu/test-libmvec-alias-mod.c b/sysdeps/x86_64/fpu/test-libmvec-alias-mod.c new file mode 100644 index 0000000000..6d70844147 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-libmvec-alias-mod.c @@ -0,0 +1,66 @@ +/* Part of test to build shared library to ensure link against + *_finite aliases from libmvec. + Copyright (C) 2016-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define N 4000 +FLOAT log_arg[N]; +FLOAT exp_arg[N]; +FLOAT log_res[N]; +FLOAT exp_res[N]; +FLOAT pow_res[N]; +int arch_check = 1; + +static void +init_arg (void) +{ + int i; + + CHECK_ARCH_EXT; + + arch_check = 0; + + for (i = 0; i < N; i += 1) + { + log_arg[i] = 1.0; + exp_arg[i] = 0.0; + } +} + +int +test_finite_alias (void) +{ + int i; + + init_arg (); + + if (arch_check) return 77; + +#pragma omp simd + for (i = 0; i < N; i += 1) + { + log_res[i] = FUNC (log) (log_arg[i]); + exp_res[i] = FUNC (exp) (exp_arg[i]); + pow_res[i] = FUNC (pow) (log_arg[i], log_arg[i]); + } + + if (log_res[0] != 0.0) return 1; + if (exp_res[0] != 1.0) return 1; + if (pow_res[0] != 1.0) return 1; + + return 0; +} diff --git a/sysdeps/x86_64/fpu/x86_64-math-asm.h b/sysdeps/x86_64/fpu/x86_64-math-asm.h index db3f9f78b0..597b967b7b 100644 --- a/sysdeps/x86_64/fpu/x86_64-math-asm.h +++ b/sysdeps/x86_64/fpu/x86_64-math-asm.h @@ -1,5 +1,5 @@ /* Helper macros for x86_64 libm functions. - Copyright (C) 2015-2016 Free Software Foundation, Inc. + Copyright (C) 2015-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/hp-timing.h b/sysdeps/x86_64/hp-timing.h index 65381b314d..ec543bef03 100644 --- a/sysdeps/x86_64/hp-timing.h +++ b/sysdeps/x86_64/hp-timing.h @@ -1,5 +1,5 @@ /* High precision, low overhead timing functions. x86-64 version. - Copyright (C) 2002-2016 Free Software Foundation, Inc. + Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/htonl.S b/sysdeps/x86_64/htonl.S index c92fae8791..23e2046caa 100644 --- a/sysdeps/x86_64/htonl.S +++ b/sysdeps/x86_64/htonl.S @@ -1,5 +1,5 @@ /* Change byte order in word. For AMD x86-64. - Copyright (C) 1997-2016 Free Software Foundation, Inc. + Copyright (C) 1997-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/ifuncmain8.c b/sysdeps/x86_64/ifuncmain8.c index 448ab96bfa..449998df50 100644 --- a/sysdeps/x86_64/ifuncmain8.c +++ b/sysdeps/x86_64/ifuncmain8.c @@ -1,5 +1,5 @@ /* Test IFUNC selector with floating-point parameters. - Copyright (C) 2015-2016 Free Software Foundation, Inc. + Copyright (C) 2015-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/ifuncmod8.c b/sysdeps/x86_64/ifuncmod8.c index c00436799c..8225c4da12 100644 --- a/sysdeps/x86_64/ifuncmod8.c +++ b/sysdeps/x86_64/ifuncmod8.c @@ -1,5 +1,5 @@ /* Test IFUNC selector with floating-point parameters. - Copyright (C) 2015-2016 Free Software Foundation, Inc. + Copyright (C) 2015-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -28,6 +28,7 @@ foo_impl (float x) } void * +inhibit_stack_protector foo_ifunc (void) { __m128i xmm = _mm_set1_epi32 (-1); diff --git a/sysdeps/x86_64/jmpbuf-offsets.h b/sysdeps/x86_64/jmpbuf-offsets.h index da71e555f7..6d1ee5e812 100644 --- a/sysdeps/x86_64/jmpbuf-offsets.h +++ b/sysdeps/x86_64/jmpbuf-offsets.h @@ -1,5 +1,5 @@ /* Private macros for accessing __jmp_buf contents. x86-64 version. - Copyright (C) 2006-2016 Free Software Foundation, Inc. + Copyright (C) 2006-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/jmpbuf-unwind.h b/sysdeps/x86_64/jmpbuf-unwind.h index aa0642b54a..49208bdd9e 100644 --- a/sysdeps/x86_64/jmpbuf-unwind.h +++ b/sysdeps/x86_64/jmpbuf-unwind.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2016 Free Software Foundation, Inc. +/* Copyright (C) 2003-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Jakub Jelinek <jakub@redhat.com>, 2003. diff --git a/sysdeps/x86_64/ldsodefs.h b/sysdeps/x86_64/ldsodefs.h deleted file mode 100644 index 6a96c53721..0000000000 --- a/sysdeps/x86_64/ldsodefs.h +++ /dev/null @@ -1,56 +0,0 @@ -/* Run-time dynamic linker data structures for loaded ELF shared objects. - Copyright (C) 1995-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#ifndef _X86_64_LDSODEFS_H -#define _X86_64_LDSODEFS_H 1 - -#include <elf.h> -#include <cpu-features.h> - -struct La_x86_64_regs; -struct La_x86_64_retval; -struct La_x32_regs; -struct La_x32_retval; - -#define ARCH_PLTENTER_MEMBERS \ - Elf64_Addr (*x86_64_gnu_pltenter) (Elf64_Sym *, unsigned int, \ - uintptr_t *, \ - uintptr_t *, struct La_x86_64_regs *, \ - unsigned int *, const char *name, \ - long int *framesizep); \ - Elf32_Addr (*x32_gnu_pltenter) (Elf32_Sym *, unsigned int, uintptr_t *, \ - uintptr_t *, struct La_x32_regs *, \ - unsigned int *, const char *name, \ - long int *framesizep) - -#define ARCH_PLTEXIT_MEMBERS \ - unsigned int (*x86_64_gnu_pltexit) (Elf64_Sym *, unsigned int, \ - uintptr_t *, \ - uintptr_t *, \ - const struct La_x86_64_regs *, \ - struct La_x86_64_retval *, \ - const char *); \ - unsigned int (*x32_gnu_pltexit) (Elf32_Sym *, unsigned int, uintptr_t *, \ - uintptr_t *, \ - const struct La_x32_regs *, \ - struct La_x86_64_retval *, \ - const char *) - -#include_next <ldsodefs.h> - -#endif diff --git a/sysdeps/x86_64/localplt.data b/sysdeps/x86_64/localplt.data index f168b143ff..c27a02b66a 100644 --- a/sysdeps/x86_64/localplt.data +++ b/sysdeps/x86_64/localplt.data @@ -8,12 +8,15 @@ libc.so: free + RELA R_X86_64_GLOB_DAT libc.so: malloc + RELA R_X86_64_GLOB_DAT libc.so: memalign + RELA R_X86_64_GLOB_DAT libc.so: realloc + RELA R_X86_64_GLOB_DAT -libm.so: matherr -# The dynamic loader uses __libc_memalign internally to allocate aligned -# TLS storage. The other malloc family of functions are expected to allow -# user symbol interposition. -ld.so: __libc_memalign + RELA R_X86_64_GLOB_DAT +libm.so: matherr + RELA R_X86_64_GLOB_DAT +# The main malloc is interposed into the dynamic linker, for +# allocations after the initial link (when dlopen is used). ld.so: malloc + RELA R_X86_64_GLOB_DAT ld.so: calloc + RELA R_X86_64_GLOB_DAT ld.so: realloc + RELA R_X86_64_GLOB_DAT ld.so: free + RELA R_X86_64_GLOB_DAT +# The TLS-enabled version of these functions is interposed from libc.so. +ld.so: _dl_signal_error + RELA R_X86_64_GLOB_DAT +ld.so: _dl_catch_error + RELA R_X86_64_GLOB_DAT +ld.so: _dl_signal_exception + RELA R_X86_64_GLOB_DAT +ld.so: _dl_catch_exception + RELA R_X86_64_GLOB_DAT diff --git a/sysdeps/x86_64/lshift.S b/sysdeps/x86_64/lshift.S index 49cbfbaf3d..af568768d0 100644 --- a/sysdeps/x86_64/lshift.S +++ b/sysdeps/x86_64/lshift.S @@ -1,5 +1,5 @@ /* x86-64 __mpn_lshift -- - Copyright (C) 2007-2016 Free Software Foundation, Inc. + Copyright (C) 2007-2018 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify diff --git a/sysdeps/x86_64/machine-gmon.h b/sysdeps/x86_64/machine-gmon.h index 3d9ce5c44e..8bc111612c 100644 --- a/sysdeps/x86_64/machine-gmon.h +++ b/sysdeps/x86_64/machine-gmon.h @@ -1,5 +1,5 @@ /* x86-64-specific implementation of profiling support. - Copyright (C) 2002-2016 Free Software Foundation, Inc. + Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Andreas Jaeger <aj@suse.de>, 2002. diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S index 132eacba8f..feef5d4f24 100644 --- a/sysdeps/x86_64/memchr.S +++ b/sysdeps/x86_64/memchr.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -18,26 +18,40 @@ #include <sysdep.h> +#ifdef USE_AS_WMEMCHR +# define MEMCHR wmemchr +# define PCMPEQ pcmpeqd +#else +# define MEMCHR memchr +# define PCMPEQ pcmpeqb +#endif + /* fast SSE2 version with using pmaxub and 64 byte loop */ .text -ENTRY(memchr) - movd %rsi, %xmm1 - mov %rdi, %rcx +ENTRY(MEMCHR) + movd %esi, %xmm1 + mov %edi, %ecx +#ifdef USE_AS_WMEMCHR + test %rdx, %rdx + jz L(return_null) + shl $2, %rdx +#else punpcklbw %xmm1, %xmm1 test %rdx, %rdx jz L(return_null) punpcklbw %xmm1, %xmm1 +#endif - and $63, %rcx + and $63, %ecx pshufd $0, %xmm1, %xmm1 - cmp $48, %rcx + cmp $48, %ecx ja L(crosscache) movdqu (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 + PCMPEQ %xmm1, %xmm0 pmovmskb %xmm0, %eax test %eax, %eax @@ -45,7 +59,7 @@ ENTRY(memchr) sub $16, %rdx jbe L(return_null) add $16, %rdi - and $15, %rcx + and $15, %ecx and $-16, %rdi add %rcx, %rdx sub $64, %rdx @@ -54,11 +68,11 @@ ENTRY(memchr) .p2align 4 L(crosscache): - and $15, %rcx + and $15, %ecx and $-16, %rdi movdqa (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 + PCMPEQ %xmm1, %xmm0 /* Check if there is a match. */ pmovmskb %xmm0, %eax /* Remove the leading bytes. */ @@ -76,8 +90,12 @@ L(crosscache): .p2align 4 L(unaligned_no_match): - add %rcx, %rdx - sub $16, %rdx + /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using + "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void + possible addition overflow. */ + neg %rcx + add $16, %rcx + sub %rcx, %rdx jbe L(return_null) add $16, %rdi sub $64, %rdx @@ -86,25 +104,25 @@ L(unaligned_no_match): .p2align 4 L(loop_prolog): movdqa (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 + PCMPEQ %xmm1, %xmm0 pmovmskb %xmm0, %eax test %eax, %eax jnz L(matches) movdqa 16(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 + PCMPEQ %xmm1, %xmm2 pmovmskb %xmm2, %eax test %eax, %eax jnz L(matches16) movdqa 32(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 + PCMPEQ %xmm1, %xmm3 pmovmskb %xmm3, %eax test %eax, %eax jnz L(matches32) movdqa 48(%rdi), %xmm4 - pcmpeqb %xmm1, %xmm4 + PCMPEQ %xmm1, %xmm4 add $64, %rdi pmovmskb %xmm4, %eax test %eax, %eax @@ -117,25 +135,25 @@ L(loop_prolog): jbe L(exit_loop) movdqa (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 + PCMPEQ %xmm1, %xmm0 pmovmskb %xmm0, %eax test %eax, %eax jnz L(matches) movdqa 16(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 + PCMPEQ %xmm1, %xmm2 pmovmskb %xmm2, %eax test %eax, %eax jnz L(matches16) movdqa 32(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 + PCMPEQ %xmm1, %xmm3 pmovmskb %xmm3, %eax test %eax, %eax jnz L(matches32) movdqa 48(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 + PCMPEQ %xmm1, %xmm3 pmovmskb %xmm3, %eax add $64, %rdi @@ -144,7 +162,7 @@ L(loop_prolog): mov %rdi, %rcx and $-64, %rdi - and $63, %rcx + and $63, %ecx add %rcx, %rdx .p2align 4 @@ -156,10 +174,10 @@ L(align64_loop): movdqa 32(%rdi), %xmm3 movdqa 48(%rdi), %xmm4 - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm1, %xmm2 - pcmpeqb %xmm1, %xmm3 - pcmpeqb %xmm1, %xmm4 + PCMPEQ %xmm1, %xmm0 + PCMPEQ %xmm1, %xmm2 + PCMPEQ %xmm1, %xmm3 + PCMPEQ %xmm1, %xmm4 pmaxub %xmm0, %xmm3 pmaxub %xmm2, %xmm4 @@ -182,9 +200,9 @@ L(align64_loop): jnz L(matches16) movdqa 32(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 + PCMPEQ %xmm1, %xmm3 - pcmpeqb 48(%rdi), %xmm1 + PCMPEQ 48(%rdi), %xmm1 pmovmskb %xmm3, %eax test %eax, %eax jnz L(matches32) @@ -196,52 +214,52 @@ L(align64_loop): .p2align 4 L(exit_loop): - add $32, %rdx + add $32, %edx jle L(exit_loop_32) movdqa (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 + PCMPEQ %xmm1, %xmm0 pmovmskb %xmm0, %eax test %eax, %eax jnz L(matches) movdqa 16(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 + PCMPEQ %xmm1, %xmm2 pmovmskb %xmm2, %eax test %eax, %eax jnz L(matches16) movdqa 32(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 + PCMPEQ %xmm1, %xmm3 pmovmskb %xmm3, %eax test %eax, %eax jnz L(matches32_1) - sub $16, %rdx + sub $16, %edx jle L(return_null) - pcmpeqb 48(%rdi), %xmm1 + PCMPEQ 48(%rdi), %xmm1 pmovmskb %xmm1, %eax test %eax, %eax jnz L(matches48_1) - xor %rax, %rax + xor %eax, %eax ret .p2align 4 L(exit_loop_32): - add $32, %rdx + add $32, %edx movdqa (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 + PCMPEQ %xmm1, %xmm0 pmovmskb %xmm0, %eax test %eax, %eax jnz L(matches_1) - sub $16, %rdx + sub $16, %edx jbe L(return_null) - pcmpeqb 16(%rdi), %xmm1 + PCMPEQ 16(%rdi), %xmm1 pmovmskb %xmm1, %eax test %eax, %eax jnz L(matches16_1) - xor %rax, %rax + xor %eax, %eax ret .p2align 4 @@ -302,10 +320,11 @@ L(matches48_1): .p2align 4 L(return_null): - xor %rax, %rax + xor %eax, %eax ret -END(memchr) +END(MEMCHR) +#ifndef USE_AS_WMEMCHR strong_alias (memchr, __memchr) - libc_hidden_builtin_def(memchr) +#endif diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S index 3fb018a772..bcb4a2e88d 100644 --- a/sysdeps/x86_64/memcmp.S +++ b/sysdeps/x86_64/memcmp.S @@ -1,5 +1,5 @@ /* memcmp with SSE2 - Copyright (C) 2009-2016 Free Software Foundation, Inc. + Copyright (C) 2009-2018 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/memcopy.h b/sysdeps/x86_64/memcopy.h new file mode 100644 index 0000000000..590b6cb16b --- /dev/null +++ b/sysdeps/x86_64/memcopy.h @@ -0,0 +1 @@ +/* X86-64 doesn't use memory copy functions. */ diff --git a/sysdeps/x86_64/memcpy.S b/sysdeps/x86_64/memcpy.S index f6e3d9396c..d98500a78a 100644 --- a/sysdeps/x86_64/memcpy.S +++ b/sysdeps/x86_64/memcpy.S @@ -1,584 +1 @@ -/* - Optimized memcpy for x86-64. - - Copyright (C) 2007-2016 Free Software Foundation, Inc. - Contributed by Evandro Menezes <evandro.menezes@amd.com>, 2007. - - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. -*/ - -#include <sysdep.h> -#include "asm-syntax.h" - -/* Stack slots in the red-zone. */ - -#ifdef USE_AS_MEMPCPY -# define RETVAL (0) -#else -# define RETVAL (-8) -# if defined SHARED && !defined USE_MULTIARCH && IS_IN (libc) -# define memcpy __memcpy -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(name) \ - .globl __GI_memcpy; __GI_memcpy = __memcpy -# endif -#endif -#define SAVE0 (RETVAL - 8) -#define SAVE1 (SAVE0 - 8) -#define SAVE2 (SAVE1 - 8) -#define SAVE3 (SAVE2 - 8) - - .text - -#if defined PIC && IS_IN (libc) -ENTRY_CHK (__memcpy_chk) - - cmpq %rdx, %rcx - jb HIDDEN_JUMPTARGET (__chk_fail) - -END_CHK (__memcpy_chk) -#endif - -ENTRY(memcpy) /* (void *, const void*, size_t) */ - -/* Handle tiny blocks. */ - -L(1try): /* up to 32B */ - cmpq $32, %rdx -#ifndef USE_AS_MEMPCPY - movq %rdi, %rax /* save return value */ -#endif - jae L(1after) - -L(1): /* 1-byte once */ - testb $1, %dl - jz L(1a) - - movzbl (%rsi), %ecx - movb %cl, (%rdi) - - incq %rsi - incq %rdi - - .p2align 4,, 4 - -L(1a): /* 2-byte once */ - testb $2, %dl - jz L(1b) - - movzwl (%rsi), %ecx - movw %cx, (%rdi) - - addq $2, %rsi - addq $2, %rdi - - .p2align 4,, 4 - -L(1b): /* 4-byte once */ - testb $4, %dl - jz L(1c) - - movl (%rsi), %ecx - movl %ecx, (%rdi) - - addq $4, %rsi - addq $4, %rdi - - .p2align 4,, 4 - -L(1c): /* 8-byte once */ - testb $8, %dl - jz L(1d) - - movq (%rsi), %rcx - movq %rcx, (%rdi) - - addq $8, %rsi - addq $8, %rdi - - .p2align 4,, 4 - -L(1d): /* 16-byte loop */ - andl $0xf0, %edx - jz L(exit) - - .p2align 4 - -L(1loop): - movq (%rsi), %rcx - movq 8(%rsi), %r8 - movq %rcx, (%rdi) - movq %r8, 8(%rdi) - - subl $16, %edx - - leaq 16(%rsi), %rsi - leaq 16(%rdi), %rdi - - jnz L(1loop) - - .p2align 4,, 4 - -L(exit): /* exit */ -#ifdef USE_AS_MEMPCPY - movq %rdi, %rax /* return value */ -#else - rep -#endif - retq - - .p2align 4 - -L(1after): -#ifndef USE_AS_MEMPCPY - movq %rax, RETVAL(%rsp) /* save return value */ -#endif - -/* Align to the natural word size. */ - -L(aligntry): - movl %esi, %ecx /* align by source */ - - andl $7, %ecx - jz L(alignafter) /* already aligned */ - -L(align): /* align */ - leaq -8(%rcx, %rdx), %rdx /* calculate remaining bytes */ - subl $8, %ecx - - .p2align 4 - -L(alignloop): /* 1-byte alignment loop */ - movzbl (%rsi), %eax - movb %al, (%rdi) - - incl %ecx - - leaq 1(%rsi), %rsi - leaq 1(%rdi), %rdi - - jnz L(alignloop) - - .p2align 4 - -L(alignafter): - -/* Handle mid-sized blocks. */ - -L(32try): /* up to 1KB */ - cmpq $1024, %rdx - ja L(32after) - -L(32): /* 32-byte loop */ - movl %edx, %ecx - shrl $5, %ecx - jz L(32skip) - - .p2align 4 - -L(32loop): - decl %ecx - - movq (%rsi), %rax - movq 8(%rsi), %r8 - movq 16(%rsi), %r9 - movq 24(%rsi), %r10 - - movq %rax, (%rdi) - movq %r8, 8(%rdi) - movq %r9, 16(%rdi) - movq %r10, 24(%rdi) - - leaq 32(%rsi), %rsi - leaq 32(%rdi), %rdi - - jz L(32skip) /* help out smaller blocks */ - - decl %ecx - - movq (%rsi), %rax - movq 8(%rsi), %r8 - movq 16(%rsi), %r9 - movq 24(%rsi), %r10 - - movq %rax, (%rdi) - movq %r8, 8(%rdi) - movq %r9, 16(%rdi) - movq %r10, 24(%rdi) - - leaq 32(%rsi), %rsi - leaq 32(%rdi), %rdi - - jnz L(32loop) - - .p2align 4 - -L(32skip): - andl $31, %edx /* check for left overs */ -#ifdef USE_AS_MEMPCPY - jnz L(1) - - movq %rdi, %rax -#else - movq RETVAL(%rsp), %rax - jnz L(1) - - rep -#endif - retq /* exit */ - - .p2align 4 - -L(32after): - -/* - In order to minimize code-size in RTLD, algorithms specific for - larger blocks are excluded when building for RTLD. -*/ - -/* Handle blocks smaller than 1/2 L1. */ - -L(fasttry): /* first 1/2 L1 */ -#if IS_IN (libc) /* only up to this algorithm outside of libc.so */ - mov __x86_data_cache_size_half(%rip), %R11_LP - cmpq %rdx, %r11 /* calculate the smaller of */ - cmovaq %rdx, %r11 /* remaining bytes and 1/2 L1 */ -#endif - -L(fast): /* good ol' MOVS */ -#if IS_IN (libc) - movq %r11, %rcx - andq $-8, %r11 -#else - movq %rdx, %rcx -#endif - shrq $3, %rcx - jz L(fastskip) - - rep - movsq - - .p2align 4,, 4 - -L(fastskip): -#if IS_IN (libc) - subq %r11, %rdx /* check for more */ - testq $-8, %rdx - jnz L(fastafter) -#endif - - andl $7, %edx /* check for left overs */ -#ifdef USE_AS_MEMPCPY - jnz L(1) - - movq %rdi, %rax -#else - movq RETVAL(%rsp), %rax - jnz L(1) - - rep -#endif - retq /* exit */ - -#if IS_IN (libc) /* none of the algorithms below for RTLD */ - - .p2align 4 - -L(fastafter): - -/* Handle large blocks smaller than 1/2 L2. */ - -L(pretry): /* first 1/2 L2 */ - mov __x86_shared_cache_size_half (%rip), %R8_LP - cmpq %rdx, %r8 /* calculate the lesser of */ - cmovaq %rdx, %r8 /* remaining bytes and 1/2 L2 */ - -L(pre): /* 64-byte with prefetching */ - movq %r8, %rcx - andq $-64, %r8 - shrq $6, %rcx - jz L(preskip) - - movq %r14, SAVE0(%rsp) - cfi_rel_offset (%r14, SAVE0) - movq %r13, SAVE1(%rsp) - cfi_rel_offset (%r13, SAVE1) - movq %r12, SAVE2(%rsp) - cfi_rel_offset (%r12, SAVE2) - movq %rbx, SAVE3(%rsp) - cfi_rel_offset (%rbx, SAVE3) - - cmpl $0, __x86_prefetchw(%rip) - jz L(preloop) /* check if PREFETCHW OK */ - - .p2align 4 - -/* ... when PREFETCHW is available (less cache-probe traffic in MP systems). */ - -L(prewloop): /* cache-line in state M */ - decq %rcx - - movq (%rsi), %rax - movq 8 (%rsi), %rbx - movq 16 (%rsi), %r9 - movq 24 (%rsi), %r10 - movq 32 (%rsi), %r11 - movq 40 (%rsi), %r12 - movq 48 (%rsi), %r13 - movq 56 (%rsi), %r14 - - prefetcht0 0 + 896 (%rsi) - prefetcht0 64 + 896 (%rsi) - - movq %rax, (%rdi) - movq %rbx, 8(%rdi) - movq %r9, 16(%rdi) - movq %r10, 24(%rdi) - movq %r11, 32(%rdi) - movq %r12, 40(%rdi) - movq %r13, 48(%rdi) - movq %r14, 56(%rdi) - - leaq 64(%rsi), %rsi - leaq 64(%rdi), %rdi - - jz L(prebail) - - decq %rcx - - movq (%rsi), %rax - movq 8(%rsi), %rbx - movq 16(%rsi), %r9 - movq 24(%rsi), %r10 - movq 32(%rsi), %r11 - movq 40(%rsi), %r12 - movq 48(%rsi), %r13 - movq 56(%rsi), %r14 - - movq %rax, (%rdi) - movq %rbx, 8(%rdi) - movq %r9, 16(%rdi) - movq %r10, 24(%rdi) - movq %r11, 32(%rdi) - movq %r12, 40(%rdi) - movq %r13, 48(%rdi) - movq %r14, 56(%rdi) - - prefetchw 896 - 64(%rdi) - prefetchw 896 - 0(%rdi) - - leaq 64(%rsi), %rsi - leaq 64(%rdi), %rdi - - jnz L(prewloop) - jmp L(prebail) - - .p2align 4 - -/* ... when PREFETCHW is not available. */ - -L(preloop): /* cache-line in state E */ - decq %rcx - - movq (%rsi), %rax - movq 8(%rsi), %rbx - movq 16(%rsi), %r9 - movq 24(%rsi), %r10 - movq 32(%rsi), %r11 - movq 40(%rsi), %r12 - movq 48(%rsi), %r13 - movq 56(%rsi), %r14 - - prefetcht0 896 + 0(%rsi) - prefetcht0 896 + 64(%rsi) - - movq %rax, (%rdi) - movq %rbx, 8(%rdi) - movq %r9, 16(%rdi) - movq %r10, 24(%rdi) - movq %r11, 32(%rdi) - movq %r12, 40(%rdi) - movq %r13, 48(%rdi) - movq %r14, 56(%rdi) - - leaq 64 (%rsi), %rsi - leaq 64 (%rdi), %rdi - - jz L(prebail) - - decq %rcx - - movq (%rsi), %rax - movq 8(%rsi), %rbx - movq 16(%rsi), %r9 - movq 24(%rsi), %r10 - movq 32(%rsi), %r11 - movq 40(%rsi), %r12 - movq 48(%rsi), %r13 - movq 56(%rsi), %r14 - - prefetcht0 896 - 64(%rdi) - prefetcht0 896 - 0(%rdi) - - movq %rax, (%rdi) - movq %rbx, 8(%rdi) - movq %r9, 16(%rdi) - movq %r10, 24(%rdi) - movq %r11, 32(%rdi) - movq %r12, 40(%rdi) - movq %r13, 48(%rdi) - movq %r14, 56(%rdi) - - leaq 64(%rsi), %rsi - leaq 64(%rdi), %rdi - - jnz L(preloop) - -L(prebail): - movq SAVE3(%rsp), %rbx - cfi_restore (%rbx) - movq SAVE2(%rsp), %r12 - cfi_restore (%r12) - movq SAVE1(%rsp), %r13 - cfi_restore (%r13) - movq SAVE0(%rsp), %r14 - cfi_restore (%r14) - -/* .p2align 4 */ - -L(preskip): - subq %r8, %rdx /* check for more */ - testq $-64, %rdx - jnz L(preafter) - - andl $63, %edx /* check for left overs */ -#ifdef USE_AS_MEMPCPY - jnz L(1) - - movq %rdi, %rax -#else - movq RETVAL(%rsp), %rax - jnz L(1) - - rep -#endif - retq /* exit */ - - .p2align 4 - -L(preafter): - -/* Handle huge blocks. */ - -L(NTtry): - -L(NT): /* non-temporal 128-byte */ - movq %rdx, %rcx - shrq $7, %rcx - jz L(NTskip) - - movq %r14, SAVE0(%rsp) - cfi_rel_offset (%r14, SAVE0) - movq %r13, SAVE1(%rsp) - cfi_rel_offset (%r13, SAVE1) - movq %r12, SAVE2(%rsp) - cfi_rel_offset (%r12, SAVE2) - - .p2align 4 - -L(NTloop): - prefetchnta 768(%rsi) - prefetchnta 832(%rsi) - - decq %rcx - - movq (%rsi), %rax - movq 8(%rsi), %r8 - movq 16(%rsi), %r9 - movq 24(%rsi), %r10 - movq 32(%rsi), %r11 - movq 40(%rsi), %r12 - movq 48(%rsi), %r13 - movq 56(%rsi), %r14 - - movntiq %rax, (%rdi) - movntiq %r8, 8(%rdi) - movntiq %r9, 16(%rdi) - movntiq %r10, 24(%rdi) - movntiq %r11, 32(%rdi) - movntiq %r12, 40(%rdi) - movntiq %r13, 48(%rdi) - movntiq %r14, 56(%rdi) - - movq 64(%rsi), %rax - movq 72(%rsi), %r8 - movq 80(%rsi), %r9 - movq 88(%rsi), %r10 - movq 96(%rsi), %r11 - movq 104(%rsi), %r12 - movq 112(%rsi), %r13 - movq 120(%rsi), %r14 - - movntiq %rax, 64(%rdi) - movntiq %r8, 72(%rdi) - movntiq %r9, 80(%rdi) - movntiq %r10, 88(%rdi) - movntiq %r11, 96(%rdi) - movntiq %r12, 104(%rdi) - movntiq %r13, 112(%rdi) - movntiq %r14, 120(%rdi) - - leaq 128(%rsi), %rsi - leaq 128(%rdi), %rdi - - jnz L(NTloop) - - sfence /* serialize memory stores */ - - movq SAVE2(%rsp), %r12 - cfi_restore (%r12) - movq SAVE1(%rsp), %r13 - cfi_restore (%r13) - movq SAVE0(%rsp), %r14 - cfi_restore (%r14) - -L(NTskip): - andl $127, %edx /* check for left overs */ -#ifdef USE_AS_MEMPCPY - jnz L(1) - - movq %rdi, %rax -#else - movq RETVAL(%rsp), %rax - jnz L(1) - - rep -#endif - retq /* exit */ - -#endif /* IS_IN (libc) */ - -END(memcpy) - -#ifndef USE_AS_MEMPCPY -libc_hidden_builtin_def (memcpy) -# if defined SHARED && !defined USE_MULTIARCH && IS_IN (libc) -# undef memcpy -# include <shlib-compat.h> -versioned_symbol (libc, __memcpy, memcpy, GLIBC_2_14); -# endif -#endif +/* Implemented in memcpy.S. */ diff --git a/sysdeps/x86_64/memcpy_chk.S b/sysdeps/x86_64/memcpy_chk.S index 2296b55119..aa33cd5fc1 100644 --- a/sysdeps/x86_64/memcpy_chk.S +++ b/sysdeps/x86_64/memcpy_chk.S @@ -1,5 +1,5 @@ /* Checking memcpy for x86-64. - Copyright (C) 2004-2016 Free Software Foundation, Inc. + Copyright (C) 2004-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -19,7 +19,7 @@ #include <sysdep.h> #include "asm-syntax.h" -#ifndef PIC +#ifndef SHARED /* For libc.so this is defined in memcpy.S. For libc.a, this is a separate source to avoid memcpy bringing in __chk_fail and all routines diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S new file mode 100644 index 0000000000..9cc92ff9a9 --- /dev/null +++ b/sysdeps/x86_64/memmove.S @@ -0,0 +1,71 @@ +/* Optimized memmove for x86-64. + Copyright (C) 2016-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#define VEC_SIZE 16 +#define VEC(i) xmm##i +#define PREFETCHNT prefetchnta +#define VMOVNT movntdq +/* Use movups and movaps for smaller code sizes. */ +#define VMOVU movups +#define VMOVA movaps + +#define SECTION(p) p + +#ifdef USE_MULTIARCH +# if !IS_IN (libc) +# define MEMCPY_SYMBOL(p,s) memcpy +# endif +#else +# if defined SHARED && IS_IN (libc) +# define MEMCPY_SYMBOL(p,s) __memcpy +# else +# define MEMCPY_SYMBOL(p,s) memcpy +# endif +#endif +#if !defined USE_MULTIARCH || !IS_IN (libc) +# define MEMPCPY_SYMBOL(p,s) __mempcpy +#endif +#ifndef MEMMOVE_SYMBOL +# define MEMMOVE_CHK_SYMBOL(p,s) p +# define MEMMOVE_SYMBOL(p,s) memmove +#endif + +#include "multiarch/memmove-vec-unaligned-erms.S" + +#ifndef USE_MULTIARCH +libc_hidden_builtin_def (memmove) +# if defined SHARED && IS_IN (libc) +strong_alias (memmove, __memcpy) +libc_hidden_ver (memmove, memcpy) +# endif +libc_hidden_def (__mempcpy) +weak_alias (__mempcpy, mempcpy) +libc_hidden_builtin_def (mempcpy) + +# if defined SHARED && IS_IN (libc) +# undef memcpy +# include <shlib-compat.h> +versioned_symbol (libc, __memcpy, memcpy, GLIBC_2_14); + +# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14) +compat_symbol (libc, memmove, memcpy, GLIBC_2_2_5); +# endif +# endif +#endif diff --git a/sysdeps/x86_64/fpu/s_fdiml.S b/sysdeps/x86_64/memmove_chk.S index f9f1e20259..39b56dde65 100644 --- a/sysdeps/x86_64/fpu/s_fdiml.S +++ b/sysdeps/x86_64/memmove_chk.S @@ -1,7 +1,6 @@ -/* Compute positive difference. - Copyright (C) 1997-2016 Free Software Foundation, Inc. +/* Checking memmove for x86-64. + Copyright (C) 2016-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. - Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -18,26 +17,17 @@ <http://www.gnu.org/licenses/>. */ #include <sysdep.h> - - .text -ENTRY(__fdiml) - fldt 8(%rsp) // x - fldt 24(%rsp) // x : y - - fucomi %st(1), %st - jp 1f - - jc 3f - fstp %st(1) - fldz - jmp 2f - -3: fsubrp %st, %st(1) - ret - -1: fucomi %st(0), %st - fcmovnu %st(1), %st -2: fstp %st(1) - ret -END(__fdiml) -weak_alias (__fdiml, fdiml) +#include "asm-syntax.h" + +#ifndef SHARED + /* For libc.so this is defined in memmove.S. + For libc.a, this is a separate source to avoid + memmove bringing in __chk_fail and all routines + it calls. */ + .text +ENTRY (__memmove_chk) + cmpq %rdx, %rcx + jb __chk_fail + jmp memmove +END (__memmove_chk) +#endif diff --git a/sysdeps/x86_64/mempcpy.S b/sysdeps/x86_64/mempcpy.S index acee5e56b1..d98500a78a 100644 --- a/sysdeps/x86_64/mempcpy.S +++ b/sysdeps/x86_64/mempcpy.S @@ -1,8 +1 @@ -#define USE_AS_MEMPCPY -#define memcpy __mempcpy -#define __memcpy_chk __mempcpy_chk -#include <sysdeps/x86_64/memcpy.S> - -libc_hidden_def (__mempcpy) -weak_alias (__mempcpy, mempcpy) -libc_hidden_builtin_def (mempcpy) +/* Implemented in memcpy.S. */ diff --git a/sysdeps/x86_64/mempcpy_chk.S b/sysdeps/x86_64/mempcpy_chk.S index 390abc68dd..0e9e24db00 100644 --- a/sysdeps/x86_64/mempcpy_chk.S +++ b/sysdeps/x86_64/mempcpy_chk.S @@ -1,5 +1,5 @@ /* Checking mempcpy for x86-64. - Copyright (C) 2004-2016 Free Software Foundation, Inc. + Copyright (C) 2004-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -19,7 +19,7 @@ #include <sysdep.h> #include "asm-syntax.h" -#ifndef PIC +#ifndef SHARED /* For libc.so this is defined in memcpy.S. For libc.a, this is a separate source to avoid mempcpy bringing in __chk_fail and all routines diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S index 840de30cd7..b8e3fa1d87 100644 --- a/sysdeps/x86_64/memrchr.S +++ b/sysdeps/x86_64/memrchr.S @@ -1,6 +1,6 @@ /* fast SSE2 memrchr with 64 byte loop and pmaxub instruction using - Copyright (C) 2011-2016 Free Software Foundation, Inc. + Copyright (C) 2011-2018 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -22,7 +22,7 @@ .text ENTRY (__memrchr) - movd %rsi, %xmm1 + movd %esi, %xmm1 sub $16, %rdx jbe L(length_less16) @@ -42,8 +42,8 @@ ENTRY (__memrchr) jnz L(matches0) sub $64, %rdi - mov %rdi, %rcx - and $15, %rcx + mov %edi, %ecx + and $15, %ecx jz L(loop_prolog) add $16, %rdi @@ -108,8 +108,8 @@ L(loop_prolog): test %eax, %eax jnz L(matches0) - mov %rdi, %rcx - and $63, %rcx + mov %edi, %ecx + and $63, %ecx jz L(align64_loop) add $64, %rdi @@ -166,8 +166,8 @@ L(align64_loop): .p2align 4 L(exit_loop): - add $64, %rdx - cmp $32, %rdx + add $64, %edx + cmp $32, %edx jbe L(exit_loop_32) movdqa 48(%rdi), %xmm0 @@ -187,7 +187,7 @@ L(exit_loop): pmovmskb %xmm3, %eax test %eax, %eax jnz L(matches16_1) - cmp $48, %rdx + cmp $48, %edx jbe L(return_null) pcmpeqb (%rdi), %xmm1 @@ -204,7 +204,7 @@ L(exit_loop_32): pmovmskb %xmm0, %eax test %eax, %eax jnz L(matches48_1) - cmp $16, %rdx + cmp $16, %edx jbe L(return_null) pcmpeqb 32(%rdi), %xmm1 @@ -276,7 +276,7 @@ L(matches48_1): .p2align 4 L(return_null): - xor %rax, %rax + xor %eax, %eax ret .p2align 4 @@ -306,18 +306,16 @@ L(length_less16): punpcklbw %xmm1, %xmm1 punpcklbw %xmm1, %xmm1 - add $16, %rdx + add $16, %edx pshufd $0, %xmm1, %xmm1 - mov %rdi, %rcx - and $15, %rcx + mov %edi, %ecx + and $15, %ecx jz L(length_less16_offset0) - mov %rdi, %rcx - and $15, %rcx mov %cl, %dh - mov %rcx, %r8 + mov %ecx, %esi add %dl, %dh and $-16, %rdi @@ -340,7 +338,7 @@ L(length_less16): bsr %eax, %eax add %rdi, %rax - add %r8, %rax + add %rsi, %rax ret .p2align 4 @@ -362,14 +360,14 @@ L(length_less16_part2): pcmpeqb (%rdi), %xmm1 pmovmskb %xmm1, %eax - mov %r8, %rcx + mov %esi, %ecx sar %cl, %eax test %eax, %eax jz L(return_null) bsr %eax, %eax add %rdi, %rax - add %r8, %rax + add %rsi, %rax ret .p2align 4 diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S index 4cf0da0fb8..b342679576 100644 --- a/sysdeps/x86_64/memset.S +++ b/sysdeps/x86_64/memset.S @@ -1,6 +1,6 @@ /* memset/bzero -- set memory area to CH/0 Optimized version for x86-64. - Copyright (C) 2002-2016 Free Software Foundation, Inc. + Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -19,114 +19,43 @@ #include <sysdep.h> - .text -#if IS_IN (libc) -ENTRY(__bzero) - movq %rdi, %rax /* Set return value. */ - movq %rsi, %rdx /* Set n. */ - pxor %xmm0, %xmm0 - jmp L(entry_from_bzero) -END(__bzero) -weak_alias (__bzero, bzero) - -/* Like memset but takes additional parameter with return value. */ -ENTRY(__memset_tail) - movq %rcx, %rax /* Set return value. */ - - movd %esi, %xmm0 - punpcklbw %xmm0, %xmm0 - punpcklwd %xmm0, %xmm0 - pshufd $0, %xmm0, %xmm0 - - jmp L(entry_from_bzero) -END(__memset_tail) +#define VEC_SIZE 16 +#define VEC(i) xmm##i +/* Don't use movups and movaps since it will get larger nop paddings for + alignment. */ +#define VMOVU movdqu +#define VMOVA movdqa + +#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ + movq r, %rax; \ + punpcklbw %xmm0, %xmm0; \ + punpcklwd %xmm0, %xmm0; \ + pshufd $0, %xmm0, %xmm0 + +#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ + movq r, %rax; \ + pshufd $0, %xmm0, %xmm0 + +#define SECTION(p) p + +#ifndef MEMSET_SYMBOL +# define MEMSET_CHK_SYMBOL(p,s) p +# define MEMSET_SYMBOL(p,s) memset #endif -#if defined PIC && IS_IN (libc) -ENTRY_CHK (__memset_chk) - cmpq %rdx, %rcx - jb HIDDEN_JUMPTARGET (__chk_fail) -END_CHK (__memset_chk) +#ifndef WMEMSET_SYMBOL +# define WMEMSET_CHK_SYMBOL(p,s) p +# define WMEMSET_SYMBOL(p,s) __wmemset #endif -ENTRY (memset) - movd %esi, %xmm0 - movq %rdi, %rax - punpcklbw %xmm0, %xmm0 - punpcklwd %xmm0, %xmm0 - pshufd $0, %xmm0, %xmm0 -L(entry_from_bzero): - cmpq $64, %rdx - ja L(loop_start) - cmpq $16, %rdx - jbe L(less_16_bytes) - cmpq $32, %rdx - movdqu %xmm0, (%rdi) - movdqu %xmm0, -16(%rdi,%rdx) - ja L(between_32_64_bytes) -L(return): - rep - ret - .p2align 4 -L(between_32_64_bytes): - movdqu %xmm0, 16(%rdi) - movdqu %xmm0, -32(%rdi,%rdx) - ret - .p2align 4 -L(loop_start): - leaq 64(%rdi), %rcx - movdqu %xmm0, (%rdi) - andq $-64, %rcx - movdqu %xmm0, -16(%rdi,%rdx) - movdqu %xmm0, 16(%rdi) - movdqu %xmm0, -32(%rdi,%rdx) - movdqu %xmm0, 32(%rdi) - movdqu %xmm0, -48(%rdi,%rdx) - movdqu %xmm0, 48(%rdi) - movdqu %xmm0, -64(%rdi,%rdx) - addq %rdi, %rdx - andq $-64, %rdx - cmpq %rdx, %rcx - je L(return) - .p2align 4 -L(loop): - movdqa %xmm0, (%rcx) - movdqa %xmm0, 16(%rcx) - movdqa %xmm0, 32(%rcx) - movdqa %xmm0, 48(%rcx) - addq $64, %rcx - cmpq %rcx, %rdx - jne L(loop) - rep - ret -L(less_16_bytes): - movq %xmm0, %rcx - testb $24, %dl - jne L(between8_16bytes) - testb $4, %dl - jne L(between4_7bytes) - testb $1, %dl - je L(odd_byte) - movb %cl, (%rdi) -L(odd_byte): - testb $2, %dl - je L(return) - movw %cx, -2(%rax,%rdx) - ret -L(between4_7bytes): - movl %ecx, (%rdi) - movl %ecx, -4(%rdi,%rdx) - ret -L(between8_16bytes): - movq %rcx, (%rdi) - movq %rcx, -8(%rdi,%rdx) - ret +#include "multiarch/memset-vec-unaligned-erms.S" -END (memset) libc_hidden_builtin_def (memset) -#if defined PIC && IS_IN (libc) && !defined USE_MULTIARCH -strong_alias (__memset_chk, __memset_zero_constant_len_parameter) - .section .gnu.warning.__memset_zero_constant_len_parameter - .string "memset used with constant zero length parameter; this could be due to transposed parameters" +#if IS_IN (libc) +libc_hidden_def (__wmemset) +weak_alias (__wmemset, wmemset) +libc_hidden_weak (wmemset) #endif diff --git a/sysdeps/x86_64/memset_chk.S b/sysdeps/x86_64/memset_chk.S index 95bb5d0e94..4ecf914fbe 100644 --- a/sysdeps/x86_64/memset_chk.S +++ b/sysdeps/x86_64/memset_chk.S @@ -1,5 +1,5 @@ /* Checking memset for x86-64. - Copyright (C) 2004-2016 Free Software Foundation, Inc. + Copyright (C) 2004-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/memusage.h b/sysdeps/x86_64/memusage.h index fc102c4252..45fd920b52 100644 --- a/sysdeps/x86_64/memusage.h +++ b/sysdeps/x86_64/memusage.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2001-2016 Free Software Foundation, Inc. +/* Copyright (C) 2001-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/mul_1.S b/sysdeps/x86_64/mul_1.S index 88b8f920a1..c38927b5a0 100644 --- a/sysdeps/x86_64/mul_1.S +++ b/sysdeps/x86_64/mul_1.S @@ -1,6 +1,6 @@ /* AMD64 __mpn_mul_1 -- Multiply a limb vector with a limb and store the result in a second limb vector. - Copyright (C) 2003-2016 Free Software Foundation, Inc. + Copyright (C) 2003-2018 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index d234f4ab66..bb5e970735 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -1,26 +1,46 @@ ifeq ($(subdir),csu) tests += test-multiarch -gen-as-const-headers += ifunc-defines.sym endif ifeq ($(subdir),string) -sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ - strcmp-sse2-unaligned strncmp-ssse3 \ - memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned \ - memcpy-avx512-no-vzeroupper mempcpy-ssse3 memmove-ssse3 \ - memcpy-ssse3-back mempcpy-ssse3-back memmove-avx-unaligned \ - memcpy-avx-unaligned mempcpy-avx-unaligned \ - mempcpy-avx512-no-vzeroupper memmove-ssse3-back \ - memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \ - strncase_l-ssse3 strcat-ssse3 strncat-ssse3\ +sysdep_routines += strncat-c stpncpy-c strncpy-c \ + strcmp-sse2 strcmp-sse2-unaligned strcmp-ssse3 \ + strcmp-sse4_2 strcmp-avx2 \ + strncmp-sse2 strncmp-ssse3 strncmp-sse4_2 strncmp-avx2 \ + memchr-sse2 rawmemchr-sse2 memchr-avx2 rawmemchr-avx2 \ + memrchr-sse2 memrchr-avx2 \ + memcmp-sse2 \ + memcmp-avx2-movbe \ + memcmp-sse4 memcpy-ssse3 \ + memmove-ssse3 \ + memcpy-ssse3-back \ + memmove-ssse3-back \ + memmove-avx512-no-vzeroupper \ + strcasecmp_l-sse2 strcasecmp_l-ssse3 \ + strcasecmp_l-sse4_2 strcasecmp_l-avx \ + strncase_l-sse2 strncase_l-ssse3 \ + strncase_l-sse4_2 strncase_l-avx \ + strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \ + strrchr-sse2 strrchr-avx2 \ + strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \ + strcat-ssse3 strncat-ssse3\ + strcpy-sse2 stpcpy-sse2 \ strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \ strcpy-sse2-unaligned strncpy-sse2-unaligned \ stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ + strcat-sse2 \ strcat-sse2-unaligned strncat-sse2-unaligned \ strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \ - strcspn-c strpbrk-c strspn-c varshift memset-avx2 \ - memset-avx512-no-vzeroupper + strcspn-sse2 strpbrk-sse2 strspn-sse2 \ + strcspn-c strpbrk-c strspn-c varshift \ + memset-avx512-no-vzeroupper \ + memmove-sse2-unaligned-erms \ + memmove-avx-unaligned-erms \ + memmove-avx512-unaligned-erms \ + memset-sse2-unaligned-erms \ + memset-avx2-unaligned-erms \ + memset-avx512-unaligned-erms CFLAGS-varshift.c += -msse4 CFLAGS-strcspn-c.c += -msse4 CFLAGS-strpbrk-c.c += -msse4 @@ -28,5 +48,20 @@ CFLAGS-strspn-c.c += -msse4 endif ifeq ($(subdir),wcsmbs) -sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c +sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ + wmemcmp-avx2-movbe \ + wmemchr-sse2 wmemchr-avx2 \ + wcscmp-sse2 wcscmp-avx2 \ + wcsncmp-sse2 wcsncmp-avx2 \ + wcscpy-ssse3 wcscpy-c \ + wcschr-sse2 wcschr-avx2 \ + wcsrchr-sse2 wcsrchr-avx2 \ + wcsnlen-sse4_1 wcsnlen-c \ + wcslen-sse2 wcslen-avx2 wcsnlen-avx2 +endif + +ifeq ($(subdir),debug) +sysdep_routines += memcpy_chk-nonshared mempcpy_chk-nonshared \ + memmove_chk-nonshared memset_chk-nonshared \ + wmemset_chk-nonshared endif diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h new file mode 100644 index 0000000000..9cab837642 --- /dev/null +++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h @@ -0,0 +1,36 @@ +/* Common definition for ifunc selections optimized with SSE2 and AVX2. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <init-arch.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) + && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + return OPTIMIZE (avx2); + + return OPTIMIZE (sse2); +} diff --git a/sysdeps/x86_64/multiarch/ifunc-defines.sym b/sysdeps/x86_64/multiarch/ifunc-defines.sym deleted file mode 100644 index 3df946f343..0000000000 --- a/sysdeps/x86_64/multiarch/ifunc-defines.sym +++ /dev/null @@ -1,20 +0,0 @@ -#include "init-arch.h" -#include <stddef.h> - --- - -CPU_FEATURES_SIZE sizeof (struct cpu_features) -CPUID_OFFSET offsetof (struct cpu_features, cpuid) -CPUID_SIZE sizeof (struct cpuid_registers) -CPUID_EAX_OFFSET offsetof (struct cpuid_registers, eax) -CPUID_EBX_OFFSET offsetof (struct cpuid_registers, ebx) -CPUID_ECX_OFFSET offsetof (struct cpuid_registers, ecx) -CPUID_EDX_OFFSET offsetof (struct cpuid_registers, edx) -FAMILY_OFFSET offsetof (struct cpu_features, family) -MODEL_OFFSET offsetof (struct cpu_features, model) -FEATURE_OFFSET offsetof (struct cpu_features, feature) -FEATURE_SIZE sizeof (unsigned int) - -COMMON_CPUID_INDEX_1 -COMMON_CPUID_INDEX_7 -FEATURE_INDEX_1 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 188b6d36c6..9aaaef7251 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -1,5 +1,5 @@ /* Enumerate available IFUNC implementations of a function. x86-64 version. - Copyright (C) 2012-2016 Free Software Foundation, Inc. + Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -38,77 +38,164 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, size_t i = 0; - /* Support sysdeps/x86_64/multiarch/memcmp.S. */ + /* Support sysdeps/x86_64/multiarch/memchr.c. */ + IFUNC_IMPL (i, name, memchr, + IFUNC_IMPL_ADD (array, i, memchr, + HAS_ARCH_FEATURE (AVX2_Usable), + __memchr_avx2) + IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2)) + + /* Support sysdeps/x86_64/multiarch/memcmp.c. */ IFUNC_IMPL (i, name, memcmp, + IFUNC_IMPL_ADD (array, i, memcmp, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (MOVBE)), + __memcmp_avx2_movbe) IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_1), __memcmp_sse4_1) IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3), __memcmp_ssse3) IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2)) +#ifdef SHARED /* Support sysdeps/x86_64/multiarch/memmove_chk.c. */ IFUNC_IMPL (i, name, __memmove_chk, -#ifdef HAVE_AVX512_ASM_SUPPORT IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_ARCH_FEATURE (AVX512F_Usable), __memmove_chk_avx512_no_vzeroupper) -#endif + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memmove_chk_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memmove_chk_avx512_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_ARCH_FEATURE (AVX_Usable), __memmove_chk_avx_unaligned) IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_ARCH_FEATURE (AVX_Usable), + __memmove_chk_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_CPU_FEATURE (SSSE3), __memmove_chk_ssse3_back) IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_CPU_FEATURE (SSSE3), __memmove_chk_ssse3) IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, - __memmove_chk_sse2)) + __memmove_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, + __memmove_chk_sse2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, + __memmove_chk_erms)) +#endif - /* Support sysdeps/x86_64/multiarch/memmove.S. */ + /* Support sysdeps/x86_64/multiarch/memmove.c. */ IFUNC_IMPL (i, name, memmove, IFUNC_IMPL_ADD (array, i, memmove, HAS_ARCH_FEATURE (AVX_Usable), __memmove_avx_unaligned) -#ifdef HAVE_AVX512_ASM_SUPPORT + IFUNC_IMPL_ADD (array, i, memmove, + HAS_ARCH_FEATURE (AVX_Usable), + __memmove_avx_unaligned_erms) IFUNC_IMPL_ADD (array, i, memmove, HAS_ARCH_FEATURE (AVX512F_Usable), __memmove_avx512_no_vzeroupper) -#endif + IFUNC_IMPL_ADD (array, i, memmove, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memmove_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, memmove, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memmove_avx512_unaligned_erms) IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3), __memmove_ssse3_back) IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3), __memmove_ssse3) - IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2)) + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms) + IFUNC_IMPL_ADD (array, i, memmove, 1, + __memmove_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, memmove, 1, + __memmove_sse2_unaligned_erms)) + + /* Support sysdeps/x86_64/multiarch/memrchr.c. */ + IFUNC_IMPL (i, name, memrchr, + IFUNC_IMPL_ADD (array, i, memrchr, + HAS_ARCH_FEATURE (AVX2_Usable), + __memrchr_avx2) + IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_sse2)) - /* Support sysdeps/x86_64/multiarch/memset_chk.S. */ +#ifdef SHARED + /* Support sysdeps/x86_64/multiarch/memset_chk.c. */ IFUNC_IMPL (i, name, __memset_chk, IFUNC_IMPL_ADD (array, i, __memset_chk, 1, - __memset_chk_sse2) + __memset_chk_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, 1, + __memset_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __memset_chk, 1, + __memset_chk_sse2_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memset_chk, HAS_ARCH_FEATURE (AVX2_Usable), - __memset_chk_avx2) -#ifdef HAVE_AVX512_ASM_SUPPORT + __memset_chk_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_ARCH_FEATURE (AVX2_Usable), + __memset_chk_avx2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_chk_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_chk_avx512_unaligned) IFUNC_IMPL_ADD (array, i, __memset_chk, HAS_ARCH_FEATURE (AVX512F_Usable), __memset_chk_avx512_no_vzeroupper) -#endif ) +#endif - /* Support sysdeps/x86_64/multiarch/memset.S. */ + /* Support sysdeps/x86_64/multiarch/memset.c. */ IFUNC_IMPL (i, name, memset, - IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2) + IFUNC_IMPL_ADD (array, i, memset, 1, + __memset_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, memset, 1, + __memset_sse2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms) + IFUNC_IMPL_ADD (array, i, memset, + HAS_ARCH_FEATURE (AVX2_Usable), + __memset_avx2_unaligned) IFUNC_IMPL_ADD (array, i, memset, HAS_ARCH_FEATURE (AVX2_Usable), - __memset_avx2) -#ifdef HAVE_AVX512_ASM_SUPPORT + __memset_avx2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_avx512_unaligned) IFUNC_IMPL_ADD (array, i, memset, HAS_ARCH_FEATURE (AVX512F_Usable), __memset_avx512_no_vzeroupper) -#endif ) - /* Support sysdeps/x86_64/multiarch/stpncpy.S. */ + /* Support sysdeps/x86_64/multiarch/rawmemchr.c. */ + IFUNC_IMPL (i, name, rawmemchr, + IFUNC_IMPL_ADD (array, i, rawmemchr, + HAS_ARCH_FEATURE (AVX2_Usable), + __rawmemchr_avx2) + IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2)) + + /* Support sysdeps/x86_64/multiarch/strlen.c. */ + IFUNC_IMPL (i, name, strlen, + IFUNC_IMPL_ADD (array, i, strlen, + HAS_ARCH_FEATURE (AVX2_Usable), + __strlen_avx2) + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2)) + + /* Support sysdeps/x86_64/multiarch/strnlen.c. */ + IFUNC_IMPL (i, name, strnlen, + IFUNC_IMPL_ADD (array, i, strnlen, + HAS_ARCH_FEATURE (AVX2_Usable), + __strnlen_avx2) + IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2)) + + /* Support sysdeps/x86_64/multiarch/stpncpy.c. */ IFUNC_IMPL (i, name, stpncpy, IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3), __stpncpy_ssse3) @@ -116,14 +203,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __stpncpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2)) - /* Support sysdeps/x86_64/multiarch/stpcpy.S. */ + /* Support sysdeps/x86_64/multiarch/stpcpy.c. */ IFUNC_IMPL (i, name, stpcpy, IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3), __stpcpy_ssse3) IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2)) - /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S. */ + /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ IFUNC_IMPL (i, name, strcasecmp, IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_ARCH_FEATURE (AVX_Usable), @@ -136,7 +223,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __strcasecmp_ssse3) IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2)) - /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S. */ + /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ IFUNC_IMPL (i, name, strcasecmp_l, IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_ARCH_FEATURE (AVX_Usable), @@ -150,20 +237,40 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1, __strcasecmp_l_sse2)) - /* Support sysdeps/x86_64/multiarch/strcat.S. */ + /* Support sysdeps/x86_64/multiarch/strcat.c. */ IFUNC_IMPL (i, name, strcat, IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3), __strcat_ssse3) IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2)) - /* Support sysdeps/x86_64/multiarch/strchr.S. */ + /* Support sysdeps/x86_64/multiarch/strchr.c. */ IFUNC_IMPL (i, name, strchr, + IFUNC_IMPL_ADD (array, i, strchr, + HAS_ARCH_FEATURE (AVX2_Usable), + __strchr_avx2) IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf) IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2)) - /* Support sysdeps/x86_64/multiarch/strcmp.S. */ + /* Support sysdeps/x86_64/multiarch/strchrnul.c. */ + IFUNC_IMPL (i, name, strchrnul, + IFUNC_IMPL_ADD (array, i, strchrnul, + HAS_ARCH_FEATURE (AVX2_Usable), + __strchrnul_avx2) + IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_sse2)) + + /* Support sysdeps/x86_64/multiarch/strrchr.c. */ + IFUNC_IMPL (i, name, strrchr, + IFUNC_IMPL_ADD (array, i, strrchr, + HAS_ARCH_FEATURE (AVX2_Usable), + __strrchr_avx2) + IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2)) + + /* Support sysdeps/x86_64/multiarch/strcmp.c. */ IFUNC_IMPL (i, name, strcmp, + IFUNC_IMPL_ADD (array, i, strcmp, + HAS_ARCH_FEATURE (AVX2_Usable), + __strcmp_avx2) IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2), __strcmp_sse42) IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSSE3), @@ -171,20 +278,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2)) - /* Support sysdeps/x86_64/multiarch/strcpy.S. */ + /* Support sysdeps/x86_64/multiarch/strcpy.c. */ IFUNC_IMPL (i, name, strcpy, IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3), __strcpy_ssse3) IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2)) - /* Support sysdeps/x86_64/multiarch/strcspn.S. */ + /* Support sysdeps/x86_64/multiarch/strcspn.c. */ IFUNC_IMPL (i, name, strcspn, IFUNC_IMPL_ADD (array, i, strcspn, HAS_CPU_FEATURE (SSE4_2), __strcspn_sse42) IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_sse2)) - /* Support sysdeps/x86_64/multiarch/strncase_l.S. */ + /* Support sysdeps/x86_64/multiarch/strncase_l.c. */ IFUNC_IMPL (i, name, strncasecmp, IFUNC_IMPL_ADD (array, i, strncasecmp, HAS_ARCH_FEATURE (AVX_Usable), @@ -198,7 +305,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strncasecmp, 1, __strncasecmp_sse2)) - /* Support sysdeps/x86_64/multiarch/strncase_l.S. */ + /* Support sysdeps/x86_64/multiarch/strncase_l.c. */ IFUNC_IMPL (i, name, strncasecmp_l, IFUNC_IMPL_ADD (array, i, strncasecmp_l, HAS_ARCH_FEATURE (AVX_Usable), @@ -212,7 +319,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1, __strncasecmp_l_sse2)) - /* Support sysdeps/x86_64/multiarch/strncat.S. */ + /* Support sysdeps/x86_64/multiarch/strncat.c. */ IFUNC_IMPL (i, name, strncat, IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3), __strncat_ssse3) @@ -220,7 +327,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __strncat_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2)) - /* Support sysdeps/x86_64/multiarch/strncpy.S. */ + /* Support sysdeps/x86_64/multiarch/strncpy.c. */ IFUNC_IMPL (i, name, strncpy, IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3), __strncpy_ssse3) @@ -228,14 +335,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __strncpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2)) - /* Support sysdeps/x86_64/multiarch/strpbrk.S. */ + /* Support sysdeps/x86_64/multiarch/strpbrk.c. */ IFUNC_IMPL (i, name, strpbrk, IFUNC_IMPL_ADD (array, i, strpbrk, HAS_CPU_FEATURE (SSE4_2), __strpbrk_sse42) IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_sse2)) - /* Support sysdeps/x86_64/multiarch/strspn.S. */ + /* Support sysdeps/x86_64/multiarch/strspn.c. */ IFUNC_IMPL (i, name, strspn, IFUNC_IMPL_ADD (array, i, strspn, HAS_CPU_FEATURE (SSE4_2), __strspn_sse42) @@ -246,99 +353,226 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2)) - /* Support sysdeps/x86_64/multiarch/wcscpy.S. */ + /* Support sysdeps/x86_64/multiarch/wcschr.c. */ + IFUNC_IMPL (i, name, wcschr, + IFUNC_IMPL_ADD (array, i, wcschr, + HAS_ARCH_FEATURE (AVX2_Usable), + __wcschr_avx2) + IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_sse2)) + + /* Support sysdeps/x86_64/multiarch/wcsrchr.c. */ + IFUNC_IMPL (i, name, wcsrchr, + IFUNC_IMPL_ADD (array, i, wcsrchr, + HAS_ARCH_FEATURE (AVX2_Usable), + __wcsrchr_avx2) + IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_sse2)) + + /* Support sysdeps/x86_64/multiarch/wcscmp.c. */ + IFUNC_IMPL (i, name, wcscmp, + IFUNC_IMPL_ADD (array, i, wcscmp, + HAS_ARCH_FEATURE (AVX2_Usable), + __wcscmp_avx2) + IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_sse2)) + + /* Support sysdeps/x86_64/multiarch/wcsncmp.c. */ + IFUNC_IMPL (i, name, wcsncmp, + IFUNC_IMPL_ADD (array, i, wcsncmp, + HAS_ARCH_FEATURE (AVX2_Usable), + __wcsncmp_avx2) + IFUNC_IMPL_ADD (array, i, wcsncmp, 1, __wcsncmp_sse2)) + + /* Support sysdeps/x86_64/multiarch/wcscpy.c. */ IFUNC_IMPL (i, name, wcscpy, IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3), __wcscpy_ssse3) IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2)) - /* Support sysdeps/x86_64/multiarch/wmemcmp.S. */ + /* Support sysdeps/x86_64/multiarch/wcslen.c. */ + IFUNC_IMPL (i, name, wcslen, + IFUNC_IMPL_ADD (array, i, wcslen, + HAS_ARCH_FEATURE (AVX2_Usable), + __wcslen_avx2) + IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2)) + + /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ + IFUNC_IMPL (i, name, wcsnlen, + IFUNC_IMPL_ADD (array, i, wcsnlen, + HAS_ARCH_FEATURE (AVX2_Usable), + __wcsnlen_avx2) + IFUNC_IMPL_ADD (array, i, wcsnlen, + HAS_CPU_FEATURE (SSE4_1), + __wcsnlen_sse4_1) + IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_sse2)) + + /* Support sysdeps/x86_64/multiarch/wmemchr.c. */ + IFUNC_IMPL (i, name, wmemchr, + IFUNC_IMPL_ADD (array, i, wmemchr, + HAS_ARCH_FEATURE (AVX2_Usable), + __wmemchr_avx2) + IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2)) + + /* Support sysdeps/x86_64/multiarch/wmemcmp.c. */ IFUNC_IMPL (i, name, wmemcmp, + IFUNC_IMPL_ADD (array, i, wmemcmp, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (MOVBE)), + __wmemcmp_avx2_movbe) IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_1), __wmemcmp_sse4_1) IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3), __wmemcmp_ssse3) IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) + /* Support sysdeps/x86_64/multiarch/wmemset.c. */ + IFUNC_IMPL (i, name, wmemset, + IFUNC_IMPL_ADD (array, i, wmemset, 1, + __wmemset_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, wmemset, + HAS_ARCH_FEATURE (AVX2_Usable), + __wmemset_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, wmemset, + HAS_ARCH_FEATURE (AVX512F_Usable), + __wmemset_avx512_unaligned)) + #ifdef SHARED - /* Support sysdeps/x86_64/multiarch/memcpy_chk.S. */ + /* Support sysdeps/x86_64/multiarch/memcpy_chk.c. */ IFUNC_IMPL (i, name, __memcpy_chk, -#ifdef HAVE_AVX512_ASM_SUPPORT IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_ARCH_FEATURE (AVX512F_Usable), __memcpy_chk_avx512_no_vzeroupper) -#endif + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memcpy_chk_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memcpy_chk_avx512_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_ARCH_FEATURE (AVX_Usable), __memcpy_chk_avx_unaligned) IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_ARCH_FEATURE (AVX_Usable), + __memcpy_chk_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_CPU_FEATURE (SSSE3), __memcpy_chk_ssse3_back) IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_CPU_FEATURE (SSSE3), __memcpy_chk_ssse3) IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, - __memcpy_chk_sse2)) + __memcpy_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, + __memcpy_chk_sse2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, + __memcpy_chk_erms)) +#endif - /* Support sysdeps/x86_64/multiarch/memcpy.S. */ + /* Support sysdeps/x86_64/multiarch/memcpy.c. */ IFUNC_IMPL (i, name, memcpy, IFUNC_IMPL_ADD (array, i, memcpy, HAS_ARCH_FEATURE (AVX_Usable), __memcpy_avx_unaligned) + IFUNC_IMPL_ADD (array, i, memcpy, + HAS_ARCH_FEATURE (AVX_Usable), + __memcpy_avx_unaligned_erms) IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3), __memcpy_ssse3_back) IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3), __memcpy_ssse3) -#ifdef HAVE_AVX512_ASM_SUPPORT IFUNC_IMPL_ADD (array, i, memcpy, HAS_ARCH_FEATURE (AVX512F_Usable), __memcpy_avx512_no_vzeroupper) -#endif + IFUNC_IMPL_ADD (array, i, memcpy, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memcpy_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, memcpy, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memcpy_avx512_unaligned_erms) IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned) - IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2)) + IFUNC_IMPL_ADD (array, i, memcpy, 1, + __memcpy_sse2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms)) - /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S. */ +#ifdef SHARED + /* Support sysdeps/x86_64/multiarch/mempcpy_chk.c. */ IFUNC_IMPL (i, name, __mempcpy_chk, -#ifdef HAVE_AVX512_ASM_SUPPORT IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_ARCH_FEATURE (AVX512F_Usable), __mempcpy_chk_avx512_no_vzeroupper) -#endif + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __mempcpy_chk_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __mempcpy_chk_avx512_unaligned_erms) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_ARCH_FEATURE (AVX_Usable), __mempcpy_chk_avx_unaligned) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_ARCH_FEATURE (AVX_Usable), + __mempcpy_chk_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_CPU_FEATURE (SSSE3), __mempcpy_chk_ssse3_back) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_CPU_FEATURE (SSSE3), __mempcpy_chk_ssse3) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, - __mempcpy_chk_sse2)) + __mempcpy_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, + __mempcpy_chk_sse2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, + __mempcpy_chk_erms)) +#endif - /* Support sysdeps/x86_64/multiarch/mempcpy.S. */ + /* Support sysdeps/x86_64/multiarch/mempcpy.c. */ IFUNC_IMPL (i, name, mempcpy, -#ifdef HAVE_AVX512_ASM_SUPPORT IFUNC_IMPL_ADD (array, i, mempcpy, HAS_ARCH_FEATURE (AVX512F_Usable), __mempcpy_avx512_no_vzeroupper) -#endif + IFUNC_IMPL_ADD (array, i, mempcpy, + HAS_ARCH_FEATURE (AVX512F_Usable), + __mempcpy_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, mempcpy, + HAS_ARCH_FEATURE (AVX512F_Usable), + __mempcpy_avx512_unaligned_erms) IFUNC_IMPL_ADD (array, i, mempcpy, HAS_ARCH_FEATURE (AVX_Usable), __mempcpy_avx_unaligned) + IFUNC_IMPL_ADD (array, i, mempcpy, + HAS_ARCH_FEATURE (AVX_Usable), + __mempcpy_avx_unaligned_erms) IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3), __mempcpy_ssse3_back) IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3), __mempcpy_ssse3) - IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2)) + IFUNC_IMPL_ADD (array, i, mempcpy, 1, + __mempcpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, mempcpy, 1, + __mempcpy_sse2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms)) - /* Support sysdeps/x86_64/multiarch/strncmp.S. */ + /* Support sysdeps/x86_64/multiarch/strncmp.c. */ IFUNC_IMPL (i, name, strncmp, + IFUNC_IMPL_ADD (array, i, strncmp, + HAS_ARCH_FEATURE (AVX2_Usable), + __strncmp_avx2) IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2), __strncmp_sse42) IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3), __strncmp_ssse3) IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2)) + +#ifdef SHARED + /* Support sysdeps/x86_64/multiarch/wmemset_chk.c. */ + IFUNC_IMPL (i, name, __wmemset_chk, + IFUNC_IMPL_ADD (array, i, __wmemset_chk, 1, + __wmemset_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __wmemset_chk, + HAS_ARCH_FEATURE (AVX2_Usable), + __wmemset_chk_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, __wmemset_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __wmemset_chk_avx512_unaligned)) #endif return i; diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h new file mode 100644 index 0000000000..bf5ab8eb7f --- /dev/null +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h @@ -0,0 +1,45 @@ +/* Common definition for memcmp/wmemcmp ifunc selections. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +# include <init-arch.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) + && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) + && CPU_FEATURES_CPU_P (cpu_features, MOVBE) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + return OPTIMIZE (avx2_movbe); + + if (CPU_FEATURES_CPU_P (cpu_features, SSE4_1)) + return OPTIMIZE (sse4_1); + + if (CPU_FEATURES_CPU_P (cpu_features, SSSE3)) + return OPTIMIZE (ssse3); + + return OPTIMIZE (sse2); +} diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h new file mode 100644 index 0000000000..5b1eb1c92c --- /dev/null +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h @@ -0,0 +1,81 @@ +/* Common definition for memcpy, mempcpy and memmove implementation. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <init-arch.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE (erms) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_no_vzeroupper) + attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_ERMS) + || CPU_FEATURES_ARCH_P (cpu_features, Prefer_FSRM)) + return OPTIMIZE (erms); + + if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable) + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) + { + if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx512_no_vzeroupper); + + if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) + return OPTIMIZE (avx512_unaligned_erms); + + return OPTIMIZE (avx512_unaligned); + } + + if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + { + if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) + return OPTIMIZE (avx_unaligned_erms); + + return OPTIMIZE (avx_unaligned); + } + + if (!CPU_FEATURES_CPU_P (cpu_features, SSSE3) + || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy)) + { + if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) + return OPTIMIZE (sse2_unaligned_erms); + + return OPTIMIZE (sse2_unaligned); + } + + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward)) + return OPTIMIZE (ssse3_back); + + return OPTIMIZE (ssse3); +} diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h new file mode 100644 index 0000000000..19b5ae676c --- /dev/null +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h @@ -0,0 +1,69 @@ +/* Common definition for memset/memset_chk ifunc selections. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <init-arch.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE (erms) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_no_vzeroupper) + attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_ERMS)) + return OPTIMIZE (erms); + + if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable) + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) + { + if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx512_no_vzeroupper); + + if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) + return OPTIMIZE (avx512_unaligned_erms); + + return OPTIMIZE (avx512_unaligned); + } + + if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)) + { + if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) + return OPTIMIZE (avx2_unaligned_erms); + else + return OPTIMIZE (avx2_unaligned); + } + + if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) + return OPTIMIZE (sse2_unaligned_erms); + + return OPTIMIZE (sse2_unaligned); +} diff --git a/sysdeps/x86_64/multiarch/ifunc-sse4_2.h b/sysdeps/x86_64/multiarch/ifunc-sse4_2.h new file mode 100644 index 0000000000..f2b791cccf --- /dev/null +++ b/sysdeps/x86_64/multiarch/ifunc-sse4_2.h @@ -0,0 +1,34 @@ +/* Common definition for ifunc selections optimized with SSE2 and SSE4.2. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <init-arch.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (CPU_FEATURES_CPU_P (cpu_features, SSE4_2)) + return OPTIMIZE (sse42); + + return OPTIMIZE (sse2); +} diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h new file mode 100644 index 0000000000..1ca170b663 --- /dev/null +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h @@ -0,0 +1,43 @@ +/* Common definition for strcasecmp famly ifunc selections. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <init-arch.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Usable)) + return OPTIMIZE (avx); + + if (CPU_FEATURES_CPU_P (cpu_features, SSE4_2) + && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) + return OPTIMIZE (sse42); + + if (CPU_FEATURES_CPU_P (cpu_features, SSSE3)) + return OPTIMIZE (ssse3); + + return OPTIMIZE (sse2); +} diff --git a/sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h b/sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h new file mode 100644 index 0000000000..81805f9832 --- /dev/null +++ b/sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h @@ -0,0 +1,40 @@ +/* Common definition for ifunc selections optimized with SSE2, unaligned + SSE2 and SSSE3. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <init-arch.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) + return OPTIMIZE (sse2_unaligned); + + if (CPU_FEATURES_CPU_P (cpu_features, SSSE3)) + return OPTIMIZE (ssse3); + + return OPTIMIZE (sse2); +} diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h new file mode 100644 index 0000000000..2f1085f5fc --- /dev/null +++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h @@ -0,0 +1,42 @@ +/* Common definition for wmemset/wmemset_chk ifunc selections. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <init-arch.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) + && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + { + if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable) + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) + return OPTIMIZE (avx512_unaligned); + else + return OPTIMIZE (avx2_unaligned); + } + + return OPTIMIZE (sse2_unaligned); +} diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S new file mode 100644 index 0000000000..5f5e772554 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S @@ -0,0 +1,340 @@ +/* memchr/wmemchr optimized with AVX2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef MEMCHR +# define MEMCHR __memchr_avx2 +# endif + +# ifdef USE_AS_WMEMCHR +# define VPCMPEQ vpcmpeqd +# else +# define VPCMPEQ vpcmpeqb +# endif + +# ifndef VZEROUPPER +# define VZEROUPPER vzeroupper +# endif + +# define VEC_SIZE 32 + + .section .text.avx,"ax",@progbits +ENTRY (MEMCHR) +# ifndef USE_AS_RAWMEMCHR + /* Check for zero length. */ + testq %rdx, %rdx + jz L(null) +# endif + movl %edi, %ecx + /* Broadcast CHAR to YMM0. */ + vmovd %esi, %xmm0 +# ifdef USE_AS_WMEMCHR + shl $2, %rdx + vpbroadcastd %xmm0, %ymm0 +# else + vpbroadcastb %xmm0, %ymm0 +# endif + /* Check if we may cross page boundary with one vector load. */ + andl $(2 * VEC_SIZE - 1), %ecx + cmpl $VEC_SIZE, %ecx + ja L(cros_page_boundary) + + /* Check the first VEC_SIZE bytes. */ + VPCMPEQ (%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + +# ifndef USE_AS_RAWMEMCHR + jnz L(first_vec_x0_check) + /* Adjust length and check the end of data. */ + subq $VEC_SIZE, %rdx + jbe L(zero) +# else + jnz L(first_vec_x0) +# endif + + /* Align data for aligned loads in the loop. */ + addq $VEC_SIZE, %rdi + andl $(VEC_SIZE - 1), %ecx + andq $-VEC_SIZE, %rdi + +# ifndef USE_AS_RAWMEMCHR + /* Adjust length. */ + addq %rcx, %rdx + + subq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec_or_less) +# endif + jmp L(more_4x_vec) + + .p2align 4 +L(cros_page_boundary): + andl $(VEC_SIZE - 1), %ecx + andq $-VEC_SIZE, %rdi + VPCMPEQ (%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + /* Remove the leading bytes. */ + sarl %cl, %eax + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax +# ifndef USE_AS_RAWMEMCHR + /* Check the end of data. */ + cmpq %rax, %rdx + jbe L(zero) +# endif + addq %rdi, %rax + addq %rcx, %rax + VZEROUPPER + ret + + .p2align 4 +L(aligned_more): +# ifndef USE_AS_RAWMEMCHR + /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)" + instead of "(rdx + rcx) - VEC_SIZE" to void possible addition + overflow. */ + negq %rcx + addq $VEC_SIZE, %rcx + + /* Check the end of data. */ + subq %rcx, %rdx + jbe L(zero) +# endif + + addq $VEC_SIZE, %rdi + +# ifndef USE_AS_RAWMEMCHR + subq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec_or_less) +# endif + +L(more_4x_vec): + /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ + VPCMPEQ (%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x0) + + VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x1) + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x2) + + VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x3) + + addq $(VEC_SIZE * 4), %rdi + +# ifndef USE_AS_RAWMEMCHR + subq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec_or_less) +# endif + + /* Align data to 4 * VEC_SIZE. */ + movq %rdi, %rcx + andl $(4 * VEC_SIZE - 1), %ecx + andq $-(4 * VEC_SIZE), %rdi + +# ifndef USE_AS_RAWMEMCHR + /* Adjust length. */ + addq %rcx, %rdx +# endif + + .p2align 4 +L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ + VPCMPEQ (%rdi), %ymm0, %ymm1 + VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2 + VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3 + VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4 + + vpor %ymm1, %ymm2, %ymm5 + vpor %ymm3, %ymm4, %ymm6 + vpor %ymm5, %ymm6, %ymm5 + + vpmovmskb %ymm5, %eax + testl %eax, %eax + jnz L(4x_vec_end) + + addq $(VEC_SIZE * 4), %rdi + +# ifdef USE_AS_RAWMEMCHR + jmp L(loop_4x_vec) +# else + subq $(VEC_SIZE * 4), %rdx + ja L(loop_4x_vec) + +L(last_4x_vec_or_less): + /* Less than 4 * VEC and aligned to VEC_SIZE. */ + addl $(VEC_SIZE * 2), %edx + jle L(last_2x_vec) + + VPCMPEQ (%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x0) + + VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x1) + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + + jnz L(first_vec_x2_check) + subl $VEC_SIZE, %edx + jle L(zero) + + VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + + jnz L(first_vec_x3_check) + xorl %eax, %eax + VZEROUPPER + ret + + .p2align 4 +L(last_2x_vec): + addl $(VEC_SIZE * 2), %edx + VPCMPEQ (%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + + jnz L(first_vec_x0_check) + subl $VEC_SIZE, %edx + jle L(zero) + + VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x1_check) + xorl %eax, %eax + VZEROUPPER + ret + + .p2align 4 +L(first_vec_x0_check): + tzcntl %eax, %eax + /* Check the end of data. */ + cmpq %rax, %rdx + jbe L(zero) + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(first_vec_x1_check): + tzcntl %eax, %eax + /* Check the end of data. */ + cmpq %rax, %rdx + jbe L(zero) + addq $VEC_SIZE, %rax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(first_vec_x2_check): + tzcntl %eax, %eax + /* Check the end of data. */ + cmpq %rax, %rdx + jbe L(zero) + addq $(VEC_SIZE * 2), %rax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(first_vec_x3_check): + tzcntl %eax, %eax + /* Check the end of data. */ + cmpq %rax, %rdx + jbe L(zero) + addq $(VEC_SIZE * 3), %rax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(zero): + VZEROUPPER +L(null): + xorl %eax, %eax + ret +# endif + + .p2align 4 +L(first_vec_x0): + tzcntl %eax, %eax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(first_vec_x1): + tzcntl %eax, %eax + addq $VEC_SIZE, %rax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(first_vec_x2): + tzcntl %eax, %eax + addq $(VEC_SIZE * 2), %rax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(4x_vec_end): + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x0) + vpmovmskb %ymm2, %eax + testl %eax, %eax + jnz L(first_vec_x1) + vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(first_vec_x2) + vpmovmskb %ymm4, %eax + testl %eax, %eax +L(first_vec_x3): + tzcntl %eax, %eax + addq $(VEC_SIZE * 3), %rax + addq %rdi, %rax + VZEROUPPER + ret + +END (MEMCHR) +#endif diff --git a/sysdeps/x86_64/multiarch/memchr-sse2.S b/sysdeps/x86_64/multiarch/memchr-sse2.S new file mode 100644 index 0000000000..8a5e7fd1c5 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memchr-sse2.S @@ -0,0 +1,28 @@ +/* memchr optimized with SSE2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# define memchr __memchr_sse2 + +# undef strong_alias +# define strong_alias(memchr, __memchr) +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(memchr) +#endif + +#include "../memchr.S" diff --git a/sysdeps/x86_64/multiarch/wcscpy.S b/sysdeps/x86_64/multiarch/memchr.c index 8e7270b9c7..016f57846a 100644 --- a/sysdeps/x86_64/multiarch/wcscpy.S +++ b/sysdeps/x86_64/multiarch/memchr.c @@ -1,7 +1,6 @@ -/* Multiple versions of wcscpy +/* Multiple versions of memchr All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2011-2016 Free Software Foundation, Inc. - Contributed by Intel Corporation. + Copyright (C) 2017-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,23 +17,19 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include <sysdep.h> -#include <init-arch.h> - /* Define multiple versions only for the definition in libc. */ #if IS_IN (libc) - - .text -ENTRY(wcscpy) - .type wcscpy, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - HAS_CPU_FEATURE (SSSE3) - jnz 2f - leaq __wcscpy_sse2(%rip), %rax - ret - -2: leaq __wcscpy_ssse3(%rip), %rax - ret - -END(wcscpy) +# define memchr __redirect_memchr +# include <string.h> +# undef memchr + +# define SYMBOL_NAME memchr +# include "ifunc-avx2.h" + +libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR ()); +strong_alias (memchr, __memchr) +# ifdef SHARED +__hidden_ver1 (memchr, __GI_memchr, __redirect_memchr) + __attribute__((visibility ("hidden"))); +# endif #endif diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S new file mode 100644 index 0000000000..30f764c393 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S @@ -0,0 +1,429 @@ +/* memcmp/wmemcmp optimized with AVX2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +/* memcmp/wmemcmp is implemented as: + 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap + to avoid branches. + 2. Use overlapping compare to avoid branch. + 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8 + bytes for wmemcmp. + 4. If size is 8 * VEC_SIZE or less, unroll the loop. + 5. Compare 4 * VEC_SIZE at a time with the aligned first memory + area. + 6. Use 2 vector compares when size is 2 * VEC_SIZE or less. + 7. Use 4 vector compares when size is 4 * VEC_SIZE or less. + 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ + +# include <sysdep.h> + +# ifndef MEMCMP +# define MEMCMP __memcmp_avx2_movbe +# endif + +# ifdef USE_AS_WMEMCMP +# define VPCMPEQ vpcmpeqd +# else +# define VPCMPEQ vpcmpeqb +# endif + +# ifndef VZEROUPPER +# define VZEROUPPER vzeroupper +# endif + +# define VEC_SIZE 32 +# define VEC_MASK ((1 << VEC_SIZE) - 1) + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ + + .section .text.avx,"ax",@progbits +ENTRY (MEMCMP) +# ifdef USE_AS_WMEMCMP + shl $2, %rdx +# endif + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + + /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + cmpq $(VEC_SIZE * 2), %rdx + jbe L(last_vec) + + VPCMPEQ %ymm0, %ymm0, %ymm0 + /* More than 2 * VEC. */ + cmpq $(VEC_SIZE * 8), %rdx + ja L(more_8x_vec) + cmpq $(VEC_SIZE * 4), %rdx + jb L(last_4x_vec) + + /* From 4 * VEC to 8 * VEC, inclusively. */ + vmovdqu (%rsi), %ymm1 + VPCMPEQ (%rdi), %ymm1, %ymm1 + + vmovdqu VEC_SIZE(%rsi), %ymm2 + VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 + + vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 + VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 + + vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 + VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 + + vpand %ymm1, %ymm2, %ymm5 + vpand %ymm3, %ymm4, %ymm6 + vpand %ymm5, %ymm6, %ymm5 + + vptest %ymm0, %ymm5 + jnc L(4x_vec_end) + + leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi + leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi + vmovdqu (%rsi), %ymm1 + VPCMPEQ (%rdi), %ymm1, %ymm1 + + vmovdqu VEC_SIZE(%rsi), %ymm2 + VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 + vpand %ymm2, %ymm1, %ymm5 + + vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 + VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 + vpand %ymm3, %ymm5, %ymm5 + + vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 + VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 + vpand %ymm4, %ymm5, %ymm5 + + vptest %ymm0, %ymm5 + jnc L(4x_vec_end) + xorl %eax, %eax + VZEROUPPER + ret + + .p2align 4 +L(last_2x_vec): + /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + +L(last_vec): + /* Use overlapping loads to avoid branches. */ + leaq -VEC_SIZE(%rdi, %rdx), %rdi + leaq -VEC_SIZE(%rsi, %rdx), %rsi + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + VZEROUPPER + ret + + .p2align 4 +L(first_vec): + /* A byte or int32 is different within 16 or 32 bytes. */ + tzcntl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl (%rdi, %rcx), %edx + cmpl (%rsi, %rcx), %edx +L(wmemcmp_return): + setl %al + negl %eax + orl $1, %eax +# else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %edx + sub %edx, %eax +# endif + VZEROUPPER + ret + +# ifdef USE_AS_WMEMCMP + .p2align 4 +L(4): + xorl %eax, %eax + movl (%rdi), %edx + cmpl (%rsi), %edx + jne L(wmemcmp_return) + ret +# else + .p2align 4 +L(between_4_7): + /* Load as big endian with overlapping movbe to avoid branches. */ + movbe (%rdi), %eax + movbe (%rsi), %ecx + shlq $32, %rax + shlq $32, %rcx + movbe -4(%rdi, %rdx), %edi + movbe -4(%rsi, %rdx), %esi + orq %rdi, %rax + orq %rsi, %rcx + subq %rcx, %rax + je L(exit) + sbbl %eax, %eax + orl $1, %eax + ret + + .p2align 4 +L(exit): + ret + + .p2align 4 +L(between_2_3): + /* Load as big endian to avoid branches. */ + movzwl (%rdi), %eax + movzwl (%rsi), %ecx + shll $8, %eax + shll $8, %ecx + bswap %eax + bswap %ecx + movb -1(%rdi, %rdx), %al + movb -1(%rsi, %rdx), %cl + /* Subtraction is okay because the upper 8 bits are zero. */ + subl %ecx, %eax + ret + + .p2align 4 +L(1): + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + subl %ecx, %eax + ret +# endif + + .p2align 4 +L(zero): + xorl %eax, %eax + ret + + .p2align 4 +L(less_vec): +# ifdef USE_AS_WMEMCMP + /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */ + cmpb $4, %dl + je L(4) + jb L(zero) +# else + cmpb $1, %dl + je L(1) + jb L(zero) + cmpb $4, %dl + jb L(between_2_3) + cmpb $8, %dl + jb L(between_4_7) +# endif + cmpb $16, %dl + jae L(between_16_31) + /* It is between 8 and 15 bytes. */ + vmovq (%rdi), %xmm1 + vmovq (%rsi), %xmm2 + VPCMPEQ %xmm1, %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax + jnz L(first_vec) + /* Use overlapping loads to avoid branches. */ + leaq -8(%rdi, %rdx), %rdi + leaq -8(%rsi, %rdx), %rsi + vmovq (%rdi), %xmm1 + vmovq (%rsi), %xmm2 + VPCMPEQ %xmm1, %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax + jnz L(first_vec) + ret + + .p2align 4 +L(between_16_31): + /* From 16 to 31 bytes. No branch when size == 16. */ + vmovdqu (%rsi), %xmm2 + VPCMPEQ (%rdi), %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax + jnz L(first_vec) + + /* Use overlapping loads to avoid branches. */ + leaq -16(%rdi, %rdx), %rdi + leaq -16(%rsi, %rdx), %rsi + vmovdqu (%rsi), %xmm2 + VPCMPEQ (%rdi), %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax + jnz L(first_vec) + ret + + .p2align 4 +L(more_8x_vec): + /* More than 8 * VEC. Check the first VEC. */ + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + /* Align the first memory area for aligned loads in the loop. + Compute how much the first memory area is misaligned. */ + movq %rdi, %rcx + andl $(VEC_SIZE - 1), %ecx + /* Get the negative of offset for alignment. */ + subq $VEC_SIZE, %rcx + /* Adjust the second memory area. */ + subq %rcx, %rsi + /* Adjust the first memory area which should be aligned now. */ + subq %rcx, %rdi + /* Adjust length. */ + addq %rcx, %rdx + +L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ + vmovdqu (%rsi), %ymm1 + VPCMPEQ (%rdi), %ymm1, %ymm1 + + vmovdqu VEC_SIZE(%rsi), %ymm2 + VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 + vpand %ymm2, %ymm1, %ymm5 + + vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 + VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 + vpand %ymm3, %ymm5, %ymm5 + + vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 + VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 + vpand %ymm4, %ymm5, %ymm5 + + vptest %ymm0, %ymm5 + jnc L(4x_vec_end) + + addq $(VEC_SIZE * 4), %rdi + addq $(VEC_SIZE * 4), %rsi + + subq $(VEC_SIZE * 4), %rdx + cmpq $(VEC_SIZE * 4), %rdx + jae L(loop_4x_vec) + + /* Less than 4 * VEC. */ + cmpq $VEC_SIZE, %rdx + jbe L(last_vec) + cmpq $(VEC_SIZE * 2), %rdx + jbe L(last_2x_vec) + +L(last_4x_vec): + /* From 2 * VEC to 4 * VEC. */ + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + addq $VEC_SIZE, %rdi + addq $VEC_SIZE, %rsi + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + /* Use overlapping loads to avoid branches. */ + leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi + leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + addq $VEC_SIZE, %rdi + addq $VEC_SIZE, %rsi + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + VZEROUPPER + ret + + .p2align 4 +L(4x_vec_end): + vpmovmskb %ymm1, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec_x1) + vpmovmskb %ymm3, %eax + subl $VEC_MASK, %eax + jnz L(first_vec_x2) + vpmovmskb %ymm4, %eax + subl $VEC_MASK, %eax + tzcntl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl (VEC_SIZE * 3)(%rdi, %rcx), %edx + cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx + jmp L(wmemcmp_return) +# else + movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx + sub %edx, %eax +# endif + VZEROUPPER + ret + + .p2align 4 +L(first_vec_x1): + tzcntl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl VEC_SIZE(%rdi, %rcx), %edx + cmpl VEC_SIZE(%rsi, %rcx), %edx + jmp L(wmemcmp_return) +# else + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %edx + sub %edx, %eax +# endif + VZEROUPPER + ret + + .p2align 4 +L(first_vec_x2): + tzcntl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl (VEC_SIZE * 2)(%rdi, %rcx), %edx + cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx + jmp L(wmemcmp_return) +# else + movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx + sub %edx, %eax +# endif + VZEROUPPER + ret +END (MEMCMP) +#endif diff --git a/sysdeps/x86_64/multiarch/memcmp-sse2.S b/sysdeps/x86_64/multiarch/memcmp-sse2.S new file mode 100644 index 0000000000..6058aa751e --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcmp-sse2.S @@ -0,0 +1,31 @@ +/* memcmp with SSE2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# define memcmp __memcmp_sse2 + +# ifdef SHARED +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) +# endif + +# undef weak_alias +# define weak_alias(ignored1, ignored2) +#endif + +#include <sysdeps/x86_64/memcmp.S> diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S index 786f87282c..8e164f2cb6 100644 --- a/sysdeps/x86_64/multiarch/memcmp-sse4.S +++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S @@ -1,5 +1,5 @@ /* memcmp with SSE4.1, wmemcmp with SSE4.1 - Copyright (C) 2010-2016 Free Software Foundation, Inc. + Copyright (C) 2010-2018 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -31,7 +31,7 @@ lea TABLE(%rip), %r11; \ movslq (%r11, INDEX, SCALE), %rcx; \ add %r11, %rcx; \ - jmp *%rcx; \ + _CET_NOTRACK jmp *%rcx; \ ud2 /* Warning! diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S index a22f399e02..6f76c64123 100644 --- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S +++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S @@ -1,5 +1,5 @@ /* memcmp with SSSE3, wmemcmp with SSSE3 - Copyright (C) 2011-2016 Free Software Foundation, Inc. + Copyright (C) 2011-2018 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S deleted file mode 100644 index b5a1cc202e..0000000000 --- a/sysdeps/x86_64/multiarch/memcmp.S +++ /dev/null @@ -1,67 +0,0 @@ -/* Multiple versions of memcmp - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2016 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - -/* Define multiple versions only for the definition in libc. */ -#if IS_IN (libc) - .text -ENTRY(memcmp) - .type memcmp, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - HAS_CPU_FEATURE (SSSE3) - jnz 2f - leaq __memcmp_sse2(%rip), %rax - ret - -2: HAS_CPU_FEATURE (SSE4_1) - jz 3f - leaq __memcmp_sse4_1(%rip), %rax - ret - -3: leaq __memcmp_ssse3(%rip), %rax - ret - -END(memcmp) - -# undef ENTRY -# define ENTRY(name) \ - .type __memcmp_sse2, @function; \ - .p2align 4; \ - .globl __memcmp_sse2; \ - .hidden __memcmp_sse2; \ - __memcmp_sse2: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __memcmp_sse2, .-__memcmp_sse2 - -# ifdef SHARED -# undef libc_hidden_builtin_def -/* IFUNC doesn't work with the hidden functions in shared library since - they will be called without setting up EBX needed for PLT which is - used by IFUNC. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_memcmp; __GI_memcmp = __memcmp_sse2 -# endif -#endif - -#include "../memcmp.S" diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/memcmp.c index 4942826b24..6f3ca43128 100644 --- a/sysdeps/x86_64/multiarch/strspn.S +++ b/sysdeps/x86_64/multiarch/memcmp.c @@ -1,7 +1,6 @@ -/* Multiple versions of strspn +/* Multiple versions of memcmp. All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2009-2016 Free Software Foundation, Inc. - Contributed by Intel Corporation. + Copyright (C) 2017-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,33 +17,21 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include <config.h> -#include <sysdep.h> -#include <init-arch.h> - /* Define multiple versions only for the definition in libc. */ #if IS_IN (libc) - .text -ENTRY(strspn) - .type strspn, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq __strspn_sse2(%rip), %rax - HAS_CPU_FEATURE (SSE4_2) - jz 2f - leaq __strspn_sse42(%rip), %rax -2: ret -END(strspn) +# define memcmp __redirect_memcmp +# include <string.h> +# undef memcmp -# undef ENTRY -# define ENTRY(name) \ - .type __strspn_sse2, @function; \ - .globl __strspn_sse2; \ - .align 16; \ - __strspn_sse2: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __strspn_sse2, .-__strspn_sse2 -#endif +# define SYMBOL_NAME memcmp +# include "ifunc-memcmp.h" -#include "../strspn.S" +libc_ifunc_redirected (__redirect_memcmp, memcmp, IFUNC_SELECTOR ()); +# undef bcmp +weak_alias (memcmp, bcmp) + +# ifdef SHARED +__hidden_ver1 (memcmp, __GI_memcmp, __redirect_memcmp) + __attribute__ ((visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S deleted file mode 100644 index 74fed186e9..0000000000 --- a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S +++ /dev/null @@ -1,376 +0,0 @@ -/* memcpy with AVX - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#if IS_IN (libc) \ - && (defined SHARED \ - || defined USE_AS_MEMMOVE \ - || !defined USE_MULTIARCH) - -#include "asm-syntax.h" -#ifndef MEMCPY -# define MEMCPY __memcpy_avx_unaligned -# define MEMCPY_CHK __memcpy_chk_avx_unaligned -#endif - - .section .text.avx,"ax",@progbits -#if !defined USE_AS_BCOPY -ENTRY (MEMCPY_CHK) - cmpq %rdx, %rcx - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMCPY_CHK) -#endif - -ENTRY (MEMCPY) - mov %rdi, %rax -#ifdef USE_AS_MEMPCPY - add %rdx, %rax -#endif - cmp $256, %rdx - jae L(256bytesormore) - cmp $16, %dl - jb L(less_16bytes) - cmp $128, %dl - jb L(less_128bytes) - vmovdqu (%rsi), %xmm0 - lea (%rsi, %rdx), %rcx - vmovdqu 0x10(%rsi), %xmm1 - vmovdqu 0x20(%rsi), %xmm2 - vmovdqu 0x30(%rsi), %xmm3 - vmovdqu 0x40(%rsi), %xmm4 - vmovdqu 0x50(%rsi), %xmm5 - vmovdqu 0x60(%rsi), %xmm6 - vmovdqu 0x70(%rsi), %xmm7 - vmovdqu -0x80(%rcx), %xmm8 - vmovdqu -0x70(%rcx), %xmm9 - vmovdqu -0x60(%rcx), %xmm10 - vmovdqu -0x50(%rcx), %xmm11 - vmovdqu -0x40(%rcx), %xmm12 - vmovdqu -0x30(%rcx), %xmm13 - vmovdqu -0x20(%rcx), %xmm14 - vmovdqu -0x10(%rcx), %xmm15 - lea (%rdi, %rdx), %rdx - vmovdqu %xmm0, (%rdi) - vmovdqu %xmm1, 0x10(%rdi) - vmovdqu %xmm2, 0x20(%rdi) - vmovdqu %xmm3, 0x30(%rdi) - vmovdqu %xmm4, 0x40(%rdi) - vmovdqu %xmm5, 0x50(%rdi) - vmovdqu %xmm6, 0x60(%rdi) - vmovdqu %xmm7, 0x70(%rdi) - vmovdqu %xmm8, -0x80(%rdx) - vmovdqu %xmm9, -0x70(%rdx) - vmovdqu %xmm10, -0x60(%rdx) - vmovdqu %xmm11, -0x50(%rdx) - vmovdqu %xmm12, -0x40(%rdx) - vmovdqu %xmm13, -0x30(%rdx) - vmovdqu %xmm14, -0x20(%rdx) - vmovdqu %xmm15, -0x10(%rdx) - ret - .p2align 4 -L(less_128bytes): - cmp $64, %dl - jb L(less_64bytes) - vmovdqu (%rsi), %xmm0 - lea (%rsi, %rdx), %rcx - vmovdqu 0x10(%rsi), %xmm1 - vmovdqu 0x20(%rsi), %xmm2 - lea (%rdi, %rdx), %rdx - vmovdqu 0x30(%rsi), %xmm3 - vmovdqu -0x40(%rcx), %xmm4 - vmovdqu -0x30(%rcx), %xmm5 - vmovdqu -0x20(%rcx), %xmm6 - vmovdqu -0x10(%rcx), %xmm7 - vmovdqu %xmm0, (%rdi) - vmovdqu %xmm1, 0x10(%rdi) - vmovdqu %xmm2, 0x20(%rdi) - vmovdqu %xmm3, 0x30(%rdi) - vmovdqu %xmm4, -0x40(%rdx) - vmovdqu %xmm5, -0x30(%rdx) - vmovdqu %xmm6, -0x20(%rdx) - vmovdqu %xmm7, -0x10(%rdx) - ret - - .p2align 4 -L(less_64bytes): - cmp $32, %dl - jb L(less_32bytes) - vmovdqu (%rsi), %xmm0 - vmovdqu 0x10(%rsi), %xmm1 - vmovdqu -0x20(%rsi, %rdx), %xmm6 - vmovdqu -0x10(%rsi, %rdx), %xmm7 - vmovdqu %xmm0, (%rdi) - vmovdqu %xmm1, 0x10(%rdi) - vmovdqu %xmm6, -0x20(%rdi, %rdx) - vmovdqu %xmm7, -0x10(%rdi, %rdx) - ret - - .p2align 4 -L(less_32bytes): - vmovdqu (%rsi), %xmm0 - vmovdqu -0x10(%rsi, %rdx), %xmm7 - vmovdqu %xmm0, (%rdi) - vmovdqu %xmm7, -0x10(%rdi, %rdx) - ret - - .p2align 4 -L(less_16bytes): - cmp $8, %dl - jb L(less_8bytes) - movq -0x08(%rsi, %rdx), %rcx - movq (%rsi), %rsi - movq %rsi, (%rdi) - movq %rcx, -0x08(%rdi, %rdx) - ret - - .p2align 4 -L(less_8bytes): - cmp $4, %dl - jb L(less_4bytes) - mov -0x04(%rsi, %rdx), %ecx - mov (%rsi), %esi - mov %esi, (%rdi) - mov %ecx, -0x04(%rdi, %rdx) - ret - -L(less_4bytes): - cmp $1, %dl - jbe L(less_2bytes) - mov -0x02(%rsi, %rdx), %cx - mov (%rsi), %si - mov %si, (%rdi) - mov %cx, -0x02(%rdi, %rdx) - ret - -L(less_2bytes): - jb L(less_0bytes) - mov (%rsi), %cl - mov %cl, (%rdi) -L(less_0bytes): - ret - - .p2align 4 -L(256bytesormore): -#ifdef USE_AS_MEMMOVE - mov %rdi, %rcx - sub %rsi, %rcx - cmp %rdx, %rcx - jc L(copy_backward) -#endif - cmp $2048, %rdx - jae L(gobble_data_movsb) - mov %rax, %r8 - lea (%rsi, %rdx), %rcx - mov %rdi, %r10 - vmovdqu -0x80(%rcx), %xmm5 - vmovdqu -0x70(%rcx), %xmm6 - mov $0x80, %rax - and $-32, %rdi - add $32, %rdi - vmovdqu -0x60(%rcx), %xmm7 - vmovdqu -0x50(%rcx), %xmm8 - mov %rdi, %r11 - sub %r10, %r11 - vmovdqu -0x40(%rcx), %xmm9 - vmovdqu -0x30(%rcx), %xmm10 - sub %r11, %rdx - vmovdqu -0x20(%rcx), %xmm11 - vmovdqu -0x10(%rcx), %xmm12 - vmovdqu (%rsi), %ymm4 - add %r11, %rsi - sub %eax, %edx -L(goble_128_loop): - vmovdqu (%rsi), %ymm0 - vmovdqu 0x20(%rsi), %ymm1 - vmovdqu 0x40(%rsi), %ymm2 - vmovdqu 0x60(%rsi), %ymm3 - add %rax, %rsi - vmovdqa %ymm0, (%rdi) - vmovdqa %ymm1, 0x20(%rdi) - vmovdqa %ymm2, 0x40(%rdi) - vmovdqa %ymm3, 0x60(%rdi) - add %rax, %rdi - sub %eax, %edx - jae L(goble_128_loop) - add %eax, %edx - add %rdi, %rdx - vmovdqu %ymm4, (%r10) - vzeroupper - vmovdqu %xmm5, -0x80(%rdx) - vmovdqu %xmm6, -0x70(%rdx) - vmovdqu %xmm7, -0x60(%rdx) - vmovdqu %xmm8, -0x50(%rdx) - vmovdqu %xmm9, -0x40(%rdx) - vmovdqu %xmm10, -0x30(%rdx) - vmovdqu %xmm11, -0x20(%rdx) - vmovdqu %xmm12, -0x10(%rdx) - mov %r8, %rax - ret - - .p2align 4 -L(gobble_data_movsb): -#ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %rcx -#else - mov __x86_shared_cache_size_half(%rip), %rcx -#endif - shl $3, %rcx - cmp %rcx, %rdx - jae L(gobble_big_data_fwd) - mov %rdx, %rcx - mov %rdx, %rcx - rep movsb - ret - - .p2align 4 -L(gobble_big_data_fwd): - lea (%rsi, %rdx), %rcx - vmovdqu (%rsi), %ymm4 - vmovdqu -0x80(%rsi,%rdx), %xmm5 - vmovdqu -0x70(%rcx), %xmm6 - vmovdqu -0x60(%rcx), %xmm7 - vmovdqu -0x50(%rcx), %xmm8 - vmovdqu -0x40(%rcx), %xmm9 - vmovdqu -0x30(%rcx), %xmm10 - vmovdqu -0x20(%rcx), %xmm11 - vmovdqu -0x10(%rcx), %xmm12 - mov %rdi, %r8 - and $-32, %rdi - add $32, %rdi - mov %rdi, %r10 - sub %r8, %r10 - sub %r10, %rdx - add %r10, %rsi - lea (%rdi, %rdx), %rcx - add $-0x80, %rdx -L(gobble_mem_fwd_loop): - prefetchnta 0x1c0(%rsi) - prefetchnta 0x280(%rsi) - vmovdqu (%rsi), %ymm0 - vmovdqu 0x20(%rsi), %ymm1 - vmovdqu 0x40(%rsi), %ymm2 - vmovdqu 0x60(%rsi), %ymm3 - sub $-0x80, %rsi - vmovntdq %ymm0, (%rdi) - vmovntdq %ymm1, 0x20(%rdi) - vmovntdq %ymm2, 0x40(%rdi) - vmovntdq %ymm3, 0x60(%rdi) - sub $-0x80, %rdi - add $-0x80, %rdx - jb L(gobble_mem_fwd_loop) - sfence - vmovdqu %ymm4, (%r8) - vzeroupper - vmovdqu %xmm5, -0x80(%rcx) - vmovdqu %xmm6, -0x70(%rcx) - vmovdqu %xmm7, -0x60(%rcx) - vmovdqu %xmm8, -0x50(%rcx) - vmovdqu %xmm9, -0x40(%rcx) - vmovdqu %xmm10, -0x30(%rcx) - vmovdqu %xmm11, -0x20(%rcx) - vmovdqu %xmm12, -0x10(%rcx) - ret - -#ifdef USE_AS_MEMMOVE - .p2align 4 -L(copy_backward): -#ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %rcx -#else - mov __x86_shared_cache_size_half(%rip), %rcx -#endif - shl $3, %rcx - vmovdqu (%rsi), %xmm5 - vmovdqu 0x10(%rsi), %xmm6 - add %rdx, %rdi - vmovdqu 0x20(%rsi), %xmm7 - vmovdqu 0x30(%rsi), %xmm8 - lea -0x20(%rdi), %r10 - mov %rdi, %r11 - vmovdqu 0x40(%rsi), %xmm9 - vmovdqu 0x50(%rsi), %xmm10 - and $0x1f, %r11 - vmovdqu 0x60(%rsi), %xmm11 - vmovdqu 0x70(%rsi), %xmm12 - xor %r11, %rdi - add %rdx, %rsi - vmovdqu -0x20(%rsi), %ymm4 - sub %r11, %rsi - sub %r11, %rdx - cmp %rcx, %rdx - ja L(gobble_big_data_bwd) - add $-0x80, %rdx -L(gobble_mem_bwd_llc): - vmovdqu -0x20(%rsi), %ymm0 - vmovdqu -0x40(%rsi), %ymm1 - vmovdqu -0x60(%rsi), %ymm2 - vmovdqu -0x80(%rsi), %ymm3 - lea -0x80(%rsi), %rsi - vmovdqa %ymm0, -0x20(%rdi) - vmovdqa %ymm1, -0x40(%rdi) - vmovdqa %ymm2, -0x60(%rdi) - vmovdqa %ymm3, -0x80(%rdi) - lea -0x80(%rdi), %rdi - add $-0x80, %rdx - jb L(gobble_mem_bwd_llc) - vmovdqu %ymm4, (%r10) - vzeroupper - vmovdqu %xmm5, (%rax) - vmovdqu %xmm6, 0x10(%rax) - vmovdqu %xmm7, 0x20(%rax) - vmovdqu %xmm8, 0x30(%rax) - vmovdqu %xmm9, 0x40(%rax) - vmovdqu %xmm10, 0x50(%rax) - vmovdqu %xmm11, 0x60(%rax) - vmovdqu %xmm12, 0x70(%rax) - ret - - .p2align 4 -L(gobble_big_data_bwd): - add $-0x80, %rdx -L(gobble_mem_bwd_loop): - prefetchnta -0x1c0(%rsi) - prefetchnta -0x280(%rsi) - vmovdqu -0x20(%rsi), %ymm0 - vmovdqu -0x40(%rsi), %ymm1 - vmovdqu -0x60(%rsi), %ymm2 - vmovdqu -0x80(%rsi), %ymm3 - lea -0x80(%rsi), %rsi - vmovntdq %ymm0, -0x20(%rdi) - vmovntdq %ymm1, -0x40(%rdi) - vmovntdq %ymm2, -0x60(%rdi) - vmovntdq %ymm3, -0x80(%rdi) - lea -0x80(%rdi), %rdi - add $-0x80, %rdx - jb L(gobble_mem_bwd_loop) - sfence - vmovdqu %ymm4, (%r10) - vzeroupper - vmovdqu %xmm5, (%rax) - vmovdqu %xmm6, 0x10(%rax) - vmovdqu %xmm7, 0x20(%rax) - vmovdqu %xmm8, 0x30(%rax) - vmovdqu %xmm9, 0x40(%rax) - vmovdqu %xmm10, 0x50(%rax) - vmovdqu %xmm11, 0x60(%rax) - vmovdqu %xmm12, 0x70(%rax) - ret -#endif -END (MEMCPY) -#endif diff --git a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S deleted file mode 100644 index 1bb12e81b0..0000000000 --- a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S +++ /dev/null @@ -1,408 +0,0 @@ -/* memcpy optimized with AVX512 for KNL hardware. - Copyright (C) 2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#if defined HAVE_AVX512_ASM_SUPPORT && IS_IN (libc) \ - && (defined SHARED \ - || defined USE_AS_MEMMOVE \ - || !defined USE_MULTIARCH) - -#include "asm-syntax.h" -#ifndef MEMCPY -# define MEMCPY __memcpy_avx512_no_vzeroupper -# define MEMCPY_CHK __memcpy_chk_avx512_no_vzeroupper -#endif - - .section .text,"ax",@progbits -#if !defined USE_AS_BCOPY -ENTRY (MEMCPY_CHK) - cmpq %rdx, %rcx - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMCPY_CHK) -#endif - -ENTRY (MEMCPY) - mov %rdi, %rax -#ifdef USE_AS_MEMPCPY - add %rdx, %rax -#endif - lea (%rsi, %rdx), %rcx - lea (%rdi, %rdx), %r9 - cmp $512, %rdx - ja L(512bytesormore) - -L(check): - cmp $16, %rdx - jbe L(less_16bytes) - cmp $256, %rdx - jb L(less_256bytes) - vmovups (%rsi), %zmm0 - vmovups 0x40(%rsi), %zmm1 - vmovups 0x80(%rsi), %zmm2 - vmovups 0xC0(%rsi), %zmm3 - vmovups -0x100(%rcx), %zmm4 - vmovups -0xC0(%rcx), %zmm5 - vmovups -0x80(%rcx), %zmm6 - vmovups -0x40(%rcx), %zmm7 - vmovups %zmm0, (%rdi) - vmovups %zmm1, 0x40(%rdi) - vmovups %zmm2, 0x80(%rdi) - vmovups %zmm3, 0xC0(%rdi) - vmovups %zmm4, -0x100(%r9) - vmovups %zmm5, -0xC0(%r9) - vmovups %zmm6, -0x80(%r9) - vmovups %zmm7, -0x40(%r9) - ret - -L(less_256bytes): - cmp $128, %dl - jb L(less_128bytes) - vmovups (%rsi), %zmm0 - vmovups 0x40(%rsi), %zmm1 - vmovups -0x80(%rcx), %zmm2 - vmovups -0x40(%rcx), %zmm3 - vmovups %zmm0, (%rdi) - vmovups %zmm1, 0x40(%rdi) - vmovups %zmm2, -0x80(%r9) - vmovups %zmm3, -0x40(%r9) - ret - -L(less_128bytes): - cmp $64, %dl - jb L(less_64bytes) - vmovdqu (%rsi), %ymm0 - vmovdqu 0x20(%rsi), %ymm1 - vmovdqu -0x40(%rcx), %ymm2 - vmovdqu -0x20(%rcx), %ymm3 - vmovdqu %ymm0, (%rdi) - vmovdqu %ymm1, 0x20(%rdi) - vmovdqu %ymm2, -0x40(%r9) - vmovdqu %ymm3, -0x20(%r9) - ret - -L(less_64bytes): - cmp $32, %dl - jb L(less_32bytes) - vmovdqu (%rsi), %ymm0 - vmovdqu -0x20(%rcx), %ymm1 - vmovdqu %ymm0, (%rdi) - vmovdqu %ymm1, -0x20(%r9) - ret - -L(less_32bytes): - vmovdqu (%rsi), %xmm0 - vmovdqu -0x10(%rcx), %xmm1 - vmovdqu %xmm0, (%rdi) - vmovdqu %xmm1, -0x10(%r9) - ret - -L(less_16bytes): - cmp $8, %dl - jb L(less_8bytes) - movq (%rsi), %rsi - movq -0x8(%rcx), %rcx - movq %rsi, (%rdi) - movq %rcx, -0x8(%r9) - ret - -L(less_8bytes): - cmp $4, %dl - jb L(less_4bytes) - mov (%rsi), %esi - mov -0x4(%rcx), %ecx - mov %esi, (%rdi) - mov %ecx, -0x4(%r9) - ret - -L(less_4bytes): - cmp $2, %dl - jb L(less_2bytes) - mov (%rsi), %si - mov -0x2(%rcx), %cx - mov %si, (%rdi) - mov %cx, -0x2(%r9) - ret - -L(less_2bytes): - cmp $1, %dl - jb L(less_1bytes) - mov (%rsi), %cl - mov %cl, (%rdi) -L(less_1bytes): - ret - -L(512bytesormore): -#ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %r8 -#else - mov __x86_shared_cache_size_half(%rip), %r8 -#endif - cmp %r8, %rdx - jae L(preloop_large) - cmp $1024, %rdx - ja L(1024bytesormore) - prefetcht1 (%rsi) - prefetcht1 0x40(%rsi) - prefetcht1 0x80(%rsi) - prefetcht1 0xC0(%rsi) - prefetcht1 0x100(%rsi) - prefetcht1 0x140(%rsi) - prefetcht1 0x180(%rsi) - prefetcht1 0x1C0(%rsi) - prefetcht1 -0x200(%rcx) - prefetcht1 -0x1C0(%rcx) - prefetcht1 -0x180(%rcx) - prefetcht1 -0x140(%rcx) - prefetcht1 -0x100(%rcx) - prefetcht1 -0xC0(%rcx) - prefetcht1 -0x80(%rcx) - prefetcht1 -0x40(%rcx) - vmovups (%rsi), %zmm0 - vmovups 0x40(%rsi), %zmm1 - vmovups 0x80(%rsi), %zmm2 - vmovups 0xC0(%rsi), %zmm3 - vmovups 0x100(%rsi), %zmm4 - vmovups 0x140(%rsi), %zmm5 - vmovups 0x180(%rsi), %zmm6 - vmovups 0x1C0(%rsi), %zmm7 - vmovups -0x200(%rcx), %zmm8 - vmovups -0x1C0(%rcx), %zmm9 - vmovups -0x180(%rcx), %zmm10 - vmovups -0x140(%rcx), %zmm11 - vmovups -0x100(%rcx), %zmm12 - vmovups -0xC0(%rcx), %zmm13 - vmovups -0x80(%rcx), %zmm14 - vmovups -0x40(%rcx), %zmm15 - vmovups %zmm0, (%rdi) - vmovups %zmm1, 0x40(%rdi) - vmovups %zmm2, 0x80(%rdi) - vmovups %zmm3, 0xC0(%rdi) - vmovups %zmm4, 0x100(%rdi) - vmovups %zmm5, 0x140(%rdi) - vmovups %zmm6, 0x180(%rdi) - vmovups %zmm7, 0x1C0(%rdi) - vmovups %zmm8, -0x200(%r9) - vmovups %zmm9, -0x1C0(%r9) - vmovups %zmm10, -0x180(%r9) - vmovups %zmm11, -0x140(%r9) - vmovups %zmm12, -0x100(%r9) - vmovups %zmm13, -0xC0(%r9) - vmovups %zmm14, -0x80(%r9) - vmovups %zmm15, -0x40(%r9) - ret - -L(1024bytesormore): - cmp %rsi, %rdi - ja L(1024bytesormore_bkw) - sub $512, %r9 - vmovups -0x200(%rcx), %zmm8 - vmovups -0x1C0(%rcx), %zmm9 - vmovups -0x180(%rcx), %zmm10 - vmovups -0x140(%rcx), %zmm11 - vmovups -0x100(%rcx), %zmm12 - vmovups -0xC0(%rcx), %zmm13 - vmovups -0x80(%rcx), %zmm14 - vmovups -0x40(%rcx), %zmm15 - prefetcht1 (%rsi) - prefetcht1 0x40(%rsi) - prefetcht1 0x80(%rsi) - prefetcht1 0xC0(%rsi) - prefetcht1 0x100(%rsi) - prefetcht1 0x140(%rsi) - prefetcht1 0x180(%rsi) - prefetcht1 0x1C0(%rsi) - -/* Loop with unaligned memory access. */ -L(gobble_512bytes_loop): - vmovups (%rsi), %zmm0 - vmovups 0x40(%rsi), %zmm1 - vmovups 0x80(%rsi), %zmm2 - vmovups 0xC0(%rsi), %zmm3 - vmovups 0x100(%rsi), %zmm4 - vmovups 0x140(%rsi), %zmm5 - vmovups 0x180(%rsi), %zmm6 - vmovups 0x1C0(%rsi), %zmm7 - add $512, %rsi - prefetcht1 (%rsi) - prefetcht1 0x40(%rsi) - prefetcht1 0x80(%rsi) - prefetcht1 0xC0(%rsi) - prefetcht1 0x100(%rsi) - prefetcht1 0x140(%rsi) - prefetcht1 0x180(%rsi) - prefetcht1 0x1C0(%rsi) - vmovups %zmm0, (%rdi) - vmovups %zmm1, 0x40(%rdi) - vmovups %zmm2, 0x80(%rdi) - vmovups %zmm3, 0xC0(%rdi) - vmovups %zmm4, 0x100(%rdi) - vmovups %zmm5, 0x140(%rdi) - vmovups %zmm6, 0x180(%rdi) - vmovups %zmm7, 0x1C0(%rdi) - add $512, %rdi - cmp %r9, %rdi - jb L(gobble_512bytes_loop) - vmovups %zmm8, (%r9) - vmovups %zmm9, 0x40(%r9) - vmovups %zmm10, 0x80(%r9) - vmovups %zmm11, 0xC0(%r9) - vmovups %zmm12, 0x100(%r9) - vmovups %zmm13, 0x140(%r9) - vmovups %zmm14, 0x180(%r9) - vmovups %zmm15, 0x1C0(%r9) - ret - -L(1024bytesormore_bkw): - add $512, %rdi - vmovups 0x1C0(%rsi), %zmm8 - vmovups 0x180(%rsi), %zmm9 - vmovups 0x140(%rsi), %zmm10 - vmovups 0x100(%rsi), %zmm11 - vmovups 0xC0(%rsi), %zmm12 - vmovups 0x80(%rsi), %zmm13 - vmovups 0x40(%rsi), %zmm14 - vmovups (%rsi), %zmm15 - prefetcht1 -0x40(%rcx) - prefetcht1 -0x80(%rcx) - prefetcht1 -0xC0(%rcx) - prefetcht1 -0x100(%rcx) - prefetcht1 -0x140(%rcx) - prefetcht1 -0x180(%rcx) - prefetcht1 -0x1C0(%rcx) - prefetcht1 -0x200(%rcx) - -/* Backward loop with unaligned memory access. */ -L(gobble_512bytes_loop_bkw): - vmovups -0x40(%rcx), %zmm0 - vmovups -0x80(%rcx), %zmm1 - vmovups -0xC0(%rcx), %zmm2 - vmovups -0x100(%rcx), %zmm3 - vmovups -0x140(%rcx), %zmm4 - vmovups -0x180(%rcx), %zmm5 - vmovups -0x1C0(%rcx), %zmm6 - vmovups -0x200(%rcx), %zmm7 - sub $512, %rcx - prefetcht1 -0x40(%rcx) - prefetcht1 -0x80(%rcx) - prefetcht1 -0xC0(%rcx) - prefetcht1 -0x100(%rcx) - prefetcht1 -0x140(%rcx) - prefetcht1 -0x180(%rcx) - prefetcht1 -0x1C0(%rcx) - prefetcht1 -0x200(%rcx) - vmovups %zmm0, -0x40(%r9) - vmovups %zmm1, -0x80(%r9) - vmovups %zmm2, -0xC0(%r9) - vmovups %zmm3, -0x100(%r9) - vmovups %zmm4, -0x140(%r9) - vmovups %zmm5, -0x180(%r9) - vmovups %zmm6, -0x1C0(%r9) - vmovups %zmm7, -0x200(%r9) - sub $512, %r9 - cmp %rdi, %r9 - ja L(gobble_512bytes_loop_bkw) - vmovups %zmm8, -0x40(%rdi) - vmovups %zmm9, -0x80(%rdi) - vmovups %zmm10, -0xC0(%rdi) - vmovups %zmm11, -0x100(%rdi) - vmovups %zmm12, -0x140(%rdi) - vmovups %zmm13, -0x180(%rdi) - vmovups %zmm14, -0x1C0(%rdi) - vmovups %zmm15, -0x200(%rdi) - ret - -L(preloop_large): - cmp %rsi, %rdi - ja L(preloop_large_bkw) - vmovups (%rsi), %zmm4 - vmovups 0x40(%rsi), %zmm5 - -/* Align destination for access with non-temporal stores in the loop. */ - mov %rdi, %r8 - and $-0x80, %rdi - add $0x80, %rdi - sub %rdi, %r8 - sub %r8, %rsi - add %r8, %rdx -L(gobble_256bytes_nt_loop): - prefetcht1 0x200(%rsi) - prefetcht1 0x240(%rsi) - prefetcht1 0x280(%rsi) - prefetcht1 0x2C0(%rsi) - prefetcht1 0x300(%rsi) - prefetcht1 0x340(%rsi) - prefetcht1 0x380(%rsi) - prefetcht1 0x3C0(%rsi) - vmovdqu64 (%rsi), %zmm0 - vmovdqu64 0x40(%rsi), %zmm1 - vmovdqu64 0x80(%rsi), %zmm2 - vmovdqu64 0xC0(%rsi), %zmm3 - vmovntdq %zmm0, (%rdi) - vmovntdq %zmm1, 0x40(%rdi) - vmovntdq %zmm2, 0x80(%rdi) - vmovntdq %zmm3, 0xC0(%rdi) - sub $256, %rdx - add $256, %rsi - add $256, %rdi - cmp $256, %rdx - ja L(gobble_256bytes_nt_loop) - sfence - vmovups %zmm4, (%rax) - vmovups %zmm5, 0x40(%rax) - jmp L(check) - -L(preloop_large_bkw): - vmovups -0x80(%rcx), %zmm4 - vmovups -0x40(%rcx), %zmm5 - -/* Align end of destination for access with non-temporal stores. */ - mov %r9, %r8 - and $-0x80, %r9 - sub %r9, %r8 - sub %r8, %rcx - sub %r8, %rdx - add %r9, %r8 -L(gobble_256bytes_nt_loop_bkw): - prefetcht1 -0x400(%rcx) - prefetcht1 -0x3C0(%rcx) - prefetcht1 -0x380(%rcx) - prefetcht1 -0x340(%rcx) - prefetcht1 -0x300(%rcx) - prefetcht1 -0x2C0(%rcx) - prefetcht1 -0x280(%rcx) - prefetcht1 -0x240(%rcx) - vmovdqu64 -0x100(%rcx), %zmm0 - vmovdqu64 -0xC0(%rcx), %zmm1 - vmovdqu64 -0x80(%rcx), %zmm2 - vmovdqu64 -0x40(%rcx), %zmm3 - vmovntdq %zmm0, -0x100(%r9) - vmovntdq %zmm1, -0xC0(%r9) - vmovntdq %zmm2, -0x80(%r9) - vmovntdq %zmm3, -0x40(%r9) - sub $256, %rdx - sub $256, %rcx - sub $256, %r9 - cmp $256, %rdx - ja L(gobble_256bytes_nt_loop_bkw) - sfence - vmovups %zmm4, -0x80(%r8) - vmovups %zmm5, -0x40(%r8) - jmp L(check) -END (MEMCPY) -#endif diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S deleted file mode 100644 index c4509831fa..0000000000 --- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S +++ /dev/null @@ -1,175 +0,0 @@ -/* memcpy with unaliged loads - Copyright (C) 2013-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#if IS_IN (libc) - -#include <sysdep.h> - -#include "asm-syntax.h" - - -ENTRY(__memcpy_sse2_unaligned) - movq %rsi, %rax - leaq (%rdx,%rdx), %rcx - subq %rdi, %rax - subq %rdx, %rax - cmpq %rcx, %rax - jb L(overlapping) - cmpq $16, %rdx - jbe L(less_16) - movdqu (%rsi), %xmm8 - cmpq $32, %rdx - movdqu %xmm8, (%rdi) - movdqu -16(%rsi,%rdx), %xmm8 - movdqu %xmm8, -16(%rdi,%rdx) - ja .L31 -L(return): - movq %rdi, %rax - ret - .p2align 4,,10 - .p2align 4 -.L31: - movdqu 16(%rsi), %xmm8 - cmpq $64, %rdx - movdqu %xmm8, 16(%rdi) - movdqu -32(%rsi,%rdx), %xmm8 - movdqu %xmm8, -32(%rdi,%rdx) - jbe L(return) - movdqu 32(%rsi), %xmm8 - cmpq $128, %rdx - movdqu %xmm8, 32(%rdi) - movdqu -48(%rsi,%rdx), %xmm8 - movdqu %xmm8, -48(%rdi,%rdx) - movdqu 48(%rsi), %xmm8 - movdqu %xmm8, 48(%rdi) - movdqu -64(%rsi,%rdx), %xmm8 - movdqu %xmm8, -64(%rdi,%rdx) - jbe L(return) - leaq 64(%rdi), %rcx - addq %rdi, %rdx - andq $-64, %rdx - andq $-64, %rcx - movq %rcx, %rax - subq %rdi, %rax - addq %rax, %rsi - cmpq %rdx, %rcx - je L(return) - movq %rsi, %r10 - subq %rcx, %r10 - leaq 16(%r10), %r9 - leaq 32(%r10), %r8 - leaq 48(%r10), %rax - .p2align 4,,10 - .p2align 4 -L(loop): - movdqu (%rcx,%r10), %xmm8 - movdqa %xmm8, (%rcx) - movdqu (%rcx,%r9), %xmm8 - movdqa %xmm8, 16(%rcx) - movdqu (%rcx,%r8), %xmm8 - movdqa %xmm8, 32(%rcx) - movdqu (%rcx,%rax), %xmm8 - movdqa %xmm8, 48(%rcx) - addq $64, %rcx - cmpq %rcx, %rdx - jne L(loop) - jmp L(return) -L(overlapping): - cmpq %rsi, %rdi - jae .L3 - testq %rdx, %rdx - .p2align 4,,5 - je L(return) - movq %rdx, %r9 - leaq 16(%rsi), %rcx - leaq 16(%rdi), %r8 - shrq $4, %r9 - movq %r9, %rax - salq $4, %rax - cmpq %rcx, %rdi - setae %cl - cmpq %r8, %rsi - setae %r8b - orl %r8d, %ecx - cmpq $15, %rdx - seta %r8b - testb %r8b, %cl - je .L16 - testq %rax, %rax - je .L16 - xorl %ecx, %ecx - xorl %r8d, %r8d -.L7: - movdqu (%rsi,%rcx), %xmm8 - addq $1, %r8 - movdqu %xmm8, (%rdi,%rcx) - addq $16, %rcx - cmpq %r8, %r9 - ja .L7 - cmpq %rax, %rdx - je L(return) -.L21: - movzbl (%rsi,%rax), %ecx - movb %cl, (%rdi,%rax) - addq $1, %rax - cmpq %rax, %rdx - ja .L21 - jmp L(return) -L(less_16): - testb $24, %dl - jne L(between_9_16) - testb $4, %dl - .p2align 4,,5 - jne L(between_5_8) - testq %rdx, %rdx - .p2align 4,,2 - je L(return) - movzbl (%rsi), %eax - testb $2, %dl - movb %al, (%rdi) - je L(return) - movzwl -2(%rsi,%rdx), %eax - movw %ax, -2(%rdi,%rdx) - jmp L(return) -.L3: - leaq -1(%rdx), %rax - .p2align 4,,10 - .p2align 4 -.L11: - movzbl (%rsi,%rax), %edx - movb %dl, (%rdi,%rax) - subq $1, %rax - jmp .L11 -L(between_9_16): - movq (%rsi), %rax - movq %rax, (%rdi) - movq -8(%rsi,%rdx), %rax - movq %rax, -8(%rdi,%rdx) - jmp L(return) -.L16: - xorl %eax, %eax - jmp .L21 -L(between_5_8): - movl (%rsi), %eax - movl %eax, (%rdi) - movl -4(%rsi,%rdx), %eax - movl %eax, -4(%rdi,%rdx) - jmp L(return) -END(__memcpy_sse2_unaligned) - -#endif diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S index 08b41e9e5a..3cd1123326 100644 --- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S +++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S @@ -1,5 +1,5 @@ /* memcpy with SSSE3 and REP string - Copyright (C) 2010-2016 Free Software Foundation, Inc. + Copyright (C) 2010-2018 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -19,16 +19,15 @@ #include <sysdep.h> -#if IS_IN (libc) \ - && (defined SHARED \ - || defined USE_AS_MEMMOVE \ - || !defined USE_MULTIARCH) +#if IS_IN (libc) #include "asm-syntax.h" #ifndef MEMCPY # define MEMCPY __memcpy_ssse3_back # define MEMCPY_CHK __memcpy_chk_ssse3_back +# define MEMPCPY __mempcpy_ssse3_back +# define MEMPCPY_CHK __mempcpy_chk_ssse3_back #endif #define JMPTBL(I, B) I - B @@ -40,10 +39,23 @@ lea TABLE(%rip), %r11; \ movslq (%r11, INDEX, SCALE), INDEX; \ lea (%r11, INDEX), INDEX; \ - jmp *INDEX; \ + _CET_NOTRACK jmp *INDEX; \ ud2 .section .text.ssse3,"ax",@progbits +#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE +ENTRY (MEMPCPY_CHK) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMPCPY_CHK) + +ENTRY (MEMPCPY) + movq %rdi, %rax + addq %rdx, %rax + jmp L(start) +END (MEMPCPY) +#endif + #if !defined USE_AS_BCOPY ENTRY (MEMCPY_CHK) cmpq %rdx, %rcx @@ -66,6 +78,7 @@ ENTRY (MEMCPY) BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) L(copy_forward): #endif +L(start): cmp $144, %rdx jae L(144bytesormore) @@ -112,7 +125,7 @@ L(144bytesormore): sub $0x80, %rdx movslq (%r11, %r9, 4), %r9 add %r11, %r9 - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 .p2align 4 @@ -142,7 +155,7 @@ L(copy_backward): sub $0x80, %rdx movslq (%r11, %r9, 4), %r9 add %r11, %r9 - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 .p2align 4 diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S index 95de9695f9..0240bfa309 100644 --- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S +++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S @@ -1,5 +1,5 @@ /* memcpy with SSSE3 - Copyright (C) 2010-2016 Free Software Foundation, Inc. + Copyright (C) 2010-2018 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -19,16 +19,15 @@ #include <sysdep.h> -#if IS_IN (libc) \ - && (defined SHARED \ - || defined USE_AS_MEMMOVE \ - || !defined USE_MULTIARCH) +#if IS_IN (libc) #include "asm-syntax.h" #ifndef MEMCPY # define MEMCPY __memcpy_ssse3 # define MEMCPY_CHK __memcpy_chk_ssse3 +# define MEMPCPY __mempcpy_ssse3 +# define MEMPCPY_CHK __mempcpy_chk_ssse3 #endif #define JMPTBL(I, B) I - B @@ -40,10 +39,23 @@ lea TABLE(%rip), %r11; \ movslq (%r11, INDEX, SCALE), INDEX; \ lea (%r11, INDEX), INDEX; \ - jmp *INDEX; \ + _CET_NOTRACK jmp *INDEX; \ ud2 .section .text.ssse3,"ax",@progbits +#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE +ENTRY (MEMPCPY_CHK) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMPCPY_CHK) + +ENTRY (MEMPCPY) + movq %rdi, %rax + addq %rdx, %rax + jmp L(start) +END (MEMPCPY) +#endif + #if !defined USE_AS_BCOPY ENTRY (MEMCPY_CHK) cmpq %rdx, %rcx @@ -66,6 +78,7 @@ ENTRY (MEMCPY) jmp L(copy_backward) L(copy_forward): #endif +L(start): cmp $79, %rdx lea L(table_less_80bytes)(%rip), %r11 ja L(80bytesormore) @@ -73,7 +86,7 @@ L(copy_forward): add %rdx, %rsi add %rdx, %rdi add %r11, %r9 - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 .p2align 4 @@ -428,7 +441,7 @@ L(shl_1): lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9 L(L1_fwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_1_loop_L2): prefetchnta 0x1c0(%rsi) @@ -451,7 +464,7 @@ L(shl_1_loop_L1): jb L(shl_1_end) movaps %xmm4, -0x20(%rdi) movaps %xmm5, -0x10(%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_1_end): movaps %xmm4, -0x20(%rdi) @@ -471,7 +484,7 @@ L(shl_1_bwd): lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9 L(L1_bwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_1_bwd_loop_L2): prefetchnta -0x1c0(%rsi) @@ -496,7 +509,7 @@ L(shl_1_bwd_loop_L1): movaps %xmm3, 0x10(%rdi) jb L(shl_1_bwd_end) movaps %xmm4, (%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_1_bwd_end): movaps %xmm4, (%rdi) @@ -513,7 +526,7 @@ L(shl_2): lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9 L(L2_fwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_2_loop_L2): prefetchnta 0x1c0(%rsi) @@ -536,7 +549,7 @@ L(shl_2_loop_L1): jb L(shl_2_end) movaps %xmm4, -0x20(%rdi) movaps %xmm5, -0x10(%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_2_end): movaps %xmm4, -0x20(%rdi) @@ -556,7 +569,7 @@ L(shl_2_bwd): lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9 L(L2_bwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_2_bwd_loop_L2): prefetchnta -0x1c0(%rsi) @@ -581,7 +594,7 @@ L(shl_2_bwd_loop_L1): movaps %xmm3, 0x10(%rdi) jb L(shl_2_bwd_end) movaps %xmm4, (%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_2_bwd_end): movaps %xmm4, (%rdi) @@ -598,7 +611,7 @@ L(shl_3): lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9 L(L3_fwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_3_loop_L2): prefetchnta 0x1c0(%rsi) @@ -621,7 +634,7 @@ L(shl_3_loop_L1): jb L(shl_3_end) movaps %xmm4, -0x20(%rdi) movaps %xmm5, -0x10(%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_3_end): movaps %xmm4, -0x20(%rdi) @@ -641,7 +654,7 @@ L(shl_3_bwd): lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9 L(L3_bwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_3_bwd_loop_L2): prefetchnta -0x1c0(%rsi) @@ -666,7 +679,7 @@ L(shl_3_bwd_loop_L1): movaps %xmm3, 0x10(%rdi) jb L(shl_3_bwd_end) movaps %xmm4, (%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_3_bwd_end): movaps %xmm4, (%rdi) @@ -683,7 +696,7 @@ L(shl_4): lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9 L(L4_fwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_4_loop_L2): prefetchnta 0x1c0(%rsi) @@ -706,7 +719,7 @@ L(shl_4_loop_L1): jb L(shl_4_end) movaps %xmm4, -0x20(%rdi) movaps %xmm5, -0x10(%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_4_end): movaps %xmm4, -0x20(%rdi) @@ -726,7 +739,7 @@ L(shl_4_bwd): lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9 L(L4_bwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_4_bwd_loop_L2): prefetchnta -0x1c0(%rsi) @@ -751,7 +764,7 @@ L(shl_4_bwd_loop_L1): movaps %xmm3, 0x10(%rdi) jb L(shl_4_bwd_end) movaps %xmm4, (%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_4_bwd_end): movaps %xmm4, (%rdi) @@ -768,7 +781,7 @@ L(shl_5): lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9 L(L5_fwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_5_loop_L2): prefetchnta 0x1c0(%rsi) @@ -791,7 +804,7 @@ L(shl_5_loop_L1): jb L(shl_5_end) movaps %xmm4, -0x20(%rdi) movaps %xmm5, -0x10(%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_5_end): movaps %xmm4, -0x20(%rdi) @@ -811,7 +824,7 @@ L(shl_5_bwd): lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9 L(L5_bwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_5_bwd_loop_L2): prefetchnta -0x1c0(%rsi) @@ -836,7 +849,7 @@ L(shl_5_bwd_loop_L1): movaps %xmm3, 0x10(%rdi) jb L(shl_5_bwd_end) movaps %xmm4, (%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_5_bwd_end): movaps %xmm4, (%rdi) @@ -853,7 +866,7 @@ L(shl_6): lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9 L(L6_fwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_6_loop_L2): prefetchnta 0x1c0(%rsi) @@ -876,7 +889,7 @@ L(shl_6_loop_L1): jb L(shl_6_end) movaps %xmm4, -0x20(%rdi) movaps %xmm5, -0x10(%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_6_end): movaps %xmm4, -0x20(%rdi) @@ -896,7 +909,7 @@ L(shl_6_bwd): lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9 L(L6_bwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_6_bwd_loop_L2): prefetchnta -0x1c0(%rsi) @@ -921,7 +934,7 @@ L(shl_6_bwd_loop_L1): movaps %xmm3, 0x10(%rdi) jb L(shl_6_bwd_end) movaps %xmm4, (%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_6_bwd_end): movaps %xmm4, (%rdi) @@ -938,7 +951,7 @@ L(shl_7): lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9 L(L7_fwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_7_loop_L2): prefetchnta 0x1c0(%rsi) @@ -961,7 +974,7 @@ L(shl_7_loop_L1): jb L(shl_7_end) movaps %xmm4, -0x20(%rdi) movaps %xmm5, -0x10(%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_7_end): movaps %xmm4, -0x20(%rdi) @@ -981,7 +994,7 @@ L(shl_7_bwd): lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9 L(L7_bwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_7_bwd_loop_L2): prefetchnta -0x1c0(%rsi) @@ -1006,7 +1019,7 @@ L(shl_7_bwd_loop_L1): movaps %xmm3, 0x10(%rdi) jb L(shl_7_bwd_end) movaps %xmm4, (%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_7_bwd_end): movaps %xmm4, (%rdi) @@ -1023,7 +1036,7 @@ L(shl_8): lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9 L(L8_fwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 L(shl_8_loop_L2): prefetchnta 0x1c0(%rsi) L(shl_8_loop_L1): @@ -1045,7 +1058,7 @@ L(shl_8_loop_L1): jb L(shl_8_end) movaps %xmm4, -0x20(%rdi) movaps %xmm5, -0x10(%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 .p2align 4 L(shl_8_end): @@ -1066,7 +1079,7 @@ L(shl_8_bwd): lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9 L(L8_bwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_8_bwd_loop_L2): prefetchnta -0x1c0(%rsi) @@ -1091,7 +1104,7 @@ L(shl_8_bwd_loop_L1): movaps %xmm3, 0x10(%rdi) jb L(shl_8_bwd_end) movaps %xmm4, (%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_8_bwd_end): movaps %xmm4, (%rdi) @@ -1108,7 +1121,7 @@ L(shl_9): lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9 L(L9_fwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_9_loop_L2): prefetchnta 0x1c0(%rsi) @@ -1131,7 +1144,7 @@ L(shl_9_loop_L1): jb L(shl_9_end) movaps %xmm4, -0x20(%rdi) movaps %xmm5, -0x10(%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_9_end): movaps %xmm4, -0x20(%rdi) @@ -1151,7 +1164,7 @@ L(shl_9_bwd): lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9 L(L9_bwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_9_bwd_loop_L2): prefetchnta -0x1c0(%rsi) @@ -1176,7 +1189,7 @@ L(shl_9_bwd_loop_L1): movaps %xmm3, 0x10(%rdi) jb L(shl_9_bwd_end) movaps %xmm4, (%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_9_bwd_end): movaps %xmm4, (%rdi) @@ -1193,7 +1206,7 @@ L(shl_10): lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9 L(L10_fwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_10_loop_L2): prefetchnta 0x1c0(%rsi) @@ -1216,7 +1229,7 @@ L(shl_10_loop_L1): jb L(shl_10_end) movaps %xmm4, -0x20(%rdi) movaps %xmm5, -0x10(%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_10_end): movaps %xmm4, -0x20(%rdi) @@ -1236,7 +1249,7 @@ L(shl_10_bwd): lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9 L(L10_bwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_10_bwd_loop_L2): prefetchnta -0x1c0(%rsi) @@ -1261,7 +1274,7 @@ L(shl_10_bwd_loop_L1): movaps %xmm3, 0x10(%rdi) jb L(shl_10_bwd_end) movaps %xmm4, (%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_10_bwd_end): movaps %xmm4, (%rdi) @@ -1278,7 +1291,7 @@ L(shl_11): lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9 L(L11_fwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_11_loop_L2): prefetchnta 0x1c0(%rsi) @@ -1301,7 +1314,7 @@ L(shl_11_loop_L1): jb L(shl_11_end) movaps %xmm4, -0x20(%rdi) movaps %xmm5, -0x10(%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_11_end): movaps %xmm4, -0x20(%rdi) @@ -1321,7 +1334,7 @@ L(shl_11_bwd): lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9 L(L11_bwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_11_bwd_loop_L2): prefetchnta -0x1c0(%rsi) @@ -1346,7 +1359,7 @@ L(shl_11_bwd_loop_L1): movaps %xmm3, 0x10(%rdi) jb L(shl_11_bwd_end) movaps %xmm4, (%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_11_bwd_end): movaps %xmm4, (%rdi) @@ -1363,7 +1376,7 @@ L(shl_12): lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9 L(L12_fwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_12_loop_L2): prefetchnta 0x1c0(%rsi) @@ -1386,7 +1399,7 @@ L(shl_12_loop_L1): jb L(shl_12_end) movaps %xmm4, -0x20(%rdi) movaps %xmm5, -0x10(%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_12_end): movaps %xmm4, -0x20(%rdi) @@ -1406,7 +1419,7 @@ L(shl_12_bwd): lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9 L(L12_bwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_12_bwd_loop_L2): prefetchnta -0x1c0(%rsi) @@ -1431,7 +1444,7 @@ L(shl_12_bwd_loop_L1): movaps %xmm3, 0x10(%rdi) jb L(shl_12_bwd_end) movaps %xmm4, (%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_12_bwd_end): movaps %xmm4, (%rdi) @@ -1448,7 +1461,7 @@ L(shl_13): lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9 L(L13_fwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_13_loop_L2): prefetchnta 0x1c0(%rsi) @@ -1471,7 +1484,7 @@ L(shl_13_loop_L1): jb L(shl_13_end) movaps %xmm4, -0x20(%rdi) movaps %xmm5, -0x10(%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_13_end): movaps %xmm4, -0x20(%rdi) @@ -1491,7 +1504,7 @@ L(shl_13_bwd): lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9 L(L13_bwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_13_bwd_loop_L2): prefetchnta -0x1c0(%rsi) @@ -1516,7 +1529,7 @@ L(shl_13_bwd_loop_L1): movaps %xmm3, 0x10(%rdi) jb L(shl_13_bwd_end) movaps %xmm4, (%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_13_bwd_end): movaps %xmm4, (%rdi) @@ -1533,7 +1546,7 @@ L(shl_14): lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9 L(L14_fwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_14_loop_L2): prefetchnta 0x1c0(%rsi) @@ -1556,7 +1569,7 @@ L(shl_14_loop_L1): jb L(shl_14_end) movaps %xmm4, -0x20(%rdi) movaps %xmm5, -0x10(%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_14_end): movaps %xmm4, -0x20(%rdi) @@ -1576,7 +1589,7 @@ L(shl_14_bwd): lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9 L(L14_bwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_14_bwd_loop_L2): prefetchnta -0x1c0(%rsi) @@ -1601,7 +1614,7 @@ L(shl_14_bwd_loop_L1): movaps %xmm3, 0x10(%rdi) jb L(shl_14_bwd_end) movaps %xmm4, (%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_14_bwd_end): movaps %xmm4, (%rdi) @@ -1618,7 +1631,7 @@ L(shl_15): lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9 L(L15_fwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_15_loop_L2): prefetchnta 0x1c0(%rsi) @@ -1641,7 +1654,7 @@ L(shl_15_loop_L1): jb L(shl_15_end) movaps %xmm4, -0x20(%rdi) movaps %xmm5, -0x10(%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_15_end): movaps %xmm4, -0x20(%rdi) @@ -1661,7 +1674,7 @@ L(shl_15_bwd): lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9 L(L15_bwd): lea -64(%rdx), %rdx - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_15_bwd_loop_L2): prefetchnta -0x1c0(%rsi) @@ -1686,7 +1699,7 @@ L(shl_15_bwd_loop_L1): movaps %xmm3, 0x10(%rdi) jb L(shl_15_bwd_end) movaps %xmm4, (%rdi) - jmp *%r9 + _CET_NOTRACK jmp *%r9 ud2 L(shl_15_bwd_end): movaps %xmm4, (%rdi) diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S deleted file mode 100644 index 64a1bcd137..0000000000 --- a/sysdeps/x86_64/multiarch/memcpy.S +++ /dev/null @@ -1,89 +0,0 @@ -/* Multiple versions of memcpy - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2016 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <shlib-compat.h> -#include <init-arch.h> - -/* Define multiple versions only for the definition in lib and for - DSO. In static binaries we need memcpy before the initialization - happened. */ -#if defined SHARED && IS_IN (libc) - .text -ENTRY(__new_memcpy) - .type __new_memcpy, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX -#ifdef HAVE_AVX512_ASM_SUPPORT - HAS_ARCH_FEATURE (AVX512F_Usable) - jz 1f - HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) - jz 1f - leaq __memcpy_avx512_no_vzeroupper(%rip), %rax - ret -#endif -1: leaq __memcpy_avx_unaligned(%rip), %rax - HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) - jz 2f - ret -2: leaq __memcpy_sse2(%rip), %rax - HAS_ARCH_FEATURE (Slow_BSF) - jnz 3f - leaq __memcpy_sse2_unaligned(%rip), %rax - ret -3: HAS_CPU_FEATURE (SSSE3) - jz 4f - leaq __memcpy_ssse3(%rip), %rax -4: ret -END(__new_memcpy) - -# undef ENTRY -# define ENTRY(name) \ - .type __memcpy_sse2, @function; \ - .globl __memcpy_sse2; \ - .hidden __memcpy_sse2; \ - .p2align 4; \ - __memcpy_sse2: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __memcpy_sse2, .-__memcpy_sse2 - -# undef ENTRY_CHK -# define ENTRY_CHK(name) \ - .type __memcpy_chk_sse2, @function; \ - .globl __memcpy_chk_sse2; \ - .p2align 4; \ - __memcpy_chk_sse2: cfi_startproc; \ - CALL_MCOUNT -# undef END_CHK -# define END_CHK(name) \ - cfi_endproc; .size __memcpy_chk_sse2, .-__memcpy_chk_sse2 - -# undef libc_hidden_builtin_def -/* It doesn't make sense to send libc-internal memcpy calls through a PLT. - The speedup we get from using SSSE3 instruction is likely eaten away - by the indirect call in the PLT. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_memcpy; __GI_memcpy = __memcpy_sse2 - -versioned_symbol (libc, __new_memcpy, memcpy, GLIBC_2_14); -#endif - -#include "../memcpy.S" diff --git a/sysdeps/x86_64/multiarch/memcpy.c b/sysdeps/x86_64/multiarch/memcpy.c new file mode 100644 index 0000000000..419f76aefc --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcpy.c @@ -0,0 +1,39 @@ +/* Multiple versions of memcpy. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define memcpy __redirect_memcpy +# include <string.h> +# undef memcpy + +# define SYMBOL_NAME memcpy +# include "ifunc-memmove.h" + +libc_ifunc_redirected (__redirect_memcpy, __new_memcpy, + IFUNC_SELECTOR ()); + +# ifdef SHARED +__hidden_ver1 (__new_memcpy, __GI_memcpy, __redirect_memcpy) + __attribute__ ((visibility ("hidden"))); +# endif + +# include <shlib-compat.h> +versioned_symbol (libc, __new_memcpy, memcpy, GLIBC_2_14); +#endif diff --git a/sysdeps/x86_64/multiarch/memcpy_chk-nonshared.S b/sysdeps/x86_64/multiarch/memcpy_chk-nonshared.S new file mode 100644 index 0000000000..84c8842ce7 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcpy_chk-nonshared.S @@ -0,0 +1,21 @@ +/* Non-shared version of memcpy_chk for x86-64. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) && !defined SHARED +# include <sysdeps/x86_64/memcpy_chk.S> +#endif diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S deleted file mode 100644 index 648217e971..0000000000 --- a/sysdeps/x86_64/multiarch/memcpy_chk.S +++ /dev/null @@ -1,56 +0,0 @@ -/* Multiple versions of __memcpy_chk - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2016 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - -/* Define multiple versions only for the definition in lib and for - DSO. There are no multiarch memcpy functions for static binaries. - */ -#if IS_IN (libc) -# ifdef SHARED - .text -ENTRY(__memcpy_chk) - .type __memcpy_chk, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX -#ifdef HAVE_AVX512_ASM_SUPPORT - HAS_ARCH_FEATURE (AVX512F_Usable) - jz 1f - HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) - jz 1f - leaq __memcpy_chk_avx512_no_vzeroupper(%rip), %rax - ret -#endif -1: leaq __memcpy_chk_sse2(%rip), %rax - HAS_CPU_FEATURE (SSSE3) - jz 2f - leaq __memcpy_chk_ssse3(%rip), %rax - HAS_ARCH_FEATURE (Fast_Copy_Backward) - jz 2f - leaq __memcpy_chk_ssse3_back(%rip), %rax - HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) - jz 2f - leaq __memcpy_chk_avx_unaligned(%rip), %rax -2: ret -END(__memcpy_chk) -# else -# include "../memcpy_chk.S" -# endif -#endif diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.c b/sysdeps/x86_64/multiarch/memcpy_chk.c new file mode 100644 index 0000000000..c9b901a6dd --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcpy_chk.c @@ -0,0 +1,31 @@ +/* Multiple versions of __memcpy_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc.so. */ +#if IS_IN (libc) && defined SHARED +# define __memcpy_chk __redirect_memcpy_chk +# include <string.h> +# undef __memcpy_chk + +# define SYMBOL_NAME memcpy_chk +# include "ifunc-memmove.h" + +libc_ifunc_redirected (__redirect_memcpy_chk, __memcpy_chk, + IFUNC_SELECTOR ()); +#endif diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S new file mode 100644 index 0000000000..e195e93f15 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S @@ -0,0 +1,12 @@ +#if IS_IN (libc) +# define VEC_SIZE 32 +# define VEC(i) ymm##i +# define VMOVNT vmovntdq +# define VMOVU vmovdqu +# define VMOVA vmovdqa + +# define SECTION(p) p##.avx +# define MEMMOVE_SYMBOL(p,s) p##_avx_##s + +# include "memmove-vec-unaligned-erms.S" +#endif diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S index 518d1fec35..effc3ac2de 100644 --- a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S +++ b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S @@ -1,5 +1,5 @@ -/* memmove optimized with AVX512 for KNL hardware. - Copyright (C) 2016 Free Software Foundation, Inc. +/* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware. + Copyright (C) 2016-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,7 +16,400 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define USE_AS_MEMMOVE -#define MEMCPY __memmove_avx512_no_vzeroupper -#define MEMCPY_CHK __memmove_chk_avx512_no_vzeroupper -#include "memcpy-avx512-no-vzeroupper.S" +#include <sysdep.h> + +#if IS_IN (libc) + +# include "asm-syntax.h" + + .section .text.avx512,"ax",@progbits +ENTRY (__mempcpy_chk_avx512_no_vzeroupper) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__mempcpy_chk_avx512_no_vzeroupper) + +ENTRY (__mempcpy_avx512_no_vzeroupper) + movq %rdi, %rax + addq %rdx, %rax + jmp L(start) +END (__mempcpy_avx512_no_vzeroupper) + +ENTRY (__memmove_chk_avx512_no_vzeroupper) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memmove_chk_avx512_no_vzeroupper) + +ENTRY (__memmove_avx512_no_vzeroupper) + mov %rdi, %rax +# ifdef USE_AS_MEMPCPY + add %rdx, %rax +# endif +L(start): + lea (%rsi, %rdx), %rcx + lea (%rdi, %rdx), %r9 + cmp $512, %rdx + ja L(512bytesormore) + +L(check): + cmp $16, %rdx + jbe L(less_16bytes) + cmp $256, %rdx + jb L(less_256bytes) + vmovups (%rsi), %zmm0 + vmovups 0x40(%rsi), %zmm1 + vmovups 0x80(%rsi), %zmm2 + vmovups 0xC0(%rsi), %zmm3 + vmovups -0x100(%rcx), %zmm4 + vmovups -0xC0(%rcx), %zmm5 + vmovups -0x80(%rcx), %zmm6 + vmovups -0x40(%rcx), %zmm7 + vmovups %zmm0, (%rdi) + vmovups %zmm1, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm3, 0xC0(%rdi) + vmovups %zmm4, -0x100(%r9) + vmovups %zmm5, -0xC0(%r9) + vmovups %zmm6, -0x80(%r9) + vmovups %zmm7, -0x40(%r9) + ret + +L(less_256bytes): + cmp $128, %dl + jb L(less_128bytes) + vmovups (%rsi), %zmm0 + vmovups 0x40(%rsi), %zmm1 + vmovups -0x80(%rcx), %zmm2 + vmovups -0x40(%rcx), %zmm3 + vmovups %zmm0, (%rdi) + vmovups %zmm1, 0x40(%rdi) + vmovups %zmm2, -0x80(%r9) + vmovups %zmm3, -0x40(%r9) + ret + +L(less_128bytes): + cmp $64, %dl + jb L(less_64bytes) + vmovdqu (%rsi), %ymm0 + vmovdqu 0x20(%rsi), %ymm1 + vmovdqu -0x40(%rcx), %ymm2 + vmovdqu -0x20(%rcx), %ymm3 + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm1, 0x20(%rdi) + vmovdqu %ymm2, -0x40(%r9) + vmovdqu %ymm3, -0x20(%r9) + ret + +L(less_64bytes): + cmp $32, %dl + jb L(less_32bytes) + vmovdqu (%rsi), %ymm0 + vmovdqu -0x20(%rcx), %ymm1 + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm1, -0x20(%r9) + ret + +L(less_32bytes): + vmovdqu (%rsi), %xmm0 + vmovdqu -0x10(%rcx), %xmm1 + vmovdqu %xmm0, (%rdi) + vmovdqu %xmm1, -0x10(%r9) + ret + +L(less_16bytes): + cmp $8, %dl + jb L(less_8bytes) + movq (%rsi), %rsi + movq -0x8(%rcx), %rcx + movq %rsi, (%rdi) + movq %rcx, -0x8(%r9) + ret + +L(less_8bytes): + cmp $4, %dl + jb L(less_4bytes) + mov (%rsi), %esi + mov -0x4(%rcx), %ecx + mov %esi, (%rdi) + mov %ecx, -0x4(%r9) + ret + +L(less_4bytes): + cmp $2, %dl + jb L(less_2bytes) + mov (%rsi), %si + mov -0x2(%rcx), %cx + mov %si, (%rdi) + mov %cx, -0x2(%r9) + ret + +L(less_2bytes): + cmp $1, %dl + jb L(less_1bytes) + mov (%rsi), %cl + mov %cl, (%rdi) +L(less_1bytes): + ret + +L(512bytesormore): +# ifdef SHARED_CACHE_SIZE_HALF + mov $SHARED_CACHE_SIZE_HALF, %r8 +# else + mov __x86_shared_cache_size_half(%rip), %r8 +# endif + cmp %r8, %rdx + jae L(preloop_large) + cmp $1024, %rdx + ja L(1024bytesormore) + prefetcht1 (%rsi) + prefetcht1 0x40(%rsi) + prefetcht1 0x80(%rsi) + prefetcht1 0xC0(%rsi) + prefetcht1 0x100(%rsi) + prefetcht1 0x140(%rsi) + prefetcht1 0x180(%rsi) + prefetcht1 0x1C0(%rsi) + prefetcht1 -0x200(%rcx) + prefetcht1 -0x1C0(%rcx) + prefetcht1 -0x180(%rcx) + prefetcht1 -0x140(%rcx) + prefetcht1 -0x100(%rcx) + prefetcht1 -0xC0(%rcx) + prefetcht1 -0x80(%rcx) + prefetcht1 -0x40(%rcx) + vmovups (%rsi), %zmm0 + vmovups 0x40(%rsi), %zmm1 + vmovups 0x80(%rsi), %zmm2 + vmovups 0xC0(%rsi), %zmm3 + vmovups 0x100(%rsi), %zmm4 + vmovups 0x140(%rsi), %zmm5 + vmovups 0x180(%rsi), %zmm6 + vmovups 0x1C0(%rsi), %zmm7 + vmovups -0x200(%rcx), %zmm8 + vmovups -0x1C0(%rcx), %zmm9 + vmovups -0x180(%rcx), %zmm10 + vmovups -0x140(%rcx), %zmm11 + vmovups -0x100(%rcx), %zmm12 + vmovups -0xC0(%rcx), %zmm13 + vmovups -0x80(%rcx), %zmm14 + vmovups -0x40(%rcx), %zmm15 + vmovups %zmm0, (%rdi) + vmovups %zmm1, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm3, 0xC0(%rdi) + vmovups %zmm4, 0x100(%rdi) + vmovups %zmm5, 0x140(%rdi) + vmovups %zmm6, 0x180(%rdi) + vmovups %zmm7, 0x1C0(%rdi) + vmovups %zmm8, -0x200(%r9) + vmovups %zmm9, -0x1C0(%r9) + vmovups %zmm10, -0x180(%r9) + vmovups %zmm11, -0x140(%r9) + vmovups %zmm12, -0x100(%r9) + vmovups %zmm13, -0xC0(%r9) + vmovups %zmm14, -0x80(%r9) + vmovups %zmm15, -0x40(%r9) + ret + +L(1024bytesormore): + cmp %rsi, %rdi + ja L(1024bytesormore_bkw) + sub $512, %r9 + vmovups -0x200(%rcx), %zmm8 + vmovups -0x1C0(%rcx), %zmm9 + vmovups -0x180(%rcx), %zmm10 + vmovups -0x140(%rcx), %zmm11 + vmovups -0x100(%rcx), %zmm12 + vmovups -0xC0(%rcx), %zmm13 + vmovups -0x80(%rcx), %zmm14 + vmovups -0x40(%rcx), %zmm15 + prefetcht1 (%rsi) + prefetcht1 0x40(%rsi) + prefetcht1 0x80(%rsi) + prefetcht1 0xC0(%rsi) + prefetcht1 0x100(%rsi) + prefetcht1 0x140(%rsi) + prefetcht1 0x180(%rsi) + prefetcht1 0x1C0(%rsi) + +/* Loop with unaligned memory access. */ +L(gobble_512bytes_loop): + vmovups (%rsi), %zmm0 + vmovups 0x40(%rsi), %zmm1 + vmovups 0x80(%rsi), %zmm2 + vmovups 0xC0(%rsi), %zmm3 + vmovups 0x100(%rsi), %zmm4 + vmovups 0x140(%rsi), %zmm5 + vmovups 0x180(%rsi), %zmm6 + vmovups 0x1C0(%rsi), %zmm7 + add $512, %rsi + prefetcht1 (%rsi) + prefetcht1 0x40(%rsi) + prefetcht1 0x80(%rsi) + prefetcht1 0xC0(%rsi) + prefetcht1 0x100(%rsi) + prefetcht1 0x140(%rsi) + prefetcht1 0x180(%rsi) + prefetcht1 0x1C0(%rsi) + vmovups %zmm0, (%rdi) + vmovups %zmm1, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm3, 0xC0(%rdi) + vmovups %zmm4, 0x100(%rdi) + vmovups %zmm5, 0x140(%rdi) + vmovups %zmm6, 0x180(%rdi) + vmovups %zmm7, 0x1C0(%rdi) + add $512, %rdi + cmp %r9, %rdi + jb L(gobble_512bytes_loop) + vmovups %zmm8, (%r9) + vmovups %zmm9, 0x40(%r9) + vmovups %zmm10, 0x80(%r9) + vmovups %zmm11, 0xC0(%r9) + vmovups %zmm12, 0x100(%r9) + vmovups %zmm13, 0x140(%r9) + vmovups %zmm14, 0x180(%r9) + vmovups %zmm15, 0x1C0(%r9) + ret + +L(1024bytesormore_bkw): + add $512, %rdi + vmovups 0x1C0(%rsi), %zmm8 + vmovups 0x180(%rsi), %zmm9 + vmovups 0x140(%rsi), %zmm10 + vmovups 0x100(%rsi), %zmm11 + vmovups 0xC0(%rsi), %zmm12 + vmovups 0x80(%rsi), %zmm13 + vmovups 0x40(%rsi), %zmm14 + vmovups (%rsi), %zmm15 + prefetcht1 -0x40(%rcx) + prefetcht1 -0x80(%rcx) + prefetcht1 -0xC0(%rcx) + prefetcht1 -0x100(%rcx) + prefetcht1 -0x140(%rcx) + prefetcht1 -0x180(%rcx) + prefetcht1 -0x1C0(%rcx) + prefetcht1 -0x200(%rcx) + +/* Backward loop with unaligned memory access. */ +L(gobble_512bytes_loop_bkw): + vmovups -0x40(%rcx), %zmm0 + vmovups -0x80(%rcx), %zmm1 + vmovups -0xC0(%rcx), %zmm2 + vmovups -0x100(%rcx), %zmm3 + vmovups -0x140(%rcx), %zmm4 + vmovups -0x180(%rcx), %zmm5 + vmovups -0x1C0(%rcx), %zmm6 + vmovups -0x200(%rcx), %zmm7 + sub $512, %rcx + prefetcht1 -0x40(%rcx) + prefetcht1 -0x80(%rcx) + prefetcht1 -0xC0(%rcx) + prefetcht1 -0x100(%rcx) + prefetcht1 -0x140(%rcx) + prefetcht1 -0x180(%rcx) + prefetcht1 -0x1C0(%rcx) + prefetcht1 -0x200(%rcx) + vmovups %zmm0, -0x40(%r9) + vmovups %zmm1, -0x80(%r9) + vmovups %zmm2, -0xC0(%r9) + vmovups %zmm3, -0x100(%r9) + vmovups %zmm4, -0x140(%r9) + vmovups %zmm5, -0x180(%r9) + vmovups %zmm6, -0x1C0(%r9) + vmovups %zmm7, -0x200(%r9) + sub $512, %r9 + cmp %rdi, %r9 + ja L(gobble_512bytes_loop_bkw) + vmovups %zmm8, -0x40(%rdi) + vmovups %zmm9, -0x80(%rdi) + vmovups %zmm10, -0xC0(%rdi) + vmovups %zmm11, -0x100(%rdi) + vmovups %zmm12, -0x140(%rdi) + vmovups %zmm13, -0x180(%rdi) + vmovups %zmm14, -0x1C0(%rdi) + vmovups %zmm15, -0x200(%rdi) + ret + +L(preloop_large): + cmp %rsi, %rdi + ja L(preloop_large_bkw) + vmovups (%rsi), %zmm4 + vmovups 0x40(%rsi), %zmm5 + + mov %rdi, %r11 +/* Align destination for access with non-temporal stores in the loop. */ + mov %rdi, %r8 + and $-0x80, %rdi + add $0x80, %rdi + sub %rdi, %r8 + sub %r8, %rsi + add %r8, %rdx +L(gobble_256bytes_nt_loop): + prefetcht1 0x200(%rsi) + prefetcht1 0x240(%rsi) + prefetcht1 0x280(%rsi) + prefetcht1 0x2C0(%rsi) + prefetcht1 0x300(%rsi) + prefetcht1 0x340(%rsi) + prefetcht1 0x380(%rsi) + prefetcht1 0x3C0(%rsi) + vmovdqu64 (%rsi), %zmm0 + vmovdqu64 0x40(%rsi), %zmm1 + vmovdqu64 0x80(%rsi), %zmm2 + vmovdqu64 0xC0(%rsi), %zmm3 + vmovntdq %zmm0, (%rdi) + vmovntdq %zmm1, 0x40(%rdi) + vmovntdq %zmm2, 0x80(%rdi) + vmovntdq %zmm3, 0xC0(%rdi) + sub $256, %rdx + add $256, %rsi + add $256, %rdi + cmp $256, %rdx + ja L(gobble_256bytes_nt_loop) + sfence + vmovups %zmm4, (%r11) + vmovups %zmm5, 0x40(%r11) + jmp L(check) + +L(preloop_large_bkw): + vmovups -0x80(%rcx), %zmm4 + vmovups -0x40(%rcx), %zmm5 + +/* Align end of destination for access with non-temporal stores. */ + mov %r9, %r8 + and $-0x80, %r9 + sub %r9, %r8 + sub %r8, %rcx + sub %r8, %rdx + add %r9, %r8 +L(gobble_256bytes_nt_loop_bkw): + prefetcht1 -0x400(%rcx) + prefetcht1 -0x3C0(%rcx) + prefetcht1 -0x380(%rcx) + prefetcht1 -0x340(%rcx) + prefetcht1 -0x300(%rcx) + prefetcht1 -0x2C0(%rcx) + prefetcht1 -0x280(%rcx) + prefetcht1 -0x240(%rcx) + vmovdqu64 -0x100(%rcx), %zmm0 + vmovdqu64 -0xC0(%rcx), %zmm1 + vmovdqu64 -0x80(%rcx), %zmm2 + vmovdqu64 -0x40(%rcx), %zmm3 + vmovntdq %zmm0, -0x100(%r9) + vmovntdq %zmm1, -0xC0(%r9) + vmovntdq %zmm2, -0x80(%r9) + vmovntdq %zmm3, -0x40(%r9) + sub $256, %rdx + sub $256, %rcx + sub $256, %r9 + cmp $256, %rdx + ja L(gobble_256bytes_nt_loop_bkw) + sfence + vmovups %zmm4, -0x80(%r8) + vmovups %zmm5, -0x40(%r8) + jmp L(check) +END (__memmove_avx512_no_vzeroupper) + +strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper) +strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper) +#endif diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S new file mode 100644 index 0000000000..aac1515cf6 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S @@ -0,0 +1,12 @@ +#if IS_IN (libc) +# define VEC_SIZE 64 +# define VEC(i) zmm##i +# define VMOVNT vmovntdq +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 + +# define SECTION(p) p##.avx512 +# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s + +# include "memmove-vec-unaligned-erms.S" +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/s_floorf.S b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S index 74a149a950..7c6163ddcb 100644 --- a/sysdeps/x86_64/fpu/multiarch/s_floorf.S +++ b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S @@ -1,6 +1,6 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* memmove with SSE2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. - Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -16,23 +16,18 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include <machine/asm.h> -#include <init-arch.h> - - -ENTRY(__floorf) - .type __floorf, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq __floorf_sse41(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jnz 2f - leaq __floorf_c(%rip), %rax -2: ret -END(__floorf) -weak_alias (__floorf, floorf) - - -ENTRY(__floorf_sse41) - roundss $1, %xmm0, %xmm0 - ret -END(__floorf_sse41) +#if IS_IN (libc) +# define MEMMOVE_SYMBOL(p,s) p##_sse2_##s +#else +weak_alias (__mempcpy, mempcpy) +#endif + +#include <sysdeps/x86_64/memmove.S> + +#if defined SHARED && IS_IN (libc) +# include <shlib-compat.h> +# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14) +/* Use __memmove_sse2_unaligned to support overlapping addresses. */ +compat_symbol (libc, __memmove_sse2_unaligned, memcpy, GLIBC_2_2_5); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S new file mode 100644 index 0000000000..e2ede45e9f --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -0,0 +1,565 @@ +/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb + Copyright (C) 2016-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* memmove/memcpy/mempcpy is implemented as: + 1. Use overlapping load and store to avoid branch. + 2. Load all sources into registers and store them together to avoid + possible address overlap between source and destination. + 3. If size is 8 * VEC_SIZE or less, load all sources into registers + and store them together. + 4. If address of destination > address of source, backward copy + 4 * VEC_SIZE at a time with unaligned load and aligned store. + Load the first 4 * VEC and last VEC before the loop and store + them after the loop to support overlapping addresses. + 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned + load and aligned store. Load the last 4 * VEC and first VEC + before the loop and store them after the loop to support + overlapping addresses. + 6. If size >= __x86_shared_non_temporal_threshold and there is no + overlap between destination and source, use non-temporal store + instead of aligned store. */ + +#include <sysdep.h> + +#ifndef MEMCPY_SYMBOL +# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) +#endif + +#ifndef MEMPCPY_SYMBOL +# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) +#endif + +#ifndef MEMMOVE_CHK_SYMBOL +# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) +#endif + +#ifndef VZEROUPPER +# if VEC_SIZE > 16 +# define VZEROUPPER vzeroupper +# else +# define VZEROUPPER +# endif +#endif + +/* Threshold to use Enhanced REP MOVSB. Since there is overhead to set + up REP MOVSB operation, REP MOVSB isn't faster on short data. The + memcpy micro benchmark in glibc shows that 2KB is the approximate + value above which REP MOVSB becomes faster than SSE2 optimization + on processors with Enhanced REP MOVSB. Since larger register size + can move more data with a single load and store, the threshold is + higher with larger register size. */ +#ifndef REP_MOVSB_THRESHOLD +# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16)) +#endif + +#ifndef PREFETCH +# define PREFETCH(addr) prefetcht0 addr +#endif + +/* Assume 64-byte prefetch size. */ +#ifndef PREFETCH_SIZE +# define PREFETCH_SIZE 64 +#endif + +#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4) + +#if PREFETCH_SIZE == 64 +# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE +# define PREFETCH_ONE_SET(dir, base, offset) \ + PREFETCH ((offset)base) +# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE +# define PREFETCH_ONE_SET(dir, base, offset) \ + PREFETCH ((offset)base); \ + PREFETCH ((offset + dir * PREFETCH_SIZE)base) +# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE +# define PREFETCH_ONE_SET(dir, base, offset) \ + PREFETCH ((offset)base); \ + PREFETCH ((offset + dir * PREFETCH_SIZE)base); \ + PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \ + PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base) +# else +# error Unsupported PREFETCHED_LOAD_SIZE! +# endif +#else +# error Unsupported PREFETCH_SIZE! +#endif + +#ifndef SECTION +# error SECTION is not defined! +#endif + + .section SECTION(.text),"ax",@progbits +#if defined SHARED && IS_IN (libc) +ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) +#endif + +ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned)) + movq %rdi, %rax + addq %rdx, %rax + jmp L(start) +END (MEMPCPY_SYMBOL (__mempcpy, unaligned)) + +#if defined SHARED && IS_IN (libc) +ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) +#endif + +ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned)) + movq %rdi, %rax +L(start): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) +#if !defined USE_MULTIARCH || !IS_IN (libc) +L(last_2x_vec): +#endif + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU (%rsi), %VEC(0) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) + VZEROUPPER +#if !defined USE_MULTIARCH || !IS_IN (libc) +L(nop): +#endif + ret +#if defined USE_MULTIARCH && IS_IN (libc) +END (MEMMOVE_SYMBOL (__memmove, unaligned)) + +# if VEC_SIZE == 16 +ENTRY (__mempcpy_chk_erms) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__mempcpy_chk_erms) + +/* Only used to measure performance of REP MOVSB. */ +ENTRY (__mempcpy_erms) + movq %rdi, %rax + /* Skip zero length. */ + testq %rdx, %rdx + jz 2f + addq %rdx, %rax + jmp L(start_movsb) +END (__mempcpy_erms) + +ENTRY (__memmove_chk_erms) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memmove_chk_erms) + +ENTRY (__memmove_erms) + movq %rdi, %rax + /* Skip zero length. */ + testq %rdx, %rdx + jz 2f +L(start_movsb): + movq %rdx, %rcx + cmpq %rsi, %rdi + jb 1f + /* Source == destination is less common. */ + je 2f + leaq (%rsi,%rcx), %rdx + cmpq %rdx, %rdi + jb L(movsb_backward) +1: + rep movsb +2: + ret +L(movsb_backward): + leaq -1(%rdi,%rcx), %rdi + leaq -1(%rsi,%rcx), %rsi + std + rep movsb + cld + ret +END (__memmove_erms) +strong_alias (__memmove_erms, __memcpy_erms) +strong_alias (__memmove_chk_erms, __memcpy_chk_erms) +# endif + +# ifdef SHARED +ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) +# endif + +ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) + movq %rdi, %rax + addq %rdx, %rax + jmp L(start_erms) +END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) + +# ifdef SHARED +ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) +# endif + +ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) + movq %rdi, %rax +L(start_erms): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + cmpq $(VEC_SIZE * 2), %rdx + ja L(movsb_more_2x_vec) +L(last_2x_vec): + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU (%rsi), %VEC(0) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) +L(return): + VZEROUPPER + ret + +L(movsb): + cmpq __x86_shared_non_temporal_threshold(%rip), %rdx + jae L(more_8x_vec) + cmpq %rsi, %rdi + jb 1f + /* Source == destination is less common. */ + je L(nop) + leaq (%rsi,%rdx), %r9 + cmpq %r9, %rdi + /* Avoid slow backward REP MOVSB. */ +# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8) +# error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE! +# endif + jb L(more_8x_vec_backward) +1: + movq %rdx, %rcx + rep movsb +L(nop): + ret +#endif + +L(less_vec): + /* Less than 1 VEC. */ +#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 +# error Unsupported VEC_SIZE! +#endif +#if VEC_SIZE > 32 + cmpb $32, %dl + jae L(between_32_63) +#endif +#if VEC_SIZE > 16 + cmpb $16, %dl + jae L(between_16_31) +#endif + cmpb $8, %dl + jae L(between_8_15) + cmpb $4, %dl + jae L(between_4_7) + cmpb $1, %dl + ja L(between_2_3) + jb 1f + movzbl (%rsi), %ecx + movb %cl, (%rdi) +1: + ret +#if VEC_SIZE > 32 +L(between_32_63): + /* From 32 to 63. No branch when size == 32. */ + vmovdqu (%rsi), %ymm0 + vmovdqu -32(%rsi,%rdx), %ymm1 + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm1, -32(%rdi,%rdx) + VZEROUPPER + ret +#endif +#if VEC_SIZE > 16 + /* From 16 to 31. No branch when size == 16. */ +L(between_16_31): + vmovdqu (%rsi), %xmm0 + vmovdqu -16(%rsi,%rdx), %xmm1 + vmovdqu %xmm0, (%rdi) + vmovdqu %xmm1, -16(%rdi,%rdx) + ret +#endif +L(between_8_15): + /* From 8 to 15. No branch when size == 8. */ + movq -8(%rsi,%rdx), %rcx + movq (%rsi), %rsi + movq %rcx, -8(%rdi,%rdx) + movq %rsi, (%rdi) + ret +L(between_4_7): + /* From 4 to 7. No branch when size == 4. */ + movl -4(%rsi,%rdx), %ecx + movl (%rsi), %esi + movl %ecx, -4(%rdi,%rdx) + movl %esi, (%rdi) + ret +L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ + movzwl -2(%rsi,%rdx), %ecx + movzwl (%rsi), %esi + movw %cx, -2(%rdi,%rdx) + movw %si, (%rdi) + ret + +#if defined USE_MULTIARCH && IS_IN (libc) +L(movsb_more_2x_vec): + cmpq $REP_MOVSB_THRESHOLD, %rdx + ja L(movsb) +#endif +L(more_2x_vec): + /* More than 2 * VEC and there may be overlap between destination + and source. */ + cmpq $(VEC_SIZE * 8), %rdx + ja L(more_8x_vec) + cmpq $(VEC_SIZE * 4), %rdx + jb L(last_4x_vec) + /* Copy from 4 * VEC to 8 * VEC, inclusively. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4) + VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5) + VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6) + VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), VEC_SIZE(%rdi) + VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) + VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) + VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) + VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) + VZEROUPPER + ret +L(last_4x_vec): + /* Copy from 2 * VEC to 4 * VEC. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2) + VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), VEC_SIZE(%rdi) + VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) + VZEROUPPER + ret + +L(more_8x_vec): + cmpq %rsi, %rdi + ja L(more_8x_vec_backward) + /* Source == destination is less common. */ + je L(nop) + /* Load the first VEC and last 4 * VEC to support overlapping + addresses. */ + VMOVU (%rsi), %VEC(4) + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5) + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6) + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7) + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8) + /* Save start and stop of the destination buffer. */ + movq %rdi, %r11 + leaq -VEC_SIZE(%rdi, %rdx), %rcx + /* Align destination for aligned stores in the loop. Compute + how much destination is misaligned. */ + movq %rdi, %r8 + andq $(VEC_SIZE - 1), %r8 + /* Get the negative of offset for alignment. */ + subq $VEC_SIZE, %r8 + /* Adjust source. */ + subq %r8, %rsi + /* Adjust destination which should be aligned now. */ + subq %r8, %rdi + /* Adjust length. */ + addq %r8, %rdx +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) + /* Check non-temporal store threshold. */ + cmpq __x86_shared_non_temporal_threshold(%rip), %rdx + ja L(large_forward) +#endif +L(loop_4x_vec_forward): + /* Copy 4 * VEC a time forward. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) + addq $(VEC_SIZE * 4), %rsi + subq $(VEC_SIZE * 4), %rdx + VMOVA %VEC(0), (%rdi) + VMOVA %VEC(1), VEC_SIZE(%rdi) + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) + addq $(VEC_SIZE * 4), %rdi + cmpq $(VEC_SIZE * 4), %rdx + ja L(loop_4x_vec_forward) + /* Store the last 4 * VEC. */ + VMOVU %VEC(5), (%rcx) + VMOVU %VEC(6), -VEC_SIZE(%rcx) + VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) + VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) + /* Store the first VEC. */ + VMOVU %VEC(4), (%r11) + VZEROUPPER + ret + +L(more_8x_vec_backward): + /* Load the first 4 * VEC and last VEC to support overlapping + addresses. */ + VMOVU (%rsi), %VEC(4) + VMOVU VEC_SIZE(%rsi), %VEC(5) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8) + /* Save stop of the destination buffer. */ + leaq -VEC_SIZE(%rdi, %rdx), %r11 + /* Align destination end for aligned stores in the loop. Compute + how much destination end is misaligned. */ + leaq -VEC_SIZE(%rsi, %rdx), %rcx + movq %r11, %r9 + movq %r11, %r8 + andq $(VEC_SIZE - 1), %r8 + /* Adjust source. */ + subq %r8, %rcx + /* Adjust the end of destination which should be aligned now. */ + subq %r8, %r9 + /* Adjust length. */ + subq %r8, %rdx +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) + /* Check non-temporal store threshold. */ + cmpq __x86_shared_non_temporal_threshold(%rip), %rdx + ja L(large_backward) +#endif +L(loop_4x_vec_backward): + /* Copy 4 * VEC a time backward. */ + VMOVU (%rcx), %VEC(0) + VMOVU -VEC_SIZE(%rcx), %VEC(1) + VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) + VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) + subq $(VEC_SIZE * 4), %rcx + subq $(VEC_SIZE * 4), %rdx + VMOVA %VEC(0), (%r9) + VMOVA %VEC(1), -VEC_SIZE(%r9) + VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) + VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) + subq $(VEC_SIZE * 4), %r9 + cmpq $(VEC_SIZE * 4), %rdx + ja L(loop_4x_vec_backward) + /* Store the first 4 * VEC. */ + VMOVU %VEC(4), (%rdi) + VMOVU %VEC(5), VEC_SIZE(%rdi) + VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) + /* Store the last VEC. */ + VMOVU %VEC(8), (%r11) + VZEROUPPER + ret + +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) +L(large_forward): + /* Don't use non-temporal store if there is overlap between + destination and source since destination may be in cache + when source is loaded. */ + leaq (%rdi, %rdx), %r10 + cmpq %r10, %rsi + jb L(loop_4x_vec_forward) +L(loop_large_forward): + /* Copy 4 * VEC a time forward with non-temporal stores. */ + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) + addq $PREFETCHED_LOAD_SIZE, %rsi + subq $PREFETCHED_LOAD_SIZE, %rdx + VMOVNT %VEC(0), (%rdi) + VMOVNT %VEC(1), VEC_SIZE(%rdi) + VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) + addq $PREFETCHED_LOAD_SIZE, %rdi + cmpq $PREFETCHED_LOAD_SIZE, %rdx + ja L(loop_large_forward) + sfence + /* Store the last 4 * VEC. */ + VMOVU %VEC(5), (%rcx) + VMOVU %VEC(6), -VEC_SIZE(%rcx) + VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) + VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) + /* Store the first VEC. */ + VMOVU %VEC(4), (%r11) + VZEROUPPER + ret + +L(large_backward): + /* Don't use non-temporal store if there is overlap between + destination and source since destination may be in cache + when source is loaded. */ + leaq (%rcx, %rdx), %r10 + cmpq %r10, %r9 + jb L(loop_4x_vec_backward) +L(loop_large_backward): + /* Copy 4 * VEC a time backward with non-temporal stores. */ + PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) + PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) + VMOVU (%rcx), %VEC(0) + VMOVU -VEC_SIZE(%rcx), %VEC(1) + VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) + VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) + subq $PREFETCHED_LOAD_SIZE, %rcx + subq $PREFETCHED_LOAD_SIZE, %rdx + VMOVNT %VEC(0), (%r9) + VMOVNT %VEC(1), -VEC_SIZE(%r9) + VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) + VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) + subq $PREFETCHED_LOAD_SIZE, %r9 + cmpq $PREFETCHED_LOAD_SIZE, %rdx + ja L(loop_large_backward) + sfence + /* Store the first 4 * VEC. */ + VMOVU %VEC(4), (%rdi) + VMOVU %VEC(5), VEC_SIZE(%rdi) + VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) + /* Store the last VEC. */ + VMOVU %VEC(8), (%r11) + VZEROUPPER + ret +#endif +END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) + +#if IS_IN (libc) +# ifdef USE_MULTIARCH +strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms), + MEMMOVE_SYMBOL (__memcpy, unaligned_erms)) +# ifdef SHARED +strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms), + MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms)) +# endif +# endif +# ifdef SHARED +strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned), + MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned)) +# endif +#endif +strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned), + MEMCPY_SYMBOL (__memcpy, unaligned)) diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c index 8da5640bb0..d512228eae 100644 --- a/sysdeps/x86_64/multiarch/memmove.c +++ b/sysdeps/x86_64/multiarch/memmove.c @@ -1,6 +1,6 @@ -/* Multiple versions of memmove. +/* Multiple versions of memmmove. All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2016 Free Software Foundation, Inc. + Copyright (C) 2016-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -17,57 +17,21 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ +/* Define multiple versions only for the definition in libc. */ #if IS_IN (libc) -# define MEMMOVE __memmove_sse2 -# ifdef SHARED -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(name) \ - __hidden_ver1 (__memmove_sse2, __GI_memmove, __memmove_sse2); -# endif - -/* Redefine memmove so that the compiler won't complain about the type - mismatch with the IFUNC selector in strong_alias, below. */ -# undef memmove # define memmove __redirect_memmove # include <string.h> # undef memmove -extern __typeof (__redirect_memmove) __memmove_sse2 attribute_hidden; -extern __typeof (__redirect_memmove) __memmove_ssse3 attribute_hidden; -extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden; -extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden; -# ifdef HAVE_AVX512_ASM_SUPPORT - extern __typeof (__redirect_memmove) __memmove_avx512_no_vzeroupper attribute_hidden; -# endif - -#endif +# define SYMBOL_NAME memmove +# include "ifunc-memmove.h" -#include "string/memmove.c" +libc_ifunc_redirected (__redirect_memmove, __libc_memmove, + IFUNC_SELECTOR ()); -#if IS_IN (libc) -# include <shlib-compat.h> -# include "init-arch.h" - -/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle - ifunc symbol properly. */ -extern __typeof (__redirect_memmove) __libc_memmove; -libc_ifunc (__libc_memmove, -#ifdef HAVE_AVX512_ASM_SUPPORT - HAS_ARCH_FEATURE (AVX512F_Usable) - && HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) - ? __memmove_avx512_no_vzeroupper - : -#endif - (HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) - ? __memmove_avx_unaligned - : (HAS_CPU_FEATURE (SSSE3) - ? (HAS_ARCH_FEATURE (Fast_Copy_Backward) - ? __memmove_ssse3_back : __memmove_ssse3) - : __memmove_sse2))); - -strong_alias (__libc_memmove, memmove) - -# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14) -compat_symbol (libc, memmove, memcpy, GLIBC_2_2_5); +strong_alias (__libc_memmove, memmove); +# ifdef SHARED +__hidden_ver1 (__libc_memmove, __GI_memmove, __redirect_memmove) + __attribute__ ((visibility ("hidden"))); # endif #endif diff --git a/sysdeps/x86_64/multiarch/memmove_chk-nonshared.S b/sysdeps/x86_64/multiarch/memmove_chk-nonshared.S new file mode 100644 index 0000000000..c362a3324d --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmove_chk-nonshared.S @@ -0,0 +1,21 @@ +/* Non-shared version of memmove_chk for x86-64. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) && !defined SHARED +# include <sysdeps/x86_64/memmove_chk.S> +#endif diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c index f64da63180..0e9dc7e07f 100644 --- a/sysdeps/x86_64/multiarch/memmove_chk.c +++ b/sysdeps/x86_64/multiarch/memmove_chk.c @@ -1,6 +1,6 @@ -/* Multiple versions of __memmove_chk. +/* Multiple versions of __memmove_chk All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2016 Free Software Foundation, Inc. + Copyright (C) 2017-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -17,30 +17,15 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include <string.h> -#include "init-arch.h" +/* Define multiple versions only for the definition in libc.so. */ +#if IS_IN (libc) && defined SHARED +# define __memmove_chk __redirect_memmove_chk +# include <string.h> +# undef __memmove_chk -#define MEMMOVE_CHK __memmove_chk_sse2 +# define SYMBOL_NAME memmove_chk +# include "ifunc-memmove.h" -extern __typeof (__memmove_chk) __memmove_chk_sse2 attribute_hidden; -extern __typeof (__memmove_chk) __memmove_chk_ssse3 attribute_hidden; -extern __typeof (__memmove_chk) __memmove_chk_ssse3_back attribute_hidden; -extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden; -# ifdef HAVE_AVX512_ASM_SUPPORT - extern __typeof (__memmove_chk) __memmove_chk_avx512_no_vzeroupper attribute_hidden; -# endif - -#include "debug/memmove_chk.c" - -libc_ifunc (__memmove_chk, -#ifdef HAVE_AVX512_ASM_SUPPORT - HAS_ARCH_FEATURE (AVX512F_Usable) - && HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) - ? __memmove_chk_avx512_no_vzeroupper - : +libc_ifunc_redirected (__redirect_memmove_chk, __memmove_chk, + IFUNC_SELECTOR ()); #endif - HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) ? __memmove_chk_avx_unaligned : - (HAS_CPU_FEATURE (SSSE3) - ? (HAS_ARCH_FEATURE (Fast_Copy_Backward) - ? __memmove_chk_ssse3_back : __memmove_chk_ssse3) - : __memmove_chk_sse2)); diff --git a/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S deleted file mode 100644 index 82ffacb8fb..0000000000 --- a/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_MEMPCPY -#define MEMCPY __mempcpy_ssse3_back -#define MEMCPY_CHK __mempcpy_chk_ssse3_back -#include "memcpy-ssse3-back.S" diff --git a/sysdeps/x86_64/multiarch/mempcpy-ssse3.S b/sysdeps/x86_64/multiarch/mempcpy-ssse3.S deleted file mode 100644 index 822d98e954..0000000000 --- a/sysdeps/x86_64/multiarch/mempcpy-ssse3.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_MEMPCPY -#define MEMCPY __mempcpy_ssse3 -#define MEMCPY_CHK __mempcpy_chk_ssse3 -#include "memcpy-ssse3.S" diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S deleted file mode 100644 index ed78623565..0000000000 --- a/sysdeps/x86_64/multiarch/mempcpy.S +++ /dev/null @@ -1,86 +0,0 @@ -/* Multiple versions of mempcpy - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2016 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - -/* Define multiple versions only for the definition in lib and for - DSO. In static binaries we need mempcpy before the initialization - happened. */ -#if defined SHARED && IS_IN (libc) -ENTRY(__mempcpy) - .type __mempcpy, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX -#ifdef HAVE_AVX512_ASM_SUPPORT - HAS_ARCH_FEATURE (AVX512F_Usable) - jz 1f - HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) - jz 1f - leaq __mempcpy_avx512_no_vzeroupper(%rip), %rax - ret -#endif -1: leaq __mempcpy_sse2(%rip), %rax - HAS_CPU_FEATURE (SSSE3) - jz 2f - leaq __mempcpy_ssse3(%rip), %rax - HAS_ARCH_FEATURE (Fast_Copy_Backward) - jz 2f - leaq __mempcpy_ssse3_back(%rip), %rax - HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) - jz 2f - leaq __mempcpy_avx_unaligned(%rip), %rax -2: ret -END(__mempcpy) - -# undef ENTRY -# define ENTRY(name) \ - .type __mempcpy_sse2, @function; \ - .p2align 4; \ - .globl __mempcpy_sse2; \ - .hidden __mempcpy_sse2; \ - __mempcpy_sse2: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __mempcpy_sse2, .-__mempcpy_sse2 - -# undef ENTRY_CHK -# define ENTRY_CHK(name) \ - .type __mempcpy_chk_sse2, @function; \ - .globl __mempcpy_chk_sse2; \ - .p2align 4; \ - __mempcpy_chk_sse2: cfi_startproc; \ - CALL_MCOUNT -# undef END_CHK -# define END_CHK(name) \ - cfi_endproc; .size __mempcpy_chk_sse2, .-__mempcpy_chk_sse2 - -# undef libc_hidden_def -# undef libc_hidden_builtin_def -/* It doesn't make sense to send libc-internal mempcpy calls through a PLT. - The speedup we get from using SSSE3 instruction is likely eaten away - by the indirect call in the PLT. */ -# define libc_hidden_def(name) \ - .globl __GI_mempcpy; __GI_mempcpy = __mempcpy_sse2 -# define libc_hidden_builtin_def(name) \ - .globl __GI___mempcpy; __GI___mempcpy = __mempcpy_sse2 -#endif - -#include "../mempcpy.S" diff --git a/sysdeps/x86_64/multiarch/mempcpy.c b/sysdeps/x86_64/multiarch/mempcpy.c new file mode 100644 index 0000000000..9fe41dda82 --- /dev/null +++ b/sysdeps/x86_64/multiarch/mempcpy.c @@ -0,0 +1,42 @@ +/* Multiple versions of mempcpy. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define mempcpy __redirect_mempcpy +# define __mempcpy __redirect___mempcpy +# define NO_MEMPCPY_STPCPY_REDIRECT +# define __NO_STRING_INLINES +# include <string.h> +# undef mempcpy +# undef __mempcpy + +# define SYMBOL_NAME mempcpy +# include "ifunc-memmove.h" + +libc_ifunc_redirected (__redirect_mempcpy, __mempcpy, IFUNC_SELECTOR ()); + +weak_alias (__mempcpy, mempcpy) +# ifdef SHARED +__hidden_ver1 (__mempcpy, __GI___mempcpy, __redirect___mempcpy) + __attribute__ ((visibility ("hidden"))); +__hidden_ver1 (mempcpy, __GI_mempcpy, __redirect_mempcpy) + __attribute__ ((visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk-nonshared.S b/sysdeps/x86_64/multiarch/mempcpy_chk-nonshared.S new file mode 100644 index 0000000000..7133246a1d --- /dev/null +++ b/sysdeps/x86_64/multiarch/mempcpy_chk-nonshared.S @@ -0,0 +1,21 @@ +/* Non-shared version of mempcpy_chk for x86-64. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) && !defined SHARED +# include <sysdeps/x86_64/mempcpy_chk.S> +#endif diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S deleted file mode 100644 index 6e8a89d38c..0000000000 --- a/sysdeps/x86_64/multiarch/mempcpy_chk.S +++ /dev/null @@ -1,56 +0,0 @@ -/* Multiple versions of __mempcpy_chk - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2016 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - -/* Define multiple versions only for the definition in lib and for - DSO. There are no multiarch mempcpy functions for static binaries. - */ -#if IS_IN (libc) -# ifdef SHARED - .text -ENTRY(__mempcpy_chk) - .type __mempcpy_chk, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX -#ifdef HAVE_AVX512_ASM_SUPPORT - HAS_ARCH_FEATURE (AVX512F_Usable) - jz 1f - HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) - jz 1f - leaq __mempcpy_chk_avx512_no_vzeroupper(%rip), %rax - ret -#endif -1: leaq __mempcpy_chk_sse2(%rip), %rax - HAS_CPU_FEATURE (SSSE3) - jz 2f - leaq __mempcpy_chk_ssse3(%rip), %rax - HAS_ARCH_FEATURE (Fast_Copy_Backward) - jz 2f - leaq __mempcpy_chk_ssse3_back(%rip), %rax - HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) - jz 2f - leaq __mempcpy_chk_avx_unaligned(%rip), %rax -2: ret -END(__mempcpy_chk) -# else -# include "../mempcpy_chk.S" -# endif -#endif diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.c b/sysdeps/x86_64/multiarch/mempcpy_chk.c new file mode 100644 index 0000000000..956918b3a1 --- /dev/null +++ b/sysdeps/x86_64/multiarch/mempcpy_chk.c @@ -0,0 +1,31 @@ +/* Multiple versions of __mempcpy_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc.so. */ +#if IS_IN (libc) && defined SHARED +# define __mempcpy_chk __redirect_mempcpy_chk +# include <string.h> +# undef __mempcpy_chk + +# define SYMBOL_NAME mempcpy_chk +# include "ifunc-memmove.h" + +libc_ifunc_redirected (__redirect_mempcpy_chk, __mempcpy_chk, + IFUNC_SELECTOR ()); +#endif diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S new file mode 100644 index 0000000000..b41a58bcba --- /dev/null +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S @@ -0,0 +1,359 @@ +/* memrchr optimized with AVX2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef VZEROUPPER +# define VZEROUPPER vzeroupper +# endif + +# define VEC_SIZE 32 + + .section .text.avx,"ax",@progbits +ENTRY (__memrchr_avx2) + /* Broadcast CHAR to YMM0. */ + vmovd %esi, %xmm0 + vpbroadcastb %xmm0, %ymm0 + + subq $VEC_SIZE, %rdx + jbe L(last_vec_or_less) + + addq %rdx, %rdi + + /* Check the last VEC_SIZE bytes. */ + vpcmpeqb (%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x0) + + subq $(VEC_SIZE * 4), %rdi + movl %edi, %ecx + andl $(VEC_SIZE - 1), %ecx + jz L(aligned_more) + + /* Align data for aligned loads in the loop. */ + addq $VEC_SIZE, %rdi + addq $VEC_SIZE, %rdx + andq $-VEC_SIZE, %rdi + subq %rcx, %rdx + + .p2align 4 +L(aligned_more): + subq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec_or_less) + + /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ + vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x3) + + vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 + vpmovmskb %ymm2, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 + vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(last_vec_x1) + + vpcmpeqb (%rdi), %ymm0, %ymm4 + vpmovmskb %ymm4, %eax + testl %eax, %eax + jnz L(last_vec_x0) + + /* Align data to 4 * VEC_SIZE for loop with fewer branches. + There are some overlaps with above if data isn't aligned + to 4 * VEC_SIZE. */ + movl %edi, %ecx + andl $(VEC_SIZE * 4 - 1), %ecx + jz L(loop_4x_vec) + + addq $(VEC_SIZE * 4), %rdi + addq $(VEC_SIZE * 4), %rdx + andq $-(VEC_SIZE * 4), %rdi + subq %rcx, %rdx + + .p2align 4 +L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ + subq $(VEC_SIZE * 4), %rdi + subq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec_or_less) + + vmovdqa (%rdi), %ymm1 + vmovdqa VEC_SIZE(%rdi), %ymm2 + vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 + vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 + + vpcmpeqb %ymm1, %ymm0, %ymm1 + vpcmpeqb %ymm2, %ymm0, %ymm2 + vpcmpeqb %ymm3, %ymm0, %ymm3 + vpcmpeqb %ymm4, %ymm0, %ymm4 + + vpor %ymm1, %ymm2, %ymm5 + vpor %ymm3, %ymm4, %ymm6 + vpor %ymm5, %ymm6, %ymm5 + + vpmovmskb %ymm5, %eax + testl %eax, %eax + jz L(loop_4x_vec) + + /* There is a match. */ + vpmovmskb %ymm4, %eax + testl %eax, %eax + jnz L(last_vec_x3) + + vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + vpmovmskb %ymm2, %eax + testl %eax, %eax + jnz L(last_vec_x1) + + vpmovmskb %ymm1, %eax + bsrl %eax, %eax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_4x_vec_or_less): + addl $(VEC_SIZE * 4), %edx + cmpl $(VEC_SIZE * 2), %edx + jbe L(last_2x_vec) + + vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x3) + + vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 + vpmovmskb %ymm2, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 + vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(last_vec_x1_check) + cmpl $(VEC_SIZE * 3), %edx + jbe L(zero) + + vpcmpeqb (%rdi), %ymm0, %ymm4 + vpmovmskb %ymm4, %eax + testl %eax, %eax + jz L(zero) + bsrl %eax, %eax + subq $(VEC_SIZE * 4), %rdx + addq %rax, %rdx + jl L(zero) + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_2x_vec): + vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x3_check) + cmpl $VEC_SIZE, %edx + jbe L(zero) + + vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jz L(zero) + bsrl %eax, %eax + subq $(VEC_SIZE * 2), %rdx + addq %rax, %rdx + jl L(zero) + addl $(VEC_SIZE * 2), %eax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_vec_x0): + bsrl %eax, %eax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_vec_x1): + bsrl %eax, %eax + addl $VEC_SIZE, %eax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_vec_x2): + bsrl %eax, %eax + addl $(VEC_SIZE * 2), %eax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_vec_x3): + bsrl %eax, %eax + addl $(VEC_SIZE * 3), %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_vec_x1_check): + bsrl %eax, %eax + subq $(VEC_SIZE * 3), %rdx + addq %rax, %rdx + jl L(zero) + addl $VEC_SIZE, %eax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_vec_x3_check): + bsrl %eax, %eax + subq $VEC_SIZE, %rdx + addq %rax, %rdx + jl L(zero) + addl $(VEC_SIZE * 3), %eax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(zero): + VZEROUPPER +L(null): + xorl %eax, %eax + ret + + .p2align 4 +L(last_vec_or_less_aligned): + movl %edx, %ecx + + vpcmpeqb (%rdi), %ymm0, %ymm1 + + movl $1, %edx + /* Support rdx << 32. */ + salq %cl, %rdx + subq $1, %rdx + + vpmovmskb %ymm1, %eax + + /* Remove the trailing bytes. */ + andl %edx, %eax + testl %eax, %eax + jz L(zero) + + bsrl %eax, %eax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_vec_or_less): + addl $VEC_SIZE, %edx + + /* Check for zero length. */ + testl %edx, %edx + jz L(null) + + movl %edi, %ecx + andl $(VEC_SIZE - 1), %ecx + jz L(last_vec_or_less_aligned) + + movl %ecx, %esi + movl %ecx, %r8d + addl %edx, %esi + andq $-VEC_SIZE, %rdi + + subl $VEC_SIZE, %esi + ja L(last_vec_2x_aligned) + + /* Check the last VEC. */ + vpcmpeqb (%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + + /* Remove the leading and trailing bytes. */ + sarl %cl, %eax + movl %edx, %ecx + + movl $1, %edx + sall %cl, %edx + subl $1, %edx + + andl %edx, %eax + testl %eax, %eax + jz L(zero) + + bsrl %eax, %eax + addq %rdi, %rax + addq %r8, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_vec_2x_aligned): + movl %esi, %ecx + + /* Check the last VEC. */ + vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1 + + movl $1, %edx + sall %cl, %edx + subl $1, %edx + + vpmovmskb %ymm1, %eax + + /* Remove the trailing bytes. */ + andl %edx, %eax + + testl %eax, %eax + jnz L(last_vec_x1) + + /* Check the second last VEC. */ + vpcmpeqb (%rdi), %ymm0, %ymm1 + + movl %r8d, %ecx + + vpmovmskb %ymm1, %eax + + /* Remove the leading bytes. Must use unsigned right shift for + bsrl below. */ + shrl %cl, %eax + testl %eax, %eax + jz L(zero) + + bsrl %eax, %eax + addq %rdi, %rax + addq %r8, %rax + VZEROUPPER + ret +END (__memrchr_avx2) +#endif diff --git a/sysdeps/x86_64/memmove.c b/sysdeps/x86_64/multiarch/memrchr-sse2.S index 07f81852d6..12281663ec 100644 --- a/sysdeps/x86_64/memmove.c +++ b/sysdeps/x86_64/multiarch/memrchr-sse2.S @@ -1,4 +1,5 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* memrchr optimized with SSE2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -15,12 +16,11 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include "string/memmove.c" +#if IS_IN (libc) +# define __memrchr __memrchr_sse2 -#if !defined memmove && IS_IN (libc) -#include <shlib-compat.h> - -#if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14) -compat_symbol (libc, memmove, memcpy, GLIBC_2_2_5); -#endif +# undef weak_alias +# define weak_alias(__memrchr, memrchr) #endif + +#include "../memrchr.S" diff --git a/sysdeps/x86_64/multiarch/memrchr.c b/sysdeps/x86_64/multiarch/memrchr.c new file mode 100644 index 0000000000..d227fe7819 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memrchr.c @@ -0,0 +1,31 @@ +/* Multiple versions of memrchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define memrchr __redirect_memrchr +# include <string.h> +# undef memrchr + +# define SYMBOL_NAME memrchr +# include "ifunc-avx2.h" + +libc_ifunc_redirected (__redirect_memrchr, __memrchr, IFUNC_SELECTOR ()); +weak_alias (__memrchr, memrchr) +#endif diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S new file mode 100644 index 0000000000..7ab3d89849 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S @@ -0,0 +1,22 @@ +#if IS_IN (libc) +# define VEC_SIZE 32 +# define VEC(i) ymm##i +# define VMOVU vmovdqu +# define VMOVA vmovdqa + +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ + movq r, %rax; \ + vpbroadcastb %xmm0, %ymm0 + +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ + movq r, %rax; \ + vpbroadcastd %xmm0, %ymm0 + +# define SECTION(p) p##.avx +# define MEMSET_SYMBOL(p,s) p##_avx2_##s +# define WMEMSET_SYMBOL(p,s) p##_avx2_##s + +# include "memset-vec-unaligned-erms.S" +#endif diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S deleted file mode 100644 index df634728d4..0000000000 --- a/sysdeps/x86_64/multiarch/memset-avx2.S +++ /dev/null @@ -1,168 +0,0 @@ -/* memset with AVX2 - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#if IS_IN (libc) - -#include "asm-syntax.h" -#ifndef MEMSET -# define MEMSET __memset_avx2 -# define MEMSET_CHK __memset_chk_avx2 -#endif - - .section .text.avx2,"ax",@progbits -#if defined PIC -ENTRY (MEMSET_CHK) - cmpq %rdx, %rcx - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMSET_CHK) -#endif - -ENTRY (MEMSET) - vpxor %xmm0, %xmm0, %xmm0 - vmovd %esi, %xmm1 - lea (%rdi, %rdx), %rsi - mov %rdi, %rax - vpshufb %xmm0, %xmm1, %xmm0 - cmp $16, %rdx - jb L(less_16bytes) - cmp $256, %rdx - jae L(256bytesormore) - cmp $128, %dl - jb L(less_128bytes) - vmovdqu %xmm0, (%rdi) - vmovdqu %xmm0, 0x10(%rdi) - vmovdqu %xmm0, 0x20(%rdi) - vmovdqu %xmm0, 0x30(%rdi) - vmovdqu %xmm0, 0x40(%rdi) - vmovdqu %xmm0, 0x50(%rdi) - vmovdqu %xmm0, 0x60(%rdi) - vmovdqu %xmm0, 0x70(%rdi) - vmovdqu %xmm0, -0x80(%rsi) - vmovdqu %xmm0, -0x70(%rsi) - vmovdqu %xmm0, -0x60(%rsi) - vmovdqu %xmm0, -0x50(%rsi) - vmovdqu %xmm0, -0x40(%rsi) - vmovdqu %xmm0, -0x30(%rsi) - vmovdqu %xmm0, -0x20(%rsi) - vmovdqu %xmm0, -0x10(%rsi) - ret - - .p2align 4 -L(less_128bytes): - cmp $64, %dl - jb L(less_64bytes) - vmovdqu %xmm0, (%rdi) - vmovdqu %xmm0, 0x10(%rdi) - vmovdqu %xmm0, 0x20(%rdi) - vmovdqu %xmm0, 0x30(%rdi) - vmovdqu %xmm0, -0x40(%rsi) - vmovdqu %xmm0, -0x30(%rsi) - vmovdqu %xmm0, -0x20(%rsi) - vmovdqu %xmm0, -0x10(%rsi) - ret - - .p2align 4 -L(less_64bytes): - cmp $32, %dl - jb L(less_32bytes) - vmovdqu %xmm0, (%rdi) - vmovdqu %xmm0, 0x10(%rdi) - vmovdqu %xmm0, -0x20(%rsi) - vmovdqu %xmm0, -0x10(%rsi) - ret - - .p2align 4 -L(less_32bytes): - vmovdqu %xmm0, (%rdi) - vmovdqu %xmm0, -0x10(%rsi) - ret - - .p2align 4 -L(less_16bytes): - cmp $8, %dl - jb L(less_8bytes) - vmovq %xmm0, (%rdi) - vmovq %xmm0, -0x08(%rsi) - ret - - .p2align 4 -L(less_8bytes): - vmovd %xmm0, %ecx - cmp $4, %dl - jb L(less_4bytes) - mov %ecx, (%rdi) - mov %ecx, -0x04(%rsi) - ret - - .p2align 4 -L(less_4bytes): - cmp $2, %dl - jb L(less_2bytes) - mov %cx, (%rdi) - mov %cx, -0x02(%rsi) - ret - - .p2align 4 -L(less_2bytes): - cmp $1, %dl - jb L(less_1bytes) - mov %cl, (%rdi) -L(less_1bytes): - ret - - .p2align 4 -L(256bytesormore): - vinserti128 $1, %xmm0, %ymm0, %ymm0 - and $-0x20, %rdi - add $0x20, %rdi - vmovdqu %ymm0, (%rax) - sub %rdi, %rax - lea -0x80(%rax, %rdx), %rcx - cmp $4096, %rcx - ja L(gobble_data) -L(gobble_128_loop): - vmovdqa %ymm0, (%rdi) - vmovdqa %ymm0, 0x20(%rdi) - vmovdqa %ymm0, 0x40(%rdi) - vmovdqa %ymm0, 0x60(%rdi) - sub $-0x80, %rdi - add $-0x80, %ecx - jb L(gobble_128_loop) - mov %rsi, %rax - vmovdqu %ymm0, -0x80(%rsi) - vmovdqu %ymm0, -0x60(%rsi) - vmovdqu %ymm0, -0x40(%rsi) - vmovdqu %ymm0, -0x20(%rsi) - sub %rdx, %rax - vzeroupper - ret - - .p2align 4 -L(gobble_data): - sub $-0x80, %rcx - vmovd %xmm0, %eax - rep stosb - mov %rsi, %rax - sub %rdx, %rax - vzeroupper - ret - -END (MEMSET) -#endif diff --git a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S index 1e638d7ac2..689cc1199c 100644 --- a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S +++ b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S @@ -1,5 +1,5 @@ /* memset optimized with AVX512 for KNL hardware. - Copyright (C) 2015-2016 Free Software Foundation, Inc. + Copyright (C) 2015-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,7 +18,7 @@ #include <sysdep.h> -#if defined HAVE_AVX512_ASM_SUPPORT && IS_IN (libc) +#if IS_IN (libc) #include "asm-syntax.h" #ifndef MEMSET @@ -26,7 +26,7 @@ # define MEMSET_CHK __memset_chk_avx512_no_vzeroupper #endif - .section .text,"ax",@progbits + .section .text.avx512,"ax",@progbits #if defined PIC ENTRY (MEMSET_CHK) cmpq %rdx, %rcx diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S new file mode 100644 index 0000000000..0783979ca5 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S @@ -0,0 +1,24 @@ +#if IS_IN (libc) +# define VEC_SIZE 64 +# define VEC(i) zmm##i +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 + +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ + movq r, %rax; \ + vpbroadcastb %xmm0, %xmm0; \ + vpbroadcastq %xmm0, %zmm0 + +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ + movq r, %rax; \ + vpbroadcastd %xmm0, %xmm0; \ + vpbroadcastq %xmm0, %zmm0 + +# define SECTION(p) p##.avx512 +# define MEMSET_SYMBOL(p,s) p##_avx512_##s +# define WMEMSET_SYMBOL(p,s) p##_avx512_##s + +# include "memset-vec-unaligned-erms.S" +#endif diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S index b510f756e2..be6671759b 100644 --- a/sysdeps/x86_64/multiarch/wmemcmp.S +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S @@ -1,7 +1,6 @@ -/* Multiple versions of wmemcmp +/* memset with SSE2. All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2011-2016 Free Software Foundation, Inc. - Contributed by Intel Corporation. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -19,26 +18,24 @@ <http://www.gnu.org/licenses/>. */ #include <sysdep.h> +#include <shlib-compat.h> #include <init-arch.h> -/* Define multiple versions only for the definition in libc. */ #if IS_IN (libc) - .text -ENTRY(wmemcmp) - .type wmemcmp, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - HAS_CPU_FEATURE (SSSE3) - jnz 2f - leaq __wmemcmp_sse2(%rip), %rax - ret - -2: HAS_CPU_FEATURE (SSE4_1) - jz 3f - leaq __wmemcmp_sse4_1(%rip), %rax - ret - -3: leaq __wmemcmp_ssse3(%rip), %rax - ret - -END(wmemcmp) +# define MEMSET_SYMBOL(p,s) p##_sse2_##s +# define WMEMSET_SYMBOL(p,s) p##_sse2_##s + +# ifdef SHARED +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) +# endif + +# undef weak_alias +# define weak_alias(original, alias) \ + .weak bzero; bzero = __bzero + +# undef strong_alias +# define strong_alias(ignored1, ignored2) #endif + +#include <sysdeps/x86_64/memset.S> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S new file mode 100644 index 0000000000..dc9cb88b37 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -0,0 +1,274 @@ +/* memset/bzero with unaligned store and rep stosb + Copyright (C) 2016-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* memset is implemented as: + 1. Use overlapping store to avoid branch. + 2. If size is less than VEC, use integer register stores. + 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores. + 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores. + 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with + 4 VEC stores and store 4 * VEC at a time until done. */ + +#include <sysdep.h> + +#ifndef MEMSET_CHK_SYMBOL +# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s) +#endif + +#ifndef WMEMSET_CHK_SYMBOL +# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s) +#endif + +#ifndef VZEROUPPER +# if VEC_SIZE > 16 +# define VZEROUPPER vzeroupper +# else +# define VZEROUPPER +# endif +#endif + +#ifndef VZEROUPPER_SHORT_RETURN +# if VEC_SIZE > 16 +# define VZEROUPPER_SHORT_RETURN vzeroupper +# else +# define VZEROUPPER_SHORT_RETURN rep +# endif +#endif + +#ifndef MOVQ +# if VEC_SIZE > 16 +# define MOVQ vmovq +# else +# define MOVQ movq +# endif +#endif + +/* Threshold to use Enhanced REP STOSB. Since there is overhead to set + up REP STOSB operation, REP STOSB isn't faster on short data. The + memset micro benchmark in glibc shows that 2KB is the approximate + value above which REP STOSB becomes faster on processors with + Enhanced REP STOSB. Since the stored value is fixed, larger register + size has minimal impact on threshold. */ +#ifndef REP_STOSB_THRESHOLD +# define REP_STOSB_THRESHOLD 2048 +#endif + +#ifndef SECTION +# error SECTION is not defined! +#endif + + .section SECTION(.text),"ax",@progbits +#if VEC_SIZE == 16 && IS_IN (libc) +ENTRY (__bzero) + movq %rdi, %rax /* Set return value. */ + movq %rsi, %rdx /* Set n. */ + pxor %xmm0, %xmm0 + jmp L(entry_from_bzero) +END (__bzero) +weak_alias (__bzero, bzero) +#endif + +#if IS_IN (libc) +# if defined SHARED +ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) +# endif + +ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) + shlq $2, %rdx + WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + jmp L(entry_from_bzero) +END (WMEMSET_SYMBOL (__wmemset, unaligned)) +#endif + +#if defined SHARED && IS_IN (libc) +ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) +#endif + +ENTRY (MEMSET_SYMBOL (__memset, unaligned)) + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) +L(entry_from_bzero): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), (%rdi) + VZEROUPPER + ret +#if defined USE_MULTIARCH && IS_IN (libc) +END (MEMSET_SYMBOL (__memset, unaligned)) + +# if VEC_SIZE == 16 +ENTRY (__memset_chk_erms) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memset_chk_erms) + +/* Only used to measure performance of REP STOSB. */ +ENTRY (__memset_erms) + /* Skip zero length. */ + testq %rdx, %rdx + jnz L(stosb) + movq %rdi, %rax + ret +# else +/* Provide a hidden symbol to debugger. */ + .hidden MEMSET_SYMBOL (__memset, erms) +ENTRY (MEMSET_SYMBOL (__memset, erms)) +# endif +L(stosb): + /* Issue vzeroupper before rep stosb. */ + VZEROUPPER + movq %rdx, %rcx + movzbl %sil, %eax + movq %rdi, %rdx + rep stosb + movq %rdx, %rax + ret +# if VEC_SIZE == 16 +END (__memset_erms) +# else +END (MEMSET_SYMBOL (__memset, erms)) +# endif + +# if defined SHARED && IS_IN (libc) +ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) +# endif + +ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + cmpq $(VEC_SIZE * 2), %rdx + ja L(stosb_more_2x_vec) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), (%rdi) + VZEROUPPER + ret + +L(stosb_more_2x_vec): + cmpq $REP_STOSB_THRESHOLD, %rdx + ja L(stosb) +#endif +L(more_2x_vec): + cmpq $(VEC_SIZE * 4), %rdx + ja L(loop_start) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(0), VEC_SIZE(%rdi) + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) +L(return): + VZEROUPPER + ret + +L(loop_start): + leaq (VEC_SIZE * 4)(%rdi), %rcx + VMOVU %VEC(0), (%rdi) + andq $-(VEC_SIZE * 4), %rcx + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), VEC_SIZE(%rdi) + VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) + VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx) + VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi) + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx) + addq %rdi, %rdx + andq $-(VEC_SIZE * 4), %rdx + cmpq %rdx, %rcx + je L(return) +L(loop): + VMOVA %VEC(0), (%rcx) + VMOVA %VEC(0), VEC_SIZE(%rcx) + VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx) + VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx) + addq $(VEC_SIZE * 4), %rcx + cmpq %rcx, %rdx + jne L(loop) + VZEROUPPER_SHORT_RETURN + ret +L(less_vec): + /* Less than 1 VEC. */ +# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 +# error Unsupported VEC_SIZE! +# endif +# if VEC_SIZE > 32 + cmpb $32, %dl + jae L(between_32_63) +# endif +# if VEC_SIZE > 16 + cmpb $16, %dl + jae L(between_16_31) +# endif + MOVQ %xmm0, %rcx + cmpb $8, %dl + jae L(between_8_15) + cmpb $4, %dl + jae L(between_4_7) + cmpb $1, %dl + ja L(between_2_3) + jb 1f + movb %cl, (%rdi) +1: + VZEROUPPER + ret +# if VEC_SIZE > 32 + /* From 32 to 63. No branch when size == 32. */ +L(between_32_63): + vmovdqu %ymm0, -32(%rdi,%rdx) + vmovdqu %ymm0, (%rdi) + VZEROUPPER + ret +# endif +# if VEC_SIZE > 16 + /* From 16 to 31. No branch when size == 16. */ +L(between_16_31): + vmovdqu %xmm0, -16(%rdi,%rdx) + vmovdqu %xmm0, (%rdi) + VZEROUPPER + ret +# endif + /* From 8 to 15. No branch when size == 8. */ +L(between_8_15): + movq %rcx, -8(%rdi,%rdx) + movq %rcx, (%rdi) + VZEROUPPER + ret +L(between_4_7): + /* From 4 to 7. No branch when size == 4. */ + movl %ecx, -4(%rdi,%rdx) + movl %ecx, (%rdi) + VZEROUPPER + ret +L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ + movw %cx, -2(%rdi,%rdx) + movw %cx, (%rdi) + VZEROUPPER + ret +END (MEMSET_SYMBOL (__memset, unaligned_erms)) diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S deleted file mode 100644 index 8e3b9b9764..0000000000 --- a/sysdeps/x86_64/multiarch/memset.S +++ /dev/null @@ -1,64 +0,0 @@ -/* Multiple versions of memset - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <shlib-compat.h> -#include <init-arch.h> - -/* Define multiple versions only for the definition in lib. */ -#if IS_IN (libc) -ENTRY(memset) - .type memset, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq __memset_sse2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - leaq __memset_avx2(%rip), %rax -#ifdef HAVE_AVX512_ASM_SUPPORT - HAS_ARCH_FEATURE (AVX512F_Usable) - jz 2f - HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) - jz 2f - leaq __memset_avx512_no_vzeroupper(%rip), %rax -#endif -2: ret -END(memset) -#endif - -#if IS_IN (libc) -# undef memset -# define memset __memset_sse2 - -# undef __memset_chk -# define __memset_chk __memset_chk_sse2 - -# ifdef SHARED -# undef libc_hidden_builtin_def -/* It doesn't make sense to send libc-internal memset calls through a PLT. - The speedup we get from using GPR instruction is likely eaten away - by the indirect call in the PLT. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_memset; __GI_memset = __memset_sse2 -# endif - -# undef strong_alias -# define strong_alias(original, alias) -#endif - -#include "../memset.S" diff --git a/sysdeps/x86_64/multiarch/memset.c b/sysdeps/x86_64/multiarch/memset.c new file mode 100644 index 0000000000..064841d5fc --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset.c @@ -0,0 +1,35 @@ +/* Multiple versions of memset. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define memset __redirect_memset +# include <string.h> +# undef memset + +# define SYMBOL_NAME memset +# include "ifunc-memset.h" + +libc_ifunc_redirected (__redirect_memset, memset, IFUNC_SELECTOR ()); + +# ifdef SHARED +__hidden_ver1 (memset, __GI_memset, __redirect_memset) + __attribute__ ((visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/memset_chk-nonshared.S b/sysdeps/x86_64/multiarch/memset_chk-nonshared.S new file mode 100644 index 0000000000..dcc2384a27 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset_chk-nonshared.S @@ -0,0 +1,21 @@ +/* Non-shared version of memcpy_chk for x86-64. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) && !defined SHARED +# include <sysdeps/x86_64/memset_chk.S> +#endif diff --git a/sysdeps/x86_64/multiarch/memset_chk.S b/sysdeps/x86_64/multiarch/memset_chk.S deleted file mode 100644 index 9a7b270274..0000000000 --- a/sysdeps/x86_64/multiarch/memset_chk.S +++ /dev/null @@ -1,49 +0,0 @@ -/* Multiple versions of memset_chk - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2014-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - -/* Define multiple versions only for the definition in lib. */ -#if IS_IN (libc) -# ifdef SHARED -ENTRY(__memset_chk) - .type __memset_chk, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq __memset_chk_sse2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - leaq __memset_chk_avx2(%rip), %rax -#ifdef HAVE_AVX512_ASM_SUPPORT - HAS_ARCH_FEATURE (AVX512F_Usable) - jz 2f - HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) - jz 2f - leaq __memset_chk_avx512_no_vzeroupper(%rip), %rax -#endif -2: ret -END(__memset_chk) - -strong_alias (__memset_chk, __memset_zero_constant_len_parameter) - .section .gnu.warning.__memset_zero_constant_len_parameter - .string "memset used with constant zero length parameter; this could be due to transposed parameters" -# else -# include "../memset_chk.S" -# endif -#endif diff --git a/sysdeps/x86_64/multiarch/memset_chk.c b/sysdeps/x86_64/multiarch/memset_chk.c new file mode 100644 index 0000000000..f9c05b364e --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset_chk.c @@ -0,0 +1,31 @@ +/* Multiple versions of __memset_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc.so. */ +#if IS_IN (libc) && defined SHARED +# define __memset_chk __redirect_memset_chk +# include <string.h> +# undef __memset_chk + +# define SYMBOL_NAME memset_chk +# include "ifunc-memset.h" + +libc_ifunc_redirected (__redirect_memset_chk, __memset_chk, + IFUNC_SELECTOR ()); +#endif diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S new file mode 100644 index 0000000000..128f9ea637 --- /dev/null +++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S @@ -0,0 +1,4 @@ +#define MEMCHR __rawmemchr_avx2 +#define USE_AS_RAWMEMCHR 1 + +#include "memchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/rawmemchr-sse2.S b/sysdeps/x86_64/multiarch/rawmemchr-sse2.S new file mode 100644 index 0000000000..c681d84037 --- /dev/null +++ b/sysdeps/x86_64/multiarch/rawmemchr-sse2.S @@ -0,0 +1,29 @@ +/* rawmemchr optimized with SSE2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define __rawmemchr __rawmemchr_sse2 + +# undef weak_alias +# define weak_alias(__rawmemchr, rawmemchr) +# undef libc_hidden_def +# define libc_hidden_def(__rawmemchr) +#endif + +#include "../rawmemchr.S" diff --git a/sysdeps/x86_64/multiarch/rawmemchr.c b/sysdeps/x86_64/multiarch/rawmemchr.c new file mode 100644 index 0000000000..8a0bc3137e --- /dev/null +++ b/sysdeps/x86_64/multiarch/rawmemchr.c @@ -0,0 +1,38 @@ +/* Multiple versions of rawmemchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define rawmemchr __redirect_rawmemchr +# define __rawmemchr __redirect___rawmemchr +# include <string.h> +# undef rawmemchr +# undef __rawmemchr + +# define SYMBOL_NAME rawmemchr +# include "ifunc-avx2.h" + +libc_ifunc_redirected (__redirect_rawmemchr, __rawmemchr, + IFUNC_SELECTOR ()); +weak_alias (__rawmemchr, rawmemchr) +# ifdef SHARED +__hidden_ver1 (__rawmemchr, __GI___rawmemchr, __redirect___rawmemchr) + __attribute__((visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/sched_cpucount.c b/sysdeps/x86_64/multiarch/sched_cpucount.c index b75aeb79b2..d10d74ae21 100644 --- a/sysdeps/x86_64/multiarch/sched_cpucount.c +++ b/sysdeps/x86_64/multiarch/sched_cpucount.c @@ -1,6 +1,6 @@ /* Count bits in CPU set. x86-64 multi-arch version. This file is part of the GNU C Library. - Copyright (C) 2008-2016 Free Software Foundation, Inc. + Copyright (C) 2008-2018 Free Software Foundation, Inc. Contributed by Ulrich Drepper <drepper@redhat.com>. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2.S b/sysdeps/x86_64/multiarch/stpcpy-sse2.S new file mode 100644 index 0000000000..b91a988399 --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpcpy-sse2.S @@ -0,0 +1,33 @@ +/* stpcpy optimized with SSE2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> +# define __stpcpy __stpcpy_sse2 + +# undef weak_alias +# define weak_alias(ignored1, ignored2) +# undef libc_hidden_def +# define libc_hidden_def(__stpcpy) +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(stpcpy) +#endif + +#define USE_AS_STPCPY +#include <sysdeps/x86_64/stpcpy.S> diff --git a/sysdeps/x86_64/multiarch/stpcpy.S b/sysdeps/x86_64/multiarch/stpcpy.S deleted file mode 100644 index ee81ab6ae3..0000000000 --- a/sysdeps/x86_64/multiarch/stpcpy.S +++ /dev/null @@ -1,9 +0,0 @@ -/* Multiple versions of stpcpy - All versions must be listed in ifunc-impl-list.c. */ -#define USE_AS_STPCPY -#define STRCPY __stpcpy -#include "strcpy.S" - -weak_alias (__stpcpy, stpcpy) -libc_hidden_def (__stpcpy) -libc_hidden_builtin_def (stpcpy) diff --git a/sysdeps/x86_64/multiarch/stpcpy.c b/sysdeps/x86_64/multiarch/stpcpy.c new file mode 100644 index 0000000000..1e340fca99 --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpcpy.c @@ -0,0 +1,42 @@ +/* Multiple versions of stpcpy. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define stpcpy __redirect_stpcpy +# define __stpcpy __redirect___stpcpy +# define NO_MEMPCPY_STPCPY_REDIRECT +# define __NO_STRING_INLINES +# include <string.h> +# undef stpcpy +# undef __stpcpy + +# define SYMBOL_NAME stpcpy +# include "ifunc-unaligned-ssse3.h" + +libc_ifunc_redirected (__redirect_stpcpy, __stpcpy, IFUNC_SELECTOR ()); + +weak_alias (__stpcpy, stpcpy) +# ifdef SHARED +__hidden_ver1 (__stpcpy, __GI___stpcpy, __redirect___stpcpy) + __attribute__ ((visibility ("hidden"))); +__hidden_ver1 (stpcpy, __GI_stpcpy, __redirect_stpcpy) + __attribute__ ((visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/stpncpy-c.c b/sysdeps/x86_64/multiarch/stpncpy-c.c index 2fde77dcab..b016e487e1 100644 --- a/sysdeps/x86_64/multiarch/stpncpy-c.c +++ b/sysdeps/x86_64/multiarch/stpncpy-c.c @@ -1,8 +1,7 @@ #define STPNCPY __stpncpy_sse2 -#ifdef SHARED +#undef weak_alias +#define weak_alias(ignored1, ignored2) #undef libc_hidden_def -#define libc_hidden_def(name) \ - __hidden_ver1 (__stpncpy_sse2, __GI___stpncpy, __stpncpy_sse2); -#endif +#define libc_hidden_def(stpncpy) -#include "stpncpy.c" +#include <string/stpncpy.c> diff --git a/sysdeps/x86_64/multiarch/stpncpy.S b/sysdeps/x86_64/multiarch/stpncpy.S deleted file mode 100644 index 2698ca6a8c..0000000000 --- a/sysdeps/x86_64/multiarch/stpncpy.S +++ /dev/null @@ -1,8 +0,0 @@ -/* Multiple versions of stpncpy - All versions must be listed in ifunc-impl-list.c. */ -#define STRCPY __stpncpy -#define USE_AS_STPCPY -#define USE_AS_STRNCPY -#include "strcpy.S" - -weak_alias (__stpncpy, stpncpy) diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c new file mode 100644 index 0000000000..28842ece2b --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpncpy.c @@ -0,0 +1,38 @@ +/* Multiple versions of stpncpy. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define stpncpy __redirect_stpncpy +# define __stpncpy __redirect___stpncpy +# include <string.h> +# undef stpncpy +# undef __stpncpy + +# define SYMBOL_NAME stpncpy +# include "ifunc-unaligned-ssse3.h" + +libc_ifunc_redirected (__redirect_stpncpy, __stpncpy, IFUNC_SELECTOR ()); + +weak_alias (__stpncpy, stpncpy) +# ifdef SHARED +__hidden_ver1 (__stpncpy, __GI___stpncpy, __redirect___stpncpy) + __attribute__ ((visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strcasecmp.c b/sysdeps/x86_64/multiarch/strcasecmp.c new file mode 100644 index 0000000000..8676a621c6 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcasecmp.c @@ -0,0 +1,39 @@ +/* Multiple versions of strcasecmp. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define strcasecmp __redirect_strcasecmp +# define __strcasecmp __redirect___strcasecmp +# include <string.h> +# undef strcasecmp +# undef __strcasecmp + +# define SYMBOL_NAME strcasecmp +# include "ifunc-strcasecmp.h" + +libc_ifunc_redirected (__redirect_strcasecmp, __strcasecmp, + IFUNC_SELECTOR ()); + +weak_alias (__strcasecmp, strcasecmp) +# ifdef SHARED +__hidden_ver1 (__strcasecmp, __GI___strcasecmp, __redirect___strcasecmp) + __attribute__ ((visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S new file mode 100644 index 0000000000..56a03547eb --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S @@ -0,0 +1,22 @@ +/* strcasecmp_l optimized with AVX. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define STRCMP_SSE42 __strcasecmp_l_avx +#define USE_AVX 1 +#define USE_AS_STRCASECMP_L +#include "strcmp-sse42.S" diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S new file mode 100644 index 0000000000..2984640405 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S @@ -0,0 +1,23 @@ +/* strcasecmp_l optimized with SSE2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define STRCMP __strcasecmp_l_sse2 +#define USE_AS_STRCASECMP_L +#define NO_NOLOCALE_ALIAS +#define __strcasecmp __strcasecmp_sse2 +#include <sysdeps/x86_64/strcmp.S> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-sse4_2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-sse4_2.S new file mode 100644 index 0000000000..31e2f9075d --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-sse4_2.S @@ -0,0 +1,21 @@ +/* strcasecmp_l optimized with SSE4.2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define STRCMP_SSE42 __strcasecmp_l_sse42 +#define USE_AS_STRCASECMP_L +#include "strcmp-sse42.S" diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l.S b/sysdeps/x86_64/multiarch/strcasecmp_l.S deleted file mode 100644 index 49f5b9fd95..0000000000 --- a/sysdeps/x86_64/multiarch/strcasecmp_l.S +++ /dev/null @@ -1,8 +0,0 @@ -/* Multiple versions of strcasecmp and strcasecmp_l - All versions must be listed in ifunc-impl-list.c. */ -#define STRCMP __strcasecmp_l -#define USE_AS_STRCASECMP_L -#include "strcmp.S" - -weak_alias (__strcasecmp_l, strcasecmp_l) -libc_hidden_def (strcasecmp_l) diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l.c b/sysdeps/x86_64/multiarch/strcasecmp_l.c new file mode 100644 index 0000000000..dc674510df --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcasecmp_l.c @@ -0,0 +1,40 @@ +/* Multiple versions of strcasecmp_l. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define strcasecmp_l __redirect_strcasecmp_l +# define __strcasecmp_l __redirect___strcasecmp_l +# include <string.h> +# undef strcasecmp_l +# undef __strcasecmp_l + +# define SYMBOL_NAME strcasecmp_l +# include "ifunc-strcasecmp.h" + +libc_ifunc_redirected (__redirect_strcasecmp_l, __strcasecmp_l, + IFUNC_SELECTOR ()); + +weak_alias (__strcasecmp_l, strcasecmp_l) +# ifdef SHARED +__hidden_ver1 (__strcasecmp_l, __GI___strcasecmp_l, + __redirect___strcasecmp_l) + __attribute__ ((visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S index 3a694d45c2..852f179bf4 100644 --- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S @@ -1,5 +1,5 @@ /* strcat with SSE2 - Copyright (C) 2011-2016 Free Software Foundation, Inc. + Copyright (C) 2011-2018 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/strcat-sse2.S b/sysdeps/x86_64/multiarch/strcat-sse2.S new file mode 100644 index 0000000000..8eb64e104c --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcat-sse2.S @@ -0,0 +1,28 @@ +/* strcat optimized with SSE2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> +# define strcat __strcat_sse2 + +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(strcat) +#endif + +#include <sysdeps/x86_64/strcat.S> diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S index 96184d0f0f..2d4fd78f99 100644 --- a/sysdeps/x86_64/multiarch/strcat-ssse3.S +++ b/sysdeps/x86_64/multiarch/strcat-ssse3.S @@ -1,5 +1,5 @@ /* strcat with SSSE3 - Copyright (C) 2011-2016 Free Software Foundation, Inc. + Copyright (C) 2011-2018 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/strcat.S b/sysdeps/x86_64/multiarch/strcat.S deleted file mode 100644 index 7bb38e68ad..0000000000 --- a/sysdeps/x86_64/multiarch/strcat.S +++ /dev/null @@ -1,85 +0,0 @@ -/* Multiple versions of strcat - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2009-2016 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - -#ifndef USE_AS_STRNCAT -# ifndef STRCAT -# define STRCAT strcat -# endif -#endif - -#ifdef USE_AS_STRNCAT -# define STRCAT_SSSE3 __strncat_ssse3 -# define STRCAT_SSE2 __strncat_sse2 -# define STRCAT_SSE2_UNALIGNED __strncat_sse2_unaligned -# define __GI_STRCAT __GI_strncat -# define __GI___STRCAT __GI___strncat -#else -# define STRCAT_SSSE3 __strcat_ssse3 -# define STRCAT_SSE2 __strcat_sse2 -# define STRCAT_SSE2_UNALIGNED __strcat_sse2_unaligned -# define __GI_STRCAT __GI_strcat -# define __GI___STRCAT __GI___strcat -#endif - - -/* Define multiple versions only for the definition in libc. */ -#if IS_IN (libc) - .text -ENTRY(STRCAT) - .type STRCAT, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq STRCAT_SSE2_UNALIGNED(%rip), %rax - HAS_ARCH_FEATURE (Fast_Unaligned_Load) - jnz 2f - leaq STRCAT_SSE2(%rip), %rax - HAS_CPU_FEATURE (SSSE3) - jz 2f - leaq STRCAT_SSSE3(%rip), %rax -2: ret -END(STRCAT) - -# undef ENTRY -# define ENTRY(name) \ - .type STRCAT_SSE2, @function; \ - .align 16; \ - .globl STRCAT_SSE2; \ - .hidden STRCAT_SSE2; \ - STRCAT_SSE2: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size STRCAT_SSE2, .-STRCAT_SSE2 -# undef libc_hidden_builtin_def -/* It doesn't make sense to send libc-internal strcat calls through a PLT. - The speedup we get from using SSSE3 instruction is likely eaten away - by the indirect call in the PLT. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_STRCAT; __GI_STRCAT = STRCAT_SSE2 -# undef libc_hidden_def -# define libc_hidden_def(name) \ - .globl __GI___STRCAT; __GI___STRCAT = STRCAT_SSE2 -#endif - -#ifndef USE_AS_STRNCAT -# include "../strcat.S" -#endif diff --git a/sysdeps/x86_64/multiarch/strcat.c b/sysdeps/x86_64/multiarch/strcat.c new file mode 100644 index 0000000000..1f7f6263f3 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcat.c @@ -0,0 +1,35 @@ +/* Multiple versions of strcat. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define strcat __redirect_strcat +# include <string.h> +# undef strcat + +# define SYMBOL_NAME strcat +# include "ifunc-unaligned-ssse3.h" + +libc_ifunc_redirected (__redirect_strcat, strcat, IFUNC_SELECTOR ()); + +# ifdef SHARED +__hidden_ver1 (strcat, __GI_strcat, __redirect_strcat) + __attribute__ ((visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S new file mode 100644 index 0000000000..47bc3c9949 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S @@ -0,0 +1,254 @@ +/* strchr/strchrnul optimized with AVX2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef STRCHR +# define STRCHR __strchr_avx2 +# endif + +# ifdef USE_AS_WCSCHR +# define VPBROADCAST vpbroadcastd +# define VPCMPEQ vpcmpeqd +# define CHAR_REG esi +# else +# define VPBROADCAST vpbroadcastb +# define VPCMPEQ vpcmpeqb +# define CHAR_REG sil +# endif + +# ifndef VZEROUPPER +# define VZEROUPPER vzeroupper +# endif + +# define VEC_SIZE 32 + + .section .text.avx,"ax",@progbits +ENTRY (STRCHR) + movl %edi, %ecx + /* Broadcast CHAR to YMM0. */ + vmovd %esi, %xmm0 + vpxor %xmm9, %xmm9, %xmm9 + VPBROADCAST %xmm0, %ymm0 + /* Check if we may cross page boundary with one vector load. */ + andl $(2 * VEC_SIZE - 1), %ecx + cmpl $VEC_SIZE, %ecx + ja L(cros_page_boundary) + + /* Check the first VEC_SIZE bytes. Search for both CHAR and the + null byte. */ + vmovdqu (%rdi), %ymm8 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x0) + + /* Align data for aligned loads in the loop. */ + addq $VEC_SIZE, %rdi + andl $(VEC_SIZE - 1), %ecx + andq $-VEC_SIZE, %rdi + + jmp L(more_4x_vec) + + .p2align 4 +L(cros_page_boundary): + andl $(VEC_SIZE - 1), %ecx + andq $-VEC_SIZE, %rdi + vmovdqu (%rdi), %ymm8 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax + /* Remove the leading bytes. */ + sarl %cl, %eax + testl %eax, %eax + jz L(aligned_more) + /* Found CHAR or the null byte. */ + tzcntl %eax, %eax + addq %rcx, %rax +# ifdef USE_AS_STRCHRNUL + addq %rdi, %rax +# else + xorl %edx, %edx + leaq (%rdi, %rax), %rax + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + VZEROUPPER + ret + + .p2align 4 +L(aligned_more): + addq $VEC_SIZE, %rdi + +L(more_4x_vec): + /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ + vmovdqa (%rdi), %ymm8 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x0) + + vmovdqa VEC_SIZE(%rdi), %ymm8 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x1) + + vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x2) + + vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x3) + + addq $(VEC_SIZE * 4), %rdi + + /* Align data to 4 * VEC_SIZE. */ + movq %rdi, %rcx + andl $(4 * VEC_SIZE - 1), %ecx + andq $-(4 * VEC_SIZE), %rdi + + .p2align 4 +L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ + vmovdqa (%rdi), %ymm5 + vmovdqa VEC_SIZE(%rdi), %ymm6 + vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 + vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 + + VPCMPEQ %ymm5, %ymm0, %ymm1 + VPCMPEQ %ymm6, %ymm0, %ymm2 + VPCMPEQ %ymm7, %ymm0, %ymm3 + VPCMPEQ %ymm8, %ymm0, %ymm4 + + VPCMPEQ %ymm5, %ymm9, %ymm5 + VPCMPEQ %ymm6, %ymm9, %ymm6 + VPCMPEQ %ymm7, %ymm9, %ymm7 + VPCMPEQ %ymm8, %ymm9, %ymm8 + + vpor %ymm1, %ymm5, %ymm1 + vpor %ymm2, %ymm6, %ymm2 + vpor %ymm3, %ymm7, %ymm3 + vpor %ymm4, %ymm8, %ymm4 + + vpor %ymm1, %ymm2, %ymm5 + vpor %ymm3, %ymm4, %ymm6 + + vpor %ymm5, %ymm6, %ymm5 + + vpmovmskb %ymm5, %eax + testl %eax, %eax + jnz L(4x_vec_end) + + addq $(VEC_SIZE * 4), %rdi + + jmp L(loop_4x_vec) + + .p2align 4 +L(first_vec_x0): + /* Found CHAR or the null byte. */ + tzcntl %eax, %eax +# ifdef USE_AS_STRCHRNUL + addq %rdi, %rax +# else + xorl %edx, %edx + leaq (%rdi, %rax), %rax + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + VZEROUPPER + ret + + .p2align 4 +L(first_vec_x1): + tzcntl %eax, %eax +# ifdef USE_AS_STRCHRNUL + addq $VEC_SIZE, %rax + addq %rdi, %rax +# else + xorl %edx, %edx + leaq VEC_SIZE(%rdi, %rax), %rax + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + VZEROUPPER + ret + + .p2align 4 +L(first_vec_x2): + tzcntl %eax, %eax +# ifdef USE_AS_STRCHRNUL + addq $(VEC_SIZE * 2), %rax + addq %rdi, %rax +# else + xorl %edx, %edx + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + VZEROUPPER + ret + + .p2align 4 +L(4x_vec_end): + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x0) + vpmovmskb %ymm2, %eax + testl %eax, %eax + jnz L(first_vec_x1) + vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(first_vec_x2) + vpmovmskb %ymm4, %eax + testl %eax, %eax +L(first_vec_x3): + tzcntl %eax, %eax +# ifdef USE_AS_STRCHRNUL + addq $(VEC_SIZE * 3), %rax + addq %rdi, %rax +# else + xorl %edx, %edx + leaq (VEC_SIZE * 3)(%rdi, %rax), %rax + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax +# endif + VZEROUPPER + ret + +END (STRCHR) +#endif diff --git a/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S index 979d112b28..93fb661da2 100644 --- a/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S +++ b/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S @@ -1,5 +1,5 @@ /* strchr with SSE2 without bsf - Copyright (C) 2011-2016 Free Software Foundation, Inc. + Copyright (C) 2011-2018 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/strchr-sse2.S b/sysdeps/x86_64/multiarch/strchr-sse2.S new file mode 100644 index 0000000000..8a6e77195c --- /dev/null +++ b/sysdeps/x86_64/multiarch/strchr-sse2.S @@ -0,0 +1,28 @@ +/* strchr optimized with SSE2. + Copyright (C) 2009-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# define strchr __strchr_sse2 + +# undef weak_alias +# define weak_alias(strchr, index) +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(strchr) +#endif + +#include "../strchr.S" diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S deleted file mode 100644 index 40683ad32b..0000000000 --- a/sysdeps/x86_64/multiarch/strchr.S +++ /dev/null @@ -1,57 +0,0 @@ -/* Multiple versions of strchr - Copyright (C) 2009-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - -/* Define multiple versions only for the definition in libc. */ -#if IS_IN (libc) - .text -ENTRY(strchr) - .type strchr, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq __strchr_sse2(%rip), %rax -2: HAS_ARCH_FEATURE (Slow_BSF) - jz 3f - leaq __strchr_sse2_no_bsf(%rip), %rax -3: ret -END(strchr) - - - -# undef ENTRY -# define ENTRY(name) \ - .type __strchr_sse2, @function; \ - .align 16; \ - .globl __strchr_sse2; \ - .hidden __strchr_sse2; \ - __strchr_sse2: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __strchr_sse2, .-__strchr_sse2 -# undef libc_hidden_builtin_def -/* It doesn't make sense to send libc-internal strchr calls through a PLT. - The speedup we get from using SSE4.2 instruction is likely eaten away - by the indirect call in the PLT. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_strchr; __GI_strchr = __strchr_sse2 -#endif - -#include "../strchr.S" diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c new file mode 100644 index 0000000000..76d64fb378 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strchr.c @@ -0,0 +1,55 @@ +/* Multiple versions of strchr. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2009-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define strchr __redirect_strchr +# include <string.h> +# undef strchr + +# define SYMBOL_NAME strchr +# include <init-arch.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) + && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + return OPTIMIZE (avx2); + + if (CPU_FEATURES_ARCH_P (cpu_features, Slow_BSF)) + return OPTIMIZE (sse2_no_bsf); + + return OPTIMIZE (sse2); +} + +libc_ifunc_redirected (__redirect_strchr, strchr, IFUNC_SELECTOR ()); +weak_alias (strchr, index) +# ifdef SHARED +__hidden_ver1 (strchr, __GI_strchr, __redirect_strchr) + __attribute__((visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strchrnul-avx2.S b/sysdeps/x86_64/multiarch/strchrnul-avx2.S new file mode 100644 index 0000000000..fa0cc09760 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strchrnul-avx2.S @@ -0,0 +1,3 @@ +#define STRCHR __strchrnul_avx2 +#define USE_AS_STRCHRNUL 1 +#include "strchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strchrnul-sse2.S b/sysdeps/x86_64/multiarch/strchrnul-sse2.S new file mode 100644 index 0000000000..d4a2be118e --- /dev/null +++ b/sysdeps/x86_64/multiarch/strchrnul-sse2.S @@ -0,0 +1,26 @@ +/* strchrnul optimized with SSE2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# define __strchrnul __strchrnul_sse2 + +# undef weak_alias +# define weak_alias(__strchrnul, strchrnul) +#endif + +#include "../strchrnul.S" diff --git a/sysdeps/x86_64/multiarch/strchrnul.c b/sysdeps/x86_64/multiarch/strchrnul.c new file mode 100644 index 0000000000..7514999341 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strchrnul.c @@ -0,0 +1,34 @@ +/* Multiple versions of strchrnul. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define strchrnul __redirect_strchrnul +# define __strchrnul __redirect___strchrnul +# include <string.h> +# undef __strchrnul +# undef strchrnul + +# define SYMBOL_NAME strchrnul +# include "ifunc-avx2.h" + +libc_ifunc_redirected (__redirect_strchrnul, __strchrnul, + IFUNC_SELECTOR ()); +weak_alias (__strchrnul, strchrnul) +#endif diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S new file mode 100644 index 0000000000..e8397f3b05 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S @@ -0,0 +1,847 @@ +/* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2. + Copyright (C) 2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef STRCMP +# define STRCMP __strcmp_avx2 +# endif + +# define PAGE_SIZE 4096 + +/* VEC_SIZE = Number of bytes in a ymm register */ +# define VEC_SIZE 32 + +/* Shift for dividing by (VEC_SIZE * 4). */ +# define DIVIDE_BY_VEC_4_SHIFT 7 +# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +# endif + +# ifdef USE_AS_WCSCMP +/* Compare packed dwords. */ +# define VPCMPEQ vpcmpeqd +/* Compare packed dwords and store minimum. */ +# define VPMINU vpminud +/* 1 dword char == 4 bytes. */ +# define SIZE_OF_CHAR 4 +# else +/* Compare packed bytes. */ +# define VPCMPEQ vpcmpeqb +/* Compare packed bytes and store minimum. */ +# define VPMINU vpminub +/* 1 byte char == 1 byte. */ +# define SIZE_OF_CHAR 1 +# endif + +# ifndef VZEROUPPER +# define VZEROUPPER vzeroupper +# endif + +/* Warning! + wcscmp/wcsncmp have to use SIGNED comparison for elements. + strcmp/strncmp have to use UNSIGNED comparison for elements. +*/ + +/* The main idea of the string comparison (byte or dword) using AVX2 + consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on + either packed bytes or dwords depending on USE_AS_WCSCMP. In order + to check the null char, algorithm keeps the matched bytes/dwords, + requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general, + the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and + one VPMINU instructions, together with movdqu and testl instructions. + Main loop (away from from page boundary) compares 4 vectors are a time, + effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop. + + The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic + is the same as strcmp, except that an a maximum offset is tracked. If + the maximum offset is reached before a difference is found, zero is + returned. */ + + .section .text.avx,"ax",@progbits +ENTRY (STRCMP) +# ifdef USE_AS_STRNCMP + /* Check for simple cases (0 or 1) in offset. */ + cmp $1, %rdx + je L(char0) + jb L(zero) +# ifdef USE_AS_WCSCMP + /* Convert units: from wide to byte char. */ + shl $2, %rdx +# endif + /* Register %r11 tracks the maximum offset. */ + movq %rdx, %r11 +# endif + movl %edi, %eax + xorl %edx, %edx + /* Make %ymm7 all zeros in this function. */ + vpxor %ymm7, %ymm7, %ymm7 + orl %esi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax + jg L(cross_page) + /* Start comparing 4 vectors. */ + vmovdqu (%rdi), %ymm1 + VPCMPEQ (%rsi), %ymm1, %ymm0 + VPMINU %ymm1, %ymm0, %ymm0 + VPCMPEQ %ymm7, %ymm0, %ymm0 + vpmovmskb %ymm0, %ecx + testl %ecx, %ecx + je L(next_3_vectors) + tzcntl %ecx, %edx +# ifdef USE_AS_STRNCMP + /* Return 0 if the mismatched index (%rdx) is after the maximum + offset (%r11). */ + cmpq %r11, %rdx + jae L(zero) +# endif +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl (%rdi, %rdx), %ecx + cmpl (%rsi, %rdx), %ecx + je L(return) +L(wcscmp_return): + setl %al + negl %eax + orl $1, %eax +L(return): +# else + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + subl %edx, %eax +# endif + VZEROUPPER + ret + + .p2align 4 +L(return_vec_size): + tzcntl %ecx, %edx +# ifdef USE_AS_STRNCMP + /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after + the maximum offset (%r11). */ + addq $VEC_SIZE, %rdx + cmpq %r11, %rdx + jae L(zero) +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl (%rdi, %rdx), %ecx + cmpl (%rsi, %rdx), %ecx + jne L(wcscmp_return) +# else + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl VEC_SIZE(%rdi, %rdx), %ecx + cmpl VEC_SIZE(%rsi, %rdx), %ecx + jne L(wcscmp_return) +# else + movzbl VEC_SIZE(%rdi, %rdx), %eax + movzbl VEC_SIZE(%rsi, %rdx), %edx + subl %edx, %eax +# endif +# endif + VZEROUPPER + ret + + .p2align 4 +L(return_2_vec_size): + tzcntl %ecx, %edx +# ifdef USE_AS_STRNCMP + /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is + after the maximum offset (%r11). */ + addq $(VEC_SIZE * 2), %rdx + cmpq %r11, %rdx + jae L(zero) +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl (%rdi, %rdx), %ecx + cmpl (%rsi, %rdx), %ecx + jne L(wcscmp_return) +# else + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx + cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx + jne L(wcscmp_return) +# else + movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx + subl %edx, %eax +# endif +# endif + VZEROUPPER + ret + + .p2align 4 +L(return_3_vec_size): + tzcntl %ecx, %edx +# ifdef USE_AS_STRNCMP + /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is + after the maximum offset (%r11). */ + addq $(VEC_SIZE * 3), %rdx + cmpq %r11, %rdx + jae L(zero) +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl (%rdi, %rdx), %ecx + cmpl (%rsi, %rdx), %ecx + jne L(wcscmp_return) +# else + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx + cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx + jne L(wcscmp_return) +# else + movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax + movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx + subl %edx, %eax +# endif +# endif + VZEROUPPER + ret + + .p2align 4 +L(next_3_vectors): + vmovdqu VEC_SIZE(%rdi), %ymm6 + VPCMPEQ VEC_SIZE(%rsi), %ymm6, %ymm3 + VPMINU %ymm6, %ymm3, %ymm3 + VPCMPEQ %ymm7, %ymm3, %ymm3 + vpmovmskb %ymm3, %ecx + testl %ecx, %ecx + jne L(return_vec_size) + vmovdqu (VEC_SIZE * 2)(%rdi), %ymm5 + vmovdqu (VEC_SIZE * 3)(%rdi), %ymm4 + vmovdqu (VEC_SIZE * 3)(%rsi), %ymm0 + VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm5, %ymm2 + VPMINU %ymm5, %ymm2, %ymm2 + VPCMPEQ %ymm4, %ymm0, %ymm0 + VPCMPEQ %ymm7, %ymm2, %ymm2 + vpmovmskb %ymm2, %ecx + testl %ecx, %ecx + jne L(return_2_vec_size) + VPMINU %ymm4, %ymm0, %ymm0 + VPCMPEQ %ymm7, %ymm0, %ymm0 + vpmovmskb %ymm0, %ecx + testl %ecx, %ecx + jne L(return_3_vec_size) +L(main_loop_header): + leaq (VEC_SIZE * 4)(%rdi), %rdx + movl $PAGE_SIZE, %ecx + /* Align load via RAX. */ + andq $-(VEC_SIZE * 4), %rdx + subq %rdi, %rdx + leaq (%rdi, %rdx), %rax +# ifdef USE_AS_STRNCMP + /* Starting from this point, the maximum offset, or simply the + 'offset', DECREASES by the same amount when base pointers are + moved forward. Return 0 when: + 1) On match: offset <= the matched vector index. + 2) On mistmach, offset is before the mistmatched index. + */ + subq %rdx, %r11 + jbe L(zero) +# endif + addq %rsi, %rdx + movq %rdx, %rsi + andl $(PAGE_SIZE - 1), %esi + /* Number of bytes before page crossing. */ + subq %rsi, %rcx + /* Number of VEC_SIZE * 4 blocks before page crossing. */ + shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx + /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ + movl %ecx, %esi + jmp L(loop_start) + + .p2align 4 +L(loop): +# ifdef USE_AS_STRNCMP + /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease + the maximum offset (%r11) by the same amount. */ + subq $(VEC_SIZE * 4), %r11 + jbe L(zero) +# endif + addq $(VEC_SIZE * 4), %rax + addq $(VEC_SIZE * 4), %rdx +L(loop_start): + testl %esi, %esi + leal -1(%esi), %esi + je L(loop_cross_page) +L(back_to_loop): + /* Main loop, comparing 4 vectors are a time. */ + vmovdqa (%rax), %ymm0 + vmovdqa VEC_SIZE(%rax), %ymm3 + VPCMPEQ (%rdx), %ymm0, %ymm4 + VPCMPEQ VEC_SIZE(%rdx), %ymm3, %ymm1 + VPMINU %ymm0, %ymm4, %ymm4 + VPMINU %ymm3, %ymm1, %ymm1 + vmovdqa (VEC_SIZE * 2)(%rax), %ymm2 + VPMINU %ymm1, %ymm4, %ymm0 + vmovdqa (VEC_SIZE * 3)(%rax), %ymm3 + VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm2, %ymm5 + VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm3, %ymm6 + VPMINU %ymm2, %ymm5, %ymm5 + VPMINU %ymm3, %ymm6, %ymm6 + VPMINU %ymm5, %ymm0, %ymm0 + VPMINU %ymm6, %ymm0, %ymm0 + VPCMPEQ %ymm7, %ymm0, %ymm0 + + /* Test each mask (32 bits) individually because for VEC_SIZE + == 32 is not possible to OR the four masks and keep all bits + in a 64-bit integer register, differing from SSE2 strcmp + where ORing is possible. */ + vpmovmskb %ymm0, %ecx + testl %ecx, %ecx + je L(loop) + VPCMPEQ %ymm7, %ymm4, %ymm0 + vpmovmskb %ymm0, %edi + testl %edi, %edi + je L(test_vec) + tzcntl %edi, %ecx +# ifdef USE_AS_STRNCMP + cmpq %rcx, %r11 + jbe L(zero) +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (%rsi, %rcx), %edi + cmpl (%rdx, %rcx), %edi + jne L(wcscmp_return) +# else + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (%rsi, %rcx), %edi + cmpl (%rdx, %rcx), %edi + jne L(wcscmp_return) +# else + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax +# endif +# endif + VZEROUPPER + ret + + .p2align 4 +L(test_vec): +# ifdef USE_AS_STRNCMP + /* The first vector matched. Return 0 if the maximum offset + (%r11) <= VEC_SIZE. */ + cmpq $VEC_SIZE, %r11 + jbe L(zero) +# endif + VPCMPEQ %ymm7, %ymm1, %ymm1 + vpmovmskb %ymm1, %ecx + testl %ecx, %ecx + je L(test_2_vec) + tzcntl %ecx, %edi +# ifdef USE_AS_STRNCMP + addq $VEC_SIZE, %rdi + cmpq %rdi, %r11 + jbe L(zero) +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (%rsi, %rdi), %ecx + cmpl (%rdx, %rdi), %ecx + jne L(wcscmp_return) +# else + movzbl (%rax, %rdi), %eax + movzbl (%rdx, %rdi), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl VEC_SIZE(%rsi, %rdi), %ecx + cmpl VEC_SIZE(%rdx, %rdi), %ecx + jne L(wcscmp_return) +# else + movzbl VEC_SIZE(%rax, %rdi), %eax + movzbl VEC_SIZE(%rdx, %rdi), %edx + subl %edx, %eax +# endif +# endif + VZEROUPPER + ret + + .p2align 4 +L(test_2_vec): +# ifdef USE_AS_STRNCMP + /* The first 2 vectors matched. Return 0 if the maximum offset + (%r11) <= 2 * VEC_SIZE. */ + cmpq $(VEC_SIZE * 2), %r11 + jbe L(zero) +# endif + VPCMPEQ %ymm7, %ymm5, %ymm5 + vpmovmskb %ymm5, %ecx + testl %ecx, %ecx + je L(test_3_vec) + tzcntl %ecx, %edi +# ifdef USE_AS_STRNCMP + addq $(VEC_SIZE * 2), %rdi + cmpq %rdi, %r11 + jbe L(zero) +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (%rsi, %rdi), %ecx + cmpl (%rdx, %rdi), %ecx + jne L(wcscmp_return) +# else + movzbl (%rax, %rdi), %eax + movzbl (%rdx, %rdi), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx + cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx + jne L(wcscmp_return) +# else + movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax + movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx + subl %edx, %eax +# endif +# endif + VZEROUPPER + ret + + .p2align 4 +L(test_3_vec): +# ifdef USE_AS_STRNCMP + /* The first 3 vectors matched. Return 0 if the maximum offset + (%r11) <= 3 * VEC_SIZE. */ + cmpq $(VEC_SIZE * 3), %r11 + jbe L(zero) +# endif + VPCMPEQ %ymm7, %ymm6, %ymm6 + vpmovmskb %ymm6, %esi + tzcntl %esi, %ecx +# ifdef USE_AS_STRNCMP + addq $(VEC_SIZE * 3), %rcx + cmpq %rcx, %r11 + jbe L(zero) +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (%rsi, %rcx), %esi + cmpl (%rdx, %rcx), %esi + jne L(wcscmp_return) +# else + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (VEC_SIZE * 3)(%rsi, %rcx), %esi + cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi + jne L(wcscmp_return) +# else + movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax + movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx + subl %edx, %eax +# endif +# endif + VZEROUPPER + ret + + .p2align 4 +L(loop_cross_page): + xorl %r10d, %r10d + movq %rdx, %rcx + /* Align load via RDX. We load the extra ECX bytes which should + be ignored. */ + andl $((VEC_SIZE * 4) - 1), %ecx + /* R10 is -RCX. */ + subq %rcx, %r10 + + /* This works only if VEC_SIZE * 2 == 64. */ +# if (VEC_SIZE * 2) != 64 +# error (VEC_SIZE * 2) != 64 +# endif + + /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ + cmpl $(VEC_SIZE * 2), %ecx + jge L(loop_cross_page_2_vec) + + vmovdqu (%rax, %r10), %ymm2 + vmovdqu VEC_SIZE(%rax, %r10), %ymm3 + VPCMPEQ (%rdx, %r10), %ymm2, %ymm0 + VPCMPEQ VEC_SIZE(%rdx, %r10), %ymm3, %ymm1 + VPMINU %ymm2, %ymm0, %ymm0 + VPMINU %ymm3, %ymm1, %ymm1 + VPCMPEQ %ymm7, %ymm0, %ymm0 + VPCMPEQ %ymm7, %ymm1, %ymm1 + + vpmovmskb %ymm0, %edi + vpmovmskb %ymm1, %esi + + salq $32, %rsi + xorq %rsi, %rdi + + /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ + shrq %cl, %rdi + + testq %rdi, %rdi + je L(loop_cross_page_2_vec) + tzcntq %rdi, %rcx +# ifdef USE_AS_STRNCMP + cmpq %rcx, %r11 + jbe L(zero) +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (%rsi, %rcx), %edi + cmpl (%rdx, %rcx), %edi + jne L(wcscmp_return) +# else + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (%rsi, %rcx), %edi + cmpl (%rdx, %rcx), %edi + jne L(wcscmp_return) +# else + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax +# endif +# endif + VZEROUPPER + ret + + .p2align 4 +L(loop_cross_page_2_vec): + /* The first VEC_SIZE * 2 bytes match or are ignored. */ + vmovdqu (VEC_SIZE * 2)(%rax, %r10), %ymm2 + vmovdqu (VEC_SIZE * 3)(%rax, %r10), %ymm3 + VPCMPEQ (VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5 + VPMINU %ymm2, %ymm5, %ymm5 + VPCMPEQ (VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6 + VPCMPEQ %ymm7, %ymm5, %ymm5 + VPMINU %ymm3, %ymm6, %ymm6 + VPCMPEQ %ymm7, %ymm6, %ymm6 + + vpmovmskb %ymm5, %edi + vpmovmskb %ymm6, %esi + + salq $32, %rsi + xorq %rsi, %rdi + + xorl %r8d, %r8d + /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ + subl $(VEC_SIZE * 2), %ecx + jle 1f + /* Skip ECX bytes. */ + shrq %cl, %rdi + /* R8 has number of bytes skipped. */ + movl %ecx, %r8d +1: + /* Before jumping back to the loop, set ESI to the number of + VEC_SIZE * 4 blocks before page crossing. */ + movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi + + testq %rdi, %rdi + je L(back_to_loop) + tzcntq %rdi, %rcx + addq %r10, %rcx + /* Adjust for number of bytes skipped. */ + addq %r8, %rcx +# ifdef USE_AS_STRNCMP + addq $(VEC_SIZE * 2), %rcx + subq %rcx, %r11 + jbe L(zero) +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (%rsi, %rcx), %edi + cmpl (%rdx, %rcx), %edi + jne L(wcscmp_return) +# else + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax +# endif +# else +# ifdef USE_AS_WCSCMP + movq %rax, %rsi + xorl %eax, %eax + movl (VEC_SIZE * 2)(%rsi, %rcx), %edi + cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi + jne L(wcscmp_return) +# else + movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx + subl %edx, %eax +# endif +# endif + VZEROUPPER + ret + + .p2align 4 +L(cross_page_loop): + /* Check one byte/dword at a time. */ +# ifdef USE_AS_WCSCMP + cmpl %ecx, %eax +# else + subl %ecx, %eax +# endif + jne L(different) + addl $SIZE_OF_CHAR, %edx + cmpl $(VEC_SIZE * 4), %edx + je L(main_loop_header) +# ifdef USE_AS_STRNCMP + cmpq %r11, %rdx + jae L(zero) +# endif +# ifdef USE_AS_WCSCMP + movl (%rdi, %rdx), %eax + movl (%rsi, %rdx), %ecx +# else + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %ecx +# endif + /* Check null char. */ + testl %eax, %eax + jne L(cross_page_loop) + /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED + comparisons. */ + subl %ecx, %eax +# ifndef USE_AS_WCSCMP +L(different): +# endif + VZEROUPPER + ret + +# ifdef USE_AS_WCSCMP + .p2align 4 +L(different): + /* Use movl to avoid modifying EFLAGS. */ + movl $0, %eax + setl %al + negl %eax + orl $1, %eax + VZEROUPPER + ret +# endif + +# ifdef USE_AS_STRNCMP + .p2align 4 +L(zero): + xorl %eax, %eax + VZEROUPPER + ret + + .p2align 4 +L(char0): +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl (%rdi), %ecx + cmpl (%rsi), %ecx + jne L(wcscmp_return) +# else + movzbl (%rsi), %ecx + movzbl (%rdi), %eax + subl %ecx, %eax +# endif + VZEROUPPER + ret +# endif + + .p2align 4 +L(last_vector): + addq %rdx, %rdi + addq %rdx, %rsi +# ifdef USE_AS_STRNCMP + subq %rdx, %r11 +# endif + tzcntl %ecx, %edx +# ifdef USE_AS_STRNCMP + cmpq %r11, %rdx + jae L(zero) +# endif +# ifdef USE_AS_WCSCMP + xorl %eax, %eax + movl (%rdi, %rdx), %ecx + cmpl (%rsi, %rdx), %ecx + jne L(wcscmp_return) +# else + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + subl %edx, %eax +# endif + VZEROUPPER + ret + + /* Comparing on page boundary region requires special treatment: + It must done one vector at the time, starting with the wider + ymm vector if possible, if not, with xmm. If fetching 16 bytes + (xmm) still passes the boundary, byte comparison must be done. + */ + .p2align 4 +L(cross_page): + /* Try one ymm vector at a time. */ + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + jg L(cross_page_1_vector) +L(loop_1_vector): + vmovdqu (%rdi, %rdx), %ymm1 + VPCMPEQ (%rsi, %rdx), %ymm1, %ymm0 + VPMINU %ymm1, %ymm0, %ymm0 + VPCMPEQ %ymm7, %ymm0, %ymm0 + vpmovmskb %ymm0, %ecx + testl %ecx, %ecx + jne L(last_vector) + + addl $VEC_SIZE, %edx + + addl $VEC_SIZE, %eax +# ifdef USE_AS_STRNCMP + /* Return 0 if the current offset (%rdx) >= the maximum offset + (%r11). */ + cmpq %r11, %rdx + jae L(zero) +# endif + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + jle L(loop_1_vector) +L(cross_page_1_vector): + /* Less than 32 bytes to check, try one xmm vector. */ + cmpl $(PAGE_SIZE - 16), %eax + jg L(cross_page_1_xmm) + vmovdqu (%rdi, %rdx), %xmm1 + VPCMPEQ (%rsi, %rdx), %xmm1, %xmm0 + VPMINU %xmm1, %xmm0, %xmm0 + VPCMPEQ %xmm7, %xmm0, %xmm0 + vpmovmskb %xmm0, %ecx + testl %ecx, %ecx + jne L(last_vector) + + addl $16, %edx +# ifndef USE_AS_WCSCMP + addl $16, %eax +# endif +# ifdef USE_AS_STRNCMP + /* Return 0 if the current offset (%rdx) >= the maximum offset + (%r11). */ + cmpq %r11, %rdx + jae L(zero) +# endif + +L(cross_page_1_xmm): +# ifndef USE_AS_WCSCMP + /* Less than 16 bytes to check, try 8 byte vector. NB: No need + for wcscmp nor wcsncmp since wide char is 4 bytes. */ + cmpl $(PAGE_SIZE - 8), %eax + jg L(cross_page_8bytes) + vmovq (%rdi, %rdx), %xmm1 + vmovq (%rsi, %rdx), %xmm0 + VPCMPEQ %xmm0, %xmm1, %xmm0 + VPMINU %xmm1, %xmm0, %xmm0 + VPCMPEQ %xmm7, %xmm0, %xmm0 + vpmovmskb %xmm0, %ecx + /* Only last 8 bits are valid. */ + andl $0xff, %ecx + testl %ecx, %ecx + jne L(last_vector) + + addl $8, %edx + addl $8, %eax +# ifdef USE_AS_STRNCMP + /* Return 0 if the current offset (%rdx) >= the maximum offset + (%r11). */ + cmpq %r11, %rdx + jae L(zero) +# endif + +L(cross_page_8bytes): + /* Less than 8 bytes to check, try 4 byte vector. */ + cmpl $(PAGE_SIZE - 4), %eax + jg L(cross_page_4bytes) + vmovd (%rdi, %rdx), %xmm1 + vmovd (%rsi, %rdx), %xmm0 + VPCMPEQ %xmm0, %xmm1, %xmm0 + VPMINU %xmm1, %xmm0, %xmm0 + VPCMPEQ %xmm7, %xmm0, %xmm0 + vpmovmskb %xmm0, %ecx + /* Only last 4 bits are valid. */ + andl $0xf, %ecx + testl %ecx, %ecx + jne L(last_vector) + + addl $4, %edx +# ifdef USE_AS_STRNCMP + /* Return 0 if the current offset (%rdx) >= the maximum offset + (%r11). */ + cmpq %r11, %rdx + jae L(zero) +# endif + +L(cross_page_4bytes): +# endif + /* Less than 4 bytes to check, try one byte/dword at a time. */ +# ifdef USE_AS_STRNCMP + cmpq %r11, %rdx + jae L(zero) +# endif +# ifdef USE_AS_WCSCMP + movl (%rdi, %rdx), %eax + movl (%rsi, %rdx), %ecx +# else + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %ecx +# endif + testl %eax, %eax + jne L(cross_page_loop) + subl %ecx, %eax + VZEROUPPER + ret +END (STRCMP) +#endif diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S index bf555b4066..a9b6267d15 100644 --- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S @@ -1,5 +1,5 @@ /* strcmp with unaligned loads - Copyright (C) 2013-2016 Free Software Foundation, Inc. + Copyright (C) 2013-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2.S b/sysdeps/x86_64/multiarch/strcmp-sse2.S new file mode 100644 index 0000000000..d173ded8c0 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcmp-sse2.S @@ -0,0 +1,28 @@ +/* strcmp optimized with SSE2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# include <sysdep.h> + +# define STRCMP __strcmp_sse2 + +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(strcmp) +#endif + +#include <sysdeps/x86_64/strcmp.S> diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S index 70df84ae32..d3c07bd292 100644 --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S @@ -1,5 +1,5 @@ /* strcmp with SSE4.2 - Copyright (C) 2009-2016 Free Software Foundation, Inc. + Copyright (C) 2009-2018 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -17,6 +17,40 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ +#include <sysdep.h> + +#ifndef STRCMP_SSE42 +# define STRCMP_SSE42 __strcmp_sse42 +#endif + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# include "locale-defines.h" +#endif + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L +/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz + if the new counter > the old one or is 0. */ +# define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + lea -16(%rcx, %r11), %r9; \ + cmp %r9, %r11; \ + jb LABEL(strcmp_exitz); \ + test %r9, %r9; \ + je LABEL(strcmp_exitz); \ + mov %r9, %r11 +#else +# define UPDATE_STRNCMP_COUNTER +#endif + +#ifdef USE_AVX +# define SECTION avx +# define GLABEL(l) l##_avx +#else +# define SECTION sse4.2 +# define GLABEL(l) l##_sse42 +#endif + +#define LABEL(l) .L##l /* We use 0x1a: _SIDD_SBYTE_OPS @@ -92,6 +126,7 @@ END (GLABEL(__strncasecmp)) STRCMP_SSE42: cfi_startproc + _CET_ENDBR CALL_MCOUNT /* @@ -240,7 +275,7 @@ LABEL(bigger): movslq (%r10, %r9,4), %r9 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ lea (%r10, %r9), %r10 - jmp *%r10 /* jump to corresponding case */ + _CET_NOTRACK jmp *%r10 /* jump to corresponding case */ /* * The following cases will be handled by ashr_0 diff --git a/sysdeps/x86_64/multiarch/strcmp-sse4_2.S b/sysdeps/x86_64/multiarch/strcmp-sse4_2.S new file mode 100644 index 0000000000..776e5e060f --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcmp-sse4_2.S @@ -0,0 +1,21 @@ +/* strcmp optimized with SSE4.2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# include "strcmp-sse42.S" +#endif diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S deleted file mode 100644 index 0e4a113f61..0000000000 --- a/sysdeps/x86_64/multiarch/strcmp.S +++ /dev/null @@ -1,209 +0,0 @@ -/* Multiple versions of strcmp - Copyright (C) 2009-2016 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - -#ifdef USE_AS_STRNCMP -/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz - if the new counter > the old one or is 0. */ -# define UPDATE_STRNCMP_COUNTER \ - /* calculate left number to compare */ \ - lea -16(%rcx, %r11), %r9; \ - cmp %r9, %r11; \ - jb LABEL(strcmp_exitz); \ - test %r9, %r9; \ - je LABEL(strcmp_exitz); \ - mov %r9, %r11 - -# define STRCMP_SSE42 __strncmp_sse42 -# define STRCMP_SSSE3 __strncmp_ssse3 -# define STRCMP_SSE2 __strncmp_sse2 -# define __GI_STRCMP __GI_strncmp -#elif defined USE_AS_STRCASECMP_L -# include "locale-defines.h" - -# define UPDATE_STRNCMP_COUNTER - -# define STRCMP_AVX __strcasecmp_l_avx -# define STRCMP_SSE42 __strcasecmp_l_sse42 -# define STRCMP_SSSE3 __strcasecmp_l_ssse3 -# define STRCMP_SSE2 __strcasecmp_l_sse2 -# define __GI_STRCMP __GI___strcasecmp_l -#elif defined USE_AS_STRNCASECMP_L -# include "locale-defines.h" - -/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz - if the new counter > the old one or is 0. */ -# define UPDATE_STRNCMP_COUNTER \ - /* calculate left number to compare */ \ - lea -16(%rcx, %r11), %r9; \ - cmp %r9, %r11; \ - jb LABEL(strcmp_exitz); \ - test %r9, %r9; \ - je LABEL(strcmp_exitz); \ - mov %r9, %r11 - -# define STRCMP_AVX __strncasecmp_l_avx -# define STRCMP_SSE42 __strncasecmp_l_sse42 -# define STRCMP_SSSE3 __strncasecmp_l_ssse3 -# define STRCMP_SSE2 __strncasecmp_l_sse2 -# define __GI_STRCMP __GI___strncasecmp_l -#else -# define USE_AS_STRCMP -# define UPDATE_STRNCMP_COUNTER -# ifndef STRCMP -# define STRCMP strcmp -# define STRCMP_SSE42 __strcmp_sse42 -# define STRCMP_SSSE3 __strcmp_ssse3 -# define STRCMP_SSE2 __strcmp_sse2 -# define __GI_STRCMP __GI_strcmp -# endif -#endif - -/* Define multiple versions only for the definition in libc. Don't - define multiple versions for strncmp in static library since we - need strncmp before the initialization happened. */ -#if (defined SHARED || !defined USE_AS_STRNCMP) && IS_IN (libc) - .text -ENTRY(STRCMP) - .type STRCMP, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX -#ifdef USE_AS_STRCMP - leaq __strcmp_sse2_unaligned(%rip), %rax - HAS_ARCH_FEATURE (Fast_Unaligned_Load) - jnz 3f -#else - HAS_ARCH_FEATURE (Slow_SSE4_2) - jnz 2f - leaq STRCMP_SSE42(%rip), %rax - HAS_CPU_FEATURE (SSE4_2) - jnz 3f -#endif -2: leaq STRCMP_SSSE3(%rip), %rax - HAS_CPU_FEATURE (SSSE3) - jnz 3f - leaq STRCMP_SSE2(%rip), %rax -3: ret -END(STRCMP) - -# ifdef USE_AS_STRCASECMP_L -ENTRY(__strcasecmp) - .type __strcasecmp, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq __strcasecmp_avx(%rip), %rax - HAS_ARCH_FEATURE (AVX_Usable) - jnz 3f - HAS_ARCH_FEATURE (Slow_SSE4_2) - jnz 2f - leaq __strcasecmp_sse42(%rip), %rax - HAS_CPU_FEATURE (SSE4_2) - jnz 3f -2: leaq __strcasecmp_ssse3(%rip), %rax - HAS_CPU_FEATURE (SSSE3) - jnz 3f - leaq __strcasecmp_sse2(%rip), %rax -3: ret -END(__strcasecmp) -weak_alias (__strcasecmp, strcasecmp) -# endif -# ifdef USE_AS_STRNCASECMP_L -ENTRY(__strncasecmp) - .type __strncasecmp, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq __strncasecmp_avx(%rip), %rax - HAS_ARCH_FEATURE (AVX_Usable) - jnz 3f - HAS_ARCH_FEATURE (Slow_SSE4_2) - jnz 2f - leaq __strncasecmp_sse42(%rip), %rax - HAS_CPU_FEATURE (SSE4_2) - jnz 3f -2: leaq __strncasecmp_ssse3(%rip), %rax - HAS_CPU_FEATURE (SSSE3) - jnz 3f - leaq __strncasecmp_sse2(%rip), %rax -3: ret -END(__strncasecmp) -weak_alias (__strncasecmp, strncasecmp) -# endif - -# undef LABEL -# define LABEL(l) .L##l##_sse42 -# define GLABEL(l) l##_sse42 -# define SECTION sse4.2 -# include "strcmp-sse42.S" - - -# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L -# define LABEL(l) .L##l##_avx -# define GLABEL(l) l##_avx -# define USE_AVX 1 -# undef STRCMP_SSE42 -# define STRCMP_SSE42 STRCMP_AVX -# define SECTION avx -# include "strcmp-sse42.S" -# endif - - -# undef ENTRY -# define ENTRY(name) \ - .type STRCMP_SSE2, @function; \ - .align 16; \ - .globl STRCMP_SSE2; \ - .hidden STRCMP_SSE2; \ - STRCMP_SSE2: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size STRCMP_SSE2, .-STRCMP_SSE2 - -# ifdef USE_AS_STRCASECMP_L -# define ENTRY2(name) \ - .type __strcasecmp_sse2, @function; \ - .align 16; \ - .globl __strcasecmp_sse2; \ - .hidden __strcasecmp_sse2; \ - __strcasecmp_sse2: cfi_startproc; \ - CALL_MCOUNT -# define END2(name) \ - cfi_endproc; .size __strcasecmp_sse2, .-__strcasecmp_sse2 -# endif - -# ifdef USE_AS_STRNCASECMP_L -# define ENTRY2(name) \ - .type __strncasecmp_sse2, @function; \ - .align 16; \ - .globl __strncasecmp_sse2; \ - .hidden __strncasecmp_sse2; \ - __strncasecmp_sse2: cfi_startproc; \ - CALL_MCOUNT -# define END2(name) \ - cfi_endproc; .size __strncasecmp_sse2, .-__strncasecmp_sse2 -# endif - -# undef libc_hidden_builtin_def -/* It doesn't make sense to send libc-internal strcmp calls through a PLT. - The speedup we get from using SSE4.2 instruction is likely eaten away - by the indirect call in the PLT. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_STRCMP; __GI_STRCMP = STRCMP_SSE2 -#endif - -#include "../strcmp.S" diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c new file mode 100644 index 0000000000..b903e418df --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcmp.c @@ -0,0 +1,59 @@ +/* Multiple versions of strcmp. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define strcmp __redirect_strcmp +# include <string.h> +# undef strcmp + +# define SYMBOL_NAME strcmp +# include <init-arch.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) + && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + return OPTIMIZE (avx2); + + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) + return OPTIMIZE (sse2_unaligned); + + if (CPU_FEATURES_CPU_P (cpu_features, SSSE3)) + return OPTIMIZE (ssse3); + + return OPTIMIZE (sse2); +} + +libc_ifunc_redirected (__redirect_strcmp, strcmp, IFUNC_SELECTOR ()); + +# ifdef SHARED +__hidden_ver1 (strcmp, __GI_strcmp, __redirect_strcmp) + __attribute__ ((visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S index caa74be2c2..72bf7e8586 100644 --- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S @@ -1,5 +1,5 @@ /* strcpy with SSE2 and unaligned load - Copyright (C) 2011-2016 Free Software Foundation, Inc. + Copyright (C) 2011-2018 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -33,7 +33,7 @@ lea TABLE(%rip), %r11; \ movslq (%r11, INDEX, SCALE), %rcx; \ lea (%r11, %rcx), %rcx; \ - jmp *%rcx + _CET_NOTRACK jmp *%rcx # ifndef USE_AS_STRCAT @@ -99,6 +99,8 @@ L(Unalign16Both): sub %rcx, %rdi # ifdef USE_AS_STRNCPY add %rcx, %r8 + sbb %rcx, %rcx + or %rcx, %r8 # endif mov $16, %rcx movdqa (%rsi, %rcx), %xmm1 diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2.S b/sysdeps/x86_64/multiarch/strcpy-sse2.S new file mode 100644 index 0000000000..70136017fa --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcpy-sse2.S @@ -0,0 +1,28 @@ +/* strcpy optimized with SSE2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> +# define strcpy __strcpy_sse2 + +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(strcpy) +#endif + +#include <sysdeps/x86_64/strcpy.S> diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S index 5bdb7671cf..9858d0c4d5 100644 --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S +++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S @@ -1,5 +1,5 @@ /* strcpy with SSSE3 - Copyright (C) 2011-2016 Free Software Foundation, Inc. + Copyright (C) 2011-2018 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S deleted file mode 100644 index 024f6ef899..0000000000 --- a/sysdeps/x86_64/multiarch/strcpy.S +++ /dev/null @@ -1,99 +0,0 @@ -/* Multiple versions of strcpy - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2009-2016 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - -#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY) -# ifndef STRCPY -# define STRCPY strcpy -# endif -#endif - -#ifdef USE_AS_STPCPY -# ifdef USE_AS_STRNCPY -# define STRCPY_SSSE3 __stpncpy_ssse3 -# define STRCPY_SSE2 __stpncpy_sse2 -# define STRCPY_SSE2_UNALIGNED __stpncpy_sse2_unaligned -# define __GI_STRCPY __GI_stpncpy -# define __GI___STRCPY __GI___stpncpy -# else -# define STRCPY_SSSE3 __stpcpy_ssse3 -# define STRCPY_SSE2 __stpcpy_sse2 -# define STRCPY_SSE2_UNALIGNED __stpcpy_sse2_unaligned -# define __GI_STRCPY __GI_stpcpy -# define __GI___STRCPY __GI___stpcpy -# endif -#else -# ifdef USE_AS_STRNCPY -# define STRCPY_SSSE3 __strncpy_ssse3 -# define STRCPY_SSE2 __strncpy_sse2 -# define STRCPY_SSE2_UNALIGNED __strncpy_sse2_unaligned -# define __GI_STRCPY __GI_strncpy -# else -# define STRCPY_SSSE3 __strcpy_ssse3 -# define STRCPY_SSE2 __strcpy_sse2 -# define STRCPY_SSE2_UNALIGNED __strcpy_sse2_unaligned -# define __GI_STRCPY __GI_strcpy -# endif -#endif - - -/* Define multiple versions only for the definition in libc. */ -#if IS_IN (libc) - .text -ENTRY(STRCPY) - .type STRCPY, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq STRCPY_SSE2_UNALIGNED(%rip), %rax - HAS_ARCH_FEATURE (Fast_Unaligned_Load) - jnz 2f - leaq STRCPY_SSE2(%rip), %rax - HAS_CPU_FEATURE (SSSE3) - jz 2f - leaq STRCPY_SSSE3(%rip), %rax -2: ret -END(STRCPY) - -# undef ENTRY -# define ENTRY(name) \ - .type STRCPY_SSE2, @function; \ - .align 16; \ - .globl STRCPY_SSE2; \ - .hidden STRCPY_SSE2; \ - STRCPY_SSE2: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size STRCPY_SSE2, .-STRCPY_SSE2 -# undef libc_hidden_builtin_def -/* It doesn't make sense to send libc-internal strcpy calls through a PLT. - The speedup we get from using SSSE3 instruction is likely eaten away - by the indirect call in the PLT. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_STRCPY; __GI_STRCPY = STRCPY_SSE2 -# undef libc_hidden_def -# define libc_hidden_def(name) \ - .globl __GI___STRCPY; __GI___STRCPY = STRCPY_SSE2 -#endif - -#ifndef USE_AS_STRNCPY -#include "../strcpy.S" -#endif diff --git a/sysdeps/x86_64/multiarch/strcpy.c b/sysdeps/x86_64/multiarch/strcpy.c new file mode 100644 index 0000000000..12e0e3ffe2 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcpy.c @@ -0,0 +1,35 @@ +/* Multiple versions of strcpy. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define strcpy __redirect_strcpy +# include <string.h> +# undef strcpy + +# define SYMBOL_NAME strcpy +# include "ifunc-unaligned-ssse3.h" + +libc_ifunc_redirected (__redirect_strcpy, strcpy, IFUNC_SELECTOR ()); + +# ifdef SHARED +__hidden_ver1 (strcpy, __GI_strcpy, __redirect_strcpy) + __attribute__ ((visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c index 91b804ddd6..857af10486 100644 --- a/sysdeps/x86_64/multiarch/strcspn-c.c +++ b/sysdeps/x86_64/multiarch/strcspn-c.c @@ -1,5 +1,5 @@ /* strcspn with SSE4.2 intrinsics - Copyright (C) 2009-2016 Free Software Foundation, Inc. + Copyright (C) 2009-2018 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -70,7 +70,7 @@ char * #else size_t #endif -STRCSPN_SSE2 (const char *, const char *); +STRCSPN_SSE2 (const char *, const char *) attribute_hidden; #ifdef USE_AS_STRPBRK diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.S b/sysdeps/x86_64/multiarch/strcspn-sse2.S new file mode 100644 index 0000000000..8a0c69d7f5 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcspn-sse2.S @@ -0,0 +1,28 @@ +/* strcspn optimized with SSE2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> +# define strcspn __strcspn_sse2 + +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(strcspn) +#endif + +#include <sysdeps/x86_64/strcspn.S> diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S deleted file mode 100644 index 8e7ff1c663..0000000000 --- a/sysdeps/x86_64/multiarch/strcspn.S +++ /dev/null @@ -1,69 +0,0 @@ -/* Multiple versions of strcspn - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2009-2016 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <config.h> -#include <sysdep.h> -#include <init-arch.h> - -#ifdef USE_AS_STRPBRK -#define STRCSPN_SSE42 __strpbrk_sse42 -#define STRCSPN_SSE2 __strpbrk_sse2 -#define __GI_STRCSPN __GI_strpbrk -#else -#ifndef STRCSPN -#define STRCSPN strcspn -#define STRCSPN_SSE42 __strcspn_sse42 -#define STRCSPN_SSE2 __strcspn_sse2 -#define __GI_STRCSPN __GI_strcspn -#endif -#endif - -/* Define multiple versions only for the definition in libc. Don't - define multiple versions for strpbrk in static library since we - need strpbrk before the initialization happened. */ -#if (defined SHARED || !defined USE_AS_STRPBRK) && IS_IN (libc) - .text -ENTRY(STRCSPN) - .type STRCSPN, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq STRCSPN_SSE2(%rip), %rax - HAS_CPU_FEATURE (SSE4_2) - jz 2f - leaq STRCSPN_SSE42(%rip), %rax -2: ret -END(STRCSPN) - -# undef ENTRY -# define ENTRY(name) \ - .type STRCSPN_SSE2, @function; \ - .globl STRCSPN_SSE2; \ - .align 16; \ - STRCSPN_SSE2: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2 -#endif - -#ifdef USE_AS_STRPBRK -#include "../strpbrk.S" -#else -#include "../strcspn.S" -#endif diff --git a/sysdeps/x86_64/multiarch/strcspn.c b/sysdeps/x86_64/multiarch/strcspn.c new file mode 100644 index 0000000000..9712e8410c --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcspn.c @@ -0,0 +1,35 @@ +/* Multiple versions of strcspn. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define strcspn __redirect_strcspn +# include <string.h> +# undef strcspn + +# define SYMBOL_NAME strcspn +# include "ifunc-sse4_2.h" + +libc_ifunc_redirected (__redirect_strcspn, strcspn, IFUNC_SELECTOR ()); + +# ifdef SHARED +__hidden_ver1 (strcspn, __GI_strcspn, __redirect_strcspn) + __attribute__ ((visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S new file mode 100644 index 0000000000..fb2418cddc --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S @@ -0,0 +1,393 @@ +/* strlen/strnlen/wcslen/wcsnlen optimized with AVX2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef STRLEN +# define STRLEN __strlen_avx2 +# endif + +# ifdef USE_AS_WCSLEN +# define VPCMPEQ vpcmpeqd +# define VPMINU vpminud +# else +# define VPCMPEQ vpcmpeqb +# define VPMINU vpminub +# endif + +# ifndef VZEROUPPER +# define VZEROUPPER vzeroupper +# endif + +# define VEC_SIZE 32 + + .section .text.avx,"ax",@progbits +ENTRY (STRLEN) +# ifdef USE_AS_STRNLEN + /* Check for zero length. */ + testq %rsi, %rsi + jz L(zero) +# ifdef USE_AS_WCSLEN + shl $2, %rsi +# endif + movq %rsi, %r8 +# endif + movl %edi, %ecx + movq %rdi, %rdx + vpxor %xmm0, %xmm0, %xmm0 + + /* Check if we may cross page boundary with one vector load. */ + andl $(2 * VEC_SIZE - 1), %ecx + cmpl $VEC_SIZE, %ecx + ja L(cros_page_boundary) + + /* Check the first VEC_SIZE bytes. */ + VPCMPEQ (%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + +# ifdef USE_AS_STRNLEN + jnz L(first_vec_x0_check) + /* Adjust length and check the end of data. */ + subq $VEC_SIZE, %rsi + jbe L(max) +# else + jnz L(first_vec_x0) +# endif + + /* Align data for aligned loads in the loop. */ + addq $VEC_SIZE, %rdi + andl $(VEC_SIZE - 1), %ecx + andq $-VEC_SIZE, %rdi + +# ifdef USE_AS_STRNLEN + /* Adjust length. */ + addq %rcx, %rsi + + subq $(VEC_SIZE * 4), %rsi + jbe L(last_4x_vec_or_less) +# endif + jmp L(more_4x_vec) + + .p2align 4 +L(cros_page_boundary): + andl $(VEC_SIZE - 1), %ecx + andq $-VEC_SIZE, %rdi + VPCMPEQ (%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + /* Remove the leading bytes. */ + sarl %cl, %eax + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax +# ifdef USE_AS_STRNLEN + /* Check the end of data. */ + cmpq %rax, %rsi + jbe L(max) +# endif + addq %rdi, %rax + addq %rcx, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + VZEROUPPER + ret + + .p2align 4 +L(aligned_more): +# ifdef USE_AS_STRNLEN + /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" + with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" + to void possible addition overflow. */ + negq %rcx + addq $VEC_SIZE, %rcx + + /* Check the end of data. */ + subq %rcx, %rsi + jbe L(max) +# endif + + addq $VEC_SIZE, %rdi + +# ifdef USE_AS_STRNLEN + subq $(VEC_SIZE * 4), %rsi + jbe L(last_4x_vec_or_less) +# endif + +L(more_4x_vec): + /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ + VPCMPEQ (%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x0) + + VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x1) + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x2) + + VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x3) + + addq $(VEC_SIZE * 4), %rdi + +# ifdef USE_AS_STRNLEN + subq $(VEC_SIZE * 4), %rsi + jbe L(last_4x_vec_or_less) +# endif + + /* Align data to 4 * VEC_SIZE. */ + movq %rdi, %rcx + andl $(4 * VEC_SIZE - 1), %ecx + andq $-(4 * VEC_SIZE), %rdi + +# ifdef USE_AS_STRNLEN + /* Adjust length. */ + addq %rcx, %rsi +# endif + + .p2align 4 +L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ + vmovdqa (%rdi), %ymm1 + vmovdqa VEC_SIZE(%rdi), %ymm2 + vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 + vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 + VPMINU %ymm1, %ymm2, %ymm5 + VPMINU %ymm3, %ymm4, %ymm6 + VPMINU %ymm5, %ymm6, %ymm5 + + VPCMPEQ %ymm5, %ymm0, %ymm5 + vpmovmskb %ymm5, %eax + testl %eax, %eax + jnz L(4x_vec_end) + + addq $(VEC_SIZE * 4), %rdi + +# ifndef USE_AS_STRNLEN + jmp L(loop_4x_vec) +# else + subq $(VEC_SIZE * 4), %rsi + ja L(loop_4x_vec) + +L(last_4x_vec_or_less): + /* Less than 4 * VEC and aligned to VEC_SIZE. */ + addl $(VEC_SIZE * 2), %esi + jle L(last_2x_vec) + + VPCMPEQ (%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x0) + + VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x1) + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + + jnz L(first_vec_x2_check) + subl $VEC_SIZE, %esi + jle L(max) + + VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + + jnz L(first_vec_x3_check) + movq %r8, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + VZEROUPPER + ret + + .p2align 4 +L(last_2x_vec): + addl $(VEC_SIZE * 2), %esi + VPCMPEQ (%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + + jnz L(first_vec_x0_check) + subl $VEC_SIZE, %esi + jle L(max) + + VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x1_check) + movq %r8, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + VZEROUPPER + ret + + .p2align 4 +L(first_vec_x0_check): + tzcntl %eax, %eax + /* Check the end of data. */ + cmpq %rax, %rsi + jbe L(max) + addq %rdi, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + VZEROUPPER + ret + + .p2align 4 +L(first_vec_x1_check): + tzcntl %eax, %eax + /* Check the end of data. */ + cmpq %rax, %rsi + jbe L(max) + addq $VEC_SIZE, %rax + addq %rdi, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + VZEROUPPER + ret + + .p2align 4 +L(first_vec_x2_check): + tzcntl %eax, %eax + /* Check the end of data. */ + cmpq %rax, %rsi + jbe L(max) + addq $(VEC_SIZE * 2), %rax + addq %rdi, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + VZEROUPPER + ret + + .p2align 4 +L(first_vec_x3_check): + tzcntl %eax, %eax + /* Check the end of data. */ + cmpq %rax, %rsi + jbe L(max) + addq $(VEC_SIZE * 3), %rax + addq %rdi, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + VZEROUPPER + ret + + .p2align 4 +L(max): + movq %r8, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + VZEROUPPER + ret + + .p2align 4 +L(zero): + xorl %eax, %eax + ret +# endif + + .p2align 4 +L(first_vec_x0): + tzcntl %eax, %eax + addq %rdi, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + VZEROUPPER + ret + + .p2align 4 +L(first_vec_x1): + tzcntl %eax, %eax + addq $VEC_SIZE, %rax + addq %rdi, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + VZEROUPPER + ret + + .p2align 4 +L(first_vec_x2): + tzcntl %eax, %eax + addq $(VEC_SIZE * 2), %rax + addq %rdi, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + VZEROUPPER + ret + + .p2align 4 +L(4x_vec_end): + VPCMPEQ %ymm1, %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x0) + VPCMPEQ %ymm2, %ymm0, %ymm2 + vpmovmskb %ymm2, %eax + testl %eax, %eax + jnz L(first_vec_x1) + VPCMPEQ %ymm3, %ymm0, %ymm3 + vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(first_vec_x2) + VPCMPEQ %ymm4, %ymm0, %ymm4 + vpmovmskb %ymm4, %eax +L(first_vec_x3): + tzcntl %eax, %eax + addq $(VEC_SIZE * 3), %rax + addq %rdi, %rax + subq %rdx, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + VZEROUPPER + ret + +END (STRLEN) +#endif diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S new file mode 100644 index 0000000000..7bc57b8d0f --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen-sse2.S @@ -0,0 +1,23 @@ +/* strlen optimized with SSE2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# define strlen __strlen_sse2 +#endif + +#include "../strlen.S" diff --git a/sysdeps/x86_64/multiarch/strlen.c b/sysdeps/x86_64/multiarch/strlen.c new file mode 100644 index 0000000000..1758d22b8f --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen.c @@ -0,0 +1,34 @@ +/* Multiple versions of strlen. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define strlen __redirect_strlen +# include <string.h> +# undef strlen + +# define SYMBOL_NAME strlen +# include "ifunc-avx2.h" + +libc_ifunc_redirected (__redirect_strlen, strlen, IFUNC_SELECTOR ()); +# ifdef SHARED +__hidden_ver1 (strlen, __GI_strlen, __redirect_strlen) + __attribute__((visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strncase.c b/sysdeps/x86_64/multiarch/strncase.c new file mode 100644 index 0000000000..798966cf3e --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncase.c @@ -0,0 +1,35 @@ +/* Multiple versions of strncasecmp. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define strncasecmp __redirect_strncasecmp +# define __strncasecmp __redirect___strncasecmp +# include <string.h> +# undef strncasecmp +# undef __strncasecmp + +# define SYMBOL_NAME strncasecmp +# include "ifunc-strcasecmp.h" + +libc_ifunc_redirected (__redirect_strncasecmp, __strncasecmp, + IFUNC_SELECTOR ()); + +weak_alias (__strncasecmp, strncasecmp) +#endif diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx.S b/sysdeps/x86_64/multiarch/strncase_l-avx.S new file mode 100644 index 0000000000..0c4e525bd4 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncase_l-avx.S @@ -0,0 +1,22 @@ +/* strncasecmp_l optimized with AVX. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define STRCMP_SSE42 __strncasecmp_l_avx +#define USE_AVX 1 +#define USE_AS_STRNCASECMP_L +#include "strcmp-sse42.S" diff --git a/sysdeps/x86_64/multiarch/strncase_l-sse2.S b/sysdeps/x86_64/multiarch/strncase_l-sse2.S new file mode 100644 index 0000000000..e7841334b7 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncase_l-sse2.S @@ -0,0 +1,23 @@ +/* strncasecmp_l optimized with SSE2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define STRCMP __strncasecmp_l_sse2 +#define NO_NOLOCALE_ALIAS +#define USE_AS_STRNCASECMP_L +#define __strncasecmp __strncasecmp_sse2 +#include <sysdeps/x86_64/strcmp.S> diff --git a/sysdeps/x86_64/multiarch/strncase_l-sse4_2.S b/sysdeps/x86_64/multiarch/strncase_l-sse4_2.S new file mode 100644 index 0000000000..d2ea88c4ce --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncase_l-sse4_2.S @@ -0,0 +1,21 @@ +/* strncasecmp_l optimized with SSE4.2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define STRCMP_SSE42 __strncasecmp_l_sse42 +#define USE_AS_STRNCASECMP_L +#include "strcmp-sse42.S" diff --git a/sysdeps/x86_64/multiarch/strncase_l.S b/sysdeps/x86_64/multiarch/strncase_l.S deleted file mode 100644 index 9c0149788e..0000000000 --- a/sysdeps/x86_64/multiarch/strncase_l.S +++ /dev/null @@ -1,8 +0,0 @@ -/* Multiple versions of strncasecmp and strncasecmp_l - All versions must be listed in ifunc-impl-list.c. */ -#define STRCMP __strncasecmp_l -#define USE_AS_STRNCASECMP_L -#include "strcmp.S" - -weak_alias (__strncasecmp_l, strncasecmp_l) -libc_hidden_def (strncasecmp_l) diff --git a/sysdeps/x86_64/multiarch/strncase_l.c b/sysdeps/x86_64/multiarch/strncase_l.c new file mode 100644 index 0000000000..97631cf401 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncase_l.c @@ -0,0 +1,40 @@ +/* Multiple versions of strncasecmp_l. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define strncasecmp_l __redirect_strncasecmp_l +# define __strncasecmp_l __redirect___strncasecmp_l +# include <string.h> +# undef strncasecmp_l +# undef __strncasecmp_l + +# define SYMBOL_NAME strncasecmp_l +# include "ifunc-strcasecmp.h" + +libc_ifunc_redirected (__redirect_strncasecmp_l, __strncasecmp_l, + IFUNC_SELECTOR ()); + +weak_alias (__strncasecmp_l, strncasecmp_l) +# ifdef SHARED +__hidden_ver1 (__strncasecmp_l, __GI___strncasecmp_l, + __redirect___strncasecmp_l) + __attribute__ ((visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strncat-c.c b/sysdeps/x86_64/multiarch/strncat-c.c index a3cdbff689..93a7fab7ea 100644 --- a/sysdeps/x86_64/multiarch/strncat-c.c +++ b/sysdeps/x86_64/multiarch/strncat-c.c @@ -1,8 +1,2 @@ #define STRNCAT __strncat_sse2 -#ifdef SHARED -#undef libc_hidden_def -#define libc_hidden_def(name) \ - __hidden_ver1 (__strncat_sse2, __GI___strncat, __strncat_sse2); -#endif - -#include "string/strncat.c" +#include <string/strncat.c> diff --git a/sysdeps/x86_64/multiarch/strncat.S b/sysdeps/x86_64/multiarch/strncat.S deleted file mode 100644 index 5c1bf41453..0000000000 --- a/sysdeps/x86_64/multiarch/strncat.S +++ /dev/null @@ -1,5 +0,0 @@ -/* Multiple versions of strncat - All versions must be listed in ifunc-impl-list.c. */ -#define STRCAT strncat -#define USE_AS_STRNCAT -#include "strcat.S" diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c new file mode 100644 index 0000000000..841c165565 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncat.c @@ -0,0 +1,35 @@ +/* Multiple versions of strncat. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define strncat __redirect_strncat +# include <string.h> +# undef strncat + +# define SYMBOL_NAME strncat +# include "ifunc-unaligned-ssse3.h" + +libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ()); +strong_alias (strncat, __strncat); +# ifdef SHARED +__hidden_ver1 (strncat, __GI___strncat, __redirect_strncat) + __attribute__((visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2.S b/sysdeps/x86_64/multiarch/strncmp-avx2.S new file mode 100644 index 0000000000..1678bcc235 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncmp-avx2.S @@ -0,0 +1,3 @@ +#define STRCMP __strncmp_avx2 +#define USE_AS_STRNCMP 1 +#include "strcmp-avx2.S" diff --git a/sysdeps/x86_64/fpu/test-double-vlen2.c b/sysdeps/x86_64/multiarch/strncmp-sse2.S index c7a3dff747..a5ecb82b13 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen2.c +++ b/sysdeps/x86_64/multiarch/strncmp-sse2.S @@ -1,5 +1,5 @@ -/* Tests for SSE ISA versions of vector math functions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. +/* strcmp optimized with SSE2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,13 +16,15 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include "test-double-vlen2.h" +#include <sysdep.h> -#define TEST_VECTOR_cos 1 -#define TEST_VECTOR_sin 1 -#define TEST_VECTOR_sincos 1 -#define TEST_VECTOR_log 1 -#define TEST_VECTOR_exp 1 -#define TEST_VECTOR_pow 1 +#if IS_IN (libc) +# define STRCMP __strncmp_sse2 +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(strcmp) +#else +# define STRCMP strncmp +#endif -#include "libm-test.c" +#define USE_AS_STRNCMP +#include <sysdeps/x86_64/strcmp.S> diff --git a/sysdeps/x86_64/multiarch/strncmp-sse4_2.S b/sysdeps/x86_64/multiarch/strncmp-sse4_2.S new file mode 100644 index 0000000000..b859c1eb74 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncmp-sse4_2.S @@ -0,0 +1,21 @@ +/* strncmp optimized with SSE4.2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define STRCMP_SSE42 __strncmp_sse42 +#define USE_AS_STRNCMP +#include "strcmp-sse42.S" diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S index 96380a46be..fa43484b54 100644 --- a/sysdeps/x86_64/multiarch/strncmp-ssse3.S +++ b/sysdeps/x86_64/multiarch/strncmp-ssse3.S @@ -1,6 +1,28 @@ -#ifdef SHARED -# define USE_SSSE3 1 -# define STRCMP __strncmp_ssse3 -# define USE_AS_STRNCMP -# include "../strcmp.S" -#endif +/* strcmp optimized with SSSE3. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#define STRCMP __strncmp_ssse3 + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(strcmp) + +#define USE_SSSE3 1 +#define USE_AS_STRNCMP +#include <sysdeps/x86_64/strcmp.S> diff --git a/sysdeps/x86_64/multiarch/strncmp.S b/sysdeps/x86_64/multiarch/strncmp.S deleted file mode 100644 index fd5eb1397c..0000000000 --- a/sysdeps/x86_64/multiarch/strncmp.S +++ /dev/null @@ -1,5 +0,0 @@ -/* Multiple versions of strncmp - All versions must be listed in ifunc-impl-list.c. */ -#define STRCMP strncmp -#define USE_AS_STRNCMP -#include "strcmp.S" diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c new file mode 100644 index 0000000000..02b6d0b6f5 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncmp.c @@ -0,0 +1,60 @@ +/* Multiple versions of strncmp. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define strncmp __redirect_strncmp +# include <string.h> +# undef strncmp + +# define SYMBOL_NAME strncmp +# include <init-arch.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) + && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + return OPTIMIZE (avx2); + + if (CPU_FEATURES_CPU_P (cpu_features, SSE4_2) + && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) + return OPTIMIZE (sse42); + + if (CPU_FEATURES_CPU_P (cpu_features, SSSE3)) + return OPTIMIZE (ssse3); + + return OPTIMIZE (sse2); +} + +libc_ifunc_redirected (__redirect_strncmp, strncmp, IFUNC_SELECTOR ()); + +# ifdef SHARED +__hidden_ver1 (strncmp, __GI_strncmp, __redirect_strncmp) + __attribute__ ((visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strncpy-c.c b/sysdeps/x86_64/multiarch/strncpy-c.c index 296c32cb5d..57c45ac7ab 100644 --- a/sysdeps/x86_64/multiarch/strncpy-c.c +++ b/sysdeps/x86_64/multiarch/strncpy-c.c @@ -1,8 +1,5 @@ #define STRNCPY __strncpy_sse2 -#ifdef SHARED #undef libc_hidden_builtin_def -#define libc_hidden_builtin_def(name) \ - __hidden_ver1 (__strncpy_sse2, __GI_strncpy, __strncpy_sse2); -#endif +#define libc_hidden_builtin_def(strncpy) -#include "strncpy.c" +#include <string/strncpy.c> diff --git a/sysdeps/x86_64/multiarch/strncpy.S b/sysdeps/x86_64/multiarch/strncpy.S deleted file mode 100644 index 6d87a0ba35..0000000000 --- a/sysdeps/x86_64/multiarch/strncpy.S +++ /dev/null @@ -1,5 +0,0 @@ -/* Multiple versions of strncpy - All versions must be listed in ifunc-impl-list.c. */ -#define STRCPY strncpy -#define USE_AS_STRNCPY -#include "strcpy.S" diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c new file mode 100644 index 0000000000..3c3de8b18e --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncpy.c @@ -0,0 +1,35 @@ +/* Multiple versions of strncpy. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define strncpy __redirect_strncpy +# include <string.h> +# undef strncpy + +# define SYMBOL_NAME strncpy +# include "ifunc-unaligned-ssse3.h" + +libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ()); + +# ifdef SHARED +__hidden_ver1 (strncpy, __GI_strncpy, __redirect_strncpy) + __attribute__ ((visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strnlen-avx2.S b/sysdeps/x86_64/multiarch/strnlen-avx2.S new file mode 100644 index 0000000000..c4062b22f7 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strnlen-avx2.S @@ -0,0 +1,4 @@ +#define STRLEN __strnlen_avx2 +#define USE_AS_STRNLEN 1 + +#include "strlen-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strnlen-sse2.S b/sysdeps/x86_64/multiarch/strnlen-sse2.S new file mode 100644 index 0000000000..41f33f6f6f --- /dev/null +++ b/sysdeps/x86_64/multiarch/strnlen-sse2.S @@ -0,0 +1,28 @@ +/* strnlen optimized with SSE2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# define __strnlen __strnlen_sse2 + +# undef weak_alias +# define weak_alias(__strnlen, strnlen) +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(strnlen) +#endif + +#include "../strnlen.S" diff --git a/sysdeps/x86_64/multiarch/strnlen.c b/sysdeps/x86_64/multiarch/strnlen.c new file mode 100644 index 0000000000..3ab94ce230 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strnlen.c @@ -0,0 +1,39 @@ +/* Multiple versions of strnlen. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define strnlen __redirect_strnlen +# define __strnlen __redirect___strnlen +# include <string.h> +# undef __strnlen +# undef strnlen + +# define SYMBOL_NAME strnlen +# include "ifunc-avx2.h" + +libc_ifunc_redirected (__redirect_strnlen, __strnlen, IFUNC_SELECTOR ()); +weak_alias (__strnlen, strnlen); +# ifdef SHARED +__hidden_ver1 (__strnlen, __GI___strnlen, __redirect___strnlen) + __attribute__((visibility ("hidden"))); +__hidden_ver1 (strnlen, __GI_strnlen, __redirect_strnlen) + __attribute__((weak, visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strpbrk-c.c b/sysdeps/x86_64/multiarch/strpbrk-c.c index bbf5c49d89..c58dcb5605 100644 --- a/sysdeps/x86_64/multiarch/strpbrk-c.c +++ b/sysdeps/x86_64/multiarch/strpbrk-c.c @@ -1,8 +1,4 @@ -/* Don't define multiple versions for strpbrk in static library since we - need strpbrk before the initialization happened. */ -#ifdef SHARED -# define USE_AS_STRPBRK -# define STRCSPN_SSE2 __strpbrk_sse2 -# define STRCSPN_SSE42 __strpbrk_sse42 -# include "strcspn-c.c" -#endif +#define USE_AS_STRPBRK +#define STRCSPN_SSE2 __strpbrk_sse2 +#define STRCSPN_SSE42 __strpbrk_sse42 +#include "strcspn-c.c" diff --git a/sysdeps/x86_64/multiarch/strpbrk-sse2.S b/sysdeps/x86_64/multiarch/strpbrk-sse2.S new file mode 100644 index 0000000000..3c6a74db29 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strpbrk-sse2.S @@ -0,0 +1,29 @@ +/* strpbrk optimized with SSE2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> +# define strcspn __strpbrk_sse2 + +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(strpbrk) +#endif + +#define USE_AS_STRPBRK +#include <sysdeps/x86_64/strcspn.S> diff --git a/sysdeps/x86_64/multiarch/strpbrk.S b/sysdeps/x86_64/multiarch/strpbrk.S deleted file mode 100644 index 7201d6376f..0000000000 --- a/sysdeps/x86_64/multiarch/strpbrk.S +++ /dev/null @@ -1,5 +0,0 @@ -/* Multiple versions of strpbrk - All versions must be listed in ifunc-impl-list.c. */ -#define STRCSPN strpbrk -#define USE_AS_STRPBRK -#include "strcspn.S" diff --git a/sysdeps/x86_64/multiarch/strpbrk.c b/sysdeps/x86_64/multiarch/strpbrk.c new file mode 100644 index 0000000000..a0d435a504 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strpbrk.c @@ -0,0 +1,35 @@ +/* Multiple versions of strpbrk. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define strpbrk __redirect_strpbrk +# include <string.h> +# undef strpbrk + +# define SYMBOL_NAME strpbrk +# include "ifunc-sse4_2.h" + +libc_ifunc_redirected (__redirect_strpbrk, strpbrk, IFUNC_SELECTOR ()); + +# ifdef SHARED +__hidden_ver1 (strpbrk, __GI_strpbrk, __redirect_strpbrk) + __attribute__ ((visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S new file mode 100644 index 0000000000..4381e6ab3e --- /dev/null +++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S @@ -0,0 +1,235 @@ +/* strrchr/wcsrchr optimized with AVX2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef STRRCHR +# define STRRCHR __strrchr_avx2 +# endif + +# ifdef USE_AS_WCSRCHR +# define VPBROADCAST vpbroadcastd +# define VPCMPEQ vpcmpeqd +# else +# define VPBROADCAST vpbroadcastb +# define VPCMPEQ vpcmpeqb +# endif + +# ifndef VZEROUPPER +# define VZEROUPPER vzeroupper +# endif + +# define VEC_SIZE 32 + + .section .text.avx,"ax",@progbits +ENTRY (STRRCHR) + movd %esi, %xmm4 + movl %edi, %ecx + /* Broadcast CHAR to YMM4. */ + VPBROADCAST %xmm4, %ymm4 + vpxor %ymm0, %ymm0, %ymm0 + + /* Check if we may cross page boundary with one vector load. */ + andl $(2 * VEC_SIZE - 1), %ecx + cmpl $VEC_SIZE, %ecx + ja L(cros_page_boundary) + + vmovdqu (%rdi), %ymm1 + VPCMPEQ %ymm1, %ymm0, %ymm2 + VPCMPEQ %ymm1, %ymm4, %ymm3 + vpmovmskb %ymm2, %ecx + vpmovmskb %ymm3, %eax + addq $VEC_SIZE, %rdi + + testl %eax, %eax + jnz L(first_vec) + + testl %ecx, %ecx + jnz L(return_null) + + andq $-VEC_SIZE, %rdi + xorl %edx, %edx + jmp L(aligned_loop) + + .p2align 4 +L(first_vec): + /* Check if there is a nul CHAR. */ + testl %ecx, %ecx + jnz L(char_and_nul_in_first_vec) + + /* Remember the match and keep searching. */ + movl %eax, %edx + movq %rdi, %rsi + andq $-VEC_SIZE, %rdi + jmp L(aligned_loop) + + .p2align 4 +L(cros_page_boundary): + andl $(VEC_SIZE - 1), %ecx + andq $-VEC_SIZE, %rdi + vmovdqa (%rdi), %ymm1 + VPCMPEQ %ymm1, %ymm0, %ymm2 + VPCMPEQ %ymm1, %ymm4, %ymm3 + vpmovmskb %ymm2, %edx + vpmovmskb %ymm3, %eax + shrl %cl, %edx + shrl %cl, %eax + addq $VEC_SIZE, %rdi + + /* Check if there is a CHAR. */ + testl %eax, %eax + jnz L(found_char) + + testl %edx, %edx + jnz L(return_null) + + jmp L(aligned_loop) + + .p2align 4 +L(found_char): + testl %edx, %edx + jnz L(char_and_nul) + + /* Remember the match and keep searching. */ + movl %eax, %edx + leaq (%rdi, %rcx), %rsi + + .p2align 4 +L(aligned_loop): + vmovdqa (%rdi), %ymm1 + VPCMPEQ %ymm1, %ymm0, %ymm2 + addq $VEC_SIZE, %rdi + VPCMPEQ %ymm1, %ymm4, %ymm3 + vpmovmskb %ymm2, %ecx + vpmovmskb %ymm3, %eax + orl %eax, %ecx + jnz L(char_nor_null) + + vmovdqa (%rdi), %ymm1 + VPCMPEQ %ymm1, %ymm0, %ymm2 + add $VEC_SIZE, %rdi + VPCMPEQ %ymm1, %ymm4, %ymm3 + vpmovmskb %ymm2, %ecx + vpmovmskb %ymm3, %eax + orl %eax, %ecx + jnz L(char_nor_null) + + vmovdqa (%rdi), %ymm1 + VPCMPEQ %ymm1, %ymm0, %ymm2 + addq $VEC_SIZE, %rdi + VPCMPEQ %ymm1, %ymm4, %ymm3 + vpmovmskb %ymm2, %ecx + vpmovmskb %ymm3, %eax + orl %eax, %ecx + jnz L(char_nor_null) + + vmovdqa (%rdi), %ymm1 + VPCMPEQ %ymm1, %ymm0, %ymm2 + addq $VEC_SIZE, %rdi + VPCMPEQ %ymm1, %ymm4, %ymm3 + vpmovmskb %ymm2, %ecx + vpmovmskb %ymm3, %eax + orl %eax, %ecx + jz L(aligned_loop) + + .p2align 4 +L(char_nor_null): + /* Find a CHAR or a nul CHAR in a loop. */ + testl %eax, %eax + jnz L(match) +L(return_value): + testl %edx, %edx + jz L(return_null) + movl %edx, %eax + movq %rsi, %rdi + +# ifdef USE_AS_WCSRCHR + /* Keep the first bit for each matching CHAR for bsr. */ + andl $0x11111111, %eax +# endif + bsrl %eax, %eax + leaq -VEC_SIZE(%rdi, %rax), %rax + VZEROUPPER + ret + + .p2align 4 +L(match): + /* Find a CHAR. Check if there is a nul CHAR. */ + vpmovmskb %ymm2, %ecx + testl %ecx, %ecx + jnz L(find_nul) + + /* Remember the match and keep searching. */ + movl %eax, %edx + movq %rdi, %rsi + jmp L(aligned_loop) + + .p2align 4 +L(find_nul): +# ifdef USE_AS_WCSRCHR + /* Keep the first bit for each matching CHAR for bsr. */ + andl $0x11111111, %ecx + andl $0x11111111, %eax +# endif + /* Mask out any matching bits after the nul CHAR. */ + movl %ecx, %r8d + subl $1, %r8d + xorl %ecx, %r8d + andl %r8d, %eax + testl %eax, %eax + /* If there is no CHAR here, return the remembered one. */ + jz L(return_value) + bsrl %eax, %eax + leaq -VEC_SIZE(%rdi, %rax), %rax + VZEROUPPER + ret + + .p2align 4 +L(char_and_nul): + /* Find both a CHAR and a nul CHAR. */ + addq %rcx, %rdi + movl %edx, %ecx +L(char_and_nul_in_first_vec): +# ifdef USE_AS_WCSRCHR + /* Keep the first bit for each matching CHAR for bsr. */ + andl $0x11111111, %ecx + andl $0x11111111, %eax +# endif + /* Mask out any matching bits after the nul CHAR. */ + movl %ecx, %r8d + subl $1, %r8d + xorl %ecx, %r8d + andl %r8d, %eax + testl %eax, %eax + /* Return null pointer if the nul CHAR comes first. */ + jz L(return_null) + bsrl %eax, %eax + leaq -VEC_SIZE(%rdi, %rax), %rax + VZEROUPPER + ret + + .p2align 4 +L(return_null): + xorl %eax, %eax + VZEROUPPER + ret + +END (STRRCHR) +#endif diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S new file mode 100644 index 0000000000..0ec76fe9cc --- /dev/null +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S @@ -0,0 +1,28 @@ +/* strrchr optimized with SSE2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# define strrchr __strrchr_sse2 + +# undef weak_alias +# define weak_alias(strrchr, rindex) +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(strrchr) +#endif + +#include "../strrchr.S" diff --git a/sysdeps/x86_64/multiarch/strrchr.c b/sysdeps/x86_64/multiarch/strrchr.c new file mode 100644 index 0000000000..a719edde10 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strrchr.c @@ -0,0 +1,34 @@ +/* Multiple versions of strrchr. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define strrchr __redirect_strrchr +# include <string.h> +# undef strrchr + +# define SYMBOL_NAME strrchr +# include "ifunc-avx2.h" + +libc_ifunc_redirected (__redirect_strrchr, strrchr, IFUNC_SELECTOR ()); +weak_alias (strrchr, rindex); +# ifdef SHARED +__hidden_ver1 (strrchr, __GI_strrchr, __redirect_strrchr) + __attribute__((visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c index 9675f9360e..4554cff0c2 100644 --- a/sysdeps/x86_64/multiarch/strspn-c.c +++ b/sysdeps/x86_64/multiarch/strspn-c.c @@ -1,5 +1,5 @@ /* strspn with SSE4.2 intrinsics - Copyright (C) 2009-2016 Free Software Foundation, Inc. + Copyright (C) 2009-2018 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -52,7 +52,7 @@ We exit from the loop for case 1. */ -extern size_t __strspn_sse2 (const char *, const char *); +extern size_t __strspn_sse2 (const char *, const char *) attribute_hidden; size_t diff --git a/sysdeps/x86_64/multiarch/strspn-sse2.S b/sysdeps/x86_64/multiarch/strspn-sse2.S new file mode 100644 index 0000000000..4686cdd55d --- /dev/null +++ b/sysdeps/x86_64/multiarch/strspn-sse2.S @@ -0,0 +1,28 @@ +/* strspn optimized with SSE2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> +# define strspn __strspn_sse2 + +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(strspn) +#endif + +#include <sysdeps/x86_64/strspn.S> diff --git a/sysdeps/x86_64/multiarch/strspn.c b/sysdeps/x86_64/multiarch/strspn.c new file mode 100644 index 0000000000..56ab4d9558 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strspn.c @@ -0,0 +1,35 @@ +/* Multiple versions of strspn. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define strspn __redirect_strspn +# include <string.h> +# undef strspn + +# define SYMBOL_NAME strspn +# include "ifunc-sse4_2.h" + +libc_ifunc_redirected (__redirect_strspn, strspn, IFUNC_SELECTOR ()); + +# ifdef SHARED +__hidden_ver1 (strspn, __GI_strspn, __redirect_strspn) + __attribute__ ((visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S index 4ead1dfaf5..8188b8f643 100644 --- a/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S @@ -1,5 +1,5 @@ /* strstr with unaligned loads - Copyright (C) 2009-2016 Free Software Foundation, Inc. + Copyright (C) 2009-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c index eecba2243e..30ce597a16 100644 --- a/sysdeps/x86_64/multiarch/strstr.c +++ b/sysdeps/x86_64/multiarch/strstr.c @@ -1,6 +1,6 @@ /* Multiple versions of strstr. All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2012-2016 Free Software Foundation, Inc. + Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/multiarch/test-multiarch.c b/sysdeps/x86_64/multiarch/test-multiarch.c index 4eb0c16cd8..aa872f27db 100644 --- a/sysdeps/x86_64/multiarch/test-multiarch.c +++ b/sysdeps/x86_64/multiarch/test-multiarch.c @@ -1,6 +1,6 @@ /* Test CPU feature data. This file is part of the GNU C Library. - Copyright (C) 2012-2016 Free Software Foundation, Inc. + Copyright (C) 2012-2018 Free Software Foundation, Inc. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -16,7 +16,7 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include <init-arch.h> +#include <cpu-features.h> #include <stdio.h> #include <stdlib.h> #include <string.h> diff --git a/sysdeps/x86_64/multiarch/varshift.c b/sysdeps/x86_64/multiarch/varshift.c index 7921be5b57..2838736544 100644 --- a/sysdeps/x86_64/multiarch/varshift.c +++ b/sysdeps/x86_64/multiarch/varshift.c @@ -1,5 +1,5 @@ /* Helper for variable shifts of SSE registers. - Copyright (C) 2010-2016 Free Software Foundation, Inc. + Copyright (C) 2010-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/multiarch/varshift.h b/sysdeps/x86_64/multiarch/varshift.h index 7b27d0e9dd..76f2759874 100644 --- a/sysdeps/x86_64/multiarch/varshift.h +++ b/sysdeps/x86_64/multiarch/varshift.h @@ -1,5 +1,5 @@ /* Helper for variable shifts of SSE registers. - Copyright (C) 2010-2016 Free Software Foundation, Inc. + Copyright (C) 2010-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/multiarch/wcschr-avx2.S b/sysdeps/x86_64/multiarch/wcschr-avx2.S new file mode 100644 index 0000000000..67726b6837 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcschr-avx2.S @@ -0,0 +1,3 @@ +#define STRCHR __wcschr_avx2 +#define USE_AS_WCSCHR 1 +#include "strchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/wcschr-sse2.S b/sysdeps/x86_64/multiarch/wcschr-sse2.S new file mode 100644 index 0000000000..67e4742ef1 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcschr-sse2.S @@ -0,0 +1,30 @@ +/* wcschr optimized with SSE2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# define __wcschr __wcschr_sse2 + +# undef weak_alias +# define weak_alias(__wcschr, wcschr) +# undef libc_hidden_def +# define libc_hidden_def(__wcschr) +# undef libc_hidden_weak +# define libc_hidden_weak(wcschr) +#endif + +#include "../wcschr.S" diff --git a/sysdeps/x86_64/multiarch/wcschr.c b/sysdeps/x86_64/multiarch/wcschr.c new file mode 100644 index 0000000000..20a03833b9 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcschr.c @@ -0,0 +1,39 @@ +/* Multiple versions of wcschr. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define wcschr __redirect_wcschr +# define __wcschr __redirect___wcschr +# include <wchar.h> +# undef wcschr +# undef __wcschr + +# define SYMBOL_NAME wcschr +# include "ifunc-avx2.h" + +libc_ifunc_redirected (__redirect_wcschr, __wcschr, IFUNC_SELECTOR ()); +weak_alias (__wcschr, wcschr); +# ifdef SHARED +__hidden_ver1 (__wcschr, __GI___wcschr, __redirect___wcschr) + __attribute__((visibility ("hidden"))); +__hidden_ver1 (wcschr, __GI_wcschr, __redirect_wcschr) + __attribute__((weak, visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/wcscmp-avx2.S b/sysdeps/x86_64/multiarch/wcscmp-avx2.S new file mode 100644 index 0000000000..e5da4da689 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcscmp-avx2.S @@ -0,0 +1,4 @@ +#define STRCMP __wcscmp_avx2 +#define USE_AS_WCSCMP 1 + +#include "strcmp-avx2.S" diff --git a/sysdeps/x86_64/multiarch/wcscmp-sse2.S b/sysdeps/x86_64/multiarch/wcscmp-sse2.S new file mode 100644 index 0000000000..b129d1c073 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcscmp-sse2.S @@ -0,0 +1,23 @@ +/* wcscmp optimized with SSE2. + Copyright (C) 2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# define __wcscmp __wcscmp_sse2 +#endif + +#include "../wcscmp.S" diff --git a/sysdeps/x86_64/multiarch/wcscmp.c b/sysdeps/x86_64/multiarch/wcscmp.c new file mode 100644 index 0000000000..74d92cf0f9 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcscmp.c @@ -0,0 +1,37 @@ +/* Multiple versions of wcscmp. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define wcscmp __redirect_wcscmp +# define __wcscmp __redirect___wcscmp +# include <wchar.h> +# undef wcscmp +# undef __wcscmp + +# define SYMBOL_NAME wcscmp +# include "ifunc-avx2.h" + +libc_ifunc_redirected (__redirect_wcscmp, __wcscmp, IFUNC_SELECTOR ()); +weak_alias (__wcscmp, wcscmp) + +# ifdef SHARED +__hidden_ver1 (__wcscmp, __GI___wcscmp, __redirect_wcscmp) + __attribute__ ((visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S index 341e57a5ca..ea1589052b 100644 --- a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S +++ b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S @@ -1,5 +1,5 @@ /* wcscpy with SSSE3 - Copyright (C) 2011-2016 Free Software Foundation, Inc. + Copyright (C) 2011-2018 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/multiarch/wcscpy.c b/sysdeps/x86_64/multiarch/wcscpy.c new file mode 100644 index 0000000000..f23b1fd853 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcscpy.c @@ -0,0 +1,44 @@ +/* Multiple versions of wcscpy. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define wcscpy __redirect_wcscpy +# include <wchar.h> +# undef wcscpy + +# define SYMBOL_NAME wcscpy +# include <init-arch.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (CPU_FEATURES_CPU_P (cpu_features, SSSE3)) + return OPTIMIZE (ssse3); + + return OPTIMIZE (sse2); +} + +libc_ifunc_redirected (__redirect_wcscpy, wcscpy, IFUNC_SELECTOR ()); +#endif diff --git a/sysdeps/x86_64/multiarch/wcslen-avx2.S b/sysdeps/x86_64/multiarch/wcslen-avx2.S new file mode 100644 index 0000000000..c9224f1bc5 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcslen-avx2.S @@ -0,0 +1,4 @@ +#define STRLEN __wcslen_avx2 +#define USE_AS_WCSLEN 1 + +#include "strlen-avx2.S" diff --git a/sysdeps/x86_64/multiarch/wcslen-sse2.S b/sysdeps/x86_64/multiarch/wcslen-sse2.S new file mode 100644 index 0000000000..6031978363 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcslen-sse2.S @@ -0,0 +1,26 @@ +/* wcslen optimized with SSE2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# define __wcslen __wcslen_sse2 + +# undef weak_alias +# define weak_alias(__wcslen, wcslen) +#endif + +#include "../wcslen.S" diff --git a/sysdeps/x86_64/fpu/multiarch/s_floor.S b/sysdeps/x86_64/multiarch/wcslen.c index 57a0eee5ba..6d06e47cbd 100644 --- a/sysdeps/x86_64/fpu/multiarch/s_floor.S +++ b/sysdeps/x86_64/multiarch/wcslen.c @@ -1,6 +1,7 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Multiple versions of wcslen. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. - Contributed by Ulrich Drepper <drepper@gmail.come>, 2011. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -16,23 +17,15 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include <machine/asm.h> -#include <init-arch.h> +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define __wcslen __redirect_wcslen +# include <wchar.h> +# undef __wcslen +# define SYMBOL_NAME wcslen +# include "ifunc-avx2.h" -ENTRY(__floor) - .type __floor, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq __floor_sse41(%rip), %rax - HAS_CPU_FEATURE (SSE4_1) - jnz 2f - leaq __floor_c(%rip), %rax -2: ret -END(__floor) -weak_alias (__floor, floor) - - -ENTRY(__floor_sse41) - roundsd $1, %xmm0, %xmm0 - ret -END(__floor_sse41) +libc_ifunc_redirected (__redirect_wcslen, __wcslen, IFUNC_SELECTOR ()); +weak_alias (__wcslen, wcslen); +#endif diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S new file mode 100644 index 0000000000..4fa1de4d3f --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S @@ -0,0 +1,5 @@ +#define STRCMP __wcsncmp_avx2 +#define USE_AS_STRNCMP 1 +#define USE_AS_WCSCMP 1 + +#include "strcmp-avx2.S" diff --git a/sysdeps/x86_64/multiarch/wcsncmp-sse2.c b/sysdeps/x86_64/multiarch/wcsncmp-sse2.c new file mode 100644 index 0000000000..2bc7b4f693 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsncmp-sse2.c @@ -0,0 +1,20 @@ +/* wcsncmp optimized with SSE2. + Copyright (C) 2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define WCSNCMP __wcsncmp_sse2 +#include <wcsmbs/wcsncmp.c> diff --git a/sysdeps/x86_64/multiarch/wcsncmp.c b/sysdeps/x86_64/multiarch/wcsncmp.c new file mode 100644 index 0000000000..90e9a352d9 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsncmp.c @@ -0,0 +1,31 @@ +/* Multiple versions of wcsncmp. + Copyright (C) 2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define wcsncmp __redirect_wcsncmp +# define __wcsncmp __redirect___wcsncmp +# include <wchar.h> +# undef wcsncmp +# undef __wcsncmp + +# define SYMBOL_NAME wcsncmp +# include "ifunc-avx2.h" + +libc_ifunc_redirected (__redirect_wcsncmp, wcsncmp, IFUNC_SELECTOR ()); +#endif diff --git a/sysdeps/x86_64/multiarch/wcsnlen-avx2.S b/sysdeps/x86_64/multiarch/wcsnlen-avx2.S new file mode 100644 index 0000000000..fac83546b5 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsnlen-avx2.S @@ -0,0 +1,5 @@ +#define STRLEN __wcsnlen_avx2 +#define USE_AS_WCSLEN 1 +#define USE_AS_STRNLEN 1 + +#include "strlen-avx2.S" diff --git a/sysdeps/x86_64/multiarch/wcsnlen-c.c b/sysdeps/x86_64/multiarch/wcsnlen-c.c new file mode 100644 index 0000000000..e1ec7cfbb5 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsnlen-c.c @@ -0,0 +1,9 @@ +#if IS_IN (libc) +# include <wchar.h> + +# define WCSNLEN __wcsnlen_sse2 + +extern __typeof (wcsnlen) __wcsnlen_sse2; +#endif + +#include "wcsmbs/wcsnlen.c" diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S new file mode 100644 index 0000000000..a8cab0cb00 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S @@ -0,0 +1,5 @@ +#define AS_WCSLEN +#define AS_STRNLEN +#define strlen __wcsnlen_sse4_1 + +#include "../strlen.S" diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c new file mode 100644 index 0000000000..bd376057e3 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsnlen.c @@ -0,0 +1,51 @@ +/* Multiple versions of wcsnlen. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define __wcsnlen __redirect_wcsnlen +# include <wchar.h> +# undef __wcsnlen + +# define SYMBOL_NAME wcsnlen +# include <init-arch.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) + && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + return OPTIMIZE (avx2); + + if (CPU_FEATURES_CPU_P (cpu_features, SSE4_1)) + return OPTIMIZE (sse4_1); + + return OPTIMIZE (sse2); +} + +libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ()); +weak_alias (__wcsnlen, wcsnlen); +#endif diff --git a/sysdeps/x86_64/multiarch/wcsrchr-avx2.S b/sysdeps/x86_64/multiarch/wcsrchr-avx2.S new file mode 100644 index 0000000000..cf8a239ab2 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsrchr-avx2.S @@ -0,0 +1,3 @@ +#define STRRCHR __wcsrchr_avx2 +#define USE_AS_WCSRCHR 1 +#include "strrchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S new file mode 100644 index 0000000000..d015e95317 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S @@ -0,0 +1,23 @@ +/* wcsrchr optimized with SSE2. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# define wcsrchr __wcsrchr_sse2 +#endif + +#include "../wcsrchr.S" diff --git a/sysdeps/x86_64/multiarch/wcsrchr.c b/sysdeps/x86_64/multiarch/wcsrchr.c new file mode 100644 index 0000000000..219fc828a6 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsrchr.c @@ -0,0 +1,29 @@ +/* Multiple versions of wcsrchr. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define wcsrchr __redirect_wcsrchr +# include <wchar.h> +# undef wcsrchr + +# define SYMBOL_NAME wcsrchr +# include "ifunc-avx2.h" + +libc_ifunc_redirected (__redirect_wcsrchr, wcsrchr, IFUNC_SELECTOR ()); +#endif diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2.S b/sysdeps/x86_64/multiarch/wmemchr-avx2.S new file mode 100644 index 0000000000..282854f1a1 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemchr-avx2.S @@ -0,0 +1,4 @@ +#define MEMCHR __wmemchr_avx2 +#define USE_AS_WMEMCHR 1 + +#include "memchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-sse2.S b/sysdeps/x86_64/multiarch/wmemchr-sse2.S new file mode 100644 index 0000000000..70a965d552 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemchr-sse2.S @@ -0,0 +1,4 @@ +#define USE_AS_WMEMCHR 1 +#define wmemchr __wmemchr_sse2 + +#include "../memchr.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr.c b/sysdeps/x86_64/multiarch/wmemchr.c new file mode 100644 index 0000000000..6d833702c6 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemchr.c @@ -0,0 +1,39 @@ +/* Multiple versions of wmemchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define wmemchr __redirect_wmemchr +# define __wmemchr __redirect___wmemchr +# include <wchar.h> +# undef wmemchr +# undef __wmemchr + +# define SYMBOL_NAME wmemchr +# include "ifunc-avx2.h" + +libc_ifunc_redirected (__redirect_wmemchr, __wmemchr, IFUNC_SELECTOR ()); +weak_alias (__wmemchr, wmemchr) +# ifdef SHARED +__hidden_ver1 (__wmemchr, __GI___wmemchr, __redirect___wmemchr) + __attribute__((visibility ("hidden"))); +__hidden_ver1 (wmemchr, __GI_wmemchr, __redirect_wmemchr) + __attribute__((weak, visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S new file mode 100644 index 0000000000..bfa1a16a35 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S @@ -0,0 +1,4 @@ +#define MEMCMP __wmemcmp_avx2_movbe +#define USE_AS_WMEMCMP 1 + +#include "memcmp-avx2-movbe.S" diff --git a/sysdeps/x86_64/multiarch/wmemcmp.c b/sysdeps/x86_64/multiarch/wmemcmp.c new file mode 100644 index 0000000000..3f4a7422f3 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemcmp.c @@ -0,0 +1,30 @@ +/* Multiple versions of wmemcmp + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define wmemcmp __redirect_wmemcmp +# include <wchar.h> +# undef wmemcmp + +# define SYMBOL_NAME wmemcmp +# include "ifunc-memcmp.h" + +libc_ifunc_redirected (__redirect_wmemcmp, wmemcmp, IFUNC_SELECTOR ()); +#endif diff --git a/sysdeps/x86_64/multiarch/wmemset.c b/sysdeps/x86_64/multiarch/wmemset.c new file mode 100644 index 0000000000..9fee77ea81 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemset.c @@ -0,0 +1,40 @@ +/* Multiple versions of wmemset. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define wmemset __redirect_wmemset +# define __wmemset __redirect___wmemset +# include <wchar.h> +# undef wmemset +# undef __wmemset + +# define SYMBOL_NAME wmemset +# include "ifunc-wmemset.h" + +libc_ifunc_redirected (__redirect_wmemset, __wmemset, IFUNC_SELECTOR ()); +weak_alias (__wmemset, wmemset) + +# ifdef SHARED +__hidden_ver1 (__wmemset, __GI___wmemset, __redirect___wmemset) + __attribute__ ((visibility ("hidden"))); +__hidden_ver1 (wmemset, __GI_wmemset, __redirect_wmemset) + __attribute__ ((visibility ("hidden"))); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S b/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S new file mode 100644 index 0000000000..140c93d6f0 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S @@ -0,0 +1,21 @@ +/* Non-shared version of wmemset_chk for x86-64. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) && !defined SHARED +# include <sysdeps/x86_64/wmemset_chk.S> +#endif diff --git a/sysdeps/x86_64/multiarch/wmemset_chk.c b/sysdeps/x86_64/multiarch/wmemset_chk.c new file mode 100644 index 0000000000..88280192c5 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemset_chk.c @@ -0,0 +1,31 @@ +/* Multiple versions of wmemset_chk. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc.so. */ +#if IS_IN (libc) && defined SHARED +# define __wmemset_chk __redirect_wmemset_chk +# include <wchar.h> +# undef __wmemset_chk + +# define SYMBOL_NAME wmemset_chk +# include "ifunc-wmemset.h" + +libc_ifunc_redirected (__redirect_wmemset_chk, __wmemset_chk, + IFUNC_SELECTOR ()); +#endif diff --git a/sysdeps/x86_64/nptl/Makefile b/sysdeps/x86_64/nptl/Makefile index 9b64b533ee..73024033ee 100644 --- a/sysdeps/x86_64/nptl/Makefile +++ b/sysdeps/x86_64/nptl/Makefile @@ -1,4 +1,4 @@ -# Copyright (C) 2002-2016 Free Software Foundation, Inc. +# Copyright (C) 2002-2018 Free Software Foundation, Inc. # This file is part of the GNU C Library. # The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/nptl/pthread-offsets.h b/sysdeps/x86_64/nptl/pthread-offsets.h new file mode 100644 index 0000000000..16c6b0d9fd --- /dev/null +++ b/sysdeps/x86_64/nptl/pthread-offsets.h @@ -0,0 +1,5 @@ +#define __PTHREAD_MUTEX_NUSERS_OFFSET 12 +#define __PTHREAD_MUTEX_KIND_OFFSET 16 +#define __PTHREAD_MUTEX_SPINS_OFFSET 20 +#define __PTHREAD_MUTEX_ELISION_OFFSET 22 +#define __PTHREAD_MUTEX_LIST_OFFSET 24 diff --git a/sysdeps/x86_64/nptl/pthread_spin_lock.S b/sysdeps/x86_64/nptl/pthread_spin_lock.S index b871241617..730fd65034 100644 --- a/sysdeps/x86_64/nptl/pthread_spin_lock.S +++ b/sysdeps/x86_64/nptl/pthread_spin_lock.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2016 Free Software Foundation, Inc. +/* Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/nptl/pthread_spin_trylock.S b/sysdeps/x86_64/nptl/pthread_spin_trylock.S index c9c53171fe..a8f1b72d60 100644 --- a/sysdeps/x86_64/nptl/pthread_spin_trylock.S +++ b/sysdeps/x86_64/nptl/pthread_spin_trylock.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2002-2016 Free Software Foundation, Inc. +/* Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. diff --git a/sysdeps/x86_64/nptl/pthread_spin_unlock.S b/sysdeps/x86_64/nptl/pthread_spin_unlock.S index 188de2e8cb..afd114e855 100644 --- a/sysdeps/x86_64/nptl/pthread_spin_unlock.S +++ b/sysdeps/x86_64/nptl/pthread_spin_unlock.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2002-2016 Free Software Foundation, Inc. +/* Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. diff --git a/sysdeps/x86_64/nptl/pthreaddef.h b/sysdeps/x86_64/nptl/pthreaddef.h index 9397efc631..036deb5772 100644 --- a/sysdeps/x86_64/nptl/pthreaddef.h +++ b/sysdeps/x86_64/nptl/pthreaddef.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2002-2016 Free Software Foundation, Inc. +/* Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. diff --git a/sysdeps/x86_64/nptl/tcb-offsets.sym b/sysdeps/x86_64/nptl/tcb-offsets.sym index aeb752673a..ae8034743b 100644 --- a/sysdeps/x86_64/nptl/tcb-offsets.sym +++ b/sysdeps/x86_64/nptl/tcb-offsets.sym @@ -4,7 +4,6 @@ RESULT offsetof (struct pthread, result) TID offsetof (struct pthread, tid) -PID offsetof (struct pthread, pid) CANCELHANDLING offsetof (struct pthread, cancelhandling) CLEANUP_JMP_BUF offsetof (struct pthread, cleanup_jmp_buf) CLEANUP offsetof (struct pthread, cleanup) @@ -13,9 +12,8 @@ MUTEX_FUTEX offsetof (pthread_mutex_t, __data.__lock) MULTIPLE_THREADS_OFFSET offsetof (tcbhead_t, multiple_threads) POINTER_GUARD offsetof (tcbhead_t, pointer_guard) VGETCPU_CACHE_OFFSET offsetof (tcbhead_t, vgetcpu_cache) -#ifndef __ASSUME_PRIVATE_FUTEX -PRIVATE_FUTEX offsetof (tcbhead_t, private_futex) -#endif +FEATURE_1_OFFSET offsetof (tcbhead_t, feature_1) +SSP_BASE_OFFSET offsetof (tcbhead_t, ssp_base) -- Not strictly offsets, but these values are also used in the TCB. TCB_CANCELSTATE_BITMASK CANCELSTATE_BITMASK diff --git a/sysdeps/x86_64/nptl/tls.h b/sysdeps/x86_64/nptl/tls.h index 2b061a07c6..e88561c934 100644 --- a/sysdeps/x86_64/nptl/tls.h +++ b/sysdeps/x86_64/nptl/tls.h @@ -1,5 +1,5 @@ /* Definition for thread-local data handling. nptl/x86_64 version. - Copyright (C) 2002-2016 Free Software Foundation, Inc. + Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -26,8 +26,9 @@ # include <stdint.h> # include <stdlib.h> # include <sysdep.h> -# include <libc-internal.h> +# include <libc-pointer-arith.h> /* For cast_to_integer. */ # include <kernel-features.h> +# include <dl-dtv.h> /* Replacement type for __m128 since this file is included by ld.so, which is compiled with -mno-sse. It must not change the alignment @@ -38,18 +39,6 @@ typedef struct } __128bits; -/* Type for the dtv. */ -typedef union dtv -{ - size_t counter; - struct - { - void *val; - bool is_static; - } pointer; -} dtv_t; - - typedef struct { void *tcb; /* Pointer to the TCB. Not necessarily the @@ -62,17 +51,17 @@ typedef struct uintptr_t stack_guard; uintptr_t pointer_guard; unsigned long int vgetcpu_cache[2]; -# ifndef __ASSUME_PRIVATE_FUTEX - int private_futex; -# else - int __glibc_reserved1; -# endif + /* Bit 0: X86_FEATURE_1_IBT. + Bit 1: X86_FEATURE_1_SHSTK. + */ + unsigned int feature_1; int __glibc_unused1; /* Reservation of some values for the TM ABI. */ void *__private_tm[4]; /* GCC split stack support. */ void *__private_ss; - long int __glibc_reserved2; + /* The lowest address of shadow stack, */ + unsigned long long int ssp_base; /* Must be kept even if it is no longer used by glibc since programs, like AddressSanitizer, depend on the size of tcbhead_t. */ __128bits __glibc_unused2[8][4] __attribute__ ((aligned (32))); @@ -80,6 +69,23 @@ typedef struct void *__padding[8]; } tcbhead_t; +# ifdef __ILP32__ +/* morestack.S in libgcc uses offset 0x40 to access __private_ss, */ +_Static_assert (offsetof (tcbhead_t, __private_ss) == 0x40, + "offset of __private_ss != 0x40"); +/* NB: ssp_base used to be "long int __glibc_reserved2", which was + changed from 32 bits to 64 bits. Make sure that the offset of the + next field, __glibc_unused2, is unchanged. */ +_Static_assert (offsetof (tcbhead_t, __glibc_unused2) == 0x60, + "offset of __glibc_unused2 != 0x60"); +# else +/* morestack.S in libgcc uses offset 0x70 to access __private_ss, */ +_Static_assert (offsetof (tcbhead_t, __private_ss) == 0x70, + "offset of __private_ss != 0x70"); +_Static_assert (offsetof (tcbhead_t, __glibc_unused2) == 0x80, + "offset of __glibc_unused2 != 0x80"); +# endif + #else /* __ASSEMBLER__ */ # include <tcb-offsets.h> #endif @@ -337,18 +343,6 @@ typedef struct abort (); }) -# define CALL_THREAD_FCT(descr) \ - ({ void *__res; \ - asm volatile ("movq %%fs:%P2, %%rdi\n\t" \ - "callq *%%fs:%P1" \ - : "=a" (__res) \ - : "i" (offsetof (struct pthread, start_routine)), \ - "i" (offsetof (struct pthread, arg)) \ - : "di", "si", "cx", "dx", "r8", "r9", "r10", "r11", \ - "memory", "cc"); \ - __res; }) - - /* Set the stack guard field in TCB head. */ # define THREAD_SET_STACK_GUARD(value) \ THREAD_SETMEM (THREAD_SELF, header.stack_guard, value) @@ -366,6 +360,7 @@ typedef struct /* Get and set the global scope generation counter in the TCB head. */ +# define THREAD_GSCOPE_IN_TCB 1 # define THREAD_GSCOPE_FLAG_UNUSED 0 # define THREAD_GSCOPE_FLAG_USED 1 # define THREAD_GSCOPE_FLAG_WAIT 2 diff --git a/sysdeps/x86_64/rawmemchr.S b/sysdeps/x86_64/rawmemchr.S index f90b7921a1..cf972768ec 100644 --- a/sysdeps/x86_64/rawmemchr.S +++ b/sysdeps/x86_64/rawmemchr.S @@ -1,6 +1,6 @@ /* fast SSE2 memchr with 64 byte loop and pmaxub instruction using - Copyright (C) 2011-2016 Free Software Foundation, Inc. + Copyright (C) 2011-2018 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -196,11 +196,6 @@ L(matches32): lea 32(%rax, %rdi), %rax ret - .p2align 4 -L(return_null): - xor %rax, %rax - ret - END (__rawmemchr) weak_alias (__rawmemchr, rawmemchr) diff --git a/sysdeps/x86_64/rshift.S b/sysdeps/x86_64/rshift.S index c88c6d82bb..1d018b857b 100644 --- a/sysdeps/x86_64/rshift.S +++ b/sysdeps/x86_64/rshift.S @@ -1,5 +1,5 @@ /* x86-64 __mpn_rshift -- - Copyright (C) 2007-2016 Free Software Foundation, Inc. + Copyright (C) 2007-2018 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify diff --git a/sysdeps/x86_64/rtld-offsets.sym b/sysdeps/x86_64/rtld-offsets.sym new file mode 100644 index 0000000000..fd41b51521 --- /dev/null +++ b/sysdeps/x86_64/rtld-offsets.sym @@ -0,0 +1,6 @@ +#define SHARED +#include <ldsodefs.h> + +-- + +GL_TLS_GENERATION_OFFSET offsetof (struct rtld_global, _dl_tls_generation) diff --git a/sysdeps/x86_64/sched_cpucount.c b/sysdeps/x86_64/sched_cpucount.c index 0834e711b3..af5bbcc044 100644 --- a/sysdeps/x86_64/sched_cpucount.c +++ b/sysdeps/x86_64/sched_cpucount.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2016 Free Software Foundation, Inc. +/* Copyright (C) 2007-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/setjmp.S b/sysdeps/x86_64/setjmp.S index 3e93967c2f..78a8bf4644 100644 --- a/sysdeps/x86_64/setjmp.S +++ b/sysdeps/x86_64/setjmp.S @@ -1,5 +1,5 @@ /* setjmp for x86-64. - Copyright (C) 2001-2016 Free Software Foundation, Inc. + Copyright (C) 2001-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,9 +18,15 @@ #include <sysdep.h> #include <jmpbuf-offsets.h> +#include <jmp_buf-ssp.h> #include <asm-syntax.h> #include <stap-probe.h> +/* Don't save shadow stack register if shadow stack isn't enabled. */ +#if !SHSTK_ENABLED +# undef SHADOW_STACK_POINTER_OFFSET +#endif + ENTRY (__sigsetjmp) /* Save registers. */ movq %rbx, (JB_RBX*8)(%rdi) @@ -54,17 +60,28 @@ ENTRY (__sigsetjmp) #endif movq %rax, (JB_PC*8)(%rdi) +#ifdef SHADOW_STACK_POINTER_OFFSET +# if IS_IN (libc) && defined SHARED && defined FEATURE_1_OFFSET + /* Check if Shadow Stack is enabled. */ + testl $X86_FEATURE_1_SHSTK, %fs:FEATURE_1_OFFSET + jz L(skip_ssp) +# else + xorl %eax, %eax +# endif + /* Get the current Shadow-Stack-Pointer and save it. */ + rdsspq %rax + movq %rax, SHADOW_STACK_POINTER_OFFSET(%rdi) +# if IS_IN (libc) && defined SHARED && defined FEATURE_1_OFFSET +L(skip_ssp): +# endif +#endif #if IS_IN (rtld) /* In ld.so we never save the signal mask. */ xorl %eax, %eax retq #else /* Make a tail call to __sigjmp_save; it takes the same args. */ -# ifdef PIC - jmp C_SYMBOL_NAME (__sigjmp_save)@PLT -# else jmp __sigjmp_save -# endif #endif END (__sigsetjmp) hidden_def (__sigsetjmp) diff --git a/sysdeps/x86_64/stackinfo.h b/sysdeps/x86_64/stackinfo.h index 848aa7754c..f7a5672f27 100644 --- a/sysdeps/x86_64/stackinfo.h +++ b/sysdeps/x86_64/stackinfo.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2001-2016 Free Software Foundation, Inc. +/* Copyright (C) 2001-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/start.S b/sysdeps/x86_64/start.S index 1374974307..354d2e6ec7 100644 --- a/sysdeps/x86_64/start.S +++ b/sysdeps/x86_64/start.S @@ -1,5 +1,5 @@ /* Startup code compliant to the ELF x86-64 ABI. - Copyright (C) 2001-2016 Free Software Foundation, Inc. + Copyright (C) 2001-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Andreas Jaeger <aj@suse.de>, 2001. @@ -96,27 +96,28 @@ ENTRY (_start) which grow downwards). */ pushq %rsp -#ifdef SHARED +#ifdef PIC /* Pass address of our own entry points to .fini and .init. */ mov __libc_csu_fini@GOTPCREL(%rip), %R8_LP mov __libc_csu_init@GOTPCREL(%rip), %RCX_LP mov main@GOTPCREL(%rip), %RDI_LP - - /* Call the user's main function, and exit with its value. - But let the libc call main. */ - call __libc_start_main@PLT #else /* Pass address of our own entry points to .fini and .init. */ mov $__libc_csu_fini, %R8_LP mov $__libc_csu_init, %RCX_LP mov $main, %RDI_LP +#endif /* Call the user's main function, and exit with its value. - But let the libc call main. */ - call __libc_start_main -#endif + But let the libc call main. Since __libc_start_main in + libc.so is called very early, lazy binding isn't relevant + here. Use indirect branch via GOT to avoid extra branch + to PLT slot. In case of static executable, ld in binutils + 2.26 or above can convert indirect branch into direct + branch. */ + call *__libc_start_main@GOTPCREL(%rip) hlt /* Crash if somehow `exit' does return. */ END (_start) diff --git a/sysdeps/x86_64/strcasecmp_l-nonascii.c b/sysdeps/x86_64/strcasecmp_l-nonascii.c index 30e8969603..9ba9bc808c 100644 --- a/sysdeps/x86_64/strcasecmp_l-nonascii.c +++ b/sysdeps/x86_64/strcasecmp_l-nonascii.c @@ -1,7 +1,7 @@ #include <string.h> extern int __strcasecmp_l_nonascii (const char *__s1, const char *__s2, - __locale_t __loc); + locale_t __loc); #define __strcasecmp_l __strcasecmp_l_nonascii #define USE_IN_EXTENDED_LOCALE_MODEL 1 diff --git a/sysdeps/x86_64/strcat.S b/sysdeps/x86_64/strcat.S index dadf4c76b2..9a4a4e6feb 100644 --- a/sysdeps/x86_64/strcat.S +++ b/sysdeps/x86_64/strcat.S @@ -1,6 +1,6 @@ /* strcat(dest, src) -- Append SRC on the end of DEST. Optimized for x86-64. - Copyright (C) 2002-2016 Free Software Foundation, Inc. + Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Andreas Jaeger <aj@suse.de>, 2002. diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S index 4431fee648..1d5112746f 100644 --- a/sysdeps/x86_64/strchr.S +++ b/sysdeps/x86_64/strchr.S @@ -1,6 +1,6 @@ /* strchr (str, ch) -- Return pointer to first occurrence of CH in STR. For AMD x86-64. - Copyright (C) 2009-2016 Free Software Foundation, Inc. + Copyright (C) 2009-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/strchrnul.S b/sysdeps/x86_64/strchrnul.S index 7b52d699ee..149f3a9ced 100644 --- a/sysdeps/x86_64/strchrnul.S +++ b/sysdeps/x86_64/strchrnul.S @@ -1,7 +1,7 @@ /* strchrnul (str, ch) -- Return pointer to first occurrence of CH in STR or terminating NUL byte. For AMD x86-64. - Copyright (C) 2009-2016 Free Software Foundation, Inc. + Copyright (C) 2009-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S index c5c44d4e27..e16945b961 100644 --- a/sysdeps/x86_64/strcmp.S +++ b/sysdeps/x86_64/strcmp.S @@ -1,5 +1,5 @@ /* Highly optimized version for x86-64. - Copyright (C) 1999-2016 Free Software Foundation, Inc. + Copyright (C) 1999-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Based on i686 version contributed by Ulrich Drepper <drepper@cygnus.com>, 1999. @@ -233,7 +233,7 @@ LABEL(bigger): lea LABEL(unaligned_table)(%rip), %r10 movslq (%r10, %r9,4), %r9 lea (%r10, %r9), %r10 - jmp *%r10 /* jump to corresponding case */ + _CET_NOTRACK jmp *%r10 /* jump to corresponding case */ /* * The following cases will be handled by ashr_0 diff --git a/sysdeps/x86_64/strcpy.S b/sysdeps/x86_64/strcpy.S index 3f90c0020a..66128a7cb5 100644 --- a/sysdeps/x86_64/strcpy.S +++ b/sysdeps/x86_64/strcpy.S @@ -1,5 +1,5 @@ /* strcpy/stpcpy implementation for x86-64. - Copyright (C) 2002-2016 Free Software Foundation, Inc. + Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Andreas Jaeger <aj@suse.de>, 2002. diff --git a/sysdeps/x86_64/strcspn.S b/sysdeps/x86_64/strcspn.S index de526c8fdd..7f9202d656 100644 --- a/sysdeps/x86_64/strcspn.S +++ b/sysdeps/x86_64/strcspn.S @@ -1,7 +1,7 @@ /* strcspn (str, ss) -- Return the length of the initial segment of STR which contains no characters from SS. For AMD x86-64. - Copyright (C) 1994-2016 Free Software Foundation, Inc. + Copyright (C) 1994-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>. Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>. @@ -24,9 +24,6 @@ #include <sysdep.h> #include "asm-syntax.h" -/* BEWARE: `#ifdef strcspn' means that strcspn is redefined as `strpbrk' */ -#define STRPBRK_P (defined strcspn) - .text ENTRY (strcspn) @@ -111,7 +108,7 @@ L(5): incq %rax L(4): addq $256, %rsp /* remove skipset */ cfi_adjust_cfa_offset(-256) -#if STRPBRK_P +#ifdef USE_AS_STRPBRK xorl %edx,%edx orb %cl, %cl /* was last character NUL? */ cmovzq %rdx, %rax /* Yes: return NULL */ diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S index 12f63ad1bb..01cb5fa846 100644 --- a/sysdeps/x86_64/strlen.S +++ b/sysdeps/x86_64/strlen.S @@ -1,5 +1,5 @@ -/* SSE2 version of strlen. - Copyright (C) 2012-2016 Free Software Foundation, Inc. +/* SSE2 version of strlen/wcslen. + Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,6 +18,16 @@ #include <sysdep.h> +#ifdef AS_WCSLEN +# define PMINU pminud +# define PCMPEQ pcmpeqd +# define SHIFT_RETURN shrq $2, %rax +#else +# define PMINU pminub +# define PCMPEQ pcmpeqb +# define SHIFT_RETURN +#endif + /* Long lived register in strlen(s), strnlen(s, n) are: %xmm3 - zero @@ -32,10 +42,10 @@ ENTRY(strlen) /* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ #define FIND_ZERO \ - pcmpeqb (%rax), %xmm0; \ - pcmpeqb 16(%rax), %xmm1; \ - pcmpeqb 32(%rax), %xmm2; \ - pcmpeqb 48(%rax), %xmm3; \ + PCMPEQ (%rax), %xmm0; \ + PCMPEQ 16(%rax), %xmm1; \ + PCMPEQ 32(%rax), %xmm2; \ + PCMPEQ 48(%rax), %xmm3; \ pmovmskb %xmm0, %esi; \ pmovmskb %xmm1, %edx; \ pmovmskb %xmm2, %r8d; \ @@ -54,6 +64,9 @@ ENTRY(strlen) xor %rax, %rax ret L(n_nonzero): +# ifdef AS_WCSLEN + shlq $2, %rsi +# endif /* Initialize long lived registers. */ @@ -96,6 +109,7 @@ L(n_nonzero): test %rdx, %rdx; \ je L(lab); \ bsfq %rdx, %rax; \ + SHIFT_RETURN; \ ret #ifdef AS_STRNLEN @@ -104,19 +118,20 @@ L(n_nonzero): #else /* Test first 16 bytes unaligned. */ movdqu (%rax), %xmm4 - pcmpeqb %xmm0, %xmm4 + PCMPEQ %xmm0, %xmm4 pmovmskb %xmm4, %edx test %edx, %edx je L(next48_bytes) bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ + SHIFT_RETURN ret L(next48_bytes): /* Same as FIND_ZERO except we do not check first 16 bytes. */ andq $-16, %rax - pcmpeqb 16(%rax), %xmm1 - pcmpeqb 32(%rax), %xmm2 - pcmpeqb 48(%rax), %xmm3 + PCMPEQ 16(%rax), %xmm1 + PCMPEQ 32(%rax), %xmm2 + PCMPEQ 48(%rax), %xmm3 pmovmskb %xmm1, %edx pmovmskb %xmm2, %r8d pmovmskb %xmm3, %ecx @@ -145,6 +160,7 @@ L(strnlen_ret): test %rdx, %rdx je L(loop_init) bsfq %rdx, %rax + SHIFT_RETURN ret #endif .p2align 4 @@ -161,10 +177,10 @@ L(loop): je L(exit_end) movdqa (%rax), %xmm0 - pminub 16(%rax), %xmm0 - pminub 32(%rax), %xmm0 - pminub 48(%rax), %xmm0 - pcmpeqb %xmm3, %xmm0 + PMINU 16(%rax), %xmm0 + PMINU 32(%rax), %xmm0 + PMINU 48(%rax), %xmm0 + PCMPEQ %xmm3, %xmm0 pmovmskb %xmm0, %edx testl %edx, %edx jne L(exit) @@ -182,6 +198,7 @@ L(first): bsfq %rdx, %rdx addq %rdx, %rax subq %rdi, %rax + SHIFT_RETURN ret .p2align 4 @@ -192,6 +209,7 @@ L(exit): bsfq %rdx, %rdx addq %rdx, %rax subq %rdi, %rax + SHIFT_RETURN ret #else @@ -201,10 +219,10 @@ L(exit): L(loop): movdqa 64(%rax), %xmm0 - pminub 80(%rax), %xmm0 - pminub 96(%rax), %xmm0 - pminub 112(%rax), %xmm0 - pcmpeqb %xmm3, %xmm0 + PMINU 80(%rax), %xmm0 + PMINU 96(%rax), %xmm0 + PMINU 112(%rax), %xmm0 + PCMPEQ %xmm3, %xmm0 pmovmskb %xmm0, %edx testl %edx, %edx jne L(exit64) @@ -212,10 +230,10 @@ L(loop): subq $-128, %rax movdqa (%rax), %xmm0 - pminub 16(%rax), %xmm0 - pminub 32(%rax), %xmm0 - pminub 48(%rax), %xmm0 - pcmpeqb %xmm3, %xmm0 + PMINU 16(%rax), %xmm0 + PMINU 32(%rax), %xmm0 + PMINU 48(%rax), %xmm0 + PCMPEQ %xmm3, %xmm0 pmovmskb %xmm0, %edx testl %edx, %edx jne L(exit0) @@ -231,6 +249,7 @@ L(exit0): bsfq %rdx, %rdx addq %rdx, %rax subq %rdi, %rax + SHIFT_RETURN ret #endif diff --git a/sysdeps/x86_64/strncase_l-nonascii.c b/sysdeps/x86_64/strncase_l-nonascii.c index 8664863778..e3d83a06cd 100644 --- a/sysdeps/x86_64/strncase_l-nonascii.c +++ b/sysdeps/x86_64/strncase_l-nonascii.c @@ -1,7 +1,7 @@ #include <string.h> extern int __strncasecmp_l_nonascii (const char *__s1, const char *__s2, - size_t __n, __locale_t __loc); + size_t __n, locale_t __loc); #define __strncasecmp_l __strncasecmp_l_nonascii #define USE_IN_EXTENDED_LOCALE_MODEL 1 diff --git a/sysdeps/x86_64/strpbrk.S b/sysdeps/x86_64/strpbrk.S index 9b97ada84e..21888a5b92 100644 --- a/sysdeps/x86_64/strpbrk.S +++ b/sysdeps/x86_64/strpbrk.S @@ -1,2 +1,3 @@ #define strcspn strpbrk +#define USE_AS_STRPBRK #include <sysdeps/x86_64/strcspn.S> diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S index de0be762ed..aca98e7eaa 100644 --- a/sysdeps/x86_64/strrchr.S +++ b/sysdeps/x86_64/strrchr.S @@ -1,5 +1,5 @@ /* strrchr (str, ch) -- Return pointer to last occurrence of CH in STR. - Copyright (C) 2013-2016 Free Software Foundation, Inc. + Copyright (C) 2013-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/strspn.S b/sysdeps/x86_64/strspn.S index 49dd4ba9f5..635f1bc6ce 100644 --- a/sysdeps/x86_64/strspn.S +++ b/sysdeps/x86_64/strspn.S @@ -1,7 +1,7 @@ /* strspn (str, ss) -- Return the length of the initial segment of STR which contains only characters from SS. For AMD x86-64. - Copyright (C) 1994-2016 Free Software Foundation, Inc. + Copyright (C) 1994-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>. Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>. diff --git a/sysdeps/x86_64/strtok.S b/sysdeps/x86_64/strtok.S deleted file mode 100644 index bd5b103d50..0000000000 --- a/sysdeps/x86_64/strtok.S +++ /dev/null @@ -1,208 +0,0 @@ -/* strtok (str, delim) -- Return next DELIM separated token from STR. - For AMD x86-64. - Copyright (C) 1998-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - Based on i686 version contributed by Ulrich Drepper - <drepper@cygnus.com>, 1998. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include "asm-syntax.h" - -/* This file can be used for the strtok and strtok_r functions: - - strtok: - INPUT PARAMETER: - str %rdi - delim %rsi - - strtok_r: - INPUT PARAMETER: - str %rdi - delim %rsi - save_ptr %rdx - - We do a common implementation here. */ - -#ifdef USE_AS_STRTOK_R -# define SAVE_PTR (%r9) -#else - .bss - .local save_ptr - .type save_ptr, @object - .size save_ptr, LP_SIZE -save_ptr: - .space LP_SIZE - -# ifdef PIC -# define SAVE_PTR save_ptr(%rip) -# else -# define SAVE_PTR save_ptr -# endif - -# define FUNCTION strtok -#endif - - .text -ENTRY (FUNCTION) - /* First we create a table with flags for all possible characters. - For the ASCII (7bit/8bit) or ISO-8859-X character sets which are - supported by the C string functions we have 256 characters. - Before inserting marks for the stop characters we clear the whole - table. */ - movq %rdi, %r8 /* Save value. */ - subq $256, %rsp /* Make space for 256 bytes. */ - cfi_adjust_cfa_offset(256) - movl $32, %ecx /* 32*8 bytes = 256 bytes. */ - movq %rsp, %rdi - xorl %eax, %eax /* We store 0s. */ - cld - rep - stosq - - /* Note: %rcx = 0 !!! */ - -#ifdef USE_AS_STRTOK_R - /* The value is stored in the third argument. */ - mov %RDX_LP, %R9_LP /* Save value - see def. of SAVE_PTR. */ - mov (%rdx), %RAX_LP -#else - /* The value is in the local variable defined above. But - we have to take care for PIC code. */ - mov SAVE_PTR, %RAX_LP -#endif - movq %r8, %rdx /* Get start of string. */ - - /* If the pointer is NULL we have to use the stored value of - the last run. */ - cmpq $0, %rdx - cmove %rax, %rdx - testq %rdx, %rdx - jz L(returnNULL) - movq %rsi, %rax /* Get start of delimiter set. */ - -/* For understanding the following code remember that %rcx == 0 now. - Although all the following instruction only modify %cl we always - have a correct zero-extended 64-bit value in %rcx. */ - -L(2): movb (%rax), %cl /* get byte from stopset */ - testb %cl, %cl /* is NUL char? */ - jz L(1) /* yes => start compare loop */ - movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ - - movb 1(%rax), %cl /* get byte from stopset */ - testb $0xff, %cl /* is NUL char? */ - jz L(1) /* yes => start compare loop */ - movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ - - movb 2(%rax), %cl /* get byte from stopset */ - testb $0xff, %cl /* is NUL char? */ - jz L(1) /* yes => start compare loop */ - movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ - - movb 3(%rax), %cl /* get byte from stopset */ - addq $4, %rax /* increment stopset pointer */ - movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ - testb $0xff, %cl /* is NUL char? */ - jnz L(2) /* no => process next dword from stopset */ - -L(1): - - leaq -4(%rdx), %rax /* prepare loop */ - - /* We use a neat trick for the following loop. Normally we would - have to test for two termination conditions - 1. a character in the stopset was found - and - 2. the end of the string was found - As a sign that the character is in the stopset we store its - value in the table. The value of NUL is NUL so the loop - terminates for NUL in every case. */ - -L(3): addq $4, %rax /* adjust pointer for full loop round */ - - movb (%rax), %cl /* get byte from string */ - testb %cl, (%rsp,%rcx) /* is it contained in stopset? */ - jz L(4) /* no => start of token */ - - movb 1(%rax), %cl /* get byte from string */ - testb %cl, (%rsp,%rcx) /* is it contained in stopset? */ - jz L(5) /* no => start of token */ - - movb 2(%rax), %cl /* get byte from string */ - testb %cl, (%rsp,%rcx) /* is it contained in stopset? */ - jz L(6) /* no => start of token */ - - movb 3(%rax), %cl /* get byte from string */ - testb %cl, (%rsp,%rcx) /* is it contained in stopset? */ - jnz L(3) /* yes => start of loop */ - - incq %rax /* adjust pointer */ -L(6): incq %rax -L(5): incq %rax - - /* Now we have to terminate the string. */ - -L(4): leaq -4(%rax), %rdx /* We use %rDX for the next run. */ - -L(7): addq $4, %rdx /* adjust pointer for full loop round */ - - movb (%rdx), %cl /* get byte from string */ - cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ - je L(8) /* yes => return */ - - movb 1(%rdx), %cl /* get byte from string */ - cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ - je L(9) /* yes => return */ - - movb 2(%rdx), %cl /* get byte from string */ - cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ - je L(10) /* yes => return */ - - movb 3(%rdx), %cl /* get byte from string */ - cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ - jne L(7) /* no => start loop again */ - - incq %rdx /* adjust pointer */ -L(10): incq %rdx -L(9): incq %rdx - -L(8): cmpq %rax, %rdx - je L(returnNULL) /* There was no token anymore. */ - - movb $0, (%rdx) /* Terminate string. */ - - /* Are we at end of string? */ - cmpb $0, %cl - leaq 1(%rdx), %rcx - cmovne %rcx, %rdx - - /* Store the pointer to the next character. */ - mov %RDX_LP, SAVE_PTR - -L(epilogue): - /* Remove the stopset table. */ - addq $256, %rsp - cfi_adjust_cfa_offset(-256) - retq - -L(returnNULL): - xorl %eax, %eax - /* Store the pointer to the next character. */ - mov %RDX_LP, SAVE_PTR - jmp L(epilogue) - -END (FUNCTION) diff --git a/sysdeps/x86_64/strtok_r.S b/sysdeps/x86_64/strtok_r.S deleted file mode 100644 index f0db78c67a..0000000000 --- a/sysdeps/x86_64/strtok_r.S +++ /dev/null @@ -1,5 +0,0 @@ -#define FUNCTION __strtok_r -#define USE_AS_STRTOK_R 1 -#include <sysdeps/x86_64/strtok.S> -weak_alias (__strtok_r, strtok_r) -strong_alias (__strtok_r, __GI___strtok_r) diff --git a/sysdeps/x86_64/sub_n.S b/sysdeps/x86_64/sub_n.S index cc9bc48b01..e70d48ba47 100644 --- a/sysdeps/x86_64/sub_n.S +++ b/sysdeps/x86_64/sub_n.S @@ -1,6 +1,6 @@ /* x86-64 __mpn_sub_n -- Add two limb vectors of the same length > 0 and store sum in a third limb vector. - Copyright (C) 2006-2016 Free Software Foundation, Inc. + Copyright (C) 2006-2018 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify diff --git a/sysdeps/x86_64/submul_1.S b/sysdeps/x86_64/submul_1.S index 3037cb9c45..ba1bf92bc5 100644 --- a/sysdeps/x86_64/submul_1.S +++ b/sysdeps/x86_64/submul_1.S @@ -1,6 +1,6 @@ /* x86-64 __mpn_submul_1 -- Multiply a limb vector with a limb and subtract the result from a second limb vector. - Copyright (C) 2003-2016 Free Software Foundation, Inc. + Copyright (C) 2003-2018 Free Software Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h index fbe3560588..1738d7f955 100644 --- a/sysdeps/x86_64/sysdep.h +++ b/sysdeps/x86_64/sysdep.h @@ -1,5 +1,5 @@ /* Assembler macros for x86-64. - Copyright (C) 2001-2016 Free Software Foundation, Inc. + Copyright (C) 2001-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -19,7 +19,7 @@ #ifndef _X86_64_SYSDEP_H #define _X86_64_SYSDEP_H 1 -#include <sysdeps/generic/sysdep.h> +#include <sysdeps/x86/sysdep.h> #ifdef __ASSEMBLER__ @@ -32,28 +32,6 @@ #define cfi_offset_rel_rsp(regn, off) .cfi_escape 0x10, regn, 0x4, 0x13, \ 0x77, off & 0x7F | 0x80, off >> 7 -/* ELF uses byte-counts for .align, most others use log2 of count of bytes. */ -#define ALIGNARG(log2) 1<<log2 -#define ASM_SIZE_DIRECTIVE(name) .size name,.-name; - - -/* Define an entry point visible from C. */ -#define ENTRY(name) \ - .globl C_SYMBOL_NAME(name); \ - .type C_SYMBOL_NAME(name),@function; \ - .align ALIGNARG(4); \ - C_LABEL(name) \ - cfi_startproc; \ - CALL_MCOUNT - -#undef END -#define END(name) \ - cfi_endproc; \ - ASM_SIZE_DIRECTIVE(name) - -#define ENTRY_CHK(name) ENTRY (name) -#define END_CHK(name) END (name) - /* If compiled for profiling, call `mcount' at the start of each function. */ #ifdef PROF /* The mcount code relies on a normal frame pointer being on the stack @@ -70,12 +48,6 @@ #define CALL_MCOUNT /* Do nothing. */ #endif -/* Since C identifiers are not normally prefixed with an underscore - on this system, the asm identifier `syscall_error' intrudes on the - C name space. Make sure we use an innocuous name. */ -#define syscall_error __syscall_error -#define mcount _mcount - #define PSEUDO(name, syscall_name, args) \ lose: \ jmp JUMPTARGET(syscall_error) \ @@ -84,25 +56,18 @@ lose: \ DO_CALL (syscall_name, args); \ jb lose -#undef PSEUDO_END -#define PSEUDO_END(name) \ - END (name) - #undef JUMPTARGET -#ifdef PIC -#define JUMPTARGET(name) name##@PLT +#ifdef SHARED +# ifdef BIND_NOW +# define JUMPTARGET(name) *name##@GOTPCREL(%rip) +# else +# define JUMPTARGET(name) name##@PLT +# endif #else -#define JUMPTARGET(name) name +/* For static archives, branch to target directly. */ +# define JUMPTARGET(name) name #endif -/* Local label name for asm code. */ -#ifndef L -/* ELF-like local names start with `.L'. */ -# define L(name) .L##name -#endif - -#define atom_text_section .section ".text.atom", "ax" - /* Long and pointer size in bytes. */ #define LP_SIZE 8 diff --git a/sysdeps/x86_64/tls_get_addr.S b/sysdeps/x86_64/tls_get_addr.S new file mode 100644 index 0000000000..cf8c6d101b --- /dev/null +++ b/sysdeps/x86_64/tls_get_addr.S @@ -0,0 +1,61 @@ +/* Stack-aligning implementation of __tls_get_addr. x86-64 version. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifdef SHARED + +# include <sysdep.h> +# include "tlsdesc.h" +# include "rtld-offsets.h" + +/* See __tls_get_addr and __tls_get_addr_slow in dl-tls.c. This function + call __tls_get_addr_slow on both slow paths. It realigns the stack + before the call to work around GCC PR58066. */ + +ENTRY (__tls_get_addr) + mov %fs:DTV_OFFSET, %RDX_LP + mov GL_TLS_GENERATION_OFFSET+_rtld_local(%rip), %RAX_LP + /* GL(dl_tls_generation) == dtv[0].counter */ + cmp %RAX_LP, (%rdx) + jne 1f + mov TI_MODULE_OFFSET(%rdi), %RAX_LP + /* dtv[ti->ti_module] */ +# ifdef __LP64__ + salq $4, %rax + movq (%rdx,%rax), %rax +# else + movl (%rdx,%rax, 8), %eax +# endif + cmp $-1, %RAX_LP + je 1f + add TI_OFFSET_OFFSET(%rdi), %RAX_LP + ret +1: + /* On the slow path, align the stack. */ + pushq %rbp + cfi_def_cfa_offset (16) + cfi_offset (%rbp, -16) + mov %RSP_LP, %RBP_LP + cfi_def_cfa_register (%rbp) + and $-16, %RSP_LP + call __tls_get_addr_slow + mov %RBP_LP, %RSP_LP + popq %rbp + cfi_def_cfa (%rsp, 8) + ret +END (__tls_get_addr) +#endif /* SHARED */ diff --git a/sysdeps/x86_64/tlsdesc.c b/sysdeps/x86_64/tlsdesc.c index aff8b67941..302d097dbb 100644 --- a/sysdeps/x86_64/tlsdesc.c +++ b/sysdeps/x86_64/tlsdesc.c @@ -1,5 +1,5 @@ /* Manage TLS descriptors. x86_64 version. - Copyright (C) 2005-2016 Free Software Foundation, Inc. + Copyright (C) 2005-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -134,7 +134,6 @@ _dl_tlsdesc_resolve_hold_fixup (struct tlsdesc volatile *td, if there is one. */ void -internal_function _dl_unmap (struct link_map *map) { _dl_unmap_segments (map); diff --git a/sysdeps/x86_64/tlsdesc.sym b/sysdeps/x86_64/tlsdesc.sym index 33854975d0..fc897ab4b5 100644 --- a/sysdeps/x86_64/tlsdesc.sym +++ b/sysdeps/x86_64/tlsdesc.sym @@ -15,3 +15,6 @@ TLSDESC_ARG offsetof(struct tlsdesc, arg) TLSDESC_GEN_COUNT offsetof(struct tlsdesc_dynamic_arg, gen_count) TLSDESC_MODID offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_module) TLSDESC_MODOFF offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_offset) + +TI_MODULE_OFFSET offsetof(tls_index, ti_module) +TI_OFFSET_OFFSET offsetof(tls_index, ti_offset) diff --git a/sysdeps/x86_64/tst-audit.h b/sysdeps/x86_64/tst-audit.h index 94e9dd5282..623ef8920c 100644 --- a/sysdeps/x86_64/tst-audit.h +++ b/sysdeps/x86_64/tst-audit.h @@ -1,6 +1,6 @@ /* Definitions for testing PLT entry/exit auditing. x86_64 version. - Copyright (C) 2012-2016 Free Software Foundation, Inc. + Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S b/sysdeps/x86_64/tst-audit10-aux.c index fb9f989adc..e1b2d92f75 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S +++ b/sysdeps/x86_64/tst-audit10-aux.c @@ -1,5 +1,5 @@ -/* Multiple versions of vectorized pow. - Copyright (C) 2014-2016 Free Software Foundation, Inc. +/* Test case for preserved AVX512 registers in dynamic linker, -mavx512f part. + Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,21 +16,26 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY (_ZGVdN4vv_pow) - .type _ZGVdN4vv_pow, @gnu_indirect_function - LOAD_RTLD_GLOBAL_RO_RDX - leaq _ZGVdN4vv_pow_avx2(%rip), %rax - HAS_ARCH_FEATURE (AVX2_Usable) - jz 2f - ret -2: leaq _ZGVdN4vv_pow_sse_wrapper(%rip), %rax - ret -END (_ZGVdN4vv_pow) -libmvec_hidden_def (_ZGVdN4vv_pow) - -#define _ZGVdN4vv_pow _ZGVdN4vv_pow_sse_wrapper -#include "../svml_d_pow4_core.S" +#include <immintrin.h> +#include <stdlib.h> +#include <string.h> + +int +tst_audit10_aux (void) +{ +#ifdef __AVX512F__ + extern __m512i audit_test (__m512i, __m512i, __m512i, __m512i, + __m512i, __m512i, __m512i, __m512i); + + __m512i zmm = _mm512_setzero_si512 (); + __m512i ret = audit_test (zmm, zmm, zmm, zmm, zmm, zmm, zmm, zmm); + + zmm = _mm512_set1_epi64 (0x12349876); + + if (memcmp (&zmm, &ret, sizeof (ret))) + abort (); + return 0; +#else /* __AVX512F__ */ + return 77; +#endif /* __AVX512F__ */ +} diff --git a/sysdeps/x86_64/tst-audit10.c b/sysdeps/x86_64/tst-audit10.c index d104341be8..568011cb96 100644 --- a/sysdeps/x86_64/tst-audit10.c +++ b/sysdeps/x86_64/tst-audit10.c @@ -1,4 +1,5 @@ -/* Copyright (C) 2012-2016 Free Software Foundation, Inc. +/* Test case for preserved AVX512 registers in dynamic linker. + Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -15,17 +16,14 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -/* Test case for x86-64 preserved registers in dynamic linker. */ - -#ifdef __AVX512F__ -#include <stdlib.h> -#include <string.h> #include <cpuid.h> -#include <immintrin.h> + +int tst_audit10_aux (void); static int avx512_enabled (void) { +#ifdef bit_AVX512F unsigned int eax, ebx, ecx, edx; if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) == 0 @@ -40,34 +38,20 @@ avx512_enabled (void) /* Verify that ZMM, YMM and XMM states are enabled. */ return (eax & 0xe6) == 0xe6; +#else + return 0; +#endif } - -extern __m512i audit_test (__m512i, __m512i, __m512i, __m512i, - __m512i, __m512i, __m512i, __m512i); static int do_test (void) { /* Run AVX512 test only if AVX512 is supported. */ if (avx512_enabled ()) - { - __m512i zmm = _mm512_setzero_si512 (); - __m512i ret = audit_test (zmm, zmm, zmm, zmm, zmm, zmm, zmm, zmm); - - zmm = _mm512_set1_epi64 (0x12349876); - - if (memcmp (&zmm, &ret, sizeof (ret))) - abort (); - } - return 0; -} -#else -static int -do_test (void) -{ - return 0; + return tst_audit10_aux (); + else + return 77; } -#endif #define TEST_FUNCTION do_test () #include "../../test-skeleton.c" diff --git a/sysdeps/x86_64/tst-audit4-aux.c b/sysdeps/x86_64/tst-audit4-aux.c new file mode 100644 index 0000000000..2770be5b5e --- /dev/null +++ b/sysdeps/x86_64/tst-audit4-aux.c @@ -0,0 +1,39 @@ +/* Test case for preserved AVX registers in dynamic linker, -mavx part. + Copyright (C) 2009-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <immintrin.h> +#include <stdlib.h> +#include <string.h> + +extern __m256i audit_test (__m256i, __m256i, __m256i, __m256i, + __m256i, __m256i, __m256i, __m256i); + +int +tst_audit4_aux (void) +{ +#ifdef __AVX__ + __m256i ymm = _mm256_setzero_si256 (); + __m256i ret = audit_test (ymm, ymm, ymm, ymm, ymm, ymm, ymm, ymm); + ymm = _mm256_set1_epi32 (0x12349876); + if (memcmp (&ymm, &ret, sizeof (ret))) + abort (); + return 0; +#else /* __AVX__ */ + return 77; +#endif /* __AVX__ */ +} diff --git a/sysdeps/x86_64/tst-audit4.c b/sysdeps/x86_64/tst-audit4.c index 44d51231e3..d7ca24ac2d 100644 --- a/sysdeps/x86_64/tst-audit4.c +++ b/sysdeps/x86_64/tst-audit4.c @@ -1,11 +1,24 @@ -/* Test case for x86-64 preserved registers in dynamic linker. */ +/* Test case for preserved AVX registers in dynamic linker. + Copyright (C) 2009-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ -#ifdef __AVX__ -#include <stdlib.h> -#include <string.h> #include <cpuid.h> -#include <immintrin.h> +int tst_audit4_aux (void); static int avx_enabled (void) @@ -22,31 +35,15 @@ avx_enabled (void) return (eax & 6) == 6; } - -extern __m256i audit_test (__m256i, __m256i, __m256i, __m256i, - __m256i, __m256i, __m256i, __m256i); static int do_test (void) { /* Run AVX test only if AVX is supported. */ if (avx_enabled ()) - { - __m256i ymm = _mm256_setzero_si256 (); - __m256i ret = audit_test (ymm, ymm, ymm, ymm, ymm, ymm, ymm, ymm); - - ymm = _mm256_set1_epi32 (0x12349876); - if (memcmp (&ymm, &ret, sizeof (ret))) - abort (); - } - return 0; -} -#else -static int -do_test (void) -{ - return 0; + return tst_audit4_aux (); + else + return 77; } -#endif #define TEST_FUNCTION do_test () #include "../../test-skeleton.c" diff --git a/sysdeps/x86_64/tst-auditmod10a.c b/sysdeps/x86_64/tst-auditmod10a.c index e94dbaf7fe..ff6021a79a 100644 --- a/sysdeps/x86_64/tst-auditmod10a.c +++ b/sysdeps/x86_64/tst-auditmod10a.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2016 Free Software Foundation, Inc. +/* Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/tst-auditmod10b.c b/sysdeps/x86_64/tst-auditmod10b.c index ad6fcafdda..de1bbbb7fb 100644 --- a/sysdeps/x86_64/tst-auditmod10b.c +++ b/sysdeps/x86_64/tst-auditmod10b.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2016 Free Software Foundation, Inc. +/* Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -19,6 +19,8 @@ function parameter passing/return. */ #include <dlfcn.h> +#include <link.h> +#include <stddef.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> diff --git a/sysdeps/x86_64/tst-auditmod3b.c b/sysdeps/x86_64/tst-auditmod3b.c index 1a41ca80c0..7aad92382e 100644 --- a/sysdeps/x86_64/tst-auditmod3b.c +++ b/sysdeps/x86_64/tst-auditmod3b.c @@ -2,6 +2,8 @@ function parameter passing/return. */ #include <dlfcn.h> +#include <link.h> +#include <stddef.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> diff --git a/sysdeps/x86_64/tst-auditmod4b.c b/sysdeps/x86_64/tst-auditmod4b.c index 2b0d827e88..1153ea442c 100644 --- a/sysdeps/x86_64/tst-auditmod4b.c +++ b/sysdeps/x86_64/tst-auditmod4b.c @@ -2,6 +2,8 @@ function parameter passing/return. */ #include <dlfcn.h> +#include <link.h> +#include <stddef.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> diff --git a/sysdeps/x86_64/tst-auditmod5b.c b/sysdeps/x86_64/tst-auditmod5b.c index a74d261f03..6a280fd61b 100644 --- a/sysdeps/x86_64/tst-auditmod5b.c +++ b/sysdeps/x86_64/tst-auditmod5b.c @@ -2,6 +2,8 @@ function parameter passing/return. */ #include <dlfcn.h> +#include <link.h> +#include <stddef.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> diff --git a/sysdeps/x86_64/tst-auditmod6b.c b/sysdeps/x86_64/tst-auditmod6b.c index 886fc33e9b..3533602c07 100644 --- a/sysdeps/x86_64/tst-auditmod6b.c +++ b/sysdeps/x86_64/tst-auditmod6b.c @@ -2,6 +2,8 @@ function parameter passing/return. */ #include <dlfcn.h> +#include <link.h> +#include <stddef.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> diff --git a/sysdeps/x86_64/tst-auditmod6c.c b/sysdeps/x86_64/tst-auditmod6c.c index b2ee24d8bf..8000e89224 100644 --- a/sysdeps/x86_64/tst-auditmod6c.c +++ b/sysdeps/x86_64/tst-auditmod6c.c @@ -2,6 +2,8 @@ function parameter passing/return. */ #include <dlfcn.h> +#include <link.h> +#include <stddef.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> diff --git a/sysdeps/x86_64/tst-auditmod7b.c b/sysdeps/x86_64/tst-auditmod7b.c index f27076d3bb..5abe6d1bc9 100644 --- a/sysdeps/x86_64/tst-auditmod7b.c +++ b/sysdeps/x86_64/tst-auditmod7b.c @@ -2,6 +2,8 @@ function parameter passing/return. */ #include <dlfcn.h> +#include <link.h> +#include <stddef.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> diff --git a/sysdeps/x86_64/tst-avx-aux.c b/sysdeps/x86_64/tst-avx-aux.c new file mode 100644 index 0000000000..e6ae368fd8 --- /dev/null +++ b/sysdeps/x86_64/tst-avx-aux.c @@ -0,0 +1,47 @@ +/* Test case for preserved AVX registers in dynamic linker, -mavx part. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <immintrin.h> +#include <stdlib.h> +#include <string.h> + +int +tst_avx_aux (void) +{ +#ifdef __AVX__ + extern __m256i avx_test (__m256i, __m256i, __m256i, __m256i, + __m256i, __m256i, __m256i, __m256i); + + __m256i ymm0 = _mm256_set1_epi32 (0); + __m256i ymm1 = _mm256_set1_epi32 (1); + __m256i ymm2 = _mm256_set1_epi32 (2); + __m256i ymm3 = _mm256_set1_epi32 (3); + __m256i ymm4 = _mm256_set1_epi32 (4); + __m256i ymm5 = _mm256_set1_epi32 (5); + __m256i ymm6 = _mm256_set1_epi32 (6); + __m256i ymm7 = _mm256_set1_epi32 (7); + __m256i ret = avx_test (ymm0, ymm1, ymm2, ymm3, + ymm4, ymm5, ymm6, ymm7); + ymm0 = _mm256_set1_epi32 (0x12349876); + if (memcmp (&ymm0, &ret, sizeof (ret))) + abort (); + return 0; +#else /* __AVX__ */ + return 77; +#endif /* __AVX__ */ +} diff --git a/sysdeps/x86_64/tst-avx.c b/sysdeps/x86_64/tst-avx.c new file mode 100644 index 0000000000..9c52fc264a --- /dev/null +++ b/sysdeps/x86_64/tst-avx.c @@ -0,0 +1,49 @@ +/* Test case for preserved AVX registers in dynamic linker. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <cpuid.h> + +int tst_avx_aux (void); + +static int +avx_enabled (void) +{ + unsigned int eax, ebx, ecx, edx; + + if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) == 0 + || (ecx & (bit_AVX | bit_OSXSAVE)) != (bit_AVX | bit_OSXSAVE)) + return 0; + + /* Check the OS has AVX and SSE saving enabled. */ + asm ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0)); + + return (eax & 6) == 6; +} + +static int +do_test (void) +{ + /* Run AVX test only if AVX is supported. */ + if (avx_enabled ()) + return tst_avx_aux (); + else + return 77; +} + +#define TEST_FUNCTION do_test () +#include "../../test-skeleton.c" diff --git a/sysdeps/x86_64/tst-avx512-aux.c b/sysdeps/x86_64/tst-avx512-aux.c new file mode 100644 index 0000000000..87c4124398 --- /dev/null +++ b/sysdeps/x86_64/tst-avx512-aux.c @@ -0,0 +1,48 @@ +/* Test case for preserved AVX512 registers in dynamic linker, + -mavx512 part. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <immintrin.h> +#include <stdlib.h> +#include <string.h> + +int +tst_avx512_aux (void) +{ +#ifdef __AVX512F__ + extern __m512i avx512_test (__m512i, __m512i, __m512i, __m512i, + __m512i, __m512i, __m512i, __m512i); + + __m512i zmm0 = _mm512_set1_epi32 (0); + __m512i zmm1 = _mm512_set1_epi32 (1); + __m512i zmm2 = _mm512_set1_epi32 (2); + __m512i zmm3 = _mm512_set1_epi32 (3); + __m512i zmm4 = _mm512_set1_epi32 (4); + __m512i zmm5 = _mm512_set1_epi32 (5); + __m512i zmm6 = _mm512_set1_epi32 (6); + __m512i zmm7 = _mm512_set1_epi32 (7); + __m512i ret = avx512_test (zmm0, zmm1, zmm2, zmm3, + zmm4, zmm5, zmm6, zmm7); + zmm0 = _mm512_set1_epi32 (0x12349876); + if (memcmp (&zmm0, &ret, sizeof (ret))) + abort (); + return 0; +#else /* __AVX512F__ */ + return 77; +#endif /* __AVX512F__ */ +} diff --git a/sysdeps/x86_64/tst-avx512.c b/sysdeps/x86_64/tst-avx512.c new file mode 100644 index 0000000000..63d8bc9c27 --- /dev/null +++ b/sysdeps/x86_64/tst-avx512.c @@ -0,0 +1,57 @@ +/* Test case for preserved AVX512 registers in dynamic linker. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <cpuid.h> + +int tst_avx512_aux (void); + +static int +avx512_enabled (void) +{ +#ifdef bit_AVX512F + unsigned int eax, ebx, ecx, edx; + + if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) == 0 + || (ecx & (bit_AVX | bit_OSXSAVE)) != (bit_AVX | bit_OSXSAVE)) + return 0; + + __cpuid_count (7, 0, eax, ebx, ecx, edx); + if (!(ebx & bit_AVX512F)) + return 0; + + asm ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0)); + + /* Verify that ZMM, YMM and XMM states are enabled. */ + return (eax & 0xe6) == 0xe6; +#else + return 0; +#endif +} + +static int +do_test (void) +{ + /* Run AVX512 test only if AVX512 is supported. */ + if (avx512_enabled ()) + return tst_avx512_aux (); + else + return 77; +} + +#define TEST_FUNCTION do_test () +#include "../../test-skeleton.c" diff --git a/sysdeps/x86_64/tst-avx512mod.c b/sysdeps/x86_64/tst-avx512mod.c new file mode 100644 index 0000000000..4cfb3a2c3d --- /dev/null +++ b/sysdeps/x86_64/tst-avx512mod.c @@ -0,0 +1,48 @@ +/* Test case for x86-64 preserved AVX512 registers in dynamic linker. */ + +#ifdef __AVX512F__ +#include <stdlib.h> +#include <string.h> +#include <immintrin.h> + +__m512i +avx512_test (__m512i x0, __m512i x1, __m512i x2, __m512i x3, + __m512i x4, __m512i x5, __m512i x6, __m512i x7) +{ + __m512i zmm; + + zmm = _mm512_set1_epi32 (0); + if (memcmp (&zmm, &x0, sizeof (zmm))) + abort (); + + zmm = _mm512_set1_epi32 (1); + if (memcmp (&zmm, &x1, sizeof (zmm))) + abort (); + + zmm = _mm512_set1_epi32 (2); + if (memcmp (&zmm, &x2, sizeof (zmm))) + abort (); + + zmm = _mm512_set1_epi32 (3); + if (memcmp (&zmm, &x3, sizeof (zmm))) + abort (); + + zmm = _mm512_set1_epi32 (4); + if (memcmp (&zmm, &x4, sizeof (zmm))) + abort (); + + zmm = _mm512_set1_epi32 (5); + if (memcmp (&zmm, &x5, sizeof (zmm))) + abort (); + + zmm = _mm512_set1_epi32 (6); + if (memcmp (&zmm, &x6, sizeof (zmm))) + abort (); + + zmm = _mm512_set1_epi32 (7); + if (memcmp (&zmm, &x7, sizeof (zmm))) + abort (); + + return _mm512_set1_epi32 (0x12349876); +} +#endif diff --git a/sysdeps/x86_64/tst-avxmod.c b/sysdeps/x86_64/tst-avxmod.c new file mode 100644 index 0000000000..6e5b154997 --- /dev/null +++ b/sysdeps/x86_64/tst-avxmod.c @@ -0,0 +1,48 @@ +/* Test case for x86-64 preserved AVX registers in dynamic linker. */ + +#ifdef __AVX__ +#include <stdlib.h> +#include <string.h> +#include <immintrin.h> + +__m256i +avx_test (__m256i x0, __m256i x1, __m256i x2, __m256i x3, + __m256i x4, __m256i x5, __m256i x6, __m256i x7) +{ + __m256i ymm; + + ymm = _mm256_set1_epi32 (0); + if (memcmp (&ymm, &x0, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (1); + if (memcmp (&ymm, &x1, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (2); + if (memcmp (&ymm, &x2, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (3); + if (memcmp (&ymm, &x3, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (4); + if (memcmp (&ymm, &x4, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (5); + if (memcmp (&ymm, &x5, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (6); + if (memcmp (&ymm, &x6, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (7); + if (memcmp (&ymm, &x7, sizeof (ymm))) + abort (); + + return _mm256_set1_epi32 (0x12349876); +} +#endif diff --git a/sysdeps/x86_64/tst-mallocalign1.c b/sysdeps/x86_64/tst-mallocalign1.c index 3897af86c1..0f2e725e3b 100644 --- a/sysdeps/x86_64/tst-mallocalign1.c +++ b/sysdeps/x86_64/tst-mallocalign1.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2016 Free Software Foundation, Inc. +/* Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/tst-platform-1.c b/sysdeps/x86_64/tst-platform-1.c new file mode 100644 index 0000000000..91dbbb93db --- /dev/null +++ b/sysdeps/x86_64/tst-platform-1.c @@ -0,0 +1,29 @@ +/* Test PRELOAD with $PLATFORM. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <stdlib.h> + +extern int preload (void); + +static int +do_test (void) +{ + return preload () == 0x1234 ? EXIT_SUCCESS : EXIT_FAILURE; +} + +#include <support/test-driver.c> diff --git a/sysdeps/x86_64/tst-platformmod-1.c b/sysdeps/x86_64/tst-platformmod-1.c new file mode 100644 index 0000000000..be0e786e76 --- /dev/null +++ b/sysdeps/x86_64/tst-platformmod-1.c @@ -0,0 +1,23 @@ +/* Test PRELOAD with $PLATFORM. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +int +preload (void) +{ + return 0; +} diff --git a/sysdeps/x86_64/tst-platformmod-2.c b/sysdeps/x86_64/tst-platformmod-2.c new file mode 100644 index 0000000000..413d0bd94b --- /dev/null +++ b/sysdeps/x86_64/tst-platformmod-2.c @@ -0,0 +1,23 @@ +/* Test PRELOAD with $PLATFORM. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +int +preload (void) +{ + return 0x1234; +} diff --git a/sysdeps/x86_64/tst-quad1.c b/sysdeps/x86_64/tst-quad1.c index 1cb63a748f..089b25d2df 100644 --- a/sysdeps/x86_64/tst-quad1.c +++ b/sysdeps/x86_64/tst-quad1.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2016 Free Software Foundation, Inc. +/* Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/tst-quadmod1.S b/sysdeps/x86_64/tst-quadmod1.S index 588c5016b6..c60f9dc89d 100644 --- a/sysdeps/x86_64/tst-quadmod1.S +++ b/sysdeps/x86_64/tst-quadmod1.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2016 Free Software Foundation, Inc. +/* Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -28,6 +28,9 @@ .type func, @function func: .cfi_startproc +#if defined __CET__ && (__CET__ & 1) != 0 + endbr64 +#endif xorl %edi, %edi jmp exit@PLT .cfi_endproc @@ -37,6 +40,9 @@ func: foo: .cfi_startproc .cfi_def_cfa_register 6 +#if defined __CET__ && (__CET__ & 1) != 0 + endbr64 +#endif movq .Ljmp(%rip), %rax subq $BIAS, %rax jmp *%rax diff --git a/sysdeps/x86_64/tst-quadmod2.S b/sysdeps/x86_64/tst-quadmod2.S index 7409a9eaa3..af03444d4f 100644 --- a/sysdeps/x86_64/tst-quadmod2.S +++ b/sysdeps/x86_64/tst-quadmod2.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2016 Free Software Foundation, Inc. +/* Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -27,6 +27,9 @@ .type func, @function func: .cfi_startproc +#if defined __CET__ && (__CET__ & 1) != 0 + endbr64 +#endif xorl %edi, %edi jmp exit@PLT .cfi_endproc @@ -36,6 +39,9 @@ func: foo: .cfi_startproc .cfi_def_cfa_register 6 +#if defined __CET__ && (__CET__ & 1) != 0 + endbr64 +#endif movq .Ljmp(%rip), %rax subq $BIAS, %rax jmp *%rax diff --git a/sysdeps/x86_64/tst-sse.c b/sysdeps/x86_64/tst-sse.c new file mode 100644 index 0000000000..d219889d1f --- /dev/null +++ b/sysdeps/x86_64/tst-sse.c @@ -0,0 +1,46 @@ +/* Test case for preserved SSE registers in dynamic linker. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <immintrin.h> +#include <stdlib.h> +#include <string.h> + +extern __m128i sse_test (__m128i, __m128i, __m128i, __m128i, + __m128i, __m128i, __m128i, __m128i); + +static int +do_test (void) +{ + __m128i xmm0 = _mm_set1_epi32 (0); + __m128i xmm1 = _mm_set1_epi32 (1); + __m128i xmm2 = _mm_set1_epi32 (2); + __m128i xmm3 = _mm_set1_epi32 (3); + __m128i xmm4 = _mm_set1_epi32 (4); + __m128i xmm5 = _mm_set1_epi32 (5); + __m128i xmm6 = _mm_set1_epi32 (6); + __m128i xmm7 = _mm_set1_epi32 (7); + __m128i ret = sse_test (xmm0, xmm1, xmm2, xmm3, + xmm4, xmm5, xmm6, xmm7); + xmm0 = _mm_set1_epi32 (0x12349876); + if (memcmp (&xmm0, &ret, sizeof (ret))) + abort (); + return 0; +} + +#define TEST_FUNCTION do_test () +#include "../../test-skeleton.c" diff --git a/sysdeps/x86_64/tst-ssemod.c b/sysdeps/x86_64/tst-ssemod.c new file mode 100644 index 0000000000..907a64c69e --- /dev/null +++ b/sysdeps/x86_64/tst-ssemod.c @@ -0,0 +1,46 @@ +/* Test case for x86-64 preserved SSE registers in dynamic linker. */ + +#include <stdlib.h> +#include <string.h> +#include <immintrin.h> + +__m128i +sse_test (__m128i x0, __m128i x1, __m128i x2, __m128i x3, + __m128i x4, __m128i x5, __m128i x6, __m128i x7) +{ + __m128i xmm; + + xmm = _mm_set1_epi32 (0); + if (memcmp (&xmm, &x0, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (1); + if (memcmp (&xmm, &x1, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (2); + if (memcmp (&xmm, &x2, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (3); + if (memcmp (&xmm, &x3, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (4); + if (memcmp (&xmm, &x4, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (5); + if (memcmp (&xmm, &x5, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (6); + if (memcmp (&xmm, &x6, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (7); + if (memcmp (&xmm, &x7, sizeof (xmm))) + abort (); + + return _mm_set1_epi32 (0x12349876); +} diff --git a/sysdeps/x86_64/tst-stack-align.h b/sysdeps/x86_64/tst-stack-align.h index 24e8e61c35..b2ef77f65d 100644 --- a/sysdeps/x86_64/tst-stack-align.h +++ b/sysdeps/x86_64/tst-stack-align.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2016 Free Software Foundation, Inc. +/* Copyright (C) 2003-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/tst-x86_64-1.c b/sysdeps/x86_64/tst-x86_64-1.c new file mode 100644 index 0000000000..801c866bdd --- /dev/null +++ b/sysdeps/x86_64/tst-x86_64-1.c @@ -0,0 +1,26 @@ +/* Test searching the "x86_64" directory for shared libraries. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +extern void foo (void); + +int +main (void) +{ + foo (); + return 0; +} diff --git a/sysdeps/x86_64/tst-x86_64mod-1.c b/sysdeps/x86_64/tst-x86_64mod-1.c new file mode 100644 index 0000000000..57e955d5d9 --- /dev/null +++ b/sysdeps/x86_64/tst-x86_64mod-1.c @@ -0,0 +1,22 @@ +/* Test searching the "x86_64" directory for shared libraries. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +void +foo (void) +{ +} diff --git a/sysdeps/x86_64/wcschr.S b/sysdeps/x86_64/wcschr.S index 8604289e46..29284662a1 100644 --- a/sysdeps/x86_64/wcschr.S +++ b/sysdeps/x86_64/wcschr.S @@ -1,5 +1,5 @@ /* wcschr with SSSE3 - Copyright (C) 2011-2016 Free Software Foundation, Inc. + Copyright (C) 2011-2018 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/wcscmp.S b/sysdeps/x86_64/wcscmp.S index 705a73b10e..0d506c8b5c 100644 --- a/sysdeps/x86_64/wcscmp.S +++ b/sysdeps/x86_64/wcscmp.S @@ -1,5 +1,5 @@ /* Optimized wcscmp for x86-64 with SSE2. - Copyright (C) 2011-2016 Free Software Foundation, Inc. + Copyright (C) 2011-2018 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -946,5 +946,7 @@ L(equal): ret END (__wcscmp) +#ifndef __wcscmp libc_hidden_def (__wcscmp) weak_alias (__wcscmp, wcscmp) +#endif diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S index 7a9175eefe..9f5f723227 100644 --- a/sysdeps/x86_64/wcslen.S +++ b/sysdeps/x86_64/wcslen.S @@ -1,5 +1,5 @@ /* Optimized wcslen for x86-64 with SSE2. - Copyright (C) 2011-2016 Free Software Foundation, Inc. + Copyright (C) 2011-2018 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S index fb192f3ecf..2f38853727 100644 --- a/sysdeps/x86_64/wcsrchr.S +++ b/sysdeps/x86_64/wcsrchr.S @@ -1,5 +1,5 @@ /* wcsrchr with SSSE3 - Copyright (C) 2011-2016 Free Software Foundation, Inc. + Copyright (C) 2011-2018 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. diff --git a/sysdeps/x86_64/wmemset.S b/sysdeps/x86_64/wmemset.S new file mode 100644 index 0000000000..f96d567fd8 --- /dev/null +++ b/sysdeps/x86_64/wmemset.S @@ -0,0 +1 @@ +/* Implemented in memset.S. */ diff --git a/sysdeps/x86_64/wmemset_chk.S b/sysdeps/x86_64/wmemset_chk.S new file mode 100644 index 0000000000..9275ebb40d --- /dev/null +++ b/sysdeps/x86_64/wmemset_chk.S @@ -0,0 +1,33 @@ +/* Checking wmemset for x86-64. + Copyright (C) 2004-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#ifndef SHARED + /* For libc.so this is defined in wmemset.S. + For libc.a, this is a separate source to avoid + wmemset bringing in __chk_fail and all routines + it calls. */ + .text +ENTRY (__wmemset_chk) + cmpq %rdx, %rcx + jb __chk_fail + jmp wmemset +END (__wmemset_chk) +#endif diff --git a/sysdeps/x86_64/wordcopy.c b/sysdeps/x86_64/wordcopy.c new file mode 100644 index 0000000000..590b6cb16b --- /dev/null +++ b/sysdeps/x86_64/wordcopy.c @@ -0,0 +1 @@ +/* X86-64 doesn't use memory copy functions. */ diff --git a/sysdeps/x86_64/x32/dl-machine.h b/sysdeps/x86_64/x32/dl-machine.h index 47132fcd96..2a612913ff 100644 --- a/sysdeps/x86_64/x32/dl-machine.h +++ b/sysdeps/x86_64/x32/dl-machine.h @@ -1,5 +1,5 @@ /* Machine-dependent ELF dynamic relocation inline functions. x32 version. - Copyright (C) 2012-2016 Free Software Foundation, Inc. + Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/x86_64/x32/fpu/s_lrint.S b/sysdeps/x86_64/x32/fpu/s_lrint.S index aa68863553..381684583e 100644 --- a/sysdeps/x86_64/x32/fpu/s_lrint.S +++ b/sysdeps/x86_64/x32/fpu/s_lrint.S @@ -1,6 +1,6 @@ /* Round argument to nearest integral value according to current rounding direction. - Copyright (C) 2015-2016 Free Software Foundation, Inc. + Copyright (C) 2015-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,10 +18,11 @@ <http://www.gnu.org/licenses/>. */ #include <sysdep.h> +#include <libm-alias-double.h> .text ENTRY(__lrint) cvtsd2si %xmm0,%eax ret END(__lrint) -weak_alias (__lrint, lrint) +libm_alias_double (__lrint, lrint) diff --git a/sysdeps/x86_64/x32/fpu/s_lrintf.S b/sysdeps/x86_64/x32/fpu/s_lrintf.S index bb5b1665bd..361d34a989 100644 --- a/sysdeps/x86_64/x32/fpu/s_lrintf.S +++ b/sysdeps/x86_64/x32/fpu/s_lrintf.S @@ -1,6 +1,6 @@ /* Round argument to nearest integral value according to current rounding direction. - Copyright (C) 2015-2016 Free Software Foundation, Inc. + Copyright (C) 2015-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,10 +18,11 @@ <http://www.gnu.org/licenses/>. */ #include <sysdep.h> +#include <libm-alias-float.h> .text ENTRY(__lrintf) cvtss2si %xmm0,%eax ret END(__lrintf) -weak_alias (__lrintf, lrintf) +libm_alias_float (__lrint, lrint) diff --git a/sysdeps/x86_64/x32/fpu/s_lrintl.S b/sysdeps/x86_64/x32/fpu/s_lrintl.S index 6bc8f6fdb9..b68313e916 100644 --- a/sysdeps/x86_64/x32/fpu/s_lrintl.S +++ b/sysdeps/x86_64/x32/fpu/s_lrintl.S @@ -1,6 +1,6 @@ /* Round argument to nearest integral value according to current rounding direction. - Copyright (C) 1997-2016 Free Software Foundation, Inc. + Copyright (C) 1997-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,6 +18,7 @@ <http://www.gnu.org/licenses/>. */ #include <sysdep.h> +#include <libm-alias-ldouble.h> .text ENTRY(__lrintl) @@ -27,4 +28,4 @@ ENTRY(__lrintl) movl -4(%rsp),%eax ret END(__lrintl) -weak_alias (__lrintl, lrintl) +libm_alias_ldouble (__lrint, lrint) diff --git a/sysdeps/x86_64/x32/gmp-mparam.h b/sysdeps/x86_64/x32/gmp-mparam.h index df37442bfb..331c26d587 100644 --- a/sysdeps/x86_64/x32/gmp-mparam.h +++ b/sysdeps/x86_64/x32/gmp-mparam.h @@ -1,6 +1,6 @@ /* gmp-mparam.h -- Compiler/machine parameter header file. -Copyright (C) 2012-2016 Free Software Foundation, Inc. +Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU MP Library. diff --git a/sysdeps/x86_64/x32/nptl/tls.h b/sysdeps/x86_64/x32/nptl/tls.h deleted file mode 100644 index 245623494b..0000000000 --- a/sysdeps/x86_64/x32/nptl/tls.h +++ /dev/null @@ -1,46 +0,0 @@ -/* Definition for thread-local data handling. nptl/x32 version. - Copyright (C) 2012-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#ifndef _X32_TLS_H -#define _X32_TLS_H 1 - -#include_next <tls.h> - -#ifndef __ASSEMBLER__ - -/* X32 doesn't support 32-bit indirect calls via memory. Instead, we - load the 32-bit address from memory into the lower 32 bits of the - return-value register, which will automatically zero-extend the upper - 32 bits of the return-value register. We then do the indirect call - via the 64-bit return-value register. */ -# undef CALL_THREAD_FCT -# define CALL_THREAD_FCT(descr) \ - ({ void *__res; \ - asm volatile ("movl %%fs:%P2, %%edi\n\t" \ - "movl %%fs:%P1, %k0\n\t" \ - "callq *%q0" \ - : "=a" (__res) \ - : "i" (offsetof (struct pthread, start_routine)), \ - "i" (offsetof (struct pthread, arg)) \ - : "di", "si", "cx", "dx", "r8", "r9", "r10", "r11", \ - "memory", "cc"); \ - __res; }) - -#endif /* __ASSEMBLER__ */ - -#endif /* x32/tls.h */ diff --git a/sysdeps/x86_64/x32/sysdep.h b/sysdeps/x86_64/x32/sysdep.h index 17a1446796..b40a438771 100644 --- a/sysdeps/x86_64/x32/sysdep.h +++ b/sysdeps/x86_64/x32/sysdep.h @@ -1,5 +1,5 @@ /* Assembler macros for x32. - Copyright (C) 2012-2016 Free Software Foundation, Inc. + Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or |