diff options
author | Samuel Thibault <samuel.thibault@ens-lyon.org> | 2018-12-27 19:16:25 +0000 |
---|---|---|
committer | Samuel Thibault <samuel.thibault@ens-lyon.org> | 2018-12-27 19:16:25 +0000 |
commit | 8d59503b977070aaa4e504e8d6dcb7da3711893e (patch) | |
tree | 8272c9c2cce43afa4fe4d8d92c269a6435242661 /sysdeps/aarch64 | |
parent | 76a7dc16fab8853ef9230447fa98c70a3619dc6d (diff) | |
parent | bcea9593527d90b9f9ff3817e3fbf0fbc3d01fa7 (diff) |
Merge commit 'refs/top-bases/t/gsync-libc-merge' into t/gsync-libc-merge
Diffstat (limited to 'sysdeps/aarch64')
134 files changed, 3188 insertions, 2008 deletions
diff --git a/sysdeps/aarch64/Implies b/sysdeps/aarch64/Implies index e5adf4d63c..a1d5e2e742 100644 --- a/sysdeps/aarch64/Implies +++ b/sysdeps/aarch64/Implies @@ -3,4 +3,3 @@ ieee754/ldbl-128 ieee754/dbl-64/wordsize-64 ieee754/dbl-64 ieee754/flt-32 -aarch64/soft-fp diff --git a/sysdeps/aarch64/Makefile b/sysdeps/aarch64/Makefile index 06323550e9..94baaf52dd 100644 --- a/sysdeps/aarch64/Makefile +++ b/sysdeps/aarch64/Makefile @@ -1,9 +1,5 @@ long-double-fcts = yes -ifeq ($(subdir),debug) -CFLAGS-backtrace.c += -funwind-tables -endif - ifeq ($(subdir),elf) sysdep-dl-routines += tlsdesc dl-tlsdesc gen-as-const-headers += dl-link.sym @@ -12,3 +8,11 @@ endif ifeq ($(subdir),csu) gen-as-const-headers += tlsdesc.sym endif + +ifeq ($(subdir),gmon) +CFLAGS-mcount.c += -mgeneral-regs-only +endif + +ifeq ($(subdir),math) +CPPFLAGS += -I../soft-fp +endif diff --git a/sysdeps/aarch64/__longjmp.S b/sysdeps/aarch64/__longjmp.S index 65116be557..3365bd70f9 100644 --- a/sysdeps/aarch64/__longjmp.S +++ b/sysdeps/aarch64/__longjmp.S @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2016 Free Software Foundation, Inc. +/* Copyright (C) 1997-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -46,6 +46,8 @@ ENTRY (__longjmp) cfi_offset(d14, JB_D14<<3) cfi_offset(d15, JB_D15<<3) + DELOUSE (0) + ldp x19, x20, [x0, #JB_X19<<3] ldp x21, x22, [x0, #JB_X21<<3] ldp x23, x24, [x0, #JB_X23<<3] @@ -53,7 +55,7 @@ ENTRY (__longjmp) ldp x27, x28, [x0, #JB_X27<<3] #ifdef PTR_DEMANGLE ldp x29, x4, [x0, #JB_X29<<3] - PTR_DEMANGLE (x30, x4, x3, x2) + PTR_DEMANGLE (30, 4, 3, 2) #else ldp x29, x30, [x0, #JB_X29<<3] #endif @@ -98,7 +100,7 @@ ENTRY (__longjmp) cfi_same_value(d15) #ifdef PTR_DEMANGLE ldr x4, [x0, #JB_SP<<3] - PTR_DEMANGLE (x5, x4, x3, x2) + PTR_DEMANGLE (5, 4, 3, 2) #else ldr x5, [x0, #JB_SP<<3] #endif diff --git a/sysdeps/aarch64/atomic-machine.h b/sysdeps/aarch64/atomic-machine.h index 28c80dc42d..63b24e625c 100644 --- a/sysdeps/aarch64/atomic-machine.h +++ b/sysdeps/aarch64/atomic-machine.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2016 Free Software Foundation, Inc. +/* Copyright (C) 2003-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -38,6 +38,7 @@ typedef uintmax_t uatomic_max_t; #define __HAVE_64B_ATOMICS 1 #define USE_ATOMIC_COMPILER_BUILTINS 1 +#define ATOMIC_EXCHANGE_USES_CAS 0 /* Compare and exchange. For all "bool" routines, we return FALSE if exchange succesful. */ @@ -115,10 +116,6 @@ typedef uintmax_t uatomic_max_t; /* Compare and exchange with "release" semantics, ie barrier before. */ -# define atomic_compare_and_exchange_bool_rel(mem, new, old) \ - __atomic_bool_bysize (__arch_compare_and_exchange_bool, int, \ - mem, new, old, __ATOMIC_RELEASE) - # define atomic_compare_and_exchange_val_rel(mem, new, old) \ __atomic_val_bysize (__arch_compare_and_exchange_val, int, \ mem, new, old, __ATOMIC_RELEASE) diff --git a/sysdeps/aarch64/backtrace.c b/sysdeps/aarch64/backtrace.c deleted file mode 100644 index 27ce597b39..0000000000 --- a/sysdeps/aarch64/backtrace.c +++ /dev/null @@ -1 +0,0 @@ -#include <sysdeps/x86_64/backtrace.c> diff --git a/sysdeps/aarch64/bits/endian.h b/sysdeps/aarch64/bits/endian.h index a32ebc0d80..1f18733422 100644 --- a/sysdeps/aarch64/bits/endian.h +++ b/sysdeps/aarch64/bits/endian.h @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2016 Free Software Foundation, Inc. +/* Copyright (C) 1997-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/bits/fenv.h b/sysdeps/aarch64/bits/fenv.h index aced0d33b2..8a0f481f79 100644 --- a/sysdeps/aarch64/bits/fenv.h +++ b/sysdeps/aarch64/bits/fenv.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2004-2016 Free Software Foundation, Inc. +/* Copyright (C) 2004-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -72,3 +72,11 @@ fenv_t; /* Floating-point environment where none of the exceptions are masked. */ # define FE_NOMASK_ENV ((const fenv_t *) -2) #endif + +#if __GLIBC_USE (IEC_60559_BFP_EXT) +/* Type representing floating-point control modes. */ +typedef unsigned int femode_t; + +/* Default floating-point control modes. */ +# define FE_DFL_MODE ((const femode_t *) -1L) +#endif diff --git a/sysdeps/aarch64/fpu/s_frint.c b/sysdeps/aarch64/bits/fp-fast.h index 321c8ef50d..3ce12210b7 100644 --- a/sysdeps/aarch64/fpu/s_frint.c +++ b/sysdeps/aarch64/bits/fp-fast.h @@ -1,5 +1,5 @@ -/* Copyright (C) 1996-2016 Free Software Foundation, Inc. - +/* Define FP_FAST_* macros. AArch64 version. + Copyright (C) 2016-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,34 +16,19 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include <math.h> - -#ifndef FUNC -# error FUNC not defined -#endif - -#ifndef TYPE -# define TYPE double -# define REGS "d" -#else -# ifndef REGS -# error REGS not defined -# endif +#ifndef _MATH_H +# error "Never use <bits/fp-fast.h> directly; include <math.h> instead." #endif -#ifndef INSN -# error INSN not defined -#endif +#ifdef __USE_ISOC99 -#define __CONCATX(a,b) __CONCAT(a,b) +/* The GCC 4.6 compiler will define __FP_FAST_FMA{,F,L} if the fma{,f,l} + builtins are supported. */ +# define FP_FAST_FMA 1 +# define FP_FAST_FMAF 1 -TYPE -__CONCATX(__,FUNC) (TYPE x) -{ - TYPE result; - asm ( INSN "\t%" REGS "0, %" REGS "1" : - "=w" (result) : "w" (x) ); - return result; -} +# ifdef __FP_FAST_FMAL +# define FP_FAST_FMAL 1 +# endif -weak_alias (__CONCATX(__,FUNC), FUNC) +#endif diff --git a/sysdeps/aarch64/bits/link.h b/sysdeps/aarch64/bits/link.h index 0bf961519e..5a7fc1ccd4 100644 --- a/sysdeps/aarch64/bits/link.h +++ b/sysdeps/aarch64/bits/link.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2005-2016 Free Software Foundation, Inc. +/* Copyright (C) 2005-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/bits/setjmp.h b/sysdeps/aarch64/bits/setjmp.h index fff8616c60..1685f1076a 100644 --- a/sysdeps/aarch64/bits/setjmp.h +++ b/sysdeps/aarch64/bits/setjmp.h @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2016 Free Software Foundation, Inc. +/* Copyright (C) 1997-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/bits/string.h b/sysdeps/aarch64/bits/wordsize.h index 0a508f72c0..ea7eecdf92 100644 --- a/sysdeps/aarch64/bits/string.h +++ b/sysdeps/aarch64/bits/wordsize.h @@ -1,5 +1,6 @@ -/* Optimized, inlined string functions. AArch64 version. - Copyright (C) 2015-2016 Free Software Foundation, Inc. +/* Determine the wordsize from the preprocessor defines. + + Copyright (C) 2016-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,9 +17,12 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#ifndef _STRING_H -# error "Never use <bits/string.h> directly; include <string.h> instead." +#ifdef __LP64__ +# define __WORDSIZE 64 +#else +# define __WORDSIZE 32 +# define __WORDSIZE32_SIZE_ULONG 1 +# define __WORDSIZE32_PTRDIFF_LONG 1 #endif -/* AArch64 uses the aligned string inline ABI. */ -#define _STRING_INLINE_unaligned 0 +#define __WORDSIZE_TIME64_COMPAT32 0 diff --git a/sysdeps/aarch64/crti.S b/sysdeps/aarch64/crti.S index 53ccb42587..2b213758b2 100644 --- a/sysdeps/aarch64/crti.S +++ b/sysdeps/aarch64/crti.S @@ -1,5 +1,5 @@ /* Special .init and .fini section support for AArch64. - Copyright (C) 1995-2016 Free Software Foundation, Inc. + Copyright (C) 1995-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -39,6 +39,7 @@ they can be called as functions. The symbols _init and _fini are magic and cause the linker to emit DT_INIT and DT_FINI. */ +#include <sysdep.h> #include <libc-symbols.h> #ifndef PREINIT_FUNCTION @@ -60,7 +61,7 @@ .type call_weak_fn, %function call_weak_fn: adrp x0, :got:PREINIT_FUNCTION - ldr x0, [x0, #:got_lo12:PREINIT_FUNCTION] + ldr PTR_REG (0), [x0, #:got_lo12:PREINIT_FUNCTION] cbz x0, 1f b PREINIT_FUNCTION 1: @@ -71,6 +72,7 @@ call_weak_fn: .section .init,"ax",%progbits .align 2 .global _init + .hidden _init .type _init, %function _init: stp x29, x30, [sp, -16]! @@ -84,6 +86,7 @@ _init: .section .fini,"ax",%progbits .align 2 .global _fini + .hidden _fini .type _fini, %function _fini: stp x29, x30, [sp, -16]! diff --git a/sysdeps/aarch64/crtn.S b/sysdeps/aarch64/crtn.S index 33d5f3d822..d72300af80 100644 --- a/sysdeps/aarch64/crtn.S +++ b/sysdeps/aarch64/crtn.S @@ -1,5 +1,5 @@ /* Special .init and .fini section support for AArch64. - Copyright (C) 1995-2016 Free Software Foundation, Inc. + Copyright (C) 1995-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/dl-irel.h b/sysdeps/aarch64/dl-irel.h index 63a8e506eb..5889ee187b 100644 --- a/sysdeps/aarch64/dl-irel.h +++ b/sysdeps/aarch64/dl-irel.h @@ -1,6 +1,6 @@ /* Machine-dependent ELF indirect relocation inline functions. AArch64 version. - Copyright (C) 2012-2016 Free Software Foundation, Inc. + Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -23,6 +23,7 @@ #include <stdio.h> #include <unistd.h> #include <ldsodefs.h> +#include <sysdep.h> #define ELF_MACHINE_IRELA 1 @@ -30,7 +31,7 @@ static inline ElfW(Addr) __attribute ((always_inline)) elf_ifunc_invoke (ElfW(Addr) addr) { - return ((ElfW(Addr) (*) (unsigned long int)) (addr)) (GLRO(dl_hwcap)); + return ((ElfW(Addr) (*) (uint64_t)) (addr)) (GLRO(dl_hwcap)); } static inline void @@ -40,7 +41,7 @@ elf_irela (const ElfW(Rela) *reloc) ElfW(Addr) *const reloc_addr = (void *) reloc->r_offset; const unsigned long int r_type = ELFW(R_TYPE) (reloc->r_info); - if (__glibc_likely (r_type == R_AARCH64_IRELATIVE)) + if (__glibc_likely (r_type == AARCH64_R(IRELATIVE))) { ElfW(Addr) value = elf_ifunc_invoke (reloc->r_addend); *reloc_addr = value; diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h index 282805e396..4935aa7c54 100644 --- a/sysdeps/aarch64/dl-machine.h +++ b/sysdeps/aarch64/dl-machine.h @@ -1,4 +1,4 @@ -/* Copyright (C) 1995-2016 Free Software Foundation, Inc. +/* Copyright (C) 1995-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -21,9 +21,11 @@ #define ELF_MACHINE_NAME "aarch64" +#include <sysdep.h> #include <tls.h> #include <dl-tlsdesc.h> #include <dl-irel.h> +#include <cpu-features.c> /* Return nonzero iff ELF header is compatible with the running host. */ static inline int __attribute__ ((unused)) @@ -49,26 +51,11 @@ elf_machine_load_address (void) /* To figure out the load address we use the definition that for any symbol: dynamic_addr(symbol) = static_addr(symbol) + load_addr - The choice of symbol is arbitrary. The static address we obtain - by constructing a non GOT reference to the symbol, the dynamic - address of the symbol we compute using adrp/add to compute the - symbol's address relative to the PC. - This depends on 32bit relocations being resolved at link time - and that the static address fits in the 32bits. */ - - ElfW(Addr) static_addr; - ElfW(Addr) dynamic_addr; - - asm (" \n" -" adrp %1, _dl_start; \n" -" add %1, %1, #:lo12:_dl_start \n" -" ldr %w0, 1f \n" -" b 2f \n" -"1: \n" -" .word _dl_start \n" -"2: \n" - : "=r" (static_addr), "=r" (dynamic_addr)); - return dynamic_addr - static_addr; + _DYNAMIC sysmbol is used here as its link-time address stored in + the special unrelocated first GOT entry. */ + + extern ElfW(Dyn) _DYNAMIC[] attribute_hidden; + return (ElfW(Addr)) &_DYNAMIC - elf_machine_dynamic (); } /* Set up the loaded object described by L so its unrelocated PLT @@ -115,97 +102,117 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) } } - if (l->l_info[ADDRIDX (DT_TLSDESC_GOT)] && lazy) - *(ElfW(Addr)*)(D_PTR (l, l_info[ADDRIDX (DT_TLSDESC_GOT)]) + l->l_addr) - = (ElfW(Addr)) &_dl_tlsdesc_resolve_rela; - return lazy; } /* Initial entry point for the dynamic linker. The C function _dl_start is the real entry point, its return value is the user program's entry point */ +#ifdef __LP64__ +# define RTLD_START RTLD_START_1 ("x", "3", "sp") +#else +# define RTLD_START RTLD_START_1 ("w", "2", "wsp") +#endif -#define RTLD_START asm ("\ -.text \n\ -.globl _start \n\ -.type _start, %function \n\ -.globl _dl_start_user \n\ -.type _dl_start_user, %function \n\ -_start: \n\ - mov x0, sp \n\ - bl _dl_start \n\ - // returns user entry point in x0 \n\ - mov x21, x0 \n\ -_dl_start_user: \n\ - // get the original arg count \n\ - ldr x1, [sp] \n\ - // get the argv address \n\ - add x2, sp, #8 \n\ - // get _dl_skip_args to see if we were \n\ - // invoked as an executable \n\ - adrp x4, _dl_skip_args \n\ - ldr w4, [x4, #:lo12:_dl_skip_args] \n\ - // do we need to adjust argc/argv \n\ - cmp w4, 0 \n\ - beq .L_done_stack_adjust \n\ - // subtract _dl_skip_args from original arg count \n\ - sub x1, x1, x4 \n\ - // store adjusted argc back to stack \n\ - str x1, [sp] \n\ - // find the first unskipped argument \n\ - mov x3, x2 \n\ - add x4, x2, x4, lsl #3 \n\ - // shuffle argv down \n\ -1: ldr x5, [x4], #8 \n\ - str x5, [x3], #8 \n\ - cmp x5, #0 \n\ - bne 1b \n\ - // shuffle envp down \n\ -1: ldr x5, [x4], #8 \n\ - str x5, [x3], #8 \n\ - cmp x5, #0 \n\ - bne 1b \n\ - // shuffle auxv down \n\ -1: ldp x0, x5, [x4, #16]! \n\ - stp x0, x5, [x3], #16 \n\ - cmp x0, #0 \n\ - bne 1b \n\ - // Update _dl_argv \n\ - adrp x3, _dl_argv \n\ - str x2, [x3, #:lo12:_dl_argv] \n\ -.L_done_stack_adjust: \n\ - // compute envp \n\ - add x3, x2, x1, lsl #3 \n\ - add x3, x3, #8 \n\ - adrp x16, _rtld_local \n\ - add x16, x16, #:lo12:_rtld_local \n\ - ldr x0, [x16] \n\ - bl _dl_init \n\ - // load the finalizer function \n\ - adrp x0, _dl_fini \n\ - add x0, x0, #:lo12:_dl_fini \n\ - // jump to the user_s entry point \n\ - br x21 \n\ + +#define RTLD_START_1(PTR, PTR_SIZE_LOG, PTR_SP) asm ("\ +.text \n\ +.globl _start \n\ +.type _start, %function \n\ +.globl _dl_start_user \n\ +.type _dl_start_user, %function \n\ +_start: \n\ + mov " PTR "0, " PTR_SP " \n\ + bl _dl_start \n\ + // returns user entry point in x0 \n\ + mov x21, x0 \n\ +_dl_start_user: \n\ + // get the original arg count \n\ + ldr " PTR "1, [sp] \n\ + // get the argv address \n\ + add " PTR "2, " PTR_SP ", #(1<<" PTR_SIZE_LOG ") \n\ + // get _dl_skip_args to see if we were \n\ + // invoked as an executable \n\ + adrp x4, _dl_skip_args \n\ + ldr w4, [x4, #:lo12:_dl_skip_args] \n\ + // do we need to adjust argc/argv \n\ + cmp w4, 0 \n\ + beq .L_done_stack_adjust \n\ + // subtract _dl_skip_args from original arg count \n\ + sub " PTR "1, " PTR "1, " PTR "4 \n\ + // store adjusted argc back to stack \n\ + str " PTR "1, [sp] \n\ + // find the first unskipped argument \n\ + mov " PTR "3, " PTR "2 \n\ + add " PTR "4, " PTR "2, " PTR "4, lsl #" PTR_SIZE_LOG " \n\ + // shuffle argv down \n\ +1: ldr " PTR "5, [x4], #(1<<" PTR_SIZE_LOG ") \n\ + str " PTR "5, [x3], #(1<<" PTR_SIZE_LOG ") \n\ + cmp " PTR "5, #0 \n\ + bne 1b \n\ + // shuffle envp down \n\ +1: ldr " PTR "5, [x4], #(1<<" PTR_SIZE_LOG ") \n\ + str " PTR "5, [x3], #(1<<" PTR_SIZE_LOG ") \n\ + cmp " PTR "5, #0 \n\ + bne 1b \n\ + // shuffle auxv down \n\ +1: ldp " PTR "0, " PTR "5, [x4, #(2<<" PTR_SIZE_LOG ")]! \n\ + stp " PTR "0, " PTR "5, [x3], #(2<<" PTR_SIZE_LOG ") \n\ + cmp " PTR "0, #0 \n\ + bne 1b \n\ + // Update _dl_argv \n\ + adrp x3, __GI__dl_argv \n\ + str " PTR "2, [x3, #:lo12:__GI__dl_argv] \n\ +.L_done_stack_adjust: \n\ + // compute envp \n\ + add " PTR "3, " PTR "2, " PTR "1, lsl #" PTR_SIZE_LOG " \n\ + add " PTR "3, " PTR "3, #(1<<" PTR_SIZE_LOG ") \n\ + adrp x16, _rtld_local \n\ + add " PTR "16, " PTR "16, #:lo12:_rtld_local \n\ + ldr " PTR "0, [x16] \n\ + bl _dl_init \n\ + // load the finalizer function \n\ + adrp x0, _dl_fini \n\ + add " PTR "0, " PTR "0, #:lo12:_dl_fini \n\ + // jump to the user_s entry point \n\ + br x21 \n\ "); #define elf_machine_type_class(type) \ - ((((type) == R_AARCH64_JUMP_SLOT || \ - (type) == R_AARCH64_TLS_DTPMOD || \ - (type) == R_AARCH64_TLS_DTPREL || \ - (type) == R_AARCH64_TLS_TPREL || \ - (type) == R_AARCH64_TLSDESC) * ELF_RTYPE_CLASS_PLT) \ - | (((type) == R_AARCH64_COPY) * ELF_RTYPE_CLASS_COPY) \ - | (((type) == R_AARCH64_GLOB_DAT) * ELF_RTYPE_CLASS_EXTERN_PROTECTED_DATA)) + ((((type) == AARCH64_R(JUMP_SLOT) \ + || (type) == AARCH64_R(TLS_DTPMOD) \ + || (type) == AARCH64_R(TLS_DTPREL) \ + || (type) == AARCH64_R(TLS_TPREL) \ + || (type) == AARCH64_R(TLSDESC)) * ELF_RTYPE_CLASS_PLT) \ + | (((type) == AARCH64_R(COPY)) * ELF_RTYPE_CLASS_COPY) \ + | (((type) == AARCH64_R(GLOB_DAT)) * ELF_RTYPE_CLASS_EXTERN_PROTECTED_DATA)) -#define ELF_MACHINE_JMP_SLOT R_AARCH64_JUMP_SLOT +#define ELF_MACHINE_JMP_SLOT AARCH64_R(JUMP_SLOT) /* AArch64 uses RELA not REL */ #define ELF_MACHINE_NO_REL 1 #define ELF_MACHINE_NO_RELA 0 +#define DL_PLATFORM_INIT dl_platform_init () + +static inline void __attribute__ ((unused)) +dl_platform_init (void) +{ + if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0') + /* Avoid an empty string which would disturb us. */ + GLRO(dl_platform) = NULL; + +#ifdef SHARED + /* init_cpu_features has been called early from __libc_start_main in + static executable. */ + init_cpu_features (&GLRO(dl_aarch64_cpu_features)); +#endif +} + + static inline ElfW(Addr) elf_machine_fixup_plt (struct link_map *map, lookup_t t, + const ElfW(Sym) *refsym, const ElfW(Sym) *sym, const ElfW(Rela) *reloc, ElfW(Addr) *reloc_addr, ElfW(Addr) value) @@ -237,9 +244,9 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc, void *const reloc_addr_arg, int skip_ifunc) { ElfW(Addr) *const reloc_addr = reloc_addr_arg; - const unsigned int r_type = ELF64_R_TYPE (reloc->r_info); + const unsigned int r_type = ELFW (R_TYPE) (reloc->r_info); - if (__builtin_expect (r_type == R_AARCH64_RELATIVE, 0)) + if (__builtin_expect (r_type == AARCH64_R(RELATIVE), 0)) *reloc_addr = map->l_addr + reloc->r_addend; else if (__builtin_expect (r_type == R_AARCH64_NONE, 0)) return; @@ -247,7 +254,7 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc, { const ElfW(Sym) *const refsym = sym; struct link_map *sym_map = RESOLVE_MAP (&sym, version, r_type); - ElfW(Addr) value = sym_map == NULL ? 0 : sym_map->l_addr + sym->st_value; + ElfW(Addr) value = SYMBOL_ADDRESS (sym_map, sym, true); if (sym != NULL && __glibc_unlikely (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC) @@ -257,7 +264,7 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc, switch (r_type) { - case R_AARCH64_COPY: + case AARCH64_R(COPY): if (sym == NULL) break; @@ -272,18 +279,21 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc, RTLD_PROGNAME, strtab + refsym->st_name); } memcpy (reloc_addr_arg, (void *) value, - MIN (sym->st_size, refsym->st_size)); + sym->st_size < refsym->st_size + ? sym->st_size : refsym->st_size); break; - case R_AARCH64_RELATIVE: - case R_AARCH64_GLOB_DAT: - case R_AARCH64_JUMP_SLOT: - case R_AARCH64_ABS32: - case R_AARCH64_ABS64: + case AARCH64_R(RELATIVE): + case AARCH64_R(GLOB_DAT): + case AARCH64_R(JUMP_SLOT): + case AARCH64_R(ABS32): +#ifdef __LP64__ + case AARCH64_R(ABS64): +#endif *reloc_addr = value + reloc->r_addend; break; - case R_AARCH64_TLSDESC: + case AARCH64_R(TLSDESC): { struct tlsdesc volatile *td = (struct tlsdesc volatile *)reloc_addr; @@ -318,7 +328,7 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc, break; } - case R_AARCH64_TLS_DTPMOD: + case AARCH64_R(TLS_DTPMOD): #ifdef RTLD_BOOTSTRAP *reloc_addr = 1; #else @@ -329,12 +339,12 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc, #endif break; - case R_AARCH64_TLS_DTPREL: + case AARCH64_R(TLS_DTPREL): if (sym) *reloc_addr = sym->st_value + reloc->r_addend; break; - case R_AARCH64_TLS_TPREL: + case AARCH64_R(TLS_TPREL): if (sym) { CHECK_STATIC_TLS (map, sym_map); @@ -343,7 +353,7 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc, } break; - case R_AARCH64_IRELATIVE: + case AARCH64_R(IRELATIVE): value = map->l_addr + reloc->r_addend; value = elf_ifunc_invoke (value); *reloc_addr = value; @@ -374,25 +384,34 @@ elf_machine_lazy_rel (struct link_map *map, int skip_ifunc) { ElfW(Addr) *const reloc_addr = (void *) (l_addr + reloc->r_offset); - const unsigned int r_type = ELF64_R_TYPE (reloc->r_info); + const unsigned int r_type = ELFW (R_TYPE) (reloc->r_info); /* Check for unexpected PLT reloc type. */ - if (__builtin_expect (r_type == R_AARCH64_JUMP_SLOT, 1)) + if (__builtin_expect (r_type == AARCH64_R(JUMP_SLOT), 1)) { if (__builtin_expect (map->l_mach.plt, 0) == 0) *reloc_addr += l_addr; else *reloc_addr = map->l_mach.plt; } - else if (__builtin_expect (r_type == R_AARCH64_TLSDESC, 1)) + else if (__builtin_expect (r_type == AARCH64_R(TLSDESC), 1)) { - struct tlsdesc volatile *td = - (struct tlsdesc volatile *)reloc_addr; + const Elf_Symndx symndx = ELFW (R_SYM) (reloc->r_info); + const ElfW (Sym) *symtab = (const void *)D_PTR (map, l_info[DT_SYMTAB]); + const ElfW (Sym) *sym = &symtab[symndx]; + const struct r_found_version *version = NULL; + + if (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL) + { + const ElfW (Half) *vernum = + (const void *)D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]); + version = &map->l_versions[vernum[symndx] & 0x7fff]; + } - td->arg = (void*)reloc; - td->entry = (void*)(D_PTR (map, l_info[ADDRIDX (DT_TLSDESC_PLT)]) - + map->l_addr); + /* Always initialize TLS descriptors completely, because lazy + initialization requires synchronization at every TLS access. */ + elf_machine_rela (map, reloc, sym, version, reloc_addr, skip_ifunc); } - else if (__glibc_unlikely (r_type == R_AARCH64_IRELATIVE)) + else if (__glibc_unlikely (r_type == AARCH64_R(IRELATIVE))) { ElfW(Addr) value = map->l_addr + reloc->r_addend; if (__glibc_likely (!skip_ifunc)) diff --git a/sysdeps/aarch64/dl-sysdep.h b/sysdeps/aarch64/dl-sysdep.h index 9c05d4b0dc..a0e6a2eed1 100644 --- a/sysdeps/aarch64/dl-sysdep.h +++ b/sysdeps/aarch64/dl-sysdep.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2002-2016 Free Software Foundation, Inc. +/* Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/dl-tls.h b/sysdeps/aarch64/dl-tls.h index 71265c5341..153fded939 100644 --- a/sysdeps/aarch64/dl-tls.h +++ b/sysdeps/aarch64/dl-tls.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2005-2016 Free Software Foundation, Inc. +/* Copyright (C) 2005-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -25,6 +25,3 @@ typedef struct extern void *__tls_get_addr (tls_index *ti); - -/* Value used for dtv entries for which the allocation is delayed. */ -#define TLS_DTV_UNALLOCATED ((void *) -1l) diff --git a/sysdeps/aarch64/dl-tlsdesc.S b/sysdeps/aarch64/dl-tlsdesc.S index 05be370abb..43a62ef307 100644 --- a/sysdeps/aarch64/dl-tlsdesc.S +++ b/sysdeps/aarch64/dl-tlsdesc.S @@ -1,6 +1,6 @@ /* Thread-local storage handling in the ELF dynamic linker. AArch64 version. - Copyright (C) 2011-2016 Free Software Foundation, Inc. + Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -74,34 +74,12 @@ cfi_startproc .align 2 _dl_tlsdesc_return: - ldr x0, [x0, #8] + DELOUSE (0) + ldr PTR_REG (0), [x0, #PTR_SIZE] RET cfi_endproc .size _dl_tlsdesc_return, .-_dl_tlsdesc_return - /* Same as _dl_tlsdesc_return but with synchronization for - lazy relocation. - Prototype: - _dl_tlsdesc_return_lazy (tlsdesc *) ; - */ - .hidden _dl_tlsdesc_return_lazy - .global _dl_tlsdesc_return_lazy - .type _dl_tlsdesc_return_lazy,%function - cfi_startproc - .align 2 -_dl_tlsdesc_return_lazy: - /* The ldar here happens after the load from [x0] at the call site - (that is generated by the compiler as part of the TLS access ABI), - so it reads the same value (this function is the final value of - td->entry) and thus it synchronizes with the release store to - td->entry in _dl_tlsdesc_resolve_rela_fixup ensuring that the load - from [x0,#8] here happens after the initialization of td->arg. */ - ldar xzr, [x0] - ldr x0, [x0, #8] - RET - cfi_endproc - .size _dl_tlsdesc_return_lazy, .-_dl_tlsdesc_return_lazy - /* Handler for undefined weak TLS symbols. Prototype: _dl_tlsdesc_undefweak (tlsdesc *); @@ -119,16 +97,10 @@ _dl_tlsdesc_return_lazy: _dl_tlsdesc_undefweak: str x1, [sp, #-16]! cfi_adjust_cfa_offset (16) - /* The ldar here happens after the load from [x0] at the call site - (that is generated by the compiler as part of the TLS access ABI), - so it reads the same value (this function is the final value of - td->entry) and thus it synchronizes with the release store to - td->entry in _dl_tlsdesc_resolve_rela_fixup ensuring that the load - from [x0,#8] here happens after the initialization of td->arg. */ - ldar xzr, [x0] - ldr x0, [x0, #8] + DELOUSE (0) + ldr PTR_REG (0), [x0, #PTR_SIZE] mrs x1, tpidr_el0 - sub x0, x0, x1 + sub PTR_REG (0), PTR_REG (0), PTR_REG (1) ldr x1, [sp], #16 cfi_adjust_cfa_offset (-16) RET @@ -151,7 +123,7 @@ _dl_tlsdesc_undefweak: _dl_tlsdesc_dynamic (struct tlsdesc *tdp) { struct tlsdesc_dynamic_arg *td = tdp->arg; - dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET); + dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + TCBHEAD_DTV); if (__builtin_expect (td->gen_count <= dtv[0].counter && (dtv[td->tlsinfo.ti_module].pointer.val != TLS_DTV_UNALLOCATED), @@ -170,46 +142,37 @@ _dl_tlsdesc_undefweak: cfi_startproc .align 2 _dl_tlsdesc_dynamic: -# define NSAVEXREGPAIRS 2 - stp x29, x30, [sp,#-(32+16*NSAVEXREGPAIRS)]! - cfi_adjust_cfa_offset (32+16*NSAVEXREGPAIRS) - mov x29, sp + DELOUSE (0) /* Save just enough registers to support fast path, if we fall into slow path we will save additional registers. */ - - stp x1, x2, [sp, #32+16*0] - stp x3, x4, [sp, #32+16*1] + stp x1, x2, [sp, #-32]! + stp x3, x4, [sp, #16] + cfi_adjust_cfa_offset (32) + cfi_rel_offset (x1, 0) + cfi_rel_offset (x2, 8) + cfi_rel_offset (x3, 16) + cfi_rel_offset (x4, 24) mrs x4, tpidr_el0 - /* The ldar here happens after the load from [x0] at the call site - (that is generated by the compiler as part of the TLS access ABI), - so it reads the same value (this function is the final value of - td->entry) and thus it synchronizes with the release store to - td->entry in _dl_tlsdesc_resolve_rela_fixup ensuring that the load - from [x0,#8] here happens after the initialization of td->arg. */ - ldar xzr, [x0] - ldr x1, [x0,#8] - ldr x0, [x4] - ldr x3, [x1,#16] - ldr x2, [x0] - cmp x3, x2 + ldr PTR_REG (1), [x0,#TLSDESC_ARG] + ldr PTR_REG (0), [x4,#TCBHEAD_DTV] + ldr PTR_REG (3), [x1,#TLSDESC_GEN_COUNT] + ldr PTR_REG (2), [x0,#DTV_COUNTER] + cmp PTR_REG (3), PTR_REG (2) b.hi 2f - ldr x2, [x1] - add x0, x0, x2, lsl #4 - ldr x0, [x0] - cmn x0, #0x1 + /* Load r2 = td->tlsinfo.ti_module and r3 = td->tlsinfo.ti_offset. */ + ldp PTR_REG (2), PTR_REG (3), [x1,#TLSDESC_MODID] + add PTR_REG (0), PTR_REG (0), PTR_REG (2), lsl #(PTR_LOG_SIZE + 1) + ldr PTR_REG (0), [x0] /* Load val member of DTV entry. */ + cmp PTR_REG (0), #TLS_DTV_UNALLOCATED b.eq 2f - ldr x1, [x1,#8] - add x0, x0, x1 - sub x0, x0, x4 + sub PTR_REG (3), PTR_REG (3), PTR_REG (4) + add PTR_REG (0), PTR_REG (0), PTR_REG (3) 1: - ldp x1, x2, [sp, #32+16*0] - ldp x3, x4, [sp, #32+16*1] - - ldp x29, x30, [sp], #(32+16*NSAVEXREGPAIRS) - cfi_adjust_cfa_offset (-32-16*NSAVEXREGPAIRS) -# undef NSAVEXREGPAIRS + ldp x3, x4, [sp, #16] + ldp x1, x2, [sp], #32 + cfi_adjust_cfa_offset (-32) RET 2: /* This is the slow path. We need to call __tls_get_addr() which @@ -217,15 +180,33 @@ _dl_tlsdesc_dynamic: callee will trash. */ /* Save the remaining registers that we must treat as caller save. */ -# define NSAVEXREGPAIRS 7 - stp x5, x6, [sp, #-16*NSAVEXREGPAIRS]! +# define NSAVEXREGPAIRS 8 + stp x29, x30, [sp,#-16*NSAVEXREGPAIRS]! cfi_adjust_cfa_offset (16*NSAVEXREGPAIRS) - stp x7, x8, [sp, #16*1] - stp x9, x10, [sp, #16*2] - stp x11, x12, [sp, #16*3] - stp x13, x14, [sp, #16*4] - stp x15, x16, [sp, #16*5] - stp x17, x18, [sp, #16*6] + cfi_rel_offset (x29, 0) + cfi_rel_offset (x30, 8) + mov x29, sp + stp x5, x6, [sp, #16*1] + stp x7, x8, [sp, #16*2] + stp x9, x10, [sp, #16*3] + stp x11, x12, [sp, #16*4] + stp x13, x14, [sp, #16*5] + stp x15, x16, [sp, #16*6] + stp x17, x18, [sp, #16*7] + cfi_rel_offset (x5, 16*1) + cfi_rel_offset (x6, 16*1+8) + cfi_rel_offset (x7, 16*2) + cfi_rel_offset (x8, 16*2+8) + cfi_rel_offset (x9, 16*3) + cfi_rel_offset (x10, 16*3+8) + cfi_rel_offset (x11, 16*4) + cfi_rel_offset (x12, 16*4+8) + cfi_rel_offset (x13, 16*5) + cfi_rel_offset (x14, 16*5+8) + cfi_rel_offset (x15, 16*6) + cfi_rel_offset (x16, 16*6+8) + cfi_rel_offset (x17, 16*7) + cfi_rel_offset (x18, 16*7+8) SAVE_Q_REGISTERS @@ -233,134 +214,24 @@ _dl_tlsdesc_dynamic: bl __tls_get_addr mrs x1, tpidr_el0 - sub x0, x0, x1 + sub PTR_REG (0), PTR_REG (0), PTR_REG (1) RESTORE_Q_REGISTERS - ldp x7, x8, [sp, #16*1] - ldp x9, x10, [sp, #16*2] - ldp x11, x12, [sp, #16*3] - ldp x13, x14, [sp, #16*4] - ldp x15, x16, [sp, #16*5] - ldp x17, x18, [sp, #16*6] - ldp x5, x6, [sp], #16*NSAVEXREGPAIRS + ldp x5, x6, [sp, #16*1] + ldp x7, x8, [sp, #16*2] + ldp x9, x10, [sp, #16*3] + ldp x11, x12, [sp, #16*4] + ldp x13, x14, [sp, #16*5] + ldp x15, x16, [sp, #16*6] + ldp x17, x18, [sp, #16*7] + + ldp x29, x30, [sp], #16*NSAVEXREGPAIRS cfi_adjust_cfa_offset (-16*NSAVEXREGPAIRS) + cfi_restore (x29) + cfi_restore (x30) b 1b cfi_endproc .size _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic # undef NSAVEXREGPAIRS #endif - - /* This function is a wrapper for a lazy resolver for TLS_DESC - RELA relocations. - When the actual resolver returns, it will have adjusted the - TLS descriptor such that we can tail-call it for it to return - the TP offset of the symbol. */ - - .hidden _dl_tlsdesc_resolve_rela - .global _dl_tlsdesc_resolve_rela - .type _dl_tlsdesc_resolve_rela,%function - cfi_startproc - .align 2 -_dl_tlsdesc_resolve_rela: -#define NSAVEXREGPAIRS 9 - stp x29, x30, [sp, #-(32+16*NSAVEXREGPAIRS)]! - cfi_adjust_cfa_offset (32+16*NSAVEXREGPAIRS) - mov x29, sp - stp x1, x4, [sp, #32+16*0] - stp x5, x6, [sp, #32+16*1] - stp x7, x8, [sp, #32+16*2] - stp x9, x10, [sp, #32+16*3] - stp x11, x12, [sp, #32+16*4] - stp x13, x14, [sp, #32+16*5] - stp x15, x16, [sp, #32+16*6] - stp x17, x18, [sp, #32+16*7] - str x0, [sp, #32+16*8] - - SAVE_Q_REGISTERS - - ldr x1, [x3, #8] - bl _dl_tlsdesc_resolve_rela_fixup - - RESTORE_Q_REGISTERS - - ldr x0, [sp, #32+16*8] - ldr x1, [x0] - blr x1 - - ldp x1, x4, [sp, #32+16*0] - ldp x5, x6, [sp, #32+16*1] - ldp x7, x8, [sp, #32+16*2] - ldp x9, x10, [sp, #32+16*3] - ldp x11, x12, [sp, #32+16*4] - ldp x13, x14, [sp, #32+16*5] - ldp x15, x16, [sp, #32+16*6] - ldp x17, x18, [sp, #32+16*7] - ldp x29, x30, [sp], #(32+16*NSAVEXREGPAIRS) - cfi_adjust_cfa_offset (-32-16*NSAVEXREGPAIRS) - ldp x2, x3, [sp], #16 - cfi_adjust_cfa_offset (-16) - RET -#undef NSAVEXREGPAIRS - cfi_endproc - .size _dl_tlsdesc_resolve_rela, .-_dl_tlsdesc_resolve_rela - - /* This function is a placeholder for lazy resolving of TLS - relocations. Once some thread starts resolving a TLS - relocation, it sets up the TLS descriptor to use this - resolver, such that other threads that would attempt to - resolve it concurrently may skip the call to the original lazy - resolver and go straight to a condition wait. - - When the actual resolver returns, it will have adjusted the - TLS descriptor such that we can tail-call it for it to return - the TP offset of the symbol. */ - - .hidden _dl_tlsdesc_resolve_hold - .global _dl_tlsdesc_resolve_hold - .type _dl_tlsdesc_resolve_hold,%function - cfi_startproc - .align 2 -_dl_tlsdesc_resolve_hold: -#define NSAVEXREGPAIRS 10 -1: - stp x29, x30, [sp, #-(32+16*NSAVEXREGPAIRS)]! - cfi_adjust_cfa_offset (32+16*NSAVEXREGPAIRS) - mov x29, sp - stp x1, x2, [sp, #32+16*0] - stp x3, x4, [sp, #32+16*1] - stp x5, x6, [sp, #32+16*2] - stp x7, x8, [sp, #32+16*3] - stp x9, x10, [sp, #32+16*4] - stp x11, x12, [sp, #32+16*5] - stp x13, x14, [sp, #32+16*6] - stp x15, x16, [sp, #32+16*7] - stp x17, x18, [sp, #32+16*8] - str x0, [sp, #32+16*9] - - SAVE_Q_REGISTERS - - adr x1, 1b - bl _dl_tlsdesc_resolve_hold_fixup - - RESTORE_Q_REGISTERS - - ldr x0, [sp, #32+16*9] - ldr x1, [x0] - blr x1 - - ldp x1, x2, [sp, #32+16*0] - ldp x3, x4, [sp, #32+16*1] - ldp x5, x6, [sp, #32+16*2] - ldp x7, x8, [sp, #32+16*3] - ldp x9, x10, [sp, #32+16*4] - ldp x11, x12, [sp, #32+16*5] - ldp x13, x14, [sp, #32+16*6] - ldp x15, x16, [sp, #32+16*7] - ldp x17, x18, [sp, #32+16*8] - ldp x29, x30, [sp], #(32+16*NSAVEXREGPAIRS) - cfi_adjust_cfa_offset (-32-16*NSAVEXREGPAIRS) - RET - cfi_endproc - .size _dl_tlsdesc_resolve_hold, .-_dl_tlsdesc_resolve_hold -#undef NSAVEXREGPAIRS diff --git a/sysdeps/aarch64/dl-tlsdesc.h b/sysdeps/aarch64/dl-tlsdesc.h index 8532469fd9..6dc80ad66c 100644 --- a/sysdeps/aarch64/dl-tlsdesc.h +++ b/sysdeps/aarch64/dl-tlsdesc.h @@ -1,6 +1,6 @@ /* Thread-local storage descriptor handling in the ELF dynamic linker. AArch64 version. - Copyright (C) 2011-2016 Free Software Foundation, Inc. + Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -46,20 +46,10 @@ extern ptrdiff_t attribute_hidden _dl_tlsdesc_return (struct tlsdesc *); extern ptrdiff_t attribute_hidden -_dl_tlsdesc_return_lazy (struct tlsdesc *); - -extern ptrdiff_t attribute_hidden _dl_tlsdesc_undefweak (struct tlsdesc *); -extern ptrdiff_t attribute_hidden -_dl_tlsdesc_resolve_rela (struct tlsdesc *); - -extern ptrdiff_t attribute_hidden -_dl_tlsdesc_resolve_hold (struct tlsdesc *); - # ifdef SHARED -extern void *internal_function _dl_make_tlsdesc_dynamic (struct link_map *, - size_t); +extern void *_dl_make_tlsdesc_dynamic (struct link_map *, size_t); extern ptrdiff_t attribute_hidden _dl_tlsdesc_dynamic (struct tlsdesc *); diff --git a/sysdeps/aarch64/dl-trampoline.S b/sysdeps/aarch64/dl-trampoline.S index 947a51579f..a86d0722d4 100644 --- a/sysdeps/aarch64/dl-trampoline.S +++ b/sysdeps/aarch64/dl-trampoline.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2005-2016 Free Software Foundation, Inc. +/* Copyright (C) 2005-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -22,9 +22,13 @@ #include "dl-link.h" #define ip0 x16 +#define ip0l PTR_REG (16) #define ip1 x17 #define lr x30 +/* RELA relocatons are 3 pointers */ +#define RELA_SIZE (PTR_SIZE * 3) + .text .globl _dl_runtime_resolve .type _dl_runtime_resolve, #function @@ -79,7 +83,7 @@ _dl_runtime_resolve: cfi_rel_offset (q1, 80+7*16) /* Get pointer to linker struct. */ - ldr x0, [ip0, #-8] + ldr PTR_REG (0), [ip0, #-PTR_SIZE] /* Prepare to call _dl_fixup(). */ ldr x1, [sp, 80+8*16] /* Recover &PLTGOT[n] */ @@ -87,7 +91,7 @@ _dl_runtime_resolve: sub x1, x1, ip0 add x1, x1, x1, lsl #1 lsl x1, x1, #3 - sub x1, x1, #192 + sub x1, x1, #(RELA_SIZE<<3) lsr x1, x1, #3 /* Call fixup routine. */ @@ -191,7 +195,7 @@ _dl_runtime_profile: stp x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_SP] /* Get pointer to linker struct. */ - ldr x0, [ip0, #-8] + ldr PTR_REG (0), [ip0, #-PTR_SIZE] /* Prepare to call _dl_profile_fixup(). */ ldr x1, [x29, OFFSET_PLTGOTN] /* Recover &PLTGOT[n] */ @@ -199,7 +203,7 @@ _dl_runtime_profile: sub x1, x1, ip0 add x1, x1, x1, lsl #1 lsl x1, x1, #3 - sub x1, x1, #192 + sub x1, x1, #(RELA_SIZE<<3) lsr x1, x1, #3 stp x0, x1, [x29, #OFFSET_SAVED_CALL_X0] @@ -210,8 +214,8 @@ _dl_runtime_profile: add x4, x29, #OFFSET_FS /* address of framesize */ bl _dl_profile_fixup - ldr ip0, [x29, #OFFSET_FS] /* framesize == 0 */ - cmp ip0, #0 + ldr ip0l, [x29, #OFFSET_FS] /* framesize == 0 */ + cmp ip0l, #0 bge 1f cfi_remember_state @@ -243,7 +247,7 @@ _dl_runtime_profile: 1: /* The new frame size is in ip0. */ - sub x1, x29, ip0 + sub PTR_REG (1), PTR_REG (29), ip0l and sp, x1, #0xfffffffffffffff0 str x0, [x29, #OFFSET_T1] diff --git a/sysdeps/aarch64/dl-tunables.list b/sysdeps/aarch64/dl-tunables.list new file mode 100644 index 0000000000..f6a88168cc --- /dev/null +++ b/sysdeps/aarch64/dl-tunables.list @@ -0,0 +1,25 @@ +# aarch64 specific tunables. +# Copyright (C) 2017-2018 Free Software Foundation, Inc. +# This file is part of the GNU C Library. + +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. + +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. + +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <http://www.gnu.org/licenses/>. + +glibc { + tune { + cpu { + type: STRING + } + } +} diff --git a/sysdeps/aarch64/soft-fp/e_sqrtl.c b/sysdeps/aarch64/e_sqrtl.c index 5d3919b6a2..1dc1cbf003 100644 --- a/sysdeps/aarch64/soft-fp/e_sqrtl.c +++ b/sysdeps/aarch64/e_sqrtl.c @@ -1,5 +1,5 @@ /* long double square root in software floating-point emulation. - Copyright (C) 1997-2016 Free Software Foundation, Inc. + Copyright (C) 1997-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Richard Henderson (rth@cygnus.com) and Jakub Jelinek (jj@ultra.linux.cz). diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile new file mode 100644 index 0000000000..4a182bd6d6 --- /dev/null +++ b/sysdeps/aarch64/fpu/Makefile @@ -0,0 +1,14 @@ +ifeq ($(subdir),math) +CFLAGS-e_sqrtf.c += -fno-math-errno +CFLAGS-e_sqrt.c += -fno-math-errno +CFLAGS-s_lroundf.c += -fno-math-errno +CFLAGS-s_lround.c += -fno-math-errno +CFLAGS-s_llroundf.c += -fno-math-errno +CFLAGS-s_llround.c += -fno-math-errno +# GCC 4.9 and 5 requires the flag to correct emits a f{max,min}nm +# for a __builtin_{fmax,fmin}{f}. +CFLAGS-s_fmax.c += -ffinite-math-only +CFLAGS-s_fmaxf.c += -ffinite-math-only +CFLAGS-s_fmin.c += -ffinite-math-only +CFLAGS-s_fminf.c += -ffinite-math-only +endif diff --git a/sysdeps/aarch64/fpu/e_sqrt.c b/sysdeps/aarch64/fpu/e_sqrt.c index 2759fb186d..32f1e26803 100644 --- a/sysdeps/aarch64/fpu/e_sqrt.c +++ b/sysdeps/aarch64/fpu/e_sqrt.c @@ -1,5 +1,5 @@ /* Square root of floating point number. - Copyright (C) 2015-2016 Free Software Foundation, Inc. + Copyright (C) 2015-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -21,8 +21,6 @@ double __ieee754_sqrt (double d) { - double res; - asm ("fsqrt %d0, %d1" : "=w" (res) : "w" (d)); - return res; + return __builtin_sqrt (d); } strong_alias (__ieee754_sqrt, __sqrt_finite) diff --git a/sysdeps/aarch64/fpu/e_sqrtf.c b/sysdeps/aarch64/fpu/e_sqrtf.c index e7ce19a38c..87f557cb10 100644 --- a/sysdeps/aarch64/fpu/e_sqrtf.c +++ b/sysdeps/aarch64/fpu/e_sqrtf.c @@ -1,5 +1,5 @@ /* Single-precision floating point square root. - Copyright (C) 2015-2016 Free Software Foundation, Inc. + Copyright (C) 2015-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -21,8 +21,6 @@ float __ieee754_sqrtf (float s) { - float res; - asm ("fsqrt %s0, %s1" : "=w" (res) : "w" (s)); - return res; + return __builtin_sqrtf (s); } strong_alias (__ieee754_sqrtf, __sqrtf_finite) diff --git a/sysdeps/aarch64/fpu/fclrexcpt.c b/sysdeps/aarch64/fpu/fclrexcpt.c index 8c0f09a82d..9328518f1e 100644 --- a/sysdeps/aarch64/fpu/fclrexcpt.c +++ b/sysdeps/aarch64/fpu/fclrexcpt.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2016 Free Software Foundation, Inc. +/* Copyright (C) 1997-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/fedisblxcpt.c b/sysdeps/aarch64/fpu/fedisblxcpt.c index bf9592847f..125751f8b4 100644 --- a/sysdeps/aarch64/fpu/fedisblxcpt.c +++ b/sysdeps/aarch64/fpu/fedisblxcpt.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2001-2016 Free Software Foundation, Inc. +/* Copyright (C) 2001-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/feenablxcpt.c b/sysdeps/aarch64/fpu/feenablxcpt.c index dff1050e66..6c72b42fc9 100644 --- a/sysdeps/aarch64/fpu/feenablxcpt.c +++ b/sysdeps/aarch64/fpu/feenablxcpt.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2001-2016 Free Software Foundation, Inc. +/* Copyright (C) 2001-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/fegetenv.c b/sysdeps/aarch64/fpu/fegetenv.c index c10975f6a2..1559e90977 100644 --- a/sysdeps/aarch64/fpu/fegetenv.c +++ b/sysdeps/aarch64/fpu/fegetenv.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2016 Free Software Foundation, Inc. +/* Copyright (C) 1997-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/fegetexcept.c b/sysdeps/aarch64/fpu/fegetexcept.c index 4a4cccd5ff..1f994ead0d 100644 --- a/sysdeps/aarch64/fpu/fegetexcept.c +++ b/sysdeps/aarch64/fpu/fegetexcept.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2001-2016 Free Software Foundation, Inc. +/* Copyright (C) 2001-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/s_frintf.c b/sysdeps/aarch64/fpu/fegetmode.c index a284eafa4c..65dcb22d89 100644 --- a/sysdeps/aarch64/fpu/s_frintf.c +++ b/sysdeps/aarch64/fpu/fegetmode.c @@ -1,11 +1,11 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. - +/* Store current floating-point control modes. AArch64 version. + Copyright (C) 2016-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation; either version 2.1 of the - License, or (at your option) any later version. + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -16,9 +16,12 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#ifndef FUNC -# error FUNC not defined -#endif -#define TYPE float -#define REGS "s" -#include <s_frint.c> +#include <fenv.h> +#include <fpu_control.h> + +int +fegetmode (femode_t *modep) +{ + _FPU_GETCW (*modep); + return 0; +} diff --git a/sysdeps/aarch64/fpu/fegetround.c b/sysdeps/aarch64/fpu/fegetround.c index 94ad920543..10082eed90 100644 --- a/sysdeps/aarch64/fpu/fegetround.c +++ b/sysdeps/aarch64/fpu/fegetround.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2016 Free Software Foundation, Inc. +/* Copyright (C) 1997-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/feholdexcpt.c b/sysdeps/aarch64/fpu/feholdexcpt.c index b9cf0ebd6e..4f7a58e379 100644 --- a/sysdeps/aarch64/fpu/feholdexcpt.c +++ b/sysdeps/aarch64/fpu/feholdexcpt.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2016 Free Software Foundation, Inc. +/* Copyright (C) 1997-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/fesetenv.c b/sysdeps/aarch64/fpu/fesetenv.c index ca87bb60d6..2a63cf0c96 100644 --- a/sysdeps/aarch64/fpu/fesetenv.c +++ b/sysdeps/aarch64/fpu/fesetenv.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2016 Free Software Foundation, Inc. +/* Copyright (C) 1997-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/fesetexcept.c b/sysdeps/aarch64/fpu/fesetexcept.c new file mode 100644 index 0000000000..c97a38b483 --- /dev/null +++ b/sysdeps/aarch64/fpu/fesetexcept.c @@ -0,0 +1,34 @@ +/* Set given exception flags. AArch64 version. + Copyright (C) 2016-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <fpu_control.h> + +int +fesetexcept (int excepts) +{ + fpu_fpsr_t fpsr; + fpu_fpsr_t fpsr_new; + + _FPU_GETFPSR (fpsr); + fpsr_new = fpsr | (excepts & FE_ALL_EXCEPT); + if (fpsr != fpsr_new) + _FPU_SETFPSR (fpsr_new); + + return 0; +} diff --git a/sysdeps/aarch64/fpu/fesetmode.c b/sysdeps/aarch64/fpu/fesetmode.c new file mode 100644 index 0000000000..6f6b8c197d --- /dev/null +++ b/sysdeps/aarch64/fpu/fesetmode.c @@ -0,0 +1,34 @@ +/* Install given floating-point control modes. AArch64 version. + Copyright (C) 2016-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <fpu_control.h> + +int +fesetmode (const femode_t *modep) +{ + fpu_control_t fpcr, fpcr_new; + _FPU_GETCW (fpcr); + if (modep == FE_DFL_MODE) + fpcr_new = (fpcr & _FPU_RESERVED) | _FPU_DEFAULT; + else + fpcr_new = *modep; + if (fpcr != fpcr_new) + _FPU_SETCW (fpcr_new); + return 0; +} diff --git a/sysdeps/aarch64/fpu/fesetround.c b/sysdeps/aarch64/fpu/fesetround.c index 645efe7804..4c2ed3d7ac 100644 --- a/sysdeps/aarch64/fpu/fesetround.c +++ b/sysdeps/aarch64/fpu/fesetround.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2016 Free Software Foundation, Inc. +/* Copyright (C) 1997-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/feupdateenv.c b/sysdeps/aarch64/fpu/feupdateenv.c index 53d2b36cfa..2aaf5d13cd 100644 --- a/sysdeps/aarch64/fpu/feupdateenv.c +++ b/sysdeps/aarch64/fpu/feupdateenv.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2016 Free Software Foundation, Inc. +/* Copyright (C) 2009-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/fgetexcptflg.c b/sysdeps/aarch64/fpu/fgetexcptflg.c index 8089a2c9fb..cfc7113e04 100644 --- a/sysdeps/aarch64/fpu/fgetexcptflg.c +++ b/sysdeps/aarch64/fpu/fgetexcptflg.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2001-2016 Free Software Foundation, Inc. +/* Copyright (C) 2001-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/fpu_control.h b/sysdeps/aarch64/fpu/fpu_control.h index 7e74445872..2d36170c05 100644 --- a/sysdeps/aarch64/fpu/fpu_control.h +++ b/sysdeps/aarch64/fpu/fpu_control.h @@ -1,4 +1,4 @@ -/* Copyright (C) 1996-2016 Free Software Foundation, Inc. +/* Copyright (C) 1996-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -19,19 +19,28 @@ #ifndef _AARCH64_FPU_CONTROL_H #define _AARCH64_FPU_CONTROL_H +#include <features.h> + /* Macros for accessing the FPCR and FPSR. */ -#define _FPU_GETCW(fpcr) \ +#if __GNUC_PREREQ (6,0) +# define _FPU_GETCW(fpcr) (fpcr = __builtin_aarch64_get_fpcr ()) +# define _FPU_SETCW(fpcr) __builtin_aarch64_set_fpcr (fpcr) +# define _FPU_GETFPSR(fpsr) (fpsr = __builtin_aarch64_get_fpsr ()) +# define _FPU_SETFPSR(fpsr) __builtin_aarch64_set_fpsr (fpsr) +#else +# define _FPU_GETCW(fpcr) \ __asm__ __volatile__ ("mrs %0, fpcr" : "=r" (fpcr)) -#define _FPU_SETCW(fpcr) \ +# define _FPU_SETCW(fpcr) \ __asm__ __volatile__ ("msr fpcr, %0" : : "r" (fpcr)) -#define _FPU_GETFPSR(fpsr) \ +# define _FPU_GETFPSR(fpsr) \ __asm__ __volatile__ ("mrs %0, fpsr" : "=r" (fpsr)) -#define _FPU_SETFPSR(fpsr) \ +# define _FPU_SETFPSR(fpsr) \ __asm__ __volatile__ ("msr fpsr, %0" : : "r" (fpsr)) +#endif /* Reserved bits should be preserved when modifying register contents. These two masks indicate which bits in each of FPCR and diff --git a/sysdeps/aarch64/fpu/fraiseexcpt.c b/sysdeps/aarch64/fpu/fraiseexcpt.c index 4e00fbba42..bc26156323 100644 --- a/sysdeps/aarch64/fpu/fraiseexcpt.c +++ b/sysdeps/aarch64/fpu/fraiseexcpt.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2016 Free Software Foundation, Inc. +/* Copyright (C) 1997-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/fsetexcptflg.c b/sysdeps/aarch64/fpu/fsetexcptflg.c index 890f1d655c..45d96af7e9 100644 --- a/sysdeps/aarch64/fpu/fsetexcptflg.c +++ b/sysdeps/aarch64/fpu/fsetexcptflg.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2016 Free Software Foundation, Inc. +/* Copyright (C) 1997-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/ftestexcept.c b/sysdeps/aarch64/fpu/ftestexcept.c index cedbce456f..84223259c8 100644 --- a/sysdeps/aarch64/fpu/ftestexcept.c +++ b/sysdeps/aarch64/fpu/ftestexcept.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2016 Free Software Foundation, Inc. +/* Copyright (C) 1997-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/get-rounding-mode.h b/sysdeps/aarch64/fpu/get-rounding-mode.h index 0b23363025..ab29c5b908 100644 --- a/sysdeps/aarch64/fpu/get-rounding-mode.h +++ b/sysdeps/aarch64/fpu/get-rounding-mode.h @@ -1,6 +1,6 @@ /* Determine floating-point rounding mode within libc. AArch64 version. - Copyright (C) 2012-2016 Free Software Foundation, Inc. + Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/bits/mathdef.h b/sysdeps/aarch64/fpu/math-barriers.h index db992fccc1..7db937bbf4 100644 --- a/sysdeps/aarch64/bits/mathdef.h +++ b/sysdeps/aarch64/fpu/math-barriers.h @@ -1,5 +1,5 @@ -/* Copyright (C) 1999-2016 Free Software Foundation, Inc. - +/* Control when floating-point expressions are evaluated. AArch64 version. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,24 +16,12 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#if !defined _MATH_H && !defined _COMPLEX_H -# error "Never use <bits/mathdef.h> directly; include <math.h> instead" -#endif - -#if defined __USE_ISOC99 && defined _MATH_H && !defined _MATH_H_MATHDEF -# define _MATH_H_MATHDEF 1 +#ifndef AARCH64_MATH_BARRIERS_H +#define AARCH64_MATH_BARRIERS_H 1 -/* GCC does not promote `float' values to `double'. */ -typedef float float_t; /* `float' expressions are evaluated as - `float'. */ -typedef double double_t; /* `double' expressions are evaluated as - `double'. */ +#define math_opt_barrier(x) \ + ({ __typeof (x) __x = (x); __asm ("" : "+w" (__x)); __x; }) +#define math_force_eval(x) \ + ({ __typeof (x) __x = (x); __asm __volatile__ ("" : : "w" (__x)); }) -/* The values returned by `ilogb' for 0 and NaN respectively. */ -# define FP_ILOGB0 (-2147483647) -# define FP_ILOGBNAN (2147483647) - -# define FP_FAST_FMA 1 -# define FP_FAST_FMAF 1 - -#endif /* ISO C99 */ +#endif diff --git a/sysdeps/aarch64/fpu/math_private.h b/sysdeps/aarch64/fpu/math_private.h index fd8eb1fb56..fcd02c0654 100644 --- a/sysdeps/aarch64/fpu/math_private.h +++ b/sysdeps/aarch64/fpu/math_private.h @@ -1,5 +1,5 @@ /* Private floating point rounding and exceptions handling. AArch64 version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -22,27 +22,6 @@ #include <fenv.h> #include <fpu_control.h> -#define math_opt_barrier(x) \ -({ __typeof (x) __x = (x); __asm ("" : "+w" (__x)); __x; }) -#define math_force_eval(x) \ -({ __typeof (x) __x = (x); __asm __volatile__ ("" : : "w" (__x)); }) - -extern __always_inline double -__ieee754_sqrt (double d) -{ - double res; - asm __volatile__ ("fsqrt %d0, %d1" : "=w" (res) : "w" (d)); - return res; -} - -extern __always_inline float -__ieee754_sqrtf (float s) -{ - float res; - asm __volatile__ ("fsqrt %s0, %s1" : "=w" (res) : "w" (s)); - return res; -} - static __always_inline void libc_feholdexcept_aarch64 (fenv_t *envp) { @@ -319,6 +298,26 @@ libc_feresetround_noex_aarch64_ctx (struct rm_ctx *ctx) #define libc_feresetround_noexf_ctx libc_feresetround_noex_aarch64_ctx #define libc_feresetround_noexl_ctx libc_feresetround_noex_aarch64_ctx +/* Hack: only include the large arm_neon.h when needed. */ +#ifdef _MATH_CONFIG_H +# include <arm_neon.h> + +/* ACLE intrinsics for frintn and fcvtns instructions. */ +# define TOINT_INTRINSICS 1 + +static inline double_t +roundtoint (double_t x) +{ + return vget_lane_f64 (vrndn_f64 (vld1_f64 (&x)), 0); +} + +static inline uint64_t +converttoint (double_t x) +{ + return vcvtnd_s64_f64 (x); +} +#endif + #include_next <math_private.h> #endif diff --git a/sysdeps/aarch64/fpu/s_ceil.c b/sysdeps/aarch64/fpu/s_ceil.c index 60c444d70a..4e3b5fdc4d 100644 --- a/sysdeps/aarch64/fpu/s_ceil.c +++ b/sysdeps/aarch64/fpu/s_ceil.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,6 +16,13 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define FUNC ceil -#define INSN "frintp" -#include <s_frint.c> +#include <math.h> +#include <libm-alias-double.h> + +double +__ceil (double x) +{ + return __builtin_ceil (x); +} + +libm_alias_double (__ceil, ceil) diff --git a/sysdeps/aarch64/fpu/s_ceilf.c b/sysdeps/aarch64/fpu/s_ceilf.c index 1f1fb0677a..d01dadc246 100644 --- a/sysdeps/aarch64/fpu/s_ceilf.c +++ b/sysdeps/aarch64/fpu/s_ceilf.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,6 +16,13 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define FUNC ceilf -#define INSN "frintp" -#include <s_frintf.c> +#include <math.h> +#include <libm-alias-float.h> + +float +__ceilf (float x) +{ + return __builtin_ceilf (x); +} + +libm_alias_float (__ceil, ceil) diff --git a/sysdeps/aarch64/fpu/s_floor.c b/sysdeps/aarch64/fpu/s_floor.c index c490a27b66..7258246896 100644 --- a/sysdeps/aarch64/fpu/s_floor.c +++ b/sysdeps/aarch64/fpu/s_floor.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,6 +16,13 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define FUNC floor -#define INSN "frintm" -#include <s_frint.c> +#include <math.h> +#include <libm-alias-double.h> + +double +__floor (double x) +{ + return __builtin_floor (x); +} + +libm_alias_double (__floor, floor) diff --git a/sysdeps/aarch64/fpu/s_floorf.c b/sysdeps/aarch64/fpu/s_floorf.c index ec81683058..bd197013ad 100644 --- a/sysdeps/aarch64/fpu/s_floorf.c +++ b/sysdeps/aarch64/fpu/s_floorf.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,6 +16,13 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define FUNC floorf -#define INSN "frintm" -#include <s_frintf.c> +#include <math.h> +#include <libm-alias-float.h> + +float +__floorf (float x) +{ + return __builtin_floorf (x); +} + +libm_alias_float (__floor, floor) diff --git a/sysdeps/aarch64/fpu/s_fma.c b/sysdeps/aarch64/fpu/s_fma.c index 3320ba6989..4f3d6c4a51 100644 --- a/sysdeps/aarch64/fpu/s_fma.c +++ b/sysdeps/aarch64/fpu/s_fma.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1996-2016 Free Software Foundation, Inc. +/* Copyright (C) 1996-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -17,29 +17,12 @@ <http://www.gnu.org/licenses/>. */ #include <math.h> +#include <libm-alias-double.h> -#ifndef FUNC -# define FUNC fma -#endif - -#ifndef TYPE -# define TYPE double -# define REGS "d" -#else -# ifndef REGS -# error REGS not defined -# endif -#endif - -#define __CONCATX(a,b) __CONCAT(a,b) - -TYPE -__CONCATX(__,FUNC) (TYPE x, TYPE y, TYPE z) +double +__fma (double x, double y, double z) { - TYPE result; - asm ( "fmadd" "\t%" REGS "0, %" REGS "1, %" REGS "2, %" REGS "3" - : "=w" (result) : "w" (x), "w" (y), "w" (z) ); - return result; + return __builtin_fma (x, y, z); } -weak_alias (__CONCATX(__,FUNC), FUNC) +libm_alias_double (__fma, fma) diff --git a/sysdeps/aarch64/fpu/s_fmaf.c b/sysdeps/aarch64/fpu/s_fmaf.c index e35512c31b..197cf7bd06 100644 --- a/sysdeps/aarch64/fpu/s_fmaf.c +++ b/sysdeps/aarch64/fpu/s_fmaf.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,7 +16,13 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define FUNC fmaf -#define TYPE float -#define REGS "s" -#include <s_fma.c> +#include <math.h> +#include <libm-alias-float.h> + +float +__fmaf (float x, float y, float z) +{ + return __builtin_fmaf (x, y, z); +} + +libm_alias_float (__fma, fma) diff --git a/sysdeps/aarch64/fpu/s_fmax.c b/sysdeps/aarch64/fpu/s_fmax.c index bc41ec44f6..65de8ceab3 100644 --- a/sysdeps/aarch64/fpu/s_fmax.c +++ b/sysdeps/aarch64/fpu/s_fmax.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,6 +16,13 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define FUNC fmax -#define INSN "fmaxnm" -#include <s_fmin.c> +#include <math.h> +#include <libm-alias-double.h> + +double +__fmax (double x, double y) +{ + return __builtin_fmax (x, y); +} + +libm_alias_double (__fmax, fmax) diff --git a/sysdeps/aarch64/fpu/s_fmaxf.c b/sysdeps/aarch64/fpu/s_fmaxf.c index 6a234bbeaf..2056700426 100644 --- a/sysdeps/aarch64/fpu/s_fmaxf.c +++ b/sysdeps/aarch64/fpu/s_fmaxf.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,8 +16,13 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define FUNC fmaxf -#define INSN "fmaxnm" -#define TYPE float -#define REGS "s" -#include <s_fmin.c> +#include <math.h> +#include <libm-alias-float.h> + +float +__fmaxf (float x, float y) +{ + return __builtin_fmaxf (x, y); +} + +libm_alias_float (__fmax, fmax) diff --git a/sysdeps/aarch64/fpu/s_fmin.c b/sysdeps/aarch64/fpu/s_fmin.c index 695abafb9b..a42b565498 100644 --- a/sysdeps/aarch64/fpu/s_fmin.c +++ b/sysdeps/aarch64/fpu/s_fmin.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1996-2016 Free Software Foundation, Inc. +/* Copyright (C) 1996-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -17,33 +17,12 @@ <http://www.gnu.org/licenses/>. */ #include <math.h> +#include <libm-alias-double.h> -#ifndef FUNC -# define FUNC fmin -#endif - -#ifndef INSN -# define INSN "fminnm" -#endif - -#ifndef TYPE -# define TYPE double -# define REGS "d" -#else -# ifndef REGS -# error REGS not defined -# endif -#endif - -#define __CONCATX(a,b) __CONCAT(a,b) - -TYPE -__CONCATX(__,FUNC) (TYPE x, TYPE y) +double +__fmin (double x, double y) { - TYPE result; - asm ( INSN "\t%" REGS "0, %" REGS "1, %" REGS "2" - : "=w" (result) : "w" (x), "w" (y) ); - return result; + return __builtin_fmin (x, y); } -weak_alias (__CONCATX(__,FUNC), FUNC) +libm_alias_double (__fmin, fmin) diff --git a/sysdeps/aarch64/fpu/s_fminf.c b/sysdeps/aarch64/fpu/s_fminf.c index 78609575bb..028f88543f 100644 --- a/sysdeps/aarch64/fpu/s_fminf.c +++ b/sysdeps/aarch64/fpu/s_fminf.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,7 +16,13 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define FUNC fminf -#define TYPE float -#define REGS "s" -#include <s_fmin.c> +#include <math.h> +#include <libm-alias-float.h> + +float +__fminf (float x, float y) +{ + return __builtin_fminf (x, y); +} + +libm_alias_float (__fmin, fmin) diff --git a/sysdeps/aarch64/fpu/s_llrint.c b/sysdeps/aarch64/fpu/s_llrint.c index 9769311b1b..287dc8974c 100644 --- a/sysdeps/aarch64/fpu/s_llrint.c +++ b/sysdeps/aarch64/fpu/s_llrint.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,6 +16,21 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define FUNC llrint -#define OTYPE long long int -#include <s_lrint.c> +#include <math.h> +#include <math-barriers.h> +#include <math_private.h> +#include <libm-alias-double.h> + +long long int +__llrint (double x) +{ + double r = __builtin_rint (x); + + /* Prevent gcc from calling llrint directly when compiled with + -fno-math-errno by inserting a barrier. */ + + math_opt_barrier (r); + return r; +} + +libm_alias_double (__llrint, llrint) diff --git a/sysdeps/aarch64/fpu/s_llrintf.c b/sysdeps/aarch64/fpu/s_llrintf.c index 51024a986c..70d6c0ca84 100644 --- a/sysdeps/aarch64/fpu/s_llrintf.c +++ b/sysdeps/aarch64/fpu/s_llrintf.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,8 +16,22 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define FUNC llrintf -#define ITYPE float -#define IREGS "s" -#define OTYPE long long int -#include <s_lrint.c> +#include <math.h> +#include <math-barriers.h> +#include <math_private.h> +#include <libm-alias-float.h> + +long long int +__llrintf (float x) +{ + float r = __builtin_rintf (x); + + /* Prevent gcc from calling llrintf directly when compiled with + -fno-math-errno by inserting a barrier. */ + + + math_opt_barrier (r); + return r; +} + +libm_alias_float (__llrint, llrint) diff --git a/sysdeps/aarch64/fpu/s_llround.c b/sysdeps/aarch64/fpu/s_llround.c index dd2b3a2e16..d6ff708f41 100644 --- a/sysdeps/aarch64/fpu/s_llround.c +++ b/sysdeps/aarch64/fpu/s_llround.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,6 +16,13 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define FUNC llround -#define OTYPE long long int -#include <s_lround.c> +#include <math.h> +#include <libm-alias-double.h> + +long long int +__llround (double x) +{ + return __builtin_llround (x); +} + +libm_alias_double (__llround, llround) diff --git a/sysdeps/aarch64/fpu/s_llroundf.c b/sysdeps/aarch64/fpu/s_llroundf.c index 8b5d263d91..4cb2caf8b5 100644 --- a/sysdeps/aarch64/fpu/s_llroundf.c +++ b/sysdeps/aarch64/fpu/s_llroundf.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,8 +16,13 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define FUNC llroundf -#define ITYPE float -#define IREGS "s" -#define OTYPE long long int -#include <s_lround.c> +#include <math.h> +#include <libm-alias-float.h> + +long long int +__llroundf (float x) +{ + return __builtin_llroundf (x); +} + +libm_alias_float (__llround, llround) diff --git a/sysdeps/aarch64/fpu/s_lrint.c b/sysdeps/aarch64/fpu/s_lrint.c index e51de0fc2d..d6ef066e45 100644 --- a/sysdeps/aarch64/fpu/s_lrint.c +++ b/sysdeps/aarch64/fpu/s_lrint.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1996-2016 Free Software Foundation, Inc. +/* Copyright (C) 1996-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -17,37 +17,73 @@ <http://www.gnu.org/licenses/>. */ #include <math.h> +#include <get-rounding-mode.h> +#include <stdint.h> +#include <math-barriers.h> +#include <math_private.h> +#include <libm-alias-double.h> -#ifndef FUNC -# define FUNC lrint -#endif +# define IREG_SIZE 64 + +# ifdef __ILP32__ +# define OREG_SIZE 32 +# else +# define OREG_SIZE 64 +# endif -#ifndef ITYPE -# define ITYPE double # define IREGS "d" + +#if OREG_SIZE == 32 +# define OREGS "w" #else -# ifndef IREGS -# error IREGS not defined -# endif +# define OREGS "x" #endif -#ifndef OTYPE -# define OTYPE long int + +long int +__lrint (double x) +{ + +#if IREG_SIZE == 64 && OREG_SIZE == 32 + long int result; + + if (__builtin_fabs (x) > INT32_MAX) + { + /* Converting large values to a 32 bit int may cause the frintx/fcvtza + sequence to set both FE_INVALID and FE_INEXACT. To avoid this + check the rounding mode and do a single instruction with the + appropriate rounding mode. */ + + switch (get_rounding_mode ()) + { + case FE_TONEAREST: + asm volatile ("fcvtns" "\t%" OREGS "0, %" IREGS "1" + : "=r" (result) : "w" (x)); + break; + case FE_UPWARD: + asm volatile ("fcvtps" "\t%" OREGS "0, %" IREGS "1" + : "=r" (result) : "w" (x)); + break; + case FE_DOWNWARD: + asm volatile ("fcvtms" "\t%" OREGS "0, %" IREGS "1" + : "=r" (result) : "w" (x)); + break; + case FE_TOWARDZERO: + default: + asm volatile ("fcvtzs" "\t%" OREGS "0, %" IREGS "1" + : "=r" (result) : "w" (x)); + } + return result; + } #endif -#define OREGS "x" + double r = __builtin_rint (x); -#define __CONCATX(a,b) __CONCAT(a,b) + /* Prevent gcc from calling lrint directly when compiled with + -fno-math-errno by inserting a barrier. */ -OTYPE -__CONCATX(__,FUNC) (ITYPE x) -{ - OTYPE result; - ITYPE temp; - asm ( "frintx" "\t%" IREGS "1, %" IREGS "2\n\t" - "fcvtzs" "\t%" OREGS "0, %" IREGS "1" - : "=r" (result), "=w" (temp) : "w" (x) ); - return result; + math_opt_barrier (r); + return r; } -weak_alias (__CONCATX(__,FUNC), FUNC) +libm_alias_double (__lrint, lrint) diff --git a/sysdeps/aarch64/fpu/s_lrintf.c b/sysdeps/aarch64/fpu/s_lrintf.c index b603510375..fa42d68dd1 100644 --- a/sysdeps/aarch64/fpu/s_lrintf.c +++ b/sysdeps/aarch64/fpu/s_lrintf.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,7 +16,21 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define FUNC lrintf -#define ITYPE float -#define IREGS "s" -#include <s_lrint.c> +#include <math.h> +#include <math-barriers.h> +#include <math_private.h> +#include <libm-alias-float.h> + +long int +__lrintf (float x) +{ + float r = __builtin_rintf (x); + + /* Prevent gcc from calling lrintf directly when compiled with + -fno-math-errno by inserting a barrier. */ + + math_opt_barrier (r); + return r; +} + +libm_alias_float (__lrint, lrint) diff --git a/sysdeps/aarch64/fpu/s_lround.c b/sysdeps/aarch64/fpu/s_lround.c index a14d347142..d8e9fc4664 100644 --- a/sysdeps/aarch64/fpu/s_lround.c +++ b/sysdeps/aarch64/fpu/s_lround.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1996-2016 Free Software Foundation, Inc. +/* Copyright (C) 1996-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -17,35 +17,12 @@ <http://www.gnu.org/licenses/>. */ #include <math.h> +#include <libm-alias-double.h> -#ifndef FUNC -# define FUNC lround -#endif +long int +__lround (double x) + { + return __builtin_lround (x); + } -#ifndef ITYPE -# define ITYPE double -# define IREGS "d" -#else -# ifndef IREGS -# error IREGS not defined -# endif -#endif - -#ifndef OTYPE -# define OTYPE long int -#endif - -#define OREGS "x" - -#define __CONCATX(a,b) __CONCAT(a,b) - -OTYPE -__CONCATX(__,FUNC) (ITYPE x) -{ - OTYPE result; - asm ( "fcvtas" "\t%" OREGS "0, %" IREGS "1" - : "=r" (result) : "w" (x) ); - return result; -} - -weak_alias (__CONCATX(__,FUNC), FUNC) +libm_alias_double (__lround, lround) diff --git a/sysdeps/aarch64/fpu/s_lroundf.c b/sysdeps/aarch64/fpu/s_lroundf.c index e3a3fba937..9a76dbf24a 100644 --- a/sysdeps/aarch64/fpu/s_lroundf.c +++ b/sysdeps/aarch64/fpu/s_lroundf.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,7 +16,13 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define FUNC lroundf -#define ITYPE float -#define IREGS "s" -#include <s_lround.c> +#include <math.h> +#include <libm-alias-float.h> + +long int +__lroundf (float x) +{ + return __builtin_lroundf (x); +} + +libm_alias_float (__lround, lround) diff --git a/sysdeps/aarch64/fpu/s_nearbyint.c b/sysdeps/aarch64/fpu/s_nearbyint.c index 3be8059c1b..477e1a8cfe 100644 --- a/sysdeps/aarch64/fpu/s_nearbyint.c +++ b/sysdeps/aarch64/fpu/s_nearbyint.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,6 +16,13 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define FUNC nearbyint -#define INSN "frinti" -#include <s_frint.c> +#include <math.h> +#include <libm-alias-double.h> + +double +__nearbyint (double x) +{ + return __builtin_nearbyint (x); +} + +libm_alias_double (__nearbyint, nearbyint) diff --git a/sysdeps/aarch64/fpu/s_nearbyintf.c b/sysdeps/aarch64/fpu/s_nearbyintf.c index e544e5597e..4edab204a9 100644 --- a/sysdeps/aarch64/fpu/s_nearbyintf.c +++ b/sysdeps/aarch64/fpu/s_nearbyintf.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,6 +16,13 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define FUNC nearbyintf -#define INSN "frinti" -#include <s_frintf.c> +#include <math.h> +#include <libm-alias-float.h> + +float +__nearbyintf (float x) +{ + return __builtin_nearbyintf (x); +} + +libm_alias_float (__nearbyint, nearbyint) diff --git a/sysdeps/aarch64/fpu/s_rint.c b/sysdeps/aarch64/fpu/s_rint.c index d12c1a7c46..eb4232af00 100644 --- a/sysdeps/aarch64/fpu/s_rint.c +++ b/sysdeps/aarch64/fpu/s_rint.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,6 +16,13 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define FUNC rint -#define INSN "frintx" -#include <s_frint.c> +#include <math.h> +#include <libm-alias-double.h> + +double +__rint (double x) +{ + return __builtin_rint (x); +} + +libm_alias_double (__rint, rint) diff --git a/sysdeps/aarch64/fpu/s_rintf.c b/sysdeps/aarch64/fpu/s_rintf.c index 375be92734..9ebfcb45b4 100644 --- a/sysdeps/aarch64/fpu/s_rintf.c +++ b/sysdeps/aarch64/fpu/s_rintf.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,6 +16,13 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define FUNC rintf -#define INSN "frintx" -#include <s_frintf.c> +#include <math.h> +#include <libm-alias-float.h> + +float +__rintf (float x) +{ + return __builtin_rintf (x); +} + +libm_alias_float (__rint, rint) diff --git a/sysdeps/aarch64/fpu/s_round.c b/sysdeps/aarch64/fpu/s_round.c index 239ca1b434..275b00f318 100644 --- a/sysdeps/aarch64/fpu/s_round.c +++ b/sysdeps/aarch64/fpu/s_round.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,6 +16,13 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define FUNC round -#define INSN "frinta" -#include <s_frint.c> +#include <math.h> +#include <libm-alias-double.h> + +double +__round (double x) +{ + return __builtin_round (x); +} + +libm_alias_double (__round, round) diff --git a/sysdeps/aarch64/fpu/s_roundf.c b/sysdeps/aarch64/fpu/s_roundf.c index eb13586dc5..201ffc54db 100644 --- a/sysdeps/aarch64/fpu/s_roundf.c +++ b/sysdeps/aarch64/fpu/s_roundf.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,6 +16,13 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define FUNC roundf -#define INSN "frinta" -#include <s_frintf.c> +#include <math.h> +#include <libm-alias-float.h> + +float +__roundf (float x) +{ + return __builtin_roundf (x); +} + +libm_alias_float (__round, round) diff --git a/sysdeps/aarch64/fpu/s_trunc.c b/sysdeps/aarch64/fpu/s_trunc.c index 7107a1b18b..22f44f1e04 100644 --- a/sysdeps/aarch64/fpu/s_trunc.c +++ b/sysdeps/aarch64/fpu/s_trunc.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,6 +16,13 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define FUNC trunc -#define INSN "frintz" -#include <s_frint.c> +#include <math.h> +#include <libm-alias-double.h> + +double +__trunc (double x) +{ + return __builtin_trunc (x); +} + +libm_alias_double (__trunc, trunc) diff --git a/sysdeps/aarch64/fpu/s_truncf.c b/sysdeps/aarch64/fpu/s_truncf.c index b08c1cb304..3771b6d6bf 100644 --- a/sysdeps/aarch64/fpu/s_truncf.c +++ b/sysdeps/aarch64/fpu/s_truncf.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2016 Free Software Foundation, Inc. +/* Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,6 +16,13 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#define FUNC truncf -#define INSN "frintz" -#include <s_frintf.c> +#include <math.h> +#include <libm-alias-float.h> + +float +__truncf (float x) +{ + return __builtin_truncf (x); +} + +libm_alias_float (__trunc, trunc) diff --git a/sysdeps/aarch64/jmpbuf-offsets.h b/sysdeps/aarch64/jmpbuf-offsets.h index 37f551a56b..a7096f2f69 100644 --- a/sysdeps/aarch64/jmpbuf-offsets.h +++ b/sysdeps/aarch64/jmpbuf-offsets.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2016 Free Software Foundation, Inc. +/* Copyright (C) 2006-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/jmpbuf-unwind.h b/sysdeps/aarch64/jmpbuf-unwind.h index 3e0a37db4e..9309e7b0ea 100644 --- a/sysdeps/aarch64/jmpbuf-unwind.h +++ b/sysdeps/aarch64/jmpbuf-unwind.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2005-2016 Free Software Foundation, Inc. +/* Copyright (C) 2005-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -27,7 +27,8 @@ ((void *) (address) < (void *) demangle (jmpbuf[JB_SP])) #define _JMPBUF_CFA_UNWINDS_ADJ(jmpbuf, context, adj) \ - _JMPBUF_UNWINDS_ADJ (jmpbuf, (void *) _Unwind_GetCFA (context), adj) + _JMPBUF_UNWINDS_ADJ (jmpbuf, (void *) (uintptr_t) _Unwind_GetCFA (context), \ + adj) #define _JMPBUF_UNWINDS_ADJ(_jmpbuf, _address, _adj) \ ((uintptr_t) (_address) - (_adj) < _jmpbuf_sp (_jmpbuf) - (_adj)) diff --git a/sysdeps/aarch64/ldsodefs.h b/sysdeps/aarch64/ldsodefs.h index 7c02e89371..600b721234 100644 --- a/sysdeps/aarch64/ldsodefs.h +++ b/sysdeps/aarch64/ldsodefs.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2005-2016 Free Software Foundation, Inc. +/* Copyright (C) 2005-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -20,6 +20,7 @@ #define _AARCH64_LDSODEFS_H 1 #include <elf.h> +#include <cpu-features.h> struct La_aarch64_regs; struct La_aarch64_retval; diff --git a/sysdeps/aarch64/libc-tls.c b/sysdeps/aarch64/libc-tls.c index f43c75f9b5..f2a2e1c0d3 100644 --- a/sysdeps/aarch64/libc-tls.c +++ b/sysdeps/aarch64/libc-tls.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2005-2016 Free Software Foundation, Inc. +/* Copyright (C) 2005-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps index bc5795b361..be06085154 100644 --- a/sysdeps/aarch64/libm-test-ulps +++ b/sysdeps/aarch64/libm-test-ulps @@ -40,9 +40,9 @@ ildouble: 2 ldouble: 2 Function: "acosh_downward": -double: 1 +double: 2 float: 2 -idouble: 1 +idouble: 2 ifloat: 2 ildouble: 3 ldouble: 3 @@ -252,36 +252,36 @@ ildouble: 2 ldouble: 2 Function: Imaginary part of "cacos": -double: 1 +double: 2 float: 2 -idouble: 1 +idouble: 2 ifloat: 2 ildouble: 2 ldouble: 2 Function: Real part of "cacos_downward": -double: 2 +double: 3 float: 2 -idouble: 2 +idouble: 3 ifloat: 2 -ildouble: 2 -ldouble: 2 +ildouble: 3 +ldouble: 3 Function: Imaginary part of "cacos_downward": double: 5 float: 3 idouble: 5 ifloat: 3 -ildouble: 5 -ldouble: 5 +ildouble: 6 +ldouble: 6 Function: Real part of "cacos_towardzero": -double: 2 +double: 3 float: 2 -idouble: 2 +idouble: 3 ifloat: 2 -ildouble: 2 -ldouble: 2 +ildouble: 3 +ldouble: 3 Function: Imaginary part of "cacos_towardzero": double: 5 @@ -300,17 +300,17 @@ ildouble: 3 ldouble: 3 Function: Imaginary part of "cacos_upward": -double: 4 -float: 4 -idouble: 4 -ifloat: 4 -ildouble: 5 -ldouble: 5 +double: 5 +float: 5 +idouble: 5 +ifloat: 5 +ildouble: 7 +ldouble: 7 Function: Real part of "cacosh": -double: 1 +double: 2 float: 2 -idouble: 1 +idouble: 2 ifloat: 2 ildouble: 2 ldouble: 2 @@ -332,12 +332,12 @@ ildouble: 5 ldouble: 5 Function: Imaginary part of "cacosh_downward": -double: 2 -float: 2 -idouble: 2 -ifloat: 2 -ildouble: 2 -ldouble: 2 +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 4 +ldouble: 4 Function: Real part of "cacosh_towardzero": double: 5 @@ -348,28 +348,28 @@ ildouble: 5 ldouble: 5 Function: Imaginary part of "cacosh_towardzero": -double: 2 +double: 3 float: 2 -idouble: 2 +idouble: 3 ifloat: 2 -ildouble: 2 -ldouble: 2 +ildouble: 3 +ldouble: 3 Function: Real part of "cacosh_upward": double: 4 float: 4 idouble: 4 ifloat: 4 -ildouble: 5 -ldouble: 5 +ildouble: 6 +ldouble: 6 Function: Imaginary part of "cacosh_upward": -double: 2 +double: 3 float: 2 -idouble: 2 +idouble: 3 ifloat: 2 -ildouble: 3 -ldouble: 3 +ildouble: 4 +ldouble: 4 Function: "carg": double: 1 @@ -412,18 +412,18 @@ ildouble: 2 ldouble: 2 Function: Imaginary part of "casin": -double: 1 +double: 2 float: 2 -idouble: 1 +idouble: 2 ifloat: 2 ildouble: 2 ldouble: 2 Function: Real part of "casin_downward": double: 3 -float: 1 +float: 2 idouble: 3 -ifloat: 1 +ifloat: 2 ildouble: 3 ldouble: 3 @@ -432,8 +432,8 @@ double: 5 float: 3 idouble: 5 ifloat: 3 -ildouble: 5 -ldouble: 5 +ildouble: 6 +ldouble: 6 Function: Real part of "casin_towardzero": double: 3 @@ -452,25 +452,25 @@ ildouble: 5 ldouble: 5 Function: Real part of "casin_upward": -double: 2 -float: 1 -idouble: 2 -ifloat: 1 +double: 3 +float: 2 +idouble: 3 +ifloat: 2 ildouble: 3 ldouble: 3 Function: Imaginary part of "casin_upward": -double: 4 -float: 4 -idouble: 4 -ifloat: 4 -ildouble: 5 -ldouble: 5 +double: 5 +float: 5 +idouble: 5 +ifloat: 5 +ildouble: 7 +ldouble: 7 Function: Real part of "casinh": -double: 1 +double: 2 float: 2 -idouble: 1 +idouble: 2 ifloat: 2 ildouble: 2 ldouble: 2 @@ -488,14 +488,14 @@ double: 5 float: 3 idouble: 5 ifloat: 3 -ildouble: 5 -ldouble: 5 +ildouble: 6 +ldouble: 6 Function: Imaginary part of "casinh_downward": double: 3 -float: 1 +float: 2 idouble: 3 -ifloat: 1 +ifloat: 2 ildouble: 3 ldouble: 3 @@ -516,23 +516,25 @@ ildouble: 3 ldouble: 3 Function: Real part of "casinh_upward": -double: 4 -float: 4 -idouble: 4 -ifloat: 4 -ildouble: 5 -ldouble: 5 +double: 5 +float: 5 +idouble: 5 +ifloat: 5 +ildouble: 7 +ldouble: 7 Function: Imaginary part of "casinh_upward": -double: 2 +double: 3 float: 2 -idouble: 2 +idouble: 3 ifloat: 2 ildouble: 3 ldouble: 3 Function: Real part of "catan": +double: 1 float: 1 +idouble: 1 ifloat: 1 ildouble: 1 ldouble: 1 @@ -547,9 +549,9 @@ ldouble: 1 Function: Real part of "catan_downward": double: 1 -float: 1 +float: 2 idouble: 1 -ifloat: 1 +ifloat: 2 ildouble: 2 ldouble: 2 @@ -563,25 +565,27 @@ ldouble: 3 Function: Real part of "catan_towardzero": double: 1 -float: 1 +float: 2 idouble: 1 -ifloat: 1 +ifloat: 2 ildouble: 2 ldouble: 2 Function: Imaginary part of "catan_towardzero": double: 2 -float: 1 +float: 2 idouble: 2 -ifloat: 1 +ifloat: 2 ildouble: 3 ldouble: 3 Function: Real part of "catan_upward": +double: 1 float: 1 +idouble: 1 ifloat: 1 -ildouble: 1 -ldouble: 1 +ildouble: 2 +ldouble: 2 Function: Imaginary part of "catan_upward": double: 3 @@ -600,7 +604,9 @@ ildouble: 1 ldouble: 1 Function: Imaginary part of "catanh": +double: 1 float: 1 +idouble: 1 ifloat: 1 ildouble: 1 ldouble: 1 @@ -623,9 +629,9 @@ ldouble: 2 Function: Real part of "catanh_towardzero": double: 2 -float: 1 +float: 2 idouble: 2 -ifloat: 1 +ifloat: 2 ildouble: 3 ldouble: 3 @@ -639,17 +645,19 @@ ldouble: 2 Function: Real part of "catanh_upward": double: 4 -float: 3 +float: 4 idouble: 4 -ifloat: 3 +ifloat: 4 ildouble: 4 ldouble: 4 Function: Imaginary part of "catanh_upward": +double: 1 float: 1 +idouble: 1 ifloat: 1 -ildouble: 1 -ldouble: 1 +ildouble: 2 +ldouble: 2 Function: "cbrt": double: 3 @@ -900,18 +908,18 @@ ildouble: 2 ldouble: 2 Function: Imaginary part of "clog10": -double: 1 +double: 2 float: 2 -idouble: 1 +idouble: 2 ifloat: 2 ildouble: 2 ldouble: 2 Function: Real part of "clog10_downward": double: 5 -float: 4 +float: 5 idouble: 5 -ifloat: 4 +ifloat: 5 ildouble: 3 ldouble: 3 @@ -1004,7 +1012,9 @@ ildouble: 2 ldouble: 2 Function: "cos": +double: 1 float: 1 +idouble: 1 ifloat: 1 ildouble: 1 ldouble: 1 @@ -1323,9 +1333,9 @@ ldouble: 3 Function: Imaginary part of "ctan": double: 2 -float: 1 +float: 2 idouble: 2 -ifloat: 1 +ifloat: 2 ildouble: 3 ldouble: 3 @@ -1339,9 +1349,9 @@ ldouble: 4 Function: Imaginary part of "ctan_downward": double: 2 -float: 1 +float: 2 idouble: 2 -ifloat: 1 +ifloat: 2 ildouble: 5 ldouble: 5 @@ -1363,9 +1373,9 @@ ldouble: 5 Function: Real part of "ctan_upward": double: 2 -float: 3 +float: 4 idouble: 2 -ifloat: 3 +ifloat: 4 ildouble: 5 ldouble: 5 @@ -1395,9 +1405,9 @@ ldouble: 3 Function: Real part of "ctanh_downward": double: 4 -float: 1 +float: 2 idouble: 4 -ifloat: 1 +ifloat: 2 ildouble: 5 ldouble: 5 @@ -1575,15 +1585,21 @@ ldouble: 2 Function: "exp_downward": double: 1 +float: 1 idouble: 1 +ifloat: 1 Function: "exp_towardzero": double: 1 +float: 1 idouble: 1 +ifloat: 1 Function: "exp_upward": double: 1 +float: 1 idouble: 1 +ifloat: 1 Function: "expm1": double: 1 @@ -1683,9 +1699,9 @@ ldouble: 2 Function: "j0_downward": double: 2 -float: 3 +float: 4 idouble: 2 -ifloat: 3 +ifloat: 4 ildouble: 4 ldouble: 4 @@ -1910,55 +1926,27 @@ ildouble: 1 ldouble: 1 Function: "log_towardzero": -float: 1 -ifloat: 1 +float: 2 +ifloat: 2 ildouble: 2 ldouble: 2 Function: "log_upward": double: 1 -float: 1 +float: 2 idouble: 1 -ifloat: 1 +ifloat: 2 ildouble: 1 ldouble: 1 Function: "pow": +double: 1 float: 1 +idouble: 1 ifloat: 1 ildouble: 2 ldouble: 2 -Function: "pow10": -double: 2 -idouble: 2 -ildouble: 2 -ldouble: 2 - -Function: "pow10_downward": -double: 2 -float: 1 -idouble: 2 -ifloat: 1 -ildouble: 3 -ldouble: 3 - -Function: "pow10_towardzero": -double: 2 -float: 1 -idouble: 2 -ifloat: 1 -ildouble: 3 -ldouble: 3 - -Function: "pow10_upward": -double: 2 -float: 1 -idouble: 2 -ifloat: 1 -ildouble: 3 -ldouble: 3 - Function: "pow_downward": double: 1 float: 1 @@ -1984,7 +1972,9 @@ ildouble: 2 ldouble: 2 Function: "sin": +double: 1 float: 1 +idouble: 1 ifloat: 1 ildouble: 1 ldouble: 1 @@ -2014,7 +2004,9 @@ ildouble: 3 ldouble: 3 Function: "sincos": +double: 1 float: 1 +idouble: 1 ifloat: 1 ildouble: 1 ldouble: 1 @@ -2179,9 +2171,9 @@ ldouble: 3 Function: "y0_downward": double: 3 -float: 2 +float: 4 idouble: 3 -ifloat: 2 +ifloat: 4 ildouble: 4 ldouble: 4 @@ -2195,9 +2187,9 @@ ldouble: 3 Function: "y0_upward": double: 2 -float: 3 +float: 5 idouble: 2 -ifloat: 3 +ifloat: 5 ildouble: 3 ldouble: 3 @@ -2235,17 +2227,17 @@ ldouble: 5 Function: "yn": double: 3 -float: 2 +float: 3 idouble: 3 -ifloat: 2 +ifloat: 3 ildouble: 5 ldouble: 5 Function: "yn_downward": double: 3 -float: 2 +float: 4 idouble: 3 -ifloat: 2 +ifloat: 4 ildouble: 5 ldouble: 5 @@ -2259,9 +2251,9 @@ ldouble: 5 Function: "yn_upward": double: 4 -float: 3 +float: 5 idouble: 4 -ifloat: 3 +ifloat: 5 ildouble: 5 ldouble: 5 diff --git a/sysdeps/aarch64/libm-test-ulps-name b/sysdeps/aarch64/libm-test-ulps-name new file mode 100644 index 0000000000..1f66c5cda0 --- /dev/null +++ b/sysdeps/aarch64/libm-test-ulps-name @@ -0,0 +1 @@ +AArch64 diff --git a/sysdeps/aarch64/linkmap.h b/sysdeps/aarch64/linkmap.h index 5aefb4cfeb..6852f343a1 100644 --- a/sysdeps/aarch64/linkmap.h +++ b/sysdeps/aarch64/linkmap.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2016 Free Software Foundation, Inc. +/* Copyright (C) 2009-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/machine-gmon.h b/sysdeps/aarch64/machine-gmon.h index 3d923f91ce..dd12a25416 100644 --- a/sysdeps/aarch64/machine-gmon.h +++ b/sysdeps/aarch64/machine-gmon.h @@ -1,5 +1,5 @@ /* AArch64 definitions for profiling support. - Copyright (C) 1996-2016 Free Software Foundation, Inc. + Copyright (C) 1996-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/math-tests.h b/sysdeps/aarch64/math-tests.h index a3e1efae97..46cc33741f 100644 --- a/sysdeps/aarch64/math-tests.h +++ b/sysdeps/aarch64/math-tests.h @@ -1,5 +1,5 @@ /* Configuration for math tests. AArch64 version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/mcount.c b/sysdeps/aarch64/mcount.c index ca596ee986..515c6bde25 100644 --- a/sysdeps/aarch64/mcount.c +++ b/sysdeps/aarch64/mcount.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2013-2016 Free Software Foundation, Inc. +/* Copyright (C) 2013-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S new file mode 100644 index 0000000000..e422aef090 --- /dev/null +++ b/sysdeps/aarch64/memchr.S @@ -0,0 +1,157 @@ +/* memchr - find a character in a memory zone + + Copyright (C) 2015-2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Assumptions: + * + * ARMv8-a, AArch64 + * Neon Available. + */ + +/* Arguments and results. */ +#define srcin x0 +#define chrin w1 +#define cntin x2 + +#define result x0 + +#define src x3 +#define tmp x4 +#define wtmp2 w5 +#define synd x6 +#define soff x9 +#define cntrem x10 + +#define vrepchr v0 +#define vdata1 v1 +#define vdata2 v2 +#define vhas_chr1 v3 +#define vhas_chr2 v4 +#define vrepmask v5 +#define vend v6 + +/* + * Core algorithm: + * + * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits + * per byte. For each tuple, bit 0 is set if the relevant byte matched the + * requested character and bit 1 is not used (faster than using a 32bit + * syndrome). Since the bits in the syndrome reflect exactly the order in which + * things occur in the original string, counting trailing zeros allows to + * identify exactly which byte has matched. + */ + +ENTRY (__memchr) + /* Do not dereference srcin if no bytes to compare. */ + cbz cntin, L(zero_length) + /* + * Magic constant 0x40100401 allows us to identify which lane matches + * the requested byte. + */ + mov wtmp2, #0x0401 + movk wtmp2, #0x4010, lsl #16 + dup vrepchr.16b, chrin + /* Work with aligned 32-byte chunks */ + bic src, srcin, #31 + dup vrepmask.4s, wtmp2 + ands soff, srcin, #31 + and cntrem, cntin, #31 + b.eq L(loop) + + /* + * Input string is not 32-byte aligned. We calculate the syndrome + * value for the aligned 32 bytes block containing the first bytes + * and mask the irrelevant part. + */ + + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + sub tmp, soff, #32 + adds cntin, cntin, tmp + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ + addp vend.16b, vend.16b, vend.16b /* 128->64 */ + mov synd, vend.2d[0] + /* Clear the soff*2 lower bits */ + lsl tmp, soff, #1 + lsr synd, synd, tmp + lsl synd, synd, tmp + /* The first block can also be the last */ + b.ls L(masklast) + /* Have we found something already? */ + cbnz synd, L(tail) + +L(loop): + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + subs cntin, cntin, #32 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + /* If we're out of data we finish regardless of the result */ + b.ls L(end) + /* Use a fast check for the termination condition */ + orr vend.16b, vhas_chr1.16b, vhas_chr2.16b + addp vend.2d, vend.2d, vend.2d + mov synd, vend.2d[0] + /* We're not out of data, loop if we haven't found the character */ + cbz synd, L(loop) + +L(end): + /* Termination condition found, let's calculate the syndrome value */ + and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ + addp vend.16b, vend.16b, vend.16b /* 128->64 */ + mov synd, vend.2d[0] + /* Only do the clear for the last possible block */ + b.hi L(tail) + +L(masklast): + /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ + add tmp, cntrem, soff + and tmp, tmp, #31 + sub tmp, tmp, #32 + neg tmp, tmp, lsl #1 + lsl synd, synd, tmp + lsr synd, synd, tmp + +L(tail): + /* Count the trailing zeros using bit reversing */ + rbit synd, synd + /* Compensate the last post-increment */ + sub src, src, #32 + /* Check that we have found a character */ + cmp synd, #0 + /* And count the leading zeros */ + clz synd, synd + /* Compute the potential result */ + add result, src, synd, lsr #1 + /* Select result or NULL */ + csel result, xzr, result, eq + ret + +L(zero_length): + mov result, #0 + ret +END (__memchr) +weak_alias (__memchr, memchr) +libc_hidden_builtin_def (memchr) diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S index ae2d997973..743bc078bb 100644 --- a/sysdeps/aarch64/memcmp.S +++ b/sysdeps/aarch64/memcmp.S @@ -1,6 +1,6 @@ /* memcmp - compare memory - Copyright (C) 2013-2016 Free Software Foundation, Inc. + Copyright (C) 2013-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -22,129 +22,132 @@ /* Assumptions: * - * ARMv8-a, AArch64 + * ARMv8-a, AArch64, unaligned accesses. */ /* Parameters and result. */ #define src1 x0 #define src2 x1 #define limit x2 -#define result x0 +#define result w0 /* Internal variables. */ #define data1 x3 #define data1w w3 -#define data2 x4 -#define data2w w4 -#define has_nul x5 -#define diff x6 -#define endloop x7 -#define tmp1 x8 -#define tmp2 x9 -#define tmp3 x10 -#define pos x11 -#define limit_wd x12 -#define mask x13 +#define data1h x4 +#define data2 x5 +#define data2w w5 +#define data2h x6 +#define tmp1 x7 +#define tmp2 x8 ENTRY_ALIGN (memcmp, 6) - cbz limit, L(ret0) - eor tmp1, src1, src2 - tst tmp1, #7 - b.ne L(misaligned8) - ands tmp1, src1, #7 - b.ne L(mutual_align) - add limit_wd, limit, #7 - lsr limit_wd, limit_wd, #3 - /* Start of performance-critical section -- one 64B cache line. */ -L(loop_aligned): - ldr data1, [src1], #8 - ldr data2, [src2], #8 -L(start_realigned): - subs limit_wd, limit_wd, #1 - eor diff, data1, data2 /* Non-zero if differences found. */ - csinv endloop, diff, xzr, ne /* Last Dword or differences. */ - cbz endloop, L(loop_aligned) - /* End of performance-critical section -- one 64B cache line. */ - - /* Not reached the limit, must have found a diff. */ - cbnz limit_wd, L(not_limit) - - /* Limit % 8 == 0 => all bytes significant. */ - ands limit, limit, #7 - b.eq L(not_limit) - - lsl limit, limit, #3 /* Bits -> bytes. */ - mov mask, #~0 -#ifdef __AARCH64EB__ - lsr mask, mask, limit -#else - lsl mask, mask, limit -#endif - bic data1, data1, mask - bic data2, data2, mask - - orr diff, diff, mask -L(not_limit): - -#ifndef __AARCH64EB__ - rev diff, diff + DELOUSE (0) + DELOUSE (1) + DELOUSE (2) + + subs limit, limit, 8 + b.lo L(less8) + + ldr data1, [src1], 8 + ldr data2, [src2], 8 + cmp data1, data2 + b.ne L(return) + + subs limit, limit, 8 + b.gt L(more16) + + ldr data1, [src1, limit] + ldr data2, [src2, limit] + b L(return) + +L(more16): + ldr data1, [src1], 8 + ldr data2, [src2], 8 + cmp data1, data2 + bne L(return) + + /* Jump directly to comparing the last 16 bytes for 32 byte (or less) + strings. */ + subs limit, limit, 16 + b.ls L(last_bytes) + + /* We overlap loads between 0-32 bytes at either side of SRC1 when we + try to align, so limit it only to strings larger than 128 bytes. */ + cmp limit, 96 + b.ls L(loop16) + + /* Align src1 and adjust src2 with bytes not yet done. */ + and tmp1, src1, 15 + add limit, limit, tmp1 + sub src1, src1, tmp1 + sub src2, src2, tmp1 + + /* Loop performing 16 bytes per iteration using aligned src1. + Limit is pre-decremented by 16 and must be larger than zero. + Exit if <= 16 bytes left to do or if the data is not equal. */ + .p2align 4 +L(loop16): + ldp data1, data1h, [src1], 16 + ldp data2, data2h, [src2], 16 + subs limit, limit, 16 + ccmp data1, data2, 0, hi + ccmp data1h, data2h, 0, eq + b.eq L(loop16) + + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h + cmp data1, data2 + bne L(return) + + /* Compare last 1-16 bytes using unaligned access. */ +L(last_bytes): + add src1, src1, limit + add src2, src2, limit + ldp data1, data1h, [src1] + ldp data2, data2h, [src2] + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h + cmp data1, data2 + + /* Compare data bytes and set return value to 0, -1 or 1. */ +L(return): +#ifndef __AARCH64EB__ rev data1, data1 rev data2, data2 #endif - /* The MS-non-zero bit of DIFF marks either the first bit - that is different, or the end of the significant data. - Shifting left now will bring the critical information into the - top bits. */ - clz pos, diff - lsl data1, data1, pos - lsl data2, data2, pos - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 - RET - -L(mutual_align): - /* Sources are mutually aligned, but are not currently at an - alignment boundary. Round down the addresses and then mask off - the bytes that precede the start point. */ - bic src1, src1, #7 - bic src2, src2, #7 - add limit, limit, tmp1 /* Adjust the limit for the extra. */ - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ - ldr data1, [src1], #8 - neg tmp1, tmp1 /* Bits to alignment -64. */ - ldr data2, [src2], #8 - mov tmp2, #~0 -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#endif - add limit_wd, limit, #7 - orr data1, data1, tmp2 - orr data2, data2, tmp2 - lsr limit_wd, limit_wd, #3 - b L(start_realigned) - -L(ret0): - mov result, #0 - RET - - .p2align 6 -L(misaligned8): - sub limit, limit, #1 -1: - /* Perhaps we can do better than this. */ - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - subs limit, limit, #1 - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.eq 1b - sub result, data1, data2 - RET + cmp data1, data2 +L(ret_eq): + cset result, ne + cneg result, result, lo + ret + + .p2align 4 + /* Compare up to 8 bytes. Limit is [-8..-1]. */ +L(less8): + adds limit, limit, 4 + b.lo L(less4) + ldr data1w, [src1], 4 + ldr data2w, [src2], 4 + cmp data1w, data2w + b.ne L(return) + sub limit, limit, 4 +L(less4): + adds limit, limit, 4 + beq L(ret_eq) +L(byte_loop): + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + subs limit, limit, 1 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ + b.eq L(byte_loop) + sub result, data1w, data2w + ret + END (memcmp) #undef bcmp weak_alias (memcmp, bcmp) diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S index 442f390426..7e1163e6a0 100644 --- a/sysdeps/aarch64/memcpy.S +++ b/sysdeps/aarch64/memcpy.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2016 Free Software Foundation, Inc. +/* Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,161 +16,252 @@ License along with the GNU C Library. If not, see <http://www.gnu.org/licenses/>. */ +#include <sysdep.h> + /* Assumptions: * - * ARMv8-a, AArch64 - * Unaligned accesses + * ARMv8-a, AArch64, unaligned accesses. * */ #define dstin x0 #define src x1 #define count x2 -#define tmp1 x3 -#define tmp1w w3 -#define tmp2 x4 -#define tmp2w w4 -#define tmp3 x5 -#define tmp3w w5 -#define dst x6 - -#define A_l x7 -#define A_h x8 -#define B_l x9 -#define B_h x10 -#define C_l x11 -#define C_h x12 -#define D_l x13 -#define D_h x14 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define A_hw w7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_l x10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l src +#define E_h count +#define F_l srcend +#define F_h dst +#define G_l count +#define G_h dst +#define tmp1 x14 -#include <sysdep.h> +/* Copies are split into 3 main cases: small copies of up to 16 bytes, + medium copies of 17..96 bytes which are fully unrolled. Large copies + of more than 96 bytes align the destination and use an unrolled loop + processing 64 bytes per iteration. + In order to share code with memmove, small and medium copies read all + data before writing, allowing any kind of overlap. So small, medium + and large backwards memmoves are handled by falling through into memcpy. + Overlapping large forward memmoves use a loop that copies backwards. +*/ -ENTRY_ALIGN (memcpy, 6) - - mov dst, dstin - cmp count, #64 - b.ge L(cpy_not_short) - cmp count, #15 - b.le L(tail15tiny) - - /* Deal with small copies quickly by dropping straight into the - * exit block. */ -L(tail63): - /* Copy up to 48 bytes of data. At this point we only need the - * bottom 6 bits of count to be accurate. */ - ands tmp1, count, #0x30 - b.eq L(tail15) - add dst, dst, tmp1 - add src, src, tmp1 - cmp tmp1w, #0x20 - b.eq 1f - b.lt 2f - ldp A_l, A_h, [src, #-48] - stp A_l, A_h, [dst, #-48] -1: - ldp A_l, A_h, [src, #-32] - stp A_l, A_h, [dst, #-32] -2: - ldp A_l, A_h, [src, #-16] - stp A_l, A_h, [dst, #-16] - -L(tail15): - ands count, count, #15 - beq 1f - add src, src, count - ldp A_l, A_h, [src, #-16] - add dst, dst, count - stp A_l, A_h, [dst, #-16] -1: - RET - -L(tail15tiny): - /* Copy up to 15 bytes of data. Does not assume additional data - being copied. */ - tbz count, #3, 1f - ldr tmp1, [src], #8 - str tmp1, [dst], #8 +#ifndef MEMMOVE +# define MEMMOVE memmove +#endif +#ifndef MEMCPY +# define MEMCPY memcpy +#endif + +ENTRY_ALIGN (MEMMOVE, 6) + + DELOUSE (0) + DELOUSE (1) + DELOUSE (2) + + sub tmp1, dstin, src + cmp count, 96 + ccmp tmp1, count, 2, hi + b.lo L(move_long) + + /* Common case falls through into memcpy. */ +END (MEMMOVE) +libc_hidden_builtin_def (MEMMOVE) +ENTRY (MEMCPY) + + DELOUSE (0) + DELOUSE (1) + DELOUSE (2) + + prfm PLDL1KEEP, [src] + add srcend, src, count + add dstend, dstin, count + cmp count, 16 + b.ls L(copy16) + cmp count, 96 + b.hi L(copy_long) + + /* Medium copies: 17..96 bytes. */ + sub tmp1, count, 1 + ldp A_l, A_h, [src] + tbnz tmp1, 6, L(copy96) + ldp D_l, D_h, [srcend, -16] + tbz tmp1, 5, 1f + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] 1: - tbz count, #2, 1f - ldr tmp1w, [src], #4 - str tmp1w, [dst], #4 + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Small copies: 0..16 bytes. */ +L(copy16): + cmp count, 8 + b.lo 1f + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + .p2align 4 1: - tbz count, #1, 1f - ldrh tmp1w, [src], #2 - strh tmp1w, [dst], #2 + tbz count, 2, 1f + ldr A_lw, [src] + ldr A_hw, [srcend, -4] + str A_lw, [dstin] + str A_hw, [dstend, -4] + ret + + /* Copy 0..3 bytes. Use a branchless sequence that copies the same + byte 3 times if count==1, or the 2nd byte twice if count==2. */ 1: - tbz count, #0, 1f - ldrb tmp1w, [src] - strb tmp1w, [dst] + cbz count, 2f + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb A_hw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb A_hw, [dstend, -1] +2: ret + + .p2align 4 + /* Copy 64..96 bytes. Copy 64 bytes from the start and + 32 bytes from the end. */ +L(copy96): + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [src, 32] + ldp D_l, D_h, [src, 48] + ldp E_l, E_h, [srcend, -32] + ldp F_l, F_h, [srcend, -16] + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin, 32] + stp D_l, D_h, [dstin, 48] + stp E_l, E_h, [dstend, -32] + stp F_l, F_h, [dstend, -16] + ret + + /* Align DST to 16 byte alignment so that we don't cross cache line + boundaries on both loads and stores. There are at least 96 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 64 bytes per iteration and prefetches one iteration ahead. */ + + .p2align 4 +L(copy_long): + and tmp1, dstin, 15 + bic dst, dstin, 15 + ldp D_l, D_h, [src] + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(last64) +L(loop64): + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi L(loop64) + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes, so it is safe to always copy 64 bytes from the end even if + there is just 1 byte left. */ +L(last64): + ldp E_l, E_h, [srcend, -64] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [srcend, -48] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [srcend, -16] + stp D_l, D_h, [dst, 64] + stp E_l, E_h, [dstend, -64] + stp A_l, A_h, [dstend, -48] + stp B_l, B_h, [dstend, -32] + stp C_l, C_h, [dstend, -16] + ret + + .p2align 4 +L(move_long): + cbz tmp1, 3f + + add srcend, src, count + add dstend, dstin, count + + /* Align dstend to 16 byte alignment so that we don't cross cache line + boundaries on both loads and stores. There are at least 96 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 64 bytes per iteration and prefetches one iteration ahead. */ + + and tmp1, dstend, 15 + ldp D_l, D_h, [srcend, -16] + sub srcend, srcend, tmp1 + sub count, count, tmp1 + ldp A_l, A_h, [srcend, -16] + stp D_l, D_h, [dstend, -16] + ldp B_l, B_h, [srcend, -32] + ldp C_l, C_h, [srcend, -48] + ldp D_l, D_h, [srcend, -64]! + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls 2f + + nop 1: - RET - -L(cpy_not_short): - /* We don't much care about the alignment of DST, but we want SRC - * to be 128-bit (16 byte) aligned so that we don't cross cache line - * boundaries on both loads and stores. */ - neg tmp2, src - ands tmp2, tmp2, #15 /* Bytes to reach alignment. */ - b.eq 2f - sub count, count, tmp2 - /* Copy more data than needed; it's faster than jumping - * around copying sub-Quadword quantities. We know that - * it can't overrun. */ - ldp A_l, A_h, [src] - add src, src, tmp2 - stp A_l, A_h, [dst] - add dst, dst, tmp2 - /* There may be less than 63 bytes to go now. */ - cmp count, #63 - b.le L(tail63) + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [srcend, -16] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [srcend, -48] + stp D_l, D_h, [dstend, -64]! + ldp D_l, D_h, [srcend, -64]! + subs count, count, 64 + b.hi 1b + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes, so it is safe to always copy 64 bytes from the start even if + there is just 1 byte left. */ 2: - subs count, count, #128 - b.ge L(cpy_body_large) - /* Less than 128 bytes to copy, so handle 64 here and then jump - * to the tail. */ - ldp A_l, A_h, [src] - ldp B_l, B_h, [src, #16] - ldp C_l, C_h, [src, #32] - ldp D_l, D_h, [src, #48] - stp A_l, A_h, [dst] - stp B_l, B_h, [dst, #16] - stp C_l, C_h, [dst, #32] - stp D_l, D_h, [dst, #48] - tst count, #0x3f - add src, src, #64 - add dst, dst, #64 - b.ne L(tail63) - RET - - /* Critical loop. Start at a new cache line boundary. Assuming - * 64 bytes per line this ensures the entire loop is in one line. */ - .p2align 6 -L(cpy_body_large): - /* There are at least 128 bytes to copy. */ - ldp A_l, A_h, [src, #0] - sub dst, dst, #16 /* Pre-bias. */ - ldp B_l, B_h, [src, #16] - ldp C_l, C_h, [src, #32] - ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */ -1: - stp A_l, A_h, [dst, #16] - ldp A_l, A_h, [src, #16] - stp B_l, B_h, [dst, #32] - ldp B_l, B_h, [src, #32] - stp C_l, C_h, [dst, #48] - ldp C_l, C_h, [src, #48] - stp D_l, D_h, [dst, #64]! - ldp D_l, D_h, [src, #64]! - subs count, count, #64 - b.ge 1b - stp A_l, A_h, [dst, #16] - stp B_l, B_h, [dst, #32] - stp C_l, C_h, [dst, #48] - stp D_l, D_h, [dst, #64] - add src, src, #16 - add dst, dst, #64 + 16 - tst count, #0x3f - b.ne L(tail63) - RET -END (memcpy) -libc_hidden_builtin_def (memcpy) + ldp G_l, G_h, [src, 48] + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [src, 32] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [src, 16] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [src] + stp D_l, D_h, [dstend, -64] + stp G_l, G_h, [dstin, 48] + stp A_l, A_h, [dstin, 32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin] +3: ret + +END (MEMCPY) +libc_hidden_builtin_def (MEMCPY) diff --git a/sysdeps/aarch64/memmove.S b/sysdeps/aarch64/memmove.S index dd91db0ff6..0feeac8414 100644 --- a/sysdeps/aarch64/memmove.S +++ b/sysdeps/aarch64/memmove.S @@ -1,312 +1 @@ -/* Copyright (C) 2012-2016 Free Software Foundation, Inc. - - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -/* Assumptions: - * - * ARMv8-a, AArch64 - * Unaligned accesses - */ - -/* Parameters and result. */ -#define dstin x0 -#define src x1 -#define count x2 -#define tmp1 x3 -#define tmp1w w3 -#define tmp2 x4 -#define tmp2w w4 -#define tmp3 x5 -#define tmp3w w5 -#define dst x6 - -#define A_l x7 -#define A_h x8 -#define B_l x9 -#define B_h x10 -#define C_l x11 -#define C_h x12 -#define D_l x13 -#define D_h x14 - -ENTRY_ALIGN (memmove, 6) - - cmp dstin, src - b.lo L(downwards) - add tmp1, src, count - cmp dstin, tmp1 - b.hs memcpy /* No overlap. */ - - /* Upwards move with potential overlap. - * Need to move from the tail backwards. SRC and DST point one - * byte beyond the remaining data to move. */ - add dst, dstin, count - add src, src, count - cmp count, #64 - b.ge L(mov_not_short_up) - - /* Deal with small moves quickly by dropping straight into the - * exit block. */ -L(tail63up): - /* Move up to 48 bytes of data. At this point we only need the - * bottom 6 bits of count to be accurate. */ - ands tmp1, count, #0x30 - b.eq L(tail15up) - sub dst, dst, tmp1 - sub src, src, tmp1 - cmp tmp1w, #0x20 - b.eq 1f - b.lt 2f - ldp A_l, A_h, [src, #32] - stp A_l, A_h, [dst, #32] -1: - ldp A_l, A_h, [src, #16] - stp A_l, A_h, [dst, #16] -2: - ldp A_l, A_h, [src] - stp A_l, A_h, [dst] -L(tail15up): - /* Move up to 15 bytes of data. Does not assume additional data - * being moved. */ - tbz count, #3, 1f - ldr tmp1, [src, #-8]! - str tmp1, [dst, #-8]! -1: - tbz count, #2, 1f - ldr tmp1w, [src, #-4]! - str tmp1w, [dst, #-4]! -1: - tbz count, #1, 1f - ldrh tmp1w, [src, #-2]! - strh tmp1w, [dst, #-2]! -1: - tbz count, #0, 1f - ldrb tmp1w, [src, #-1] - strb tmp1w, [dst, #-1] -1: - RET - -L(mov_not_short_up): - /* We don't much care about the alignment of DST, but we want SRC - * to be 128-bit (16 byte) aligned so that we don't cross cache line - * boundaries on both loads and stores. */ - ands tmp2, src, #15 /* Bytes to reach alignment. */ - b.eq 2f - sub count, count, tmp2 - /* Move enough data to reach alignment; unlike memcpy, we have to - * be aware of the overlap, which means we can't move data twice. */ - tbz tmp2, #3, 1f - ldr tmp1, [src, #-8]! - str tmp1, [dst, #-8]! -1: - tbz tmp2, #2, 1f - ldr tmp1w, [src, #-4]! - str tmp1w, [dst, #-4]! -1: - tbz tmp2, #1, 1f - ldrh tmp1w, [src, #-2]! - strh tmp1w, [dst, #-2]! -1: - tbz tmp2, #0, 1f - ldrb tmp1w, [src, #-1]! - strb tmp1w, [dst, #-1]! -1: - - /* There may be less than 63 bytes to go now. */ - cmp count, #63 - b.le L(tail63up) -2: - subs count, count, #128 - b.ge L(mov_body_large_up) - /* Less than 128 bytes to move, so handle 64 here and then jump - * to the tail. */ - ldp A_l, A_h, [src, #-64]! - ldp B_l, B_h, [src, #16] - ldp C_l, C_h, [src, #32] - ldp D_l, D_h, [src, #48] - stp A_l, A_h, [dst, #-64]! - stp B_l, B_h, [dst, #16] - stp C_l, C_h, [dst, #32] - stp D_l, D_h, [dst, #48] - tst count, #0x3f - b.ne L(tail63up) - RET - - /* Critical loop. Start at a new Icache line boundary. Assuming - * 64 bytes per line this ensures the entire loop is in one line. */ - .p2align 6 -L(mov_body_large_up): - /* There are at least 128 bytes to move. */ - ldp A_l, A_h, [src, #-16] - ldp B_l, B_h, [src, #-32] - ldp C_l, C_h, [src, #-48] - ldp D_l, D_h, [src, #-64]! -1: - stp A_l, A_h, [dst, #-16] - ldp A_l, A_h, [src, #-16] - stp B_l, B_h, [dst, #-32] - ldp B_l, B_h, [src, #-32] - stp C_l, C_h, [dst, #-48] - ldp C_l, C_h, [src, #-48] - stp D_l, D_h, [dst, #-64]! - ldp D_l, D_h, [src, #-64]! - subs count, count, #64 - b.ge 1b - stp A_l, A_h, [dst, #-16] - stp B_l, B_h, [dst, #-32] - stp C_l, C_h, [dst, #-48] - stp D_l, D_h, [dst, #-64]! - tst count, #0x3f - b.ne L(tail63up) - RET - -L(downwards): - /* For a downwards move we can safely use memcpy provided that - * DST is more than 16 bytes away from SRC. */ - sub tmp1, src, #16 - cmp dstin, tmp1 - b.ls memcpy /* May overlap, but not critically. */ - - mov dst, dstin /* Preserve DSTIN for return value. */ - cmp count, #64 - b.ge L(mov_not_short_down) - - /* Deal with small moves quickly by dropping straight into the - * exit block. */ -L(tail63down): - /* Move up to 48 bytes of data. At this point we only need the - * bottom 6 bits of count to be accurate. */ - ands tmp1, count, #0x30 - b.eq L(tail15down) - add dst, dst, tmp1 - add src, src, tmp1 - cmp tmp1w, #0x20 - b.eq 1f - b.lt 2f - ldp A_l, A_h, [src, #-48] - stp A_l, A_h, [dst, #-48] -1: - ldp A_l, A_h, [src, #-32] - stp A_l, A_h, [dst, #-32] -2: - ldp A_l, A_h, [src, #-16] - stp A_l, A_h, [dst, #-16] -L(tail15down): - /* Move up to 15 bytes of data. Does not assume additional data - being moved. */ - tbz count, #3, 1f - ldr tmp1, [src], #8 - str tmp1, [dst], #8 -1: - tbz count, #2, 1f - ldr tmp1w, [src], #4 - str tmp1w, [dst], #4 -1: - tbz count, #1, 1f - ldrh tmp1w, [src], #2 - strh tmp1w, [dst], #2 -1: - tbz count, #0, 1f - ldrb tmp1w, [src] - strb tmp1w, [dst] -1: - RET - -L(mov_not_short_down): - /* We don't much care about the alignment of DST, but we want SRC - * to be 128-bit (16 byte) aligned so that we don't cross cache line - * boundaries on both loads and stores. */ - neg tmp2, src - ands tmp2, tmp2, #15 /* Bytes to reach alignment. */ - b.eq 2f - sub count, count, tmp2 - /* Move enough data to reach alignment; unlike memcpy, we have to - * be aware of the overlap, which means we can't move data twice. */ - tbz tmp2, #3, 1f - ldr tmp1, [src], #8 - str tmp1, [dst], #8 -1: - tbz tmp2, #2, 1f - ldr tmp1w, [src], #4 - str tmp1w, [dst], #4 -1: - tbz tmp2, #1, 1f - ldrh tmp1w, [src], #2 - strh tmp1w, [dst], #2 -1: - tbz tmp2, #0, 1f - ldrb tmp1w, [src], #1 - strb tmp1w, [dst], #1 -1: - - /* There may be less than 63 bytes to go now. */ - cmp count, #63 - b.le L(tail63down) -2: - subs count, count, #128 - b.ge L(mov_body_large_down) - /* Less than 128 bytes to move, so handle 64 here and then jump - * to the tail. */ - ldp A_l, A_h, [src] - ldp B_l, B_h, [src, #16] - ldp C_l, C_h, [src, #32] - ldp D_l, D_h, [src, #48] - stp A_l, A_h, [dst] - stp B_l, B_h, [dst, #16] - stp C_l, C_h, [dst, #32] - stp D_l, D_h, [dst, #48] - tst count, #0x3f - add src, src, #64 - add dst, dst, #64 - b.ne L(tail63down) - RET - - /* Critical loop. Start at a new cache line boundary. Assuming - * 64 bytes per line this ensures the entire loop is in one line. */ - .p2align 6 -L(mov_body_large_down): - /* There are at least 128 bytes to move. */ - ldp A_l, A_h, [src, #0] - sub dst, dst, #16 /* Pre-bias. */ - ldp B_l, B_h, [src, #16] - ldp C_l, C_h, [src, #32] - ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */ -1: - stp A_l, A_h, [dst, #16] - ldp A_l, A_h, [src, #16] - stp B_l, B_h, [dst, #32] - ldp B_l, B_h, [src, #32] - stp C_l, C_h, [dst, #48] - ldp C_l, C_h, [src, #48] - stp D_l, D_h, [dst, #64]! - ldp D_l, D_h, [src, #64]! - subs count, count, #64 - b.ge 1b - stp A_l, A_h, [dst, #16] - stp B_l, B_h, [dst, #32] - stp C_l, C_h, [dst, #48] - stp D_l, D_h, [dst, #64] - add src, src, #16 - add dst, dst, #64 + 16 - tst count, #0x3f - b.ne L(tail63down) - RET -END (memmove) - -libc_hidden_builtin_def (memmove) +/* memmove is part of memcpy.S. */ diff --git a/sysdeps/aarch64/memset-reg.h b/sysdeps/aarch64/memset-reg.h new file mode 100644 index 0000000000..ac7c083867 --- /dev/null +++ b/sysdeps/aarch64/memset-reg.h @@ -0,0 +1,30 @@ +/* Register aliases for memset to be used across implementations. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define dstin x0 +#define val x1 +#define valw w1 +#define count x2 +#define dst x3 +#define dstend x4 +#define tmp1 x5 +#define tmp1w w5 +#define tmp2 x6 +#define tmp2w w6 +#define zva_len x7 +#define zva_lenw w7 diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S index e49f4d6faf..4a45459361 100644 --- a/sysdeps/aarch64/memset.S +++ b/sysdeps/aarch64/memset.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2016 Free Software Foundation, Inc. +/* Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,214 +16,175 @@ License along with the GNU C Library. If not, see <http://www.gnu.org/licenses/>. */ +#include <sysdep.h> +#include "memset-reg.h" + +#ifndef MEMSET +# define MEMSET memset +#endif + /* Assumptions: * - * ARMv8-a, AArch64 - * Unaligned accesses + * ARMv8-a, AArch64, unaligned accesses * */ -#include <sysdep.h> - -/* By default we assume that the DC instruction can be used to zero - data blocks more efficiently. In some circumstances this might be - unsafe, for example in an asymmetric multiprocessor environment with - different DC clear lengths (neither the upper nor lower lengths are - safe to use). The feature can be disabled by defining DONT_USE_DC. - - If code may be run in a virtualized environment, then define - MAYBE_VIRT. This will cause the code to cache the system register - values rather than re-reading them each call. */ - -#define dstin x0 -#define val w1 -#define count x2 -#define tmp1 x3 -#define tmp1w w3 -#define tmp2 x4 -#define tmp2w w4 -#define zva_len_x x5 -#define zva_len w5 -#define zva_bits_x x6 - -#define A_l x7 -#define A_lw w7 -#define dst x8 -#define tmp3w w9 - -ENTRY_ALIGN (__memset, 6) - - mov dst, dstin /* Preserve return value. */ - ands A_lw, val, #255 -#ifndef DONT_USE_DC - b.eq L(zero_mem) -#endif - orr A_lw, A_lw, A_lw, lsl #8 - orr A_lw, A_lw, A_lw, lsl #16 - orr A_l, A_l, A_l, lsl #32 -L(tail_maybe_long): - cmp count, #64 - b.ge L(not_short) -L(tail_maybe_tiny): - cmp count, #15 - b.le L(tail15tiny) -L(tail63): - ands tmp1, count, #0x30 - b.eq L(tail15) - add dst, dst, tmp1 - cmp tmp1w, #0x20 - b.eq 1f - b.lt 2f - stp A_l, A_l, [dst, #-48] -1: - stp A_l, A_l, [dst, #-32] -2: - stp A_l, A_l, [dst, #-16] - -L(tail15): - and count, count, #15 - add dst, dst, count - stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */ - RET - -L(tail15tiny): - /* Set up to 15 bytes. Does not assume earlier memory - being set. */ - tbz count, #3, 1f - str A_l, [dst], #8 -1: - tbz count, #2, 1f - str A_lw, [dst], #4 -1: - tbz count, #1, 1f - strh A_lw, [dst], #2 -1: - tbz count, #0, 1f - strb A_lw, [dst] -1: - RET - - /* Critical loop. Start at a new cache line boundary. Assuming - * 64 bytes per line, this ensures the entire loop is in one line. */ - .p2align 6 -L(not_short): - neg tmp2, dst - ands tmp2, tmp2, #15 - b.eq 2f - /* Bring DST to 128-bit (16-byte) alignment. We know that there's - * more than that to set, so we simply store 16 bytes and advance by - * the amount required to reach alignment. */ - sub count, count, tmp2 - stp A_l, A_l, [dst] - add dst, dst, tmp2 - /* There may be less than 63 bytes to go now. */ - cmp count, #63 - b.le L(tail63) -2: - sub dst, dst, #16 /* Pre-bias. */ - sub count, count, #64 -1: - stp A_l, A_l, [dst, #16] - stp A_l, A_l, [dst, #32] - stp A_l, A_l, [dst, #48] - stp A_l, A_l, [dst, #64]! - subs count, count, #64 - b.ge 1b - tst count, #0x3f - add dst, dst, #16 - b.ne L(tail63) - RET - -#ifndef DONT_USE_DC - /* For zeroing memory, check to see if we can use the ZVA feature to - * zero entire 'cache' lines. */ -L(zero_mem): - mov A_l, #0 - cmp count, #63 - b.le L(tail_maybe_tiny) - neg tmp2, dst - ands tmp2, tmp2, #15 - b.eq 1f - sub count, count, tmp2 - stp A_l, A_l, [dst] - add dst, dst, tmp2 - cmp count, #63 - b.le L(tail63) -1: - /* For zeroing small amounts of memory, it's not worth setting up - * the line-clear code. */ - cmp count, #128 - b.lt L(not_short) -#ifdef MAYBE_VIRT - /* For efficiency when virtualized, we cache the ZVA capability. */ - adrp tmp2, L(cache_clear) - ldr zva_len, [tmp2, #:lo12:L(cache_clear)] - tbnz zva_len, #31, L(not_short) - cbnz zva_len, L(zero_by_line) - mrs tmp1, dczid_el0 - tbz tmp1, #4, 1f - /* ZVA not available. Remember this for next time. */ - mov zva_len, #~0 - str zva_len, [tmp2, #:lo12:L(cache_clear)] - b L(not_short) -1: - mov tmp3w, #4 - and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ - lsl zva_len, tmp3w, zva_len - str zva_len, [tmp2, #:lo12:L(cache_clear)] +ENTRY_ALIGN (MEMSET, 6) + + DELOUSE (0) + DELOUSE (2) + + dup v0.16B, valw + add dstend, dstin, count + + cmp count, 96 + b.hi L(set_long) + cmp count, 16 + b.hs L(set_medium) + mov val, v0.D[0] + + /* Set 0..15 bytes. */ + tbz count, 3, 1f + str val, [dstin] + str val, [dstend, -8] + ret + nop +1: tbz count, 2, 2f + str valw, [dstin] + str valw, [dstend, -4] + ret +2: cbz count, 3f + strb valw, [dstin] + tbz count, 1, 3f + strh valw, [dstend, -2] +3: ret + + /* Set 17..96 bytes. */ +L(set_medium): + str q0, [dstin] + tbnz count, 6, L(set96) + str q0, [dstend, -16] + tbz count, 5, 1f + str q0, [dstin, 16] + str q0, [dstend, -32] +1: ret + + .p2align 4 + /* Set 64..96 bytes. Write 64 bytes from the start and + 32 bytes from the end. */ +L(set96): + str q0, [dstin, 16] + stp q0, q0, [dstin, 32] + stp q0, q0, [dstend, -32] + ret + + .p2align 3 + nop +L(set_long): + and valw, valw, 255 + bic dst, dstin, 15 + str q0, [dstin] + cmp count, 256 + ccmp valw, 0, 0, cs + b.eq L(try_zva) +L(no_zva): + sub count, dstend, dst /* Count is 16 too large. */ + add dst, dst, 16 + sub count, count, 64 + 16 /* Adjust count and bias for loop. */ +1: stp q0, q0, [dst], 64 + stp q0, q0, [dst, -32] +L(tail64): + subs count, count, 64 + b.hi 1b +2: stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + +L(try_zva): +#ifdef ZVA_MACRO + zva_macro #else + .p2align 3 mrs tmp1, dczid_el0 - tbnz tmp1, #4, L(not_short) - mov tmp3w, #4 - and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ - lsl zva_len, tmp3w, zva_len -#endif - -L(zero_by_line): - /* Compute how far we need to go to become suitably aligned. We're - * already at quad-word alignment. */ - cmp count, zva_len_x - b.lt L(not_short) /* Not enough to reach alignment. */ - sub zva_bits_x, zva_len_x, #1 - neg tmp2, dst - ands tmp2, tmp2, zva_bits_x - b.eq 1f /* Already aligned. */ - /* Not aligned, check that there's enough to copy after alignment. */ - sub tmp1, count, tmp2 - cmp tmp1, #64 - ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */ - b.lt L(not_short) - /* We know that there's at least 64 bytes to zero and that it's safe - * to overrun by 64 bytes. */ - mov count, tmp1 -2: - stp A_l, A_l, [dst] - stp A_l, A_l, [dst, #16] - stp A_l, A_l, [dst, #32] - subs tmp2, tmp2, #64 - stp A_l, A_l, [dst, #48] - add dst, dst, #64 - b.ge 2b - /* We've overrun a bit, so adjust dst downwards. */ - add dst, dst, tmp2 -1: - sub count, count, zva_len_x -3: - dc zva, dst - add dst, dst, zva_len_x - subs count, count, zva_len_x - b.ge 3b - ands count, count, zva_bits_x - b.ne L(tail_maybe_long) - RET -#ifdef MAYBE_VIRT - .bss - .p2align 2 -L(cache_clear): - .space 4 + tbnz tmp1w, 4, L(no_zva) + and tmp1w, tmp1w, 15 + cmp tmp1w, 4 /* ZVA size is 64 bytes. */ + b.ne L(zva_128) + + /* Write the first and last 64 byte aligned block using stp rather + than using DC ZVA. This is faster on some cores. + */ +L(zva_64): + str q0, [dst, 16] + stp q0, q0, [dst, 32] + bic dst, dst, 63 + stp q0, q0, [dst, 64] + stp q0, q0, [dst, 96] + sub count, dstend, dst /* Count is now 128 too large. */ + sub count, count, 128+64+64 /* Adjust count and bias for loop. */ + add dst, dst, 128 + nop +1: dc zva, dst + add dst, dst, 64 + subs count, count, 64 + b.hi 1b + stp q0, q0, [dst, 0] + stp q0, q0, [dst, 32] + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + + .p2align 3 +L(zva_128): + cmp tmp1w, 5 /* ZVA size is 128 bytes. */ + b.ne L(zva_other) + + str q0, [dst, 16] + stp q0, q0, [dst, 32] + stp q0, q0, [dst, 64] + stp q0, q0, [dst, 96] + bic dst, dst, 127 + sub count, dstend, dst /* Count is now 128 too large. */ + sub count, count, 128+128 /* Adjust count and bias for loop. */ + add dst, dst, 128 +1: dc zva, dst + add dst, dst, 128 + subs count, count, 128 + b.hi 1b + stp q0, q0, [dstend, -128] + stp q0, q0, [dstend, -96] + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + +L(zva_other): + mov tmp2w, 4 + lsl zva_lenw, tmp2w, tmp1w + add tmp1, zva_len, 64 /* Max alignment bytes written. */ + cmp count, tmp1 + blo L(no_zva) + + sub tmp2, zva_len, 1 + add tmp1, dst, zva_len + add dst, dst, 16 + subs count, tmp1, dst /* Actual alignment bytes to write. */ + bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ + beq 2f +1: stp q0, q0, [dst], 64 + stp q0, q0, [dst, -32] + subs count, count, 64 + b.hi 1b +2: mov dst, tmp1 + sub count, dstend, tmp1 /* Remaining bytes to write. */ + subs count, count, zva_len + b.lo 4f +3: dc zva, dst + add dst, dst, zva_len + subs count, count, zva_len + b.hs 3b +4: add count, count, zva_len + b L(tail64) #endif -#endif /* DONT_USE_DC */ -END (__memset) -weak_alias (__memset, memset) -libc_hidden_builtin_def (memset) +END (MEMSET) +libc_hidden_builtin_def (MEMSET) diff --git a/sysdeps/aarch64/memusage.h b/sysdeps/aarch64/memusage.h index 70bf2ce859..874f3b5b3d 100644 --- a/sysdeps/aarch64/memusage.h +++ b/sysdeps/aarch64/memusage.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2000-2016 Free Software Foundation, Inc. +/* Copyright (C) 2000-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile new file mode 100644 index 0000000000..57ffdf7238 --- /dev/null +++ b/sysdeps/aarch64/multiarch/Makefile @@ -0,0 +1,4 @@ +ifeq ($(subdir),string) +sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \ + memcpy_falkor memmove_falkor memset_generic memset_falkor +endif diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c new file mode 100644 index 0000000000..e55be80103 --- /dev/null +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -0,0 +1,57 @@ +/* Enumerate available IFUNC implementations of a function. AARCH64 version. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <assert.h> +#include <string.h> +#include <wchar.h> +#include <ldsodefs.h> +#include <ifunc-impl-list.h> +#include <init-arch.h> +#include <stdio.h> + +/* Maximum number of IFUNC implementations. */ +#define MAX_IFUNC 4 + +size_t +__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + size_t max) +{ + assert (max >= MAX_IFUNC); + + size_t i = 0; + + INIT_ARCH (); + + /* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c. */ + IFUNC_IMPL (i, name, memcpy, + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic)) + IFUNC_IMPL (i, name, memmove, + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx) + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor) + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic)) + IFUNC_IMPL (i, name, memset, + /* Enable this on non-falkor processors too so that other cores + can do a comparative analysis with __memset_generic. */ + IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor) + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic)) + + return i; +} diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h new file mode 100644 index 0000000000..d1e5703cb2 --- /dev/null +++ b/sysdeps/aarch64/multiarch/init-arch.h @@ -0,0 +1,25 @@ +/* Define INIT_ARCH so that midr is initialized before use by IFUNCs. + This file is part of the GNU C Library. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <ldsodefs.h> + +#define INIT_ARCH() \ + uint64_t __attribute__((unused)) midr = \ + GLRO(dl_aarch64_cpu_features).midr_el1; \ + unsigned __attribute__((unused)) zva_size = \ + GLRO(dl_aarch64_cpu_features).zva_size; diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c new file mode 100644 index 0000000000..4a04a63b0f --- /dev/null +++ b/sysdeps/aarch64/multiarch/memcpy.c @@ -0,0 +1,47 @@ +/* Multiple versions of memcpy. AARCH64 version. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ + +#if IS_IN (libc) +/* Redefine memcpy so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +# undef memcpy +# define memcpy __redirect_memcpy +# include <string.h> +# include <init-arch.h> + +extern __typeof (__redirect_memcpy) __libc_memcpy; + +extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden; +extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden; +extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden; +extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden; + +libc_ifunc (__libc_memcpy, + (IS_THUNDERX (midr) + ? __memcpy_thunderx + : (IS_FALKOR (midr) || IS_PHECDA (midr) + ? __memcpy_falkor + : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr) + ? __memcpy_thunderx2 + : __memcpy_generic)))); + +# undef memcpy +strong_alias (__libc_memcpy, memcpy); +#endif diff --git a/sysdeps/aarch64/multiarch/memcpy_falkor.S b/sysdeps/aarch64/multiarch/memcpy_falkor.S new file mode 100644 index 0000000000..cdc2de4750 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memcpy_falkor.S @@ -0,0 +1,191 @@ +/* Optimized memcpy for Qualcomm Falkor processor. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Assumptions: + + ARMv8-a, AArch64, falkor, unaligned accesses. */ + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define tmp1 x14 +#define A_x x6 +#define B_x x7 +#define A_w w6 +#define B_w w7 + +#define A_q q0 +#define B_q q1 +#define C_q q2 +#define D_q q3 +#define E_q q4 +#define F_q q5 +#define G_q q6 + +/* Copies are split into 3 main cases: + + 1. Small copies of up to 32 bytes + 2. Medium copies of 33..128 bytes which are fully unrolled + 3. Large copies of more than 128 bytes. + + Large copies align the sourceto a quad word and use an unrolled loop + processing 64 bytes per iteration. + + FALKOR-SPECIFIC DESIGN: + + The smallest copies (32 bytes or less) focus on optimal pipeline usage, + which is why the redundant copies of 0-3 bytes have been replaced with + conditionals, since the former would unnecessarily break across multiple + issue groups. The medium copy group has been enlarged to 128 bytes since + bumping up the small copies up to 32 bytes allows us to do that without + cost and also allows us to reduce the size of the prep code before loop64. + + The copy loop uses only one register q0. This is to ensure that all loads + hit a single hardware prefetcher which can get correctly trained to prefetch + a single stream. + + The non-temporal stores help optimize cache utilization. */ + +#if IS_IN (libc) +ENTRY_ALIGN (__memcpy_falkor, 6) + + cmp count, 32 + add srcend, src, count + add dstend, dstin, count + b.ls L(copy32) + ldr A_q, [src] + cmp count, 128 + str A_q, [dstin] + b.hi L(copy_long) + + /* Medium copies: 33..128 bytes. */ + sub tmp1, count, 1 + ldr A_q, [src, 16] + ldr B_q, [srcend, -32] + ldr C_q, [srcend, -16] + tbz tmp1, 6, 1f + ldr D_q, [src, 32] + ldr E_q, [src, 48] + str D_q, [dstin, 32] + str E_q, [dstin, 48] + ldr F_q, [srcend, -64] + ldr G_q, [srcend, -48] + str F_q, [dstend, -64] + str G_q, [dstend, -48] +1: + str A_q, [dstin, 16] + str B_q, [dstend, -32] + str C_q, [dstend, -16] + ret + + .p2align 4 + /* Small copies: 0..32 bytes. */ +L(copy32): + /* 16-32 */ + cmp count, 16 + b.lo 1f + ldr A_q, [src] + ldr B_q, [srcend, -16] + str A_q, [dstin] + str B_q, [dstend, -16] + ret + .p2align 4 +1: + /* 8-15 */ + tbz count, 3, 1f + ldr A_x, [src] + ldr B_x, [srcend, -8] + str A_x, [dstin] + str B_x, [dstend, -8] + ret + .p2align 4 +1: + /* 4-7 */ + tbz count, 2, 1f + ldr A_w, [src] + ldr B_w, [srcend, -4] + str A_w, [dstin] + str B_w, [dstend, -4] + ret + .p2align 4 +1: + /* 2-3 */ + tbz count, 1, 1f + ldrh A_w, [src] + ldrh B_w, [srcend, -2] + strh A_w, [dstin] + strh B_w, [dstend, -2] + ret + .p2align 4 +1: + /* 0-1 */ + tbz count, 0, 1f + ldrb A_w, [src] + strb A_w, [dstin] +1: + ret + + /* Align SRC to 16 bytes and copy; that way at least one of the + accesses is aligned throughout the copy sequence. + + The count is off by 0 to 15 bytes, but this is OK because we trim + off the last 64 bytes to copy off from the end. Due to this the + loop never runs out of bounds. */ + .p2align 6 +L(copy_long): + sub count, count, 64 + 16 + and tmp1, src, 15 + bic src, src, 15 + sub dst, dstin, tmp1 + add count, count, tmp1 + +L(loop64): + ldr A_q, [src, 16]! + str A_q, [dst, 16] + ldr A_q, [src, 16]! + subs count, count, 64 + str A_q, [dst, 32] + ldr A_q, [src, 16]! + str A_q, [dst, 48] + ldr A_q, [src, 16]! + str A_q, [dst, 64]! + b.hi L(loop64) + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes, so it is safe to always copy 64 bytes from the end even if + there is just 1 byte left. */ +L(last64): + ldr E_q, [srcend, -64] + str E_q, [dstend, -64] + ldr D_q, [srcend, -48] + str D_q, [dstend, -48] + ldr C_q, [srcend, -32] + str C_q, [dstend, -32] + ldr B_q, [srcend, -16] + str B_q, [dstend, -16] + ret + +END (__memcpy_falkor) +libc_hidden_builtin_def (__memcpy_falkor) +#endif diff --git a/sysdeps/aarch64/multiarch/memcpy_generic.S b/sysdeps/aarch64/multiarch/memcpy_generic.S new file mode 100644 index 0000000000..4bc81f4ce5 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memcpy_generic.S @@ -0,0 +1,44 @@ +/* A Generic Optimized memcpy implementation for AARCH64. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* The actual memcpy and memmove code is in ../memcpy.S. If we are + building libc this file defines __memcpy_generic and __memmove_generic. + Otherwise the include of ../memcpy.S will define the normal __memcpy + and__memmove entry points. */ + +#include <sysdep.h> + +#if IS_IN (libc) + +# define MEMCPY __memcpy_generic +# define MEMMOVE __memmove_generic + +/* Do not hide the generic versions of memcpy and memmove, we use them + internally. */ +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) + +# ifdef SHARED +/* It doesn't make sense to send libc-internal memcpy calls through a PLT. */ + .globl __GI_memcpy; __GI_memcpy = __memcpy_generic + .globl __GI_memmove; __GI_memmove = __memmove_generic +# endif + +#endif + +#include "../memcpy.S" diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx.S b/sysdeps/aarch64/multiarch/memcpy_thunderx.S new file mode 100644 index 0000000000..de494d933d --- /dev/null +++ b/sysdeps/aarch64/multiarch/memcpy_thunderx.S @@ -0,0 +1,336 @@ +/* A Thunderx Optimized memcpy implementation for AARCH64. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* The actual code in this memcpy and memmove should be identical to the + generic version except for the code under '#ifdef THUNDERX'. This is + to make is easier to keep this version and the generic version in sync + for changes that are not specific to thunderx. */ + +#include <sysdep.h> + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses. + * + */ + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define A_hw w7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_l x10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l src +#define E_h count +#define F_l srcend +#define F_h dst +#define G_l count +#define G_h dst +#define tmp1 x14 + +/* Copies are split into 3 main cases: small copies of up to 16 bytes, + medium copies of 17..96 bytes which are fully unrolled. Large copies + of more than 96 bytes align the destination and use an unrolled loop + processing 64 bytes per iteration. + In order to share code with memmove, small and medium copies read all + data before writing, allowing any kind of overlap. So small, medium + and large backwards memmoves are handled by falling through into memcpy. + Overlapping large forward memmoves use a loop that copies backwards. +*/ + +#ifndef MEMMOVE +# define MEMMOVE memmove +#endif +#ifndef MEMCPY +# define MEMCPY memcpy +#endif + +#if IS_IN (libc) + +# ifndef USE_THUNDERX2 +# undef MEMCPY +# define MEMCPY __memcpy_thunderx +# undef MEMMOVE +# define MEMMOVE __memmove_thunderx +# define USE_THUNDERX +# endif + +ENTRY_ALIGN (MEMMOVE, 6) + + DELOUSE (0) + DELOUSE (1) + DELOUSE (2) + + sub tmp1, dstin, src + cmp count, 96 + ccmp tmp1, count, 2, hi + b.lo L(move_long) + + /* Common case falls through into memcpy. */ +END (MEMMOVE) +libc_hidden_builtin_def (MEMMOVE) +ENTRY (MEMCPY) + + DELOUSE (0) + DELOUSE (1) + DELOUSE (2) + + prfm PLDL1KEEP, [src] + add srcend, src, count + add dstend, dstin, count + cmp count, 16 + b.ls L(copy16) + cmp count, 96 + b.hi L(copy_long) + + /* Medium copies: 17..96 bytes. */ + sub tmp1, count, 1 + ldp A_l, A_h, [src] + tbnz tmp1, 6, L(copy96) + ldp D_l, D_h, [srcend, -16] + tbz tmp1, 5, 1f + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] +1: + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Small copies: 0..16 bytes. */ +L(copy16): + cmp count, 8 + b.lo 1f + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + .p2align 4 +1: + tbz count, 2, 1f + ldr A_lw, [src] + ldr A_hw, [srcend, -4] + str A_lw, [dstin] + str A_hw, [dstend, -4] + ret + + /* Copy 0..3 bytes. Use a branchless sequence that copies the same + byte 3 times if count==1, or the 2nd byte twice if count==2. */ +1: + cbz count, 2f + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb A_hw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb A_hw, [dstend, -1] +2: ret + + .p2align 4 + /* Copy 64..96 bytes. Copy 64 bytes from the start and + 32 bytes from the end. */ +L(copy96): + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [src, 32] + ldp D_l, D_h, [src, 48] + ldp E_l, E_h, [srcend, -32] + ldp F_l, F_h, [srcend, -16] + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin, 32] + stp D_l, D_h, [dstin, 48] + stp E_l, E_h, [dstend, -32] + stp F_l, F_h, [dstend, -16] + ret + + /* Align DST to 16 byte alignment so that we don't cross cache line + boundaries on both loads and stores. There are at least 96 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 64 bytes per iteration and prefetches one iteration ahead. */ + + .p2align 4 +L(copy_long): + +# if defined(USE_THUNDERX) || defined (USE_THUNDERX2) + + /* On thunderx, large memcpy's are helped by software prefetching. + This loop is identical to the one below it but with prefetching + instructions included. For loops that are less than 32768 bytes, + the prefetching does not help and slow the code down so we only + use the prefetching loop for the largest memcpys. */ + + cmp count, #32768 + b.lo L(copy_long_without_prefetch) + and tmp1, dstin, 15 + bic dst, dstin, 15 + ldp D_l, D_h, [src] + sub src, src, tmp1 +# if defined(USE_THUNDERX) + prfm pldl1strm, [src, 384] +# elif defined(USE_THUNDERX2) + prfm pldl1strm, [src, 256] +# endif + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 /* Test and readjust count. */ + +L(prefetch_loop64): +# if defined(USE_THUNDERX) + tbz src, #6, 1f + prfm pldl1strm, [src, 512] +1: +# elif defined(USE_THUNDERX2) + prfm pldl1strm, [src, 256] +# endif + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi L(prefetch_loop64) + b L(last64) + +L(copy_long_without_prefetch): +# endif + + and tmp1, dstin, 15 + bic dst, dstin, 15 + ldp D_l, D_h, [src] + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(last64) +L(loop64): + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi L(loop64) + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes, so it is safe to always copy 64 bytes from the end even if + there is just 1 byte left. */ +L(last64): + ldp E_l, E_h, [srcend, -64] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [srcend, -48] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [srcend, -16] + stp D_l, D_h, [dst, 64] + stp E_l, E_h, [dstend, -64] + stp A_l, A_h, [dstend, -48] + stp B_l, B_h, [dstend, -32] + stp C_l, C_h, [dstend, -16] + ret + + .p2align 4 +L(move_long): + cbz tmp1, 3f + + add srcend, src, count + add dstend, dstin, count + + /* Align dstend to 16 byte alignment so that we don't cross cache line + boundaries on both loads and stores. There are at least 96 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 64 bytes per iteration and prefetches one iteration ahead. */ + + and tmp1, dstend, 15 + ldp D_l, D_h, [srcend, -16] + sub srcend, srcend, tmp1 + sub count, count, tmp1 + ldp A_l, A_h, [srcend, -16] + stp D_l, D_h, [dstend, -16] + ldp B_l, B_h, [srcend, -32] + ldp C_l, C_h, [srcend, -48] + ldp D_l, D_h, [srcend, -64]! + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls 2f + + nop +1: + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [srcend, -16] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [srcend, -48] + stp D_l, D_h, [dstend, -64]! + ldp D_l, D_h, [srcend, -64]! + subs count, count, 64 + b.hi 1b + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes, so it is safe to always copy 64 bytes from the start even if + there is just 1 byte left. */ +2: + ldp G_l, G_h, [src, 48] + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [src, 32] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [src, 16] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [src] + stp D_l, D_h, [dstend, -64] + stp G_l, G_h, [dstin, 48] + stp A_l, A_h, [dstin, 32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin] +3: ret + +END (MEMCPY) +libc_hidden_builtin_def (MEMCPY) + +#endif diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S new file mode 100644 index 0000000000..8501abf725 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S @@ -0,0 +1,27 @@ +/* A Thunderx2 Optimized memcpy implementation for AARCH64. + Copyright (C) 2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* The actual code in this memcpy and memmove is in memcpy_thunderx.S. + The only real differences are with the prefetching instructions. */ + +#define MEMCPY __memcpy_thunderx2 +#define MEMMOVE __memmove_thunderx2 +#define USE_THUNDERX2 + +#include "memcpy_thunderx.S" diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c new file mode 100644 index 0000000000..e69d816291 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memmove.c @@ -0,0 +1,44 @@ +/* Multiple versions of memmove. AARCH64 version. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ + +#if IS_IN (libc) +/* Redefine memmove so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +# undef memmove +# define memmove __redirect_memmove +# include <string.h> +# include <init-arch.h> + +extern __typeof (__redirect_memmove) __libc_memmove; + +extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden; +extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden; +extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden; + +libc_ifunc (__libc_memmove, + (IS_THUNDERX (midr) + ? __memmove_thunderx + : (IS_FALKOR (midr) || IS_PHECDA (midr) + ? __memmove_falkor + : __memmove_generic))); + +# undef memmove +strong_alias (__libc_memmove, memmove); +#endif diff --git a/sysdeps/aarch64/multiarch/memmove_falkor.S b/sysdeps/aarch64/multiarch/memmove_falkor.S new file mode 100644 index 0000000000..5163f68e53 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memmove_falkor.S @@ -0,0 +1,225 @@ +/* Copyright (C) 2017-2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Assumptions: ARMv8-a, AArch64, falkor, unaligned accesses. */ + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_x x6 +#define B_x x7 +#define A_w w6 +#define B_w w7 +#define tmp1 x14 + +#define Q_q q6 +#define A_q q22 +#define B_q q18 +#define C_q q19 +#define D_q q20 +#define E_q q21 +#define F_q q17 +#define G_q q23 + +/* RATIONALE: + + The move has 4 distinct parts: + * Small moves of 16 bytes and under + * Medium sized moves of 17-96 bytes + * Large moves where the source address is higher than the destination + (forward copies) + * Large moves where the destination address is higher than the source + (copy backward, or move). + + We use only two registers q6 and q22 for the moves and move 32 bytes at a + time to correctly train the hardware prefetcher for better throughput. */ +ENTRY_ALIGN (__memmove_falkor, 6) + + sub tmp1, dstin, src + add srcend, src, count + add dstend, dstin, count + cmp count, 96 + ccmp tmp1, count, 2, hi + b.lo L(move_long) + + cmp count, 16 + b.ls L(copy16) + cmp count, 96 + b.hi L(copy_long) + + /* Medium copies: 17..96 bytes. */ + sub tmp1, count, 1 + ldr A_q, [src] + tbnz tmp1, 6, L(copy96) + ldr D_q, [srcend, -16] + tbz tmp1, 5, 1f + ldr B_q, [src, 16] + ldr C_q, [srcend, -32] + str B_q, [dstin, 16] + str C_q, [dstend, -32] +1: + str A_q, [dstin] + str D_q, [dstend, -16] + ret + + .p2align 4 + /* Small copies: 0..16 bytes. */ +L(copy16): + cmp count, 8 + b.lo 1f + ldr A_x, [src] + ldr B_x, [srcend, -8] + str A_x, [dstin] + str B_x, [dstend, -8] + ret + .p2align 4 +1: + /* 4-7 */ + tbz count, 2, 1f + ldr A_w, [src] + ldr B_w, [srcend, -4] + str A_w, [dstin] + str B_w, [dstend, -4] + ret + .p2align 4 +1: + /* 2-3 */ + tbz count, 1, 1f + ldrh A_w, [src] + ldrh B_w, [srcend, -2] + strh A_w, [dstin] + strh B_w, [dstend, -2] + ret + .p2align 4 +1: + /* 0-1 */ + tbz count, 0, 1f + ldrb A_w, [src] + strb A_w, [dstin] +1: ret + + .p2align 4 + /* Copy 64..96 bytes. Copy 64 bytes from the start and + 32 bytes from the end. */ +L(copy96): + ldr B_q, [src, 16] + ldr C_q, [src, 32] + ldr D_q, [src, 48] + ldr E_q, [srcend, -32] + ldr F_q, [srcend, -16] + str A_q, [dstin] + str B_q, [dstin, 16] + str C_q, [dstin, 32] + str D_q, [dstin, 48] + str E_q, [dstend, -32] + str F_q, [dstend, -16] + ret + + /* Align SRC to 16 byte alignment so that we don't cross cache line + boundaries on both loads and stores. There are at least 96 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 32 bytes per iteration and prefetches one iteration ahead. */ + + .p2align 4 +L(copy_long): + ldr A_q, [src] + and tmp1, src, 15 + bic src, src, 15 + sub dst, dstin, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldr Q_q, [src, 16]! + str A_q, [dstin] + ldr A_q, [src, 16]! + subs count, count, 32 + 64 + 16 /* Test and readjust count. */ + b.ls L(last64) + +L(loop64): + subs count, count, 32 + str Q_q, [dst, 16] + ldr Q_q, [src, 16]! + str A_q, [dst, 32]! + ldr A_q, [src, 16]! + b.hi L(loop64) + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes and at least 33 bytes, so it is safe to always copy 64 bytes + from the end. */ +L(last64): + ldr C_q, [srcend, -64] + str Q_q, [dst, 16] + ldr B_q, [srcend, -48] + str A_q, [dst, 32] + ldr A_q, [srcend, -32] + ldr D_q, [srcend, -16] + str C_q, [dstend, -64] + str B_q, [dstend, -48] + str A_q, [dstend, -32] + str D_q, [dstend, -16] + ret + + .p2align 4 +L(move_long): + cbz tmp1, 3f + + /* Align SRCEND to 16 byte alignment so that we don't cross cache line + boundaries on both loads and stores. There are at least 96 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 32 bytes per iteration and prefetches one iteration ahead. */ + + ldr A_q, [srcend, -16] + and tmp1, srcend, 15 + sub srcend, srcend, tmp1 + ldr Q_q, [srcend, -16]! + str A_q, [dstend, -16] + sub count, count, tmp1 + ldr A_q, [srcend, -16]! + sub dstend, dstend, tmp1 + subs count, count, 32 + 64 + b.ls 2f + +1: + subs count, count, 32 + str Q_q, [dstend, -16] + ldr Q_q, [srcend, -16]! + str A_q, [dstend, -32]! + ldr A_q, [srcend, -16]! + b.hi 1b + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes and at least 33 bytes, so it is safe to always copy 64 bytes + from the start. */ +2: + ldr C_q, [src, 48] + str Q_q, [dstend, -16] + ldr B_q, [src, 32] + str A_q, [dstend, -32] + ldr A_q, [src, 16] + ldr D_q, [src] + str C_q, [dstin, 48] + str B_q, [dstin, 32] + str A_q, [dstin, 16] + str D_q, [dstin] +3: ret + +END (__memmove_falkor) +libc_hidden_builtin_def (__memmove_falkor) diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c new file mode 100644 index 0000000000..d74ed3a549 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memset.c @@ -0,0 +1,41 @@ +/* Multiple versions of memset. AARCH64 version. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ + +#if IS_IN (libc) +/* Redefine memset so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +# undef memset +# define memset __redirect_memset +# include <string.h> +# include <init-arch.h> + +extern __typeof (__redirect_memset) __libc_memset; + +extern __typeof (__redirect_memset) __memset_falkor attribute_hidden; +extern __typeof (__redirect_memset) __memset_generic attribute_hidden; + +libc_ifunc (__libc_memset, + ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64 + ? __memset_falkor + : __memset_generic)); + +# undef memset +strong_alias (__libc_memset, memset); +#endif diff --git a/sysdeps/aarch64/multiarch/memset_falkor.S b/sysdeps/aarch64/multiarch/memset_falkor.S new file mode 100644 index 0000000000..16849a5afb --- /dev/null +++ b/sysdeps/aarch64/multiarch/memset_falkor.S @@ -0,0 +1,53 @@ +/* Memset for falkor. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#include <memset-reg.h> + +/* Reading dczid_el0 is expensive on falkor so move it into the ifunc + resolver and assume ZVA size of 64 bytes. The IFUNC resolver takes care to + use this function only when ZVA is enabled. */ + +#if IS_IN (libc) +.macro zva_macro + .p2align 4 + /* Write the first and last 64 byte aligned block using stp rather + than using DC ZVA. This is faster on some cores. */ + str q0, [dst, 16] + stp q0, q0, [dst, 32] + bic dst, dst, 63 + stp q0, q0, [dst, 64] + stp q0, q0, [dst, 96] + sub count, dstend, dst /* Count is now 128 too large. */ + sub count, count, 128+64+64 /* Adjust count and bias for loop. */ + add dst, dst, 128 +1: dc zva, dst + add dst, dst, 64 + subs count, count, 64 + b.hi 1b + stp q0, q0, [dst, 0] + stp q0, q0, [dst, 32] + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret +.endm + +# define ZVA_MACRO zva_macro +# define MEMSET __memset_falkor +# include <sysdeps/aarch64/memset.S> +#endif diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S new file mode 100644 index 0000000000..345a88b94f --- /dev/null +++ b/sysdeps/aarch64/multiarch/memset_generic.S @@ -0,0 +1,27 @@ +/* Memset for aarch64, default version for internal use. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# define MEMSET __memset_generic +/* Add a hidden definition for use within libc.so. */ +# ifdef SHARED + .globl __GI_memset; __GI_memset = __memset_generic +# endif +# include <sysdeps/aarch64/memset.S> +#endif diff --git a/sysdeps/aarch64/multiarch/rtld-memset.S b/sysdeps/aarch64/multiarch/rtld-memset.S new file mode 100644 index 0000000000..8cdc6e1f50 --- /dev/null +++ b/sysdeps/aarch64/multiarch/rtld-memset.S @@ -0,0 +1,23 @@ +/* Memset for aarch64, for the dynamic linker. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (rtld) +# define MEMSET memset +# include <sysdeps/aarch64/memset.S> +#endif diff --git a/sysdeps/aarch64/nptl/Makefile b/sysdeps/aarch64/nptl/Makefile index 486aeca81e..bebbdef871 100644 --- a/sysdeps/aarch64/nptl/Makefile +++ b/sysdeps/aarch64/nptl/Makefile @@ -1,4 +1,4 @@ -# Copyright (C) 2005-2016 Free Software Foundation, Inc. +# Copyright (C) 2005-2018 Free Software Foundation, Inc. # # This file is part of the GNU C Library. # diff --git a/sysdeps/aarch64/nptl/bits/pthreadtypes-arch.h b/sysdeps/aarch64/nptl/bits/pthreadtypes-arch.h new file mode 100644 index 0000000000..008aa7e0c7 --- /dev/null +++ b/sysdeps/aarch64/nptl/bits/pthreadtypes-arch.h @@ -0,0 +1,71 @@ +/* Copyright (C) 2002-2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _BITS_PTHREADTYPES_ARCH_H +#define _BITS_PTHREADTYPES_ARCH_H 1 + +#include <endian.h> + +#ifdef __ILP32__ +# define __SIZEOF_PTHREAD_ATTR_T 32 +# define __SIZEOF_PTHREAD_MUTEX_T 32 +# define __SIZEOF_PTHREAD_MUTEXATTR_T 4 +# define __SIZEOF_PTHREAD_CONDATTR_T 4 +# define __SIZEOF_PTHREAD_RWLOCK_T 48 +# define __SIZEOF_PTHREAD_BARRIER_T 20 +# define __SIZEOF_PTHREAD_BARRIERATTR_T 4 +#else +# define __SIZEOF_PTHREAD_ATTR_T 64 +# define __SIZEOF_PTHREAD_MUTEX_T 48 +# define __SIZEOF_PTHREAD_MUTEXATTR_T 8 +# define __SIZEOF_PTHREAD_CONDATTR_T 8 +# define __SIZEOF_PTHREAD_RWLOCK_T 56 +# define __SIZEOF_PTHREAD_BARRIER_T 32 +# define __SIZEOF_PTHREAD_BARRIERATTR_T 8 +#endif +#define __SIZEOF_PTHREAD_COND_T 48 +#define __SIZEOF_PTHREAD_RWLOCKATTR_T 8 + +/* Definitions for internal mutex struct. */ +#define __PTHREAD_COMPAT_PADDING_MID +#define __PTHREAD_COMPAT_PADDING_END +#define __PTHREAD_MUTEX_LOCK_ELISION 0 +#define __PTHREAD_MUTEX_NUSERS_AFTER_KIND 0 +#define __PTHREAD_MUTEX_USE_UNION 0 + +#define __LOCK_ALIGNMENT +#define __ONCE_ALIGNMENT + +struct __pthread_rwlock_arch_t +{ + unsigned int __readers; + unsigned int __writers; + unsigned int __wrphase_futex; + unsigned int __writers_futex; + unsigned int __pad3; + unsigned int __pad4; + int __cur_writer; + int __shared; + unsigned long int __pad1; + unsigned long int __pad2; + unsigned int __flags; +}; + +#define __PTHREAD_RWLOCK_ELISION_EXTRA 0 + +#endif /* bits/pthreadtypes.h */ diff --git a/sysdeps/aarch64/nptl/bits/pthreadtypes.h b/sysdeps/aarch64/nptl/bits/pthreadtypes.h deleted file mode 100644 index 13984a718d..0000000000 --- a/sysdeps/aarch64/nptl/bits/pthreadtypes.h +++ /dev/null @@ -1,174 +0,0 @@ -/* Copyright (C) 2002-2016 Free Software Foundation, Inc. - - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#ifndef _BITS_PTHREADTYPES_H -#define _BITS_PTHREADTYPES_H 1 - -#include <endian.h> - -#define __SIZEOF_PTHREAD_ATTR_T 64 -#define __SIZEOF_PTHREAD_MUTEX_T 48 -#define __SIZEOF_PTHREAD_MUTEXATTR_T 8 -#define __SIZEOF_PTHREAD_COND_T 48 -#define __SIZEOF_PTHREAD_COND_COMPAT_T 48 -#define __SIZEOF_PTHREAD_CONDATTR_T 8 -#define __SIZEOF_PTHREAD_RWLOCK_T 56 -#define __SIZEOF_PTHREAD_RWLOCKATTR_T 8 -#define __SIZEOF_PTHREAD_BARRIER_T 32 -#define __SIZEOF_PTHREAD_BARRIERATTR_T 8 - - -/* Thread identifiers. The structure of the attribute type is not - exposed on purpose. */ -typedef unsigned long int pthread_t; - - -union pthread_attr_t -{ - char __size[__SIZEOF_PTHREAD_ATTR_T]; - long int __align; -}; -#ifndef __have_pthread_attr_t -typedef union pthread_attr_t pthread_attr_t; -# define __have_pthread_attr_t1 -#endif - -typedef struct __pthread_internal_list -{ - struct __pthread_internal_list *__prev; - struct __pthread_internal_list *__next; -} __pthread_list_t; - - -/* Data structures for mutex handling. The structure of the attribute - type is not exposed on purpose. */ -typedef union -{ - struct __pthread_mutex_s - { - int __lock; - unsigned int __count; - int __owner; - unsigned int __nusers; - int __kind; - int __spins; - __pthread_list_t __list; -#define __PTHREAD_MUTEX_HAVE_PREV 1 - } __data; - char __size[__SIZEOF_PTHREAD_MUTEX_T]; - long int __align; -} pthread_mutex_t; - -/* Mutex __spins initializer used by PTHREAD_MUTEX_INITIALIZER. */ -#define __PTHREAD_SPINS 0 - -typedef union -{ - char __size[__SIZEOF_PTHREAD_MUTEXATTR_T]; - long int __align; -} pthread_mutexattr_t; - - -/* Data structure for conditional variable handling. The structure of - the attribute type is not exposed on purpose. */ -typedef union -{ - struct - { - int __lock; - unsigned int __futex; - __extension__ unsigned long long int __total_seq; - __extension__ unsigned long long int __wakeup_seq; - __extension__ unsigned long long int __woken_seq; - void *__mutex; - unsigned int __nwaiters; - unsigned int __broadcast_seq; - } __data; - char __size[__SIZEOF_PTHREAD_COND_T]; - long int __align; -} pthread_cond_t; - -typedef union -{ - char __size[__SIZEOF_PTHREAD_CONDATTR_T]; - int __align; -} pthread_condattr_t; - - -/* Keys for thread-specific data */ -typedef unsigned int pthread_key_t; - - -/* Once-only execution */ -typedef int pthread_once_t; - - -#if defined __USE_UNIX98 || defined __USE_XOPEN2K -/* Data structure for read-write lock variable handling. The - structure of the attribute type is not exposed on purpose. */ -typedef union -{ - struct - { - int __lock; - unsigned int __nr_readers; - unsigned int __readers_wakeup; - unsigned int __writer_wakeup; - unsigned int __nr_readers_queued; - unsigned int __nr_writers_queued; - int __writer; - int __shared; - unsigned long int __pad1; - unsigned long int __pad2; - unsigned int __flags; - } __data; - char __size[__SIZEOF_PTHREAD_RWLOCK_T]; - long int __align; -} pthread_rwlock_t; - -#define __PTHREAD_RWLOCK_ELISION_EXTRA 0 - -typedef union -{ - char __size[__SIZEOF_PTHREAD_RWLOCKATTR_T]; - long int __align; -} pthread_rwlockattr_t; -#endif - - -#ifdef __USE_XOPEN2K -/* POSIX spinlock data type. */ -typedef volatile int pthread_spinlock_t; - - -/* POSIX barriers data type. The structure of the type is - deliberately not exposed. */ -typedef union -{ - char __size[__SIZEOF_PTHREAD_BARRIER_T]; - long int __align; -} pthread_barrier_t; - -typedef union -{ - char __size[__SIZEOF_PTHREAD_BARRIERATTR_T]; - int __align; -} pthread_barrierattr_t; -#endif - -#endif /* bits/pthreadtypes.h */ diff --git a/sysdeps/aarch64/nptl/bits/semaphore.h b/sysdeps/aarch64/nptl/bits/semaphore.h index 3cc5b37847..18f8aae532 100644 --- a/sysdeps/aarch64/nptl/bits/semaphore.h +++ b/sysdeps/aarch64/nptl/bits/semaphore.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2002-2016 Free Software Foundation, Inc. +/* Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -21,7 +21,11 @@ #endif -#define __SIZEOF_SEM_T 32 +#ifdef __ILP32__ +# define __SIZEOF_SEM_T 16 +#else +# define __SIZEOF_SEM_T 32 +#endif /* Value returned if `sem_open' failed. */ @@ -31,5 +35,5 @@ typedef union { char __size[__SIZEOF_SEM_T]; - long int __align; + long long int __align; } sem_t; diff --git a/sysdeps/aarch64/nptl/pthread-offsets.h b/sysdeps/aarch64/nptl/pthread-offsets.h new file mode 100644 index 0000000000..16c6b0d9fd --- /dev/null +++ b/sysdeps/aarch64/nptl/pthread-offsets.h @@ -0,0 +1,5 @@ +#define __PTHREAD_MUTEX_NUSERS_OFFSET 12 +#define __PTHREAD_MUTEX_KIND_OFFSET 16 +#define __PTHREAD_MUTEX_SPINS_OFFSET 20 +#define __PTHREAD_MUTEX_ELISION_OFFSET 22 +#define __PTHREAD_MUTEX_LIST_OFFSET 24 diff --git a/sysdeps/aarch64/nptl/pthread_spin_lock.c b/sysdeps/aarch64/nptl/pthread_spin_lock.c deleted file mode 100644 index 21a54b8215..0000000000 --- a/sysdeps/aarch64/nptl/pthread_spin_lock.c +++ /dev/null @@ -1,24 +0,0 @@ -/* Copyright (C) 2008-2016 Free Software Foundation, Inc. - - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation; either version 2.1 of the - License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#define SPIN_LOCK_READS_BETWEEN_CMPXCHG 1000 - -/* We can't use the normal "#include <nptl/pthread_spin_lock.c>" because - it will resolve to this very file. Using "sysdeps/.." as reference to the - top level directory does the job. */ -#include <sysdeps/../nptl/pthread_spin_lock.c> diff --git a/sysdeps/aarch64/nptl/pthreaddef.h b/sysdeps/aarch64/nptl/pthreaddef.h index 5aa8d561e7..32f729ada1 100644 --- a/sysdeps/aarch64/nptl/pthreaddef.h +++ b/sysdeps/aarch64/nptl/pthreaddef.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2002-2016 Free Software Foundation, Inc. +/* Copyright (C) 2002-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/nptl/tcb-offsets.sym b/sysdeps/aarch64/nptl/tcb-offsets.sym index 0677aeabff..238647dd47 100644 --- a/sysdeps/aarch64/nptl/tcb-offsets.sym +++ b/sysdeps/aarch64/nptl/tcb-offsets.sym @@ -2,6 +2,5 @@ #include <tls.h> PTHREAD_MULTIPLE_THREADS_OFFSET offsetof (struct pthread, header.multiple_threads) -PTHREAD_PID_OFFSET offsetof (struct pthread, pid) PTHREAD_TID_OFFSET offsetof (struct pthread, tid) PTHREAD_SIZEOF sizeof (struct pthread) diff --git a/sysdeps/aarch64/nptl/tls.h b/sysdeps/aarch64/nptl/tls.h index 95ea3f9a1a..200334f84a 100644 --- a/sysdeps/aarch64/nptl/tls.h +++ b/sysdeps/aarch64/nptl/tls.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2005-2016 Free Software Foundation, Inc. +/* Copyright (C) 2005-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -25,17 +25,7 @@ # include <stdbool.h> # include <stddef.h> # include <stdint.h> - -/* Type for the dtv. */ -typedef union dtv -{ - size_t counter; - struct - { - void *val; - bool is_static; - } pointer; -} dtv_t; +# include <dl-dtv.h> #else /* __ASSEMBLER__ */ # include <tcb-offsets.h> @@ -119,6 +109,7 @@ typedef struct descr->member[idx] = (value) /* Get and set the global scope generation counter in struct pthread. */ +# define THREAD_GSCOPE_IN_TCB 1 # define THREAD_GSCOPE_FLAG_UNUSED 0 # define THREAD_GSCOPE_FLAG_USED 1 # define THREAD_GSCOPE_FLAG_WAIT 2 diff --git a/sysdeps/aarch64/rawmemchr.S b/sysdeps/aarch64/rawmemchr.S new file mode 100644 index 0000000000..de3c05f337 --- /dev/null +++ b/sysdeps/aarch64/rawmemchr.S @@ -0,0 +1,42 @@ +/* rawmemchr - find a character in a memory zone + + Copyright (C) 2015-2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Special case rawmemchr (s, 0) as strlen, otherwise tailcall memchr. + Call strlen without setting up a full frame - it preserves x14/x15. +*/ + +ENTRY (__rawmemchr) + cbz w1, L(do_strlen) + mov x2, -1 + b __memchr + +L(do_strlen): + mov x15, x30 + cfi_return_column (x15) + mov x14, x0 + bl __strlen + add x0, x14, x0 + ret x15 + +END (__rawmemchr) +weak_alias (__rawmemchr, rawmemchr) +libc_hidden_builtin_def (__rawmemchr) diff --git a/sysdeps/aarch64/setjmp.S b/sysdeps/aarch64/setjmp.S index 22f4368ee6..d8838e7b54 100644 --- a/sysdeps/aarch64/setjmp.S +++ b/sysdeps/aarch64/setjmp.S @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2016 Free Software Foundation, Inc. +/* Copyright (C) 1997-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -33,6 +33,7 @@ END (_setjmp) libc_hidden_def (_setjmp) ENTRY (__sigsetjmp) + DELOUSE (0) 1: stp x19, x20, [x0, #JB_X19<<3] @@ -42,7 +43,7 @@ ENTRY (__sigsetjmp) stp x27, x28, [x0, #JB_X27<<3] #ifdef PTR_MANGLE - PTR_MANGLE (x4, x30, x3, x2) + PTR_MANGLE (4, 30, 3, 2) stp x29, x4, [x0, #JB_X29<<3] #else stp x29, x30, [x0, #JB_X29<<3] @@ -57,7 +58,7 @@ ENTRY (__sigsetjmp) stp d14, d15, [x0, #JB_D14<<3] #ifdef PTR_MANGLE mov x4, sp - PTR_MANGLE (x5, x4, x3, x2) + PTR_MANGLE (5, 4, 3, 2) str x5, [x0, #JB_SP<<3] #else mov x2, sp diff --git a/sysdeps/aarch64/soft-fp/sfp-machine.h b/sysdeps/aarch64/sfp-machine.h index 3e969952fa..3e969952fa 100644 --- a/sysdeps/aarch64/soft-fp/sfp-machine.h +++ b/sysdeps/aarch64/sfp-machine.h diff --git a/sysdeps/aarch64/soft-fp/Makefile b/sysdeps/aarch64/soft-fp/Makefile deleted file mode 100644 index ada13e8b70..0000000000 --- a/sysdeps/aarch64/soft-fp/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -ifeq ($(subdir),math) -CPPFLAGS += -I../soft-fp -endif diff --git a/sysdeps/aarch64/sotruss-lib.c b/sysdeps/aarch64/sotruss-lib.c index 65174402b8..5dafacc21f 100644 --- a/sysdeps/aarch64/sotruss-lib.c +++ b/sysdeps/aarch64/sotruss-lib.c @@ -1,5 +1,5 @@ /* Override generic sotruss-lib.c to define actual functions for AArch64. - Copyright (C) 2012-2016 Free Software Foundation, Inc. + Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/stackinfo.h b/sysdeps/aarch64/stackinfo.h index 78f3262033..7e666fbe18 100644 --- a/sysdeps/aarch64/stackinfo.h +++ b/sysdeps/aarch64/stackinfo.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2001-2016 Free Software Foundation, Inc. +/* Copyright (C) 2001-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/start.S b/sysdeps/aarch64/start.S index efe24749f7..bad000f555 100644 --- a/sysdeps/aarch64/start.S +++ b/sysdeps/aarch64/start.S @@ -1,4 +1,4 @@ -/* Copyright (C) 1995-2016 Free Software Foundation, Inc. +/* Copyright (C) 1995-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,6 +16,8 @@ License along with the GNU C Library. If not, see <http://www.gnu.org/licenses/>. */ +#include <sysdep.h> + /* This is the canonical entry point, usually the first thing in the text segment. @@ -25,7 +27,7 @@ At this entry point, most registers' values are unspecified, except: - x0 Contains a function pointer to be registered with `atexit'. + x0/w0 Contains a function pointer to be registered with `atexit'. This is how the dynamic linker arranges to have DT_FINI functions called for shared libraries that have been loaded before this code runs. @@ -52,26 +54,35 @@ _start: mov x5, x0 /* Load argc and a pointer to argv */ - ldr x1, [sp, #0] - add x2, sp, #8 + ldr PTR_REG (1), [sp, #0] + add x2, sp, #PTR_SIZE /* Setup stack limit in argument register */ mov x6, sp -#ifdef SHARED +#ifdef PIC +# ifdef SHARED adrp x0, :got:main - ldr x0, [x0, #:got_lo12:main] + ldr PTR_REG (0), [x0, #:got_lo12:main] adrp x3, :got:__libc_csu_init - ldr x3, [x3, #:got_lo12:__libc_csu_init] + ldr PTR_REG (3), [x3, #:got_lo12:__libc_csu_init] adrp x4, :got:__libc_csu_fini - ldr x4, [x4, #:got_lo12:__libc_csu_fini] + ldr PTR_REG (4), [x4, #:got_lo12:__libc_csu_fini] +# else + adrp x0, __wrap_main + add x0, x0, :lo12:__wrap_main + adrp x3, __libc_csu_init + add x3, x3, :lo12:__libc_csu_init + adrp x4, __libc_csu_fini + add x4, x4, :lo12:__libc_csu_fini +# endif #else /* Set up the other arguments in registers */ - ldr x0, =main - ldr x3, =__libc_csu_init - ldr x4, =__libc_csu_fini + MOVL (0, main) + MOVL (3, __libc_csu_init) + MOVL (4, __libc_csu_fini) #endif /* __libc_start_main (main, argc, argv, init, fini, rtld_fini, @@ -83,6 +94,15 @@ _start: /* should never get here....*/ bl abort +#if defined PIC && !defined SHARED + /* When main is not defined in the executable but in a shared library + then a wrapper is needed in crt1.o of the static-pie enabled libc, + because crt1.o and rcrt1.o share code and the later must avoid the + use of GOT relocations before __libc_start_main is called. */ +__wrap_main: + b main +#endif + /* Define a symbol for the first piece of initialized data. */ .data .globl __data_start diff --git a/sysdeps/aarch64/stpcpy.S b/sysdeps/aarch64/stpcpy.S index cbbf26c499..dcf1356bda 100644 --- a/sysdeps/aarch64/stpcpy.S +++ b/sysdeps/aarch64/stpcpy.S @@ -1,5 +1,5 @@ /* stpcpy - copy a string returning pointer to end. - Copyright (C) 2015-2016 Free Software Foundation, Inc. + Copyright (C) 2015-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/strchr.S b/sysdeps/aarch64/strchr.S index 5e3aecf4c2..c27465220b 100644 --- a/sysdeps/aarch64/strchr.S +++ b/sysdeps/aarch64/strchr.S @@ -1,6 +1,6 @@ /* strchr - find a character in a string - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -62,6 +62,7 @@ /* Locals and temporaries. */ ENTRY (strchr) + DELOUSE (0) mov wtmp2, #0x0401 movk wtmp2, #0x4010, lsl #16 dup vrepchr.16b, chrin diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S index a624c8d2ef..e13ace5b7e 100644 --- a/sysdeps/aarch64/strchrnul.S +++ b/sysdeps/aarch64/strchrnul.S @@ -1,6 +1,6 @@ /* strchrnul - find a character or nul in a string - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -60,6 +60,7 @@ identify exactly which byte is causing the termination. */ ENTRY (__strchrnul) + DELOUSE (0) /* Magic constant 0x40100401 to allow us to identify which lane matches the termination condition. */ mov wtmp2, #0x0401 diff --git a/sysdeps/aarch64/strcmp.S b/sysdeps/aarch64/strcmp.S index ba0ccb409b..267aa4b551 100644 --- a/sysdeps/aarch64/strcmp.S +++ b/sysdeps/aarch64/strcmp.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2016 Free Software Foundation, Inc. +/* Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -49,6 +49,8 @@ /* Start of performance-critical section -- one 64B cache line. */ ENTRY_ALIGN(strcmp, 6) + DELOUSE (0) + DELOUSE (1) eor tmp1, src1, src2 mov zeroones, #REP8_01 tst tmp1, #7 @@ -70,6 +72,7 @@ L(start_realigned): cbz syndrome, L(loop_aligned) /* End of performance-critical section -- one 64B cache line. */ +L(end): #ifndef __AARCH64EB__ rev syndrome, syndrome rev data1, data1 @@ -143,12 +146,38 @@ L(mutual_align): b L(start_realigned) L(misaligned8): - /* We can do better than this. */ + /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always + checking to make sure that we don't access beyond page boundary in + SRC2. */ + tst src1, #7 + b.eq L(loop_misaligned) +L(do_misaligned): ldrb data1w, [src1], #1 ldrb data2w, [src2], #1 cmp data1w, #1 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.eq L(misaligned8) + b.ne L(done) + tst src1, #7 + b.ne L(do_misaligned) + +L(loop_misaligned): + /* Test if we are within the last dword of the end of a 4K page. If + yes then jump back to the misaligned loop to copy a byte at a time. */ + and tmp1, src2, #0xff8 + eor tmp1, tmp1, #0xff8 + cbz tmp1, L(do_misaligned) + ldr data1, [src1], #8 + ldr data2, [src2], #8 + + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + orr syndrome, diff, has_nul + cbz syndrome, L(loop_misaligned) + b L(end) + +L(done): sub result, data1, data2 RET END(strcmp) diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S index 0694199372..0449fcb04e 100644 --- a/sysdeps/aarch64/strcpy.S +++ b/sysdeps/aarch64/strcpy.S @@ -1,5 +1,5 @@ /* strcpy/stpcpy - copy a string returning pointer to start/end. - Copyright (C) 2013-2016 Free Software Foundation, Inc. + Copyright (C) 2013-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -91,6 +91,8 @@ #define MIN_PAGE_SIZE (1 << MIN_PAGE_P2) ENTRY_ALIGN (STRCPY, 6) + DELOUSE (0) + DELOUSE (1) /* For moderately short strings, the fastest way to do the copy is to calculate the length of the string in the same way as strlen, then essentially do a memcpy of the result. This avoids the need for diff --git a/sysdeps/aarch64/string_private.h b/sysdeps/aarch64/string_private.h index 8b194a41a6..ba6297417a 100644 --- a/sysdeps/aarch64/string_private.h +++ b/sysdeps/aarch64/string_private.h @@ -1,5 +1,5 @@ /* Define _STRING_ARCH_unaligned. AArch64 version. - Copyright (C) 2016 Free Software Foundation, Inc. + Copyright (C) 2016-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S index 9b4d1da60c..32292e3b6e 100644 --- a/sysdeps/aarch64/strlen.S +++ b/sysdeps/aarch64/strlen.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2016 Free Software Foundation, Inc. +/* Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -84,7 +84,9 @@ whether the first fetch, which may be misaligned, crosses a page boundary. */ -ENTRY_ALIGN (strlen, 6) +ENTRY_ALIGN (__strlen, 6) + DELOUSE (0) + DELOUSE (1) and tmp1, srcin, MIN_PAGE_SIZE - 1 mov zeroones, REP8_01 cmp tmp1, MIN_PAGE_SIZE - 16 @@ -213,5 +215,6 @@ L(page_cross): csel data1, data1, tmp4, eq csel data2, data2, tmp2, eq b L(page_cross_entry) -END (strlen) +END (__strlen) +weak_alias (__strlen, strlen) libc_hidden_builtin_def (strlen) diff --git a/sysdeps/aarch64/strncmp.S b/sysdeps/aarch64/strncmp.S index f6a17fdba2..759c752fc2 100644 --- a/sysdeps/aarch64/strncmp.S +++ b/sysdeps/aarch64/strncmp.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2013-2016 Free Software Foundation, Inc. +/* Copyright (C) 2013-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -49,15 +49,19 @@ #define limit_wd x13 #define mask x14 #define endloop x15 +#define count mask ENTRY_ALIGN_AND_PAD (strncmp, 6, 7) + DELOUSE (0) + DELOUSE (1) + DELOUSE (2) cbz limit, L(ret0) eor tmp1, src1, src2 mov zeroones, #REP8_01 tst tmp1, #7 + and count, src1, #7 b.ne L(misaligned8) - ands tmp1, src1, #7 - b.ne L(mutual_align) + cbnz count, L(mutual_align) /* Calculate the number of full and partial words -1. */ sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ @@ -162,43 +166,107 @@ L(mutual_align): bic src1, src1, #7 bic src2, src2, #7 ldr data1, [src1], #8 - neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */ + neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ ldr data2, [src2], #8 mov tmp2, #~0 sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ #ifdef __AARCH64EB__ /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */ + lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */ #else /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */ + lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */ #endif and tmp3, limit_wd, #7 lsr limit_wd, limit_wd, #3 /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */ - add limit, limit, tmp1 - add tmp3, tmp3, tmp1 + add limit, limit, count + add tmp3, tmp3, count orr data1, data1, tmp2 orr data2, data2, tmp2 add limit_wd, limit_wd, tmp3, lsr #3 b L(start_realigned) -L(ret0): - mov result, #0 - RET - .p2align 6 + /* Don't bother with dwords for up to 16 bytes. */ L(misaligned8): - sub limit, limit, #1 -1: + cmp limit, #16 + b.hs L(try_misaligned_words) + +L(byte_loop): /* Perhaps we can do better than this. */ ldrb data1w, [src1], #1 ldrb data2w, [src2], #1 subs limit, limit, #1 - ccmp data1w, #1, #0, cs /* NZCV = 0b0000. */ + ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.eq 1b + b.eq L(byte_loop) +L(done): sub result, data1, data2 RET + + /* Align the SRC1 to a dword by doing a bytewise compare and then do + the dword loop. */ +L(try_misaligned_words): + lsr limit_wd, limit, #3 + cbz count, L(do_misaligned) + + neg count, count + and count, count, #7 + sub limit, limit, count + lsr limit_wd, limit, #3 + +L(page_end_loop): + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + cmp data1w, #1 + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.ne L(done) + subs count, count, #1 + b.hi L(page_end_loop) + +L(do_misaligned): + /* Prepare ourselves for the next page crossing. Unlike the aligned + loop, we fetch 1 less dword because we risk crossing bounds on + SRC2. */ + mov count, #8 + subs limit_wd, limit_wd, #1 + b.lo L(done_loop) +L(loop_misaligned): + and tmp2, src2, #0xff8 + eor tmp2, tmp2, #0xff8 + cbz tmp2, L(page_end_loop) + + ldr data1, [src1], #8 + ldr data2, [src2], #8 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + ccmp diff, #0, #0, eq + b.ne L(not_limit) + subs limit_wd, limit_wd, #1 + b.pl L(loop_misaligned) + +L(done_loop): + /* We found a difference or a NULL before the limit was reached. */ + and limit, limit, #7 + cbz limit, L(not_limit) + /* Read the last word. */ + sub src1, src1, 8 + sub src2, src2, 8 + ldr data1, [src1, limit] + ldr data2, [src2, limit] + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + ccmp diff, #0, #0, eq + b.ne L(not_limit) + +L(ret0): + mov result, #0 + RET + END (strncmp) libc_hidden_builtin_def (strncmp) diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S index 4cce45f905..0f190e83e2 100644 --- a/sysdeps/aarch64/strnlen.S +++ b/sysdeps/aarch64/strnlen.S @@ -1,6 +1,6 @@ /* strnlen - calculate the length of a string with limit. - Copyright (C) 2013-2016 Free Software Foundation, Inc. + Copyright (C) 2013-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -50,6 +50,9 @@ #define REP8_80 0x8080808080808080 ENTRY_ALIGN_AND_PAD (__strnlen, 6, 9) + DELOUSE (0) + DELOUSE (1) + DELOUSE (2) cbz limit, L(hit_limit) mov zeroones, #REP8_01 bic src, srcin, #15 diff --git a/sysdeps/aarch64/strrchr.S b/sysdeps/aarch64/strrchr.S index 44c1917a20..aa334ede53 100644 --- a/sysdeps/aarch64/strrchr.S +++ b/sysdeps/aarch64/strrchr.S @@ -1,6 +1,6 @@ /* strrchr: find the last instance of a character in a string. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -68,6 +68,7 @@ identify exactly which byte is causing the termination, and why. */ ENTRY(strrchr) + DELOUSE (0) cbz x1, L(null_search) /* Magic constant 0x40100401 to allow us to identify which lane matches the requested byte. Magic constant 0x80200802 used diff --git a/sysdeps/aarch64/sysdep.h b/sysdeps/aarch64/sysdep.h index 6b728ec651..5b30709436 100644 --- a/sysdeps/aarch64/sysdep.h +++ b/sysdeps/aarch64/sysdep.h @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2016 Free Software Foundation, Inc. +/* Copyright (C) 1997-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,8 +16,25 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ +#ifndef _AARCH64_SYSDEP_H +#define _AARCH64_SYSDEP_H + #include <sysdeps/generic/sysdep.h> +#ifdef __LP64__ +# define AARCH64_R(NAME) R_AARCH64_ ## NAME +# define PTR_REG(n) x##n +# define PTR_LOG_SIZE 3 +# define DELOUSE(n) +#else +# define AARCH64_R(NAME) R_AARCH64_P32_ ## NAME +# define PTR_REG(n) w##n +# define PTR_LOG_SIZE 2 +# define DELOUSE(n) mov w##n, w##n +#endif + +#define PTR_SIZE (1<<PTR_LOG_SIZE) + #ifdef __ASSEMBLER__ /* Syntactic details of assembler. */ @@ -66,9 +83,38 @@ /* If compiled for profiling, call `mcount' at the start of each function. */ #ifdef PROF # define CALL_MCOUNT \ - str x30, [sp, #-16]!; \ + str x30, [sp, #-80]!; \ + cfi_adjust_cfa_offset (80); \ + cfi_rel_offset (x30, 0); \ + stp x0, x1, [sp, #16]; \ + cfi_rel_offset (x0, 16); \ + cfi_rel_offset (x1, 24); \ + stp x2, x3, [sp, #32]; \ + cfi_rel_offset (x2, 32); \ + cfi_rel_offset (x3, 40); \ + stp x4, x5, [sp, #48]; \ + cfi_rel_offset (x4, 48); \ + cfi_rel_offset (x5, 56); \ + stp x6, x7, [sp, #64]; \ + cfi_rel_offset (x6, 64); \ + cfi_rel_offset (x7, 72); \ + mov x0, x30; \ bl mcount; \ - ldr x30, [sp], #16 ; + ldp x0, x1, [sp, #16]; \ + cfi_restore (x0); \ + cfi_restore (x1); \ + ldp x2, x3, [sp, #32]; \ + cfi_restore (x2); \ + cfi_restore (x3); \ + ldp x4, x5, [sp, #48]; \ + cfi_restore (x4); \ + cfi_restore (x5); \ + ldp x6, x7, [sp, #64]; \ + cfi_restore (x6); \ + cfi_restore (x7); \ + ldr x30, [sp], #80; \ + cfi_adjust_cfa_offset (-80); \ + cfi_restore (x30); #else # define CALL_MCOUNT /* Do nothing. */ #endif @@ -78,16 +124,32 @@ # define L(name) .L##name #endif -/* Load or store to/from a pc-relative EXPR into/from R, using T. */ -#define LDST_PCREL(OP, R, T, EXPR) \ - adrp T, EXPR; \ - OP R, [T, #:lo12:EXPR];\ - -/* Load or store to/from a got-relative EXPR into/from R, using T. */ -#define LDST_GLOBAL(OP, R, T, EXPR) \ - adrp T, :got:EXPR; \ - ldr T, [T, #:got_lo12:EXPR];\ - OP R, [T]; +/* Load or store to/from a pc-relative EXPR into/from R, using T. + Note R and T are register numbers and not register names. */ +#define LDST_PCREL(OP, R, T, EXPR) \ + adrp x##T, EXPR; \ + OP PTR_REG (R), [x##T, #:lo12:EXPR]; \ + +/* Load or store to/from a got-relative EXPR into/from R, using T. + Note R and T are register numbers and not register names. */ +#define LDST_GLOBAL(OP, R, T, EXPR) \ + adrp x##T, :got:EXPR; \ + ldr PTR_REG (T), [x##T, #:got_lo12:EXPR]; \ + OP PTR_REG (R), [x##T]; + +/* Load an immediate into R. + Note R is a register number and not a register name. */ +#ifdef __LP64__ +# define MOVL(R, NAME) \ + movz PTR_REG (R), #:abs_g3:NAME; \ + movk PTR_REG (R), #:abs_g2_nc:NAME; \ + movk PTR_REG (R), #:abs_g1_nc:NAME; \ + movk PTR_REG (R), #:abs_g0_nc:NAME; +#else +# define MOVL(R, NAME) \ + movz PTR_REG (R), #:abs_g1:NAME; \ + movk PTR_REG (R), #:abs_g0_nc:NAME; +#endif /* Since C identifiers are not normally prefixed with an underscore on this system, the asm identifier `syscall_error' intrudes on the @@ -96,3 +158,5 @@ #define mcount _mcount #endif /* __ASSEMBLER__ */ + +#endif /* _AARCH64_SYSDEP_H */ diff --git a/sysdeps/aarch64/tls-macros.h b/sysdeps/aarch64/tls-macros.h index 2080a4dfa4..cc58a7d6bf 100644 --- a/sysdeps/aarch64/tls-macros.h +++ b/sysdeps/aarch64/tls-macros.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2016 Free Software Foundation, Inc. +/* Copyright (C) 2009-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/tlsdesc.c b/sysdeps/aarch64/tlsdesc.c index ac3a9f4591..357465f23d 100644 --- a/sysdeps/aarch64/tlsdesc.c +++ b/sysdeps/aarch64/tlsdesc.c @@ -1,6 +1,6 @@ /* Manage TLS descriptors. AArch64 version. - Copyright (C) 2011-2016 Free Software Foundation, Inc. + Copyright (C) 2011-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -18,143 +18,17 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include <link.h> #include <ldsodefs.h> -#include <elf/dynamic-link.h> #include <tls.h> #include <dl-tlsdesc.h> #include <dl-unmap-segments.h> +#define _dl_tlsdesc_resolve_hold 0 #include <tlsdeschtab.h> -#include <atomic.h> - -/* The following functions take an entry_check_offset argument. It's - computed by the caller as an offset between its entry point and the - call site, such that by adding the built-in return address that is - implicitly passed to the function with this offset, we can easily - obtain the caller's entry point to compare with the entry point - given in the TLS descriptor. If it's changed, we want to return - immediately. */ - -/* This function is used to lazily resolve TLS_DESC RELA relocations. - The argument location is used to hold a pointer to the relocation. */ - -void -attribute_hidden -_dl_tlsdesc_resolve_rela_fixup (struct tlsdesc *td, struct link_map *l) -{ - const ElfW(Rela) *reloc = atomic_load_relaxed (&td->arg); - - /* After GL(dl_load_lock) is grabbed only one caller can see td->entry in - initial state in _dl_tlsdesc_resolve_early_return_p, other concurrent - callers will return and retry calling td->entry. The updated td->entry - synchronizes with the single writer so all read accesses here can use - relaxed order. */ - if (_dl_tlsdesc_resolve_early_return_p - (td, (void*)(D_PTR (l, l_info[ADDRIDX (DT_TLSDESC_PLT)]) + l->l_addr))) - return; - - /* The code below was borrowed from _dl_fixup(), - except for checking for STB_LOCAL. */ - const ElfW(Sym) *const symtab - = (const void *) D_PTR (l, l_info[DT_SYMTAB]); - const char *strtab = (const void *) D_PTR (l, l_info[DT_STRTAB]); - const ElfW(Sym) *sym = &symtab[ELFW(R_SYM) (reloc->r_info)]; - lookup_t result; - - /* Look up the target symbol. If the normal lookup rules are not - used don't look in the global scope. */ - if (ELFW(ST_BIND) (sym->st_info) != STB_LOCAL - && __builtin_expect (ELFW(ST_VISIBILITY) (sym->st_other), 0) == 0) - { - const struct r_found_version *version = NULL; - - if (l->l_info[VERSYMIDX (DT_VERSYM)] != NULL) - { - const ElfW(Half) *vernum = - (const void *) D_PTR (l, l_info[VERSYMIDX (DT_VERSYM)]); - ElfW(Half) ndx = vernum[ELFW(R_SYM) (reloc->r_info)] & 0x7fff; - version = &l->l_versions[ndx]; - if (version->hash == 0) - version = NULL; - } - - result = _dl_lookup_symbol_x (strtab + sym->st_name, l, &sym, - l->l_scope, version, ELF_RTYPE_CLASS_PLT, - DL_LOOKUP_ADD_DEPENDENCY, NULL); - } - else - { - /* We already found the symbol. The module (and therefore its load - address) is also known. */ - result = l; - } - - if (!sym) - { - atomic_store_relaxed (&td->arg, (void *) reloc->r_addend); - /* This release store synchronizes with the ldar acquire load - instruction in _dl_tlsdesc_undefweak. */ - atomic_store_release (&td->entry, _dl_tlsdesc_undefweak); - } - else - { -# ifndef SHARED - CHECK_STATIC_TLS (l, result); -# else - if (!TRY_STATIC_TLS (l, result)) - { - void *p = _dl_make_tlsdesc_dynamic (result, sym->st_value - + reloc->r_addend); - atomic_store_relaxed (&td->arg, p); - /* This release store synchronizes with the ldar acquire load - instruction in _dl_tlsdesc_dynamic. */ - atomic_store_release (&td->entry, _dl_tlsdesc_dynamic); - } - else -# endif - { - void *p = (void*) (sym->st_value + result->l_tls_offset - + reloc->r_addend); - atomic_store_relaxed (&td->arg, p); - /* This release store synchronizes with the ldar acquire load - instruction in _dl_tlsdesc_return_lazy. */ - atomic_store_release (&td->entry, _dl_tlsdesc_return_lazy); - } - } - - _dl_tlsdesc_wake_up_held_fixups (); -} - -/* This function is used to avoid busy waiting for other threads to - complete the lazy relocation. Once another thread wins the race to - relocate a TLS descriptor, it sets the descriptor up such that this - function is called to wait until the resolver releases the - lock. */ - -void -attribute_hidden -_dl_tlsdesc_resolve_hold_fixup (struct tlsdesc *td, void *caller) -{ - /* Maybe we're lucky and can return early. */ - if (caller != atomic_load_relaxed (&td->entry)) - return; - - /* Locking here will stop execution until the running resolver runs - _dl_tlsdesc_wake_up_held_fixups(), releasing the lock. - - FIXME: We'd be better off waiting on a condition variable, such - that we didn't have to hold the lock throughout the relocation - processing. */ - __rtld_lock_lock_recursive (GL(dl_load_lock)); - __rtld_lock_unlock_recursive (GL(dl_load_lock)); -} - /* Unmap the dynamic object, but also release its TLS descriptor table if there is one. */ void -internal_function _dl_unmap (struct link_map *map) { _dl_unmap_segments (map); diff --git a/sysdeps/aarch64/tlsdesc.sym b/sysdeps/aarch64/tlsdesc.sym index 63766af612..a06a719779 100644 --- a/sysdeps/aarch64/tlsdesc.sym +++ b/sysdeps/aarch64/tlsdesc.sym @@ -13,3 +13,6 @@ TLSDESC_ARG offsetof(struct tlsdesc, arg) TLSDESC_GEN_COUNT offsetof(struct tlsdesc_dynamic_arg, gen_count) TLSDESC_MODID offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_module) TLSDESC_MODOFF offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_offset) +TCBHEAD_DTV offsetof(tcbhead_t, dtv) +DTV_COUNTER offsetof(dtv_t, counter) +TLS_DTV_UNALLOCATED TLS_DTV_UNALLOCATED diff --git a/sysdeps/aarch64/tst-audit.h b/sysdeps/aarch64/tst-audit.h index 8ed01484e3..fd4f38f613 100644 --- a/sysdeps/aarch64/tst-audit.h +++ b/sysdeps/aarch64/tst-audit.h @@ -1,6 +1,6 @@ /* Definitions for testing PLT entry/exit auditing. AArch64 version. - Copyright (C) 2005-2016 Free Software Foundation, Inc. + Copyright (C) 2005-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. |