summaryrefslogtreecommitdiff
path: root/sysdeps/aarch64
diff options
context:
space:
mode:
authorSamuel Thibault <samuel.thibault@ens-lyon.org>2018-12-27 19:16:25 +0000
committerSamuel Thibault <samuel.thibault@ens-lyon.org>2018-12-27 19:16:25 +0000
commit8d59503b977070aaa4e504e8d6dcb7da3711893e (patch)
tree8272c9c2cce43afa4fe4d8d92c269a6435242661 /sysdeps/aarch64
parent76a7dc16fab8853ef9230447fa98c70a3619dc6d (diff)
parentbcea9593527d90b9f9ff3817e3fbf0fbc3d01fa7 (diff)
Merge commit 'refs/top-bases/t/gsync-libc-merge' into t/gsync-libc-merge
Diffstat (limited to 'sysdeps/aarch64')
-rw-r--r--sysdeps/aarch64/Implies1
-rw-r--r--sysdeps/aarch64/Makefile12
-rw-r--r--sysdeps/aarch64/__longjmp.S8
-rw-r--r--sysdeps/aarch64/atomic-machine.h7
-rw-r--r--sysdeps/aarch64/backtrace.c1
-rw-r--r--sysdeps/aarch64/bits/endian.h2
-rw-r--r--sysdeps/aarch64/bits/fenv.h10
-rw-r--r--sysdeps/aarch64/bits/fp-fast.h (renamed from sysdeps/aarch64/fpu/s_frint.c)41
-rw-r--r--sysdeps/aarch64/bits/link.h2
-rw-r--r--sysdeps/aarch64/bits/setjmp.h2
-rw-r--r--sysdeps/aarch64/bits/wordsize.h (renamed from sysdeps/aarch64/bits/string.h)16
-rw-r--r--sysdeps/aarch64/crti.S7
-rw-r--r--sysdeps/aarch64/crtn.S2
-rw-r--r--sysdeps/aarch64/dl-irel.h7
-rw-r--r--sysdeps/aarch64/dl-machine.h255
-rw-r--r--sysdeps/aarch64/dl-sysdep.h2
-rw-r--r--sysdeps/aarch64/dl-tls.h5
-rw-r--r--sysdeps/aarch64/dl-tlsdesc.S265
-rw-r--r--sysdeps/aarch64/dl-tlsdesc.h14
-rw-r--r--sysdeps/aarch64/dl-trampoline.S20
-rw-r--r--sysdeps/aarch64/dl-tunables.list25
-rw-r--r--sysdeps/aarch64/e_sqrtl.c (renamed from sysdeps/aarch64/soft-fp/e_sqrtl.c)2
-rw-r--r--sysdeps/aarch64/fpu/Makefile14
-rw-r--r--sysdeps/aarch64/fpu/e_sqrt.c6
-rw-r--r--sysdeps/aarch64/fpu/e_sqrtf.c6
-rw-r--r--sysdeps/aarch64/fpu/fclrexcpt.c2
-rw-r--r--sysdeps/aarch64/fpu/fedisblxcpt.c2
-rw-r--r--sysdeps/aarch64/fpu/feenablxcpt.c2
-rw-r--r--sysdeps/aarch64/fpu/fegetenv.c2
-rw-r--r--sysdeps/aarch64/fpu/fegetexcept.c2
-rw-r--r--sysdeps/aarch64/fpu/fegetmode.c (renamed from sysdeps/aarch64/fpu/s_frintf.c)25
-rw-r--r--sysdeps/aarch64/fpu/fegetround.c2
-rw-r--r--sysdeps/aarch64/fpu/feholdexcpt.c2
-rw-r--r--sysdeps/aarch64/fpu/fesetenv.c2
-rw-r--r--sysdeps/aarch64/fpu/fesetexcept.c34
-rw-r--r--sysdeps/aarch64/fpu/fesetmode.c34
-rw-r--r--sysdeps/aarch64/fpu/fesetround.c2
-rw-r--r--sysdeps/aarch64/fpu/feupdateenv.c2
-rw-r--r--sysdeps/aarch64/fpu/fgetexcptflg.c2
-rw-r--r--sysdeps/aarch64/fpu/fpu_control.h19
-rw-r--r--sysdeps/aarch64/fpu/fraiseexcpt.c2
-rw-r--r--sysdeps/aarch64/fpu/fsetexcptflg.c2
-rw-r--r--sysdeps/aarch64/fpu/ftestexcept.c2
-rw-r--r--sysdeps/aarch64/fpu/get-rounding-mode.h2
-rw-r--r--sysdeps/aarch64/fpu/math-barriers.h (renamed from sysdeps/aarch64/bits/mathdef.h)30
-rw-r--r--sysdeps/aarch64/fpu/math_private.h43
-rw-r--r--sysdeps/aarch64/fpu/s_ceil.c15
-rw-r--r--sysdeps/aarch64/fpu/s_ceilf.c15
-rw-r--r--sysdeps/aarch64/fpu/s_floor.c15
-rw-r--r--sysdeps/aarch64/fpu/s_floorf.c15
-rw-r--r--sysdeps/aarch64/fpu/s_fma.c29
-rw-r--r--sysdeps/aarch64/fpu/s_fmaf.c16
-rw-r--r--sysdeps/aarch64/fpu/s_fmax.c15
-rw-r--r--sysdeps/aarch64/fpu/s_fmaxf.c17
-rw-r--r--sysdeps/aarch64/fpu/s_fmin.c33
-rw-r--r--sysdeps/aarch64/fpu/s_fminf.c16
-rw-r--r--sysdeps/aarch64/fpu/s_llrint.c23
-rw-r--r--sysdeps/aarch64/fpu/s_llrintf.c26
-rw-r--r--sysdeps/aarch64/fpu/s_llround.c15
-rw-r--r--sysdeps/aarch64/fpu/s_llroundf.c17
-rw-r--r--sysdeps/aarch64/fpu/s_lrint.c82
-rw-r--r--sysdeps/aarch64/fpu/s_lrintf.c24
-rw-r--r--sysdeps/aarch64/fpu/s_lround.c39
-rw-r--r--sysdeps/aarch64/fpu/s_lroundf.c16
-rw-r--r--sysdeps/aarch64/fpu/s_nearbyint.c15
-rw-r--r--sysdeps/aarch64/fpu/s_nearbyintf.c15
-rw-r--r--sysdeps/aarch64/fpu/s_rint.c15
-rw-r--r--sysdeps/aarch64/fpu/s_rintf.c15
-rw-r--r--sysdeps/aarch64/fpu/s_round.c15
-rw-r--r--sysdeps/aarch64/fpu/s_roundf.c15
-rw-r--r--sysdeps/aarch64/fpu/s_trunc.c15
-rw-r--r--sysdeps/aarch64/fpu/s_truncf.c15
-rw-r--r--sysdeps/aarch64/jmpbuf-offsets.h2
-rw-r--r--sysdeps/aarch64/jmpbuf-unwind.h5
-rw-r--r--sysdeps/aarch64/ldsodefs.h3
-rw-r--r--sysdeps/aarch64/libc-tls.c2
-rw-r--r--sysdeps/aarch64/libm-test-ulps272
-rw-r--r--sysdeps/aarch64/libm-test-ulps-name1
-rw-r--r--sysdeps/aarch64/linkmap.h2
-rw-r--r--sysdeps/aarch64/machine-gmon.h2
-rw-r--r--sysdeps/aarch64/math-tests.h2
-rw-r--r--sysdeps/aarch64/mcount.c2
-rw-r--r--sysdeps/aarch64/memchr.S157
-rw-r--r--sysdeps/aarch64/memcmp.S221
-rw-r--r--sysdeps/aarch64/memcpy.S379
-rw-r--r--sysdeps/aarch64/memmove.S313
-rw-r--r--sysdeps/aarch64/memset-reg.h30
-rw-r--r--sysdeps/aarch64/memset.S367
-rw-r--r--sysdeps/aarch64/memusage.h2
-rw-r--r--sysdeps/aarch64/multiarch/Makefile4
-rw-r--r--sysdeps/aarch64/multiarch/ifunc-impl-list.c57
-rw-r--r--sysdeps/aarch64/multiarch/init-arch.h25
-rw-r--r--sysdeps/aarch64/multiarch/memcpy.c47
-rw-r--r--sysdeps/aarch64/multiarch/memcpy_falkor.S191
-rw-r--r--sysdeps/aarch64/multiarch/memcpy_generic.S44
-rw-r--r--sysdeps/aarch64/multiarch/memcpy_thunderx.S336
-rw-r--r--sysdeps/aarch64/multiarch/memcpy_thunderx2.S27
-rw-r--r--sysdeps/aarch64/multiarch/memmove.c44
-rw-r--r--sysdeps/aarch64/multiarch/memmove_falkor.S225
-rw-r--r--sysdeps/aarch64/multiarch/memset.c41
-rw-r--r--sysdeps/aarch64/multiarch/memset_falkor.S53
-rw-r--r--sysdeps/aarch64/multiarch/memset_generic.S27
-rw-r--r--sysdeps/aarch64/multiarch/rtld-memset.S23
-rw-r--r--sysdeps/aarch64/nptl/Makefile2
-rw-r--r--sysdeps/aarch64/nptl/bits/pthreadtypes-arch.h71
-rw-r--r--sysdeps/aarch64/nptl/bits/pthreadtypes.h174
-rw-r--r--sysdeps/aarch64/nptl/bits/semaphore.h10
-rw-r--r--sysdeps/aarch64/nptl/pthread-offsets.h5
-rw-r--r--sysdeps/aarch64/nptl/pthread_spin_lock.c24
-rw-r--r--sysdeps/aarch64/nptl/pthreaddef.h2
-rw-r--r--sysdeps/aarch64/nptl/tcb-offsets.sym1
-rw-r--r--sysdeps/aarch64/nptl/tls.h15
-rw-r--r--sysdeps/aarch64/rawmemchr.S42
-rw-r--r--sysdeps/aarch64/setjmp.S7
-rw-r--r--sysdeps/aarch64/sfp-machine.h (renamed from sysdeps/aarch64/soft-fp/sfp-machine.h)0
-rw-r--r--sysdeps/aarch64/soft-fp/Makefile3
-rw-r--r--sysdeps/aarch64/sotruss-lib.c2
-rw-r--r--sysdeps/aarch64/stackinfo.h2
-rw-r--r--sysdeps/aarch64/start.S42
-rw-r--r--sysdeps/aarch64/stpcpy.S2
-rw-r--r--sysdeps/aarch64/strchr.S3
-rw-r--r--sysdeps/aarch64/strchrnul.S3
-rw-r--r--sysdeps/aarch64/strcmp.S35
-rw-r--r--sysdeps/aarch64/strcpy.S4
-rw-r--r--sysdeps/aarch64/string_private.h2
-rw-r--r--sysdeps/aarch64/strlen.S9
-rw-r--r--sysdeps/aarch64/strncmp.S100
-rw-r--r--sysdeps/aarch64/strnlen.S5
-rw-r--r--sysdeps/aarch64/strrchr.S3
-rw-r--r--sysdeps/aarch64/sysdep.h90
-rw-r--r--sysdeps/aarch64/tls-macros.h2
-rw-r--r--sysdeps/aarch64/tlsdesc.c130
-rw-r--r--sysdeps/aarch64/tlsdesc.sym3
-rw-r--r--sysdeps/aarch64/tst-audit.h2
134 files changed, 3188 insertions, 2008 deletions
diff --git a/sysdeps/aarch64/Implies b/sysdeps/aarch64/Implies
index e5adf4d63c..a1d5e2e742 100644
--- a/sysdeps/aarch64/Implies
+++ b/sysdeps/aarch64/Implies
@@ -3,4 +3,3 @@ ieee754/ldbl-128
ieee754/dbl-64/wordsize-64
ieee754/dbl-64
ieee754/flt-32
-aarch64/soft-fp
diff --git a/sysdeps/aarch64/Makefile b/sysdeps/aarch64/Makefile
index 06323550e9..94baaf52dd 100644
--- a/sysdeps/aarch64/Makefile
+++ b/sysdeps/aarch64/Makefile
@@ -1,9 +1,5 @@
long-double-fcts = yes
-ifeq ($(subdir),debug)
-CFLAGS-backtrace.c += -funwind-tables
-endif
-
ifeq ($(subdir),elf)
sysdep-dl-routines += tlsdesc dl-tlsdesc
gen-as-const-headers += dl-link.sym
@@ -12,3 +8,11 @@ endif
ifeq ($(subdir),csu)
gen-as-const-headers += tlsdesc.sym
endif
+
+ifeq ($(subdir),gmon)
+CFLAGS-mcount.c += -mgeneral-regs-only
+endif
+
+ifeq ($(subdir),math)
+CPPFLAGS += -I../soft-fp
+endif
diff --git a/sysdeps/aarch64/__longjmp.S b/sysdeps/aarch64/__longjmp.S
index 65116be557..3365bd70f9 100644
--- a/sysdeps/aarch64/__longjmp.S
+++ b/sysdeps/aarch64/__longjmp.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -46,6 +46,8 @@ ENTRY (__longjmp)
cfi_offset(d14, JB_D14<<3)
cfi_offset(d15, JB_D15<<3)
+ DELOUSE (0)
+
ldp x19, x20, [x0, #JB_X19<<3]
ldp x21, x22, [x0, #JB_X21<<3]
ldp x23, x24, [x0, #JB_X23<<3]
@@ -53,7 +55,7 @@ ENTRY (__longjmp)
ldp x27, x28, [x0, #JB_X27<<3]
#ifdef PTR_DEMANGLE
ldp x29, x4, [x0, #JB_X29<<3]
- PTR_DEMANGLE (x30, x4, x3, x2)
+ PTR_DEMANGLE (30, 4, 3, 2)
#else
ldp x29, x30, [x0, #JB_X29<<3]
#endif
@@ -98,7 +100,7 @@ ENTRY (__longjmp)
cfi_same_value(d15)
#ifdef PTR_DEMANGLE
ldr x4, [x0, #JB_SP<<3]
- PTR_DEMANGLE (x5, x4, x3, x2)
+ PTR_DEMANGLE (5, 4, 3, 2)
#else
ldr x5, [x0, #JB_SP<<3]
#endif
diff --git a/sysdeps/aarch64/atomic-machine.h b/sysdeps/aarch64/atomic-machine.h
index 28c80dc42d..63b24e625c 100644
--- a/sysdeps/aarch64/atomic-machine.h
+++ b/sysdeps/aarch64/atomic-machine.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2003-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -38,6 +38,7 @@ typedef uintmax_t uatomic_max_t;
#define __HAVE_64B_ATOMICS 1
#define USE_ATOMIC_COMPILER_BUILTINS 1
+#define ATOMIC_EXCHANGE_USES_CAS 0
/* Compare and exchange.
For all "bool" routines, we return FALSE if exchange succesful. */
@@ -115,10 +116,6 @@ typedef uintmax_t uatomic_max_t;
/* Compare and exchange with "release" semantics, ie barrier before. */
-# define atomic_compare_and_exchange_bool_rel(mem, new, old) \
- __atomic_bool_bysize (__arch_compare_and_exchange_bool, int, \
- mem, new, old, __ATOMIC_RELEASE)
-
# define atomic_compare_and_exchange_val_rel(mem, new, old) \
__atomic_val_bysize (__arch_compare_and_exchange_val, int, \
mem, new, old, __ATOMIC_RELEASE)
diff --git a/sysdeps/aarch64/backtrace.c b/sysdeps/aarch64/backtrace.c
deleted file mode 100644
index 27ce597b39..0000000000
--- a/sysdeps/aarch64/backtrace.c
+++ /dev/null
@@ -1 +0,0 @@
-#include <sysdeps/x86_64/backtrace.c>
diff --git a/sysdeps/aarch64/bits/endian.h b/sysdeps/aarch64/bits/endian.h
index a32ebc0d80..1f18733422 100644
--- a/sysdeps/aarch64/bits/endian.h
+++ b/sysdeps/aarch64/bits/endian.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/bits/fenv.h b/sysdeps/aarch64/bits/fenv.h
index aced0d33b2..8a0f481f79 100644
--- a/sysdeps/aarch64/bits/fenv.h
+++ b/sysdeps/aarch64/bits/fenv.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2004-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2004-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -72,3 +72,11 @@ fenv_t;
/* Floating-point environment where none of the exceptions are masked. */
# define FE_NOMASK_ENV ((const fenv_t *) -2)
#endif
+
+#if __GLIBC_USE (IEC_60559_BFP_EXT)
+/* Type representing floating-point control modes. */
+typedef unsigned int femode_t;
+
+/* Default floating-point control modes. */
+# define FE_DFL_MODE ((const femode_t *) -1L)
+#endif
diff --git a/sysdeps/aarch64/fpu/s_frint.c b/sysdeps/aarch64/bits/fp-fast.h
index 321c8ef50d..3ce12210b7 100644
--- a/sysdeps/aarch64/fpu/s_frint.c
+++ b/sysdeps/aarch64/bits/fp-fast.h
@@ -1,5 +1,5 @@
-/* Copyright (C) 1996-2016 Free Software Foundation, Inc.
-
+/* Define FP_FAST_* macros. AArch64 version.
+ Copyright (C) 2016-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,34 +16,19 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#include <math.h>
-
-#ifndef FUNC
-# error FUNC not defined
-#endif
-
-#ifndef TYPE
-# define TYPE double
-# define REGS "d"
-#else
-# ifndef REGS
-# error REGS not defined
-# endif
+#ifndef _MATH_H
+# error "Never use <bits/fp-fast.h> directly; include <math.h> instead."
#endif
-#ifndef INSN
-# error INSN not defined
-#endif
+#ifdef __USE_ISOC99
-#define __CONCATX(a,b) __CONCAT(a,b)
+/* The GCC 4.6 compiler will define __FP_FAST_FMA{,F,L} if the fma{,f,l}
+ builtins are supported. */
+# define FP_FAST_FMA 1
+# define FP_FAST_FMAF 1
-TYPE
-__CONCATX(__,FUNC) (TYPE x)
-{
- TYPE result;
- asm ( INSN "\t%" REGS "0, %" REGS "1" :
- "=w" (result) : "w" (x) );
- return result;
-}
+# ifdef __FP_FAST_FMAL
+# define FP_FAST_FMAL 1
+# endif
-weak_alias (__CONCATX(__,FUNC), FUNC)
+#endif
diff --git a/sysdeps/aarch64/bits/link.h b/sysdeps/aarch64/bits/link.h
index 0bf961519e..5a7fc1ccd4 100644
--- a/sysdeps/aarch64/bits/link.h
+++ b/sysdeps/aarch64/bits/link.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2005-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/bits/setjmp.h b/sysdeps/aarch64/bits/setjmp.h
index fff8616c60..1685f1076a 100644
--- a/sysdeps/aarch64/bits/setjmp.h
+++ b/sysdeps/aarch64/bits/setjmp.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/bits/string.h b/sysdeps/aarch64/bits/wordsize.h
index 0a508f72c0..ea7eecdf92 100644
--- a/sysdeps/aarch64/bits/string.h
+++ b/sysdeps/aarch64/bits/wordsize.h
@@ -1,5 +1,6 @@
-/* Optimized, inlined string functions. AArch64 version.
- Copyright (C) 2015-2016 Free Software Foundation, Inc.
+/* Determine the wordsize from the preprocessor defines.
+
+ Copyright (C) 2016-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,9 +17,12 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#ifndef _STRING_H
-# error "Never use <bits/string.h> directly; include <string.h> instead."
+#ifdef __LP64__
+# define __WORDSIZE 64
+#else
+# define __WORDSIZE 32
+# define __WORDSIZE32_SIZE_ULONG 1
+# define __WORDSIZE32_PTRDIFF_LONG 1
#endif
-/* AArch64 uses the aligned string inline ABI. */
-#define _STRING_INLINE_unaligned 0
+#define __WORDSIZE_TIME64_COMPAT32 0
diff --git a/sysdeps/aarch64/crti.S b/sysdeps/aarch64/crti.S
index 53ccb42587..2b213758b2 100644
--- a/sysdeps/aarch64/crti.S
+++ b/sysdeps/aarch64/crti.S
@@ -1,5 +1,5 @@
/* Special .init and .fini section support for AArch64.
- Copyright (C) 1995-2016 Free Software Foundation, Inc.
+ Copyright (C) 1995-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -39,6 +39,7 @@
they can be called as functions. The symbols _init and _fini are
magic and cause the linker to emit DT_INIT and DT_FINI. */
+#include <sysdep.h>
#include <libc-symbols.h>
#ifndef PREINIT_FUNCTION
@@ -60,7 +61,7 @@
.type call_weak_fn, %function
call_weak_fn:
adrp x0, :got:PREINIT_FUNCTION
- ldr x0, [x0, #:got_lo12:PREINIT_FUNCTION]
+ ldr PTR_REG (0), [x0, #:got_lo12:PREINIT_FUNCTION]
cbz x0, 1f
b PREINIT_FUNCTION
1:
@@ -71,6 +72,7 @@ call_weak_fn:
.section .init,"ax",%progbits
.align 2
.global _init
+ .hidden _init
.type _init, %function
_init:
stp x29, x30, [sp, -16]!
@@ -84,6 +86,7 @@ _init:
.section .fini,"ax",%progbits
.align 2
.global _fini
+ .hidden _fini
.type _fini, %function
_fini:
stp x29, x30, [sp, -16]!
diff --git a/sysdeps/aarch64/crtn.S b/sysdeps/aarch64/crtn.S
index 33d5f3d822..d72300af80 100644
--- a/sysdeps/aarch64/crtn.S
+++ b/sysdeps/aarch64/crtn.S
@@ -1,5 +1,5 @@
/* Special .init and .fini section support for AArch64.
- Copyright (C) 1995-2016 Free Software Foundation, Inc.
+ Copyright (C) 1995-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/dl-irel.h b/sysdeps/aarch64/dl-irel.h
index 63a8e506eb..5889ee187b 100644
--- a/sysdeps/aarch64/dl-irel.h
+++ b/sysdeps/aarch64/dl-irel.h
@@ -1,6 +1,6 @@
/* Machine-dependent ELF indirect relocation inline functions.
AArch64 version.
- Copyright (C) 2012-2016 Free Software Foundation, Inc.
+ Copyright (C) 2012-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -23,6 +23,7 @@
#include <stdio.h>
#include <unistd.h>
#include <ldsodefs.h>
+#include <sysdep.h>
#define ELF_MACHINE_IRELA 1
@@ -30,7 +31,7 @@ static inline ElfW(Addr)
__attribute ((always_inline))
elf_ifunc_invoke (ElfW(Addr) addr)
{
- return ((ElfW(Addr) (*) (unsigned long int)) (addr)) (GLRO(dl_hwcap));
+ return ((ElfW(Addr) (*) (uint64_t)) (addr)) (GLRO(dl_hwcap));
}
static inline void
@@ -40,7 +41,7 @@ elf_irela (const ElfW(Rela) *reloc)
ElfW(Addr) *const reloc_addr = (void *) reloc->r_offset;
const unsigned long int r_type = ELFW(R_TYPE) (reloc->r_info);
- if (__glibc_likely (r_type == R_AARCH64_IRELATIVE))
+ if (__glibc_likely (r_type == AARCH64_R(IRELATIVE)))
{
ElfW(Addr) value = elf_ifunc_invoke (reloc->r_addend);
*reloc_addr = value;
diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h
index 282805e396..4935aa7c54 100644
--- a/sysdeps/aarch64/dl-machine.h
+++ b/sysdeps/aarch64/dl-machine.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 1995-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1995-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -21,9 +21,11 @@
#define ELF_MACHINE_NAME "aarch64"
+#include <sysdep.h>
#include <tls.h>
#include <dl-tlsdesc.h>
#include <dl-irel.h>
+#include <cpu-features.c>
/* Return nonzero iff ELF header is compatible with the running host. */
static inline int __attribute__ ((unused))
@@ -49,26 +51,11 @@ elf_machine_load_address (void)
/* To figure out the load address we use the definition that for any symbol:
dynamic_addr(symbol) = static_addr(symbol) + load_addr
- The choice of symbol is arbitrary. The static address we obtain
- by constructing a non GOT reference to the symbol, the dynamic
- address of the symbol we compute using adrp/add to compute the
- symbol's address relative to the PC.
- This depends on 32bit relocations being resolved at link time
- and that the static address fits in the 32bits. */
-
- ElfW(Addr) static_addr;
- ElfW(Addr) dynamic_addr;
-
- asm (" \n"
-" adrp %1, _dl_start; \n"
-" add %1, %1, #:lo12:_dl_start \n"
-" ldr %w0, 1f \n"
-" b 2f \n"
-"1: \n"
-" .word _dl_start \n"
-"2: \n"
- : "=r" (static_addr), "=r" (dynamic_addr));
- return dynamic_addr - static_addr;
+ _DYNAMIC sysmbol is used here as its link-time address stored in
+ the special unrelocated first GOT entry. */
+
+ extern ElfW(Dyn) _DYNAMIC[] attribute_hidden;
+ return (ElfW(Addr)) &_DYNAMIC - elf_machine_dynamic ();
}
/* Set up the loaded object described by L so its unrelocated PLT
@@ -115,97 +102,117 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
}
}
- if (l->l_info[ADDRIDX (DT_TLSDESC_GOT)] && lazy)
- *(ElfW(Addr)*)(D_PTR (l, l_info[ADDRIDX (DT_TLSDESC_GOT)]) + l->l_addr)
- = (ElfW(Addr)) &_dl_tlsdesc_resolve_rela;
-
return lazy;
}
/* Initial entry point for the dynamic linker. The C function
_dl_start is the real entry point, its return value is the user
program's entry point */
+#ifdef __LP64__
+# define RTLD_START RTLD_START_1 ("x", "3", "sp")
+#else
+# define RTLD_START RTLD_START_1 ("w", "2", "wsp")
+#endif
-#define RTLD_START asm ("\
-.text \n\
-.globl _start \n\
-.type _start, %function \n\
-.globl _dl_start_user \n\
-.type _dl_start_user, %function \n\
-_start: \n\
- mov x0, sp \n\
- bl _dl_start \n\
- // returns user entry point in x0 \n\
- mov x21, x0 \n\
-_dl_start_user: \n\
- // get the original arg count \n\
- ldr x1, [sp] \n\
- // get the argv address \n\
- add x2, sp, #8 \n\
- // get _dl_skip_args to see if we were \n\
- // invoked as an executable \n\
- adrp x4, _dl_skip_args \n\
- ldr w4, [x4, #:lo12:_dl_skip_args] \n\
- // do we need to adjust argc/argv \n\
- cmp w4, 0 \n\
- beq .L_done_stack_adjust \n\
- // subtract _dl_skip_args from original arg count \n\
- sub x1, x1, x4 \n\
- // store adjusted argc back to stack \n\
- str x1, [sp] \n\
- // find the first unskipped argument \n\
- mov x3, x2 \n\
- add x4, x2, x4, lsl #3 \n\
- // shuffle argv down \n\
-1: ldr x5, [x4], #8 \n\
- str x5, [x3], #8 \n\
- cmp x5, #0 \n\
- bne 1b \n\
- // shuffle envp down \n\
-1: ldr x5, [x4], #8 \n\
- str x5, [x3], #8 \n\
- cmp x5, #0 \n\
- bne 1b \n\
- // shuffle auxv down \n\
-1: ldp x0, x5, [x4, #16]! \n\
- stp x0, x5, [x3], #16 \n\
- cmp x0, #0 \n\
- bne 1b \n\
- // Update _dl_argv \n\
- adrp x3, _dl_argv \n\
- str x2, [x3, #:lo12:_dl_argv] \n\
-.L_done_stack_adjust: \n\
- // compute envp \n\
- add x3, x2, x1, lsl #3 \n\
- add x3, x3, #8 \n\
- adrp x16, _rtld_local \n\
- add x16, x16, #:lo12:_rtld_local \n\
- ldr x0, [x16] \n\
- bl _dl_init \n\
- // load the finalizer function \n\
- adrp x0, _dl_fini \n\
- add x0, x0, #:lo12:_dl_fini \n\
- // jump to the user_s entry point \n\
- br x21 \n\
+
+#define RTLD_START_1(PTR, PTR_SIZE_LOG, PTR_SP) asm ("\
+.text \n\
+.globl _start \n\
+.type _start, %function \n\
+.globl _dl_start_user \n\
+.type _dl_start_user, %function \n\
+_start: \n\
+ mov " PTR "0, " PTR_SP " \n\
+ bl _dl_start \n\
+ // returns user entry point in x0 \n\
+ mov x21, x0 \n\
+_dl_start_user: \n\
+ // get the original arg count \n\
+ ldr " PTR "1, [sp] \n\
+ // get the argv address \n\
+ add " PTR "2, " PTR_SP ", #(1<<" PTR_SIZE_LOG ") \n\
+ // get _dl_skip_args to see if we were \n\
+ // invoked as an executable \n\
+ adrp x4, _dl_skip_args \n\
+ ldr w4, [x4, #:lo12:_dl_skip_args] \n\
+ // do we need to adjust argc/argv \n\
+ cmp w4, 0 \n\
+ beq .L_done_stack_adjust \n\
+ // subtract _dl_skip_args from original arg count \n\
+ sub " PTR "1, " PTR "1, " PTR "4 \n\
+ // store adjusted argc back to stack \n\
+ str " PTR "1, [sp] \n\
+ // find the first unskipped argument \n\
+ mov " PTR "3, " PTR "2 \n\
+ add " PTR "4, " PTR "2, " PTR "4, lsl #" PTR_SIZE_LOG " \n\
+ // shuffle argv down \n\
+1: ldr " PTR "5, [x4], #(1<<" PTR_SIZE_LOG ") \n\
+ str " PTR "5, [x3], #(1<<" PTR_SIZE_LOG ") \n\
+ cmp " PTR "5, #0 \n\
+ bne 1b \n\
+ // shuffle envp down \n\
+1: ldr " PTR "5, [x4], #(1<<" PTR_SIZE_LOG ") \n\
+ str " PTR "5, [x3], #(1<<" PTR_SIZE_LOG ") \n\
+ cmp " PTR "5, #0 \n\
+ bne 1b \n\
+ // shuffle auxv down \n\
+1: ldp " PTR "0, " PTR "5, [x4, #(2<<" PTR_SIZE_LOG ")]! \n\
+ stp " PTR "0, " PTR "5, [x3], #(2<<" PTR_SIZE_LOG ") \n\
+ cmp " PTR "0, #0 \n\
+ bne 1b \n\
+ // Update _dl_argv \n\
+ adrp x3, __GI__dl_argv \n\
+ str " PTR "2, [x3, #:lo12:__GI__dl_argv] \n\
+.L_done_stack_adjust: \n\
+ // compute envp \n\
+ add " PTR "3, " PTR "2, " PTR "1, lsl #" PTR_SIZE_LOG " \n\
+ add " PTR "3, " PTR "3, #(1<<" PTR_SIZE_LOG ") \n\
+ adrp x16, _rtld_local \n\
+ add " PTR "16, " PTR "16, #:lo12:_rtld_local \n\
+ ldr " PTR "0, [x16] \n\
+ bl _dl_init \n\
+ // load the finalizer function \n\
+ adrp x0, _dl_fini \n\
+ add " PTR "0, " PTR "0, #:lo12:_dl_fini \n\
+ // jump to the user_s entry point \n\
+ br x21 \n\
");
#define elf_machine_type_class(type) \
- ((((type) == R_AARCH64_JUMP_SLOT || \
- (type) == R_AARCH64_TLS_DTPMOD || \
- (type) == R_AARCH64_TLS_DTPREL || \
- (type) == R_AARCH64_TLS_TPREL || \
- (type) == R_AARCH64_TLSDESC) * ELF_RTYPE_CLASS_PLT) \
- | (((type) == R_AARCH64_COPY) * ELF_RTYPE_CLASS_COPY) \
- | (((type) == R_AARCH64_GLOB_DAT) * ELF_RTYPE_CLASS_EXTERN_PROTECTED_DATA))
+ ((((type) == AARCH64_R(JUMP_SLOT) \
+ || (type) == AARCH64_R(TLS_DTPMOD) \
+ || (type) == AARCH64_R(TLS_DTPREL) \
+ || (type) == AARCH64_R(TLS_TPREL) \
+ || (type) == AARCH64_R(TLSDESC)) * ELF_RTYPE_CLASS_PLT) \
+ | (((type) == AARCH64_R(COPY)) * ELF_RTYPE_CLASS_COPY) \
+ | (((type) == AARCH64_R(GLOB_DAT)) * ELF_RTYPE_CLASS_EXTERN_PROTECTED_DATA))
-#define ELF_MACHINE_JMP_SLOT R_AARCH64_JUMP_SLOT
+#define ELF_MACHINE_JMP_SLOT AARCH64_R(JUMP_SLOT)
/* AArch64 uses RELA not REL */
#define ELF_MACHINE_NO_REL 1
#define ELF_MACHINE_NO_RELA 0
+#define DL_PLATFORM_INIT dl_platform_init ()
+
+static inline void __attribute__ ((unused))
+dl_platform_init (void)
+{
+ if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0')
+ /* Avoid an empty string which would disturb us. */
+ GLRO(dl_platform) = NULL;
+
+#ifdef SHARED
+ /* init_cpu_features has been called early from __libc_start_main in
+ static executable. */
+ init_cpu_features (&GLRO(dl_aarch64_cpu_features));
+#endif
+}
+
+
static inline ElfW(Addr)
elf_machine_fixup_plt (struct link_map *map, lookup_t t,
+ const ElfW(Sym) *refsym, const ElfW(Sym) *sym,
const ElfW(Rela) *reloc,
ElfW(Addr) *reloc_addr,
ElfW(Addr) value)
@@ -237,9 +244,9 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc,
void *const reloc_addr_arg, int skip_ifunc)
{
ElfW(Addr) *const reloc_addr = reloc_addr_arg;
- const unsigned int r_type = ELF64_R_TYPE (reloc->r_info);
+ const unsigned int r_type = ELFW (R_TYPE) (reloc->r_info);
- if (__builtin_expect (r_type == R_AARCH64_RELATIVE, 0))
+ if (__builtin_expect (r_type == AARCH64_R(RELATIVE), 0))
*reloc_addr = map->l_addr + reloc->r_addend;
else if (__builtin_expect (r_type == R_AARCH64_NONE, 0))
return;
@@ -247,7 +254,7 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc,
{
const ElfW(Sym) *const refsym = sym;
struct link_map *sym_map = RESOLVE_MAP (&sym, version, r_type);
- ElfW(Addr) value = sym_map == NULL ? 0 : sym_map->l_addr + sym->st_value;
+ ElfW(Addr) value = SYMBOL_ADDRESS (sym_map, sym, true);
if (sym != NULL
&& __glibc_unlikely (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC)
@@ -257,7 +264,7 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc,
switch (r_type)
{
- case R_AARCH64_COPY:
+ case AARCH64_R(COPY):
if (sym == NULL)
break;
@@ -272,18 +279,21 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc,
RTLD_PROGNAME, strtab + refsym->st_name);
}
memcpy (reloc_addr_arg, (void *) value,
- MIN (sym->st_size, refsym->st_size));
+ sym->st_size < refsym->st_size
+ ? sym->st_size : refsym->st_size);
break;
- case R_AARCH64_RELATIVE:
- case R_AARCH64_GLOB_DAT:
- case R_AARCH64_JUMP_SLOT:
- case R_AARCH64_ABS32:
- case R_AARCH64_ABS64:
+ case AARCH64_R(RELATIVE):
+ case AARCH64_R(GLOB_DAT):
+ case AARCH64_R(JUMP_SLOT):
+ case AARCH64_R(ABS32):
+#ifdef __LP64__
+ case AARCH64_R(ABS64):
+#endif
*reloc_addr = value + reloc->r_addend;
break;
- case R_AARCH64_TLSDESC:
+ case AARCH64_R(TLSDESC):
{
struct tlsdesc volatile *td =
(struct tlsdesc volatile *)reloc_addr;
@@ -318,7 +328,7 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc,
break;
}
- case R_AARCH64_TLS_DTPMOD:
+ case AARCH64_R(TLS_DTPMOD):
#ifdef RTLD_BOOTSTRAP
*reloc_addr = 1;
#else
@@ -329,12 +339,12 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc,
#endif
break;
- case R_AARCH64_TLS_DTPREL:
+ case AARCH64_R(TLS_DTPREL):
if (sym)
*reloc_addr = sym->st_value + reloc->r_addend;
break;
- case R_AARCH64_TLS_TPREL:
+ case AARCH64_R(TLS_TPREL):
if (sym)
{
CHECK_STATIC_TLS (map, sym_map);
@@ -343,7 +353,7 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc,
}
break;
- case R_AARCH64_IRELATIVE:
+ case AARCH64_R(IRELATIVE):
value = map->l_addr + reloc->r_addend;
value = elf_ifunc_invoke (value);
*reloc_addr = value;
@@ -374,25 +384,34 @@ elf_machine_lazy_rel (struct link_map *map,
int skip_ifunc)
{
ElfW(Addr) *const reloc_addr = (void *) (l_addr + reloc->r_offset);
- const unsigned int r_type = ELF64_R_TYPE (reloc->r_info);
+ const unsigned int r_type = ELFW (R_TYPE) (reloc->r_info);
/* Check for unexpected PLT reloc type. */
- if (__builtin_expect (r_type == R_AARCH64_JUMP_SLOT, 1))
+ if (__builtin_expect (r_type == AARCH64_R(JUMP_SLOT), 1))
{
if (__builtin_expect (map->l_mach.plt, 0) == 0)
*reloc_addr += l_addr;
else
*reloc_addr = map->l_mach.plt;
}
- else if (__builtin_expect (r_type == R_AARCH64_TLSDESC, 1))
+ else if (__builtin_expect (r_type == AARCH64_R(TLSDESC), 1))
{
- struct tlsdesc volatile *td =
- (struct tlsdesc volatile *)reloc_addr;
+ const Elf_Symndx symndx = ELFW (R_SYM) (reloc->r_info);
+ const ElfW (Sym) *symtab = (const void *)D_PTR (map, l_info[DT_SYMTAB]);
+ const ElfW (Sym) *sym = &symtab[symndx];
+ const struct r_found_version *version = NULL;
+
+ if (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL)
+ {
+ const ElfW (Half) *vernum =
+ (const void *)D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]);
+ version = &map->l_versions[vernum[symndx] & 0x7fff];
+ }
- td->arg = (void*)reloc;
- td->entry = (void*)(D_PTR (map, l_info[ADDRIDX (DT_TLSDESC_PLT)])
- + map->l_addr);
+ /* Always initialize TLS descriptors completely, because lazy
+ initialization requires synchronization at every TLS access. */
+ elf_machine_rela (map, reloc, sym, version, reloc_addr, skip_ifunc);
}
- else if (__glibc_unlikely (r_type == R_AARCH64_IRELATIVE))
+ else if (__glibc_unlikely (r_type == AARCH64_R(IRELATIVE)))
{
ElfW(Addr) value = map->l_addr + reloc->r_addend;
if (__glibc_likely (!skip_ifunc))
diff --git a/sysdeps/aarch64/dl-sysdep.h b/sysdeps/aarch64/dl-sysdep.h
index 9c05d4b0dc..a0e6a2eed1 100644
--- a/sysdeps/aarch64/dl-sysdep.h
+++ b/sysdeps/aarch64/dl-sysdep.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2002-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2002-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/dl-tls.h b/sysdeps/aarch64/dl-tls.h
index 71265c5341..153fded939 100644
--- a/sysdeps/aarch64/dl-tls.h
+++ b/sysdeps/aarch64/dl-tls.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2005-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -25,6 +25,3 @@ typedef struct
extern void *__tls_get_addr (tls_index *ti);
-
-/* Value used for dtv entries for which the allocation is delayed. */
-#define TLS_DTV_UNALLOCATED ((void *) -1l)
diff --git a/sysdeps/aarch64/dl-tlsdesc.S b/sysdeps/aarch64/dl-tlsdesc.S
index 05be370abb..43a62ef307 100644
--- a/sysdeps/aarch64/dl-tlsdesc.S
+++ b/sysdeps/aarch64/dl-tlsdesc.S
@@ -1,6 +1,6 @@
/* Thread-local storage handling in the ELF dynamic linker.
AArch64 version.
- Copyright (C) 2011-2016 Free Software Foundation, Inc.
+ Copyright (C) 2011-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -74,34 +74,12 @@
cfi_startproc
.align 2
_dl_tlsdesc_return:
- ldr x0, [x0, #8]
+ DELOUSE (0)
+ ldr PTR_REG (0), [x0, #PTR_SIZE]
RET
cfi_endproc
.size _dl_tlsdesc_return, .-_dl_tlsdesc_return
- /* Same as _dl_tlsdesc_return but with synchronization for
- lazy relocation.
- Prototype:
- _dl_tlsdesc_return_lazy (tlsdesc *) ;
- */
- .hidden _dl_tlsdesc_return_lazy
- .global _dl_tlsdesc_return_lazy
- .type _dl_tlsdesc_return_lazy,%function
- cfi_startproc
- .align 2
-_dl_tlsdesc_return_lazy:
- /* The ldar here happens after the load from [x0] at the call site
- (that is generated by the compiler as part of the TLS access ABI),
- so it reads the same value (this function is the final value of
- td->entry) and thus it synchronizes with the release store to
- td->entry in _dl_tlsdesc_resolve_rela_fixup ensuring that the load
- from [x0,#8] here happens after the initialization of td->arg. */
- ldar xzr, [x0]
- ldr x0, [x0, #8]
- RET
- cfi_endproc
- .size _dl_tlsdesc_return_lazy, .-_dl_tlsdesc_return_lazy
-
/* Handler for undefined weak TLS symbols.
Prototype:
_dl_tlsdesc_undefweak (tlsdesc *);
@@ -119,16 +97,10 @@ _dl_tlsdesc_return_lazy:
_dl_tlsdesc_undefweak:
str x1, [sp, #-16]!
cfi_adjust_cfa_offset (16)
- /* The ldar here happens after the load from [x0] at the call site
- (that is generated by the compiler as part of the TLS access ABI),
- so it reads the same value (this function is the final value of
- td->entry) and thus it synchronizes with the release store to
- td->entry in _dl_tlsdesc_resolve_rela_fixup ensuring that the load
- from [x0,#8] here happens after the initialization of td->arg. */
- ldar xzr, [x0]
- ldr x0, [x0, #8]
+ DELOUSE (0)
+ ldr PTR_REG (0), [x0, #PTR_SIZE]
mrs x1, tpidr_el0
- sub x0, x0, x1
+ sub PTR_REG (0), PTR_REG (0), PTR_REG (1)
ldr x1, [sp], #16
cfi_adjust_cfa_offset (-16)
RET
@@ -151,7 +123,7 @@ _dl_tlsdesc_undefweak:
_dl_tlsdesc_dynamic (struct tlsdesc *tdp)
{
struct tlsdesc_dynamic_arg *td = tdp->arg;
- dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
+ dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + TCBHEAD_DTV);
if (__builtin_expect (td->gen_count <= dtv[0].counter
&& (dtv[td->tlsinfo.ti_module].pointer.val
!= TLS_DTV_UNALLOCATED),
@@ -170,46 +142,37 @@ _dl_tlsdesc_undefweak:
cfi_startproc
.align 2
_dl_tlsdesc_dynamic:
-# define NSAVEXREGPAIRS 2
- stp x29, x30, [sp,#-(32+16*NSAVEXREGPAIRS)]!
- cfi_adjust_cfa_offset (32+16*NSAVEXREGPAIRS)
- mov x29, sp
+ DELOUSE (0)
/* Save just enough registers to support fast path, if we fall
into slow path we will save additional registers. */
-
- stp x1, x2, [sp, #32+16*0]
- stp x3, x4, [sp, #32+16*1]
+ stp x1, x2, [sp, #-32]!
+ stp x3, x4, [sp, #16]
+ cfi_adjust_cfa_offset (32)
+ cfi_rel_offset (x1, 0)
+ cfi_rel_offset (x2, 8)
+ cfi_rel_offset (x3, 16)
+ cfi_rel_offset (x4, 24)
mrs x4, tpidr_el0
- /* The ldar here happens after the load from [x0] at the call site
- (that is generated by the compiler as part of the TLS access ABI),
- so it reads the same value (this function is the final value of
- td->entry) and thus it synchronizes with the release store to
- td->entry in _dl_tlsdesc_resolve_rela_fixup ensuring that the load
- from [x0,#8] here happens after the initialization of td->arg. */
- ldar xzr, [x0]
- ldr x1, [x0,#8]
- ldr x0, [x4]
- ldr x3, [x1,#16]
- ldr x2, [x0]
- cmp x3, x2
+ ldr PTR_REG (1), [x0,#TLSDESC_ARG]
+ ldr PTR_REG (0), [x4,#TCBHEAD_DTV]
+ ldr PTR_REG (3), [x1,#TLSDESC_GEN_COUNT]
+ ldr PTR_REG (2), [x0,#DTV_COUNTER]
+ cmp PTR_REG (3), PTR_REG (2)
b.hi 2f
- ldr x2, [x1]
- add x0, x0, x2, lsl #4
- ldr x0, [x0]
- cmn x0, #0x1
+ /* Load r2 = td->tlsinfo.ti_module and r3 = td->tlsinfo.ti_offset. */
+ ldp PTR_REG (2), PTR_REG (3), [x1,#TLSDESC_MODID]
+ add PTR_REG (0), PTR_REG (0), PTR_REG (2), lsl #(PTR_LOG_SIZE + 1)
+ ldr PTR_REG (0), [x0] /* Load val member of DTV entry. */
+ cmp PTR_REG (0), #TLS_DTV_UNALLOCATED
b.eq 2f
- ldr x1, [x1,#8]
- add x0, x0, x1
- sub x0, x0, x4
+ sub PTR_REG (3), PTR_REG (3), PTR_REG (4)
+ add PTR_REG (0), PTR_REG (0), PTR_REG (3)
1:
- ldp x1, x2, [sp, #32+16*0]
- ldp x3, x4, [sp, #32+16*1]
-
- ldp x29, x30, [sp], #(32+16*NSAVEXREGPAIRS)
- cfi_adjust_cfa_offset (-32-16*NSAVEXREGPAIRS)
-# undef NSAVEXREGPAIRS
+ ldp x3, x4, [sp, #16]
+ ldp x1, x2, [sp], #32
+ cfi_adjust_cfa_offset (-32)
RET
2:
/* This is the slow path. We need to call __tls_get_addr() which
@@ -217,15 +180,33 @@ _dl_tlsdesc_dynamic:
callee will trash. */
/* Save the remaining registers that we must treat as caller save. */
-# define NSAVEXREGPAIRS 7
- stp x5, x6, [sp, #-16*NSAVEXREGPAIRS]!
+# define NSAVEXREGPAIRS 8
+ stp x29, x30, [sp,#-16*NSAVEXREGPAIRS]!
cfi_adjust_cfa_offset (16*NSAVEXREGPAIRS)
- stp x7, x8, [sp, #16*1]
- stp x9, x10, [sp, #16*2]
- stp x11, x12, [sp, #16*3]
- stp x13, x14, [sp, #16*4]
- stp x15, x16, [sp, #16*5]
- stp x17, x18, [sp, #16*6]
+ cfi_rel_offset (x29, 0)
+ cfi_rel_offset (x30, 8)
+ mov x29, sp
+ stp x5, x6, [sp, #16*1]
+ stp x7, x8, [sp, #16*2]
+ stp x9, x10, [sp, #16*3]
+ stp x11, x12, [sp, #16*4]
+ stp x13, x14, [sp, #16*5]
+ stp x15, x16, [sp, #16*6]
+ stp x17, x18, [sp, #16*7]
+ cfi_rel_offset (x5, 16*1)
+ cfi_rel_offset (x6, 16*1+8)
+ cfi_rel_offset (x7, 16*2)
+ cfi_rel_offset (x8, 16*2+8)
+ cfi_rel_offset (x9, 16*3)
+ cfi_rel_offset (x10, 16*3+8)
+ cfi_rel_offset (x11, 16*4)
+ cfi_rel_offset (x12, 16*4+8)
+ cfi_rel_offset (x13, 16*5)
+ cfi_rel_offset (x14, 16*5+8)
+ cfi_rel_offset (x15, 16*6)
+ cfi_rel_offset (x16, 16*6+8)
+ cfi_rel_offset (x17, 16*7)
+ cfi_rel_offset (x18, 16*7+8)
SAVE_Q_REGISTERS
@@ -233,134 +214,24 @@ _dl_tlsdesc_dynamic:
bl __tls_get_addr
mrs x1, tpidr_el0
- sub x0, x0, x1
+ sub PTR_REG (0), PTR_REG (0), PTR_REG (1)
RESTORE_Q_REGISTERS
- ldp x7, x8, [sp, #16*1]
- ldp x9, x10, [sp, #16*2]
- ldp x11, x12, [sp, #16*3]
- ldp x13, x14, [sp, #16*4]
- ldp x15, x16, [sp, #16*5]
- ldp x17, x18, [sp, #16*6]
- ldp x5, x6, [sp], #16*NSAVEXREGPAIRS
+ ldp x5, x6, [sp, #16*1]
+ ldp x7, x8, [sp, #16*2]
+ ldp x9, x10, [sp, #16*3]
+ ldp x11, x12, [sp, #16*4]
+ ldp x13, x14, [sp, #16*5]
+ ldp x15, x16, [sp, #16*6]
+ ldp x17, x18, [sp, #16*7]
+
+ ldp x29, x30, [sp], #16*NSAVEXREGPAIRS
cfi_adjust_cfa_offset (-16*NSAVEXREGPAIRS)
+ cfi_restore (x29)
+ cfi_restore (x30)
b 1b
cfi_endproc
.size _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
# undef NSAVEXREGPAIRS
#endif
-
- /* This function is a wrapper for a lazy resolver for TLS_DESC
- RELA relocations.
- When the actual resolver returns, it will have adjusted the
- TLS descriptor such that we can tail-call it for it to return
- the TP offset of the symbol. */
-
- .hidden _dl_tlsdesc_resolve_rela
- .global _dl_tlsdesc_resolve_rela
- .type _dl_tlsdesc_resolve_rela,%function
- cfi_startproc
- .align 2
-_dl_tlsdesc_resolve_rela:
-#define NSAVEXREGPAIRS 9
- stp x29, x30, [sp, #-(32+16*NSAVEXREGPAIRS)]!
- cfi_adjust_cfa_offset (32+16*NSAVEXREGPAIRS)
- mov x29, sp
- stp x1, x4, [sp, #32+16*0]
- stp x5, x6, [sp, #32+16*1]
- stp x7, x8, [sp, #32+16*2]
- stp x9, x10, [sp, #32+16*3]
- stp x11, x12, [sp, #32+16*4]
- stp x13, x14, [sp, #32+16*5]
- stp x15, x16, [sp, #32+16*6]
- stp x17, x18, [sp, #32+16*7]
- str x0, [sp, #32+16*8]
-
- SAVE_Q_REGISTERS
-
- ldr x1, [x3, #8]
- bl _dl_tlsdesc_resolve_rela_fixup
-
- RESTORE_Q_REGISTERS
-
- ldr x0, [sp, #32+16*8]
- ldr x1, [x0]
- blr x1
-
- ldp x1, x4, [sp, #32+16*0]
- ldp x5, x6, [sp, #32+16*1]
- ldp x7, x8, [sp, #32+16*2]
- ldp x9, x10, [sp, #32+16*3]
- ldp x11, x12, [sp, #32+16*4]
- ldp x13, x14, [sp, #32+16*5]
- ldp x15, x16, [sp, #32+16*6]
- ldp x17, x18, [sp, #32+16*7]
- ldp x29, x30, [sp], #(32+16*NSAVEXREGPAIRS)
- cfi_adjust_cfa_offset (-32-16*NSAVEXREGPAIRS)
- ldp x2, x3, [sp], #16
- cfi_adjust_cfa_offset (-16)
- RET
-#undef NSAVEXREGPAIRS
- cfi_endproc
- .size _dl_tlsdesc_resolve_rela, .-_dl_tlsdesc_resolve_rela
-
- /* This function is a placeholder for lazy resolving of TLS
- relocations. Once some thread starts resolving a TLS
- relocation, it sets up the TLS descriptor to use this
- resolver, such that other threads that would attempt to
- resolve it concurrently may skip the call to the original lazy
- resolver and go straight to a condition wait.
-
- When the actual resolver returns, it will have adjusted the
- TLS descriptor such that we can tail-call it for it to return
- the TP offset of the symbol. */
-
- .hidden _dl_tlsdesc_resolve_hold
- .global _dl_tlsdesc_resolve_hold
- .type _dl_tlsdesc_resolve_hold,%function
- cfi_startproc
- .align 2
-_dl_tlsdesc_resolve_hold:
-#define NSAVEXREGPAIRS 10
-1:
- stp x29, x30, [sp, #-(32+16*NSAVEXREGPAIRS)]!
- cfi_adjust_cfa_offset (32+16*NSAVEXREGPAIRS)
- mov x29, sp
- stp x1, x2, [sp, #32+16*0]
- stp x3, x4, [sp, #32+16*1]
- stp x5, x6, [sp, #32+16*2]
- stp x7, x8, [sp, #32+16*3]
- stp x9, x10, [sp, #32+16*4]
- stp x11, x12, [sp, #32+16*5]
- stp x13, x14, [sp, #32+16*6]
- stp x15, x16, [sp, #32+16*7]
- stp x17, x18, [sp, #32+16*8]
- str x0, [sp, #32+16*9]
-
- SAVE_Q_REGISTERS
-
- adr x1, 1b
- bl _dl_tlsdesc_resolve_hold_fixup
-
- RESTORE_Q_REGISTERS
-
- ldr x0, [sp, #32+16*9]
- ldr x1, [x0]
- blr x1
-
- ldp x1, x2, [sp, #32+16*0]
- ldp x3, x4, [sp, #32+16*1]
- ldp x5, x6, [sp, #32+16*2]
- ldp x7, x8, [sp, #32+16*3]
- ldp x9, x10, [sp, #32+16*4]
- ldp x11, x12, [sp, #32+16*5]
- ldp x13, x14, [sp, #32+16*6]
- ldp x15, x16, [sp, #32+16*7]
- ldp x17, x18, [sp, #32+16*8]
- ldp x29, x30, [sp], #(32+16*NSAVEXREGPAIRS)
- cfi_adjust_cfa_offset (-32-16*NSAVEXREGPAIRS)
- RET
- cfi_endproc
- .size _dl_tlsdesc_resolve_hold, .-_dl_tlsdesc_resolve_hold
-#undef NSAVEXREGPAIRS
diff --git a/sysdeps/aarch64/dl-tlsdesc.h b/sysdeps/aarch64/dl-tlsdesc.h
index 8532469fd9..6dc80ad66c 100644
--- a/sysdeps/aarch64/dl-tlsdesc.h
+++ b/sysdeps/aarch64/dl-tlsdesc.h
@@ -1,6 +1,6 @@
/* Thread-local storage descriptor handling in the ELF dynamic linker.
AArch64 version.
- Copyright (C) 2011-2016 Free Software Foundation, Inc.
+ Copyright (C) 2011-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -46,20 +46,10 @@ extern ptrdiff_t attribute_hidden
_dl_tlsdesc_return (struct tlsdesc *);
extern ptrdiff_t attribute_hidden
-_dl_tlsdesc_return_lazy (struct tlsdesc *);
-
-extern ptrdiff_t attribute_hidden
_dl_tlsdesc_undefweak (struct tlsdesc *);
-extern ptrdiff_t attribute_hidden
-_dl_tlsdesc_resolve_rela (struct tlsdesc *);
-
-extern ptrdiff_t attribute_hidden
-_dl_tlsdesc_resolve_hold (struct tlsdesc *);
-
# ifdef SHARED
-extern void *internal_function _dl_make_tlsdesc_dynamic (struct link_map *,
- size_t);
+extern void *_dl_make_tlsdesc_dynamic (struct link_map *, size_t);
extern ptrdiff_t attribute_hidden
_dl_tlsdesc_dynamic (struct tlsdesc *);
diff --git a/sysdeps/aarch64/dl-trampoline.S b/sysdeps/aarch64/dl-trampoline.S
index 947a51579f..a86d0722d4 100644
--- a/sysdeps/aarch64/dl-trampoline.S
+++ b/sysdeps/aarch64/dl-trampoline.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2005-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -22,9 +22,13 @@
#include "dl-link.h"
#define ip0 x16
+#define ip0l PTR_REG (16)
#define ip1 x17
#define lr x30
+/* RELA relocatons are 3 pointers */
+#define RELA_SIZE (PTR_SIZE * 3)
+
.text
.globl _dl_runtime_resolve
.type _dl_runtime_resolve, #function
@@ -79,7 +83,7 @@ _dl_runtime_resolve:
cfi_rel_offset (q1, 80+7*16)
/* Get pointer to linker struct. */
- ldr x0, [ip0, #-8]
+ ldr PTR_REG (0), [ip0, #-PTR_SIZE]
/* Prepare to call _dl_fixup(). */
ldr x1, [sp, 80+8*16] /* Recover &PLTGOT[n] */
@@ -87,7 +91,7 @@ _dl_runtime_resolve:
sub x1, x1, ip0
add x1, x1, x1, lsl #1
lsl x1, x1, #3
- sub x1, x1, #192
+ sub x1, x1, #(RELA_SIZE<<3)
lsr x1, x1, #3
/* Call fixup routine. */
@@ -191,7 +195,7 @@ _dl_runtime_profile:
stp x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_SP]
/* Get pointer to linker struct. */
- ldr x0, [ip0, #-8]
+ ldr PTR_REG (0), [ip0, #-PTR_SIZE]
/* Prepare to call _dl_profile_fixup(). */
ldr x1, [x29, OFFSET_PLTGOTN] /* Recover &PLTGOT[n] */
@@ -199,7 +203,7 @@ _dl_runtime_profile:
sub x1, x1, ip0
add x1, x1, x1, lsl #1
lsl x1, x1, #3
- sub x1, x1, #192
+ sub x1, x1, #(RELA_SIZE<<3)
lsr x1, x1, #3
stp x0, x1, [x29, #OFFSET_SAVED_CALL_X0]
@@ -210,8 +214,8 @@ _dl_runtime_profile:
add x4, x29, #OFFSET_FS /* address of framesize */
bl _dl_profile_fixup
- ldr ip0, [x29, #OFFSET_FS] /* framesize == 0 */
- cmp ip0, #0
+ ldr ip0l, [x29, #OFFSET_FS] /* framesize == 0 */
+ cmp ip0l, #0
bge 1f
cfi_remember_state
@@ -243,7 +247,7 @@ _dl_runtime_profile:
1:
/* The new frame size is in ip0. */
- sub x1, x29, ip0
+ sub PTR_REG (1), PTR_REG (29), ip0l
and sp, x1, #0xfffffffffffffff0
str x0, [x29, #OFFSET_T1]
diff --git a/sysdeps/aarch64/dl-tunables.list b/sysdeps/aarch64/dl-tunables.list
new file mode 100644
index 0000000000..f6a88168cc
--- /dev/null
+++ b/sysdeps/aarch64/dl-tunables.list
@@ -0,0 +1,25 @@
+# aarch64 specific tunables.
+# Copyright (C) 2017-2018 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+glibc {
+ tune {
+ cpu {
+ type: STRING
+ }
+ }
+}
diff --git a/sysdeps/aarch64/soft-fp/e_sqrtl.c b/sysdeps/aarch64/e_sqrtl.c
index 5d3919b6a2..1dc1cbf003 100644
--- a/sysdeps/aarch64/soft-fp/e_sqrtl.c
+++ b/sysdeps/aarch64/e_sqrtl.c
@@ -1,5 +1,5 @@
/* long double square root in software floating-point emulation.
- Copyright (C) 1997-2016 Free Software Foundation, Inc.
+ Copyright (C) 1997-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Richard Henderson (rth@cygnus.com) and
Jakub Jelinek (jj@ultra.linux.cz).
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
new file mode 100644
index 0000000000..4a182bd6d6
--- /dev/null
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -0,0 +1,14 @@
+ifeq ($(subdir),math)
+CFLAGS-e_sqrtf.c += -fno-math-errno
+CFLAGS-e_sqrt.c += -fno-math-errno
+CFLAGS-s_lroundf.c += -fno-math-errno
+CFLAGS-s_lround.c += -fno-math-errno
+CFLAGS-s_llroundf.c += -fno-math-errno
+CFLAGS-s_llround.c += -fno-math-errno
+# GCC 4.9 and 5 requires the flag to correct emits a f{max,min}nm
+# for a __builtin_{fmax,fmin}{f}.
+CFLAGS-s_fmax.c += -ffinite-math-only
+CFLAGS-s_fmaxf.c += -ffinite-math-only
+CFLAGS-s_fmin.c += -ffinite-math-only
+CFLAGS-s_fminf.c += -ffinite-math-only
+endif
diff --git a/sysdeps/aarch64/fpu/e_sqrt.c b/sysdeps/aarch64/fpu/e_sqrt.c
index 2759fb186d..32f1e26803 100644
--- a/sysdeps/aarch64/fpu/e_sqrt.c
+++ b/sysdeps/aarch64/fpu/e_sqrt.c
@@ -1,5 +1,5 @@
/* Square root of floating point number.
- Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Copyright (C) 2015-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -21,8 +21,6 @@
double
__ieee754_sqrt (double d)
{
- double res;
- asm ("fsqrt %d0, %d1" : "=w" (res) : "w" (d));
- return res;
+ return __builtin_sqrt (d);
}
strong_alias (__ieee754_sqrt, __sqrt_finite)
diff --git a/sysdeps/aarch64/fpu/e_sqrtf.c b/sysdeps/aarch64/fpu/e_sqrtf.c
index e7ce19a38c..87f557cb10 100644
--- a/sysdeps/aarch64/fpu/e_sqrtf.c
+++ b/sysdeps/aarch64/fpu/e_sqrtf.c
@@ -1,5 +1,5 @@
/* Single-precision floating point square root.
- Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Copyright (C) 2015-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -21,8 +21,6 @@
float
__ieee754_sqrtf (float s)
{
- float res;
- asm ("fsqrt %s0, %s1" : "=w" (res) : "w" (s));
- return res;
+ return __builtin_sqrtf (s);
}
strong_alias (__ieee754_sqrtf, __sqrtf_finite)
diff --git a/sysdeps/aarch64/fpu/fclrexcpt.c b/sysdeps/aarch64/fpu/fclrexcpt.c
index 8c0f09a82d..9328518f1e 100644
--- a/sysdeps/aarch64/fpu/fclrexcpt.c
+++ b/sysdeps/aarch64/fpu/fclrexcpt.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/fpu/fedisblxcpt.c b/sysdeps/aarch64/fpu/fedisblxcpt.c
index bf9592847f..125751f8b4 100644
--- a/sysdeps/aarch64/fpu/fedisblxcpt.c
+++ b/sysdeps/aarch64/fpu/fedisblxcpt.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2001-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2001-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/fpu/feenablxcpt.c b/sysdeps/aarch64/fpu/feenablxcpt.c
index dff1050e66..6c72b42fc9 100644
--- a/sysdeps/aarch64/fpu/feenablxcpt.c
+++ b/sysdeps/aarch64/fpu/feenablxcpt.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2001-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2001-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/fpu/fegetenv.c b/sysdeps/aarch64/fpu/fegetenv.c
index c10975f6a2..1559e90977 100644
--- a/sysdeps/aarch64/fpu/fegetenv.c
+++ b/sysdeps/aarch64/fpu/fegetenv.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/fpu/fegetexcept.c b/sysdeps/aarch64/fpu/fegetexcept.c
index 4a4cccd5ff..1f994ead0d 100644
--- a/sysdeps/aarch64/fpu/fegetexcept.c
+++ b/sysdeps/aarch64/fpu/fegetexcept.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2001-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2001-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/fpu/s_frintf.c b/sysdeps/aarch64/fpu/fegetmode.c
index a284eafa4c..65dcb22d89 100644
--- a/sysdeps/aarch64/fpu/s_frintf.c
+++ b/sysdeps/aarch64/fpu/fegetmode.c
@@ -1,11 +1,11 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
-
+/* Store current floating-point control modes. AArch64 version.
+ Copyright (C) 2016-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public License as
- published by the Free Software Foundation; either version 2.1 of the
- License, or (at your option) any later version.
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -16,9 +16,12 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#ifndef FUNC
-# error FUNC not defined
-#endif
-#define TYPE float
-#define REGS "s"
-#include <s_frint.c>
+#include <fenv.h>
+#include <fpu_control.h>
+
+int
+fegetmode (femode_t *modep)
+{
+ _FPU_GETCW (*modep);
+ return 0;
+}
diff --git a/sysdeps/aarch64/fpu/fegetround.c b/sysdeps/aarch64/fpu/fegetround.c
index 94ad920543..10082eed90 100644
--- a/sysdeps/aarch64/fpu/fegetround.c
+++ b/sysdeps/aarch64/fpu/fegetround.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/fpu/feholdexcpt.c b/sysdeps/aarch64/fpu/feholdexcpt.c
index b9cf0ebd6e..4f7a58e379 100644
--- a/sysdeps/aarch64/fpu/feholdexcpt.c
+++ b/sysdeps/aarch64/fpu/feholdexcpt.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/fpu/fesetenv.c b/sysdeps/aarch64/fpu/fesetenv.c
index ca87bb60d6..2a63cf0c96 100644
--- a/sysdeps/aarch64/fpu/fesetenv.c
+++ b/sysdeps/aarch64/fpu/fesetenv.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/fpu/fesetexcept.c b/sysdeps/aarch64/fpu/fesetexcept.c
new file mode 100644
index 0000000000..c97a38b483
--- /dev/null
+++ b/sysdeps/aarch64/fpu/fesetexcept.c
@@ -0,0 +1,34 @@
+/* Set given exception flags. AArch64 version.
+ Copyright (C) 2016-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv.h>
+#include <fpu_control.h>
+
+int
+fesetexcept (int excepts)
+{
+ fpu_fpsr_t fpsr;
+ fpu_fpsr_t fpsr_new;
+
+ _FPU_GETFPSR (fpsr);
+ fpsr_new = fpsr | (excepts & FE_ALL_EXCEPT);
+ if (fpsr != fpsr_new)
+ _FPU_SETFPSR (fpsr_new);
+
+ return 0;
+}
diff --git a/sysdeps/aarch64/fpu/fesetmode.c b/sysdeps/aarch64/fpu/fesetmode.c
new file mode 100644
index 0000000000..6f6b8c197d
--- /dev/null
+++ b/sysdeps/aarch64/fpu/fesetmode.c
@@ -0,0 +1,34 @@
+/* Install given floating-point control modes. AArch64 version.
+ Copyright (C) 2016-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv.h>
+#include <fpu_control.h>
+
+int
+fesetmode (const femode_t *modep)
+{
+ fpu_control_t fpcr, fpcr_new;
+ _FPU_GETCW (fpcr);
+ if (modep == FE_DFL_MODE)
+ fpcr_new = (fpcr & _FPU_RESERVED) | _FPU_DEFAULT;
+ else
+ fpcr_new = *modep;
+ if (fpcr != fpcr_new)
+ _FPU_SETCW (fpcr_new);
+ return 0;
+}
diff --git a/sysdeps/aarch64/fpu/fesetround.c b/sysdeps/aarch64/fpu/fesetround.c
index 645efe7804..4c2ed3d7ac 100644
--- a/sysdeps/aarch64/fpu/fesetround.c
+++ b/sysdeps/aarch64/fpu/fesetround.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/fpu/feupdateenv.c b/sysdeps/aarch64/fpu/feupdateenv.c
index 53d2b36cfa..2aaf5d13cd 100644
--- a/sysdeps/aarch64/fpu/feupdateenv.c
+++ b/sysdeps/aarch64/fpu/feupdateenv.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2009-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/fpu/fgetexcptflg.c b/sysdeps/aarch64/fpu/fgetexcptflg.c
index 8089a2c9fb..cfc7113e04 100644
--- a/sysdeps/aarch64/fpu/fgetexcptflg.c
+++ b/sysdeps/aarch64/fpu/fgetexcptflg.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2001-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2001-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/fpu/fpu_control.h b/sysdeps/aarch64/fpu/fpu_control.h
index 7e74445872..2d36170c05 100644
--- a/sysdeps/aarch64/fpu/fpu_control.h
+++ b/sysdeps/aarch64/fpu/fpu_control.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 1996-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1996-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -19,19 +19,28 @@
#ifndef _AARCH64_FPU_CONTROL_H
#define _AARCH64_FPU_CONTROL_H
+#include <features.h>
+
/* Macros for accessing the FPCR and FPSR. */
-#define _FPU_GETCW(fpcr) \
+#if __GNUC_PREREQ (6,0)
+# define _FPU_GETCW(fpcr) (fpcr = __builtin_aarch64_get_fpcr ())
+# define _FPU_SETCW(fpcr) __builtin_aarch64_set_fpcr (fpcr)
+# define _FPU_GETFPSR(fpsr) (fpsr = __builtin_aarch64_get_fpsr ())
+# define _FPU_SETFPSR(fpsr) __builtin_aarch64_set_fpsr (fpsr)
+#else
+# define _FPU_GETCW(fpcr) \
__asm__ __volatile__ ("mrs %0, fpcr" : "=r" (fpcr))
-#define _FPU_SETCW(fpcr) \
+# define _FPU_SETCW(fpcr) \
__asm__ __volatile__ ("msr fpcr, %0" : : "r" (fpcr))
-#define _FPU_GETFPSR(fpsr) \
+# define _FPU_GETFPSR(fpsr) \
__asm__ __volatile__ ("mrs %0, fpsr" : "=r" (fpsr))
-#define _FPU_SETFPSR(fpsr) \
+# define _FPU_SETFPSR(fpsr) \
__asm__ __volatile__ ("msr fpsr, %0" : : "r" (fpsr))
+#endif
/* Reserved bits should be preserved when modifying register
contents. These two masks indicate which bits in each of FPCR and
diff --git a/sysdeps/aarch64/fpu/fraiseexcpt.c b/sysdeps/aarch64/fpu/fraiseexcpt.c
index 4e00fbba42..bc26156323 100644
--- a/sysdeps/aarch64/fpu/fraiseexcpt.c
+++ b/sysdeps/aarch64/fpu/fraiseexcpt.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/fpu/fsetexcptflg.c b/sysdeps/aarch64/fpu/fsetexcptflg.c
index 890f1d655c..45d96af7e9 100644
--- a/sysdeps/aarch64/fpu/fsetexcptflg.c
+++ b/sysdeps/aarch64/fpu/fsetexcptflg.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/fpu/ftestexcept.c b/sysdeps/aarch64/fpu/ftestexcept.c
index cedbce456f..84223259c8 100644
--- a/sysdeps/aarch64/fpu/ftestexcept.c
+++ b/sysdeps/aarch64/fpu/ftestexcept.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/fpu/get-rounding-mode.h b/sysdeps/aarch64/fpu/get-rounding-mode.h
index 0b23363025..ab29c5b908 100644
--- a/sysdeps/aarch64/fpu/get-rounding-mode.h
+++ b/sysdeps/aarch64/fpu/get-rounding-mode.h
@@ -1,6 +1,6 @@
/* Determine floating-point rounding mode within libc. AArch64 version.
- Copyright (C) 2012-2016 Free Software Foundation, Inc.
+ Copyright (C) 2012-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/bits/mathdef.h b/sysdeps/aarch64/fpu/math-barriers.h
index db992fccc1..7db937bbf4 100644
--- a/sysdeps/aarch64/bits/mathdef.h
+++ b/sysdeps/aarch64/fpu/math-barriers.h
@@ -1,5 +1,5 @@
-/* Copyright (C) 1999-2016 Free Software Foundation, Inc.
-
+/* Control when floating-point expressions are evaluated. AArch64 version.
+ Copyright (C) 2014-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,24 +16,12 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#if !defined _MATH_H && !defined _COMPLEX_H
-# error "Never use <bits/mathdef.h> directly; include <math.h> instead"
-#endif
-
-#if defined __USE_ISOC99 && defined _MATH_H && !defined _MATH_H_MATHDEF
-# define _MATH_H_MATHDEF 1
+#ifndef AARCH64_MATH_BARRIERS_H
+#define AARCH64_MATH_BARRIERS_H 1
-/* GCC does not promote `float' values to `double'. */
-typedef float float_t; /* `float' expressions are evaluated as
- `float'. */
-typedef double double_t; /* `double' expressions are evaluated as
- `double'. */
+#define math_opt_barrier(x) \
+ ({ __typeof (x) __x = (x); __asm ("" : "+w" (__x)); __x; })
+#define math_force_eval(x) \
+ ({ __typeof (x) __x = (x); __asm __volatile__ ("" : : "w" (__x)); })
-/* The values returned by `ilogb' for 0 and NaN respectively. */
-# define FP_ILOGB0 (-2147483647)
-# define FP_ILOGBNAN (2147483647)
-
-# define FP_FAST_FMA 1
-# define FP_FAST_FMAF 1
-
-#endif /* ISO C99 */
+#endif
diff --git a/sysdeps/aarch64/fpu/math_private.h b/sysdeps/aarch64/fpu/math_private.h
index fd8eb1fb56..fcd02c0654 100644
--- a/sysdeps/aarch64/fpu/math_private.h
+++ b/sysdeps/aarch64/fpu/math_private.h
@@ -1,5 +1,5 @@
/* Private floating point rounding and exceptions handling. AArch64 version.
- Copyright (C) 2014-2016 Free Software Foundation, Inc.
+ Copyright (C) 2014-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -22,27 +22,6 @@
#include <fenv.h>
#include <fpu_control.h>
-#define math_opt_barrier(x) \
-({ __typeof (x) __x = (x); __asm ("" : "+w" (__x)); __x; })
-#define math_force_eval(x) \
-({ __typeof (x) __x = (x); __asm __volatile__ ("" : : "w" (__x)); })
-
-extern __always_inline double
-__ieee754_sqrt (double d)
-{
- double res;
- asm __volatile__ ("fsqrt %d0, %d1" : "=w" (res) : "w" (d));
- return res;
-}
-
-extern __always_inline float
-__ieee754_sqrtf (float s)
-{
- float res;
- asm __volatile__ ("fsqrt %s0, %s1" : "=w" (res) : "w" (s));
- return res;
-}
-
static __always_inline void
libc_feholdexcept_aarch64 (fenv_t *envp)
{
@@ -319,6 +298,26 @@ libc_feresetround_noex_aarch64_ctx (struct rm_ctx *ctx)
#define libc_feresetround_noexf_ctx libc_feresetround_noex_aarch64_ctx
#define libc_feresetround_noexl_ctx libc_feresetround_noex_aarch64_ctx
+/* Hack: only include the large arm_neon.h when needed. */
+#ifdef _MATH_CONFIG_H
+# include <arm_neon.h>
+
+/* ACLE intrinsics for frintn and fcvtns instructions. */
+# define TOINT_INTRINSICS 1
+
+static inline double_t
+roundtoint (double_t x)
+{
+ return vget_lane_f64 (vrndn_f64 (vld1_f64 (&x)), 0);
+}
+
+static inline uint64_t
+converttoint (double_t x)
+{
+ return vcvtnd_s64_f64 (x);
+}
+#endif
+
#include_next <math_private.h>
#endif
diff --git a/sysdeps/aarch64/fpu/s_ceil.c b/sysdeps/aarch64/fpu/s_ceil.c
index 60c444d70a..4e3b5fdc4d 100644
--- a/sysdeps/aarch64/fpu/s_ceil.c
+++ b/sysdeps/aarch64/fpu/s_ceil.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,6 +16,13 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define FUNC ceil
-#define INSN "frintp"
-#include <s_frint.c>
+#include <math.h>
+#include <libm-alias-double.h>
+
+double
+__ceil (double x)
+{
+ return __builtin_ceil (x);
+}
+
+libm_alias_double (__ceil, ceil)
diff --git a/sysdeps/aarch64/fpu/s_ceilf.c b/sysdeps/aarch64/fpu/s_ceilf.c
index 1f1fb0677a..d01dadc246 100644
--- a/sysdeps/aarch64/fpu/s_ceilf.c
+++ b/sysdeps/aarch64/fpu/s_ceilf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,6 +16,13 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define FUNC ceilf
-#define INSN "frintp"
-#include <s_frintf.c>
+#include <math.h>
+#include <libm-alias-float.h>
+
+float
+__ceilf (float x)
+{
+ return __builtin_ceilf (x);
+}
+
+libm_alias_float (__ceil, ceil)
diff --git a/sysdeps/aarch64/fpu/s_floor.c b/sysdeps/aarch64/fpu/s_floor.c
index c490a27b66..7258246896 100644
--- a/sysdeps/aarch64/fpu/s_floor.c
+++ b/sysdeps/aarch64/fpu/s_floor.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,6 +16,13 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define FUNC floor
-#define INSN "frintm"
-#include <s_frint.c>
+#include <math.h>
+#include <libm-alias-double.h>
+
+double
+__floor (double x)
+{
+ return __builtin_floor (x);
+}
+
+libm_alias_double (__floor, floor)
diff --git a/sysdeps/aarch64/fpu/s_floorf.c b/sysdeps/aarch64/fpu/s_floorf.c
index ec81683058..bd197013ad 100644
--- a/sysdeps/aarch64/fpu/s_floorf.c
+++ b/sysdeps/aarch64/fpu/s_floorf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,6 +16,13 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define FUNC floorf
-#define INSN "frintm"
-#include <s_frintf.c>
+#include <math.h>
+#include <libm-alias-float.h>
+
+float
+__floorf (float x)
+{
+ return __builtin_floorf (x);
+}
+
+libm_alias_float (__floor, floor)
diff --git a/sysdeps/aarch64/fpu/s_fma.c b/sysdeps/aarch64/fpu/s_fma.c
index 3320ba6989..4f3d6c4a51 100644
--- a/sysdeps/aarch64/fpu/s_fma.c
+++ b/sysdeps/aarch64/fpu/s_fma.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1996-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1996-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -17,29 +17,12 @@
<http://www.gnu.org/licenses/>. */
#include <math.h>
+#include <libm-alias-double.h>
-#ifndef FUNC
-# define FUNC fma
-#endif
-
-#ifndef TYPE
-# define TYPE double
-# define REGS "d"
-#else
-# ifndef REGS
-# error REGS not defined
-# endif
-#endif
-
-#define __CONCATX(a,b) __CONCAT(a,b)
-
-TYPE
-__CONCATX(__,FUNC) (TYPE x, TYPE y, TYPE z)
+double
+__fma (double x, double y, double z)
{
- TYPE result;
- asm ( "fmadd" "\t%" REGS "0, %" REGS "1, %" REGS "2, %" REGS "3"
- : "=w" (result) : "w" (x), "w" (y), "w" (z) );
- return result;
+ return __builtin_fma (x, y, z);
}
-weak_alias (__CONCATX(__,FUNC), FUNC)
+libm_alias_double (__fma, fma)
diff --git a/sysdeps/aarch64/fpu/s_fmaf.c b/sysdeps/aarch64/fpu/s_fmaf.c
index e35512c31b..197cf7bd06 100644
--- a/sysdeps/aarch64/fpu/s_fmaf.c
+++ b/sysdeps/aarch64/fpu/s_fmaf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,7 +16,13 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define FUNC fmaf
-#define TYPE float
-#define REGS "s"
-#include <s_fma.c>
+#include <math.h>
+#include <libm-alias-float.h>
+
+float
+__fmaf (float x, float y, float z)
+{
+ return __builtin_fmaf (x, y, z);
+}
+
+libm_alias_float (__fma, fma)
diff --git a/sysdeps/aarch64/fpu/s_fmax.c b/sysdeps/aarch64/fpu/s_fmax.c
index bc41ec44f6..65de8ceab3 100644
--- a/sysdeps/aarch64/fpu/s_fmax.c
+++ b/sysdeps/aarch64/fpu/s_fmax.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,6 +16,13 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define FUNC fmax
-#define INSN "fmaxnm"
-#include <s_fmin.c>
+#include <math.h>
+#include <libm-alias-double.h>
+
+double
+__fmax (double x, double y)
+{
+ return __builtin_fmax (x, y);
+}
+
+libm_alias_double (__fmax, fmax)
diff --git a/sysdeps/aarch64/fpu/s_fmaxf.c b/sysdeps/aarch64/fpu/s_fmaxf.c
index 6a234bbeaf..2056700426 100644
--- a/sysdeps/aarch64/fpu/s_fmaxf.c
+++ b/sysdeps/aarch64/fpu/s_fmaxf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,8 +16,13 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define FUNC fmaxf
-#define INSN "fmaxnm"
-#define TYPE float
-#define REGS "s"
-#include <s_fmin.c>
+#include <math.h>
+#include <libm-alias-float.h>
+
+float
+__fmaxf (float x, float y)
+{
+ return __builtin_fmaxf (x, y);
+}
+
+libm_alias_float (__fmax, fmax)
diff --git a/sysdeps/aarch64/fpu/s_fmin.c b/sysdeps/aarch64/fpu/s_fmin.c
index 695abafb9b..a42b565498 100644
--- a/sysdeps/aarch64/fpu/s_fmin.c
+++ b/sysdeps/aarch64/fpu/s_fmin.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1996-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1996-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -17,33 +17,12 @@
<http://www.gnu.org/licenses/>. */
#include <math.h>
+#include <libm-alias-double.h>
-#ifndef FUNC
-# define FUNC fmin
-#endif
-
-#ifndef INSN
-# define INSN "fminnm"
-#endif
-
-#ifndef TYPE
-# define TYPE double
-# define REGS "d"
-#else
-# ifndef REGS
-# error REGS not defined
-# endif
-#endif
-
-#define __CONCATX(a,b) __CONCAT(a,b)
-
-TYPE
-__CONCATX(__,FUNC) (TYPE x, TYPE y)
+double
+__fmin (double x, double y)
{
- TYPE result;
- asm ( INSN "\t%" REGS "0, %" REGS "1, %" REGS "2"
- : "=w" (result) : "w" (x), "w" (y) );
- return result;
+ return __builtin_fmin (x, y);
}
-weak_alias (__CONCATX(__,FUNC), FUNC)
+libm_alias_double (__fmin, fmin)
diff --git a/sysdeps/aarch64/fpu/s_fminf.c b/sysdeps/aarch64/fpu/s_fminf.c
index 78609575bb..028f88543f 100644
--- a/sysdeps/aarch64/fpu/s_fminf.c
+++ b/sysdeps/aarch64/fpu/s_fminf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,7 +16,13 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define FUNC fminf
-#define TYPE float
-#define REGS "s"
-#include <s_fmin.c>
+#include <math.h>
+#include <libm-alias-float.h>
+
+float
+__fminf (float x, float y)
+{
+ return __builtin_fminf (x, y);
+}
+
+libm_alias_float (__fmin, fmin)
diff --git a/sysdeps/aarch64/fpu/s_llrint.c b/sysdeps/aarch64/fpu/s_llrint.c
index 9769311b1b..287dc8974c 100644
--- a/sysdeps/aarch64/fpu/s_llrint.c
+++ b/sysdeps/aarch64/fpu/s_llrint.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,6 +16,21 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define FUNC llrint
-#define OTYPE long long int
-#include <s_lrint.c>
+#include <math.h>
+#include <math-barriers.h>
+#include <math_private.h>
+#include <libm-alias-double.h>
+
+long long int
+__llrint (double x)
+{
+ double r = __builtin_rint (x);
+
+ /* Prevent gcc from calling llrint directly when compiled with
+ -fno-math-errno by inserting a barrier. */
+
+ math_opt_barrier (r);
+ return r;
+}
+
+libm_alias_double (__llrint, llrint)
diff --git a/sysdeps/aarch64/fpu/s_llrintf.c b/sysdeps/aarch64/fpu/s_llrintf.c
index 51024a986c..70d6c0ca84 100644
--- a/sysdeps/aarch64/fpu/s_llrintf.c
+++ b/sysdeps/aarch64/fpu/s_llrintf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,8 +16,22 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define FUNC llrintf
-#define ITYPE float
-#define IREGS "s"
-#define OTYPE long long int
-#include <s_lrint.c>
+#include <math.h>
+#include <math-barriers.h>
+#include <math_private.h>
+#include <libm-alias-float.h>
+
+long long int
+__llrintf (float x)
+{
+ float r = __builtin_rintf (x);
+
+ /* Prevent gcc from calling llrintf directly when compiled with
+ -fno-math-errno by inserting a barrier. */
+
+
+ math_opt_barrier (r);
+ return r;
+}
+
+libm_alias_float (__llrint, llrint)
diff --git a/sysdeps/aarch64/fpu/s_llround.c b/sysdeps/aarch64/fpu/s_llround.c
index dd2b3a2e16..d6ff708f41 100644
--- a/sysdeps/aarch64/fpu/s_llround.c
+++ b/sysdeps/aarch64/fpu/s_llround.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,6 +16,13 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define FUNC llround
-#define OTYPE long long int
-#include <s_lround.c>
+#include <math.h>
+#include <libm-alias-double.h>
+
+long long int
+__llround (double x)
+{
+ return __builtin_llround (x);
+}
+
+libm_alias_double (__llround, llround)
diff --git a/sysdeps/aarch64/fpu/s_llroundf.c b/sysdeps/aarch64/fpu/s_llroundf.c
index 8b5d263d91..4cb2caf8b5 100644
--- a/sysdeps/aarch64/fpu/s_llroundf.c
+++ b/sysdeps/aarch64/fpu/s_llroundf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,8 +16,13 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define FUNC llroundf
-#define ITYPE float
-#define IREGS "s"
-#define OTYPE long long int
-#include <s_lround.c>
+#include <math.h>
+#include <libm-alias-float.h>
+
+long long int
+__llroundf (float x)
+{
+ return __builtin_llroundf (x);
+}
+
+libm_alias_float (__llround, llround)
diff --git a/sysdeps/aarch64/fpu/s_lrint.c b/sysdeps/aarch64/fpu/s_lrint.c
index e51de0fc2d..d6ef066e45 100644
--- a/sysdeps/aarch64/fpu/s_lrint.c
+++ b/sysdeps/aarch64/fpu/s_lrint.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1996-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1996-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -17,37 +17,73 @@
<http://www.gnu.org/licenses/>. */
#include <math.h>
+#include <get-rounding-mode.h>
+#include <stdint.h>
+#include <math-barriers.h>
+#include <math_private.h>
+#include <libm-alias-double.h>
-#ifndef FUNC
-# define FUNC lrint
-#endif
+# define IREG_SIZE 64
+
+# ifdef __ILP32__
+# define OREG_SIZE 32
+# else
+# define OREG_SIZE 64
+# endif
-#ifndef ITYPE
-# define ITYPE double
# define IREGS "d"
+
+#if OREG_SIZE == 32
+# define OREGS "w"
#else
-# ifndef IREGS
-# error IREGS not defined
-# endif
+# define OREGS "x"
#endif
-#ifndef OTYPE
-# define OTYPE long int
+
+long int
+__lrint (double x)
+{
+
+#if IREG_SIZE == 64 && OREG_SIZE == 32
+ long int result;
+
+ if (__builtin_fabs (x) > INT32_MAX)
+ {
+ /* Converting large values to a 32 bit int may cause the frintx/fcvtza
+ sequence to set both FE_INVALID and FE_INEXACT. To avoid this
+ check the rounding mode and do a single instruction with the
+ appropriate rounding mode. */
+
+ switch (get_rounding_mode ())
+ {
+ case FE_TONEAREST:
+ asm volatile ("fcvtns" "\t%" OREGS "0, %" IREGS "1"
+ : "=r" (result) : "w" (x));
+ break;
+ case FE_UPWARD:
+ asm volatile ("fcvtps" "\t%" OREGS "0, %" IREGS "1"
+ : "=r" (result) : "w" (x));
+ break;
+ case FE_DOWNWARD:
+ asm volatile ("fcvtms" "\t%" OREGS "0, %" IREGS "1"
+ : "=r" (result) : "w" (x));
+ break;
+ case FE_TOWARDZERO:
+ default:
+ asm volatile ("fcvtzs" "\t%" OREGS "0, %" IREGS "1"
+ : "=r" (result) : "w" (x));
+ }
+ return result;
+ }
#endif
-#define OREGS "x"
+ double r = __builtin_rint (x);
-#define __CONCATX(a,b) __CONCAT(a,b)
+ /* Prevent gcc from calling lrint directly when compiled with
+ -fno-math-errno by inserting a barrier. */
-OTYPE
-__CONCATX(__,FUNC) (ITYPE x)
-{
- OTYPE result;
- ITYPE temp;
- asm ( "frintx" "\t%" IREGS "1, %" IREGS "2\n\t"
- "fcvtzs" "\t%" OREGS "0, %" IREGS "1"
- : "=r" (result), "=w" (temp) : "w" (x) );
- return result;
+ math_opt_barrier (r);
+ return r;
}
-weak_alias (__CONCATX(__,FUNC), FUNC)
+libm_alias_double (__lrint, lrint)
diff --git a/sysdeps/aarch64/fpu/s_lrintf.c b/sysdeps/aarch64/fpu/s_lrintf.c
index b603510375..fa42d68dd1 100644
--- a/sysdeps/aarch64/fpu/s_lrintf.c
+++ b/sysdeps/aarch64/fpu/s_lrintf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,7 +16,21 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define FUNC lrintf
-#define ITYPE float
-#define IREGS "s"
-#include <s_lrint.c>
+#include <math.h>
+#include <math-barriers.h>
+#include <math_private.h>
+#include <libm-alias-float.h>
+
+long int
+__lrintf (float x)
+{
+ float r = __builtin_rintf (x);
+
+ /* Prevent gcc from calling lrintf directly when compiled with
+ -fno-math-errno by inserting a barrier. */
+
+ math_opt_barrier (r);
+ return r;
+}
+
+libm_alias_float (__lrint, lrint)
diff --git a/sysdeps/aarch64/fpu/s_lround.c b/sysdeps/aarch64/fpu/s_lround.c
index a14d347142..d8e9fc4664 100644
--- a/sysdeps/aarch64/fpu/s_lround.c
+++ b/sysdeps/aarch64/fpu/s_lround.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1996-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1996-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -17,35 +17,12 @@
<http://www.gnu.org/licenses/>. */
#include <math.h>
+#include <libm-alias-double.h>
-#ifndef FUNC
-# define FUNC lround
-#endif
+long int
+__lround (double x)
+ {
+ return __builtin_lround (x);
+ }
-#ifndef ITYPE
-# define ITYPE double
-# define IREGS "d"
-#else
-# ifndef IREGS
-# error IREGS not defined
-# endif
-#endif
-
-#ifndef OTYPE
-# define OTYPE long int
-#endif
-
-#define OREGS "x"
-
-#define __CONCATX(a,b) __CONCAT(a,b)
-
-OTYPE
-__CONCATX(__,FUNC) (ITYPE x)
-{
- OTYPE result;
- asm ( "fcvtas" "\t%" OREGS "0, %" IREGS "1"
- : "=r" (result) : "w" (x) );
- return result;
-}
-
-weak_alias (__CONCATX(__,FUNC), FUNC)
+libm_alias_double (__lround, lround)
diff --git a/sysdeps/aarch64/fpu/s_lroundf.c b/sysdeps/aarch64/fpu/s_lroundf.c
index e3a3fba937..9a76dbf24a 100644
--- a/sysdeps/aarch64/fpu/s_lroundf.c
+++ b/sysdeps/aarch64/fpu/s_lroundf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,7 +16,13 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define FUNC lroundf
-#define ITYPE float
-#define IREGS "s"
-#include <s_lround.c>
+#include <math.h>
+#include <libm-alias-float.h>
+
+long int
+__lroundf (float x)
+{
+ return __builtin_lroundf (x);
+}
+
+libm_alias_float (__lround, lround)
diff --git a/sysdeps/aarch64/fpu/s_nearbyint.c b/sysdeps/aarch64/fpu/s_nearbyint.c
index 3be8059c1b..477e1a8cfe 100644
--- a/sysdeps/aarch64/fpu/s_nearbyint.c
+++ b/sysdeps/aarch64/fpu/s_nearbyint.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,6 +16,13 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define FUNC nearbyint
-#define INSN "frinti"
-#include <s_frint.c>
+#include <math.h>
+#include <libm-alias-double.h>
+
+double
+__nearbyint (double x)
+{
+ return __builtin_nearbyint (x);
+}
+
+libm_alias_double (__nearbyint, nearbyint)
diff --git a/sysdeps/aarch64/fpu/s_nearbyintf.c b/sysdeps/aarch64/fpu/s_nearbyintf.c
index e544e5597e..4edab204a9 100644
--- a/sysdeps/aarch64/fpu/s_nearbyintf.c
+++ b/sysdeps/aarch64/fpu/s_nearbyintf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,6 +16,13 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define FUNC nearbyintf
-#define INSN "frinti"
-#include <s_frintf.c>
+#include <math.h>
+#include <libm-alias-float.h>
+
+float
+__nearbyintf (float x)
+{
+ return __builtin_nearbyintf (x);
+}
+
+libm_alias_float (__nearbyint, nearbyint)
diff --git a/sysdeps/aarch64/fpu/s_rint.c b/sysdeps/aarch64/fpu/s_rint.c
index d12c1a7c46..eb4232af00 100644
--- a/sysdeps/aarch64/fpu/s_rint.c
+++ b/sysdeps/aarch64/fpu/s_rint.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,6 +16,13 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define FUNC rint
-#define INSN "frintx"
-#include <s_frint.c>
+#include <math.h>
+#include <libm-alias-double.h>
+
+double
+__rint (double x)
+{
+ return __builtin_rint (x);
+}
+
+libm_alias_double (__rint, rint)
diff --git a/sysdeps/aarch64/fpu/s_rintf.c b/sysdeps/aarch64/fpu/s_rintf.c
index 375be92734..9ebfcb45b4 100644
--- a/sysdeps/aarch64/fpu/s_rintf.c
+++ b/sysdeps/aarch64/fpu/s_rintf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,6 +16,13 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define FUNC rintf
-#define INSN "frintx"
-#include <s_frintf.c>
+#include <math.h>
+#include <libm-alias-float.h>
+
+float
+__rintf (float x)
+{
+ return __builtin_rintf (x);
+}
+
+libm_alias_float (__rint, rint)
diff --git a/sysdeps/aarch64/fpu/s_round.c b/sysdeps/aarch64/fpu/s_round.c
index 239ca1b434..275b00f318 100644
--- a/sysdeps/aarch64/fpu/s_round.c
+++ b/sysdeps/aarch64/fpu/s_round.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,6 +16,13 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define FUNC round
-#define INSN "frinta"
-#include <s_frint.c>
+#include <math.h>
+#include <libm-alias-double.h>
+
+double
+__round (double x)
+{
+ return __builtin_round (x);
+}
+
+libm_alias_double (__round, round)
diff --git a/sysdeps/aarch64/fpu/s_roundf.c b/sysdeps/aarch64/fpu/s_roundf.c
index eb13586dc5..201ffc54db 100644
--- a/sysdeps/aarch64/fpu/s_roundf.c
+++ b/sysdeps/aarch64/fpu/s_roundf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,6 +16,13 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define FUNC roundf
-#define INSN "frinta"
-#include <s_frintf.c>
+#include <math.h>
+#include <libm-alias-float.h>
+
+float
+__roundf (float x)
+{
+ return __builtin_roundf (x);
+}
+
+libm_alias_float (__round, round)
diff --git a/sysdeps/aarch64/fpu/s_trunc.c b/sysdeps/aarch64/fpu/s_trunc.c
index 7107a1b18b..22f44f1e04 100644
--- a/sysdeps/aarch64/fpu/s_trunc.c
+++ b/sysdeps/aarch64/fpu/s_trunc.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,6 +16,13 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define FUNC trunc
-#define INSN "frintz"
-#include <s_frint.c>
+#include <math.h>
+#include <libm-alias-double.h>
+
+double
+__trunc (double x)
+{
+ return __builtin_trunc (x);
+}
+
+libm_alias_double (__trunc, trunc)
diff --git a/sysdeps/aarch64/fpu/s_truncf.c b/sysdeps/aarch64/fpu/s_truncf.c
index b08c1cb304..3771b6d6bf 100644
--- a/sysdeps/aarch64/fpu/s_truncf.c
+++ b/sysdeps/aarch64/fpu/s_truncf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,6 +16,13 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define FUNC truncf
-#define INSN "frintz"
-#include <s_frintf.c>
+#include <math.h>
+#include <libm-alias-float.h>
+
+float
+__truncf (float x)
+{
+ return __builtin_truncf (x);
+}
+
+libm_alias_float (__trunc, trunc)
diff --git a/sysdeps/aarch64/jmpbuf-offsets.h b/sysdeps/aarch64/jmpbuf-offsets.h
index 37f551a56b..a7096f2f69 100644
--- a/sysdeps/aarch64/jmpbuf-offsets.h
+++ b/sysdeps/aarch64/jmpbuf-offsets.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2006-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/jmpbuf-unwind.h b/sysdeps/aarch64/jmpbuf-unwind.h
index 3e0a37db4e..9309e7b0ea 100644
--- a/sysdeps/aarch64/jmpbuf-unwind.h
+++ b/sysdeps/aarch64/jmpbuf-unwind.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2005-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -27,7 +27,8 @@
((void *) (address) < (void *) demangle (jmpbuf[JB_SP]))
#define _JMPBUF_CFA_UNWINDS_ADJ(jmpbuf, context, adj) \
- _JMPBUF_UNWINDS_ADJ (jmpbuf, (void *) _Unwind_GetCFA (context), adj)
+ _JMPBUF_UNWINDS_ADJ (jmpbuf, (void *) (uintptr_t) _Unwind_GetCFA (context), \
+ adj)
#define _JMPBUF_UNWINDS_ADJ(_jmpbuf, _address, _adj) \
((uintptr_t) (_address) - (_adj) < _jmpbuf_sp (_jmpbuf) - (_adj))
diff --git a/sysdeps/aarch64/ldsodefs.h b/sysdeps/aarch64/ldsodefs.h
index 7c02e89371..600b721234 100644
--- a/sysdeps/aarch64/ldsodefs.h
+++ b/sysdeps/aarch64/ldsodefs.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2005-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -20,6 +20,7 @@
#define _AARCH64_LDSODEFS_H 1
#include <elf.h>
+#include <cpu-features.h>
struct La_aarch64_regs;
struct La_aarch64_retval;
diff --git a/sysdeps/aarch64/libc-tls.c b/sysdeps/aarch64/libc-tls.c
index f43c75f9b5..f2a2e1c0d3 100644
--- a/sysdeps/aarch64/libc-tls.c
+++ b/sysdeps/aarch64/libc-tls.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2005-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index bc5795b361..be06085154 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -40,9 +40,9 @@ ildouble: 2
ldouble: 2
Function: "acosh_downward":
-double: 1
+double: 2
float: 2
-idouble: 1
+idouble: 2
ifloat: 2
ildouble: 3
ldouble: 3
@@ -252,36 +252,36 @@ ildouble: 2
ldouble: 2
Function: Imaginary part of "cacos":
-double: 1
+double: 2
float: 2
-idouble: 1
+idouble: 2
ifloat: 2
ildouble: 2
ldouble: 2
Function: Real part of "cacos_downward":
-double: 2
+double: 3
float: 2
-idouble: 2
+idouble: 3
ifloat: 2
-ildouble: 2
-ldouble: 2
+ildouble: 3
+ldouble: 3
Function: Imaginary part of "cacos_downward":
double: 5
float: 3
idouble: 5
ifloat: 3
-ildouble: 5
-ldouble: 5
+ildouble: 6
+ldouble: 6
Function: Real part of "cacos_towardzero":
-double: 2
+double: 3
float: 2
-idouble: 2
+idouble: 3
ifloat: 2
-ildouble: 2
-ldouble: 2
+ildouble: 3
+ldouble: 3
Function: Imaginary part of "cacos_towardzero":
double: 5
@@ -300,17 +300,17 @@ ildouble: 3
ldouble: 3
Function: Imaginary part of "cacos_upward":
-double: 4
-float: 4
-idouble: 4
-ifloat: 4
-ildouble: 5
-ldouble: 5
+double: 5
+float: 5
+idouble: 5
+ifloat: 5
+ildouble: 7
+ldouble: 7
Function: Real part of "cacosh":
-double: 1
+double: 2
float: 2
-idouble: 1
+idouble: 2
ifloat: 2
ildouble: 2
ldouble: 2
@@ -332,12 +332,12 @@ ildouble: 5
ldouble: 5
Function: Imaginary part of "cacosh_downward":
-double: 2
-float: 2
-idouble: 2
-ifloat: 2
-ildouble: 2
-ldouble: 2
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 4
+ldouble: 4
Function: Real part of "cacosh_towardzero":
double: 5
@@ -348,28 +348,28 @@ ildouble: 5
ldouble: 5
Function: Imaginary part of "cacosh_towardzero":
-double: 2
+double: 3
float: 2
-idouble: 2
+idouble: 3
ifloat: 2
-ildouble: 2
-ldouble: 2
+ildouble: 3
+ldouble: 3
Function: Real part of "cacosh_upward":
double: 4
float: 4
idouble: 4
ifloat: 4
-ildouble: 5
-ldouble: 5
+ildouble: 6
+ldouble: 6
Function: Imaginary part of "cacosh_upward":
-double: 2
+double: 3
float: 2
-idouble: 2
+idouble: 3
ifloat: 2
-ildouble: 3
-ldouble: 3
+ildouble: 4
+ldouble: 4
Function: "carg":
double: 1
@@ -412,18 +412,18 @@ ildouble: 2
ldouble: 2
Function: Imaginary part of "casin":
-double: 1
+double: 2
float: 2
-idouble: 1
+idouble: 2
ifloat: 2
ildouble: 2
ldouble: 2
Function: Real part of "casin_downward":
double: 3
-float: 1
+float: 2
idouble: 3
-ifloat: 1
+ifloat: 2
ildouble: 3
ldouble: 3
@@ -432,8 +432,8 @@ double: 5
float: 3
idouble: 5
ifloat: 3
-ildouble: 5
-ldouble: 5
+ildouble: 6
+ldouble: 6
Function: Real part of "casin_towardzero":
double: 3
@@ -452,25 +452,25 @@ ildouble: 5
ldouble: 5
Function: Real part of "casin_upward":
-double: 2
-float: 1
-idouble: 2
-ifloat: 1
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
ildouble: 3
ldouble: 3
Function: Imaginary part of "casin_upward":
-double: 4
-float: 4
-idouble: 4
-ifloat: 4
-ildouble: 5
-ldouble: 5
+double: 5
+float: 5
+idouble: 5
+ifloat: 5
+ildouble: 7
+ldouble: 7
Function: Real part of "casinh":
-double: 1
+double: 2
float: 2
-idouble: 1
+idouble: 2
ifloat: 2
ildouble: 2
ldouble: 2
@@ -488,14 +488,14 @@ double: 5
float: 3
idouble: 5
ifloat: 3
-ildouble: 5
-ldouble: 5
+ildouble: 6
+ldouble: 6
Function: Imaginary part of "casinh_downward":
double: 3
-float: 1
+float: 2
idouble: 3
-ifloat: 1
+ifloat: 2
ildouble: 3
ldouble: 3
@@ -516,23 +516,25 @@ ildouble: 3
ldouble: 3
Function: Real part of "casinh_upward":
-double: 4
-float: 4
-idouble: 4
-ifloat: 4
-ildouble: 5
-ldouble: 5
+double: 5
+float: 5
+idouble: 5
+ifloat: 5
+ildouble: 7
+ldouble: 7
Function: Imaginary part of "casinh_upward":
-double: 2
+double: 3
float: 2
-idouble: 2
+idouble: 3
ifloat: 2
ildouble: 3
ldouble: 3
Function: Real part of "catan":
+double: 1
float: 1
+idouble: 1
ifloat: 1
ildouble: 1
ldouble: 1
@@ -547,9 +549,9 @@ ldouble: 1
Function: Real part of "catan_downward":
double: 1
-float: 1
+float: 2
idouble: 1
-ifloat: 1
+ifloat: 2
ildouble: 2
ldouble: 2
@@ -563,25 +565,27 @@ ldouble: 3
Function: Real part of "catan_towardzero":
double: 1
-float: 1
+float: 2
idouble: 1
-ifloat: 1
+ifloat: 2
ildouble: 2
ldouble: 2
Function: Imaginary part of "catan_towardzero":
double: 2
-float: 1
+float: 2
idouble: 2
-ifloat: 1
+ifloat: 2
ildouble: 3
ldouble: 3
Function: Real part of "catan_upward":
+double: 1
float: 1
+idouble: 1
ifloat: 1
-ildouble: 1
-ldouble: 1
+ildouble: 2
+ldouble: 2
Function: Imaginary part of "catan_upward":
double: 3
@@ -600,7 +604,9 @@ ildouble: 1
ldouble: 1
Function: Imaginary part of "catanh":
+double: 1
float: 1
+idouble: 1
ifloat: 1
ildouble: 1
ldouble: 1
@@ -623,9 +629,9 @@ ldouble: 2
Function: Real part of "catanh_towardzero":
double: 2
-float: 1
+float: 2
idouble: 2
-ifloat: 1
+ifloat: 2
ildouble: 3
ldouble: 3
@@ -639,17 +645,19 @@ ldouble: 2
Function: Real part of "catanh_upward":
double: 4
-float: 3
+float: 4
idouble: 4
-ifloat: 3
+ifloat: 4
ildouble: 4
ldouble: 4
Function: Imaginary part of "catanh_upward":
+double: 1
float: 1
+idouble: 1
ifloat: 1
-ildouble: 1
-ldouble: 1
+ildouble: 2
+ldouble: 2
Function: "cbrt":
double: 3
@@ -900,18 +908,18 @@ ildouble: 2
ldouble: 2
Function: Imaginary part of "clog10":
-double: 1
+double: 2
float: 2
-idouble: 1
+idouble: 2
ifloat: 2
ildouble: 2
ldouble: 2
Function: Real part of "clog10_downward":
double: 5
-float: 4
+float: 5
idouble: 5
-ifloat: 4
+ifloat: 5
ildouble: 3
ldouble: 3
@@ -1004,7 +1012,9 @@ ildouble: 2
ldouble: 2
Function: "cos":
+double: 1
float: 1
+idouble: 1
ifloat: 1
ildouble: 1
ldouble: 1
@@ -1323,9 +1333,9 @@ ldouble: 3
Function: Imaginary part of "ctan":
double: 2
-float: 1
+float: 2
idouble: 2
-ifloat: 1
+ifloat: 2
ildouble: 3
ldouble: 3
@@ -1339,9 +1349,9 @@ ldouble: 4
Function: Imaginary part of "ctan_downward":
double: 2
-float: 1
+float: 2
idouble: 2
-ifloat: 1
+ifloat: 2
ildouble: 5
ldouble: 5
@@ -1363,9 +1373,9 @@ ldouble: 5
Function: Real part of "ctan_upward":
double: 2
-float: 3
+float: 4
idouble: 2
-ifloat: 3
+ifloat: 4
ildouble: 5
ldouble: 5
@@ -1395,9 +1405,9 @@ ldouble: 3
Function: Real part of "ctanh_downward":
double: 4
-float: 1
+float: 2
idouble: 4
-ifloat: 1
+ifloat: 2
ildouble: 5
ldouble: 5
@@ -1575,15 +1585,21 @@ ldouble: 2
Function: "exp_downward":
double: 1
+float: 1
idouble: 1
+ifloat: 1
Function: "exp_towardzero":
double: 1
+float: 1
idouble: 1
+ifloat: 1
Function: "exp_upward":
double: 1
+float: 1
idouble: 1
+ifloat: 1
Function: "expm1":
double: 1
@@ -1683,9 +1699,9 @@ ldouble: 2
Function: "j0_downward":
double: 2
-float: 3
+float: 4
idouble: 2
-ifloat: 3
+ifloat: 4
ildouble: 4
ldouble: 4
@@ -1910,55 +1926,27 @@ ildouble: 1
ldouble: 1
Function: "log_towardzero":
-float: 1
-ifloat: 1
+float: 2
+ifloat: 2
ildouble: 2
ldouble: 2
Function: "log_upward":
double: 1
-float: 1
+float: 2
idouble: 1
-ifloat: 1
+ifloat: 2
ildouble: 1
ldouble: 1
Function: "pow":
+double: 1
float: 1
+idouble: 1
ifloat: 1
ildouble: 2
ldouble: 2
-Function: "pow10":
-double: 2
-idouble: 2
-ildouble: 2
-ldouble: 2
-
-Function: "pow10_downward":
-double: 2
-float: 1
-idouble: 2
-ifloat: 1
-ildouble: 3
-ldouble: 3
-
-Function: "pow10_towardzero":
-double: 2
-float: 1
-idouble: 2
-ifloat: 1
-ildouble: 3
-ldouble: 3
-
-Function: "pow10_upward":
-double: 2
-float: 1
-idouble: 2
-ifloat: 1
-ildouble: 3
-ldouble: 3
-
Function: "pow_downward":
double: 1
float: 1
@@ -1984,7 +1972,9 @@ ildouble: 2
ldouble: 2
Function: "sin":
+double: 1
float: 1
+idouble: 1
ifloat: 1
ildouble: 1
ldouble: 1
@@ -2014,7 +2004,9 @@ ildouble: 3
ldouble: 3
Function: "sincos":
+double: 1
float: 1
+idouble: 1
ifloat: 1
ildouble: 1
ldouble: 1
@@ -2179,9 +2171,9 @@ ldouble: 3
Function: "y0_downward":
double: 3
-float: 2
+float: 4
idouble: 3
-ifloat: 2
+ifloat: 4
ildouble: 4
ldouble: 4
@@ -2195,9 +2187,9 @@ ldouble: 3
Function: "y0_upward":
double: 2
-float: 3
+float: 5
idouble: 2
-ifloat: 3
+ifloat: 5
ildouble: 3
ldouble: 3
@@ -2235,17 +2227,17 @@ ldouble: 5
Function: "yn":
double: 3
-float: 2
+float: 3
idouble: 3
-ifloat: 2
+ifloat: 3
ildouble: 5
ldouble: 5
Function: "yn_downward":
double: 3
-float: 2
+float: 4
idouble: 3
-ifloat: 2
+ifloat: 4
ildouble: 5
ldouble: 5
@@ -2259,9 +2251,9 @@ ldouble: 5
Function: "yn_upward":
double: 4
-float: 3
+float: 5
idouble: 4
-ifloat: 3
+ifloat: 5
ildouble: 5
ldouble: 5
diff --git a/sysdeps/aarch64/libm-test-ulps-name b/sysdeps/aarch64/libm-test-ulps-name
new file mode 100644
index 0000000000..1f66c5cda0
--- /dev/null
+++ b/sysdeps/aarch64/libm-test-ulps-name
@@ -0,0 +1 @@
+AArch64
diff --git a/sysdeps/aarch64/linkmap.h b/sysdeps/aarch64/linkmap.h
index 5aefb4cfeb..6852f343a1 100644
--- a/sysdeps/aarch64/linkmap.h
+++ b/sysdeps/aarch64/linkmap.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2009-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/machine-gmon.h b/sysdeps/aarch64/machine-gmon.h
index 3d923f91ce..dd12a25416 100644
--- a/sysdeps/aarch64/machine-gmon.h
+++ b/sysdeps/aarch64/machine-gmon.h
@@ -1,5 +1,5 @@
/* AArch64 definitions for profiling support.
- Copyright (C) 1996-2016 Free Software Foundation, Inc.
+ Copyright (C) 1996-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/aarch64/math-tests.h b/sysdeps/aarch64/math-tests.h
index a3e1efae97..46cc33741f 100644
--- a/sysdeps/aarch64/math-tests.h
+++ b/sysdeps/aarch64/math-tests.h
@@ -1,5 +1,5 @@
/* Configuration for math tests. AArch64 version.
- Copyright (C) 2014-2016 Free Software Foundation, Inc.
+ Copyright (C) 2014-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/aarch64/mcount.c b/sysdeps/aarch64/mcount.c
index ca596ee986..515c6bde25 100644
--- a/sysdeps/aarch64/mcount.c
+++ b/sysdeps/aarch64/mcount.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2013-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
new file mode 100644
index 0000000000..e422aef090
--- /dev/null
+++ b/sysdeps/aarch64/memchr.S
@@ -0,0 +1,157 @@
+/* memchr - find a character in a memory zone
+
+ Copyright (C) 2015-2018 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+#define cntin x2
+
+#define result x0
+
+#define src x3
+#define tmp x4
+#define wtmp2 w5
+#define synd x6
+#define soff x9
+#define cntrem x10
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_chr1 v3
+#define vhas_chr2 v4
+#define vrepmask v5
+#define vend v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ENTRY (__memchr)
+ /* Do not dereference srcin if no bytes to compare. */
+ cbz cntin, L(zero_length)
+ /*
+ * Magic constant 0x40100401 allows us to identify which lane matches
+ * the requested byte.
+ */
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ /* Work with aligned 32-byte chunks */
+ bic src, srcin, #31
+ dup vrepmask.4s, wtmp2
+ ands soff, srcin, #31
+ and cntrem, cntin, #31
+ b.eq L(loop)
+
+ /*
+ * Input string is not 32-byte aligned. We calculate the syndrome
+ * value for the aligned 32 bytes block containing the first bytes
+ * and mask the irrelevant part.
+ */
+
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ sub tmp, soff, #32
+ adds cntin, cntin, tmp
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.2d[0]
+ /* Clear the soff*2 lower bits */
+ lsl tmp, soff, #1
+ lsr synd, synd, tmp
+ lsl synd, synd, tmp
+ /* The first block can also be the last */
+ b.ls L(masklast)
+ /* Have we found something already? */
+ cbnz synd, L(tail)
+
+L(loop):
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ subs cntin, cntin, #32
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ /* If we're out of data we finish regardless of the result */
+ b.ls L(end)
+ /* Use a fast check for the termination condition */
+ orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
+ addp vend.2d, vend.2d, vend.2d
+ mov synd, vend.2d[0]
+ /* We're not out of data, loop if we haven't found the character */
+ cbz synd, L(loop)
+
+L(end):
+ /* Termination condition found, let's calculate the syndrome value */
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.2d[0]
+ /* Only do the clear for the last possible block */
+ b.hi L(tail)
+
+L(masklast):
+ /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+ add tmp, cntrem, soff
+ and tmp, tmp, #31
+ sub tmp, tmp, #32
+ neg tmp, tmp, lsl #1
+ lsl synd, synd, tmp
+ lsr synd, synd, tmp
+
+L(tail):
+ /* Count the trailing zeros using bit reversing */
+ rbit synd, synd
+ /* Compensate the last post-increment */
+ sub src, src, #32
+ /* Check that we have found a character */
+ cmp synd, #0
+ /* And count the leading zeros */
+ clz synd, synd
+ /* Compute the potential result */
+ add result, src, synd, lsr #1
+ /* Select result or NULL */
+ csel result, xzr, result, eq
+ ret
+
+L(zero_length):
+ mov result, #0
+ ret
+END (__memchr)
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
index ae2d997973..743bc078bb 100644
--- a/sysdeps/aarch64/memcmp.S
+++ b/sysdeps/aarch64/memcmp.S
@@ -1,6 +1,6 @@
/* memcmp - compare memory
- Copyright (C) 2013-2016 Free Software Foundation, Inc.
+ Copyright (C) 2013-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -22,129 +22,132 @@
/* Assumptions:
*
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64, unaligned accesses.
*/
/* Parameters and result. */
#define src1 x0
#define src2 x1
#define limit x2
-#define result x0
+#define result w0
/* Internal variables. */
#define data1 x3
#define data1w w3
-#define data2 x4
-#define data2w w4
-#define has_nul x5
-#define diff x6
-#define endloop x7
-#define tmp1 x8
-#define tmp2 x9
-#define tmp3 x10
-#define pos x11
-#define limit_wd x12
-#define mask x13
+#define data1h x4
+#define data2 x5
+#define data2w w5
+#define data2h x6
+#define tmp1 x7
+#define tmp2 x8
ENTRY_ALIGN (memcmp, 6)
- cbz limit, L(ret0)
- eor tmp1, src1, src2
- tst tmp1, #7
- b.ne L(misaligned8)
- ands tmp1, src1, #7
- b.ne L(mutual_align)
- add limit_wd, limit, #7
- lsr limit_wd, limit_wd, #3
- /* Start of performance-critical section -- one 64B cache line. */
-L(loop_aligned):
- ldr data1, [src1], #8
- ldr data2, [src2], #8
-L(start_realigned):
- subs limit_wd, limit_wd, #1
- eor diff, data1, data2 /* Non-zero if differences found. */
- csinv endloop, diff, xzr, ne /* Last Dword or differences. */
- cbz endloop, L(loop_aligned)
- /* End of performance-critical section -- one 64B cache line. */
-
- /* Not reached the limit, must have found a diff. */
- cbnz limit_wd, L(not_limit)
-
- /* Limit % 8 == 0 => all bytes significant. */
- ands limit, limit, #7
- b.eq L(not_limit)
-
- lsl limit, limit, #3 /* Bits -> bytes. */
- mov mask, #~0
-#ifdef __AARCH64EB__
- lsr mask, mask, limit
-#else
- lsl mask, mask, limit
-#endif
- bic data1, data1, mask
- bic data2, data2, mask
-
- orr diff, diff, mask
-L(not_limit):
-
-#ifndef __AARCH64EB__
- rev diff, diff
+ DELOUSE (0)
+ DELOUSE (1)
+ DELOUSE (2)
+
+ subs limit, limit, 8
+ b.lo L(less8)
+
+ ldr data1, [src1], 8
+ ldr data2, [src2], 8
+ cmp data1, data2
+ b.ne L(return)
+
+ subs limit, limit, 8
+ b.gt L(more16)
+
+ ldr data1, [src1, limit]
+ ldr data2, [src2, limit]
+ b L(return)
+
+L(more16):
+ ldr data1, [src1], 8
+ ldr data2, [src2], 8
+ cmp data1, data2
+ bne L(return)
+
+ /* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+ strings. */
+ subs limit, limit, 16
+ b.ls L(last_bytes)
+
+ /* We overlap loads between 0-32 bytes at either side of SRC1 when we
+ try to align, so limit it only to strings larger than 128 bytes. */
+ cmp limit, 96
+ b.ls L(loop16)
+
+ /* Align src1 and adjust src2 with bytes not yet done. */
+ and tmp1, src1, 15
+ add limit, limit, tmp1
+ sub src1, src1, tmp1
+ sub src2, src2, tmp1
+
+ /* Loop performing 16 bytes per iteration using aligned src1.
+ Limit is pre-decremented by 16 and must be larger than zero.
+ Exit if <= 16 bytes left to do or if the data is not equal. */
+ .p2align 4
+L(loop16):
+ ldp data1, data1h, [src1], 16
+ ldp data2, data2h, [src2], 16
+ subs limit, limit, 16
+ ccmp data1, data2, 0, hi
+ ccmp data1h, data2h, 0, eq
+ b.eq L(loop16)
+
+ cmp data1, data2
+ bne L(return)
+ mov data1, data1h
+ mov data2, data2h
+ cmp data1, data2
+ bne L(return)
+
+ /* Compare last 1-16 bytes using unaligned access. */
+L(last_bytes):
+ add src1, src1, limit
+ add src2, src2, limit
+ ldp data1, data1h, [src1]
+ ldp data2, data2h, [src2]
+ cmp data1, data2
+ bne L(return)
+ mov data1, data1h
+ mov data2, data2h
+ cmp data1, data2
+
+ /* Compare data bytes and set return value to 0, -1 or 1. */
+L(return):
+#ifndef __AARCH64EB__
rev data1, data1
rev data2, data2
#endif
- /* The MS-non-zero bit of DIFF marks either the first bit
- that is different, or the end of the significant data.
- Shifting left now will bring the critical information into the
- top bits. */
- clz pos, diff
- lsl data1, data1, pos
- lsl data2, data2, pos
- /* But we need to zero-extend (char is unsigned) the value and then
- perform a signed 32-bit subtraction. */
- lsr data1, data1, #56
- sub result, data1, data2, lsr #56
- RET
-
-L(mutual_align):
- /* Sources are mutually aligned, but are not currently at an
- alignment boundary. Round down the addresses and then mask off
- the bytes that precede the start point. */
- bic src1, src1, #7
- bic src2, src2, #7
- add limit, limit, tmp1 /* Adjust the limit for the extra. */
- lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
- ldr data1, [src1], #8
- neg tmp1, tmp1 /* Bits to alignment -64. */
- ldr data2, [src2], #8
- mov tmp2, #~0
-#ifdef __AARCH64EB__
- /* Big-endian. Early bytes are at MSB. */
- lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
-#else
- /* Little-endian. Early bytes are at LSB. */
- lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
-#endif
- add limit_wd, limit, #7
- orr data1, data1, tmp2
- orr data2, data2, tmp2
- lsr limit_wd, limit_wd, #3
- b L(start_realigned)
-
-L(ret0):
- mov result, #0
- RET
-
- .p2align 6
-L(misaligned8):
- sub limit, limit, #1
-1:
- /* Perhaps we can do better than this. */
- ldrb data1w, [src1], #1
- ldrb data2w, [src2], #1
- subs limit, limit, #1
- ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.eq 1b
- sub result, data1, data2
- RET
+ cmp data1, data2
+L(ret_eq):
+ cset result, ne
+ cneg result, result, lo
+ ret
+
+ .p2align 4
+ /* Compare up to 8 bytes. Limit is [-8..-1]. */
+L(less8):
+ adds limit, limit, 4
+ b.lo L(less4)
+ ldr data1w, [src1], 4
+ ldr data2w, [src2], 4
+ cmp data1w, data2w
+ b.ne L(return)
+ sub limit, limit, 4
+L(less4):
+ adds limit, limit, 4
+ beq L(ret_eq)
+L(byte_loop):
+ ldrb data1w, [src1], 1
+ ldrb data2w, [src2], 1
+ subs limit, limit, 1
+ ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
+ b.eq L(byte_loop)
+ sub result, data1w, data2w
+ ret
+
END (memcmp)
#undef bcmp
weak_alias (memcmp, bcmp)
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 442f390426..7e1163e6a0 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2012-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,161 +16,252 @@
License along with the GNU C Library. If not, see
<http://www.gnu.org/licenses/>. */
+#include <sysdep.h>
+
/* Assumptions:
*
- * ARMv8-a, AArch64
- * Unaligned accesses
+ * ARMv8-a, AArch64, unaligned accesses.
*
*/
#define dstin x0
#define src x1
#define count x2
-#define tmp1 x3
-#define tmp1w w3
-#define tmp2 x4
-#define tmp2w w4
-#define tmp3 x5
-#define tmp3w w5
-#define dst x6
-
-#define A_l x7
-#define A_h x8
-#define B_l x9
-#define B_h x10
-#define C_l x11
-#define C_h x12
-#define D_l x13
-#define D_h x14
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define A_l x6
+#define A_lw w6
+#define A_h x7
+#define A_hw w7
+#define B_l x8
+#define B_lw w8
+#define B_h x9
+#define C_l x10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l src
+#define E_h count
+#define F_l srcend
+#define F_h dst
+#define G_l count
+#define G_h dst
+#define tmp1 x14
-#include <sysdep.h>
+/* Copies are split into 3 main cases: small copies of up to 16 bytes,
+ medium copies of 17..96 bytes which are fully unrolled. Large copies
+ of more than 96 bytes align the destination and use an unrolled loop
+ processing 64 bytes per iteration.
+ In order to share code with memmove, small and medium copies read all
+ data before writing, allowing any kind of overlap. So small, medium
+ and large backwards memmoves are handled by falling through into memcpy.
+ Overlapping large forward memmoves use a loop that copies backwards.
+*/
-ENTRY_ALIGN (memcpy, 6)
-
- mov dst, dstin
- cmp count, #64
- b.ge L(cpy_not_short)
- cmp count, #15
- b.le L(tail15tiny)
-
- /* Deal with small copies quickly by dropping straight into the
- * exit block. */
-L(tail63):
- /* Copy up to 48 bytes of data. At this point we only need the
- * bottom 6 bits of count to be accurate. */
- ands tmp1, count, #0x30
- b.eq L(tail15)
- add dst, dst, tmp1
- add src, src, tmp1
- cmp tmp1w, #0x20
- b.eq 1f
- b.lt 2f
- ldp A_l, A_h, [src, #-48]
- stp A_l, A_h, [dst, #-48]
-1:
- ldp A_l, A_h, [src, #-32]
- stp A_l, A_h, [dst, #-32]
-2:
- ldp A_l, A_h, [src, #-16]
- stp A_l, A_h, [dst, #-16]
-
-L(tail15):
- ands count, count, #15
- beq 1f
- add src, src, count
- ldp A_l, A_h, [src, #-16]
- add dst, dst, count
- stp A_l, A_h, [dst, #-16]
-1:
- RET
-
-L(tail15tiny):
- /* Copy up to 15 bytes of data. Does not assume additional data
- being copied. */
- tbz count, #3, 1f
- ldr tmp1, [src], #8
- str tmp1, [dst], #8
+#ifndef MEMMOVE
+# define MEMMOVE memmove
+#endif
+#ifndef MEMCPY
+# define MEMCPY memcpy
+#endif
+
+ENTRY_ALIGN (MEMMOVE, 6)
+
+ DELOUSE (0)
+ DELOUSE (1)
+ DELOUSE (2)
+
+ sub tmp1, dstin, src
+ cmp count, 96
+ ccmp tmp1, count, 2, hi
+ b.lo L(move_long)
+
+ /* Common case falls through into memcpy. */
+END (MEMMOVE)
+libc_hidden_builtin_def (MEMMOVE)
+ENTRY (MEMCPY)
+
+ DELOUSE (0)
+ DELOUSE (1)
+ DELOUSE (2)
+
+ prfm PLDL1KEEP, [src]
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 16
+ b.ls L(copy16)
+ cmp count, 96
+ b.hi L(copy_long)
+
+ /* Medium copies: 17..96 bytes. */
+ sub tmp1, count, 1
+ ldp A_l, A_h, [src]
+ tbnz tmp1, 6, L(copy96)
+ ldp D_l, D_h, [srcend, -16]
+ tbz tmp1, 5, 1f
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
1:
- tbz count, #2, 1f
- ldr tmp1w, [src], #4
- str tmp1w, [dst], #4
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Small copies: 0..16 bytes. */
+L(copy16):
+ cmp count, 8
+ b.lo 1f
+ ldr A_l, [src]
+ ldr A_h, [srcend, -8]
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
+ ret
+ .p2align 4
1:
- tbz count, #1, 1f
- ldrh tmp1w, [src], #2
- strh tmp1w, [dst], #2
+ tbz count, 2, 1f
+ ldr A_lw, [src]
+ ldr A_hw, [srcend, -4]
+ str A_lw, [dstin]
+ str A_hw, [dstend, -4]
+ ret
+
+ /* Copy 0..3 bytes. Use a branchless sequence that copies the same
+ byte 3 times if count==1, or the 2nd byte twice if count==2. */
1:
- tbz count, #0, 1f
- ldrb tmp1w, [src]
- strb tmp1w, [dst]
+ cbz count, 2f
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb A_hw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb A_hw, [dstend, -1]
+2: ret
+
+ .p2align 4
+ /* Copy 64..96 bytes. Copy 64 bytes from the start and
+ 32 bytes from the end. */
+L(copy96):
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [src, 32]
+ ldp D_l, D_h, [src, 48]
+ ldp E_l, E_h, [srcend, -32]
+ ldp F_l, F_h, [srcend, -16]
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin, 32]
+ stp D_l, D_h, [dstin, 48]
+ stp E_l, E_h, [dstend, -32]
+ stp F_l, F_h, [dstend, -16]
+ ret
+
+ /* Align DST to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 96 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 64 bytes per iteration and prefetches one iteration ahead. */
+
+ .p2align 4
+L(copy_long):
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ ldp D_l, D_h, [src]
+ sub src, src, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_l, A_h, [src, 16]
+ stp D_l, D_h, [dstin]
+ ldp B_l, B_h, [src, 32]
+ ldp C_l, C_h, [src, 48]
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls L(last64)
+L(loop64):
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]!
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 64
+ b.hi L(loop64)
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the end even if
+ there is just 1 byte left. */
+L(last64):
+ ldp E_l, E_h, [srcend, -64]
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [srcend, -48]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [srcend, -16]
+ stp D_l, D_h, [dst, 64]
+ stp E_l, E_h, [dstend, -64]
+ stp A_l, A_h, [dstend, -48]
+ stp B_l, B_h, [dstend, -32]
+ stp C_l, C_h, [dstend, -16]
+ ret
+
+ .p2align 4
+L(move_long):
+ cbz tmp1, 3f
+
+ add srcend, src, count
+ add dstend, dstin, count
+
+ /* Align dstend to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 96 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 64 bytes per iteration and prefetches one iteration ahead. */
+
+ and tmp1, dstend, 15
+ ldp D_l, D_h, [srcend, -16]
+ sub srcend, srcend, tmp1
+ sub count, count, tmp1
+ ldp A_l, A_h, [srcend, -16]
+ stp D_l, D_h, [dstend, -16]
+ ldp B_l, B_h, [srcend, -32]
+ ldp C_l, C_h, [srcend, -48]
+ ldp D_l, D_h, [srcend, -64]!
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls 2f
+
+ nop
1:
- RET
-
-L(cpy_not_short):
- /* We don't much care about the alignment of DST, but we want SRC
- * to be 128-bit (16 byte) aligned so that we don't cross cache line
- * boundaries on both loads and stores. */
- neg tmp2, src
- ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
- b.eq 2f
- sub count, count, tmp2
- /* Copy more data than needed; it's faster than jumping
- * around copying sub-Quadword quantities. We know that
- * it can't overrun. */
- ldp A_l, A_h, [src]
- add src, src, tmp2
- stp A_l, A_h, [dst]
- add dst, dst, tmp2
- /* There may be less than 63 bytes to go now. */
- cmp count, #63
- b.le L(tail63)
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [srcend, -16]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [srcend, -48]
+ stp D_l, D_h, [dstend, -64]!
+ ldp D_l, D_h, [srcend, -64]!
+ subs count, count, 64
+ b.hi 1b
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the start even if
+ there is just 1 byte left. */
2:
- subs count, count, #128
- b.ge L(cpy_body_large)
- /* Less than 128 bytes to copy, so handle 64 here and then jump
- * to the tail. */
- ldp A_l, A_h, [src]
- ldp B_l, B_h, [src, #16]
- ldp C_l, C_h, [src, #32]
- ldp D_l, D_h, [src, #48]
- stp A_l, A_h, [dst]
- stp B_l, B_h, [dst, #16]
- stp C_l, C_h, [dst, #32]
- stp D_l, D_h, [dst, #48]
- tst count, #0x3f
- add src, src, #64
- add dst, dst, #64
- b.ne L(tail63)
- RET
-
- /* Critical loop. Start at a new cache line boundary. Assuming
- * 64 bytes per line this ensures the entire loop is in one line. */
- .p2align 6
-L(cpy_body_large):
- /* There are at least 128 bytes to copy. */
- ldp A_l, A_h, [src, #0]
- sub dst, dst, #16 /* Pre-bias. */
- ldp B_l, B_h, [src, #16]
- ldp C_l, C_h, [src, #32]
- ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
-1:
- stp A_l, A_h, [dst, #16]
- ldp A_l, A_h, [src, #16]
- stp B_l, B_h, [dst, #32]
- ldp B_l, B_h, [src, #32]
- stp C_l, C_h, [dst, #48]
- ldp C_l, C_h, [src, #48]
- stp D_l, D_h, [dst, #64]!
- ldp D_l, D_h, [src, #64]!
- subs count, count, #64
- b.ge 1b
- stp A_l, A_h, [dst, #16]
- stp B_l, B_h, [dst, #32]
- stp C_l, C_h, [dst, #48]
- stp D_l, D_h, [dst, #64]
- add src, src, #16
- add dst, dst, #64 + 16
- tst count, #0x3f
- b.ne L(tail63)
- RET
-END (memcpy)
-libc_hidden_builtin_def (memcpy)
+ ldp G_l, G_h, [src, 48]
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [src, 32]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [src, 16]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [src]
+ stp D_l, D_h, [dstend, -64]
+ stp G_l, G_h, [dstin, 48]
+ stp A_l, A_h, [dstin, 32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin]
+3: ret
+
+END (MEMCPY)
+libc_hidden_builtin_def (MEMCPY)
diff --git a/sysdeps/aarch64/memmove.S b/sysdeps/aarch64/memmove.S
index dd91db0ff6..0feeac8414 100644
--- a/sysdeps/aarch64/memmove.S
+++ b/sysdeps/aarch64/memmove.S
@@ -1,312 +1 @@
-/* Copyright (C) 2012-2016 Free Software Foundation, Inc.
-
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library. If not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64
- * Unaligned accesses
- */
-
-/* Parameters and result. */
-#define dstin x0
-#define src x1
-#define count x2
-#define tmp1 x3
-#define tmp1w w3
-#define tmp2 x4
-#define tmp2w w4
-#define tmp3 x5
-#define tmp3w w5
-#define dst x6
-
-#define A_l x7
-#define A_h x8
-#define B_l x9
-#define B_h x10
-#define C_l x11
-#define C_h x12
-#define D_l x13
-#define D_h x14
-
-ENTRY_ALIGN (memmove, 6)
-
- cmp dstin, src
- b.lo L(downwards)
- add tmp1, src, count
- cmp dstin, tmp1
- b.hs memcpy /* No overlap. */
-
- /* Upwards move with potential overlap.
- * Need to move from the tail backwards. SRC and DST point one
- * byte beyond the remaining data to move. */
- add dst, dstin, count
- add src, src, count
- cmp count, #64
- b.ge L(mov_not_short_up)
-
- /* Deal with small moves quickly by dropping straight into the
- * exit block. */
-L(tail63up):
- /* Move up to 48 bytes of data. At this point we only need the
- * bottom 6 bits of count to be accurate. */
- ands tmp1, count, #0x30
- b.eq L(tail15up)
- sub dst, dst, tmp1
- sub src, src, tmp1
- cmp tmp1w, #0x20
- b.eq 1f
- b.lt 2f
- ldp A_l, A_h, [src, #32]
- stp A_l, A_h, [dst, #32]
-1:
- ldp A_l, A_h, [src, #16]
- stp A_l, A_h, [dst, #16]
-2:
- ldp A_l, A_h, [src]
- stp A_l, A_h, [dst]
-L(tail15up):
- /* Move up to 15 bytes of data. Does not assume additional data
- * being moved. */
- tbz count, #3, 1f
- ldr tmp1, [src, #-8]!
- str tmp1, [dst, #-8]!
-1:
- tbz count, #2, 1f
- ldr tmp1w, [src, #-4]!
- str tmp1w, [dst, #-4]!
-1:
- tbz count, #1, 1f
- ldrh tmp1w, [src, #-2]!
- strh tmp1w, [dst, #-2]!
-1:
- tbz count, #0, 1f
- ldrb tmp1w, [src, #-1]
- strb tmp1w, [dst, #-1]
-1:
- RET
-
-L(mov_not_short_up):
- /* We don't much care about the alignment of DST, but we want SRC
- * to be 128-bit (16 byte) aligned so that we don't cross cache line
- * boundaries on both loads and stores. */
- ands tmp2, src, #15 /* Bytes to reach alignment. */
- b.eq 2f
- sub count, count, tmp2
- /* Move enough data to reach alignment; unlike memcpy, we have to
- * be aware of the overlap, which means we can't move data twice. */
- tbz tmp2, #3, 1f
- ldr tmp1, [src, #-8]!
- str tmp1, [dst, #-8]!
-1:
- tbz tmp2, #2, 1f
- ldr tmp1w, [src, #-4]!
- str tmp1w, [dst, #-4]!
-1:
- tbz tmp2, #1, 1f
- ldrh tmp1w, [src, #-2]!
- strh tmp1w, [dst, #-2]!
-1:
- tbz tmp2, #0, 1f
- ldrb tmp1w, [src, #-1]!
- strb tmp1w, [dst, #-1]!
-1:
-
- /* There may be less than 63 bytes to go now. */
- cmp count, #63
- b.le L(tail63up)
-2:
- subs count, count, #128
- b.ge L(mov_body_large_up)
- /* Less than 128 bytes to move, so handle 64 here and then jump
- * to the tail. */
- ldp A_l, A_h, [src, #-64]!
- ldp B_l, B_h, [src, #16]
- ldp C_l, C_h, [src, #32]
- ldp D_l, D_h, [src, #48]
- stp A_l, A_h, [dst, #-64]!
- stp B_l, B_h, [dst, #16]
- stp C_l, C_h, [dst, #32]
- stp D_l, D_h, [dst, #48]
- tst count, #0x3f
- b.ne L(tail63up)
- RET
-
- /* Critical loop. Start at a new Icache line boundary. Assuming
- * 64 bytes per line this ensures the entire loop is in one line. */
- .p2align 6
-L(mov_body_large_up):
- /* There are at least 128 bytes to move. */
- ldp A_l, A_h, [src, #-16]
- ldp B_l, B_h, [src, #-32]
- ldp C_l, C_h, [src, #-48]
- ldp D_l, D_h, [src, #-64]!
-1:
- stp A_l, A_h, [dst, #-16]
- ldp A_l, A_h, [src, #-16]
- stp B_l, B_h, [dst, #-32]
- ldp B_l, B_h, [src, #-32]
- stp C_l, C_h, [dst, #-48]
- ldp C_l, C_h, [src, #-48]
- stp D_l, D_h, [dst, #-64]!
- ldp D_l, D_h, [src, #-64]!
- subs count, count, #64
- b.ge 1b
- stp A_l, A_h, [dst, #-16]
- stp B_l, B_h, [dst, #-32]
- stp C_l, C_h, [dst, #-48]
- stp D_l, D_h, [dst, #-64]!
- tst count, #0x3f
- b.ne L(tail63up)
- RET
-
-L(downwards):
- /* For a downwards move we can safely use memcpy provided that
- * DST is more than 16 bytes away from SRC. */
- sub tmp1, src, #16
- cmp dstin, tmp1
- b.ls memcpy /* May overlap, but not critically. */
-
- mov dst, dstin /* Preserve DSTIN for return value. */
- cmp count, #64
- b.ge L(mov_not_short_down)
-
- /* Deal with small moves quickly by dropping straight into the
- * exit block. */
-L(tail63down):
- /* Move up to 48 bytes of data. At this point we only need the
- * bottom 6 bits of count to be accurate. */
- ands tmp1, count, #0x30
- b.eq L(tail15down)
- add dst, dst, tmp1
- add src, src, tmp1
- cmp tmp1w, #0x20
- b.eq 1f
- b.lt 2f
- ldp A_l, A_h, [src, #-48]
- stp A_l, A_h, [dst, #-48]
-1:
- ldp A_l, A_h, [src, #-32]
- stp A_l, A_h, [dst, #-32]
-2:
- ldp A_l, A_h, [src, #-16]
- stp A_l, A_h, [dst, #-16]
-L(tail15down):
- /* Move up to 15 bytes of data. Does not assume additional data
- being moved. */
- tbz count, #3, 1f
- ldr tmp1, [src], #8
- str tmp1, [dst], #8
-1:
- tbz count, #2, 1f
- ldr tmp1w, [src], #4
- str tmp1w, [dst], #4
-1:
- tbz count, #1, 1f
- ldrh tmp1w, [src], #2
- strh tmp1w, [dst], #2
-1:
- tbz count, #0, 1f
- ldrb tmp1w, [src]
- strb tmp1w, [dst]
-1:
- RET
-
-L(mov_not_short_down):
- /* We don't much care about the alignment of DST, but we want SRC
- * to be 128-bit (16 byte) aligned so that we don't cross cache line
- * boundaries on both loads and stores. */
- neg tmp2, src
- ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
- b.eq 2f
- sub count, count, tmp2
- /* Move enough data to reach alignment; unlike memcpy, we have to
- * be aware of the overlap, which means we can't move data twice. */
- tbz tmp2, #3, 1f
- ldr tmp1, [src], #8
- str tmp1, [dst], #8
-1:
- tbz tmp2, #2, 1f
- ldr tmp1w, [src], #4
- str tmp1w, [dst], #4
-1:
- tbz tmp2, #1, 1f
- ldrh tmp1w, [src], #2
- strh tmp1w, [dst], #2
-1:
- tbz tmp2, #0, 1f
- ldrb tmp1w, [src], #1
- strb tmp1w, [dst], #1
-1:
-
- /* There may be less than 63 bytes to go now. */
- cmp count, #63
- b.le L(tail63down)
-2:
- subs count, count, #128
- b.ge L(mov_body_large_down)
- /* Less than 128 bytes to move, so handle 64 here and then jump
- * to the tail. */
- ldp A_l, A_h, [src]
- ldp B_l, B_h, [src, #16]
- ldp C_l, C_h, [src, #32]
- ldp D_l, D_h, [src, #48]
- stp A_l, A_h, [dst]
- stp B_l, B_h, [dst, #16]
- stp C_l, C_h, [dst, #32]
- stp D_l, D_h, [dst, #48]
- tst count, #0x3f
- add src, src, #64
- add dst, dst, #64
- b.ne L(tail63down)
- RET
-
- /* Critical loop. Start at a new cache line boundary. Assuming
- * 64 bytes per line this ensures the entire loop is in one line. */
- .p2align 6
-L(mov_body_large_down):
- /* There are at least 128 bytes to move. */
- ldp A_l, A_h, [src, #0]
- sub dst, dst, #16 /* Pre-bias. */
- ldp B_l, B_h, [src, #16]
- ldp C_l, C_h, [src, #32]
- ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
-1:
- stp A_l, A_h, [dst, #16]
- ldp A_l, A_h, [src, #16]
- stp B_l, B_h, [dst, #32]
- ldp B_l, B_h, [src, #32]
- stp C_l, C_h, [dst, #48]
- ldp C_l, C_h, [src, #48]
- stp D_l, D_h, [dst, #64]!
- ldp D_l, D_h, [src, #64]!
- subs count, count, #64
- b.ge 1b
- stp A_l, A_h, [dst, #16]
- stp B_l, B_h, [dst, #32]
- stp C_l, C_h, [dst, #48]
- stp D_l, D_h, [dst, #64]
- add src, src, #16
- add dst, dst, #64 + 16
- tst count, #0x3f
- b.ne L(tail63down)
- RET
-END (memmove)
-
-libc_hidden_builtin_def (memmove)
+/* memmove is part of memcpy.S. */
diff --git a/sysdeps/aarch64/memset-reg.h b/sysdeps/aarch64/memset-reg.h
new file mode 100644
index 0000000000..ac7c083867
--- /dev/null
+++ b/sysdeps/aarch64/memset-reg.h
@@ -0,0 +1,30 @@
+/* Register aliases for memset to be used across implementations.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define dstin x0
+#define val x1
+#define valw w1
+#define count x2
+#define dst x3
+#define dstend x4
+#define tmp1 x5
+#define tmp1w w5
+#define tmp2 x6
+#define tmp2w w6
+#define zva_len x7
+#define zva_lenw w7
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index e49f4d6faf..4a45459361 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2012-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,214 +16,175 @@
License along with the GNU C Library. If not, see
<http://www.gnu.org/licenses/>. */
+#include <sysdep.h>
+#include "memset-reg.h"
+
+#ifndef MEMSET
+# define MEMSET memset
+#endif
+
/* Assumptions:
*
- * ARMv8-a, AArch64
- * Unaligned accesses
+ * ARMv8-a, AArch64, unaligned accesses
*
*/
-#include <sysdep.h>
-
-/* By default we assume that the DC instruction can be used to zero
- data blocks more efficiently. In some circumstances this might be
- unsafe, for example in an asymmetric multiprocessor environment with
- different DC clear lengths (neither the upper nor lower lengths are
- safe to use). The feature can be disabled by defining DONT_USE_DC.
-
- If code may be run in a virtualized environment, then define
- MAYBE_VIRT. This will cause the code to cache the system register
- values rather than re-reading them each call. */
-
-#define dstin x0
-#define val w1
-#define count x2
-#define tmp1 x3
-#define tmp1w w3
-#define tmp2 x4
-#define tmp2w w4
-#define zva_len_x x5
-#define zva_len w5
-#define zva_bits_x x6
-
-#define A_l x7
-#define A_lw w7
-#define dst x8
-#define tmp3w w9
-
-ENTRY_ALIGN (__memset, 6)
-
- mov dst, dstin /* Preserve return value. */
- ands A_lw, val, #255
-#ifndef DONT_USE_DC
- b.eq L(zero_mem)
-#endif
- orr A_lw, A_lw, A_lw, lsl #8
- orr A_lw, A_lw, A_lw, lsl #16
- orr A_l, A_l, A_l, lsl #32
-L(tail_maybe_long):
- cmp count, #64
- b.ge L(not_short)
-L(tail_maybe_tiny):
- cmp count, #15
- b.le L(tail15tiny)
-L(tail63):
- ands tmp1, count, #0x30
- b.eq L(tail15)
- add dst, dst, tmp1
- cmp tmp1w, #0x20
- b.eq 1f
- b.lt 2f
- stp A_l, A_l, [dst, #-48]
-1:
- stp A_l, A_l, [dst, #-32]
-2:
- stp A_l, A_l, [dst, #-16]
-
-L(tail15):
- and count, count, #15
- add dst, dst, count
- stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */
- RET
-
-L(tail15tiny):
- /* Set up to 15 bytes. Does not assume earlier memory
- being set. */
- tbz count, #3, 1f
- str A_l, [dst], #8
-1:
- tbz count, #2, 1f
- str A_lw, [dst], #4
-1:
- tbz count, #1, 1f
- strh A_lw, [dst], #2
-1:
- tbz count, #0, 1f
- strb A_lw, [dst]
-1:
- RET
-
- /* Critical loop. Start at a new cache line boundary. Assuming
- * 64 bytes per line, this ensures the entire loop is in one line. */
- .p2align 6
-L(not_short):
- neg tmp2, dst
- ands tmp2, tmp2, #15
- b.eq 2f
- /* Bring DST to 128-bit (16-byte) alignment. We know that there's
- * more than that to set, so we simply store 16 bytes and advance by
- * the amount required to reach alignment. */
- sub count, count, tmp2
- stp A_l, A_l, [dst]
- add dst, dst, tmp2
- /* There may be less than 63 bytes to go now. */
- cmp count, #63
- b.le L(tail63)
-2:
- sub dst, dst, #16 /* Pre-bias. */
- sub count, count, #64
-1:
- stp A_l, A_l, [dst, #16]
- stp A_l, A_l, [dst, #32]
- stp A_l, A_l, [dst, #48]
- stp A_l, A_l, [dst, #64]!
- subs count, count, #64
- b.ge 1b
- tst count, #0x3f
- add dst, dst, #16
- b.ne L(tail63)
- RET
-
-#ifndef DONT_USE_DC
- /* For zeroing memory, check to see if we can use the ZVA feature to
- * zero entire 'cache' lines. */
-L(zero_mem):
- mov A_l, #0
- cmp count, #63
- b.le L(tail_maybe_tiny)
- neg tmp2, dst
- ands tmp2, tmp2, #15
- b.eq 1f
- sub count, count, tmp2
- stp A_l, A_l, [dst]
- add dst, dst, tmp2
- cmp count, #63
- b.le L(tail63)
-1:
- /* For zeroing small amounts of memory, it's not worth setting up
- * the line-clear code. */
- cmp count, #128
- b.lt L(not_short)
-#ifdef MAYBE_VIRT
- /* For efficiency when virtualized, we cache the ZVA capability. */
- adrp tmp2, L(cache_clear)
- ldr zva_len, [tmp2, #:lo12:L(cache_clear)]
- tbnz zva_len, #31, L(not_short)
- cbnz zva_len, L(zero_by_line)
- mrs tmp1, dczid_el0
- tbz tmp1, #4, 1f
- /* ZVA not available. Remember this for next time. */
- mov zva_len, #~0
- str zva_len, [tmp2, #:lo12:L(cache_clear)]
- b L(not_short)
-1:
- mov tmp3w, #4
- and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
- lsl zva_len, tmp3w, zva_len
- str zva_len, [tmp2, #:lo12:L(cache_clear)]
+ENTRY_ALIGN (MEMSET, 6)
+
+ DELOUSE (0)
+ DELOUSE (2)
+
+ dup v0.16B, valw
+ add dstend, dstin, count
+
+ cmp count, 96
+ b.hi L(set_long)
+ cmp count, 16
+ b.hs L(set_medium)
+ mov val, v0.D[0]
+
+ /* Set 0..15 bytes. */
+ tbz count, 3, 1f
+ str val, [dstin]
+ str val, [dstend, -8]
+ ret
+ nop
+1: tbz count, 2, 2f
+ str valw, [dstin]
+ str valw, [dstend, -4]
+ ret
+2: cbz count, 3f
+ strb valw, [dstin]
+ tbz count, 1, 3f
+ strh valw, [dstend, -2]
+3: ret
+
+ /* Set 17..96 bytes. */
+L(set_medium):
+ str q0, [dstin]
+ tbnz count, 6, L(set96)
+ str q0, [dstend, -16]
+ tbz count, 5, 1f
+ str q0, [dstin, 16]
+ str q0, [dstend, -32]
+1: ret
+
+ .p2align 4
+ /* Set 64..96 bytes. Write 64 bytes from the start and
+ 32 bytes from the end. */
+L(set96):
+ str q0, [dstin, 16]
+ stp q0, q0, [dstin, 32]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .p2align 3
+ nop
+L(set_long):
+ and valw, valw, 255
+ bic dst, dstin, 15
+ str q0, [dstin]
+ cmp count, 256
+ ccmp valw, 0, 0, cs
+ b.eq L(try_zva)
+L(no_zva):
+ sub count, dstend, dst /* Count is 16 too large. */
+ add dst, dst, 16
+ sub count, count, 64 + 16 /* Adjust count and bias for loop. */
+1: stp q0, q0, [dst], 64
+ stp q0, q0, [dst, -32]
+L(tail64):
+ subs count, count, 64
+ b.hi 1b
+2: stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+L(try_zva):
+#ifdef ZVA_MACRO
+ zva_macro
#else
+ .p2align 3
mrs tmp1, dczid_el0
- tbnz tmp1, #4, L(not_short)
- mov tmp3w, #4
- and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
- lsl zva_len, tmp3w, zva_len
-#endif
-
-L(zero_by_line):
- /* Compute how far we need to go to become suitably aligned. We're
- * already at quad-word alignment. */
- cmp count, zva_len_x
- b.lt L(not_short) /* Not enough to reach alignment. */
- sub zva_bits_x, zva_len_x, #1
- neg tmp2, dst
- ands tmp2, tmp2, zva_bits_x
- b.eq 1f /* Already aligned. */
- /* Not aligned, check that there's enough to copy after alignment. */
- sub tmp1, count, tmp2
- cmp tmp1, #64
- ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
- b.lt L(not_short)
- /* We know that there's at least 64 bytes to zero and that it's safe
- * to overrun by 64 bytes. */
- mov count, tmp1
-2:
- stp A_l, A_l, [dst]
- stp A_l, A_l, [dst, #16]
- stp A_l, A_l, [dst, #32]
- subs tmp2, tmp2, #64
- stp A_l, A_l, [dst, #48]
- add dst, dst, #64
- b.ge 2b
- /* We've overrun a bit, so adjust dst downwards. */
- add dst, dst, tmp2
-1:
- sub count, count, zva_len_x
-3:
- dc zva, dst
- add dst, dst, zva_len_x
- subs count, count, zva_len_x
- b.ge 3b
- ands count, count, zva_bits_x
- b.ne L(tail_maybe_long)
- RET
-#ifdef MAYBE_VIRT
- .bss
- .p2align 2
-L(cache_clear):
- .space 4
+ tbnz tmp1w, 4, L(no_zva)
+ and tmp1w, tmp1w, 15
+ cmp tmp1w, 4 /* ZVA size is 64 bytes. */
+ b.ne L(zva_128)
+
+ /* Write the first and last 64 byte aligned block using stp rather
+ than using DC ZVA. This is faster on some cores.
+ */
+L(zva_64):
+ str q0, [dst, 16]
+ stp q0, q0, [dst, 32]
+ bic dst, dst, 63
+ stp q0, q0, [dst, 64]
+ stp q0, q0, [dst, 96]
+ sub count, dstend, dst /* Count is now 128 too large. */
+ sub count, count, 128+64+64 /* Adjust count and bias for loop. */
+ add dst, dst, 128
+ nop
+1: dc zva, dst
+ add dst, dst, 64
+ subs count, count, 64
+ b.hi 1b
+ stp q0, q0, [dst, 0]
+ stp q0, q0, [dst, 32]
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .p2align 3
+L(zva_128):
+ cmp tmp1w, 5 /* ZVA size is 128 bytes. */
+ b.ne L(zva_other)
+
+ str q0, [dst, 16]
+ stp q0, q0, [dst, 32]
+ stp q0, q0, [dst, 64]
+ stp q0, q0, [dst, 96]
+ bic dst, dst, 127
+ sub count, dstend, dst /* Count is now 128 too large. */
+ sub count, count, 128+128 /* Adjust count and bias for loop. */
+ add dst, dst, 128
+1: dc zva, dst
+ add dst, dst, 128
+ subs count, count, 128
+ b.hi 1b
+ stp q0, q0, [dstend, -128]
+ stp q0, q0, [dstend, -96]
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+L(zva_other):
+ mov tmp2w, 4
+ lsl zva_lenw, tmp2w, tmp1w
+ add tmp1, zva_len, 64 /* Max alignment bytes written. */
+ cmp count, tmp1
+ blo L(no_zva)
+
+ sub tmp2, zva_len, 1
+ add tmp1, dst, zva_len
+ add dst, dst, 16
+ subs count, tmp1, dst /* Actual alignment bytes to write. */
+ bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
+ beq 2f
+1: stp q0, q0, [dst], 64
+ stp q0, q0, [dst, -32]
+ subs count, count, 64
+ b.hi 1b
+2: mov dst, tmp1
+ sub count, dstend, tmp1 /* Remaining bytes to write. */
+ subs count, count, zva_len
+ b.lo 4f
+3: dc zva, dst
+ add dst, dst, zva_len
+ subs count, count, zva_len
+ b.hs 3b
+4: add count, count, zva_len
+ b L(tail64)
#endif
-#endif /* DONT_USE_DC */
-END (__memset)
-weak_alias (__memset, memset)
-libc_hidden_builtin_def (memset)
+END (MEMSET)
+libc_hidden_builtin_def (MEMSET)
diff --git a/sysdeps/aarch64/memusage.h b/sysdeps/aarch64/memusage.h
index 70bf2ce859..874f3b5b3d 100644
--- a/sysdeps/aarch64/memusage.h
+++ b/sysdeps/aarch64/memusage.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2000-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2000-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
new file mode 100644
index 0000000000..57ffdf7238
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -0,0 +1,4 @@
+ifeq ($(subdir),string)
+sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
+ memcpy_falkor memmove_falkor memset_generic memset_falkor
+endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
new file mode 100644
index 0000000000..e55be80103
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -0,0 +1,57 @@
+/* Enumerate available IFUNC implementations of a function. AARCH64 version.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <assert.h>
+#include <string.h>
+#include <wchar.h>
+#include <ldsodefs.h>
+#include <ifunc-impl-list.h>
+#include <init-arch.h>
+#include <stdio.h>
+
+/* Maximum number of IFUNC implementations. */
+#define MAX_IFUNC 4
+
+size_t
+__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ size_t max)
+{
+ assert (max >= MAX_IFUNC);
+
+ size_t i = 0;
+
+ INIT_ARCH ();
+
+ /* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c. */
+ IFUNC_IMPL (i, name, memcpy,
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
+ IFUNC_IMPL (i, name, memmove,
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
+ IFUNC_IMPL (i, name, memset,
+ /* Enable this on non-falkor processors too so that other cores
+ can do a comparative analysis with __memset_generic. */
+ IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
+
+ return i;
+}
diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
new file mode 100644
index 0000000000..d1e5703cb2
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/init-arch.h
@@ -0,0 +1,25 @@
+/* Define INIT_ARCH so that midr is initialized before use by IFUNCs.
+ This file is part of the GNU C Library.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <ldsodefs.h>
+
+#define INIT_ARCH() \
+ uint64_t __attribute__((unused)) midr = \
+ GLRO(dl_aarch64_cpu_features).midr_el1; \
+ unsigned __attribute__((unused)) zva_size = \
+ GLRO(dl_aarch64_cpu_features).zva_size;
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
new file mode 100644
index 0000000000..4a04a63b0f
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -0,0 +1,47 @@
+/* Multiple versions of memcpy. AARCH64 version.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+
+#if IS_IN (libc)
+/* Redefine memcpy so that the compiler won't complain about the type
+ mismatch with the IFUNC selector in strong_alias, below. */
+# undef memcpy
+# define memcpy __redirect_memcpy
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__redirect_memcpy) __libc_memcpy;
+
+extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
+
+libc_ifunc (__libc_memcpy,
+ (IS_THUNDERX (midr)
+ ? __memcpy_thunderx
+ : (IS_FALKOR (midr) || IS_PHECDA (midr)
+ ? __memcpy_falkor
+ : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
+ ? __memcpy_thunderx2
+ : __memcpy_generic))));
+
+# undef memcpy
+strong_alias (__libc_memcpy, memcpy);
+#endif
diff --git a/sysdeps/aarch64/multiarch/memcpy_falkor.S b/sysdeps/aarch64/multiarch/memcpy_falkor.S
new file mode 100644
index 0000000000..cdc2de4750
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy_falkor.S
@@ -0,0 +1,191 @@
+/* Optimized memcpy for Qualcomm Falkor processor.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Assumptions:
+
+ ARMv8-a, AArch64, falkor, unaligned accesses. */
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define tmp1 x14
+#define A_x x6
+#define B_x x7
+#define A_w w6
+#define B_w w7
+
+#define A_q q0
+#define B_q q1
+#define C_q q2
+#define D_q q3
+#define E_q q4
+#define F_q q5
+#define G_q q6
+
+/* Copies are split into 3 main cases:
+
+ 1. Small copies of up to 32 bytes
+ 2. Medium copies of 33..128 bytes which are fully unrolled
+ 3. Large copies of more than 128 bytes.
+
+ Large copies align the sourceto a quad word and use an unrolled loop
+ processing 64 bytes per iteration.
+
+ FALKOR-SPECIFIC DESIGN:
+
+ The smallest copies (32 bytes or less) focus on optimal pipeline usage,
+ which is why the redundant copies of 0-3 bytes have been replaced with
+ conditionals, since the former would unnecessarily break across multiple
+ issue groups. The medium copy group has been enlarged to 128 bytes since
+ bumping up the small copies up to 32 bytes allows us to do that without
+ cost and also allows us to reduce the size of the prep code before loop64.
+
+ The copy loop uses only one register q0. This is to ensure that all loads
+ hit a single hardware prefetcher which can get correctly trained to prefetch
+ a single stream.
+
+ The non-temporal stores help optimize cache utilization. */
+
+#if IS_IN (libc)
+ENTRY_ALIGN (__memcpy_falkor, 6)
+
+ cmp count, 32
+ add srcend, src, count
+ add dstend, dstin, count
+ b.ls L(copy32)
+ ldr A_q, [src]
+ cmp count, 128
+ str A_q, [dstin]
+ b.hi L(copy_long)
+
+ /* Medium copies: 33..128 bytes. */
+ sub tmp1, count, 1
+ ldr A_q, [src, 16]
+ ldr B_q, [srcend, -32]
+ ldr C_q, [srcend, -16]
+ tbz tmp1, 6, 1f
+ ldr D_q, [src, 32]
+ ldr E_q, [src, 48]
+ str D_q, [dstin, 32]
+ str E_q, [dstin, 48]
+ ldr F_q, [srcend, -64]
+ ldr G_q, [srcend, -48]
+ str F_q, [dstend, -64]
+ str G_q, [dstend, -48]
+1:
+ str A_q, [dstin, 16]
+ str B_q, [dstend, -32]
+ str C_q, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Small copies: 0..32 bytes. */
+L(copy32):
+ /* 16-32 */
+ cmp count, 16
+ b.lo 1f
+ ldr A_q, [src]
+ ldr B_q, [srcend, -16]
+ str A_q, [dstin]
+ str B_q, [dstend, -16]
+ ret
+ .p2align 4
+1:
+ /* 8-15 */
+ tbz count, 3, 1f
+ ldr A_x, [src]
+ ldr B_x, [srcend, -8]
+ str A_x, [dstin]
+ str B_x, [dstend, -8]
+ ret
+ .p2align 4
+1:
+ /* 4-7 */
+ tbz count, 2, 1f
+ ldr A_w, [src]
+ ldr B_w, [srcend, -4]
+ str A_w, [dstin]
+ str B_w, [dstend, -4]
+ ret
+ .p2align 4
+1:
+ /* 2-3 */
+ tbz count, 1, 1f
+ ldrh A_w, [src]
+ ldrh B_w, [srcend, -2]
+ strh A_w, [dstin]
+ strh B_w, [dstend, -2]
+ ret
+ .p2align 4
+1:
+ /* 0-1 */
+ tbz count, 0, 1f
+ ldrb A_w, [src]
+ strb A_w, [dstin]
+1:
+ ret
+
+ /* Align SRC to 16 bytes and copy; that way at least one of the
+ accesses is aligned throughout the copy sequence.
+
+ The count is off by 0 to 15 bytes, but this is OK because we trim
+ off the last 64 bytes to copy off from the end. Due to this the
+ loop never runs out of bounds. */
+ .p2align 6
+L(copy_long):
+ sub count, count, 64 + 16
+ and tmp1, src, 15
+ bic src, src, 15
+ sub dst, dstin, tmp1
+ add count, count, tmp1
+
+L(loop64):
+ ldr A_q, [src, 16]!
+ str A_q, [dst, 16]
+ ldr A_q, [src, 16]!
+ subs count, count, 64
+ str A_q, [dst, 32]
+ ldr A_q, [src, 16]!
+ str A_q, [dst, 48]
+ ldr A_q, [src, 16]!
+ str A_q, [dst, 64]!
+ b.hi L(loop64)
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the end even if
+ there is just 1 byte left. */
+L(last64):
+ ldr E_q, [srcend, -64]
+ str E_q, [dstend, -64]
+ ldr D_q, [srcend, -48]
+ str D_q, [dstend, -48]
+ ldr C_q, [srcend, -32]
+ str C_q, [dstend, -32]
+ ldr B_q, [srcend, -16]
+ str B_q, [dstend, -16]
+ ret
+
+END (__memcpy_falkor)
+libc_hidden_builtin_def (__memcpy_falkor)
+#endif
diff --git a/sysdeps/aarch64/multiarch/memcpy_generic.S b/sysdeps/aarch64/multiarch/memcpy_generic.S
new file mode 100644
index 0000000000..4bc81f4ce5
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy_generic.S
@@ -0,0 +1,44 @@
+/* A Generic Optimized memcpy implementation for AARCH64.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* The actual memcpy and memmove code is in ../memcpy.S. If we are
+ building libc this file defines __memcpy_generic and __memmove_generic.
+ Otherwise the include of ../memcpy.S will define the normal __memcpy
+ and__memmove entry points. */
+
+#include <sysdep.h>
+
+#if IS_IN (libc)
+
+# define MEMCPY __memcpy_generic
+# define MEMMOVE __memmove_generic
+
+/* Do not hide the generic versions of memcpy and memmove, we use them
+ internally. */
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)
+
+# ifdef SHARED
+/* It doesn't make sense to send libc-internal memcpy calls through a PLT. */
+ .globl __GI_memcpy; __GI_memcpy = __memcpy_generic
+ .globl __GI_memmove; __GI_memmove = __memmove_generic
+# endif
+
+#endif
+
+#include "../memcpy.S"
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx.S b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
new file mode 100644
index 0000000000..de494d933d
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
@@ -0,0 +1,336 @@
+/* A Thunderx Optimized memcpy implementation for AARCH64.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* The actual code in this memcpy and memmove should be identical to the
+ generic version except for the code under '#ifdef THUNDERX'. This is
+ to make is easier to keep this version and the generic version in sync
+ for changes that are not specific to thunderx. */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define A_l x6
+#define A_lw w6
+#define A_h x7
+#define A_hw w7
+#define B_l x8
+#define B_lw w8
+#define B_h x9
+#define C_l x10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l src
+#define E_h count
+#define F_l srcend
+#define F_h dst
+#define G_l count
+#define G_h dst
+#define tmp1 x14
+
+/* Copies are split into 3 main cases: small copies of up to 16 bytes,
+ medium copies of 17..96 bytes which are fully unrolled. Large copies
+ of more than 96 bytes align the destination and use an unrolled loop
+ processing 64 bytes per iteration.
+ In order to share code with memmove, small and medium copies read all
+ data before writing, allowing any kind of overlap. So small, medium
+ and large backwards memmoves are handled by falling through into memcpy.
+ Overlapping large forward memmoves use a loop that copies backwards.
+*/
+
+#ifndef MEMMOVE
+# define MEMMOVE memmove
+#endif
+#ifndef MEMCPY
+# define MEMCPY memcpy
+#endif
+
+#if IS_IN (libc)
+
+# ifndef USE_THUNDERX2
+# undef MEMCPY
+# define MEMCPY __memcpy_thunderx
+# undef MEMMOVE
+# define MEMMOVE __memmove_thunderx
+# define USE_THUNDERX
+# endif
+
+ENTRY_ALIGN (MEMMOVE, 6)
+
+ DELOUSE (0)
+ DELOUSE (1)
+ DELOUSE (2)
+
+ sub tmp1, dstin, src
+ cmp count, 96
+ ccmp tmp1, count, 2, hi
+ b.lo L(move_long)
+
+ /* Common case falls through into memcpy. */
+END (MEMMOVE)
+libc_hidden_builtin_def (MEMMOVE)
+ENTRY (MEMCPY)
+
+ DELOUSE (0)
+ DELOUSE (1)
+ DELOUSE (2)
+
+ prfm PLDL1KEEP, [src]
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 16
+ b.ls L(copy16)
+ cmp count, 96
+ b.hi L(copy_long)
+
+ /* Medium copies: 17..96 bytes. */
+ sub tmp1, count, 1
+ ldp A_l, A_h, [src]
+ tbnz tmp1, 6, L(copy96)
+ ldp D_l, D_h, [srcend, -16]
+ tbz tmp1, 5, 1f
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+1:
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Small copies: 0..16 bytes. */
+L(copy16):
+ cmp count, 8
+ b.lo 1f
+ ldr A_l, [src]
+ ldr A_h, [srcend, -8]
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
+ ret
+ .p2align 4
+1:
+ tbz count, 2, 1f
+ ldr A_lw, [src]
+ ldr A_hw, [srcend, -4]
+ str A_lw, [dstin]
+ str A_hw, [dstend, -4]
+ ret
+
+ /* Copy 0..3 bytes. Use a branchless sequence that copies the same
+ byte 3 times if count==1, or the 2nd byte twice if count==2. */
+1:
+ cbz count, 2f
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb A_hw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb A_hw, [dstend, -1]
+2: ret
+
+ .p2align 4
+ /* Copy 64..96 bytes. Copy 64 bytes from the start and
+ 32 bytes from the end. */
+L(copy96):
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [src, 32]
+ ldp D_l, D_h, [src, 48]
+ ldp E_l, E_h, [srcend, -32]
+ ldp F_l, F_h, [srcend, -16]
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin, 32]
+ stp D_l, D_h, [dstin, 48]
+ stp E_l, E_h, [dstend, -32]
+ stp F_l, F_h, [dstend, -16]
+ ret
+
+ /* Align DST to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 96 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 64 bytes per iteration and prefetches one iteration ahead. */
+
+ .p2align 4
+L(copy_long):
+
+# if defined(USE_THUNDERX) || defined (USE_THUNDERX2)
+
+ /* On thunderx, large memcpy's are helped by software prefetching.
+ This loop is identical to the one below it but with prefetching
+ instructions included. For loops that are less than 32768 bytes,
+ the prefetching does not help and slow the code down so we only
+ use the prefetching loop for the largest memcpys. */
+
+ cmp count, #32768
+ b.lo L(copy_long_without_prefetch)
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ ldp D_l, D_h, [src]
+ sub src, src, tmp1
+# if defined(USE_THUNDERX)
+ prfm pldl1strm, [src, 384]
+# elif defined(USE_THUNDERX2)
+ prfm pldl1strm, [src, 256]
+# endif
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_l, A_h, [src, 16]
+ stp D_l, D_h, [dstin]
+ ldp B_l, B_h, [src, 32]
+ ldp C_l, C_h, [src, 48]
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 128 + 16 /* Test and readjust count. */
+
+L(prefetch_loop64):
+# if defined(USE_THUNDERX)
+ tbz src, #6, 1f
+ prfm pldl1strm, [src, 512]
+1:
+# elif defined(USE_THUNDERX2)
+ prfm pldl1strm, [src, 256]
+# endif
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]!
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 64
+ b.hi L(prefetch_loop64)
+ b L(last64)
+
+L(copy_long_without_prefetch):
+# endif
+
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ ldp D_l, D_h, [src]
+ sub src, src, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_l, A_h, [src, 16]
+ stp D_l, D_h, [dstin]
+ ldp B_l, B_h, [src, 32]
+ ldp C_l, C_h, [src, 48]
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls L(last64)
+L(loop64):
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]!
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 64
+ b.hi L(loop64)
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the end even if
+ there is just 1 byte left. */
+L(last64):
+ ldp E_l, E_h, [srcend, -64]
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [srcend, -48]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [srcend, -16]
+ stp D_l, D_h, [dst, 64]
+ stp E_l, E_h, [dstend, -64]
+ stp A_l, A_h, [dstend, -48]
+ stp B_l, B_h, [dstend, -32]
+ stp C_l, C_h, [dstend, -16]
+ ret
+
+ .p2align 4
+L(move_long):
+ cbz tmp1, 3f
+
+ add srcend, src, count
+ add dstend, dstin, count
+
+ /* Align dstend to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 96 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 64 bytes per iteration and prefetches one iteration ahead. */
+
+ and tmp1, dstend, 15
+ ldp D_l, D_h, [srcend, -16]
+ sub srcend, srcend, tmp1
+ sub count, count, tmp1
+ ldp A_l, A_h, [srcend, -16]
+ stp D_l, D_h, [dstend, -16]
+ ldp B_l, B_h, [srcend, -32]
+ ldp C_l, C_h, [srcend, -48]
+ ldp D_l, D_h, [srcend, -64]!
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls 2f
+
+ nop
+1:
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [srcend, -16]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [srcend, -48]
+ stp D_l, D_h, [dstend, -64]!
+ ldp D_l, D_h, [srcend, -64]!
+ subs count, count, 64
+ b.hi 1b
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the start even if
+ there is just 1 byte left. */
+2:
+ ldp G_l, G_h, [src, 48]
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [src, 32]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [src, 16]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [src]
+ stp D_l, D_h, [dstend, -64]
+ stp G_l, G_h, [dstin, 48]
+ stp A_l, A_h, [dstin, 32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin]
+3: ret
+
+END (MEMCPY)
+libc_hidden_builtin_def (MEMCPY)
+
+#endif
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
new file mode 100644
index 0000000000..8501abf725
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
@@ -0,0 +1,27 @@
+/* A Thunderx2 Optimized memcpy implementation for AARCH64.
+ Copyright (C) 2018 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* The actual code in this memcpy and memmove is in memcpy_thunderx.S.
+ The only real differences are with the prefetching instructions. */
+
+#define MEMCPY __memcpy_thunderx2
+#define MEMMOVE __memmove_thunderx2
+#define USE_THUNDERX2
+
+#include "memcpy_thunderx.S"
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
new file mode 100644
index 0000000000..e69d816291
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -0,0 +1,44 @@
+/* Multiple versions of memmove. AARCH64 version.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+
+#if IS_IN (libc)
+/* Redefine memmove so that the compiler won't complain about the type
+ mismatch with the IFUNC selector in strong_alias, below. */
+# undef memmove
+# define memmove __redirect_memmove
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__redirect_memmove) __libc_memmove;
+
+extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
+
+libc_ifunc (__libc_memmove,
+ (IS_THUNDERX (midr)
+ ? __memmove_thunderx
+ : (IS_FALKOR (midr) || IS_PHECDA (midr)
+ ? __memmove_falkor
+ : __memmove_generic)));
+
+# undef memmove
+strong_alias (__libc_memmove, memmove);
+#endif
diff --git a/sysdeps/aarch64/multiarch/memmove_falkor.S b/sysdeps/aarch64/multiarch/memmove_falkor.S
new file mode 100644
index 0000000000..5163f68e53
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memmove_falkor.S
@@ -0,0 +1,225 @@
+/* Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Assumptions: ARMv8-a, AArch64, falkor, unaligned accesses. */
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define A_x x6
+#define B_x x7
+#define A_w w6
+#define B_w w7
+#define tmp1 x14
+
+#define Q_q q6
+#define A_q q22
+#define B_q q18
+#define C_q q19
+#define D_q q20
+#define E_q q21
+#define F_q q17
+#define G_q q23
+
+/* RATIONALE:
+
+ The move has 4 distinct parts:
+ * Small moves of 16 bytes and under
+ * Medium sized moves of 17-96 bytes
+ * Large moves where the source address is higher than the destination
+ (forward copies)
+ * Large moves where the destination address is higher than the source
+ (copy backward, or move).
+
+ We use only two registers q6 and q22 for the moves and move 32 bytes at a
+ time to correctly train the hardware prefetcher for better throughput. */
+ENTRY_ALIGN (__memmove_falkor, 6)
+
+ sub tmp1, dstin, src
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 96
+ ccmp tmp1, count, 2, hi
+ b.lo L(move_long)
+
+ cmp count, 16
+ b.ls L(copy16)
+ cmp count, 96
+ b.hi L(copy_long)
+
+ /* Medium copies: 17..96 bytes. */
+ sub tmp1, count, 1
+ ldr A_q, [src]
+ tbnz tmp1, 6, L(copy96)
+ ldr D_q, [srcend, -16]
+ tbz tmp1, 5, 1f
+ ldr B_q, [src, 16]
+ ldr C_q, [srcend, -32]
+ str B_q, [dstin, 16]
+ str C_q, [dstend, -32]
+1:
+ str A_q, [dstin]
+ str D_q, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Small copies: 0..16 bytes. */
+L(copy16):
+ cmp count, 8
+ b.lo 1f
+ ldr A_x, [src]
+ ldr B_x, [srcend, -8]
+ str A_x, [dstin]
+ str B_x, [dstend, -8]
+ ret
+ .p2align 4
+1:
+ /* 4-7 */
+ tbz count, 2, 1f
+ ldr A_w, [src]
+ ldr B_w, [srcend, -4]
+ str A_w, [dstin]
+ str B_w, [dstend, -4]
+ ret
+ .p2align 4
+1:
+ /* 2-3 */
+ tbz count, 1, 1f
+ ldrh A_w, [src]
+ ldrh B_w, [srcend, -2]
+ strh A_w, [dstin]
+ strh B_w, [dstend, -2]
+ ret
+ .p2align 4
+1:
+ /* 0-1 */
+ tbz count, 0, 1f
+ ldrb A_w, [src]
+ strb A_w, [dstin]
+1: ret
+
+ .p2align 4
+ /* Copy 64..96 bytes. Copy 64 bytes from the start and
+ 32 bytes from the end. */
+L(copy96):
+ ldr B_q, [src, 16]
+ ldr C_q, [src, 32]
+ ldr D_q, [src, 48]
+ ldr E_q, [srcend, -32]
+ ldr F_q, [srcend, -16]
+ str A_q, [dstin]
+ str B_q, [dstin, 16]
+ str C_q, [dstin, 32]
+ str D_q, [dstin, 48]
+ str E_q, [dstend, -32]
+ str F_q, [dstend, -16]
+ ret
+
+ /* Align SRC to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 96 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 32 bytes per iteration and prefetches one iteration ahead. */
+
+ .p2align 4
+L(copy_long):
+ ldr A_q, [src]
+ and tmp1, src, 15
+ bic src, src, 15
+ sub dst, dstin, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldr Q_q, [src, 16]!
+ str A_q, [dstin]
+ ldr A_q, [src, 16]!
+ subs count, count, 32 + 64 + 16 /* Test and readjust count. */
+ b.ls L(last64)
+
+L(loop64):
+ subs count, count, 32
+ str Q_q, [dst, 16]
+ ldr Q_q, [src, 16]!
+ str A_q, [dst, 32]!
+ ldr A_q, [src, 16]!
+ b.hi L(loop64)
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes and at least 33 bytes, so it is safe to always copy 64 bytes
+ from the end. */
+L(last64):
+ ldr C_q, [srcend, -64]
+ str Q_q, [dst, 16]
+ ldr B_q, [srcend, -48]
+ str A_q, [dst, 32]
+ ldr A_q, [srcend, -32]
+ ldr D_q, [srcend, -16]
+ str C_q, [dstend, -64]
+ str B_q, [dstend, -48]
+ str A_q, [dstend, -32]
+ str D_q, [dstend, -16]
+ ret
+
+ .p2align 4
+L(move_long):
+ cbz tmp1, 3f
+
+ /* Align SRCEND to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 96 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 32 bytes per iteration and prefetches one iteration ahead. */
+
+ ldr A_q, [srcend, -16]
+ and tmp1, srcend, 15
+ sub srcend, srcend, tmp1
+ ldr Q_q, [srcend, -16]!
+ str A_q, [dstend, -16]
+ sub count, count, tmp1
+ ldr A_q, [srcend, -16]!
+ sub dstend, dstend, tmp1
+ subs count, count, 32 + 64
+ b.ls 2f
+
+1:
+ subs count, count, 32
+ str Q_q, [dstend, -16]
+ ldr Q_q, [srcend, -16]!
+ str A_q, [dstend, -32]!
+ ldr A_q, [srcend, -16]!
+ b.hi 1b
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes and at least 33 bytes, so it is safe to always copy 64 bytes
+ from the start. */
+2:
+ ldr C_q, [src, 48]
+ str Q_q, [dstend, -16]
+ ldr B_q, [src, 32]
+ str A_q, [dstend, -32]
+ ldr A_q, [src, 16]
+ ldr D_q, [src]
+ str C_q, [dstin, 48]
+ str B_q, [dstin, 32]
+ str A_q, [dstin, 16]
+ str D_q, [dstin]
+3: ret
+
+END (__memmove_falkor)
+libc_hidden_builtin_def (__memmove_falkor)
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
new file mode 100644
index 0000000000..d74ed3a549
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -0,0 +1,41 @@
+/* Multiple versions of memset. AARCH64 version.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+
+#if IS_IN (libc)
+/* Redefine memset so that the compiler won't complain about the type
+ mismatch with the IFUNC selector in strong_alias, below. */
+# undef memset
+# define memset __redirect_memset
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__redirect_memset) __libc_memset;
+
+extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
+extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
+
+libc_ifunc (__libc_memset,
+ ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64
+ ? __memset_falkor
+ : __memset_generic));
+
+# undef memset
+strong_alias (__libc_memset, memset);
+#endif
diff --git a/sysdeps/aarch64/multiarch/memset_falkor.S b/sysdeps/aarch64/multiarch/memset_falkor.S
new file mode 100644
index 0000000000..16849a5afb
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_falkor.S
@@ -0,0 +1,53 @@
+/* Memset for falkor.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <memset-reg.h>
+
+/* Reading dczid_el0 is expensive on falkor so move it into the ifunc
+ resolver and assume ZVA size of 64 bytes. The IFUNC resolver takes care to
+ use this function only when ZVA is enabled. */
+
+#if IS_IN (libc)
+.macro zva_macro
+ .p2align 4
+ /* Write the first and last 64 byte aligned block using stp rather
+ than using DC ZVA. This is faster on some cores. */
+ str q0, [dst, 16]
+ stp q0, q0, [dst, 32]
+ bic dst, dst, 63
+ stp q0, q0, [dst, 64]
+ stp q0, q0, [dst, 96]
+ sub count, dstend, dst /* Count is now 128 too large. */
+ sub count, count, 128+64+64 /* Adjust count and bias for loop. */
+ add dst, dst, 128
+1: dc zva, dst
+ add dst, dst, 64
+ subs count, count, 64
+ b.hi 1b
+ stp q0, q0, [dst, 0]
+ stp q0, q0, [dst, 32]
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+.endm
+
+# define ZVA_MACRO zva_macro
+# define MEMSET __memset_falkor
+# include <sysdeps/aarch64/memset.S>
+#endif
diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S
new file mode 100644
index 0000000000..345a88b94f
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_generic.S
@@ -0,0 +1,27 @@
+/* Memset for aarch64, default version for internal use.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# define MEMSET __memset_generic
+/* Add a hidden definition for use within libc.so. */
+# ifdef SHARED
+ .globl __GI_memset; __GI_memset = __memset_generic
+# endif
+# include <sysdeps/aarch64/memset.S>
+#endif
diff --git a/sysdeps/aarch64/multiarch/rtld-memset.S b/sysdeps/aarch64/multiarch/rtld-memset.S
new file mode 100644
index 0000000000..8cdc6e1f50
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/rtld-memset.S
@@ -0,0 +1,23 @@
+/* Memset for aarch64, for the dynamic linker.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (rtld)
+# define MEMSET memset
+# include <sysdeps/aarch64/memset.S>
+#endif
diff --git a/sysdeps/aarch64/nptl/Makefile b/sysdeps/aarch64/nptl/Makefile
index 486aeca81e..bebbdef871 100644
--- a/sysdeps/aarch64/nptl/Makefile
+++ b/sysdeps/aarch64/nptl/Makefile
@@ -1,4 +1,4 @@
-# Copyright (C) 2005-2016 Free Software Foundation, Inc.
+# Copyright (C) 2005-2018 Free Software Foundation, Inc.
#
# This file is part of the GNU C Library.
#
diff --git a/sysdeps/aarch64/nptl/bits/pthreadtypes-arch.h b/sysdeps/aarch64/nptl/bits/pthreadtypes-arch.h
new file mode 100644
index 0000000000..008aa7e0c7
--- /dev/null
+++ b/sysdeps/aarch64/nptl/bits/pthreadtypes-arch.h
@@ -0,0 +1,71 @@
+/* Copyright (C) 2002-2018 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _BITS_PTHREADTYPES_ARCH_H
+#define _BITS_PTHREADTYPES_ARCH_H 1
+
+#include <endian.h>
+
+#ifdef __ILP32__
+# define __SIZEOF_PTHREAD_ATTR_T 32
+# define __SIZEOF_PTHREAD_MUTEX_T 32
+# define __SIZEOF_PTHREAD_MUTEXATTR_T 4
+# define __SIZEOF_PTHREAD_CONDATTR_T 4
+# define __SIZEOF_PTHREAD_RWLOCK_T 48
+# define __SIZEOF_PTHREAD_BARRIER_T 20
+# define __SIZEOF_PTHREAD_BARRIERATTR_T 4
+#else
+# define __SIZEOF_PTHREAD_ATTR_T 64
+# define __SIZEOF_PTHREAD_MUTEX_T 48
+# define __SIZEOF_PTHREAD_MUTEXATTR_T 8
+# define __SIZEOF_PTHREAD_CONDATTR_T 8
+# define __SIZEOF_PTHREAD_RWLOCK_T 56
+# define __SIZEOF_PTHREAD_BARRIER_T 32
+# define __SIZEOF_PTHREAD_BARRIERATTR_T 8
+#endif
+#define __SIZEOF_PTHREAD_COND_T 48
+#define __SIZEOF_PTHREAD_RWLOCKATTR_T 8
+
+/* Definitions for internal mutex struct. */
+#define __PTHREAD_COMPAT_PADDING_MID
+#define __PTHREAD_COMPAT_PADDING_END
+#define __PTHREAD_MUTEX_LOCK_ELISION 0
+#define __PTHREAD_MUTEX_NUSERS_AFTER_KIND 0
+#define __PTHREAD_MUTEX_USE_UNION 0
+
+#define __LOCK_ALIGNMENT
+#define __ONCE_ALIGNMENT
+
+struct __pthread_rwlock_arch_t
+{
+ unsigned int __readers;
+ unsigned int __writers;
+ unsigned int __wrphase_futex;
+ unsigned int __writers_futex;
+ unsigned int __pad3;
+ unsigned int __pad4;
+ int __cur_writer;
+ int __shared;
+ unsigned long int __pad1;
+ unsigned long int __pad2;
+ unsigned int __flags;
+};
+
+#define __PTHREAD_RWLOCK_ELISION_EXTRA 0
+
+#endif /* bits/pthreadtypes.h */
diff --git a/sysdeps/aarch64/nptl/bits/pthreadtypes.h b/sysdeps/aarch64/nptl/bits/pthreadtypes.h
deleted file mode 100644
index 13984a718d..0000000000
--- a/sysdeps/aarch64/nptl/bits/pthreadtypes.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (C) 2002-2016 Free Software Foundation, Inc.
-
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#ifndef _BITS_PTHREADTYPES_H
-#define _BITS_PTHREADTYPES_H 1
-
-#include <endian.h>
-
-#define __SIZEOF_PTHREAD_ATTR_T 64
-#define __SIZEOF_PTHREAD_MUTEX_T 48
-#define __SIZEOF_PTHREAD_MUTEXATTR_T 8
-#define __SIZEOF_PTHREAD_COND_T 48
-#define __SIZEOF_PTHREAD_COND_COMPAT_T 48
-#define __SIZEOF_PTHREAD_CONDATTR_T 8
-#define __SIZEOF_PTHREAD_RWLOCK_T 56
-#define __SIZEOF_PTHREAD_RWLOCKATTR_T 8
-#define __SIZEOF_PTHREAD_BARRIER_T 32
-#define __SIZEOF_PTHREAD_BARRIERATTR_T 8
-
-
-/* Thread identifiers. The structure of the attribute type is not
- exposed on purpose. */
-typedef unsigned long int pthread_t;
-
-
-union pthread_attr_t
-{
- char __size[__SIZEOF_PTHREAD_ATTR_T];
- long int __align;
-};
-#ifndef __have_pthread_attr_t
-typedef union pthread_attr_t pthread_attr_t;
-# define __have_pthread_attr_t1
-#endif
-
-typedef struct __pthread_internal_list
-{
- struct __pthread_internal_list *__prev;
- struct __pthread_internal_list *__next;
-} __pthread_list_t;
-
-
-/* Data structures for mutex handling. The structure of the attribute
- type is not exposed on purpose. */
-typedef union
-{
- struct __pthread_mutex_s
- {
- int __lock;
- unsigned int __count;
- int __owner;
- unsigned int __nusers;
- int __kind;
- int __spins;
- __pthread_list_t __list;
-#define __PTHREAD_MUTEX_HAVE_PREV 1
- } __data;
- char __size[__SIZEOF_PTHREAD_MUTEX_T];
- long int __align;
-} pthread_mutex_t;
-
-/* Mutex __spins initializer used by PTHREAD_MUTEX_INITIALIZER. */
-#define __PTHREAD_SPINS 0
-
-typedef union
-{
- char __size[__SIZEOF_PTHREAD_MUTEXATTR_T];
- long int __align;
-} pthread_mutexattr_t;
-
-
-/* Data structure for conditional variable handling. The structure of
- the attribute type is not exposed on purpose. */
-typedef union
-{
- struct
- {
- int __lock;
- unsigned int __futex;
- __extension__ unsigned long long int __total_seq;
- __extension__ unsigned long long int __wakeup_seq;
- __extension__ unsigned long long int __woken_seq;
- void *__mutex;
- unsigned int __nwaiters;
- unsigned int __broadcast_seq;
- } __data;
- char __size[__SIZEOF_PTHREAD_COND_T];
- long int __align;
-} pthread_cond_t;
-
-typedef union
-{
- char __size[__SIZEOF_PTHREAD_CONDATTR_T];
- int __align;
-} pthread_condattr_t;
-
-
-/* Keys for thread-specific data */
-typedef unsigned int pthread_key_t;
-
-
-/* Once-only execution */
-typedef int pthread_once_t;
-
-
-#if defined __USE_UNIX98 || defined __USE_XOPEN2K
-/* Data structure for read-write lock variable handling. The
- structure of the attribute type is not exposed on purpose. */
-typedef union
-{
- struct
- {
- int __lock;
- unsigned int __nr_readers;
- unsigned int __readers_wakeup;
- unsigned int __writer_wakeup;
- unsigned int __nr_readers_queued;
- unsigned int __nr_writers_queued;
- int __writer;
- int __shared;
- unsigned long int __pad1;
- unsigned long int __pad2;
- unsigned int __flags;
- } __data;
- char __size[__SIZEOF_PTHREAD_RWLOCK_T];
- long int __align;
-} pthread_rwlock_t;
-
-#define __PTHREAD_RWLOCK_ELISION_EXTRA 0
-
-typedef union
-{
- char __size[__SIZEOF_PTHREAD_RWLOCKATTR_T];
- long int __align;
-} pthread_rwlockattr_t;
-#endif
-
-
-#ifdef __USE_XOPEN2K
-/* POSIX spinlock data type. */
-typedef volatile int pthread_spinlock_t;
-
-
-/* POSIX barriers data type. The structure of the type is
- deliberately not exposed. */
-typedef union
-{
- char __size[__SIZEOF_PTHREAD_BARRIER_T];
- long int __align;
-} pthread_barrier_t;
-
-typedef union
-{
- char __size[__SIZEOF_PTHREAD_BARRIERATTR_T];
- int __align;
-} pthread_barrierattr_t;
-#endif
-
-#endif /* bits/pthreadtypes.h */
diff --git a/sysdeps/aarch64/nptl/bits/semaphore.h b/sysdeps/aarch64/nptl/bits/semaphore.h
index 3cc5b37847..18f8aae532 100644
--- a/sysdeps/aarch64/nptl/bits/semaphore.h
+++ b/sysdeps/aarch64/nptl/bits/semaphore.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2002-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2002-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -21,7 +21,11 @@
#endif
-#define __SIZEOF_SEM_T 32
+#ifdef __ILP32__
+# define __SIZEOF_SEM_T 16
+#else
+# define __SIZEOF_SEM_T 32
+#endif
/* Value returned if `sem_open' failed. */
@@ -31,5 +35,5 @@
typedef union
{
char __size[__SIZEOF_SEM_T];
- long int __align;
+ long long int __align;
} sem_t;
diff --git a/sysdeps/aarch64/nptl/pthread-offsets.h b/sysdeps/aarch64/nptl/pthread-offsets.h
new file mode 100644
index 0000000000..16c6b0d9fd
--- /dev/null
+++ b/sysdeps/aarch64/nptl/pthread-offsets.h
@@ -0,0 +1,5 @@
+#define __PTHREAD_MUTEX_NUSERS_OFFSET 12
+#define __PTHREAD_MUTEX_KIND_OFFSET 16
+#define __PTHREAD_MUTEX_SPINS_OFFSET 20
+#define __PTHREAD_MUTEX_ELISION_OFFSET 22
+#define __PTHREAD_MUTEX_LIST_OFFSET 24
diff --git a/sysdeps/aarch64/nptl/pthread_spin_lock.c b/sysdeps/aarch64/nptl/pthread_spin_lock.c
deleted file mode 100644
index 21a54b8215..0000000000
--- a/sysdeps/aarch64/nptl/pthread_spin_lock.c
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (C) 2008-2016 Free Software Foundation, Inc.
-
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public License as
- published by the Free Software Foundation; either version 2.1 of the
- License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#define SPIN_LOCK_READS_BETWEEN_CMPXCHG 1000
-
-/* We can't use the normal "#include <nptl/pthread_spin_lock.c>" because
- it will resolve to this very file. Using "sysdeps/.." as reference to the
- top level directory does the job. */
-#include <sysdeps/../nptl/pthread_spin_lock.c>
diff --git a/sysdeps/aarch64/nptl/pthreaddef.h b/sysdeps/aarch64/nptl/pthreaddef.h
index 5aa8d561e7..32f729ada1 100644
--- a/sysdeps/aarch64/nptl/pthreaddef.h
+++ b/sysdeps/aarch64/nptl/pthreaddef.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2002-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2002-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/nptl/tcb-offsets.sym b/sysdeps/aarch64/nptl/tcb-offsets.sym
index 0677aeabff..238647dd47 100644
--- a/sysdeps/aarch64/nptl/tcb-offsets.sym
+++ b/sysdeps/aarch64/nptl/tcb-offsets.sym
@@ -2,6 +2,5 @@
#include <tls.h>
PTHREAD_MULTIPLE_THREADS_OFFSET offsetof (struct pthread, header.multiple_threads)
-PTHREAD_PID_OFFSET offsetof (struct pthread, pid)
PTHREAD_TID_OFFSET offsetof (struct pthread, tid)
PTHREAD_SIZEOF sizeof (struct pthread)
diff --git a/sysdeps/aarch64/nptl/tls.h b/sysdeps/aarch64/nptl/tls.h
index 95ea3f9a1a..200334f84a 100644
--- a/sysdeps/aarch64/nptl/tls.h
+++ b/sysdeps/aarch64/nptl/tls.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2005-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -25,17 +25,7 @@
# include <stdbool.h>
# include <stddef.h>
# include <stdint.h>
-
-/* Type for the dtv. */
-typedef union dtv
-{
- size_t counter;
- struct
- {
- void *val;
- bool is_static;
- } pointer;
-} dtv_t;
+# include <dl-dtv.h>
#else /* __ASSEMBLER__ */
# include <tcb-offsets.h>
@@ -119,6 +109,7 @@ typedef struct
descr->member[idx] = (value)
/* Get and set the global scope generation counter in struct pthread. */
+# define THREAD_GSCOPE_IN_TCB 1
# define THREAD_GSCOPE_FLAG_UNUSED 0
# define THREAD_GSCOPE_FLAG_USED 1
# define THREAD_GSCOPE_FLAG_WAIT 2
diff --git a/sysdeps/aarch64/rawmemchr.S b/sysdeps/aarch64/rawmemchr.S
new file mode 100644
index 0000000000..de3c05f337
--- /dev/null
+++ b/sysdeps/aarch64/rawmemchr.S
@@ -0,0 +1,42 @@
+/* rawmemchr - find a character in a memory zone
+
+ Copyright (C) 2015-2018 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Special case rawmemchr (s, 0) as strlen, otherwise tailcall memchr.
+ Call strlen without setting up a full frame - it preserves x14/x15.
+*/
+
+ENTRY (__rawmemchr)
+ cbz w1, L(do_strlen)
+ mov x2, -1
+ b __memchr
+
+L(do_strlen):
+ mov x15, x30
+ cfi_return_column (x15)
+ mov x14, x0
+ bl __strlen
+ add x0, x14, x0
+ ret x15
+
+END (__rawmemchr)
+weak_alias (__rawmemchr, rawmemchr)
+libc_hidden_builtin_def (__rawmemchr)
diff --git a/sysdeps/aarch64/setjmp.S b/sysdeps/aarch64/setjmp.S
index 22f4368ee6..d8838e7b54 100644
--- a/sysdeps/aarch64/setjmp.S
+++ b/sysdeps/aarch64/setjmp.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -33,6 +33,7 @@ END (_setjmp)
libc_hidden_def (_setjmp)
ENTRY (__sigsetjmp)
+ DELOUSE (0)
1:
stp x19, x20, [x0, #JB_X19<<3]
@@ -42,7 +43,7 @@ ENTRY (__sigsetjmp)
stp x27, x28, [x0, #JB_X27<<3]
#ifdef PTR_MANGLE
- PTR_MANGLE (x4, x30, x3, x2)
+ PTR_MANGLE (4, 30, 3, 2)
stp x29, x4, [x0, #JB_X29<<3]
#else
stp x29, x30, [x0, #JB_X29<<3]
@@ -57,7 +58,7 @@ ENTRY (__sigsetjmp)
stp d14, d15, [x0, #JB_D14<<3]
#ifdef PTR_MANGLE
mov x4, sp
- PTR_MANGLE (x5, x4, x3, x2)
+ PTR_MANGLE (5, 4, 3, 2)
str x5, [x0, #JB_SP<<3]
#else
mov x2, sp
diff --git a/sysdeps/aarch64/soft-fp/sfp-machine.h b/sysdeps/aarch64/sfp-machine.h
index 3e969952fa..3e969952fa 100644
--- a/sysdeps/aarch64/soft-fp/sfp-machine.h
+++ b/sysdeps/aarch64/sfp-machine.h
diff --git a/sysdeps/aarch64/soft-fp/Makefile b/sysdeps/aarch64/soft-fp/Makefile
deleted file mode 100644
index ada13e8b70..0000000000
--- a/sysdeps/aarch64/soft-fp/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-ifeq ($(subdir),math)
-CPPFLAGS += -I../soft-fp
-endif
diff --git a/sysdeps/aarch64/sotruss-lib.c b/sysdeps/aarch64/sotruss-lib.c
index 65174402b8..5dafacc21f 100644
--- a/sysdeps/aarch64/sotruss-lib.c
+++ b/sysdeps/aarch64/sotruss-lib.c
@@ -1,5 +1,5 @@
/* Override generic sotruss-lib.c to define actual functions for AArch64.
- Copyright (C) 2012-2016 Free Software Foundation, Inc.
+ Copyright (C) 2012-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/stackinfo.h b/sysdeps/aarch64/stackinfo.h
index 78f3262033..7e666fbe18 100644
--- a/sysdeps/aarch64/stackinfo.h
+++ b/sysdeps/aarch64/stackinfo.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2001-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2001-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/start.S b/sysdeps/aarch64/start.S
index efe24749f7..bad000f555 100644
--- a/sysdeps/aarch64/start.S
+++ b/sysdeps/aarch64/start.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 1995-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1995-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,6 +16,8 @@
License along with the GNU C Library. If not, see
<http://www.gnu.org/licenses/>. */
+#include <sysdep.h>
+
/* This is the canonical entry point, usually the first thing in the text
segment.
@@ -25,7 +27,7 @@
At this entry point, most registers' values are unspecified, except:
- x0 Contains a function pointer to be registered with `atexit'.
+ x0/w0 Contains a function pointer to be registered with `atexit'.
This is how the dynamic linker arranges to have DT_FINI
functions called for shared libraries that have been loaded
before this code runs.
@@ -52,26 +54,35 @@ _start:
mov x5, x0
/* Load argc and a pointer to argv */
- ldr x1, [sp, #0]
- add x2, sp, #8
+ ldr PTR_REG (1), [sp, #0]
+ add x2, sp, #PTR_SIZE
/* Setup stack limit in argument register */
mov x6, sp
-#ifdef SHARED
+#ifdef PIC
+# ifdef SHARED
adrp x0, :got:main
- ldr x0, [x0, #:got_lo12:main]
+ ldr PTR_REG (0), [x0, #:got_lo12:main]
adrp x3, :got:__libc_csu_init
- ldr x3, [x3, #:got_lo12:__libc_csu_init]
+ ldr PTR_REG (3), [x3, #:got_lo12:__libc_csu_init]
adrp x4, :got:__libc_csu_fini
- ldr x4, [x4, #:got_lo12:__libc_csu_fini]
+ ldr PTR_REG (4), [x4, #:got_lo12:__libc_csu_fini]
+# else
+ adrp x0, __wrap_main
+ add x0, x0, :lo12:__wrap_main
+ adrp x3, __libc_csu_init
+ add x3, x3, :lo12:__libc_csu_init
+ adrp x4, __libc_csu_fini
+ add x4, x4, :lo12:__libc_csu_fini
+# endif
#else
/* Set up the other arguments in registers */
- ldr x0, =main
- ldr x3, =__libc_csu_init
- ldr x4, =__libc_csu_fini
+ MOVL (0, main)
+ MOVL (3, __libc_csu_init)
+ MOVL (4, __libc_csu_fini)
#endif
/* __libc_start_main (main, argc, argv, init, fini, rtld_fini,
@@ -83,6 +94,15 @@ _start:
/* should never get here....*/
bl abort
+#if defined PIC && !defined SHARED
+ /* When main is not defined in the executable but in a shared library
+ then a wrapper is needed in crt1.o of the static-pie enabled libc,
+ because crt1.o and rcrt1.o share code and the later must avoid the
+ use of GOT relocations before __libc_start_main is called. */
+__wrap_main:
+ b main
+#endif
+
/* Define a symbol for the first piece of initialized data. */
.data
.globl __data_start
diff --git a/sysdeps/aarch64/stpcpy.S b/sysdeps/aarch64/stpcpy.S
index cbbf26c499..dcf1356bda 100644
--- a/sysdeps/aarch64/stpcpy.S
+++ b/sysdeps/aarch64/stpcpy.S
@@ -1,5 +1,5 @@
/* stpcpy - copy a string returning pointer to end.
- Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Copyright (C) 2015-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/aarch64/strchr.S b/sysdeps/aarch64/strchr.S
index 5e3aecf4c2..c27465220b 100644
--- a/sysdeps/aarch64/strchr.S
+++ b/sysdeps/aarch64/strchr.S
@@ -1,6 +1,6 @@
/* strchr - find a character in a string
- Copyright (C) 2014-2016 Free Software Foundation, Inc.
+ Copyright (C) 2014-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -62,6 +62,7 @@
/* Locals and temporaries. */
ENTRY (strchr)
+ DELOUSE (0)
mov wtmp2, #0x0401
movk wtmp2, #0x4010, lsl #16
dup vrepchr.16b, chrin
diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
index a624c8d2ef..e13ace5b7e 100644
--- a/sysdeps/aarch64/strchrnul.S
+++ b/sysdeps/aarch64/strchrnul.S
@@ -1,6 +1,6 @@
/* strchrnul - find a character or nul in a string
- Copyright (C) 2014-2016 Free Software Foundation, Inc.
+ Copyright (C) 2014-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -60,6 +60,7 @@
identify exactly which byte is causing the termination. */
ENTRY (__strchrnul)
+ DELOUSE (0)
/* Magic constant 0x40100401 to allow us to identify which lane
matches the termination condition. */
mov wtmp2, #0x0401
diff --git a/sysdeps/aarch64/strcmp.S b/sysdeps/aarch64/strcmp.S
index ba0ccb409b..267aa4b551 100644
--- a/sysdeps/aarch64/strcmp.S
+++ b/sysdeps/aarch64/strcmp.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2012-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -49,6 +49,8 @@
/* Start of performance-critical section -- one 64B cache line. */
ENTRY_ALIGN(strcmp, 6)
+ DELOUSE (0)
+ DELOUSE (1)
eor tmp1, src1, src2
mov zeroones, #REP8_01
tst tmp1, #7
@@ -70,6 +72,7 @@ L(start_realigned):
cbz syndrome, L(loop_aligned)
/* End of performance-critical section -- one 64B cache line. */
+L(end):
#ifndef __AARCH64EB__
rev syndrome, syndrome
rev data1, data1
@@ -143,12 +146,38 @@ L(mutual_align):
b L(start_realigned)
L(misaligned8):
- /* We can do better than this. */
+ /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+ checking to make sure that we don't access beyond page boundary in
+ SRC2. */
+ tst src1, #7
+ b.eq L(loop_misaligned)
+L(do_misaligned):
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
cmp data1w, #1
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.eq L(misaligned8)
+ b.ne L(done)
+ tst src1, #7
+ b.ne L(do_misaligned)
+
+L(loop_misaligned):
+ /* Test if we are within the last dword of the end of a 4K page. If
+ yes then jump back to the misaligned loop to copy a byte at a time. */
+ and tmp1, src2, #0xff8
+ eor tmp1, tmp1, #0xff8
+ cbz tmp1, L(do_misaligned)
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ orr syndrome, diff, has_nul
+ cbz syndrome, L(loop_misaligned)
+ b L(end)
+
+L(done):
sub result, data1, data2
RET
END(strcmp)
diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
index 0694199372..0449fcb04e 100644
--- a/sysdeps/aarch64/strcpy.S
+++ b/sysdeps/aarch64/strcpy.S
@@ -1,5 +1,5 @@
/* strcpy/stpcpy - copy a string returning pointer to start/end.
- Copyright (C) 2013-2016 Free Software Foundation, Inc.
+ Copyright (C) 2013-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -91,6 +91,8 @@
#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
ENTRY_ALIGN (STRCPY, 6)
+ DELOUSE (0)
+ DELOUSE (1)
/* For moderately short strings, the fastest way to do the copy is to
calculate the length of the string in the same way as strlen, then
essentially do a memcpy of the result. This avoids the need for
diff --git a/sysdeps/aarch64/string_private.h b/sysdeps/aarch64/string_private.h
index 8b194a41a6..ba6297417a 100644
--- a/sysdeps/aarch64/string_private.h
+++ b/sysdeps/aarch64/string_private.h
@@ -1,5 +1,5 @@
/* Define _STRING_ARCH_unaligned. AArch64 version.
- Copyright (C) 2016 Free Software Foundation, Inc.
+ Copyright (C) 2016-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
index 9b4d1da60c..32292e3b6e 100644
--- a/sysdeps/aarch64/strlen.S
+++ b/sysdeps/aarch64/strlen.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2012-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -84,7 +84,9 @@
whether the first fetch, which may be misaligned, crosses a page
boundary. */
-ENTRY_ALIGN (strlen, 6)
+ENTRY_ALIGN (__strlen, 6)
+ DELOUSE (0)
+ DELOUSE (1)
and tmp1, srcin, MIN_PAGE_SIZE - 1
mov zeroones, REP8_01
cmp tmp1, MIN_PAGE_SIZE - 16
@@ -213,5 +215,6 @@ L(page_cross):
csel data1, data1, tmp4, eq
csel data2, data2, tmp2, eq
b L(page_cross_entry)
-END (strlen)
+END (__strlen)
+weak_alias (__strlen, strlen)
libc_hidden_builtin_def (strlen)
diff --git a/sysdeps/aarch64/strncmp.S b/sysdeps/aarch64/strncmp.S
index f6a17fdba2..759c752fc2 100644
--- a/sysdeps/aarch64/strncmp.S
+++ b/sysdeps/aarch64/strncmp.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2013-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -49,15 +49,19 @@
#define limit_wd x13
#define mask x14
#define endloop x15
+#define count mask
ENTRY_ALIGN_AND_PAD (strncmp, 6, 7)
+ DELOUSE (0)
+ DELOUSE (1)
+ DELOUSE (2)
cbz limit, L(ret0)
eor tmp1, src1, src2
mov zeroones, #REP8_01
tst tmp1, #7
+ and count, src1, #7
b.ne L(misaligned8)
- ands tmp1, src1, #7
- b.ne L(mutual_align)
+ cbnz count, L(mutual_align)
/* Calculate the number of full and partial words -1. */
sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
@@ -162,43 +166,107 @@ L(mutual_align):
bic src1, src1, #7
bic src2, src2, #7
ldr data1, [src1], #8
- neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */
+ neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
ldr data2, [src2], #8
mov tmp2, #~0
sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
#ifdef __AARCH64EB__
/* Big-endian. Early bytes are at MSB. */
- lsl tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */
+ lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */
#else
/* Little-endian. Early bytes are at LSB. */
- lsr tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */
+ lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */
#endif
and tmp3, limit_wd, #7
lsr limit_wd, limit_wd, #3
/* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
- add limit, limit, tmp1
- add tmp3, tmp3, tmp1
+ add limit, limit, count
+ add tmp3, tmp3, count
orr data1, data1, tmp2
orr data2, data2, tmp2
add limit_wd, limit_wd, tmp3, lsr #3
b L(start_realigned)
-L(ret0):
- mov result, #0
- RET
-
.p2align 6
+ /* Don't bother with dwords for up to 16 bytes. */
L(misaligned8):
- sub limit, limit, #1
-1:
+ cmp limit, #16
+ b.hs L(try_misaligned_words)
+
+L(byte_loop):
/* Perhaps we can do better than this. */
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
subs limit, limit, #1
- ccmp data1w, #1, #0, cs /* NZCV = 0b0000. */
+ ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.eq 1b
+ b.eq L(byte_loop)
+L(done):
sub result, data1, data2
RET
+
+ /* Align the SRC1 to a dword by doing a bytewise compare and then do
+ the dword loop. */
+L(try_misaligned_words):
+ lsr limit_wd, limit, #3
+ cbz count, L(do_misaligned)
+
+ neg count, count
+ and count, count, #7
+ sub limit, limit, count
+ lsr limit_wd, limit, #3
+
+L(page_end_loop):
+ ldrb data1w, [src1], #1
+ ldrb data2w, [src2], #1
+ cmp data1w, #1
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.ne L(done)
+ subs count, count, #1
+ b.hi L(page_end_loop)
+
+L(do_misaligned):
+ /* Prepare ourselves for the next page crossing. Unlike the aligned
+ loop, we fetch 1 less dword because we risk crossing bounds on
+ SRC2. */
+ mov count, #8
+ subs limit_wd, limit_wd, #1
+ b.lo L(done_loop)
+L(loop_misaligned):
+ and tmp2, src2, #0xff8
+ eor tmp2, tmp2, #0xff8
+ cbz tmp2, L(page_end_loop)
+
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ ccmp diff, #0, #0, eq
+ b.ne L(not_limit)
+ subs limit_wd, limit_wd, #1
+ b.pl L(loop_misaligned)
+
+L(done_loop):
+ /* We found a difference or a NULL before the limit was reached. */
+ and limit, limit, #7
+ cbz limit, L(not_limit)
+ /* Read the last word. */
+ sub src1, src1, 8
+ sub src2, src2, 8
+ ldr data1, [src1, limit]
+ ldr data2, [src2, limit]
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ ccmp diff, #0, #0, eq
+ b.ne L(not_limit)
+
+L(ret0):
+ mov result, #0
+ RET
+
END (strncmp)
libc_hidden_builtin_def (strncmp)
diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
index 4cce45f905..0f190e83e2 100644
--- a/sysdeps/aarch64/strnlen.S
+++ b/sysdeps/aarch64/strnlen.S
@@ -1,6 +1,6 @@
/* strnlen - calculate the length of a string with limit.
- Copyright (C) 2013-2016 Free Software Foundation, Inc.
+ Copyright (C) 2013-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -50,6 +50,9 @@
#define REP8_80 0x8080808080808080
ENTRY_ALIGN_AND_PAD (__strnlen, 6, 9)
+ DELOUSE (0)
+ DELOUSE (1)
+ DELOUSE (2)
cbz limit, L(hit_limit)
mov zeroones, #REP8_01
bic src, srcin, #15
diff --git a/sysdeps/aarch64/strrchr.S b/sysdeps/aarch64/strrchr.S
index 44c1917a20..aa334ede53 100644
--- a/sysdeps/aarch64/strrchr.S
+++ b/sysdeps/aarch64/strrchr.S
@@ -1,6 +1,6 @@
/* strrchr: find the last instance of a character in a string.
- Copyright (C) 2014-2016 Free Software Foundation, Inc.
+ Copyright (C) 2014-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -68,6 +68,7 @@
identify exactly which byte is causing the termination, and why. */
ENTRY(strrchr)
+ DELOUSE (0)
cbz x1, L(null_search)
/* Magic constant 0x40100401 to allow us to identify which lane
matches the requested byte. Magic constant 0x80200802 used
diff --git a/sysdeps/aarch64/sysdep.h b/sysdeps/aarch64/sysdep.h
index 6b728ec651..5b30709436 100644
--- a/sysdeps/aarch64/sysdep.h
+++ b/sysdeps/aarch64/sysdep.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,8 +16,25 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
+#ifndef _AARCH64_SYSDEP_H
+#define _AARCH64_SYSDEP_H
+
#include <sysdeps/generic/sysdep.h>
+#ifdef __LP64__
+# define AARCH64_R(NAME) R_AARCH64_ ## NAME
+# define PTR_REG(n) x##n
+# define PTR_LOG_SIZE 3
+# define DELOUSE(n)
+#else
+# define AARCH64_R(NAME) R_AARCH64_P32_ ## NAME
+# define PTR_REG(n) w##n
+# define PTR_LOG_SIZE 2
+# define DELOUSE(n) mov w##n, w##n
+#endif
+
+#define PTR_SIZE (1<<PTR_LOG_SIZE)
+
#ifdef __ASSEMBLER__
/* Syntactic details of assembler. */
@@ -66,9 +83,38 @@
/* If compiled for profiling, call `mcount' at the start of each function. */
#ifdef PROF
# define CALL_MCOUNT \
- str x30, [sp, #-16]!; \
+ str x30, [sp, #-80]!; \
+ cfi_adjust_cfa_offset (80); \
+ cfi_rel_offset (x30, 0); \
+ stp x0, x1, [sp, #16]; \
+ cfi_rel_offset (x0, 16); \
+ cfi_rel_offset (x1, 24); \
+ stp x2, x3, [sp, #32]; \
+ cfi_rel_offset (x2, 32); \
+ cfi_rel_offset (x3, 40); \
+ stp x4, x5, [sp, #48]; \
+ cfi_rel_offset (x4, 48); \
+ cfi_rel_offset (x5, 56); \
+ stp x6, x7, [sp, #64]; \
+ cfi_rel_offset (x6, 64); \
+ cfi_rel_offset (x7, 72); \
+ mov x0, x30; \
bl mcount; \
- ldr x30, [sp], #16 ;
+ ldp x0, x1, [sp, #16]; \
+ cfi_restore (x0); \
+ cfi_restore (x1); \
+ ldp x2, x3, [sp, #32]; \
+ cfi_restore (x2); \
+ cfi_restore (x3); \
+ ldp x4, x5, [sp, #48]; \
+ cfi_restore (x4); \
+ cfi_restore (x5); \
+ ldp x6, x7, [sp, #64]; \
+ cfi_restore (x6); \
+ cfi_restore (x7); \
+ ldr x30, [sp], #80; \
+ cfi_adjust_cfa_offset (-80); \
+ cfi_restore (x30);
#else
# define CALL_MCOUNT /* Do nothing. */
#endif
@@ -78,16 +124,32 @@
# define L(name) .L##name
#endif
-/* Load or store to/from a pc-relative EXPR into/from R, using T. */
-#define LDST_PCREL(OP, R, T, EXPR) \
- adrp T, EXPR; \
- OP R, [T, #:lo12:EXPR];\
-
-/* Load or store to/from a got-relative EXPR into/from R, using T. */
-#define LDST_GLOBAL(OP, R, T, EXPR) \
- adrp T, :got:EXPR; \
- ldr T, [T, #:got_lo12:EXPR];\
- OP R, [T];
+/* Load or store to/from a pc-relative EXPR into/from R, using T.
+ Note R and T are register numbers and not register names. */
+#define LDST_PCREL(OP, R, T, EXPR) \
+ adrp x##T, EXPR; \
+ OP PTR_REG (R), [x##T, #:lo12:EXPR]; \
+
+/* Load or store to/from a got-relative EXPR into/from R, using T.
+ Note R and T are register numbers and not register names. */
+#define LDST_GLOBAL(OP, R, T, EXPR) \
+ adrp x##T, :got:EXPR; \
+ ldr PTR_REG (T), [x##T, #:got_lo12:EXPR]; \
+ OP PTR_REG (R), [x##T];
+
+/* Load an immediate into R.
+ Note R is a register number and not a register name. */
+#ifdef __LP64__
+# define MOVL(R, NAME) \
+ movz PTR_REG (R), #:abs_g3:NAME; \
+ movk PTR_REG (R), #:abs_g2_nc:NAME; \
+ movk PTR_REG (R), #:abs_g1_nc:NAME; \
+ movk PTR_REG (R), #:abs_g0_nc:NAME;
+#else
+# define MOVL(R, NAME) \
+ movz PTR_REG (R), #:abs_g1:NAME; \
+ movk PTR_REG (R), #:abs_g0_nc:NAME;
+#endif
/* Since C identifiers are not normally prefixed with an underscore
on this system, the asm identifier `syscall_error' intrudes on the
@@ -96,3 +158,5 @@
#define mcount _mcount
#endif /* __ASSEMBLER__ */
+
+#endif /* _AARCH64_SYSDEP_H */
diff --git a/sysdeps/aarch64/tls-macros.h b/sysdeps/aarch64/tls-macros.h
index 2080a4dfa4..cc58a7d6bf 100644
--- a/sysdeps/aarch64/tls-macros.h
+++ b/sysdeps/aarch64/tls-macros.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2009-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
diff --git a/sysdeps/aarch64/tlsdesc.c b/sysdeps/aarch64/tlsdesc.c
index ac3a9f4591..357465f23d 100644
--- a/sysdeps/aarch64/tlsdesc.c
+++ b/sysdeps/aarch64/tlsdesc.c
@@ -1,6 +1,6 @@
/* Manage TLS descriptors. AArch64 version.
- Copyright (C) 2011-2016 Free Software Foundation, Inc.
+ Copyright (C) 2011-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -18,143 +18,17 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#include <link.h>
#include <ldsodefs.h>
-#include <elf/dynamic-link.h>
#include <tls.h>
#include <dl-tlsdesc.h>
#include <dl-unmap-segments.h>
+#define _dl_tlsdesc_resolve_hold 0
#include <tlsdeschtab.h>
-#include <atomic.h>
-
-/* The following functions take an entry_check_offset argument. It's
- computed by the caller as an offset between its entry point and the
- call site, such that by adding the built-in return address that is
- implicitly passed to the function with this offset, we can easily
- obtain the caller's entry point to compare with the entry point
- given in the TLS descriptor. If it's changed, we want to return
- immediately. */
-
-/* This function is used to lazily resolve TLS_DESC RELA relocations.
- The argument location is used to hold a pointer to the relocation. */
-
-void
-attribute_hidden
-_dl_tlsdesc_resolve_rela_fixup (struct tlsdesc *td, struct link_map *l)
-{
- const ElfW(Rela) *reloc = atomic_load_relaxed (&td->arg);
-
- /* After GL(dl_load_lock) is grabbed only one caller can see td->entry in
- initial state in _dl_tlsdesc_resolve_early_return_p, other concurrent
- callers will return and retry calling td->entry. The updated td->entry
- synchronizes with the single writer so all read accesses here can use
- relaxed order. */
- if (_dl_tlsdesc_resolve_early_return_p
- (td, (void*)(D_PTR (l, l_info[ADDRIDX (DT_TLSDESC_PLT)]) + l->l_addr)))
- return;
-
- /* The code below was borrowed from _dl_fixup(),
- except for checking for STB_LOCAL. */
- const ElfW(Sym) *const symtab
- = (const void *) D_PTR (l, l_info[DT_SYMTAB]);
- const char *strtab = (const void *) D_PTR (l, l_info[DT_STRTAB]);
- const ElfW(Sym) *sym = &symtab[ELFW(R_SYM) (reloc->r_info)];
- lookup_t result;
-
- /* Look up the target symbol. If the normal lookup rules are not
- used don't look in the global scope. */
- if (ELFW(ST_BIND) (sym->st_info) != STB_LOCAL
- && __builtin_expect (ELFW(ST_VISIBILITY) (sym->st_other), 0) == 0)
- {
- const struct r_found_version *version = NULL;
-
- if (l->l_info[VERSYMIDX (DT_VERSYM)] != NULL)
- {
- const ElfW(Half) *vernum =
- (const void *) D_PTR (l, l_info[VERSYMIDX (DT_VERSYM)]);
- ElfW(Half) ndx = vernum[ELFW(R_SYM) (reloc->r_info)] & 0x7fff;
- version = &l->l_versions[ndx];
- if (version->hash == 0)
- version = NULL;
- }
-
- result = _dl_lookup_symbol_x (strtab + sym->st_name, l, &sym,
- l->l_scope, version, ELF_RTYPE_CLASS_PLT,
- DL_LOOKUP_ADD_DEPENDENCY, NULL);
- }
- else
- {
- /* We already found the symbol. The module (and therefore its load
- address) is also known. */
- result = l;
- }
-
- if (!sym)
- {
- atomic_store_relaxed (&td->arg, (void *) reloc->r_addend);
- /* This release store synchronizes with the ldar acquire load
- instruction in _dl_tlsdesc_undefweak. */
- atomic_store_release (&td->entry, _dl_tlsdesc_undefweak);
- }
- else
- {
-# ifndef SHARED
- CHECK_STATIC_TLS (l, result);
-# else
- if (!TRY_STATIC_TLS (l, result))
- {
- void *p = _dl_make_tlsdesc_dynamic (result, sym->st_value
- + reloc->r_addend);
- atomic_store_relaxed (&td->arg, p);
- /* This release store synchronizes with the ldar acquire load
- instruction in _dl_tlsdesc_dynamic. */
- atomic_store_release (&td->entry, _dl_tlsdesc_dynamic);
- }
- else
-# endif
- {
- void *p = (void*) (sym->st_value + result->l_tls_offset
- + reloc->r_addend);
- atomic_store_relaxed (&td->arg, p);
- /* This release store synchronizes with the ldar acquire load
- instruction in _dl_tlsdesc_return_lazy. */
- atomic_store_release (&td->entry, _dl_tlsdesc_return_lazy);
- }
- }
-
- _dl_tlsdesc_wake_up_held_fixups ();
-}
-
-/* This function is used to avoid busy waiting for other threads to
- complete the lazy relocation. Once another thread wins the race to
- relocate a TLS descriptor, it sets the descriptor up such that this
- function is called to wait until the resolver releases the
- lock. */
-
-void
-attribute_hidden
-_dl_tlsdesc_resolve_hold_fixup (struct tlsdesc *td, void *caller)
-{
- /* Maybe we're lucky and can return early. */
- if (caller != atomic_load_relaxed (&td->entry))
- return;
-
- /* Locking here will stop execution until the running resolver runs
- _dl_tlsdesc_wake_up_held_fixups(), releasing the lock.
-
- FIXME: We'd be better off waiting on a condition variable, such
- that we didn't have to hold the lock throughout the relocation
- processing. */
- __rtld_lock_lock_recursive (GL(dl_load_lock));
- __rtld_lock_unlock_recursive (GL(dl_load_lock));
-}
-
/* Unmap the dynamic object, but also release its TLS descriptor table
if there is one. */
void
-internal_function
_dl_unmap (struct link_map *map)
{
_dl_unmap_segments (map);
diff --git a/sysdeps/aarch64/tlsdesc.sym b/sysdeps/aarch64/tlsdesc.sym
index 63766af612..a06a719779 100644
--- a/sysdeps/aarch64/tlsdesc.sym
+++ b/sysdeps/aarch64/tlsdesc.sym
@@ -13,3 +13,6 @@ TLSDESC_ARG offsetof(struct tlsdesc, arg)
TLSDESC_GEN_COUNT offsetof(struct tlsdesc_dynamic_arg, gen_count)
TLSDESC_MODID offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_module)
TLSDESC_MODOFF offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_offset)
+TCBHEAD_DTV offsetof(tcbhead_t, dtv)
+DTV_COUNTER offsetof(dtv_t, counter)
+TLS_DTV_UNALLOCATED TLS_DTV_UNALLOCATED
diff --git a/sysdeps/aarch64/tst-audit.h b/sysdeps/aarch64/tst-audit.h
index 8ed01484e3..fd4f38f613 100644
--- a/sysdeps/aarch64/tst-audit.h
+++ b/sysdeps/aarch64/tst-audit.h
@@ -1,6 +1,6 @@
/* Definitions for testing PLT entry/exit auditing. AArch64 version.
- Copyright (C) 2005-2016 Free Software Foundation, Inc.
+ Copyright (C) 2005-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.