Merge commit 'refs/top-bases/t/gsync-libc-merge' into t/gsync-libc-merge

author: Samuel Thibault <samuel.thibault@ens-lyon.org> 2018-12-27 19:16:25 +0000
committer: Samuel Thibault <samuel.thibault@ens-lyon.org> 2018-12-27 19:16:25 +0000
commit: 8d59503b977070aaa4e504e8d6dcb7da3711893e (patch)
tree: 8272c9c2cce43afa4fe4d8d92c269a6435242661 /sysdeps/aarch64
parent: 76a7dc16fab8853ef9230447fa98c70a3619dc6d (diff)
parent: bcea9593527d90b9f9ff3817e3fbf0fbc3d01fa7 (diff)
134 files changed, 3188 insertions, 2008 deletions
diff --git a/sysdeps/aarch64/Implies b/sysdeps/aarch64/Implies
index e5adf4d63c..a1d5e2e742 100644
--- a/sysdeps/aarch64/Implies
+++ b/sysdeps/aarch64/Implies
@@ -3,4 +3,3 @@ ieee754/ldbl-128
 ieee754/dbl-64/wordsize-64
 ieee754/dbl-64
 ieee754/flt-32
-aarch64/soft-fp
diff --git a/sysdeps/aarch64/Makefile b/sysdeps/aarch64/Makefile
index 06323550e9..94baaf52dd 100644
--- a/sysdeps/aarch64/Makefile
+++ b/sysdeps/aarch64/Makefile
@@ -1,9 +1,5 @@
 long-double-fcts = yes
 
-ifeq ($(subdir),debug)
-CFLAGS-backtrace.c += -funwind-tables
-endif
-
 ifeq ($(subdir),elf)
 sysdep-dl-routines += tlsdesc dl-tlsdesc
 gen-as-const-headers += dl-link.sym
@@ -12,3 +8,11 @@ endif
 ifeq ($(subdir),csu)
 gen-as-const-headers += tlsdesc.sym
 endif
+
+ifeq ($(subdir),gmon)
+CFLAGS-mcount.c += -mgeneral-regs-only
+endif
+
+ifeq ($(subdir),math)
+CPPFLAGS += -I../soft-fp
+endif
diff --git a/sysdeps/aarch64/__longjmp.S b/sysdeps/aarch64/__longjmp.S
index 65116be557..3365bd70f9 100644
--- a/sysdeps/aarch64/__longjmp.S
+++ b/sysdeps/aarch64/__longjmp.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -46,6 +46,8 @@ ENTRY (__longjmp)
 	cfi_offset(d14, JB_D14<<3)
 	cfi_offset(d15, JB_D15<<3)
 
+	DELOUSE (0)
+
 	ldp	x19, x20, [x0, #JB_X19<<3]
 	ldp	x21, x22, [x0, #JB_X21<<3]
 	ldp	x23, x24, [x0, #JB_X23<<3]
@@ -53,7 +55,7 @@ ENTRY (__longjmp)
 	ldp	x27, x28, [x0, #JB_X27<<3]
 #ifdef PTR_DEMANGLE
 	ldp	x29,  x4, [x0, #JB_X29<<3]
-	PTR_DEMANGLE (x30, x4, x3, x2)
+	PTR_DEMANGLE (30, 4, 3, 2)
 #else
 	ldp	x29, x30, [x0, #JB_X29<<3]
 #endif
@@ -98,7 +100,7 @@ ENTRY (__longjmp)
 	cfi_same_value(d15)
 #ifdef PTR_DEMANGLE
 	ldr	x4, [x0, #JB_SP<<3]
-	PTR_DEMANGLE (x5, x4, x3, x2)
+	PTR_DEMANGLE (5, 4, 3, 2)
 #else
 	ldr	x5, [x0, #JB_SP<<3]
 #endif
diff --git a/sysdeps/aarch64/atomic-machine.h b/sysdeps/aarch64/atomic-machine.h
index 28c80dc42d..63b24e625c 100644
--- a/sysdeps/aarch64/atomic-machine.h
+++ b/sysdeps/aarch64/atomic-machine.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2003-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -38,6 +38,7 @@ typedef uintmax_t uatomic_max_t;
 
 #define __HAVE_64B_ATOMICS 1
 #define USE_ATOMIC_COMPILER_BUILTINS 1
+#define ATOMIC_EXCHANGE_USES_CAS 0
 
 /* Compare and exchange.
    For all "bool" routines, we return FALSE if exchange succesful.  */
@@ -115,10 +116,6 @@ typedef uintmax_t uatomic_max_t;
 
 /* Compare and exchange with "release" semantics, ie barrier before.  */
 
-# define atomic_compare_and_exchange_bool_rel(mem, new, old)	\
-  __atomic_bool_bysize (__arch_compare_and_exchange_bool, int,	\
-			mem, new, old, __ATOMIC_RELEASE)
-
 # define atomic_compare_and_exchange_val_rel(mem, new, old)	 \
   __atomic_val_bysize (__arch_compare_and_exchange_val, int,    \
                        mem, new, old, __ATOMIC_RELEASE)
diff --git a/sysdeps/aarch64/backtrace.c b/sysdeps/aarch64/backtrace.c
deleted file mode 100644
index 27ce597b39..0000000000
--- a/sysdeps/aarch64/backtrace.c
+++ /dev/null
@@ -1 +0,0 @@
-#include <sysdeps/x86_64/backtrace.c>
diff --git a/sysdeps/aarch64/bits/endian.h b/sysdeps/aarch64/bits/endian.h
index a32ebc0d80..1f18733422 100644
--- a/sysdeps/aarch64/bits/endian.h
+++ b/sysdeps/aarch64/bits/endian.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/bits/fenv.h b/sysdeps/aarch64/bits/fenv.h
index aced0d33b2..8a0f481f79 100644
--- a/sysdeps/aarch64/bits/fenv.h
+++ b/sysdeps/aarch64/bits/fenv.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2004-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2004-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -72,3 +72,11 @@ fenv_t;
 /* Floating-point environment where none of the exceptions are masked.  */
 # define FE_NOMASK_ENV  ((const fenv_t *) -2)
 #endif
+
+#if __GLIBC_USE (IEC_60559_BFP_EXT)
+/* Type representing floating-point control modes.  */
+typedef unsigned int femode_t;
+
+/* Default floating-point control modes.  */
+# define FE_DFL_MODE	((const femode_t *) -1L)
+#endif
diff --git a/sysdeps/aarch64/fpu/s_frint.c b/sysdeps/aarch64/bits/fp-fast.h
index 321c8ef50d..3ce12210b7 100644
--- a/sysdeps/aarch64/fpu/s_frint.c
+++ b/sysdeps/aarch64/bits/fp-fast.h
@@ -1,5 +1,5 @@
-/* Copyright (C) 1996-2016 Free Software Foundation, Inc.
-
+/* Define FP_FAST_* macros.  AArch64 version.
+   Copyright (C) 2016-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,34 +16,19 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <math.h>
-
-#ifndef FUNC
-# error FUNC not defined
-#endif
-
-#ifndef TYPE
-# define TYPE double
-# define REGS "d"
-#else
-# ifndef REGS
-#  error REGS not defined
-# endif
+#ifndef _MATH_H
+# error "Never use <bits/fp-fast.h> directly; include <math.h> instead."
 #endif
 
-#ifndef INSN
-# error INSN not defined
-#endif
+#ifdef __USE_ISOC99
 
-#define __CONCATX(a,b) __CONCAT(a,b)
+/* The GCC 4.6 compiler will define __FP_FAST_FMA{,F,L} if the fma{,f,l}
+   builtins are supported.  */
+# define FP_FAST_FMA 1
+# define FP_FAST_FMAF 1
 
-TYPE
-__CONCATX(__,FUNC) (TYPE x)
-{
-  TYPE result;
-  asm ( INSN "\t%" REGS "0, %" REGS "1" :
-	"=w" (result) : "w" (x) );
-  return result;
-}
+# ifdef __FP_FAST_FMAL
+#  define FP_FAST_FMAL 1
+# endif
 
-weak_alias (__CONCATX(__,FUNC), FUNC)
+#endif
diff --git a/sysdeps/aarch64/bits/link.h b/sysdeps/aarch64/bits/link.h
index 0bf961519e..5a7fc1ccd4 100644
--- a/sysdeps/aarch64/bits/link.h
+++ b/sysdeps/aarch64/bits/link.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2005-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/bits/setjmp.h b/sysdeps/aarch64/bits/setjmp.h
index fff8616c60..1685f1076a 100644
--- a/sysdeps/aarch64/bits/setjmp.h
+++ b/sysdeps/aarch64/bits/setjmp.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/bits/string.h b/sysdeps/aarch64/bits/wordsize.h
index 0a508f72c0..ea7eecdf92 100644
--- a/sysdeps/aarch64/bits/string.h
+++ b/sysdeps/aarch64/bits/wordsize.h
@@ -1,5 +1,6 @@
-/* Optimized, inlined string functions.  AArch64 version.
-   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+/* Determine the wordsize from the preprocessor defines.
+
+   Copyright (C) 2016-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,9 +17,12 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifndef _STRING_H
-# error "Never use <bits/string.h> directly; include <string.h> instead."
+#ifdef __LP64__
+# define __WORDSIZE			64
+#else
+# define __WORDSIZE			32
+# define __WORDSIZE32_SIZE_ULONG	1
+# define __WORDSIZE32_PTRDIFF_LONG	1
 #endif
 
-/* AArch64 uses the aligned string inline ABI.  */
-#define _STRING_INLINE_unaligned 0
+#define __WORDSIZE_TIME64_COMPAT32	0
diff --git a/sysdeps/aarch64/crti.S b/sysdeps/aarch64/crti.S
index 53ccb42587..2b213758b2 100644
--- a/sysdeps/aarch64/crti.S
+++ b/sysdeps/aarch64/crti.S
@@ -1,5 +1,5 @@
 /* Special .init and .fini section support for AArch64.
-   Copyright (C) 1995-2016 Free Software Foundation, Inc.
+   Copyright (C) 1995-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -39,6 +39,7 @@
    they can be called as functions.  The symbols _init and _fini are
    magic and cause the linker to emit DT_INIT and DT_FINI.  */
 
+#include <sysdep.h>
 #include <libc-symbols.h>
 
 #ifndef PREINIT_FUNCTION
@@ -60,7 +61,7 @@
 	.type	call_weak_fn, %function
 call_weak_fn:
 	adrp	x0, :got:PREINIT_FUNCTION
-	ldr	x0, [x0, #:got_lo12:PREINIT_FUNCTION]
+	ldr	PTR_REG (0), [x0, #:got_lo12:PREINIT_FUNCTION]
 	cbz	x0, 1f
 	b	PREINIT_FUNCTION
 1:
@@ -71,6 +72,7 @@ call_weak_fn:
 	.section .init,"ax",%progbits
 	.align	2
 	.global	_init
+	.hidden	_init
 	.type	_init, %function
 _init:
 	stp	x29, x30, [sp, -16]!
@@ -84,6 +86,7 @@ _init:
 	.section	.fini,"ax",%progbits
 	.align	2
 	.global	_fini
+	.hidden	_fini
 	.type	_fini, %function
 _fini:
 	stp	x29, x30, [sp, -16]!
diff --git a/sysdeps/aarch64/crtn.S b/sysdeps/aarch64/crtn.S
index 33d5f3d822..d72300af80 100644
--- a/sysdeps/aarch64/crtn.S
+++ b/sysdeps/aarch64/crtn.S
@@ -1,5 +1,5 @@
 /* Special .init and .fini section support for AArch64.
-   Copyright (C) 1995-2016 Free Software Foundation, Inc.
+   Copyright (C) 1995-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/dl-irel.h b/sysdeps/aarch64/dl-irel.h
index 63a8e506eb..5889ee187b 100644
--- a/sysdeps/aarch64/dl-irel.h
+++ b/sysdeps/aarch64/dl-irel.h
@@ -1,6 +1,6 @@
 /* Machine-dependent ELF indirect relocation inline functions.
    AArch64 version.
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
+   Copyright (C) 2012-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -23,6 +23,7 @@
 #include <stdio.h>
 #include <unistd.h>
 #include <ldsodefs.h>
+#include <sysdep.h>
 
 #define ELF_MACHINE_IRELA	1
 
@@ -30,7 +31,7 @@ static inline ElfW(Addr)
 __attribute ((always_inline))
 elf_ifunc_invoke (ElfW(Addr) addr)
 {
-  return ((ElfW(Addr) (*) (unsigned long int)) (addr)) (GLRO(dl_hwcap));
+  return ((ElfW(Addr) (*) (uint64_t)) (addr)) (GLRO(dl_hwcap));
 }
 
 static inline void
@@ -40,7 +41,7 @@ elf_irela (const ElfW(Rela) *reloc)
   ElfW(Addr) *const reloc_addr = (void *) reloc->r_offset;
   const unsigned long int r_type = ELFW(R_TYPE) (reloc->r_info);
 
-  if (__glibc_likely (r_type == R_AARCH64_IRELATIVE))
+  if (__glibc_likely (r_type == AARCH64_R(IRELATIVE)))
     {
       ElfW(Addr) value = elf_ifunc_invoke (reloc->r_addend);
       *reloc_addr = value;
diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h
index 282805e396..4935aa7c54 100644
--- a/sysdeps/aarch64/dl-machine.h
+++ b/sysdeps/aarch64/dl-machine.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 1995-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1995-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -21,9 +21,11 @@
 
 #define ELF_MACHINE_NAME "aarch64"
 
+#include <sysdep.h>
 #include <tls.h>
 #include <dl-tlsdesc.h>
 #include <dl-irel.h>
+#include <cpu-features.c>
 
 /* Return nonzero iff ELF header is compatible with the running host.  */
 static inline int __attribute__ ((unused))
@@ -49,26 +51,11 @@ elf_machine_load_address (void)
   /* To figure out the load address we use the definition that for any symbol:
      dynamic_addr(symbol) = static_addr(symbol) + load_addr
 
-     The choice of symbol is arbitrary. The static address we obtain
-     by constructing a non GOT reference to the symbol, the dynamic
-     address of the symbol we compute using adrp/add to compute the
-     symbol's address relative to the PC.
-     This depends on 32bit relocations being resolved at link time
-     and that the static address fits in the 32bits.  */
-
-  ElfW(Addr) static_addr;
-  ElfW(Addr) dynamic_addr;
-
-  asm ("					\n"
-"	adrp	%1, _dl_start;			\n"
-"	add	%1, %1, #:lo12:_dl_start	\n"
-"	ldr	%w0, 1f				\n"
-"	b	2f				\n"
-"1:						\n"
-"	.word	_dl_start			\n"
-"2:						\n"
-    : "=r" (static_addr),  "=r" (dynamic_addr));
-  return dynamic_addr - static_addr;
+    _DYNAMIC sysmbol is used here as its link-time address stored in
+    the special unrelocated first GOT entry.  */
+
+    extern ElfW(Dyn) _DYNAMIC[] attribute_hidden;
+    return (ElfW(Addr)) &_DYNAMIC - elf_machine_dynamic ();
 }
 
 /* Set up the loaded object described by L so its unrelocated PLT
@@ -115,97 +102,117 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 	}
     }
 
-  if (l->l_info[ADDRIDX (DT_TLSDESC_GOT)] && lazy)
-    *(ElfW(Addr)*)(D_PTR (l, l_info[ADDRIDX (DT_TLSDESC_GOT)]) + l->l_addr)
-      = (ElfW(Addr)) &_dl_tlsdesc_resolve_rela;
-
   return lazy;
 }
 
 /* Initial entry point for the dynamic linker. The C function
    _dl_start is the real entry point, its return value is the user
    program's entry point */
+#ifdef __LP64__
+# define RTLD_START RTLD_START_1 ("x", "3", "sp")
+#else
+# define RTLD_START RTLD_START_1 ("w", "2", "wsp")
+#endif
 
-#define RTLD_START asm ("\
-.text								\n\
-.globl _start							\n\
-.type _start, %function						\n\
-.globl _dl_start_user						\n\
-.type _dl_start_user, %function					\n\
-_start:								\n\
-	mov	x0,	sp					\n\
-	bl	_dl_start					\n\
-	// returns user entry point in x0			\n\
-	mov	x21, x0						\n\
-_dl_start_user:							\n\
-	// get the original arg count				\n\
-	ldr	x1, [sp]					\n\
-	// get the argv address					\n\
-	add	x2, sp, #8					\n\
-	// get _dl_skip_args to see if we were			\n\
-	// invoked as an executable				\n\
-	adrp	x4, _dl_skip_args				\n\
-        ldr	w4, [x4, #:lo12:_dl_skip_args]			\n\
-	// do we need to adjust argc/argv			\n\
-        cmp	w4, 0						\n\
-	beq	.L_done_stack_adjust				\n\
-	// subtract _dl_skip_args from original arg count	\n\
-	sub	x1, x1, x4					\n\
-	// store adjusted argc back to stack			\n\
-	str	x1, [sp]					\n\
-	// find the first unskipped argument			\n\
-	mov	x3, x2						\n\
-	add	x4, x2, x4, lsl #3				\n\
-	// shuffle argv down					\n\
-1:	ldr	x5, [x4], #8					\n\
-	str	x5, [x3], #8					\n\
-	cmp	x5, #0						\n\
-	bne	1b						\n\
-	// shuffle envp down					\n\
-1:	ldr	x5, [x4], #8					\n\
-	str	x5, [x3], #8					\n\
-	cmp	x5, #0						\n\
-	bne	1b						\n\
-	// shuffle auxv down					\n\
-1:	ldp	x0, x5, [x4, #16]!				\n\
-	stp	x0, x5, [x3], #16				\n\
-	cmp	x0, #0						\n\
-	bne	1b						\n\
-	// Update _dl_argv					\n\
-	adrp	x3, _dl_argv					\n\
-	str	x2, [x3, #:lo12:_dl_argv]			\n\
-.L_done_stack_adjust:						\n\
-	// compute envp						\n\
-	add	x3, x2, x1, lsl #3				\n\
-	add	x3, x3, #8					\n\
-	adrp	x16, _rtld_local				\n\
-        add	x16, x16, #:lo12:_rtld_local			\n\
-        ldr	x0, [x16]					\n\
-	bl	_dl_init					\n\
-	// load the finalizer function				\n\
-	adrp	x0, _dl_fini					\n\
-	add	x0, x0, #:lo12:_dl_fini				\n\
-	// jump to the user_s entry point			\n\
-	br      x21						\n\
+
+#define RTLD_START_1(PTR, PTR_SIZE_LOG, PTR_SP) asm ("\
+.text									\n\
+.globl _start								\n\
+.type _start, %function							\n\
+.globl _dl_start_user							\n\
+.type _dl_start_user, %function						\n\
+_start:									\n\
+	mov	" PTR "0, " PTR_SP "					\n\
+	bl	_dl_start						\n\
+	// returns user entry point in x0				\n\
+	mov	x21, x0							\n\
+_dl_start_user:								\n\
+	// get the original arg count					\n\
+	ldr	" PTR "1, [sp]						\n\
+	// get the argv address						\n\
+	add	" PTR "2, " PTR_SP ", #(1<<"  PTR_SIZE_LOG ")		\n\
+	// get _dl_skip_args to see if we were				\n\
+	// invoked as an executable					\n\
+	adrp	x4, _dl_skip_args					\n\
+        ldr	w4, [x4, #:lo12:_dl_skip_args]				\n\
+	// do we need to adjust argc/argv				\n\
+        cmp	w4, 0							\n\
+	beq	.L_done_stack_adjust					\n\
+	// subtract _dl_skip_args from original arg count		\n\
+	sub	" PTR "1, " PTR "1, " PTR "4				\n\
+	// store adjusted argc back to stack				\n\
+	str	" PTR "1, [sp]						\n\
+	// find the first unskipped argument				\n\
+	mov	" PTR "3, " PTR "2					\n\
+	add	" PTR "4, " PTR "2, " PTR "4, lsl #" PTR_SIZE_LOG "	\n\
+	// shuffle argv down						\n\
+1:	ldr	" PTR "5, [x4], #(1<<"  PTR_SIZE_LOG ")			\n\
+	str	" PTR "5, [x3], #(1<<"  PTR_SIZE_LOG ")			\n\
+	cmp	" PTR "5, #0						\n\
+	bne	1b							\n\
+	// shuffle envp down						\n\
+1:	ldr	" PTR "5, [x4], #(1<<"  PTR_SIZE_LOG ")			\n\
+	str	" PTR "5, [x3], #(1<<"  PTR_SIZE_LOG ")			\n\
+	cmp	" PTR "5, #0						\n\
+	bne	1b							\n\
+	// shuffle auxv down						\n\
+1:	ldp	" PTR "0, " PTR "5, [x4, #(2<<"  PTR_SIZE_LOG ")]!	\n\
+	stp	" PTR "0, " PTR "5, [x3], #(2<<"  PTR_SIZE_LOG ")	\n\
+	cmp	" PTR "0, #0						\n\
+	bne	1b							\n\
+	// Update _dl_argv						\n\
+	adrp	x3, __GI__dl_argv					\n\
+	str	" PTR "2, [x3, #:lo12:__GI__dl_argv]			\n\
+.L_done_stack_adjust:							\n\
+	// compute envp							\n\
+	add	" PTR "3, " PTR "2, " PTR "1, lsl #" PTR_SIZE_LOG "	\n\
+	add	" PTR "3, " PTR "3, #(1<<"  PTR_SIZE_LOG ")		\n\
+	adrp	x16, _rtld_local					\n\
+        add	" PTR "16, " PTR "16, #:lo12:_rtld_local		\n\
+        ldr	" PTR "0, [x16]						\n\
+	bl	_dl_init						\n\
+	// load the finalizer function					\n\
+	adrp	x0, _dl_fini						\n\
+	add	" PTR "0, " PTR "0, #:lo12:_dl_fini			\n\
+	// jump to the user_s entry point				\n\
+	br      x21							\n\
 ");
 
 #define elf_machine_type_class(type)					\
-  ((((type) == R_AARCH64_JUMP_SLOT ||					\
-     (type) == R_AARCH64_TLS_DTPMOD ||					\
-     (type) == R_AARCH64_TLS_DTPREL ||					\
-     (type) == R_AARCH64_TLS_TPREL ||					\
-     (type) == R_AARCH64_TLSDESC) * ELF_RTYPE_CLASS_PLT)		\
-   | (((type) == R_AARCH64_COPY) * ELF_RTYPE_CLASS_COPY)		\
-   | (((type) == R_AARCH64_GLOB_DAT) * ELF_RTYPE_CLASS_EXTERN_PROTECTED_DATA))
+  ((((type) == AARCH64_R(JUMP_SLOT)					\
+     || (type) == AARCH64_R(TLS_DTPMOD)					\
+     || (type) == AARCH64_R(TLS_DTPREL)					\
+     || (type) == AARCH64_R(TLS_TPREL)					\
+     || (type) == AARCH64_R(TLSDESC)) * ELF_RTYPE_CLASS_PLT)		\
+   | (((type) == AARCH64_R(COPY)) * ELF_RTYPE_CLASS_COPY)		\
+   | (((type) == AARCH64_R(GLOB_DAT)) * ELF_RTYPE_CLASS_EXTERN_PROTECTED_DATA))
 
-#define ELF_MACHINE_JMP_SLOT	R_AARCH64_JUMP_SLOT
+#define ELF_MACHINE_JMP_SLOT	AARCH64_R(JUMP_SLOT)
 
 /* AArch64 uses RELA not REL */
 #define ELF_MACHINE_NO_REL 1
 #define ELF_MACHINE_NO_RELA 0
 
+#define DL_PLATFORM_INIT dl_platform_init ()
+
+static inline void __attribute__ ((unused))
+dl_platform_init (void)
+{
+  if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0')
+    /* Avoid an empty string which would disturb us.  */
+    GLRO(dl_platform) = NULL;
+
+#ifdef SHARED
+  /* init_cpu_features has been called early from __libc_start_main in
+     static executable.  */
+  init_cpu_features (&GLRO(dl_aarch64_cpu_features));
+#endif
+}
+
+
 static inline ElfW(Addr)
 elf_machine_fixup_plt (struct link_map *map, lookup_t t,
+		       const ElfW(Sym) *refsym, const ElfW(Sym) *sym,
 		       const ElfW(Rela) *reloc,
 		       ElfW(Addr) *reloc_addr,
 		       ElfW(Addr) value)
@@ -237,9 +244,9 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc,
 		  void *const reloc_addr_arg, int skip_ifunc)
 {
   ElfW(Addr) *const reloc_addr = reloc_addr_arg;
-  const unsigned int r_type = ELF64_R_TYPE (reloc->r_info);
+  const unsigned int r_type = ELFW (R_TYPE) (reloc->r_info);
 
-  if (__builtin_expect (r_type == R_AARCH64_RELATIVE, 0))
+  if (__builtin_expect (r_type == AARCH64_R(RELATIVE), 0))
       *reloc_addr = map->l_addr + reloc->r_addend;
   else if (__builtin_expect (r_type == R_AARCH64_NONE, 0))
       return;
@@ -247,7 +254,7 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc,
     {
       const ElfW(Sym) *const refsym = sym;
       struct link_map *sym_map = RESOLVE_MAP (&sym, version, r_type);
-      ElfW(Addr) value = sym_map == NULL ? 0 : sym_map->l_addr + sym->st_value;
+      ElfW(Addr) value = SYMBOL_ADDRESS (sym_map, sym, true);
 
       if (sym != NULL
 	  && __glibc_unlikely (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC)
@@ -257,7 +264,7 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc,
 
       switch (r_type)
 	{
-	case R_AARCH64_COPY:
+	case AARCH64_R(COPY):
 	  if (sym == NULL)
 	      break;
 
@@ -272,18 +279,21 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc,
 				RTLD_PROGNAME, strtab + refsym->st_name);
 	    }
 	  memcpy (reloc_addr_arg, (void *) value,
-		  MIN (sym->st_size, refsym->st_size));
+		  sym->st_size < refsym->st_size
+		  ? sym->st_size : refsym->st_size);
 	  break;
 
-	case R_AARCH64_RELATIVE:
-	case R_AARCH64_GLOB_DAT:
-	case R_AARCH64_JUMP_SLOT:
-	case R_AARCH64_ABS32:
-	case R_AARCH64_ABS64:
+	case AARCH64_R(RELATIVE):
+	case AARCH64_R(GLOB_DAT):
+	case AARCH64_R(JUMP_SLOT):
+	case AARCH64_R(ABS32):
+#ifdef __LP64__
+	case AARCH64_R(ABS64):
+#endif
 	  *reloc_addr = value + reloc->r_addend;
 	  break;
 
-	case R_AARCH64_TLSDESC:
+	case AARCH64_R(TLSDESC):
 	  {
 	    struct tlsdesc volatile *td =
 	      (struct tlsdesc volatile *)reloc_addr;
@@ -318,7 +328,7 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc,
 	    break;
 	  }
 
-	case R_AARCH64_TLS_DTPMOD:
+	case AARCH64_R(TLS_DTPMOD):
 #ifdef RTLD_BOOTSTRAP
 	  *reloc_addr = 1;
 #else
@@ -329,12 +339,12 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc,
 #endif
 	  break;
 
-	case R_AARCH64_TLS_DTPREL:
+	case AARCH64_R(TLS_DTPREL):
 	  if (sym)
 	    *reloc_addr = sym->st_value + reloc->r_addend;
 	  break;
 
-	case R_AARCH64_TLS_TPREL:
+	case AARCH64_R(TLS_TPREL):
 	  if (sym)
 	    {
 	      CHECK_STATIC_TLS (map, sym_map);
@@ -343,7 +353,7 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc,
 	    }
 	  break;
 
-	case R_AARCH64_IRELATIVE:
+	case AARCH64_R(IRELATIVE):
 	  value = map->l_addr + reloc->r_addend;
 	  value = elf_ifunc_invoke (value);
 	  *reloc_addr = value;
@@ -374,25 +384,34 @@ elf_machine_lazy_rel (struct link_map *map,
 		      int skip_ifunc)
 {
   ElfW(Addr) *const reloc_addr = (void *) (l_addr + reloc->r_offset);
-  const unsigned int r_type = ELF64_R_TYPE (reloc->r_info);
+  const unsigned int r_type = ELFW (R_TYPE) (reloc->r_info);
   /* Check for unexpected PLT reloc type.  */
-  if (__builtin_expect (r_type == R_AARCH64_JUMP_SLOT, 1))
+  if (__builtin_expect (r_type == AARCH64_R(JUMP_SLOT), 1))
     {
       if (__builtin_expect (map->l_mach.plt, 0) == 0)
 	*reloc_addr += l_addr;
       else
 	*reloc_addr = map->l_mach.plt;
     }
-  else if (__builtin_expect (r_type == R_AARCH64_TLSDESC, 1))
+  else if (__builtin_expect (r_type == AARCH64_R(TLSDESC), 1))
     {
-      struct tlsdesc volatile *td =
-	(struct tlsdesc volatile *)reloc_addr;
+      const Elf_Symndx symndx = ELFW (R_SYM) (reloc->r_info);
+      const ElfW (Sym) *symtab = (const void *)D_PTR (map, l_info[DT_SYMTAB]);
+      const ElfW (Sym) *sym = &symtab[symndx];
+      const struct r_found_version *version = NULL;
+
+      if (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL)
+	{
+	  const ElfW (Half) *vernum =
+	    (const void *)D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]);
+	  version = &map->l_versions[vernum[symndx] & 0x7fff];
+	}
 
-      td->arg = (void*)reloc;
-      td->entry = (void*)(D_PTR (map, l_info[ADDRIDX (DT_TLSDESC_PLT)])
-			  + map->l_addr);
+      /* Always initialize TLS descriptors completely, because lazy
+	 initialization requires synchronization at every TLS access.  */
+      elf_machine_rela (map, reloc, sym, version, reloc_addr, skip_ifunc);
     }
-  else if (__glibc_unlikely (r_type == R_AARCH64_IRELATIVE))
+  else if (__glibc_unlikely (r_type == AARCH64_R(IRELATIVE)))
     {
       ElfW(Addr) value = map->l_addr + reloc->r_addend;
       if (__glibc_likely (!skip_ifunc))
diff --git a/sysdeps/aarch64/dl-sysdep.h b/sysdeps/aarch64/dl-sysdep.h
index 9c05d4b0dc..a0e6a2eed1 100644
--- a/sysdeps/aarch64/dl-sysdep.h
+++ b/sysdeps/aarch64/dl-sysdep.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2002-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2002-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/dl-tls.h b/sysdeps/aarch64/dl-tls.h
index 71265c5341..153fded939 100644
--- a/sysdeps/aarch64/dl-tls.h
+++ b/sysdeps/aarch64/dl-tls.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2005-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -25,6 +25,3 @@ typedef struct
 
 
 extern void *__tls_get_addr (tls_index *ti);
-
-/* Value used for dtv entries for which the allocation is delayed.  */
-#define TLS_DTV_UNALLOCATED ((void *) -1l)
diff --git a/sysdeps/aarch64/dl-tlsdesc.S b/sysdeps/aarch64/dl-tlsdesc.S
index 05be370abb..43a62ef307 100644
--- a/sysdeps/aarch64/dl-tlsdesc.S
+++ b/sysdeps/aarch64/dl-tlsdesc.S
@@ -1,6 +1,6 @@
 /* Thread-local storage handling in the ELF dynamic linker.
    AArch64 version.
-   Copyright (C) 2011-2016 Free Software Foundation, Inc.
+   Copyright (C) 2011-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -74,34 +74,12 @@
 	cfi_startproc
 	.align 2
 _dl_tlsdesc_return:
-	ldr	x0, [x0, #8]
+	DELOUSE (0)
+	ldr	PTR_REG (0), [x0, #PTR_SIZE]
 	RET
 	cfi_endproc
 	.size	_dl_tlsdesc_return, .-_dl_tlsdesc_return
 
-	/* Same as _dl_tlsdesc_return but with synchronization for
-	   lazy relocation.
-	   Prototype:
-	   _dl_tlsdesc_return_lazy (tlsdesc *) ;
-	 */
-	.hidden _dl_tlsdesc_return_lazy
-	.global	_dl_tlsdesc_return_lazy
-	.type	_dl_tlsdesc_return_lazy,%function
-	cfi_startproc
-	.align 2
-_dl_tlsdesc_return_lazy:
-	/* The ldar here happens after the load from [x0] at the call site
-	   (that is generated by the compiler as part of the TLS access ABI),
-	   so it reads the same value (this function is the final value of
-	   td->entry) and thus it synchronizes with the release store to
-	   td->entry in _dl_tlsdesc_resolve_rela_fixup ensuring that the load
-	   from [x0,#8] here happens after the initialization of td->arg.  */
-	ldar	xzr, [x0]
-	ldr	x0, [x0, #8]
-	RET
-	cfi_endproc
-	.size	_dl_tlsdesc_return_lazy, .-_dl_tlsdesc_return_lazy
-
 	/* Handler for undefined weak TLS symbols.
 	   Prototype:
 	   _dl_tlsdesc_undefweak (tlsdesc *);
@@ -119,16 +97,10 @@ _dl_tlsdesc_return_lazy:
 _dl_tlsdesc_undefweak:
 	str	x1, [sp, #-16]!
 	cfi_adjust_cfa_offset (16)
-	/* The ldar here happens after the load from [x0] at the call site
-	   (that is generated by the compiler as part of the TLS access ABI),
-	   so it reads the same value (this function is the final value of
-	   td->entry) and thus it synchronizes with the release store to
-	   td->entry in _dl_tlsdesc_resolve_rela_fixup ensuring that the load
-	   from [x0,#8] here happens after the initialization of td->arg.  */
-	ldar	xzr, [x0]
-	ldr	x0, [x0, #8]
+	DELOUSE (0)
+	ldr	PTR_REG (0), [x0, #PTR_SIZE]
 	mrs	x1, tpidr_el0
-	sub	x0, x0, x1
+	sub	PTR_REG (0), PTR_REG (0), PTR_REG (1)
 	ldr	x1, [sp], #16
 	cfi_adjust_cfa_offset (-16)
 	RET
@@ -151,7 +123,7 @@ _dl_tlsdesc_undefweak:
 	   _dl_tlsdesc_dynamic (struct tlsdesc *tdp)
 	   {
 	     struct tlsdesc_dynamic_arg *td = tdp->arg;
-	     dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
+	     dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + TCBHEAD_DTV);
 	     if (__builtin_expect (td->gen_count <= dtv[0].counter
 		&& (dtv[td->tlsinfo.ti_module].pointer.val
 		    != TLS_DTV_UNALLOCATED),
@@ -170,46 +142,37 @@ _dl_tlsdesc_undefweak:
 	cfi_startproc
 	.align 2
 _dl_tlsdesc_dynamic:
-# define NSAVEXREGPAIRS 2
-	stp	x29, x30, [sp,#-(32+16*NSAVEXREGPAIRS)]!
-	cfi_adjust_cfa_offset (32+16*NSAVEXREGPAIRS)
-	mov	x29, sp
+	DELOUSE (0)
 
 	/* Save just enough registers to support fast path, if we fall
 	   into slow path we will save additional registers.  */
-
-	stp	x1,  x2, [sp, #32+16*0]
-	stp	x3,  x4, [sp, #32+16*1]
+	stp	x1,  x2, [sp, #-32]!
+	stp	x3,  x4, [sp, #16]
+	cfi_adjust_cfa_offset (32)
+	cfi_rel_offset (x1, 0)
+	cfi_rel_offset (x2, 8)
+	cfi_rel_offset (x3, 16)
+	cfi_rel_offset (x4, 24)
 
 	mrs	x4, tpidr_el0
-	/* The ldar here happens after the load from [x0] at the call site
-	   (that is generated by the compiler as part of the TLS access ABI),
-	   so it reads the same value (this function is the final value of
-	   td->entry) and thus it synchronizes with the release store to
-	   td->entry in _dl_tlsdesc_resolve_rela_fixup ensuring that the load
-	   from [x0,#8] here happens after the initialization of td->arg.  */
-	ldar	xzr, [x0]
-	ldr	x1, [x0,#8]
-	ldr	x0, [x4]
-	ldr	x3, [x1,#16]
-	ldr	x2, [x0]
-	cmp	x3, x2
+	ldr	PTR_REG (1), [x0,#TLSDESC_ARG]
+	ldr	PTR_REG (0), [x4,#TCBHEAD_DTV]
+	ldr	PTR_REG (3), [x1,#TLSDESC_GEN_COUNT]
+	ldr	PTR_REG (2), [x0,#DTV_COUNTER]
+	cmp	PTR_REG (3), PTR_REG (2)
 	b.hi	2f
-	ldr	x2, [x1]
-	add	x0, x0, x2, lsl #4
-	ldr	x0, [x0]
-	cmn	x0, #0x1
+	/* Load r2 = td->tlsinfo.ti_module and r3 = td->tlsinfo.ti_offset.  */
+	ldp	PTR_REG (2), PTR_REG (3), [x1,#TLSDESC_MODID]
+	add	PTR_REG (0), PTR_REG (0), PTR_REG (2), lsl #(PTR_LOG_SIZE + 1)
+	ldr	PTR_REG (0), [x0] /* Load val member of DTV entry.  */
+	cmp	PTR_REG (0), #TLS_DTV_UNALLOCATED
 	b.eq	2f
-	ldr	x1, [x1,#8]
-	add	x0, x0, x1
-	sub	x0, x0, x4
+	sub	PTR_REG (3), PTR_REG (3), PTR_REG (4)
+	add	PTR_REG (0), PTR_REG (0), PTR_REG (3)
 1:
-	ldp	 x1,  x2, [sp, #32+16*0]
-	ldp	 x3,  x4, [sp, #32+16*1]
-
-	ldp	x29, x30, [sp], #(32+16*NSAVEXREGPAIRS)
-	cfi_adjust_cfa_offset (-32-16*NSAVEXREGPAIRS)
-# undef NSAVEXREGPAIRS
+	ldp	 x3,  x4, [sp, #16]
+	ldp	 x1,  x2, [sp], #32
+	cfi_adjust_cfa_offset (-32)
 	RET
 2:
 	/* This is the slow path. We need to call __tls_get_addr() which
@@ -217,15 +180,33 @@ _dl_tlsdesc_dynamic:
 	   callee will trash.  */
 
 	/* Save the remaining registers that we must treat as caller save.  */
-# define NSAVEXREGPAIRS 7
-	stp	 x5,  x6, [sp, #-16*NSAVEXREGPAIRS]!
+# define NSAVEXREGPAIRS 8
+	stp	x29, x30, [sp,#-16*NSAVEXREGPAIRS]!
 	cfi_adjust_cfa_offset (16*NSAVEXREGPAIRS)
-	stp	 x7,  x8, [sp, #16*1]
-	stp	 x9, x10, [sp, #16*2]
-	stp	x11, x12, [sp, #16*3]
-	stp	x13, x14, [sp, #16*4]
-	stp	x15, x16, [sp, #16*5]
-	stp	x17, x18, [sp, #16*6]
+	cfi_rel_offset (x29, 0)
+	cfi_rel_offset (x30, 8)
+	mov	x29, sp
+	stp	 x5,  x6, [sp, #16*1]
+	stp	 x7,  x8, [sp, #16*2]
+	stp	 x9, x10, [sp, #16*3]
+	stp	x11, x12, [sp, #16*4]
+	stp	x13, x14, [sp, #16*5]
+	stp	x15, x16, [sp, #16*6]
+	stp	x17, x18, [sp, #16*7]
+	cfi_rel_offset (x5, 16*1)
+	cfi_rel_offset (x6, 16*1+8)
+	cfi_rel_offset (x7, 16*2)
+	cfi_rel_offset (x8, 16*2+8)
+	cfi_rel_offset (x9, 16*3)
+	cfi_rel_offset (x10, 16*3+8)
+	cfi_rel_offset (x11, 16*4)
+	cfi_rel_offset (x12, 16*4+8)
+	cfi_rel_offset (x13, 16*5)
+	cfi_rel_offset (x14, 16*5+8)
+	cfi_rel_offset (x15, 16*6)
+	cfi_rel_offset (x16, 16*6+8)
+	cfi_rel_offset (x17, 16*7)
+	cfi_rel_offset (x18, 16*7+8)
 
 	SAVE_Q_REGISTERS
 
@@ -233,134 +214,24 @@ _dl_tlsdesc_dynamic:
 	bl	__tls_get_addr
 
 	mrs	x1, tpidr_el0
-	sub	x0, x0, x1
+	sub	PTR_REG (0), PTR_REG (0), PTR_REG (1)
 
 	RESTORE_Q_REGISTERS
 
-	ldp	 x7,  x8, [sp, #16*1]
-	ldp	 x9, x10, [sp, #16*2]
-	ldp	x11, x12, [sp, #16*3]
-	ldp	x13, x14, [sp, #16*4]
-	ldp	x15, x16, [sp, #16*5]
-	ldp	x17, x18, [sp, #16*6]
-	ldp	 x5,  x6, [sp], #16*NSAVEXREGPAIRS
+	ldp	 x5,  x6, [sp, #16*1]
+	ldp	 x7,  x8, [sp, #16*2]
+	ldp	 x9, x10, [sp, #16*3]
+	ldp	x11, x12, [sp, #16*4]
+	ldp	x13, x14, [sp, #16*5]
+	ldp	x15, x16, [sp, #16*6]
+	ldp	x17, x18, [sp, #16*7]
+
+	ldp	x29, x30, [sp], #16*NSAVEXREGPAIRS
 	cfi_adjust_cfa_offset (-16*NSAVEXREGPAIRS)
+	cfi_restore (x29)
+	cfi_restore (x30)
 	b	1b
 	cfi_endproc
 	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
 # undef NSAVEXREGPAIRS
 #endif
-
-	/* This function is a wrapper for a lazy resolver for TLS_DESC
-	   RELA relocations.
-	   When the actual resolver returns, it will have adjusted the
-	   TLS descriptor such that we can tail-call it for it to return
-	   the TP offset of the symbol.  */
-
-	.hidden _dl_tlsdesc_resolve_rela
-	.global	_dl_tlsdesc_resolve_rela
-	.type	_dl_tlsdesc_resolve_rela,%function
-	cfi_startproc
-	.align 2
-_dl_tlsdesc_resolve_rela:
-#define	NSAVEXREGPAIRS 9
-	stp	x29, x30, [sp, #-(32+16*NSAVEXREGPAIRS)]!
-	cfi_adjust_cfa_offset (32+16*NSAVEXREGPAIRS)
-	mov	x29, sp
-	stp	 x1,  x4, [sp, #32+16*0]
-	stp	 x5,  x6, [sp, #32+16*1]
-	stp	 x7,  x8, [sp, #32+16*2]
-	stp	 x9, x10, [sp, #32+16*3]
-	stp	x11, x12, [sp, #32+16*4]
-	stp	x13, x14, [sp, #32+16*5]
-	stp	x15, x16, [sp, #32+16*6]
-	stp	x17, x18, [sp, #32+16*7]
-	str	x0,       [sp, #32+16*8]
-
-	SAVE_Q_REGISTERS
-
-	ldr	x1, [x3, #8]
-	bl	_dl_tlsdesc_resolve_rela_fixup
-
-	RESTORE_Q_REGISTERS
-
-	ldr	x0, [sp, #32+16*8]
-	ldr	x1, [x0]
-	blr	x1
-
-	ldp	 x1,  x4, [sp, #32+16*0]
-	ldp	 x5,  x6, [sp, #32+16*1]
-	ldp	 x7,  x8, [sp, #32+16*2]
-	ldp	 x9, x10, [sp, #32+16*3]
-	ldp	x11, x12, [sp, #32+16*4]
-	ldp	x13, x14, [sp, #32+16*5]
-	ldp	x15, x16, [sp, #32+16*6]
-	ldp	x17, x18, [sp, #32+16*7]
-	ldp	x29, x30, [sp], #(32+16*NSAVEXREGPAIRS)
-	cfi_adjust_cfa_offset (-32-16*NSAVEXREGPAIRS)
-	ldp	x2, x3, [sp], #16
-	cfi_adjust_cfa_offset (-16)
-	RET
-#undef NSAVEXREGPAIRS
-	cfi_endproc
-	.size	_dl_tlsdesc_resolve_rela, .-_dl_tlsdesc_resolve_rela
-
-	/* This function is a placeholder for lazy resolving of TLS
-	relocations.  Once some thread starts resolving a TLS
-	relocation, it sets up the TLS descriptor to use this
-	resolver, such that other threads that would attempt to
-	resolve it concurrently may skip the call to the original lazy
-	resolver and go straight to a condition wait.
-
-	When the actual resolver returns, it will have adjusted the
-	TLS descriptor such that we can tail-call it for it to return
-	the TP offset of the symbol.  */
-
-	.hidden _dl_tlsdesc_resolve_hold
-	.global	_dl_tlsdesc_resolve_hold
-	.type	_dl_tlsdesc_resolve_hold,%function
-	cfi_startproc
-	.align 2
-_dl_tlsdesc_resolve_hold:
-#define	NSAVEXREGPAIRS 10
-1:
-	stp	x29, x30, [sp, #-(32+16*NSAVEXREGPAIRS)]!
-	cfi_adjust_cfa_offset (32+16*NSAVEXREGPAIRS)
-	mov	x29, sp
-	stp	 x1,  x2, [sp, #32+16*0]
-	stp	 x3,  x4, [sp, #32+16*1]
-	stp	 x5,  x6, [sp, #32+16*2]
-	stp	 x7,  x8, [sp, #32+16*3]
-	stp	 x9, x10, [sp, #32+16*4]
-	stp	x11, x12, [sp, #32+16*5]
-	stp	x13, x14, [sp, #32+16*6]
-	stp	x15, x16, [sp, #32+16*7]
-	stp	x17, x18, [sp, #32+16*8]
-	str	x0,       [sp, #32+16*9]
-
-	SAVE_Q_REGISTERS
-
-	adr	x1, 1b
-	bl	_dl_tlsdesc_resolve_hold_fixup
-
-	RESTORE_Q_REGISTERS
-
-	ldr	x0, [sp, #32+16*9]
-	ldr	x1, [x0]
-	blr	x1
-
-	ldp	 x1,  x2, [sp, #32+16*0]
-	ldp	 x3,  x4, [sp, #32+16*1]
-	ldp	 x5,  x6, [sp, #32+16*2]
-	ldp	 x7,  x8, [sp, #32+16*3]
-	ldp	 x9, x10, [sp, #32+16*4]
-	ldp	x11, x12, [sp, #32+16*5]
-	ldp	x13, x14, [sp, #32+16*6]
-	ldp	x15, x16, [sp, #32+16*7]
-	ldp	x17, x18, [sp, #32+16*8]
-	ldp	x29, x30, [sp], #(32+16*NSAVEXREGPAIRS)
-	cfi_adjust_cfa_offset (-32-16*NSAVEXREGPAIRS)
-	RET
-	cfi_endproc
-	.size	_dl_tlsdesc_resolve_hold, .-_dl_tlsdesc_resolve_hold
-#undef NSAVEXREGPAIRS
diff --git a/sysdeps/aarch64/dl-tlsdesc.h b/sysdeps/aarch64/dl-tlsdesc.h
index 8532469fd9..6dc80ad66c 100644
--- a/sysdeps/aarch64/dl-tlsdesc.h
+++ b/sysdeps/aarch64/dl-tlsdesc.h
@@ -1,6 +1,6 @@
 /* Thread-local storage descriptor handling in the ELF dynamic linker.
    AArch64 version.
-   Copyright (C) 2011-2016 Free Software Foundation, Inc.
+   Copyright (C) 2011-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -46,20 +46,10 @@ extern ptrdiff_t attribute_hidden
 _dl_tlsdesc_return (struct tlsdesc *);
 
 extern ptrdiff_t attribute_hidden
-_dl_tlsdesc_return_lazy (struct tlsdesc *);
-
-extern ptrdiff_t attribute_hidden
 _dl_tlsdesc_undefweak (struct tlsdesc *);
 
-extern ptrdiff_t attribute_hidden
-_dl_tlsdesc_resolve_rela (struct tlsdesc *);
-
-extern ptrdiff_t attribute_hidden
-_dl_tlsdesc_resolve_hold (struct tlsdesc *);
-
 # ifdef SHARED
-extern void *internal_function _dl_make_tlsdesc_dynamic (struct link_map *,
-							 size_t);
+extern void *_dl_make_tlsdesc_dynamic (struct link_map *, size_t);
 
 extern ptrdiff_t attribute_hidden
 _dl_tlsdesc_dynamic (struct tlsdesc *);
diff --git a/sysdeps/aarch64/dl-trampoline.S b/sysdeps/aarch64/dl-trampoline.S
index 947a51579f..a86d0722d4 100644
--- a/sysdeps/aarch64/dl-trampoline.S
+++ b/sysdeps/aarch64/dl-trampoline.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2005-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -22,9 +22,13 @@
 #include "dl-link.h"
 
 #define ip0 x16
+#define ip0l PTR_REG (16)
 #define ip1 x17
 #define lr  x30
 
+/* RELA relocatons are 3 pointers */
+#define RELA_SIZE (PTR_SIZE * 3)
+
 	.text
 	.globl _dl_runtime_resolve
 	.type _dl_runtime_resolve, #function
@@ -79,7 +83,7 @@ _dl_runtime_resolve:
 	cfi_rel_offset (q1, 80+7*16)
 
 	/* Get pointer to linker struct.  */
-	ldr	x0, [ip0, #-8]
+	ldr	PTR_REG (0), [ip0, #-PTR_SIZE]
 
 	/* Prepare to call _dl_fixup().  */
 	ldr	x1, [sp, 80+8*16]	/* Recover &PLTGOT[n] */
@@ -87,7 +91,7 @@ _dl_runtime_resolve:
 	sub     x1, x1, ip0
 	add     x1, x1, x1, lsl #1
 	lsl     x1, x1, #3
-	sub     x1, x1, #192
+	sub     x1, x1, #(RELA_SIZE<<3)
 	lsr     x1, x1, #3
 
 	/* Call fixup routine.  */
@@ -191,7 +195,7 @@ _dl_runtime_profile:
 	stp	x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_SP]
 
 	/* Get pointer to linker struct.  */
-	ldr	x0, [ip0, #-8]
+	ldr	PTR_REG (0), [ip0, #-PTR_SIZE]
 
 	/* Prepare to call _dl_profile_fixup().  */
 	ldr	x1, [x29, OFFSET_PLTGOTN]	/* Recover &PLTGOT[n] */
@@ -199,7 +203,7 @@ _dl_runtime_profile:
 	sub     x1, x1, ip0
 	add     x1, x1, x1, lsl #1
 	lsl     x1, x1, #3
-	sub     x1, x1, #192
+	sub     x1, x1, #(RELA_SIZE<<3)
 	lsr     x1, x1, #3
 
 	stp	x0, x1, [x29, #OFFSET_SAVED_CALL_X0]
@@ -210,8 +214,8 @@ _dl_runtime_profile:
 	add	x4, x29, #OFFSET_FS		/* address of framesize */
 	bl	_dl_profile_fixup
 
-	ldr	ip0, [x29, #OFFSET_FS]		/* framesize == 0 */
-	cmp	ip0, #0
+	ldr	ip0l, [x29, #OFFSET_FS]		/* framesize == 0 */
+	cmp	ip0l, #0
 	bge	1f
 	cfi_remember_state
 
@@ -243,7 +247,7 @@ _dl_runtime_profile:
 1:
 	/* The new frame size is in ip0.  */
 
-	sub	x1, x29, ip0
+	sub	PTR_REG (1), PTR_REG (29), ip0l
 	and	sp, x1, #0xfffffffffffffff0
 
 	str	x0, [x29, #OFFSET_T1]
diff --git a/sysdeps/aarch64/dl-tunables.list b/sysdeps/aarch64/dl-tunables.list
new file mode 100644
index 0000000000..f6a88168cc
--- /dev/null
+++ b/sysdeps/aarch64/dl-tunables.list
@@ -0,0 +1,25 @@
+# aarch64 specific tunables.
+# Copyright (C) 2017-2018 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+glibc {
+  tune {
+    cpu {
+      type: STRING
+    }
+  }
+}
diff --git a/sysdeps/aarch64/soft-fp/e_sqrtl.c b/sysdeps/aarch64/e_sqrtl.c
index 5d3919b6a2..1dc1cbf003 100644
--- a/sysdeps/aarch64/soft-fp/e_sqrtl.c
+++ b/sysdeps/aarch64/e_sqrtl.c
@@ -1,5 +1,5 @@
 /* long double square root in software floating-point emulation.
-   Copyright (C) 1997-2016 Free Software Foundation, Inc.
+   Copyright (C) 1997-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Richard Henderson (rth@cygnus.com) and
 		  Jakub Jelinek (jj@ultra.linux.cz).
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
new file mode 100644
index 0000000000..4a182bd6d6
--- /dev/null
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -0,0 +1,14 @@
+ifeq ($(subdir),math)
+CFLAGS-e_sqrtf.c += -fno-math-errno
+CFLAGS-e_sqrt.c += -fno-math-errno
+CFLAGS-s_lroundf.c += -fno-math-errno
+CFLAGS-s_lround.c += -fno-math-errno
+CFLAGS-s_llroundf.c += -fno-math-errno
+CFLAGS-s_llround.c += -fno-math-errno
+# GCC 4.9 and 5 requires the flag to correct emits a f{max,min}nm
+# for a __builtin_{fmax,fmin}{f}.
+CFLAGS-s_fmax.c += -ffinite-math-only
+CFLAGS-s_fmaxf.c += -ffinite-math-only
+CFLAGS-s_fmin.c += -ffinite-math-only
+CFLAGS-s_fminf.c += -ffinite-math-only
+endif
diff --git a/sysdeps/aarch64/fpu/e_sqrt.c b/sysdeps/aarch64/fpu/e_sqrt.c
index 2759fb186d..32f1e26803 100644
--- a/sysdeps/aarch64/fpu/e_sqrt.c
+++ b/sysdeps/aarch64/fpu/e_sqrt.c
@@ -1,5 +1,5 @@
 /* Square root of floating point number.
-   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Copyright (C) 2015-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -21,8 +21,6 @@
 double
 __ieee754_sqrt (double d)
 {
-  double res;
-  asm ("fsqrt   %d0, %d1" : "=w" (res) : "w" (d));
-  return res;
+  return __builtin_sqrt (d);
 }
 strong_alias (__ieee754_sqrt, __sqrt_finite)
diff --git a/sysdeps/aarch64/fpu/e_sqrtf.c b/sysdeps/aarch64/fpu/e_sqrtf.c
index e7ce19a38c..87f557cb10 100644
--- a/sysdeps/aarch64/fpu/e_sqrtf.c
+++ b/sysdeps/aarch64/fpu/e_sqrtf.c
@@ -1,5 +1,5 @@
 /* Single-precision floating point square root.
-   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Copyright (C) 2015-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -21,8 +21,6 @@
 float
 __ieee754_sqrtf (float s)
 {
-  float res;
-  asm ("fsqrt   %s0, %s1" : "=w" (res) : "w" (s));
-  return res;
+  return __builtin_sqrtf (s);
 }
 strong_alias (__ieee754_sqrtf, __sqrtf_finite)
diff --git a/sysdeps/aarch64/fpu/fclrexcpt.c b/sysdeps/aarch64/fpu/fclrexcpt.c
index 8c0f09a82d..9328518f1e 100644
--- a/sysdeps/aarch64/fpu/fclrexcpt.c
+++ b/sysdeps/aarch64/fpu/fclrexcpt.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/fpu/fedisblxcpt.c b/sysdeps/aarch64/fpu/fedisblxcpt.c
index bf9592847f..125751f8b4 100644
--- a/sysdeps/aarch64/fpu/fedisblxcpt.c
+++ b/sysdeps/aarch64/fpu/fedisblxcpt.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2001-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2001-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/fpu/feenablxcpt.c b/sysdeps/aarch64/fpu/feenablxcpt.c
index dff1050e66..6c72b42fc9 100644
--- a/sysdeps/aarch64/fpu/feenablxcpt.c
+++ b/sysdeps/aarch64/fpu/feenablxcpt.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2001-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2001-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/fpu/fegetenv.c b/sysdeps/aarch64/fpu/fegetenv.c
index c10975f6a2..1559e90977 100644
--- a/sysdeps/aarch64/fpu/fegetenv.c
+++ b/sysdeps/aarch64/fpu/fegetenv.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/fpu/fegetexcept.c b/sysdeps/aarch64/fpu/fegetexcept.c
index 4a4cccd5ff..1f994ead0d 100644
--- a/sysdeps/aarch64/fpu/fegetexcept.c
+++ b/sysdeps/aarch64/fpu/fegetexcept.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2001-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2001-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/fpu/s_frintf.c b/sysdeps/aarch64/fpu/fegetmode.c
index a284eafa4c..65dcb22d89 100644
--- a/sysdeps/aarch64/fpu/s_frintf.c
+++ b/sysdeps/aarch64/fpu/fegetmode.c
@@ -1,11 +1,11 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
-
+/* Store current floating-point control modes.  AArch64 version.
+   Copyright (C) 2016-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public License as
-   published by the Free Software Foundation; either version 2.1 of the
-   License, or (at your option) any later version.
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
 
    The GNU C Library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -16,9 +16,12 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifndef FUNC
-# error FUNC not defined
-#endif
-#define TYPE float
-#define REGS "s"
-#include <s_frint.c>
+#include <fenv.h>
+#include <fpu_control.h>
+
+int
+fegetmode (femode_t *modep)
+{
+  _FPU_GETCW (*modep);
+  return 0;
+}
diff --git a/sysdeps/aarch64/fpu/fegetround.c b/sysdeps/aarch64/fpu/fegetround.c
index 94ad920543..10082eed90 100644
--- a/sysdeps/aarch64/fpu/fegetround.c
+++ b/sysdeps/aarch64/fpu/fegetround.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/fpu/feholdexcpt.c b/sysdeps/aarch64/fpu/feholdexcpt.c
index b9cf0ebd6e..4f7a58e379 100644
--- a/sysdeps/aarch64/fpu/feholdexcpt.c
+++ b/sysdeps/aarch64/fpu/feholdexcpt.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/fpu/fesetenv.c b/sysdeps/aarch64/fpu/fesetenv.c
index ca87bb60d6..2a63cf0c96 100644
--- a/sysdeps/aarch64/fpu/fesetenv.c
+++ b/sysdeps/aarch64/fpu/fesetenv.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/fpu/fesetexcept.c b/sysdeps/aarch64/fpu/fesetexcept.c
new file mode 100644
index 0000000000..c97a38b483
--- /dev/null
+++ b/sysdeps/aarch64/fpu/fesetexcept.c
@@ -0,0 +1,34 @@
+/* Set given exception flags.  AArch64 version.
+   Copyright (C) 2016-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <fpu_control.h>
+
+int
+fesetexcept (int excepts)
+{
+  fpu_fpsr_t fpsr;
+  fpu_fpsr_t fpsr_new;
+
+  _FPU_GETFPSR (fpsr);
+  fpsr_new = fpsr | (excepts & FE_ALL_EXCEPT);
+  if (fpsr != fpsr_new)
+    _FPU_SETFPSR (fpsr_new);
+
+  return 0;
+}
diff --git a/sysdeps/aarch64/fpu/fesetmode.c b/sysdeps/aarch64/fpu/fesetmode.c
new file mode 100644
index 0000000000..6f6b8c197d
--- /dev/null
+++ b/sysdeps/aarch64/fpu/fesetmode.c
@@ -0,0 +1,34 @@
+/* Install given floating-point control modes.  AArch64 version.
+   Copyright (C) 2016-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv.h>
+#include <fpu_control.h>
+
+int
+fesetmode (const femode_t *modep)
+{
+  fpu_control_t fpcr, fpcr_new;
+  _FPU_GETCW (fpcr);
+  if (modep == FE_DFL_MODE)
+    fpcr_new = (fpcr & _FPU_RESERVED) | _FPU_DEFAULT;
+  else
+    fpcr_new = *modep;
+  if (fpcr != fpcr_new)
+    _FPU_SETCW (fpcr_new);
+  return 0;
+}
diff --git a/sysdeps/aarch64/fpu/fesetround.c b/sysdeps/aarch64/fpu/fesetround.c
index 645efe7804..4c2ed3d7ac 100644
--- a/sysdeps/aarch64/fpu/fesetround.c
+++ b/sysdeps/aarch64/fpu/fesetround.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/fpu/feupdateenv.c b/sysdeps/aarch64/fpu/feupdateenv.c
index 53d2b36cfa..2aaf5d13cd 100644
--- a/sysdeps/aarch64/fpu/feupdateenv.c
+++ b/sysdeps/aarch64/fpu/feupdateenv.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2009-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/fpu/fgetexcptflg.c b/sysdeps/aarch64/fpu/fgetexcptflg.c
index 8089a2c9fb..cfc7113e04 100644
--- a/sysdeps/aarch64/fpu/fgetexcptflg.c
+++ b/sysdeps/aarch64/fpu/fgetexcptflg.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2001-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2001-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/fpu/fpu_control.h b/sysdeps/aarch64/fpu/fpu_control.h
index 7e74445872..2d36170c05 100644
--- a/sysdeps/aarch64/fpu/fpu_control.h
+++ b/sysdeps/aarch64/fpu/fpu_control.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 1996-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1996-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -19,19 +19,28 @@
 #ifndef _AARCH64_FPU_CONTROL_H
 #define _AARCH64_FPU_CONTROL_H
 
+#include <features.h>
+
 /* Macros for accessing the FPCR and FPSR.  */
 
-#define _FPU_GETCW(fpcr) \
+#if __GNUC_PREREQ (6,0)
+# define _FPU_GETCW(fpcr) (fpcr = __builtin_aarch64_get_fpcr ())
+# define _FPU_SETCW(fpcr) __builtin_aarch64_set_fpcr (fpcr)
+# define _FPU_GETFPSR(fpsr) (fpsr = __builtin_aarch64_get_fpsr ())
+# define _FPU_SETFPSR(fpsr) __builtin_aarch64_set_fpsr (fpsr)
+#else
+# define _FPU_GETCW(fpcr) \
   __asm__ __volatile__ ("mrs	%0, fpcr" : "=r" (fpcr))
 
-#define _FPU_SETCW(fpcr) \
+# define _FPU_SETCW(fpcr) \
   __asm__ __volatile__ ("msr	fpcr, %0" : : "r" (fpcr))
 
-#define _FPU_GETFPSR(fpsr) \
+# define _FPU_GETFPSR(fpsr) \
   __asm__ __volatile__ ("mrs	%0, fpsr" : "=r" (fpsr))
 
-#define _FPU_SETFPSR(fpsr) \
+# define _FPU_SETFPSR(fpsr) \
   __asm__ __volatile__ ("msr	fpsr, %0" : : "r" (fpsr))
+#endif
 
 /* Reserved bits should be preserved when modifying register
    contents. These two masks indicate which bits in each of FPCR and
diff --git a/sysdeps/aarch64/fpu/fraiseexcpt.c b/sysdeps/aarch64/fpu/fraiseexcpt.c
index 4e00fbba42..bc26156323 100644
--- a/sysdeps/aarch64/fpu/fraiseexcpt.c
+++ b/sysdeps/aarch64/fpu/fraiseexcpt.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/fpu/fsetexcptflg.c b/sysdeps/aarch64/fpu/fsetexcptflg.c
index 890f1d655c..45d96af7e9 100644
--- a/sysdeps/aarch64/fpu/fsetexcptflg.c
+++ b/sysdeps/aarch64/fpu/fsetexcptflg.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/fpu/ftestexcept.c b/sysdeps/aarch64/fpu/ftestexcept.c
index cedbce456f..84223259c8 100644
--- a/sysdeps/aarch64/fpu/ftestexcept.c
+++ b/sysdeps/aarch64/fpu/ftestexcept.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/fpu/get-rounding-mode.h b/sysdeps/aarch64/fpu/get-rounding-mode.h
index 0b23363025..ab29c5b908 100644
--- a/sysdeps/aarch64/fpu/get-rounding-mode.h
+++ b/sysdeps/aarch64/fpu/get-rounding-mode.h
@@ -1,6 +1,6 @@
 /* Determine floating-point rounding mode within libc.  AArch64 version.
 
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
+   Copyright (C) 2012-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/bits/mathdef.h b/sysdeps/aarch64/fpu/math-barriers.h
index db992fccc1..7db937bbf4 100644
--- a/sysdeps/aarch64/bits/mathdef.h
+++ b/sysdeps/aarch64/fpu/math-barriers.h
@@ -1,5 +1,5 @@
-/* Copyright (C) 1999-2016 Free Software Foundation, Inc.
-
+/* Control when floating-point expressions are evaluated.  AArch64 version.
+   Copyright (C) 2014-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,24 +16,12 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if !defined _MATH_H && !defined _COMPLEX_H
-# error "Never use <bits/mathdef.h> directly; include <math.h> instead"
-#endif
-
-#if defined  __USE_ISOC99 && defined _MATH_H && !defined _MATH_H_MATHDEF
-# define _MATH_H_MATHDEF	1
+#ifndef AARCH64_MATH_BARRIERS_H
+#define AARCH64_MATH_BARRIERS_H 1
 
-/* GCC does not promote `float' values to `double'.  */
-typedef float float_t;		/* `float' expressions are evaluated as
-				   `float'.  */
-typedef double double_t;	/* `double' expressions are evaluated as
-				   `double'.  */
+#define math_opt_barrier(x)					\
+  ({ __typeof (x) __x = (x); __asm ("" : "+w" (__x)); __x; })
+#define math_force_eval(x)						\
+  ({ __typeof (x) __x = (x); __asm __volatile__ ("" : : "w" (__x)); })
 
-/* The values returned by `ilogb' for 0 and NaN respectively.  */
-# define FP_ILOGB0	(-2147483647)
-# define FP_ILOGBNAN	(2147483647)
-
-# define FP_FAST_FMA 1
-# define FP_FAST_FMAF 1
-
-#endif	/* ISO C99 */
+#endif
diff --git a/sysdeps/aarch64/fpu/math_private.h b/sysdeps/aarch64/fpu/math_private.h
index fd8eb1fb56..fcd02c0654 100644
--- a/sysdeps/aarch64/fpu/math_private.h
+++ b/sysdeps/aarch64/fpu/math_private.h
@@ -1,5 +1,5 @@
 /* Private floating point rounding and exceptions handling.  AArch64 version.
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
+   Copyright (C) 2014-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,27 +22,6 @@
 #include <fenv.h>
 #include <fpu_control.h>
 
-#define math_opt_barrier(x) \
-({ __typeof (x) __x = (x); __asm ("" : "+w" (__x)); __x; })
-#define math_force_eval(x) \
-({ __typeof (x) __x = (x); __asm __volatile__ ("" : : "w" (__x)); })
-
-extern __always_inline double
-__ieee754_sqrt (double d)
-{
-  double res;
-  asm __volatile__ ("fsqrt   %d0, %d1" : "=w" (res) : "w" (d));
-  return res;
-}
-
-extern __always_inline float
-__ieee754_sqrtf (float s)
-{
-  float res;
-  asm __volatile__ ("fsqrt   %s0, %s1" : "=w" (res) : "w" (s));
-  return res;
-}
-
 static __always_inline void
 libc_feholdexcept_aarch64 (fenv_t *envp)
 {
@@ -319,6 +298,26 @@ libc_feresetround_noex_aarch64_ctx (struct rm_ctx *ctx)
 #define libc_feresetround_noexf_ctx	libc_feresetround_noex_aarch64_ctx
 #define libc_feresetround_noexl_ctx	libc_feresetround_noex_aarch64_ctx
 
+/* Hack: only include the large arm_neon.h when needed.  */
+#ifdef _MATH_CONFIG_H
+# include <arm_neon.h>
+
+/* ACLE intrinsics for frintn and fcvtns instructions.  */
+# define TOINT_INTRINSICS 1
+
+static inline double_t
+roundtoint (double_t x)
+{
+  return vget_lane_f64 (vrndn_f64 (vld1_f64 (&x)), 0);
+}
+
+static inline uint64_t
+converttoint (double_t x)
+{
+  return vcvtnd_s64_f64 (x);
+}
+#endif
+
 #include_next <math_private.h>
 
 #endif
diff --git a/sysdeps/aarch64/fpu/s_ceil.c b/sysdeps/aarch64/fpu/s_ceil.c
index 60c444d70a..4e3b5fdc4d 100644
--- a/sysdeps/aarch64/fpu/s_ceil.c
+++ b/sysdeps/aarch64/fpu/s_ceil.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,6 +16,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define	FUNC ceil
-#define INSN "frintp"
-#include <s_frint.c>
+#include <math.h>
+#include <libm-alias-double.h>
+
+double
+__ceil (double x)
+{
+  return __builtin_ceil (x);
+}
+
+libm_alias_double (__ceil, ceil)
diff --git a/sysdeps/aarch64/fpu/s_ceilf.c b/sysdeps/aarch64/fpu/s_ceilf.c
index 1f1fb0677a..d01dadc246 100644
--- a/sysdeps/aarch64/fpu/s_ceilf.c
+++ b/sysdeps/aarch64/fpu/s_ceilf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,6 +16,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define	FUNC ceilf
-#define INSN "frintp"
-#include <s_frintf.c>
+#include <math.h>
+#include <libm-alias-float.h>
+
+float
+__ceilf (float x)
+{
+  return __builtin_ceilf (x);
+}
+
+libm_alias_float (__ceil, ceil)
diff --git a/sysdeps/aarch64/fpu/s_floor.c b/sysdeps/aarch64/fpu/s_floor.c
index c490a27b66..7258246896 100644
--- a/sysdeps/aarch64/fpu/s_floor.c
+++ b/sysdeps/aarch64/fpu/s_floor.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,6 +16,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define FUNC floor
-#define INSN "frintm"
-#include <s_frint.c>
+#include <math.h>
+#include <libm-alias-double.h>
+
+double
+__floor (double x)
+{
+  return __builtin_floor (x);
+}
+
+libm_alias_double (__floor, floor)
diff --git a/sysdeps/aarch64/fpu/s_floorf.c b/sysdeps/aarch64/fpu/s_floorf.c
index ec81683058..bd197013ad 100644
--- a/sysdeps/aarch64/fpu/s_floorf.c
+++ b/sysdeps/aarch64/fpu/s_floorf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,6 +16,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define FUNC floorf
-#define INSN "frintm"
-#include <s_frintf.c>
+#include <math.h>
+#include <libm-alias-float.h>
+
+float
+__floorf (float x)
+{
+  return __builtin_floorf (x);
+}
+
+libm_alias_float (__floor, floor)
diff --git a/sysdeps/aarch64/fpu/s_fma.c b/sysdeps/aarch64/fpu/s_fma.c
index 3320ba6989..4f3d6c4a51 100644
--- a/sysdeps/aarch64/fpu/s_fma.c
+++ b/sysdeps/aarch64/fpu/s_fma.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1996-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1996-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -17,29 +17,12 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <math.h>
+#include <libm-alias-double.h>
 
-#ifndef FUNC
-# define FUNC fma
-#endif
-
-#ifndef TYPE
-# define TYPE double
-# define REGS "d"
-#else
-# ifndef REGS
-#  error REGS not defined
-# endif
-#endif
-
-#define __CONCATX(a,b) __CONCAT(a,b)
-
-TYPE
-__CONCATX(__,FUNC) (TYPE x, TYPE y, TYPE z)
+double
+__fma (double x, double y, double z)
 {
-  TYPE result;
-  asm ( "fmadd" "\t%" REGS "0, %" REGS "1, %" REGS "2, %" REGS "3"
-        : "=w" (result) : "w" (x), "w" (y), "w" (z) );
-  return result;
+  return __builtin_fma (x, y, z);
 }
 
-weak_alias (__CONCATX(__,FUNC), FUNC)
+libm_alias_double (__fma, fma)
diff --git a/sysdeps/aarch64/fpu/s_fmaf.c b/sysdeps/aarch64/fpu/s_fmaf.c
index e35512c31b..197cf7bd06 100644
--- a/sysdeps/aarch64/fpu/s_fmaf.c
+++ b/sysdeps/aarch64/fpu/s_fmaf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,7 +16,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define FUNC fmaf
-#define TYPE float
-#define REGS "s"
-#include <s_fma.c>
+#include <math.h>
+#include <libm-alias-float.h>
+
+float
+__fmaf (float x, float y, float z)
+{
+  return __builtin_fmaf (x, y, z);
+}
+
+libm_alias_float (__fma, fma)
diff --git a/sysdeps/aarch64/fpu/s_fmax.c b/sysdeps/aarch64/fpu/s_fmax.c
index bc41ec44f6..65de8ceab3 100644
--- a/sysdeps/aarch64/fpu/s_fmax.c
+++ b/sysdeps/aarch64/fpu/s_fmax.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,6 +16,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define FUNC fmax
-#define INSN "fmaxnm"
-#include <s_fmin.c>
+#include <math.h>
+#include <libm-alias-double.h>
+
+double
+__fmax (double x, double y)
+{
+  return __builtin_fmax (x, y);
+}
+
+libm_alias_double (__fmax, fmax)
diff --git a/sysdeps/aarch64/fpu/s_fmaxf.c b/sysdeps/aarch64/fpu/s_fmaxf.c
index 6a234bbeaf..2056700426 100644
--- a/sysdeps/aarch64/fpu/s_fmaxf.c
+++ b/sysdeps/aarch64/fpu/s_fmaxf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,8 +16,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define FUNC fmaxf
-#define INSN "fmaxnm"
-#define TYPE float
-#define REGS "s"
-#include <s_fmin.c>
+#include <math.h>
+#include <libm-alias-float.h>
+
+float
+__fmaxf (float x, float y)
+{
+  return __builtin_fmaxf (x, y);
+}
+
+libm_alias_float (__fmax, fmax)
diff --git a/sysdeps/aarch64/fpu/s_fmin.c b/sysdeps/aarch64/fpu/s_fmin.c
index 695abafb9b..a42b565498 100644
--- a/sysdeps/aarch64/fpu/s_fmin.c
+++ b/sysdeps/aarch64/fpu/s_fmin.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1996-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1996-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -17,33 +17,12 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <math.h>
+#include <libm-alias-double.h>
 
-#ifndef FUNC
-# define FUNC fmin
-#endif
-
-#ifndef INSN
-# define INSN "fminnm"
-#endif
-
-#ifndef TYPE
-# define TYPE double
-# define REGS "d"
-#else
-# ifndef REGS
-#  error REGS not defined
-# endif
-#endif
-
-#define __CONCATX(a,b) __CONCAT(a,b)
-
-TYPE
-__CONCATX(__,FUNC) (TYPE x, TYPE y)
+double
+__fmin (double x, double y)
 {
-  TYPE result;
-  asm ( INSN "\t%" REGS "0, %" REGS "1, %" REGS "2"
-        : "=w" (result) : "w" (x), "w" (y) );
-  return result;
+  return __builtin_fmin (x, y);
 }
 
-weak_alias (__CONCATX(__,FUNC), FUNC)
+libm_alias_double (__fmin, fmin)
diff --git a/sysdeps/aarch64/fpu/s_fminf.c b/sysdeps/aarch64/fpu/s_fminf.c
index 78609575bb..028f88543f 100644
--- a/sysdeps/aarch64/fpu/s_fminf.c
+++ b/sysdeps/aarch64/fpu/s_fminf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,7 +16,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define FUNC fminf
-#define TYPE float
-#define REGS "s"
-#include <s_fmin.c>
+#include <math.h>
+#include <libm-alias-float.h>
+
+float
+__fminf (float x, float y)
+{
+  return __builtin_fminf (x, y);
+}
+
+libm_alias_float (__fmin, fmin)
diff --git a/sysdeps/aarch64/fpu/s_llrint.c b/sysdeps/aarch64/fpu/s_llrint.c
index 9769311b1b..287dc8974c 100644
--- a/sysdeps/aarch64/fpu/s_llrint.c
+++ b/sysdeps/aarch64/fpu/s_llrint.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,6 +16,21 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define FUNC llrint
-#define OTYPE long long int
-#include <s_lrint.c>
+#include <math.h>
+#include <math-barriers.h>
+#include <math_private.h>
+#include <libm-alias-double.h>
+
+long long int
+__llrint (double x)
+{
+  double r = __builtin_rint (x);
+
+  /* Prevent gcc from calling llrint directly when compiled with
+     -fno-math-errno by inserting a barrier.  */
+
+  math_opt_barrier (r);
+  return r;
+}
+
+libm_alias_double (__llrint, llrint)
diff --git a/sysdeps/aarch64/fpu/s_llrintf.c b/sysdeps/aarch64/fpu/s_llrintf.c
index 51024a986c..70d6c0ca84 100644
--- a/sysdeps/aarch64/fpu/s_llrintf.c
+++ b/sysdeps/aarch64/fpu/s_llrintf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,8 +16,22 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define FUNC llrintf
-#define ITYPE float
-#define IREGS "s"
-#define OTYPE long long int
-#include <s_lrint.c>
+#include <math.h>
+#include <math-barriers.h>
+#include <math_private.h>
+#include <libm-alias-float.h>
+
+long long int
+__llrintf (float x)
+{
+  float r = __builtin_rintf (x);
+
+  /* Prevent gcc from calling llrintf directly when compiled with
+     -fno-math-errno by inserting a barrier.  */
+
+
+  math_opt_barrier (r);
+  return r;
+}
+
+libm_alias_float (__llrint, llrint)
diff --git a/sysdeps/aarch64/fpu/s_llround.c b/sysdeps/aarch64/fpu/s_llround.c
index dd2b3a2e16..d6ff708f41 100644
--- a/sysdeps/aarch64/fpu/s_llround.c
+++ b/sysdeps/aarch64/fpu/s_llround.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,6 +16,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define FUNC llround
-#define OTYPE long long int
-#include <s_lround.c>
+#include <math.h>
+#include <libm-alias-double.h>
+
+long long int
+__llround (double x)
+{
+  return __builtin_llround (x);
+}
+
+libm_alias_double (__llround, llround)
diff --git a/sysdeps/aarch64/fpu/s_llroundf.c b/sysdeps/aarch64/fpu/s_llroundf.c
index 8b5d263d91..4cb2caf8b5 100644
--- a/sysdeps/aarch64/fpu/s_llroundf.c
+++ b/sysdeps/aarch64/fpu/s_llroundf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,8 +16,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define FUNC llroundf
-#define ITYPE float
-#define IREGS "s"
-#define OTYPE long long int
-#include <s_lround.c>
+#include <math.h>
+#include <libm-alias-float.h>
+
+long long int
+__llroundf (float x)
+{
+  return __builtin_llroundf (x);
+}
+
+libm_alias_float (__llround, llround)
diff --git a/sysdeps/aarch64/fpu/s_lrint.c b/sysdeps/aarch64/fpu/s_lrint.c
index e51de0fc2d..d6ef066e45 100644
--- a/sysdeps/aarch64/fpu/s_lrint.c
+++ b/sysdeps/aarch64/fpu/s_lrint.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1996-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1996-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -17,37 +17,73 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <math.h>
+#include <get-rounding-mode.h>
+#include <stdint.h>
+#include <math-barriers.h>
+#include <math_private.h>
+#include <libm-alias-double.h>
 
-#ifndef FUNC
-# define FUNC lrint
-#endif
+# define IREG_SIZE 64
+
+# ifdef __ILP32__
+#  define OREG_SIZE 32
+# else
+#  define OREG_SIZE 64
+# endif
 
-#ifndef ITYPE
-# define ITYPE double
 # define IREGS "d"
+
+#if OREG_SIZE == 32
+# define OREGS "w"
 #else
-# ifndef IREGS
-#  error IREGS not defined
-# endif
+# define OREGS "x"
 #endif
 
-#ifndef OTYPE
-# define OTYPE long int
+
+long int
+__lrint (double x)
+{
+
+#if IREG_SIZE == 64 && OREG_SIZE == 32
+  long int result;
+
+  if (__builtin_fabs (x) > INT32_MAX)
+    {
+      /* Converting large values to a 32 bit int may cause the frintx/fcvtza
+	 sequence to set both FE_INVALID and FE_INEXACT.  To avoid this
+	 check the rounding mode and do a single instruction with the
+	 appropriate rounding mode.  */
+
+      switch (get_rounding_mode ())
+	{
+	case FE_TONEAREST:
+	  asm volatile ("fcvtns" "\t%" OREGS "0, %" IREGS "1"
+			: "=r" (result) : "w" (x));
+	  break;
+	case FE_UPWARD:
+	  asm volatile ("fcvtps" "\t%" OREGS "0, %" IREGS "1"
+			: "=r" (result) : "w" (x));
+	  break;
+	case FE_DOWNWARD:
+	  asm volatile ("fcvtms" "\t%" OREGS "0, %" IREGS "1"
+			: "=r" (result) : "w" (x));
+	  break;
+	case FE_TOWARDZERO:
+	default:
+	  asm volatile ("fcvtzs" "\t%" OREGS "0, %" IREGS "1"
+			: "=r" (result) : "w" (x));
+	}
+      return result;
+    }
 #endif
 
-#define OREGS "x"
+  double r =  __builtin_rint (x);
 
-#define __CONCATX(a,b) __CONCAT(a,b)
+  /* Prevent gcc from calling lrint directly when compiled with
+     -fno-math-errno by inserting a barrier.  */
 
-OTYPE
-__CONCATX(__,FUNC) (ITYPE x)
-{
-  OTYPE result;
-  ITYPE temp;
-  asm ( "frintx" "\t%" IREGS "1, %" IREGS "2\n\t"
-        "fcvtzs" "\t%" OREGS "0, %" IREGS "1"
-        : "=r" (result), "=w" (temp) : "w" (x) );
-  return result;
+  math_opt_barrier (r);
+  return r;
 }
 
-weak_alias (__CONCATX(__,FUNC), FUNC)
+libm_alias_double (__lrint, lrint)
diff --git a/sysdeps/aarch64/fpu/s_lrintf.c b/sysdeps/aarch64/fpu/s_lrintf.c
index b603510375..fa42d68dd1 100644
--- a/sysdeps/aarch64/fpu/s_lrintf.c
+++ b/sysdeps/aarch64/fpu/s_lrintf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,7 +16,21 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define FUNC lrintf
-#define ITYPE float
-#define IREGS "s"
-#include <s_lrint.c>
+#include <math.h>
+#include <math-barriers.h>
+#include <math_private.h>
+#include <libm-alias-float.h>
+
+long int
+__lrintf (float x)
+{
+  float r = __builtin_rintf (x);
+
+  /* Prevent gcc from calling lrintf directly when compiled with
+     -fno-math-errno by inserting a barrier.  */
+
+  math_opt_barrier (r);
+  return r;
+}
+
+libm_alias_float (__lrint, lrint)
diff --git a/sysdeps/aarch64/fpu/s_lround.c b/sysdeps/aarch64/fpu/s_lround.c
index a14d347142..d8e9fc4664 100644
--- a/sysdeps/aarch64/fpu/s_lround.c
+++ b/sysdeps/aarch64/fpu/s_lround.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1996-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1996-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -17,35 +17,12 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <math.h>
+#include <libm-alias-double.h>
 
-#ifndef FUNC
-# define FUNC lround
-#endif
+long int
+__lround (double x)
+ {
+  return __builtin_lround (x);
+ }
 
-#ifndef ITYPE
-# define ITYPE double
-# define IREGS "d"
-#else
-# ifndef IREGS
-#  error IREGS not defined
-# endif
-#endif
-
-#ifndef OTYPE
-# define OTYPE long int
-#endif
-
-#define OREGS "x"
-
-#define __CONCATX(a,b) __CONCAT(a,b)
-
-OTYPE
-__CONCATX(__,FUNC) (ITYPE x)
-{
-  OTYPE result;
-  asm ( "fcvtas" "\t%" OREGS "0, %" IREGS "1"
-        : "=r" (result) : "w" (x) );
-  return result;
-}
-
-weak_alias (__CONCATX(__,FUNC), FUNC)
+libm_alias_double (__lround, lround)
diff --git a/sysdeps/aarch64/fpu/s_lroundf.c b/sysdeps/aarch64/fpu/s_lroundf.c
index e3a3fba937..9a76dbf24a 100644
--- a/sysdeps/aarch64/fpu/s_lroundf.c
+++ b/sysdeps/aarch64/fpu/s_lroundf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,7 +16,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define FUNC lroundf
-#define ITYPE float
-#define IREGS "s"
-#include <s_lround.c>
+#include <math.h>
+#include <libm-alias-float.h>
+
+long int
+__lroundf (float x)
+{
+  return __builtin_lroundf (x);
+}
+
+libm_alias_float (__lround, lround)
diff --git a/sysdeps/aarch64/fpu/s_nearbyint.c b/sysdeps/aarch64/fpu/s_nearbyint.c
index 3be8059c1b..477e1a8cfe 100644
--- a/sysdeps/aarch64/fpu/s_nearbyint.c
+++ b/sysdeps/aarch64/fpu/s_nearbyint.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,6 +16,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define	FUNC nearbyint
-#define INSN "frinti"
-#include <s_frint.c>
+#include <math.h>
+#include <libm-alias-double.h>
+
+double
+__nearbyint (double x)
+{
+  return __builtin_nearbyint (x);
+}
+
+libm_alias_double (__nearbyint, nearbyint)
diff --git a/sysdeps/aarch64/fpu/s_nearbyintf.c b/sysdeps/aarch64/fpu/s_nearbyintf.c
index e544e5597e..4edab204a9 100644
--- a/sysdeps/aarch64/fpu/s_nearbyintf.c
+++ b/sysdeps/aarch64/fpu/s_nearbyintf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,6 +16,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define FUNC nearbyintf
-#define INSN "frinti"
-#include <s_frintf.c>
+#include <math.h>
+#include <libm-alias-float.h>
+
+float
+__nearbyintf (float x)
+{
+  return __builtin_nearbyintf (x);
+}
+
+libm_alias_float (__nearbyint, nearbyint)
diff --git a/sysdeps/aarch64/fpu/s_rint.c b/sysdeps/aarch64/fpu/s_rint.c
index d12c1a7c46..eb4232af00 100644
--- a/sysdeps/aarch64/fpu/s_rint.c
+++ b/sysdeps/aarch64/fpu/s_rint.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,6 +16,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define FUNC rint
-#define INSN "frintx"
-#include <s_frint.c>
+#include <math.h>
+#include <libm-alias-double.h>
+
+double
+__rint (double x)
+{
+  return __builtin_rint (x);
+}
+
+libm_alias_double (__rint, rint)
diff --git a/sysdeps/aarch64/fpu/s_rintf.c b/sysdeps/aarch64/fpu/s_rintf.c
index 375be92734..9ebfcb45b4 100644
--- a/sysdeps/aarch64/fpu/s_rintf.c
+++ b/sysdeps/aarch64/fpu/s_rintf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,6 +16,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define FUNC rintf
-#define INSN "frintx"
-#include <s_frintf.c>
+#include <math.h>
+#include <libm-alias-float.h>
+
+float
+__rintf (float x)
+{
+  return __builtin_rintf (x);
+}
+
+libm_alias_float (__rint, rint)
diff --git a/sysdeps/aarch64/fpu/s_round.c b/sysdeps/aarch64/fpu/s_round.c
index 239ca1b434..275b00f318 100644
--- a/sysdeps/aarch64/fpu/s_round.c
+++ b/sysdeps/aarch64/fpu/s_round.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,6 +16,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define FUNC round
-#define INSN "frinta"
-#include <s_frint.c>
+#include <math.h>
+#include <libm-alias-double.h>
+
+double
+__round (double x)
+{
+  return __builtin_round (x);
+}
+
+libm_alias_double (__round, round)
diff --git a/sysdeps/aarch64/fpu/s_roundf.c b/sysdeps/aarch64/fpu/s_roundf.c
index eb13586dc5..201ffc54db 100644
--- a/sysdeps/aarch64/fpu/s_roundf.c
+++ b/sysdeps/aarch64/fpu/s_roundf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,6 +16,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define FUNC roundf
-#define INSN "frinta"
-#include <s_frintf.c>
+#include <math.h>
+#include <libm-alias-float.h>
+
+float
+__roundf (float x)
+{
+  return __builtin_roundf (x);
+}
+
+libm_alias_float (__round, round)
diff --git a/sysdeps/aarch64/fpu/s_trunc.c b/sysdeps/aarch64/fpu/s_trunc.c
index 7107a1b18b..22f44f1e04 100644
--- a/sysdeps/aarch64/fpu/s_trunc.c
+++ b/sysdeps/aarch64/fpu/s_trunc.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,6 +16,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define FUNC trunc
-#define INSN "frintz"
-#include <s_frint.c>
+#include <math.h>
+#include <libm-alias-double.h>
+
+double
+__trunc (double x)
+{
+  return __builtin_trunc (x);
+}
+
+libm_alias_double (__trunc, trunc)
diff --git a/sysdeps/aarch64/fpu/s_truncf.c b/sysdeps/aarch64/fpu/s_truncf.c
index b08c1cb304..3771b6d6bf 100644
--- a/sysdeps/aarch64/fpu/s_truncf.c
+++ b/sysdeps/aarch64/fpu/s_truncf.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,6 +16,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define FUNC truncf
-#define INSN "frintz"
-#include <s_frintf.c>
+#include <math.h>
+#include <libm-alias-float.h>
+
+float
+__truncf (float x)
+{
+  return __builtin_truncf (x);
+}
+
+libm_alias_float (__trunc, trunc)
diff --git a/sysdeps/aarch64/jmpbuf-offsets.h b/sysdeps/aarch64/jmpbuf-offsets.h
index 37f551a56b..a7096f2f69 100644
--- a/sysdeps/aarch64/jmpbuf-offsets.h
+++ b/sysdeps/aarch64/jmpbuf-offsets.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2006-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/jmpbuf-unwind.h b/sysdeps/aarch64/jmpbuf-unwind.h
index 3e0a37db4e..9309e7b0ea 100644
--- a/sysdeps/aarch64/jmpbuf-unwind.h
+++ b/sysdeps/aarch64/jmpbuf-unwind.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2005-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -27,7 +27,8 @@
   ((void *) (address) < (void *) demangle (jmpbuf[JB_SP]))
 
 #define _JMPBUF_CFA_UNWINDS_ADJ(jmpbuf, context, adj) \
-  _JMPBUF_UNWINDS_ADJ (jmpbuf, (void *) _Unwind_GetCFA (context), adj)
+  _JMPBUF_UNWINDS_ADJ (jmpbuf, (void *) (uintptr_t) _Unwind_GetCFA (context), \
+		       adj)
 
 #define _JMPBUF_UNWINDS_ADJ(_jmpbuf, _address, _adj) \
   ((uintptr_t) (_address) - (_adj) < _jmpbuf_sp (_jmpbuf) - (_adj))
diff --git a/sysdeps/aarch64/ldsodefs.h b/sysdeps/aarch64/ldsodefs.h
index 7c02e89371..600b721234 100644
--- a/sysdeps/aarch64/ldsodefs.h
+++ b/sysdeps/aarch64/ldsodefs.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2005-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -20,6 +20,7 @@
 #define _AARCH64_LDSODEFS_H 1
 
 #include <elf.h>
+#include <cpu-features.h>
 
 struct La_aarch64_regs;
 struct La_aarch64_retval;
diff --git a/sysdeps/aarch64/libc-tls.c b/sysdeps/aarch64/libc-tls.c
index f43c75f9b5..f2a2e1c0d3 100644
--- a/sysdeps/aarch64/libc-tls.c
+++ b/sysdeps/aarch64/libc-tls.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2005-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index bc5795b361..be06085154 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -40,9 +40,9 @@ ildouble: 2
 ldouble: 2
 
 Function: "acosh_downward":
-double: 1
+double: 2
 float: 2
-idouble: 1
+idouble: 2
 ifloat: 2
 ildouble: 3
 ldouble: 3
@@ -252,36 +252,36 @@ ildouble: 2
 ldouble: 2
 
 Function: Imaginary part of "cacos":
-double: 1
+double: 2
 float: 2
-idouble: 1
+idouble: 2
 ifloat: 2
 ildouble: 2
 ldouble: 2
 
 Function: Real part of "cacos_downward":
-double: 2
+double: 3
 float: 2
-idouble: 2
+idouble: 3
 ifloat: 2
-ildouble: 2
-ldouble: 2
+ildouble: 3
+ldouble: 3
 
 Function: Imaginary part of "cacos_downward":
 double: 5
 float: 3
 idouble: 5
 ifloat: 3
-ildouble: 5
-ldouble: 5
+ildouble: 6
+ldouble: 6
 
 Function: Real part of "cacos_towardzero":
-double: 2
+double: 3
 float: 2
-idouble: 2
+idouble: 3
 ifloat: 2
-ildouble: 2
-ldouble: 2
+ildouble: 3
+ldouble: 3
 
 Function: Imaginary part of "cacos_towardzero":
 double: 5
@@ -300,17 +300,17 @@ ildouble: 3
 ldouble: 3
 
 Function: Imaginary part of "cacos_upward":
-double: 4
-float: 4
-idouble: 4
-ifloat: 4
-ildouble: 5
-ldouble: 5
+double: 5
+float: 5
+idouble: 5
+ifloat: 5
+ildouble: 7
+ldouble: 7
 
 Function: Real part of "cacosh":
-double: 1
+double: 2
 float: 2
-idouble: 1
+idouble: 2
 ifloat: 2
 ildouble: 2
 ldouble: 2
@@ -332,12 +332,12 @@ ildouble: 5
 ldouble: 5
 
 Function: Imaginary part of "cacosh_downward":
-double: 2
-float: 2
-idouble: 2
-ifloat: 2
-ildouble: 2
-ldouble: 2
+double: 3
+float: 3
+idouble: 3
+ifloat: 3
+ildouble: 4
+ldouble: 4
 
 Function: Real part of "cacosh_towardzero":
 double: 5
@@ -348,28 +348,28 @@ ildouble: 5
 ldouble: 5
 
 Function: Imaginary part of "cacosh_towardzero":
-double: 2
+double: 3
 float: 2
-idouble: 2
+idouble: 3
 ifloat: 2
-ildouble: 2
-ldouble: 2
+ildouble: 3
+ldouble: 3
 
 Function: Real part of "cacosh_upward":
 double: 4
 float: 4
 idouble: 4
 ifloat: 4
-ildouble: 5
-ldouble: 5
+ildouble: 6
+ldouble: 6
 
 Function: Imaginary part of "cacosh_upward":
-double: 2
+double: 3
 float: 2
-idouble: 2
+idouble: 3
 ifloat: 2
-ildouble: 3
-ldouble: 3
+ildouble: 4
+ldouble: 4
 
 Function: "carg":
 double: 1
@@ -412,18 +412,18 @@ ildouble: 2
 ldouble: 2
 
 Function: Imaginary part of "casin":
-double: 1
+double: 2
 float: 2
-idouble: 1
+idouble: 2
 ifloat: 2
 ildouble: 2
 ldouble: 2
 
 Function: Real part of "casin_downward":
 double: 3
-float: 1
+float: 2
 idouble: 3
-ifloat: 1
+ifloat: 2
 ildouble: 3
 ldouble: 3
 
@@ -432,8 +432,8 @@ double: 5
 float: 3
 idouble: 5
 ifloat: 3
-ildouble: 5
-ldouble: 5
+ildouble: 6
+ldouble: 6
 
 Function: Real part of "casin_towardzero":
 double: 3
@@ -452,25 +452,25 @@ ildouble: 5
 ldouble: 5
 
 Function: Real part of "casin_upward":
-double: 2
-float: 1
-idouble: 2
-ifloat: 1
+double: 3
+float: 2
+idouble: 3
+ifloat: 2
 ildouble: 3
 ldouble: 3
 
 Function: Imaginary part of "casin_upward":
-double: 4
-float: 4
-idouble: 4
-ifloat: 4
-ildouble: 5
-ldouble: 5
+double: 5
+float: 5
+idouble: 5
+ifloat: 5
+ildouble: 7
+ldouble: 7
 
 Function: Real part of "casinh":
-double: 1
+double: 2
 float: 2
-idouble: 1
+idouble: 2
 ifloat: 2
 ildouble: 2
 ldouble: 2
@@ -488,14 +488,14 @@ double: 5
 float: 3
 idouble: 5
 ifloat: 3
-ildouble: 5
-ldouble: 5
+ildouble: 6
+ldouble: 6
 
 Function: Imaginary part of "casinh_downward":
 double: 3
-float: 1
+float: 2
 idouble: 3
-ifloat: 1
+ifloat: 2
 ildouble: 3
 ldouble: 3
 
@@ -516,23 +516,25 @@ ildouble: 3
 ldouble: 3
 
 Function: Real part of "casinh_upward":
-double: 4
-float: 4
-idouble: 4
-ifloat: 4
-ildouble: 5
-ldouble: 5
+double: 5
+float: 5
+idouble: 5
+ifloat: 5
+ildouble: 7
+ldouble: 7
 
 Function: Imaginary part of "casinh_upward":
-double: 2
+double: 3
 float: 2
-idouble: 2
+idouble: 3
 ifloat: 2
 ildouble: 3
 ldouble: 3
 
 Function: Real part of "catan":
+double: 1
 float: 1
+idouble: 1
 ifloat: 1
 ildouble: 1
 ldouble: 1
@@ -547,9 +549,9 @@ ldouble: 1
 
 Function: Real part of "catan_downward":
 double: 1
-float: 1
+float: 2
 idouble: 1
-ifloat: 1
+ifloat: 2
 ildouble: 2
 ldouble: 2
 
@@ -563,25 +565,27 @@ ldouble: 3
 
 Function: Real part of "catan_towardzero":
 double: 1
-float: 1
+float: 2
 idouble: 1
-ifloat: 1
+ifloat: 2
 ildouble: 2
 ldouble: 2
 
 Function: Imaginary part of "catan_towardzero":
 double: 2
-float: 1
+float: 2
 idouble: 2
-ifloat: 1
+ifloat: 2
 ildouble: 3
 ldouble: 3
 
 Function: Real part of "catan_upward":
+double: 1
 float: 1
+idouble: 1
 ifloat: 1
-ildouble: 1
-ldouble: 1
+ildouble: 2
+ldouble: 2
 
 Function: Imaginary part of "catan_upward":
 double: 3
@@ -600,7 +604,9 @@ ildouble: 1
 ldouble: 1
 
 Function: Imaginary part of "catanh":
+double: 1
 float: 1
+idouble: 1
 ifloat: 1
 ildouble: 1
 ldouble: 1
@@ -623,9 +629,9 @@ ldouble: 2
 
 Function: Real part of "catanh_towardzero":
 double: 2
-float: 1
+float: 2
 idouble: 2
-ifloat: 1
+ifloat: 2
 ildouble: 3
 ldouble: 3
 
@@ -639,17 +645,19 @@ ldouble: 2
 
 Function: Real part of "catanh_upward":
 double: 4
-float: 3
+float: 4
 idouble: 4
-ifloat: 3
+ifloat: 4
 ildouble: 4
 ldouble: 4
 
 Function: Imaginary part of "catanh_upward":
+double: 1
 float: 1
+idouble: 1
 ifloat: 1
-ildouble: 1
-ldouble: 1
+ildouble: 2
+ldouble: 2
 
 Function: "cbrt":
 double: 3
@@ -900,18 +908,18 @@ ildouble: 2
 ldouble: 2
 
 Function: Imaginary part of "clog10":
-double: 1
+double: 2
 float: 2
-idouble: 1
+idouble: 2
 ifloat: 2
 ildouble: 2
 ldouble: 2
 
 Function: Real part of "clog10_downward":
 double: 5
-float: 4
+float: 5
 idouble: 5
-ifloat: 4
+ifloat: 5
 ildouble: 3
 ldouble: 3
 
@@ -1004,7 +1012,9 @@ ildouble: 2
 ldouble: 2
 
 Function: "cos":
+double: 1
 float: 1
+idouble: 1
 ifloat: 1
 ildouble: 1
 ldouble: 1
@@ -1323,9 +1333,9 @@ ldouble: 3
 
 Function: Imaginary part of "ctan":
 double: 2
-float: 1
+float: 2
 idouble: 2
-ifloat: 1
+ifloat: 2
 ildouble: 3
 ldouble: 3
 
@@ -1339,9 +1349,9 @@ ldouble: 4
 
 Function: Imaginary part of "ctan_downward":
 double: 2
-float: 1
+float: 2
 idouble: 2
-ifloat: 1
+ifloat: 2
 ildouble: 5
 ldouble: 5
 
@@ -1363,9 +1373,9 @@ ldouble: 5
 
 Function: Real part of "ctan_upward":
 double: 2
-float: 3
+float: 4
 idouble: 2
-ifloat: 3
+ifloat: 4
 ildouble: 5
 ldouble: 5
 
@@ -1395,9 +1405,9 @@ ldouble: 3
 
 Function: Real part of "ctanh_downward":
 double: 4
-float: 1
+float: 2
 idouble: 4
-ifloat: 1
+ifloat: 2
 ildouble: 5
 ldouble: 5
 
@@ -1575,15 +1585,21 @@ ldouble: 2
 
 Function: "exp_downward":
 double: 1
+float: 1
 idouble: 1
+ifloat: 1
 
 Function: "exp_towardzero":
 double: 1
+float: 1
 idouble: 1
+ifloat: 1
 
 Function: "exp_upward":
 double: 1
+float: 1
 idouble: 1
+ifloat: 1
 
 Function: "expm1":
 double: 1
@@ -1683,9 +1699,9 @@ ldouble: 2
 
 Function: "j0_downward":
 double: 2
-float: 3
+float: 4
 idouble: 2
-ifloat: 3
+ifloat: 4
 ildouble: 4
 ldouble: 4
 
@@ -1910,55 +1926,27 @@ ildouble: 1
 ldouble: 1
 
 Function: "log_towardzero":
-float: 1
-ifloat: 1
+float: 2
+ifloat: 2
 ildouble: 2
 ldouble: 2
 
 Function: "log_upward":
 double: 1
-float: 1
+float: 2
 idouble: 1
-ifloat: 1
+ifloat: 2
 ildouble: 1
 ldouble: 1
 
 Function: "pow":
+double: 1
 float: 1
+idouble: 1
 ifloat: 1
 ildouble: 2
 ldouble: 2
 
-Function: "pow10":
-double: 2
-idouble: 2
-ildouble: 2
-ldouble: 2
-
-Function: "pow10_downward":
-double: 2
-float: 1
-idouble: 2
-ifloat: 1
-ildouble: 3
-ldouble: 3
-
-Function: "pow10_towardzero":
-double: 2
-float: 1
-idouble: 2
-ifloat: 1
-ildouble: 3
-ldouble: 3
-
-Function: "pow10_upward":
-double: 2
-float: 1
-idouble: 2
-ifloat: 1
-ildouble: 3
-ldouble: 3
-
 Function: "pow_downward":
 double: 1
 float: 1
@@ -1984,7 +1972,9 @@ ildouble: 2
 ldouble: 2
 
 Function: "sin":
+double: 1
 float: 1
+idouble: 1
 ifloat: 1
 ildouble: 1
 ldouble: 1
@@ -2014,7 +2004,9 @@ ildouble: 3
 ldouble: 3
 
 Function: "sincos":
+double: 1
 float: 1
+idouble: 1
 ifloat: 1
 ildouble: 1
 ldouble: 1
@@ -2179,9 +2171,9 @@ ldouble: 3
 
 Function: "y0_downward":
 double: 3
-float: 2
+float: 4
 idouble: 3
-ifloat: 2
+ifloat: 4
 ildouble: 4
 ldouble: 4
 
@@ -2195,9 +2187,9 @@ ldouble: 3
 
 Function: "y0_upward":
 double: 2
-float: 3
+float: 5
 idouble: 2
-ifloat: 3
+ifloat: 5
 ildouble: 3
 ldouble: 3
 
@@ -2235,17 +2227,17 @@ ldouble: 5
 
 Function: "yn":
 double: 3
-float: 2
+float: 3
 idouble: 3
-ifloat: 2
+ifloat: 3
 ildouble: 5
 ldouble: 5
 
 Function: "yn_downward":
 double: 3
-float: 2
+float: 4
 idouble: 3
-ifloat: 2
+ifloat: 4
 ildouble: 5
 ldouble: 5
 
@@ -2259,9 +2251,9 @@ ldouble: 5
 
 Function: "yn_upward":
 double: 4
-float: 3
+float: 5
 idouble: 4
-ifloat: 3
+ifloat: 5
 ildouble: 5
 ldouble: 5
 
diff --git a/sysdeps/aarch64/libm-test-ulps-name b/sysdeps/aarch64/libm-test-ulps-name
new file mode 100644
index 0000000000..1f66c5cda0
--- /dev/null
+++ b/sysdeps/aarch64/libm-test-ulps-name
@@ -0,0 +1 @@
+AArch64
diff --git a/sysdeps/aarch64/linkmap.h b/sysdeps/aarch64/linkmap.h
index 5aefb4cfeb..6852f343a1 100644
--- a/sysdeps/aarch64/linkmap.h
+++ b/sysdeps/aarch64/linkmap.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2009-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/machine-gmon.h b/sysdeps/aarch64/machine-gmon.h
index 3d923f91ce..dd12a25416 100644
--- a/sysdeps/aarch64/machine-gmon.h
+++ b/sysdeps/aarch64/machine-gmon.h
@@ -1,5 +1,5 @@
 /* AArch64 definitions for profiling support.
-   Copyright (C) 1996-2016 Free Software Foundation, Inc.
+   Copyright (C) 1996-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/aarch64/math-tests.h b/sysdeps/aarch64/math-tests.h
index a3e1efae97..46cc33741f 100644
--- a/sysdeps/aarch64/math-tests.h
+++ b/sysdeps/aarch64/math-tests.h
@@ -1,5 +1,5 @@
 /* Configuration for math tests.  AArch64 version.
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
+   Copyright (C) 2014-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/aarch64/mcount.c b/sysdeps/aarch64/mcount.c
index ca596ee986..515c6bde25 100644
--- a/sysdeps/aarch64/mcount.c
+++ b/sysdeps/aarch64/mcount.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2013-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
new file mode 100644
index 0000000000..e422aef090
--- /dev/null
+++ b/sysdeps/aarch64/memchr.S
@@ -0,0 +1,157 @@
+/* memchr - find a character in a memory zone
+
+   Copyright (C) 2015-2018 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+#define cntin		x2
+
+#define result		x0
+
+#define src		x3
+#define	tmp		x4
+#define wtmp2		w5
+#define synd		x6
+#define soff		x9
+#define cntrem		x10
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_chr1	v3
+#define vhas_chr2	v4
+#define vrepmask	v5
+#define vend		v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ENTRY (__memchr)
+	/* Do not dereference srcin if no bytes to compare.  */
+	cbz	cntin, L(zero_length)
+	/*
+	 * Magic constant 0x40100401 allows us to identify which lane matches
+	 * the requested byte.
+	 */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	/* Work with aligned 32-byte chunks */
+	bic	src, srcin, #31
+	dup	vrepmask.4s, wtmp2
+	ands	soff, srcin, #31
+	and	cntrem, cntin, #31
+	b.eq	L(loop)
+
+	/*
+	 * Input string is not 32-byte aligned. We calculate the syndrome
+	 * value for the aligned 32 bytes block containing the first bytes
+	 * and mask the irrelevant part.
+	 */
+
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	sub	tmp, soff, #32
+	adds	cntin, cntin, tmp
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.2d[0]
+	/* Clear the soff*2 lower bits */
+	lsl	tmp, soff, #1
+	lsr	synd, synd, tmp
+	lsl	synd, synd, tmp
+	/* The first block can also be the last */
+	b.ls	L(masklast)
+	/* Have we found something already? */
+	cbnz	synd, L(tail)
+
+L(loop):
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	subs	cntin, cntin, #32
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	/* If we're out of data we finish regardless of the result */
+	b.ls	L(end)
+	/* Use a fast check for the termination condition */
+	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
+	addp	vend.2d, vend.2d, vend.2d
+	mov	synd, vend.2d[0]
+	/* We're not out of data, loop if we haven't found the character */
+	cbz	synd, L(loop)
+
+L(end):
+	/* Termination condition found, let's calculate the syndrome value */
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.2d[0]
+	/* Only do the clear for the last possible block */
+	b.hi	L(tail)
+
+L(masklast):
+	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+	add	tmp, cntrem, soff
+	and	tmp, tmp, #31
+	sub	tmp, tmp, #32
+	neg	tmp, tmp, lsl #1
+	lsl	synd, synd, tmp
+	lsr	synd, synd, tmp
+
+L(tail):
+	/* Count the trailing zeros using bit reversing */
+	rbit	synd, synd
+	/* Compensate the last post-increment */
+	sub	src, src, #32
+	/* Check that we have found a character */
+	cmp	synd, #0
+	/* And count the leading zeros */
+	clz	synd, synd
+	/* Compute the potential result */
+	add	result, src, synd, lsr #1
+	/* Select result or NULL */
+	csel	result, xzr, result, eq
+	ret
+
+L(zero_length):
+	mov	result, #0
+	ret
+END (__memchr)
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
index ae2d997973..743bc078bb 100644
--- a/sysdeps/aarch64/memcmp.S
+++ b/sysdeps/aarch64/memcmp.S
@@ -1,6 +1,6 @@
 /* memcmp - compare memory
 
-   Copyright (C) 2013-2016 Free Software Foundation, Inc.
+   Copyright (C) 2013-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -22,129 +22,132 @@
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64, unaligned accesses.
  */
 
 /* Parameters and result.  */
 #define src1		x0
 #define src2		x1
 #define limit		x2
-#define result		x0
+#define result		w0
 
 /* Internal variables.  */
 #define data1		x3
 #define data1w		w3
-#define data2		x4
-#define data2w		w4
-#define has_nul		x5
-#define diff		x6
-#define endloop		x7
-#define tmp1		x8
-#define tmp2		x9
-#define tmp3		x10
-#define pos		x11
-#define limit_wd	x12
-#define mask		x13
+#define data1h		x4
+#define data2		x5
+#define data2w		w5
+#define data2h		x6
+#define tmp1		x7
+#define tmp2		x8
 
 ENTRY_ALIGN (memcmp, 6)
-	cbz	limit, L(ret0)
-	eor	tmp1, src1, src2
-	tst	tmp1, #7
-	b.ne	L(misaligned8)
-	ands	tmp1, src1, #7
-	b.ne	L(mutual_align)
-	add	limit_wd, limit, #7
-	lsr	limit_wd, limit_wd, #3
-	/* Start of performance-critical section  -- one 64B cache line.  */
-L(loop_aligned):
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-L(start_realigned):
-	subs	limit_wd, limit_wd, #1
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, ne	/* Last Dword or differences.  */
-	cbz	endloop, L(loop_aligned)
-	/* End of performance-critical section  -- one 64B cache line.  */
-
-	/* Not reached the limit, must have found a diff.  */
-	cbnz	limit_wd, L(not_limit)
-
-	/* Limit % 8 == 0 => all bytes significant.  */
-	ands	limit, limit, #7
-	b.eq	L(not_limit)
-
-	lsl	limit, limit, #3	/* Bits -> bytes.  */
-	mov	mask, #~0
-#ifdef __AARCH64EB__
-	lsr	mask, mask, limit
-#else
-	lsl	mask, mask, limit
-#endif
-	bic	data1, data1, mask
-	bic	data2, data2, mask
-
-	orr	diff, diff, mask
-L(not_limit):
-
-#ifndef	__AARCH64EB__
-	rev	diff, diff
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
+
+	subs	limit, limit, 8
+	b.lo	L(less8)
+
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	cmp	data1, data2
+	b.ne	L(return)
+
+	subs	limit, limit, 8
+	b.gt	L(more16)
+
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	b	L(return)
+
+L(more16):
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	cmp	data1, data2
+	bne	L(return)
+
+	/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+	   strings.  */
+	subs	limit, limit, 16
+	b.ls	L(last_bytes)
+
+	/* We overlap loads between 0-32 bytes at either side of SRC1 when we
+	   try to align, so limit it only to strings larger than 128 bytes.  */
+	cmp	limit, 96
+	b.ls	L(loop16)
+
+	/* Align src1 and adjust src2 with bytes not yet done.  */
+	and	tmp1, src1, 15
+	add	limit, limit, tmp1
+	sub	src1, src1, tmp1
+	sub	src2, src2, tmp1
+
+	/* Loop performing 16 bytes per iteration using aligned src1.
+	   Limit is pre-decremented by 16 and must be larger than zero.
+	   Exit if <= 16 bytes left to do or if the data is not equal.  */
+	.p2align 4
+L(loop16):
+	ldp	data1, data1h, [src1], 16
+	ldp	data2, data2h, [src2], 16
+	subs	limit, limit, 16
+	ccmp	data1, data2, 0, hi
+	ccmp	data1h, data2h, 0, eq
+	b.eq	L(loop16)
+
+	cmp	data1, data2
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
+	cmp	data1, data2
+	bne	L(return)
+
+	/* Compare last 1-16 bytes using unaligned access.  */
+L(last_bytes):
+	add	src1, src1, limit
+	add	src2, src2, limit
+	ldp	data1, data1h, [src1]
+	ldp	data2, data2h, [src2]
+	cmp     data1, data2
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
+	cmp	data1, data2
+
+	/* Compare data bytes and set return value to 0, -1 or 1.  */
+L(return):
+#ifndef __AARCH64EB__
 	rev	data1, data1
 	rev	data2, data2
 #endif
-	/* The MS-non-zero bit of DIFF marks either the first bit
-	   that is different, or the end of the significant data.
-	   Shifting left now will bring the critical information into the
-	   top bits.  */
-	clz	pos, diff
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/* But we need to zero-extend (char is unsigned) the value and then
-	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
-	RET
-
-L(mutual_align):
-	/* Sources are mutually aligned, but are not currently at an
-	   alignment boundary.  Round down the addresses and then mask off
-	   the bytes that precede the start point.  */
-	bic	src1, src1, #7
-	bic	src2, src2, #7
-	add	limit, limit, tmp1	/* Adjust the limit for the extra.  */
-	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
-	ldr	data1, [src1], #8
-	neg	tmp1, tmp1		/* Bits to alignment -64.  */
-	ldr	data2, [src2], #8
-	mov	tmp2, #~0
-#ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#endif
-	add	limit_wd, limit, #7
-	orr	data1, data1, tmp2
-	orr	data2, data2, tmp2
-	lsr	limit_wd, limit_wd, #3
-	b	L(start_realigned)
-
-L(ret0):
-	mov	result, #0
-	RET
-
-	.p2align 6
-L(misaligned8):
-	sub	limit, limit, #1
-1:
-	/* Perhaps we can do better than this.  */
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	subs	limit, limit, #1
-	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	1b
-	sub	result, data1, data2
-	RET
+	cmp     data1, data2
+L(ret_eq):
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+
+	.p2align 4
+	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
+L(less8):
+	adds	limit, limit, 4
+	b.lo	L(less4)
+	ldr	data1w, [src1], 4
+	ldr	data2w, [src2], 4
+	cmp	data1w, data2w
+	b.ne	L(return)
+	sub	limit, limit, 4
+L(less4):
+	adds	limit, limit, 4
+	beq	L(ret_eq)
+L(byte_loop):
+	ldrb	data1w, [src1], 1
+	ldrb	data2w, [src2], 1
+	subs	limit, limit, 1
+	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
+	b.eq	L(byte_loop)
+	sub	result, data1w, data2w
+	ret
+
 END (memcmp)
 #undef bcmp
 weak_alias (memcmp, bcmp)
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 442f390426..7e1163e6a0 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2012-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,161 +16,252 @@
    License along with the GNU C Library.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
+#include <sysdep.h>
+
 /* Assumptions:
  *
- * ARMv8-a, AArch64
- * Unaligned accesses
+ * ARMv8-a, AArch64, unaligned accesses.
  *
  */
 
 #define dstin	x0
 #define src	x1
 #define count	x2
-#define tmp1	x3
-#define tmp1w	w3
-#define tmp2	x4
-#define tmp2w	w4
-#define tmp3	x5
-#define tmp3w	w5
-#define dst	x6
-
-#define A_l	x7
-#define A_h	x8
-#define B_l	x9
-#define B_h	x10
-#define C_l	x11
-#define C_h	x12
-#define D_l	x13
-#define D_h	x14
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define A_l	x6
+#define A_lw	w6
+#define A_h	x7
+#define A_hw	w7
+#define B_l	x8
+#define B_lw	w8
+#define B_h	x9
+#define C_l	x10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	src
+#define E_h	count
+#define F_l	srcend
+#define F_h	dst
+#define G_l	count
+#define G_h	dst
+#define tmp1	x14
 
-#include <sysdep.h>
+/* Copies are split into 3 main cases: small copies of up to 16 bytes,
+   medium copies of 17..96 bytes which are fully unrolled. Large copies
+   of more than 96 bytes align the destination and use an unrolled loop
+   processing 64 bytes per iteration.
+   In order to share code with memmove, small and medium copies read all
+   data before writing, allowing any kind of overlap. So small, medium
+   and large backwards memmoves are handled by falling through into memcpy.
+   Overlapping large forward memmoves use a loop that copies backwards.
+*/
 
-ENTRY_ALIGN (memcpy, 6)
-
-	mov	dst, dstin
-	cmp	count, #64
-	b.ge	L(cpy_not_short)
-	cmp	count, #15
-	b.le	L(tail15tiny)
-
-	/* Deal with small copies quickly by dropping straight into the
-	 * exit block.  */
-L(tail63):
-	/* Copy up to 48 bytes of data.  At this point we only need the
-	 * bottom 6 bits of count to be accurate.  */
-	ands	tmp1, count, #0x30
-	b.eq	L(tail15)
-	add	dst, dst, tmp1
-	add	src, src, tmp1
-	cmp	tmp1w, #0x20
-	b.eq	1f
-	b.lt	2f
-	ldp	A_l, A_h, [src, #-48]
-	stp	A_l, A_h, [dst, #-48]
-1:
-	ldp	A_l, A_h, [src, #-32]
-	stp	A_l, A_h, [dst, #-32]
-2:
-	ldp	A_l, A_h, [src, #-16]
-	stp	A_l, A_h, [dst, #-16]
-
-L(tail15):
-	ands	count, count, #15
-	beq	1f
-	add	src, src, count
-	ldp	A_l, A_h, [src, #-16]
-	add	dst, dst, count
-	stp	A_l, A_h, [dst, #-16]
-1:
-	RET
-
-L(tail15tiny):
-	/* Copy up to 15 bytes of data.  Does not assume additional data
-	   being copied.  */
-	tbz	count, #3, 1f
-	ldr	tmp1, [src], #8
-	str	tmp1, [dst], #8
+#ifndef MEMMOVE
+# define MEMMOVE memmove
+#endif
+#ifndef MEMCPY
+# define MEMCPY memcpy
+#endif
+
+ENTRY_ALIGN (MEMMOVE, 6)
+
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
+
+	sub	tmp1, dstin, src
+	cmp	count, 96
+	ccmp	tmp1, count, 2, hi
+	b.lo	L(move_long)
+
+	/* Common case falls through into memcpy.  */
+END (MEMMOVE)
+libc_hidden_builtin_def (MEMMOVE)
+ENTRY (MEMCPY)
+
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
+
+	prfm	PLDL1KEEP, [src]
+	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 16
+	b.ls	L(copy16)
+	cmp	count, 96
+	b.hi	L(copy_long)
+
+	/* Medium copies: 17..96 bytes.  */
+	sub	tmp1, count, 1
+	ldp	A_l, A_h, [src]
+	tbnz	tmp1, 6, L(copy96)
+	ldp	D_l, D_h, [srcend, -16]
+	tbz	tmp1, 5, 1f
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [srcend, -32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstend, -32]
 1:
-	tbz	count, #2, 1f
-	ldr	tmp1w, [src], #4
-	str	tmp1w, [dst], #4
+	stp	A_l, A_h, [dstin]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Small copies: 0..16 bytes.  */
+L(copy16):
+	cmp	count, 8
+	b.lo	1f
+	ldr	A_l, [src]
+	ldr	A_h, [srcend, -8]
+	str	A_l, [dstin]
+	str	A_h, [dstend, -8]
+	ret
+	.p2align 4
 1:
-	tbz	count, #1, 1f
-	ldrh	tmp1w, [src], #2
-	strh	tmp1w, [dst], #2
+	tbz	count, 2, 1f
+	ldr	A_lw, [src]
+	ldr	A_hw, [srcend, -4]
+	str	A_lw, [dstin]
+	str	A_hw, [dstend, -4]
+	ret
+
+	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
+	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
 1:
-	tbz	count, #0, 1f
-	ldrb	tmp1w, [src]
-	strb	tmp1w, [dst]
+	cbz	count, 2f
+	lsr	tmp1, count, 1
+	ldrb	A_lw, [src]
+	ldrb	A_hw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	A_hw, [dstend, -1]
+2:	ret
+
+	.p2align 4
+	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
+	   32 bytes from the end.  */
+L(copy96):
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [src, 32]
+	ldp	D_l, D_h, [src, 48]
+	ldp	E_l, E_h, [srcend, -32]
+	ldp	F_l, F_h, [srcend, -16]
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin, 32]
+	stp	D_l, D_h, [dstin, 48]
+	stp	E_l, E_h, [dstend, -32]
+	stp	F_l, F_h, [dstend, -16]
+	ret
+
+	/* Align DST to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.  There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.  The loop
+	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+	.p2align 4
+L(copy_long):
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	ldp	D_l, D_h, [src]
+	sub	src, src, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_l, A_h, [src, 16]
+	stp	D_l, D_h, [dstin]
+	ldp	B_l, B_h, [src, 32]
+	ldp	C_l, C_h, [src, 48]
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	L(last64)
+L(loop64):
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]!
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 64
+	b.hi	L(loop64)
+
+	/* Write the last full set of 64 bytes.  The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the end even if
+	   there is just 1 byte left.  */
+L(last64):
+	ldp	E_l, E_h, [srcend, -64]
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [srcend, -48]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [srcend, -16]
+	stp	D_l, D_h, [dst, 64]
+	stp	E_l, E_h, [dstend, -64]
+	stp	A_l, A_h, [dstend, -48]
+	stp	B_l, B_h, [dstend, -32]
+	stp	C_l, C_h, [dstend, -16]
+	ret
+
+	.p2align 4
+L(move_long):
+	cbz	tmp1, 3f
+
+	add	srcend, src, count
+	add	dstend, dstin, count
+
+	/* Align dstend to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.  There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.  The loop
+	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+	and	tmp1, dstend, 15
+	ldp	D_l, D_h, [srcend, -16]
+	sub	srcend, srcend, tmp1
+	sub	count, count, tmp1
+	ldp	A_l, A_h, [srcend, -16]
+	stp	D_l, D_h, [dstend, -16]
+	ldp	B_l, B_h, [srcend, -32]
+	ldp	C_l, C_h, [srcend, -48]
+	ldp	D_l, D_h, [srcend, -64]!
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	2f
+
+	nop
 1:
-	RET
-
-L(cpy_not_short):
-	/* We don't much care about the alignment of DST, but we want SRC
-	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
-	 * boundaries on both loads and stores.  */
-	neg	tmp2, src
-	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
-	b.eq	2f
-	sub	count, count, tmp2
-	/* Copy more data than needed; it's faster than jumping
-	 * around copying sub-Quadword quantities.  We know that
-	 * it can't overrun.  */
-	ldp	A_l, A_h, [src]
-	add	src, src, tmp2
-	stp	A_l, A_h, [dst]
-	add	dst, dst, tmp2
-	/* There may be less than 63 bytes to go now.  */
-	cmp	count, #63
-	b.le	L(tail63)
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [srcend, -16]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [srcend, -48]
+	stp	D_l, D_h, [dstend, -64]!
+	ldp	D_l, D_h, [srcend, -64]!
+	subs	count, count, 64
+	b.hi	1b
+
+	/* Write the last full set of 64 bytes.  The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the start even if
+	   there is just 1 byte left.  */
 2:
-	subs	count, count, #128
-	b.ge	L(cpy_body_large)
-	/* Less than 128 bytes to copy, so handle 64 here and then jump
-	 * to the tail.  */
-	ldp	A_l, A_h, [src]
-	ldp	B_l, B_h, [src, #16]
-	ldp	C_l, C_h, [src, #32]
-	ldp	D_l, D_h, [src, #48]
-	stp	A_l, A_h, [dst]
-	stp	B_l, B_h, [dst, #16]
-	stp	C_l, C_h, [dst, #32]
-	stp	D_l, D_h, [dst, #48]
-	tst	count, #0x3f
-	add	src, src, #64
-	add	dst, dst, #64
-	b.ne	L(tail63)
-	RET
-
-	/* Critical loop.  Start at a new cache line boundary.  Assuming
-	 * 64 bytes per line this ensures the entire loop is in one line.  */
-	.p2align 6
-L(cpy_body_large):
-	/* There are at least 128 bytes to copy.  */
-	ldp	A_l, A_h, [src, #0]
-	sub	dst, dst, #16		/* Pre-bias.  */
-	ldp	B_l, B_h, [src, #16]
-	ldp	C_l, C_h, [src, #32]
-	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
-1:
-	stp	A_l, A_h, [dst, #16]
-	ldp	A_l, A_h, [src, #16]
-	stp	B_l, B_h, [dst, #32]
-	ldp	B_l, B_h, [src, #32]
-	stp	C_l, C_h, [dst, #48]
-	ldp	C_l, C_h, [src, #48]
-	stp	D_l, D_h, [dst, #64]!
-	ldp	D_l, D_h, [src, #64]!
-	subs	count, count, #64
-	b.ge	1b
-	stp	A_l, A_h, [dst, #16]
-	stp	B_l, B_h, [dst, #32]
-	stp	C_l, C_h, [dst, #48]
-	stp	D_l, D_h, [dst, #64]
-	add	src, src, #16
-	add	dst, dst, #64 + 16
-	tst	count, #0x3f
-	b.ne	L(tail63)
-	RET
-END (memcpy)
-libc_hidden_builtin_def (memcpy)
+	ldp	G_l, G_h, [src, 48]
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [src, 32]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [src, 16]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [src]
+	stp	D_l, D_h, [dstend, -64]
+	stp	G_l, G_h, [dstin, 48]
+	stp	A_l, A_h, [dstin, 32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin]
+3:	ret
+
+END (MEMCPY)
+libc_hidden_builtin_def (MEMCPY)
diff --git a/sysdeps/aarch64/memmove.S b/sysdeps/aarch64/memmove.S
index dd91db0ff6..0feeac8414 100644
--- a/sysdeps/aarch64/memmove.S
+++ b/sysdeps/aarch64/memmove.S
@@ -1,312 +1 @@
-/* Copyright (C) 2012-2016 Free Software Foundation, Inc.
-
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64
- * Unaligned accesses
- */
-
-/* Parameters and result.  */
-#define dstin	x0
-#define src	x1
-#define count	x2
-#define tmp1	x3
-#define tmp1w	w3
-#define tmp2	x4
-#define tmp2w	w4
-#define tmp3	x5
-#define tmp3w	w5
-#define dst	x6
-
-#define A_l	x7
-#define A_h	x8
-#define B_l	x9
-#define B_h	x10
-#define C_l	x11
-#define C_h	x12
-#define D_l	x13
-#define D_h	x14
-
-ENTRY_ALIGN (memmove, 6)
-
-	cmp	dstin, src
-	b.lo	L(downwards)
-	add	tmp1, src, count
-	cmp	dstin, tmp1
-	b.hs	memcpy		/* No overlap.  */
-
-	/* Upwards move with potential overlap.
-	 * Need to move from the tail backwards.  SRC and DST point one
-	 * byte beyond the remaining data to move.  */
-	add	dst, dstin, count
-	add	src, src, count
-	cmp	count, #64
-	b.ge	L(mov_not_short_up)
-
-	/* Deal with small moves quickly by dropping straight into the
-	 * exit block.  */
-L(tail63up):
-	/* Move up to 48 bytes of data.  At this point we only need the
-	 * bottom 6 bits of count to be accurate.  */
-	ands	tmp1, count, #0x30
-	b.eq	L(tail15up)
-	sub	dst, dst, tmp1
-	sub	src, src, tmp1
-	cmp	tmp1w, #0x20
-	b.eq	1f
-	b.lt	2f
-	ldp	A_l, A_h, [src, #32]
-	stp	A_l, A_h, [dst, #32]
-1:
-	ldp	A_l, A_h, [src, #16]
-	stp	A_l, A_h, [dst, #16]
-2:
-	ldp	A_l, A_h, [src]
-	stp	A_l, A_h, [dst]
-L(tail15up):
-	/* Move up to 15 bytes of data.  Does not assume additional data
-	 * being moved.  */
-	tbz	count, #3, 1f
-	ldr	tmp1, [src, #-8]!
-	str	tmp1, [dst, #-8]!
-1:
-	tbz	count, #2, 1f
-	ldr	tmp1w, [src, #-4]!
-	str	tmp1w, [dst, #-4]!
-1:
-	tbz	count, #1, 1f
-	ldrh	tmp1w, [src, #-2]!
-	strh	tmp1w, [dst, #-2]!
-1:
-	tbz	count, #0, 1f
-	ldrb	tmp1w, [src, #-1]
-	strb	tmp1w, [dst, #-1]
-1:
-	RET
-
-L(mov_not_short_up):
-	/* We don't much care about the alignment of DST, but we want SRC
-	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
-	 * boundaries on both loads and stores.  */
-	ands	tmp2, src, #15		/* Bytes to reach alignment.  */
-	b.eq	2f
-	sub	count, count, tmp2
-	/* Move enough data to reach alignment; unlike memcpy, we have to
-	 * be aware of the overlap, which means we can't move data twice.  */
-	tbz	tmp2, #3, 1f
-	ldr	tmp1, [src, #-8]!
-	str	tmp1, [dst, #-8]!
-1:
-	tbz	tmp2, #2, 1f
-	ldr	tmp1w, [src, #-4]!
-	str	tmp1w, [dst, #-4]!
-1:
-	tbz	tmp2, #1, 1f
-	ldrh	tmp1w, [src, #-2]!
-	strh	tmp1w, [dst, #-2]!
-1:
-	tbz	tmp2, #0, 1f
-	ldrb	tmp1w, [src, #-1]!
-	strb	tmp1w, [dst, #-1]!
-1:
-
-	/* There may be less than 63 bytes to go now.  */
-	cmp	count, #63
-	b.le	L(tail63up)
-2:
-	subs	count, count, #128
-	b.ge	L(mov_body_large_up)
-	/* Less than 128 bytes to move, so handle 64 here and then jump
-	 * to the tail.  */
-	ldp	A_l, A_h, [src, #-64]!
-	ldp	B_l, B_h, [src, #16]
-	ldp	C_l, C_h, [src, #32]
-	ldp	D_l, D_h, [src, #48]
-	stp	A_l, A_h, [dst, #-64]!
-	stp	B_l, B_h, [dst, #16]
-	stp	C_l, C_h, [dst, #32]
-	stp	D_l, D_h, [dst, #48]
-	tst	count, #0x3f
-	b.ne	L(tail63up)
-	RET
-
-	/* Critical loop.  Start at a new Icache line boundary.  Assuming
-	 * 64 bytes per line this ensures the entire loop is in one line.  */
-	.p2align 6
-L(mov_body_large_up):
-	/* There are at least 128 bytes to move.  */
-	ldp	A_l, A_h, [src, #-16]
-	ldp	B_l, B_h, [src, #-32]
-	ldp	C_l, C_h, [src, #-48]
-	ldp	D_l, D_h, [src, #-64]!
-1:
-	stp	A_l, A_h, [dst, #-16]
-	ldp	A_l, A_h, [src, #-16]
-	stp	B_l, B_h, [dst, #-32]
-	ldp	B_l, B_h, [src, #-32]
-	stp	C_l, C_h, [dst, #-48]
-	ldp	C_l, C_h, [src, #-48]
-	stp	D_l, D_h, [dst, #-64]!
-	ldp	D_l, D_h, [src, #-64]!
-	subs	count, count, #64
-	b.ge	1b
-	stp	A_l, A_h, [dst, #-16]
-	stp	B_l, B_h, [dst, #-32]
-	stp	C_l, C_h, [dst, #-48]
-	stp	D_l, D_h, [dst, #-64]!
-	tst	count, #0x3f
-	b.ne	L(tail63up)
-	RET
-
-L(downwards):
-	/* For a downwards move we can safely use memcpy provided that
-	 * DST is more than 16 bytes away from SRC.  */
-	sub	tmp1, src, #16
-	cmp	dstin, tmp1
-	b.ls	memcpy		/* May overlap, but not critically.  */
-
-	mov	dst, dstin	/* Preserve DSTIN for return value.  */
-	cmp	count, #64
-	b.ge	L(mov_not_short_down)
-
-	/* Deal with small moves quickly by dropping straight into the
-	 * exit block.  */
-L(tail63down):
-	/* Move up to 48 bytes of data.  At this point we only need the
-	 * bottom 6 bits of count to be accurate.  */
-	ands	tmp1, count, #0x30
-	b.eq	L(tail15down)
-	add	dst, dst, tmp1
-	add	src, src, tmp1
-	cmp	tmp1w, #0x20
-	b.eq	1f
-	b.lt	2f
-	ldp	A_l, A_h, [src, #-48]
-	stp	A_l, A_h, [dst, #-48]
-1:
-	ldp	A_l, A_h, [src, #-32]
-	stp	A_l, A_h, [dst, #-32]
-2:
-	ldp	A_l, A_h, [src, #-16]
-	stp	A_l, A_h, [dst, #-16]
-L(tail15down):
-	/* Move up to 15 bytes of data.  Does not assume additional data
-	   being moved.  */
-	tbz	count, #3, 1f
-	ldr	tmp1, [src], #8
-	str	tmp1, [dst], #8
-1:
-	tbz	count, #2, 1f
-	ldr	tmp1w, [src], #4
-	str	tmp1w, [dst], #4
-1:
-	tbz	count, #1, 1f
-	ldrh	tmp1w, [src], #2
-	strh	tmp1w, [dst], #2
-1:
-	tbz	count, #0, 1f
-	ldrb	tmp1w, [src]
-	strb	tmp1w, [dst]
-1:
-	RET
-
-L(mov_not_short_down):
-	/* We don't much care about the alignment of DST, but we want SRC
-	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
-	 * boundaries on both loads and stores.  */
-	neg	tmp2, src
-	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
-	b.eq	2f
-	sub	count, count, tmp2
-	/* Move enough data to reach alignment; unlike memcpy, we have to
-	 * be aware of the overlap, which means we can't move data twice.  */
-	tbz	tmp2, #3, 1f
-	ldr	tmp1, [src], #8
-	str	tmp1, [dst], #8
-1:
-	tbz	tmp2, #2, 1f
-	ldr	tmp1w, [src], #4
-	str	tmp1w, [dst], #4
-1:
-	tbz	tmp2, #1, 1f
-	ldrh	tmp1w, [src], #2
-	strh	tmp1w, [dst], #2
-1:
-	tbz	tmp2, #0, 1f
-	ldrb	tmp1w, [src], #1
-	strb	tmp1w, [dst], #1
-1:
-
-	/* There may be less than 63 bytes to go now.  */
-	cmp	count, #63
-	b.le	L(tail63down)
-2:
-	subs	count, count, #128
-	b.ge	L(mov_body_large_down)
-	/* Less than 128 bytes to move, so handle 64 here and then jump
-	 * to the tail.  */
-	ldp	A_l, A_h, [src]
-	ldp	B_l, B_h, [src, #16]
-	ldp	C_l, C_h, [src, #32]
-	ldp	D_l, D_h, [src, #48]
-	stp	A_l, A_h, [dst]
-	stp	B_l, B_h, [dst, #16]
-	stp	C_l, C_h, [dst, #32]
-	stp	D_l, D_h, [dst, #48]
-	tst	count, #0x3f
-	add	src, src, #64
-	add	dst, dst, #64
-	b.ne	L(tail63down)
-	RET
-
-	/* Critical loop.  Start at a new cache line boundary.  Assuming
-	 * 64 bytes per line this ensures the entire loop is in one line.  */
-	.p2align 6
-L(mov_body_large_down):
-	/* There are at least 128 bytes to move.  */
-	ldp	A_l, A_h, [src, #0]
-	sub	dst, dst, #16		/* Pre-bias.  */
-	ldp	B_l, B_h, [src, #16]
-	ldp	C_l, C_h, [src, #32]
-	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
-1:
-	stp	A_l, A_h, [dst, #16]
-	ldp	A_l, A_h, [src, #16]
-	stp	B_l, B_h, [dst, #32]
-	ldp	B_l, B_h, [src, #32]
-	stp	C_l, C_h, [dst, #48]
-	ldp	C_l, C_h, [src, #48]
-	stp	D_l, D_h, [dst, #64]!
-	ldp	D_l, D_h, [src, #64]!
-	subs	count, count, #64
-	b.ge	1b
-	stp	A_l, A_h, [dst, #16]
-	stp	B_l, B_h, [dst, #32]
-	stp	C_l, C_h, [dst, #48]
-	stp	D_l, D_h, [dst, #64]
-	add	src, src, #16
-	add	dst, dst, #64 + 16
-	tst	count, #0x3f
-	b.ne	L(tail63down)
-	RET
-END (memmove)
-
-libc_hidden_builtin_def (memmove)
+/* memmove is part of memcpy.S.  */
diff --git a/sysdeps/aarch64/memset-reg.h b/sysdeps/aarch64/memset-reg.h
new file mode 100644
index 0000000000..ac7c083867
--- /dev/null
+++ b/sysdeps/aarch64/memset-reg.h
@@ -0,0 +1,30 @@
+/* Register aliases for memset to be used across implementations.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define dstin	x0
+#define val	x1
+#define valw	w1
+#define count	x2
+#define dst	x3
+#define dstend	x4
+#define tmp1	x5
+#define tmp1w	w5
+#define tmp2	x6
+#define tmp2w	w6
+#define zva_len x7
+#define zva_lenw w7
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index e49f4d6faf..4a45459361 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2012-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,214 +16,175 @@
    License along with the GNU C Library.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
+#include <sysdep.h>
+#include "memset-reg.h"
+
+#ifndef MEMSET
+# define MEMSET memset
+#endif
+
 /* Assumptions:
  *
- * ARMv8-a, AArch64
- * Unaligned accesses
+ * ARMv8-a, AArch64, unaligned accesses
  *
  */
 
-#include <sysdep.h>
-
-/* By default we assume that the DC instruction can be used to zero
-   data blocks more efficiently.  In some circumstances this might be
-   unsafe, for example in an asymmetric multiprocessor environment with
-   different DC clear lengths (neither the upper nor lower lengths are
-   safe to use).  The feature can be disabled by defining DONT_USE_DC.
-
-   If code may be run in a virtualized environment, then define
-   MAYBE_VIRT.  This will cause the code to cache the system register
-   values rather than re-reading them each call.  */
-
-#define dstin		x0
-#define val		w1
-#define count		x2
-#define tmp1		x3
-#define tmp1w		w3
-#define tmp2		x4
-#define tmp2w		w4
-#define zva_len_x	x5
-#define zva_len		w5
-#define zva_bits_x	x6
-
-#define A_l		x7
-#define A_lw		w7
-#define dst		x8
-#define tmp3w		w9
-
-ENTRY_ALIGN (__memset, 6)
-
-	mov	dst, dstin		/* Preserve return value.  */
-	ands	A_lw, val, #255
-#ifndef DONT_USE_DC
-	b.eq	L(zero_mem)
-#endif
-	orr	A_lw, A_lw, A_lw, lsl #8
-	orr	A_lw, A_lw, A_lw, lsl #16
-	orr	A_l, A_l, A_l, lsl #32
-L(tail_maybe_long):
-	cmp	count, #64
-	b.ge	L(not_short)
-L(tail_maybe_tiny):
-	cmp	count, #15
-	b.le	L(tail15tiny)
-L(tail63):
-	ands	tmp1, count, #0x30
-	b.eq	L(tail15)
-	add	dst, dst, tmp1
-	cmp	tmp1w, #0x20
-	b.eq	1f
-	b.lt	2f
-	stp	A_l, A_l, [dst, #-48]
-1:
-	stp	A_l, A_l, [dst, #-32]
-2:
-	stp	A_l, A_l, [dst, #-16]
-
-L(tail15):
-	and	count, count, #15
-	add	dst, dst, count
-	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
-	RET
-
-L(tail15tiny):
-	/* Set up to 15 bytes.  Does not assume earlier memory
-	   being set.  */
-	tbz	count, #3, 1f
-	str	A_l, [dst], #8
-1:
-	tbz	count, #2, 1f
-	str	A_lw, [dst], #4
-1:
-	tbz	count, #1, 1f
-	strh	A_lw, [dst], #2
-1:
-	tbz	count, #0, 1f
-	strb	A_lw, [dst]
-1:
-	RET
-
-	/* Critical loop.  Start at a new cache line boundary.  Assuming
-	 * 64 bytes per line, this ensures the entire loop is in one line.  */
-	.p2align 6
-L(not_short):
-	neg	tmp2, dst
-	ands	tmp2, tmp2, #15
-	b.eq	2f
-	/* Bring DST to 128-bit (16-byte) alignment.  We know that there's
-	 * more than that to set, so we simply store 16 bytes and advance by
-	 * the amount required to reach alignment.  */
-	sub	count, count, tmp2
-	stp	A_l, A_l, [dst]
-	add	dst, dst, tmp2
-	/* There may be less than 63 bytes to go now.  */
-	cmp	count, #63
-	b.le	L(tail63)
-2:
-	sub	dst, dst, #16		/* Pre-bias.  */
-	sub	count, count, #64
-1:
-	stp	A_l, A_l, [dst, #16]
-	stp	A_l, A_l, [dst, #32]
-	stp	A_l, A_l, [dst, #48]
-	stp	A_l, A_l, [dst, #64]!
-	subs	count, count, #64
-	b.ge	1b
-	tst	count, #0x3f
-	add	dst, dst, #16
-	b.ne	L(tail63)
-	RET
-
-#ifndef DONT_USE_DC
-	/* For zeroing memory, check to see if we can use the ZVA feature to
-	 * zero entire 'cache' lines.  */
-L(zero_mem):
-	mov	A_l, #0
-	cmp	count, #63
-	b.le	L(tail_maybe_tiny)
-	neg	tmp2, dst
-	ands	tmp2, tmp2, #15
-	b.eq	1f
-	sub	count, count, tmp2
-	stp	A_l, A_l, [dst]
-	add	dst, dst, tmp2
-	cmp	count, #63
-	b.le	L(tail63)
-1:
-	/* For zeroing small amounts of memory, it's not worth setting up
-	 * the line-clear code.  */
-	cmp	count, #128
-	b.lt	L(not_short)
-#ifdef MAYBE_VIRT
-	/* For efficiency when virtualized, we cache the ZVA capability.  */
-	adrp	tmp2, L(cache_clear)
-	ldr	zva_len, [tmp2, #:lo12:L(cache_clear)]
-	tbnz	zva_len, #31, L(not_short)
-	cbnz	zva_len, L(zero_by_line)
-	mrs	tmp1, dczid_el0
-	tbz	tmp1, #4, 1f
-	/* ZVA not available.  Remember this for next time.  */
-	mov	zva_len, #~0
-	str	zva_len, [tmp2, #:lo12:L(cache_clear)]
-	b	L(not_short)
-1:
-	mov	tmp3w, #4
-	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
-	lsl	zva_len, tmp3w, zva_len
-	str	zva_len, [tmp2, #:lo12:L(cache_clear)]
+ENTRY_ALIGN (MEMSET, 6)
+
+	DELOUSE (0)
+	DELOUSE (2)
+
+	dup	v0.16B, valw
+	add	dstend, dstin, count
+
+	cmp	count, 96
+	b.hi	L(set_long)
+	cmp	count, 16
+	b.hs	L(set_medium)
+	mov	val, v0.D[0]
+
+	/* Set 0..15 bytes.  */
+	tbz	count, 3, 1f
+	str	val, [dstin]
+	str	val, [dstend, -8]
+	ret
+	nop
+1:	tbz	count, 2, 2f
+	str	valw, [dstin]
+	str	valw, [dstend, -4]
+	ret
+2:	cbz	count, 3f
+	strb	valw, [dstin]
+	tbz	count, 1, 3f
+	strh	valw, [dstend, -2]
+3:	ret
+
+	/* Set 17..96 bytes.  */
+L(set_medium):
+	str	q0, [dstin]
+	tbnz	count, 6, L(set96)
+	str	q0, [dstend, -16]
+	tbz	count, 5, 1f
+	str	q0, [dstin, 16]
+	str	q0, [dstend, -32]
+1:	ret
+
+	.p2align 4
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+L(set96):
+	str	q0, [dstin, 16]
+	stp	q0, q0, [dstin, 32]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+	.p2align 3
+	nop
+L(set_long):
+	and	valw, valw, 255
+	bic	dst, dstin, 15
+	str	q0, [dstin]
+	cmp	count, 256
+	ccmp	valw, 0, 0, cs
+	b.eq	L(try_zva)
+L(no_zva):
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	add	dst, dst, 16
+	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+1:	stp	q0, q0, [dst], 64
+	stp	q0, q0, [dst, -32]
+L(tail64):
+	subs	count, count, 64
+	b.hi	1b
+2:	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+L(try_zva):
+#ifdef ZVA_MACRO
+	zva_macro
 #else
+	.p2align 3
 	mrs	tmp1, dczid_el0
-	tbnz	tmp1, #4, L(not_short)
-	mov	tmp3w, #4
-	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
-	lsl	zva_len, tmp3w, zva_len
-#endif
-
-L(zero_by_line):
-	/* Compute how far we need to go to become suitably aligned.  We're
-	 * already at quad-word alignment.  */
-	cmp	count, zva_len_x
-	b.lt	L(not_short)		/* Not enough to reach alignment.  */
-	sub	zva_bits_x, zva_len_x, #1
-	neg	tmp2, dst
-	ands	tmp2, tmp2, zva_bits_x
-	b.eq	1f			/* Already aligned.  */
-	/* Not aligned, check that there's enough to copy after alignment.  */
-	sub	tmp1, count, tmp2
-	cmp	tmp1, #64
-	ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
-	b.lt	L(not_short)
-	/* We know that there's at least 64 bytes to zero and that it's safe
-	 * to overrun by 64 bytes.  */
-	mov	count, tmp1
-2:
-	stp	A_l, A_l, [dst]
-	stp	A_l, A_l, [dst, #16]
-	stp	A_l, A_l, [dst, #32]
-	subs	tmp2, tmp2, #64
-	stp	A_l, A_l, [dst, #48]
-	add	dst, dst, #64
-	b.ge	2b
-	/* We've overrun a bit, so adjust dst downwards.  */
-	add	dst, dst, tmp2
-1:
-	sub	count, count, zva_len_x
-3:
-	dc	zva, dst
-	add	dst, dst, zva_len_x
-	subs	count, count, zva_len_x
-	b.ge	3b
-	ands	count, count, zva_bits_x
-	b.ne	L(tail_maybe_long)
-	RET
-#ifdef MAYBE_VIRT
-	.bss
-	.p2align 2
-L(cache_clear):
-	.space 4
+	tbnz	tmp1w, 4, L(no_zva)
+	and	tmp1w, tmp1w, 15
+	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
+	b.ne	 L(zva_128)
+
+	/* Write the first and last 64 byte aligned block using stp rather
+	   than using DC ZVA.  This is faster on some cores.
+	 */
+L(zva_64):
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	bic	dst, dst, 63
+	stp	q0, q0, [dst, 64]
+	stp	q0, q0, [dst, 96]
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+	nop
+1:	dc	zva, dst
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	1b
+	stp	q0, q0, [dst, 0]
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+	.p2align 3
+L(zva_128):
+	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
+	b.ne	L(zva_other)
+
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]
+	stp	q0, q0, [dst, 96]
+	bic	dst, dst, 127
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+1:	dc	zva, dst
+	add	dst, dst, 128
+	subs	count, count, 128
+	b.hi	1b
+	stp	q0, q0, [dstend, -128]
+	stp	q0, q0, [dstend, -96]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+L(zva_other):
+	mov	tmp2w, 4
+	lsl	zva_lenw, tmp2w, tmp1w
+	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
+	cmp	count, tmp1
+	blo	L(no_zva)
+
+	sub	tmp2, zva_len, 1
+	add	tmp1, dst, zva_len
+	add	dst, dst, 16
+	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
+	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
+	beq	2f
+1:	stp	q0, q0, [dst], 64
+	stp	q0, q0, [dst, -32]
+	subs	count, count, 64
+	b.hi	1b
+2:	mov	dst, tmp1
+	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
+	subs	count, count, zva_len
+	b.lo	4f
+3:	dc	zva, dst
+	add	dst, dst, zva_len
+	subs	count, count, zva_len
+	b.hs	3b
+4:	add	count, count, zva_len
+	b	L(tail64)
 #endif
-#endif /* DONT_USE_DC */
 
-END (__memset)
-weak_alias (__memset, memset)
-libc_hidden_builtin_def (memset)
+END (MEMSET)
+libc_hidden_builtin_def (MEMSET)
diff --git a/sysdeps/aarch64/memusage.h b/sysdeps/aarch64/memusage.h
index 70bf2ce859..874f3b5b3d 100644
--- a/sysdeps/aarch64/memusage.h
+++ b/sysdeps/aarch64/memusage.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2000-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2000-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
new file mode 100644
index 0000000000..57ffdf7238
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -0,0 +1,4 @@
+ifeq ($(subdir),string)
+sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
+		   memcpy_falkor memmove_falkor memset_generic memset_falkor
+endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
new file mode 100644
index 0000000000..e55be80103
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -0,0 +1,57 @@
+/* Enumerate available IFUNC implementations of a function.  AARCH64 version.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <assert.h>
+#include <string.h>
+#include <wchar.h>
+#include <ldsodefs.h>
+#include <ifunc-impl-list.h>
+#include <init-arch.h>
+#include <stdio.h>
+
+/* Maximum number of IFUNC implementations.  */
+#define MAX_IFUNC	4
+
+size_t
+__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+			size_t max)
+{
+  assert (max >= MAX_IFUNC);
+
+  size_t i = 0;
+
+  INIT_ARCH ();
+
+  /* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c.  */
+  IFUNC_IMPL (i, name, memcpy,
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
+  IFUNC_IMPL (i, name, memmove,
+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
+  IFUNC_IMPL (i, name, memset,
+	      /* Enable this on non-falkor processors too so that other cores
+		 can do a comparative analysis with __memset_generic.  */
+	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
+
+  return i;
+}
diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
new file mode 100644
index 0000000000..d1e5703cb2
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/init-arch.h
@@ -0,0 +1,25 @@
+/* Define INIT_ARCH so that midr is initialized before use by IFUNCs.
+   This file is part of the GNU C Library.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <ldsodefs.h>
+
+#define INIT_ARCH()							      \
+  uint64_t __attribute__((unused)) midr =				      \
+    GLRO(dl_aarch64_cpu_features).midr_el1;				      \
+  unsigned __attribute__((unused)) zva_size =				      \
+    GLRO(dl_aarch64_cpu_features).zva_size;
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
new file mode 100644
index 0000000000..4a04a63b0f
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -0,0 +1,47 @@
+/* Multiple versions of memcpy. AARCH64 version.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+
+#if IS_IN (libc)
+/* Redefine memcpy so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+# undef memcpy
+# define memcpy __redirect_memcpy
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__redirect_memcpy) __libc_memcpy;
+
+extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
+
+libc_ifunc (__libc_memcpy,
+            (IS_THUNDERX (midr)
+	     ? __memcpy_thunderx
+	     : (IS_FALKOR (midr) || IS_PHECDA (midr)
+		? __memcpy_falkor
+		: (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
+		  ? __memcpy_thunderx2
+		  : __memcpy_generic))));
+
+# undef memcpy
+strong_alias (__libc_memcpy, memcpy);
+#endif
diff --git a/sysdeps/aarch64/multiarch/memcpy_falkor.S b/sysdeps/aarch64/multiarch/memcpy_falkor.S
new file mode 100644
index 0000000000..cdc2de4750
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy_falkor.S
@@ -0,0 +1,191 @@
+/* Optimized memcpy for Qualcomm Falkor processor.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+
+   ARMv8-a, AArch64, falkor, unaligned accesses.  */
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define tmp1	x14
+#define A_x	x6
+#define B_x	x7
+#define A_w	w6
+#define B_w	w7
+
+#define A_q	q0
+#define B_q	q1
+#define C_q	q2
+#define D_q	q3
+#define E_q	q4
+#define F_q	q5
+#define G_q	q6
+
+/* Copies are split into 3 main cases:
+
+   1. Small copies of up to 32 bytes
+   2. Medium copies of 33..128 bytes which are fully unrolled
+   3. Large copies of more than 128 bytes.
+
+   Large copies align the sourceto a quad word and use an unrolled loop
+   processing 64 bytes per iteration.
+
+   FALKOR-SPECIFIC DESIGN:
+
+   The smallest copies (32 bytes or less) focus on optimal pipeline usage,
+   which is why the redundant copies of 0-3 bytes have been replaced with
+   conditionals, since the former would unnecessarily break across multiple
+   issue groups.  The medium copy group has been enlarged to 128 bytes since
+   bumping up the small copies up to 32 bytes allows us to do that without
+   cost and also allows us to reduce the size of the prep code before loop64.
+
+   The copy loop uses only one register q0.  This is to ensure that all loads
+   hit a single hardware prefetcher which can get correctly trained to prefetch
+   a single stream.
+
+   The non-temporal stores help optimize cache utilization.  */
+
+#if IS_IN (libc)
+ENTRY_ALIGN (__memcpy_falkor, 6)
+
+	cmp	count, 32
+	add	srcend, src, count
+	add	dstend, dstin, count
+	b.ls	L(copy32)
+	ldr	A_q, [src]
+	cmp	count, 128
+	str	A_q, [dstin]
+	b.hi	L(copy_long)
+
+	/* Medium copies: 33..128 bytes.  */
+	sub	tmp1, count, 1
+	ldr	A_q, [src, 16]
+	ldr	B_q, [srcend, -32]
+	ldr	C_q, [srcend, -16]
+	tbz	tmp1, 6, 1f
+	ldr	D_q, [src, 32]
+	ldr	E_q, [src, 48]
+	str	D_q, [dstin, 32]
+	str	E_q, [dstin, 48]
+	ldr	F_q, [srcend, -64]
+	ldr	G_q, [srcend, -48]
+	str	F_q, [dstend, -64]
+	str	G_q, [dstend, -48]
+1:
+	str	A_q, [dstin, 16]
+	str	B_q, [dstend, -32]
+	str	C_q, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Small copies: 0..32 bytes.  */
+L(copy32):
+	/* 16-32 */
+	cmp	count, 16
+	b.lo	1f
+	ldr	A_q, [src]
+	ldr	B_q, [srcend, -16]
+	str	A_q, [dstin]
+	str	B_q, [dstend, -16]
+	ret
+	.p2align 4
+1:
+	/* 8-15 */
+	tbz	count, 3, 1f
+	ldr	A_x, [src]
+	ldr	B_x, [srcend, -8]
+	str	A_x, [dstin]
+	str	B_x, [dstend, -8]
+	ret
+	.p2align 4
+1:
+	/* 4-7 */
+	tbz	count, 2, 1f
+	ldr	A_w, [src]
+	ldr	B_w, [srcend, -4]
+	str	A_w, [dstin]
+	str	B_w, [dstend, -4]
+	ret
+	.p2align 4
+1:
+	/* 2-3 */
+	tbz	count, 1, 1f
+	ldrh	A_w, [src]
+	ldrh	B_w, [srcend, -2]
+	strh	A_w, [dstin]
+	strh	B_w, [dstend, -2]
+	ret
+	.p2align 4
+1:
+	/* 0-1 */
+	tbz	count, 0, 1f
+	ldrb	A_w, [src]
+	strb	A_w, [dstin]
+1:
+	ret
+
+	/* Align SRC to 16 bytes and copy; that way at least one of the
+	   accesses is aligned throughout the copy sequence.
+
+	   The count is off by 0 to 15 bytes, but this is OK because we trim
+	   off the last 64 bytes to copy off from the end.  Due to this the
+	   loop never runs out of bounds.  */
+	.p2align 6
+L(copy_long):
+	sub	count, count, 64 + 16
+	and	tmp1, src, 15
+	bic	src, src, 15
+	sub	dst, dstin, tmp1
+	add	count, count, tmp1
+
+L(loop64):
+	ldr	A_q, [src, 16]!
+	str	A_q, [dst, 16]
+	ldr	A_q, [src, 16]!
+	subs	count, count, 64
+	str	A_q, [dst, 32]
+	ldr	A_q, [src, 16]!
+	str	A_q, [dst, 48]
+	ldr	A_q, [src, 16]!
+	str	A_q, [dst, 64]!
+	b.hi	L(loop64)
+
+	/* Write the last full set of 64 bytes.  The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the end even if
+	   there is just 1 byte left.  */
+L(last64):
+	ldr	E_q, [srcend, -64]
+	str	E_q, [dstend, -64]
+	ldr	D_q, [srcend, -48]
+	str	D_q, [dstend, -48]
+	ldr	C_q, [srcend, -32]
+	str	C_q, [dstend, -32]
+	ldr	B_q, [srcend, -16]
+	str	B_q, [dstend, -16]
+	ret
+
+END (__memcpy_falkor)
+libc_hidden_builtin_def (__memcpy_falkor)
+#endif
diff --git a/sysdeps/aarch64/multiarch/memcpy_generic.S b/sysdeps/aarch64/multiarch/memcpy_generic.S
new file mode 100644
index 0000000000..4bc81f4ce5
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy_generic.S
@@ -0,0 +1,44 @@
+/* A Generic Optimized memcpy implementation for AARCH64.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* The actual memcpy and memmove code is in ../memcpy.S.  If we are
+   building libc this file defines __memcpy_generic and __memmove_generic.
+   Otherwise the include of ../memcpy.S will define the normal __memcpy
+   and__memmove entry points.  */
+
+#include <sysdep.h>
+
+#if IS_IN (libc)
+
+# define MEMCPY __memcpy_generic
+# define MEMMOVE __memmove_generic
+
+/* Do not hide the generic versions of memcpy and memmove, we use them
+   internally.  */
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)
+
+# ifdef SHARED
+/* It doesn't make sense to send libc-internal memcpy calls through a PLT. */
+	.globl __GI_memcpy; __GI_memcpy = __memcpy_generic
+	.globl __GI_memmove; __GI_memmove = __memmove_generic
+# endif
+
+#endif
+
+#include "../memcpy.S"
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx.S b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
new file mode 100644
index 0000000000..de494d933d
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
@@ -0,0 +1,336 @@
+/* A Thunderx Optimized memcpy implementation for AARCH64.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* The actual code in this memcpy and memmove should be identical to the
+   generic version except for the code under '#ifdef THUNDERX'.  This is
+   to make is easier to keep this version and the generic version in sync
+   for changes that are not specific to thunderx.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define A_l	x6
+#define A_lw	w6
+#define A_h	x7
+#define A_hw	w7
+#define B_l	x8
+#define B_lw	w8
+#define B_h	x9
+#define C_l	x10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	src
+#define E_h	count
+#define F_l	srcend
+#define F_h	dst
+#define G_l	count
+#define G_h	dst
+#define tmp1	x14
+
+/* Copies are split into 3 main cases: small copies of up to 16 bytes,
+   medium copies of 17..96 bytes which are fully unrolled. Large copies
+   of more than 96 bytes align the destination and use an unrolled loop
+   processing 64 bytes per iteration.
+   In order to share code with memmove, small and medium copies read all
+   data before writing, allowing any kind of overlap. So small, medium
+   and large backwards memmoves are handled by falling through into memcpy.
+   Overlapping large forward memmoves use a loop that copies backwards.
+*/
+
+#ifndef MEMMOVE
+# define MEMMOVE memmove
+#endif
+#ifndef MEMCPY
+# define MEMCPY memcpy
+#endif
+
+#if IS_IN (libc)
+
+# ifndef USE_THUNDERX2
+#  undef MEMCPY
+#  define MEMCPY __memcpy_thunderx
+#  undef MEMMOVE
+#  define MEMMOVE __memmove_thunderx
+#  define USE_THUNDERX
+# endif
+
+ENTRY_ALIGN (MEMMOVE, 6)
+
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
+
+	sub	tmp1, dstin, src
+	cmp	count, 96
+	ccmp	tmp1, count, 2, hi
+	b.lo	L(move_long)
+
+	/* Common case falls through into memcpy.  */
+END (MEMMOVE)
+libc_hidden_builtin_def (MEMMOVE)
+ENTRY (MEMCPY)
+
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
+
+	prfm	PLDL1KEEP, [src]
+	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 16
+	b.ls	L(copy16)
+	cmp	count, 96
+	b.hi	L(copy_long)
+
+	/* Medium copies: 17..96 bytes.  */
+	sub	tmp1, count, 1
+	ldp	A_l, A_h, [src]
+	tbnz	tmp1, 6, L(copy96)
+	ldp	D_l, D_h, [srcend, -16]
+	tbz	tmp1, 5, 1f
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [srcend, -32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstend, -32]
+1:
+	stp	A_l, A_h, [dstin]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Small copies: 0..16 bytes.  */
+L(copy16):
+	cmp	count, 8
+	b.lo	1f
+	ldr	A_l, [src]
+	ldr	A_h, [srcend, -8]
+	str	A_l, [dstin]
+	str	A_h, [dstend, -8]
+	ret
+	.p2align 4
+1:
+	tbz	count, 2, 1f
+	ldr	A_lw, [src]
+	ldr	A_hw, [srcend, -4]
+	str	A_lw, [dstin]
+	str	A_hw, [dstend, -4]
+	ret
+
+	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
+	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
+1:
+	cbz	count, 2f
+	lsr	tmp1, count, 1
+	ldrb	A_lw, [src]
+	ldrb	A_hw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	A_hw, [dstend, -1]
+2:	ret
+
+	.p2align 4
+	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
+	   32 bytes from the end.  */
+L(copy96):
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [src, 32]
+	ldp	D_l, D_h, [src, 48]
+	ldp	E_l, E_h, [srcend, -32]
+	ldp	F_l, F_h, [srcend, -16]
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin, 32]
+	stp	D_l, D_h, [dstin, 48]
+	stp	E_l, E_h, [dstend, -32]
+	stp	F_l, F_h, [dstend, -16]
+	ret
+
+	/* Align DST to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.  There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.  The loop
+	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+	.p2align 4
+L(copy_long):
+
+# if defined(USE_THUNDERX) || defined (USE_THUNDERX2)
+
+	/* On thunderx, large memcpy's are helped by software prefetching.
+	   This loop is identical to the one below it but with prefetching
+	   instructions included.  For loops that are less than 32768 bytes,
+	   the prefetching does not help and slow the code down so we only
+	   use the prefetching loop for the largest memcpys.  */
+
+	cmp	count, #32768
+	b.lo	L(copy_long_without_prefetch)
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	ldp	D_l, D_h, [src]
+	sub	src, src, tmp1
+#  if defined(USE_THUNDERX)
+	prfm	pldl1strm, [src, 384]
+#  elif defined(USE_THUNDERX2)
+	prfm	pldl1strm, [src, 256]
+#  endif
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_l, A_h, [src, 16]
+	stp	D_l, D_h, [dstin]
+	ldp	B_l, B_h, [src, 32]
+	ldp	C_l, C_h, [src, 48]
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+
+L(prefetch_loop64):
+#  if defined(USE_THUNDERX)
+	tbz	src, #6, 1f
+	prfm	pldl1strm, [src, 512]
+1:
+#  elif defined(USE_THUNDERX2)
+	prfm	pldl1strm, [src, 256]
+#  endif
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]!
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 64
+	b.hi	L(prefetch_loop64)
+	b	L(last64)
+
+L(copy_long_without_prefetch):
+# endif
+
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	ldp	D_l, D_h, [src]
+	sub	src, src, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_l, A_h, [src, 16]
+	stp	D_l, D_h, [dstin]
+	ldp	B_l, B_h, [src, 32]
+	ldp	C_l, C_h, [src, 48]
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	L(last64)
+L(loop64):
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]!
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 64
+	b.hi	L(loop64)
+
+	/* Write the last full set of 64 bytes.  The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the end even if
+	   there is just 1 byte left.  */
+L(last64):
+	ldp	E_l, E_h, [srcend, -64]
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [srcend, -48]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [srcend, -16]
+	stp	D_l, D_h, [dst, 64]
+	stp	E_l, E_h, [dstend, -64]
+	stp	A_l, A_h, [dstend, -48]
+	stp	B_l, B_h, [dstend, -32]
+	stp	C_l, C_h, [dstend, -16]
+	ret
+
+	.p2align 4
+L(move_long):
+	cbz	tmp1, 3f
+
+	add	srcend, src, count
+	add	dstend, dstin, count
+
+	/* Align dstend to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.  There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.  The loop
+	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+	and	tmp1, dstend, 15
+	ldp	D_l, D_h, [srcend, -16]
+	sub	srcend, srcend, tmp1
+	sub	count, count, tmp1
+	ldp	A_l, A_h, [srcend, -16]
+	stp	D_l, D_h, [dstend, -16]
+	ldp	B_l, B_h, [srcend, -32]
+	ldp	C_l, C_h, [srcend, -48]
+	ldp	D_l, D_h, [srcend, -64]!
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	2f
+
+	nop
+1:
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [srcend, -16]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [srcend, -48]
+	stp	D_l, D_h, [dstend, -64]!
+	ldp	D_l, D_h, [srcend, -64]!
+	subs	count, count, 64
+	b.hi	1b
+
+	/* Write the last full set of 64 bytes.  The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the start even if
+	   there is just 1 byte left.  */
+2:
+	ldp	G_l, G_h, [src, 48]
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [src, 32]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [src, 16]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [src]
+	stp	D_l, D_h, [dstend, -64]
+	stp	G_l, G_h, [dstin, 48]
+	stp	A_l, A_h, [dstin, 32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin]
+3:	ret
+
+END (MEMCPY)
+libc_hidden_builtin_def (MEMCPY)
+
+#endif
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
new file mode 100644
index 0000000000..8501abf725
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
@@ -0,0 +1,27 @@
+/* A Thunderx2 Optimized memcpy implementation for AARCH64.
+   Copyright (C) 2018 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* The actual code in this memcpy and memmove is in memcpy_thunderx.S.
+   The only real differences are with the prefetching instructions.  */
+
+#define MEMCPY __memcpy_thunderx2
+#define MEMMOVE __memmove_thunderx2
+#define USE_THUNDERX2
+
+#include "memcpy_thunderx.S"
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
new file mode 100644
index 0000000000..e69d816291
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -0,0 +1,44 @@
+/* Multiple versions of memmove. AARCH64 version.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+
+#if IS_IN (libc)
+/* Redefine memmove so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+# undef memmove
+# define memmove __redirect_memmove
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__redirect_memmove) __libc_memmove;
+
+extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
+
+libc_ifunc (__libc_memmove,
+            (IS_THUNDERX (midr)
+	     ? __memmove_thunderx
+	     : (IS_FALKOR (midr) || IS_PHECDA (midr)
+		? __memmove_falkor
+		: __memmove_generic)));
+
+# undef memmove
+strong_alias (__libc_memmove, memmove);
+#endif
diff --git a/sysdeps/aarch64/multiarch/memmove_falkor.S b/sysdeps/aarch64/multiarch/memmove_falkor.S
new file mode 100644
index 0000000000..5163f68e53
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memmove_falkor.S
@@ -0,0 +1,225 @@
+/* Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions: ARMv8-a, AArch64, falkor, unaligned accesses.  */
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define A_x	x6
+#define B_x	x7
+#define A_w	w6
+#define B_w	w7
+#define tmp1	x14
+
+#define Q_q	q6
+#define A_q	q22
+#define B_q	q18
+#define C_q	q19
+#define D_q	q20
+#define E_q	q21
+#define F_q	q17
+#define G_q	q23
+
+/* RATIONALE:
+
+   The move has 4 distinct parts:
+   * Small moves of 16 bytes and under
+   * Medium sized moves of 17-96 bytes
+   * Large moves where the source address is higher than the destination
+     (forward copies)
+   * Large moves where the destination address is higher than the source
+     (copy backward, or move).
+
+   We use only two registers q6 and q22 for the moves and move 32 bytes at a
+   time to correctly train the hardware prefetcher for better throughput.  */
+ENTRY_ALIGN (__memmove_falkor, 6)
+
+	sub	tmp1, dstin, src
+	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 96
+	ccmp	tmp1, count, 2, hi
+	b.lo	L(move_long)
+
+	cmp	count, 16
+	b.ls	L(copy16)
+	cmp	count, 96
+	b.hi	L(copy_long)
+
+	/* Medium copies: 17..96 bytes.  */
+	sub	tmp1, count, 1
+	ldr	A_q, [src]
+	tbnz	tmp1, 6, L(copy96)
+	ldr	D_q, [srcend, -16]
+	tbz	tmp1, 5, 1f
+	ldr	B_q, [src, 16]
+	ldr	C_q, [srcend, -32]
+	str	B_q, [dstin, 16]
+	str	C_q, [dstend, -32]
+1:
+	str	A_q, [dstin]
+	str	D_q, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Small copies: 0..16 bytes.  */
+L(copy16):
+	cmp	count, 8
+	b.lo	1f
+	ldr	A_x, [src]
+	ldr	B_x, [srcend, -8]
+	str	A_x, [dstin]
+	str	B_x, [dstend, -8]
+	ret
+	.p2align 4
+1:
+	/* 4-7 */
+	tbz	count, 2, 1f
+	ldr	A_w, [src]
+	ldr	B_w, [srcend, -4]
+	str	A_w, [dstin]
+	str	B_w, [dstend, -4]
+	ret
+	.p2align 4
+1:
+	/* 2-3 */
+	tbz	count, 1, 1f
+	ldrh	A_w, [src]
+	ldrh	B_w, [srcend, -2]
+	strh	A_w, [dstin]
+	strh	B_w, [dstend, -2]
+	ret
+	.p2align 4
+1:
+	/* 0-1 */
+	tbz	count, 0, 1f
+	ldrb	A_w, [src]
+	strb	A_w, [dstin]
+1:	ret
+
+	.p2align 4
+	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
+	   32 bytes from the end.  */
+L(copy96):
+	ldr	B_q, [src, 16]
+	ldr	C_q, [src, 32]
+	ldr	D_q, [src, 48]
+	ldr	E_q, [srcend, -32]
+	ldr	F_q, [srcend, -16]
+	str	A_q, [dstin]
+	str	B_q, [dstin, 16]
+	str	C_q, [dstin, 32]
+	str	D_q, [dstin, 48]
+	str	E_q, [dstend, -32]
+	str	F_q, [dstend, -16]
+	ret
+
+	/* Align SRC to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.  There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.  The loop
+	   copies 32 bytes per iteration and prefetches one iteration ahead.  */
+
+	.p2align 4
+L(copy_long):
+	ldr	A_q, [src]
+	and	tmp1, src, 15
+	bic	src, src, 15
+	sub	dst, dstin, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldr	Q_q, [src, 16]!
+	str	A_q, [dstin]
+	ldr	A_q, [src, 16]!
+	subs	count, count, 32 + 64 + 16	/* Test and readjust count.  */
+	b.ls	L(last64)
+
+L(loop64):
+	subs	count, count, 32
+	str	Q_q, [dst, 16]
+	ldr	Q_q, [src, 16]!
+	str	A_q, [dst, 32]!
+	ldr	A_q, [src, 16]!
+	b.hi	L(loop64)
+
+	/* Write the last full set of 64 bytes.  The remainder is at most 64
+	   bytes and at least 33 bytes, so it is safe to always copy 64 bytes
+	   from the end.  */
+L(last64):
+	ldr	C_q, [srcend, -64]
+	str	Q_q, [dst, 16]
+	ldr	B_q, [srcend, -48]
+	str	A_q, [dst, 32]
+	ldr	A_q, [srcend, -32]
+	ldr	D_q, [srcend, -16]
+	str	C_q, [dstend, -64]
+	str	B_q, [dstend, -48]
+	str	A_q, [dstend, -32]
+	str	D_q, [dstend, -16]
+	ret
+
+	.p2align 4
+L(move_long):
+	cbz	tmp1, 3f
+
+	/* Align SRCEND to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.  There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.  The loop
+	   copies 32 bytes per iteration and prefetches one iteration ahead.  */
+
+	ldr	A_q, [srcend, -16]
+	and	tmp1, srcend, 15
+	sub	srcend, srcend, tmp1
+	ldr	Q_q, [srcend, -16]!
+	str	A_q, [dstend, -16]
+	sub	count, count, tmp1
+	ldr	A_q, [srcend, -16]!
+	sub	dstend, dstend, tmp1
+	subs	count, count, 32 + 64
+	b.ls	2f
+
+1:
+	subs	count, count, 32
+	str	Q_q, [dstend, -16]
+	ldr	Q_q, [srcend, -16]!
+	str	A_q, [dstend, -32]!
+	ldr	A_q, [srcend, -16]!
+	b.hi	1b
+
+	/* Write the last full set of 64 bytes.  The remainder is at most 64
+	   bytes and at least 33 bytes, so it is safe to always copy 64 bytes
+	   from the start.  */
+2:
+	ldr	C_q, [src, 48]
+	str	Q_q, [dstend, -16]
+	ldr	B_q, [src, 32]
+	str	A_q, [dstend, -32]
+	ldr	A_q, [src, 16]
+	ldr	D_q, [src]
+	str	C_q, [dstin, 48]
+	str	B_q, [dstin, 32]
+	str	A_q, [dstin, 16]
+	str	D_q, [dstin]
+3:	ret
+
+END (__memmove_falkor)
+libc_hidden_builtin_def (__memmove_falkor)
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
new file mode 100644
index 0000000000..d74ed3a549
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -0,0 +1,41 @@
+/* Multiple versions of memset. AARCH64 version.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+
+#if IS_IN (libc)
+/* Redefine memset so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+# undef memset
+# define memset __redirect_memset
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__redirect_memset) __libc_memset;
+
+extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
+extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
+
+libc_ifunc (__libc_memset,
+	    ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64
+	     ? __memset_falkor
+	     : __memset_generic));
+
+# undef memset
+strong_alias (__libc_memset, memset);
+#endif
diff --git a/sysdeps/aarch64/multiarch/memset_falkor.S b/sysdeps/aarch64/multiarch/memset_falkor.S
new file mode 100644
index 0000000000..16849a5afb
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_falkor.S
@@ -0,0 +1,53 @@
+/* Memset for falkor.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <memset-reg.h>
+
+/* Reading dczid_el0 is expensive on falkor so move it into the ifunc
+   resolver and assume ZVA size of 64 bytes.  The IFUNC resolver takes care to
+   use this function only when ZVA is enabled.  */
+
+#if IS_IN (libc)
+.macro zva_macro
+	.p2align 4
+	/* Write the first and last 64 byte aligned block using stp rather
+	   than using DC ZVA.  This is faster on some cores.  */
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	bic	dst, dst, 63
+	stp	q0, q0, [dst, 64]
+	stp	q0, q0, [dst, 96]
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+1:	dc	zva, dst
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	1b
+	stp	q0, q0, [dst, 0]
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+.endm
+
+# define ZVA_MACRO zva_macro
+# define MEMSET __memset_falkor
+# include <sysdeps/aarch64/memset.S>
+#endif
diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S
new file mode 100644
index 0000000000..345a88b94f
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_generic.S
@@ -0,0 +1,27 @@
+/* Memset for aarch64, default version for internal use.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# define MEMSET __memset_generic
+/* Add a hidden definition for use within libc.so.  */
+# ifdef SHARED
+	.globl __GI_memset; __GI_memset = __memset_generic
+# endif
+# include <sysdeps/aarch64/memset.S>
+#endif
diff --git a/sysdeps/aarch64/multiarch/rtld-memset.S b/sysdeps/aarch64/multiarch/rtld-memset.S
new file mode 100644
index 0000000000..8cdc6e1f50
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/rtld-memset.S
@@ -0,0 +1,23 @@
+/* Memset for aarch64, for the dynamic linker.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (rtld)
+# define MEMSET memset
+# include <sysdeps/aarch64/memset.S>
+#endif
diff --git a/sysdeps/aarch64/nptl/Makefile b/sysdeps/aarch64/nptl/Makefile
index 486aeca81e..bebbdef871 100644
--- a/sysdeps/aarch64/nptl/Makefile
+++ b/sysdeps/aarch64/nptl/Makefile
@@ -1,4 +1,4 @@
-# Copyright (C) 2005-2016 Free Software Foundation, Inc.
+# Copyright (C) 2005-2018 Free Software Foundation, Inc.
 #
 # This file is part of the GNU C Library.
 #
diff --git a/sysdeps/aarch64/nptl/bits/pthreadtypes-arch.h b/sysdeps/aarch64/nptl/bits/pthreadtypes-arch.h
new file mode 100644
index 0000000000..008aa7e0c7
--- /dev/null
+++ b/sysdeps/aarch64/nptl/bits/pthreadtypes-arch.h
@@ -0,0 +1,71 @@
+/* Copyright (C) 2002-2018 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _BITS_PTHREADTYPES_ARCH_H
+#define _BITS_PTHREADTYPES_ARCH_H	1
+
+#include <endian.h>
+
+#ifdef __ILP32__
+# define __SIZEOF_PTHREAD_ATTR_T        32
+# define __SIZEOF_PTHREAD_MUTEX_T       32
+# define __SIZEOF_PTHREAD_MUTEXATTR_T    4
+# define __SIZEOF_PTHREAD_CONDATTR_T     4
+# define __SIZEOF_PTHREAD_RWLOCK_T      48
+# define __SIZEOF_PTHREAD_BARRIER_T     20
+# define __SIZEOF_PTHREAD_BARRIERATTR_T  4
+#else
+# define __SIZEOF_PTHREAD_ATTR_T        64
+# define __SIZEOF_PTHREAD_MUTEX_T       48
+# define __SIZEOF_PTHREAD_MUTEXATTR_T    8
+# define __SIZEOF_PTHREAD_CONDATTR_T     8
+# define __SIZEOF_PTHREAD_RWLOCK_T      56
+# define __SIZEOF_PTHREAD_BARRIER_T     32
+# define __SIZEOF_PTHREAD_BARRIERATTR_T  8
+#endif
+#define __SIZEOF_PTHREAD_COND_T         48
+#define __SIZEOF_PTHREAD_RWLOCKATTR_T	8
+
+/* Definitions for internal mutex struct.  */
+#define __PTHREAD_COMPAT_PADDING_MID
+#define __PTHREAD_COMPAT_PADDING_END
+#define __PTHREAD_MUTEX_LOCK_ELISION	0
+#define __PTHREAD_MUTEX_NUSERS_AFTER_KIND  0
+#define __PTHREAD_MUTEX_USE_UNION          0
+
+#define __LOCK_ALIGNMENT
+#define __ONCE_ALIGNMENT
+
+struct __pthread_rwlock_arch_t
+{
+  unsigned int __readers;
+  unsigned int __writers;
+  unsigned int __wrphase_futex;
+  unsigned int __writers_futex;
+  unsigned int __pad3;
+  unsigned int __pad4;
+  int __cur_writer;
+  int __shared;
+  unsigned long int __pad1;
+  unsigned long int __pad2;
+  unsigned int __flags;
+};
+
+#define __PTHREAD_RWLOCK_ELISION_EXTRA 0
+
+#endif	/* bits/pthreadtypes.h */
diff --git a/sysdeps/aarch64/nptl/bits/pthreadtypes.h b/sysdeps/aarch64/nptl/bits/pthreadtypes.h
deleted file mode 100644
index 13984a718d..0000000000
--- a/sysdeps/aarch64/nptl/bits/pthreadtypes.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (C) 2002-2016 Free Software Foundation, Inc.
-
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#ifndef _BITS_PTHREADTYPES_H
-#define _BITS_PTHREADTYPES_H	1
-
-#include <endian.h>
-
-#define __SIZEOF_PTHREAD_ATTR_T        64
-#define __SIZEOF_PTHREAD_MUTEX_T       48
-#define __SIZEOF_PTHREAD_MUTEXATTR_T    8
-#define __SIZEOF_PTHREAD_COND_T        48
-#define __SIZEOF_PTHREAD_COND_COMPAT_T 48
-#define __SIZEOF_PTHREAD_CONDATTR_T     8
-#define __SIZEOF_PTHREAD_RWLOCK_T      56
-#define __SIZEOF_PTHREAD_RWLOCKATTR_T   8
-#define __SIZEOF_PTHREAD_BARRIER_T     32
-#define __SIZEOF_PTHREAD_BARRIERATTR_T  8
-
-
-/* Thread identifiers.  The structure of the attribute type is not
-   exposed on purpose.  */
-typedef unsigned long int pthread_t;
-
-
-union pthread_attr_t
-{
-  char __size[__SIZEOF_PTHREAD_ATTR_T];
-  long int __align;
-};
-#ifndef __have_pthread_attr_t
-typedef union pthread_attr_t pthread_attr_t;
-# define __have_pthread_attr_t1
-#endif
-
-typedef struct __pthread_internal_list
-{
-  struct __pthread_internal_list *__prev;
-  struct __pthread_internal_list *__next;
-} __pthread_list_t;
-
-
-/* Data structures for mutex handling.  The structure of the attribute
-   type is not exposed on purpose.  */
-typedef union
-{
-  struct __pthread_mutex_s
-  {
-    int __lock;
-    unsigned int __count;
-    int __owner;
-    unsigned int __nusers;
-    int __kind;
-    int __spins;
-    __pthread_list_t __list;
-#define __PTHREAD_MUTEX_HAVE_PREV	1
-  } __data;
-  char __size[__SIZEOF_PTHREAD_MUTEX_T];
-  long int __align;
-} pthread_mutex_t;
-
-/* Mutex __spins initializer used by PTHREAD_MUTEX_INITIALIZER.  */
-#define __PTHREAD_SPINS 0
-
-typedef union
-{
-  char __size[__SIZEOF_PTHREAD_MUTEXATTR_T];
-  long int __align;
-} pthread_mutexattr_t;
-
-
-/* Data structure for conditional variable handling.  The structure of
-   the attribute type is not exposed on purpose.  */
-typedef union
-{
-  struct
-  {
-    int __lock;
-    unsigned int __futex;
-    __extension__ unsigned long long int __total_seq;
-    __extension__ unsigned long long int __wakeup_seq;
-    __extension__ unsigned long long int __woken_seq;
-    void *__mutex;
-    unsigned int __nwaiters;
-    unsigned int __broadcast_seq;
-  } __data;
-  char __size[__SIZEOF_PTHREAD_COND_T];
-  long int __align;
-} pthread_cond_t;
-
-typedef union
-{
-  char __size[__SIZEOF_PTHREAD_CONDATTR_T];
-  int __align;
-} pthread_condattr_t;
-
-
-/* Keys for thread-specific data */
-typedef unsigned int pthread_key_t;
-
-
-/* Once-only execution */
-typedef int pthread_once_t;
-
-
-#if defined __USE_UNIX98 || defined __USE_XOPEN2K
-/* Data structure for read-write lock variable handling.  The
-   structure of the attribute type is not exposed on purpose.  */
-typedef union
-{
-  struct
-  {
-    int __lock;
-    unsigned int __nr_readers;
-    unsigned int __readers_wakeup;
-    unsigned int __writer_wakeup;
-    unsigned int __nr_readers_queued;
-    unsigned int __nr_writers_queued;
-    int __writer;
-    int __shared;
-    unsigned long int __pad1;
-    unsigned long int __pad2;
-    unsigned int __flags;
-  } __data;
-  char __size[__SIZEOF_PTHREAD_RWLOCK_T];
-  long int __align;
-} pthread_rwlock_t;
-
-#define __PTHREAD_RWLOCK_ELISION_EXTRA 0
-
-typedef union
-{
-  char __size[__SIZEOF_PTHREAD_RWLOCKATTR_T];
-  long int __align;
-} pthread_rwlockattr_t;
-#endif
-
-
-#ifdef __USE_XOPEN2K
-/* POSIX spinlock data type.  */
-typedef volatile int pthread_spinlock_t;
-
-
-/* POSIX barriers data type.  The structure of the type is
-   deliberately not exposed.  */
-typedef union
-{
-  char __size[__SIZEOF_PTHREAD_BARRIER_T];
-  long int __align;
-} pthread_barrier_t;
-
-typedef union
-{
-  char __size[__SIZEOF_PTHREAD_BARRIERATTR_T];
-  int __align;
-} pthread_barrierattr_t;
-#endif
-
-#endif	/* bits/pthreadtypes.h */
diff --git a/sysdeps/aarch64/nptl/bits/semaphore.h b/sysdeps/aarch64/nptl/bits/semaphore.h
index 3cc5b37847..18f8aae532 100644
--- a/sysdeps/aarch64/nptl/bits/semaphore.h
+++ b/sysdeps/aarch64/nptl/bits/semaphore.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2002-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2002-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -21,7 +21,11 @@
 #endif
 
 
-#define __SIZEOF_SEM_T	32
+#ifdef __ILP32__
+# define __SIZEOF_SEM_T	16
+#else
+# define __SIZEOF_SEM_T	32
+#endif
 
 
 /* Value returned if `sem_open' failed.  */
@@ -31,5 +35,5 @@
 typedef union
 {
   char __size[__SIZEOF_SEM_T];
-  long int __align;
+  long long int __align;
 } sem_t;
diff --git a/sysdeps/aarch64/nptl/pthread-offsets.h b/sysdeps/aarch64/nptl/pthread-offsets.h
new file mode 100644
index 0000000000..16c6b0d9fd
--- /dev/null
+++ b/sysdeps/aarch64/nptl/pthread-offsets.h
@@ -0,0 +1,5 @@
+#define __PTHREAD_MUTEX_NUSERS_OFFSET   12
+#define __PTHREAD_MUTEX_KIND_OFFSET     16
+#define __PTHREAD_MUTEX_SPINS_OFFSET    20
+#define __PTHREAD_MUTEX_ELISION_OFFSET  22
+#define __PTHREAD_MUTEX_LIST_OFFSET     24
diff --git a/sysdeps/aarch64/nptl/pthread_spin_lock.c b/sysdeps/aarch64/nptl/pthread_spin_lock.c
deleted file mode 100644
index 21a54b8215..0000000000
--- a/sysdeps/aarch64/nptl/pthread_spin_lock.c
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (C) 2008-2016 Free Software Foundation, Inc.
-
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public License as
-   published by the Free Software Foundation; either version 2.1 of the
-   License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#define SPIN_LOCK_READS_BETWEEN_CMPXCHG 1000
-
-/* We can't use the normal "#include <nptl/pthread_spin_lock.c>" because
-   it will resolve to this very file.  Using "sysdeps/.." as reference to the
-   top level directory does the job.  */
-#include <sysdeps/../nptl/pthread_spin_lock.c>
diff --git a/sysdeps/aarch64/nptl/pthreaddef.h b/sysdeps/aarch64/nptl/pthreaddef.h
index 5aa8d561e7..32f729ada1 100644
--- a/sysdeps/aarch64/nptl/pthreaddef.h
+++ b/sysdeps/aarch64/nptl/pthreaddef.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2002-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2002-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/nptl/tcb-offsets.sym b/sysdeps/aarch64/nptl/tcb-offsets.sym
index 0677aeabff..238647dd47 100644
--- a/sysdeps/aarch64/nptl/tcb-offsets.sym
+++ b/sysdeps/aarch64/nptl/tcb-offsets.sym
@@ -2,6 +2,5 @@
 #include <tls.h>
 
 PTHREAD_MULTIPLE_THREADS_OFFSET		offsetof (struct pthread, header.multiple_threads)
-PTHREAD_PID_OFFSET			offsetof (struct pthread, pid)
 PTHREAD_TID_OFFSET			offsetof (struct pthread, tid)
 PTHREAD_SIZEOF				sizeof (struct pthread)
diff --git a/sysdeps/aarch64/nptl/tls.h b/sysdeps/aarch64/nptl/tls.h
index 95ea3f9a1a..200334f84a 100644
--- a/sysdeps/aarch64/nptl/tls.h
+++ b/sysdeps/aarch64/nptl/tls.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2005-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -25,17 +25,7 @@
 # include <stdbool.h>
 # include <stddef.h>
 # include <stdint.h>
-
-/* Type for the dtv.  */
-typedef union dtv
-{
-  size_t counter;
-  struct
-  {
-    void *val;
-    bool is_static;
-  } pointer;
-} dtv_t;
+# include <dl-dtv.h>
 
 #else /* __ASSEMBLER__ */
 # include <tcb-offsets.h>
@@ -119,6 +109,7 @@ typedef struct
   descr->member[idx] = (value)
 
 /* Get and set the global scope generation counter in struct pthread.  */
+# define THREAD_GSCOPE_IN_TCB      1
 # define THREAD_GSCOPE_FLAG_UNUSED 0
 # define THREAD_GSCOPE_FLAG_USED   1
 # define THREAD_GSCOPE_FLAG_WAIT   2
diff --git a/sysdeps/aarch64/rawmemchr.S b/sysdeps/aarch64/rawmemchr.S
new file mode 100644
index 0000000000..de3c05f337
--- /dev/null
+++ b/sysdeps/aarch64/rawmemchr.S
@@ -0,0 +1,42 @@
+/* rawmemchr - find a character in a memory zone
+
+   Copyright (C) 2015-2018 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Special case rawmemchr (s, 0) as strlen, otherwise tailcall memchr.
+   Call strlen without setting up a full frame - it preserves x14/x15.
+*/
+
+ENTRY (__rawmemchr)
+	cbz	w1, L(do_strlen)
+	mov	x2, -1
+	b	__memchr
+
+L(do_strlen):
+	mov	x15, x30
+	cfi_return_column (x15)
+	mov	x14, x0
+	bl	__strlen
+	add	x0, x14, x0
+	ret	x15
+
+END (__rawmemchr)
+weak_alias (__rawmemchr, rawmemchr)
+libc_hidden_builtin_def (__rawmemchr)
diff --git a/sysdeps/aarch64/setjmp.S b/sysdeps/aarch64/setjmp.S
index 22f4368ee6..d8838e7b54 100644
--- a/sysdeps/aarch64/setjmp.S
+++ b/sysdeps/aarch64/setjmp.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -33,6 +33,7 @@ END (_setjmp)
 libc_hidden_def (_setjmp)
 
 ENTRY (__sigsetjmp)
+	DELOUSE (0)
 
 1:
 	stp	x19, x20, [x0, #JB_X19<<3]
@@ -42,7 +43,7 @@ ENTRY (__sigsetjmp)
 	stp	x27, x28, [x0, #JB_X27<<3]
 
 #ifdef PTR_MANGLE
-	PTR_MANGLE (x4, x30, x3, x2)
+	PTR_MANGLE (4, 30, 3, 2)
 	stp	x29,  x4, [x0, #JB_X29<<3]
 #else
 	stp	x29, x30, [x0, #JB_X29<<3]
@@ -57,7 +58,7 @@ ENTRY (__sigsetjmp)
 	stp	d14, d15, [x0, #JB_D14<<3]
 #ifdef PTR_MANGLE
 	mov	x4, sp
-	PTR_MANGLE (x5, x4, x3, x2)
+	PTR_MANGLE (5, 4, 3, 2)
 	str	x5, [x0, #JB_SP<<3]
 #else
 	mov	x2,  sp
diff --git a/sysdeps/aarch64/soft-fp/sfp-machine.h b/sysdeps/aarch64/sfp-machine.h
index 3e969952fa..3e969952fa 100644
--- a/sysdeps/aarch64/soft-fp/sfp-machine.h
+++ b/sysdeps/aarch64/sfp-machine.h
diff --git a/sysdeps/aarch64/soft-fp/Makefile b/sysdeps/aarch64/soft-fp/Makefile
deleted file mode 100644
index ada13e8b70..0000000000
--- a/sysdeps/aarch64/soft-fp/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-ifeq ($(subdir),math)
-CPPFLAGS += -I../soft-fp
-endif
diff --git a/sysdeps/aarch64/sotruss-lib.c b/sysdeps/aarch64/sotruss-lib.c
index 65174402b8..5dafacc21f 100644
--- a/sysdeps/aarch64/sotruss-lib.c
+++ b/sysdeps/aarch64/sotruss-lib.c
@@ -1,5 +1,5 @@
 /* Override generic sotruss-lib.c to define actual functions for AArch64.
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
+   Copyright (C) 2012-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/stackinfo.h b/sysdeps/aarch64/stackinfo.h
index 78f3262033..7e666fbe18 100644
--- a/sysdeps/aarch64/stackinfo.h
+++ b/sysdeps/aarch64/stackinfo.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2001-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2001-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/start.S b/sysdeps/aarch64/start.S
index efe24749f7..bad000f555 100644
--- a/sysdeps/aarch64/start.S
+++ b/sysdeps/aarch64/start.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 1995-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1995-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,6 +16,8 @@
    License along with the GNU C Library.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
+#include <sysdep.h>
+
 /* This is the canonical entry point, usually the first thing in the text
    segment.
 
@@ -25,7 +27,7 @@
 
    At this entry point, most registers' values are unspecified, except:
 
-   x0		Contains a function pointer to be registered with `atexit'.
+   x0/w0	Contains a function pointer to be registered with `atexit'.
 		This is how the dynamic linker arranges to have DT_FINI
 		functions called for shared libraries that have been loaded
 		before this code runs.
@@ -52,26 +54,35 @@ _start:
 	mov	x5, x0
 
 	/* Load argc and a pointer to argv */
-	ldr	x1, [sp, #0]
-	add	x2, sp, #8
+	ldr	PTR_REG (1), [sp, #0]
+	add	x2, sp, #PTR_SIZE
 
 	/* Setup stack limit in argument register */
 	mov	x6, sp
 
-#ifdef SHARED
+#ifdef PIC
+# ifdef SHARED
         adrp    x0, :got:main
-	ldr     x0, [x0, #:got_lo12:main]
+	ldr     PTR_REG (0), [x0, #:got_lo12:main]
 
         adrp    x3, :got:__libc_csu_init
-	ldr     x3, [x3, #:got_lo12:__libc_csu_init]
+	ldr     PTR_REG (3), [x3, #:got_lo12:__libc_csu_init]
 
         adrp    x4, :got:__libc_csu_fini
-	ldr     x4, [x4, #:got_lo12:__libc_csu_fini]
+	ldr     PTR_REG (4), [x4, #:got_lo12:__libc_csu_fini]
+# else
+	adrp	x0, __wrap_main
+	add	x0, x0, :lo12:__wrap_main
+	adrp	x3, __libc_csu_init
+	add	x3, x3, :lo12:__libc_csu_init
+	adrp	x4, __libc_csu_fini
+	add	x4, x4, :lo12:__libc_csu_fini
+# endif
 #else
 	/* Set up the other arguments in registers */
-	ldr	x0, =main
-	ldr	x3, =__libc_csu_init
-	ldr	x4, =__libc_csu_fini
+	MOVL (0, main)
+	MOVL (3, __libc_csu_init)
+	MOVL (4, __libc_csu_fini)
 #endif
 
 	/* __libc_start_main (main, argc, argv, init, fini, rtld_fini,
@@ -83,6 +94,15 @@ _start:
 	/* should never get here....*/
 	bl	abort
 
+#if defined PIC && !defined SHARED
+	/* When main is not defined in the executable but in a shared library
+	   then a wrapper is needed in crt1.o of the static-pie enabled libc,
+	   because crt1.o and rcrt1.o share code and the later must avoid the
+	   use of GOT relocations before __libc_start_main is called.  */
+__wrap_main:
+	b	main
+#endif
+
 	/* Define a symbol for the first piece of initialized data.  */
 	.data
 	.globl __data_start
diff --git a/sysdeps/aarch64/stpcpy.S b/sysdeps/aarch64/stpcpy.S
index cbbf26c499..dcf1356bda 100644
--- a/sysdeps/aarch64/stpcpy.S
+++ b/sysdeps/aarch64/stpcpy.S
@@ -1,5 +1,5 @@
 /* stpcpy - copy a string returning pointer to end.
-   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Copyright (C) 2015-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/aarch64/strchr.S b/sysdeps/aarch64/strchr.S
index 5e3aecf4c2..c27465220b 100644
--- a/sysdeps/aarch64/strchr.S
+++ b/sysdeps/aarch64/strchr.S
@@ -1,6 +1,6 @@
 /* strchr - find a character in a string
 
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
+   Copyright (C) 2014-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -62,6 +62,7 @@
 /* Locals and temporaries.  */
 
 ENTRY (strchr)
+	DELOUSE (0)
 	mov	wtmp2, #0x0401
 	movk	wtmp2, #0x4010, lsl #16
 	dup	vrepchr.16b, chrin
diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
index a624c8d2ef..e13ace5b7e 100644
--- a/sysdeps/aarch64/strchrnul.S
+++ b/sysdeps/aarch64/strchrnul.S
@@ -1,6 +1,6 @@
 /* strchrnul - find a character or nul in a string
 
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
+   Copyright (C) 2014-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -60,6 +60,7 @@
    identify exactly which byte is causing the termination.  */
 
 ENTRY (__strchrnul)
+	DELOUSE (0)
 	/* Magic constant 0x40100401 to allow us to identify which lane
 	   matches the termination condition.  */
 	mov	wtmp2, #0x0401
diff --git a/sysdeps/aarch64/strcmp.S b/sysdeps/aarch64/strcmp.S
index ba0ccb409b..267aa4b551 100644
--- a/sysdeps/aarch64/strcmp.S
+++ b/sysdeps/aarch64/strcmp.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2012-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -49,6 +49,8 @@
 	/* Start of performance-critical section  -- one 64B cache line.  */
 ENTRY_ALIGN(strcmp, 6)
 
+	DELOUSE (0)
+	DELOUSE (1)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
 	tst	tmp1, #7
@@ -70,6 +72,7 @@ L(start_realigned):
 	cbz	syndrome, L(loop_aligned)
 	/* End of performance-critical section  -- one 64B cache line.  */
 
+L(end):
 #ifndef	__AARCH64EB__
 	rev	syndrome, syndrome
 	rev	data1, data1
@@ -143,12 +146,38 @@ L(mutual_align):
 	b	L(start_realigned)
 
 L(misaligned8):
-	/* We can do better than this.  */
+	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+	   checking to make sure that we don't access beyond page boundary in
+	   SRC2.  */
+	tst	src1, #7
+	b.eq	L(loop_misaligned)
+L(do_misaligned):
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
 	cmp	data1w, #1
 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	L(misaligned8)
+	b.ne	L(done)
+	tst	src1, #7
+	b.ne	L(do_misaligned)
+
+L(loop_misaligned):
+	/* Test if we are within the last dword of the end of a 4K page.  If
+	   yes then jump back to the misaligned loop to copy a byte at a time.  */
+	and	tmp1, src2, #0xff8
+	eor	tmp1, tmp1, #0xff8
+	cbz	tmp1, L(do_misaligned)
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	orr	syndrome, diff, has_nul
+	cbz	syndrome, L(loop_misaligned)
+	b	L(end)
+
+L(done):
 	sub	result, data1, data2
 	RET
 END(strcmp)
diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
index 0694199372..0449fcb04e 100644
--- a/sysdeps/aarch64/strcpy.S
+++ b/sysdeps/aarch64/strcpy.S
@@ -1,5 +1,5 @@
 /* strcpy/stpcpy - copy a string returning pointer to start/end.
-   Copyright (C) 2013-2016 Free Software Foundation, Inc.
+   Copyright (C) 2013-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -91,6 +91,8 @@
 #define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
 
 ENTRY_ALIGN (STRCPY, 6)
+	DELOUSE (0)
+	DELOUSE (1)
 	/* For moderately short strings, the fastest way to do the copy is to
 	   calculate the length of the string in the same way as strlen, then
 	   essentially do a memcpy of the result.  This avoids the need for
diff --git a/sysdeps/aarch64/string_private.h b/sysdeps/aarch64/string_private.h
index 8b194a41a6..ba6297417a 100644
--- a/sysdeps/aarch64/string_private.h
+++ b/sysdeps/aarch64/string_private.h
@@ -1,5 +1,5 @@
 /* Define _STRING_ARCH_unaligned.  AArch64 version.
-   Copyright (C) 2016 Free Software Foundation, Inc.
+   Copyright (C) 2016-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
index 9b4d1da60c..32292e3b6e 100644
--- a/sysdeps/aarch64/strlen.S
+++ b/sysdeps/aarch64/strlen.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2012-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -84,7 +84,9 @@
 	   whether the first fetch, which may be misaligned, crosses a page
 	   boundary.  */
 
-ENTRY_ALIGN (strlen, 6)
+ENTRY_ALIGN (__strlen, 6)
+	DELOUSE (0)
+	DELOUSE (1)
 	and	tmp1, srcin, MIN_PAGE_SIZE - 1
 	mov	zeroones, REP8_01
 	cmp	tmp1, MIN_PAGE_SIZE - 16
@@ -213,5 +215,6 @@ L(page_cross):
 	csel	data1, data1, tmp4, eq
 	csel	data2, data2, tmp2, eq
 	b	L(page_cross_entry)
-END (strlen)
+END (__strlen)
+weak_alias (__strlen, strlen)
 libc_hidden_builtin_def (strlen)
diff --git a/sysdeps/aarch64/strncmp.S b/sysdeps/aarch64/strncmp.S
index f6a17fdba2..759c752fc2 100644
--- a/sysdeps/aarch64/strncmp.S
+++ b/sysdeps/aarch64/strncmp.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2013-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -49,15 +49,19 @@
 #define limit_wd	x13
 #define mask		x14
 #define endloop		x15
+#define count		mask
 
 ENTRY_ALIGN_AND_PAD (strncmp, 6, 7)
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
 	cbz	limit, L(ret0)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
 	tst	tmp1, #7
+	and	count, src1, #7
 	b.ne	L(misaligned8)
-	ands	tmp1, src1, #7
-	b.ne	L(mutual_align)
+	cbnz	count, L(mutual_align)
 	/* Calculate the number of full and partial words -1.  */
 	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
 	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
@@ -162,43 +166,107 @@ L(mutual_align):
 	bic	src1, src1, #7
 	bic	src2, src2, #7
 	ldr	data1, [src1], #8
-	neg	tmp3, tmp1, lsl #3	/* 64 - bits(bytes beyond align). */
+	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
 	ldr	data2, [src2], #8
 	mov	tmp2, #~0
 	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
 #ifdef __AARCH64EB__
 	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp3	/* Shift (tmp1 & 63).  */
+	lsl	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
 #else
 	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp3	/* Shift (tmp1 & 63).  */
+	lsr	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
 #endif
 	and	tmp3, limit_wd, #7
 	lsr	limit_wd, limit_wd, #3
 	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
-	add	limit, limit, tmp1
-	add	tmp3, tmp3, tmp1
+	add	limit, limit, count
+	add	tmp3, tmp3, count
 	orr	data1, data1, tmp2
 	orr	data2, data2, tmp2
 	add	limit_wd, limit_wd, tmp3, lsr #3
 	b	L(start_realigned)
 
-L(ret0):
-	mov	result, #0
-	RET
-
 	.p2align 6
+	/* Don't bother with dwords for up to 16 bytes.  */
 L(misaligned8):
-	sub	limit, limit, #1
-1:
+	cmp	limit, #16
+	b.hs	L(try_misaligned_words)
+
+L(byte_loop):
 	/* Perhaps we can do better than this.  */
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
 	subs	limit, limit, #1
-	ccmp	data1w, #1, #0, cs	/* NZCV = 0b0000.  */
+	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	1b
+	b.eq	L(byte_loop)
+L(done):
 	sub	result, data1, data2
 	RET
+
+	/* Align the SRC1 to a dword by doing a bytewise compare and then do
+	   the dword loop.  */
+L(try_misaligned_words):
+	lsr	limit_wd, limit, #3
+	cbz	count, L(do_misaligned)
+
+	neg	count, count
+	and	count, count, #7
+	sub	limit, limit, count
+	lsr	limit_wd, limit, #3
+
+L(page_end_loop):
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	cmp	data1w, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.ne	L(done)
+	subs	count, count, #1
+	b.hi	L(page_end_loop)
+
+L(do_misaligned):
+	/* Prepare ourselves for the next page crossing.  Unlike the aligned
+	   loop, we fetch 1 less dword because we risk crossing bounds on
+	   SRC2.  */
+	mov	count, #8
+	subs	limit_wd, limit_wd, #1
+	b.lo	L(done_loop)
+L(loop_misaligned):
+	and	tmp2, src2, #0xff8
+	eor	tmp2, tmp2, #0xff8
+	cbz	tmp2, L(page_end_loop)
+
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	diff, #0, #0, eq
+	b.ne	L(not_limit)
+	subs	limit_wd, limit_wd, #1
+	b.pl	L(loop_misaligned)
+
+L(done_loop):
+	/* We found a difference or a NULL before the limit was reached.  */
+	and	limit, limit, #7
+	cbz	limit, L(not_limit)
+	/* Read the last word.  */
+	sub	src1, src1, 8
+	sub	src2, src2, 8
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	diff, #0, #0, eq
+	b.ne	L(not_limit)
+
+L(ret0):
+	mov	result, #0
+	RET
+
 END (strncmp)
 libc_hidden_builtin_def (strncmp)
diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
index 4cce45f905..0f190e83e2 100644
--- a/sysdeps/aarch64/strnlen.S
+++ b/sysdeps/aarch64/strnlen.S
@@ -1,6 +1,6 @@
 /* strnlen - calculate the length of a string with limit.
 
-   Copyright (C) 2013-2016 Free Software Foundation, Inc.
+   Copyright (C) 2013-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -50,6 +50,9 @@
 #define REP8_80 0x8080808080808080
 
 ENTRY_ALIGN_AND_PAD (__strnlen, 6, 9)
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
 	cbz	limit, L(hit_limit)
 	mov	zeroones, #REP8_01
 	bic	src, srcin, #15
diff --git a/sysdeps/aarch64/strrchr.S b/sysdeps/aarch64/strrchr.S
index 44c1917a20..aa334ede53 100644
--- a/sysdeps/aarch64/strrchr.S
+++ b/sysdeps/aarch64/strrchr.S
@@ -1,6 +1,6 @@
 /* strrchr: find the last instance of a character in a string.
 
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
+   Copyright (C) 2014-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -68,6 +68,7 @@
    identify exactly which byte is causing the termination, and why.  */
 
 ENTRY(strrchr)
+	DELOUSE (0)
 	cbz	x1, L(null_search)
 	/* Magic constant 0x40100401 to allow us to identify which lane
 	   matches the requested byte.  Magic constant 0x80200802 used
diff --git a/sysdeps/aarch64/sysdep.h b/sysdeps/aarch64/sysdep.h
index 6b728ec651..5b30709436 100644
--- a/sysdeps/aarch64/sysdep.h
+++ b/sysdeps/aarch64/sysdep.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2016 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -16,8 +16,25 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
+#ifndef _AARCH64_SYSDEP_H
+#define _AARCH64_SYSDEP_H
+
 #include <sysdeps/generic/sysdep.h>
 
+#ifdef __LP64__
+# define AARCH64_R(NAME)	R_AARCH64_ ## NAME
+# define PTR_REG(n)		x##n
+# define PTR_LOG_SIZE		3
+# define DELOUSE(n)
+#else
+# define AARCH64_R(NAME)	R_AARCH64_P32_ ## NAME
+# define PTR_REG(n)		w##n
+# define PTR_LOG_SIZE		2
+# define DELOUSE(n)		mov     w##n, w##n
+#endif
+
+#define PTR_SIZE	(1<<PTR_LOG_SIZE)
+
 #ifdef	__ASSEMBLER__
 
 /* Syntactic details of assembler.  */
@@ -66,9 +83,38 @@
 /* If compiled for profiling, call `mcount' at the start of each function.  */
 #ifdef	PROF
 # define CALL_MCOUNT						\
-	str	x30, [sp, #-16]!;				\
+	str	x30, [sp, #-80]!;				\
+	cfi_adjust_cfa_offset (80);				\
+	cfi_rel_offset (x30, 0);				\
+	stp	x0, x1, [sp, #16];				\
+	cfi_rel_offset (x0, 16);				\
+	cfi_rel_offset (x1, 24);				\
+	stp	x2, x3, [sp, #32];				\
+	cfi_rel_offset (x2, 32);				\
+	cfi_rel_offset (x3, 40);				\
+	stp	x4, x5, [sp, #48];				\
+	cfi_rel_offset (x4, 48);				\
+	cfi_rel_offset (x5, 56);				\
+	stp	x6, x7, [sp, #64];				\
+	cfi_rel_offset (x6, 64);				\
+	cfi_rel_offset (x7, 72);				\
+	mov	x0, x30;					\
 	bl	mcount;						\
-	ldr	x30, [sp], #16	;
+	ldp	x0, x1, [sp, #16];				\
+	cfi_restore (x0);					\
+	cfi_restore (x1);					\
+	ldp	x2, x3, [sp, #32];				\
+	cfi_restore (x2);					\
+	cfi_restore (x3);					\
+	ldp	x4, x5, [sp, #48];				\
+	cfi_restore (x4);					\
+	cfi_restore (x5);					\
+	ldp	x6, x7, [sp, #64];				\
+	cfi_restore (x6);					\
+	cfi_restore (x7);					\
+	ldr	x30, [sp], #80;					\
+	cfi_adjust_cfa_offset (-80);				\
+	cfi_restore (x30);
 #else
 # define CALL_MCOUNT		/* Do nothing.  */
 #endif
@@ -78,16 +124,32 @@
 # define L(name)         .L##name
 #endif
 
-/* Load or store to/from a pc-relative EXPR into/from R, using T.  */
-#define LDST_PCREL(OP, R, T, EXPR)  \
-	adrp	T, EXPR;	    \
-	OP	R, [T, #:lo12:EXPR];\
-
-/* Load or store to/from a got-relative EXPR into/from R, using T.  */
-#define LDST_GLOBAL(OP, R, T, EXPR)     \
-	adrp	T, :got:EXPR;		\
-	ldr	T, [T, #:got_lo12:EXPR];\
-	OP	R, [T];
+/* Load or store to/from a pc-relative EXPR into/from R, using T.
+   Note R and T are register numbers and not register names.  */
+#define LDST_PCREL(OP, R, T, EXPR)			\
+	adrp	x##T, EXPR;				\
+	OP	PTR_REG (R), [x##T, #:lo12:EXPR];	\
+
+/* Load or store to/from a got-relative EXPR into/from R, using T.
+   Note R and T are register numbers and not register names.  */
+#define LDST_GLOBAL(OP, R, T,  EXPR)			\
+	adrp	x##T, :got:EXPR;			\
+	ldr	PTR_REG (T), [x##T, #:got_lo12:EXPR];	\
+	OP	PTR_REG (R), [x##T];
+
+/* Load an immediate into R.
+   Note R is a register number and not a register name.  */
+#ifdef __LP64__
+# define MOVL(R, NAME)					\
+	movz	PTR_REG (R), #:abs_g3:NAME;		\
+	movk	PTR_REG (R), #:abs_g2_nc:NAME;		\
+	movk	PTR_REG (R), #:abs_g1_nc:NAME;		\
+	movk	PTR_REG (R), #:abs_g0_nc:NAME;
+#else
+# define MOVL(R, NAME)					\
+	movz	PTR_REG (R), #:abs_g1:NAME;		\
+	movk	PTR_REG (R), #:abs_g0_nc:NAME;
+#endif
 
 /* Since C identifiers are not normally prefixed with an underscore
    on this system, the asm identifier `syscall_error' intrudes on the
@@ -96,3 +158,5 @@
 #define mcount		_mcount
 
 #endif	/* __ASSEMBLER__ */
+
+#endif  /* _AARCH64_SYSDEP_H */
diff --git a/sysdeps/aarch64/tls-macros.h b/sysdeps/aarch64/tls-macros.h
index 2080a4dfa4..cc58a7d6bf 100644
--- a/sysdeps/aarch64/tls-macros.h
+++ b/sysdeps/aarch64/tls-macros.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2009-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/aarch64/tlsdesc.c b/sysdeps/aarch64/tlsdesc.c
index ac3a9f4591..357465f23d 100644
--- a/sysdeps/aarch64/tlsdesc.c
+++ b/sysdeps/aarch64/tlsdesc.c
@@ -1,6 +1,6 @@
 /* Manage TLS descriptors.  AArch64 version.
 
-   Copyright (C) 2011-2016 Free Software Foundation, Inc.
+   Copyright (C) 2011-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -18,143 +18,17 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <link.h>
 #include <ldsodefs.h>
-#include <elf/dynamic-link.h>
 #include <tls.h>
 #include <dl-tlsdesc.h>
 #include <dl-unmap-segments.h>
+#define _dl_tlsdesc_resolve_hold 0
 #include <tlsdeschtab.h>
-#include <atomic.h>
-
-/* The following functions take an entry_check_offset argument.  It's
-   computed by the caller as an offset between its entry point and the
-   call site, such that by adding the built-in return address that is
-   implicitly passed to the function with this offset, we can easily
-   obtain the caller's entry point to compare with the entry point
-   given in the TLS descriptor.  If it's changed, we want to return
-   immediately.  */
-
-/* This function is used to lazily resolve TLS_DESC RELA relocations.
-   The argument location is used to hold a pointer to the relocation.  */
-
-void
-attribute_hidden
-_dl_tlsdesc_resolve_rela_fixup (struct tlsdesc *td, struct link_map *l)
-{
-  const ElfW(Rela) *reloc = atomic_load_relaxed (&td->arg);
-
-  /* After GL(dl_load_lock) is grabbed only one caller can see td->entry in
-     initial state in _dl_tlsdesc_resolve_early_return_p, other concurrent
-     callers will return and retry calling td->entry.  The updated td->entry
-     synchronizes with the single writer so all read accesses here can use
-     relaxed order.  */
-  if (_dl_tlsdesc_resolve_early_return_p
-      (td, (void*)(D_PTR (l, l_info[ADDRIDX (DT_TLSDESC_PLT)]) + l->l_addr)))
-    return;
-
-  /* The code below was borrowed from _dl_fixup(),
-     except for checking for STB_LOCAL.  */
-  const ElfW(Sym) *const symtab
-    = (const void *) D_PTR (l, l_info[DT_SYMTAB]);
-  const char *strtab = (const void *) D_PTR (l, l_info[DT_STRTAB]);
-  const ElfW(Sym) *sym = &symtab[ELFW(R_SYM) (reloc->r_info)];
-  lookup_t result;
-
-   /* Look up the target symbol.  If the normal lookup rules are not
-      used don't look in the global scope.  */
-  if (ELFW(ST_BIND) (sym->st_info) != STB_LOCAL
-      && __builtin_expect (ELFW(ST_VISIBILITY) (sym->st_other), 0) == 0)
-    {
-      const struct r_found_version *version = NULL;
-
-      if (l->l_info[VERSYMIDX (DT_VERSYM)] != NULL)
-	{
-	  const ElfW(Half) *vernum =
-	    (const void *) D_PTR (l, l_info[VERSYMIDX (DT_VERSYM)]);
-	  ElfW(Half) ndx = vernum[ELFW(R_SYM) (reloc->r_info)] & 0x7fff;
-	  version = &l->l_versions[ndx];
-	  if (version->hash == 0)
-	    version = NULL;
-	}
-
-      result = _dl_lookup_symbol_x (strtab + sym->st_name, l, &sym,
-				    l->l_scope, version, ELF_RTYPE_CLASS_PLT,
-				    DL_LOOKUP_ADD_DEPENDENCY, NULL);
-    }
-  else
-    {
-      /* We already found the symbol.  The module (and therefore its load
-	 address) is also known.  */
-      result = l;
-    }
-
-  if (!sym)
-    {
-      atomic_store_relaxed (&td->arg, (void *) reloc->r_addend);
-      /* This release store synchronizes with the ldar acquire load
-	 instruction in _dl_tlsdesc_undefweak.  */
-      atomic_store_release (&td->entry, _dl_tlsdesc_undefweak);
-    }
-  else
-    {
-#  ifndef SHARED
-      CHECK_STATIC_TLS (l, result);
-#  else
-      if (!TRY_STATIC_TLS (l, result))
-	{
-	  void *p = _dl_make_tlsdesc_dynamic (result, sym->st_value
-					      + reloc->r_addend);
-	  atomic_store_relaxed (&td->arg, p);
-	  /* This release store synchronizes with the ldar acquire load
-	     instruction in _dl_tlsdesc_dynamic.  */
-	  atomic_store_release (&td->entry, _dl_tlsdesc_dynamic);
-	}
-      else
-#  endif
-	{
-	  void *p = (void*) (sym->st_value + result->l_tls_offset
-			     + reloc->r_addend);
-	  atomic_store_relaxed (&td->arg, p);
-	  /* This release store synchronizes with the ldar acquire load
-	     instruction in _dl_tlsdesc_return_lazy.  */
-	  atomic_store_release (&td->entry, _dl_tlsdesc_return_lazy);
-	}
-    }
-
-  _dl_tlsdesc_wake_up_held_fixups ();
-}
-
-/* This function is used to avoid busy waiting for other threads to
-   complete the lazy relocation.  Once another thread wins the race to
-   relocate a TLS descriptor, it sets the descriptor up such that this
-   function is called to wait until the resolver releases the
-   lock.  */
-
-void
-attribute_hidden
-_dl_tlsdesc_resolve_hold_fixup (struct tlsdesc *td, void *caller)
-{
-  /* Maybe we're lucky and can return early.  */
-  if (caller != atomic_load_relaxed (&td->entry))
-    return;
-
-  /* Locking here will stop execution until the running resolver runs
-     _dl_tlsdesc_wake_up_held_fixups(), releasing the lock.
-
-     FIXME: We'd be better off waiting on a condition variable, such
-     that we didn't have to hold the lock throughout the relocation
-     processing.  */
-  __rtld_lock_lock_recursive (GL(dl_load_lock));
-  __rtld_lock_unlock_recursive (GL(dl_load_lock));
-}
-
 
 /* Unmap the dynamic object, but also release its TLS descriptor table
    if there is one.  */
 
 void
-internal_function
 _dl_unmap (struct link_map *map)
 {
   _dl_unmap_segments (map);
diff --git a/sysdeps/aarch64/tlsdesc.sym b/sysdeps/aarch64/tlsdesc.sym
index 63766af612..a06a719779 100644
--- a/sysdeps/aarch64/tlsdesc.sym
+++ b/sysdeps/aarch64/tlsdesc.sym
@@ -13,3 +13,6 @@ TLSDESC_ARG			offsetof(struct tlsdesc, arg)
 TLSDESC_GEN_COUNT	offsetof(struct tlsdesc_dynamic_arg, gen_count)
 TLSDESC_MODID		offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_module)
 TLSDESC_MODOFF		offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_offset)
+TCBHEAD_DTV		offsetof(tcbhead_t, dtv)
+DTV_COUNTER		offsetof(dtv_t, counter)
+TLS_DTV_UNALLOCATED	TLS_DTV_UNALLOCATED
diff --git a/sysdeps/aarch64/tst-audit.h b/sysdeps/aarch64/tst-audit.h
index 8ed01484e3..fd4f38f613 100644
--- a/sysdeps/aarch64/tst-audit.h
+++ b/sysdeps/aarch64/tst-audit.h
@@ -1,6 +1,6 @@
 /* Definitions for testing PLT entry/exit auditing.  AArch64 version.
 
-   Copyright (C) 2005-2016 Free Software Foundation, Inc.
+   Copyright (C) 2005-2018 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
author	Samuel Thibault <samuel.thibault@ens-lyon.org>	2018-12-27 19:16:25 +0000
committer	Samuel Thibault <samuel.thibault@ens-lyon.org>	2018-12-27 19:16:25 +0000
commit	8d59503b977070aaa4e504e8d6dcb7da3711893e (patch)
tree	8272c9c2cce43afa4fe4d8d92c269a6435242661 /sysdeps/aarch64
parent	76a7dc16fab8853ef9230447fa98c70a3619dc6d (diff)
parent	bcea9593527d90b9f9ff3817e3fbf0fbc3d01fa7 (diff)