summaryrefslogtreecommitdiff
path: root/sysdeps/aarch64/multiarch
diff options
context:
space:
mode:
authorSamuel Thibault <samuel.thibault@ens-lyon.org>2018-12-27 19:01:57 +0000
committerSamuel Thibault <samuel.thibault@ens-lyon.org>2018-12-27 19:01:57 +0000
commitcab56836b146bc129f1ad43f0393d95a9deca63a (patch)
tree4f4e655319bbac78fca170da05275c127429b460 /sysdeps/aarch64/multiarch
parent04ac1241a4cd004872282c2c82ec37fa33925292 (diff)
parent82dd75a7f436a19047325d62182590c9f9e23a78 (diff)
Merge branch 't/tls' into refs/top-bases/t/tls-threadvar
Diffstat (limited to 'sysdeps/aarch64/multiarch')
-rw-r--r--sysdeps/aarch64/multiarch/Makefile4
-rw-r--r--sysdeps/aarch64/multiarch/ifunc-impl-list.c57
-rw-r--r--sysdeps/aarch64/multiarch/init-arch.h25
-rw-r--r--sysdeps/aarch64/multiarch/memcpy.c47
-rw-r--r--sysdeps/aarch64/multiarch/memcpy_falkor.S191
-rw-r--r--sysdeps/aarch64/multiarch/memcpy_generic.S44
-rw-r--r--sysdeps/aarch64/multiarch/memcpy_thunderx.S336
-rw-r--r--sysdeps/aarch64/multiarch/memcpy_thunderx2.S27
-rw-r--r--sysdeps/aarch64/multiarch/memmove.c44
-rw-r--r--sysdeps/aarch64/multiarch/memmove_falkor.S225
-rw-r--r--sysdeps/aarch64/multiarch/memset.c41
-rw-r--r--sysdeps/aarch64/multiarch/memset_falkor.S53
-rw-r--r--sysdeps/aarch64/multiarch/memset_generic.S27
-rw-r--r--sysdeps/aarch64/multiarch/rtld-memset.S23
14 files changed, 1144 insertions, 0 deletions
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
new file mode 100644
index 0000000000..57ffdf7238
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -0,0 +1,4 @@
+ifeq ($(subdir),string)
+sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
+ memcpy_falkor memmove_falkor memset_generic memset_falkor
+endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
new file mode 100644
index 0000000000..e55be80103
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -0,0 +1,57 @@
+/* Enumerate available IFUNC implementations of a function. AARCH64 version.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <assert.h>
+#include <string.h>
+#include <wchar.h>
+#include <ldsodefs.h>
+#include <ifunc-impl-list.h>
+#include <init-arch.h>
+#include <stdio.h>
+
+/* Maximum number of IFUNC implementations. */
+#define MAX_IFUNC 4
+
+size_t
+__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ size_t max)
+{
+ assert (max >= MAX_IFUNC);
+
+ size_t i = 0;
+
+ INIT_ARCH ();
+
+ /* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c. */
+ IFUNC_IMPL (i, name, memcpy,
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
+ IFUNC_IMPL (i, name, memmove,
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
+ IFUNC_IMPL (i, name, memset,
+ /* Enable this on non-falkor processors too so that other cores
+ can do a comparative analysis with __memset_generic. */
+ IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
+
+ return i;
+}
diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
new file mode 100644
index 0000000000..d1e5703cb2
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/init-arch.h
@@ -0,0 +1,25 @@
+/* Define INIT_ARCH so that midr is initialized before use by IFUNCs.
+ This file is part of the GNU C Library.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <ldsodefs.h>
+
+#define INIT_ARCH() \
+ uint64_t __attribute__((unused)) midr = \
+ GLRO(dl_aarch64_cpu_features).midr_el1; \
+ unsigned __attribute__((unused)) zva_size = \
+ GLRO(dl_aarch64_cpu_features).zva_size;
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
new file mode 100644
index 0000000000..4a04a63b0f
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -0,0 +1,47 @@
+/* Multiple versions of memcpy. AARCH64 version.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+
+#if IS_IN (libc)
+/* Redefine memcpy so that the compiler won't complain about the type
+ mismatch with the IFUNC selector in strong_alias, below. */
+# undef memcpy
+# define memcpy __redirect_memcpy
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__redirect_memcpy) __libc_memcpy;
+
+extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
+
+libc_ifunc (__libc_memcpy,
+ (IS_THUNDERX (midr)
+ ? __memcpy_thunderx
+ : (IS_FALKOR (midr) || IS_PHECDA (midr)
+ ? __memcpy_falkor
+ : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
+ ? __memcpy_thunderx2
+ : __memcpy_generic))));
+
+# undef memcpy
+strong_alias (__libc_memcpy, memcpy);
+#endif
diff --git a/sysdeps/aarch64/multiarch/memcpy_falkor.S b/sysdeps/aarch64/multiarch/memcpy_falkor.S
new file mode 100644
index 0000000000..cdc2de4750
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy_falkor.S
@@ -0,0 +1,191 @@
+/* Optimized memcpy for Qualcomm Falkor processor.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Assumptions:
+
+ ARMv8-a, AArch64, falkor, unaligned accesses. */
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define tmp1 x14
+#define A_x x6
+#define B_x x7
+#define A_w w6
+#define B_w w7
+
+#define A_q q0
+#define B_q q1
+#define C_q q2
+#define D_q q3
+#define E_q q4
+#define F_q q5
+#define G_q q6
+
+/* Copies are split into 3 main cases:
+
+ 1. Small copies of up to 32 bytes
+ 2. Medium copies of 33..128 bytes which are fully unrolled
+ 3. Large copies of more than 128 bytes.
+
+ Large copies align the sourceto a quad word and use an unrolled loop
+ processing 64 bytes per iteration.
+
+ FALKOR-SPECIFIC DESIGN:
+
+ The smallest copies (32 bytes or less) focus on optimal pipeline usage,
+ which is why the redundant copies of 0-3 bytes have been replaced with
+ conditionals, since the former would unnecessarily break across multiple
+ issue groups. The medium copy group has been enlarged to 128 bytes since
+ bumping up the small copies up to 32 bytes allows us to do that without
+ cost and also allows us to reduce the size of the prep code before loop64.
+
+ The copy loop uses only one register q0. This is to ensure that all loads
+ hit a single hardware prefetcher which can get correctly trained to prefetch
+ a single stream.
+
+ The non-temporal stores help optimize cache utilization. */
+
+#if IS_IN (libc)
+ENTRY_ALIGN (__memcpy_falkor, 6)
+
+ cmp count, 32
+ add srcend, src, count
+ add dstend, dstin, count
+ b.ls L(copy32)
+ ldr A_q, [src]
+ cmp count, 128
+ str A_q, [dstin]
+ b.hi L(copy_long)
+
+ /* Medium copies: 33..128 bytes. */
+ sub tmp1, count, 1
+ ldr A_q, [src, 16]
+ ldr B_q, [srcend, -32]
+ ldr C_q, [srcend, -16]
+ tbz tmp1, 6, 1f
+ ldr D_q, [src, 32]
+ ldr E_q, [src, 48]
+ str D_q, [dstin, 32]
+ str E_q, [dstin, 48]
+ ldr F_q, [srcend, -64]
+ ldr G_q, [srcend, -48]
+ str F_q, [dstend, -64]
+ str G_q, [dstend, -48]
+1:
+ str A_q, [dstin, 16]
+ str B_q, [dstend, -32]
+ str C_q, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Small copies: 0..32 bytes. */
+L(copy32):
+ /* 16-32 */
+ cmp count, 16
+ b.lo 1f
+ ldr A_q, [src]
+ ldr B_q, [srcend, -16]
+ str A_q, [dstin]
+ str B_q, [dstend, -16]
+ ret
+ .p2align 4
+1:
+ /* 8-15 */
+ tbz count, 3, 1f
+ ldr A_x, [src]
+ ldr B_x, [srcend, -8]
+ str A_x, [dstin]
+ str B_x, [dstend, -8]
+ ret
+ .p2align 4
+1:
+ /* 4-7 */
+ tbz count, 2, 1f
+ ldr A_w, [src]
+ ldr B_w, [srcend, -4]
+ str A_w, [dstin]
+ str B_w, [dstend, -4]
+ ret
+ .p2align 4
+1:
+ /* 2-3 */
+ tbz count, 1, 1f
+ ldrh A_w, [src]
+ ldrh B_w, [srcend, -2]
+ strh A_w, [dstin]
+ strh B_w, [dstend, -2]
+ ret
+ .p2align 4
+1:
+ /* 0-1 */
+ tbz count, 0, 1f
+ ldrb A_w, [src]
+ strb A_w, [dstin]
+1:
+ ret
+
+ /* Align SRC to 16 bytes and copy; that way at least one of the
+ accesses is aligned throughout the copy sequence.
+
+ The count is off by 0 to 15 bytes, but this is OK because we trim
+ off the last 64 bytes to copy off from the end. Due to this the
+ loop never runs out of bounds. */
+ .p2align 6
+L(copy_long):
+ sub count, count, 64 + 16
+ and tmp1, src, 15
+ bic src, src, 15
+ sub dst, dstin, tmp1
+ add count, count, tmp1
+
+L(loop64):
+ ldr A_q, [src, 16]!
+ str A_q, [dst, 16]
+ ldr A_q, [src, 16]!
+ subs count, count, 64
+ str A_q, [dst, 32]
+ ldr A_q, [src, 16]!
+ str A_q, [dst, 48]
+ ldr A_q, [src, 16]!
+ str A_q, [dst, 64]!
+ b.hi L(loop64)
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the end even if
+ there is just 1 byte left. */
+L(last64):
+ ldr E_q, [srcend, -64]
+ str E_q, [dstend, -64]
+ ldr D_q, [srcend, -48]
+ str D_q, [dstend, -48]
+ ldr C_q, [srcend, -32]
+ str C_q, [dstend, -32]
+ ldr B_q, [srcend, -16]
+ str B_q, [dstend, -16]
+ ret
+
+END (__memcpy_falkor)
+libc_hidden_builtin_def (__memcpy_falkor)
+#endif
diff --git a/sysdeps/aarch64/multiarch/memcpy_generic.S b/sysdeps/aarch64/multiarch/memcpy_generic.S
new file mode 100644
index 0000000000..4bc81f4ce5
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy_generic.S
@@ -0,0 +1,44 @@
+/* A Generic Optimized memcpy implementation for AARCH64.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* The actual memcpy and memmove code is in ../memcpy.S. If we are
+ building libc this file defines __memcpy_generic and __memmove_generic.
+ Otherwise the include of ../memcpy.S will define the normal __memcpy
+ and__memmove entry points. */
+
+#include <sysdep.h>
+
+#if IS_IN (libc)
+
+# define MEMCPY __memcpy_generic
+# define MEMMOVE __memmove_generic
+
+/* Do not hide the generic versions of memcpy and memmove, we use them
+ internally. */
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)
+
+# ifdef SHARED
+/* It doesn't make sense to send libc-internal memcpy calls through a PLT. */
+ .globl __GI_memcpy; __GI_memcpy = __memcpy_generic
+ .globl __GI_memmove; __GI_memmove = __memmove_generic
+# endif
+
+#endif
+
+#include "../memcpy.S"
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx.S b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
new file mode 100644
index 0000000000..de494d933d
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
@@ -0,0 +1,336 @@
+/* A Thunderx Optimized memcpy implementation for AARCH64.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* The actual code in this memcpy and memmove should be identical to the
+ generic version except for the code under '#ifdef THUNDERX'. This is
+ to make is easier to keep this version and the generic version in sync
+ for changes that are not specific to thunderx. */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define A_l x6
+#define A_lw w6
+#define A_h x7
+#define A_hw w7
+#define B_l x8
+#define B_lw w8
+#define B_h x9
+#define C_l x10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l src
+#define E_h count
+#define F_l srcend
+#define F_h dst
+#define G_l count
+#define G_h dst
+#define tmp1 x14
+
+/* Copies are split into 3 main cases: small copies of up to 16 bytes,
+ medium copies of 17..96 bytes which are fully unrolled. Large copies
+ of more than 96 bytes align the destination and use an unrolled loop
+ processing 64 bytes per iteration.
+ In order to share code with memmove, small and medium copies read all
+ data before writing, allowing any kind of overlap. So small, medium
+ and large backwards memmoves are handled by falling through into memcpy.
+ Overlapping large forward memmoves use a loop that copies backwards.
+*/
+
+#ifndef MEMMOVE
+# define MEMMOVE memmove
+#endif
+#ifndef MEMCPY
+# define MEMCPY memcpy
+#endif
+
+#if IS_IN (libc)
+
+# ifndef USE_THUNDERX2
+# undef MEMCPY
+# define MEMCPY __memcpy_thunderx
+# undef MEMMOVE
+# define MEMMOVE __memmove_thunderx
+# define USE_THUNDERX
+# endif
+
+ENTRY_ALIGN (MEMMOVE, 6)
+
+ DELOUSE (0)
+ DELOUSE (1)
+ DELOUSE (2)
+
+ sub tmp1, dstin, src
+ cmp count, 96
+ ccmp tmp1, count, 2, hi
+ b.lo L(move_long)
+
+ /* Common case falls through into memcpy. */
+END (MEMMOVE)
+libc_hidden_builtin_def (MEMMOVE)
+ENTRY (MEMCPY)
+
+ DELOUSE (0)
+ DELOUSE (1)
+ DELOUSE (2)
+
+ prfm PLDL1KEEP, [src]
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 16
+ b.ls L(copy16)
+ cmp count, 96
+ b.hi L(copy_long)
+
+ /* Medium copies: 17..96 bytes. */
+ sub tmp1, count, 1
+ ldp A_l, A_h, [src]
+ tbnz tmp1, 6, L(copy96)
+ ldp D_l, D_h, [srcend, -16]
+ tbz tmp1, 5, 1f
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+1:
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Small copies: 0..16 bytes. */
+L(copy16):
+ cmp count, 8
+ b.lo 1f
+ ldr A_l, [src]
+ ldr A_h, [srcend, -8]
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
+ ret
+ .p2align 4
+1:
+ tbz count, 2, 1f
+ ldr A_lw, [src]
+ ldr A_hw, [srcend, -4]
+ str A_lw, [dstin]
+ str A_hw, [dstend, -4]
+ ret
+
+ /* Copy 0..3 bytes. Use a branchless sequence that copies the same
+ byte 3 times if count==1, or the 2nd byte twice if count==2. */
+1:
+ cbz count, 2f
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb A_hw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb A_hw, [dstend, -1]
+2: ret
+
+ .p2align 4
+ /* Copy 64..96 bytes. Copy 64 bytes from the start and
+ 32 bytes from the end. */
+L(copy96):
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [src, 32]
+ ldp D_l, D_h, [src, 48]
+ ldp E_l, E_h, [srcend, -32]
+ ldp F_l, F_h, [srcend, -16]
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin, 32]
+ stp D_l, D_h, [dstin, 48]
+ stp E_l, E_h, [dstend, -32]
+ stp F_l, F_h, [dstend, -16]
+ ret
+
+ /* Align DST to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 96 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 64 bytes per iteration and prefetches one iteration ahead. */
+
+ .p2align 4
+L(copy_long):
+
+# if defined(USE_THUNDERX) || defined (USE_THUNDERX2)
+
+ /* On thunderx, large memcpy's are helped by software prefetching.
+ This loop is identical to the one below it but with prefetching
+ instructions included. For loops that are less than 32768 bytes,
+ the prefetching does not help and slow the code down so we only
+ use the prefetching loop for the largest memcpys. */
+
+ cmp count, #32768
+ b.lo L(copy_long_without_prefetch)
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ ldp D_l, D_h, [src]
+ sub src, src, tmp1
+# if defined(USE_THUNDERX)
+ prfm pldl1strm, [src, 384]
+# elif defined(USE_THUNDERX2)
+ prfm pldl1strm, [src, 256]
+# endif
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_l, A_h, [src, 16]
+ stp D_l, D_h, [dstin]
+ ldp B_l, B_h, [src, 32]
+ ldp C_l, C_h, [src, 48]
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 128 + 16 /* Test and readjust count. */
+
+L(prefetch_loop64):
+# if defined(USE_THUNDERX)
+ tbz src, #6, 1f
+ prfm pldl1strm, [src, 512]
+1:
+# elif defined(USE_THUNDERX2)
+ prfm pldl1strm, [src, 256]
+# endif
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]!
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 64
+ b.hi L(prefetch_loop64)
+ b L(last64)
+
+L(copy_long_without_prefetch):
+# endif
+
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ ldp D_l, D_h, [src]
+ sub src, src, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_l, A_h, [src, 16]
+ stp D_l, D_h, [dstin]
+ ldp B_l, B_h, [src, 32]
+ ldp C_l, C_h, [src, 48]
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls L(last64)
+L(loop64):
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]!
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 64
+ b.hi L(loop64)
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the end even if
+ there is just 1 byte left. */
+L(last64):
+ ldp E_l, E_h, [srcend, -64]
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [srcend, -48]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [srcend, -16]
+ stp D_l, D_h, [dst, 64]
+ stp E_l, E_h, [dstend, -64]
+ stp A_l, A_h, [dstend, -48]
+ stp B_l, B_h, [dstend, -32]
+ stp C_l, C_h, [dstend, -16]
+ ret
+
+ .p2align 4
+L(move_long):
+ cbz tmp1, 3f
+
+ add srcend, src, count
+ add dstend, dstin, count
+
+ /* Align dstend to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 96 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 64 bytes per iteration and prefetches one iteration ahead. */
+
+ and tmp1, dstend, 15
+ ldp D_l, D_h, [srcend, -16]
+ sub srcend, srcend, tmp1
+ sub count, count, tmp1
+ ldp A_l, A_h, [srcend, -16]
+ stp D_l, D_h, [dstend, -16]
+ ldp B_l, B_h, [srcend, -32]
+ ldp C_l, C_h, [srcend, -48]
+ ldp D_l, D_h, [srcend, -64]!
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls 2f
+
+ nop
+1:
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [srcend, -16]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [srcend, -48]
+ stp D_l, D_h, [dstend, -64]!
+ ldp D_l, D_h, [srcend, -64]!
+ subs count, count, 64
+ b.hi 1b
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the start even if
+ there is just 1 byte left. */
+2:
+ ldp G_l, G_h, [src, 48]
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [src, 32]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [src, 16]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [src]
+ stp D_l, D_h, [dstend, -64]
+ stp G_l, G_h, [dstin, 48]
+ stp A_l, A_h, [dstin, 32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin]
+3: ret
+
+END (MEMCPY)
+libc_hidden_builtin_def (MEMCPY)
+
+#endif
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
new file mode 100644
index 0000000000..8501abf725
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
@@ -0,0 +1,27 @@
+/* A Thunderx2 Optimized memcpy implementation for AARCH64.
+ Copyright (C) 2018 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* The actual code in this memcpy and memmove is in memcpy_thunderx.S.
+ The only real differences are with the prefetching instructions. */
+
+#define MEMCPY __memcpy_thunderx2
+#define MEMMOVE __memmove_thunderx2
+#define USE_THUNDERX2
+
+#include "memcpy_thunderx.S"
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
new file mode 100644
index 0000000000..e69d816291
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -0,0 +1,44 @@
+/* Multiple versions of memmove. AARCH64 version.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+
+#if IS_IN (libc)
+/* Redefine memmove so that the compiler won't complain about the type
+ mismatch with the IFUNC selector in strong_alias, below. */
+# undef memmove
+# define memmove __redirect_memmove
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__redirect_memmove) __libc_memmove;
+
+extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
+
+libc_ifunc (__libc_memmove,
+ (IS_THUNDERX (midr)
+ ? __memmove_thunderx
+ : (IS_FALKOR (midr) || IS_PHECDA (midr)
+ ? __memmove_falkor
+ : __memmove_generic)));
+
+# undef memmove
+strong_alias (__libc_memmove, memmove);
+#endif
diff --git a/sysdeps/aarch64/multiarch/memmove_falkor.S b/sysdeps/aarch64/multiarch/memmove_falkor.S
new file mode 100644
index 0000000000..5163f68e53
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memmove_falkor.S
@@ -0,0 +1,225 @@
+/* Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Assumptions: ARMv8-a, AArch64, falkor, unaligned accesses. */
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define A_x x6
+#define B_x x7
+#define A_w w6
+#define B_w w7
+#define tmp1 x14
+
+#define Q_q q6
+#define A_q q22
+#define B_q q18
+#define C_q q19
+#define D_q q20
+#define E_q q21
+#define F_q q17
+#define G_q q23
+
+/* RATIONALE:
+
+ The move has 4 distinct parts:
+ * Small moves of 16 bytes and under
+ * Medium sized moves of 17-96 bytes
+ * Large moves where the source address is higher than the destination
+ (forward copies)
+ * Large moves where the destination address is higher than the source
+ (copy backward, or move).
+
+ We use only two registers q6 and q22 for the moves and move 32 bytes at a
+ time to correctly train the hardware prefetcher for better throughput. */
+ENTRY_ALIGN (__memmove_falkor, 6)
+
+ sub tmp1, dstin, src
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 96
+ ccmp tmp1, count, 2, hi
+ b.lo L(move_long)
+
+ cmp count, 16
+ b.ls L(copy16)
+ cmp count, 96
+ b.hi L(copy_long)
+
+ /* Medium copies: 17..96 bytes. */
+ sub tmp1, count, 1
+ ldr A_q, [src]
+ tbnz tmp1, 6, L(copy96)
+ ldr D_q, [srcend, -16]
+ tbz tmp1, 5, 1f
+ ldr B_q, [src, 16]
+ ldr C_q, [srcend, -32]
+ str B_q, [dstin, 16]
+ str C_q, [dstend, -32]
+1:
+ str A_q, [dstin]
+ str D_q, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Small copies: 0..16 bytes. */
+L(copy16):
+ cmp count, 8
+ b.lo 1f
+ ldr A_x, [src]
+ ldr B_x, [srcend, -8]
+ str A_x, [dstin]
+ str B_x, [dstend, -8]
+ ret
+ .p2align 4
+1:
+ /* 4-7 */
+ tbz count, 2, 1f
+ ldr A_w, [src]
+ ldr B_w, [srcend, -4]
+ str A_w, [dstin]
+ str B_w, [dstend, -4]
+ ret
+ .p2align 4
+1:
+ /* 2-3 */
+ tbz count, 1, 1f
+ ldrh A_w, [src]
+ ldrh B_w, [srcend, -2]
+ strh A_w, [dstin]
+ strh B_w, [dstend, -2]
+ ret
+ .p2align 4
+1:
+ /* 0-1 */
+ tbz count, 0, 1f
+ ldrb A_w, [src]
+ strb A_w, [dstin]
+1: ret
+
+ .p2align 4
+ /* Copy 64..96 bytes. Copy 64 bytes from the start and
+ 32 bytes from the end. */
+L(copy96):
+ ldr B_q, [src, 16]
+ ldr C_q, [src, 32]
+ ldr D_q, [src, 48]
+ ldr E_q, [srcend, -32]
+ ldr F_q, [srcend, -16]
+ str A_q, [dstin]
+ str B_q, [dstin, 16]
+ str C_q, [dstin, 32]
+ str D_q, [dstin, 48]
+ str E_q, [dstend, -32]
+ str F_q, [dstend, -16]
+ ret
+
+ /* Align SRC to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 96 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 32 bytes per iteration and prefetches one iteration ahead. */
+
+ .p2align 4
+L(copy_long):
+ ldr A_q, [src]
+ and tmp1, src, 15
+ bic src, src, 15
+ sub dst, dstin, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldr Q_q, [src, 16]!
+ str A_q, [dstin]
+ ldr A_q, [src, 16]!
+ subs count, count, 32 + 64 + 16 /* Test and readjust count. */
+ b.ls L(last64)
+
+L(loop64):
+ subs count, count, 32
+ str Q_q, [dst, 16]
+ ldr Q_q, [src, 16]!
+ str A_q, [dst, 32]!
+ ldr A_q, [src, 16]!
+ b.hi L(loop64)
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes and at least 33 bytes, so it is safe to always copy 64 bytes
+ from the end. */
+L(last64):
+ ldr C_q, [srcend, -64]
+ str Q_q, [dst, 16]
+ ldr B_q, [srcend, -48]
+ str A_q, [dst, 32]
+ ldr A_q, [srcend, -32]
+ ldr D_q, [srcend, -16]
+ str C_q, [dstend, -64]
+ str B_q, [dstend, -48]
+ str A_q, [dstend, -32]
+ str D_q, [dstend, -16]
+ ret
+
+ .p2align 4
+L(move_long):
+ cbz tmp1, 3f
+
+ /* Align SRCEND to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 96 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 32 bytes per iteration and prefetches one iteration ahead. */
+
+ ldr A_q, [srcend, -16]
+ and tmp1, srcend, 15
+ sub srcend, srcend, tmp1
+ ldr Q_q, [srcend, -16]!
+ str A_q, [dstend, -16]
+ sub count, count, tmp1
+ ldr A_q, [srcend, -16]!
+ sub dstend, dstend, tmp1
+ subs count, count, 32 + 64
+ b.ls 2f
+
+1:
+ subs count, count, 32
+ str Q_q, [dstend, -16]
+ ldr Q_q, [srcend, -16]!
+ str A_q, [dstend, -32]!
+ ldr A_q, [srcend, -16]!
+ b.hi 1b
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes and at least 33 bytes, so it is safe to always copy 64 bytes
+ from the start. */
+2:
+ ldr C_q, [src, 48]
+ str Q_q, [dstend, -16]
+ ldr B_q, [src, 32]
+ str A_q, [dstend, -32]
+ ldr A_q, [src, 16]
+ ldr D_q, [src]
+ str C_q, [dstin, 48]
+ str B_q, [dstin, 32]
+ str A_q, [dstin, 16]
+ str D_q, [dstin]
+3: ret
+
+END (__memmove_falkor)
+libc_hidden_builtin_def (__memmove_falkor)
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
new file mode 100644
index 0000000000..d74ed3a549
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -0,0 +1,41 @@
+/* Multiple versions of memset. AARCH64 version.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+
+#if IS_IN (libc)
+/* Redefine memset so that the compiler won't complain about the type
+ mismatch with the IFUNC selector in strong_alias, below. */
+# undef memset
+# define memset __redirect_memset
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__redirect_memset) __libc_memset;
+
+extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
+extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
+
+libc_ifunc (__libc_memset,
+ ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64
+ ? __memset_falkor
+ : __memset_generic));
+
+# undef memset
+strong_alias (__libc_memset, memset);
+#endif
diff --git a/sysdeps/aarch64/multiarch/memset_falkor.S b/sysdeps/aarch64/multiarch/memset_falkor.S
new file mode 100644
index 0000000000..16849a5afb
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_falkor.S
@@ -0,0 +1,53 @@
+/* Memset for falkor.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <memset-reg.h>
+
+/* Reading dczid_el0 is expensive on falkor so move it into the ifunc
+ resolver and assume ZVA size of 64 bytes. The IFUNC resolver takes care to
+ use this function only when ZVA is enabled. */
+
+#if IS_IN (libc)
+.macro zva_macro
+ .p2align 4
+ /* Write the first and last 64 byte aligned block using stp rather
+ than using DC ZVA. This is faster on some cores. */
+ str q0, [dst, 16]
+ stp q0, q0, [dst, 32]
+ bic dst, dst, 63
+ stp q0, q0, [dst, 64]
+ stp q0, q0, [dst, 96]
+ sub count, dstend, dst /* Count is now 128 too large. */
+ sub count, count, 128+64+64 /* Adjust count and bias for loop. */
+ add dst, dst, 128
+1: dc zva, dst
+ add dst, dst, 64
+ subs count, count, 64
+ b.hi 1b
+ stp q0, q0, [dst, 0]
+ stp q0, q0, [dst, 32]
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+.endm
+
+# define ZVA_MACRO zva_macro
+# define MEMSET __memset_falkor
+# include <sysdeps/aarch64/memset.S>
+#endif
diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S
new file mode 100644
index 0000000000..345a88b94f
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_generic.S
@@ -0,0 +1,27 @@
+/* Memset for aarch64, default version for internal use.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# define MEMSET __memset_generic
+/* Add a hidden definition for use within libc.so. */
+# ifdef SHARED
+ .globl __GI_memset; __GI_memset = __memset_generic
+# endif
+# include <sysdeps/aarch64/memset.S>
+#endif
diff --git a/sysdeps/aarch64/multiarch/rtld-memset.S b/sysdeps/aarch64/multiarch/rtld-memset.S
new file mode 100644
index 0000000000..8cdc6e1f50
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/rtld-memset.S
@@ -0,0 +1,23 @@
+/* Memset for aarch64, for the dynamic linker.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (rtld)
+# define MEMSET memset
+# include <sysdeps/aarch64/memset.S>
+#endif