Merge branch 't/tls' into refs/top-bases/t/tls-threadvar

author: Samuel Thibault <samuel.thibault@ens-lyon.org> 2018-12-27 19:01:57 +0000
committer: Samuel Thibault <samuel.thibault@ens-lyon.org> 2018-12-27 19:01:57 +0000
commit: cab56836b146bc129f1ad43f0393d95a9deca63a (patch)
tree: 4f4e655319bbac78fca170da05275c127429b460 /sysdeps/aarch64/multiarch
parent: 04ac1241a4cd004872282c2c82ec37fa33925292 (diff)
parent: 82dd75a7f436a19047325d62182590c9f9e23a78 (diff)
14 files changed, 1144 insertions, 0 deletions
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
new file mode 100644
index 0000000000..57ffdf7238
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -0,0 +1,4 @@
+ifeq ($(subdir),string)
+sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
+		   memcpy_falkor memmove_falkor memset_generic memset_falkor
+endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
new file mode 100644
index 0000000000..e55be80103
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -0,0 +1,57 @@
+/* Enumerate available IFUNC implementations of a function.  AARCH64 version.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <assert.h>
+#include <string.h>
+#include <wchar.h>
+#include <ldsodefs.h>
+#include <ifunc-impl-list.h>
+#include <init-arch.h>
+#include <stdio.h>
+
+/* Maximum number of IFUNC implementations.  */
+#define MAX_IFUNC	4
+
+size_t
+__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+			size_t max)
+{
+  assert (max >= MAX_IFUNC);
+
+  size_t i = 0;
+
+  INIT_ARCH ();
+
+  /* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c.  */
+  IFUNC_IMPL (i, name, memcpy,
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
+  IFUNC_IMPL (i, name, memmove,
+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
+  IFUNC_IMPL (i, name, memset,
+	      /* Enable this on non-falkor processors too so that other cores
+		 can do a comparative analysis with __memset_generic.  */
+	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
+
+  return i;
+}
diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
new file mode 100644
index 0000000000..d1e5703cb2
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/init-arch.h
@@ -0,0 +1,25 @@
+/* Define INIT_ARCH so that midr is initialized before use by IFUNCs.
+   This file is part of the GNU C Library.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <ldsodefs.h>
+
+#define INIT_ARCH()							      \
+  uint64_t __attribute__((unused)) midr =				      \
+    GLRO(dl_aarch64_cpu_features).midr_el1;				      \
+  unsigned __attribute__((unused)) zva_size =				      \
+    GLRO(dl_aarch64_cpu_features).zva_size;
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
new file mode 100644
index 0000000000..4a04a63b0f
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -0,0 +1,47 @@
+/* Multiple versions of memcpy. AARCH64 version.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+
+#if IS_IN (libc)
+/* Redefine memcpy so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+# undef memcpy
+# define memcpy __redirect_memcpy
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__redirect_memcpy) __libc_memcpy;
+
+extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
+
+libc_ifunc (__libc_memcpy,
+            (IS_THUNDERX (midr)
+	     ? __memcpy_thunderx
+	     : (IS_FALKOR (midr) || IS_PHECDA (midr)
+		? __memcpy_falkor
+		: (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
+		  ? __memcpy_thunderx2
+		  : __memcpy_generic))));
+
+# undef memcpy
+strong_alias (__libc_memcpy, memcpy);
+#endif
diff --git a/sysdeps/aarch64/multiarch/memcpy_falkor.S b/sysdeps/aarch64/multiarch/memcpy_falkor.S
new file mode 100644
index 0000000000..cdc2de4750
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy_falkor.S
@@ -0,0 +1,191 @@
+/* Optimized memcpy for Qualcomm Falkor processor.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+
+   ARMv8-a, AArch64, falkor, unaligned accesses.  */
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define tmp1	x14
+#define A_x	x6
+#define B_x	x7
+#define A_w	w6
+#define B_w	w7
+
+#define A_q	q0
+#define B_q	q1
+#define C_q	q2
+#define D_q	q3
+#define E_q	q4
+#define F_q	q5
+#define G_q	q6
+
+/* Copies are split into 3 main cases:
+
+   1. Small copies of up to 32 bytes
+   2. Medium copies of 33..128 bytes which are fully unrolled
+   3. Large copies of more than 128 bytes.
+
+   Large copies align the sourceto a quad word and use an unrolled loop
+   processing 64 bytes per iteration.
+
+   FALKOR-SPECIFIC DESIGN:
+
+   The smallest copies (32 bytes or less) focus on optimal pipeline usage,
+   which is why the redundant copies of 0-3 bytes have been replaced with
+   conditionals, since the former would unnecessarily break across multiple
+   issue groups.  The medium copy group has been enlarged to 128 bytes since
+   bumping up the small copies up to 32 bytes allows us to do that without
+   cost and also allows us to reduce the size of the prep code before loop64.
+
+   The copy loop uses only one register q0.  This is to ensure that all loads
+   hit a single hardware prefetcher which can get correctly trained to prefetch
+   a single stream.
+
+   The non-temporal stores help optimize cache utilization.  */
+
+#if IS_IN (libc)
+ENTRY_ALIGN (__memcpy_falkor, 6)
+
+	cmp	count, 32
+	add	srcend, src, count
+	add	dstend, dstin, count
+	b.ls	L(copy32)
+	ldr	A_q, [src]
+	cmp	count, 128
+	str	A_q, [dstin]
+	b.hi	L(copy_long)
+
+	/* Medium copies: 33..128 bytes.  */
+	sub	tmp1, count, 1
+	ldr	A_q, [src, 16]
+	ldr	B_q, [srcend, -32]
+	ldr	C_q, [srcend, -16]
+	tbz	tmp1, 6, 1f
+	ldr	D_q, [src, 32]
+	ldr	E_q, [src, 48]
+	str	D_q, [dstin, 32]
+	str	E_q, [dstin, 48]
+	ldr	F_q, [srcend, -64]
+	ldr	G_q, [srcend, -48]
+	str	F_q, [dstend, -64]
+	str	G_q, [dstend, -48]
+1:
+	str	A_q, [dstin, 16]
+	str	B_q, [dstend, -32]
+	str	C_q, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Small copies: 0..32 bytes.  */
+L(copy32):
+	/* 16-32 */
+	cmp	count, 16
+	b.lo	1f
+	ldr	A_q, [src]
+	ldr	B_q, [srcend, -16]
+	str	A_q, [dstin]
+	str	B_q, [dstend, -16]
+	ret
+	.p2align 4
+1:
+	/* 8-15 */
+	tbz	count, 3, 1f
+	ldr	A_x, [src]
+	ldr	B_x, [srcend, -8]
+	str	A_x, [dstin]
+	str	B_x, [dstend, -8]
+	ret
+	.p2align 4
+1:
+	/* 4-7 */
+	tbz	count, 2, 1f
+	ldr	A_w, [src]
+	ldr	B_w, [srcend, -4]
+	str	A_w, [dstin]
+	str	B_w, [dstend, -4]
+	ret
+	.p2align 4
+1:
+	/* 2-3 */
+	tbz	count, 1, 1f
+	ldrh	A_w, [src]
+	ldrh	B_w, [srcend, -2]
+	strh	A_w, [dstin]
+	strh	B_w, [dstend, -2]
+	ret
+	.p2align 4
+1:
+	/* 0-1 */
+	tbz	count, 0, 1f
+	ldrb	A_w, [src]
+	strb	A_w, [dstin]
+1:
+	ret
+
+	/* Align SRC to 16 bytes and copy; that way at least one of the
+	   accesses is aligned throughout the copy sequence.
+
+	   The count is off by 0 to 15 bytes, but this is OK because we trim
+	   off the last 64 bytes to copy off from the end.  Due to this the
+	   loop never runs out of bounds.  */
+	.p2align 6
+L(copy_long):
+	sub	count, count, 64 + 16
+	and	tmp1, src, 15
+	bic	src, src, 15
+	sub	dst, dstin, tmp1
+	add	count, count, tmp1
+
+L(loop64):
+	ldr	A_q, [src, 16]!
+	str	A_q, [dst, 16]
+	ldr	A_q, [src, 16]!
+	subs	count, count, 64
+	str	A_q, [dst, 32]
+	ldr	A_q, [src, 16]!
+	str	A_q, [dst, 48]
+	ldr	A_q, [src, 16]!
+	str	A_q, [dst, 64]!
+	b.hi	L(loop64)
+
+	/* Write the last full set of 64 bytes.  The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the end even if
+	   there is just 1 byte left.  */
+L(last64):
+	ldr	E_q, [srcend, -64]
+	str	E_q, [dstend, -64]
+	ldr	D_q, [srcend, -48]
+	str	D_q, [dstend, -48]
+	ldr	C_q, [srcend, -32]
+	str	C_q, [dstend, -32]
+	ldr	B_q, [srcend, -16]
+	str	B_q, [dstend, -16]
+	ret
+
+END (__memcpy_falkor)
+libc_hidden_builtin_def (__memcpy_falkor)
+#endif
diff --git a/sysdeps/aarch64/multiarch/memcpy_generic.S b/sysdeps/aarch64/multiarch/memcpy_generic.S
new file mode 100644
index 0000000000..4bc81f4ce5
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy_generic.S
@@ -0,0 +1,44 @@
+/* A Generic Optimized memcpy implementation for AARCH64.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* The actual memcpy and memmove code is in ../memcpy.S.  If we are
+   building libc this file defines __memcpy_generic and __memmove_generic.
+   Otherwise the include of ../memcpy.S will define the normal __memcpy
+   and__memmove entry points.  */
+
+#include <sysdep.h>
+
+#if IS_IN (libc)
+
+# define MEMCPY __memcpy_generic
+# define MEMMOVE __memmove_generic
+
+/* Do not hide the generic versions of memcpy and memmove, we use them
+   internally.  */
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)
+
+# ifdef SHARED
+/* It doesn't make sense to send libc-internal memcpy calls through a PLT. */
+	.globl __GI_memcpy; __GI_memcpy = __memcpy_generic
+	.globl __GI_memmove; __GI_memmove = __memmove_generic
+# endif
+
+#endif
+
+#include "../memcpy.S"
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx.S b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
new file mode 100644
index 0000000000..de494d933d
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
@@ -0,0 +1,336 @@
+/* A Thunderx Optimized memcpy implementation for AARCH64.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* The actual code in this memcpy and memmove should be identical to the
+   generic version except for the code under '#ifdef THUNDERX'.  This is
+   to make is easier to keep this version and the generic version in sync
+   for changes that are not specific to thunderx.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define A_l	x6
+#define A_lw	w6
+#define A_h	x7
+#define A_hw	w7
+#define B_l	x8
+#define B_lw	w8
+#define B_h	x9
+#define C_l	x10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	src
+#define E_h	count
+#define F_l	srcend
+#define F_h	dst
+#define G_l	count
+#define G_h	dst
+#define tmp1	x14
+
+/* Copies are split into 3 main cases: small copies of up to 16 bytes,
+   medium copies of 17..96 bytes which are fully unrolled. Large copies
+   of more than 96 bytes align the destination and use an unrolled loop
+   processing 64 bytes per iteration.
+   In order to share code with memmove, small and medium copies read all
+   data before writing, allowing any kind of overlap. So small, medium
+   and large backwards memmoves are handled by falling through into memcpy.
+   Overlapping large forward memmoves use a loop that copies backwards.
+*/
+
+#ifndef MEMMOVE
+# define MEMMOVE memmove
+#endif
+#ifndef MEMCPY
+# define MEMCPY memcpy
+#endif
+
+#if IS_IN (libc)
+
+# ifndef USE_THUNDERX2
+#  undef MEMCPY
+#  define MEMCPY __memcpy_thunderx
+#  undef MEMMOVE
+#  define MEMMOVE __memmove_thunderx
+#  define USE_THUNDERX
+# endif
+
+ENTRY_ALIGN (MEMMOVE, 6)
+
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
+
+	sub	tmp1, dstin, src
+	cmp	count, 96
+	ccmp	tmp1, count, 2, hi
+	b.lo	L(move_long)
+
+	/* Common case falls through into memcpy.  */
+END (MEMMOVE)
+libc_hidden_builtin_def (MEMMOVE)
+ENTRY (MEMCPY)
+
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
+
+	prfm	PLDL1KEEP, [src]
+	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 16
+	b.ls	L(copy16)
+	cmp	count, 96
+	b.hi	L(copy_long)
+
+	/* Medium copies: 17..96 bytes.  */
+	sub	tmp1, count, 1
+	ldp	A_l, A_h, [src]
+	tbnz	tmp1, 6, L(copy96)
+	ldp	D_l, D_h, [srcend, -16]
+	tbz	tmp1, 5, 1f
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [srcend, -32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstend, -32]
+1:
+	stp	A_l, A_h, [dstin]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Small copies: 0..16 bytes.  */
+L(copy16):
+	cmp	count, 8
+	b.lo	1f
+	ldr	A_l, [src]
+	ldr	A_h, [srcend, -8]
+	str	A_l, [dstin]
+	str	A_h, [dstend, -8]
+	ret
+	.p2align 4
+1:
+	tbz	count, 2, 1f
+	ldr	A_lw, [src]
+	ldr	A_hw, [srcend, -4]
+	str	A_lw, [dstin]
+	str	A_hw, [dstend, -4]
+	ret
+
+	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
+	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
+1:
+	cbz	count, 2f
+	lsr	tmp1, count, 1
+	ldrb	A_lw, [src]
+	ldrb	A_hw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	A_hw, [dstend, -1]
+2:	ret
+
+	.p2align 4
+	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
+	   32 bytes from the end.  */
+L(copy96):
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [src, 32]
+	ldp	D_l, D_h, [src, 48]
+	ldp	E_l, E_h, [srcend, -32]
+	ldp	F_l, F_h, [srcend, -16]
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin, 32]
+	stp	D_l, D_h, [dstin, 48]
+	stp	E_l, E_h, [dstend, -32]
+	stp	F_l, F_h, [dstend, -16]
+	ret
+
+	/* Align DST to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.  There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.  The loop
+	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+	.p2align 4
+L(copy_long):
+
+# if defined(USE_THUNDERX) || defined (USE_THUNDERX2)
+
+	/* On thunderx, large memcpy's are helped by software prefetching.
+	   This loop is identical to the one below it but with prefetching
+	   instructions included.  For loops that are less than 32768 bytes,
+	   the prefetching does not help and slow the code down so we only
+	   use the prefetching loop for the largest memcpys.  */
+
+	cmp	count, #32768
+	b.lo	L(copy_long_without_prefetch)
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	ldp	D_l, D_h, [src]
+	sub	src, src, tmp1
+#  if defined(USE_THUNDERX)
+	prfm	pldl1strm, [src, 384]
+#  elif defined(USE_THUNDERX2)
+	prfm	pldl1strm, [src, 256]
+#  endif
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_l, A_h, [src, 16]
+	stp	D_l, D_h, [dstin]
+	ldp	B_l, B_h, [src, 32]
+	ldp	C_l, C_h, [src, 48]
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+
+L(prefetch_loop64):
+#  if defined(USE_THUNDERX)
+	tbz	src, #6, 1f
+	prfm	pldl1strm, [src, 512]
+1:
+#  elif defined(USE_THUNDERX2)
+	prfm	pldl1strm, [src, 256]
+#  endif
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]!
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 64
+	b.hi	L(prefetch_loop64)
+	b	L(last64)
+
+L(copy_long_without_prefetch):
+# endif
+
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	ldp	D_l, D_h, [src]
+	sub	src, src, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_l, A_h, [src, 16]
+	stp	D_l, D_h, [dstin]
+	ldp	B_l, B_h, [src, 32]
+	ldp	C_l, C_h, [src, 48]
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	L(last64)
+L(loop64):
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]!
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 64
+	b.hi	L(loop64)
+
+	/* Write the last full set of 64 bytes.  The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the end even if
+	   there is just 1 byte left.  */
+L(last64):
+	ldp	E_l, E_h, [srcend, -64]
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [srcend, -48]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [srcend, -16]
+	stp	D_l, D_h, [dst, 64]
+	stp	E_l, E_h, [dstend, -64]
+	stp	A_l, A_h, [dstend, -48]
+	stp	B_l, B_h, [dstend, -32]
+	stp	C_l, C_h, [dstend, -16]
+	ret
+
+	.p2align 4
+L(move_long):
+	cbz	tmp1, 3f
+
+	add	srcend, src, count
+	add	dstend, dstin, count
+
+	/* Align dstend to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.  There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.  The loop
+	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+	and	tmp1, dstend, 15
+	ldp	D_l, D_h, [srcend, -16]
+	sub	srcend, srcend, tmp1
+	sub	count, count, tmp1
+	ldp	A_l, A_h, [srcend, -16]
+	stp	D_l, D_h, [dstend, -16]
+	ldp	B_l, B_h, [srcend, -32]
+	ldp	C_l, C_h, [srcend, -48]
+	ldp	D_l, D_h, [srcend, -64]!
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	2f
+
+	nop
+1:
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [srcend, -16]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [srcend, -48]
+	stp	D_l, D_h, [dstend, -64]!
+	ldp	D_l, D_h, [srcend, -64]!
+	subs	count, count, 64
+	b.hi	1b
+
+	/* Write the last full set of 64 bytes.  The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the start even if
+	   there is just 1 byte left.  */
+2:
+	ldp	G_l, G_h, [src, 48]
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [src, 32]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [src, 16]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [src]
+	stp	D_l, D_h, [dstend, -64]
+	stp	G_l, G_h, [dstin, 48]
+	stp	A_l, A_h, [dstin, 32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin]
+3:	ret
+
+END (MEMCPY)
+libc_hidden_builtin_def (MEMCPY)
+
+#endif
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
new file mode 100644
index 0000000000..8501abf725
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
@@ -0,0 +1,27 @@
+/* A Thunderx2 Optimized memcpy implementation for AARCH64.
+   Copyright (C) 2018 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* The actual code in this memcpy and memmove is in memcpy_thunderx.S.
+   The only real differences are with the prefetching instructions.  */
+
+#define MEMCPY __memcpy_thunderx2
+#define MEMMOVE __memmove_thunderx2
+#define USE_THUNDERX2
+
+#include "memcpy_thunderx.S"
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
new file mode 100644
index 0000000000..e69d816291
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -0,0 +1,44 @@
+/* Multiple versions of memmove. AARCH64 version.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+
+#if IS_IN (libc)
+/* Redefine memmove so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+# undef memmove
+# define memmove __redirect_memmove
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__redirect_memmove) __libc_memmove;
+
+extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
+
+libc_ifunc (__libc_memmove,
+            (IS_THUNDERX (midr)
+	     ? __memmove_thunderx
+	     : (IS_FALKOR (midr) || IS_PHECDA (midr)
+		? __memmove_falkor
+		: __memmove_generic)));
+
+# undef memmove
+strong_alias (__libc_memmove, memmove);
+#endif
diff --git a/sysdeps/aarch64/multiarch/memmove_falkor.S b/sysdeps/aarch64/multiarch/memmove_falkor.S
new file mode 100644
index 0000000000..5163f68e53
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memmove_falkor.S
@@ -0,0 +1,225 @@
+/* Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions: ARMv8-a, AArch64, falkor, unaligned accesses.  */
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define A_x	x6
+#define B_x	x7
+#define A_w	w6
+#define B_w	w7
+#define tmp1	x14
+
+#define Q_q	q6
+#define A_q	q22
+#define B_q	q18
+#define C_q	q19
+#define D_q	q20
+#define E_q	q21
+#define F_q	q17
+#define G_q	q23
+
+/* RATIONALE:
+
+   The move has 4 distinct parts:
+   * Small moves of 16 bytes and under
+   * Medium sized moves of 17-96 bytes
+   * Large moves where the source address is higher than the destination
+     (forward copies)
+   * Large moves where the destination address is higher than the source
+     (copy backward, or move).
+
+   We use only two registers q6 and q22 for the moves and move 32 bytes at a
+   time to correctly train the hardware prefetcher for better throughput.  */
+ENTRY_ALIGN (__memmove_falkor, 6)
+
+	sub	tmp1, dstin, src
+	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 96
+	ccmp	tmp1, count, 2, hi
+	b.lo	L(move_long)
+
+	cmp	count, 16
+	b.ls	L(copy16)
+	cmp	count, 96
+	b.hi	L(copy_long)
+
+	/* Medium copies: 17..96 bytes.  */
+	sub	tmp1, count, 1
+	ldr	A_q, [src]
+	tbnz	tmp1, 6, L(copy96)
+	ldr	D_q, [srcend, -16]
+	tbz	tmp1, 5, 1f
+	ldr	B_q, [src, 16]
+	ldr	C_q, [srcend, -32]
+	str	B_q, [dstin, 16]
+	str	C_q, [dstend, -32]
+1:
+	str	A_q, [dstin]
+	str	D_q, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Small copies: 0..16 bytes.  */
+L(copy16):
+	cmp	count, 8
+	b.lo	1f
+	ldr	A_x, [src]
+	ldr	B_x, [srcend, -8]
+	str	A_x, [dstin]
+	str	B_x, [dstend, -8]
+	ret
+	.p2align 4
+1:
+	/* 4-7 */
+	tbz	count, 2, 1f
+	ldr	A_w, [src]
+	ldr	B_w, [srcend, -4]
+	str	A_w, [dstin]
+	str	B_w, [dstend, -4]
+	ret
+	.p2align 4
+1:
+	/* 2-3 */
+	tbz	count, 1, 1f
+	ldrh	A_w, [src]
+	ldrh	B_w, [srcend, -2]
+	strh	A_w, [dstin]
+	strh	B_w, [dstend, -2]
+	ret
+	.p2align 4
+1:
+	/* 0-1 */
+	tbz	count, 0, 1f
+	ldrb	A_w, [src]
+	strb	A_w, [dstin]
+1:	ret
+
+	.p2align 4
+	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
+	   32 bytes from the end.  */
+L(copy96):
+	ldr	B_q, [src, 16]
+	ldr	C_q, [src, 32]
+	ldr	D_q, [src, 48]
+	ldr	E_q, [srcend, -32]
+	ldr	F_q, [srcend, -16]
+	str	A_q, [dstin]
+	str	B_q, [dstin, 16]
+	str	C_q, [dstin, 32]
+	str	D_q, [dstin, 48]
+	str	E_q, [dstend, -32]
+	str	F_q, [dstend, -16]
+	ret
+
+	/* Align SRC to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.  There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.  The loop
+	   copies 32 bytes per iteration and prefetches one iteration ahead.  */
+
+	.p2align 4
+L(copy_long):
+	ldr	A_q, [src]
+	and	tmp1, src, 15
+	bic	src, src, 15
+	sub	dst, dstin, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldr	Q_q, [src, 16]!
+	str	A_q, [dstin]
+	ldr	A_q, [src, 16]!
+	subs	count, count, 32 + 64 + 16	/* Test and readjust count.  */
+	b.ls	L(last64)
+
+L(loop64):
+	subs	count, count, 32
+	str	Q_q, [dst, 16]
+	ldr	Q_q, [src, 16]!
+	str	A_q, [dst, 32]!
+	ldr	A_q, [src, 16]!
+	b.hi	L(loop64)
+
+	/* Write the last full set of 64 bytes.  The remainder is at most 64
+	   bytes and at least 33 bytes, so it is safe to always copy 64 bytes
+	   from the end.  */
+L(last64):
+	ldr	C_q, [srcend, -64]
+	str	Q_q, [dst, 16]
+	ldr	B_q, [srcend, -48]
+	str	A_q, [dst, 32]
+	ldr	A_q, [srcend, -32]
+	ldr	D_q, [srcend, -16]
+	str	C_q, [dstend, -64]
+	str	B_q, [dstend, -48]
+	str	A_q, [dstend, -32]
+	str	D_q, [dstend, -16]
+	ret
+
+	.p2align 4
+L(move_long):
+	cbz	tmp1, 3f
+
+	/* Align SRCEND to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.  There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.  The loop
+	   copies 32 bytes per iteration and prefetches one iteration ahead.  */
+
+	ldr	A_q, [srcend, -16]
+	and	tmp1, srcend, 15
+	sub	srcend, srcend, tmp1
+	ldr	Q_q, [srcend, -16]!
+	str	A_q, [dstend, -16]
+	sub	count, count, tmp1
+	ldr	A_q, [srcend, -16]!
+	sub	dstend, dstend, tmp1
+	subs	count, count, 32 + 64
+	b.ls	2f
+
+1:
+	subs	count, count, 32
+	str	Q_q, [dstend, -16]
+	ldr	Q_q, [srcend, -16]!
+	str	A_q, [dstend, -32]!
+	ldr	A_q, [srcend, -16]!
+	b.hi	1b
+
+	/* Write the last full set of 64 bytes.  The remainder is at most 64
+	   bytes and at least 33 bytes, so it is safe to always copy 64 bytes
+	   from the start.  */
+2:
+	ldr	C_q, [src, 48]
+	str	Q_q, [dstend, -16]
+	ldr	B_q, [src, 32]
+	str	A_q, [dstend, -32]
+	ldr	A_q, [src, 16]
+	ldr	D_q, [src]
+	str	C_q, [dstin, 48]
+	str	B_q, [dstin, 32]
+	str	A_q, [dstin, 16]
+	str	D_q, [dstin]
+3:	ret
+
+END (__memmove_falkor)
+libc_hidden_builtin_def (__memmove_falkor)
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
new file mode 100644
index 0000000000..d74ed3a549
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -0,0 +1,41 @@
+/* Multiple versions of memset. AARCH64 version.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+
+#if IS_IN (libc)
+/* Redefine memset so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+# undef memset
+# define memset __redirect_memset
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__redirect_memset) __libc_memset;
+
+extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
+extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
+
+libc_ifunc (__libc_memset,
+	    ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64
+	     ? __memset_falkor
+	     : __memset_generic));
+
+# undef memset
+strong_alias (__libc_memset, memset);
+#endif
diff --git a/sysdeps/aarch64/multiarch/memset_falkor.S b/sysdeps/aarch64/multiarch/memset_falkor.S
new file mode 100644
index 0000000000..16849a5afb
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_falkor.S
@@ -0,0 +1,53 @@
+/* Memset for falkor.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <memset-reg.h>
+
+/* Reading dczid_el0 is expensive on falkor so move it into the ifunc
+   resolver and assume ZVA size of 64 bytes.  The IFUNC resolver takes care to
+   use this function only when ZVA is enabled.  */
+
+#if IS_IN (libc)
+.macro zva_macro
+	.p2align 4
+	/* Write the first and last 64 byte aligned block using stp rather
+	   than using DC ZVA.  This is faster on some cores.  */
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	bic	dst, dst, 63
+	stp	q0, q0, [dst, 64]
+	stp	q0, q0, [dst, 96]
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+1:	dc	zva, dst
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	1b
+	stp	q0, q0, [dst, 0]
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+.endm
+
+# define ZVA_MACRO zva_macro
+# define MEMSET __memset_falkor
+# include <sysdeps/aarch64/memset.S>
+#endif
diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S
new file mode 100644
index 0000000000..345a88b94f
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_generic.S
@@ -0,0 +1,27 @@
+/* Memset for aarch64, default version for internal use.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# define MEMSET __memset_generic
+/* Add a hidden definition for use within libc.so.  */
+# ifdef SHARED
+	.globl __GI_memset; __GI_memset = __memset_generic
+# endif
+# include <sysdeps/aarch64/memset.S>
+#endif
diff --git a/sysdeps/aarch64/multiarch/rtld-memset.S b/sysdeps/aarch64/multiarch/rtld-memset.S
new file mode 100644
index 0000000000..8cdc6e1f50
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/rtld-memset.S
@@ -0,0 +1,23 @@
+/* Memset for aarch64, for the dynamic linker.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (rtld)
+# define MEMSET memset
+# include <sysdeps/aarch64/memset.S>
+#endif
author	Samuel Thibault <samuel.thibault@ens-lyon.org>	2018-12-27 19:01:57 +0000
committer	Samuel Thibault <samuel.thibault@ens-lyon.org>	2018-12-27 19:01:57 +0000
commit	cab56836b146bc129f1ad43f0393d95a9deca63a (patch)
tree	4f4e655319bbac78fca170da05275c127429b460 /sysdeps/aarch64/multiarch
parent	04ac1241a4cd004872282c2c82ec37fa33925292 (diff)
parent	82dd75a7f436a19047325d62182590c9f9e23a78 (diff)