diff options
author | Samuel Thibault <samuel.thibault@ens-lyon.org> | 2018-12-27 19:01:57 +0000 |
---|---|---|
committer | Samuel Thibault <samuel.thibault@ens-lyon.org> | 2018-12-27 19:01:57 +0000 |
commit | cab56836b146bc129f1ad43f0393d95a9deca63a (patch) | |
tree | 4f4e655319bbac78fca170da05275c127429b460 /sysdeps/aarch64/multiarch | |
parent | 04ac1241a4cd004872282c2c82ec37fa33925292 (diff) | |
parent | 82dd75a7f436a19047325d62182590c9f9e23a78 (diff) |
Merge branch 't/tls' into refs/top-bases/t/tls-threadvar
Diffstat (limited to 'sysdeps/aarch64/multiarch')
-rw-r--r-- | sysdeps/aarch64/multiarch/Makefile | 4 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/ifunc-impl-list.c | 57 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/init-arch.h | 25 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/memcpy.c | 47 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/memcpy_falkor.S | 191 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/memcpy_generic.S | 44 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/memcpy_thunderx.S | 336 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/memcpy_thunderx2.S | 27 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/memmove.c | 44 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/memmove_falkor.S | 225 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/memset.c | 41 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/memset_falkor.S | 53 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/memset_generic.S | 27 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/rtld-memset.S | 23 |
14 files changed, 1144 insertions, 0 deletions
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile new file mode 100644 index 0000000000..57ffdf7238 --- /dev/null +++ b/sysdeps/aarch64/multiarch/Makefile @@ -0,0 +1,4 @@ +ifeq ($(subdir),string) +sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \ + memcpy_falkor memmove_falkor memset_generic memset_falkor +endif diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c new file mode 100644 index 0000000000..e55be80103 --- /dev/null +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -0,0 +1,57 @@ +/* Enumerate available IFUNC implementations of a function. AARCH64 version. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <assert.h> +#include <string.h> +#include <wchar.h> +#include <ldsodefs.h> +#include <ifunc-impl-list.h> +#include <init-arch.h> +#include <stdio.h> + +/* Maximum number of IFUNC implementations. */ +#define MAX_IFUNC 4 + +size_t +__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + size_t max) +{ + assert (max >= MAX_IFUNC); + + size_t i = 0; + + INIT_ARCH (); + + /* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c. */ + IFUNC_IMPL (i, name, memcpy, + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic)) + IFUNC_IMPL (i, name, memmove, + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx) + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor) + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic)) + IFUNC_IMPL (i, name, memset, + /* Enable this on non-falkor processors too so that other cores + can do a comparative analysis with __memset_generic. */ + IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor) + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic)) + + return i; +} diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h new file mode 100644 index 0000000000..d1e5703cb2 --- /dev/null +++ b/sysdeps/aarch64/multiarch/init-arch.h @@ -0,0 +1,25 @@ +/* Define INIT_ARCH so that midr is initialized before use by IFUNCs. + This file is part of the GNU C Library. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <ldsodefs.h> + +#define INIT_ARCH() \ + uint64_t __attribute__((unused)) midr = \ + GLRO(dl_aarch64_cpu_features).midr_el1; \ + unsigned __attribute__((unused)) zva_size = \ + GLRO(dl_aarch64_cpu_features).zva_size; diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c new file mode 100644 index 0000000000..4a04a63b0f --- /dev/null +++ b/sysdeps/aarch64/multiarch/memcpy.c @@ -0,0 +1,47 @@ +/* Multiple versions of memcpy. AARCH64 version. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ + +#if IS_IN (libc) +/* Redefine memcpy so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +# undef memcpy +# define memcpy __redirect_memcpy +# include <string.h> +# include <init-arch.h> + +extern __typeof (__redirect_memcpy) __libc_memcpy; + +extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden; +extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden; +extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden; +extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden; + +libc_ifunc (__libc_memcpy, + (IS_THUNDERX (midr) + ? __memcpy_thunderx + : (IS_FALKOR (midr) || IS_PHECDA (midr) + ? __memcpy_falkor + : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr) + ? __memcpy_thunderx2 + : __memcpy_generic)))); + +# undef memcpy +strong_alias (__libc_memcpy, memcpy); +#endif diff --git a/sysdeps/aarch64/multiarch/memcpy_falkor.S b/sysdeps/aarch64/multiarch/memcpy_falkor.S new file mode 100644 index 0000000000..cdc2de4750 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memcpy_falkor.S @@ -0,0 +1,191 @@ +/* Optimized memcpy for Qualcomm Falkor processor. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Assumptions: + + ARMv8-a, AArch64, falkor, unaligned accesses. */ + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define tmp1 x14 +#define A_x x6 +#define B_x x7 +#define A_w w6 +#define B_w w7 + +#define A_q q0 +#define B_q q1 +#define C_q q2 +#define D_q q3 +#define E_q q4 +#define F_q q5 +#define G_q q6 + +/* Copies are split into 3 main cases: + + 1. Small copies of up to 32 bytes + 2. Medium copies of 33..128 bytes which are fully unrolled + 3. Large copies of more than 128 bytes. + + Large copies align the sourceto a quad word and use an unrolled loop + processing 64 bytes per iteration. + + FALKOR-SPECIFIC DESIGN: + + The smallest copies (32 bytes or less) focus on optimal pipeline usage, + which is why the redundant copies of 0-3 bytes have been replaced with + conditionals, since the former would unnecessarily break across multiple + issue groups. The medium copy group has been enlarged to 128 bytes since + bumping up the small copies up to 32 bytes allows us to do that without + cost and also allows us to reduce the size of the prep code before loop64. + + The copy loop uses only one register q0. This is to ensure that all loads + hit a single hardware prefetcher which can get correctly trained to prefetch + a single stream. + + The non-temporal stores help optimize cache utilization. */ + +#if IS_IN (libc) +ENTRY_ALIGN (__memcpy_falkor, 6) + + cmp count, 32 + add srcend, src, count + add dstend, dstin, count + b.ls L(copy32) + ldr A_q, [src] + cmp count, 128 + str A_q, [dstin] + b.hi L(copy_long) + + /* Medium copies: 33..128 bytes. */ + sub tmp1, count, 1 + ldr A_q, [src, 16] + ldr B_q, [srcend, -32] + ldr C_q, [srcend, -16] + tbz tmp1, 6, 1f + ldr D_q, [src, 32] + ldr E_q, [src, 48] + str D_q, [dstin, 32] + str E_q, [dstin, 48] + ldr F_q, [srcend, -64] + ldr G_q, [srcend, -48] + str F_q, [dstend, -64] + str G_q, [dstend, -48] +1: + str A_q, [dstin, 16] + str B_q, [dstend, -32] + str C_q, [dstend, -16] + ret + + .p2align 4 + /* Small copies: 0..32 bytes. */ +L(copy32): + /* 16-32 */ + cmp count, 16 + b.lo 1f + ldr A_q, [src] + ldr B_q, [srcend, -16] + str A_q, [dstin] + str B_q, [dstend, -16] + ret + .p2align 4 +1: + /* 8-15 */ + tbz count, 3, 1f + ldr A_x, [src] + ldr B_x, [srcend, -8] + str A_x, [dstin] + str B_x, [dstend, -8] + ret + .p2align 4 +1: + /* 4-7 */ + tbz count, 2, 1f + ldr A_w, [src] + ldr B_w, [srcend, -4] + str A_w, [dstin] + str B_w, [dstend, -4] + ret + .p2align 4 +1: + /* 2-3 */ + tbz count, 1, 1f + ldrh A_w, [src] + ldrh B_w, [srcend, -2] + strh A_w, [dstin] + strh B_w, [dstend, -2] + ret + .p2align 4 +1: + /* 0-1 */ + tbz count, 0, 1f + ldrb A_w, [src] + strb A_w, [dstin] +1: + ret + + /* Align SRC to 16 bytes and copy; that way at least one of the + accesses is aligned throughout the copy sequence. + + The count is off by 0 to 15 bytes, but this is OK because we trim + off the last 64 bytes to copy off from the end. Due to this the + loop never runs out of bounds. */ + .p2align 6 +L(copy_long): + sub count, count, 64 + 16 + and tmp1, src, 15 + bic src, src, 15 + sub dst, dstin, tmp1 + add count, count, tmp1 + +L(loop64): + ldr A_q, [src, 16]! + str A_q, [dst, 16] + ldr A_q, [src, 16]! + subs count, count, 64 + str A_q, [dst, 32] + ldr A_q, [src, 16]! + str A_q, [dst, 48] + ldr A_q, [src, 16]! + str A_q, [dst, 64]! + b.hi L(loop64) + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes, so it is safe to always copy 64 bytes from the end even if + there is just 1 byte left. */ +L(last64): + ldr E_q, [srcend, -64] + str E_q, [dstend, -64] + ldr D_q, [srcend, -48] + str D_q, [dstend, -48] + ldr C_q, [srcend, -32] + str C_q, [dstend, -32] + ldr B_q, [srcend, -16] + str B_q, [dstend, -16] + ret + +END (__memcpy_falkor) +libc_hidden_builtin_def (__memcpy_falkor) +#endif diff --git a/sysdeps/aarch64/multiarch/memcpy_generic.S b/sysdeps/aarch64/multiarch/memcpy_generic.S new file mode 100644 index 0000000000..4bc81f4ce5 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memcpy_generic.S @@ -0,0 +1,44 @@ +/* A Generic Optimized memcpy implementation for AARCH64. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* The actual memcpy and memmove code is in ../memcpy.S. If we are + building libc this file defines __memcpy_generic and __memmove_generic. + Otherwise the include of ../memcpy.S will define the normal __memcpy + and__memmove entry points. */ + +#include <sysdep.h> + +#if IS_IN (libc) + +# define MEMCPY __memcpy_generic +# define MEMMOVE __memmove_generic + +/* Do not hide the generic versions of memcpy and memmove, we use them + internally. */ +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) + +# ifdef SHARED +/* It doesn't make sense to send libc-internal memcpy calls through a PLT. */ + .globl __GI_memcpy; __GI_memcpy = __memcpy_generic + .globl __GI_memmove; __GI_memmove = __memmove_generic +# endif + +#endif + +#include "../memcpy.S" diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx.S b/sysdeps/aarch64/multiarch/memcpy_thunderx.S new file mode 100644 index 0000000000..de494d933d --- /dev/null +++ b/sysdeps/aarch64/multiarch/memcpy_thunderx.S @@ -0,0 +1,336 @@ +/* A Thunderx Optimized memcpy implementation for AARCH64. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* The actual code in this memcpy and memmove should be identical to the + generic version except for the code under '#ifdef THUNDERX'. This is + to make is easier to keep this version and the generic version in sync + for changes that are not specific to thunderx. */ + +#include <sysdep.h> + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses. + * + */ + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define A_hw w7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_l x10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l src +#define E_h count +#define F_l srcend +#define F_h dst +#define G_l count +#define G_h dst +#define tmp1 x14 + +/* Copies are split into 3 main cases: small copies of up to 16 bytes, + medium copies of 17..96 bytes which are fully unrolled. Large copies + of more than 96 bytes align the destination and use an unrolled loop + processing 64 bytes per iteration. + In order to share code with memmove, small and medium copies read all + data before writing, allowing any kind of overlap. So small, medium + and large backwards memmoves are handled by falling through into memcpy. + Overlapping large forward memmoves use a loop that copies backwards. +*/ + +#ifndef MEMMOVE +# define MEMMOVE memmove +#endif +#ifndef MEMCPY +# define MEMCPY memcpy +#endif + +#if IS_IN (libc) + +# ifndef USE_THUNDERX2 +# undef MEMCPY +# define MEMCPY __memcpy_thunderx +# undef MEMMOVE +# define MEMMOVE __memmove_thunderx +# define USE_THUNDERX +# endif + +ENTRY_ALIGN (MEMMOVE, 6) + + DELOUSE (0) + DELOUSE (1) + DELOUSE (2) + + sub tmp1, dstin, src + cmp count, 96 + ccmp tmp1, count, 2, hi + b.lo L(move_long) + + /* Common case falls through into memcpy. */ +END (MEMMOVE) +libc_hidden_builtin_def (MEMMOVE) +ENTRY (MEMCPY) + + DELOUSE (0) + DELOUSE (1) + DELOUSE (2) + + prfm PLDL1KEEP, [src] + add srcend, src, count + add dstend, dstin, count + cmp count, 16 + b.ls L(copy16) + cmp count, 96 + b.hi L(copy_long) + + /* Medium copies: 17..96 bytes. */ + sub tmp1, count, 1 + ldp A_l, A_h, [src] + tbnz tmp1, 6, L(copy96) + ldp D_l, D_h, [srcend, -16] + tbz tmp1, 5, 1f + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] +1: + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Small copies: 0..16 bytes. */ +L(copy16): + cmp count, 8 + b.lo 1f + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + .p2align 4 +1: + tbz count, 2, 1f + ldr A_lw, [src] + ldr A_hw, [srcend, -4] + str A_lw, [dstin] + str A_hw, [dstend, -4] + ret + + /* Copy 0..3 bytes. Use a branchless sequence that copies the same + byte 3 times if count==1, or the 2nd byte twice if count==2. */ +1: + cbz count, 2f + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb A_hw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb A_hw, [dstend, -1] +2: ret + + .p2align 4 + /* Copy 64..96 bytes. Copy 64 bytes from the start and + 32 bytes from the end. */ +L(copy96): + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [src, 32] + ldp D_l, D_h, [src, 48] + ldp E_l, E_h, [srcend, -32] + ldp F_l, F_h, [srcend, -16] + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin, 32] + stp D_l, D_h, [dstin, 48] + stp E_l, E_h, [dstend, -32] + stp F_l, F_h, [dstend, -16] + ret + + /* Align DST to 16 byte alignment so that we don't cross cache line + boundaries on both loads and stores. There are at least 96 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 64 bytes per iteration and prefetches one iteration ahead. */ + + .p2align 4 +L(copy_long): + +# if defined(USE_THUNDERX) || defined (USE_THUNDERX2) + + /* On thunderx, large memcpy's are helped by software prefetching. + This loop is identical to the one below it but with prefetching + instructions included. For loops that are less than 32768 bytes, + the prefetching does not help and slow the code down so we only + use the prefetching loop for the largest memcpys. */ + + cmp count, #32768 + b.lo L(copy_long_without_prefetch) + and tmp1, dstin, 15 + bic dst, dstin, 15 + ldp D_l, D_h, [src] + sub src, src, tmp1 +# if defined(USE_THUNDERX) + prfm pldl1strm, [src, 384] +# elif defined(USE_THUNDERX2) + prfm pldl1strm, [src, 256] +# endif + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 /* Test and readjust count. */ + +L(prefetch_loop64): +# if defined(USE_THUNDERX) + tbz src, #6, 1f + prfm pldl1strm, [src, 512] +1: +# elif defined(USE_THUNDERX2) + prfm pldl1strm, [src, 256] +# endif + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi L(prefetch_loop64) + b L(last64) + +L(copy_long_without_prefetch): +# endif + + and tmp1, dstin, 15 + bic dst, dstin, 15 + ldp D_l, D_h, [src] + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(last64) +L(loop64): + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi L(loop64) + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes, so it is safe to always copy 64 bytes from the end even if + there is just 1 byte left. */ +L(last64): + ldp E_l, E_h, [srcend, -64] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [srcend, -48] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [srcend, -16] + stp D_l, D_h, [dst, 64] + stp E_l, E_h, [dstend, -64] + stp A_l, A_h, [dstend, -48] + stp B_l, B_h, [dstend, -32] + stp C_l, C_h, [dstend, -16] + ret + + .p2align 4 +L(move_long): + cbz tmp1, 3f + + add srcend, src, count + add dstend, dstin, count + + /* Align dstend to 16 byte alignment so that we don't cross cache line + boundaries on both loads and stores. There are at least 96 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 64 bytes per iteration and prefetches one iteration ahead. */ + + and tmp1, dstend, 15 + ldp D_l, D_h, [srcend, -16] + sub srcend, srcend, tmp1 + sub count, count, tmp1 + ldp A_l, A_h, [srcend, -16] + stp D_l, D_h, [dstend, -16] + ldp B_l, B_h, [srcend, -32] + ldp C_l, C_h, [srcend, -48] + ldp D_l, D_h, [srcend, -64]! + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls 2f + + nop +1: + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [srcend, -16] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [srcend, -48] + stp D_l, D_h, [dstend, -64]! + ldp D_l, D_h, [srcend, -64]! + subs count, count, 64 + b.hi 1b + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes, so it is safe to always copy 64 bytes from the start even if + there is just 1 byte left. */ +2: + ldp G_l, G_h, [src, 48] + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [src, 32] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [src, 16] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [src] + stp D_l, D_h, [dstend, -64] + stp G_l, G_h, [dstin, 48] + stp A_l, A_h, [dstin, 32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin] +3: ret + +END (MEMCPY) +libc_hidden_builtin_def (MEMCPY) + +#endif diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S new file mode 100644 index 0000000000..8501abf725 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S @@ -0,0 +1,27 @@ +/* A Thunderx2 Optimized memcpy implementation for AARCH64. + Copyright (C) 2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* The actual code in this memcpy and memmove is in memcpy_thunderx.S. + The only real differences are with the prefetching instructions. */ + +#define MEMCPY __memcpy_thunderx2 +#define MEMMOVE __memmove_thunderx2 +#define USE_THUNDERX2 + +#include "memcpy_thunderx.S" diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c new file mode 100644 index 0000000000..e69d816291 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memmove.c @@ -0,0 +1,44 @@ +/* Multiple versions of memmove. AARCH64 version. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ + +#if IS_IN (libc) +/* Redefine memmove so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +# undef memmove +# define memmove __redirect_memmove +# include <string.h> +# include <init-arch.h> + +extern __typeof (__redirect_memmove) __libc_memmove; + +extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden; +extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden; +extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden; + +libc_ifunc (__libc_memmove, + (IS_THUNDERX (midr) + ? __memmove_thunderx + : (IS_FALKOR (midr) || IS_PHECDA (midr) + ? __memmove_falkor + : __memmove_generic))); + +# undef memmove +strong_alias (__libc_memmove, memmove); +#endif diff --git a/sysdeps/aarch64/multiarch/memmove_falkor.S b/sysdeps/aarch64/multiarch/memmove_falkor.S new file mode 100644 index 0000000000..5163f68e53 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memmove_falkor.S @@ -0,0 +1,225 @@ +/* Copyright (C) 2017-2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Assumptions: ARMv8-a, AArch64, falkor, unaligned accesses. */ + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_x x6 +#define B_x x7 +#define A_w w6 +#define B_w w7 +#define tmp1 x14 + +#define Q_q q6 +#define A_q q22 +#define B_q q18 +#define C_q q19 +#define D_q q20 +#define E_q q21 +#define F_q q17 +#define G_q q23 + +/* RATIONALE: + + The move has 4 distinct parts: + * Small moves of 16 bytes and under + * Medium sized moves of 17-96 bytes + * Large moves where the source address is higher than the destination + (forward copies) + * Large moves where the destination address is higher than the source + (copy backward, or move). + + We use only two registers q6 and q22 for the moves and move 32 bytes at a + time to correctly train the hardware prefetcher for better throughput. */ +ENTRY_ALIGN (__memmove_falkor, 6) + + sub tmp1, dstin, src + add srcend, src, count + add dstend, dstin, count + cmp count, 96 + ccmp tmp1, count, 2, hi + b.lo L(move_long) + + cmp count, 16 + b.ls L(copy16) + cmp count, 96 + b.hi L(copy_long) + + /* Medium copies: 17..96 bytes. */ + sub tmp1, count, 1 + ldr A_q, [src] + tbnz tmp1, 6, L(copy96) + ldr D_q, [srcend, -16] + tbz tmp1, 5, 1f + ldr B_q, [src, 16] + ldr C_q, [srcend, -32] + str B_q, [dstin, 16] + str C_q, [dstend, -32] +1: + str A_q, [dstin] + str D_q, [dstend, -16] + ret + + .p2align 4 + /* Small copies: 0..16 bytes. */ +L(copy16): + cmp count, 8 + b.lo 1f + ldr A_x, [src] + ldr B_x, [srcend, -8] + str A_x, [dstin] + str B_x, [dstend, -8] + ret + .p2align 4 +1: + /* 4-7 */ + tbz count, 2, 1f + ldr A_w, [src] + ldr B_w, [srcend, -4] + str A_w, [dstin] + str B_w, [dstend, -4] + ret + .p2align 4 +1: + /* 2-3 */ + tbz count, 1, 1f + ldrh A_w, [src] + ldrh B_w, [srcend, -2] + strh A_w, [dstin] + strh B_w, [dstend, -2] + ret + .p2align 4 +1: + /* 0-1 */ + tbz count, 0, 1f + ldrb A_w, [src] + strb A_w, [dstin] +1: ret + + .p2align 4 + /* Copy 64..96 bytes. Copy 64 bytes from the start and + 32 bytes from the end. */ +L(copy96): + ldr B_q, [src, 16] + ldr C_q, [src, 32] + ldr D_q, [src, 48] + ldr E_q, [srcend, -32] + ldr F_q, [srcend, -16] + str A_q, [dstin] + str B_q, [dstin, 16] + str C_q, [dstin, 32] + str D_q, [dstin, 48] + str E_q, [dstend, -32] + str F_q, [dstend, -16] + ret + + /* Align SRC to 16 byte alignment so that we don't cross cache line + boundaries on both loads and stores. There are at least 96 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 32 bytes per iteration and prefetches one iteration ahead. */ + + .p2align 4 +L(copy_long): + ldr A_q, [src] + and tmp1, src, 15 + bic src, src, 15 + sub dst, dstin, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldr Q_q, [src, 16]! + str A_q, [dstin] + ldr A_q, [src, 16]! + subs count, count, 32 + 64 + 16 /* Test and readjust count. */ + b.ls L(last64) + +L(loop64): + subs count, count, 32 + str Q_q, [dst, 16] + ldr Q_q, [src, 16]! + str A_q, [dst, 32]! + ldr A_q, [src, 16]! + b.hi L(loop64) + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes and at least 33 bytes, so it is safe to always copy 64 bytes + from the end. */ +L(last64): + ldr C_q, [srcend, -64] + str Q_q, [dst, 16] + ldr B_q, [srcend, -48] + str A_q, [dst, 32] + ldr A_q, [srcend, -32] + ldr D_q, [srcend, -16] + str C_q, [dstend, -64] + str B_q, [dstend, -48] + str A_q, [dstend, -32] + str D_q, [dstend, -16] + ret + + .p2align 4 +L(move_long): + cbz tmp1, 3f + + /* Align SRCEND to 16 byte alignment so that we don't cross cache line + boundaries on both loads and stores. There are at least 96 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 32 bytes per iteration and prefetches one iteration ahead. */ + + ldr A_q, [srcend, -16] + and tmp1, srcend, 15 + sub srcend, srcend, tmp1 + ldr Q_q, [srcend, -16]! + str A_q, [dstend, -16] + sub count, count, tmp1 + ldr A_q, [srcend, -16]! + sub dstend, dstend, tmp1 + subs count, count, 32 + 64 + b.ls 2f + +1: + subs count, count, 32 + str Q_q, [dstend, -16] + ldr Q_q, [srcend, -16]! + str A_q, [dstend, -32]! + ldr A_q, [srcend, -16]! + b.hi 1b + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes and at least 33 bytes, so it is safe to always copy 64 bytes + from the start. */ +2: + ldr C_q, [src, 48] + str Q_q, [dstend, -16] + ldr B_q, [src, 32] + str A_q, [dstend, -32] + ldr A_q, [src, 16] + ldr D_q, [src] + str C_q, [dstin, 48] + str B_q, [dstin, 32] + str A_q, [dstin, 16] + str D_q, [dstin] +3: ret + +END (__memmove_falkor) +libc_hidden_builtin_def (__memmove_falkor) diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c new file mode 100644 index 0000000000..d74ed3a549 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memset.c @@ -0,0 +1,41 @@ +/* Multiple versions of memset. AARCH64 version. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ + +#if IS_IN (libc) +/* Redefine memset so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +# undef memset +# define memset __redirect_memset +# include <string.h> +# include <init-arch.h> + +extern __typeof (__redirect_memset) __libc_memset; + +extern __typeof (__redirect_memset) __memset_falkor attribute_hidden; +extern __typeof (__redirect_memset) __memset_generic attribute_hidden; + +libc_ifunc (__libc_memset, + ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64 + ? __memset_falkor + : __memset_generic)); + +# undef memset +strong_alias (__libc_memset, memset); +#endif diff --git a/sysdeps/aarch64/multiarch/memset_falkor.S b/sysdeps/aarch64/multiarch/memset_falkor.S new file mode 100644 index 0000000000..16849a5afb --- /dev/null +++ b/sysdeps/aarch64/multiarch/memset_falkor.S @@ -0,0 +1,53 @@ +/* Memset for falkor. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#include <memset-reg.h> + +/* Reading dczid_el0 is expensive on falkor so move it into the ifunc + resolver and assume ZVA size of 64 bytes. The IFUNC resolver takes care to + use this function only when ZVA is enabled. */ + +#if IS_IN (libc) +.macro zva_macro + .p2align 4 + /* Write the first and last 64 byte aligned block using stp rather + than using DC ZVA. This is faster on some cores. */ + str q0, [dst, 16] + stp q0, q0, [dst, 32] + bic dst, dst, 63 + stp q0, q0, [dst, 64] + stp q0, q0, [dst, 96] + sub count, dstend, dst /* Count is now 128 too large. */ + sub count, count, 128+64+64 /* Adjust count and bias for loop. */ + add dst, dst, 128 +1: dc zva, dst + add dst, dst, 64 + subs count, count, 64 + b.hi 1b + stp q0, q0, [dst, 0] + stp q0, q0, [dst, 32] + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret +.endm + +# define ZVA_MACRO zva_macro +# define MEMSET __memset_falkor +# include <sysdeps/aarch64/memset.S> +#endif diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S new file mode 100644 index 0000000000..345a88b94f --- /dev/null +++ b/sysdeps/aarch64/multiarch/memset_generic.S @@ -0,0 +1,27 @@ +/* Memset for aarch64, default version for internal use. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# define MEMSET __memset_generic +/* Add a hidden definition for use within libc.so. */ +# ifdef SHARED + .globl __GI_memset; __GI_memset = __memset_generic +# endif +# include <sysdeps/aarch64/memset.S> +#endif diff --git a/sysdeps/aarch64/multiarch/rtld-memset.S b/sysdeps/aarch64/multiarch/rtld-memset.S new file mode 100644 index 0000000000..8cdc6e1f50 --- /dev/null +++ b/sysdeps/aarch64/multiarch/rtld-memset.S @@ -0,0 +1,23 @@ +/* Memset for aarch64, for the dynamic linker. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (rtld) +# define MEMSET memset +# include <sysdeps/aarch64/memset.S> +#endif |