diff options
Diffstat (limited to 'sysdeps/arm/armv7')
-rw-r--r-- | sysdeps/arm/armv7/multiarch/Makefile | 3 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/ifunc-impl-list.c | 7 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/ifunc-memchr.h | 28 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/ifunc-memcpy.h | 37 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/memchr.c | 35 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/memchr_neon.S | 202 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/memchr_noneon.S | 5 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/memcpy.S | 76 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/memcpy.c | 35 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/memcpy_arm.S | 10 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/memcpy_impl.S | 585 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/memcpy_neon.S | 8 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/memcpy_vfp.S | 4 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/rtld-memchr.S | 1 | ||||
-rw-r--r-- | sysdeps/arm/armv7/multiarch/rtld-memcpy.S | 1 | ||||
-rw-r--r-- | sysdeps/arm/armv7/strcmp.S | 118 |
16 files changed, 599 insertions, 556 deletions
diff --git a/sysdeps/arm/armv7/multiarch/Makefile b/sysdeps/arm/armv7/multiarch/Makefile index e834cc937f..6e5851f897 100644 --- a/sysdeps/arm/armv7/multiarch/Makefile +++ b/sysdeps/arm/armv7/multiarch/Makefile @@ -1,3 +1,4 @@ ifeq ($(subdir),string) -sysdep_routines += memcpy_neon memcpy_vfp +sysdep_routines += memcpy_neon memcpy_vfp memchr_neon memcpy_arm \ + memchr_noneon endif diff --git a/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c b/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c index d7088f2a22..48e43da66e 100644 --- a/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c +++ b/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c @@ -1,5 +1,5 @@ /* Enumerate available IFUNC implementations of a function. ARM version. - Copyright (C) 2013-2016 Free Software Foundation, Inc. + Copyright (C) 2013-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -34,6 +34,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, bool use_neon = true; #ifdef __ARM_NEON__ # define __memcpy_neon memcpy +# define __memchr_neon memchr #else use_neon = (GLRO(dl_hwcap) & HWCAP_ARM_NEON) != 0; #endif @@ -52,5 +53,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, #endif IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_arm)); + IFUNC_IMPL (i, name, memchr, + IFUNC_IMPL_ADD (array, i, memchr, use_neon, __memchr_neon) + IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_noneon)); + return i; } diff --git a/sysdeps/arm/armv7/multiarch/ifunc-memchr.h b/sysdeps/arm/armv7/multiarch/ifunc-memchr.h new file mode 100644 index 0000000000..75495824f4 --- /dev/null +++ b/sysdeps/arm/armv7/multiarch/ifunc-memchr.h @@ -0,0 +1,28 @@ +/* Common definition for memchr resolver. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +__typeof (REDIRECT_NAME) OPTIMIZE (neon) attribute_hidden; +__typeof (REDIRECT_NAME) OPTIMIZE (noneon) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (int hwcap) +{ + if (hwcap & HWCAP_ARM_NEON) + return OPTIMIZE (neon); + return OPTIMIZE (noneon); +} diff --git a/sysdeps/arm/armv7/multiarch/ifunc-memcpy.h b/sysdeps/arm/armv7/multiarch/ifunc-memcpy.h new file mode 100644 index 0000000000..7e6f73ff4d --- /dev/null +++ b/sysdeps/arm/armv7/multiarch/ifunc-memcpy.h @@ -0,0 +1,37 @@ +/* Common definition for memcpy resolver. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifdef __SOFTFP__ +__typeof (REDIRECT_NAME) OPTIMIZE (arm) attribute_hidden; +#endif +__typeof (REDIRECT_NAME) OPTIMIZE (vfp) attribute_hidden; +__typeof (REDIRECT_NAME) OPTIMIZE (neon) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (int hwcap) +{ + if (hwcap & HWCAP_ARM_NEON) + return OPTIMIZE (neon); +#ifdef __SOFTFP__ + if (hwcap & HWCAP_ARM_VFP) + return OPTIMIZE (vfp); + return OPTIMIZE (arm); +#else + return OPTIMIZE (vfp); +#endif +} diff --git a/sysdeps/arm/armv7/multiarch/memchr.c b/sysdeps/arm/armv7/multiarch/memchr.c new file mode 100644 index 0000000000..ff1cc5d203 --- /dev/null +++ b/sysdeps/arm/armv7/multiarch/memchr.c @@ -0,0 +1,35 @@ +/* Multiple versions of memchr. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* For __ARM_NEON__ memchr_neon.S defines memchr directly and ifunc + is not used. */ +#if IS_IN (libc) && !defined (__ARM_NEON__) +# define memchr __redirect_memchr +# include <string.h> +# undef memchr + +# include <arm-ifunc.h> + +# define SYMBOL_NAME memchr +# include "ifunc-memchr.h" + +arm_libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR); + +arm_libc_ifunc_hidden_def (__redirect_memchr, memchr); +#endif diff --git a/sysdeps/arm/armv7/multiarch/memchr_neon.S b/sysdeps/arm/armv7/multiarch/memchr_neon.S new file mode 100644 index 0000000000..6fbf9b8898 --- /dev/null +++ b/sysdeps/arm/armv7/multiarch/memchr_neon.S @@ -0,0 +1,202 @@ +/* memchr implemented using NEON. + Copyright (C) 2011-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* For __ARM_NEON__ this file defines memchr. */ +#ifndef __ARM_NEON__ +# define memchr __memchr_neon +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(a) +#endif + + .arch armv7-a + .fpu neon + + +/* Arguments */ +#define srcin r0 +#define chrin r1 +#define cntin r2 + +/* Retval */ +#define result r0 /* Live range does not overlap with srcin */ + +/* Working registers */ +#define src r1 /* Live range does not overlap with chrin */ +#define tmp r3 +#define synd r0 /* No overlap with srcin or result */ +#define soff r12 + +/* Working NEON registers */ +#define vrepchr q0 +#define vdata0 q1 +#define vdata0_0 d2 /* Lower half of vdata0 */ +#define vdata0_1 d3 /* Upper half of vdata0 */ +#define vdata1 q2 +#define vdata1_0 d4 /* Lower half of vhas_chr0 */ +#define vdata1_1 d5 /* Upper half of vhas_chr0 */ +#define vrepmask q3 +#define vrepmask0 d6 +#define vrepmask1 d7 +#define vend q4 +#define vend0 d8 +#define vend1 d9 + +/* + * Core algorithm: + * + * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per + * byte. Each bit is set if the relevant byte matched the requested character + * and cleared otherwise. Since the bits in the syndrome reflect exactly the + * order in which things occur in the original string, counting trailing zeros + * allows to identify exactly which byte has matched. + */ + + .thumb_func + .p2align 4,,15 + +ENTRY(memchr) + /* Use a simple loop if there are less than 8 bytes to search. */ + cmp cntin, #7 + bhi .Llargestr + and chrin, chrin, #0xff + +.Lsmallstr: + subs cntin, cntin, #1 + blo .Lnotfound /* Return not found if reached end. */ + ldrb tmp, [srcin], #1 + cmp tmp, chrin + bne .Lsmallstr /* Loop again if not found. */ + /* Otherwise fixup address and return. */ + sub result, srcin, #1 + bx lr + + +.Llargestr: + vdup.8 vrepchr, chrin /* Duplicate char across all lanes. */ + /* + * Magic constant 0x8040201008040201 allows us to identify which lane + * matches the requested byte. + */ + movw tmp, #0x0201 + movt tmp, #0x0804 + lsl soff, tmp, #4 + vmov vrepmask0, tmp, soff + vmov vrepmask1, tmp, soff + /* Work with aligned 32-byte chunks */ + bic src, srcin, #31 + ands soff, srcin, #31 + beq .Lloopintro /* Go straight to main loop if it's aligned. */ + + /* + * Input string is not 32-byte aligned. We calculate the syndrome + * value for the aligned 32 bytes block containing the first bytes + * and mask the irrelevant part. + */ + vld1.8 {vdata0, vdata1}, [src:256]! + sub tmp, soff, #32 + adds cntin, cntin, tmp + vceq.i8 vdata0, vdata0, vrepchr + vceq.i8 vdata1, vdata1, vrepchr + vand vdata0, vdata0, vrepmask + vand vdata1, vdata1, vrepmask + vpadd.i8 vdata0_0, vdata0_0, vdata0_1 + vpadd.i8 vdata1_0, vdata1_0, vdata1_1 + vpadd.i8 vdata0_0, vdata0_0, vdata1_0 + vpadd.i8 vdata0_0, vdata0_0, vdata0_0 + vmov synd, vdata0_0[0] + + /* Clear the soff lower bits */ + lsr synd, synd, soff + lsl synd, synd, soff + /* The first block can also be the last */ + bls .Lmasklast + /* Have we found something already? */ + cbnz synd, .Ltail + + +.Lloopintro: + vpush {vend} + /* 264/265 correspond to d8/d9 for q4 */ + cfi_adjust_cfa_offset (16) + cfi_rel_offset (264, 0) + cfi_rel_offset (265, 8) + .p2align 3,,7 +.Lloop: + vld1.8 {vdata0, vdata1}, [src:256]! + subs cntin, cntin, #32 + vceq.i8 vdata0, vdata0, vrepchr + vceq.i8 vdata1, vdata1, vrepchr + /* If we're out of data we finish regardless of the result. */ + bls .Lend + /* Use a fast check for the termination condition. */ + vorr vend, vdata0, vdata1 + vorr vend0, vend0, vend1 + vmov synd, tmp, vend0 + orrs synd, synd, tmp + /* We're not out of data, loop if we haven't found the character. */ + beq .Lloop + +.Lend: + vpop {vend} + cfi_adjust_cfa_offset (-16) + cfi_restore (264) + cfi_restore (265) + + /* Termination condition found, let's calculate the syndrome value. */ + vand vdata0, vdata0, vrepmask + vand vdata1, vdata1, vrepmask + vpadd.i8 vdata0_0, vdata0_0, vdata0_1 + vpadd.i8 vdata1_0, vdata1_0, vdata1_1 + vpadd.i8 vdata0_0, vdata0_0, vdata1_0 + vpadd.i8 vdata0_0, vdata0_0, vdata0_0 + vmov synd, vdata0_0[0] + cbz synd, .Lnotfound + bhi .Ltail /* Uses the condition code from + subs cntin, cntin, #32 above. */ + + +.Lmasklast: + /* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */ + neg cntin, cntin + lsl synd, synd, cntin + lsrs synd, synd, cntin + it eq + moveq src, #0 /* If no match, set src to 0 so the retval is 0. */ + + +.Ltail: + /* Count the trailing zeros using bit reversing */ + rbit synd, synd + /* Compensate the last post-increment */ + sub src, src, #32 + /* Count the leading zeros */ + clz synd, synd + /* Compute the potential result and return */ + add result, src, synd + bx lr + + +.Lnotfound: + /* Set result to NULL if not found and return */ + mov result, #0 + bx lr + +END(memchr) +libc_hidden_builtin_def (memchr) diff --git a/sysdeps/arm/armv7/multiarch/memchr_noneon.S b/sysdeps/arm/armv7/multiarch/memchr_noneon.S new file mode 100644 index 0000000000..b1fb54018d --- /dev/null +++ b/sysdeps/arm/armv7/multiarch/memchr_noneon.S @@ -0,0 +1,5 @@ +#define memchr __memchr_noneon +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#include <sysdeps/arm/armv6t2/memchr.S> diff --git a/sysdeps/arm/armv7/multiarch/memcpy.S b/sysdeps/arm/armv7/multiarch/memcpy.S deleted file mode 100644 index 01ba9e5733..0000000000 --- a/sysdeps/arm/armv7/multiarch/memcpy.S +++ /dev/null @@ -1,76 +0,0 @@ -/* Multiple versions of memcpy - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2013-2016 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -/* Thumb requires excess IT instructions here. */ -#define NO_THUMB -#include <sysdep.h> -#include <rtld-global-offsets.h> - -#if IS_IN (libc) -/* Under __ARM_NEON__, memcpy_neon.S defines the name memcpy. */ -# ifndef __ARM_NEON__ - .text -ENTRY(memcpy) - .type memcpy, %gnu_indirect_function -# ifdef __SOFTFP__ - ldr r1, .Lmemcpy_arm - tst r0, #HWCAP_ARM_VFP - ldrne r1, .Lmemcpy_vfp -# else - ldr r1, .Lmemcpy_vfp -# endif - tst r0, #HWCAP_ARM_NEON - ldrne r1, .Lmemcpy_neon -1: - add r0, r1, pc - DO_RET(lr) - -# ifdef __SOFTFP__ -.Lmemcpy_arm: - .long C_SYMBOL_NAME(__memcpy_arm) - 1b - PC_OFS -# endif -.Lmemcpy_neon: - .long C_SYMBOL_NAME(__memcpy_neon) - 1b - PC_OFS -.Lmemcpy_vfp: - .long C_SYMBOL_NAME(__memcpy_vfp) - 1b - PC_OFS - -END(memcpy) - -libc_hidden_builtin_def (memcpy) -#endif /* Not __ARM_NEON__. */ - -/* These versions of memcpy are defined not to clobber any VFP or NEON - registers so they must always call the ARM variant of the memcpy code. */ -strong_alias (__memcpy_arm, __aeabi_memcpy) -strong_alias (__memcpy_arm, __aeabi_memcpy4) -strong_alias (__memcpy_arm, __aeabi_memcpy8) -libc_hidden_def (__memcpy_arm) - -#undef libc_hidden_builtin_def -#define libc_hidden_builtin_def(name) -#undef weak_alias -#define weak_alias(x, y) -#undef libc_hidden_def -#define libc_hidden_def(name) - -#define memcpy __memcpy_arm - -#endif - -#include "memcpy_impl.S" diff --git a/sysdeps/arm/armv7/multiarch/memcpy.c b/sysdeps/arm/armv7/multiarch/memcpy.c new file mode 100644 index 0000000000..02776b6fe6 --- /dev/null +++ b/sysdeps/arm/armv7/multiarch/memcpy.c @@ -0,0 +1,35 @@ +/* Multiple versions of memcpy. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* For __ARM_NEON__ memchr_neon.S defines memchr directly and ifunc + is not used. */ +#if IS_IN (libc) && !defined (__ARM_NEON__) +# define memcpy __redirect_memcpy +# include <string.h> +# undef memcpy + +# include <arm-ifunc.h> + +# define SYMBOL_NAME memcpy +# include "ifunc-memcpy.h" + +arm_libc_ifunc_redirected (__redirect_memcpy, memcpy, IFUNC_SELECTOR); + +arm_libc_ifunc_hidden_def (__redirect_memcpy, memcpy); +#endif diff --git a/sysdeps/arm/armv7/multiarch/memcpy_arm.S b/sysdeps/arm/armv7/multiarch/memcpy_arm.S new file mode 100644 index 0000000000..e4a9a68c42 --- /dev/null +++ b/sysdeps/arm/armv7/multiarch/memcpy_arm.S @@ -0,0 +1,10 @@ +#define memcpy __memcpy_arm +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(a) +#include "memcpy_impl.S" + +/* These versions of memcpy are defined not to clobber any VFP or NEON + registers so they must always call the ARM variant of the memcpy code. */ +strong_alias (__memcpy_arm, __aeabi_memcpy) +strong_alias (__memcpy_arm, __aeabi_memcpy4) +strong_alias (__memcpy_arm, __aeabi_memcpy8) diff --git a/sysdeps/arm/armv7/multiarch/memcpy_impl.S b/sysdeps/arm/armv7/multiarch/memcpy_impl.S index a1f6266c88..2de172635c 100644 --- a/sysdeps/arm/armv7/multiarch/memcpy_impl.S +++ b/sysdeps/arm/armv7/multiarch/memcpy_impl.S @@ -1,5 +1,5 @@ /* NEON/VFP/ARM version of memcpy optimized for Cortex-A15. - Copyright (C) 2013-2016 Free Software Foundation, Inc. + Copyright (C) 2013-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -226,71 +226,40 @@ #ifdef USE_VFP .macro cpy_line_vfp vreg, base - sfi_breg dst, \ - vstr \vreg, [\B, #\base] - sfi_breg src, \ - vldr \vreg, [\B, #\base] - sfi_breg dst, \ - vstr d0, [\B, #\base + 8] - sfi_breg src, \ - vldr d0, [\B, #\base + 8] - sfi_breg dst, \ - vstr d1, [\B, #\base + 16] - sfi_breg src, \ - vldr d1, [\B, #\base + 16] - sfi_breg dst, \ - vstr d2, [\B, #\base + 24] - sfi_breg src, \ - vldr d2, [\B, #\base + 24] - sfi_breg dst, \ - vstr \vreg, [\B, #\base + 32] - sfi_breg src, \ - vldr \vreg, [\B, #\base + prefetch_lines * 64 - 32] - sfi_breg dst, \ - vstr d0, [\B, #\base + 40] - sfi_breg src, \ - vldr d0, [\B, #\base + 40] - sfi_breg dst, \ - vstr d1, [\B, #\base + 48] - sfi_breg src, \ - vldr d1, [\B, #\base + 48] - sfi_breg dst, \ - vstr d2, [\B, #\base + 56] - sfi_breg src, \ - vldr d2, [\B, #\base + 56] + vstr \vreg, [dst, #\base] + vldr \vreg, [src, #\base] + vstr d0, [dst, #\base + 8] + vldr d0, [src, #\base + 8] + vstr d1, [dst, #\base + 16] + vldr d1, [src, #\base + 16] + vstr d2, [dst, #\base + 24] + vldr d2, [src, #\base + 24] + vstr \vreg, [dst, #\base + 32] + vldr \vreg, [src, #\base + prefetch_lines * 64 - 32] + vstr d0, [dst, #\base + 40] + vldr d0, [src, #\base + 40] + vstr d1, [dst, #\base + 48] + vldr d1, [src, #\base + 48] + vstr d2, [dst, #\base + 56] + vldr d2, [src, #\base + 56] .endm .macro cpy_tail_vfp vreg, base - sfi_breg dst, \ - vstr \vreg, [\B, #\base] - sfi_breg src, \ - vldr \vreg, [\B, #\base] - sfi_breg dst, \ - vstr d0, [\B, #\base + 8] - sfi_breg src, \ - vldr d0, [\B, #\base + 8] - sfi_breg dst, \ - vstr d1, [\B, #\base + 16] - sfi_breg src, \ - vldr d1, [\B, #\base + 16] - sfi_breg dst, \ - vstr d2, [\B, #\base + 24] - sfi_breg src, \ - vldr d2, [\B, #\base + 24] - sfi_breg dst, \ - vstr \vreg, [\B, #\base + 32] - sfi_breg dst, \ - vstr d0, [\B, #\base + 40] - sfi_breg src, \ - vldr d0, [\B, #\base + 40] - sfi_breg dst, \ - vstr d1, [\B, #\base + 48] - sfi_breg src, \ - vldr d1, [\B, #\base + 48] - sfi_breg dst, \ - vstr d2, [\B, #\base + 56] - sfi_breg src, \ - vldr d2, [\B, #\base + 56] + vstr \vreg, [dst, #\base] + vldr \vreg, [src, #\base] + vstr d0, [dst, #\base + 8] + vldr d0, [src, #\base + 8] + vstr d1, [dst, #\base + 16] + vldr d1, [src, #\base + 16] + vstr d2, [dst, #\base + 24] + vldr d2, [src, #\base + 24] + vstr \vreg, [dst, #\base + 32] + vstr d0, [dst, #\base + 40] + vldr d0, [src, #\base + 40] + vstr d1, [dst, #\base + 48] + vldr d1, [src, #\base + 48] + vstr d2, [dst, #\base + 56] + vldr d2, [src, #\base + 56] .endm #endif @@ -307,7 +276,7 @@ ENTRY(memcpy) #ifdef USE_NEON /* These need an extra layer of macro just to work around a bug in the assembler's parser when an operand starts with - a {...}. http://sourceware.org/bugzilla/show_bug.cgi?id=15647 + a {...}. https://sourceware.org/bugzilla/show_bug.cgi?id=15647 tracks that bug; it was not fixed as of binutils-2.23.2. */ .macro neon_load_d0 reg vld1.8 {d0}, [\reg]! @@ -316,26 +285,16 @@ ENTRY(memcpy) vst1.8 {d0}, [\reg]! .endm - /* These are used by the NaCl sfi_breg macro. */ - .macro _sfi_breg_dmask_neon_load_d0 reg - _sfi_dmask \reg - .endm - .macro _sfi_breg_dmask_neon_store_d0 reg - _sfi_dmask \reg - .endm - and tmp1, count, #0x38 .macro dispatch_step i - sfi_breg src, neon_load_d0 \B - sfi_breg dst, neon_store_d0 \B + neon_load_d0 src + neon_store_d0 dst .endm dispatch_7_dword tst count, #4 - sfi_breg src, \ - ldrne tmp1, [\B], #4 - sfi_breg dst, \ - strne tmp1, [\B], #4 + ldrne tmp1, [src], #4 + strne tmp1, [dst], #4 #else /* Copy up to 15 full words of data. May not be aligned. */ /* Cannot use VFP for unaligned data. */ @@ -344,23 +303,17 @@ ENTRY(memcpy) add src, src, tmp1 /* Jump directly into the sequence below at the correct offset. */ .macro dispatch_step i - sfi_breg src, \ - ldr tmp1, [\B, #-(\i * 4)] - sfi_breg dst, \ - str tmp1, [\B, #-(\i * 4)] + ldr tmp1, [src, #-(\i * 4)] + str tmp1, [dst, #-(\i * 4)] .endm dispatch_15_word #endif lsls count, count, #31 - sfi_breg src, \ - ldrhcs tmp1, [\B], #2 - sfi_breg src, \ - ldrbne src, [\B] /* Src is dead, use as a scratch. */ - sfi_breg dst, \ - strhcs tmp1, [\B], #2 - sfi_breg dst, \ - strbne src, [\B] + ldrhcs tmp1, [src], #2 + ldrbne src, [src] /* Src is dead, use as a scratch. */ + strhcs tmp1, [dst], #2 + strbne src, [dst] bx lr .Lcpy_not_short: @@ -388,19 +341,13 @@ ENTRY(memcpy) beq 1f rsbs tmp2, tmp2, #0 sub count, count, tmp2, lsr #29 - sfi_breg src, \ - ldrmi tmp1, [\B], #4 - sfi_breg dst, \ - strmi tmp1, [\B], #4 + ldrmi tmp1, [src], #4 + strmi tmp1, [dst], #4 lsls tmp2, tmp2, #2 - sfi_breg src, \ - ldrhcs tmp1, [\B], #2 - sfi_breg src, \ - ldrbne tmp2, [\B], #1 - sfi_breg dst, \ - strhcs tmp1, [\B], #2 - sfi_breg dst, \ - strbne tmp2, [\B], #1 + ldrhcs tmp1, [src], #2 + ldrbne tmp2, [src], #1 + strhcs tmp1, [dst], #2 + strbne tmp2, [dst], #1 1: subs tmp2, count, #64 /* Use tmp2 for count. */ @@ -412,40 +359,24 @@ ENTRY(memcpy) .Lcpy_body_medium: /* Count in tmp2. */ #ifdef USE_VFP 1: - sfi_breg src, \ - vldr d0, [\B, #0] + vldr d0, [src, #0] subs tmp2, tmp2, #64 - sfi_breg src, \ - vldr d1, [\B, #8] - sfi_breg dst, \ - vstr d0, [\B, #0] - sfi_breg src, \ - vldr d0, [\B, #16] - sfi_breg dst, \ - vstr d1, [\B, #8] - sfi_breg src, \ - vldr d1, [\B, #24] - sfi_breg dst, \ - vstr d0, [\B, #16] - sfi_breg src, \ - vldr d0, [\B, #32] - sfi_breg dst, \ - vstr d1, [\B, #24] - sfi_breg src, \ - vldr d1, [\B, #40] - sfi_breg dst, \ - vstr d0, [\B, #32] - sfi_breg src, \ - vldr d0, [\B, #48] - sfi_breg dst, \ - vstr d1, [\B, #40] - sfi_breg src, \ - vldr d1, [\B, #56] - sfi_breg dst, \ - vstr d0, [\B, #48] + vldr d1, [src, #8] + vstr d0, [dst, #0] + vldr d0, [src, #16] + vstr d1, [dst, #8] + vldr d1, [src, #24] + vstr d0, [dst, #16] + vldr d0, [src, #32] + vstr d1, [dst, #24] + vldr d1, [src, #40] + vstr d0, [dst, #32] + vldr d0, [src, #48] + vstr d1, [dst, #40] + vldr d1, [src, #56] + vstr d0, [dst, #48] add src, src, #64 - sfi_breg dst, \ - vstr d1, [\B, #56] + vstr d1, [dst, #56] add dst, dst, #64 bge 1b tst tmp2, #0x3f @@ -456,48 +387,30 @@ ENTRY(memcpy) add dst, dst, tmp1 add src, src, tmp1 .macro dispatch_step i - sfi_breg src, \ - vldr d0, [\B, #-(\i * 8)] - sfi_breg dst, \ - vstr d0, [\B, #-(\i * 8)] + vldr d0, [src, #-(\i * 8)] + vstr d0, [dst, #-(\i * 8)] .endm dispatch_7_dword #else sub src, src, #8 sub dst, dst, #8 1: - sfi_breg src, \ - ldrd A_l, A_h, [\B, #8] - sfi_breg dst, \ - strd A_l, A_h, [\B, #8] - sfi_breg src, \ - ldrd A_l, A_h, [\B, #16] - sfi_breg dst, \ - strd A_l, A_h, [\B, #16] - sfi_breg src, \ - ldrd A_l, A_h, [\B, #24] - sfi_breg dst, \ - strd A_l, A_h, [\B, #24] - sfi_breg src, \ - ldrd A_l, A_h, [\B, #32] - sfi_breg dst, \ - strd A_l, A_h, [\B, #32] - sfi_breg src, \ - ldrd A_l, A_h, [\B, #40] - sfi_breg dst, \ - strd A_l, A_h, [\B, #40] - sfi_breg src, \ - ldrd A_l, A_h, [\B, #48] - sfi_breg dst, \ - strd A_l, A_h, [\B, #48] - sfi_breg src, \ - ldrd A_l, A_h, [\B, #56] - sfi_breg dst, \ - strd A_l, A_h, [\B, #56] - sfi_breg src, \ - ldrd A_l, A_h, [\B, #64]! - sfi_breg dst, \ - strd A_l, A_h, [\B, #64]! + ldrd A_l, A_h, [src, #8] + strd A_l, A_h, [dst, #8] + ldrd A_l, A_h, [src, #16] + strd A_l, A_h, [dst, #16] + ldrd A_l, A_h, [src, #24] + strd A_l, A_h, [dst, #24] + ldrd A_l, A_h, [src, #32] + strd A_l, A_h, [dst, #32] + ldrd A_l, A_h, [src, #40] + strd A_l, A_h, [dst, #40] + ldrd A_l, A_h, [src, #48] + strd A_l, A_h, [dst, #48] + ldrd A_l, A_h, [src, #56] + strd A_l, A_h, [dst, #56] + ldrd A_l, A_h, [src, #64]! + strd A_l, A_h, [dst, #64]! subs tmp2, tmp2, #64 bge 1b tst tmp2, #0x3f @@ -524,28 +437,20 @@ ENTRY(memcpy) add dst, dst, tmp1 add src, src, tmp1 .macro dispatch_step i - sfi_breg src, \ - ldrd A_l, A_h, [\B, #-(\i * 8)] - sfi_breg dst, \ - strd A_l, A_h, [\B, #-(\i * 8)] + ldrd A_l, A_h, [src, #-(\i * 8)] + strd A_l, A_h, [dst, #-(\i * 8)] .endm dispatch_7_dword #endif tst tmp2, #4 - sfi_breg src, \ - ldrne tmp1, [\B], #4 - sfi_breg dst, \ - strne tmp1, [\B], #4 + ldrne tmp1, [src], #4 + strne tmp1, [dst], #4 lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ - sfi_breg src, \ - ldrhcs tmp1, [\B], #2 - sfi_breg src, \ - ldrbne tmp2, [\B] - sfi_breg dst, \ - strhcs tmp1, [\B], #2 - sfi_breg dst, \ - strbne tmp2, [\B] + ldrhcs tmp1, [src], #2 + ldrbne tmp2, [src] + strhcs tmp1, [dst], #2 + strbne tmp2, [dst] .Ldone: ldr tmp2, [sp], #FRAME_SIZE @@ -565,23 +470,15 @@ ENTRY(memcpy) copy position into a register. This should act like a PLD operation but we won't have to repeat the transfer. */ - sfi_breg src, \ - vldr d3, [\B, #0] - sfi_breg src, \ - vldr d4, [\B, #64] - sfi_breg src, \ - vldr d5, [\B, #128] - sfi_breg src, \ - vldr d6, [\B, #192] - sfi_breg src, \ - vldr d7, [\B, #256] - - sfi_breg src, \ - vldr d0, [\B, #8] - sfi_breg src, \ - vldr d1, [\B, #16] - sfi_breg src, \ - vldr d2, [\B, #24] + vldr d3, [src, #0] + vldr d4, [src, #64] + vldr d5, [src, #128] + vldr d6, [src, #192] + vldr d7, [src, #256] + + vldr d0, [src, #8] + vldr d1, [src, #16] + vldr d2, [src, #24] add src, src, #32 subs tmp2, tmp2, #prefetch_lines * 64 * 2 @@ -606,31 +503,19 @@ ENTRY(memcpy) add src, src, #3 * 64 add dst, dst, #3 * 64 cpy_tail_vfp d6, 0 - sfi_breg dst, \ - vstr d7, [\B, #64] - sfi_breg src, \ - vldr d7, [\B, #64] - sfi_breg dst, \ - vstr d0, [\B, #64 + 8] - sfi_breg src, \ - vldr d0, [\B, #64 + 8] - sfi_breg dst, \ - vstr d1, [\B, #64 + 16] - sfi_breg src, \ - vldr d1, [\B, #64 + 16] - sfi_breg dst, \ - vstr d2, [\B, #64 + 24] - sfi_breg src, \ - vldr d2, [\B, #64 + 24] - sfi_breg dst, \ - vstr d7, [\B, #64 + 32] + vstr d7, [dst, #64] + vldr d7, [src, #64] + vstr d0, [dst, #64 + 8] + vldr d0, [src, #64 + 8] + vstr d1, [dst, #64 + 16] + vldr d1, [src, #64 + 16] + vstr d2, [dst, #64 + 24] + vldr d2, [src, #64 + 24] + vstr d7, [dst, #64 + 32] add src, src, #96 - sfi_breg dst, \ - vstr d0, [\B, #64 + 40] - sfi_breg dst, \ - vstr d1, [\B, #64 + 48] - sfi_breg dst, \ - vstr d2, [\B, #64 + 56] + vstr d0, [dst, #64 + 40] + vstr d1, [dst, #64 + 48] + vstr d2, [dst, #64 + 56] add dst, dst, #128 add tmp2, tmp2, #prefetch_lines * 64 b .Lcpy_body_medium @@ -641,83 +526,59 @@ ENTRY(memcpy) /* Pre-bias src and dst. */ sub src, src, #8 sub dst, dst, #8 - sfi_pld src, #8 - sfi_pld src, #72 + pld [src, #8] + pld [src, #72] subs tmp2, tmp2, #64 - sfi_pld src, #136 - sfi_breg src, \ - ldrd A_l, A_h, [\B, #8] + pld [src, #136] + ldrd A_l, A_h, [src, #8] strd B_l, B_h, [sp, #8] cfi_rel_offset (B_l, 8) cfi_rel_offset (B_h, 12) - sfi_breg src, \ - ldrd B_l, B_h, [\B, #16] + ldrd B_l, B_h, [src, #16] strd C_l, C_h, [sp, #16] cfi_rel_offset (C_l, 16) cfi_rel_offset (C_h, 20) - sfi_breg src, \ - ldrd C_l, C_h, [\B, #24] + ldrd C_l, C_h, [src, #24] strd D_l, D_h, [sp, #24] cfi_rel_offset (D_l, 24) cfi_rel_offset (D_h, 28) - sfi_pld src, #200 - sfi_breg src, \ - ldrd D_l, D_h, [\B, #32]! + pld [src, #200] + ldrd D_l, D_h, [src, #32]! b 1f .p2align 6 2: - sfi_pld src, #232 - sfi_breg dst, \ - strd A_l, A_h, [\B, #40] - sfi_breg src, \ - ldrd A_l, A_h, [\B, #40] - sfi_breg dst, \ - strd B_l, B_h, [\B, #48] - sfi_breg src, \ - ldrd B_l, B_h, [\B, #48] - sfi_breg dst, \ - strd C_l, C_h, [\B, #56] - sfi_breg src, \ - ldrd C_l, C_h, [\B, #56] - sfi_breg dst, \ - strd D_l, D_h, [\B, #64]! - sfi_breg src, \ - ldrd D_l, D_h, [\B, #64]! + pld [src, #232] + strd A_l, A_h, [dst, #40] + ldrd A_l, A_h, [src, #40] + strd B_l, B_h, [dst, #48] + ldrd B_l, B_h, [src, #48] + strd C_l, C_h, [dst, #56] + ldrd C_l, C_h, [src, #56] + strd D_l, D_h, [dst, #64]! + ldrd D_l, D_h, [src, #64]! subs tmp2, tmp2, #64 1: - sfi_breg dst, \ - strd A_l, A_h, [\B, #8] - sfi_breg src, \ - ldrd A_l, A_h, [\B, #8] - sfi_breg dst, \ - strd B_l, B_h, [\B, #16] - sfi_breg src, \ - ldrd B_l, B_h, [\B, #16] - sfi_breg dst, \ - strd C_l, C_h, [\B, #24] - sfi_breg src, \ - ldrd C_l, C_h, [\B, #24] - sfi_breg dst, \ - strd D_l, D_h, [\B, #32] - sfi_breg src, \ - ldrd D_l, D_h, [\B, #32] + strd A_l, A_h, [dst, #8] + ldrd A_l, A_h, [src, #8] + strd B_l, B_h, [dst, #16] + ldrd B_l, B_h, [src, #16] + strd C_l, C_h, [dst, #24] + ldrd C_l, C_h, [src, #24] + strd D_l, D_h, [dst, #32] + ldrd D_l, D_h, [src, #32] bcs 2b /* Save the remaining bytes and restore the callee-saved regs. */ - sfi_breg dst, \ - strd A_l, A_h, [\B, #40] + strd A_l, A_h, [dst, #40] add src, src, #40 - sfi_breg dst, \ - strd B_l, B_h, [\B, #48] + strd B_l, B_h, [dst, #48] ldrd B_l, B_h, [sp, #8] cfi_restore (B_l) cfi_restore (B_h) - sfi_breg dst, \ - strd C_l, C_h, [\B, #56] + strd C_l, C_h, [dst, #56] ldrd C_l, C_h, [sp, #16] cfi_restore (C_l) cfi_restore (C_h) - sfi_breg dst, \ - strd D_l, D_h, [\B, #64] + strd D_l, D_h, [dst, #64] ldrd D_l, D_h, [sp, #24] cfi_restore (D_l) cfi_restore (D_h) @@ -734,35 +595,29 @@ ENTRY(memcpy) cfi_remember_state .Lcpy_notaligned: - sfi_pld src - sfi_pld src, #64 + pld [src, #0] + pld [src, #64] /* There's at least 64 bytes to copy, but there is no mutual alignment. */ /* Bring DST to 64-bit alignment. */ lsls tmp2, dst, #29 - sfi_pld src, #(2 * 64) + pld [src, #(2 * 64)] beq 1f rsbs tmp2, tmp2, #0 sub count, count, tmp2, lsr #29 - sfi_breg src, \ - ldrmi tmp1, [\B], #4 - sfi_breg dst, \ - strmi tmp1, [\B], #4 + ldrmi tmp1, [src], #4 + strmi tmp1, [dst], #4 lsls tmp2, tmp2, #2 - sfi_breg src, \ - ldrbne tmp1, [\B], #1 - sfi_breg src, \ - ldrhcs tmp2, [\B], #2 - sfi_breg dst, \ - strbne tmp1, [\B], #1 - sfi_breg dst, \ - strhcs tmp2, [\B], #2 + ldrbne tmp1, [src], #1 + ldrhcs tmp2, [src], #2 + strbne tmp1, [dst], #1 + strhcs tmp2, [dst], #2 1: - sfi_pld src, #(3 * 64) + pld [src, #(3 * 64)] subs count, count, #64 ldrmi tmp2, [sp], #FRAME_SIZE bmi .Ltail63unaligned - sfi_pld src, #(4 * 64) + pld [src, #(4 * 64)] #ifdef USE_NEON /* These need an extra layer of macro just to work around a @@ -775,132 +630,88 @@ ENTRY(memcpy) vst1.8 {\reglist}, [ALIGN (\basereg, 64)]! .endm - /* These are used by the NaCl sfi_breg macro. */ - .macro _sfi_breg_dmask_neon_load_multi reg - _sfi_dmask \reg - .endm - .macro _sfi_breg_dmask_neon_store_multi reg - _sfi_dmask \reg - .endm - - sfi_breg src, neon_load_multi d0-d3, \B - sfi_breg src, neon_load_multi d4-d7, \B + neon_load_multi d0-d3, src + neon_load_multi d4-d7, src subs count, count, #64 bmi 2f 1: - sfi_pld src, #(4 * 64) - sfi_breg dst, neon_store_multi d0-d3, \B - sfi_breg src, neon_load_multi d0-d3, \B - sfi_breg dst, neon_store_multi d4-d7, \B - sfi_breg src, neon_load_multi d4-d7, \B + pld [src, #(4 * 64)] + neon_store_multi d0-d3, dst + neon_load_multi d0-d3, src + neon_store_multi d4-d7, dst + neon_load_multi d4-d7, src subs count, count, #64 bpl 1b 2: - sfi_breg dst, neon_store_multi d0-d3, \B - sfi_breg dst, neon_store_multi d4-d7, \B + neon_store_multi d0-d3, dst + neon_store_multi d4-d7, dst ands count, count, #0x3f #else /* Use an SMS style loop to maximize the I/O bandwidth. */ sub src, src, #4 sub dst, dst, #8 subs tmp2, count, #64 /* Use tmp2 for count. */ - sfi_breg src, \ - ldr A_l, [\B, #4] - sfi_breg src, \ - ldr A_h, [\B, #8] + ldr A_l, [src, #4] + ldr A_h, [src, #8] strd B_l, B_h, [sp, #8] cfi_rel_offset (B_l, 8) cfi_rel_offset (B_h, 12) - sfi_breg src, \ - ldr B_l, [\B, #12] - sfi_breg src, \ - ldr B_h, [\B, #16] + ldr B_l, [src, #12] + ldr B_h, [src, #16] strd C_l, C_h, [sp, #16] cfi_rel_offset (C_l, 16) cfi_rel_offset (C_h, 20) - sfi_breg src, \ - ldr C_l, [\B, #20] - sfi_breg src, \ - ldr C_h, [\B, #24] + ldr C_l, [src, #20] + ldr C_h, [src, #24] strd D_l, D_h, [sp, #24] cfi_rel_offset (D_l, 24) cfi_rel_offset (D_h, 28) - sfi_breg src, \ - ldr D_l, [\B, #28] - sfi_breg src, \ - ldr D_h, [\B, #32]! + ldr D_l, [src, #28] + ldr D_h, [src, #32]! b 1f .p2align 6 2: - sfi_pld src, #(5 * 64) - (32 - 4) - sfi_breg dst, \ - strd A_l, A_h, [\B, #40] - sfi_breg src, \ - ldr A_l, [\B, #36] - sfi_breg src, \ - ldr A_h, [\B, #40] - sfi_breg dst, \ - strd B_l, B_h, [\B, #48] - sfi_breg src, \ - ldr B_l, [\B, #44] - sfi_breg src, \ - ldr B_h, [\B, #48] - sfi_breg dst, \ - strd C_l, C_h, [\B, #56] - sfi_breg src, \ - ldr C_l, [\B, #52] - sfi_breg src, \ - ldr C_h, [\B, #56] - sfi_breg dst, \ - strd D_l, D_h, [\B, #64]! - sfi_breg src, \ - ldr D_l, [\B, #60] - sfi_breg src, \ - ldr D_h, [\B, #64]! + pld [src, #(5 * 64) - (32 - 4)] + strd A_l, A_h, [dst, #40] + ldr A_l, [src, #36] + ldr A_h, [src, #40] + strd B_l, B_h, [dst, #48] + ldr B_l, [src, #44] + ldr B_h, [src, #48] + strd C_l, C_h, [dst, #56] + ldr C_l, [src, #52] + ldr C_h, [src, #56] + strd D_l, D_h, [dst, #64]! + ldr D_l, [src, #60] + ldr D_h, [src, #64]! subs tmp2, tmp2, #64 1: - sfi_breg dst, \ - strd A_l, A_h, [\B, #8] - sfi_breg src, \ - ldr A_l, [\B, #4] - sfi_breg src, \ - ldr A_h, [\B, #8] - sfi_breg dst, \ - strd B_l, B_h, [\B, #16] - sfi_breg src, \ - ldr B_l, [\B, #12] - sfi_breg src, \ - ldr B_h, [\B, #16] - sfi_breg dst, \ - strd C_l, C_h, [\B, #24] - sfi_breg src, \ - ldr C_l, [\B, #20] - sfi_breg src, \ - ldr C_h, [\B, #24] - sfi_breg dst, \ - strd D_l, D_h, [\B, #32] - sfi_breg src, \ - ldr D_l, [\B, #28] - sfi_breg src, \ - ldr D_h, [\B, #32] + strd A_l, A_h, [dst, #8] + ldr A_l, [src, #4] + ldr A_h, [src, #8] + strd B_l, B_h, [dst, #16] + ldr B_l, [src, #12] + ldr B_h, [src, #16] + strd C_l, C_h, [dst, #24] + ldr C_l, [src, #20] + ldr C_h, [src, #24] + strd D_l, D_h, [dst, #32] + ldr D_l, [src, #28] + ldr D_h, [src, #32] bcs 2b /* Save the remaining bytes and restore the callee-saved regs. */ - sfi_breg dst, \ - strd A_l, A_h, [\B, #40] + strd A_l, A_h, [dst, #40] add src, src, #36 - sfi_breg dst, \ - strd B_l, B_h, [\B, #48] + strd B_l, B_h, [dst, #48] ldrd B_l, B_h, [sp, #8] cfi_restore (B_l) cfi_restore (B_h) - sfi_breg dst, \ - strd C_l, C_h, [\B, #56] + strd C_l, C_h, [dst, #56] ldrd C_l, C_h, [sp, #16] cfi_restore (C_l) cfi_restore (C_h) - sfi_breg dst, \ - strd D_l, D_h, [\B, #64] + strd D_l, D_h, [dst, #64] ldrd D_l, D_h, [sp, #24] cfi_restore (D_l) cfi_restore (D_h) diff --git a/sysdeps/arm/armv7/multiarch/memcpy_neon.S b/sysdeps/arm/armv7/multiarch/memcpy_neon.S index e60d1cc0e1..1a8d8bbe9e 100644 --- a/sysdeps/arm/armv7/multiarch/memcpy_neon.S +++ b/sysdeps/arm/armv7/multiarch/memcpy_neon.S @@ -1,8 +1,8 @@ -#ifdef __ARM_NEON__ -/* Under __ARM_NEON__, this file defines memcpy directly. */ -libc_hidden_builtin_def (memcpy) -#else +/* For __ARM_NEON__ this file defines memcpy. */ +#ifndef __ARM_NEON__ # define memcpy __memcpy_neon +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(a) #endif #define MEMCPY_NEON diff --git a/sysdeps/arm/armv7/multiarch/memcpy_vfp.S b/sysdeps/arm/armv7/multiarch/memcpy_vfp.S index e008c041ed..d1e9ede439 100644 --- a/sysdeps/arm/armv7/multiarch/memcpy_vfp.S +++ b/sysdeps/arm/armv7/multiarch/memcpy_vfp.S @@ -1,7 +1,9 @@ -/* Under __ARM_NEON__, memcpy_neon.S defines memcpy directly +/* Under __ARM_NEON__ memcpy_neon.S defines memcpy directly and the __memcpy_vfp code will never be used. */ #ifndef __ARM_NEON__ # define MEMCPY_VFP # define memcpy __memcpy_vfp +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(a) # include "memcpy_impl.S" #endif diff --git a/sysdeps/arm/armv7/multiarch/rtld-memchr.S b/sysdeps/arm/armv7/multiarch/rtld-memchr.S new file mode 100644 index 0000000000..ae8e5f04c4 --- /dev/null +++ b/sysdeps/arm/armv7/multiarch/rtld-memchr.S @@ -0,0 +1 @@ +#include <sysdeps/arm/armv6t2/memchr.S> diff --git a/sysdeps/arm/armv7/multiarch/rtld-memcpy.S b/sysdeps/arm/armv7/multiarch/rtld-memcpy.S new file mode 100644 index 0000000000..ca2387531b --- /dev/null +++ b/sysdeps/arm/armv7/multiarch/rtld-memcpy.S @@ -0,0 +1 @@ +#include <sysdeps/arm/armv7/multiarch/memcpy_impl.S> diff --git a/sysdeps/arm/armv7/strcmp.S b/sysdeps/arm/armv7/strcmp.S index 5bcaf21ee2..2626fdf72e 100644 --- a/sysdeps/arm/armv7/strcmp.S +++ b/sysdeps/arm/armv7/strcmp.S @@ -1,5 +1,5 @@ /* strcmp implementation for ARMv7-A, optimized for Cortex-A15. - Copyright (C) 2012-2016 Free Software Foundation, Inc. + Copyright (C) 2012-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -83,8 +83,6 @@ #define syndrome tmp2 -#ifndef NO_THUMB -/* This code is best on Thumb. */ .thumb /* In Thumb code we can't use MVN with a register shift, but we do have ORN. */ @@ -94,27 +92,6 @@ .macro apply_mask data_reg, mask_reg orn \data_reg, \data_reg, \mask_reg .endm -#else -/* In ARM code we don't have ORN, but we can use MVN with a register shift. */ -.macro prepare_mask mask_reg, nbits_reg - mvn \mask_reg, const_m1, S2HI \nbits_reg -.endm -.macro apply_mask data_reg, mask_reg - orr \data_reg, \data_reg, \mask_reg -.endm - -/* These clobber the condition codes, which the real Thumb cbz/cbnz - instructions do not. But it doesn't matter for any of the uses here. */ -.macro cbz reg, label - cmp \reg, #0 - beq \label -.endm -.macro cbnz reg, label - cmp \reg, #0 - bne \label -.endm -#endif - /* Macro to compute and return the result value for word-aligned cases. */ @@ -178,10 +155,8 @@ #endif ENTRY (strcmp) #if STRCMP_PRECHECK == 1 - sfi_breg src1, \ - ldrb r2, [\B] - sfi_breg src2, \ - ldrb r3, [\B] + ldrb r2, [src1] + ldrb r3, [src2] cmp r2, #1 it cs cmpcs r2, r3 @@ -211,11 +186,9 @@ ENTRY (strcmp) and tmp2, tmp1, #3 bic src2, src2, #7 lsl tmp2, tmp2, #3 /* Bytes -> bits. */ - sfi_breg src1, \ - ldrd data1a, data1b, [\B], #16 + ldrd data1a, data1b, [src1], #16 tst tmp1, #4 - sfi_breg src2, \ - ldrd data2a, data2b, [\B], #16 + ldrd data2a, data2b, [src2], #16 prepare_mask tmp1, tmp2 apply_mask data1a, tmp1 apply_mask data2a, tmp1 @@ -231,10 +204,8 @@ ENTRY (strcmp) .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ .p2align 2 /* Always word aligned. */ .Lloop_aligned8: - sfi_breg src1, \ - ldrd data1a, data1b, [\B], #16 - sfi_breg src2, \ - ldrd data2a, data2b, [\B], #16 + ldrd data1a, data1b, [src1], #16 + ldrd data2a, data2b, [src2], #16 .Lstart_realigned8: uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ eor syndrome_a, data1a, data2a @@ -245,10 +216,8 @@ ENTRY (strcmp) sel syndrome_b, syndrome_b, const_m1 cbnz syndrome_b, .Ldiff_in_b - sfi_breg src1, \ - ldrd data1a, data1b, [\B, #-8] - sfi_breg src2, \ - ldrd data2a, data2b, [\B, #-8] + ldrd data1a, data1b, [src1, #-8] + ldrd data2a, data2b, [src2, #-8] uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ eor syndrome_a, data1a, data2a sel syndrome_a, syndrome_a, const_m1 @@ -279,19 +248,15 @@ ENTRY (strcmp) /* Unrolled by a factor of 2, to reduce the number of post-increment operations. */ .Lloop_aligned4: - sfi_breg src1, \ - ldr data1, [\B], #8 - sfi_breg src2, \ - ldr data2, [\B], #8 + ldr data1, [src1], #8 + ldr data2, [src2], #8 .Lstart_realigned4: uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ eor syndrome, data1, data2 sel syndrome, syndrome, const_m1 cbnz syndrome, .Laligned4_done - sfi_breg src1, \ - ldr data1, [\B, #-4] - sfi_breg src2, \ - ldr data2, [\B, #-4] + ldr data1, [src1, #-4] + ldr data2, [src2, #-4] uadd8 syndrome, data1, const_m1 eor syndrome, data1, data2 sel syndrome, syndrome, const_m1 @@ -307,11 +272,9 @@ ENTRY (strcmp) masking off the unwanted loaded data to prevent a difference. */ lsl tmp1, tmp1, #3 /* Bytes -> bits. */ bic src1, src1, #3 - sfi_breg src1, \ - ldr data1, [\B], #8 + ldr data1, [src1], #8 bic src2, src2, #3 - sfi_breg src2, \ - ldr data2, [\B], #8 + ldr data2, [src2], #8 prepare_mask tmp1, tmp1 apply_mask data1, tmp1 @@ -324,30 +287,26 @@ ENTRY (strcmp) sub src2, src2, tmp1 bic src1, src1, #3 lsls tmp1, tmp1, #31 - sfi_breg src1, \ - ldr data1, [\B], #4 + ldr data1, [src1], #4 beq .Laligned_m2 bcs .Laligned_m1 #if STRCMP_PRECHECK == 0 - sfi_breg src2, \ - ldrb data2, [\B, #1] + ldrb data2, [src2, #1] uxtb tmp1, data1, ror #BYTE1_OFFSET subs tmp1, tmp1, data2 bne .Lmisaligned_exit cbz data2, .Lmisaligned_exit .Laligned_m2: - sfi_breg src2, \ - ldrb data2, [\B, #2] + ldrb data2, [src2, #2] uxtb tmp1, data1, ror #BYTE2_OFFSET subs tmp1, tmp1, data2 bne .Lmisaligned_exit cbz data2, .Lmisaligned_exit .Laligned_m1: - sfi_breg src2, \ - ldrb data2, [\B, #3] + ldrb data2, [src2, #3] uxtb tmp1, data1, ror #BYTE3_OFFSET subs tmp1, tmp1, data2 bne .Lmisaligned_exit @@ -356,16 +315,14 @@ ENTRY (strcmp) #else /* STRCMP_PRECHECK */ /* If we've done the pre-check, then we don't need to check the first byte again here. */ - sfi_breg src2, \ - ldrb data2, [\B, #2] + ldrb data2, [src2, #2] uxtb tmp1, data1, ror #BYTE2_OFFSET subs tmp1, tmp1, data2 bne .Lmisaligned_exit cbz data2, .Lmisaligned_exit .Laligned_m2: - sfi_breg src2, \ - ldrb data2, [\B, #3] + ldrb data2, [src2, #3] uxtb tmp1, data1, ror #BYTE3_OFFSET subs tmp1, tmp1, data2 bne .Lmisaligned_exit @@ -391,13 +348,11 @@ ENTRY (strcmp) cfi_restore_state /* src1 is word aligned, but src2 has no common alignment with it. */ - sfi_breg src1, \ - ldr data1, [\B], #4 + ldr data1, [src1], #4 lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */ bic src2, src2, #3 - sfi_breg src2, \ - ldr data2, [\B], #4 + ldr data2, [src2], #4 bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */ bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */ @@ -409,13 +364,11 @@ ENTRY (strcmp) sel syndrome, syndrome, const_m1 bne 4f cbnz syndrome, 5f - sfi_breg src2, \ - ldr data2, [\B], #4 + ldr data2, [src2], #4 eor tmp1, tmp1, data1 cmp tmp1, data2, S2HI #24 bne 6f - sfi_breg src1, \ - ldr data1, [\B], #4 + ldr data1, [src1], #4 b .Loverlap3 4: S2LO data2, data2, #8 @@ -427,8 +380,7 @@ ENTRY (strcmp) /* We can only get here if the MSB of data1 contains 0, so fast-path the exit. */ - sfi_breg src2, \ - ldrb result, [\B] + ldrb result, [src2] ldrd r4, r5, [sp], #16 cfi_remember_state cfi_def_cfa_offset (0) @@ -454,13 +406,11 @@ ENTRY (strcmp) sel syndrome, syndrome, const_m1 bne 4f cbnz syndrome, 5f - sfi_breg src2, \ - ldr data2, [\B], #4 + ldr data2, [src2], #4 eor tmp1, tmp1, data1 cmp tmp1, data2, S2HI #16 bne 6f - sfi_breg src1, \ - ldr data1, [\B], #4 + ldr data1, [src1], #4 b .Loverlap2 4: S2LO data2, data2, #16 @@ -469,8 +419,7 @@ ENTRY (strcmp) ands syndrome, syndrome, const_m1, S2LO #16 bne .Lstrcmp_done_equal - sfi_breg src2, \ - ldrh data2, [\B] + ldrh data2, [src2] S2LO data1, data1, #16 #ifdef __ARM_BIG_ENDIAN lsl data2, data2, #16 @@ -490,13 +439,11 @@ ENTRY (strcmp) sel syndrome, syndrome, const_m1 bne 4f cbnz syndrome, 5f - sfi_breg src2, \ - ldr data2, [\B], #4 + ldr data2, [src2], #4 eor tmp1, tmp1, data1 cmp tmp1, data2, S2HI #8 bne 6f - sfi_breg src1, \ - ldr data1, [\B], #4 + ldr data1, [src1], #4 b .Loverlap1 4: S2LO data2, data2, #24 @@ -504,8 +451,7 @@ ENTRY (strcmp) 5: tst syndrome, #LSB bne .Lstrcmp_done_equal - sfi_breg src2, \ - ldr data2, [\B] + ldr data2, [src2] 6: S2LO data1, data1, #8 bic data2, data2, #MSB |