summaryrefslogtreecommitdiff
path: root/sysdeps/arm/armv7
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/arm/armv7')
-rw-r--r--sysdeps/arm/armv7/multiarch/Makefile3
-rw-r--r--sysdeps/arm/armv7/multiarch/ifunc-impl-list.c7
-rw-r--r--sysdeps/arm/armv7/multiarch/ifunc-memchr.h28
-rw-r--r--sysdeps/arm/armv7/multiarch/ifunc-memcpy.h37
-rw-r--r--sysdeps/arm/armv7/multiarch/memchr.c35
-rw-r--r--sysdeps/arm/armv7/multiarch/memchr_neon.S202
-rw-r--r--sysdeps/arm/armv7/multiarch/memchr_noneon.S5
-rw-r--r--sysdeps/arm/armv7/multiarch/memcpy.S76
-rw-r--r--sysdeps/arm/armv7/multiarch/memcpy.c35
-rw-r--r--sysdeps/arm/armv7/multiarch/memcpy_arm.S10
-rw-r--r--sysdeps/arm/armv7/multiarch/memcpy_impl.S585
-rw-r--r--sysdeps/arm/armv7/multiarch/memcpy_neon.S8
-rw-r--r--sysdeps/arm/armv7/multiarch/memcpy_vfp.S4
-rw-r--r--sysdeps/arm/armv7/multiarch/rtld-memchr.S1
-rw-r--r--sysdeps/arm/armv7/multiarch/rtld-memcpy.S1
-rw-r--r--sysdeps/arm/armv7/strcmp.S118
16 files changed, 599 insertions, 556 deletions
diff --git a/sysdeps/arm/armv7/multiarch/Makefile b/sysdeps/arm/armv7/multiarch/Makefile
index e834cc937f..6e5851f897 100644
--- a/sysdeps/arm/armv7/multiarch/Makefile
+++ b/sysdeps/arm/armv7/multiarch/Makefile
@@ -1,3 +1,4 @@
ifeq ($(subdir),string)
-sysdep_routines += memcpy_neon memcpy_vfp
+sysdep_routines += memcpy_neon memcpy_vfp memchr_neon memcpy_arm \
+ memchr_noneon
endif
diff --git a/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c b/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
index d7088f2a22..48e43da66e 100644
--- a/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
+++ b/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
@@ -1,5 +1,5 @@
/* Enumerate available IFUNC implementations of a function. ARM version.
- Copyright (C) 2013-2016 Free Software Foundation, Inc.
+ Copyright (C) 2013-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -34,6 +34,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
bool use_neon = true;
#ifdef __ARM_NEON__
# define __memcpy_neon memcpy
+# define __memchr_neon memchr
#else
use_neon = (GLRO(dl_hwcap) & HWCAP_ARM_NEON) != 0;
#endif
@@ -52,5 +53,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
#endif
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_arm));
+ IFUNC_IMPL (i, name, memchr,
+ IFUNC_IMPL_ADD (array, i, memchr, use_neon, __memchr_neon)
+ IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_noneon));
+
return i;
}
diff --git a/sysdeps/arm/armv7/multiarch/ifunc-memchr.h b/sysdeps/arm/armv7/multiarch/ifunc-memchr.h
new file mode 100644
index 0000000000..75495824f4
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/ifunc-memchr.h
@@ -0,0 +1,28 @@
+/* Common definition for memchr resolver.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+__typeof (REDIRECT_NAME) OPTIMIZE (neon) attribute_hidden;
+__typeof (REDIRECT_NAME) OPTIMIZE (noneon) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (int hwcap)
+{
+ if (hwcap & HWCAP_ARM_NEON)
+ return OPTIMIZE (neon);
+ return OPTIMIZE (noneon);
+}
diff --git a/sysdeps/arm/armv7/multiarch/ifunc-memcpy.h b/sysdeps/arm/armv7/multiarch/ifunc-memcpy.h
new file mode 100644
index 0000000000..7e6f73ff4d
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/ifunc-memcpy.h
@@ -0,0 +1,37 @@
+/* Common definition for memcpy resolver.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifdef __SOFTFP__
+__typeof (REDIRECT_NAME) OPTIMIZE (arm) attribute_hidden;
+#endif
+__typeof (REDIRECT_NAME) OPTIMIZE (vfp) attribute_hidden;
+__typeof (REDIRECT_NAME) OPTIMIZE (neon) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (int hwcap)
+{
+ if (hwcap & HWCAP_ARM_NEON)
+ return OPTIMIZE (neon);
+#ifdef __SOFTFP__
+ if (hwcap & HWCAP_ARM_VFP)
+ return OPTIMIZE (vfp);
+ return OPTIMIZE (arm);
+#else
+ return OPTIMIZE (vfp);
+#endif
+}
diff --git a/sysdeps/arm/armv7/multiarch/memchr.c b/sysdeps/arm/armv7/multiarch/memchr.c
new file mode 100644
index 0000000000..ff1cc5d203
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/memchr.c
@@ -0,0 +1,35 @@
+/* Multiple versions of memchr.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* For __ARM_NEON__ memchr_neon.S defines memchr directly and ifunc
+ is not used. */
+#if IS_IN (libc) && !defined (__ARM_NEON__)
+# define memchr __redirect_memchr
+# include <string.h>
+# undef memchr
+
+# include <arm-ifunc.h>
+
+# define SYMBOL_NAME memchr
+# include "ifunc-memchr.h"
+
+arm_libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR);
+
+arm_libc_ifunc_hidden_def (__redirect_memchr, memchr);
+#endif
diff --git a/sysdeps/arm/armv7/multiarch/memchr_neon.S b/sysdeps/arm/armv7/multiarch/memchr_neon.S
new file mode 100644
index 0000000000..6fbf9b8898
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/memchr_neon.S
@@ -0,0 +1,202 @@
+/* memchr implemented using NEON.
+ Copyright (C) 2011-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* For __ARM_NEON__ this file defines memchr. */
+#ifndef __ARM_NEON__
+# define memchr __memchr_neon
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(a)
+#endif
+
+ .arch armv7-a
+ .fpu neon
+
+
+/* Arguments */
+#define srcin r0
+#define chrin r1
+#define cntin r2
+
+/* Retval */
+#define result r0 /* Live range does not overlap with srcin */
+
+/* Working registers */
+#define src r1 /* Live range does not overlap with chrin */
+#define tmp r3
+#define synd r0 /* No overlap with srcin or result */
+#define soff r12
+
+/* Working NEON registers */
+#define vrepchr q0
+#define vdata0 q1
+#define vdata0_0 d2 /* Lower half of vdata0 */
+#define vdata0_1 d3 /* Upper half of vdata0 */
+#define vdata1 q2
+#define vdata1_0 d4 /* Lower half of vhas_chr0 */
+#define vdata1_1 d5 /* Upper half of vhas_chr0 */
+#define vrepmask q3
+#define vrepmask0 d6
+#define vrepmask1 d7
+#define vend q4
+#define vend0 d8
+#define vend1 d9
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per
+ * byte. Each bit is set if the relevant byte matched the requested character
+ * and cleared otherwise. Since the bits in the syndrome reflect exactly the
+ * order in which things occur in the original string, counting trailing zeros
+ * allows to identify exactly which byte has matched.
+ */
+
+ .thumb_func
+ .p2align 4,,15
+
+ENTRY(memchr)
+ /* Use a simple loop if there are less than 8 bytes to search. */
+ cmp cntin, #7
+ bhi .Llargestr
+ and chrin, chrin, #0xff
+
+.Lsmallstr:
+ subs cntin, cntin, #1
+ blo .Lnotfound /* Return not found if reached end. */
+ ldrb tmp, [srcin], #1
+ cmp tmp, chrin
+ bne .Lsmallstr /* Loop again if not found. */
+ /* Otherwise fixup address and return. */
+ sub result, srcin, #1
+ bx lr
+
+
+.Llargestr:
+ vdup.8 vrepchr, chrin /* Duplicate char across all lanes. */
+ /*
+ * Magic constant 0x8040201008040201 allows us to identify which lane
+ * matches the requested byte.
+ */
+ movw tmp, #0x0201
+ movt tmp, #0x0804
+ lsl soff, tmp, #4
+ vmov vrepmask0, tmp, soff
+ vmov vrepmask1, tmp, soff
+ /* Work with aligned 32-byte chunks */
+ bic src, srcin, #31
+ ands soff, srcin, #31
+ beq .Lloopintro /* Go straight to main loop if it's aligned. */
+
+ /*
+ * Input string is not 32-byte aligned. We calculate the syndrome
+ * value for the aligned 32 bytes block containing the first bytes
+ * and mask the irrelevant part.
+ */
+ vld1.8 {vdata0, vdata1}, [src:256]!
+ sub tmp, soff, #32
+ adds cntin, cntin, tmp
+ vceq.i8 vdata0, vdata0, vrepchr
+ vceq.i8 vdata1, vdata1, vrepchr
+ vand vdata0, vdata0, vrepmask
+ vand vdata1, vdata1, vrepmask
+ vpadd.i8 vdata0_0, vdata0_0, vdata0_1
+ vpadd.i8 vdata1_0, vdata1_0, vdata1_1
+ vpadd.i8 vdata0_0, vdata0_0, vdata1_0
+ vpadd.i8 vdata0_0, vdata0_0, vdata0_0
+ vmov synd, vdata0_0[0]
+
+ /* Clear the soff lower bits */
+ lsr synd, synd, soff
+ lsl synd, synd, soff
+ /* The first block can also be the last */
+ bls .Lmasklast
+ /* Have we found something already? */
+ cbnz synd, .Ltail
+
+
+.Lloopintro:
+ vpush {vend}
+ /* 264/265 correspond to d8/d9 for q4 */
+ cfi_adjust_cfa_offset (16)
+ cfi_rel_offset (264, 0)
+ cfi_rel_offset (265, 8)
+ .p2align 3,,7
+.Lloop:
+ vld1.8 {vdata0, vdata1}, [src:256]!
+ subs cntin, cntin, #32
+ vceq.i8 vdata0, vdata0, vrepchr
+ vceq.i8 vdata1, vdata1, vrepchr
+ /* If we're out of data we finish regardless of the result. */
+ bls .Lend
+ /* Use a fast check for the termination condition. */
+ vorr vend, vdata0, vdata1
+ vorr vend0, vend0, vend1
+ vmov synd, tmp, vend0
+ orrs synd, synd, tmp
+ /* We're not out of data, loop if we haven't found the character. */
+ beq .Lloop
+
+.Lend:
+ vpop {vend}
+ cfi_adjust_cfa_offset (-16)
+ cfi_restore (264)
+ cfi_restore (265)
+
+ /* Termination condition found, let's calculate the syndrome value. */
+ vand vdata0, vdata0, vrepmask
+ vand vdata1, vdata1, vrepmask
+ vpadd.i8 vdata0_0, vdata0_0, vdata0_1
+ vpadd.i8 vdata1_0, vdata1_0, vdata1_1
+ vpadd.i8 vdata0_0, vdata0_0, vdata1_0
+ vpadd.i8 vdata0_0, vdata0_0, vdata0_0
+ vmov synd, vdata0_0[0]
+ cbz synd, .Lnotfound
+ bhi .Ltail /* Uses the condition code from
+ subs cntin, cntin, #32 above. */
+
+
+.Lmasklast:
+ /* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */
+ neg cntin, cntin
+ lsl synd, synd, cntin
+ lsrs synd, synd, cntin
+ it eq
+ moveq src, #0 /* If no match, set src to 0 so the retval is 0. */
+
+
+.Ltail:
+ /* Count the trailing zeros using bit reversing */
+ rbit synd, synd
+ /* Compensate the last post-increment */
+ sub src, src, #32
+ /* Count the leading zeros */
+ clz synd, synd
+ /* Compute the potential result and return */
+ add result, src, synd
+ bx lr
+
+
+.Lnotfound:
+ /* Set result to NULL if not found and return */
+ mov result, #0
+ bx lr
+
+END(memchr)
+libc_hidden_builtin_def (memchr)
diff --git a/sysdeps/arm/armv7/multiarch/memchr_noneon.S b/sysdeps/arm/armv7/multiarch/memchr_noneon.S
new file mode 100644
index 0000000000..b1fb54018d
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/memchr_noneon.S
@@ -0,0 +1,5 @@
+#define memchr __memchr_noneon
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/arm/armv6t2/memchr.S>
diff --git a/sysdeps/arm/armv7/multiarch/memcpy.S b/sysdeps/arm/armv7/multiarch/memcpy.S
deleted file mode 100644
index 01ba9e5733..0000000000
--- a/sysdeps/arm/armv7/multiarch/memcpy.S
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Multiple versions of memcpy
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2013-2016 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-/* Thumb requires excess IT instructions here. */
-#define NO_THUMB
-#include <sysdep.h>
-#include <rtld-global-offsets.h>
-
-#if IS_IN (libc)
-/* Under __ARM_NEON__, memcpy_neon.S defines the name memcpy. */
-# ifndef __ARM_NEON__
- .text
-ENTRY(memcpy)
- .type memcpy, %gnu_indirect_function
-# ifdef __SOFTFP__
- ldr r1, .Lmemcpy_arm
- tst r0, #HWCAP_ARM_VFP
- ldrne r1, .Lmemcpy_vfp
-# else
- ldr r1, .Lmemcpy_vfp
-# endif
- tst r0, #HWCAP_ARM_NEON
- ldrne r1, .Lmemcpy_neon
-1:
- add r0, r1, pc
- DO_RET(lr)
-
-# ifdef __SOFTFP__
-.Lmemcpy_arm:
- .long C_SYMBOL_NAME(__memcpy_arm) - 1b - PC_OFS
-# endif
-.Lmemcpy_neon:
- .long C_SYMBOL_NAME(__memcpy_neon) - 1b - PC_OFS
-.Lmemcpy_vfp:
- .long C_SYMBOL_NAME(__memcpy_vfp) - 1b - PC_OFS
-
-END(memcpy)
-
-libc_hidden_builtin_def (memcpy)
-#endif /* Not __ARM_NEON__. */
-
-/* These versions of memcpy are defined not to clobber any VFP or NEON
- registers so they must always call the ARM variant of the memcpy code. */
-strong_alias (__memcpy_arm, __aeabi_memcpy)
-strong_alias (__memcpy_arm, __aeabi_memcpy4)
-strong_alias (__memcpy_arm, __aeabi_memcpy8)
-libc_hidden_def (__memcpy_arm)
-
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(name)
-#undef weak_alias
-#define weak_alias(x, y)
-#undef libc_hidden_def
-#define libc_hidden_def(name)
-
-#define memcpy __memcpy_arm
-
-#endif
-
-#include "memcpy_impl.S"
diff --git a/sysdeps/arm/armv7/multiarch/memcpy.c b/sysdeps/arm/armv7/multiarch/memcpy.c
new file mode 100644
index 0000000000..02776b6fe6
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/memcpy.c
@@ -0,0 +1,35 @@
+/* Multiple versions of memcpy.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* For __ARM_NEON__ memchr_neon.S defines memchr directly and ifunc
+ is not used. */
+#if IS_IN (libc) && !defined (__ARM_NEON__)
+# define memcpy __redirect_memcpy
+# include <string.h>
+# undef memcpy
+
+# include <arm-ifunc.h>
+
+# define SYMBOL_NAME memcpy
+# include "ifunc-memcpy.h"
+
+arm_libc_ifunc_redirected (__redirect_memcpy, memcpy, IFUNC_SELECTOR);
+
+arm_libc_ifunc_hidden_def (__redirect_memcpy, memcpy);
+#endif
diff --git a/sysdeps/arm/armv7/multiarch/memcpy_arm.S b/sysdeps/arm/armv7/multiarch/memcpy_arm.S
new file mode 100644
index 0000000000..e4a9a68c42
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/memcpy_arm.S
@@ -0,0 +1,10 @@
+#define memcpy __memcpy_arm
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(a)
+#include "memcpy_impl.S"
+
+/* These versions of memcpy are defined not to clobber any VFP or NEON
+ registers so they must always call the ARM variant of the memcpy code. */
+strong_alias (__memcpy_arm, __aeabi_memcpy)
+strong_alias (__memcpy_arm, __aeabi_memcpy4)
+strong_alias (__memcpy_arm, __aeabi_memcpy8)
diff --git a/sysdeps/arm/armv7/multiarch/memcpy_impl.S b/sysdeps/arm/armv7/multiarch/memcpy_impl.S
index a1f6266c88..2de172635c 100644
--- a/sysdeps/arm/armv7/multiarch/memcpy_impl.S
+++ b/sysdeps/arm/armv7/multiarch/memcpy_impl.S
@@ -1,5 +1,5 @@
/* NEON/VFP/ARM version of memcpy optimized for Cortex-A15.
- Copyright (C) 2013-2016 Free Software Foundation, Inc.
+ Copyright (C) 2013-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -226,71 +226,40 @@
#ifdef USE_VFP
.macro cpy_line_vfp vreg, base
- sfi_breg dst, \
- vstr \vreg, [\B, #\base]
- sfi_breg src, \
- vldr \vreg, [\B, #\base]
- sfi_breg dst, \
- vstr d0, [\B, #\base + 8]
- sfi_breg src, \
- vldr d0, [\B, #\base + 8]
- sfi_breg dst, \
- vstr d1, [\B, #\base + 16]
- sfi_breg src, \
- vldr d1, [\B, #\base + 16]
- sfi_breg dst, \
- vstr d2, [\B, #\base + 24]
- sfi_breg src, \
- vldr d2, [\B, #\base + 24]
- sfi_breg dst, \
- vstr \vreg, [\B, #\base + 32]
- sfi_breg src, \
- vldr \vreg, [\B, #\base + prefetch_lines * 64 - 32]
- sfi_breg dst, \
- vstr d0, [\B, #\base + 40]
- sfi_breg src, \
- vldr d0, [\B, #\base + 40]
- sfi_breg dst, \
- vstr d1, [\B, #\base + 48]
- sfi_breg src, \
- vldr d1, [\B, #\base + 48]
- sfi_breg dst, \
- vstr d2, [\B, #\base + 56]
- sfi_breg src, \
- vldr d2, [\B, #\base + 56]
+ vstr \vreg, [dst, #\base]
+ vldr \vreg, [src, #\base]
+ vstr d0, [dst, #\base + 8]
+ vldr d0, [src, #\base + 8]
+ vstr d1, [dst, #\base + 16]
+ vldr d1, [src, #\base + 16]
+ vstr d2, [dst, #\base + 24]
+ vldr d2, [src, #\base + 24]
+ vstr \vreg, [dst, #\base + 32]
+ vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
+ vstr d0, [dst, #\base + 40]
+ vldr d0, [src, #\base + 40]
+ vstr d1, [dst, #\base + 48]
+ vldr d1, [src, #\base + 48]
+ vstr d2, [dst, #\base + 56]
+ vldr d2, [src, #\base + 56]
.endm
.macro cpy_tail_vfp vreg, base
- sfi_breg dst, \
- vstr \vreg, [\B, #\base]
- sfi_breg src, \
- vldr \vreg, [\B, #\base]
- sfi_breg dst, \
- vstr d0, [\B, #\base + 8]
- sfi_breg src, \
- vldr d0, [\B, #\base + 8]
- sfi_breg dst, \
- vstr d1, [\B, #\base + 16]
- sfi_breg src, \
- vldr d1, [\B, #\base + 16]
- sfi_breg dst, \
- vstr d2, [\B, #\base + 24]
- sfi_breg src, \
- vldr d2, [\B, #\base + 24]
- sfi_breg dst, \
- vstr \vreg, [\B, #\base + 32]
- sfi_breg dst, \
- vstr d0, [\B, #\base + 40]
- sfi_breg src, \
- vldr d0, [\B, #\base + 40]
- sfi_breg dst, \
- vstr d1, [\B, #\base + 48]
- sfi_breg src, \
- vldr d1, [\B, #\base + 48]
- sfi_breg dst, \
- vstr d2, [\B, #\base + 56]
- sfi_breg src, \
- vldr d2, [\B, #\base + 56]
+ vstr \vreg, [dst, #\base]
+ vldr \vreg, [src, #\base]
+ vstr d0, [dst, #\base + 8]
+ vldr d0, [src, #\base + 8]
+ vstr d1, [dst, #\base + 16]
+ vldr d1, [src, #\base + 16]
+ vstr d2, [dst, #\base + 24]
+ vldr d2, [src, #\base + 24]
+ vstr \vreg, [dst, #\base + 32]
+ vstr d0, [dst, #\base + 40]
+ vldr d0, [src, #\base + 40]
+ vstr d1, [dst, #\base + 48]
+ vldr d1, [src, #\base + 48]
+ vstr d2, [dst, #\base + 56]
+ vldr d2, [src, #\base + 56]
.endm
#endif
@@ -307,7 +276,7 @@ ENTRY(memcpy)
#ifdef USE_NEON
/* These need an extra layer of macro just to work around a
bug in the assembler's parser when an operand starts with
- a {...}. http://sourceware.org/bugzilla/show_bug.cgi?id=15647
+ a {...}. https://sourceware.org/bugzilla/show_bug.cgi?id=15647
tracks that bug; it was not fixed as of binutils-2.23.2. */
.macro neon_load_d0 reg
vld1.8 {d0}, [\reg]!
@@ -316,26 +285,16 @@ ENTRY(memcpy)
vst1.8 {d0}, [\reg]!
.endm
- /* These are used by the NaCl sfi_breg macro. */
- .macro _sfi_breg_dmask_neon_load_d0 reg
- _sfi_dmask \reg
- .endm
- .macro _sfi_breg_dmask_neon_store_d0 reg
- _sfi_dmask \reg
- .endm
-
and tmp1, count, #0x38
.macro dispatch_step i
- sfi_breg src, neon_load_d0 \B
- sfi_breg dst, neon_store_d0 \B
+ neon_load_d0 src
+ neon_store_d0 dst
.endm
dispatch_7_dword
tst count, #4
- sfi_breg src, \
- ldrne tmp1, [\B], #4
- sfi_breg dst, \
- strne tmp1, [\B], #4
+ ldrne tmp1, [src], #4
+ strne tmp1, [dst], #4
#else
/* Copy up to 15 full words of data. May not be aligned. */
/* Cannot use VFP for unaligned data. */
@@ -344,23 +303,17 @@ ENTRY(memcpy)
add src, src, tmp1
/* Jump directly into the sequence below at the correct offset. */
.macro dispatch_step i
- sfi_breg src, \
- ldr tmp1, [\B, #-(\i * 4)]
- sfi_breg dst, \
- str tmp1, [\B, #-(\i * 4)]
+ ldr tmp1, [src, #-(\i * 4)]
+ str tmp1, [dst, #-(\i * 4)]
.endm
dispatch_15_word
#endif
lsls count, count, #31
- sfi_breg src, \
- ldrhcs tmp1, [\B], #2
- sfi_breg src, \
- ldrbne src, [\B] /* Src is dead, use as a scratch. */
- sfi_breg dst, \
- strhcs tmp1, [\B], #2
- sfi_breg dst, \
- strbne src, [\B]
+ ldrhcs tmp1, [src], #2
+ ldrbne src, [src] /* Src is dead, use as a scratch. */
+ strhcs tmp1, [dst], #2
+ strbne src, [dst]
bx lr
.Lcpy_not_short:
@@ -388,19 +341,13 @@ ENTRY(memcpy)
beq 1f
rsbs tmp2, tmp2, #0
sub count, count, tmp2, lsr #29
- sfi_breg src, \
- ldrmi tmp1, [\B], #4
- sfi_breg dst, \
- strmi tmp1, [\B], #4
+ ldrmi tmp1, [src], #4
+ strmi tmp1, [dst], #4
lsls tmp2, tmp2, #2
- sfi_breg src, \
- ldrhcs tmp1, [\B], #2
- sfi_breg src, \
- ldrbne tmp2, [\B], #1
- sfi_breg dst, \
- strhcs tmp1, [\B], #2
- sfi_breg dst, \
- strbne tmp2, [\B], #1
+ ldrhcs tmp1, [src], #2
+ ldrbne tmp2, [src], #1
+ strhcs tmp1, [dst], #2
+ strbne tmp2, [dst], #1
1:
subs tmp2, count, #64 /* Use tmp2 for count. */
@@ -412,40 +359,24 @@ ENTRY(memcpy)
.Lcpy_body_medium: /* Count in tmp2. */
#ifdef USE_VFP
1:
- sfi_breg src, \
- vldr d0, [\B, #0]
+ vldr d0, [src, #0]
subs tmp2, tmp2, #64
- sfi_breg src, \
- vldr d1, [\B, #8]
- sfi_breg dst, \
- vstr d0, [\B, #0]
- sfi_breg src, \
- vldr d0, [\B, #16]
- sfi_breg dst, \
- vstr d1, [\B, #8]
- sfi_breg src, \
- vldr d1, [\B, #24]
- sfi_breg dst, \
- vstr d0, [\B, #16]
- sfi_breg src, \
- vldr d0, [\B, #32]
- sfi_breg dst, \
- vstr d1, [\B, #24]
- sfi_breg src, \
- vldr d1, [\B, #40]
- sfi_breg dst, \
- vstr d0, [\B, #32]
- sfi_breg src, \
- vldr d0, [\B, #48]
- sfi_breg dst, \
- vstr d1, [\B, #40]
- sfi_breg src, \
- vldr d1, [\B, #56]
- sfi_breg dst, \
- vstr d0, [\B, #48]
+ vldr d1, [src, #8]
+ vstr d0, [dst, #0]
+ vldr d0, [src, #16]
+ vstr d1, [dst, #8]
+ vldr d1, [src, #24]
+ vstr d0, [dst, #16]
+ vldr d0, [src, #32]
+ vstr d1, [dst, #24]
+ vldr d1, [src, #40]
+ vstr d0, [dst, #32]
+ vldr d0, [src, #48]
+ vstr d1, [dst, #40]
+ vldr d1, [src, #56]
+ vstr d0, [dst, #48]
add src, src, #64
- sfi_breg dst, \
- vstr d1, [\B, #56]
+ vstr d1, [dst, #56]
add dst, dst, #64
bge 1b
tst tmp2, #0x3f
@@ -456,48 +387,30 @@ ENTRY(memcpy)
add dst, dst, tmp1
add src, src, tmp1
.macro dispatch_step i
- sfi_breg src, \
- vldr d0, [\B, #-(\i * 8)]
- sfi_breg dst, \
- vstr d0, [\B, #-(\i * 8)]
+ vldr d0, [src, #-(\i * 8)]
+ vstr d0, [dst, #-(\i * 8)]
.endm
dispatch_7_dword
#else
sub src, src, #8
sub dst, dst, #8
1:
- sfi_breg src, \
- ldrd A_l, A_h, [\B, #8]
- sfi_breg dst, \
- strd A_l, A_h, [\B, #8]
- sfi_breg src, \
- ldrd A_l, A_h, [\B, #16]
- sfi_breg dst, \
- strd A_l, A_h, [\B, #16]
- sfi_breg src, \
- ldrd A_l, A_h, [\B, #24]
- sfi_breg dst, \
- strd A_l, A_h, [\B, #24]
- sfi_breg src, \
- ldrd A_l, A_h, [\B, #32]
- sfi_breg dst, \
- strd A_l, A_h, [\B, #32]
- sfi_breg src, \
- ldrd A_l, A_h, [\B, #40]
- sfi_breg dst, \
- strd A_l, A_h, [\B, #40]
- sfi_breg src, \
- ldrd A_l, A_h, [\B, #48]
- sfi_breg dst, \
- strd A_l, A_h, [\B, #48]
- sfi_breg src, \
- ldrd A_l, A_h, [\B, #56]
- sfi_breg dst, \
- strd A_l, A_h, [\B, #56]
- sfi_breg src, \
- ldrd A_l, A_h, [\B, #64]!
- sfi_breg dst, \
- strd A_l, A_h, [\B, #64]!
+ ldrd A_l, A_h, [src, #8]
+ strd A_l, A_h, [dst, #8]
+ ldrd A_l, A_h, [src, #16]
+ strd A_l, A_h, [dst, #16]
+ ldrd A_l, A_h, [src, #24]
+ strd A_l, A_h, [dst, #24]
+ ldrd A_l, A_h, [src, #32]
+ strd A_l, A_h, [dst, #32]
+ ldrd A_l, A_h, [src, #40]
+ strd A_l, A_h, [dst, #40]
+ ldrd A_l, A_h, [src, #48]
+ strd A_l, A_h, [dst, #48]
+ ldrd A_l, A_h, [src, #56]
+ strd A_l, A_h, [dst, #56]
+ ldrd A_l, A_h, [src, #64]!
+ strd A_l, A_h, [dst, #64]!
subs tmp2, tmp2, #64
bge 1b
tst tmp2, #0x3f
@@ -524,28 +437,20 @@ ENTRY(memcpy)
add dst, dst, tmp1
add src, src, tmp1
.macro dispatch_step i
- sfi_breg src, \
- ldrd A_l, A_h, [\B, #-(\i * 8)]
- sfi_breg dst, \
- strd A_l, A_h, [\B, #-(\i * 8)]
+ ldrd A_l, A_h, [src, #-(\i * 8)]
+ strd A_l, A_h, [dst, #-(\i * 8)]
.endm
dispatch_7_dword
#endif
tst tmp2, #4
- sfi_breg src, \
- ldrne tmp1, [\B], #4
- sfi_breg dst, \
- strne tmp1, [\B], #4
+ ldrne tmp1, [src], #4
+ strne tmp1, [dst], #4
lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
- sfi_breg src, \
- ldrhcs tmp1, [\B], #2
- sfi_breg src, \
- ldrbne tmp2, [\B]
- sfi_breg dst, \
- strhcs tmp1, [\B], #2
- sfi_breg dst, \
- strbne tmp2, [\B]
+ ldrhcs tmp1, [src], #2
+ ldrbne tmp2, [src]
+ strhcs tmp1, [dst], #2
+ strbne tmp2, [dst]
.Ldone:
ldr tmp2, [sp], #FRAME_SIZE
@@ -565,23 +470,15 @@ ENTRY(memcpy)
copy position into a register. This should act like a PLD
operation but we won't have to repeat the transfer. */
- sfi_breg src, \
- vldr d3, [\B, #0]
- sfi_breg src, \
- vldr d4, [\B, #64]
- sfi_breg src, \
- vldr d5, [\B, #128]
- sfi_breg src, \
- vldr d6, [\B, #192]
- sfi_breg src, \
- vldr d7, [\B, #256]
-
- sfi_breg src, \
- vldr d0, [\B, #8]
- sfi_breg src, \
- vldr d1, [\B, #16]
- sfi_breg src, \
- vldr d2, [\B, #24]
+ vldr d3, [src, #0]
+ vldr d4, [src, #64]
+ vldr d5, [src, #128]
+ vldr d6, [src, #192]
+ vldr d7, [src, #256]
+
+ vldr d0, [src, #8]
+ vldr d1, [src, #16]
+ vldr d2, [src, #24]
add src, src, #32
subs tmp2, tmp2, #prefetch_lines * 64 * 2
@@ -606,31 +503,19 @@ ENTRY(memcpy)
add src, src, #3 * 64
add dst, dst, #3 * 64
cpy_tail_vfp d6, 0
- sfi_breg dst, \
- vstr d7, [\B, #64]
- sfi_breg src, \
- vldr d7, [\B, #64]
- sfi_breg dst, \
- vstr d0, [\B, #64 + 8]
- sfi_breg src, \
- vldr d0, [\B, #64 + 8]
- sfi_breg dst, \
- vstr d1, [\B, #64 + 16]
- sfi_breg src, \
- vldr d1, [\B, #64 + 16]
- sfi_breg dst, \
- vstr d2, [\B, #64 + 24]
- sfi_breg src, \
- vldr d2, [\B, #64 + 24]
- sfi_breg dst, \
- vstr d7, [\B, #64 + 32]
+ vstr d7, [dst, #64]
+ vldr d7, [src, #64]
+ vstr d0, [dst, #64 + 8]
+ vldr d0, [src, #64 + 8]
+ vstr d1, [dst, #64 + 16]
+ vldr d1, [src, #64 + 16]
+ vstr d2, [dst, #64 + 24]
+ vldr d2, [src, #64 + 24]
+ vstr d7, [dst, #64 + 32]
add src, src, #96
- sfi_breg dst, \
- vstr d0, [\B, #64 + 40]
- sfi_breg dst, \
- vstr d1, [\B, #64 + 48]
- sfi_breg dst, \
- vstr d2, [\B, #64 + 56]
+ vstr d0, [dst, #64 + 40]
+ vstr d1, [dst, #64 + 48]
+ vstr d2, [dst, #64 + 56]
add dst, dst, #128
add tmp2, tmp2, #prefetch_lines * 64
b .Lcpy_body_medium
@@ -641,83 +526,59 @@ ENTRY(memcpy)
/* Pre-bias src and dst. */
sub src, src, #8
sub dst, dst, #8
- sfi_pld src, #8
- sfi_pld src, #72
+ pld [src, #8]
+ pld [src, #72]
subs tmp2, tmp2, #64
- sfi_pld src, #136
- sfi_breg src, \
- ldrd A_l, A_h, [\B, #8]
+ pld [src, #136]
+ ldrd A_l, A_h, [src, #8]
strd B_l, B_h, [sp, #8]
cfi_rel_offset (B_l, 8)
cfi_rel_offset (B_h, 12)
- sfi_breg src, \
- ldrd B_l, B_h, [\B, #16]
+ ldrd B_l, B_h, [src, #16]
strd C_l, C_h, [sp, #16]
cfi_rel_offset (C_l, 16)
cfi_rel_offset (C_h, 20)
- sfi_breg src, \
- ldrd C_l, C_h, [\B, #24]
+ ldrd C_l, C_h, [src, #24]
strd D_l, D_h, [sp, #24]
cfi_rel_offset (D_l, 24)
cfi_rel_offset (D_h, 28)
- sfi_pld src, #200
- sfi_breg src, \
- ldrd D_l, D_h, [\B, #32]!
+ pld [src, #200]
+ ldrd D_l, D_h, [src, #32]!
b 1f
.p2align 6
2:
- sfi_pld src, #232
- sfi_breg dst, \
- strd A_l, A_h, [\B, #40]
- sfi_breg src, \
- ldrd A_l, A_h, [\B, #40]
- sfi_breg dst, \
- strd B_l, B_h, [\B, #48]
- sfi_breg src, \
- ldrd B_l, B_h, [\B, #48]
- sfi_breg dst, \
- strd C_l, C_h, [\B, #56]
- sfi_breg src, \
- ldrd C_l, C_h, [\B, #56]
- sfi_breg dst, \
- strd D_l, D_h, [\B, #64]!
- sfi_breg src, \
- ldrd D_l, D_h, [\B, #64]!
+ pld [src, #232]
+ strd A_l, A_h, [dst, #40]
+ ldrd A_l, A_h, [src, #40]
+ strd B_l, B_h, [dst, #48]
+ ldrd B_l, B_h, [src, #48]
+ strd C_l, C_h, [dst, #56]
+ ldrd C_l, C_h, [src, #56]
+ strd D_l, D_h, [dst, #64]!
+ ldrd D_l, D_h, [src, #64]!
subs tmp2, tmp2, #64
1:
- sfi_breg dst, \
- strd A_l, A_h, [\B, #8]
- sfi_breg src, \
- ldrd A_l, A_h, [\B, #8]
- sfi_breg dst, \
- strd B_l, B_h, [\B, #16]
- sfi_breg src, \
- ldrd B_l, B_h, [\B, #16]
- sfi_breg dst, \
- strd C_l, C_h, [\B, #24]
- sfi_breg src, \
- ldrd C_l, C_h, [\B, #24]
- sfi_breg dst, \
- strd D_l, D_h, [\B, #32]
- sfi_breg src, \
- ldrd D_l, D_h, [\B, #32]
+ strd A_l, A_h, [dst, #8]
+ ldrd A_l, A_h, [src, #8]
+ strd B_l, B_h, [dst, #16]
+ ldrd B_l, B_h, [src, #16]
+ strd C_l, C_h, [dst, #24]
+ ldrd C_l, C_h, [src, #24]
+ strd D_l, D_h, [dst, #32]
+ ldrd D_l, D_h, [src, #32]
bcs 2b
/* Save the remaining bytes and restore the callee-saved regs. */
- sfi_breg dst, \
- strd A_l, A_h, [\B, #40]
+ strd A_l, A_h, [dst, #40]
add src, src, #40
- sfi_breg dst, \
- strd B_l, B_h, [\B, #48]
+ strd B_l, B_h, [dst, #48]
ldrd B_l, B_h, [sp, #8]
cfi_restore (B_l)
cfi_restore (B_h)
- sfi_breg dst, \
- strd C_l, C_h, [\B, #56]
+ strd C_l, C_h, [dst, #56]
ldrd C_l, C_h, [sp, #16]
cfi_restore (C_l)
cfi_restore (C_h)
- sfi_breg dst, \
- strd D_l, D_h, [\B, #64]
+ strd D_l, D_h, [dst, #64]
ldrd D_l, D_h, [sp, #24]
cfi_restore (D_l)
cfi_restore (D_h)
@@ -734,35 +595,29 @@ ENTRY(memcpy)
cfi_remember_state
.Lcpy_notaligned:
- sfi_pld src
- sfi_pld src, #64
+ pld [src, #0]
+ pld [src, #64]
/* There's at least 64 bytes to copy, but there is no mutual
alignment. */
/* Bring DST to 64-bit alignment. */
lsls tmp2, dst, #29
- sfi_pld src, #(2 * 64)
+ pld [src, #(2 * 64)]
beq 1f
rsbs tmp2, tmp2, #0
sub count, count, tmp2, lsr #29
- sfi_breg src, \
- ldrmi tmp1, [\B], #4
- sfi_breg dst, \
- strmi tmp1, [\B], #4
+ ldrmi tmp1, [src], #4
+ strmi tmp1, [dst], #4
lsls tmp2, tmp2, #2
- sfi_breg src, \
- ldrbne tmp1, [\B], #1
- sfi_breg src, \
- ldrhcs tmp2, [\B], #2
- sfi_breg dst, \
- strbne tmp1, [\B], #1
- sfi_breg dst, \
- strhcs tmp2, [\B], #2
+ ldrbne tmp1, [src], #1
+ ldrhcs tmp2, [src], #2
+ strbne tmp1, [dst], #1
+ strhcs tmp2, [dst], #2
1:
- sfi_pld src, #(3 * 64)
+ pld [src, #(3 * 64)]
subs count, count, #64
ldrmi tmp2, [sp], #FRAME_SIZE
bmi .Ltail63unaligned
- sfi_pld src, #(4 * 64)
+ pld [src, #(4 * 64)]
#ifdef USE_NEON
/* These need an extra layer of macro just to work around a
@@ -775,132 +630,88 @@ ENTRY(memcpy)
vst1.8 {\reglist}, [ALIGN (\basereg, 64)]!
.endm
- /* These are used by the NaCl sfi_breg macro. */
- .macro _sfi_breg_dmask_neon_load_multi reg
- _sfi_dmask \reg
- .endm
- .macro _sfi_breg_dmask_neon_store_multi reg
- _sfi_dmask \reg
- .endm
-
- sfi_breg src, neon_load_multi d0-d3, \B
- sfi_breg src, neon_load_multi d4-d7, \B
+ neon_load_multi d0-d3, src
+ neon_load_multi d4-d7, src
subs count, count, #64
bmi 2f
1:
- sfi_pld src, #(4 * 64)
- sfi_breg dst, neon_store_multi d0-d3, \B
- sfi_breg src, neon_load_multi d0-d3, \B
- sfi_breg dst, neon_store_multi d4-d7, \B
- sfi_breg src, neon_load_multi d4-d7, \B
+ pld [src, #(4 * 64)]
+ neon_store_multi d0-d3, dst
+ neon_load_multi d0-d3, src
+ neon_store_multi d4-d7, dst
+ neon_load_multi d4-d7, src
subs count, count, #64
bpl 1b
2:
- sfi_breg dst, neon_store_multi d0-d3, \B
- sfi_breg dst, neon_store_multi d4-d7, \B
+ neon_store_multi d0-d3, dst
+ neon_store_multi d4-d7, dst
ands count, count, #0x3f
#else
/* Use an SMS style loop to maximize the I/O bandwidth. */
sub src, src, #4
sub dst, dst, #8
subs tmp2, count, #64 /* Use tmp2 for count. */
- sfi_breg src, \
- ldr A_l, [\B, #4]
- sfi_breg src, \
- ldr A_h, [\B, #8]
+ ldr A_l, [src, #4]
+ ldr A_h, [src, #8]
strd B_l, B_h, [sp, #8]
cfi_rel_offset (B_l, 8)
cfi_rel_offset (B_h, 12)
- sfi_breg src, \
- ldr B_l, [\B, #12]
- sfi_breg src, \
- ldr B_h, [\B, #16]
+ ldr B_l, [src, #12]
+ ldr B_h, [src, #16]
strd C_l, C_h, [sp, #16]
cfi_rel_offset (C_l, 16)
cfi_rel_offset (C_h, 20)
- sfi_breg src, \
- ldr C_l, [\B, #20]
- sfi_breg src, \
- ldr C_h, [\B, #24]
+ ldr C_l, [src, #20]
+ ldr C_h, [src, #24]
strd D_l, D_h, [sp, #24]
cfi_rel_offset (D_l, 24)
cfi_rel_offset (D_h, 28)
- sfi_breg src, \
- ldr D_l, [\B, #28]
- sfi_breg src, \
- ldr D_h, [\B, #32]!
+ ldr D_l, [src, #28]
+ ldr D_h, [src, #32]!
b 1f
.p2align 6
2:
- sfi_pld src, #(5 * 64) - (32 - 4)
- sfi_breg dst, \
- strd A_l, A_h, [\B, #40]
- sfi_breg src, \
- ldr A_l, [\B, #36]
- sfi_breg src, \
- ldr A_h, [\B, #40]
- sfi_breg dst, \
- strd B_l, B_h, [\B, #48]
- sfi_breg src, \
- ldr B_l, [\B, #44]
- sfi_breg src, \
- ldr B_h, [\B, #48]
- sfi_breg dst, \
- strd C_l, C_h, [\B, #56]
- sfi_breg src, \
- ldr C_l, [\B, #52]
- sfi_breg src, \
- ldr C_h, [\B, #56]
- sfi_breg dst, \
- strd D_l, D_h, [\B, #64]!
- sfi_breg src, \
- ldr D_l, [\B, #60]
- sfi_breg src, \
- ldr D_h, [\B, #64]!
+ pld [src, #(5 * 64) - (32 - 4)]
+ strd A_l, A_h, [dst, #40]
+ ldr A_l, [src, #36]
+ ldr A_h, [src, #40]
+ strd B_l, B_h, [dst, #48]
+ ldr B_l, [src, #44]
+ ldr B_h, [src, #48]
+ strd C_l, C_h, [dst, #56]
+ ldr C_l, [src, #52]
+ ldr C_h, [src, #56]
+ strd D_l, D_h, [dst, #64]!
+ ldr D_l, [src, #60]
+ ldr D_h, [src, #64]!
subs tmp2, tmp2, #64
1:
- sfi_breg dst, \
- strd A_l, A_h, [\B, #8]
- sfi_breg src, \
- ldr A_l, [\B, #4]
- sfi_breg src, \
- ldr A_h, [\B, #8]
- sfi_breg dst, \
- strd B_l, B_h, [\B, #16]
- sfi_breg src, \
- ldr B_l, [\B, #12]
- sfi_breg src, \
- ldr B_h, [\B, #16]
- sfi_breg dst, \
- strd C_l, C_h, [\B, #24]
- sfi_breg src, \
- ldr C_l, [\B, #20]
- sfi_breg src, \
- ldr C_h, [\B, #24]
- sfi_breg dst, \
- strd D_l, D_h, [\B, #32]
- sfi_breg src, \
- ldr D_l, [\B, #28]
- sfi_breg src, \
- ldr D_h, [\B, #32]
+ strd A_l, A_h, [dst, #8]
+ ldr A_l, [src, #4]
+ ldr A_h, [src, #8]
+ strd B_l, B_h, [dst, #16]
+ ldr B_l, [src, #12]
+ ldr B_h, [src, #16]
+ strd C_l, C_h, [dst, #24]
+ ldr C_l, [src, #20]
+ ldr C_h, [src, #24]
+ strd D_l, D_h, [dst, #32]
+ ldr D_l, [src, #28]
+ ldr D_h, [src, #32]
bcs 2b
/* Save the remaining bytes and restore the callee-saved regs. */
- sfi_breg dst, \
- strd A_l, A_h, [\B, #40]
+ strd A_l, A_h, [dst, #40]
add src, src, #36
- sfi_breg dst, \
- strd B_l, B_h, [\B, #48]
+ strd B_l, B_h, [dst, #48]
ldrd B_l, B_h, [sp, #8]
cfi_restore (B_l)
cfi_restore (B_h)
- sfi_breg dst, \
- strd C_l, C_h, [\B, #56]
+ strd C_l, C_h, [dst, #56]
ldrd C_l, C_h, [sp, #16]
cfi_restore (C_l)
cfi_restore (C_h)
- sfi_breg dst, \
- strd D_l, D_h, [\B, #64]
+ strd D_l, D_h, [dst, #64]
ldrd D_l, D_h, [sp, #24]
cfi_restore (D_l)
cfi_restore (D_h)
diff --git a/sysdeps/arm/armv7/multiarch/memcpy_neon.S b/sysdeps/arm/armv7/multiarch/memcpy_neon.S
index e60d1cc0e1..1a8d8bbe9e 100644
--- a/sysdeps/arm/armv7/multiarch/memcpy_neon.S
+++ b/sysdeps/arm/armv7/multiarch/memcpy_neon.S
@@ -1,8 +1,8 @@
-#ifdef __ARM_NEON__
-/* Under __ARM_NEON__, this file defines memcpy directly. */
-libc_hidden_builtin_def (memcpy)
-#else
+/* For __ARM_NEON__ this file defines memcpy. */
+#ifndef __ARM_NEON__
# define memcpy __memcpy_neon
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(a)
#endif
#define MEMCPY_NEON
diff --git a/sysdeps/arm/armv7/multiarch/memcpy_vfp.S b/sysdeps/arm/armv7/multiarch/memcpy_vfp.S
index e008c041ed..d1e9ede439 100644
--- a/sysdeps/arm/armv7/multiarch/memcpy_vfp.S
+++ b/sysdeps/arm/armv7/multiarch/memcpy_vfp.S
@@ -1,7 +1,9 @@
-/* Under __ARM_NEON__, memcpy_neon.S defines memcpy directly
+/* Under __ARM_NEON__ memcpy_neon.S defines memcpy directly
and the __memcpy_vfp code will never be used. */
#ifndef __ARM_NEON__
# define MEMCPY_VFP
# define memcpy __memcpy_vfp
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(a)
# include "memcpy_impl.S"
#endif
diff --git a/sysdeps/arm/armv7/multiarch/rtld-memchr.S b/sysdeps/arm/armv7/multiarch/rtld-memchr.S
new file mode 100644
index 0000000000..ae8e5f04c4
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/rtld-memchr.S
@@ -0,0 +1 @@
+#include <sysdeps/arm/armv6t2/memchr.S>
diff --git a/sysdeps/arm/armv7/multiarch/rtld-memcpy.S b/sysdeps/arm/armv7/multiarch/rtld-memcpy.S
new file mode 100644
index 0000000000..ca2387531b
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/rtld-memcpy.S
@@ -0,0 +1 @@
+#include <sysdeps/arm/armv7/multiarch/memcpy_impl.S>
diff --git a/sysdeps/arm/armv7/strcmp.S b/sysdeps/arm/armv7/strcmp.S
index 5bcaf21ee2..2626fdf72e 100644
--- a/sysdeps/arm/armv7/strcmp.S
+++ b/sysdeps/arm/armv7/strcmp.S
@@ -1,5 +1,5 @@
/* strcmp implementation for ARMv7-A, optimized for Cortex-A15.
- Copyright (C) 2012-2016 Free Software Foundation, Inc.
+ Copyright (C) 2012-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -83,8 +83,6 @@
#define syndrome tmp2
-#ifndef NO_THUMB
-/* This code is best on Thumb. */
.thumb
/* In Thumb code we can't use MVN with a register shift, but we do have ORN. */
@@ -94,27 +92,6 @@
.macro apply_mask data_reg, mask_reg
orn \data_reg, \data_reg, \mask_reg
.endm
-#else
-/* In ARM code we don't have ORN, but we can use MVN with a register shift. */
-.macro prepare_mask mask_reg, nbits_reg
- mvn \mask_reg, const_m1, S2HI \nbits_reg
-.endm
-.macro apply_mask data_reg, mask_reg
- orr \data_reg, \data_reg, \mask_reg
-.endm
-
-/* These clobber the condition codes, which the real Thumb cbz/cbnz
- instructions do not. But it doesn't matter for any of the uses here. */
-.macro cbz reg, label
- cmp \reg, #0
- beq \label
-.endm
-.macro cbnz reg, label
- cmp \reg, #0
- bne \label
-.endm
-#endif
-
/* Macro to compute and return the result value for word-aligned
cases. */
@@ -178,10 +155,8 @@
#endif
ENTRY (strcmp)
#if STRCMP_PRECHECK == 1
- sfi_breg src1, \
- ldrb r2, [\B]
- sfi_breg src2, \
- ldrb r3, [\B]
+ ldrb r2, [src1]
+ ldrb r3, [src2]
cmp r2, #1
it cs
cmpcs r2, r3
@@ -211,11 +186,9 @@ ENTRY (strcmp)
and tmp2, tmp1, #3
bic src2, src2, #7
lsl tmp2, tmp2, #3 /* Bytes -> bits. */
- sfi_breg src1, \
- ldrd data1a, data1b, [\B], #16
+ ldrd data1a, data1b, [src1], #16
tst tmp1, #4
- sfi_breg src2, \
- ldrd data2a, data2b, [\B], #16
+ ldrd data2a, data2b, [src2], #16
prepare_mask tmp1, tmp2
apply_mask data1a, tmp1
apply_mask data2a, tmp1
@@ -231,10 +204,8 @@ ENTRY (strcmp)
.p2align 5,,12 /* Don't start in the tail bytes of a cache line. */
.p2align 2 /* Always word aligned. */
.Lloop_aligned8:
- sfi_breg src1, \
- ldrd data1a, data1b, [\B], #16
- sfi_breg src2, \
- ldrd data2a, data2b, [\B], #16
+ ldrd data1a, data1b, [src1], #16
+ ldrd data2a, data2b, [src2], #16
.Lstart_realigned8:
uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
eor syndrome_a, data1a, data2a
@@ -245,10 +216,8 @@ ENTRY (strcmp)
sel syndrome_b, syndrome_b, const_m1
cbnz syndrome_b, .Ldiff_in_b
- sfi_breg src1, \
- ldrd data1a, data1b, [\B, #-8]
- sfi_breg src2, \
- ldrd data2a, data2b, [\B, #-8]
+ ldrd data1a, data1b, [src1, #-8]
+ ldrd data2a, data2b, [src2, #-8]
uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
eor syndrome_a, data1a, data2a
sel syndrome_a, syndrome_a, const_m1
@@ -279,19 +248,15 @@ ENTRY (strcmp)
/* Unrolled by a factor of 2, to reduce the number of post-increment
operations. */
.Lloop_aligned4:
- sfi_breg src1, \
- ldr data1, [\B], #8
- sfi_breg src2, \
- ldr data2, [\B], #8
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
.Lstart_realigned4:
uadd8 syndrome, data1, const_m1 /* Only need GE bits. */
eor syndrome, data1, data2
sel syndrome, syndrome, const_m1
cbnz syndrome, .Laligned4_done
- sfi_breg src1, \
- ldr data1, [\B, #-4]
- sfi_breg src2, \
- ldr data2, [\B, #-4]
+ ldr data1, [src1, #-4]
+ ldr data2, [src2, #-4]
uadd8 syndrome, data1, const_m1
eor syndrome, data1, data2
sel syndrome, syndrome, const_m1
@@ -307,11 +272,9 @@ ENTRY (strcmp)
masking off the unwanted loaded data to prevent a difference. */
lsl tmp1, tmp1, #3 /* Bytes -> bits. */
bic src1, src1, #3
- sfi_breg src1, \
- ldr data1, [\B], #8
+ ldr data1, [src1], #8
bic src2, src2, #3
- sfi_breg src2, \
- ldr data2, [\B], #8
+ ldr data2, [src2], #8
prepare_mask tmp1, tmp1
apply_mask data1, tmp1
@@ -324,30 +287,26 @@ ENTRY (strcmp)
sub src2, src2, tmp1
bic src1, src1, #3
lsls tmp1, tmp1, #31
- sfi_breg src1, \
- ldr data1, [\B], #4
+ ldr data1, [src1], #4
beq .Laligned_m2
bcs .Laligned_m1
#if STRCMP_PRECHECK == 0
- sfi_breg src2, \
- ldrb data2, [\B, #1]
+ ldrb data2, [src2, #1]
uxtb tmp1, data1, ror #BYTE1_OFFSET
subs tmp1, tmp1, data2
bne .Lmisaligned_exit
cbz data2, .Lmisaligned_exit
.Laligned_m2:
- sfi_breg src2, \
- ldrb data2, [\B, #2]
+ ldrb data2, [src2, #2]
uxtb tmp1, data1, ror #BYTE2_OFFSET
subs tmp1, tmp1, data2
bne .Lmisaligned_exit
cbz data2, .Lmisaligned_exit
.Laligned_m1:
- sfi_breg src2, \
- ldrb data2, [\B, #3]
+ ldrb data2, [src2, #3]
uxtb tmp1, data1, ror #BYTE3_OFFSET
subs tmp1, tmp1, data2
bne .Lmisaligned_exit
@@ -356,16 +315,14 @@ ENTRY (strcmp)
#else /* STRCMP_PRECHECK */
/* If we've done the pre-check, then we don't need to check the
first byte again here. */
- sfi_breg src2, \
- ldrb data2, [\B, #2]
+ ldrb data2, [src2, #2]
uxtb tmp1, data1, ror #BYTE2_OFFSET
subs tmp1, tmp1, data2
bne .Lmisaligned_exit
cbz data2, .Lmisaligned_exit
.Laligned_m2:
- sfi_breg src2, \
- ldrb data2, [\B, #3]
+ ldrb data2, [src2, #3]
uxtb tmp1, data1, ror #BYTE3_OFFSET
subs tmp1, tmp1, data2
bne .Lmisaligned_exit
@@ -391,13 +348,11 @@ ENTRY (strcmp)
cfi_restore_state
/* src1 is word aligned, but src2 has no common alignment
with it. */
- sfi_breg src1, \
- ldr data1, [\B], #4
+ ldr data1, [src1], #4
lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */
bic src2, src2, #3
- sfi_breg src2, \
- ldr data2, [\B], #4
+ ldr data2, [src2], #4
bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */
bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */
@@ -409,13 +364,11 @@ ENTRY (strcmp)
sel syndrome, syndrome, const_m1
bne 4f
cbnz syndrome, 5f
- sfi_breg src2, \
- ldr data2, [\B], #4
+ ldr data2, [src2], #4
eor tmp1, tmp1, data1
cmp tmp1, data2, S2HI #24
bne 6f
- sfi_breg src1, \
- ldr data1, [\B], #4
+ ldr data1, [src1], #4
b .Loverlap3
4:
S2LO data2, data2, #8
@@ -427,8 +380,7 @@ ENTRY (strcmp)
/* We can only get here if the MSB of data1 contains 0, so
fast-path the exit. */
- sfi_breg src2, \
- ldrb result, [\B]
+ ldrb result, [src2]
ldrd r4, r5, [sp], #16
cfi_remember_state
cfi_def_cfa_offset (0)
@@ -454,13 +406,11 @@ ENTRY (strcmp)
sel syndrome, syndrome, const_m1
bne 4f
cbnz syndrome, 5f
- sfi_breg src2, \
- ldr data2, [\B], #4
+ ldr data2, [src2], #4
eor tmp1, tmp1, data1
cmp tmp1, data2, S2HI #16
bne 6f
- sfi_breg src1, \
- ldr data1, [\B], #4
+ ldr data1, [src1], #4
b .Loverlap2
4:
S2LO data2, data2, #16
@@ -469,8 +419,7 @@ ENTRY (strcmp)
ands syndrome, syndrome, const_m1, S2LO #16
bne .Lstrcmp_done_equal
- sfi_breg src2, \
- ldrh data2, [\B]
+ ldrh data2, [src2]
S2LO data1, data1, #16
#ifdef __ARM_BIG_ENDIAN
lsl data2, data2, #16
@@ -490,13 +439,11 @@ ENTRY (strcmp)
sel syndrome, syndrome, const_m1
bne 4f
cbnz syndrome, 5f
- sfi_breg src2, \
- ldr data2, [\B], #4
+ ldr data2, [src2], #4
eor tmp1, tmp1, data1
cmp tmp1, data2, S2HI #8
bne 6f
- sfi_breg src1, \
- ldr data1, [\B], #4
+ ldr data1, [src1], #4
b .Loverlap1
4:
S2LO data2, data2, #24
@@ -504,8 +451,7 @@ ENTRY (strcmp)
5:
tst syndrome, #LSB
bne .Lstrcmp_done_equal
- sfi_breg src2, \
- ldr data2, [\B]
+ ldr data2, [src2]
6:
S2LO data1, data1, #8
bic data2, data2, #MSB