16 files changed, 599 insertions, 556 deletions
diff --git a/sysdeps/arm/armv7/multiarch/Makefile b/sysdeps/arm/armv7/multiarch/Makefile
index e834cc937f..6e5851f897 100644
--- a/sysdeps/arm/armv7/multiarch/Makefile
+++ b/sysdeps/arm/armv7/multiarch/Makefile
@@ -1,3 +1,4 @@
 ifeq ($(subdir),string)
-sysdep_routines += memcpy_neon memcpy_vfp
+sysdep_routines += memcpy_neon memcpy_vfp memchr_neon memcpy_arm \
+		   memchr_noneon
 endif
diff --git a/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c b/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
index d7088f2a22..48e43da66e 100644
--- a/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
+++ b/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
@@ -1,5 +1,5 @@
 /* Enumerate available IFUNC implementations of a function.  ARM version.
-   Copyright (C) 2013-2016 Free Software Foundation, Inc.
+   Copyright (C) 2013-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -34,6 +34,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   bool use_neon = true;
 #ifdef __ARM_NEON__
 # define __memcpy_neon	memcpy
+# define __memchr_neon	memchr
 #else
   use_neon = (GLRO(dl_hwcap) & HWCAP_ARM_NEON) != 0;
 #endif
@@ -52,5 +53,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 #endif
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_arm));
 
+  IFUNC_IMPL (i, name, memchr,
+	      IFUNC_IMPL_ADD (array, i, memchr, use_neon, __memchr_neon)
+	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_noneon));
+
   return i;
 }
diff --git a/sysdeps/arm/armv7/multiarch/ifunc-memchr.h b/sysdeps/arm/armv7/multiarch/ifunc-memchr.h
new file mode 100644
index 0000000000..75495824f4
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/ifunc-memchr.h
@@ -0,0 +1,28 @@
+/* Common definition for memchr resolver.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+__typeof (REDIRECT_NAME) OPTIMIZE (neon) attribute_hidden;
+__typeof (REDIRECT_NAME) OPTIMIZE (noneon) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (int hwcap)
+{
+  if (hwcap & HWCAP_ARM_NEON)
+    return OPTIMIZE (neon);
+  return OPTIMIZE (noneon);
+}
diff --git a/sysdeps/arm/armv7/multiarch/ifunc-memcpy.h b/sysdeps/arm/armv7/multiarch/ifunc-memcpy.h
new file mode 100644
index 0000000000..7e6f73ff4d
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/ifunc-memcpy.h
@@ -0,0 +1,37 @@
+/* Common definition for memcpy resolver.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef __SOFTFP__
+__typeof (REDIRECT_NAME) OPTIMIZE (arm) attribute_hidden;
+#endif
+__typeof (REDIRECT_NAME) OPTIMIZE (vfp) attribute_hidden;
+__typeof (REDIRECT_NAME) OPTIMIZE (neon) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (int hwcap)
+{
+  if (hwcap & HWCAP_ARM_NEON)
+    return OPTIMIZE (neon);
+#ifdef __SOFTFP__
+  if (hwcap & HWCAP_ARM_VFP)
+    return OPTIMIZE (vfp);
+  return OPTIMIZE (arm);
+#else
+  return OPTIMIZE (vfp);
+#endif
+}
diff --git a/sysdeps/arm/armv7/multiarch/memchr.c b/sysdeps/arm/armv7/multiarch/memchr.c
new file mode 100644
index 0000000000..ff1cc5d203
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/memchr.c
@@ -0,0 +1,35 @@
+/* Multiple versions of memchr.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* For __ARM_NEON__ memchr_neon.S defines memchr directly and ifunc
+   is not used.  */
+#if IS_IN (libc) && !defined (__ARM_NEON__)
+# define memchr __redirect_memchr
+# include <string.h>
+# undef memchr
+
+# include <arm-ifunc.h>
+
+# define SYMBOL_NAME memchr
+# include "ifunc-memchr.h"
+
+arm_libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR);
+
+arm_libc_ifunc_hidden_def (__redirect_memchr, memchr);
+#endif
diff --git a/sysdeps/arm/armv7/multiarch/memchr_neon.S b/sysdeps/arm/armv7/multiarch/memchr_neon.S
new file mode 100644
index 0000000000..6fbf9b8898
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/memchr_neon.S
@@ -0,0 +1,202 @@
+/* memchr implemented using NEON.
+   Copyright (C) 2011-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* For __ARM_NEON__ this file defines memchr.  */
+#ifndef __ARM_NEON__
+# define memchr __memchr_neon
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(a)
+#endif
+
+	.arch	armv7-a
+	.fpu	neon
+
+
+/* Arguments */
+#define srcin		r0
+#define chrin		r1
+#define cntin		r2
+
+/* Retval */
+#define result		r0	/* Live range does not overlap with srcin */
+
+/* Working registers */
+#define src		r1	/* Live range does not overlap with chrin */
+#define tmp		r3
+#define synd		r0	/* No overlap with srcin or result */
+#define soff		r12
+
+/* Working NEON registers */
+#define vrepchr		q0
+#define vdata0		q1
+#define vdata0_0	d2	/* Lower half of vdata0 */
+#define vdata0_1	d3	/* Upper half of vdata0 */
+#define vdata1		q2
+#define vdata1_0	d4	/* Lower half of vhas_chr0 */
+#define vdata1_1	d5	/* Upper half of vhas_chr0 */
+#define vrepmask	q3
+#define vrepmask0	d6
+#define vrepmask1	d7
+#define vend		q4
+#define vend0		d8
+#define vend1		d9
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per
+ * byte. Each bit is set if the relevant byte matched the requested character
+ * and cleared otherwise. Since the bits in the syndrome reflect exactly the
+ * order in which things occur in the original string, counting trailing zeros
+ * allows to identify exactly which byte has matched.
+ */
+
+	.thumb_func
+	.p2align 4,,15
+
+ENTRY(memchr)
+	/* Use a simple loop if there are less than 8 bytes to search.  */
+	cmp	cntin, #7
+	bhi	.Llargestr
+	and	chrin, chrin, #0xff
+
+.Lsmallstr:
+	subs	cntin, cntin, #1
+	blo	.Lnotfound	/* Return not found if reached end.  */
+	ldrb	tmp, [srcin], #1
+	cmp	tmp, chrin
+	bne	.Lsmallstr	/* Loop again if not found.  */
+	/* Otherwise fixup address and return.  */
+	sub	result, srcin, #1
+	bx	lr
+
+
+.Llargestr:
+	vdup.8	vrepchr, chrin	/* Duplicate char across all lanes. */
+	/*
+	 * Magic constant 0x8040201008040201 allows us to identify which lane
+	 * matches the requested byte.
+	 */
+	movw	tmp, #0x0201
+	movt	tmp, #0x0804
+	lsl	soff, tmp, #4
+	vmov	vrepmask0, tmp, soff
+	vmov	vrepmask1, tmp, soff
+	/* Work with aligned 32-byte chunks */
+	bic	src, srcin, #31
+	ands	soff, srcin, #31
+	beq	.Lloopintro	/* Go straight to main loop if it's aligned. */
+
+	/*
+	 * Input string is not 32-byte aligned. We calculate the syndrome
+	 * value for the aligned 32 bytes block containing the first bytes
+	 * and mask the irrelevant part.
+	 */
+	vld1.8		{vdata0, vdata1}, [src:256]!
+	sub		tmp, soff, #32
+	adds		cntin, cntin, tmp
+	vceq.i8		vdata0, vdata0, vrepchr
+	vceq.i8		vdata1, vdata1, vrepchr
+	vand		vdata0, vdata0, vrepmask
+	vand		vdata1, vdata1, vrepmask
+	vpadd.i8	vdata0_0, vdata0_0, vdata0_1
+	vpadd.i8	vdata1_0, vdata1_0, vdata1_1
+	vpadd.i8	vdata0_0, vdata0_0, vdata1_0
+	vpadd.i8	vdata0_0, vdata0_0, vdata0_0
+	vmov		synd, vdata0_0[0]
+
+	/* Clear the soff lower bits */
+	lsr		synd, synd, soff
+	lsl		synd, synd, soff
+	/* The first block can also be the last */
+	bls		.Lmasklast
+	/* Have we found something already? */
+	cbnz		synd, .Ltail
+
+
+.Lloopintro:
+	vpush	{vend}
+	/* 264/265 correspond to d8/d9 for q4 */
+	cfi_adjust_cfa_offset (16)
+	cfi_rel_offset (264, 0)
+	cfi_rel_offset (265, 8)
+	.p2align 3,,7
+.Lloop:
+	vld1.8		{vdata0, vdata1}, [src:256]!
+	subs		cntin, cntin, #32
+	vceq.i8		vdata0, vdata0, vrepchr
+	vceq.i8		vdata1, vdata1, vrepchr
+	/* If we're out of data we finish regardless of the result. */
+	bls		.Lend
+	/* Use a fast check for the termination condition. */
+	vorr		vend, vdata0, vdata1
+	vorr		vend0, vend0, vend1
+	vmov		synd, tmp, vend0
+	orrs		synd, synd, tmp
+	/* We're not out of data, loop if we haven't found the character. */
+	beq		.Lloop
+
+.Lend:
+	vpop		{vend}
+	cfi_adjust_cfa_offset (-16)
+	cfi_restore (264)
+	cfi_restore (265)
+
+	/* Termination condition found, let's calculate the syndrome value. */
+	vand		vdata0, vdata0, vrepmask
+	vand		vdata1, vdata1, vrepmask
+	vpadd.i8	vdata0_0, vdata0_0, vdata0_1
+	vpadd.i8	vdata1_0, vdata1_0, vdata1_1
+	vpadd.i8	vdata0_0, vdata0_0, vdata1_0
+	vpadd.i8	vdata0_0, vdata0_0, vdata0_0
+	vmov		synd, vdata0_0[0]
+	cbz		synd, .Lnotfound
+	bhi		.Ltail	/* Uses the condition code from
+				   subs cntin, cntin, #32 above.  */
+
+
+.Lmasklast:
+	/* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */
+	neg	cntin, cntin
+	lsl	synd, synd, cntin
+	lsrs	synd, synd, cntin
+	it	eq
+	moveq	src, #0	/* If no match, set src to 0 so the retval is 0. */
+
+
+.Ltail:
+	/* Count the trailing zeros using bit reversing */
+	rbit	synd, synd
+	/* Compensate the last post-increment */
+	sub	src, src, #32
+	/* Count the leading zeros */
+	clz	synd, synd
+	/* Compute the potential result and return */
+	add	result, src, synd
+	bx	lr
+
+
+.Lnotfound:
+	/* Set result to NULL if not found and return */
+	mov	result, #0
+	bx	lr
+
+END(memchr)
+libc_hidden_builtin_def (memchr)
diff --git a/sysdeps/arm/armv7/multiarch/memchr_noneon.S b/sysdeps/arm/armv7/multiarch/memchr_noneon.S
new file mode 100644
index 0000000000..b1fb54018d
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/memchr_noneon.S
@@ -0,0 +1,5 @@
+#define memchr __memchr_noneon
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/arm/armv6t2/memchr.S>
diff --git a/sysdeps/arm/armv7/multiarch/memcpy.S b/sysdeps/arm/armv7/multiarch/memcpy.S
deleted file mode 100644
index 01ba9e5733..0000000000
--- a/sysdeps/arm/armv7/multiarch/memcpy.S
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Multiple versions of memcpy
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2013-2016 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-/* Thumb requires excess IT instructions here.  */
-#define NO_THUMB
-#include <sysdep.h>
-#include <rtld-global-offsets.h>
-
-#if IS_IN (libc)
-/* Under __ARM_NEON__, memcpy_neon.S defines the name memcpy.  */
-# ifndef __ARM_NEON__
-	.text
-ENTRY(memcpy)
-	.type	memcpy, %gnu_indirect_function
-# ifdef __SOFTFP__
-	ldr	r1, .Lmemcpy_arm
-	tst	r0, #HWCAP_ARM_VFP
-	ldrne	r1, .Lmemcpy_vfp
-# else
-	ldr	r1, .Lmemcpy_vfp
-# endif
-	tst	r0, #HWCAP_ARM_NEON
-	ldrne	r1, .Lmemcpy_neon
-1:
-	add	r0, r1, pc
-	DO_RET(lr)
-
-# ifdef __SOFTFP__
-.Lmemcpy_arm:
-	.long	C_SYMBOL_NAME(__memcpy_arm) - 1b - PC_OFS
-# endif
-.Lmemcpy_neon:
-	.long	C_SYMBOL_NAME(__memcpy_neon) - 1b - PC_OFS
-.Lmemcpy_vfp:
-	.long	C_SYMBOL_NAME(__memcpy_vfp) - 1b - PC_OFS
-
-END(memcpy)
-
-libc_hidden_builtin_def (memcpy)
-#endif  /* Not __ARM_NEON__.  */
-
-/* These versions of memcpy are defined not to clobber any VFP or NEON
-   registers so they must always call the ARM variant of the memcpy code.  */
-strong_alias (__memcpy_arm, __aeabi_memcpy)
-strong_alias (__memcpy_arm, __aeabi_memcpy4)
-strong_alias (__memcpy_arm, __aeabi_memcpy8)
-libc_hidden_def (__memcpy_arm)
-
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(name)
-#undef weak_alias
-#define weak_alias(x, y)
-#undef libc_hidden_def
-#define libc_hidden_def(name)
-
-#define memcpy __memcpy_arm
-
-#endif
-
-#include "memcpy_impl.S"
diff --git a/sysdeps/arm/armv7/multiarch/memcpy.c b/sysdeps/arm/armv7/multiarch/memcpy.c
new file mode 100644
index 0000000000..02776b6fe6
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/memcpy.c
@@ -0,0 +1,35 @@
+/* Multiple versions of memcpy.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* For __ARM_NEON__ memchr_neon.S defines memchr directly and ifunc
+   is not used.  */
+#if IS_IN (libc) && !defined (__ARM_NEON__)
+# define memcpy __redirect_memcpy
+# include <string.h>
+# undef memcpy
+
+# include <arm-ifunc.h>
+
+# define SYMBOL_NAME memcpy
+# include "ifunc-memcpy.h"
+
+arm_libc_ifunc_redirected (__redirect_memcpy, memcpy, IFUNC_SELECTOR);
+
+arm_libc_ifunc_hidden_def (__redirect_memcpy, memcpy);
+#endif
diff --git a/sysdeps/arm/armv7/multiarch/memcpy_arm.S b/sysdeps/arm/armv7/multiarch/memcpy_arm.S
new file mode 100644
index 0000000000..e4a9a68c42
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/memcpy_arm.S
@@ -0,0 +1,10 @@
+#define memcpy __memcpy_arm
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(a)
+#include "memcpy_impl.S"
+
+/* These versions of memcpy are defined not to clobber any VFP or NEON
+   registers so they must always call the ARM variant of the memcpy code.  */
+strong_alias (__memcpy_arm, __aeabi_memcpy)
+strong_alias (__memcpy_arm, __aeabi_memcpy4)
+strong_alias (__memcpy_arm, __aeabi_memcpy8)
diff --git a/sysdeps/arm/armv7/multiarch/memcpy_impl.S b/sysdeps/arm/armv7/multiarch/memcpy_impl.S
index a1f6266c88..2de172635c 100644
--- a/sysdeps/arm/armv7/multiarch/memcpy_impl.S
+++ b/sysdeps/arm/armv7/multiarch/memcpy_impl.S
@@ -1,5 +1,5 @@
 /* NEON/VFP/ARM version of memcpy optimized for Cortex-A15.
-   Copyright (C) 2013-2016 Free Software Foundation, Inc.
+   Copyright (C) 2013-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -226,71 +226,40 @@
 
 #ifdef USE_VFP
 	.macro	cpy_line_vfp vreg, base
-	sfi_breg dst, \
-	vstr	\vreg, [\B, #\base]
-	sfi_breg src, \
-	vldr	\vreg, [\B, #\base]
-	sfi_breg dst, \
-	vstr	d0, [\B, #\base + 8]
-	sfi_breg src, \
-	vldr	d0, [\B, #\base + 8]
-	sfi_breg dst, \
-	vstr	d1, [\B, #\base + 16]
-	sfi_breg src, \
-	vldr	d1, [\B, #\base + 16]
-	sfi_breg dst, \
-	vstr	d2, [\B, #\base + 24]
-	sfi_breg src, \
-	vldr	d2, [\B, #\base + 24]
-	sfi_breg dst, \
-	vstr	\vreg, [\B, #\base + 32]
-	sfi_breg src, \
-	vldr	\vreg, [\B, #\base + prefetch_lines * 64 - 32]
-	sfi_breg dst, \
-	vstr	d0, [\B, #\base + 40]
-	sfi_breg src, \
-	vldr	d0, [\B, #\base + 40]
-	sfi_breg dst, \
-	vstr	d1, [\B, #\base + 48]
-	sfi_breg src, \
-	vldr	d1, [\B, #\base + 48]
-	sfi_breg dst, \
-	vstr	d2, [\B, #\base + 56]
-	sfi_breg src, \
-	vldr	d2, [\B, #\base + 56]
+	vstr	\vreg, [dst, #\base]
+	vldr	\vreg, [src, #\base]
+	vstr	d0, [dst, #\base + 8]
+	vldr	d0, [src, #\base + 8]
+	vstr	d1, [dst, #\base + 16]
+	vldr	d1, [src, #\base + 16]
+	vstr	d2, [dst, #\base + 24]
+	vldr	d2, [src, #\base + 24]
+	vstr	\vreg, [dst, #\base + 32]
+	vldr	\vreg, [src, #\base + prefetch_lines * 64 - 32]
+	vstr	d0, [dst, #\base + 40]
+	vldr	d0, [src, #\base + 40]
+	vstr	d1, [dst, #\base + 48]
+	vldr	d1, [src, #\base + 48]
+	vstr	d2, [dst, #\base + 56]
+	vldr	d2, [src, #\base + 56]
 	.endm
 
 	.macro	cpy_tail_vfp vreg, base
-	sfi_breg dst, \
-	vstr	\vreg, [\B, #\base]
-	sfi_breg src, \
-	vldr	\vreg, [\B, #\base]
-	sfi_breg dst, \
-	vstr	d0, [\B, #\base + 8]
-	sfi_breg src, \
-	vldr	d0, [\B, #\base + 8]
-	sfi_breg dst, \
-	vstr	d1, [\B, #\base + 16]
-	sfi_breg src, \
-	vldr	d1, [\B, #\base + 16]
-	sfi_breg dst, \
-	vstr	d2, [\B, #\base + 24]
-	sfi_breg src, \
-	vldr	d2, [\B, #\base + 24]
-	sfi_breg dst, \
-	vstr	\vreg, [\B, #\base + 32]
-	sfi_breg dst, \
-	vstr	d0, [\B, #\base + 40]
-	sfi_breg src, \
-	vldr	d0, [\B, #\base + 40]
-	sfi_breg dst, \
-	vstr	d1, [\B, #\base + 48]
-	sfi_breg src, \
-	vldr	d1, [\B, #\base + 48]
-	sfi_breg dst, \
-	vstr	d2, [\B, #\base + 56]
-	sfi_breg src, \
-	vldr	d2, [\B, #\base + 56]
+	vstr	\vreg, [dst, #\base]
+	vldr	\vreg, [src, #\base]
+	vstr	d0, [dst, #\base + 8]
+	vldr	d0, [src, #\base + 8]
+	vstr	d1, [dst, #\base + 16]
+	vldr	d1, [src, #\base + 16]
+	vstr	d2, [dst, #\base + 24]
+	vldr	d2, [src, #\base + 24]
+	vstr	\vreg, [dst, #\base + 32]
+	vstr	d0, [dst, #\base + 40]
+	vldr	d0, [src, #\base + 40]
+	vstr	d1, [dst, #\base + 48]
+	vldr	d1, [src, #\base + 48]
+	vstr	d2, [dst, #\base + 56]
+	vldr	d2, [src, #\base + 56]
 	.endm
 #endif
 
@@ -307,7 +276,7 @@ ENTRY(memcpy)
 #ifdef USE_NEON
 	/* These need an extra layer of macro just to work around a
 	   bug in the assembler's parser when an operand starts with
-	   a {...}.  http://sourceware.org/bugzilla/show_bug.cgi?id=15647
+	   a {...}.  https://sourceware.org/bugzilla/show_bug.cgi?id=15647
 	   tracks that bug; it was not fixed as of binutils-2.23.2.  */
 	.macro neon_load_d0 reg
 	vld1.8	{d0}, [\reg]!
@@ -316,26 +285,16 @@ ENTRY(memcpy)
 	vst1.8	{d0}, [\reg]!
 	.endm
 
-	/* These are used by the NaCl sfi_breg macro.  */
-	.macro _sfi_breg_dmask_neon_load_d0 reg
-	_sfi_dmask \reg
-	.endm
-	.macro _sfi_breg_dmask_neon_store_d0 reg
-	_sfi_dmask \reg
-	.endm
-
 	and	tmp1, count, #0x38
 	.macro dispatch_step i
-	sfi_breg src, neon_load_d0 \B
-	sfi_breg dst, neon_store_d0 \B
+	neon_load_d0 src
+	neon_store_d0 dst
 	.endm
 	dispatch_7_dword
 
 	tst	count, #4
-	sfi_breg src, \
-	ldrne	tmp1, [\B], #4
-	sfi_breg dst, \
-	strne	tmp1, [\B], #4
+	ldrne	tmp1, [src], #4
+	strne	tmp1, [dst], #4
 #else
 	/* Copy up to 15 full words of data.  May not be aligned.  */
 	/* Cannot use VFP for unaligned data.  */
@@ -344,23 +303,17 @@ ENTRY(memcpy)
 	add	src, src, tmp1
 	/* Jump directly into the sequence below at the correct offset.  */
 	.macro dispatch_step i
-	sfi_breg src, \
-	ldr	tmp1, [\B, #-(\i * 4)]
-	sfi_breg dst, \
-	str	tmp1, [\B, #-(\i * 4)]
+	ldr	tmp1, [src, #-(\i * 4)]
+	str	tmp1, [dst, #-(\i * 4)]
 	.endm
 	dispatch_15_word
 #endif
 
 	lsls	count, count, #31
-	sfi_breg src, \
-	ldrhcs	tmp1, [\B], #2
-	sfi_breg src, \
-	ldrbne	src, [\B]		/* Src is dead, use as a scratch.  */
-	sfi_breg dst, \
-	strhcs	tmp1, [\B], #2
-	sfi_breg dst, \
-	strbne	src, [\B]
+	ldrhcs	tmp1, [src], #2
+	ldrbne	src, [src]		/* Src is dead, use as a scratch.  */
+	strhcs	tmp1, [dst], #2
+	strbne	src, [dst]
 	bx	lr
 
 .Lcpy_not_short:
@@ -388,19 +341,13 @@ ENTRY(memcpy)
 	beq	1f
 	rsbs	tmp2, tmp2, #0
 	sub	count, count, tmp2, lsr #29
-	sfi_breg src, \
-	ldrmi	tmp1, [\B], #4
-	sfi_breg dst, \
-	strmi	tmp1, [\B], #4
+	ldrmi	tmp1, [src], #4
+	strmi	tmp1, [dst], #4
 	lsls	tmp2, tmp2, #2
-	sfi_breg src, \
-	ldrhcs	tmp1, [\B], #2
-	sfi_breg src, \
-	ldrbne	tmp2, [\B], #1
-	sfi_breg dst, \
-	strhcs	tmp1, [\B], #2
-	sfi_breg dst, \
-	strbne	tmp2, [\B], #1
+	ldrhcs	tmp1, [src], #2
+	ldrbne	tmp2, [src], #1
+	strhcs	tmp1, [dst], #2
+	strbne	tmp2, [dst], #1
 
 1:
 	subs	tmp2, count, #64	/* Use tmp2 for count.  */
@@ -412,40 +359,24 @@ ENTRY(memcpy)
 .Lcpy_body_medium:			/* Count in tmp2.  */
 #ifdef USE_VFP
 1:
-	sfi_breg src, \
-	vldr	d0, [\B, #0]
+	vldr	d0, [src, #0]
 	subs	tmp2, tmp2, #64
-	sfi_breg src, \
-	vldr	d1, [\B, #8]
-	sfi_breg dst, \
-	vstr	d0, [\B, #0]
-	sfi_breg src, \
-	vldr	d0, [\B, #16]
-	sfi_breg dst, \
-	vstr	d1, [\B, #8]
-	sfi_breg src, \
-	vldr	d1, [\B, #24]
-	sfi_breg dst, \
-	vstr	d0, [\B, #16]
-	sfi_breg src, \
-	vldr	d0, [\B, #32]
-	sfi_breg dst, \
-	vstr	d1, [\B, #24]
-	sfi_breg src, \
-	vldr	d1, [\B, #40]
-	sfi_breg dst, \
-	vstr	d0, [\B, #32]
-	sfi_breg src, \
-	vldr	d0, [\B, #48]
-	sfi_breg dst, \
-	vstr	d1, [\B, #40]
-	sfi_breg src, \
-	vldr	d1, [\B, #56]
-	sfi_breg dst, \
-	vstr	d0, [\B, #48]
+	vldr	d1, [src, #8]
+	vstr	d0, [dst, #0]
+	vldr	d0, [src, #16]
+	vstr	d1, [dst, #8]
+	vldr	d1, [src, #24]
+	vstr	d0, [dst, #16]
+	vldr	d0, [src, #32]
+	vstr	d1, [dst, #24]
+	vldr	d1, [src, #40]
+	vstr	d0, [dst, #32]
+	vldr	d0, [src, #48]
+	vstr	d1, [dst, #40]
+	vldr	d1, [src, #56]
+	vstr	d0, [dst, #48]
 	add	src, src, #64
-	sfi_breg dst, \
-	vstr	d1, [\B, #56]
+	vstr	d1, [dst, #56]
 	add	dst, dst, #64
 	bge	1b
 	tst	tmp2, #0x3f
@@ -456,48 +387,30 @@ ENTRY(memcpy)
 	add	dst, dst, tmp1
 	add	src, src, tmp1
 	.macro dispatch_step i
-	sfi_breg src, \
-	vldr	d0, [\B, #-(\i * 8)]
-	sfi_breg dst, \
-	vstr	d0, [\B, #-(\i * 8)]
+	vldr	d0, [src, #-(\i * 8)]
+	vstr	d0, [dst, #-(\i * 8)]
 	.endm
 	dispatch_7_dword
 #else
 	sub	src, src, #8
 	sub	dst, dst, #8
 1:
-	sfi_breg src, \
-	ldrd	A_l, A_h, [\B, #8]
-	sfi_breg dst, \
-	strd	A_l, A_h, [\B, #8]
-	sfi_breg src, \
-	ldrd	A_l, A_h, [\B, #16]
-	sfi_breg dst, \
-	strd	A_l, A_h, [\B, #16]
-	sfi_breg src, \
-	ldrd	A_l, A_h, [\B, #24]
-	sfi_breg dst, \
-	strd	A_l, A_h, [\B, #24]
-	sfi_breg src, \
-	ldrd	A_l, A_h, [\B, #32]
-	sfi_breg dst, \
-	strd	A_l, A_h, [\B, #32]
-	sfi_breg src, \
-	ldrd	A_l, A_h, [\B, #40]
-	sfi_breg dst, \
-	strd	A_l, A_h, [\B, #40]
-	sfi_breg src, \
-	ldrd	A_l, A_h, [\B, #48]
-	sfi_breg dst, \
-	strd	A_l, A_h, [\B, #48]
-	sfi_breg src, \
-	ldrd	A_l, A_h, [\B, #56]
-	sfi_breg dst, \
-	strd	A_l, A_h, [\B, #56]
-	sfi_breg src, \
-	ldrd	A_l, A_h, [\B, #64]!
-	sfi_breg dst, \
-	strd	A_l, A_h, [\B, #64]!
+	ldrd	A_l, A_h, [src, #8]
+	strd	A_l, A_h, [dst, #8]
+	ldrd	A_l, A_h, [src, #16]
+	strd	A_l, A_h, [dst, #16]
+	ldrd	A_l, A_h, [src, #24]
+	strd	A_l, A_h, [dst, #24]
+	ldrd	A_l, A_h, [src, #32]
+	strd	A_l, A_h, [dst, #32]
+	ldrd	A_l, A_h, [src, #40]
+	strd	A_l, A_h, [dst, #40]
+	ldrd	A_l, A_h, [src, #48]
+	strd	A_l, A_h, [dst, #48]
+	ldrd	A_l, A_h, [src, #56]
+	strd	A_l, A_h, [dst, #56]
+	ldrd	A_l, A_h, [src, #64]!
+	strd	A_l, A_h, [dst, #64]!
 	subs	tmp2, tmp2, #64
 	bge	1b
 	tst	tmp2, #0x3f
@@ -524,28 +437,20 @@ ENTRY(memcpy)
 	add	dst, dst, tmp1
 	add	src, src, tmp1
 	.macro dispatch_step i
-	sfi_breg src, \
-	ldrd	A_l, A_h, [\B, #-(\i * 8)]
-	sfi_breg dst, \
-	strd	A_l, A_h, [\B, #-(\i * 8)]
+	ldrd	A_l, A_h, [src, #-(\i * 8)]
+	strd	A_l, A_h, [dst, #-(\i * 8)]
 	.endm
 	dispatch_7_dword
 #endif
 
 	tst	tmp2, #4
-	sfi_breg src, \
-	ldrne	tmp1, [\B], #4
-	sfi_breg dst, \
-	strne	tmp1, [\B], #4
+	ldrne	tmp1, [src], #4
+	strne	tmp1, [dst], #4
 	lsls	tmp2, tmp2, #31		/* Count (tmp2) now dead. */
-	sfi_breg src, \
-	ldrhcs	tmp1, [\B], #2
-	sfi_breg src, \
-	ldrbne	tmp2, [\B]
-	sfi_breg dst, \
-	strhcs	tmp1, [\B], #2
-	sfi_breg dst, \
-	strbne	tmp2, [\B]
+	ldrhcs	tmp1, [src], #2
+	ldrbne	tmp2, [src]
+	strhcs	tmp1, [dst], #2
+	strbne	tmp2, [dst]
 
 .Ldone:
 	ldr	tmp2, [sp], #FRAME_SIZE
@@ -565,23 +470,15 @@ ENTRY(memcpy)
 	   copy position into a register.  This should act like a PLD
 	   operation but we won't have to repeat the transfer.  */
 
-	sfi_breg src, \
-	vldr	d3, [\B, #0]
-	sfi_breg src, \
-	vldr	d4, [\B, #64]
-	sfi_breg src, \
-	vldr	d5, [\B, #128]
-	sfi_breg src, \
-	vldr	d6, [\B, #192]
-	sfi_breg src, \
-	vldr	d7, [\B, #256]
-
-	sfi_breg src, \
-	vldr	d0, [\B, #8]
-	sfi_breg src, \
-	vldr	d1, [\B, #16]
-	sfi_breg src, \
-	vldr	d2, [\B, #24]
+	vldr	d3, [src, #0]
+	vldr	d4, [src, #64]
+	vldr	d5, [src, #128]
+	vldr	d6, [src, #192]
+	vldr	d7, [src, #256]
+
+	vldr	d0, [src, #8]
+	vldr	d1, [src, #16]
+	vldr	d2, [src, #24]
 	add	src, src, #32
 
 	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
@@ -606,31 +503,19 @@ ENTRY(memcpy)
 	add	src, src, #3 * 64
 	add	dst, dst, #3 * 64
 	cpy_tail_vfp	d6, 0
-	sfi_breg dst, \
-	vstr	d7, [\B, #64]
-	sfi_breg src, \
-	vldr	d7, [\B, #64]
-	sfi_breg dst, \
-	vstr	d0, [\B, #64 + 8]
-	sfi_breg src, \
-	vldr	d0, [\B, #64 + 8]
-	sfi_breg dst, \
-	vstr	d1, [\B, #64 + 16]
-	sfi_breg src, \
-	vldr	d1, [\B, #64 + 16]
-	sfi_breg dst, \
-	vstr	d2, [\B, #64 + 24]
-	sfi_breg src, \
-	vldr	d2, [\B, #64 + 24]
-	sfi_breg dst, \
-	vstr	d7, [\B, #64 + 32]
+	vstr	d7, [dst, #64]
+	vldr	d7, [src, #64]
+	vstr	d0, [dst, #64 + 8]
+	vldr	d0, [src, #64 + 8]
+	vstr	d1, [dst, #64 + 16]
+	vldr	d1, [src, #64 + 16]
+	vstr	d2, [dst, #64 + 24]
+	vldr	d2, [src, #64 + 24]
+	vstr	d7, [dst, #64 + 32]
 	add	src, src, #96
-	sfi_breg dst, \
-	vstr	d0, [\B, #64 + 40]
-	sfi_breg dst, \
-	vstr	d1, [\B, #64 + 48]
-	sfi_breg dst, \
-	vstr	d2, [\B, #64 + 56]
+	vstr	d0, [dst, #64 + 40]
+	vstr	d1, [dst, #64 + 48]
+	vstr	d2, [dst, #64 + 56]
 	add	dst, dst, #128
 	add	tmp2, tmp2, #prefetch_lines * 64
 	b	.Lcpy_body_medium
@@ -641,83 +526,59 @@ ENTRY(memcpy)
 	/* Pre-bias src and dst.  */
 	sub	src, src, #8
 	sub	dst, dst, #8
-	sfi_pld	src, #8
-	sfi_pld	src, #72
+	pld	[src, #8]
+	pld	[src, #72]
 	subs	tmp2, tmp2, #64
-	sfi_pld	src, #136
-	sfi_breg src, \
-	ldrd	A_l, A_h, [\B, #8]
+	pld	[src, #136]
+	ldrd	A_l, A_h, [src, #8]
 	strd	B_l, B_h, [sp, #8]
 	cfi_rel_offset (B_l, 8)
 	cfi_rel_offset (B_h, 12)
-	sfi_breg src, \
-	ldrd	B_l, B_h, [\B, #16]
+	ldrd	B_l, B_h, [src, #16]
 	strd	C_l, C_h, [sp, #16]
 	cfi_rel_offset (C_l, 16)
 	cfi_rel_offset (C_h, 20)
-	sfi_breg src, \
-	ldrd	C_l, C_h, [\B, #24]
+	ldrd	C_l, C_h, [src, #24]
 	strd	D_l, D_h, [sp, #24]
 	cfi_rel_offset (D_l, 24)
 	cfi_rel_offset (D_h, 28)
-	sfi_pld	src, #200
-	sfi_breg src, \
-	ldrd	D_l, D_h, [\B, #32]!
+	pld	[src, #200]
+	ldrd	D_l, D_h, [src, #32]!
 	b	1f
 	.p2align	6
 2:
-	sfi_pld	src, #232
-	sfi_breg dst, \
-	strd	A_l, A_h, [\B, #40]
-	sfi_breg src, \
-	ldrd	A_l, A_h, [\B, #40]
-	sfi_breg dst, \
-	strd	B_l, B_h, [\B, #48]
-	sfi_breg src, \
-	ldrd	B_l, B_h, [\B, #48]
-	sfi_breg dst, \
-	strd	C_l, C_h, [\B, #56]
-	sfi_breg src, \
-	ldrd	C_l, C_h, [\B, #56]
-	sfi_breg dst, \
-	strd	D_l, D_h, [\B, #64]!
-	sfi_breg src, \
-	ldrd	D_l, D_h, [\B, #64]!
+	pld	[src, #232]
+	strd	A_l, A_h, [dst, #40]
+	ldrd	A_l, A_h, [src, #40]
+	strd	B_l, B_h, [dst, #48]
+	ldrd	B_l, B_h, [src, #48]
+	strd	C_l, C_h, [dst, #56]
+	ldrd	C_l, C_h, [src, #56]
+	strd	D_l, D_h, [dst, #64]!
+	ldrd	D_l, D_h, [src, #64]!
 	subs	tmp2, tmp2, #64
 1:
-	sfi_breg dst, \
-	strd	A_l, A_h, [\B, #8]
-	sfi_breg src, \
-	ldrd	A_l, A_h, [\B, #8]
-	sfi_breg dst, \
-	strd	B_l, B_h, [\B, #16]
-	sfi_breg src, \
-	ldrd	B_l, B_h, [\B, #16]
-	sfi_breg dst, \
-	strd	C_l, C_h, [\B, #24]
-	sfi_breg src, \
-	ldrd	C_l, C_h, [\B, #24]
-	sfi_breg dst, \
-	strd	D_l, D_h, [\B, #32]
-	sfi_breg src, \
-	ldrd	D_l, D_h, [\B, #32]
+	strd	A_l, A_h, [dst, #8]
+	ldrd	A_l, A_h, [src, #8]
+	strd	B_l, B_h, [dst, #16]
+	ldrd	B_l, B_h, [src, #16]
+	strd	C_l, C_h, [dst, #24]
+	ldrd	C_l, C_h, [src, #24]
+	strd	D_l, D_h, [dst, #32]
+	ldrd	D_l, D_h, [src, #32]
 	bcs	2b
 	/* Save the remaining bytes and restore the callee-saved regs.  */
-	sfi_breg dst, \
-	strd	A_l, A_h, [\B, #40]
+	strd	A_l, A_h, [dst, #40]
 	add	src, src, #40
-	sfi_breg dst, \
-	strd	B_l, B_h, [\B, #48]
+	strd	B_l, B_h, [dst, #48]
 	ldrd	B_l, B_h, [sp, #8]
 	cfi_restore (B_l)
 	cfi_restore (B_h)
-	sfi_breg dst, \
-	strd	C_l, C_h, [\B, #56]
+	strd	C_l, C_h, [dst, #56]
 	ldrd	C_l, C_h, [sp, #16]
 	cfi_restore (C_l)
 	cfi_restore (C_h)
-	sfi_breg dst, \
-	strd	D_l, D_h, [\B, #64]
+	strd	D_l, D_h, [dst, #64]
 	ldrd	D_l, D_h, [sp, #24]
 	cfi_restore (D_l)
 	cfi_restore (D_h)
@@ -734,35 +595,29 @@ ENTRY(memcpy)
 	cfi_remember_state
 
 .Lcpy_notaligned:
-	sfi_pld	src
-	sfi_pld	src, #64
+	pld	[src, #0]
+	pld	[src, #64]
 	/* There's at least 64 bytes to copy, but there is no mutual
 	   alignment.  */
 	/* Bring DST to 64-bit alignment.  */
 	lsls	tmp2, dst, #29
-	sfi_pld	src, #(2 * 64)
+	pld	[src, #(2 * 64)]
 	beq	1f
 	rsbs	tmp2, tmp2, #0
 	sub	count, count, tmp2, lsr #29
-	sfi_breg src, \
-	ldrmi	tmp1, [\B], #4
-	sfi_breg dst, \
-	strmi	tmp1, [\B], #4
+	ldrmi	tmp1, [src], #4
+	strmi	tmp1, [dst], #4
 	lsls	tmp2, tmp2, #2
-	sfi_breg src, \
-	ldrbne	tmp1, [\B], #1
-	sfi_breg src, \
-	ldrhcs	tmp2, [\B], #2
-	sfi_breg dst, \
-	strbne	tmp1, [\B], #1
-	sfi_breg dst, \
-	strhcs	tmp2, [\B], #2
+	ldrbne	tmp1, [src], #1
+	ldrhcs	tmp2, [src], #2
+	strbne	tmp1, [dst], #1
+	strhcs	tmp2, [dst], #2
 1:
-	sfi_pld	src, #(3 * 64)
+	pld	[src, #(3 * 64)]
 	subs	count, count, #64
 	ldrmi	tmp2, [sp], #FRAME_SIZE
 	bmi	.Ltail63unaligned
-	sfi_pld	src, #(4 * 64)
+	pld	[src, #(4 * 64)]
 
 #ifdef USE_NEON
 	/* These need an extra layer of macro just to work around a
@@ -775,132 +630,88 @@ ENTRY(memcpy)
 	vst1.8	{\reglist}, [ALIGN (\basereg, 64)]!
 	.endm
 
-	/* These are used by the NaCl sfi_breg macro.  */
-	.macro _sfi_breg_dmask_neon_load_multi reg
-	_sfi_dmask \reg
-	.endm
-	.macro _sfi_breg_dmask_neon_store_multi reg
-	_sfi_dmask \reg
-	.endm
-
-	sfi_breg src, neon_load_multi d0-d3, \B
-	sfi_breg src, neon_load_multi d4-d7, \B
+	neon_load_multi d0-d3, src
+	neon_load_multi d4-d7, src
 	subs	count, count, #64
 	bmi	2f
 1:
-	sfi_pld	src, #(4 * 64)
-	sfi_breg dst, neon_store_multi d0-d3, \B
-	sfi_breg src, neon_load_multi d0-d3, \B
-	sfi_breg dst, neon_store_multi d4-d7, \B
-	sfi_breg src, neon_load_multi d4-d7, \B
+	pld	[src, #(4 * 64)]
+	neon_store_multi d0-d3, dst
+	neon_load_multi d0-d3, src
+	neon_store_multi d4-d7, dst
+	neon_load_multi d4-d7, src
 	subs	count, count, #64
 	bpl	1b
 2:
-	sfi_breg dst, neon_store_multi d0-d3, \B
-	sfi_breg dst, neon_store_multi d4-d7, \B
+	neon_store_multi d0-d3, dst
+	neon_store_multi d4-d7, dst
 	ands	count, count, #0x3f
 #else
 	/* Use an SMS style loop to maximize the I/O bandwidth.  */
 	sub	src, src, #4
 	sub	dst, dst, #8
 	subs	tmp2, count, #64	/* Use tmp2 for count.  */
-	sfi_breg src, \
-	ldr	A_l, [\B, #4]
-	sfi_breg src, \
-	ldr	A_h, [\B, #8]
+	ldr	A_l, [src, #4]
+	ldr	A_h, [src, #8]
 	strd	B_l, B_h, [sp, #8]
 	cfi_rel_offset (B_l, 8)
 	cfi_rel_offset (B_h, 12)
-	sfi_breg src, \
-	ldr	B_l, [\B, #12]
-	sfi_breg src, \
-	ldr	B_h, [\B, #16]
+	ldr	B_l, [src, #12]
+	ldr	B_h, [src, #16]
 	strd	C_l, C_h, [sp, #16]
 	cfi_rel_offset (C_l, 16)
 	cfi_rel_offset (C_h, 20)
-	sfi_breg src, \
-	ldr	C_l, [\B, #20]
-	sfi_breg src, \
-	ldr	C_h, [\B, #24]
+	ldr	C_l, [src, #20]
+	ldr	C_h, [src, #24]
 	strd	D_l, D_h, [sp, #24]
 	cfi_rel_offset (D_l, 24)
 	cfi_rel_offset (D_h, 28)
-	sfi_breg src, \
-	ldr	D_l, [\B, #28]
-	sfi_breg src, \
-	ldr	D_h, [\B, #32]!
+	ldr	D_l, [src, #28]
+	ldr	D_h, [src, #32]!
 	b	1f
 	.p2align	6
 2:
-	sfi_pld	src, #(5 * 64) - (32 - 4)
-	sfi_breg dst, \
-	strd	A_l, A_h, [\B, #40]
-	sfi_breg src, \
-	ldr	A_l, [\B, #36]
-	sfi_breg src, \
-	ldr	A_h, [\B, #40]
-	sfi_breg dst, \
-	strd	B_l, B_h, [\B, #48]
-	sfi_breg src, \
-	ldr	B_l, [\B, #44]
-	sfi_breg src, \
-	ldr	B_h, [\B, #48]
-	sfi_breg dst, \
-	strd	C_l, C_h, [\B, #56]
-	sfi_breg src, \
-	ldr	C_l, [\B, #52]
-	sfi_breg src, \
-	ldr	C_h, [\B, #56]
-	sfi_breg dst, \
-	strd	D_l, D_h, [\B, #64]!
-	sfi_breg src, \
-	ldr	D_l, [\B, #60]
-	sfi_breg src, \
-	ldr	D_h, [\B, #64]!
+	pld	[src, #(5 * 64) - (32 - 4)]
+	strd	A_l, A_h, [dst, #40]
+	ldr	A_l, [src, #36]
+	ldr	A_h, [src, #40]
+	strd	B_l, B_h, [dst, #48]
+	ldr	B_l, [src, #44]
+	ldr	B_h, [src, #48]
+	strd	C_l, C_h, [dst, #56]
+	ldr	C_l, [src, #52]
+	ldr	C_h, [src, #56]
+	strd	D_l, D_h, [dst, #64]!
+	ldr	D_l, [src, #60]
+	ldr	D_h, [src, #64]!
 	subs	tmp2, tmp2, #64
 1:
-	sfi_breg dst, \
-	strd	A_l, A_h, [\B, #8]
-	sfi_breg src, \
-	ldr	A_l, [\B, #4]
-	sfi_breg src, \
-	ldr	A_h, [\B, #8]
-	sfi_breg dst, \
-	strd	B_l, B_h, [\B, #16]
-	sfi_breg src, \
-	ldr	B_l, [\B, #12]
-	sfi_breg src, \
-	ldr	B_h, [\B, #16]
-	sfi_breg dst, \
-	strd	C_l, C_h, [\B, #24]
-	sfi_breg src, \
-	ldr	C_l, [\B, #20]
-	sfi_breg src, \
-	ldr	C_h, [\B, #24]
-	sfi_breg dst, \
-	strd	D_l, D_h, [\B, #32]
-	sfi_breg src, \
-	ldr	D_l, [\B, #28]
-	sfi_breg src, \
-	ldr	D_h, [\B, #32]
+	strd	A_l, A_h, [dst, #8]
+	ldr	A_l, [src, #4]
+	ldr	A_h, [src, #8]
+	strd	B_l, B_h, [dst, #16]
+	ldr	B_l, [src, #12]
+	ldr	B_h, [src, #16]
+	strd	C_l, C_h, [dst, #24]
+	ldr	C_l, [src, #20]
+	ldr	C_h, [src, #24]
+	strd	D_l, D_h, [dst, #32]
+	ldr	D_l, [src, #28]
+	ldr	D_h, [src, #32]
 	bcs	2b
 
 	/* Save the remaining bytes and restore the callee-saved regs.  */
-	sfi_breg dst, \
-	strd	A_l, A_h, [\B, #40]
+	strd	A_l, A_h, [dst, #40]
 	add	src, src, #36
-	sfi_breg dst, \
-	strd	B_l, B_h, [\B, #48]
+	strd	B_l, B_h, [dst, #48]
 	ldrd	B_l, B_h, [sp, #8]
 	cfi_restore (B_l)
 	cfi_restore (B_h)
-	sfi_breg dst, \
-	strd	C_l, C_h, [\B, #56]
+	strd	C_l, C_h, [dst, #56]
 	ldrd	C_l, C_h, [sp, #16]
 	cfi_restore (C_l)
 	cfi_restore (C_h)
-	sfi_breg dst, \
-	strd	D_l, D_h, [\B, #64]
+	strd	D_l, D_h, [dst, #64]
 	ldrd	D_l, D_h, [sp, #24]
 	cfi_restore (D_l)
 	cfi_restore (D_h)
diff --git a/sysdeps/arm/armv7/multiarch/memcpy_neon.S b/sysdeps/arm/armv7/multiarch/memcpy_neon.S
index e60d1cc0e1..1a8d8bbe9e 100644
--- a/sysdeps/arm/armv7/multiarch/memcpy_neon.S
+++ b/sysdeps/arm/armv7/multiarch/memcpy_neon.S
@@ -1,8 +1,8 @@
-#ifdef __ARM_NEON__
-/* Under __ARM_NEON__, this file defines memcpy directly.  */
-libc_hidden_builtin_def (memcpy)
-#else
+/* For __ARM_NEON__ this file defines memcpy.  */
+#ifndef __ARM_NEON__
 # define memcpy __memcpy_neon
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(a)
 #endif
 
 #define MEMCPY_NEON
diff --git a/sysdeps/arm/armv7/multiarch/memcpy_vfp.S b/sysdeps/arm/armv7/multiarch/memcpy_vfp.S
index e008c041ed..d1e9ede439 100644
--- a/sysdeps/arm/armv7/multiarch/memcpy_vfp.S
+++ b/sysdeps/arm/armv7/multiarch/memcpy_vfp.S
@@ -1,7 +1,9 @@
-/* Under __ARM_NEON__, memcpy_neon.S defines memcpy directly
+/* Under __ARM_NEON__ memcpy_neon.S defines memcpy directly
    and the __memcpy_vfp code will never be used.  */
 #ifndef __ARM_NEON__
 # define MEMCPY_VFP
 # define memcpy __memcpy_vfp
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(a)
 # include "memcpy_impl.S"
 #endif
diff --git a/sysdeps/arm/armv7/multiarch/rtld-memchr.S b/sysdeps/arm/armv7/multiarch/rtld-memchr.S
new file mode 100644
index 0000000000..ae8e5f04c4
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/rtld-memchr.S
@@ -0,0 +1 @@
+#include <sysdeps/arm/armv6t2/memchr.S>
diff --git a/sysdeps/arm/armv7/multiarch/rtld-memcpy.S b/sysdeps/arm/armv7/multiarch/rtld-memcpy.S
new file mode 100644
index 0000000000..ca2387531b
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/rtld-memcpy.S
@@ -0,0 +1 @@
+#include <sysdeps/arm/armv7/multiarch/memcpy_impl.S>
diff --git a/sysdeps/arm/armv7/strcmp.S b/sysdeps/arm/armv7/strcmp.S
index 5bcaf21ee2..2626fdf72e 100644
--- a/sysdeps/arm/armv7/strcmp.S
+++ b/sysdeps/arm/armv7/strcmp.S
@@ -1,5 +1,5 @@
 /* strcmp implementation for ARMv7-A, optimized for Cortex-A15.
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
+   Copyright (C) 2012-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -83,8 +83,6 @@
 #define syndrome	tmp2
 
 
-#ifndef NO_THUMB
-/* This code is best on Thumb.  */
 	.thumb
 
 /* In Thumb code we can't use MVN with a register shift, but we do have ORN.  */
@@ -94,27 +92,6 @@
 .macro apply_mask data_reg, mask_reg
 	orn \data_reg, \data_reg, \mask_reg
 .endm
-#else
-/* In ARM code we don't have ORN, but we can use MVN with a register shift.  */
-.macro prepare_mask mask_reg, nbits_reg
-	mvn \mask_reg, const_m1, S2HI \nbits_reg
-.endm
-.macro apply_mask data_reg, mask_reg
-	orr \data_reg, \data_reg, \mask_reg
-.endm
-
-/* These clobber the condition codes, which the real Thumb cbz/cbnz
-   instructions do not.  But it doesn't matter for any of the uses here.  */
-.macro cbz reg, label
-	cmp \reg, #0
-	beq \label
-.endm
-.macro cbnz reg, label
-	cmp \reg, #0
-	bne \label
-.endm
-#endif
-
 
 	/* Macro to compute and return the result value for word-aligned
 	   cases.  */
@@ -178,10 +155,8 @@
 #endif
 ENTRY (strcmp)
 #if STRCMP_PRECHECK == 1
-	sfi_breg src1, \
-	ldrb	r2, [\B]
-	sfi_breg src2, \
-	ldrb	r3, [\B]
+	ldrb	r2, [src1]
+	ldrb	r3, [src2]
 	cmp	r2, #1
 	it	cs
 	cmpcs	r2, r3
@@ -211,11 +186,9 @@ ENTRY (strcmp)
 	and	tmp2, tmp1, #3
 	bic	src2, src2, #7
 	lsl	tmp2, tmp2, #3	/* Bytes -> bits.  */
-	sfi_breg src1, \
-	ldrd	data1a, data1b, [\B], #16
+	ldrd	data1a, data1b, [src1], #16
 	tst	tmp1, #4
-	sfi_breg src2, \
-	ldrd	data2a, data2b, [\B], #16
+	ldrd	data2a, data2b, [src2], #16
 	prepare_mask tmp1, tmp2
 	apply_mask data1a, tmp1
 	apply_mask data2a, tmp1
@@ -231,10 +204,8 @@ ENTRY (strcmp)
 	.p2align 5,,12  /* Don't start in the tail bytes of a cache line.  */
 	.p2align 2	/* Always word aligned.  */
 .Lloop_aligned8:
-	sfi_breg src1, \
-	ldrd	data1a, data1b, [\B], #16
-	sfi_breg src2, \
-	ldrd	data2a, data2b, [\B], #16
+	ldrd	data1a, data1b, [src1], #16
+	ldrd	data2a, data2b, [src2], #16
 .Lstart_realigned8:
 	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
 	eor	syndrome_a, data1a, data2a
@@ -245,10 +216,8 @@ ENTRY (strcmp)
 	sel	syndrome_b, syndrome_b, const_m1
 	cbnz	syndrome_b, .Ldiff_in_b
 
-	sfi_breg src1, \
-	ldrd	data1a, data1b, [\B, #-8]
-	sfi_breg src2, \
-	ldrd	data2a, data2b, [\B, #-8]
+	ldrd	data1a, data1b, [src1, #-8]
+	ldrd	data2a, data2b, [src2, #-8]
 	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
 	eor	syndrome_a, data1a, data2a
 	sel	syndrome_a, syndrome_a, const_m1
@@ -279,19 +248,15 @@ ENTRY (strcmp)
 	/* Unrolled by a factor of 2, to reduce the number of post-increment
 	   operations.  */
 .Lloop_aligned4:
-	sfi_breg src1, \
-	ldr	data1, [\B], #8
-	sfi_breg src2, \
-	ldr	data2, [\B], #8
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
 .Lstart_realigned4:
 	uadd8	syndrome, data1, const_m1	/* Only need GE bits.  */
 	eor	syndrome, data1, data2
 	sel	syndrome, syndrome, const_m1
 	cbnz	syndrome, .Laligned4_done
-	sfi_breg src1, \
-	ldr	data1, [\B, #-4]
-	sfi_breg src2, \
-	ldr	data2, [\B, #-4]
+	ldr	data1, [src1, #-4]
+	ldr	data2, [src2, #-4]
 	uadd8	syndrome, data1, const_m1
 	eor	syndrome, data1, data2
 	sel	syndrome, syndrome, const_m1
@@ -307,11 +272,9 @@ ENTRY (strcmp)
 	   masking off the unwanted loaded data to prevent a difference.  */
 	lsl	tmp1, tmp1, #3	/* Bytes -> bits.  */
 	bic	src1, src1, #3
-	sfi_breg src1, \
-	ldr	data1, [\B], #8
+	ldr	data1, [src1], #8
 	bic	src2, src2, #3
-	sfi_breg src2, \
-	ldr	data2, [\B], #8
+	ldr	data2, [src2], #8
 
 	prepare_mask tmp1, tmp1
 	apply_mask data1, tmp1
@@ -324,30 +287,26 @@ ENTRY (strcmp)
 	sub	src2, src2, tmp1
 	bic	src1, src1, #3
 	lsls	tmp1, tmp1, #31
-	sfi_breg src1, \
-	ldr	data1, [\B], #4
+	ldr	data1, [src1], #4
 	beq	.Laligned_m2
 	bcs	.Laligned_m1
 
 #if STRCMP_PRECHECK == 0
-	sfi_breg src2, \
-	ldrb	data2, [\B, #1]
+	ldrb	data2, [src2, #1]
 	uxtb	tmp1, data1, ror #BYTE1_OFFSET
 	subs	tmp1, tmp1, data2
 	bne	.Lmisaligned_exit
 	cbz	data2, .Lmisaligned_exit
 
 .Laligned_m2:
-	sfi_breg src2, \
-	ldrb	data2, [\B, #2]
+	ldrb	data2, [src2, #2]
 	uxtb	tmp1, data1, ror #BYTE2_OFFSET
 	subs	tmp1, tmp1, data2
 	bne	.Lmisaligned_exit
 	cbz	data2, .Lmisaligned_exit
 
 .Laligned_m1:
-	sfi_breg src2, \
-	ldrb	data2, [\B, #3]
+	ldrb	data2, [src2, #3]
 	uxtb	tmp1, data1, ror #BYTE3_OFFSET
 	subs	tmp1, tmp1, data2
 	bne	.Lmisaligned_exit
@@ -356,16 +315,14 @@ ENTRY (strcmp)
 #else  /* STRCMP_PRECHECK */
 	/* If we've done the pre-check, then we don't need to check the
 	   first byte again here.  */
-	sfi_breg src2, \
-	ldrb	data2, [\B, #2]
+	ldrb	data2, [src2, #2]
 	uxtb	tmp1, data1, ror #BYTE2_OFFSET
 	subs	tmp1, tmp1, data2
 	bne	.Lmisaligned_exit
 	cbz	data2, .Lmisaligned_exit
 
 .Laligned_m2:
-	sfi_breg src2, \
-	ldrb	data2, [\B, #3]
+	ldrb	data2, [src2, #3]
 	uxtb	tmp1, data1, ror #BYTE3_OFFSET
 	subs	tmp1, tmp1, data2
 	bne	.Lmisaligned_exit
@@ -391,13 +348,11 @@ ENTRY (strcmp)
 	cfi_restore_state
 	/* src1 is word aligned, but src2 has no common alignment
 	   with it.  */
-	sfi_breg src1, \
-	ldr	data1, [\B], #4
+	ldr	data1, [src1], #4
 	lsls	tmp1, src2, #31		/* C=src2[1], Z=src2[0].  */
 
 	bic	src2, src2, #3
-	sfi_breg src2, \
-	ldr	data2, [\B], #4
+	ldr	data2, [src2], #4
 	bhi	.Loverlap1		/* C=1, Z=0 => src2[1:0] = 0b11.  */
 	bcs	.Loverlap2		/* C=1, Z=1 => src2[1:0] = 0b10.  */
 
@@ -409,13 +364,11 @@ ENTRY (strcmp)
 	sel	syndrome, syndrome, const_m1
 	bne	4f
 	cbnz	syndrome, 5f
-	sfi_breg src2, \
-	ldr	data2, [\B], #4
+	ldr	data2, [src2], #4
 	eor	tmp1, tmp1, data1
 	cmp	tmp1, data2, S2HI #24
 	bne	6f
-	sfi_breg src1, \
-	ldr	data1, [\B], #4
+	ldr	data1, [src1], #4
 	b	.Loverlap3
 4:
 	S2LO	data2, data2, #8
@@ -427,8 +380,7 @@ ENTRY (strcmp)
 
 	/* We can only get here if the MSB of data1 contains 0, so
 	   fast-path the exit.  */
-	sfi_breg src2, \
-	ldrb	result, [\B]
+	ldrb	result, [src2]
 	ldrd	r4, r5, [sp], #16
 	cfi_remember_state
 	cfi_def_cfa_offset (0)
@@ -454,13 +406,11 @@ ENTRY (strcmp)
 	sel	syndrome, syndrome, const_m1
 	bne	4f
 	cbnz	syndrome, 5f
-	sfi_breg src2, \
-	ldr	data2, [\B], #4
+	ldr	data2, [src2], #4
 	eor	tmp1, tmp1, data1
 	cmp	tmp1, data2, S2HI #16
 	bne	6f
-	sfi_breg src1, \
-	ldr	data1, [\B], #4
+	ldr	data1, [src1], #4
 	b	.Loverlap2
 4:
 	S2LO	data2, data2, #16
@@ -469,8 +419,7 @@ ENTRY (strcmp)
 	ands	syndrome, syndrome, const_m1, S2LO #16
 	bne	.Lstrcmp_done_equal
 
-	sfi_breg src2, \
-	ldrh	data2, [\B]
+	ldrh	data2, [src2]
 	S2LO	data1, data1, #16
 #ifdef __ARM_BIG_ENDIAN
 	lsl	data2, data2, #16
@@ -490,13 +439,11 @@ ENTRY (strcmp)
 	sel	syndrome, syndrome, const_m1
 	bne	4f
 	cbnz	syndrome, 5f
-	sfi_breg src2, \
-	ldr	data2, [\B], #4
+	ldr	data2, [src2], #4
 	eor	tmp1, tmp1, data1
 	cmp	tmp1, data2, S2HI #8
 	bne	6f
-	sfi_breg src1, \
-	ldr	data1, [\B], #4
+	ldr	data1, [src1], #4
 	b	.Loverlap1
 4:
 	S2LO	data2, data2, #24
@@ -504,8 +451,7 @@ ENTRY (strcmp)
 5:
 	tst	syndrome, #LSB
 	bne	.Lstrcmp_done_equal
-	sfi_breg src2, \
-	ldr	data2, [\B]
+	ldr	data2, [src2]
 6:
 	S2LO	data1, data1, #8
 	bic	data2, data2, #MSB