28 files changed, 3056 insertions, 716 deletions
diff --git a/sysdeps/powerpc/powerpc64/power7/add_n.S b/sysdeps/powerpc/powerpc64/power7/add_n.S
index 6df442ccec..0661cbdbaf 100644
--- a/sysdeps/powerpc/powerpc64/power7/add_n.S
+++ b/sysdeps/powerpc/powerpc64/power7/add_n.S
@@ -1,6 +1,6 @@
 /* PowerPC64 mpn_lshift --  mpn_add_n/mpn_sub_n -- mpn addition and
    subtraction.
-   Copyright (C) 2003-2014 Free Software Foundation, Inc.
+   Copyright (C) 2003-2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/powerpc/powerpc64/power7/bcopy.c b/sysdeps/powerpc/powerpc64/power7/bcopy.c
new file mode 100644
index 0000000000..4a6a400e7a
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/bcopy.c
@@ -0,0 +1 @@
+/* Implemented at memmove.S  */
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/Implies b/sysdeps/powerpc/powerpc64/power7/fpu/Implies
index 410d289a6d..30fa17646e 100644
--- a/sysdeps/powerpc/powerpc64/power7/fpu/Implies
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/Implies
@@ -1 +1 @@
-powerpc/powerpc64/power6/fpu/multiarch
+powerpc/powerpc64/power6/fpu
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
index 765d68914a..0d37e54491 100644
--- a/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
@@ -1,5 +1,5 @@
 /* finite().  PowerPC64/POWER7 version.
-   Copyright (C) 2010-2014 Free Software Foundation, Inc.
+   Copyright (C) 2010-2015 Free Software Foundation, Inc.
    Contributed by Luis Machado <luisgpm@br.ibm.com>.
    This file is part of the GNU C Library.
 
@@ -57,7 +57,7 @@ strong_alias (__finite, __finitef)
 hidden_def (__finitef)
 weak_alias (__finitef, finitef)
 
-#ifdef IS_IN_libm
+#if IS_IN (libm)
 # if LONG_DOUBLE_COMPAT (libm, GLIBC_2_0)
 compat_symbol (libm, __finite, __finitel, GLIBC_2_0)
 compat_symbol (libm, finite, finitel, GLIBC_2_0)
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
index e102d4b448..b24760a953 100644
--- a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
@@ -1,5 +1,5 @@
 /* isinf().  PowerPC64/POWER7 version.
-   Copyright (C) 2010-2014 Free Software Foundation, Inc.
+   Copyright (C) 2010-2015 Free Software Foundation, Inc.
    Contributed by Luis Machado <luisgpm@br.ibm.com>.
    This file is part of the GNU C Library.
 
@@ -61,7 +61,7 @@ strong_alias (__isinf, __isinfl)
 weak_alias (__isinf, isinfl)
 #endif
 
-#ifndef IS_IN_libm
+#if !IS_IN (libm)
 # if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
 compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0);
 compat_symbol (libc, isinf, isinfl, GLIBC_2_0);
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S
index eabee712ea..e53779b877 100644
--- a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S
@@ -1,5 +1,5 @@
 /* isnan().  PowerPC64/POWER7 version.
-   Copyright (C) 2010-2014 Free Software Foundation, Inc.
+   Copyright (C) 2010-2015 Free Software Foundation, Inc.
    Contributed by Luis Machado <luisgpm@br.ibm.com>.
    This file is part of the GNU C Library.
 
@@ -60,7 +60,7 @@ strong_alias (__isnan, __isnanl)
 weak_alias (__isnan, isnanl)
 #endif
 
-#ifndef IS_IN_libm
+#if !IS_IN (libm)
 # if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
 compat_symbol (libc, __isnan, __isnanl, GLIBC_2_0);
 compat_symbol (libc, isnan, isnanl, GLIBC_2_0);
diff --git a/sysdeps/powerpc/powerpc64/power7/memchr.S b/sysdeps/powerpc/powerpc64/power7/memchr.S
index f502ad022b..0e70921a08 100644
--- a/sysdeps/powerpc/powerpc64/power7/memchr.S
+++ b/sysdeps/powerpc/powerpc64/power7/memchr.S
@@ -1,5 +1,5 @@
 /* Optimized memchr implementation for PowerPC64/POWER7 using cmpb insn.
-   Copyright (C) 2010-2014 Free Software Foundation, Inc.
+   Copyright (C) 2010-2015 Free Software Foundation, Inc.
    Contributed by Luis Machado <luisgpm@br.ibm.com>.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/powerpc/powerpc64/power7/memcmp.S b/sysdeps/powerpc/powerpc64/power7/memcmp.S
index 09bff696ff..d60dfdaa18 100644
--- a/sysdeps/powerpc/powerpc64/power7/memcmp.S
+++ b/sysdeps/powerpc/powerpc64/power7/memcmp.S
@@ -1,5 +1,5 @@
 /* Optimized memcmp implementation for POWER7/PowerPC64.
-   Copyright (C) 2010-2014 Free Software Foundation, Inc.
+   Copyright (C) 2010-2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -26,18 +26,48 @@
 EALIGN (memcmp, 4, 0)
 	CALL_MCOUNT 3
 
-#define rRTN	r3
-#define rSTR1	r3	/* first string arg */
-#define rSTR2	r4	/* second string arg */
-#define rN	r5	/* max string length */
-#define rWORD1	r6	/* current word in s1 */
-#define rWORD2	r7	/* current word in s2 */
-#define rWORD3	r8	/* next word in s1 */
-#define rWORD4	r9	/* next word in s2 */
-#define rWORD5	r10	/* next word in s1 */
-#define rWORD6	r11	/* next word in s2 */
-#define rWORD7	r30	/* next word in s1 */
-#define rWORD8	r31	/* next word in s2 */
+#define rRTN		r3
+#define rSTR1		r3	/* first string arg */
+#define rSTR2		r4	/* second string arg */
+#define rN		r5	/* max string length */
+#define rWORD1		r6	/* current word in s1 */
+#define rWORD2		r7	/* current word in s2 */
+#define rWORD3		r8	/* next word in s1 */
+#define rWORD4		r9	/* next word in s2 */
+#define rWORD5		r10	/* next word in s1 */
+#define rWORD6		r11	/* next word in s2 */
+
+#define rOFF8		r20	/* 8 bytes offset.  */
+#define rOFF16  	r21	/* 16 bytes offset.  */
+#define rOFF24		r22	/* 24 bytes offset.  */
+#define rOFF32		r23	/* 24 bytes offset.  */
+#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
+#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
+#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
+#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
+#define rSHR		r28	/* Unaligned shift right count.  */
+#define rSHL		r29	/* Unaligned shift left count.  */
+#define rWORD7		r30	/* next word in s1 */
+#define rWORD8		r31	/* next word in s2 */
+
+#define rWORD8SAVE	(-8)
+#define rWORD7SAVE	(-16)
+#define rOFF8SAVE	(-24)
+#define rOFF16SAVE	(-32)
+#define rOFF24SAVE	(-40)
+#define rOFF32SAVE	(-48)
+#define rSHRSAVE	(-56)
+#define rSHLSAVE	(-64)
+#define rWORD8SHIFTSAVE	(-72)
+#define rWORD2SHIFTSAVE	(-80)
+#define rWORD4SHIFTSAVE	(-88)
+#define rWORD6SHIFTSAVE	(-96)
+
+#ifdef __LITTLE_ENDIAN__
+# define LD	ldbrx
+#else
+# define LD	ldx
+#endif
 
 	xor	r0, rSTR2, rSTR1
 	cmpldi	cr6, rN, 0
@@ -51,10 +81,24 @@ EALIGN (memcmp, 4, 0)
 /* If less than 8 bytes or not aligned, use the unaligned
    byte loop.  */
 	blt	cr1, L(bytealigned)
-	std	rWORD8, -8(r1)
-	cfi_offset(rWORD8, -8)
-	std	rWORD7, -16(r1)
-	cfi_offset(rWORD7, -16)
+	std	rWORD8, rWORD8SAVE(r1)
+	cfi_offset(rWORD8, rWORD8SAVE)
+	std	rWORD7, rWORD7SAVE(r1)
+	cfi_offset(rWORD7, rWORD7SAVE)
+	std	rOFF8, rOFF8SAVE(r1)
+	cfi_offset(rWORD7, rOFF8SAVE)
+	std	rOFF16, rOFF16SAVE(r1)
+	cfi_offset(rWORD7, rOFF16SAVE)
+	std	rOFF24, rOFF24SAVE(r1)
+	cfi_offset(rWORD7, rOFF24SAVE)
+	std	rOFF32, rOFF32SAVE(r1)
+	cfi_offset(rWORD7, rOFF32SAVE)
+
+	li	rOFF8,8
+	li	rOFF16,16
+	li	rOFF24,24
+	li	rOFF32,32
+
 	bne	L(unaligned)
 /* At this point we know both strings have the same alignment and the
    compare length is at least 8 bytes.  r12 contains the low order
@@ -79,15 +123,8 @@ L(samealignment):
 	sldi	rWORD6, r12, 3
 	srdi	r0, rN, 5	/* Divide by 32 */
 	andi.	r12, rN, 24	/* Get the DW remainder */
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 0(rSTR1)
-	ld	rWORD2, 0(rSTR2)
-#endif
+	LD	rWORD1, 0, rSTR1
+	LD	rWORD2, 0, rSTR2
 	cmpldi	cr1, r12, 16
 	cmpldi	cr7, rN, 32
 	clrldi	rN, rN, 61
@@ -104,15 +141,8 @@ L(dsP1):
 	cmpld	cr5, rWORD5, rWORD6
 	blt	cr7, L(dP1x)
 /* Do something useful in this cycle since we have to branch anyway.  */
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 8(rSTR1)
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
 	b	L(dP1e)
 /* Remainder is 16 */
@@ -123,15 +153,8 @@ L(dPs2):
 	cmpld	cr6, rWORD5, rWORD6
 	blt	cr7, L(dP2x)
 /* Do something useful in this cycle since we have to branch anyway.  */
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD7, 8(rSTR1)
-	ld	rWORD8, 8(rSTR2)
-#endif
+	LD	rWORD7, rOFF8, rSTR1
+	LD	rWORD8, rOFF8, rSTR2
 	cmpld	cr5, rWORD7, rWORD8
 	b	L(dP2e)
 /* Remainder is 24 */
@@ -173,72 +196,43 @@ L(dP1):
    change any on the early exit path.  The key here is the non-early
    exit path only cares about the condition code (cr5), not about which
    register pair was used.  */
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 0(rSTR1)
-	ld	rWORD6, 0(rSTR2)
-#endif
+	LD	rWORD5, 0, rSTR1
+	LD	rWORD6, 0, rSTR2
 	cmpld	cr5, rWORD5, rWORD6
 	blt	cr7, L(dP1x)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 8(rSTR1)
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
 L(dP1e):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 16(rSTR1)
-	ld	rWORD4, 16(rSTR2)
-#endif
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 24(rSTR1)
-	ld	rWORD6, 24(rSTR2)
-#endif
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
 	cmpld	cr6, rWORD5, rWORD6
 	bne	cr5, L(dLcr5x)
 	bne	cr7, L(dLcr7x)
 
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ldu	rWORD7, 32(rSTR1)
-	ldu	rWORD8, 32(rSTR2)
-#endif
+	LD	rWORD7, rOFF32, rSTR1
+	LD	rWORD8, rOFF32, rSTR2
+	addi	rSTR1, rSTR1, 32
+	addi	rSTR2, rSTR2, 32
 	bne	cr1, L(dLcr1)
 	cmpld	cr5, rWORD7, rWORD8
 	bdnz	L(dLoop)
 	bne	cr6, L(dLcr6)
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 	.align	3
 L(dP1x):
 	sldi.	r12, rN, 3
 	bne	cr5, L(dLcr5x)
 	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
 	bne	L(d00)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 0
 	blr
 
@@ -246,79 +240,41 @@ L(dP1x):
 	.align	4
 L(dP2):
 	mtctr	r0
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 0(rSTR1)
-	ld	rWORD6, 0(rSTR2)
-#endif
+	LD	rWORD5, 0, rSTR1
+	LD	rWORD6, 0, rSTR2
 	cmpld	cr6, rWORD5, rWORD6
 	blt	cr7, L(dP2x)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD7, 8(rSTR1)
-	ld	rWORD8, 8(rSTR2)
-#endif
+	LD	rWORD7, rOFF8, rSTR1
+	LD	rWORD8, rOFF8, rSTR2
 	cmpld	cr5, rWORD7, rWORD8
 L(dP2e):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 16(rSTR1)
-	ld	rWORD2, 16(rSTR2)
-#endif
+	LD	rWORD1, rOFF16, rSTR1
+	LD	rWORD2, rOFF16, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 24(rSTR1)
-	ld	rWORD4, 24(rSTR2)
-#endif
+	LD	rWORD3, rOFF24, rSTR1
+	LD	rWORD4, rOFF24, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
-#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
-#endif
 	bne	cr6, L(dLcr6)
 	bne	cr5, L(dLcr5)
 	b	L(dLoop2)
-/* Again we are on a early exit path (16-23 byte compare), we want to
-   only use volatile registers and avoid restoring non-volatile
-   registers.  */
 	.align	4
 L(dP2x):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 8(rSTR1)
-	ld	rWORD4, 8(rSTR2)
-#endif
+	LD	rWORD3, rOFF8, rSTR1
+	LD	rWORD4, rOFF8, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
 	sldi.	r12, rN, 3
 	bne	cr6, L(dLcr6x)
-#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
-#endif
 	bne	cr1, L(dLcr1x)
 	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
 	bne	L(d00)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 0
 	blr
 
@@ -326,52 +282,22 @@ L(dP2x):
 	.align	4
 L(dP3):
 	mtctr	r0
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 0(rSTR1)
-	ld	rWORD4, 0(rSTR2)
-#endif
+	LD	rWORD3, 0, rSTR1
+	LD	rWORD4, 0, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
 L(dP3e):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 8(rSTR1)
-	ld	rWORD6, 8(rSTR2)
-#endif
+	LD	rWORD5, rOFF8, rSTR1
+	LD	rWORD6, rOFF8, rSTR2
 	cmpld	cr6, rWORD5, rWORD6
 	blt	cr7, L(dP3x)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD7, 16(rSTR1)
-	ld	rWORD8, 16(rSTR2)
-#endif
+	LD	rWORD7, rOFF16, rSTR1
+	LD	rWORD8, rOFF16, rSTR2
 	cmpld	cr5, rWORD7, rWORD8
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 24(rSTR1)
-	ld	rWORD2, 24(rSTR2)
-#endif
+	LD	rWORD1, rOFF24, rSTR1
+	LD	rWORD2, rOFF24, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
-#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 16
 	addi	rSTR2, rSTR2, 16
-#endif
 	bne	cr1, L(dLcr1)
 	bne	cr6, L(dLcr6)
 	b	L(dLoop1)
@@ -380,26 +306,21 @@ L(dP3e):
    registers.  */
 	.align	4
 L(dP3x):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 16(rSTR1)
-	ld	rWORD2, 16(rSTR2)
-#endif
+	LD	rWORD1, rOFF16, rSTR1
+	LD	rWORD2, rOFF16, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
 	sldi.	r12, rN, 3
 	bne	cr1, L(dLcr1x)
-#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 16
 	addi	rSTR2, rSTR2, 16
-#endif
 	bne	cr6, L(dLcr6x)
 	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
 	bne	cr7, L(dLcr7x)
 	bne	L(d00)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 0
 	blr
 
@@ -407,46 +328,20 @@ L(dP3x):
 	.align	4
 L(dP4):
 	mtctr	r0
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 0(rSTR1)
-	ld	rWORD2, 0(rSTR2)
-#endif
+	LD	rWORD1, 0, rSTR1
+	LD	rWORD2, 0, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
 L(dP4e):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 8(rSTR1)
-	ld	rWORD4, 8(rSTR2)
-#endif
+	LD	rWORD3, rOFF8, rSTR1
+	LD	rWORD4, rOFF8, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 16(rSTR1)
-	ld	rWORD6, 16(rSTR2)
-#endif
+	LD	rWORD5, rOFF16, rSTR1
+	LD	rWORD6, rOFF16, rSTR2
 	cmpld	cr6, rWORD5, rWORD6
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ldu	rWORD7, 24(rSTR1)
-	ldu	rWORD8, 24(rSTR2)
-#endif
+	LD	rWORD7, rOFF24, rSTR1
+	LD	rWORD8, rOFF24, rSTR2
+	addi	rSTR1, rSTR1, 24
+	addi	rSTR2, rSTR2, 24
 	cmpld	cr5, rWORD7, rWORD8
 	bne	cr7, L(dLcr7)
 	bne	cr1, L(dLcr1)
@@ -454,51 +349,25 @@ L(dP4e):
 /* This is the primary loop */
 	.align	4
 L(dLoop):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 8(rSTR1)
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
 	bne	cr6, L(dLcr6)
 L(dLoop1):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 16(rSTR1)
-	ld	rWORD4, 16(rSTR2)
-#endif
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
 	cmpld	cr6, rWORD5, rWORD6
 	bne	cr5, L(dLcr5)
 L(dLoop2):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 24(rSTR1)
-	ld	rWORD6, 24(rSTR2)
-#endif
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
 	cmpld	cr5, rWORD7, rWORD8
 	bne	cr7, L(dLcr7)
 L(dLoop3):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ldu	rWORD7, 32(rSTR1)
-	ldu	rWORD8, 32(rSTR2)
-#endif
+	LD	rWORD7, rOFF32, rSTR1
+	LD	rWORD8, rOFF32, rSTR2
+	addi	rSTR1, rSTR1, 32
+	addi	rSTR2, rSTR2, 32
 	bne	cr1, L(dLcr1)
 	cmpld	cr7, rWORD1, rWORD2
 	bdnz	L(dLoop)
@@ -519,62 +388,75 @@ L(d14):
 	sldi.	r12, rN, 3
 	bne	cr5, L(dLcr5)
 L(d04):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
-	beq	L(zeroLength)
+	beq	L(duzeroLength)
 /* At this point we have a remainder of 1 to 7 bytes to compare.  Since
    we are aligned it is safe to load the whole double word, and use
    shift right double to eliminate bits beyond the compare length.  */
 L(d00):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 8(rSTR1)
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
 	srd	rWORD1, rWORD1, rN
 	srd	rWORD2, rWORD2, rN
 	cmpld	cr7, rWORD1, rWORD2
 	bne	cr7, L(dLcr7x)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 0
 	blr
 
 	.align	4
 L(dLcr7):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 L(dLcr7x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 1
 	bgtlr	cr7
 	li	rRTN, -1
 	blr
 	.align	4
 L(dLcr1):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 L(dLcr1x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 1
 	bgtlr	cr1
 	li	rRTN, -1
 	blr
 	.align	4
 L(dLcr6):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 L(dLcr6x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 1
 	bgtlr	cr6
 	li	rRTN, -1
 	blr
 	.align	4
 L(dLcr5):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 L(dLcr5x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 1
 	bgtlr	cr5
 	li	rRTN, -1
@@ -583,10 +465,6 @@ L(dLcr5x):
 	.align	4
 L(bytealigned):
 	mtctr	rN
-#if 0
-/* Huh?  We've already branched on cr6!  */
-	beq	cr6, L(zeroLength)
-#endif
 
 /* We need to prime this loop.  This loop is swing modulo scheduled
    to avoid pipe delays.  The dependent instruction latencies (load to
@@ -685,6 +563,7 @@ L(b11):
 L(bx12):
 	sub	rRTN, rWORD1, rWORD2
 	blr
+
 	.align	4
 L(zeroLength):
 	li	rRTN, 0
@@ -705,42 +584,36 @@ L(zeroLength):
    we need to adjust the length (rN) and special case the loop
    versioning for the first DW. This ensures that the loop count is
    correct and the first DW (shifted) is in the expected resister pair.  */
-#define rSHL		r29	/* Unaligned shift left count.  */
-#define rSHR		r28	/* Unaligned shift right count.  */
-#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
-#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
-#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
-#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
 L(unaligned):
-	std	rSHL, -24(r1)
-	cfi_offset(rSHL, -24)
+	std	rSHL, rSHLSAVE(r1)
+	cfi_offset(rSHL, rSHLSAVE)
 	clrldi	rSHL, rSTR2, 61
 	beq	cr6, L(duzeroLength)
-	std	rSHR, -32(r1)
-	cfi_offset(rSHR, -32)
+	std	rSHR, rSHRSAVE(r1)
+	cfi_offset(rSHR, rSHRSAVE)
 	beq	cr5, L(DWunaligned)
-	std	rWORD8_SHIFT, -40(r1)
-	cfi_offset(rWORD8_SHIFT, -40)
+	std	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+	cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
 /* Adjust the logical start of rSTR2 to compensate for the extra bits
    in the 1st rSTR1 DW.  */
 	sub	rWORD8_SHIFT, rSTR2, r12
 /* But do not attempt to address the DW before that DW that contains
    the actual start of rSTR2.  */
 	clrrdi	rSTR2, rSTR2, 3
-	std	rWORD2_SHIFT, -48(r1)
-	cfi_offset(rWORD2_SHIFT, -48)
+	std	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+	cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
 /* Compute the left/right shift counts for the unaligned rSTR2,
    compensating for the logical (DW aligned) start of rSTR1.  */
 	clrldi	rSHL, rWORD8_SHIFT, 61
 	clrrdi	rSTR1, rSTR1, 3
-	std	rWORD4_SHIFT, -56(r1)
-	cfi_offset(rWORD4_SHIFT, -56)
+	std	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+	cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
 	sldi	rSHL, rSHL, 3
 	cmpld	cr5, rWORD8_SHIFT, rSTR2
 	add	rN, rN, r12
 	sldi	rWORD6, r12, 3
-	std	rWORD6_SHIFT, -64(r1)
-	cfi_offset(rWORD6_SHIFT, -64)
+	std	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
 	subfic	rSHR, rSHL, 64
 	srdi	r0, rN, 5	/* Divide by 32 */
 	andi.	r12, rN, 24	/* Get the DW remainder */
@@ -750,25 +623,13 @@ L(unaligned):
    this may cross a page boundary and cause a page fault.  */
 	li	rWORD8, 0
 	blt	cr5, L(dus0)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD8, 0, rSTR2
+	LD	rWORD8, 0, rSTR2
 	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD8, 0(rSTR2)
-	addi	rSTR2, rSTR2, 8
-#endif
 	sld	rWORD8, rWORD8, rSHL
 
 L(dus0):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 0(rSTR1)
-	ld	rWORD2, 0(rSTR2)
-#endif
+	LD	rWORD1, 0, rSTR1
+	LD	rWORD2, 0, rSTR2
 	cmpldi	cr1, r12, 16
 	cmpldi	cr7, rN, 32
 	srd	r12, rWORD2, rSHR
@@ -796,12 +657,7 @@ L(dusP1):
 	beq	L(duZeroReturn)
 	li	r0, 0
 	ble	cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD2, rOFF8, rSTR2
 	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 16 */
@@ -832,27 +688,21 @@ L(duPs4):
    compare length is at least 8 bytes.  */
 	.align	4
 L(DWunaligned):
-	std	rWORD8_SHIFT, -40(r1)
-	cfi_offset(rWORD8_SHIFT, -40)
+	std	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+	cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
 	clrrdi	rSTR2, rSTR2, 3
-	std	rWORD2_SHIFT, -48(r1)
-	cfi_offset(rWORD2_SHIFT, -48)
+	std	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+	cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
 	srdi	r0, rN, 5	/* Divide by 32 */
-	std	rWORD4_SHIFT, -56(r1)
-	cfi_offset(rWORD4_SHIFT, -56)
+	std	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+	cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
 	andi.	r12, rN, 24	/* Get the DW remainder */
-	std	rWORD6_SHIFT, -64(r1)
-	cfi_offset(rWORD6_SHIFT, -64)
+	std	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
 	sldi	rSHL, rSHL, 3
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD6, 0, rSTR2
+	LD	rWORD6, 0, rSTR2
+	LD	rWORD8, rOFF8, rSTR2
 	addi	rSTR2, rSTR2, 8
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD6, 0(rSTR2)
-	ldu	rWORD8, 8(rSTR2)
-#endif
 	cmpldi	cr1, r12, 16
 	cmpldi	cr7, rN, 32
 	clrldi	rN, rN, 61
@@ -867,52 +717,26 @@ L(DWunaligned):
 	.align	4
 L(duP1):
 	srd	r12, rWORD8, rSHR
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	addi	rSTR1, rSTR1, 8
-#else
-	ld	rWORD7, 0(rSTR1)
-#endif
+	LD	rWORD7, 0, rSTR1
 	sld	rWORD8_SHIFT, rWORD8, rSHL
 	or	rWORD8, r12, rWORD6_SHIFT
 	blt	cr7, L(duP1x)
 L(duP1e):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 8(rSTR1)
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
 	cmpld	cr5, rWORD7, rWORD8
 	srd	r0, rWORD2, rSHR
 	sld	rWORD2_SHIFT, rWORD2, rSHL
 	or	rWORD2, r0, rWORD8_SHIFT
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 16(rSTR1)
-	ld	rWORD4, 16(rSTR2)
-#endif
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
 	srd	r12, rWORD4, rSHR
 	sld	rWORD4_SHIFT, rWORD4, rSHL
 	bne	cr5, L(duLcr5)
 	or	rWORD4, r12, rWORD2_SHIFT
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 24(rSTR1)
-	ld	rWORD6, 24(rSTR2)
-#endif
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
 	srd	r0, rWORD6, rSHR
 	sld	rWORD6_SHIFT, rWORD6, rSHL
@@ -932,82 +756,47 @@ L(duP1x):
 	beq	L(duZeroReturn)
 	li	r0, 0
 	ble	cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD2, rOFF8, rSTR2
 	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 16 */
 	.align	4
 L(duP2):
 	srd	r0, rWORD8, rSHR
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	addi	rSTR1, rSTR1, 8
-#else
-	ld	rWORD5, 0(rSTR1)
-#endif
+	LD	rWORD5, 0, rSTR1
 	or	rWORD6, r0, rWORD6_SHIFT
 	sld	rWORD6_SHIFT, rWORD8, rSHL
 L(duP2e):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD7, 8(rSTR1)
-	ld	rWORD8, 8(rSTR2)
-#endif
+	LD	rWORD7, rOFF8, rSTR1
+	LD	rWORD8, rOFF8, rSTR2
 	cmpld	cr6, rWORD5, rWORD6
 	srd	r12, rWORD8, rSHR
 	sld	rWORD8_SHIFT, rWORD8, rSHL
 	or	rWORD8, r12, rWORD6_SHIFT
 	blt	cr7, L(duP2x)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 16(rSTR1)
-	ld	rWORD2, 16(rSTR2)
-#endif
+	LD	rWORD1, rOFF16, rSTR1
+	LD	rWORD2, rOFF16, rSTR2
 	cmpld	cr5, rWORD7, rWORD8
 	bne	cr6, L(duLcr6)
 	srd	r0, rWORD2, rSHR
 	sld	rWORD2_SHIFT, rWORD2, rSHL
 	or	rWORD2, r0, rWORD8_SHIFT
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 24(rSTR1)
-	ld	rWORD4, 24(rSTR2)
-#endif
+	LD	rWORD3, rOFF24, rSTR1
+	LD	rWORD4, rOFF24, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
 	bne	cr5, L(duLcr5)
 	srd	r12, rWORD4, rSHR
 	sld	rWORD4_SHIFT, rWORD4, rSHL
 	or	rWORD4, r12, rWORD2_SHIFT
-#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
-#endif
 	cmpld	cr1, rWORD3, rWORD4
 	b	L(duLoop2)
 	.align	4
 L(duP2x):
 	cmpld	cr5, rWORD7, rWORD8
-#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
-#endif
 	bne	cr6, L(duLcr6)
 	sldi.	rN, rN, 3
 	bne	cr5, L(duLcr5)
@@ -1015,12 +804,7 @@ L(duP2x):
 	beq	L(duZeroReturn)
 	li	r0, 0
 	ble	cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD2, rOFF8, rSTR2
 	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 
@@ -1028,73 +812,39 @@ L(duP2x):
 	.align	4
 L(duP3):
 	srd	r12, rWORD8, rSHR
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	addi	rSTR1, rSTR1, 8
-#else
-	ld	rWORD3, 0(rSTR1)
-#endif
+	LD	rWORD3, 0, rSTR1
 	sld	rWORD4_SHIFT, rWORD8, rSHL
 	or	rWORD4, r12, rWORD6_SHIFT
 L(duP3e):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 8(rSTR1)
-	ld	rWORD6, 8(rSTR2)
-#endif
+	LD	rWORD5, rOFF8, rSTR1
+	LD	rWORD6, rOFF8, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
 	srd	r0, rWORD6, rSHR
 	sld	rWORD6_SHIFT, rWORD6, rSHL
 	or	rWORD6, r0, rWORD4_SHIFT
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD7, 16(rSTR1)
-	ld	rWORD8, 16(rSTR2)
-#endif
+	LD	rWORD7, rOFF16, rSTR1
+	LD	rWORD8, rOFF16, rSTR2
 	cmpld	cr6, rWORD5, rWORD6
 	bne	cr1, L(duLcr1)
 	srd	r12, rWORD8, rSHR
 	sld	rWORD8_SHIFT, rWORD8, rSHL
 	or	rWORD8, r12, rWORD6_SHIFT
 	blt	cr7, L(duP3x)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 24(rSTR1)
-	ld	rWORD2, 24(rSTR2)
-#endif
+	LD	rWORD1, rOFF24, rSTR1
+	LD	rWORD2, rOFF24, rSTR2
 	cmpld	cr5, rWORD7, rWORD8
 	bne	cr6, L(duLcr6)
 	srd	r0, rWORD2, rSHR
 	sld	rWORD2_SHIFT, rWORD2, rSHL
 	or	rWORD2, r0, rWORD8_SHIFT
-#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 16
 	addi	rSTR2, rSTR2, 16
-#endif
 	cmpld	cr7, rWORD1, rWORD2
 	b	L(duLoop1)
 	.align	4
 L(duP3x):
-#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 16
 	addi	rSTR2, rSTR2, 16
-#endif
-#if 0
-/* Huh?  We've already branched on cr1!  */
-	bne	cr1, L(duLcr1)
-#endif
 	cmpld	cr5, rWORD7, rWORD8
 	bne	cr6, L(duLcr6)
 	sldi.	rN, rN, 3
@@ -1103,12 +853,7 @@ L(duP3x):
 	beq	L(duZeroReturn)
 	li	r0, 0
 	ble	cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD2, rOFF8, rSTR2
 	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 
@@ -1117,51 +862,27 @@ L(duP3x):
 L(duP4):
 	mtctr	r0
 	srd	r0, rWORD8, rSHR
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	addi	rSTR1, rSTR1, 8
-#else
-	ld	rWORD1, 0(rSTR1)
-#endif
+	LD	rWORD1, 0, rSTR1
 	sld	rWORD2_SHIFT, rWORD8, rSHL
 	or	rWORD2, r0, rWORD6_SHIFT
 L(duP4e):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 8(rSTR1)
-	ld	rWORD4, 8(rSTR2)
-#endif
+	LD	rWORD3, rOFF8, rSTR1
+	LD	rWORD4, rOFF8, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
 	srd	r12, rWORD4, rSHR
 	sld	rWORD4_SHIFT, rWORD4, rSHL
 	or	rWORD4, r12, rWORD2_SHIFT
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 16(rSTR1)
-	ld	rWORD6, 16(rSTR2)
-#endif
+	LD	rWORD5, rOFF16, rSTR1
+	LD	rWORD6, rOFF16, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
 	bne	cr7, L(duLcr7)
 	srd	r0, rWORD6, rSHR
 	sld	rWORD6_SHIFT, rWORD6, rSHL
 	or	rWORD6, r0, rWORD4_SHIFT
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ldu	rWORD7, 24(rSTR1)
-	ldu	rWORD8, 24(rSTR2)
-#endif
+	LD	rWORD7, rOFF24, rSTR1
+	LD	rWORD8, rOFF24, rSTR2
+	addi	rSTR1, rSTR1, 24
+	addi	rSTR2, rSTR2, 24
 	cmpld	cr6, rWORD5, rWORD6
 	bne	cr1, L(duLcr1)
 	srd	r12, rWORD8, rSHR
@@ -1172,60 +893,34 @@ L(duP4e):
 /* This is the primary loop */
 	.align	4
 L(duLoop):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 8(rSTR1)
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
 	bne	cr6, L(duLcr6)
 	srd	r0, rWORD2, rSHR
 	sld	rWORD2_SHIFT, rWORD2, rSHL
 	or	rWORD2, r0, rWORD8_SHIFT
 L(duLoop1):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 16(rSTR1)
-	ld	rWORD4, 16(rSTR2)
-#endif
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
 	cmpld	cr6, rWORD5, rWORD6
 	bne	cr5, L(duLcr5)
 	srd	r12, rWORD4, rSHR
 	sld	rWORD4_SHIFT, rWORD4, rSHL
 	or	rWORD4, r12, rWORD2_SHIFT
 L(duLoop2):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 24(rSTR1)
-	ld	rWORD6, 24(rSTR2)
-#endif
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
 	cmpld	cr5, rWORD7, rWORD8
 	bne	cr7, L(duLcr7)
 	srd	r0, rWORD6, rSHR
 	sld	rWORD6_SHIFT, rWORD6, rSHL
 	or	rWORD6, r0, rWORD4_SHIFT
 L(duLoop3):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ldu	rWORD7, 32(rSTR1)
-	ldu	rWORD8, 32(rSTR2)
-#endif
+	LD	rWORD7, rOFF32, rSTR1
+	LD	rWORD8, rOFF32, rSTR2
+	addi	rSTR1, rSTR1, 32
+	addi	rSTR2, rSTR2, 32
 	cmpld	cr7, rWORD1, rWORD2
 	bne	cr1, L(duLcr1)
 	srd	r12, rWORD8, rSHR
@@ -1234,10 +929,6 @@ L(duLoop3):
 	bdnz	L(duLoop)
 
 L(duL4):
-#if 0
-/* Huh?  We've already branched on cr1!  */
-	bne	cr1, L(duLcr1)
-#endif
 	cmpld	cr1, rWORD3, rWORD4
 	bne	cr6, L(duLcr6)
 	cmpld	cr6, rWORD5, rWORD6
@@ -1264,99 +955,102 @@ L(du14):
 	beq	L(duZeroReturn)
 	li	r0, 0
 	ble	cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD2, rOFF8, rSTR2
 	srd	r0, rWORD2, rSHR
 	.align	4
 L(dutrim):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-#else
-	ld	rWORD1, 8(rSTR1)
-#endif
+	LD	rWORD1, rOFF8, rSTR1
 	ld	rWORD8, -8(r1)
 	subfic	rN, rN, 64	/* Shift count is 64 - (rN * 8).  */
 	or	rWORD2, r0, rWORD8_SHIFT
-	ld	rWORD7, -16(r1)
-	ld	rSHL, -24(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	ld	rSHL, rSHLSAVE(r1)
 	srd	rWORD1, rWORD1, rN
 	srd	rWORD2, rWORD2, rN
-	ld	rSHR, -32(r1)
-	ld	rWORD8_SHIFT, -40(r1)
+	ld	rSHR, rSHRSAVE(r1)
+	ld	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
 	li	rRTN, 0
 	cmpld	cr7, rWORD1, rWORD2
-	ld	rWORD2_SHIFT, -48(r1)
-	ld	rWORD4_SHIFT, -56(r1)
+	ld	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+	ld	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
 	beq	cr7, L(dureturn24)
 	li	rRTN, 1
-	ld	rWORD6_SHIFT, -64(r1)
+	ld	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	bgtlr	cr7
 	li	rRTN, -1
 	blr
 	.align	4
 L(duLcr7):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 	li	rRTN, 1
 	bgt	cr7, L(dureturn29)
-	ld	rSHL, -24(r1)
-	ld	rSHR, -32(r1)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
 	.align	4
 L(duLcr1):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 	li	rRTN, 1
 	bgt	cr1, L(dureturn29)
-	ld	rSHL, -24(r1)
-	ld	rSHR, -32(r1)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
 	.align	4
 L(duLcr6):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 	li	rRTN, 1
 	bgt	cr6, L(dureturn29)
-	ld	rSHL, -24(r1)
-	ld	rSHR, -32(r1)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
 	.align	4
 L(duLcr5):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 	li	rRTN, 1
 	bgt	cr5, L(dureturn29)
-	ld	rSHL, -24(r1)
-	ld	rSHR, -32(r1)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
+
 	.align	3
 L(duZeroReturn):
 	li	rRTN, 0
 	.align	4
 L(dureturn):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 L(dureturn29):
-	ld	rSHL, -24(r1)
-	ld	rSHR, -32(r1)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
 L(dureturn27):
-	ld	rWORD8_SHIFT, -40(r1)
-L(dureturn26):
-	ld	rWORD2_SHIFT, -48(r1)
-L(dureturn25):
-	ld	rWORD4_SHIFT, -56(r1)
+	ld	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+	ld	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+	ld	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
 L(dureturn24):
-	ld	rWORD6_SHIFT, -64(r1)
+	ld	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	blr
+
 L(duzeroLength):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 0
 	blr
 
diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S
index bbfd381b1b..8c8834e2a2 100644
--- a/sysdeps/powerpc/powerpc64/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S
@@ -1,5 +1,5 @@
 /* Optimized memcpy implementation for PowerPC64/POWER7.
-   Copyright (C) 2010-2014 Free Software Foundation, Inc.
+   Copyright (C) 2010-2015 Free Software Foundation, Inc.
    Contributed by Luis Machado <luisgpm@br.ibm.com>.
    This file is part of the GNU C Library.
 
@@ -36,16 +36,11 @@ EALIGN (memcpy, 5, 0)
 	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
 				    code.  */
 
-#ifdef __LITTLE_ENDIAN__
-/* In little-endian mode, power7 takes an alignment trap on any lxvd2x
-   or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy
-   loop is only used for quadword aligned copies.  */
+/* Align copies using VSX instructions to quadword. It is to avoid alignment
+   traps when memcpy is used on non-cacheable memory (for instance, memory
+   mapped I/O).  */
 	andi.	10,3,15
 	clrldi	11,4,60
-#else
-	andi.	10,3,7		/* Check alignment of DST.  */
-	clrldi	11,4,61		/* Check alignment of SRC.  */
-#endif
 	cmpld	cr6,10,11	/* SRC and DST alignments match?  */
 
 	mr	dst,3
@@ -53,13 +48,9 @@ EALIGN (memcpy, 5, 0)
 	beq	L(aligned_copy)
 
 	mtocrf	0x01,0
-#ifdef __LITTLE_ENDIAN__
 	clrldi	0,0,60
-#else
-	clrldi	0,0,61
-#endif
 
-/* Get the DST and SRC aligned to 8 bytes (16 for little-endian).  */
+/* Get the DST and SRC aligned to 16 bytes.  */
 1:
 	bf	31,2f
 	lbz	6,0(src)
@@ -79,14 +70,12 @@ EALIGN (memcpy, 5, 0)
 	stw	6,0(dst)
 	addi	dst,dst,4
 8:
-#ifdef __LITTLE_ENDIAN__
 	bf	28,16f
 	ld	6,0(src)
 	addi	src,src,8
 	std	6,0(dst)
 	addi	dst,dst,8
 16:
-#endif
 	subf	cnt,0,cnt
 
 /* Main aligned copy loop. Copies 128 bytes at a time. */
@@ -298,9 +287,6 @@ L(copy_LE_8):
 	.align	4
 L(copy_GE_32_unaligned):
 	clrldi	0,0,60	      /* Number of bytes until the 1st dst quadword.  */
-#ifndef __LITTLE_ENDIAN__
-	andi.	10,3,15	      /* Check alignment of DST (against quadwords).  */
-#endif
 	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
 
 	beq	L(copy_GE_32_unaligned_cont)
diff --git a/sysdeps/powerpc/powerpc64/power7/memmove.S b/sysdeps/powerpc/powerpc64/power7/memmove.S
new file mode 100644
index 0000000000..3bd4b4bb3f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/memmove.S
@@ -0,0 +1,831 @@
+/* Optimized memmove implementation for PowerPC64/POWER7.
+   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+
+/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5])
+
+   This optimization check if memory 'dest'  overlaps with 'src'. If it does
+   not then it calls an optimized memcpy call (similar to memcpy for POWER7,
+   embedded here to gain some cycles).
+   If source and destiny overlaps, a optimized backwards memcpy is used
+   instead.  */
+
+	.machine power7
+EALIGN (memmove, 5, 0)
+	CALL_MCOUNT 3
+
+L(_memmove):
+	subf    r9,r4,r3
+	cmpld   cr7,r9,r5
+	blt	cr7,L(memmove_bwd)
+
+	cmpldi	cr1,r5,31
+	neg	0,3
+	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
+				       code.  */
+
+	andi.	10,3,15
+	clrldi	11,4,60
+	cmpld	cr6,10,11	/* SRC and DST alignments match?  */
+
+	mr	r11,3
+	bne	cr6,L(copy_GE_32_unaligned)
+	beq	L(aligned_copy)
+
+	mtocrf	0x01,0
+	clrldi	0,0,60
+
+/* Get the DST and SRC aligned to 8 bytes (16 for little-endian).  */
+1:
+	bf	31,2f
+	lbz	6,0(r4)
+	addi	r4,r4,1
+	stb	6,0(r11)
+	addi	r11,r11,1
+2:
+	bf	30,4f
+	lhz	6,0(r4)
+	addi	r4,r4,2
+	sth	6,0(r11)
+	addi	r11,r11,2
+4:
+	bf	29,8f
+	lwz	6,0(r4)
+	addi	r4,r4,4
+	stw	6,0(r11)
+	addi	r11,r11,4
+8:
+	bf	28,16f
+	ld	6,0(r4)
+	addi	r4,r4,8
+	std	6,0(r11)
+	addi	r11,r11,8
+16:
+	subf	r5,0,r5
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy):
+	li	6,16
+	li	7,32
+	li	8,48
+	mtocrf	0x02,r5
+	srdi	12,r5,7
+	cmpdi	12,0
+	beq	L(aligned_tail)
+	lxvd2x	6,0,r4
+	lxvd2x	7,r4,6
+	mtctr	12
+	b	L(aligned_128loop)
+
+	.align  4
+L(aligned_128head):
+	/* for the 2nd + iteration of this loop. */
+	lxvd2x	6,0,r4
+	lxvd2x	7,r4,6
+L(aligned_128loop):
+	lxvd2x	8,r4,7
+	lxvd2x	9,r4,8
+	stxvd2x	6,0,r11
+	addi	r4,r4,64
+	stxvd2x	7,r11,6
+	stxvd2x	8,r11,7
+	stxvd2x	9,r11,8
+	lxvd2x	6,0,r4
+	lxvd2x	7,r4,6
+	addi	r11,r11,64
+	lxvd2x	8,r4,7
+	lxvd2x	9,r4,8
+	addi	r4,r4,64
+	stxvd2x	6,0,r11
+	stxvd2x	7,r11,6
+	stxvd2x	8,r11,7
+	stxvd2x	9,r11,8
+	addi	r11,r11,64
+	bdnz	L(aligned_128head)
+
+L(aligned_tail):
+	mtocrf	0x01,r5
+	bf	25,32f
+	lxvd2x	6,0,r4
+	lxvd2x	7,r4,6
+	lxvd2x	8,r4,7
+	lxvd2x	9,r4,8
+	addi	r4,r4,64
+	stxvd2x	6,0,r11
+	stxvd2x	7,r11,6
+	stxvd2x	8,r11,7
+	stxvd2x	9,r11,8
+	addi	r11,r11,64
+32:
+	bf	26,16f
+	lxvd2x	6,0,r4
+	lxvd2x	7,r4,6
+	addi	r4,r4,32
+	stxvd2x	6,0,r11
+	stxvd2x	7,r11,6
+	addi	r11,r11,32
+16:
+	bf	27,8f
+	lxvd2x	6,0,r4
+	addi	r4,r4,16
+	stxvd2x	6,0,r11
+	addi	r11,r11,16
+8:
+	bf	28,4f
+	ld	6,0(r4)
+	addi	r4,r4,8
+	std     6,0(r11)
+	addi	r11,r11,8
+4:	/* Copies 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(r4)
+	stw     6,0(r11)
+	bf      30,L(tail5)
+	lhz     7,4(r4)
+	sth     7,4(r11)
+	bflr	31
+	lbz     8,6(r4)
+	stb     8,6(r11)
+	/* Return original DST pointer.  */
+	blr
+
+/* Handle copies of 0~31 bytes.  */
+	.align	4
+L(copy_LT_32):
+	mr	r11,3
+	cmpldi	cr6,r5,8
+	mtocrf	0x01,r5
+	ble	cr6,L(copy_LE_8)
+
+	/* At least 9 bytes to go.  */
+	neg	8,4
+	andi.	0,8,3
+	cmpldi	cr1,r5,16
+	beq	L(copy_LT_32_aligned)
+
+	/* Force 4-byte alignment for SRC.  */
+	mtocrf	0x01,0
+	subf	r5,0,r5
+2:
+	bf	30,1f
+	lhz	6,0(r4)
+	addi	r4,r4,2
+	sth	6,0(r11)
+	addi	r11,r11,2
+1:
+	bf	31,L(end_4bytes_alignment)
+	lbz	6,0(r4)
+	addi	r4,r4,1
+	stb	6,0(r11)
+	addi	r11,r11,1
+
+	.align	4
+L(end_4bytes_alignment):
+	cmpldi	cr1,r5,16
+	mtocrf	0x01,r5
+
+L(copy_LT_32_aligned):
+	/* At least 6 bytes to go, and SRC is word-aligned.  */
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	lwz	6,0(r4)
+	lwz	7,4(r4)
+	stw	6,0(r11)
+	lwz	8,8(r4)
+	stw	7,4(r11)
+	lwz	6,12(r4)
+	addi	r4,r4,16
+	stw	8,8(r11)
+	stw	6,12(r11)
+	addi	r11,r11,16
+8:	/* Copy 8 bytes.  */
+	bf	28,L(tail4)
+	lwz	6,0(r4)
+	lwz	7,4(r4)
+	addi	r4,r4,8
+	stw	6,0(r11)
+	stw	7,4(r11)
+	addi	r11,r11,8
+
+	.align	4
+/* Copies 4~7 bytes.  */
+L(tail4):
+	bf	29,L(tail2)
+	lwz	6,0(r4)
+	stw	6,0(r11)
+	bf	30,L(tail5)
+	lhz	7,4(r4)
+	sth	7,4(r11)
+	bflr	31
+	lbz	8,6(r4)
+	stb	8,6(r11)
+	/* Return original DST pointer.  */
+	blr
+
+	.align	4
+/* Copies 2~3 bytes.  */
+L(tail2):
+	bf	30,1f
+	lhz	6,0(r4)
+	sth	6,0(r11)
+	bflr	31
+	lbz	7,2(r4)
+	stb	7,2(r11)
+	blr
+
+	.align	4
+L(tail5):
+	bflr	31
+	lbz	6,4(r4)
+	stb	6,4(r11)
+	blr
+
+	.align	4
+1:
+	bflr	31
+	lbz	6,0(r4)
+	stb	6,0(r11)
+	/* Return original DST pointer.  */
+	blr
+
+/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(copy_LE_8):
+	bne	cr6,L(tail4)
+
+	/* Though we could've used ld/std here, they are still
+	slow for unaligned cases.  */
+
+	lwz	6,0(r4)
+	lwz	7,4(r4)
+	stw	6,0(r11)
+	stw	7,4(r11)
+	blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
+   the data, allowing for aligned DST stores.  */
+	.align	4
+L(copy_GE_32_unaligned):
+	clrldi	0,0,60	      /* Number of bytes until the 1st r11 quadword.  */
+	srdi	9,r5,4	      /* Number of full quadwords remaining.  */
+
+	beq	L(copy_GE_32_unaligned_cont)
+
+	/* DST is not quadword aligned, get it aligned.  */
+
+	mtocrf	0x01,0
+	subf	r5,0,r5
+
+	/* Vector instructions work best when proper alignment (16-bytes)
+	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
+1:
+	bf	31,2f
+	lbz	6,0(r4)
+	addi	r4,r4,1
+	stb	6,0(r11)
+	addi	r11,r11,1
+2:
+	bf	30,4f
+	lhz	6,0(r4)
+	addi	r4,r4,2
+	sth	6,0(r11)
+	addi	r11,r11,2
+4:
+	bf	29,8f
+	lwz	6,0(r4)
+	addi	r4,r4,4
+	stw	6,0(r11)
+	addi	r11,r11,4
+8:
+	bf	28,0f
+	ld	6,0(r4)
+	addi	r4,r4,8
+	std	6,0(r11)
+	addi	r11,r11,8
+0:
+	srdi	9,r5,4	      /* Number of full quadwords remaining.  */
+
+	/* The proper alignment is present, it is OK to copy the bytes now.  */
+L(copy_GE_32_unaligned_cont):
+
+	/* Setup two indexes to speed up the indexed vector operations.  */
+	clrldi	10,r5,60
+	li	6,16	      /* Index for 16-bytes offsets.  */
+	li	7,32	      /* Index for 32-bytes offsets.  */
+	cmpldi	cr1,10,0
+	srdi	8,r5,5	      /* Setup the loop counter.  */
+	mtocrf	0x01,9
+	cmpldi	cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr	5,0,r4
+#else
+	lvsl	5,0,r4
+#endif
+	lvx	3,0,r4
+	li	0,0
+	bf	31,L(setup_unaligned_loop)
+
+	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
+	lvx	4,r4,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	addi	r4,r4,16
+	stvx	6,0,r11
+	addi	r11,r11,16
+	vor	3,4,4
+	clrrdi	0,r4,60
+
+L(setup_unaligned_loop):
+	mtctr	8
+	ble	cr6,L(end_unaligned_loop)
+
+	/* Copy 32 bytes at a time using vector instructions.  */
+	.align	4
+L(unaligned_loop):
+
+	/* Note: vr6/vr10 may contain data that was already copied,
+	but in order to get proper alignment, we may have to copy
+	some portions again. This is faster than having unaligned
+	vector instructions though.  */
+
+	lvx	4,r4,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	lvx	3,r4,7
+#ifdef __LITTLE_ENDIAN__
+	vperm	10,3,4,5
+#else
+	vperm	10,4,3,5
+#endif
+	addi	r4,r4,32
+	stvx	6,0,r11
+	stvx	10,r11,6
+	addi	r11,r11,32
+	bdnz	L(unaligned_loop)
+
+	clrrdi	0,r4,60
+
+	.align	4
+L(end_unaligned_loop):
+
+	/* Check for tail bytes.  */
+	mtocrf	0x01,r5
+	beqlr	cr1
+
+	add	r4,r4,0
+
+	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
+	/* Copy 8 bytes.  */
+	bf	28,4f
+	lwz	6,0(r4)
+	lwz	7,4(r4)
+	addi	r4,r4,8
+	stw	6,0(r11)
+	stw	7,4(r11)
+	addi	r11,r11,8
+4:	/* Copy 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(r4)
+	stw	6,0(r11)
+	bf	30,L(tail5)
+	lhz	7,4(r4)
+	sth	7,4(r11)
+	bflr	31
+	lbz	8,6(r4)
+	stb	8,6(r11)
+	/* Return original DST pointer.  */
+	blr
+
+	/* Start to memcpy backward implementation: the algorith first check if
+	   src and dest have the same alignment and if it does align both to 16
+	   bytes and copy using VSX instructions.
+	   If does not, align dest to 16 bytes and use VMX (altivec) instruction
+	   to read two 16 bytes at time, shift/permute the bytes read and write
+	   aligned to dest.  */
+L(memmove_bwd):
+	cmpldi	cr1,r5,31
+	/* Copy is done backwards: update the pointers and check alignment.  */
+	add	r11,r3,r5
+	add	r4,r4,r5
+	mr	r0,r11
+	ble	cr1, L(copy_LT_32_bwd)  /* If move < 32 bytes use short move
+				           code.  */
+
+	andi.	r10,r11,15	    /* Check if r11 is aligned to 16 bytes  */
+	clrldi	r9,r4,60	    /* Check if r4 is aligned to 16 bytes  */
+	cmpld	cr6,r10,r9	    /* SRC and DST alignments match?  */
+
+	bne     cr6,L(copy_GE_32_unaligned_bwd)
+	beq     L(aligned_copy_bwd)
+
+	mtocrf	0x01,r0
+	clrldi	r0,r0,60
+
+/* Get the DST and SRC aligned to 16 bytes.  */
+1:
+	bf	31,2f
+	lbz	r6,-1(r4)
+	subi	r4,r4,1
+	stb	r6,-1(r11)
+	subi	r11,r11,1
+2:
+	bf	30,4f
+	lhz	r6,-2(r4)
+	subi	r4,r4,2
+	sth	r6,-2(r11)
+	subi	r11,r11,2
+4:
+	bf	29,8f
+	lwz	r6,-4(r4)
+	subi	r4,r4,4
+	stw	r6,-4(r11)
+	subi	r11,r11,4
+8:
+	bf	28,16f
+	ld	r6,-8(r4)
+	subi	r4,r4,8
+	std	r6,-8(r11)
+	subi	r11,r11,8
+16:
+	subf	r5,0,r5
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy_bwd):
+	li	r6,-16
+	li	r7,-32
+	li	r8,-48
+	li	r9,-64
+	mtocrf	0x02,r5
+	srdi	r12,r5,7
+	cmpdi	r12,0
+	beq	L(aligned_tail_bwd)
+	lxvd2x	v6,r4,r6
+	lxvd2x	v7,r4,r7
+	mtctr	12
+	b	L(aligned_128loop_bwd)
+
+	.align  4
+L(aligned_128head_bwd):
+	/* for the 2nd + iteration of this loop. */
+	lxvd2x	v6,r4,r6
+	lxvd2x	v7,r4,r7
+L(aligned_128loop_bwd):
+	lxvd2x	v8,r4,r8
+	lxvd2x	v9,r4,r9
+	stxvd2x	v6,r11,r6
+	subi	r4,r4,64
+	stxvd2x	v7,r11,r7
+	stxvd2x	v8,r11,r8
+	stxvd2x	v9,r11,r9
+	lxvd2x	v6,r4,r6
+	lxvd2x	v7,r4,7
+	subi	r11,r11,64
+	lxvd2x	v8,r4,r8
+	lxvd2x	v9,r4,r9
+	subi	r4,r4,64
+	stxvd2x	v6,r11,r6
+	stxvd2x	v7,r11,r7
+	stxvd2x	v8,r11,r8
+	stxvd2x	v9,r11,r9
+	subi	r11,r11,64
+	bdnz	L(aligned_128head_bwd)
+
+L(aligned_tail_bwd):
+	mtocrf	0x01,r5
+	bf	25,32f
+	lxvd2x	v6,r4,r6
+	lxvd2x	v7,r4,r7
+	lxvd2x	v8,r4,r8
+	lxvd2x	v9,r4,r9
+	subi	r4,r4,64
+	stxvd2x	v6,r11,r6
+	stxvd2x	v7,r11,r7
+	stxvd2x	v8,r11,r8
+	stxvd2x	v9,r11,r9
+	subi	r11,r11,64
+32:
+	bf	26,16f
+	lxvd2x	v6,r4,r6
+	lxvd2x	v7,r4,r7
+	subi	r4,r4,32
+	stxvd2x	v6,r11,r6
+	stxvd2x	v7,r11,r7
+	subi	r11,r11,32
+16:
+	bf	27,8f
+	lxvd2x	v6,r4,r6
+	subi	r4,r4,16
+	stxvd2x	v6,r11,r6
+	subi	r11,r11,16
+8:
+	bf	28,4f
+	ld	r6,-8(r4)
+	subi	r4,r4,8
+	std     r6,-8(r11)
+	subi	r11,r11,8
+4:	/* Copies 4~7 bytes.  */
+	bf	29,L(tail2_bwd)
+	lwz	r6,-4(r4)
+	stw     r6,-4(r11)
+	bf      30,L(tail5_bwd)
+	lhz     r7,-6(r4)
+	sth     r7,-6(r11)
+	bflr	31
+	lbz     r8,-7(r4)
+	stb     r8,-7(r11)
+	/* Return original DST pointer.  */
+	blr
+
+/* Handle copies of 0~31 bytes.  */
+	.align	4
+L(copy_LT_32_bwd):
+	cmpldi	cr6,r5,8
+	mtocrf	0x01,r5
+	ble	cr6,L(copy_LE_8_bwd)
+
+	/* At least 9 bytes to go.  */
+	neg	r8,r4
+	andi.	r0,r8,3
+	cmpldi	cr1,r5,16
+	beq	L(copy_LT_32_aligned_bwd)
+
+	/* Force 4-byte alignment for SRC.  */
+	mtocrf	0x01,0
+	subf	r5,0,r5
+2:
+	bf	30,1f
+	lhz	r6,-2(r4)
+	subi	r4,r4,2
+	sth	r6,-2(r11)
+	subi	r11,r11,2
+1:
+	bf	31,L(end_4bytes_alignment_bwd)
+	lbz	6,-1(r4)
+	subi	r4,r4,1
+	stb	6,-1(r11)
+	subi	r11,r11,1
+
+	.align	4
+L(end_4bytes_alignment_bwd):
+	cmpldi	cr1,r5,16
+	mtocrf	0x01,r5
+
+L(copy_LT_32_aligned_bwd):
+	/* At least 6 bytes to go, and SRC is word-aligned.  */
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	lwz	r6,-4(r4)
+	lwz	r7,-8(r4)
+	stw	r6,-4(r11)
+	lwz	r8,-12(r4)
+	stw	r7,-8(r11)
+	lwz	r6,-16(r4)
+	subi	r4,r4,16
+	stw	r8,-12(r11)
+	stw	r6,-16(r11)
+	subi	r11,r11,16
+8:	/* Copy 8 bytes.  */
+	bf	28,L(tail4_bwd)
+	lwz	r6,-4(r4)
+	lwz	r7,-8(r4)
+	subi	r4,r4,8
+	stw	r6,-4(r11)
+	stw	r7,-8(r11)
+	subi	r11,r11,8
+
+	.align	4
+/* Copies 4~7 bytes.  */
+L(tail4_bwd):
+	bf	29,L(tail2_bwd)
+	lwz	6,-4(r4)
+	stw	6,-4(r11)
+	bf	30,L(tail5_bwd)
+	lhz	7,-6(r4)
+	sth	7,-6(r11)
+	bflr	31
+	lbz	8,-7(r4)
+	stb	8,-7(r11)
+	/* Return original DST pointer.  */
+	blr
+
+	.align	4
+/* Copies 2~3 bytes.  */
+L(tail2_bwd):
+	bf	30,1f
+	lhz	6,-2(r4)
+	sth	6,-2(r11)
+	bflr	31
+	lbz	7,-3(r4)
+	stb	7,-3(r11)
+	blr
+
+	.align	4
+L(tail5_bwd):
+	bflr	31
+	lbz	6,-5(r4)
+	stb	6,-5(r11)
+	blr
+
+	.align	4
+1:
+	bflr	31
+	lbz	6,-1(r4)
+	stb	6,-1(r11)
+	/* Return original DST pointer.  */
+	blr
+
+
+/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(copy_LE_8_bwd):
+	bne	cr6,L(tail4_bwd)
+
+	/* Though we could've used ld/std here, they are still
+	   slow for unaligned cases.  */
+	lwz	6,-8(r4)
+	lwz	7,-4(r4)
+	stw	6,-8(r11)
+	stw	7,-4(r11)
+	blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
+   the data, allowing for aligned DST stores.  */
+	.align	4
+L(copy_GE_32_unaligned_bwd):
+	andi.	r10,r11,15      /* Check alignment of DST against 16 bytes..  */
+	srdi	r9,r5,4		/* Number of full quadwords remaining.  */
+
+	beq	L(copy_GE_32_unaligned_cont_bwd)
+
+	/* DST is not quadword aligned and r10 holds the address masked to
+           compare alignments.  */
+	mtocrf	0x01,r10
+	subf	r5,r10,r5
+
+	/* Vector instructions work best when proper alignment (16-bytes)
+	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
+1:
+	bf	31,2f
+	lbz	r6,-1(r4)
+	subi	r4,r4,1
+	stb	r6,-1(r11)
+	subi	r11,r11,1
+2:
+	bf	30,4f
+	lhz	r6,-2(r4)
+	subi	r4,r4,2
+	sth	r6,-2(r11)
+	subi	r11,r11,2
+4:
+	bf	29,8f
+	lwz	r6,-4(r4)
+	subi	r4,r4,4
+	stw	r6,-4(r11)
+	subi	r11,r11,4
+8:
+	bf	28,0f
+	ld	r6,-8(r4)
+	subi	r4,r4,8
+	std	r6,-8(r11)
+	subi	r11,r11,8
+0:
+	srdi	r9,r5,4	      /* Number of full quadwords remaining.  */
+
+	/* The proper alignment is present, it is OK to copy the bytes now.  */
+L(copy_GE_32_unaligned_cont_bwd):
+
+	/* Setup two indexes to speed up the indexed vector operations.  */
+	clrldi	r10,r5,60
+	li	r6,-16	      /* Index for 16-bytes offsets.  */
+	li	r7,-32	      /* Index for 32-bytes offsets.  */
+	cmpldi	cr1,10,0
+	srdi	r8,r5,5	      /* Setup the loop counter.  */
+	mtocrf	0x01,9
+	cmpldi	cr6,r9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr	v5,r0,r4
+#else
+	lvsl	v5,r0,r4
+#endif
+	lvx	v3,0,r4
+	li	r0,0
+	bf	31,L(setup_unaligned_loop_bwd)
+
+	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
+	lvx	v4,r4,r6
+#ifdef __LITTLE_ENDIAN__
+	vperm	v6,v3,v4,v5
+#else
+	vperm	v6,v4,v3,v5
+#endif
+	subi	r4,r4,16
+	stvx	v6,r11,r6
+	subi	r11,r11,16
+	vor	v3,v4,v4
+	clrrdi	r0,r4,60
+
+L(setup_unaligned_loop_bwd):
+	mtctr	r8
+	ble	cr6,L(end_unaligned_loop_bwd)
+
+	/* Copy 32 bytes at a time using vector instructions.  */
+	.align	4
+L(unaligned_loop_bwd):
+
+	/* Note: vr6/vr10 may contain data that was already copied,
+	but in order to get proper alignment, we may have to copy
+	some portions again. This is faster than having unaligned
+	vector instructions though.  */
+
+	lvx	v4,r4,r6
+#ifdef __LITTLE_ENDIAN__
+	vperm	v6,v3,v4,v5
+#else
+	vperm	v6,v4,v3,v5
+#endif
+	lvx	v3,r4,r7
+#ifdef __LITTLE_ENDIAN__
+	vperm	v10,v4,v3,v5
+#else
+	vperm	v10,v3,v4,v5
+#endif
+	subi	r4,r4,32
+	stvx	v6,r11,r6
+	stvx	v10,r11,r7
+	subi	r11,r11,32
+	bdnz	L(unaligned_loop_bwd)
+
+	clrrdi	r0,r4,60
+
+	.align	4
+L(end_unaligned_loop_bwd):
+
+	/* Check for tail bytes.  */
+	mtocrf	0x01,r5
+	beqlr	cr1
+
+	add	r4,r4,0
+
+	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
+	/* Copy 8 bytes.  */
+	bf	28,4f
+	lwz	r6,-4(r4)
+	lwz	r7,-8(r4)
+	subi	r4,r4,8
+	stw	r6,-4(r11)
+	stw	r7,-8(r11)
+	subi	r11,r11,8
+4:	/* Copy 4~7 bytes.  */
+	bf	29,L(tail2_bwd)
+	lwz	r6,-4(r4)
+	stw	r6,-4(r11)
+	bf	30,L(tail5_bwd)
+	lhz	r7,-6(r4)
+	sth	r7,-6(r11)
+	bflr	31
+	lbz	r8,-7(r4)
+	stb	r8,-7(r11)
+	/* Return original DST pointer.  */
+	blr
+END_GEN_TB (memmove, TB_TOCLESS)
+libc_hidden_builtin_def (memmove)
+
+
+/* void bcopy(const void *src [r3], void *dest [r4], size_t n [r5])
+   Implemented in this file to avoid linker create a stub function call
+   in the branch to '_memmove'.  */
+ENTRY (bcopy)
+	mr	r6,r3
+	mr	r3,r4
+	mr	r4,r6
+	b	L(_memmove)
+END (bcopy)
diff --git a/sysdeps/powerpc/powerpc64/power7/mempcpy.S b/sysdeps/powerpc/powerpc64/power7/mempcpy.S
index a7239eeac1..1cd69df137 100644
--- a/sysdeps/powerpc/powerpc64/power7/mempcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/mempcpy.S
@@ -1,5 +1,5 @@
 /* Optimized mempcpy implementation for POWER7.
-   Copyright (C) 2010-2014 Free Software Foundation, Inc.
+   Copyright (C) 2010-2015 Free Software Foundation, Inc.
    Contributed by Luis Machado <luisgpm@br.ibm.com>.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/powerpc/powerpc64/power7/memrchr.S b/sysdeps/powerpc/powerpc64/power7/memrchr.S
index 40e436f853..bd3f085872 100644
--- a/sysdeps/powerpc/powerpc64/power7/memrchr.S
+++ b/sysdeps/powerpc/powerpc64/power7/memrchr.S
@@ -1,5 +1,5 @@
 /* Optimized memrchr implementation for PowerPC64/POWER7 using cmpb insn.
-   Copyright (C) 2010-2014 Free Software Foundation, Inc.
+   Copyright (C) 2010-2015 Free Software Foundation, Inc.
    Contributed by Luis Machado <luisgpm@br.ibm.com>.
    This file is part of the GNU C Library.
 
@@ -29,7 +29,7 @@ ENTRY (__memrchr)
 	mr	r10,r3
 	clrrdi	r6,r7,7
 	li	r9,3<<5
-	dcbt	r9,r6,16      /* Stream hint, decreasing addresses.  */
+	dcbt	r9,r6,8       /* Stream hint, decreasing addresses.  */
 
 	/* Replicate BYTE to doubleword.  */
 	insrdi	r4,r4,8,48
diff --git a/sysdeps/powerpc/powerpc64/power7/memset.S b/sysdeps/powerpc/powerpc64/power7/memset.S
index 6b8999dc1f..4c8c06fec9 100644
--- a/sysdeps/powerpc/powerpc64/power7/memset.S
+++ b/sysdeps/powerpc/powerpc64/power7/memset.S
@@ -1,5 +1,5 @@
 /* Optimized memset implementation for PowerPC64/POWER7.
-   Copyright (C) 2010-2014 Free Software Foundation, Inc.
+   Copyright (C) 2010-2015 Free Software Foundation, Inc.
    Contributed by Luis Machado <luisgpm@br.ibm.com>.
    This file is part of the GNU C Library.
 
@@ -383,7 +383,6 @@ L(small):
 END_GEN_TB (memset,TB_TOCLESS)
 libc_hidden_builtin_def (memset)
 
-#ifndef NO_BZERO_IMPL
 /* Copied from bzero.S to prevent the linker from inserting a stub
    between bzero and memset.  */
 ENTRY (__bzero)
@@ -391,7 +390,7 @@ ENTRY (__bzero)
 	mr	r5,r4
 	li	r4,0
 	b	L(_memset)
-END_GEN_TB (__bzero,TB_TOCLESS)
-
+END (__bzero)
+#ifndef __bzero
 weak_alias (__bzero, bzero)
 #endif
diff --git a/sysdeps/powerpc/powerpc64/power7/rawmemchr.S b/sysdeps/powerpc/powerpc64/power7/rawmemchr.S
index 56a19bd885..cccac6e7fb 100644
--- a/sysdeps/powerpc/powerpc64/power7/rawmemchr.S
+++ b/sysdeps/powerpc/powerpc64/power7/rawmemchr.S
@@ -1,5 +1,5 @@
 /* Optimized rawmemchr implementation for PowerPC64/POWER7 using cmpb insn.
-   Copyright (C) 2010-2014 Free Software Foundation, Inc.
+   Copyright (C) 2010-2015 Free Software Foundation, Inc.
    Contributed by Luis Machado <luisgpm@br.ibm.com>.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/powerpc/powerpc64/power7/stpcpy.S b/sysdeps/powerpc/powerpc64/power7/stpcpy.S
index baf6e98826..ef90142932 100644
--- a/sysdeps/powerpc/powerpc64/power7/stpcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/stpcpy.S
@@ -1,5 +1,5 @@
 /* Optimized stpcpy implementation for PowerPC64/POWER7.
-   Copyright (C) 2013-2014 Free Software Foundation, Inc.
+   Copyright (C) 2013-2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/powerpc/powerpc64/power7/stpncpy.S b/sysdeps/powerpc/powerpc64/power7/stpncpy.S
new file mode 100644
index 0000000000..c60453a55f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/stpncpy.S
@@ -0,0 +1,24 @@
+/* Optimized stpncpy implementation for PowerPC64/POWER7.
+   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/power7/strncpy.S>
+
+weak_alias (__stpncpy, stpncpy)
+libc_hidden_def (__stpncpy)
+libc_hidden_builtin_def (stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/power7/strcasecmp.S b/sysdeps/powerpc/powerpc64/power7/strcasecmp.S
index 417c7e56af..2dcb2bc7dc 100644
--- a/sysdeps/powerpc/powerpc64/power7/strcasecmp.S
+++ b/sysdeps/powerpc/powerpc64/power7/strcasecmp.S
@@ -1,5 +1,5 @@
 /* Optimized strcasecmp implementation for PowerPC64.
-   Copyright (C) 2011-2014 Free Software Foundation, Inc.
+   Copyright (C) 2011-2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/powerpc/powerpc64/power7/strchr.S b/sysdeps/powerpc/powerpc64/power7/strchr.S
index 1c0a556c04..1ba388c791 100644
--- a/sysdeps/powerpc/powerpc64/power7/strchr.S
+++ b/sysdeps/powerpc/powerpc64/power7/strchr.S
@@ -1,5 +1,5 @@
 /* Optimized strchr implementation for PowerPC64/POWER7 using cmpb insn.
-   Copyright (C) 2010-2014 Free Software Foundation, Inc.
+   Copyright (C) 2010-2015 Free Software Foundation, Inc.
    Contributed by Luis Machado <luisgpm@br.ibm.com>.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/powerpc/powerpc64/power7/strchrnul.S b/sysdeps/powerpc/powerpc64/power7/strchrnul.S
index 586c76950a..180b72bf5c 100644
--- a/sysdeps/powerpc/powerpc64/power7/strchrnul.S
+++ b/sysdeps/powerpc/powerpc64/power7/strchrnul.S
@@ -1,5 +1,5 @@
 /* Optimized strchrnul implementation for PowerPC64/POWER7 using cmpb insn.
-   Copyright (C) 2010-2014 Free Software Foundation, Inc.
+   Copyright (C) 2010-2015 Free Software Foundation, Inc.
    Contributed by Luis Machado <luisgpm@br.ibm.com>.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/powerpc/powerpc64/power7/strcmp.S b/sysdeps/powerpc/powerpc64/power7/strcmp.S
new file mode 100644
index 0000000000..6af0e7dad6
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/strcmp.S
@@ -0,0 +1,164 @@
+/* Optimized strcmp implementation for Power7 using 'cmpb' instruction
+   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* The optimization is achieved here through cmpb instruction.
+   8byte aligned strings are processed with double word comparision
+   and unaligned strings are handled effectively with loop unrolling
+   technique  */
+
+#include <sysdep.h>
+
+/* int [r3] strcmp (const char *s1 [r3], const char *s2 [r4])  */
+
+	.machine	power7
+EALIGN (strcmp, 4, 0)
+	CALL_MCOUNT 2
+
+	or r9, r3, r4
+	rldicl. r10, r9, 0, 61	/* are s1 and s2 8 byte aligned..?  */
+	bne cr0, L(process_unaligned_bytes)
+	li	r5, 0
+
+	.align 4
+/* process input parameters on double word aligned boundary  */
+L(unrollDword):
+	ld	r8,0(r3)
+	ld	r10,0(r4)
+	cmpb	r7,r8,r5
+	cmpdi	cr7,r7,0
+	mr	r9,r7
+	bne 	cr7,L(null_found)
+	cmpld	cr7,r8,r10
+	bne	cr7,L(different)
+
+	ld	r8,8(r3)
+	ld	r10,8(r4)
+	cmpb	r7,r8,r5
+	cmpdi	cr7,r7,0
+	mr	r9,r7
+	bne 	cr7,L(null_found)
+	cmpld	cr7,r8,r10
+	bne	cr7,L(different)
+
+	ld	r8,16(r3)
+	ld	r10,16(r4)
+	cmpb	r7,r8,r5
+	cmpdi	cr7,r7,0
+	mr	r9,r7
+	bne 	cr7,L(null_found)
+	cmpld	cr7,r8,r10
+	bne	cr7,L(different)
+
+	ld	r8,24(r3)
+	ld	r10,24(r4)
+	cmpb	r7,r8,r5
+	cmpdi	cr7,r7,0
+	mr	r9,r7
+	bne 	cr7,L(null_found)
+	cmpld	cr7,r8,r10
+	bne	cr7,L(different)
+
+	addi r3, r3, 32
+	addi r4, r4, 32
+	beq cr7, L(unrollDword)
+
+	.align 4
+L(null_found):
+#ifdef __LITTLE_ENDIAN__
+	neg	r7,r9
+	and	r9,r9,r7
+	li	r7,-1
+	cntlzd	r9,r9
+	subfic	r9,r9,71
+	sld	r9,r7,r9
+#else
+	cntlzd	r9,r9
+	li	r7,-1
+	addi	r9,r9,8
+	srd	r9,r7,r9
+#endif
+	or	r8,r8,r9
+	or	r10,r10,r9
+
+L(different):
+	cmpb	r9,r8,r10
+#ifdef __LITTLE_ENDIAN__
+	addi	r7,r9,1
+	andc	r9,r7,r9
+	cntlzd	r9,r9
+	subfic	r9,r9,63
+#else
+	not	r9,r9
+	cntlzd	r9,r9
+	subfic	r9,r9,56
+#endif
+	srd	r3,r8,r9
+	srd	r10,r10,r9
+	rldicl	r10,r10,0,56
+	rldicl	r3,r3,0,56
+	subf	r3,r10,r3
+	blr
+
+	.align 4
+L(process_unaligned_bytes):
+	lbz r9, 0(r3)		/* load byte from s1  */
+	lbz r10, 0(r4)		/* load byte from s2  */
+	cmpdi cr7, r9, 0	/* compare *s1 with NULL  */
+	beq cr7, L(diffOfNULL)	/* if *s1 is NULL , return *s1 - *s2  */
+	cmplw cr7, r9, r10	/* compare *s1 and *s2  */
+	bne cr7, L(ComputeDiff)	/* branch to compute difference and return  */
+
+	lbz r9, 1(r3)		/* load next byte from s1  */
+	lbz r10, 1(r4)		/* load next byte from s2  */
+	cmpdi cr7, r9, 0	/* compare *s1 with NULL  */
+	beq cr7, L(diffOfNULL)	/* if *s1 is NULL , return *s1 - *s2  */
+	cmplw cr7, r9, r10	/* compare *s1 and *s2  */
+	bne cr7, L(ComputeDiff)	/* branch to compute difference and return  */
+
+	lbz r9, 2(r3)		/* unroll 3rd byte here  */
+	lbz r10, 2(r4)
+	cmpdi cr7, r9, 0
+	beq cr7, L(diffOfNULL)
+	cmplw cr7, r9, r10
+	bne 7, L(ComputeDiff)
+
+	lbz r9, 3(r3)		/* unroll 4th byte now  */
+	lbz r10, 3(r4)
+	addi r3, r3, 4		/* increment s1 by unroll factor  */
+	cmpdi cr7, r9, 0
+	cmplw cr6, 9, r10
+	beq cr7, L(diffOfNULL)
+	addi r4, r4, 4		/* increment s2 by unroll factor  */
+	beq cr6, L(process_unaligned_bytes)	/* unroll byte processing  */
+
+	.align 4
+L(ComputeDiff):
+	extsw r9, r9
+	subf r10, r10, r9	/* compute s1 - s2  */
+	extsw r3, r10
+	blr			/* return  */
+
+	.align 4
+L(diffOfNULL):
+	li r9, 0
+	subf r10, r10, r9	/* compute s1 - s2  */
+	extsw r3, r10		/* sign extend result  */
+	blr			/* return  */
+
+END (strcmp)
+libc_hidden_builtin_def (strcmp)
diff --git a/sysdeps/powerpc/powerpc64/power7/strcpy.S b/sysdeps/powerpc/powerpc64/power7/strcpy.S
index ce71982eaf..70f2987181 100644
--- a/sysdeps/powerpc/powerpc64/power7/strcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/strcpy.S
@@ -1,5 +1,5 @@
 /* Optimized strcpy/stpcpy implementation for PowerPC64/POWER7.
-   Copyright (C) 2013-2014 Free Software Foundation, Inc.
+   Copyright (C) 2013-2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -31,8 +31,6 @@
 
    if (((((uintptr_t)dst & 0x7UL) == 0) && ((uintptr_t)src & 0x7UL) == 0))
      goto aligned_doubleword_copy;
-   if (((((uintptr_t)dst & 0x3UL) == 0) && ((uintptr_t)src & 0x3UL) == 0))
-     goto aligned_word_copy;
    if (((uintptr_t)dst & 0x7UL) == ((uintptr_t)src & 0x7UL))
      goto same_alignment;
    goto unaligned;
@@ -70,9 +68,18 @@ EALIGN (FUNC_NAME, 4, 0)
 #endif
 	or	rTMP, rSRC, rRTN
 	clrldi.	rTMP, rTMP, 61
-	bne	L(check_word_alignment)
+	bne	L(check_alignment)
 	b	L(aligned_doubleword_copy)
 
+	.align 4
+L(check_alignment):
+	rldicl	rRTNAL, rRTN, 0, 61
+	rldicl	rSRCAL, rSRC, 0, 61
+	cmpld	cr7, rSRCAL, rRTNAL
+	beq	cr7, L(same_alignment)
+	b	L(unaligned)
+
+	.align 4
 L(same_alignment):
 /* Src and dst with same alignment: align both to doubleword.  */
 	mr	rALCNT, rRTN
@@ -180,93 +187,249 @@ L(g1):
 #endif
 	blr
 
-L(check_word_alignment):
-	clrldi. rTMP, rTMP, 62
-	beq	L(aligned_word_copy)
-	rldicl	rRTNAL, rRTN, 0, 61
-	rldicl	rSRCAL, rSRC, 0, 61
-	cmpld	cr7, rSRCAL, rRTNAL
-	beq	cr7, L(same_alignment)
-	b	L(unaligned)
-
-/* For word aligned memory, operate using word load and stores.  */
 	.align	4
-L(aligned_word_copy):
-	li	rMASK, 0
-	addi	rRTN, rRTN, -4
-	lwz	rWORD, 0(rSRC)
-	b	L(g5)
+L(unaligned):
+	cmpdi	rSRCAL, 0		/* Check src alignment */
+	beq	L(srcaligndstunalign)
+	/* src is unaligned */
+	rlwinm	r10, rSRC, 3,26,28	/* Calculate padding.  */
+	clrrdi	rSRC, rSRC, 3		/* Align the addr to dw boundary */
+	ld	rWORD, 0(rSRC)		/* Load doubleword from memory.  */
+	li	rTMP, 0
+	/* Discard bits not part of the string */
+#ifdef __LITTLE_ENDIAN__
+	srd	rALT, rWORD, r10
+#else
+	sld	rALT, rWORD, r10
+#endif
+	cmpb	rTMP, rALT, rTMP	/* Compare each byte against null */
+	/* Discard bits not part of the string */
+#ifdef __LITTLE_ENDIAN__
+	sld	rTMP, rTMP, r10
+#else
+	srd	rTMP, rTMP, r10
+#endif
+	cmpdi	rTMP, 0
+	bne	L(bytebybyte)		/* if it has null, copy byte by byte */
+	subfic	r8, r9, 8
+	rlwinm	r5, rRTN, 3,26,28	/* Calculate padding in bits.  */
+	rldicl	r9, rRTN, 0, 61		/* Calculate padding in bytes. */
+	addi	rRTN, rRTN, -1
 
-	.align	4
-L(g3):	lwzu	rALT, 4(rSRC)
-	stwu	rWORD, 4(rRTN)
-	cmpb	rTMP, rALT, rMASK
-	cmpwi	rTMP, 0
-	bne	L(g4)
-	lwzu	rWORD, 4(rSRC)
-	stwu	rALT, 4(rRTN)
-L(g5):	cmpb	rTMP, rWORD, rMASK
-	cmpwi	rTMP, 0		/* If rTMP is 0, no null in word.  */
-	beq	L(g3)
-
-	mr      rALT, rWORD
-/* We've hit the end of the string.  Do the rest byte-by-byte.  */
-L(g4):
+	cmpdi	r5, 0			/* check dest alignment */
+	beq	L(srcunaligndstalign)
+
+	/* both src and dst unaligned */
 #ifdef __LITTLE_ENDIAN__
-	rlwinm.	rTMP, rALT, 0, 24, 31
-	stbu	rALT, 4(rRTN)
-	beqlr-
-	rlwinm.	rTMP, rALT, 24, 24, 31
-	stbu	rTMP, 1(rRTN)
-	beqlr-
-	rlwinm.	rTMP, rALT, 16, 24, 31
-	stbu	rTMP, 1(rRTN)
-	beqlr-
-	rlwinm	rTMP, rALT, 8, 24, 31
-	stbu	rTMP, 1(rRTN)
+	sld	rWORD, rALT, r10
+	mr 	r11, r10
+	addi	r11, r11, -8		/* Adjust byte pointer on loaded dw */
 #else
-	rlwinm. rTMP, rALT, 8, 24, 31
-	stbu    rTMP, 4(rRTN)
-	beqlr
-	rlwinm. rTMP, rALT, 16, 24, 31
-	stbu    rTMP, 1(rRTN)
-	beqlr
-	rlwinm. rTMP, rALT, 24, 24, 31
-	stbu    rTMP, 1(rRTN)
-	beqlr
-	stbu    rALT, 1(rRTN)
+	srd	rWORD, rALT, r10
+	subfic	r11, r10, 64
 #endif
-	blr
+	/* dst alignment is greater then src alignment? */
+	cmpd	cr7, r5, r10
+	blt	cr7, L(dst_align_small)
+	/* src alignment is less than dst */
 
-/* Oh well.  In this case, we just do a byte-by-byte copy.  */
-	.align	4
-L(unaligned):
-	lbz	rWORD, 0(rSRC)
-	addi	rRTN, rRTN, -1
-	cmpdi	rWORD, 0
-	beq	L(u2)
-
-	.align 	5
-L(u0):	lbzu	rALT, 1(rSRC)
-	stbu	rWORD, 1(rRTN)
-	cmpdi	rALT, 0
-	beq	L(u1)
-	lbzu	rWORD, 1(rSRC)
+	/* Calculate the dst alignment differnce */
+	subfic	rALT, r9, 8
+	mtctr	rALT
+
+	/* Write till dst is aligned */
+	cmpdi	rTMP, rALT, 4
+	blt	L(storebyte1)		/* less than 4, store byte by byte */
+	beq	L(equal1)		/* if its 4, store word */
+	addi	rTMP, rALT, -4		/* greater than 4, so stb and stw */
+	mtctr	rTMP
+L(storebyte1):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8		/* Adjust byte pointer on loaded dw */
+#else
+	addi	r11, r11, -8
+#endif
+	srd	rALT, rWORD, r11
+	stbu	rALT, 1(rRTN)
+	bdnz	L(storebyte1)
+
+	subfic	rALT, r9, 8		/* Check the remaining bytes */
+	cmpdi	rTMP, rALT, 4
+	blt	L(proceed)
+
+	.align 4
+L(equal1):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8		/* Adjust byte pointer on loaded dw */
+	srd	rALT, rWORD, r11
+#else
+	subfic	r11, r11, 64
+	sld	rALT, rWORD, r11
+	srdi	rALT, rALT, 32
+#endif
+	stw	rALT, 1(rRTN)
+	addi	rRTN, rRTN, 4
+
+L(proceed):
+	mr	rALT, rWORD
+	/* calculate the Left over bytes to be written */
+	subfic	r11, r10, 64
+	subfic	r5, r5, 64
+	subf	r5, r5, r11		/* remaining bytes on second dw */
+        subfic	r10, r5, 64		/* remaining bytes on first dw */
+	subfic	r9, r9, 8
+	subf	r8, r9, r8		/* recalculate padding */
+L(srcunaligndstalign):
+	addi	rRTN, rRTN, 1
+	subfic	r5, r10, 64		/* remaining bytes on second dw */
+	addi	rSRC, rSRC, 8
+	li	rTMP,0
+	b	L(storedouble)
+
+	.align 4
+L(dst_align_small):
+	mtctr	r8
+	/* Write till src is aligned */
+L(storebyte2):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8		/* Adjust byte pointer on dw */
+#else
+	addi	r11, r11, -8
+#endif
+	srd	rALT, rWORD, r11
 	stbu	rALT, 1(rRTN)
-	cmpdi	rWORD, 0
-	beq	L(u2)
-	lbzu	rALT, 1(rSRC)
-	stbu	rWORD, 1(rRTN)
-	cmpdi	rALT, 0
-	beq	L(u1)
-	lbzu	rWORD, 1(rSRC)
+	bdnz	L(storebyte2)
+
+	addi	rSRC, rSRC, 8		/* Increment src pointer */
+	addi	rRTN, rRTN, 1		/* Increment dst pointer */
+	rldicl	r8, rRTN, 0, 61		/* Recalculate padding */
+
+	/* src is aligned */
+L(srcaligndstunalign):
+	ld	rWORD, 0(rSRC)
+	mr	rALT, rWORD
+	li	rTMP, 0			/* Check null */
+	cmpb	rTMP, rWORD, rTMP
+	cmpdi	rTMP, 0
+	bne	L(bytebybyte)		/* Do byte by byte if there is NULL */
+	rlwinm	r5, rRTN, 3,26,28	/* Calculate padding */
+	addi	rRTN, rRTN, -1
+	subfic	r10, r8, 8
+	/* write byte by byte till aligned */
+#ifdef __LITTLE_ENDIAN__
+	li	r11, -8
+#else
+	li	r11, 64
+#endif
+	mtctr	r10
+	cmpdi	rTMP, r10, 4
+	blt	L(storebyte)
+	beq	L(equal)
+	addi	rTMP, r10, -4
+	mtctr	rTMP
+L(storebyte):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8		/* Adjust byte pointer on  dw */
+#else
+	addi	r11, r11, -8
+#endif
+	srd	rALT, rWORD, r11
 	stbu	rALT, 1(rRTN)
-	cmpdi	rWORD, 0
-	bne	L(u0)
-L(u2):	stbu	rWORD, 1(rRTN)
-	blr
-L(u1):	stbu	rALT, 1(rRTN)
-	blr
+	bdnz	L(storebyte)
+
+	cmpdi	rTMP, r10, 4
+	blt	L(align)
+
+	.align 4
+L(equal):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8
+	srd	rALT, rWORD, r11
+#else
+	subfic	r11, r11, 64
+	sld	rALT, rWORD, r11
+	srdi	rALT, rALT, 32
+#endif
+	stw	rALT, 1(rRTN)
+	addi	rRTN, rRTN, 4
+L(align):
+	addi	rRTN, rRTN, 1
+	addi	rSRC, rSRC, 8		/* Increment src pointer */
+	subfic	r10, r5, 64
+	li	rTMP, 0
+	/* dst addr aligned to 8 */
+L(storedouble):
+	ld	rALT, 0(rSRC)		/* load next dw */
+	cmpb	rTMP, rALT, rTMP
+	cmpdi	rTMP, 0			/* check for null on each new dw */
+	bne	L(null)
+#ifdef __LITTLE_ENDIAN__
+	srd	r9, rWORD, r10		/* bytes from first dw */
+	sld	r11, rALT, r5		/* bytes from second dw */
+#else
+	sld	r9, rWORD, r10
+	srd	r11, rALT, r5
+#endif
+	or	r11, r9, r11		/* make as a single dw */
+	std	r11, 0(rRTN)		/* store as std on aligned addr */
+	mr	rWORD, rALT		/* still few bytes left to be written */
+	addi	rRTN, rRTN, 8		/* increment dst addr */
+	addi	rSRC, rSRC, 8		/* increment src addr */
+	b	L(storedouble)		/* Loop till NULL */
+
+	.align 4
+
+/* We've hit the end of the string.  Do the rest byte-by-byte.  */
+L(null):
+	addi	rRTN, rRTN, -1
+	mr	r10, r5
+	mtctr	r8
+#ifdef __LITTLE_ENDIAN__
+	subfic	r10, r10, 64
+	addi	r10, r10, -8
+#endif
+	cmpdi	rTMP, r8, 4
+	blt	L(loop)
+
+	/* we can still use stw if leftover >= 4*/
+#ifdef __LITTLE_ENDIAN__
+	addi	r10, r10, 8
+	srd	r11, rWORD, r10
+#else
+	subfic	r10, r10, 64
+	sld	r11, rWORD, r10
+	srdi	r11, r11, 32
+#endif
+	stw	r11, 1(rRTN)
+	addi	rRTN, rRTN, 4
+
+	beq	L(bytebybyte1)
+	addi	r10, r10, 32
+#ifdef __LITTLE_ENDIAN__
+	addi	r10, r10, -8
+#else
+	subfic	r10, r10, 64
+#endif
+	addi	rTMP, r8, -4
+	mtctr	rTMP
+	/* remaining byte by byte part of first dw */
+L(loop):
+#ifdef __LITTLE_ENDIAN__
+	addi	r10, r10, 8
+#else
+	addi	r10, r10, -8
+#endif
+	srd	rTMP, rWORD, r10
+	stbu	rTMP, 1(rRTN)
+	bdnz	L(loop)
+
+L(bytebybyte1):
+	addi	rRTN, rRTN, 1
+	/* remaining byte by byte part of second dw */
+L(bytebybyte):
+	addi	rRTN, rRTN, -8
+	b	L(g1)
+
 END (FUNC_NAME)
 
 #ifndef USE_AS_STPCPY
diff --git a/sysdeps/powerpc/powerpc64/power7/strlen.S b/sysdeps/powerpc/powerpc64/power7/strlen.S
index d023e85938..598fe0b5ff 100644
--- a/sysdeps/powerpc/powerpc64/power7/strlen.S
+++ b/sysdeps/powerpc/powerpc64/power7/strlen.S
@@ -1,5 +1,5 @@
 /* Optimized strlen implementation for PowerPC64/POWER7 using cmpb insn.
-   Copyright (C) 2010-2014 Free Software Foundation, Inc.
+   Copyright (C) 2010-2015 Free Software Foundation, Inc.
    Contributed by Luis Machado <luisgpm@br.ibm.com>.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/powerpc/powerpc64/power7/strncmp.S b/sysdeps/powerpc/powerpc64/power7/strncmp.S
index 35cc244f36..959eb95752 100644
--- a/sysdeps/powerpc/powerpc64/power7/strncmp.S
+++ b/sysdeps/powerpc/powerpc64/power7/strncmp.S
@@ -1,5 +1,5 @@
 /* Optimized strcmp implementation for POWER7/PowerPC64.
-   Copyright (C) 2010-2014 Free Software Foundation, Inc.
+   Copyright (C) 2010-2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/powerpc/powerpc64/power7/strncpy.S b/sysdeps/powerpc/powerpc64/power7/strncpy.S
new file mode 100644
index 0000000000..a6c9abf7d9
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/strncpy.S
@@ -0,0 +1,714 @@
+/* Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Implements the functions
+
+   char * [r3] strncpy (char *dst [r3], const char *src [r4], size_t n [r5])
+
+   AND
+
+   char * [r3] stpncpy (char *dst [r3], const char *src [r4], size_t n [r5])
+
+   The algorithm is as follows:
+   > if src and dest are 8 byte aligned, perform double word copy
+     else
+   > copy byte by byte on unaligned addresses.
+
+   The aligned comparison are made using cmpb instructions.  */
+
+/* The focus on optimization for performance improvements are as follows:
+   1. data alignment [gain from aligned memory access on read/write]
+   2. POWER7 gains performance with loop unrolling/unwinding
+      [gain by reduction of branch penalty].
+   3. The final pad with null bytes is done by calling an optimized
+      memset.  */
+
+#ifdef USE_AS_STPNCPY
+# define FUNC_NAME __stpncpy
+#else
+# define FUNC_NAME strncpy
+#endif
+
+#define		FRAMESIZE	(FRAME_MIN_SIZE+32)
+
+#ifndef MEMSET
+/* For builds with no IFUNC support, local calls should be made to internal
+   GLIBC symbol (created by libc_hidden_builtin_def).  */
+# ifdef SHARED
+#  define MEMSET   __GI_memset
+# else
+#  define MEMSET   memset
+# endif
+#endif
+
+	.machine  power7
+EALIGN(FUNC_NAME, 4, 0)
+	CALL_MCOUNT 3
+
+	mflr r0			/* load link register LR to r0  */
+	or r10, r3, r4		/* to verify source and destination  */
+	rldicl. r8, r10, 0, 61	/* is double word aligned .. ?  */
+
+	std r19, -8(r1)		/* save callers register , r19  */
+	std r18, -16(r1)	/* save callers register , r18  */
+	std r0, 16(r1)		/* store the link register  */
+	stdu r1, -FRAMESIZE(r1)	/* create the stack frame  */
+
+	mr r9, r3		/* save r3 into r9 for use  */
+	mr r18, r3		/* save r3 for retCode of strncpy  */
+	bne 0, L(unaligned)
+
+L(aligned):
+	srdi r11, r5, 3		/* compute count for CTR ; count = n/8  */
+	cmpldi cr7, r11, 3	/* if count > 4 ; perform unrolling 4 times  */
+	ble 7, L(update1)
+
+	ld r10, 0(r4)		/* load doubleWord from src  */
+	cmpb r8, r10, r8	/* compare src with NULL ,we read just now  */
+	cmpdi cr7, r8, 0	/* if cmpb returned NULL ; we continue  */
+	bne cr7, L(update3)
+
+	std r10, 0(r3)		/* copy doubleword at offset=0  */
+	ld r10, 8(r4)		/* load next doubleword from offset=8  */
+	cmpb r8, r10, r8	/* compare src with NULL , we read just now  */
+	cmpdi cr7, r8, 0	/* if cmpb returned NULL ; we continue  */
+	bne 7,L(HopBy8)
+
+	addi r8, r11, -4
+	mr r7, r3
+	srdi r8, r8, 2
+	mr r6, r4
+	addi r8, r8, 1
+	li r12, 0
+	mtctr r8
+	b L(dwordCopy)
+
+	.p2align 4
+L(dWordUnroll):
+	std r8, 16(r9)
+	ld r8, 24(r4)		/* load dword,perform loop unrolling again  */
+	cmpb r10, r8, r10
+	cmpdi cr7, r10, 0
+	bne cr7, L(HopBy24)
+
+	std r8, 24(r7)		/* copy dword at offset=24  */
+	addi r9, r9, 32
+	addi r4, r4, 32
+	bdz  L(leftDwords)	/* continue with loop on counter  */
+
+	ld r3, 32(r6)
+	cmpb r8, r3, r10
+	cmpdi cr7, r8, 0
+	bne cr7, L(update2)
+
+	std r3, 32(r7)
+	ld r10, 40(r6)
+	cmpb r8, r10, r8
+	cmpdi cr7, r8, 0
+	bne cr7, L(HopBy40)
+
+	mr r6, r4		/* update values  */
+	mr r7, r9
+	mr r11, r0
+	mr r5, r19
+
+L(dwordCopy):
+	std r10, 8(r9)		/* copy dword at offset=8  */
+	addi r19, r5, -32
+	addi r0, r11, -4
+	ld r8, 16(r4)
+	cmpb r10, r8, r12
+	cmpdi cr7, r10, 0
+	beq cr7, L(dWordUnroll)
+
+	addi r9, r9, 16		/* increment dst by 16  */
+	addi r4, r4, 16		/* increment src by 16  */
+	addi r5, r5, -16	/* decrement length 'n' by 16  */
+	addi r0, r11, -2	/* decrement loop counter  */
+
+L(dWordUnrollOFF):
+	ld r10, 0(r4)		/* load first dword  */
+	li r8, 0		/* load mask  */
+	cmpb r8, r10, r8
+	cmpdi cr7, r8, 0
+	bne cr7, L(byte_by_byte)
+	mtctr r0
+	li r7, 0
+	b L(CopyDword)
+
+	.p2align 4
+L(loadDWordandCompare):
+	ld r10, 0(r4)
+	cmpb r8, r10, r7
+	cmpdi cr7, r8, 0
+	bne cr7, L(byte_by_byte)
+
+L(CopyDword):
+	addi r9, r9, 8
+	std r10, -8(r9)
+	addi r4, r4, 8
+	addi r5, r5, -8
+	bdnz L(loadDWordandCompare)
+
+L(byte_by_byte):
+	cmpldi cr7, r5, 3
+	ble cr7, L(verifyByte)
+	srdi r10, r5, 2
+	mr r19, r9
+	mtctr r10
+	b L(firstByteUnroll)
+
+	.p2align 4
+L(bytes_unroll):
+	lbz r10, 1(r4)		/* load byte from src  */
+	cmpdi cr7, r10, 0	/* compare for NULL  */
+	stb r10, 1(r19)		/* store byte to dst  */
+	beq cr7, L(updtDestComputeN2ndByte)
+
+	addi r4, r4, 4		/* advance src  */
+
+	lbz r10, -2(r4)		/* perform loop unrolling for byte r/w  */
+	cmpdi cr7, r10, 0
+	stb r10, 2(r19)
+	beq cr7, L(updtDestComputeN3rdByte)
+
+	lbz r10, -1(r4)		/* perform loop unrolling for byte r/w  */
+	addi r19, r19, 4
+	cmpdi cr7, r10, 0
+	stb r10, -1(r19)
+	beq cr7, L(ComputeNByte)
+
+	bdz L(update0)
+
+L(firstByteUnroll):
+	lbz r10, 0(r4)		/* perform loop unrolling for byte r/w  */
+	cmpdi cr7, 10, 0
+	stb r10, 0(r19)
+	bne cr7, L(bytes_unroll)
+	addi r19, r19, 1
+
+L(ComputeNByte):
+	subf r9, r19, r9	/* compute 'n'n bytes to fill  */
+	add r8, r9, r5
+
+L(zeroFill):
+	cmpdi cr7, r8, 0	/* compare if length is zero  */
+	beq cr7, L(update3return)
+
+	mr r3, r19		/* fill buffer with  */
+	li r4, 0		/* zero fill buffer  */
+	mr r5, r8		/* how many bytes to fill buffer with  */
+	bl MEMSET		/* call optimized memset  */
+	nop
+
+L(update3return):
+#ifdef USE_AS_STPNCPY
+	addi r3, r19, -1	/* update return value  */
+#endif
+
+L(hop2return):
+#ifndef USE_AS_STPNCPY
+	mr r3, r18		/* set return value  */
+#endif
+	addi r1, r1, FRAMESIZE	/* restore stack pointer  */
+	ld r0, 16(r1)		/* read the saved link register  */
+	ld r18, -16(r1)		/* restore callers save register, r18  */
+	ld r19, -8(r1)		/* restore callers save register, r19  */
+	mtlr r0			/* branch to link register  */
+	blr			/* return  */
+
+	.p2align 4
+L(update0):
+	mr r9, r19
+
+	.p2align 4
+L(verifyByte):
+	rldicl. r8, r5, 0, 62
+#ifdef USE_AS_STPNCPY
+	mr r3, r9
+#endif
+	beq cr0, L(hop2return)
+	mtctr r8
+	addi r4, r4, -1
+	mr r19, r9
+	b L(oneBYone)
+
+	.p2align 4
+L(proceed):
+	bdz L(done)
+
+L(oneBYone):
+	lbzu r10, 1(r4)		/* copy byte  */
+	addi r19, r19, 1
+	addi r8, r8, -1
+	cmpdi cr7, r10, 0
+	stb r10, -1(r19)
+	bne cr7, L(proceed)
+	b L(zeroFill)
+
+	.p2align 4
+L(done):
+	addi r1, r1, FRAMESIZE	/* restore stack pointer  */
+#ifdef USE_AS_STPNCPY
+	mr r3, r19		/* set the return value  */
+#else
+	mr r3, r18		/* set the return value  */
+#endif
+	ld r0, 16(r1)		/* read the saved link register  */
+	ld r18, -16(r1)		/* restore callers save register, r18  */
+	ld r19, -8(r1)		/* restore callers save register, r19  */
+	mtlr r0			/* branch to link register  */
+	blr			/* return  */
+
+L(update1):
+	mr r0, r11
+	mr r19, r5
+
+	.p2align 4
+L(leftDwords):
+	cmpdi cr7, r0, 0
+	mr r5, r19
+	bne cr7, L(dWordUnrollOFF)
+	b L(byte_by_byte)
+
+	.p2align 4
+L(updtDestComputeN2ndByte):
+	addi r19, r19, 2	/* update dst by 2  */
+	subf r9, r19, r9	/* compute distance covered  */
+	add r8, r9, r5
+	b L(zeroFill)
+
+	.p2align 4
+L(updtDestComputeN3rdByte):
+	addi r19, r19, 3	/* update dst by 3  */
+	subf r9, r19, r9	/* compute distance covered  */
+	add r8, r9, r5
+	b L(zeroFill)
+
+	.p2align 4
+L(HopBy24):
+	addi r9, r9, 24		/* increment dst by 24  */
+	addi r4, r4, 24		/* increment src by 24  */
+	addi r5, r5, -24	/* decrement length 'n' by 24  */
+	addi r0, r11, -3	/* decrement loop counter  */
+	b L(dWordUnrollOFF)
+
+	.p2align 4
+L(update2):
+	mr r5, r19
+	b L(dWordUnrollOFF)
+
+	.p2align 4
+L(HopBy40):
+	addi r9, r7, 40		/* increment dst by 40  */
+	addi r4, r6, 40		/* increment src by 40  */
+	addi r5, r5, -40	/* decrement length 'n' by 40  */
+	addi r0, r11, -5	/* decrement loop counter  */
+	b L(dWordUnrollOFF)
+
+L(update3):
+	mr r0, r11
+	b L(dWordUnrollOFF)
+
+L(HopBy8):
+	addi r9, r3, 8		/* increment dst by 8  */
+	addi r4, r4, 8		/* increment src by 8  */
+	addi r5, r5, -8		/* decrement length 'n' by 8  */
+	addi r0, r11, -1	/* decrement loop counter  */
+	b L(dWordUnrollOFF)
+
+L(unaligned):
+	cmpdi	r5, 16		/* Proceed byte by byte for less than 16  */
+	ble	L(byte_by_byte)
+	rldicl	r7, r3, 0, 61
+	rldicl	r6, r4, 0, 61
+	cmpdi	r6, 0	/* Check src alignment */
+	beq	L(srcaligndstunalign)
+	/* src is unaligned */
+	rlwinm	r10, r4, 3,26,28	/* Calculate padding.  */
+	clrrdi	r4, r4, 3	/* Align the addr to dw boundary */
+	ld	r8, 0(r4)	/* Load doubleword from memory.  */
+	li	r0, 0
+	/* Discard bits not part of the string */
+#ifdef __LITTLE_ENDIAN__
+	srd	r7, r8, r10
+#else
+	sld	r7, r8, r10
+#endif
+	cmpb	r0, r7, r0	/* Compare each byte against null */
+	/* Discard bits not part of the string */
+#ifdef __LITTLE_ENDIAN__
+	sld	r0, r0, r10
+#else
+	srd	r0, r0, r10
+#endif
+	cmpdi	r0, 0
+	bne     L(bytebybyte)	/* if it has null, copy byte by byte */
+	subfic	r6, r6, 8
+	rlwinm	r12, r3, 3,26,28	/* Calculate padding in bits.  */
+	rldicl	r9, r3, 0, 61	/* Calculate padding in bytes. */
+	addi	r3, r3, -1
+
+	cmpdi	r12, 0	/* check dest alignment */
+	beq     L(srcunaligndstalign)
+
+	/* both src and dst unaligned */
+#ifdef __LITTLE_ENDIAN__
+	sld	r8, r7, r10
+	mr	r11, r10
+	addi	r11, r11, -8	/* Adjust byte pointer on loaded dw */
+#else
+	srd	r8, r7, r10
+	subfic	r11, r10, 64
+#endif
+	/* dst alignment is greater then src alignment? */
+	cmpd    cr7, r12, r10
+	ble     cr7, L(dst_align_small)
+	/* src alignment is less than dst */
+
+	/* Calculate the dst alignment difference  */
+	subfic	r7, r9, 8
+	mtctr	r7
+
+	/* Write until dst is aligned  */
+	cmpdi	r0, r7, 4
+	blt     L(storebyte1)	/* less than 4, store byte by byte  */
+	beq     L(equal1)	/* if its 4, store word  */
+	addi	r0, r7, -4	/* greater than 4, so stb and stw  */
+	mtctr	r0
+L(storebyte1):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8	/* Adjust byte pointer on loaded dw  */
+#else
+	addi	r11, r11, -8
+#endif
+	srd	r7, r8, r11
+	stbu	r7, 1(r3)
+	addi	r5, r5, -1
+	bdnz    L(storebyte1)
+
+	subfic	r7, r9, 8	/* Check the remaining bytes  */
+	cmpdi	r0, r7, 4
+	blt     L(proceed1)
+
+	.align 4
+L(equal1):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8	/* Adjust byte pointer on loaded dw  */
+	srd	r7, r8, r11
+#else
+	subfic	r11, r11, 64
+	sld	r7, r8, r11
+	srdi	r7, r7, 32
+#endif
+	stw	r7, 1(r3)
+	addi	r3, r3, 4
+	addi	r5, r5, -4
+
+L(proceed1):
+	mr	r7, r8
+	/* calculate the Left over bytes to be written  */
+	subfic	r11, r10, 64
+	subfic	r12, r12, 64
+	subf	r12, r12, r11	/* remaining bytes on second dw  */
+	subfic	r10, r12, 64	/* remaining bytes on first dw  */
+	subfic	r9, r9, 8
+	subf	r6, r9, r6	/* recalculate padding  */
+L(srcunaligndstalign):
+	addi	r3, r3, 1
+	subfic	r12, r10, 64	/* remaining bytes on second dw  */
+	addi	r4, r4, 8
+	li	r0,0
+	b       L(storedouble)
+
+	.align 4
+L(dst_align_small):
+	mtctr	r6
+	/* Write until src is aligned  */
+L(storebyte2):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8	/* Adjust byte pointer on dw  */
+#else
+	addi	r11, r11, -8
+#endif
+	srd	r7, r8, r11
+	stbu	r7, 1(r3)
+	addi	r5, r5, -1
+	bdnz    L(storebyte2)
+
+	addi	r4, r4, 8	/* Increment src pointer  */
+	addi	r3, r3, 1	/* Increment dst pointer  */
+	mr	r9, r3
+	li	r8, 0
+	cmpd    cr7, r12, r10
+	beq     cr7, L(aligned)
+	rldicl	r6, r3, 0, 61	/* Recalculate padding */
+	mr	r7, r6
+
+	/* src is algined */
+L(srcaligndstunalign):
+	mr	r9, r3
+	mr	r6, r7
+	ld	r8, 0(r4)
+	subfic	r10, r7, 8
+	mr	r7, r8
+	li	r0, 0	/* Check null */
+	cmpb	r0, r8, r0
+	cmpdi	r0, 0
+	bne     L(byte_by_byte)	/* Do byte by byte if there is NULL  */
+	rlwinm	r12, r3, 3,26,28	/* Calculate padding  */
+	addi	r3, r3, -1
+	/* write byte by byte until aligned  */
+#ifdef __LITTLE_ENDIAN__
+	li	r11, -8
+#else
+	li	r11, 64
+#endif
+	mtctr	r10
+	cmpdi	r0, r10, 4
+	blt     L(storebyte)
+	beq     L(equal)
+	addi	r0, r10, -4
+	mtctr	r0
+L(storebyte):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8	/* Adjust byte pointer on  dw  */
+#else
+	addi	r11, r11, -8
+#endif
+	srd	r7, r8, r11
+	stbu	r7, 1(r3)
+	addi	r5, r5, -1
+	bdnz    L(storebyte)
+
+	cmpdi	r0, r10, 4
+	blt     L(align)
+
+	.align 4
+L(equal):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8
+	srd	r7, r8, r11
+#else
+	subfic	r11, r11, 64
+	sld	r7, r8, r11
+	srdi	r7, r7, 32
+#endif
+	stw	r7, 1(r3)
+	addi	r5, r5, -4
+	addi	r3, r3, 4
+L(align):
+	addi	r3, r3, 1
+	addi	r4, r4, 8	/* Increment src pointer  */
+	subfic	r10, r12, 64
+	li	r0, 0
+	/* dst addr aligned to 8 */
+L(storedouble):
+	cmpdi	r5, 8
+	ble	L(null1)
+	ld	r7, 0(r4)	/* load next dw  */
+	cmpb	r0, r7, r0
+	cmpdi	r0, 0	/* check for null on each new dw  */
+	bne     L(null)
+#ifdef __LITTLE_ENDIAN__
+	srd	r9, r8, r10	/* bytes from first dw  */
+	sld	r11, r7, r12	/* bytes from second dw  */
+#else
+	sld	r9, r8, r10
+	srd	r11, r7, r12
+#endif
+	or	r11, r9, r11	/* make as a single dw  */
+	std	r11, 0(r3)	/* store as std on aligned addr  */
+	mr	r8, r7		/* still few bytes left to be written  */
+	addi	r3, r3, 8	/* increment dst addr  */
+	addi	r4, r4, 8	/* increment src addr  */
+	addi	r5, r5, -8
+	b       L(storedouble)	/* Loop until NULL  */
+
+	.align 4
+
+/* We've hit the end of the string.  Do the rest byte-by-byte.  */
+L(null):
+	addi	r3, r3, -1
+	mr	r10, r12
+	mtctr	r6
+#ifdef __LITTLE_ENDIAN__
+	subfic	r10, r10, 64
+	addi	r10, r10, -8
+#endif
+	cmpdi	r0, r5, 4
+	blt	L(loop)
+	cmpdi	r0, r6, 4
+	blt     L(loop)
+
+	/* we can still use stw if leftover >= 4  */
+#ifdef __LITTLE_ENDIAN__
+	addi	r10, r10, 8
+	srd	r11, r8, r10
+#else
+	subfic	r10, r10, 64
+	sld	r11, r8, r10
+	srdi	r11, r11, 32
+#endif
+	stw	r11, 1(r3)
+	addi	r5, r5, -4
+	addi	r3, r3, 4
+	cmpdi	r0, r5, 0
+	beq	L(g1)
+	cmpdi	r0, r6, 4
+	beq     L(bytebybyte1)
+	addi	r10, r10, 32
+#ifdef __LITTLE_ENDIAN__
+	addi	r10, r10, -8
+#else
+	subfic	r10, r10, 64
+#endif
+	addi	r0, r6, -4
+	mtctr	r0
+	/* remaining byte by byte part of first dw  */
+L(loop):
+#ifdef __LITTLE_ENDIAN__
+	addi	r10, r10, 8
+#else
+	addi	r10, r10, -8
+#endif
+	srd	r0, r8, r10
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	cmpdi	r0, r5, 0
+	beq	L(g1)
+	bdnz    L(loop)
+L(bytebybyte1):
+	addi	r3, r3, 1
+	/* remaining byte by byte part of second dw   */
+L(bytebybyte):
+	addi	r3, r3, -8
+	addi	r4, r4, -1
+
+#ifdef __LITTLE_ENDIAN__
+	extrdi. r0, r7, 8, 56
+	stbu	r7, 8(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 48
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 40
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 32
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 24
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 16
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 8
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi	r0, r7, 8, 0
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	b	L(g2)
+#else
+	extrdi. r0, r7, 8, 0
+	stbu	r0, 8(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 8
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 16
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 24
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 32
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 40
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 48
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	stbu	r7, 1(r3)
+	addi	r5, r5, -1
+	b	L(g2)
+#endif
+L(g1):
+#ifdef USE_AS_STPNCPY
+	addi	r3, r3, 1
+#endif
+L(g2):
+	addi	r3, r3, 1
+	mr	r19, r3
+	mr	r8, r5
+	b	L(zeroFill)
+L(null1):
+	mr	r9, r3
+	subf	r4, r6, r4
+	b	L(byte_by_byte)
+END(FUNC_NAME)
+#ifndef USE_AS_STPNCPY
+libc_hidden_builtin_def (strncpy)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/strnlen.S b/sysdeps/powerpc/powerpc64/power7/strnlen.S
index 7993dae69e..2de267fd35 100644
--- a/sysdeps/powerpc/powerpc64/power7/strnlen.S
+++ b/sysdeps/powerpc/powerpc64/power7/strnlen.S
@@ -1,5 +1,5 @@
 /* Optimized strnlen implementation for PowerPC64/POWER7 using cmpb insn.
-   Copyright (C) 2010-2014 Free Software Foundation, Inc.
+   Copyright (C) 2010-2015 Free Software Foundation, Inc.
    Contributed by Luis Machado <luisgpm@br.ibm.com>.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/powerpc/powerpc64/power7/strrchr.S b/sysdeps/powerpc/powerpc64/power7/strrchr.S
new file mode 100644
index 0000000000..68565c68bc
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/strrchr.S
@@ -0,0 +1,255 @@
+/* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn.
+   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* int [r3] strrchr (char *s [r3], int c [r4])  */
+	.machine  power7
+ENTRY (strrchr)
+	CALL_MCOUNT 2
+	dcbt	0,r3
+	clrrdi	r8,r3,3	      /* Align the address to doubleword boundary.  */
+	cmpdi	cr7,r4,0
+	ld	r12,0(r8)     /* Load doubleword from memory.  */
+	li	r9,0	      /* used to store last occurence */
+	li	r0,0	      /* Doubleword with null chars to use
+				 with cmpb.  */
+
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+
+	beq	cr7,L(null_match)
+
+	/* Replicate byte to doubleword.  */
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
+	insrdi	r4,r4,32,0
+
+	/* r4 is changed now ,if its passed as more chars
+	   check for null again */
+	cmpdi	cr7,r4,0
+	beq	cr7,L(null_match)
+	/* Now r4 has a doubleword of c bytes and r0 has
+	   a doubleword of null bytes.  */
+
+	cmpb	r10,r12,r4     /* Compare each byte against c byte.  */
+	cmpb	r11,r12,r0     /* Compare each byte against null byte.  */
+
+	/* Move the doublewords left and right to discard the bits that are
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	srd	r11,r11,r6
+	sld	r10,r10,r6
+	sld	r11,r11,r6
+#else
+	sld	r10,r10,r6
+	sld	r11,r11,r6
+	srd	r10,r10,r6
+	srd	r11,r11,r6
+#endif
+	or	r5,r10,r11    /* OR the results to speed things up.  */
+	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done)
+
+L(align):
+	mtcrf	0x01,r8
+
+	/* Are we now aligned to a doubleword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	28,L(loop)
+
+	/* Handle WORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb	r10,r12,r4
+	cmpb	r11,r12,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done)
+	b	L(loop)	      /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+	.p2align  5
+L(loop):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu	r7,16(r8)
+	cmpb	r10,r12,r4
+	cmpb	r11,r12,r0
+	cmpb	r6,r7,r4
+	cmpb	r7,r7,r0
+	or	r12,r10,r11
+	or	r5,r6,r7
+	or	r5,r12,r5
+	cmpdi	cr7,r5,0
+	beq	cr7,L(loop)
+
+	/* OK, one (or both) of the doublewords contains a c/null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a c/null byte.  */
+	cmpdi	cr6,r12,0
+	addi	r8,r8,-8
+	bne	cr6,L(done)
+
+	/* The c/null byte must be in the second doubleword.  Adjust the
+	   address again and move the result of cmpb to r10 so we can calculate
+	   the pointer.  */
+
+	mr	r10,r6
+	mr	r11,r7
+	addi	r8,r8,8
+
+	/* r10/r11 have the output of the cmpb instructions, that is,
+	   0xff in the same position as the c/null byte in the original
+	   doubleword from the string.  Use that to calculate the pointer.  */
+
+L(done):
+	/* if there are more than one 0xff in r11, find the first pos of ff
+	   in r11 and fill r10 with 0 from that position */
+	cmpdi	cr7,r11,0
+	beq	cr7,L(no_null)
+#ifdef __LITTLE_ENDIAN__
+	addi	r3,r11,-1
+	andc	r3,r3,r11
+	popcntd r0,r3
+#else
+	cntlzd	r0,r11
+#endif
+	subfic	r0,r0,63
+	li	r6,-1
+#ifdef __LITTLE_ENDIAN__
+	srd	r0,r6,r0
+#else
+	sld	r0,r6,r0
+#endif
+	and	r10,r0,r10
+L(no_null):
+#ifdef __LITTLE_ENDIAN__
+	cntlzd	r0,r10		/* Count leading zeros before c matches.  */
+	addi	r3,r10,-1
+	andc	r3,r3,r10
+	addi	r10,r11,-1
+	andc	r10,r10,r11
+	cmpld	cr7,r3,r10
+	bgt	cr7,L(no_match)
+#else
+	addi	r3,r10,-1	/* Count trailing zeros before c matches.  */
+	andc	r3,r3,r10
+	popcntd	r0,r3
+	cmpld	cr7,r11,r10
+	bgt	cr7,L(no_match)
+#endif
+	srdi	r0,r0,3		/* Convert trailing zeros to bytes.  */
+	subfic	r0,r0,7
+	add	r9,r8,r0      /* Return address of the matching c byte
+				 or null in case c was not found.  */
+	li	r0,0
+	cmpdi	cr7,r11,0     /* If r11 == 0, no null's have been found.  */
+	beq	cr7,L(align)
+
+	.align	4
+L(no_match):
+	mr	r3,r9
+	blr
+
+/* We are here because strrchr was called with a null byte.  */
+	.align	4
+L(null_match):
+	/* r0 has a doubleword of null bytes.  */
+
+	cmpb	r5,r12,r0     /* Compare each byte against null bytes.  */
+
+	/* Move the doublewords left and right to discard the bits that are
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r5,r5,r6
+	sld	r5,r5,r6
+#else
+	sld	r5,r5,r6
+	srd	r5,r5,r6
+#endif
+	cmpdi	cr7,r5,0      /* If r10 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done_null)
+
+	mtcrf	0x01,r8
+
+	/* Are we now aligned to a quadword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	28,L(loop_null)
+
+	/* Handle WORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb	r5,r12,r0
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done_null)
+	b	L(loop_null)  /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+
+	/* Main loop to look for the end of the string.  Since it's a
+	   small loop (< 8 instructions), align it to 32-bytes.  */
+	.p2align  5
+L(loop_null):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu	r11,16(r8)
+	cmpb	r5,r12,r0
+	cmpb	r10,r11,r0
+	or	r6,r5,r10
+	cmpdi	cr7,r6,0
+	beq	cr7,L(loop_null)
+
+	/* OK, one (or both) of the doublewords contains a null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a null byte.  */
+
+	cmpdi	cr6,r5,0
+	addi	r8,r8,-8
+	bne	cr6,L(done_null)
+
+	/* The null byte must be in the second doubleword.  Adjust the address
+	   again and move the result of cmpb to r10 so we can calculate the
+	   pointer.  */
+
+	mr	r5,r10
+	addi	r8,r8,8
+
+	/* r5 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as the null byte in the original
+	   doubleword from the string.  Use that to calculate the pointer.  */
+L(done_null):
+#ifdef __LITTLE_ENDIAN__
+	addi	r0,r5,-1
+	andc	r0,r0,r5
+	popcntd	r0,r0
+#else
+	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
+#endif
+	srdi	r0,r0,3	      /* Convert trailing zeros to bytes.  */
+	add	r3,r8,r0      /* Return address of the matching null byte.  */
+	blr
+END (strrchr)
+weak_alias (strrchr, rindex)
+libc_hidden_builtin_def (strrchr)
diff --git a/sysdeps/powerpc/powerpc64/power7/strstr.S b/sysdeps/powerpc/powerpc64/power7/strstr.S
new file mode 100644
index 0000000000..8dca31ce35
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/strstr.S
@@ -0,0 +1,509 @@
+/* Optimized strstr implementation for PowerPC64/POWER7.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Char * [r3] strstr (char *s [r3], char * pat[r4])  */
+
+/* The performance gain is obtained using aligned memory access, load
+ * doubleword and usage of cmpb instruction for quicker comparison.  */
+
+#ifndef STRLEN
+/* For builds with no IFUNC support, local calls should be made to internal
+   GLIBC symbol (created by libc_hidden_builtin_def).  */
+# ifdef SHARED
+#  define STRLEN   __GI_strlen
+# else
+#  define STRLEN   strlen
+# endif
+#endif
+
+#ifndef STRNLEN
+/* For builds with no IFUNC support, local calls should be made to internal
+   GLIBC symbol (created by libc_hidden_builtin_def).  */
+# ifdef SHARED
+#  define STRNLEN   __GI_strnlen
+# else
+#  define STRNLEN   strnlen
+# endif
+#endif
+
+#ifndef STRCHR
+# ifdef SHARED
+#  define STRCHR   __GI_strchr
+# else
+#  define STRCHR   strchr
+# endif
+#endif
+
+#define	FRAMESIZE	(FRAME_MIN_SIZE+32)
+	.machine  power7
+EALIGN (strstr, 4, 0)
+	CALL_MCOUNT 2
+	mflr	r0			/* Load link register LR to r0.  */
+	std	r31, -8(r1)		/* Save callers register r31.  */
+	cfi_offset(r31, -8)
+	std	r30, -16(r1)		/* Save callers register r30.  */
+	cfi_offset(r30, -16)
+	std	r29, -24(r1)		/* Save callers register r29.  */
+	cfi_offset(r29, -24)
+	std	r0, 16(r1)		/* Store the link register.  */
+	cfi_offset(lr, 16)
+	stdu	r1, -FRAMESIZE(r1)	/* Create the stack frame.  */
+	cfi_adjust_cfa_offset(FRAMESIZE)
+
+	dcbt	0, r3
+	dcbt	0, r4
+
+	cmpdi	cr7, r3, 0
+	beq	cr7, L(retnull)
+	cmpdi	cr7, r4, 0
+	beq	cr7, L(retnull)
+
+	mr	r29, r3
+	mr	r30, r4
+	mr	r3, r4
+	bl	STRLEN
+	nop
+
+	cmpdi	cr7, r3, 0	/* If search str is null.  */
+	beq	cr7, L(ret_r3)
+
+	/* Call __strstr_ppc if needle len > 2048 */
+	cmpdi	cr7, r3, 2048
+	bgt	cr7, L(default)
+
+	mr	r31, r3
+	mr	r4, r3
+	mr	r3, r29
+	bl	STRNLEN
+	nop
+
+	cmpd	cr7, r3, r31 	/* If len(r3) < len(r4).  */
+	blt	cr7, L(retnull)
+	mr	r3, r29
+	lbz	r4, 0(r30)
+	bl	STRCHR
+	nop
+
+	mr	r11, r3
+	/* If first char of search str is not present.  */
+	cmpdi	cr7, r3, 0
+	ble	cr7, L(end)
+
+	rldicl	r8, r3, 0, 52	/* Page cross check.  */
+	cmpldi	cr7, r8, 4096-16
+	bgt	cr7, L(bytebybyte)
+
+	rldicl	r8, r30, 0, 52
+	cmpldi	cr7, r8, 4096-16
+	bgt	cr7, L(bytebybyte)
+
+	/* If len(r4) < 8 handle in a different way.  */
+	/* Shift position based on null and use cmpb.  */
+	cmpdi	cr7, r31, 8
+	blt	cr7, L(lessthan8)
+
+	/* Len(r4) >= 8 reaches here.  */
+	mr	r8, r3		/* Save r3 for future use.  */
+	mr	r4, r30		/* Restore r4.  */
+	li	r0, 0
+	rlwinm	r10, r30, 3, 26, 28	/* Calculate padding in bits.  */
+	clrrdi	r4, r4, 3	/* Make r4 aligned to 8.  */
+	ld	r6, 0(r4)
+	addi	r4, r4, 8
+	cmpdi	cr7, r10, 0	/* Check if its already aligned?  */
+	beq	cr7, L(begin1)
+#ifdef __LITTLE_ENDIAN__
+	srd	r6, r6, r10	/* Discard unwanted bits.  */
+#else
+	sld	r6, r6, r10
+#endif
+	ld	r9, 0(r4)
+	subfic	r10, r10, 64
+#ifdef __LITTLE_ENDIAN__
+	sld	r9, r9, r10	/* Discard unwanted bits.  */
+#else
+	srd	r9, r9, r10
+#endif
+	or	r6, r6, r9	/* Form complete search str.  */
+L(begin1):
+	mr	r29, r6
+	rlwinm	r10, r3, 3, 26, 28
+	clrrdi	r3, r3, 3
+	ld	r5, 0(r3)
+	cmpb	r9, r0, r6	/* Check if input has null.  */
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(return3)
+	cmpb	r9, r0, r5	/* Check if input has null.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r9, r9, r10
+#else
+	sld	r9, r9, r10
+#endif
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(retnull)
+
+	li	r12, -8		/* Shift values.  */
+	li	r11, 72		/* Shift values.  */
+	cmpdi	cr7, r10, 0
+	beq	cr7, L(nextbyte1)
+	mr	r12, r10
+	addi	r12, r12, -8
+	subfic	r11, r12, 64
+
+L(nextbyte1):
+	ldu	r7, 8(r3) 	/* Load next dw.  */
+	addi	r12, r12, 8	/* Shift one byte and compare.  */
+	addi	r11, r11, -8
+#ifdef __LITTLE_ENDIAN__
+	srd	r9, r5, r12	/* Rotate based on mask.  */
+	sld	r10, r7, r11
+#else
+	sld	r9, r5, r12
+	srd	r10, r7, r11
+#endif
+	/* Form single dw from few bytes on first load and second load.  */
+	or	r10, r9, r10
+	/* Check for null in the formed dw.  */
+	cmpb	r9, r0, r10
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(retnull)
+	/* Cmpb search str and input str.  */
+	cmpb	r9, r10, r6
+	cmpdi	cr7, r9, -1
+	beq	cr7, L(match)
+	addi	r8, r8, 1
+	b	L(begin)
+
+	.align	4
+L(match):
+	/* There is a match of 8 bytes, check next bytes.  */
+	cmpdi	cr7, r31, 8
+	beq	cr7, L(return)
+	/* Update next starting point r8.  */
+	srdi	r9, r11, 3
+	subf	r9, r9, r3
+	mr	r8, r9
+
+L(secondmatch):
+	mr	r5, r7
+	rlwinm	r10, r30, 3, 26, 28	/* Calculate padding in bits.  */
+	ld	r6, 0(r4)
+	addi	r4, r4, 8
+	cmpdi	cr7, r10, 0	/* Check if its already aligned?  */
+	beq	cr7, L(proceed3)
+#ifdef __LITTLE_ENDIAN__
+	srd	r6, r6, r10	/* Discard unwanted bits.  */
+	cmpb	r9, r0, r6
+	sld	r9, r9, r10
+#else
+	sld	r6, r6, r10
+	cmpb	r9, r0, r6
+	srd	r9, r9, r10
+#endif
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(proceed3)
+	ld	r9, 0(r4)
+	subfic	r10, r10, 64
+#ifdef __LITTLE_ENDIAN__
+	sld	r9, r9, r10	/* Discard unwanted bits.  */
+#else
+	srd	r9, r9, r10
+#endif
+	or	r6, r6, r9	/* Form complete search str.  */
+
+L(proceed3):
+	li	r7, 0
+	addi	r3, r3, 8
+	cmpb	r9, r0, r5
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(proceed4)
+	ld	r7, 0(r3)
+L(proceed4):
+#ifdef __LITTLE_ENDIAN__
+	srd	r9, r5, r12
+	sld	r10, r7, r11
+#else
+	sld	r9, r5, r12
+	srd	r10, r7, r11
+#endif
+	/* Form single dw with few bytes from first and second load.  */
+	or	r10, r9, r10
+	cmpb	r9, r0, r6
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(return4)
+	/* Check for null in the formed dw.  */
+	cmpb	r9, r0, r10
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(retnull)
+	/* If the next 8 bytes dont match, start search again.  */
+	cmpb	r9, r10, r6
+	cmpdi	cr7, r9, -1
+	bne	cr7, L(reset)
+	/* If the next 8 bytes match, load and compare next 8.  */
+	b	L(secondmatch)
+
+	.align	4
+L(reset):
+	/* Start the search again.  */
+	addi	r8, r8, 1
+	b	L(begin)
+
+	.align	4
+L(return3):
+	/* Count leading zeros and compare partial dw.  */
+#ifdef __LITTLE_ENDIAN__
+	addi	r7, r9, -1
+	andc	r7, r7, r9
+	popcntd	r7, r7
+	subfic	r7, r7, 64
+	sld	r10, r5, r7
+	sld	r6, r6, r7
+#else
+	cntlzd	r7, r9
+	subfic	r7, r7, 64
+	srd	r10, r5, r7
+	srd	r6, r6, r7
+#endif
+	cmpb	r9, r10, r6
+	cmpdi	cr7, r9, -1
+	addi	r8, r8, 1
+	/* Start search again if there is no match.  */
+	bne	cr7, L(begin)
+	/* If the words match, update return values.  */
+	subfic	r7, r7, 64
+	srdi	r7, r7, 3
+	add	r3, r3, r7
+	subf	r3, r31, r3
+	b	L(end)
+
+	.align	4
+L(return4):
+	/* Count leading zeros and compare partial dw.  */
+#ifdef __LITTLE_ENDIAN__
+	addi	r7, r9, -1
+	andc	r7, r7, r9
+	popcntd	r7, r7
+	subfic	r7, r7, 64
+	sld	r10, r10, r7
+	sld	r6, r6, r7
+#else
+	cntlzd	r7, r9
+	subfic	r7, r7, 64
+	srd	r10, r10, r7
+	srd	r6, r6, r7
+#endif
+	cmpb	r9, r10, r6
+	cmpdi	cr7, r9, -1
+	addi	r8, r8, 1
+	bne	cr7, L(begin)
+	subfic	r7, r7, 64
+	srdi	r11, r11, 3
+	subf	r3, r11, r3
+	srdi	r7, r7, 3
+	add	r3, r3, r7
+	subf	r3, r31, r3
+	b	L(end)
+
+	.align	4
+L(begin):
+	mr	r3, r8
+	lbz	r4, 0(r30)
+	bl	STRCHR
+	nop
+	/* If first char of search str is not present.  */
+	cmpdi	cr7, r3, 0
+	ble	cr7, L(end)
+	mr	r8, r3
+	mr	r4, r30		/* Restore r4.  */
+	li	r0, 0
+	mr	r6, r29
+	clrrdi	r4, r4, 3
+	addi	r4, r4, 8
+	b	L(begin1)
+
+	/* Handle less than 8 search string.  */
+	.align	4
+L(lessthan8):
+	mr	r4, r3
+	mr	r9, r30
+	li	r0, 0
+
+	rlwinm	r10, r9, 3, 26, 28	/* Calculate padding in bits.  */
+	srdi	r8, r10, 3	/* Padding in bytes.  */
+	clrrdi	r9, r9, 3	/* Make r4 aligned to 8.  */
+	ld	r6, 0(r9)
+	cmpdi	cr7, r10, 0	/* Check if its already aligned?  */
+	beq	cr7, L(proceed2)
+#ifdef __LITTLE_ENDIAN__
+	srd	r6, r6, r10	/* Discard unwanted bits.  */
+#else
+	sld	r6, r6, r10
+#endif
+	subfic	r8, r8, 8
+	cmpd	cr7, r8, r31	/* Next load needed?  */
+	bge	cr7, L(proceed2)
+	ld	r7, 8(r9)
+	subfic	r10, r10, 64
+#ifdef __LITTLE_ENDIAN__
+	sld	r7, r7, r10	/* Discard unwanted bits.  */
+#else
+	srd	r7, r7, r10
+#endif
+	or	r6, r6, r7	/* Form complete search str.  */
+L(proceed2):
+	mr	r29, r6
+	rlwinm	r10, r3, 3, 26, 28
+	clrrdi	r7, r3, 3	/* Make r3 aligned.  */
+	ld	r5, 0(r7)
+	sldi	r8, r31, 3
+	subfic	r8, r8, 64
+#ifdef __LITTLE_ENDIAN__
+	sld	r6, r6, r8
+	cmpb	r9, r0, r5
+	srd	r9, r9, r10
+#else
+	srd	r6, r6, r8
+	cmpb	r9, r0, r5
+	sld	r9, r9, r10
+#endif
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(noload)
+	cmpdi	cr7, r10, 0
+	beq	cr7, L(continue)
+	ld	r7, 8(r7)
+L(continue1):
+	mr	r12, r10
+	addi	r12, r12, -8
+	subfic	r11, r12, 64
+	b	L(nextbyte)
+
+	.align	4
+L(continue):
+	ld	r7, 8(r7)
+	li	r12, -8		/* Shift values.  */
+	li	r11, 72		/* Shift values.  */
+L(nextbyte):
+	addi	r12, r12, 8	/* Mask for rotation.  */
+	addi	r11, r11, -8
+#ifdef __LITTLE_ENDIAN__
+	srd	r9, r5, r12
+	sld	r10, r7, r11
+	or	r10, r9, r10
+	sld	r10, r10, r8
+	cmpb	r9, r0, r10
+	srd	r9, r9, r8
+#else
+	sld	r9, r5, r12
+	srd	r10, r7, r11
+	or	r10, r9, r10
+	srd	r10, r10, r8
+	cmpb	r9, r0, r10
+	sld	r9, r9, r8
+#endif
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(retnull)
+	cmpb	r9, r10, r6
+	cmpdi	cr7, r9, -1
+	beq	cr7, L(end)
+	addi	r3, r4, 1
+	lbz	r4, 0(r30)
+	bl	STRCHR
+	nop
+	/* If first char of search str is not present.  */
+	cmpdi	cr7, r3, 0
+	ble	cr7, L(end)
+	mr	r4, r3
+	mr	r6, r29
+	li	r0, 0
+	b	L(proceed2)
+
+	.align	4
+L(noload):
+	/* Reached null in r3, so skip next load.  */
+	li 	r7, 0
+	b	L(continue1)
+
+	.align	4
+L(return):
+	/* Update return values.  */
+	srdi	r9, r11, 3
+	subf	r3, r9, r3
+	b	L(end)
+
+	/* Handling byte by byte.  */
+	.align	4
+L(bytebybyte):
+	mr	r8, r3
+	addi	r8, r8, -1
+L(loop1):
+	addi	r8, r8, 1
+	mr	r3, r8
+	mr	r4, r30
+	lbz	r6, 0(r4)
+	cmpdi	cr7, r6, 0
+	beq	cr7, L(updater3)
+L(loop):
+	lbz	r5, 0(r3)
+	cmpdi	cr7, r5, 0
+	beq	cr7, L(retnull)
+	cmpld	cr7, r6, r5
+	bne	cr7, L(loop1)
+	addi	r3, r3, 1
+	addi	r4, r4, 1
+	lbz	r6, 0(r4)
+	cmpdi	cr7, r6, 0
+	beq	cr7, L(updater3)
+	b	L(loop)
+
+	/* Handling return values.  */
+	.align	4
+L(updater3):
+	subf	r3, r31, r3	/* Reduce len of r4 from r3.  */
+	b	L(end)
+
+	.align	4
+L(ret_r3):
+	mr	r3, r29		/* Return r3.  */
+	b	L(end)
+
+	.align	4
+L(retnull):
+	li	r3, 0		/* Return NULL.  */
+	b	L(end)
+
+	.align	4
+L(default):
+	mr	r3, r29
+	mr	r4, r30
+	bl	__strstr_ppc
+	nop
+
+	.align	4
+L(end):
+	addi	r1, r1, FRAMESIZE	/* Restore stack pointer.  */
+	cfi_adjust_cfa_offset(-FRAMESIZE)
+	ld	r0, 16(r1)	/* Restore the saved link register.  */
+	ld	r29, -24(r1)	/* Restore callers save register r29.  */
+	ld	r30, -16(r1)	/* Restore callers save register r30.  */
+	ld	r31, -8(r1)	/* Restore callers save register r31.  */
+	mtlr	r0		/* Branch to link register.  */
+	blr
+END (strstr)
+libc_hidden_builtin_def (strstr)
diff --git a/sysdeps/powerpc/powerpc64/power7/sub_n.S b/sysdeps/powerpc/powerpc64/power7/sub_n.S
index d6539aa067..460bb1907a 100644
--- a/sysdeps/powerpc/powerpc64/power7/sub_n.S
+++ b/sysdeps/powerpc/powerpc64/power7/sub_n.S
@@ -1,6 +1,6 @@
 /* PowerPC64 mpn_lshift --  mpn_add_n/mpn_sub_n -- mpn addition and
    subtraction.
-   Copyright (C) 2013-2014 Free Software Foundation, Inc.
+   Copyright (C) 2013-2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or