36 files changed, 6704 insertions, 167 deletions
diff --git a/sysdeps/powerpc/powerpc64/power8/Implies b/sysdeps/powerpc/powerpc64/power8/Implies
deleted file mode 100644
index 9a5e3c7277..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/Implies
+++ /dev/null
@@ -1,2 +0,0 @@
-powerpc/powerpc64/power7/fpu
-powerpc/powerpc64/power7
diff --git a/sysdeps/powerpc/powerpc64/power8/Makefile b/sysdeps/powerpc/powerpc64/power8/Makefile
new file mode 100644
index 0000000000..71a59529f3
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/Makefile
@@ -0,0 +1,3 @@
+ifeq ($(subdir),string)
+sysdep_routines += strcasestr-ppc64
+endif
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/Implies b/sysdeps/powerpc/powerpc64/power8/fpu/Implies
deleted file mode 100644
index 1187cdfb0a..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/fpu/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc64/power7/fpu/
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S b/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S
new file mode 100644
index 0000000000..32ee8326e1
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S
@@ -0,0 +1,303 @@
+/* Optimized expf().  PowerPC64/POWER8 version.
+   Copyright (C) 2016-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Short algorithm description:
+ *
+ *  Let K = 64 (table size).
+ *       e^x  = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y))
+ *  where:
+ *       x = m*log(2)/K + y,    y in [0.0..log(2)/K]
+ *       m = n*K + j,           m,n,j - signed integer, j in [0..K-1]
+ *       values of 2^(j/K) are tabulated as T[j].
+ *
+ *       P(y) is a minimax polynomial approximation of expf(y)-1
+ *       on small interval [0.0..log(2)/K].
+ *
+ *       P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as
+ *       z = y*y;    P(y) = (P3*z + P1)*z + (P2*z + P0)*y
+ *
+ * Special cases:
+ *  expf(NaN) = NaN
+ *  expf(+INF) = +INF
+ *  expf(-INF) = 0
+ *  expf(x) = 1 for subnormals
+ *  for finite argument, only expf(0)=1 is exact
+ *  expf(x) overflows if x>88.7228317260742190
+ *  expf(x) underflows if x<-103.972076416015620
+ */
+
+#define C1 0x42ad496b		/* Single precision 125*log(2).  */
+#define C2 0x31800000		/* Single precision 2^(-28).  */
+#define SP_INF 0x7f800000	/* Single precision Inf.  */
+#define SP_EXP_BIAS 0x1fc0	/* Single precision exponent bias.  */
+
+#define DATA_OFFSET r9
+
+/* Implements the function
+
+   float [fp1] expf (float [fp1] x)  */
+
+	.machine power8
+ENTRY (__ieee754_expf, 4)
+	addis	DATA_OFFSET,r2,.Lanchor@toc@ha
+	addi	DATA_OFFSET,DATA_OFFSET,.Lanchor@toc@l
+
+	xscvdpspn v0,v1
+	mfvsrd	r8,v0		/* r8 = x  */
+	lfd	fp2,(.KLN2-.Lanchor)(DATA_OFFSET)
+	lfd	fp3,(.P2-.Lanchor)(DATA_OFFSET)
+	rldicl	r3,r8,32,33	/* r3 = |x|  */
+	lis	r4,C1@ha	/* r4 = 125*log(2)  */
+	ori	r4,r4,C1@l
+	cmpw	r3,r4
+	lfd	fp5,(.P3-.Lanchor)(DATA_OFFSET)
+	lfd	fp4,(.RS-.Lanchor)(DATA_OFFSET)
+	fmadd	fp2,fp1,fp2,fp4	/* fp2 = x * K/log(2) + (2^23 + 2^22)  */
+	bge	L(special_paths)	/* |x| >= 125*log(2) ?  */
+
+	lis	r4,C2@ha
+	ori	r4,r4,C2@l
+	cmpw	r3,r4
+	blt	L(small_args)	/* |x| < 2^(-28) ?  */
+
+	/* Main path: here if 2^(-28) <= |x| < 125*log(2) */
+	frsp	fp6,fp2
+	xscvdpsp v2,v2
+	mfvsrd	r8,v2
+	mr	r3,r8			/* r3 = m  */
+	rldicl	r8,r8,32,58		/* r8 = j  */
+	lfs	fp4,(.SP_RS-.Lanchor)(DATA_OFFSET)
+	fsubs	fp2,fp6,fp4		/* fp2 = m = x * K/log(2)  */
+	srdi	r3,r3,32
+	clrrwi	r3,r3,6			/* r3 = n  */
+	lfd	fp6,(.NLN2K-.Lanchor)(DATA_OFFSET)
+	fmadd	fp0,fp2,fp6,fp1		/* fp0 = y = x - m*log(2)/K  */
+	fmul	fp2,fp0,fp0		/* fp2 = z = y^2  */
+	lfd	fp4,(.P1-.Lanchor)(DATA_OFFSET)
+	lfd	fp6,(.P0-.Lanchor)(DATA_OFFSET)
+	lis	r4,SP_EXP_BIAS@ha
+	ori	r4,r4,SP_EXP_BIAS@l
+	add	r3,r3,r4
+	rldic	r3,r3,49,1		/* r3 = 2^n  */
+	fmadd	fp4,fp5,fp2,fp4		/* fp4 = P3 * z + P1  */
+	fmadd	fp6,fp3,fp2,fp6		/* fp6 = P2 * z + P0  */
+	mtvsrd	v1,r3
+	xscvspdp v1,v1
+	fmul	fp4,fp4,fp2		/* fp4 = (P3 * z + P1)*z  */
+	fmadd	fp0,fp0,fp6,fp4		/* fp0 = P(y)  */
+	sldi	r8,r8,3			/* Access doublewords from T[j].  */
+	addi	r6,DATA_OFFSET,(.Ttable-.Lanchor)
+	lfdx	fp3,r6,r8
+	fmadd	fp0,fp0,fp3,fp3		/* fp0 = T[j] * (1 + P(y))  */
+	fmul	fp1,fp1,fp0		/* fp1 = 2^n * T[j] * (1 + P(y))  */
+	frsp	fp1,fp1
+	blr
+
+	.align	4
+/* x is either underflow, overflow, infinite or NaN.  */
+L(special_paths):
+	srdi	r8,r8,32
+	rlwinm	r8,r8,3,29,29		/* r8 = 0, if x positive.
+					   r8 = 4, otherwise.  */
+	addi	r6,DATA_OFFSET,(.SPRANGE-.Lanchor)
+	lwzx	r4,r6,r8		/* r4 = .SPRANGE[signbit(x)]  */
+	cmpw	r3,r4
+	/* |x| <= .SPRANGE[signbit(x)]  */
+	ble	L(near_under_or_overflow)
+
+	lis	r4,SP_INF@ha
+	ori	r4,r4,SP_INF@l
+	cmpw	r3,r4
+	bge	L(arg_inf_or_nan)	/* |x| > Infinite ?  */
+
+	addi	r6,DATA_OFFSET,(.SPLARGE_SMALL-.Lanchor)
+	lfsx	fp1,r6,r8
+	fmuls	fp1,fp1,fp1
+	blr
+
+
+	.align	4
+L(small_args):
+	/* expf(x) = 1.0, where |x| < |2^(-28)|  */
+	lfs	fp2,(.SPone-.Lanchor)(DATA_OFFSET)
+	fadds	fp1,fp1,fp2
+	blr
+
+
+	.align	4
+L(arg_inf_or_nan:)
+	bne	L(arg_nan)
+
+	/* expf(+INF) = +INF
+	   expf(-INF) = 0  */
+	addi	r6,DATA_OFFSET,(.INF_ZERO-.Lanchor)
+	lfsx	fp1,r6,r8
+	blr
+
+
+	.align	4
+L(arg_nan):
+	/* expf(NaN) = NaN  */
+	fadd	fp1,fp1,fp1
+	frsp	fp1,fp1
+	blr
+
+	.align	4
+L(near_under_or_overflow):
+	frsp	fp6,fp2
+	xscvdpsp v2,v2
+	mfvsrd	r8,v2
+	mr	r3,r8			/* r3 = m  */
+	rldicl	r8,r8,32,58		/* r8 = j  */
+	lfs	fp4,(.SP_RS-.Lanchor)(DATA_OFFSET)
+	fsubs	fp2,fp6,fp4		/* fp2 = m = x * K/log(2)  */
+	srdi	r3,r3,32
+	clrrwi	r3,r3,6			/* r3 = n  */
+	lfd	fp6,(.NLN2K-.Lanchor)(DATA_OFFSET)
+	fmadd	fp0,fp2,fp6,fp1		/* fp0 = y = x - m*log(2)/K  */
+	fmul	fp2,fp0,fp0		/* fp2 = z = y^2  */
+	lfd	fp4,(.P1-.Lanchor)(DATA_OFFSET)
+	lfd	fp6,(.P0-.Lanchor)(DATA_OFFSET)
+	ld	r4,(.DP_EXP_BIAS-.Lanchor)(DATA_OFFSET)
+	add	r3,r3,r4
+	rldic	r3,r3,46,1		/* r3 = 2  */
+	fmadd	fp4,fp5,fp2,fp4		/* fp4 = P3 * z + P1  */
+	fmadd	fp6,fp3,fp2,fp6		/* fp6 = P2 * z + P0  */
+	mtvsrd	v1,r3
+	fmul	fp4,fp4,fp2		/* fp4 = (P3*z + P1)*z  */
+	fmadd	fp0,fp0,fp6,fp4		/* fp0 = P(y)  */
+	sldi	r8,r8,3			/* Access doublewords from T[j].  */
+	addi	r6,DATA_OFFSET,(.Ttable-.Lanchor)
+	lfdx	fp3,r6,r8
+	fmadd	fp0,fp0,fp3,fp3		/* fp0 = T[j] * (1 + T[j])  */
+	fmul	fp1,fp1,fp0		/* fp1 = 2^n * T[j] * (1 + T[j])  */
+	frsp	fp1,fp1
+	blr
+END(__ieee754_expf)
+
+	.section .rodata, "a",@progbits
+.Lanchor:
+	.balign	8
+/* Table T[j] = 2^(j/K).  Double precision.  */
+.Ttable:
+	.8byte	0x3ff0000000000000
+	.8byte	0x3ff02c9a3e778061
+	.8byte	0x3ff059b0d3158574
+	.8byte	0x3ff0874518759bc8
+	.8byte	0x3ff0b5586cf9890f
+	.8byte	0x3ff0e3ec32d3d1a2
+	.8byte	0x3ff11301d0125b51
+	.8byte	0x3ff1429aaea92de0
+	.8byte	0x3ff172b83c7d517b
+	.8byte	0x3ff1a35beb6fcb75
+	.8byte	0x3ff1d4873168b9aa
+	.8byte	0x3ff2063b88628cd6
+	.8byte	0x3ff2387a6e756238
+	.8byte	0x3ff26b4565e27cdd
+	.8byte	0x3ff29e9df51fdee1
+	.8byte	0x3ff2d285a6e4030b
+	.8byte	0x3ff306fe0a31b715
+	.8byte	0x3ff33c08b26416ff
+	.8byte	0x3ff371a7373aa9cb
+	.8byte	0x3ff3a7db34e59ff7
+	.8byte	0x3ff3dea64c123422
+	.8byte	0x3ff4160a21f72e2a
+	.8byte	0x3ff44e086061892d
+	.8byte	0x3ff486a2b5c13cd0
+	.8byte	0x3ff4bfdad5362a27
+	.8byte	0x3ff4f9b2769d2ca7
+	.8byte	0x3ff5342b569d4f82
+	.8byte	0x3ff56f4736b527da
+	.8byte	0x3ff5ab07dd485429
+	.8byte	0x3ff5e76f15ad2148
+	.8byte	0x3ff6247eb03a5585
+	.8byte	0x3ff6623882552225
+	.8byte	0x3ff6a09e667f3bcd
+	.8byte	0x3ff6dfb23c651a2f
+	.8byte	0x3ff71f75e8ec5f74
+	.8byte	0x3ff75feb564267c9
+	.8byte	0x3ff7a11473eb0187
+	.8byte	0x3ff7e2f336cf4e62
+	.8byte	0x3ff82589994cce13
+	.8byte	0x3ff868d99b4492ed
+	.8byte	0x3ff8ace5422aa0db
+	.8byte	0x3ff8f1ae99157736
+	.8byte	0x3ff93737b0cdc5e5
+	.8byte	0x3ff97d829fde4e50
+	.8byte	0x3ff9c49182a3f090
+	.8byte	0x3ffa0c667b5de565
+	.8byte	0x3ffa5503b23e255d
+	.8byte	0x3ffa9e6b5579fdbf
+	.8byte	0x3ffae89f995ad3ad
+	.8byte	0x3ffb33a2b84f15fb
+	.8byte	0x3ffb7f76f2fb5e47
+	.8byte	0x3ffbcc1e904bc1d2
+	.8byte	0x3ffc199bdd85529c
+	.8byte	0x3ffc67f12e57d14b
+	.8byte	0x3ffcb720dcef9069
+	.8byte	0x3ffd072d4a07897c
+	.8byte	0x3ffd5818dcfba487
+	.8byte	0x3ffda9e603db3285
+	.8byte	0x3ffdfc97337b9b5f
+	.8byte	0x3ffe502ee78b3ff6
+	.8byte	0x3ffea4afa2a490da
+	.8byte	0x3ffefa1bee615a27
+	.8byte	0x3fff50765b6e4540
+	.8byte	0x3fffa7c1819e90d8
+
+.KLN2:
+	.8byte	0x40571547652b82fe	/* Double precision K/log(2).  */
+
+/* Double precision polynomial coefficients.  */
+.P0:
+	.8byte	0x3fefffffffffe7c6
+.P1:
+	.8byte	0x3fe00000008d6118
+.P2:
+	.8byte	0x3fc55550da752d4f
+.P3:
+	.8byte	0x3fa56420eb78fa85
+
+.RS:
+	.8byte	0x4168000000000000	/* Double precision 2^23 + 2^22.  */
+.NLN2K:
+	.8byte	0xbf862e42fefa39ef	/* Double precision -log(2)/K.  */
+.DP_EXP_BIAS:
+	.8byte	0x000000000000ffc0	/* Double precision exponent bias.  */
+
+	.balign	4
+.SPone:
+	.4byte	0x3f800000	/* Single precision 1.0.  */
+.SP_RS:
+	.4byte	0x4b400000	/* Single precision 2^23 + 2^22.  */
+
+.SPRANGE: /* Single precision overflow/underflow bounds.  */
+	.4byte	0x42b17217	/* if x>this bound, then result overflows.  */
+	.4byte	0x42cff1b4	/* if x<this bound, then result underflows.  */
+
+.SPLARGE_SMALL:
+	.4byte	0x71800000	/* 2^100.  */
+	.4byte	0x0d800000	/* 2^-100.  */
+
+.INF_ZERO:
+	.4byte	0x7f800000	/* Single precision Inf.  */
+	.4byte	0		/* Single precision zero.  */
+
+strong_alias (__ieee754_expf, __expf_finite)
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies b/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies
deleted file mode 100644
index 7fd86fdf87..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc64/power7/fpu/multiarch
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S
new file mode 100644
index 0000000000..af71382fb2
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S
@@ -0,0 +1,509 @@
+/* Optimized cosf().  PowerPC64/POWER8 version.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#define _ERRNO_H	1
+#include <bits/errno.h>
+#include <libm-alias-float.h>
+
+#define FRAMESIZE (FRAME_MIN_SIZE+16)
+
+#define FLOAT_EXPONENT_SHIFT	23
+#define FLOAT_EXPONENT_BIAS	127
+#define INTEGER_BITS		3
+
+#define PI_4		0x3f490fdb	/* PI/4 */
+#define NINEPI_4	0x40e231d6	/* 9 * PI/4 */
+#define TWO_PN5		0x3d000000	/* 2^-5 */
+#define TWO_PN27	0x32000000	/* 2^-27 */
+#define INFINITY	0x7f800000
+#define TWO_P23		0x4b000000	/* 2^23 */
+#define FX_FRACTION_1_28 0x9249250	/* 0x100000000 / 28 + 1 */
+
+	/* Implements the function
+
+	   float [fp1] cosf (float [fp1] x)  */
+
+	.machine power8
+ENTRY (__cosf, 4)
+	addis	r9,r2,L(anchor)@toc@ha
+	addi	r9,r9,L(anchor)@toc@l
+
+	lis	r4,PI_4@h
+	ori	r4,r4,PI_4@l
+
+	xscvdpspn v0,v1
+	mfvsrd	r8,v0
+	rldicl	r3,r8,32,33		/* Remove sign bit.  */
+
+	cmpw	r3,r4
+	bge	L(greater_or_equal_pio4)
+
+	lis	r4,TWO_PN5@h
+	ori	r4,r4,TWO_PN5@l
+
+	cmpw	r3,r4
+	blt	L(less_2pn5)
+
+	/* Chebyshev polynomial of the form:
+	 * 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).  */
+
+	lfd	fp9,(L(C0)-L(anchor))(r9)
+	lfd	fp10,(L(C1)-L(anchor))(r9)
+	lfd	fp11,(L(C2)-L(anchor))(r9)
+	lfd	fp12,(L(C3)-L(anchor))(r9)
+	lfd	fp13,(L(C4)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp1		/* x^2 */
+	lfd     fp3,(L(DPone)-L(anchor))(r9)
+
+	fmadd	fp4,fp2,fp13,fp12	/* C3+x^2*C4 */
+	fmadd	fp4,fp2,fp4,fp11	/* C2+x^2*(C3+x^2*C4) */
+	fmadd	fp4,fp2,fp4,fp10	/* C1+x^2*(C2+x^2*(C3+x^2*C4)) */
+	fmadd	fp4,fp2,fp4,fp9		/* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */
+	fmadd	fp1,fp2,fp4,fp3		/* 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */
+	frsp	fp1,fp1			/* Round to single precision.  */
+
+	blr
+
+	.balign 16
+L(greater_or_equal_pio4):
+	lis	r4,NINEPI_4@h
+	ori	r4,r4,NINEPI_4@l
+	cmpw	r3,r4
+	bge	L(greater_or_equal_9pio4)
+
+	/* Calculate quotient of |x|/(PI/4).  */
+	lfd	fp2,(L(invpio4)-L(anchor))(r9)
+	fabs	fp1,fp1			/* |x| */
+	fmul	fp2,fp1,fp2		/* |x|/(PI/4) */
+	fctiduz	fp2,fp2
+	mfvsrd	r3,v2			/* n = |x| mod PI/4 */
+
+	/* Now use that quotient to find |x| mod (PI/2).  */
+	addi	r7,r3,1
+	rldicr	r5,r7,2,60		/* ((n+1) >> 1) << 3 */
+	addi	r6,r9,(L(pio2_table)-L(anchor))
+	lfdx	fp4,r5,r6
+	fsub	fp1,fp1,fp4
+
+	.balign 16
+L(reduced):
+	/* Now we are in the range -PI/4 to PI/4.  */
+
+	/* Work out if we are in a positive or negative primary interval.  */
+	addi    r7,r7,2
+	rldicl	r4,r7,62,63		/* ((n+3) >> 2) & 1 */
+
+	/* Load a 1.0 or -1.0.  */
+	addi	r5,r9,(L(ones)-L(anchor))
+	sldi	r4,r4,3
+	lfdx	fp0,r4,r5
+
+	/* Are we in the primary interval of sin or cos?  */
+	andi.	r4,r7,0x2
+	bne	L(cos)
+
+	/* Chebyshev polynomial of the form:
+	   x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).  */
+
+	lfd	fp9,(L(S0)-L(anchor))(r9)
+	lfd	fp10,(L(S1)-L(anchor))(r9)
+	lfd	fp11,(L(S2)-L(anchor))(r9)
+	lfd	fp12,(L(S3)-L(anchor))(r9)
+	lfd	fp13,(L(S4)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp1		/* x^2 */
+	fmul	fp3,fp2,fp1		/* x^3 */
+
+	fmadd	fp4,fp2,fp13,fp12	/* S3+x^2*S4 */
+	fmadd	fp4,fp2,fp4,fp11	/* S2+x^2*(S3+x^2*S4) */
+	fmadd	fp4,fp2,fp4,fp10	/* S1+x^2*(S2+x^2*(S3+x^2*S4)) */
+	fmadd	fp4,fp2,fp4,fp9		/* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */
+	fmadd	fp4,fp3,fp4,fp1		/* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */
+	fmul	fp4,fp4,fp0		/* Add in the sign.  */
+	frsp	fp1,fp4			/* Round to single precision.  */
+
+	blr
+
+	.balign 16
+L(cos):
+	/* Chebyshev polynomial of the form:
+	   1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).  */
+
+	lfd	fp9,(L(C0)-L(anchor))(r9)
+	lfd	fp10,(L(C1)-L(anchor))(r9)
+	lfd	fp11,(L(C2)-L(anchor))(r9)
+	lfd	fp12,(L(C3)-L(anchor))(r9)
+	lfd	fp13,(L(C4)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp1		/* x^2 */
+	lfd	fp3,(L(DPone)-L(anchor))(r9)
+
+	fmadd	fp4,fp2,fp13,fp12	/* C3+x^2*C4 */
+	fmadd	fp4,fp2,fp4,fp11	/* C2+x^2*(C3+x^2*C4) */
+	fmadd	fp4,fp2,fp4,fp10	/* C1+x^2*(C2+x^2*(C3+x^2*C4)) */
+	fmadd	fp4,fp2,fp4,fp9		/* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */
+	fmadd	fp4,fp2,fp4,fp3		/* 1.0 + x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */
+	fmul	fp4,fp4,fp0		/* Add in the sign.  */
+	frsp	fp1,fp4			/* Round to single precision.  */
+
+	blr
+
+	.balign 16
+L(greater_or_equal_9pio4):
+	lis	r4,INFINITY@h
+	ori	r4,r4,INFINITY@l
+	cmpw	r3,r4
+	bge	L(inf_or_nan)
+
+	lis	r4,TWO_P23@h
+	ori	r4,r4,TWO_P23@l
+	cmpw	r3,r4
+	bge	L(greater_or_equal_2p23)
+
+	fabs	fp1,fp1			/* |x| */
+
+	/* Calculate quotient of |x|/(PI/4).  */
+	lfd	fp2,(L(invpio4)-L(anchor))(r9)
+
+	lfd     fp3,(L(DPone)-L(anchor))(r9)
+	lfd     fp4,(L(DPhalf)-L(anchor))(r9)
+	fmul    fp2,fp1,fp2             /* |x|/(PI/4) */
+	friz    fp2,fp2                 /* n = floor(|x|/(PI/4)) */
+
+	/* Calculate (n + 1) / 2.  */
+	fadd	fp2,fp2,fp3		/* n + 1 */
+	fmul	fp3,fp2,fp4		/* (n + 1) / 2 */
+	friz	fp3,fp3
+
+	lfd	fp4,(L(pio2hi)-L(anchor))(r9)
+	lfd	fp5,(L(pio2lo)-L(anchor))(r9)
+
+	fmul	fp6,fp4,fp3
+	fadd	fp6,fp6,fp1
+	fmadd	fp1,fp5,fp3,fp6
+
+	fctiduz	fp2,fp2
+	mfvsrd	r7,v2			/* n + 1 */
+
+	b	L(reduced)
+
+	.balign 16
+L(inf_or_nan):
+	bne	L(skip_errno_setting)	/* Is a NAN?  */
+
+	/* We delayed the creation of the stack frame, as well as the saving of
+	   the link register, because only at this point, we are sure that
+	   doing so is actually needed.  */
+
+	stfd	fp1,-8(r1)
+
+	/* Save the link register.  */
+	mflr	r0
+	std	r0,16(r1)
+	cfi_offset(lr, 16)
+
+	/* Create the stack frame.  */
+	stdu	r1,-FRAMESIZE(r1)
+	cfi_adjust_cfa_offset(FRAMESIZE)
+
+	bl	JUMPTARGET(__errno_location)
+	nop
+
+	/* Restore the stack frame.  */
+	addi	r1,r1,FRAMESIZE
+	cfi_adjust_cfa_offset(-FRAMESIZE)
+	/* Restore the link register.  */
+	ld	r0,16(r1)
+	mtlr	r0
+
+	lfd	fp1,-8(r1)
+
+	/* errno = EDOM */
+	li	r4,EDOM
+	stw	r4,0(r3)
+
+L(skip_errno_setting):
+	fsub	fp1,fp1,fp1		/* x - x */
+	blr
+
+	.balign 16
+L(greater_or_equal_2p23):
+	fabs	fp1,fp1
+
+	srwi	r4,r3,FLOAT_EXPONENT_SHIFT
+	subi	r4,r4,FLOAT_EXPONENT_BIAS
+
+	/* We reduce the input modulo pi/4, so we need 3 bits of integer
+	   to determine where in 2*pi we are. Index into our array
+	   accordingly.  */
+	addi r4,r4,INTEGER_BITS
+
+	/* To avoid an expensive divide, for the range we care about (0 - 127)
+	   we can transform x/28 into:
+
+	   x/28 = (x * ((0x100000000 / 28) + 1)) >> 32
+
+	   mulhwu returns the top 32 bits of the 64 bit result, doing the
+	   shift for us in the same instruction. The top 32 bits are undefined,
+	   so we have to mask them.  */
+
+	lis	r6,FX_FRACTION_1_28@h
+	ori	r6,r6,FX_FRACTION_1_28@l
+	mulhwu	r5,r4,r6
+	clrldi	r5,r5,32
+
+	/* Get our pointer into the invpio4_table array.  */
+	sldi	r4,r5,3
+	addi	r6,r9,(L(invpio4_table)-L(anchor))
+	add	r4,r4,r6
+
+	lfd	fp2,0(r4)
+	lfd	fp3,8(r4)
+	lfd	fp4,16(r4)
+	lfd	fp5,24(r4)
+
+	fmul	fp6,fp2,fp1
+	fmul	fp7,fp3,fp1
+	fmul	fp8,fp4,fp1
+	fmul	fp9,fp5,fp1
+
+	/* Mask off larger integer bits in highest double word that we don't
+	   care about to avoid losing precision when combining with smaller
+	   values.  */
+	fctiduz	fp10,fp6
+	mfvsrd	r7,v10
+	rldicr	r7,r7,0,(63-INTEGER_BITS)
+	mtvsrd	v10,r7
+	fcfidu	fp10,fp10		/* Integer bits.  */
+
+	fsub	fp6,fp6,fp10		/* highest -= integer bits */
+
+	/* Work out the integer component, rounded down. Use the top two
+	   limbs for this.  */
+	fadd	fp10,fp6,fp7		/* highest + higher */
+
+	fctiduz	fp10,fp10
+	mfvsrd	r7,v10
+	andi.	r0,r7,1
+	fcfidu	fp10,fp10
+
+	/* Subtract integer component from highest limb.  */
+	fsub	fp12,fp6,fp10
+
+	beq	L(even_integer)
+
+	/* Our integer component is odd, so we are in the -PI/4 to 0 primary
+	   region. We need to shift our result down by PI/4, and to do this
+	   in the mod (4/PI) space we simply subtract 1.  */
+	lfd	fp11,(L(DPone)-L(anchor))(r9)
+	fsub	fp12,fp12,fp11
+
+	/* Now add up all the limbs in order.  */
+	fadd	fp12,fp12,fp7
+	fadd	fp12,fp12,fp8
+	fadd	fp12,fp12,fp9
+
+	/* And finally multiply by pi/4.  */
+	lfd	fp13,(L(pio4)-L(anchor))(r9)
+	fmul	fp1,fp12,fp13
+
+	addi	r7,r7,1
+	b	L(reduced)
+
+L(even_integer):
+	lfd	fp11,(L(DPone)-L(anchor))(r9)
+
+	/* Now add up all the limbs in order.  */
+	fadd	fp12,fp12,fp7
+	fadd	fp12,r12,fp8
+	fadd	fp12,r12,fp9
+
+	/* We need to check if the addition of all the limbs resulted in us
+	   overflowing 1.0.  */
+	fcmpu	0,fp12,fp11
+	bgt	L(greater_than_one)
+
+	/* And finally multiply by pi/4.  */
+	lfd	fp13,(L(pio4)-L(anchor))(r9)
+	fmul	fp1,fp12,fp13
+
+	addi	r7,r7,1
+	b	L(reduced)
+
+L(greater_than_one):
+	/* We did overflow 1.0 when adding up all the limbs. Add 1.0 to our
+	   integer, and subtract 1.0 from our result. Since that makes the
+	   integer component odd, we need to subtract another 1.0 as
+	   explained above.  */
+	addi	r7,r7,1
+
+	lfd	fp11,(L(DPtwo)-L(anchor))(r9)
+	fsub	fp12,fp12,fp11
+
+	/* And finally multiply by pi/4.  */
+	lfd	fp13,(L(pio4)-L(anchor))(r9)
+	fmul	fp1,fp12,fp13
+
+	addi	r7,r7,1
+	b	L(reduced)
+
+	.balign 16
+L(less_2pn5):
+	lis	r4,TWO_PN27@h
+	ori	r4,r4,TWO_PN27@l
+
+	cmpw	r3,r4
+	blt	L(less_2pn27)
+
+	/* A simpler Chebyshev approximation is close enough for this range:
+	   1.0+x^2*(CC0+x^3*CC1).  */
+
+	lfd	fp10,(L(CC0)-L(anchor))(r9)
+	lfd	fp11,(L(CC1)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp1		/* x^2 */
+	fmul	fp3,fp2,fp1		/* x^3 */
+	lfd     fp1,(L(DPone)-L(anchor))(r9)
+
+	fmadd   fp4,fp3,fp11,fp10       /* CC0+x^3*CC1 */
+	fmadd	fp1,fp2,fp4,fp1		/* 1.0+x^2*(CC0+x^3*CC1) */
+
+	frsp	fp1,fp1			/* Round to single precision.  */
+
+	blr
+
+	.balign 16
+L(less_2pn27):
+	/* Handle some special cases:
+
+	   cosf(subnormal) raises inexact
+	   cosf(min_normalized) raises inexact
+	   cosf(normalized) raises inexact.  */
+
+	lfd     fp2,(L(DPone)-L(anchor))(r9)
+
+	fabs    fp1,fp1                 /* |x| */
+	fsub	fp1,fp2,fp1		/* 1.0-|x| */
+
+	frsp	fp1,fp1
+
+	blr
+
+END (__cosf)
+
+	.section .rodata, "a"
+
+	.balign 8
+
+L(anchor):
+
+	/* Chebyshev constants for sin, range -PI/4 - PI/4.  */
+L(S0):	.8byte	0xbfc5555555551cd9
+L(S1):	.8byte	0x3f81111110c2688b
+L(S2):	.8byte	0xbf2a019f8b4bd1f9
+L(S3):	.8byte	0x3ec71d7264e6b5b4
+L(S4):	.8byte	0xbe5a947e1674b58a
+
+	/* Chebyshev constants for cos, range 2^-27 - 2^-5.  */
+L(CC0):	.8byte	0xbfdfffffff5cc6fd
+L(CC1):	.8byte	0x3fa55514b178dac5
+
+	/* Chebyshev constants for cos, range -PI/4 - PI/4.  */
+L(C0):	.8byte	0xbfdffffffffe98ae
+L(C1):	.8byte	0x3fa55555545c50c7
+L(C2):	.8byte	0xbf56c16b348b6874
+L(C3):	.8byte	0x3efa00eb9ac43cc0
+L(C4):	.8byte	0xbe923c97dd8844d7
+
+L(invpio2):
+	.8byte	0x3fe45f306dc9c883	/* 2/PI */
+
+L(invpio4):
+	.8byte	0x3ff45f306dc9c883	/* 4/PI */
+
+L(invpio4_table):
+	.8byte	0x0000000000000000
+	.8byte	0x3ff45f306c000000
+	.8byte	0x3e3c9c882a000000
+	.8byte	0x3c54fe13a8000000
+	.8byte	0x3aaf47d4d0000000
+	.8byte	0x38fbb81b6c000000
+	.8byte	0x3714acc9e0000000
+	.8byte	0x3560e4107c000000
+	.8byte	0x33bca2c756000000
+	.8byte	0x31fbd778ac000000
+	.8byte	0x300b7246e0000000
+	.8byte	0x2e5d2126e8000000
+	.8byte	0x2c97003248000000
+	.8byte	0x2ad77504e8000000
+	.8byte	0x290921cfe0000000
+	.8byte	0x274deb1cb0000000
+	.8byte	0x25829a73e0000000
+	.8byte	0x23fd1046be000000
+	.8byte	0x2224baed10000000
+	.8byte	0x20709d338e000000
+	.8byte	0x1e535a2f80000000
+	.8byte	0x1cef904e64000000
+	.8byte	0x1b0d639830000000
+	.8byte	0x1964ce7d24000000
+	.8byte	0x17b908bf16000000
+
+L(pio4):
+	.8byte	0x3fe921fb54442d18	/* PI/4 */
+
+/* PI/2 as a sum of two doubles. We only use 32 bits of the upper limb
+   to avoid losing significant bits when multiplying with up to
+   (2^22)/(pi/2).  */
+L(pio2hi):
+	.8byte	0xbff921fb54400000
+
+L(pio2lo):
+	.8byte	0xbdd0b4611a626332
+
+L(pio2_table):
+	.8byte	0
+	.8byte	0x3ff921fb54442d18	/* 1 * PI/2 */
+	.8byte	0x400921fb54442d18	/* 2 * PI/2 */
+	.8byte	0x4012d97c7f3321d2	/* 3 * PI/2 */
+	.8byte	0x401921fb54442d18	/* 4 * PI/2 */
+	.8byte	0x401f6a7a2955385e	/* 5 * PI/2 */
+	.8byte	0x4022d97c7f3321d2	/* 6 * PI/2 */
+	.8byte	0x4025fdbbe9bba775	/* 7 * PI/2 */
+	.8byte	0x402921fb54442d18	/* 8 * PI/2 */
+	.8byte	0x402c463abeccb2bb	/* 9 * PI/2 */
+	.8byte	0x402f6a7a2955385e	/* 10 * PI/2 */
+
+L(small):
+	.8byte	0x3cd0000000000000	/* 2^-50 */
+
+L(ones):
+	.8byte	0x3ff0000000000000	/* +1.0 */
+	.8byte	0xbff0000000000000	/* -1.0 */
+
+L(DPhalf):
+	.8byte	0x3fe0000000000000	/* 0.5 */
+
+L(DPone):
+	.8byte	0x3ff0000000000000	/* 1.0 */
+
+L(DPtwo):
+	.8byte	0x4000000000000000	/* 2.0 */
+
+libm_alias_float (__cos, cos)
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S
index 3b0c88e5eb..aac54a9364 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S
@@ -1,5 +1,5 @@
 /* isfinite().  PowerPC64/POWER8 version.
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
+   Copyright (C) 2014-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -23,7 +23,7 @@
 
 /* int [r3] __finite ([fp1] x)  */
 
-EALIGN (__finite, 4, 0)
+ENTRY_TOCLESS (__finite, 4)
 	CALL_MCOUNT 0
 	MFVSRD_R3_V1
 	lis     r9,0x8010
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S
index 4708239689..94746ef068 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S
@@ -1,5 +1,5 @@
 /* isinf().  PowerPC64/POWER8 version.
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
+   Copyright (C) 2014-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -23,7 +23,7 @@
 
 /* int [r3] __isinf([fp1] x)  */
 
-EALIGN (__isinf, 4, 0)
+ENTRY_TOCLESS (__isinf, 4)
 	CALL_MCOUNT 0
 	MFVSRD_R3_V1
 	lis     r9,0x7ff0     /* r9 = 0x7ff0  */
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S
index 0e5c19333c..8aef354f68 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S
@@ -1,5 +1,5 @@
 /* isnan().  PowerPC64/POWER8 version.
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
+   Copyright (C) 2014-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -23,7 +23,7 @@
 
 /* int [r3] __isnan([f1] x)  */
 
-EALIGN (__isnan, 4, 0)
+ENTRY_TOCLESS (__isnan, 4)
 	CALL_MCOUNT 0
 	MFVSRD_R3_V1
 	lis     r9,0x7ff0
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S
index 0af9feee5a..7f18d705a9 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S
@@ -1,5 +1,5 @@
 /* Round double to long int.  POWER8 PowerPC64 version.
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
+   Copyright (C) 2014-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -18,11 +18,13 @@
 
 #include <sysdep.h>
 #include <math_ldbl_opt.h>
+#include <libm-alias-float.h>
+#include <libm-alias-double.h>
 
 #define MFVSRD_R3_V1  .long 0x7c230066     /* mfvsrd  r3,vs1  */
 
 /* long long int[r3] __llrint (double x[fp1])  */
-ENTRY (__llrint)
+ENTRY_TOCLESS (__llrint)
 	CALL_MCOUNT 0
 	fctid	fp1,fp1
 	MFVSRD_R3_V1
@@ -30,16 +32,12 @@ ENTRY (__llrint)
 END (__llrint)
 
 strong_alias (__llrint, __lrint)
-weak_alias (__llrint, llrint)
-weak_alias (__lrint, lrint)
-
-#ifdef NO_LONG_DOUBLE
-strong_alias (__llrint, __llrintl)
-weak_alias (__llrint, llrintl)
-strong_alias (__lrint, __lrintl)
-weak_alias (__lrint, lrintl)
-#endif
-#if LONG_DOUBLE_COMPAT(libm, GLIBC_2_1)
-compat_symbol (libm, __llrint, llrintl, GLIBC_2_1)
-compat_symbol (libm, __lrint, lrintl, GLIBC_2_1)
-#endif
+libm_alias_double (__llrint, llrint)
+libm_alias_double (__lrint, lrint)
+/* The double version also works for single-precision as both float and
+   double parameters are passed in 64bit FPRs and both versions are expected
+   to return [long] long type.  */
+strong_alias (__llrint, __llrintf)
+libm_alias_float (__llrint, llrint)
+strong_alias (__lrint, __lrintf)
+libm_alias_float (__lrint, lrint)
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S
index af2409deb8..a22fc63bb3 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S
@@ -1,5 +1,5 @@
 /* llround function.  POWER8 PowerPC64 version.
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
+   Copyright (C) 2014-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -19,12 +19,14 @@
 #include <sysdep.h>
 #include <endian.h>
 #include <math_ldbl_opt.h>
+#include <libm-alias-float.h>
+#include <libm-alias-double.h>
 
 #define MFVSRD_R3_V1  .long 0x7c230066     /* mfvsrd  r3,vs1  */
 
 /* long long [r3] llround (float x [fp1])  */
 
-ENTRY (__llround)
+ENTRY_TOCLESS (__llround)
 	CALL_MCOUNT 0
 	frin	fp1,fp1	/* Round to nearest +-0.5.  */
 	fctidz	fp1,fp1	/* Convert To Integer DW round toward 0.  */
@@ -33,16 +35,12 @@ ENTRY (__llround)
 END (__llround)
 
 strong_alias (__llround, __lround)
-weak_alias (__llround, llround)
-weak_alias (__lround, lround)
-
-#ifdef NO_LONG_DOUBLE
-weak_alias (__llround, llroundl)
-strong_alias (__llround, __llroundl)
-weak_alias (__lround, lroundl)
-strong_alias (__lround, __lroundl)
-#endif
-#if LONG_DOUBLE_COMPAT(libm, GLIBC_2_1)
-compat_symbol (libm, __llround, llroundl, GLIBC_2_1)
-compat_symbol (libm, __lround, lroundl, GLIBC_2_1)
-#endif
+libm_alias_double (__llround, llround)
+libm_alias_double (__lround, lround)
+/* The double version also works for single-precision as both float and
+   double parameters are passed in 64bit FPRs and both versions are expected
+   to return [long] long type.  */
+strong_alias (__llround, __llroundf)
+libm_alias_float (__llround, llround)
+strong_alias (__lround, __lroundf)
+libm_alias_float (__lround, lround)
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_llroundf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_llroundf.S
new file mode 100644
index 0000000000..9ea6bd105b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_llroundf.S
@@ -0,0 +1 @@
+/* __lroundf is in s_llround.S.  */
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S
new file mode 100644
index 0000000000..59e613c102
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S
@@ -0,0 +1,520 @@
+/* Optimized sinf().  PowerPC64/POWER8 version.
+   Copyright (C) 2016-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#define _ERRNO_H	1
+#include <bits/errno.h>
+#include <libm-alias-float.h>
+
+#define FRAMESIZE (FRAME_MIN_SIZE+16)
+
+#define FLOAT_EXPONENT_SHIFT	23
+#define FLOAT_EXPONENT_BIAS	127
+#define INTEGER_BITS		3
+
+#define PI_4		0x3f490fdb	/* PI/4 */
+#define NINEPI_4	0x40e231d6	/* 9 * PI/4 */
+#define TWO_PN5		0x3d000000	/* 2^-5 */
+#define TWO_PN27	0x32000000	/* 2^-27 */
+#define INFINITY	0x7f800000
+#define TWO_P23		0x4b000000	/* 2^27 */
+#define FX_FRACTION_1_28 0x9249250	/* 0x100000000 / 28 + 1 */
+
+	/* Implements the function
+
+	   float [fp1] sinf (float [fp1] x)  */
+
+	.machine power8
+ENTRY (__sinf, 4)
+	addis	r9,r2,L(anchor)@toc@ha
+	addi	r9,r9,L(anchor)@toc@l
+
+	lis	r4,PI_4@h
+	ori	r4,r4,PI_4@l
+
+	xscvdpspn v0,v1
+	mfvsrd	r8,v0
+	rldicl	r3,r8,32,33		/* Remove sign bit.  */
+
+	cmpw	r3,r4
+	bge	L(greater_or_equal_pio4)
+
+	lis	r4,TWO_PN5@h
+	ori	r4,r4,TWO_PN5@l
+
+	cmpw	r3,r4
+	blt	L(less_2pn5)
+
+	/* Chebyshev polynomial of the form:
+	 * x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).  */
+
+	lfd	fp9,(L(S0)-L(anchor))(r9)
+	lfd	fp10,(L(S1)-L(anchor))(r9)
+	lfd	fp11,(L(S2)-L(anchor))(r9)
+	lfd	fp12,(L(S3)-L(anchor))(r9)
+	lfd	fp13,(L(S4)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp1		/* x^2 */
+	fmul	fp3,fp2,fp1		/* x^3 */
+
+	fmadd	fp4,fp2,fp13,fp12	/* S3+x^2*S4 */
+	fmadd	fp4,fp2,fp4,fp11	/* S2+x^2*(S3+x^2*S4) */
+	fmadd	fp4,fp2,fp4,fp10	/* S1+x^2*(S2+x^2*(S3+x^2*S4)) */
+	fmadd	fp4,fp2,fp4,fp9		/* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */
+	fmadd	fp1,fp3,fp4,fp1		/* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */
+	frsp	fp1,fp1			/* Round to single precision.  */
+
+	blr
+
+	.balign 16
+L(greater_or_equal_pio4):
+	lis	r4,NINEPI_4@h
+	ori	r4,r4,NINEPI_4@l
+	cmpw	r3,r4
+	bge	L(greater_or_equal_9pio4)
+
+	/* Calculate quotient of |x|/(PI/4).  */
+	lfd	fp2,(L(invpio4)-L(anchor))(r9)
+	fabs	fp1,fp1			/* |x| */
+	fmul	fp2,fp1,fp2		/* |x|/(PI/4) */
+	fctiduz	fp2,fp2
+	mfvsrd	r3,v2			/* n = |x| mod PI/4 */
+
+	/* Now use that quotient to find |x| mod (PI/2).  */
+	addi	r7,r3,1
+	rldicr	r5,r7,2,60		/* ((n+1) >> 1) << 3 */
+	addi	r6,r9,(L(pio2_table)-L(anchor))
+	lfdx	fp4,r5,r6
+	fsub	fp1,fp1,fp4
+
+	.balign 16
+L(reduced):
+	/* Now we are in the range -PI/4 to PI/4.  */
+
+	/* Work out if we are in a positive or negative primary interval.  */
+	rldicl	r4,r7,62,63		/* ((n+1) >> 2) & 1 */
+
+	/* We are operating on |x|, so we need to add back the original
+	   sign.  */
+	rldicl	r8,r8,33,63		/* (x >> 31) & 1, ie the sign bit.  */
+	xor	r4,r4,r8		/* 0 if result should be positive,
+					   1 if negative.  */
+
+	/* Load a 1.0 or -1.0.  */
+	addi	r5,r9,(L(ones)-L(anchor))
+	sldi	r4,r4,3
+	lfdx	fp0,r4,r5
+
+	/* Are we in the primary interval of sin or cos?  */
+	andi.	r4,r7,0x2
+	bne	L(cos)
+
+	/* Chebyshev polynomial of the form:
+	   x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).  */
+
+	lfd	fp9,(L(S0)-L(anchor))(r9)
+	lfd	fp10,(L(S1)-L(anchor))(r9)
+	lfd	fp11,(L(S2)-L(anchor))(r9)
+	lfd	fp12,(L(S3)-L(anchor))(r9)
+	lfd	fp13,(L(S4)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp1		/* x^2 */
+	fmul	fp3,fp2,fp1		/* x^3 */
+
+	fmadd	fp4,fp2,fp13,fp12	/* S3+x^2*S4 */
+	fmadd	fp4,fp2,fp4,fp11	/* S2+x^2*(S3+x^2*S4) */
+	fmadd	fp4,fp2,fp4,fp10	/* S1+x^2*(S2+x^2*(S3+x^2*S4)) */
+	fmadd	fp4,fp2,fp4,fp9		/* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */
+	fmadd	fp4,fp3,fp4,fp1		/* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */
+	fmul	fp4,fp4,fp0		/* Add in the sign.  */
+	frsp	fp1,fp4			/* Round to single precision.  */
+
+	blr
+
+	.balign 16
+L(cos):
+	/* Chebyshev polynomial of the form:
+	   1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).  */
+
+	lfd	fp9,(L(C0)-L(anchor))(r9)
+	lfd	fp10,(L(C1)-L(anchor))(r9)
+	lfd	fp11,(L(C2)-L(anchor))(r9)
+	lfd	fp12,(L(C3)-L(anchor))(r9)
+	lfd	fp13,(L(C4)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp1		/* x^2 */
+	lfd	fp3,(L(DPone)-L(anchor))(r9)
+
+	fmadd	fp4,fp2,fp13,fp12	/* C3+x^2*C4 */
+	fmadd	fp4,fp2,fp4,fp11	/* C2+x^2*(C3+x^2*C4) */
+	fmadd	fp4,fp2,fp4,fp10	/* C1+x^2*(C2+x^2*(C3+x^2*C4)) */
+	fmadd	fp4,fp2,fp4,fp9		/* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */
+	fmadd	fp4,fp2,fp4,fp3		/* 1.0 + x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */
+	fmul	fp4,fp4,fp0		/* Add in the sign.  */
+	frsp	fp1,fp4			/* Round to single precision.  */
+
+	blr
+
+	.balign 16
+L(greater_or_equal_9pio4):
+	lis	r4,INFINITY@h
+	ori	r4,r4,INFINITY@l
+	cmpw	r3,r4
+	bge	L(inf_or_nan)
+
+	lis	r4,TWO_P23@h
+	ori	r4,r4,TWO_P23@l
+	cmpw	r3,r4
+	bge	L(greater_or_equal_2p23)
+
+	fabs	fp1,fp1			/* |x| */
+
+	/* Calculate quotient of |x|/(PI/4).  */
+	lfd	fp2,(L(invpio4)-L(anchor))(r9)
+
+	lfd	fp3,(L(DPone)-L(anchor))(r9)
+	lfd	fp4,(L(DPhalf)-L(anchor))(r9)
+	fmul	fp2,fp1,fp2		/* |x|/(PI/4) */
+	friz	fp2,fp2			/* n = floor(|x|/(PI/4)) */
+
+	/* Calculate (n + 1) / 2.  */
+	fadd	fp2,fp2,fp3		/* n + 1 */
+	fmul	fp3,fp2,fp4		/* (n + 1) / 2 */
+	friz	fp3,fp3
+
+	lfd	fp4,(L(pio2hi)-L(anchor))(r9)
+	lfd	fp5,(L(pio2lo)-L(anchor))(r9)
+
+	fmul	fp6,fp4,fp3
+	fadd	fp6,fp6,fp1
+	fmadd	fp1,fp5,fp3,fp6
+
+	fctiduz	fp2,fp2
+	mfvsrd	r7,v2			/* n + 1 */
+
+	b	L(reduced)
+
+	.balign 16
+L(inf_or_nan):
+	bne	L(skip_errno_setting)	/* Is a NAN?  */
+
+	/* We delayed the creation of the stack frame, as well as the saving of
+	   the link register, because only at this point, we are sure that
+	   doing so is actually needed.  */
+
+	stfd	fp1,-8(r1)
+
+	/* Save the link register.  */
+	mflr	r0
+	std	r0,16(r1)
+	cfi_offset(lr, 16)
+
+	/* Create the stack frame.  */
+	stdu	r1,-FRAMESIZE(r1)
+	cfi_adjust_cfa_offset(FRAMESIZE)
+
+	bl	JUMPTARGET(__errno_location)
+	nop
+
+	/* Restore the stack frame.  */
+	addi	r1,r1,FRAMESIZE
+	cfi_adjust_cfa_offset(-FRAMESIZE)
+	/* Restore the link register.  */
+	ld	r0,16(r1)
+	mtlr	r0
+
+	lfd	fp1,-8(r1)
+
+	/* errno = EDOM */
+	li	r4,EDOM
+	stw	r4,0(r3)
+
+L(skip_errno_setting):
+	fsub	fp1,fp1,fp1		/* x - x */
+	blr
+
+	.balign 16
+L(greater_or_equal_2p23):
+	fabs	fp1,fp1
+
+	srwi	r4,r3,FLOAT_EXPONENT_SHIFT
+	subi	r4,r4,FLOAT_EXPONENT_BIAS
+
+	/* We reduce the input modulo pi/4, so we need 3 bits of integer
+	   to determine where in 2*pi we are. Index into our array
+	   accordingly.  */
+	addi r4,r4,INTEGER_BITS
+
+	/* To avoid an expensive divide, for the range we care about (0 - 127)
+	   we can transform x/28 into:
+
+	   x/28 = (x * ((0x100000000 / 28) + 1)) >> 32
+
+	   mulhwu returns the top 32 bits of the 64 bit result, doing the
+	   shift for us in the same instruction. The top 32 bits are undefined,
+	   so we have to mask them.  */
+
+	lis	r6,FX_FRACTION_1_28@h
+	ori	r6,r6,FX_FRACTION_1_28@l
+	mulhwu	r5,r4,r6
+	clrldi	r5,r5,32
+
+	/* Get our pointer into the invpio4_table array.  */
+	sldi	r4,r5,3
+	addi	r6,r9,(L(invpio4_table)-L(anchor))
+	add	r4,r4,r6
+
+	lfd	fp2,0(r4)
+	lfd	fp3,8(r4)
+	lfd	fp4,16(r4)
+	lfd	fp5,24(r4)
+
+	fmul	fp6,fp2,fp1
+	fmul	fp7,fp3,fp1
+	fmul	fp8,fp4,fp1
+	fmul	fp9,fp5,fp1
+
+	/* Mask off larger integer bits in highest double word that we don't
+	   care about to avoid losing precision when combining with smaller
+	   values.  */
+	fctiduz	fp10,fp6
+	mfvsrd	r7,v10
+	rldicr	r7,r7,0,(63-INTEGER_BITS)
+	mtvsrd	v10,r7
+	fcfidu	fp10,fp10		/* Integer bits.  */
+
+	fsub	fp6,fp6,fp10		/* highest -= integer bits */
+
+	/* Work out the integer component, rounded down. Use the top two
+	   limbs for this.  */
+	fadd	fp10,fp6,fp7		/* highest + higher */
+
+	fctiduz	fp10,fp10
+	mfvsrd	r7,v10
+	andi.	r0,r7,1
+	fcfidu	fp10,fp10
+
+	/* Subtract integer component from highest limb.  */
+	fsub	fp12,fp6,fp10
+
+	beq	L(even_integer)
+
+	/* Our integer component is odd, so we are in the -PI/4 to 0 primary
+	   region. We need to shift our result down by PI/4, and to do this
+	   in the mod (4/PI) space we simply subtract 1.  */
+	lfd	fp11,(L(DPone)-L(anchor))(r9)
+	fsub	fp12,fp12,fp11
+
+	/* Now add up all the limbs in order.  */
+	fadd	fp12,fp12,fp7
+	fadd	fp12,fp12,fp8
+	fadd	fp12,fp12,fp9
+
+	/* And finally multiply by pi/4.  */
+	lfd	fp13,(L(pio4)-L(anchor))(r9)
+	fmul	fp1,fp12,fp13
+
+	addi	r7,r7,1
+	b	L(reduced)
+
+L(even_integer):
+	lfd	fp11,(L(DPone)-L(anchor))(r9)
+
+	/* Now add up all the limbs in order.  */
+	fadd	fp12,fp12,fp7
+	fadd	fp12,r12,fp8
+	fadd	fp12,r12,fp9
+
+	/* We need to check if the addition of all the limbs resulted in us
+	   overflowing 1.0.  */
+	fcmpu	0,fp12,fp11
+	bgt	L(greater_than_one)
+
+	/* And finally multiply by pi/4.  */
+	lfd	fp13,(L(pio4)-L(anchor))(r9)
+	fmul	fp1,fp12,fp13
+
+	addi	r7,r7,1
+	b	L(reduced)
+
+L(greater_than_one):
+	/* We did overflow 1.0 when adding up all the limbs. Add 1.0 to our
+	   integer, and subtract 1.0 from our result. Since that makes the
+	   integer component odd, we need to subtract another 1.0 as
+	   explained above.  */
+	addi	r7,r7,1
+
+	lfd	fp11,(L(DPtwo)-L(anchor))(r9)
+	fsub	fp12,fp12,fp11
+
+	/* And finally multiply by pi/4.  */
+	lfd	fp13,(L(pio4)-L(anchor))(r9)
+	fmul	fp1,fp12,fp13
+
+	addi	r7,r7,1
+	b	L(reduced)
+
+	.balign 16
+L(less_2pn5):
+	lis	r4,TWO_PN27@h
+	ori	r4,r4,TWO_PN27@l
+
+	cmpw	r3,r4
+	blt	L(less_2pn27)
+
+	/* A simpler Chebyshev approximation is close enough for this range:
+	   x+x^3*(SS0+x^2*SS1).  */
+
+	lfd	fp10,(L(SS0)-L(anchor))(r9)
+	lfd	fp11,(L(SS1)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp1		/* x^2 */
+	fmul	fp3,fp2,fp1		/* x^3 */
+
+	fmadd	fp4,fp2,fp11,fp10	/* SS0+x^2*SS1 */
+	fmadd	fp1,fp3,fp4,fp1		/* x+x^3*(SS0+x^2*SS1) */
+
+	frsp	fp1,fp1			/* Round to single precision.  */
+
+	blr
+
+	.balign 16
+L(less_2pn27):
+	cmpwi	r3,0
+	beq	L(zero)
+
+	/* Handle some special cases:
+
+	   sinf(subnormal) raises inexact/underflow
+	   sinf(min_normalized) raises inexact/underflow
+	   sinf(normalized) raises inexact.  */
+
+	lfd	fp2,(L(small)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp2		/* x * small */
+	fsub	fp1,fp1,fp2		/* x - x * small */
+
+	frsp	fp1,fp1
+
+	blr
+
+	.balign 16
+L(zero):
+	blr
+
+END (__sinf)
+
+	.section .rodata, "a"
+
+	.balign 8
+
+L(anchor):
+
+	/* Chebyshev constants for sin, range -PI/4 - PI/4.  */
+L(S0):	.8byte	0xbfc5555555551cd9
+L(S1):	.8byte	0x3f81111110c2688b
+L(S2):	.8byte	0xbf2a019f8b4bd1f9
+L(S3):	.8byte	0x3ec71d7264e6b5b4
+L(S4):	.8byte	0xbe5a947e1674b58a
+
+	/* Chebyshev constants for sin, range 2^-27 - 2^-5.  */
+L(SS0):	.8byte	0xbfc555555543d49d
+L(SS1):	.8byte	0x3f8110f475cec8c5
+
+	/* Chebyshev constants for cos, range -PI/4 - PI/4.  */
+L(C0):	.8byte	0xbfdffffffffe98ae
+L(C1):	.8byte	0x3fa55555545c50c7
+L(C2):	.8byte	0xbf56c16b348b6874
+L(C3):	.8byte	0x3efa00eb9ac43cc0
+L(C4):	.8byte	0xbe923c97dd8844d7
+
+L(invpio2):
+	.8byte	0x3fe45f306dc9c883	/* 2/PI */
+
+L(invpio4):
+	.8byte	0x3ff45f306dc9c883	/* 4/PI */
+
+L(invpio4_table):
+	.8byte	0x0000000000000000
+	.8byte	0x3ff45f306c000000
+	.8byte	0x3e3c9c882a000000
+	.8byte	0x3c54fe13a8000000
+	.8byte	0x3aaf47d4d0000000
+	.8byte	0x38fbb81b6c000000
+	.8byte	0x3714acc9e0000000
+	.8byte	0x3560e4107c000000
+	.8byte	0x33bca2c756000000
+	.8byte	0x31fbd778ac000000
+	.8byte	0x300b7246e0000000
+	.8byte	0x2e5d2126e8000000
+	.8byte	0x2c97003248000000
+	.8byte	0x2ad77504e8000000
+	.8byte	0x290921cfe0000000
+	.8byte	0x274deb1cb0000000
+	.8byte	0x25829a73e0000000
+	.8byte	0x23fd1046be000000
+	.8byte	0x2224baed10000000
+	.8byte	0x20709d338e000000
+	.8byte	0x1e535a2f80000000
+	.8byte	0x1cef904e64000000
+	.8byte	0x1b0d639830000000
+	.8byte	0x1964ce7d24000000
+	.8byte	0x17b908bf16000000
+
+L(pio4):
+	.8byte	0x3fe921fb54442d18	/* PI/4 */
+
+/* PI/2 as a sum of two doubles. We only use 32 bits of the upper limb
+   to avoid losing significant bits when multiplying with up to
+   (2^22)/(pi/2).  */
+L(pio2hi):
+	.8byte	0xbff921fb54400000
+
+L(pio2lo):
+	.8byte	0xbdd0b4611a626332
+
+L(pio2_table):
+	.8byte	0
+	.8byte	0x3ff921fb54442d18	/* 1 * PI/2 */
+	.8byte	0x400921fb54442d18	/* 2 * PI/2 */
+	.8byte	0x4012d97c7f3321d2	/* 3 * PI/2 */
+	.8byte	0x401921fb54442d18	/* 4 * PI/2 */
+	.8byte	0x401f6a7a2955385e	/* 5 * PI/2 */
+	.8byte	0x4022d97c7f3321d2	/* 6 * PI/2 */
+	.8byte	0x4025fdbbe9bba775	/* 7 * PI/2 */
+	.8byte	0x402921fb54442d18	/* 8 * PI/2 */
+	.8byte	0x402c463abeccb2bb	/* 9 * PI/2 */
+	.8byte	0x402f6a7a2955385e	/* 10 * PI/2 */
+
+L(small):
+	.8byte	0x3cd0000000000000	/* 2^-50 */
+
+L(ones):
+	.8byte	0x3ff0000000000000	/* +1.0 */
+	.8byte	0xbff0000000000000	/* -1.0 */
+
+L(DPhalf):
+	.8byte	0x3fe0000000000000	/* 0.5 */
+
+L(DPone):
+	.8byte	0x3ff0000000000000	/* 1.0 */
+
+L(DPtwo):
+	.8byte	0x4000000000000000	/* 2.0 */
+
+libm_alias_float (__sin, sin)
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/w_expf.c b/sysdeps/powerpc/powerpc64/power8/fpu/w_expf.c
new file mode 100644
index 0000000000..b5fe164520
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/w_expf.c
@@ -0,0 +1 @@
+#include <sysdeps/../math/w_expf.c>
diff --git a/sysdeps/powerpc/powerpc64/power8/memchr.S b/sysdeps/powerpc/powerpc64/power8/memchr.S
new file mode 100644
index 0000000000..45ba1b479a
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/memchr.S
@@ -0,0 +1,335 @@
+/* Optimized memchr implementation for POWER8.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* void *[r3] memchr (const void *s [r3], int c [r4], size_t n [r5])  */
+
+/* TODO: change these to the actual instructions when the minimum required
+   binutils allows it.  */
+#define MTVRD(v, r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define MFVRD(r, v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define VBPERMQ(t, a, b)  .long (0x1000054c \
+				| ((t)<<(32-11)) \
+				| ((a)<<(32-16)) \
+				| ((b)<<(32-21)) )
+
+#ifndef MEMCHR
+# define MEMCHR __memchr
+#endif
+/* TODO: change this to .machine power8 when the minimum required binutils
+   allows it.  */
+	.machine  power7
+ENTRY_TOCLESS (MEMCHR)
+	CALL_MCOUNT 3
+	dcbt	0, r3
+	clrrdi  r8, r3, 3
+	insrdi	r4, r4, 8, 48
+
+	/* Calculate the last acceptable address and check for possible
+	   addition overflow by using satured math:
+	   r7 = r3 + r5
+	   r7 |= -(r7 < x)  */
+	add     r7, r3, r5
+	subfc   r6, r3, r7
+	subfe   r9, r9, r9
+	extsw   r6, r9
+	or      r7, r7, r6
+
+	insrdi	r4, r4, 16, 32
+	cmpldi	r5, 32
+	li	r9, -1
+	rlwinm	r6, r3, 3, 26, 28 /* Calculate padding.  */
+	insrdi  r4, r4, 32, 0
+	mr	r10, r7
+	addi	r7, r7, -1
+#ifdef __LITTLE_ENDIAN__
+	sld	r9, r9, r6
+#else
+	srd	r9, r9, r6
+#endif
+	ble	L(small_range)
+	andi.	r11, r3, 63
+	beq	cr0, L(align_qw)
+	clrldi	r11, r3, 61
+	ld	r12, 0(r8)     /* Load doubleword from memory.  */
+	cmpb	r3, r12, r4     /* Check for BYTEs in DWORD1.  */
+	and	r3, r3, r9
+	clrldi	r6, r7, 61      /* Byte count - 1 in last dword.  */
+	clrrdi	r7, r7, 3       /* Address of last doubleword.  */
+	cmpldi	cr7, r3, 0      /* Does r3 indicate we got a hit?  */
+	bne	cr7, L(done)
+	addi	r8, r8, 8
+	addi	r5, r5, -8
+	add	r5, r5, r11
+
+	/* Are we now aligned to a quadword boundary?  */
+	andi.	r11, r8, 15
+	beq	cr0, L(align_qw)
+
+	/* Handle DWORD to make it QW aligned.  */
+	ld	r12, 0(r8)
+	cmpb	r3, r12, r4
+	cmpldi	cr7, r3, 0
+	bne	cr7, L(done)
+	addi	r5, r5, -8
+	addi	r8, r8, 8
+	/* At this point, r8 is 16B aligned.  */
+L(align_qw):
+	vspltisb	v0, 0
+	/* Precompute vbpermq constant.  */
+	vspltisb	v10, 3
+	li	r0, 0
+	lvsl	v11, r0, r0
+	vslb	v10, v11, v10
+	MTVRD(v1, r4)
+	vspltb	v1, v1, 7
+	cmpldi	r5, 64
+	ble	L(tail64)
+	/* Are we 64-byte aligned? If so, jump to the vectorized loop.
+	   Note: aligning to 64-byte will necessarily slow down performance for
+	   strings around 64 bytes in length due to the extra comparisons
+	   required to check alignment for the vectorized loop.  This is a
+	   necessary tradeoff we are willing to take in order to speed up the
+	   calculation for larger strings.  */
+	andi.	r11, r8, 63
+	beq	cr0, L(preloop_64B)
+	/* In order to begin the 64B loop, it needs to be 64
+	   bytes aligned.  So read until it is 64B aligned.  */
+	lvx	v4, 0, r8
+	vcmpequb	v6, v1, v4
+	vcmpequb.	v11, v0, v6
+	bnl	cr6, L(found_16B)
+	addi	r8, r8, 16
+	addi	r5, r5, -16
+
+	andi.	r11, r8, 63
+	beq	cr0, L(preloop_64B)
+	lvx	v4, 0, r8
+	vcmpequb	v6, v1, v4
+	vcmpequb.	v11, v0, v6
+	bnl	cr6, L(found_16B)
+	addi	r8, r8, 16
+	addi	r5, r5, -16
+
+	andi.	r11, r8, 63
+	beq	cr0, L(preloop_64B)
+	lvx	v4, 0, r8
+	vcmpequb	v6, v1, v4
+	vcmpequb.	v11, v0, v6
+	bnl	cr6, L(found_16B)
+	addi	r8, r8, 16
+	addi	r5, r5, -16
+	/* At this point it should be 64B aligned.
+	   Prepare for the 64B loop.  */
+L(preloop_64B):
+	cmpldi	r5, 64		/* Check if r5 < 64.  */
+	ble	L(tail64)
+	sub	r6, r10, r8
+	srdi	r9, r6, 6	/* Number of loop iterations.  */
+	mtctr	r9		/* Setup the counter.  */
+	li	r11, 16		/* Load required offsets.  */
+	li	r9, 32
+	li	r7, 48
+
+	/* Handle r5 > 64.  Loop over the bytes in strides of 64B.  */
+	.align 4
+L(loop):
+	lvx	v2, 0, r8	/* Load 4 quadwords.  */
+	lvx	v3, r8, r11
+	lvx	v4, v8, r9
+	lvx	v5, v8, r7
+	vcmpequb	v6, v1, v2
+	vcmpequb	v7, v1, v3
+	vcmpequb	v8, v1, v4
+	vcmpequb	v9, v1, v5
+	vor	v11, v6, v7
+	vor	v12, v8, v9
+	vor	v11, v11, v12	/* Compare and merge into one VR for speed.  */
+	vcmpequb.	v11, v0, v11
+	bnl	cr6, L(found)
+	addi	r8, r8, 64	/* Adjust address for the next iteration.  */
+	bdnz	L(loop)
+	clrldi	r5, r6, 58
+
+	/* Handle remainder of 64B loop or r5 > 64.  */
+	.align	4
+L(tail64):
+	cmpldi	r5, 0
+	beq	L(null)
+	lvx	v4, 0, r8
+	vcmpequb	v6, v1, v4
+	vcmpequb.	v11, v0, v6
+	bnl	cr6, L(found_16B)
+	addi	r8, r8, 16
+	cmpldi	cr6, r5, 16
+	ble	cr6, L(null)
+	addi	r5, r5, -16
+
+	lvx	v4, 0, r8
+	vcmpequb	v6, v1, v4
+	vcmpequb.	v11, v0, v6
+	bnl	cr6, L(found_16B)
+	addi	r8, r8, 16
+	cmpldi	cr6, r5, 16
+	ble	cr6, L(null)
+	addi	r5, r5, -16
+
+	lvx	v4, 0, r8
+	vcmpequb	v6, v1, v4
+	vcmpequb.	v11, v0, v6
+	bnl	cr6, L(found_16B)
+	addi	r8, r8, 16
+	cmpldi	cr6, r5, 16
+	ble	cr6, L(null)
+	addi	r5, r5, -16
+
+	lvx	v4, 0, r8
+	vcmpequb	v6, v1, v4
+	vcmpequb.	v11, v0, v6
+	bnl	cr6, L(found_16B)
+	li	r3, 0
+	blr
+
+	/* Found a match in 64B loop.  */
+	.align	4
+L(found):
+	/* Permute the first bit of each byte into bits 48-63.  */
+	VBPERMQ(v6, v6, v10)
+	VBPERMQ(v7, v7, v10)
+	VBPERMQ(v8, v8, v10)
+	VBPERMQ(v9, v9, v10)
+	/* Shift each component into its correct position for merging.  */
+#ifdef __LITTLE_ENDIAN__
+	vsldoi	v7, v7, v7, 2
+	vsldoi	v8, v8, v8, 4
+	vsldoi	v9, v9, v9, 6
+#else
+	vsldoi	v6, v6, v6, 6
+	vsldoi	v7, v7, v7, 4
+	vsldoi	v8, v8, v8, 2
+#endif
+	/* Merge the results and move to a GPR.  */
+	vor	v11, v6, v7
+	vor	v4, v9, v8
+	vor	v4, v11, v4
+	MFVRD(r5, v4)
+#ifdef __LITTLE_ENDIAN__
+	addi	r6, r5, -1
+	andc	r6, r6, r5
+	popcntd	r6, r6
+#else
+	cntlzd	r6, r5	/* Count leading zeros before the match.  */
+#endif
+	add	r3, r8, r6	/* Compute final length.  */
+	blr
+
+	/* Found a match in last 16 bytes.  */
+	.align	4
+L(found_16B):
+	/* Permute the first bit of each byte into bits 48-63.  */
+	VBPERMQ(v6, v6, v10)
+	/* Shift each component into its correct position for merging.  */
+#ifdef __LITTLE_ENDIAN__
+	MFVRD(r7, v6)
+	addi	r6, r7, -1
+	andc	r6, r6, r7
+	popcntd	r6, r6
+#else
+	vsldoi	v6, v6, v6, 6
+	MFVRD(r7, v6)
+	cntlzd	r6, r7	/* Count leading zeros before the match.  */
+#endif
+	add	r3, r8, r6	/* Compute final length.  */
+	cmpld	r6, r5
+	bltlr
+	li	r3, 0
+	blr
+
+	.align	4
+	/* r3 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as BYTE in the original
+	   doubleword from the string.  Use that to calculate the pointer.
+	   We need to make sure BYTE is *before* the end of the range.  */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi	r0, r3, -1
+	andc	r0, r0, r3
+	popcntd	r0, r0	      /* Count trailing zeros.  */
+#else
+	cntlzd	r0, r3	      /* Count leading zeros before the match.  */
+#endif
+	cmpld	r8, r7         /* Are we on the last dword?  */
+	srdi	r0, r0, 3	/* Convert leading/trailing zeros to bytes.  */
+	add	r3, r8, r0
+	cmpld	cr7, r0, r6     /* If on the last dword, check byte offset.  */
+	bnelr
+	blelr	cr7
+	li	r3, 0
+	blr
+
+	.align	4
+L(null):
+	li	r3, 0
+	blr
+
+/* Deals with size <= 32.  */
+	.align	4
+L(small_range):
+	cmpldi	r5, 0
+	beq	L(null)
+	ld	r12, 0(r8)     /* Load word from memory.  */
+	cmpb	r3, r12, r4     /* Check for BYTE in DWORD1.  */
+	and	r3, r3, r9
+	cmpldi	cr7, r3, 0
+	clrldi	r6, r7, 61      /* Byte count - 1 in last dword.  */
+	clrrdi	r7, r7, 3       /* Address of last doubleword.  */
+	cmpld	r8, r7         /* Are we done already?  */
+	bne	cr7, L(done)
+	beqlr
+
+	ldu	r12, 8(r8)
+	cmpb	r3, r12, r4
+	cmpldi	cr6, r3, 0
+	cmpld	r8, r7
+	bne	cr6, L(done)   /* Found something.  */
+	beqlr		      /* Hit end of string (length).  */
+
+	ldu	r12, 8(r8)
+	cmpb	r3, r12, r4
+	cmpldi	cr6, r3, 0
+	cmpld	r8, r7
+	bne	cr6, L(done)
+	beqlr
+
+	ldu	r12, 8(r8)
+	cmpb	r3, r12, r4
+	cmpldi	cr6, r3, 0
+	cmpld	r8, r7
+	bne	cr6, L(done)
+	beqlr
+
+	ldu	r12, 8(r8)
+	cmpb	r3, r12, r4
+	cmpldi	cr6, r3, 0
+	bne	cr6, L(done)
+	blr
+
+END (MEMCHR)
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
diff --git a/sysdeps/powerpc/powerpc64/power8/memcmp.S b/sysdeps/powerpc/powerpc64/power8/memcmp.S
new file mode 100644
index 0000000000..ec4ccf3382
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/memcmp.S
@@ -0,0 +1,1447 @@
+/* Optimized memcmp implementation for POWER7/PowerPC64.
+   Copyright (C) 2010-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* int [r3] memcmp (const char *s1 [r3],
+		    const char *s2 [r4],
+		    size_t size [r5])  */
+
+/* TODO: change these to the actual instructions when the minimum required
+   binutils allows it.  */
+#define MFVRD(r,v)	.long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#ifndef MEMCMP
+# define MEMCMP memcmp
+#endif
+	.machine power7
+ENTRY_TOCLESS (MEMCMP, 4)
+	CALL_MCOUNT 3
+
+#define rRTN		r3
+#define rSTR1		r3	/* First string arg.  */
+#define rSTR2		r4	/* Second string arg.  */
+#define rN		r5	/* Max string length.  */
+#define rWORD1		r6	/* Current word in s1.  */
+#define rWORD2		r7	/* Current word in s2.  */
+#define rWORD3		r8	/* Next word in s1.  */
+#define rWORD4		r9	/* Next word in s2.  */
+#define rWORD5		r10	/* Next word in s1.  */
+#define rWORD6		r11	/* Next word in s2.  */
+
+#define rOFF8		r20	/* 8 bytes offset.  */
+#define rOFF16  	r21	/* 16 bytes offset.  */
+#define rOFF24		r22	/* 24 bytes offset.  */
+#define rOFF32		r23	/* 24 bytes offset.  */
+#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
+#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
+#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
+#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
+#define rSHR		r28	/* Unaligned shift right count.  */
+#define rSHL		r29	/* Unaligned shift left count.  */
+#define rWORD7		r30	/* Next word in s1.  */
+#define rWORD8		r31	/* Next word in s2.  */
+
+#define rWORD8SAVE	(-8)
+#define rWORD7SAVE	(-16)
+#define rOFF8SAVE	(-24)
+#define rOFF16SAVE	(-32)
+#define rOFF24SAVE	(-40)
+#define rOFF32SAVE	(-48)
+#define rSHRSAVE	(-56)
+#define rSHLSAVE	(-64)
+#define rWORD8SHIFTSAVE	(-72)
+#define rWORD2SHIFTSAVE	(-80)
+#define rWORD4SHIFTSAVE	(-88)
+#define rWORD6SHIFTSAVE	(-96)
+
+#ifdef __LITTLE_ENDIAN__
+# define LD	ldbrx
+#else
+# define LD	ldx
+#endif
+
+	xor	r10, rSTR2, rSTR1
+	cmpldi	cr6, rN, 0
+	cmpldi	cr1, rN, 8
+	clrldi.	r0, r10, 61
+	clrldi	r12, rSTR1, 61
+	cmpldi	cr5, r12, 0
+	beq-	cr6, L(zeroLength)
+	dcbt	0, rSTR1
+	dcbt	0, rSTR2
+	/* If less than 8 bytes or not aligned, use the unaligned
+	   byte loop.  */
+	blt	cr1, L(bytealigned)
+	bne	L(unalignedqw)
+/* At this point we know both strings have the same alignment and the
+   compare length is at least 8 bytes.  r12 contains the low order
+   3 bits of rSTR1 and cr5 contains the result of the logical compare
+   of r12 to 0.  If r12 == 0 then we are already double word
+   aligned and can perform the DW aligned loop.  */
+
+	.align	4
+L(samealignment):
+	or	r11, rSTR2, rSTR1
+	clrldi.	r11, r11, 60
+	beq	L(qw_align)
+	/* Try to align to QW else proceed to DW loop.  */
+	clrldi.	r10, r10, 60
+	bne	L(DW)
+	/* For the difference to reach QW alignment, load as DW.  */
+	clrrdi	rSTR1, rSTR1, 3
+	clrrdi	rSTR2, rSTR2, 3
+	subfic	r10, r12, 8
+	LD	rWORD1, 0, rSTR1
+	LD	rWORD2, 0, rSTR2
+	sldi	r9, r10, 3
+	subfic	r9, r9, 64
+	sld	rWORD1, rWORD1, r9
+	sld	rWORD2, rWORD2, r9
+	cmpld	cr6, rWORD1, rWORD2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+	bne	cr6, L(ret_diff)
+	subf	rN, r10, rN
+
+	cmpld	cr6, r11, r12
+	bgt	cr6, L(qw_align)
+	LD	rWORD1, 0, rSTR1
+	LD	rWORD2, 0, rSTR2
+	cmpld	cr6, rWORD1, rWORD2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+	bne	cr6, L(different)
+	cmpldi	cr6, rN, 8
+	ble	cr6, L(zeroLength)
+	addi	rN, rN, -8
+	/* Now both rSTR1 and rSTR2 are aligned to QW.  */
+	.align	4
+L(qw_align):
+	vspltisb	v0, 0
+	srdi.	r6, rN, 6
+	li	r8, 16
+	li	r10, 32
+	li	r11, 48
+	ble	cr0, L(lessthan64)
+	mtctr	r6
+	vspltisb	v8, 0
+	vspltisb	v6, 0
+	/* Aligned vector loop.  */
+	.align	4
+L(aligned_loop):
+	lvx	v4, 0, rSTR1
+	lvx	v5, 0, rSTR2
+	vcmpequb.	v7, v6, v8
+	bnl	cr6, L(different3)
+	lvx	v6, rSTR1, r8
+	lvx	v8, rSTR2, r8
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different2)
+	lvx	v4, rSTR1, r10
+	lvx	v5, rSTR2, r10
+	vcmpequb.	v7, v6, v8
+	bnl	cr6, L(different3)
+	lvx	v6, rSTR1, r11
+	lvx	v8, rSTR2, r11
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different2)
+	addi	rSTR1, rSTR1, 64
+	addi	rSTR2, rSTR2, 64
+	bdnz	L(aligned_loop)
+	vcmpequb.	v7, v6, v8
+	bnl	cr6, L(different3)
+	clrldi	rN, rN, 58
+	/* Handle remainder for aligned loop.  */
+	.align	4
+L(lessthan64):
+	mr	r9, rSTR1
+	cmpdi	cr6, rN, 0
+	li	rSTR1, 0
+	blelr	cr6
+	lvx	v4, 0, r9
+	lvx	v5, 0, rSTR2
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different1)
+	addi	rN, rN, -16
+
+	cmpdi	cr6, rN, 0
+	blelr	cr6
+	lvx	v4, r9, r8
+	lvx	v5, rSTR2, r8
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different1)
+	addi	rN, rN, -16
+
+	cmpdi	cr6, rN, 0
+	blelr	cr6
+	lvx	v4, r9, r10
+	lvx	v5, rSTR2, r10
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different1)
+	addi	rN, rN, -16
+
+	cmpdi	cr6, rN, 0
+	blelr	cr6
+	lvx	v4, r9, r11
+	lvx	v5, rSTR2, r11
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different1)
+	blr
+
+	/* Calculate and return the difference.  */
+	.align 4
+L(different1):
+	cmpdi	cr6, rN, 16
+	bge	cr6, L(different2)
+	/* Discard unwanted bytes.  */
+#ifdef __LITTLE_ENDIAN__
+	lvsr	v1, 0, rN
+	vperm	v4, v4, v0, v1
+	vperm	v5, v5, v0, v1
+#else
+	lvsl	v1, 0, rN
+	vperm	v4, v0, v4, v1
+	vperm	v5, v0, v5, v1
+#endif
+	vcmpequb.	v7, v4, v5
+	li	rRTN, 0
+	bltlr	cr6
+	.align 4
+L(different2):
+#ifdef __LITTLE_ENDIAN__
+	/* Reverse bytes for direct comparison.  */
+	lvsl	v10, r0, r0
+	vspltisb	v8, 15
+	vsububm	v9, v8, v10
+	vperm	v4, v4, v0, v9
+	vperm	v5, v5, v0, v9
+#endif
+	MFVRD(r7, v4)
+	MFVRD(r9, v5)
+	cmpld	cr6, r7, r9
+	bne	cr6, L(ret_diff)
+	/* Difference in second DW.  */
+	vsldoi	v4, v4, v4, 8
+	vsldoi	v5, v5, v5, 8
+	MFVRD(r7, v4)
+	MFVRD(r9, v5)
+	cmpld	cr6, r7, r9
+L(ret_diff):
+	li	rRTN, 1
+	bgtlr	cr6
+	li	rRTN, -1
+	blr
+	.align	4
+L(different3):
+#ifdef __LITTLE_ENDIAN__
+	/* Reverse bytes for direct comparison.  */
+	vspltisb	v9, 15
+	lvsl	v10, r0, r0
+	vsububm	v9, v9, v10
+	vperm	v6, v6, v0, v9
+	vperm	v8, v8, v0, v9
+#endif
+	MFVRD(r7, v6)
+	MFVRD(r9, v8)
+	cmpld	cr6, r7, r9
+	bne	cr6, L(ret_diff)
+	/* Difference in second DW.  */
+	vsldoi	v6, v6, v6, 8
+	vsldoi	v8, v8, v8, 8
+	MFVRD(r7, v6)
+	MFVRD(r9, v8)
+	cmpld	cr6, r7, r9
+	li	rRTN, 1
+	bgtlr	cr6
+	li	rRTN, -1
+	blr
+
+	.align 4
+L(different):
+	cmpldi	cr7, rN, 8
+	bgt	cr7, L(end)
+	/* Skip unwanted bytes.  */
+	sldi	r8, rN, 3
+	subfic	r8, r8, 64
+	srd	rWORD1, rWORD1, r8
+	srd	rWORD2, rWORD2, r8
+	cmpld	cr6, rWORD1, rWORD2
+	li	rRTN, 0
+	beqlr	cr6
+L(end):
+	li	rRTN, 1
+	bgtlr	cr6
+	li	rRTN, -1
+	blr
+
+	.align	4
+L(unalignedqw):
+	/* Proceed to DW unaligned loop,if there is a chance of pagecross.  */
+	rldicl	r9, rSTR1, 0, 52
+	add	r9, r9, rN
+	cmpldi	cr0, r9, 4096-16
+	bgt	cr0, L(unaligned)
+	rldicl	r9, rSTR2, 0, 52
+	add	r9, r9, rN
+	cmpldi	cr0, r9, 4096-16
+	bgt	cr0, L(unaligned)
+	li	r0, 0
+	li	r8, 16
+	vspltisb	v0, 0
+	/* Check if rSTR1 is aligned to QW.  */
+	andi.	r11, rSTR1, 0xF
+	beq	L(s1_align)
+
+	/* Compare 16B and align S1 to QW.  */
+#ifdef __LITTLE_ENDIAN__
+	lvsr	v10, 0, rSTR1	/* Compute mask.  */
+	lvsr	v6, 0, rSTR2	/* Compute mask.  */
+#else
+	lvsl	v10, 0, rSTR1	/* Compute mask.  */
+	lvsl	v6, 0, rSTR2	/* Compute mask.  */
+#endif
+	lvx	v5, 0, rSTR2
+	lvx	v9, rSTR2, r8
+#ifdef __LITTLE_ENDIAN__
+	vperm	v5, v9, v5, v6
+#else
+	vperm	v5, v5, v9, v6
+#endif
+	lvx	v4, 0, rSTR1
+	lvx	v9, rSTR1, r8
+#ifdef __LITTLE_ENDIAN__
+	vperm	v4, v9, v4, v10
+#else
+	vperm	v4, v4, v9, v10
+#endif
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different1)
+	cmpldi	cr6, rN, 16
+	ble	cr6, L(zeroLength)
+	subfic	r11, r11, 16
+	subf	rN, r11, rN
+	add	rSTR1, rSTR1, r11
+	add	rSTR2, rSTR2, r11
+
+	/* As s1 is QW aligned prepare for unaligned loop.  */
+	.align	4
+L(s1_align):
+#ifdef __LITTLE_ENDIAN__
+	lvsr	v6, 0, rSTR2
+#else
+	lvsl	v6, 0, rSTR2
+#endif
+	lvx	v5, 0, rSTR2
+	srdi.	r6, rN, 6
+	li	r10, 32
+	li	r11, 48
+	ble	cr0, L(lessthan64_unalign)
+	mtctr	r6
+	li 	r9, 64
+	/* Unaligned vector loop.  */
+	.align	4
+L(unalign_qwloop):
+	lvx	v4, 0, rSTR1
+	lvx	v10, rSTR2, r8
+#ifdef __LITTLE_ENDIAN__
+	vperm	v5, v10, v5, v6
+#else
+	vperm	v5, v5, v10, v6
+#endif
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different2)
+	vor	v5, v10, v10
+	lvx	v4, rSTR1, r8
+	lvx	v10, rSTR2, r10
+#ifdef __LITTLE_ENDIAN__
+	vperm	v5, v10, v5, v6
+#else
+	vperm	v5, v5, v10, v6
+#endif
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different2)
+	vor	v5, v10, v10
+	lvx	v4, rSTR1, r10
+	lvx	v10, rSTR2, r11
+#ifdef __LITTLE_ENDIAN__
+	vperm	v5, v10, v5, v6
+#else
+	vperm	v5, v5, v10, v6
+#endif
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different2)
+	vor	v5, v10, v10
+	lvx	v4, rSTR1, r11
+	lvx	v10, rSTR2, r9
+#ifdef __LITTLE_ENDIAN__
+	vperm	v5, v10, v5, v6
+#else
+	vperm	v5, v5, v10, v6
+#endif
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different2)
+	vor	v5, v10, v10
+	addi	rSTR1, rSTR1, 64
+	addi	rSTR2, rSTR2, 64
+	bdnz	L(unalign_qwloop)
+	clrldi	rN, rN, 58
+	/* Handle remainder for unaligned loop.  */
+	.align	4
+L(lessthan64_unalign):
+	mr	r9, rSTR1
+	cmpdi	cr6, rN, 0
+	li	rSTR1, 0
+	blelr	cr6
+	lvx	v4, 0, r9
+	lvx     v10, rSTR2, r8
+#ifdef __LITTLE_ENDIAN__
+	vperm	v5, v10, v5, v6
+#else
+	vperm	v5, v5, v10, v6
+#endif
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different1)
+	vor	v5, v10, v10
+	addi	rN, rN, -16
+
+	cmpdi	cr6, rN, 0
+	blelr	cr6
+	lvx	v4, r9, r8
+	lvx	v10, rSTR2, r10
+#ifdef __LITTLE_ENDIAN__
+	vperm	v5, v10, v5, v6
+#else
+	vperm	v5, v5, v10, v6
+#endif
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different1)
+	vor	v5, v10, v10
+	addi	rN, rN, -16
+
+	cmpdi	cr6, rN, 0
+	blelr	cr6
+	lvx	v4, r9, r10
+	lvx	v10, rSTR2, r11
+#ifdef __LITTLE_ENDIAN__
+	vperm	v5, v10, v5, v6
+#else
+	vperm	v5, v5, v10, v6
+#endif
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different1)
+	vor	v5, v10, v10
+	addi	rN, rN, -16
+
+	cmpdi	cr6, rN, 0
+	blelr	cr6
+	lvx	v4, r9, r11
+	addi	r11, r11, 16
+	lvx	v10, rSTR2, r11
+#ifdef __LITTLE_ENDIAN__
+	vperm	v5, v10, v5, v6
+#else
+	vperm	v5, v5, v10, v6
+#endif
+	vcmpequb.	v7, v5, v4
+	bnl	cr6, L(different1)
+	blr
+
+/* Otherwise we know the two strings have the same alignment (but not
+   yet DW).  So we force the string addresses to the next lower DW
+   boundary and special case this first DW using shift left to
+   eliminate bits preceding the first byte.  Since we want to join the
+   normal (DW aligned) compare loop, starting at the second double word,
+   we need to adjust the length (rN) and special case the loop
+   versioning for the first DW.  This ensures that the loop count is
+   correct and the first DW (shifted) is in the expected register pair.  */
+	.align	4
+L(DW):
+	std	rWORD8, rWORD8SAVE(r1)
+	std	rWORD7, rWORD7SAVE(r1)
+	std	rOFF8, rOFF8SAVE(r1)
+	std	rOFF16, rOFF16SAVE(r1)
+	std	rOFF24, rOFF24SAVE(r1)
+	std	rOFF32, rOFF32SAVE(r1)
+	cfi_offset(rWORD8, rWORD8SAVE)
+	cfi_offset(rWORD7, rWORD7SAVE)
+	cfi_offset(rOFF8, rOFF8SAVE)
+	cfi_offset(rOFF16, rOFF16SAVE)
+	cfi_offset(rOFF24, rOFF24SAVE)
+	cfi_offset(rOFF32, rOFF32SAVE)
+
+	li	rOFF8,8
+	li	rOFF16,16
+	li	rOFF24,24
+	li	rOFF32,32
+	clrrdi	rSTR1, rSTR1, 3
+	clrrdi	rSTR2, rSTR2, 3
+	beq	cr5, L(DWaligned)
+	add	rN, rN, r12
+	sldi	rWORD6, r12, 3
+	srdi	r0, rN, 5	/* Divide by 32.  */
+	andi.	r12, rN, 24	/* Get the DW remainder.  */
+	LD	rWORD1, 0, rSTR1
+	LD	rWORD2, 0, rSTR2
+	cmpldi	cr1, r12, 16
+	cmpldi	cr7, rN, 32
+	clrldi	rN, rN, 61
+	beq	L(dPs4)
+	mtctr	r0
+	bgt	cr1, L(dPs3)
+	beq	cr1, L(dPs2)
+
+/* Remainder is 8.  */
+	.align	3
+L(dsP1):
+	sld	rWORD5, rWORD1, rWORD6
+	sld	rWORD6, rWORD2, rWORD6
+	cmpld	cr5, rWORD5, rWORD6
+	blt	cr7, L(dP1x)
+/* Do something useful in this cycle since we have to branch anyway.  */
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	b	L(dP1e)
+/* Remainder is 16.  */
+	.align	4
+L(dPs2):
+	sld	rWORD5, rWORD1, rWORD6
+	sld	rWORD6, rWORD2, rWORD6
+	cmpld	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP2x)
+/* Do something useful in this cycle since we have to branch anyway.  */
+	LD	rWORD7, rOFF8, rSTR1
+	LD	rWORD8, rOFF8, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	b	L(dP2e)
+/* Remainder is 24.  */
+	.align	4
+L(dPs3):
+	sld	rWORD3, rWORD1, rWORD6
+	sld	rWORD4, rWORD2, rWORD6
+	cmpld	cr1, rWORD3, rWORD4
+	b	L(dP3e)
+/* Count is a multiple of 32, remainder is 0.  */
+	.align	4
+L(dPs4):
+	mtctr	r0
+	sld	rWORD1, rWORD1, rWORD6
+	sld	rWORD2, rWORD2, rWORD6
+	cmpld	cr7, rWORD1, rWORD2
+	b	L(dP4e)
+
+/* At this point we know both strings are double word aligned and the
+   compare length is at least 8 bytes.  */
+	.align	4
+L(DWaligned):
+	andi.	r12, rN, 24	/* Get the DW remainder.  */
+	srdi	r0, rN, 5	/* Divide by 32.  */
+	cmpldi	cr1, r12, 16
+	cmpldi	cr7, rN, 32
+	clrldi	rN, rN, 61
+	beq	L(dP4)
+	bgt	cr1, L(dP3)
+	beq	cr1, L(dP2)
+
+/* Remainder is 8.  */
+	.align	4
+L(dP1):
+	mtctr	r0
+/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
+   (8-15 byte compare), we want to use only volatile registers.  This
+   means we can avoid restoring non-volatile registers since we did not
+   change any on the early exit path.  The key here is the non-early
+   exit path only cares about the condition code (cr5), not about which
+   register pair was used.  */
+	LD	rWORD5, 0, rSTR1
+	LD	rWORD6, 0, rSTR2
+	cmpld	cr5, rWORD5, rWORD6
+	blt	cr7, L(dP1x)
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+L(dP1e):
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5x)
+	bne	cr7, L(dLcr7x)
+
+	LD	rWORD7, rOFF32, rSTR1
+	LD	rWORD8, rOFF32, rSTR2
+	addi	rSTR1, rSTR1, 32
+	addi	rSTR2, rSTR2, 32
+	bne	cr1, L(dLcr1)
+	cmpld	cr5, rWORD7, rWORD8
+	bdnz	L(dLoop)
+	bne	cr6, L(dLcr6)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	.align	3
+L(dP1x):
+	sldi.	r12, rN, 3
+	bne	cr5, L(dLcr5x)
+	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
+	bne	L(d00)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 0
+	blr
+
+/* Remainder is 16.  */
+	.align	4
+L(dP2):
+	mtctr	r0
+	LD	rWORD5, 0, rSTR1
+	LD	rWORD6, 0, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP2x)
+	LD	rWORD7, rOFF8, rSTR1
+	LD	rWORD8, rOFF8, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+L(dP2e):
+	LD	rWORD1, rOFF16, rSTR1
+	LD	rWORD2, rOFF16, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	LD	rWORD3, rOFF24, rSTR1
+	LD	rWORD4, rOFF24, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+	bne	cr6, L(dLcr6)
+	bne	cr5, L(dLcr5)
+	b	L(dLoop2)
+	.align	4
+L(dP2x):
+	LD	rWORD3, rOFF8, rSTR1
+	LD	rWORD4, rOFF8, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	sldi.	r12, rN, 3
+	bne	cr6, L(dLcr6x)
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+	bne	cr1, L(dLcr1x)
+	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
+	bne	L(d00)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 0
+	blr
+
+/* Remainder is 24.  */
+	.align	4
+L(dP3):
+	mtctr	r0
+	LD	rWORD3, 0, rSTR1
+	LD	rWORD4, 0, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+L(dP3e):
+	LD	rWORD5, rOFF8, rSTR1
+	LD	rWORD6, rOFF8, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP3x)
+	LD	rWORD7, rOFF16, rSTR1
+	LD	rWORD8, rOFF16, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	LD	rWORD1, rOFF24, rSTR1
+	LD	rWORD2, rOFF24, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+	bne	cr1, L(dLcr1)
+	bne	cr6, L(dLcr6)
+	b	L(dLoop1)
+/* Again we are on a early exit path (24-31 byte compare), we want to
+   only use volatile registers and avoid restoring non-volatile
+   registers.  */
+	.align	4
+L(dP3x):
+	LD	rWORD1, rOFF16, rSTR1
+	LD	rWORD2, rOFF16, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	sldi.	r12, rN, 3
+	bne	cr1, L(dLcr1x)
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+	bne	cr6, L(dLcr6x)
+	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
+	bne	cr7, L(dLcr7x)
+	bne	L(d00)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 0
+	blr
+
+/* Count is a multiple of 32, remainder is 0.  */
+	.align	4
+L(dP4):
+	mtctr	r0
+	LD	rWORD1, 0, rSTR1
+	LD	rWORD2, 0, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+L(dP4e):
+	LD	rWORD3, rOFF8, rSTR1
+	LD	rWORD4, rOFF8, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	LD	rWORD5, rOFF16, rSTR1
+	LD	rWORD6, rOFF16, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	LD	rWORD7, rOFF24, rSTR1
+	LD	rWORD8, rOFF24, rSTR2
+	addi	rSTR1, rSTR1, 24
+	addi	rSTR2, rSTR2, 24
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr7, L(dLcr7)
+	bne	cr1, L(dLcr1)
+	bdz-	L(d24)		/* Adjust CTR as we start with +4.  */
+/* This is the primary loop.  */
+	.align	4
+L(dLoop):
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr6, L(dLcr6)
+L(dLoop1):
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5)
+L(dLoop2):
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr7, L(dLcr7)
+L(dLoop3):
+	LD	rWORD7, rOFF32, rSTR1
+	LD	rWORD8, rOFF32, rSTR2
+	addi	rSTR1, rSTR1, 32
+	addi	rSTR2, rSTR2, 32
+	bne	cr1, L(dLcr1)
+	cmpld	cr7, rWORD1, rWORD2
+	bdnz	L(dLoop)
+
+L(dL4):
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr6, L(dLcr6)
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5)
+	cmpld	cr5, rWORD7, rWORD8
+L(d44):
+	bne	cr7, L(dLcr7)
+L(d34):
+	bne	cr1, L(dLcr1)
+L(d24):
+	bne	cr6, L(dLcr6)
+L(d14):
+	sldi.	r12, rN, 3
+	bne	cr5, L(dLcr5)
+L(d04):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
+	beq	L(duzeroLength)
+/* At this point we have a remainder of 1 to 7 bytes to compare.  Since
+   we are aligned it is safe to load the whole double word, and use
+   shift right double to eliminate bits beyond the compare length.  */
+L(d00):
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
+	srd	rWORD1, rWORD1, rN
+	srd	rWORD2, rWORD2, rN
+	cmpld	cr7, rWORD1, rWORD2
+	bne	cr7, L(dLcr7x)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 0
+	blr
+
+	.align	4
+L(dLcr7):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+L(dLcr7x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 1
+	bgtlr	cr7
+	li	rRTN, -1
+	blr
+	.align	4
+L(dLcr1):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+L(dLcr1x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 1
+	bgtlr	cr1
+	li	rRTN, -1
+	blr
+	.align	4
+L(dLcr6):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+L(dLcr6x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 1
+	bgtlr	cr6
+	li	rRTN, -1
+	blr
+	.align	4
+L(dLcr5):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+L(dLcr5x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 1
+	bgtlr	cr5
+	li	rRTN, -1
+	blr
+
+	.align	4
+L(bytealigned):
+	mtctr	rN
+
+/* We need to prime this loop.  This loop is swing modulo scheduled
+   to avoid pipe delays.  The dependent instruction latencies (load to
+   compare to conditional branch) is 2 to 3 cycles.  In this loop each
+   dispatch group ends in a branch and takes 1 cycle.  Effectively
+   the first iteration of the loop only serves to load operands and
+   branches based on compares are delayed until the next loop.
+
+   So we must precondition some registers and condition codes so that
+   we don't exit the loop early on the first iteration.  */
+
+	lbz	rWORD1, 0(rSTR1)
+	lbz	rWORD2, 0(rSTR2)
+	bdz	L(b11)
+	cmpld	cr7, rWORD1, rWORD2
+	lbz	rWORD3, 1(rSTR1)
+	lbz	rWORD4, 1(rSTR2)
+	bdz	L(b12)
+	cmpld	cr1, rWORD3, rWORD4
+	lbzu	rWORD5, 2(rSTR1)
+	lbzu	rWORD6, 2(rSTR2)
+	bdz	L(b13)
+	.align	4
+L(bLoop):
+	lbzu	rWORD1, 1(rSTR1)
+	lbzu	rWORD2, 1(rSTR2)
+	bne	cr7, L(bLcr7)
+
+	cmpld	cr6, rWORD5, rWORD6
+	bdz	L(b3i)
+
+	lbzu	rWORD3, 1(rSTR1)
+	lbzu	rWORD4, 1(rSTR2)
+	bne	cr1, L(bLcr1)
+
+	cmpld	cr7, rWORD1, rWORD2
+	bdz	L(b2i)
+
+	lbzu	rWORD5, 1(rSTR1)
+	lbzu	rWORD6, 1(rSTR2)
+	bne	cr6, L(bLcr6)
+
+	cmpld	cr1, rWORD3, rWORD4
+	bdnz	L(bLoop)
+
+/* We speculatively loading bytes before we have tested the previous
+   bytes.  But we must avoid overrunning the length (in the ctr) to
+   prevent these speculative loads from causing a segfault.  In this
+   case the loop will exit early (before the all pending bytes are
+   tested.  In this case we must complete the pending operations
+   before returning.  */
+L(b1i):
+	bne	cr7, L(bLcr7)
+	bne	cr1, L(bLcr1)
+	b	L(bx56)
+	.align	4
+L(b2i):
+	bne	cr6, L(bLcr6)
+	bne	cr7, L(bLcr7)
+	b	L(bx34)
+	.align	4
+L(b3i):
+	bne	cr1, L(bLcr1)
+	bne	cr6, L(bLcr6)
+	b	L(bx12)
+	.align	4
+L(bLcr7):
+	li	rRTN, 1
+	bgtlr	cr7
+	li	rRTN, -1
+	blr
+L(bLcr1):
+	li	rRTN, 1
+	bgtlr	cr1
+	li	rRTN, -1
+	blr
+L(bLcr6):
+	li	rRTN, 1
+	bgtlr	cr6
+	li	rRTN, -1
+	blr
+
+L(b13):
+	bne	cr7, L(bx12)
+	bne	cr1, L(bx34)
+L(bx56):
+	sub	rRTN, rWORD5, rWORD6
+	blr
+	nop
+L(b12):
+	bne	cr7, L(bx12)
+L(bx34):
+	sub	rRTN, rWORD3, rWORD4
+	blr
+L(b11):
+L(bx12):
+	sub	rRTN, rWORD1, rWORD2
+	blr
+
+	.align	4
+L(zeroLength):
+	li	rRTN, 0
+	blr
+
+	.align	4
+/* At this point we know the strings have different alignment and the
+   compare length is at least 8 bytes.  r12 contains the low order
+   3 bits of rSTR1 and cr5 contains the result of the logical compare
+   of r12 to 0.  If r12 == 0 then rStr1 is double word
+   aligned and can perform the DWunaligned loop.
+
+   Otherwise we know that rSTR1 is not already DW aligned yet.
+   So we can force the string addresses to the next lower DW
+   boundary and special case this first DW using shift left to
+   eliminate bits preceding the first byte.  Since we want to join the
+   normal (DWaligned) compare loop, starting at the second double word,
+   we need to adjust the length (rN) and special case the loop
+   versioning for the first DW.  This ensures that the loop count is
+   correct and the first DW (shifted) is in the expected resister pair.  */
+L(unaligned):
+	std	rWORD8, rWORD8SAVE(r1)
+	std	rWORD7, rWORD7SAVE(r1)
+	std	rOFF8, rOFF8SAVE(r1)
+	std	rOFF16, rOFF16SAVE(r1)
+	std	rOFF24, rOFF24SAVE(r1)
+	std	rOFF32, rOFF32SAVE(r1)
+	cfi_offset(rWORD8, rWORD8SAVE)
+	cfi_offset(rWORD7, rWORD7SAVE)
+	cfi_offset(rOFF8, rOFF8SAVE)
+	cfi_offset(rOFF16, rOFF16SAVE)
+	cfi_offset(rOFF24, rOFF24SAVE)
+	cfi_offset(rOFF32, rOFF32SAVE)
+	li	rOFF8,8
+	li	rOFF16,16
+	li	rOFF24,24
+	li	rOFF32,32
+	std	rSHL, rSHLSAVE(r1)
+	cfi_offset(rSHL, rSHLSAVE)
+	clrldi	rSHL, rSTR2, 61
+	beq	cr6, L(duzeroLength)
+	std	rSHR, rSHRSAVE(r1)
+	cfi_offset(rSHR, rSHRSAVE)
+	beq	cr5, L(DWunaligned)
+	std	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+	cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
+/* Adjust the logical start of rSTR2 to compensate for the extra bits
+   in the 1st rSTR1 DW.  */
+	sub	rWORD8_SHIFT, rSTR2, r12
+/* But do not attempt to address the DW before that DW that contains
+   the actual start of rSTR2.  */
+	clrrdi	rSTR2, rSTR2, 3
+	std	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+/* Compute the left/right shift counts for the unaligned rSTR2,
+   compensating for the logical (DW aligned) start of rSTR1.  */
+	clrldi	rSHL, rWORD8_SHIFT, 61
+	clrrdi	rSTR1, rSTR1, 3
+	std	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+	sldi	rSHL, rSHL, 3
+	cmpld	cr5, rWORD8_SHIFT, rSTR2
+	add	rN, rN, r12
+	sldi	rWORD6, r12, 3
+	std	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
+	cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
+	cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
+	subfic	rSHR, rSHL, 64
+	srdi	r0, rN, 5	/* Divide by 32.  */
+	andi.	r12, rN, 24	/* Get the DW remainder.  */
+/* We normally need to load 2 DWs to start the unaligned rSTR2, but in
+   this special case those bits may be discarded anyway.  Also we
+   must avoid loading a DW where none of the bits are part of rSTR2 as
+   this may cross a page boundary and cause a page fault.  */
+	li	rWORD8, 0
+	blt	cr5, L(dus0)
+	LD	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+	sld	rWORD8, rWORD8, rSHL
+
+L(dus0):
+	LD	rWORD1, 0, rSTR1
+	LD	rWORD2, 0, rSTR2
+	cmpldi	cr1, r12, 16
+	cmpldi	cr7, rN, 32
+	srd	r12, rWORD2, rSHR
+	clrldi	rN, rN, 61
+	beq	L(duPs4)
+	mtctr	r0
+	or	rWORD8, r12, rWORD8
+	bgt	cr1, L(duPs3)
+	beq	cr1, L(duPs2)
+
+/* Remainder is 8.  */
+	.align	4
+L(dusP1):
+	sld	rWORD8_SHIFT, rWORD2, rSHL
+	sld	rWORD7, rWORD1, rWORD6
+	sld	rWORD8, rWORD8, rWORD6
+	bge	cr7, L(duP1e)
+/* At this point we exit early with the first double word compare
+   complete and remainder of 0 to 7 bytes.  See L(du14) for details on
+   how we handle the remaining bytes.  */
+	cmpld	cr5, rWORD7, rWORD8
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmpld	cr7, rN, rSHR
+	beq	L(duZeroReturn)
+	li	r0, 0
+	ble	cr7, L(dutrim)
+	LD	rWORD2, rOFF8, rSTR2
+	srd	r0, rWORD2, rSHR
+	b	L(dutrim)
+/* Remainder is 16.  */
+	.align	4
+L(duPs2):
+	sld	rWORD6_SHIFT, rWORD2, rSHL
+	sld	rWORD5, rWORD1, rWORD6
+	sld	rWORD6, rWORD8, rWORD6
+	b	L(duP2e)
+/* Remainder is 24.  */
+	.align	4
+L(duPs3):
+	sld	rWORD4_SHIFT, rWORD2, rSHL
+	sld	rWORD3, rWORD1, rWORD6
+	sld	rWORD4, rWORD8, rWORD6
+	b	L(duP3e)
+/* Count is a multiple of 32, remainder is 0.  */
+	.align	4
+L(duPs4):
+	mtctr	r0
+	or	rWORD8, r12, rWORD8
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	sld	rWORD1, rWORD1, rWORD6
+	sld	rWORD2, rWORD8, rWORD6
+	b	L(duP4e)
+
+/* At this point we know rSTR1 is double word aligned and the
+   compare length is at least 8 bytes.  */
+	.align	4
+L(DWunaligned):
+	std	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+	clrrdi	rSTR2, rSTR2, 3
+	std	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+	srdi	r0, rN, 5	/* Divide by 32.  */
+	std	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+	andi.	r12, rN, 24	/* Get the DW remainder.  */
+	std	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
+	cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
+	cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
+	cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
+	sldi	rSHL, rSHL, 3
+	LD	rWORD6, 0, rSTR2
+	LD	rWORD8, rOFF8, rSTR2
+	addi	rSTR2, rSTR2, 8
+	cmpldi	cr1, r12, 16
+	cmpldi	cr7, rN, 32
+	clrldi	rN, rN, 61
+	subfic	rSHR, rSHL, 64
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	beq	L(duP4)
+	mtctr	r0
+	bgt	cr1, L(duP3)
+	beq	cr1, L(duP2)
+
+/* Remainder is 8.  */
+	.align	4
+L(duP1):
+	srd	r12, rWORD8, rSHR
+	LD	rWORD7, 0, rSTR1
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP1x)
+L(duP1e):
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	bne	cr5, L(duLcr5)
+	or	rWORD4, r12, rWORD2_SHIFT
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	bne	cr7, L(duLcr7)
+	or	rWORD6, r0, rWORD4_SHIFT
+	cmpld	cr6, rWORD5, rWORD6
+	b	L(duLoop3)
+	.align	4
+/* At this point we exit early with the first double word compare
+   complete and remainder of 0 to 7 bytes.  See L(du14) for details on
+   how we handle the remaining bytes.  */
+L(duP1x):
+	cmpld	cr5, rWORD7, rWORD8
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmpld	cr7, rN, rSHR
+	beq	L(duZeroReturn)
+	li	r0, 0
+	ble	cr7, L(dutrim)
+	LD	rWORD2, rOFF8, rSTR2
+	srd	r0, rWORD2, rSHR
+	b	L(dutrim)
+/* Remainder is 16.  */
+	.align	4
+L(duP2):
+	srd	r0, rWORD8, rSHR
+	LD	rWORD5, 0, rSTR1
+	or	rWORD6, r0, rWORD6_SHIFT
+	sld	rWORD6_SHIFT, rWORD8, rSHL
+L(duP2e):
+	LD	rWORD7, rOFF8, rSTR1
+	LD	rWORD8, rOFF8, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP2x)
+	LD	rWORD1, rOFF16, rSTR1
+	LD	rWORD2, rOFF16, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+	LD	rWORD3, rOFF24, rSTR1
+	LD	rWORD4, rOFF24, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	bne	cr5, L(duLcr5)
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+	cmpld	cr1, rWORD3, rWORD4
+	b	L(duLoop2)
+	.align	4
+L(duP2x):
+	cmpld	cr5, rWORD7, rWORD8
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+	bne	cr6, L(duLcr6)
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmpld	cr7, rN, rSHR
+	beq	L(duZeroReturn)
+	li	r0, 0
+	ble	cr7, L(dutrim)
+	LD	rWORD2, rOFF8, rSTR2
+	srd	r0, rWORD2, rSHR
+	b	L(dutrim)
+
+/* Remainder is 24.  */
+	.align	4
+L(duP3):
+	srd	r12, rWORD8, rSHR
+	LD	rWORD3, 0, rSTR1
+	sld	rWORD4_SHIFT, rWORD8, rSHL
+	or	rWORD4, r12, rWORD6_SHIFT
+L(duP3e):
+	LD	rWORD5, rOFF8, rSTR1
+	LD	rWORD6, rOFF8, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+	LD	rWORD7, rOFF16, rSTR1
+	LD	rWORD8, rOFF16, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr1, L(duLcr1)
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP3x)
+	LD	rWORD1, rOFF24, rSTR1
+	LD	rWORD2, rOFF24, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+	cmpld	cr7, rWORD1, rWORD2
+	b	L(duLoop1)
+	.align	4
+L(duP3x):
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmpld	cr7, rN, rSHR
+	beq	L(duZeroReturn)
+	li	r0, 0
+	ble	cr7, L(dutrim)
+	LD	rWORD2, rOFF8, rSTR2
+	srd	r0, rWORD2, rSHR
+	b	L(dutrim)
+
+/* Count is a multiple of 32, remainder is 0.  */
+	.align	4
+L(duP4):
+	mtctr	r0
+	srd	r0, rWORD8, rSHR
+	LD	rWORD1, 0, rSTR1
+	sld	rWORD2_SHIFT, rWORD8, rSHL
+	or	rWORD2, r0, rWORD6_SHIFT
+L(duP4e):
+	LD	rWORD3, rOFF8, rSTR1
+	LD	rWORD4, rOFF8, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+	LD	rWORD5, rOFF16, rSTR1
+	LD	rWORD6, rOFF16, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr7, L(duLcr7)
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+	LD	rWORD7, rOFF24, rSTR1
+	LD	rWORD8, rOFF24, rSTR2
+	addi	rSTR1, rSTR1, 24
+	addi	rSTR2, rSTR2, 24
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr1, L(duLcr1)
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	cmpld	cr5, rWORD7, rWORD8
+	bdz	L(du24)		/* Adjust CTR as we start with +4.  */
+/* This is the primary loop.  */
+	.align	4
+L(duLoop):
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr6, L(duLcr6)
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+L(duLoop1):
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(duLcr5)
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+L(duLoop2):
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr7, L(duLcr7)
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+L(duLoop3):
+	LD	rWORD7, rOFF32, rSTR1
+	LD	rWORD8, rOFF32, rSTR2
+	addi	rSTR1, rSTR1, 32
+	addi	rSTR2, rSTR2, 32
+	cmpld	cr7, rWORD1, rWORD2
+	bne	cr1, L(duLcr1)
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	bdnz	L(duLoop)
+
+L(duL4):
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr6, L(duLcr6)
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(duLcr5)
+	cmpld	cr5, rWORD7, rWORD8
+L(du44):
+	bne	cr7, L(duLcr7)
+L(du34):
+	bne	cr1, L(duLcr1)
+L(du24):
+	bne	cr6, L(duLcr6)
+L(du14):
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+/* At this point we have a remainder of 1 to 7 bytes to compare.  We use
+   shift right double to eliminate bits beyond the compare length.
+
+   However it may not be safe to load rWORD2 which may be beyond the
+   string length.  So we compare the bit length of the remainder to
+   the right shift count (rSHR).  If the bit count is less than or equal
+   we do not need to load rWORD2 (all significant bits are already in
+   rWORD8_SHIFT).  */
+	cmpld	cr7, rN, rSHR
+	beq	L(duZeroReturn)
+	li	r0, 0
+	ble	cr7, L(dutrim)
+	LD	rWORD2, rOFF8, rSTR2
+	srd	r0, rWORD2, rSHR
+	.align	4
+L(dutrim):
+	LD	rWORD1, rOFF8, rSTR1
+	ld	rWORD8, -8(r1)
+	subfic	rN, rN, 64	/* Shift count is 64 - (rN * 8).  */
+	or	rWORD2, r0, rWORD8_SHIFT
+	ld	rWORD7, rWORD7SAVE(r1)
+	ld	rSHL, rSHLSAVE(r1)
+	srd	rWORD1, rWORD1, rN
+	srd	rWORD2, rWORD2, rN
+	ld	rSHR, rSHRSAVE(r1)
+	ld	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+	li	rRTN, 0
+	cmpld	cr7, rWORD1, rWORD2
+	ld	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+	ld	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+	beq	cr7, L(dureturn24)
+	li	rRTN, 1
+	ld	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	bgtlr	cr7
+	li	rRTN, -1
+	blr
+	.align	4
+L(duLcr7):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	li	rRTN, 1
+	bgt	cr7, L(dureturn29)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
+	li	rRTN, -1
+	b	L(dureturn27)
+	.align	4
+L(duLcr1):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	li	rRTN, 1
+	bgt	cr1, L(dureturn29)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
+	li	rRTN, -1
+	b	L(dureturn27)
+	.align	4
+L(duLcr6):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	li	rRTN, 1
+	bgt	cr6, L(dureturn29)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
+	li	rRTN, -1
+	b	L(dureturn27)
+	.align	4
+L(duLcr5):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	li	rRTN, 1
+	bgt	cr5, L(dureturn29)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
+	li	rRTN, -1
+	b	L(dureturn27)
+
+	.align	3
+L(duZeroReturn):
+	li	rRTN, 0
+	.align	4
+L(dureturn):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+L(dureturn29):
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
+L(dureturn27):
+	ld	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+	ld	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+	ld	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+L(dureturn24):
+	ld	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	blr
+
+L(duzeroLength):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 0
+	blr
+
+END (MEMCMP)
+libc_hidden_builtin_def (memcmp)
+weak_alias (memcmp, bcmp)
diff --git a/sysdeps/powerpc/powerpc64/power8/memrchr.S b/sysdeps/powerpc/powerpc64/power8/memrchr.S
new file mode 100644
index 0000000000..54de6566bd
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/memrchr.S
@@ -0,0 +1,345 @@
+/* Optimized memrchr implementation for PowerPC64/POWER8.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* int [r3] memrchr (char *s [r3], int byte [r4], int size [r5])  */
+
+/* TODO: change these to the actual instructions when the minimum required
+   binutils allows it.  */
+#define MTVRD(v, r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define MFVRD(r, v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define VBPERMQ(t, a, b)  .long (0x1000054c \
+				| ((t)<<(32-11)) \
+				| ((a)<<(32-16)) \
+				| ((b)<<(32-21)) )
+#ifndef MEMRCHR
+# define MEMRCHR __memrchr
+#endif
+	.machine  power7
+ENTRY_TOCLESS (MEMRCHR)
+	CALL_MCOUNT 3
+	add	r7, r3, r5      /* Calculate the last acceptable address.  */
+	neg	r0, r7
+	addi	r7, r7, -1
+	mr	r10, r3
+	clrrdi	r6, r7, 7
+	li	r9, 3<<5
+	dcbt	r9, r6, 8       /* Stream hint, decreasing addresses.  */
+
+	/* Replicate BYTE to doubleword.  */
+	insrdi	r4, r4, 8, 48
+	insrdi	r4, r4, 16, 32
+	insrdi	r4, r4, 32, 0
+	li	r6, -8
+	li	r9, -1
+	rlwinm	r0, r0, 3, 26, 28 /* Calculate padding.  */
+	clrrdi	r8, r7, 3
+	srd	r9, r9, r0
+	cmpldi	r5, 32
+	clrrdi	r0, r10, 3
+	ble	L(small_range)
+
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12, 0, r8
+#else
+	ldbrx	r12, 0, r8      /* Load reversed doubleword from memory.  */
+#endif
+	cmpb	r3, r12, r4     /* Check for BYTE in DWORD1.  */
+	and	r3, r3, r9
+	cmpldi	cr7, r3, 0      /* If r3 == 0, no BYTEs have been found.  */
+	bne	cr7, L(done)
+
+	/* Are we now aligned to a quadword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+	andi.	r12, r8, 15
+	beq	cr0, L(align_qw)
+
+	/* Handle DWORD2 of pair.  */
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12, r8, r6
+#else
+	ldbrx	r12, r8, r6
+#endif
+	addi	r8, r8, -8
+	cmpb	r3, r12, r4
+	cmpldi	cr7, r3, 0
+	bne	cr7, L(done)
+
+	.align	4
+	/* At this point, r8 is 16B aligned.  */
+L(align_qw):
+	sub	r5, r8, r0
+	vspltisb	v0, 0
+	/* Precompute vbpermq constant.  */
+	vspltisb	v10, 3
+	li	r0, 0
+	lvsl	v11, r0, r0
+	vslb	v10, v11, v10
+	MTVRD(v1, r4)
+	vspltb	v1, v1, 7
+	cmpldi	r5, 64
+	ble	L(tail64)
+	/* Are we 64-byte aligned? If so, jump to the vectorized loop.
+	   Note: aligning to 64-byte will necessarily slow down performance for
+	   strings around 64 bytes in length due to the extra comparisons
+	   required to check alignment for the vectorized loop.  This is a
+	   necessary tradeoff we are willing to take in order to speed up the
+	   calculation for larger strings.  */
+	andi.	r11, r8, 63
+	beq	cr0, L(preloop_64B)
+	/* In order to begin the 64B loop, it needs to be 64
+	   bytes aligned.  So read until it is 64B aligned.  */
+	addi	r8, r8, -16
+	lvx	v4, 0, r8
+	vcmpequb	v6, v1, v4
+	vcmpequb.	v11, v0, v6
+	bnl	cr6, L(found_16B)
+	addi	r5, r5, -16
+
+	andi.	r11, r8, 63
+	beq	cr0, L(preloop_64B)
+	addi	r8, r8, -16
+	lvx	v4, 0, r8
+	vcmpequb	v6, v1, v4
+	vcmpequb.	v11, v0, v6
+	bnl	cr6, L(found_16B)
+	addi	r5, r5, -16
+
+	andi.	r11, r8, 63
+	beq	cr0, L(preloop_64B)
+	addi	r8, r8, -16
+	lvx	v4, 0, r8
+	vcmpequb	v6, v1, v4
+	vcmpequb.	v11, v0, v6
+	bnl	cr6, L(found_16B)
+	addi	r5, r5, -16
+	/* At this point it should be 64B aligned.
+	   Prepare for the 64B loop.  */
+L(preloop_64B):
+	cmpldi	r5, 64		/* Check if r5 < 64.  */
+	ble	L(tail64)
+	srdi	r9, r5, 6	/* Number of loop iterations.  */
+	mtctr	r9		/* Setup the counter.  */
+	li	r11, 16		/* Load required offsets.  */
+	li	r9, 32
+	li	r7, 48
+
+	/* Handle r5 > 64.  Loop over the bytes in strides of 64B.  */
+	.align 4
+L(loop):
+	addi	r8, r8, -64	/* Adjust address for the next iteration.  */
+	lvx	v2, 0, r8	/* Load 4 quadwords.  */
+	lvx	v3, r8, r11
+	lvx	v4, v8, r9
+	lvx	v5, v8, r7
+	vcmpequb	v6, v1, v2
+	vcmpequb	v7, v1, v3
+	vcmpequb	v8, v1, v4
+	vcmpequb	v9, v1, v5
+	vor	v11, v6, v7
+	vor	v12, v8, v9
+	vor	v11, v11, v12	/* Compare and merge into one VR for speed.  */
+	vcmpequb.	v11, v0, v11
+	bnl	cr6, L(found)
+	bdnz	L(loop)
+	clrldi	r5, r5, 58
+
+	/* Handle remainder of 64B loop or r5 > 64.  */
+	.align	4
+L(tail64):
+	cmpldi	r5, 0
+	beq	L(null)
+	addi	r8, r8, -16
+	lvx	v4, 0, r8
+	vcmpequb	v6, v1, v4
+	vcmpequb.	v11, v0, v6
+	bnl	cr6, L(found_16B)
+	cmpldi	cr6, r5, 16
+	ble	cr6, L(null)
+	addi	r5, r5, -16
+
+	addi	r8, r8, -16
+	lvx	v4, 0, r8
+	vcmpequb	v6, v1, v4
+	vcmpequb.	v11, v0, v6
+	bnl	cr6, L(found_16B)
+	cmpldi	cr6, r5, 16
+	ble	cr6, L(null)
+	addi	r5, r5, -16
+
+	addi	r8, r8, -16
+	lvx	v4, 0, r8
+	vcmpequb	v6, v1, v4
+	vcmpequb.	v11, v0, v6
+	bnl	cr6, L(found_16B)
+	cmpldi	cr6, r5, 16
+	ble	cr6, L(null)
+	addi	r5, r5, -16
+
+	addi	r8, r8, -16
+	lvx	v4, 0, r8
+	vcmpequb	v6, v1, v4
+	vcmpequb.	v11, v0, v6
+	bnl	cr6, L(found_16B)
+	li	r3, 0
+	blr
+
+	/* Found a match in 64B loop.  */
+	.align	4
+L(found):
+	/* Permute the first bit of each byte into bits 48-63.  */
+	VBPERMQ(v6, v6, v10)
+	VBPERMQ(v7, v7, v10)
+	VBPERMQ(v8, v8, v10)
+	VBPERMQ(v9, v9, v10)
+	/* Shift each component into its correct position for merging.  */
+#ifdef __LITTLE_ENDIAN__
+	vsldoi	v7, v7, v7, 2
+	vsldoi	v8, v8, v8, 4
+	vsldoi	v9, v9, v9, 6
+#else
+	vsldoi	v6, v6, v6, 6
+	vsldoi	v7, v7, v7, 4
+	vsldoi	v8, v8, v8, 2
+#endif
+	/* Merge the results and move to a GPR.  */
+	vor	v11, v6, v7
+	vor	v4, v9, v8
+	vor	v4, v11, v4
+	MFVRD(r5, v4)
+#ifdef __LITTLE_ENDIAN__
+	cntlzd	r6, r5	/* Count leading zeros before the match.  */
+#else
+	addi	r6, r5, -1
+	andc	r6, r6, r5
+	popcntd	r6, r6
+#endif
+	addi	r8, r8, 63
+	sub	r3, r8, r6	/* Compute final address.  */
+	cmpld	cr7, r3, r10
+	bgelr	cr7
+	li	r3, 0
+	blr
+
+	/* Found a match in last 16 bytes.  */
+	.align	4
+L(found_16B):
+	cmpld	r8, r10		/* Are we on the last QW?  */
+	bge	L(last)
+	/* Now discard bytes before starting address.  */
+	sub	r9, r10, r8
+	MTVRD(v9, r9)
+	vspltisb	v8, 3
+	/* Mask unwanted bytes.  */
+#ifdef __LITTLE_ENDIAN__
+	lvsr	v7, 0, r10
+	vperm   v6, v0, v6, v7
+	vsldoi	v9, v0, v9, 8
+	vsl	v9, v9, v8
+	vslo	v6, v6, v9
+#else
+	lvsl	v7, 0, r10
+	vperm   v6, v6, v0, v7
+	vsldoi	v9, v0, v9, 8
+	vsl	v9, v9, v8
+	vsro	v6, v6, v9
+#endif
+L(last):
+	/* Permute the first bit of each byte into bits 48-63.  */
+	VBPERMQ(v6, v6, v10)
+	/* Shift each component into its correct position for merging.  */
+#ifdef __LITTLE_ENDIAN__
+	vsldoi	v6, v6, v6, 6
+	MFVRD(r7, v6)
+	cntlzd	r6, r7	/* Count leading zeros before the match.  */
+#else
+	MFVRD(r7, v6)
+	addi	r6, r7, -1
+	andc	r6, r6, r7
+	popcntd	r6, r6
+#endif
+	addi	r8, r8, 15
+	sub	r3, r8, r6	/* Compute final address.  */
+	cmpld	r6, r5
+	bltlr
+	li	r3, 0
+	blr
+
+	/* r3 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as BYTE in the original
+	   word from the string.  Use that to calculate the pointer.
+	   We need to make sure BYTE is *before* the end of the
+	   range.  */
+L(done):
+	cntlzd	r9, r3	      /* Count leading zeros before the match.  */
+	cmpld	r8, r0         /* Are we on the last word?  */
+	srdi	r6, r9, 3	      /* Convert leading zeros to bytes.  */
+	addi	r0, r6, -7
+	sub	r3, r8, r0
+	cmpld	cr7, r3, r10
+	bnelr
+	bgelr	cr7
+	li	r3, 0
+	blr
+
+	.align	4
+L(null):
+	li	r3, 0
+	blr
+
+/* Deals with size <= 32.  */
+	.align	4
+L(small_range):
+	cmpldi	r5, 0
+	beq	L(null)
+
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12, 0, r8
+#else
+	ldbrx	r12, 0, r8      /* Load reversed doubleword from memory.  */
+#endif
+	cmpb	r3, r12, r4     /* Check for BYTE in DWORD1.  */
+	and	r3, r3, r9
+	cmpldi	cr7, r3, 0
+	bne	cr7, L(done)
+
+	/* Are we done already?  */
+	cmpld	r8, r0
+	addi	r8, r8, -8
+	beqlr
+
+	.align	5
+L(loop_small):
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12, 0, r8
+#else
+	ldbrx	r12, 0, r8
+#endif
+	cmpb	r3, r12, r4
+	cmpld	r8, r0
+	cmpldi	cr7, r3, 0
+	bne	cr7, L(done)
+	addi	r8, r8, -8
+	bne	L(loop_small)
+	blr
+
+END (MEMRCHR)
+weak_alias (__memrchr, memrchr)
+libc_hidden_builtin_def (memrchr)
diff --git a/sysdeps/powerpc/powerpc64/power8/memset.S b/sysdeps/powerpc/powerpc64/power8/memset.S
index 11433d89ad..a42232b42a 100644
--- a/sysdeps/powerpc/powerpc64/power8/memset.S
+++ b/sysdeps/powerpc/powerpc64/power8/memset.S
@@ -1,5 +1,5 @@
 /* Optimized memset implementation for PowerPC64/POWER8.
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
+   Copyright (C) 2014-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -20,14 +20,18 @@
 
 #define MTVSRD_V1_R4  .long 0x7c240166     /* mtvsrd  v1,r4  */
 
-/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
    Returns 's'.  */
 
+#ifndef MEMSET
+# define MEMSET memset
+#endif
+
 	/* No need to use .machine power8 since mtvsrd is already
 	   handled by the define.  It avoid breakage on binutils
 	   that does not support this machine specifier.  */
 	.machine power7
-EALIGN (memset, 5, 0)
+ENTRY_TOCLESS (MEMSET, 5)
 	CALL_MCOUNT 3
 
 L(_memset):
@@ -373,7 +377,10 @@ L(write_LT_32):
 	subf	r5,r0,r5
 
 2:	bf	30,1f
-	sth	r4,0(r10)
+	/* Use stb instead of sth because it doesn't generate
+	   alignment interrupts on cache-inhibited storage.  */
+	stb	r4,0(r10)
+	stb	r4,1(r10)
 	addi	r10,r10,2
 
 1:	bf	31,L(end_4bytes_alignment)
@@ -433,17 +440,80 @@ L(tail5):
 	/* Handles copies of 0~8 bytes.  */
 	.align	4
 L(write_LE_8):
-	bne	cr6,L(tail4)
+	bne	cr6,L(LE7_tail4)
+	/* If input is word aligned, use stw, else use stb.  */
+	andi.	r0,r10,3
+	bne	L(8_unalign)
 
 	stw	r4,0(r10)
 	stw	r4,4(r10)
 	blr
-END_GEN_TB (memset,TB_TOCLESS)
+
+	/* Unaligned input and size is 8.  */
+	.align	4
+L(8_unalign):
+	andi.	r0,r10,1
+	beq	L(8_hwalign)
+	stb	r4,0(r10)
+	sth	r4,1(r10)
+	sth	r4,3(r10)
+	sth	r4,5(r10)
+	stb	r4,7(r10)
+	blr
+
+	/* Halfword aligned input and size is 8.  */
+	.align	4
+L(8_hwalign):
+	sth	r4,0(r10)
+	sth	r4,2(r10)
+	sth	r4,4(r10)
+	sth	r4,6(r10)
+	blr
+
+	.align	4
+	/* Copies 4~7 bytes.  */
+L(LE7_tail4):
+	/* Use stb instead of sth because it doesn't generate
+	   alignment interrupts on cache-inhibited storage.  */
+	bf	29,L(LE7_tail2)
+	stb	r4,0(r10)
+	stb	r4,1(r10)
+	stb	r4,2(r10)
+	stb	r4,3(r10)
+	bf	30,L(LE7_tail5)
+	stb	r4,4(r10)
+	stb	r4,5(r10)
+	bflr	31
+	stb	r4,6(r10)
+	blr
+
+	.align	4
+	/* Copies 2~3 bytes.  */
+L(LE7_tail2):
+	bf	30,1f
+	stb	r4,0(r10)
+	stb	r4,1(r10)
+	bflr	31
+	stb	r4,2(r10)
+	blr
+
+	.align	4
+L(LE7_tail5):
+	bflr	31
+	stb	r4,4(r10)
+	blr
+
+	.align	4
+1: 	bflr	31
+	stb	r4,0(r10)
+	blr
+
+END_GEN_TB (MEMSET,TB_TOCLESS)
 libc_hidden_builtin_def (memset)
 
 /* Copied from bzero.S to prevent the linker from inserting a stub
    between bzero and memset.  */
-ENTRY (__bzero)
+ENTRY_TOCLESS (__bzero)
 	CALL_MCOUNT 3
 	mr	r5,r4
 	li	r4,0
diff --git a/sysdeps/powerpc/powerpc64/power8/multiarch/Implies b/sysdeps/powerpc/powerpc64/power8/multiarch/Implies
deleted file mode 100644
index 1fc7b7cd39..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/multiarch/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc64/power7/multiarch
diff --git a/sysdeps/powerpc/powerpc64/power8/stpcpy.S b/sysdeps/powerpc/powerpc64/power8/stpcpy.S
index fd8f7fa63a..ebdfaab97c 100644
--- a/sysdeps/powerpc/powerpc64/power8/stpcpy.S
+++ b/sysdeps/powerpc/powerpc64/power8/stpcpy.S
@@ -1,5 +1,5 @@
 /* Optimized stpcpy implementation for PowerPC64/POWER8.
-   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Copyright (C) 2015-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/powerpc/powerpc64/power8/stpncpy.S b/sysdeps/powerpc/powerpc64/power8/stpncpy.S
index 067910b373..95c86e9677 100644
--- a/sysdeps/powerpc/powerpc64/power8/stpncpy.S
+++ b/sysdeps/powerpc/powerpc64/power8/stpncpy.S
@@ -1,5 +1,5 @@
 /* Optimized stpncpy implementation for PowerPC64/POWER8.
-   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Copyright (C) 2015-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -18,3 +18,7 @@
 
 #define USE_AS_STPNCPY
 #include <sysdeps/powerpc/powerpc64/power8/strncpy.S>
+
+weak_alias (__stpncpy, stpncpy)
+libc_hidden_def (__stpncpy)
+libc_hidden_builtin_def (stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/power8/strcasecmp.S b/sysdeps/powerpc/powerpc64/power8/strcasecmp.S
new file mode 100644
index 0000000000..3a2efe2a64
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strcasecmp.S
@@ -0,0 +1,457 @@
+/* Optimized strcasecmp implementation for PowerPC64.
+   Copyright (C) 2016-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <locale-defines.h>
+
+/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] ) */
+
+#ifndef USE_AS_STRNCASECMP
+#  define __STRCASECMP __strcasecmp
+#  define STRCASECMP   strcasecmp
+#else
+#  define __STRCASECMP __strncasecmp
+#  define STRCASECMP   strncasecmp
+#endif
+/* Convert 16 bytes to lowercase and compare */
+#define TOLOWER()     \
+	vaddubm	v8, v4, v1; \
+	vaddubm	v7, v4, v3; \
+	vcmpgtub	v8, v8, v2; \
+	vsel	v4, v7, v4, v8; \
+	vaddubm	v8, v5, v1; \
+	vaddubm	v7, v5, v3; \
+	vcmpgtub	v8, v8, v2; \
+	vsel	v5, v7, v5, v8; \
+	vcmpequb.	v7, v5, v4;
+
+/*
+ * Get 16 bytes for unaligned case.
+ * reg1: Vector to hold next 16 bytes.
+ * reg2: Address to read from.
+ * reg3: Permute control vector.
+ * v8: Tmp vector used to mask unwanted bytes.
+ * v9: Tmp vector,0 when null is found on first 16 bytes
+ */
+#ifdef __LITTLE_ENDIAN__
+#define GET16BYTES(reg1, reg2, reg3) \
+	lvx	reg1, 0, reg2; \
+	vspltisb	v8, -1; \
+	vperm	v8, v8, reg1, reg3; \
+	vcmpequb.	v8, v0, v8; \
+	beq	cr6, 1f; \
+	vspltisb	v9, 0; \
+	b	2f; \
+	.align 4; \
+1: \
+	addi	r6, reg2, 16; \
+	lvx	v9, 0, r6; \
+2: \
+	vperm	reg1, v9, reg1, reg3;
+#else
+#define GET16BYTES(reg1, reg2, reg3) \
+	lvx	reg1, 0, reg2; \
+	vspltisb	 v8, -1; \
+	vperm	v8, reg1, v8,  reg3; \
+	vcmpequb.	v8, v0, v8; \
+	beq	cr6, 1f; \
+	vspltisb	v9, 0; \
+	b	2f; \
+	.align 4; \
+1: \
+	addi	r6, reg2, 16; \
+	lvx	v9, 0, r6; \
+2: \
+	vperm	reg1, reg1, v9, reg3;
+#endif
+
+/* Check null in v4, v5 and convert to lower.  */
+#define CHECKNULLANDCONVERT() \
+	vcmpequb.	v7, v0, v5; \
+	beq	cr6, 3f; \
+	vcmpequb.	v7, v0, v4; \
+	beq	cr6, 3f; \
+	b	L(null_found); \
+	.align  4; \
+3: \
+	TOLOWER()
+
+#ifdef _ARCH_PWR8
+#  define VCLZD_V8_v7	vclzd	v8, v7;
+#  define MFVRD_R3_V1	mfvrd	r3, v1;
+#  define VSUBUDM_V9_V8	vsubudm	v9, v9, v8;
+#  define VPOPCNTD_V8_V8	vpopcntd v8, v8;
+#  define VADDUQM_V7_V8	vadduqm	v9, v7, v8;
+#else
+#  define VCLZD_V8_v7	.long	0x11003fc2
+#  define MFVRD_R3_V1	.long	0x7c230067
+#  define VSUBUDM_V9_V8	.long	0x112944c0
+#  define VPOPCNTD_V8_V8	.long	0x110047c3
+#  define VADDUQM_V7_V8	.long	0x11274100
+#endif
+
+	.machine  power7
+
+ENTRY (__STRCASECMP)
+#ifdef USE_AS_STRNCASECMP
+	CALL_MCOUNT 3
+#else
+	CALL_MCOUNT 2
+#endif
+#define rRTN	r3	/* Return value */
+#define rSTR1	r10	/* 1st string */
+#define rSTR2	r4	/* 2nd string */
+#define rCHAR1	r6	/* Byte read from 1st string */
+#define rCHAR2	r7	/* Byte read from 2nd string */
+#define rADDR1	r8	/* Address of tolower(rCHAR1) */
+#define rADDR2	r12	/* Address of tolower(rCHAR2) */
+#define rLWR1	r8	/* Word tolower(rCHAR1) */
+#define rLWR2	r12	/* Word tolower(rCHAR2) */
+#define rTMP	r9
+#define rLOC	r11	/* Default locale address */
+
+	cmpd	cr7, rRTN, rSTR2
+
+	/* Get locale address.  */
+	ld 	rTMP, __libc_tsd_LOCALE@got@tprel(r2)
+	add 	rLOC, rTMP, __libc_tsd_LOCALE@tls
+	ld	rLOC, 0(rLOC)
+
+	mr	rSTR1, rRTN
+	li	rRTN, 0
+	beqlr	cr7
+#ifdef USE_AS_STRNCASECMP
+	cmpdi	cr7, r5, 0
+	beq	cr7, L(retnull)
+	cmpdi	cr7, r5, 16
+	blt	cr7, L(bytebybyte)
+#endif
+	vspltisb	v0, 0
+	vspltisb	v8, -1
+	/* Check for null in initial characters.
+	   Check max of 16 char depending on the alignment.
+	   If null is present, proceed byte by byte.  */
+	lvx	v4, 0, rSTR1
+#ifdef  __LITTLE_ENDIAN__
+	lvsr	v10, 0, rSTR1	/* Compute mask.  */
+	vperm	v9, v8, v4, v10	/* Mask bits that are not part of string.  */
+#else
+	lvsl	v10, 0, rSTR1
+	vperm	v9, v4, v8, v10
+#endif
+	vcmpequb.	v9, v0, v9	/* Check for null bytes.  */
+	bne	cr6, L(bytebybyte)
+	lvx	v5, 0, rSTR2
+	/* Calculate alignment.  */
+#ifdef __LITTLE_ENDIAN__
+	lvsr	v6, 0, rSTR2
+	vperm	v9, v8, v5, v6	/* Mask bits that are not part of string.  */
+#else
+	lvsl	v6, 0, rSTR2
+	vperm	v9, v5, v8, v6
+#endif
+	vcmpequb.	v9, v0, v9	/* Check for null bytes.  */
+	bne	cr6, L(bytebybyte)
+	/* Check if locale has non ascii characters.  */
+	ld	rTMP, 0(rLOC)
+	addi r6, rTMP,LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES
+	lwz	rTMP, 0(r6)
+	cmpdi	cr7, rTMP, 1
+	beq	cr7, L(bytebybyte)
+
+	/* Load vector registers with values used for TOLOWER.  */
+	/* Load v1 = 0xbf, v2 = 0x19 v3 = 0x20 in each byte.  */
+	vspltisb	v3, 2
+	vspltisb	v9, 4
+	vsl	v3, v3, v9
+	vaddubm	v1, v3, v3
+	vnor	v1, v1, v1
+	vspltisb	v2, 7
+	vsububm	v2, v3, v2
+
+	andi.	rADDR1, rSTR1, 0xF
+	beq	cr0, L(align)
+	addi	r6, rSTR1, 16
+	lvx	v9, 0, r6
+	/* Compute 16 bytes from previous two loads.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm	v4, v9, v4, v10
+#else
+	vperm	v4, v4, v9, v10
+#endif
+L(align):
+	andi.	rADDR2, rSTR2, 0xF
+	beq	cr0, L(align1)
+	addi	r6, rSTR2, 16
+	lvx	v9, 0, r6
+	/* Compute 16 bytes from previous two loads.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm	v5, v9, v5, v6
+#else
+	vperm	v5, v5, v9, v6
+#endif
+L(align1):
+	CHECKNULLANDCONVERT()
+	blt	cr6, L(match)
+	b	L(different)
+	.align 	4
+L(match):
+	clrldi	r6, rSTR1, 60
+	subfic	r7, r6, 16
+#ifdef USE_AS_STRNCASECMP
+	sub	r5, r5, r7
+#endif
+	add	rSTR1, rSTR1, r7
+	add	rSTR2, rSTR2, r7
+	andi.	rADDR2, rSTR2, 0xF
+	addi	rSTR1, rSTR1, -16
+	addi	rSTR2, rSTR2, -16
+	beq	cr0, L(aligned)
+#ifdef __LITTLE_ENDIAN__
+	lvsr	v6, 0, rSTR2
+#else
+	lvsl	v6, 0, rSTR2
+#endif
+	/* There are 2 loops depending on the input alignment.
+	   Each loop gets 16 bytes from s1 and s2, check for null,
+	   convert to lowercase and compare. Loop till difference
+	   or null occurs. */
+L(s1_align):
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+#ifdef USE_AS_STRNCASECMP
+	cmpdi	cr7, r5, 16
+	blt	cr7, L(bytebybyte)
+	addi	r5, r5, -16
+#endif
+	lvx	v4, 0, rSTR1
+	GET16BYTES(v5, rSTR2, v6)
+	CHECKNULLANDCONVERT()
+	blt	cr6, L(s1_align)
+	b	L(different)
+	.align 	4
+L(aligned):
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+#ifdef USE_AS_STRNCASECMP
+	cmpdi	cr7, r5, 16
+	blt	cr7, L(bytebybyte)
+	addi	r5, r5, -16
+#endif
+	lvx	v4, 0, rSTR1
+	lvx	v5, 0, rSTR2
+	CHECKNULLANDCONVERT()
+	blt	cr6, L(aligned)
+
+	/* Calculate and return the difference. */
+L(different):
+	vaddubm	v1, v3, v3
+	vcmpequb	v7, v0, v7
+#ifdef __LITTLE_ENDIAN__
+	/* Count trailing zero.  */
+	vspltisb	v8, -1
+	VADDUQM_V7_V8
+	vandc	v8, v9, v7
+	VPOPCNTD_V8_V8
+	vspltb	v6, v8, 15
+	vcmpequb.	v6, v6, v1
+	blt	cr6, L(shift8)
+#else
+	/* Count leading zero.  */
+	VCLZD_V8_v7
+	vspltb	v6, v8, 7
+	vcmpequb.	v6, v6, v1
+	blt	cr6, L(shift8)
+	vsro	v8, v8, v1
+#endif
+	b	L(skipsum)
+	.align  4
+L(shift8):
+	vsumsws		v8, v8, v0
+L(skipsum):
+#ifdef __LITTLE_ENDIAN__
+	/* Shift registers based on leading zero count.  */
+	vsro	v6, v5, v8
+	vsro	v7, v4, v8
+	/* Merge and move to GPR.  */
+	vmrglb	v6, v6, v7
+	vslo	v1, v6, v1
+	MFVRD_R3_V1
+	/* Place the characters that are different in first position.  */
+	sldi	rSTR2, rRTN, 56
+	srdi	rSTR2, rSTR2, 56
+	sldi	rSTR1, rRTN, 48
+	srdi	rSTR1, rSTR1, 56
+#else
+	vslo	v6, v5, v8
+	vslo	v7, v4, v8
+	vmrghb	v1, v6, v7
+	MFVRD_R3_V1
+	srdi	rSTR2, rRTN, 48
+	sldi	rSTR2, rSTR2, 56
+	srdi	rSTR2, rSTR2, 56
+	srdi	rSTR1, rRTN, 56
+#endif
+	subf  	rRTN, rSTR1, rSTR2
+	extsw 	rRTN, rRTN
+	blr
+
+	.align  4
+	/* OK. We've hit the end of the string. We need to be careful that
+	   we don't compare two strings as different because of junk beyond
+	   the end of the strings...  */
+L(null_found):
+	vaddubm	v10, v3, v3
+#ifdef __LITTLE_ENDIAN__
+	/* Count trailing zero.  */
+	vspltisb	v8, -1
+	VADDUQM_V7_V8
+	vandc	v8, v9, v7
+	VPOPCNTD_V8_V8
+	vspltb	v6, v8, 15
+	vcmpequb.	v6, v6, v10
+	blt	cr6, L(shift_8)
+#else
+	/* Count leading zero.  */
+	VCLZD_V8_v7
+	vspltb	v6, v8, 7
+	vcmpequb.	v6, v6, v10
+	blt	cr6, L(shift_8)
+	vsro	v8, v8, v10
+#endif
+	b	L(skipsum1)
+	.align  4
+L(shift_8):
+	vsumsws	v8, v8, v0
+L(skipsum1):
+	/* Calculate shift count based on count of zero.  */
+	vspltisb	v10, 7
+	vslb	v10, v10, v10
+	vsldoi	v9, v0, v10, 1
+	VSUBUDM_V9_V8
+	vspltisb	v8, 8
+	vsldoi	v8, v0, v8, 1
+	VSUBUDM_V9_V8
+	/* Shift and remove junk after null character.  */
+#ifdef __LITTLE_ENDIAN__
+	vslo	v5, v5, v9
+	vslo	v4, v4, v9
+#else
+	vsro	v5, v5, v9
+	vsro	v4, v4, v9
+#endif
+	/* Convert and compare 16 bytes.  */
+	TOLOWER()
+	blt	cr6, L(retnull)
+	b	L(different)
+	.align  4
+L(retnull):
+	li	rRTN, 0
+	blr
+	.align  4
+L(bytebybyte):
+	/* Unrolling loop for POWER: loads are done with 'lbz' plus
+	offset and string descriptors are only updated in the end
+	of loop unrolling. */
+	ld	rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
+	lbz	rCHAR1, 0(rSTR1)	/* Load char from s1 */
+	lbz	rCHAR2, 0(rSTR2)	/* Load char from s2 */
+#ifdef USE_AS_STRNCASECMP
+	rldicl	rTMP, r5, 62, 2
+	cmpdi	cr7, rTMP, 0
+	beq	cr7, L(lessthan4)
+	mtctr	rTMP
+#endif
+L(loop):
+	cmpdi	rCHAR1, 0		/* *s1 == '\0' ? */
+	sldi	rADDR1, rCHAR1, 2	/* Calculate address for tolower(*s1) */
+	sldi	rADDR2, rCHAR2, 2	/* Calculate address for tolower(*s2) */
+	lwzx	rLWR1, rLOC, rADDR1	/* Load tolower(*s1) */
+	lwzx	rLWR2, rLOC, rADDR2	/* Load tolower(*s2) */
+	cmpw	cr1, rLWR1, rLWR2	/* r = tolower(*s1) == tolower(*s2) ? */
+	crorc	4*cr1+eq,eq,4*cr1+eq	/* (*s1 != '\0') || (r == 1) */
+	beq	cr1, L(done)
+	lbz	rCHAR1, 1(rSTR1)
+	lbz	rCHAR2, 1(rSTR2)
+	cmpdi	rCHAR1, 0
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpw	cr1, rLWR1, rLWR2
+	crorc	4*cr1+eq,eq,4*cr1+eq
+	beq	cr1, L(done)
+	lbz	rCHAR1, 2(rSTR1)
+	lbz	rCHAR2, 2(rSTR2)
+	cmpdi	rCHAR1, 0
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpw	cr1, rLWR1, rLWR2
+	crorc	4*cr1+eq,eq,4*cr1+eq
+	beq	cr1, L(done)
+	lbz	rCHAR1, 3(rSTR1)
+	lbz	rCHAR2, 3(rSTR2)
+	cmpdi	rCHAR1, 0
+	/* Increment both string descriptors */
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpw	cr1, rLWR1, rLWR2
+	crorc	4*cr1+eq,eq,4*cr1+eq
+	beq     cr1, L(done)
+	lbz	rCHAR1, 0(rSTR1)	/* Load char from s1 */
+	lbz	rCHAR2, 0(rSTR2)	/* Load char from s2 */
+#ifdef USE_AS_STRNCASECMP
+	bdnz	L(loop)
+#else
+	b	L(loop)
+#endif
+#ifdef USE_AS_STRNCASECMP
+L(lessthan4):
+	clrldi	r5, r5, 62
+	cmpdi	cr7, r5, 0
+	beq	cr7, L(retnull)
+	mtctr	r5
+L(loop1):
+	cmpdi	rCHAR1, 0
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpw	cr1, rLWR1, rLWR2
+	crorc	4*cr1+eq,eq,4*cr1+eq
+	beq	cr1, L(done)
+	addi	rSTR1, rSTR1, 1
+	addi	rSTR2, rSTR2, 1
+	lbz	rCHAR1, 0(rSTR1)
+	lbz	rCHAR2, 0(rSTR2)
+	bdnz	L(loop1)
+#endif
+L(done):
+	subf	r0, rLWR2, rLWR1
+	extsw	rRTN, r0
+	blr
+END (__STRCASECMP)
+
+weak_alias (__STRCASECMP, STRCASECMP)
+libc_hidden_builtin_def (__STRCASECMP)
diff --git a/sysdeps/powerpc/powerpc64/power8/strcasestr-ppc64.c b/sysdeps/powerpc/powerpc64/power8/strcasestr-ppc64.c
new file mode 100644
index 0000000000..221d4733f4
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strcasestr-ppc64.c
@@ -0,0 +1,29 @@
+/* Optimized strcasestr implementation for PowerPC64/POWER8.
+   Copyright (C) 2016-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+
+#define STRCASESTR __strcasestr_ppc
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(__name)
+
+#undef weak_alias
+#define weak_alias(a,b)
+extern __typeof (strcasestr) __strcasestr_ppc attribute_hidden;
+
+#include <string/strcasestr.c>
diff --git a/sysdeps/powerpc/powerpc64/power8/strcasestr.S b/sysdeps/powerpc/powerpc64/power8/strcasestr.S
new file mode 100644
index 0000000000..9fc24c29f9
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strcasestr.S
@@ -0,0 +1,538 @@
+/* Optimized strcasestr implementation for PowerPC64/POWER8.
+   Copyright (C) 2016-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <locale-defines.h>
+
+/* Char * [r3] strcasestr (char *s [r3], char * pat[r4])  */
+
+/* The performance gain is obtained by comparing 16 bytes.  */
+
+/* When the first char of r4 is hit ITERATIONS times in r3
+   fallback to default.  */
+#define ITERATIONS	64
+
+#ifndef STRCASESTR
+# define STRCASESTR __strcasestr
+#endif
+
+#ifndef STRLEN
+/* For builds without IFUNC support, local calls should be made to internal
+   GLIBC symbol (created by libc_hidden_builtin_def).  */
+# ifdef SHARED
+#  define STRLEN   __GI_strlen
+# else
+#  define STRLEN   strlen
+# endif
+#endif
+
+#ifndef STRNLEN
+/* For builds without IFUNC support, local calls should be made to internal
+   GLIBC symbol (created by libc_hidden_builtin_def).  */
+# ifdef SHARED
+#  define STRNLEN   __GI_strnlen
+# else
+#  define STRNLEN    __strnlen
+# endif
+#endif
+
+#ifndef STRCHR
+# ifdef SHARED
+#  define STRCHR   __GI_strchr
+# else
+#  define STRCHR   strchr
+# endif
+#endif
+
+/* Convert 16 bytes of v4 and reg to lowercase and compare.  */
+#define TOLOWER(reg)     \
+	vcmpgtub	v6, v4, v1; \
+	vcmpgtub	v7, v2, v4; \
+	vand	v8, v7, v6; \
+	vand	v8, v8, v3; \
+	vor	v4, v8, v4; \
+	vcmpgtub	v6, reg, v1; \
+	vcmpgtub	v7, v2, reg; \
+	vand	v8, v7, v6; \
+	vand	v8, v8, v3; \
+	vor	reg, v8, reg; \
+	vcmpequb.	v6, reg, v4;
+
+/* TODO: change these to the actual instructions when the minimum required
+   binutils allows it.  */
+#ifdef _ARCH_PWR8
+#define VCLZD_V8_v7	vclzd	v8, v7;
+#else
+#define VCLZD_V8_v7	.long	0x11003fc2
+#endif
+
+#define	FRAMESIZE	(FRAME_MIN_SIZE+48)
+/* TODO: change this to .machine power8 when the minimum required binutils
+   allows it.  */
+	.machine  power7
+ENTRY (STRCASESTR, 4)
+	CALL_MCOUNT 2
+	mflr	r0			/* Load link register LR to r0.  */
+	std	r31, -8(r1)		/* Save callers register r31.  */
+	std	r30, -16(r1)		/* Save callers register r30.  */
+	std	r29, -24(r1)		/* Save callers register r29.  */
+	std	r28, -32(r1)		/* Save callers register r28.  */
+	std	r27, -40(r1)		/* Save callers register r27.  */
+	std	r0, 16(r1)		/* Store the link register.  */
+	cfi_offset(r31, -8)
+	cfi_offset(r30, -16)
+	cfi_offset(r29, -24)
+	cfi_offset(r28, -32)
+	cfi_offset(r27, -40)
+	cfi_offset(lr, 16)
+	stdu	r1, -FRAMESIZE(r1)	/* Create the stack frame.  */
+	cfi_adjust_cfa_offset(FRAMESIZE)
+
+	dcbt	0, r3
+	dcbt	0, r4
+	cmpdi	cr7, r3, 0		/* Input validation.  */
+	beq	cr7, L(retnull)
+	cmpdi	cr7, r4, 0
+	beq	cr7, L(retnull)
+
+	mr	r29, r3
+	mr	r30, r4
+	/* Load first byte from r4 and check if its null.  */
+	lbz	r6, 0(r4)
+	cmpdi	cr7, r6, 0
+	beq	cr7, L(ret_r3)
+
+	ld	r10, __libc_tsd_LOCALE@got@tprel(r2)
+	add	r9, r10, __libc_tsd_LOCALE@tls
+	ld	r9, 0(r9)
+	ld	r9, LOCALE_CTYPE_TOUPPER(r9)
+	sldi	r10, r6, 2		/* Convert to upper case.  */
+	lwzx	r28, r9, r10
+
+	ld	r10, __libc_tsd_LOCALE@got@tprel(r2)
+	add	r11, r10, __libc_tsd_LOCALE@tls
+	ld	r11, 0(r11)
+	ld	r11, LOCALE_CTYPE_TOLOWER(r11)
+	sldi	r10, r6, 2              /* Convert to lower case.  */
+	lwzx	r27, r11, r10
+
+	/* Check if the first char is present.  */
+	mr	r4, r27
+	bl	STRCHR
+	nop
+	mr	r5, r3
+	mr	r3, r29
+	mr	r29, r5
+	mr	r4, r28
+	bl	STRCHR
+	nop
+	cmpdi	cr7, r29, 0
+	beq	cr7, L(firstpos)
+	cmpdi	cr7, r3, 0
+	beq	cr7, L(skipcheck)
+	cmpw	cr7, r3, r29
+	ble 	cr7, L(firstpos)
+	/* Move r3 to the first occurence.  */
+L(skipcheck):
+	mr	r3, r29
+L(firstpos):
+	mr	r29, r3
+
+	sldi	r9, r27, 8
+	or	r28, r9, r28
+	/* Reg r27 is used to count the number of iterations.  */
+	li	r27, 0
+	/* If first char of search str is not present.  */
+	cmpdi	cr7, r3, 0
+	ble	cr7, L(end)
+
+	/* Find the length of pattern.  */
+	mr	r3, r30
+	bl	STRLEN
+	nop
+
+	cmpdi	cr7, r3, 0	/* If search str is null.  */
+	beq	cr7, L(ret_r3)
+
+	mr	r31, r3
+	mr	r4, r3
+	mr	r3, r29
+	bl	STRNLEN
+	nop
+
+	cmpd	cr7, r3, r31 	/* If len(r3) < len(r4).  */
+	blt	cr7, L(retnull)
+
+	mr	r3, r29
+
+	/* Locales not matching ASCII for single bytes.  */
+	ld	r10, __libc_tsd_LOCALE@got@tprel(r2)
+	add	r9, r10, __libc_tsd_LOCALE@tls
+	ld	r9, 0(r9)
+	ld	r7, 0(r9)
+	addi	r7, r7, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES
+	lwz	r8, 0(r7)
+	cmpdi	cr7, r8, 1
+	beq	cr7, L(bytebybyte)
+
+	/* If len(r4) < 16 handle byte by byte.  */
+	/* For shorter strings we will not use vector registers.  */
+	cmpdi	cr7, r31, 16
+	blt	cr7, L(bytebybyte)
+
+	/* Comparison values used for TOLOWER.  */
+	/* Load v1 = 64('A' - 1), v2 = 91('Z' + 1), v3 = 32 in each byte.  */
+	vspltish	v0, 0
+	vspltisb	v5, 2
+	vspltisb	v4, 4
+	vsl	v3, v5, v4
+	vaddubm	v1, v3, v3
+	vspltisb	v5, 15
+	vaddubm	v2, v5, v5
+	vaddubm	v2, v1, v2
+	vspltisb	v4, -3
+	vaddubm	v2, v2, v4
+
+	/*
+	1. Load 16 bytes from r3 and r4
+	2. Check if there is null, If yes, proceed byte by byte path.
+	3. Else,Convert both to lowercase and compare.
+	4. If they are same proceed to 1.
+	5. If they dont match, find if first char of r4 is present in the
+	   loaded 16 byte of r3.
+	6. If yes, move position, load next 16 bytes of r3 and proceed to 2.
+	*/
+
+	mr	r8, r3		/* Save r3 for future use.  */
+	mr	r4, r30		/* Restore r4.  */
+	clrldi	r10, r4, 60
+	lvx	v5, 0, r4	/* Load 16 bytes from r4.  */
+	cmpdi	cr7, r10, 0
+	beq	cr7, L(begin2)
+	/* If r4 is unaligned, load another 16 bytes.  */
+#ifdef __LITTLE_ENDIAN__
+	lvsr	v7, 0, r4
+#else
+	lvsl	v7, 0, r4
+#endif
+	addi	r5, r4, 16
+	lvx	v9, 0, r5
+#ifdef __LITTLE_ENDIAN__
+	vperm	v5, v9, v5, v7
+#else
+	vperm	v5, v5, v9, v7
+#endif
+L(begin2):
+	lvx	v4, 0, r3
+	vcmpequb.	v7, v0, v4	/* Check for null.  */
+	beq	cr6, L(nullchk6)
+	b	L(trailcheck)
+
+        .align  4
+L(nullchk6):
+	clrldi	r10, r3, 60
+	cmpdi	cr7, r10, 0
+	beq	cr7, L(next16)
+#ifdef __LITTLE_ENDIAN__
+	lvsr	v7, 0, r3
+#else
+	lvsl	v7, 0, r3
+#endif
+	addi	r5, r3, 16
+	/* If r3 is unaligned, load another 16 bytes.  */
+	lvx	v10, 0, r5
+#ifdef __LITTLE_ENDIAN__
+	vperm	v4, v10, v4, v7
+#else
+	vperm	v4, v4, v10, v7
+#endif
+L(next16):
+	vcmpequb.	v6, v0, v5	/* Check for null.  */
+	beq	cr6, L(nullchk)
+	b	L(trailcheck)
+
+	.align	4
+L(nullchk):
+	vcmpequb.	v6, v0, v4
+	beq	cr6, L(nullchk1)
+	b	L(retnull)
+
+	.align	4
+L(nullchk1):
+	/* Convert both v3 and v4 to lower.  */
+	TOLOWER(v5)
+	/* If both are same, branch to match.  */
+	blt	cr6, L(match)
+	/* Find if the first char is present in next 15 bytes.  */
+#ifdef __LITTLE_ENDIAN__
+	vspltb	v6, v5, 15
+	vsldoi	v7, v0, v4, 15
+#else
+	vspltb	v6, v5, 0
+	vspltisb	v7, 8
+	vslo	v7, v4, v7
+#endif
+	vcmpequb	v7, v6, v7
+	vcmpequb.	v6, v0, v7
+	/* Shift r3 by 16 bytes and proceed.  */
+	blt	cr6, L(shift16)
+	VCLZD_V8_v7
+#ifdef __LITTLE_ENDIAN__
+	vspltb	v6, v8, 15
+#else
+	vspltb	v6, v8, 7
+#endif
+	vcmpequb.	v6, v6, v1
+	/* Shift r3 by 8  bytes and proceed.  */
+	blt	cr6, L(shift8)
+	b	L(begin)
+
+	.align	4
+L(match):
+	/* There is a match of 16 bytes, check next bytes.  */
+	cmpdi	cr7, r31, 16
+	mr	r29, r3
+	beq	cr7, L(ret_r3)
+
+L(secondmatch):
+	addi	r3, r3, 16
+	addi	r4, r4, 16
+	/* Load next 16 bytes of r3 and r4 and compare.  */
+	clrldi	r10, r4, 60
+	cmpdi	cr7, r10, 0
+	beq	cr7, L(nextload)
+	/* Handle unaligned case.  */
+	vor	v6, v9, v9
+	vcmpequb.	v7, v0, v6
+	beq	cr6, L(nullchk2)
+	b	L(trailcheck)
+
+	.align	4
+L(nullchk2):
+#ifdef __LITTLE_ENDIAN__
+	lvsr	v7, 0, r4
+#else
+	lvsl	v7, 0, r4
+#endif
+	addi	r5, r4, 16
+	/* If r4 is unaligned, load another 16 bytes.  */
+	lvx	v9, 0, r5
+#ifdef __LITTLE_ENDIAN__
+	vperm	v11, v9, v6, v7
+#else
+	vperm	v11, v6, v9, v7
+#endif
+	b	L(compare)
+
+	.align	4
+L(nextload):
+	lvx	v11, 0, r4
+L(compare):
+	vcmpequb.	v7, v0, v11
+	beq	cr6, L(nullchk3)
+	b	L(trailcheck)
+
+	.align	4
+L(nullchk3):
+	clrldi	r10, r3, 60
+	cmpdi 	cr7, r10, 0
+	beq 	cr7, L(nextload1)
+	/* Handle unaligned case.  */
+	vor	v4, v10, v10
+	vcmpequb.	v7, v0, v4
+	beq	cr6, L(nullchk4)
+	b	L(retnull)
+
+	.align	4
+L(nullchk4):
+#ifdef __LITTLE_ENDIAN__
+	lvsr	v7, 0, r3
+#else
+	lvsl	v7, 0, r3
+#endif
+	addi	r5, r3, 16
+	/* If r3 is unaligned, load another 16 bytes.  */
+	lvx	v10, 0, r5
+#ifdef __LITTLE_ENDIAN__
+	vperm	v4, v10, v4, v7
+#else
+	vperm	v4, v4, v10, v7
+#endif
+	b	L(compare1)
+
+	.align	4
+L(nextload1):
+	lvx	v4, 0, r3
+L(compare1):
+	vcmpequb.	v7, v0, v4
+	beq	cr6, L(nullchk5)
+	b	L(retnull)
+
+	.align	4
+L(nullchk5):
+	/* Convert both v3 and v4 to lower.  */
+	TOLOWER(v11)
+	/* If both are same, branch to secondmatch.  */
+	blt 	cr6, L(secondmatch)
+	/* Continue the search.  */
+        b	L(begin)
+
+	.align	4
+L(trailcheck):
+	ld	r10, __libc_tsd_LOCALE@got@tprel(r2)
+	add	r11, r10, __libc_tsd_LOCALE@tls
+	ld	r11, 0(r11)
+	ld	r11, LOCALE_CTYPE_TOLOWER(r11)
+L(loop2):
+	lbz	r5, 0(r3)               /* Load byte from r3.  */
+	lbz	r6, 0(r4)               /* Load next byte from r4.  */
+	cmpdi 	cr7, r6, 0              /* Is it null?  */
+	beq 	cr7, L(updater3)
+	cmpdi 	cr7, r5, 0              /* Is it null?  */
+	beq 	cr7, L(retnull)         /* If yes, return.  */
+	addi	r3, r3, 1
+	addi	r4, r4, 1               /* Increment r4.  */
+	sldi	r10, r5, 2              /* Convert to lower case.  */
+	lwzx	r10, r11, r10
+	sldi	r7, r6, 2               /* Convert to lower case.  */
+	lwzx	r7, r11, r7
+	cmpw	cr7, r7, r10            /* Compare with byte from r4.  */
+	bne	cr7, L(begin)
+	b	L(loop2)
+
+	.align	4
+L(shift8):
+	addi	r8, r8, 7
+	b	L(begin)
+	.align	4
+L(shift16):
+	addi	r8, r8, 15
+	.align	4
+L(begin):
+	addi	r8, r8, 1
+	mr	r3, r8
+	/* When our iterations exceed ITERATIONS,fall back to default.  */
+	addi	r27, r27, 1
+	cmpdi	cr7, r27, ITERATIONS
+	beq	cr7, L(default)
+	mr	r4, r30         /* Restore r4.  */
+	b	L(begin2)
+
+	/* Handling byte by byte.  */
+	.align	4
+L(loop1):
+	mr	r3, r8
+	addi	r27, r27, 1
+	cmpdi	cr7, r27, ITERATIONS
+	beq	cr7, L(default)
+	mr	r29, r8
+	srdi	r4, r28, 8
+	/* Check if the first char is present.  */
+	bl	STRCHR
+	nop
+	mr	r5, r3
+	mr	r3, r29
+	mr	r29, r5
+	sldi	r4, r28, 56
+	srdi	r4, r4, 56
+	bl	STRCHR
+	nop
+	cmpdi	cr7, r29, 0
+	beq	cr7, L(nextpos)
+	cmpdi	cr7, r3, 0
+	beq	cr7, L(skipcheck1)
+	cmpw	cr7, r3, r29
+	ble 	cr7, L(nextpos)
+	/* Move r3 to first occurence.  */
+L(skipcheck1):
+	mr	r3, r29
+L(nextpos):
+	mr	r29, r3
+	cmpdi 	cr7, r3, 0
+	ble 	cr7, L(retnull)
+L(bytebybyte):
+	ld	r10, __libc_tsd_LOCALE@got@tprel(r2)
+	add	r11, r10, __libc_tsd_LOCALE@tls
+	ld	r11, 0(r11)
+	ld	r11, LOCALE_CTYPE_TOLOWER(r11)
+	mr	r4, r30                 /* Restore r4.  */
+	mr	r8, r3                  /* Save r3.  */
+	addi	r8, r8, 1
+
+L(loop):
+	addi	r3, r3, 1
+	lbz	r5, 0(r3)               /* Load byte from r3.  */
+	addi	r4, r4, 1               /* Increment r4.  */
+	lbz	r6, 0(r4)               /* Load next byte from r4.  */
+	cmpdi 	cr7, r6, 0              /* Is it null?  */
+	beq 	cr7, L(updater3)
+	cmpdi 	cr7, r5, 0              /* Is it null?  */
+	beq 	cr7, L(retnull)         /* If yes, return.  */
+	sldi	r10, r5, 2              /* Convert to lower case.  */
+	lwzx	r10, r11, r10
+	sldi	r7, r6, 2               /* Convert to lower case.  */
+	lwzx	r7, r11, r7
+	cmpw	cr7, r7, r10            /* Compare with byte from r4.  */
+	bne 	cr7, L(loop1)
+	b	L(loop)
+
+	/* Handling return values.  */
+	.align	4
+L(updater3):
+	subf	r3, r31, r3	/* Reduce r31 (len of r4) from r3.  */
+	b	L(end)
+
+	.align	4
+L(ret_r3):
+	mr	r3, r29		/* Return point of match.  */
+	b	L(end)
+
+	.align	4
+L(retnull):
+	li	r3, 0		/* Substring was not found.  */
+	b	L(end)
+
+	.align	4
+L(default):
+	mr	r4, r30
+	bl	__strcasestr_ppc
+	nop
+
+	.align	4
+L(end):
+	addi	r1, r1, FRAMESIZE	/* Restore stack pointer.  */
+	cfi_adjust_cfa_offset(-FRAMESIZE)
+	ld	r0, 16(r1)	/* Restore the saved link register.  */
+	ld	r27, -40(r1)
+	ld	r28, -32(r1)
+	ld	r29, -24(r1)	/* Restore callers save register r29.  */
+	ld	r30, -16(r1)	/* Restore callers save register r30.  */
+	ld	r31, -8(r1)	/* Restore callers save register r31.  */
+	cfi_restore(lr)
+	cfi_restore(r27)
+	cfi_restore(r28)
+	cfi_restore(r29)
+	cfi_restore(r30)
+	cfi_restore(r31)
+	mtlr	r0		/* Branch to link register.  */
+	blr
+END (STRCASESTR)
+
+weak_alias (__strcasestr, strcasestr)
+libc_hidden_def (__strcasestr)
+libc_hidden_builtin_def (strcasestr)
diff --git a/sysdeps/powerpc/powerpc64/power8/strchr.S b/sysdeps/powerpc/powerpc64/power8/strchr.S
new file mode 100644
index 0000000000..c5e28d9c9e
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strchr.S
@@ -0,0 +1,377 @@
+/* Optimized strchr implementation for PowerPC64/POWER8.
+   Copyright (C) 2016-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifdef USE_AS_STRCHRNUL
+# ifndef STRCHRNUL
+#   define FUNC_NAME __strchrnul
+# else
+#   define FUNC_NAME STRCHRNUL
+# endif
+#else
+# ifndef STRCHR
+#  define FUNC_NAME strchr
+# else
+#  define FUNC_NAME STRCHR
+# endif
+#endif  /* !USE_AS_STRCHRNUL  */
+
+/* int [r3] strchr (char *s [r3], int c [r4])  */
+/* TODO: change these to the actual instructions when the minimum required
+   binutils allows it.  */
+#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define VBPERMQ(t,a,b)  .long (0x1000054c \
+			| ((t)<<(32-11)) \
+			| ((a)<<(32-16)) \
+			| ((b)<<(32-21)) )
+/* TODO: change this to .machine power8 when the minimum required binutils
+   allows it.  */
+	.machine  power7
+ENTRY_TOCLESS (FUNC_NAME)
+	CALL_MCOUNT 2
+	dcbt	0,r3
+	clrrdi	r8,r3,3	      /* Align the address to doubleword boundary.  */
+	cmpdi	cr7,r4,0
+	ld	r12,0(r8)     /* Load doubleword from memory.  */
+	li	r0,0	      /* Doubleword with null chars to use
+				 with cmpb.  */
+
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+
+	beq	cr7,L(null_match)
+
+	/* Replicate byte to doubleword.  */
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
+	insrdi  r4,r4,32,0
+
+	/* Now r4 has a doubleword of c bytes and r0 has
+	   a doubleword of null bytes.  */
+
+	cmpb	r10,r12,r4     /* Compare each byte against c byte.  */
+	cmpb	r11,r12,r0     /* Compare each byte against null byte.  */
+
+	/* Move the doublewords left and right to discard the bits that are
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	srd	r11,r11,r6
+	sld	r10,r10,r6
+	sld	r11,r11,r6
+#else
+	sld	r10,r10,r6
+	sld	r11,r11,r6
+	srd	r10,r10,r6
+	srd	r11,r11,r6
+#endif
+	or	r5,r10,r11    /* OR the results to speed things up.  */
+	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done)
+
+	mtcrf   0x01,r8
+
+	/* Are we now aligned to a doubleword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	28,L(loop)
+
+	/* Handle WORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb    r10,r12,r4
+	cmpb	r11,r12,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done)
+	b	L(loop)	      /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+
+	.p2align  5
+L(loop):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu	r9,16(r8)
+	cmpb	r10,r12,r4
+	cmpb	r11,r12,r0
+	cmpb	r6,r9,r4
+	cmpb	r7,r9,r0
+	or	r5,r10,r11
+	or	r9,r6,r7
+	or	r12,r5,r9
+	cmpdi	cr7,r12,0
+	beq	cr7,L(vector)
+	/* OK, one (or both) of the doublewords contains a c/null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a c/null byte.  */
+
+	cmpdi	cr6,r5,0
+	addi	r8,r8,-8
+	bne	cr6,L(done)
+
+	/* The c/null byte must be in the second doubleword.  Adjust the
+	   address again and move the result of cmpb to r10 so we can calculate
+	   the pointer.  */
+
+	mr	r10,r6
+	mr	r11,r7
+	addi	r8,r8,8
+#ifdef USE_AS_STRCHRNUL
+	mr	r5, r9
+#endif
+	/* r10/r11 have the output of the cmpb instructions, that is,
+	   0xff in the same position as the c/null byte in the original
+	   doubleword from the string.  Use that to calculate the pointer.  */
+L(done):
+#ifdef USE_AS_STRCHRNUL
+	mr	r10, r5
+#endif
+#ifdef __LITTLE_ENDIAN__
+	addi    r3,r10,-1
+	andc    r3,r3,r10
+	popcntd	r0,r3
+# ifndef USE_AS_STRCHRNUL
+	addi    r4,r11,-1
+	andc    r4,r4,r11
+	cmpld	cr7,r3,r4
+	bgt	cr7,L(no_match)
+# endif
+#else
+	cntlzd	r0,r10	      /* Count leading zeros before c matches.  */
+# ifndef USE_AS_STRCHRNUL
+	cmpld	cr7,r11,r10
+	bgt	cr7,L(no_match)
+# endif
+#endif
+	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
+	add	r3,r8,r0      /* Return address of the matching c byte
+				 or null in case c was not found.  */
+	blr
+
+	/* Check the first 32B in GPR's and move to vectorized loop.  */
+	.p2align  5
+L(vector):
+	addi	r3, r8, 8
+	andi.	r10, r3, 31
+	bne	cr0, L(loop)
+	vspltisb	v0, 0
+	/* Precompute vbpermq constant.  */
+	vspltisb	v10, 3
+	lvsl	v11, r0, r0
+	vslb	v10, v11, v10
+	MTVRD(v1,r4)
+	li	r5, 16
+	vspltb	v1, v1, 7
+	/* Compare 32 bytes in each loop.  */
+L(continue):
+	lvx	v4, 0, r3
+	lvx	v5, r3, r5
+	vcmpequb	v2, v0, v4
+	vcmpequb	v3, v0, v5
+	vcmpequb	v6, v1, v4
+	vcmpequb	v7, v1, v5
+	vor	v8, v2, v3
+	vor	v9, v6, v7
+	vor	v11, v8, v9
+	vcmpequb.	v11, v0, v11
+	addi	r3, r3, 32
+	blt	cr6, L(continue)
+	/* One (or both) of the quadwords contains a c/null byte.  */
+	addi	r3, r3, -32
+#ifndef USE_AS_STRCHRNUL
+	vcmpequb.	v11, v0, v9
+	blt	cr6, L(no_match)
+#endif
+	/* Permute the first bit of each byte into bits 48-63.  */
+	VBPERMQ(v2, v2, v10)
+	VBPERMQ(v3, v3, v10)
+	VBPERMQ(v6, v6, v10)
+	VBPERMQ(v7, v7, v10)
+	/* Shift each component into its correct position for merging.  */
+#ifdef __LITTLE_ENDIAN__
+	vsldoi	v3, v3, v3, 2
+	vsldoi	v7, v7, v7, 2
+#else
+	vsldoi	v2, v2, v2, 6
+	vsldoi	v3, v3, v3, 4
+	vsldoi	v6, v6, v6, 6
+	vsldoi	v7, v7, v7, 4
+#endif
+
+        /* Merge the results and move to a GPR.  */
+        vor     v1, v3, v2
+        vor     v2, v6, v7
+        vor     v4, v1, v2
+	MFVRD(r5, v4)
+#ifdef __LITTLE_ENDIAN__
+	addi	r6, r5, -1
+	andc	r6, r6, r5
+	popcntd	r6, r6
+#else
+	cntlzd	r6, r5	/* Count leading zeros before the match.  */
+#endif
+	add	r3, r3, r6	/* Compute final length.  */
+	/* Return NULL if null found before c.  */
+#ifndef USE_AS_STRCHRNUL
+	lbz	r4, 0(r3)
+	cmpdi	cr7, r4, 0
+	beq	cr7, L(no_match)
+#endif
+	blr
+
+#ifndef USE_AS_STRCHRNUL
+	.align	4
+L(no_match):
+	li	r3,0
+	blr
+#endif
+
+/* We are here because strchr was called with a null byte.  */
+	.align	4
+L(null_match):
+	/* r0 has a doubleword of null bytes.  */
+
+	cmpb	r5,r12,r0     /* Compare each byte against null bytes.  */
+
+	/* Move the doublewords left and right to discard the bits that are
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r5,r5,r6
+	sld	r5,r5,r6
+#else
+	sld	r5,r5,r6
+	srd	r5,r5,r6
+#endif
+	cmpdi	cr7,r5,0      /* If r10 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done_null)
+
+	mtcrf   0x01,r8
+
+	/* Are we now aligned to a quadword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	28,L(loop_null)
+
+	/* Handle WORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb    r5,r12,r0
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done_null)
+	b	L(loop_null)  /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+
+	/* Main loop to look for the end of the string.  Since it's a
+	   small loop (< 8 instructions), align it to 32-bytes.  */
+	.p2align  5
+L(loop_null):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu     r11,16(r8)
+	cmpb	r5,r12,r0
+	cmpb	r10,r11,r0
+	or	r6,r5,r10
+	cmpdi	cr7,r6,0
+	beq	cr7,L(vector1)
+
+	/* OK, one (or both) of the doublewords contains a null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a null byte.  */
+
+	cmpdi	cr6,r5,0
+	addi	r8,r8,-8
+	bne	cr6,L(done_null)
+
+	/* The null byte must be in the second doubleword.  Adjust the address
+	   again and move the result of cmpb to r10 so we can calculate the
+	   pointer.  */
+
+	mr	r5,r10
+	addi	r8,r8,8
+
+	/* r5 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as the null byte in the original
+	   doubleword from the string.  Use that to calculate the pointer.  */
+L(done_null):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntd	r0,r0
+#else
+	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
+#endif
+	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
+	add	r3,r8,r0      /* Return address of the matching null byte.  */
+	blr
+	.p2align  5
+L(vector1):
+	addi    r3, r8, 8
+	andi.	r10, r3, 31
+	bne	cr0, L(loop_null)
+	vspltisb	v8, -1
+	vspltisb	v0, 0
+	vspltisb	v10, 3
+	lvsl	v11, r0, r0
+	vslb	v10, v11, v10
+	li	r5, 16
+L(continue1):
+	lvx	v4, 0, r3
+	lvx	v5, r3, r5
+	vcmpequb	v2, v0, v4
+	vcmpequb	v3, v0, v5
+	vor	v8, v2, v3
+	vcmpequb.	v11, v0, v8
+	addi	r3, r3, 32
+	blt	cr6, L(continue1)
+	addi	r3, r3, -32
+L(end1):
+	VBPERMQ(v2, v2, v10)
+	VBPERMQ(v3, v3, v10)
+	/* Shift each component into its correct position for merging.  */
+#ifdef __LITTLE_ENDIAN__
+	vsldoi	v3, v3, v3, 2
+#else
+	vsldoi	v2, v2, v2, 6
+	vsldoi	v3, v3, v3, 4
+#endif
+
+        /* Merge the results and move to a GPR.  */
+        vor     v4, v3, v2
+	MFVRD(r5, v4)
+#ifdef __LITTLE_ENDIAN__
+	addi	r6, r5, -1
+	andc	r6, r6, r5
+	popcntd	r6, r6
+#else
+	cntlzd	r6, r5	/* Count leading zeros before the match.  */
+#endif
+	add	r3, r3, r6	/* Compute final length.  */
+	blr
+END (FUNC_NAME)
+
+#ifndef USE_AS_STRCHRNUL
+weak_alias (strchr, index)
+libc_hidden_builtin_def (strchr)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/power8/strchrnul.S b/sysdeps/powerpc/powerpc64/power8/strchrnul.S
new file mode 100644
index 0000000000..022ad67a6b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strchrnul.S
@@ -0,0 +1,23 @@
+/* Optimized strchrnul implementation for PowerPC64/POWER8.
+   Copyright (C) 2016-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STRCHRNUL 1
+#include <sysdeps/powerpc/powerpc64/power8/strchr.S>
+
+weak_alias (__strchrnul,strchrnul)
+libc_hidden_builtin_def (__strchrnul)
diff --git a/sysdeps/powerpc/powerpc64/power8/strcmp.S b/sysdeps/powerpc/powerpc64/power8/strcmp.S
index 4d6c477194..15e7351d1b 100644
--- a/sysdeps/powerpc/powerpc64/power8/strcmp.S
+++ b/sysdeps/powerpc/powerpc64/power8/strcmp.S
@@ -1,5 +1,5 @@
 /* Optimized strcmp implementation for PowerPC64/POWER8.
-   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Copyright (C) 2015-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -18,6 +18,10 @@
 
 #include <sysdep.h>
 
+#ifndef STRCMP
+# define STRCMP strcmp
+#endif
+
 /* Implements the function
 
    size_t [r3] strcmp (const char *s1 [r3], const char *s2 [r4])
@@ -27,24 +31,24 @@
    64K as default, the page cross handling assumes minimum page size of
    4k.  */
 
-EALIGN (strcmp, 4, 0)
+ENTRY_TOCLESS (STRCMP, 4)
 	li	r0,0
 
-	/* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using
+	/* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
 	   the code:
 
 	    (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
 
-	   with PAGE_SIZE being 4096 and ITER_SIZE begin 32.  */
+	   with PAGE_SIZE being 4096 and ITER_SIZE begin 16.  */
 
 	rldicl	r7,r3,0,52
 	rldicl	r9,r4,0,52
-	cmpldi	cr7,r7,4096-32
+	cmpldi	cr7,r7,4096-16
 	bgt	cr7,L(pagecross_check)
-	cmpldi	cr5,r9,4096-32
+	cmpldi	cr5,r9,4096-16
 	bgt	cr5,L(pagecross_check)
 
-	/* For short string up to 32 bytes, load both s1 and s2 using
+	/* For short string up to 16 bytes, load both s1 and s2 using
 	   unaligned dwords and compare.  */
 	ld	r8,0(r3)
 	ld	r10,0(r4)
@@ -60,25 +64,11 @@ EALIGN (strcmp, 4, 0)
 	orc.	r9,r12,r11
 	bne	cr0,L(different_nocmpb)
 
-	ld	r8,16(r3)
-	ld	r10,16(r4)
-	cmpb	r12,r8,r0
-	cmpb	r11,r8,r10
-	orc.	r9,r12,r11
-	bne	cr0,L(different_nocmpb)
-
-	ld	r8,24(r3)
-	ld	r10,24(r4)
-	cmpb	r12,r8,r0
-	cmpb	r11,r8,r10
-	orc.	r9,r12,r11
-	bne	cr0,L(different_nocmpb)
-
-	addi	r7,r3,32
-	addi	r4,r4,32
+	addi	r7,r3,16
+	addi	r4,r4,16
 
 L(align_8b):
-	/* Now it has checked for first 32 bytes, align source1 to doubleword
+	/* Now it has checked for first 16 bytes, align source1 to doubleword
 	   and adjust source2 address.  */
 	rldicl	r9,r7,0,61	/* source1 alignment to doubleword  */
 	subf	r4,r9,r4	/* Adjust source2 address based on source1
@@ -253,5 +243,5 @@ L(pagecross_retdiff):
 L(pagecross_nullfound):
 	li	r3,0
 	b	L(pagecross_retdiff)
-END (strcmp)
+END (STRCMP)
 libc_hidden_builtin_def (strcmp)
diff --git a/sysdeps/powerpc/powerpc64/power8/strcpy.S b/sysdeps/powerpc/powerpc64/power8/strcpy.S
index 5130831c6a..956faf714f 100644
--- a/sysdeps/powerpc/powerpc64/power8/strcpy.S
+++ b/sysdeps/powerpc/powerpc64/power8/strcpy.S
@@ -1,5 +1,5 @@
 /* Optimized strcpy/stpcpy implementation for PowerPC64/POWER8.
-   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Copyright (C) 2015-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -19,10 +19,18 @@
 #include <sysdep.h>
 
 #ifdef USE_AS_STPCPY
-# define FUNC_NAME __stpcpy
+# ifndef STPCPY
+#   define FUNC_NAME __stpcpy
+# else
+#   define FUNC_NAME STPCPY
+# endif
 #else
-# define FUNC_NAME strcpy
-#endif
+# ifndef STRCPY
+#  define FUNC_NAME strcpy
+# else
+#  define FUNC_NAME STRCPY
+# endif
+#endif  /* !USE_AS_STPCPY  */
 
 /* Implements the function
 
@@ -39,8 +47,8 @@
    64K as default, the page cross handling assumes minimum page size of
    4k.  */
 
-	.machine  power7
-EALIGN (FUNC_NAME, 4, 0)
+	.machine  power8
+ENTRY_TOCLESS (FUNC_NAME, 4)
         li      r0,0          /* Doubleword with null chars to use
                                  with cmpb.  */
 
@@ -112,7 +120,7 @@ L(pagecross):
 	ldu	r8, 8(r7)
 
 L(loop_before):
-	/* Save the two doublewords readed from source and align the source
+	/* Save the two doublewords read from source and align the source
 	   to 16 bytes for the loop.  */
 	mr	r11,r3
 	std	r12,0(r11)
@@ -121,7 +129,150 @@ L(loop_before):
 	rldicl	r9,r4,0,60
 	subf	r7,r9,r7
 	subf	r11,r9,r11
-	b	L(loop_start)
+	/* Source is adjusted to 16B alignment and destination r11 is
+	   also moved based on that adjustment.  Now check if r11 is
+	   also 16B aligned to move to vectorized loop.  */
+	andi.	r6, r11, 0xF
+	bne	L(loop_start)
+
+	/* Prepare for the loop.  */
+	subf	r4, r9, r4	/* Adjust r4 based on alignment.  */
+	li	r7, 16	/* Load required offsets.  */
+	li	r8, 32
+	li	r9, 48
+	vspltisb	v0, 0
+	addi	r4, r4, 16
+	/* Are we 64-byte aligned? If so, jump to the vectorized loop.
+	   Else copy 16B till r4 is 64B aligned.  */
+	andi.	r6, r4, 63
+	beq	L(qw_loop)
+
+	lvx	v6, 0, r4	/* Load 16 bytes from memory.  */
+	vcmpequb.	v5, v0, v6	/* Check for null.  */
+	bne	cr6, L(qw_done)
+	stvx	v6, 0, r11	/* Store 16 bytes.  */
+	addi	r4, r4, 16	/* Increment the address.  */
+	addi	r11, r11, 16
+	andi.	r6, r4, 63
+	beq	L(qw_loop)
+
+	lvx	v6, 0, r4
+	vcmpequb.	v5, v0, v6
+	bne	cr6, L(qw_done)
+	stvx	v6, 0, r11
+	addi	r4, r4, 16
+	addi	r11, r11, 16
+	andi.	r6, r4, 63
+	beq	L(qw_loop)
+
+	lvx	v6, 0, r4
+	vcmpequb.	v5, v0, v6
+	bne	cr6, L(qw_done)
+	stvx	v6, 0, r11
+	addi	r4, r4, 16
+	addi	r11, r11, 16
+
+	.align	4
+L(qw_loop):
+	lvx	v1, r4, r0  /* Load 4 quadwords.  */
+	lvx	v2, r4, r7
+	lvx	v3, r4, r8
+	lvx	v4, r4, r9
+	vminub	v5, v1, v2  /* Compare and merge into one VR for speed.  */
+	vminub	v8, v3, v4
+	vminub	v7, v5, v8
+	vcmpequb.	v7, v7, v0  /* Check for NULLs.  */
+	bne	cr6, L(qw_loop_done)
+	stvx	v1, r11, r0	/* Store 4 quadwords.  */
+	stvx	v2, r11, r7
+	stvx	v3, r11, r8
+	stvx	v4, r11, r9
+	addi	r4, r4, 64  /* Adjust address for the next iteration.  */
+	addi	r11, r11, 64	/* Adjust address for the next iteration.  */
+
+	lvx	v1, r4, r0  /* Load 4 quadwords.  */
+	lvx	v2, r4, r7
+	lvx	v3, r4, r8
+	lvx	v4, r4, r9
+	vminub	v5, v1, v2  /* Compare and merge into one VR for speed.  */
+	vminub	v8, v3, v4
+	vminub	v7, v5, v8
+	vcmpequb.	v7, v7, v0  /* Check for NULLs.  */
+	bne	cr6, L(qw_loop_done)
+	stvx	v1, r11, r0	/* Store 4 quadwords.  */
+	stvx	v2, r11, r7
+	stvx	v3, r11, r8
+	stvx	v4, r11, r9
+	addi	r4, r4, 64  /* Adjust address for the next iteration.  */
+	addi	r11, r11, 64	/* Adjust address for the next iteration.  */
+
+	lvx	v1, r4, r0  /* Load 4 quadwords.  */
+	lvx	v2, r4, r7
+	lvx	v3, r4, r8
+	lvx	v4, r4, r9
+	vminub	v5, v1, v2  /* Compare and merge into one VR for speed.  */
+	vminub	v8, v3, v4
+	vminub	v7, v5, v8
+	vcmpequb.	v7, v7, v0  /* Check for NULLs.  */
+	bne	cr6, L(qw_loop_done)
+	stvx	v1, r11, r0	/* Store 4 quadwords.  */
+	stvx	v2, r11, r7
+	stvx	v3, r11, r8
+	stvx	v4, r11, r9
+	addi	r4, r4, 64  /* Adjust address for the next iteration.  */
+	addi	r11, r11, 64	/* Adjust address for the next iteration.  */
+	b	L(qw_loop)
+
+	.align	4
+L(qw_loop_done):
+	/* Null found in one of the 4 loads.  */
+	vcmpequb.	v7, v1, v0
+	vor	v6, v1, v1
+	bne	cr6, L(qw_done)
+	/* Not on the first 16B, So store it.  */
+	stvx	v1, r11, r0
+	addi	r4, r4, 16
+	addi	r11, r11, 16
+	vcmpequb.	v7, v2, v0
+	vor	v6, v2, v2
+	bne	cr6, L(qw_done)
+	/* Not on the second 16B, So store it.  */
+	stvx	v2, r11, r0
+	addi	r4, r4, 16
+	addi	r11, r11, 16
+	vcmpequb.	v7, v3, v0
+	vor	v6, v3, v3
+	bne	cr6, L(qw_done)
+	/* Not on the third 16B, So store it.  */
+	stvx	v6, r11, r0
+	addi	r4, r4, 16
+	addi	r11, r11, 16
+	vor	v6, v4, v4
+
+	.align	4
+L(qw_done):
+	mr	r7, r4
+	/* Move the result to GPR.  */
+#ifdef __LITTLE_ENDIAN__
+	vsldoi	v4, v6, v0, 8
+	mfvrd	r12, v4
+#else
+	mfvrd	r12, v6
+#endif
+	/* Check for null in the first 8 bytes.  */
+	cmpb	r10, r12, r0
+	cmpdi	cr6, r10, 0
+	bne	cr6, L(done2)
+	/* Null found in second doubleword.  */
+#ifdef __LITTLE_ENDIAN__
+	mfvrd	r6, v6
+#else
+	vsldoi	v6, v6, v0, 8
+	mfvrd	r6, v6
+#endif
+	cmpb	r10, r6, r0
+	addi	r7, r7, 8
+	b	L(done2)
 
         .align  5
 L(loop):
diff --git a/sysdeps/powerpc/powerpc64/power8/strcspn.S b/sysdeps/powerpc/powerpc64/power8/strcspn.S
new file mode 100644
index 0000000000..c2d130e7db
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strcspn.S
@@ -0,0 +1,20 @@
+/* Optimized strcspn implementation for PowerPC64/POWER8.
+   Copyright (C) 2016-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STRCSPN 1
+#include <sysdeps/powerpc/powerpc64/power8/strspn.S>
diff --git a/sysdeps/powerpc/powerpc64/power8/strlen.S b/sysdeps/powerpc/powerpc64/power8/strlen.S
new file mode 100644
index 0000000000..719b5c604c
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strlen.S
@@ -0,0 +1,290 @@
+/* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized
+   loop.
+   Copyright (C) 2016-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* TODO: change these to the actual instructions when the minimum required
+   binutils allows it.  */
+#define MFVRD(r,v)	.long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define VBPERMQ(t,a,b)	.long (0x1000054c \
+			       | ((t)<<(32-11))	\
+			       | ((a)<<(32-16))	\
+			       | ((b)<<(32-21)) )
+
+/* int [r3] strlen (char *s [r3])  */
+
+#ifndef STRLEN
+# define STRLEN strlen
+#endif
+
+/* TODO: change this to .machine power8 when the minimum required binutils
+   allows it.  */
+	.machine  power7
+ENTRY_TOCLESS (STRLEN, 4)
+	CALL_MCOUNT 1
+	dcbt	0,r3
+	clrrdi	r4,r3,3	      /* Align the address to doubleword boundary.  */
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+	li	r0,0	      /* Doubleword with null chars to use
+				 with cmpb.  */
+	li	r5,-1	      /* MASK = 0xffffffffffffffff.  */
+	ld	r12,0(r4)     /* Load doubleword from memory.  */
+#ifdef __LITTLE_ENDIAN__
+	sld	r5,r5,r6
+#else
+	srd	r5,r5,r6      /* MASK = MASK >> padding.  */
+#endif
+	orc	r9,r12,r5     /* Mask bits that are not part of the string.  */
+	cmpb	r10,r9,r0     /* Check for null bytes in DWORD1.  */
+	cmpdi	cr7,r10,0     /* If r10 == 0, no null's have been found.  */
+	bne	cr7,L(done)
+
+	/* For shorter strings (< 64 bytes), we will not use vector registers,
+	   as the overhead isn't worth it.  So, let's use GPRs instead.  This
+	   will be done the same way as we do in the POWER7 implementation.
+	   Let's see if we are aligned to a quadword boundary.  If so, we can
+	   jump to the first (non-vectorized) loop.  Otherwise, we have to
+	   handle the next DWORD first.  */
+	mtcrf	0x01,r4
+	mr	r9,r4
+	addi	r9,r9,8
+	bt	28,L(align64)
+
+	/* Handle the next 8 bytes so we are aligned to a quadword
+	   boundary.  */
+	ldu	r5,8(r4)
+	cmpb	r10,r5,r0
+	cmpdi	cr7,r10,0
+	addi	r9,r9,8
+	bne	cr7,L(done)
+
+L(align64):
+	/* Proceed to the old (POWER7) implementation, checking two doublewords
+	   per iteraction.  For the first 56 bytes, we will just check for null
+	   characters.  After that, we will also check if we are 64-byte aligned
+	   so we can jump to the vectorized implementation.  We will unroll
+	   these loops to avoid excessive branching.  */
+	ld	r6,8(r4)
+	ldu	r5,16(r4)
+	cmpb	r10,r6,r0
+	cmpb	r11,r5,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	addi	r9,r9,16
+	bne	cr7,L(dword_zero)
+
+	ld	r6,8(r4)
+	ldu	r5,16(r4)
+	cmpb	r10,r6,r0
+	cmpb	r11,r5,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	addi	r9,r9,16
+	bne	cr7,L(dword_zero)
+
+	ld	r6,8(r4)
+	ldu	r5,16(r4)
+	cmpb	r10,r6,r0
+	cmpb	r11,r5,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	addi	r9,r9,16
+	bne	cr7,L(dword_zero)
+
+	/* Are we 64-byte aligned? If so, jump to the vectorized loop.
+	   Note: aligning to 64-byte will necessarily slow down performance for
+	   strings around 64 bytes in length due to the extra comparisons
+	   required to check alignment for the vectorized loop.  This is a
+	   necessary tradeoff we are willing to take in order to speed up the
+	   calculation for larger strings.  */
+	andi.	r10,r9,63
+	beq	cr0,L(preloop)
+	ld	r6,8(r4)
+	ldu	r5,16(r4)
+	cmpb	r10,r6,r0
+	cmpb	r11,r5,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	addi	r9,r9,16
+	bne	cr7,L(dword_zero)
+
+	andi.	r10,r9,63
+	beq	cr0,L(preloop)
+	ld	r6,8(r4)
+	ldu	r5,16(r4)
+	cmpb	r10,r6,r0
+	cmpb	r11,r5,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	addi	r9,r9,16
+	bne	cr7,L(dword_zero)
+
+	andi.	r10,r9,63
+	beq	cr0,L(preloop)
+	ld	r6,8(r4)
+	ldu	r5,16(r4)
+	cmpb	r10,r6,r0
+	cmpb	r11,r5,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	addi	r9,r9,16
+
+	/* At this point, we are necessarily 64-byte aligned.  If no zeroes were
+	   found, jump to the vectorized loop.  */
+	beq	cr7,L(preloop)
+
+L(dword_zero):
+	/* OK, one (or both) of the doublewords contains a null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a null byte.  */
+
+	cmpdi	cr6,r10,0
+	addi	r4,r4,-8
+	bne	cr6,L(done)
+
+	/* The null byte must be in the second doubleword.  Adjust the address
+	   again and move the result of cmpb to r10 so we can calculate the
+	   length.  */
+
+	mr	r10,r11
+	addi	r4,r4,8
+
+	/* If the null byte was found in the non-vectorized code, compute the
+	   final length.  r10 has the output of the cmpb instruction, that is,
+	   it contains 0xff in the same position as the null byte in the
+	   original doubleword from the string.  Use that to calculate the
+	   length.  */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi	r9, r10,-1    /* Form a mask from trailing zeros.  */
+	andc	r9, r9,r10
+	popcntd	r0, r9	      /* Count the bits in the mask.  */
+#else
+	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
+#endif
+	subf	r5,r3,r4
+	srdi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
+	add	r3,r5,r0      /* Compute final length.  */
+	blr
+
+	/* Vectorized implementation starts here.  */
+	.p2align  4
+L(preloop):
+	/* Set up for the loop.  */
+	mr	r4,r9
+	li	r7, 16	      /* Load required offsets.  */
+	li	r8, 32
+	li	r9, 48
+	li	r12, 8
+	vxor	v0,v0,v0      /* VR with null chars to use with
+				 vcmpequb.  */
+
+	/* Main loop to look for the end of the string.  We will read in
+	   64-byte chunks.  Align it to 32 bytes and unroll it 3 times to
+	   leverage the icache performance.  */
+	.p2align  5
+L(loop):
+	lvx	  v1,r4,r0  /* Load 4 quadwords.  */
+	lvx	  v2,r4,r7
+	lvx	  v3,r4,r8
+	lvx	  v4,r4,r9
+	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
+	vminub	  v6,v3,v4
+	vminub	  v7,v5,v6
+	vcmpequb. v7,v7,v0  /* Check for NULLs.  */
+	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
+	bne	  cr6,L(vmx_zero)
+
+	lvx	  v1,r4,r0  /* Load 4 quadwords.  */
+	lvx	  v2,r4,r7
+	lvx	  v3,r4,r8
+	lvx	  v4,r4,r9
+	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
+	vminub	  v6,v3,v4
+	vminub	  v7,v5,v6
+	vcmpequb. v7,v7,v0  /* Check for NULLs.  */
+	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
+	bne	  cr6,L(vmx_zero)
+
+	lvx	  v1,r4,r0  /* Load 4 quadwords.  */
+	lvx	  v2,r4,r7
+	lvx	  v3,r4,r8
+	lvx	  v4,r4,r9
+	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
+	vminub	  v6,v3,v4
+	vminub	  v7,v5,v6
+	vcmpequb. v7,v7,v0  /* Check for NULLs.  */
+	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
+	beq	  cr6,L(loop)
+
+L(vmx_zero):
+	/* OK, we found a null byte.  Let's look for it in the current 64-byte
+	   block and mark it in its corresponding VR.  */
+	vcmpequb  v1,v1,v0
+	vcmpequb  v2,v2,v0
+	vcmpequb  v3,v3,v0
+	vcmpequb  v4,v4,v0
+
+	/* We will now 'compress' the result into a single doubleword, so it
+	   can be moved to a GPR for the final calculation.  First, we
+	   generate an appropriate mask for vbpermq, so we can permute bits into
+	   the first halfword.  */
+	vspltisb  v10,3
+	lvsl	  v11,r0,r0
+	vslb	  v10,v11,v10
+
+	/* Permute the first bit of each byte into bits 48-63.  */
+	VBPERMQ(v1,v1,v10)
+	VBPERMQ(v2,v2,v10)
+	VBPERMQ(v3,v3,v10)
+	VBPERMQ(v4,v4,v10)
+
+	/* Shift each component into its correct position for merging.  */
+#ifdef __LITTLE_ENDIAN__
+	vsldoi  v2,v2,v2,2
+	vsldoi  v3,v3,v3,4
+	vsldoi  v4,v4,v4,6
+#else
+	vsldoi	v1,v1,v1,6
+	vsldoi	v2,v2,v2,4
+	vsldoi	v3,v3,v3,2
+#endif
+
+	/* Merge the results and move to a GPR.  */
+	vor	v1,v2,v1
+	vor	v2,v3,v4
+	vor	v4,v1,v2
+	MFVRD(r10,v4)
+
+	 /* Adjust address to the begninning of the current 64-byte block.  */
+	addi	r4,r4,-64
+
+#ifdef __LITTLE_ENDIAN__
+	addi	r9, r10,-1    /* Form a mask from trailing zeros.  */
+	andc	r9, r9,r10
+	popcntd	r0, r9	      /* Count the bits in the mask.  */
+#else
+	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
+#endif
+	subf	r5,r3,r4
+	add	r3,r5,r0      /* Compute final length.  */
+	blr
+
+END (STRLEN)
+libc_hidden_builtin_def (strlen)
diff --git a/sysdeps/powerpc/powerpc64/power8/strncase.S b/sysdeps/powerpc/powerpc64/power8/strncase.S
new file mode 100644
index 0000000000..050b63ab91
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strncase.S
@@ -0,0 +1,20 @@
+/* Optimized strncasecmp implementation for POWER8.
+   Copyright (C) 2016-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STRNCASECMP 1
+#include <sysdeps/powerpc/powerpc64/power8/strcasecmp.S>
diff --git a/sysdeps/powerpc/powerpc64/power8/strncmp.S b/sysdeps/powerpc/powerpc64/power8/strncmp.S
index 1ce9e3fc65..2eefa4a2ba 100644
--- a/sysdeps/powerpc/powerpc64/power8/strncmp.S
+++ b/sysdeps/powerpc/powerpc64/power8/strncmp.S
@@ -1,5 +1,5 @@
 /* Optimized strncmp implementation for PowerPC64/POWER8.
-   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Copyright (C) 2015-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -18,6 +18,10 @@
 
 #include <sysdep.h>
 
+#ifndef STRNCMP
+# define STRNCMP strncmp
+#endif
+
 /* Implements the function
 
    int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n)
@@ -28,7 +32,7 @@
    4k.  */
 
 	.machine  power7
-EALIGN (strncmp, 4, 0)
+ENTRY_TOCLESS (STRNCMP, 4)
 	/* Check if size is 0.  */
 	mr.	r10,r5
 	beq	cr0,L(ret0)
@@ -319,5 +323,5 @@ L(byte_ne_4):
 	extsw	r10,r9
 	mr	r9,r8
 	b	L(size_reached_1)
-END(strncmp)
+END(STRNCMP)
 libc_hidden_builtin_def(strncmp)
diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
index 17c3afb5fe..e8c5c71f87 100644
--- a/sysdeps/powerpc/powerpc64/power8/strncpy.S
+++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S
@@ -1,5 +1,5 @@
 /* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8.
-   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   Copyright (C) 2015-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -19,11 +19,32 @@
 #include <sysdep.h>
 
 #ifdef USE_AS_STPNCPY
-# define FUNC_NAME __stpncpy
+# ifndef STPNCPY
+#   define FUNC_NAME __stpncpy
+# else
+#   define FUNC_NAME STPNCPY
+# endif
 #else
-# define FUNC_NAME strncpy
+# ifndef STRNCPY
+#  define FUNC_NAME strncpy
+# else
+#  define FUNC_NAME STRNCPY
+# endif
+#endif  /* !USE_AS_STPNCPY  */
+
+#ifndef MEMSET
+/* For builds without IFUNC support, local calls should be made to internal
+   GLIBC symbol (created by libc_hidden_builtin_def).  */
+# ifdef SHARED
+#  define MEMSET_is_local
+#  define MEMSET   __GI_memset
+# else
+#  define MEMSET   memset
+# endif
 #endif
 
+#define FRAMESIZE (FRAME_MIN_SIZE+48)
+
 /* Implements the function
 
    char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
@@ -40,7 +61,12 @@
    4k.  */
 
 	.machine  power7
-EALIGN (FUNC_NAME, 4, 0)
+#ifdef MEMSET_is_local
+ENTRY_TOCLESS (FUNC_NAME, 4)
+#else
+ENTRY (FUNC_NAME, 4)
+#endif
+	CALL_MCOUNT 3
 
         /* Check if the [src]+15 will cross a 4K page by checking if the bit
            indicating the page size changes.  Basically:
@@ -54,8 +80,7 @@ EALIGN (FUNC_NAME, 4, 0)
 	addi	r10,r4,16
 	rlwinm	r9,r4,0,19,19
 
-	/* Since it is a leaf function, save some non-volatile registers on the
-	   protected/red zone.  */
+	/* Save some non-volatile registers on the stack.  */
 	std	r26,-48(r1)
 	std	r27,-40(r1)
 
@@ -69,6 +94,14 @@ EALIGN (FUNC_NAME, 4, 0)
 	std	r30,-16(r1)
 	std	r31,-8(r1)
 
+	/* Update CFI.  */
+	cfi_offset(r26, -48)
+	cfi_offset(r27, -40)
+	cfi_offset(r28, -32)
+	cfi_offset(r29, -24)
+	cfi_offset(r30, -16)
+	cfi_offset(r31, -8)
+
 	beq	cr7,L(unaligned_lt_16)
 	rldicl	r9,r4,0,61
 	subfic	r8,r9,8
@@ -180,79 +213,66 @@ L(short_path_loop_end):
 	ld	r31,-8(r1)
 	blr
 
-	/* This code pads the remainder dest with NULL bytes.  The algorithm
-	   calculate the remanining size and issues a doubleword unrolled
-	   loops followed by a byte a byte set.  */
+	/* This code pads the remainder of dest with NULL bytes.  The algorithm
+	   calculates the remaining size and calls memset.  */
 	.align	4
 L(zero_pad_start):
 	mr	r5,r10
 	mr	r9,r6
 L(zero_pad_start_1):
-	srdi.	r8,r5,r3
-	mr	r10,r9
-#ifdef USE_AS_STPNCPY
-	mr	r3,r9
+	/* At this point:
+	     - r5 holds the number of bytes that still have to be written to
+	       dest.
+	     - r9 points to the position, in dest, where the first null byte
+	       will be written.
+	   The above statements are true both when control reaches this label
+	   from a branch or when falling through the previous lines.  */
+#ifndef USE_AS_STPNCPY
+	mr	r30,r3       /* Save the return value of strncpy.  */
+#endif
+	/* Prepare the call to memset.  */
+	mr	r3,r9        /* Pointer to the area to be zero-filled.  */
+	li	r4,0         /* Byte to be written (zero).  */
+
+	/* We delayed the creation of the stack frame, as well as the saving of
+	   the link register, because only at this point, we are sure that
+	   doing so is actually needed.  */
+
+	/* Save the link register.  */
+	mflr	r0
+	std	r0,16(r1)
+
+	/* Create the stack frame.  */
+	stdu	r1,-FRAMESIZE(r1)
+	cfi_adjust_cfa_offset(FRAMESIZE)
+	cfi_offset(lr, 16)
+
+	bl	MEMSET
+#ifndef MEMSET_is_local
+	nop
 #endif
-	beq-	cr0,L(zero_pad_loop_b_start)
-	cmpldi	cr7,r8,1
-	li	cr7,0
-	std	r7,0(r9)
-	beq	cr7,L(zero_pad_loop_b_prepare)
-	addic.	r8,r8,-2
-	addi	r10,r9,r16
-	std	r7,8(r9)
-	beq	cr0,L(zero_pad_loop_dw_2)
-	std	r7,16(r9)
-	li	r9,0
-	b	L(zero_pad_loop_dw_1)
-
-	.align	4
-L(zero_pad_loop_dw):
-	addi	r10,r10,16
-	std	r9,-8(r10)
-	beq	cr0,L(zero_pad_loop_dw_2)
-	std	r9,0(r10)
-L(zero_pad_loop_dw_1):
-	cmpldi	cr7,r8,1
-	std	r9,0(r10)
-	addic.	r8,r8,-2
-	bne	cr7,L(zero_pad_loop_dw)
-	addi	r10,r10,8
-L(zero_pad_loop_dw_2):
-	rldicl	r5,r5,0,61
-L(zero_pad_loop_b_start):
-	cmpdi	cr7,r5,0
-	addi	r5,r5,-1
-	addi	r9,r10,-1
-	add	r10,r10,5
-	subf	r10,r9,r10
-	li	r8,0
-	beq-	cr7,L(short_path_loop_end)
-
-	/* Write remaining 1-8 bytes.  */
-        .align  4
-	addi	r9,r9,1
-	mtocrf	0x1,r10
-	bf	29,4f
-        stw     r8,0(r9)
-        addi	r9,r9,4
 
-        .align  4
-4:      bf      30,2f
-        sth     r8,0(r9)
-        addi	r9,r9,2
+	ld	r0,FRAMESIZE+16(r1)
 
-        .align  4
-2:      bf	31,1f
-        stb	r8,0(r9)
+#ifndef USE_AS_STPNCPY
+	mr	r3,r30       /* Restore the return value of strncpy, i.e.:
+				dest.  For stpncpy, the return value is the
+				same as return value of memset.  */
+#endif
 
-	/* Restore non-volatile registers.  */
-1:	ld	r26,-48(r1)
-	ld	r27,-40(r1)
-	ld	r28,-32(r1)
-	ld	r29,-24(r1)
-	ld	r30,-16(r1)
-	ld	r31,-8(r1)
+	/* Restore non-volatile registers and return.  */
+	ld	r26,FRAMESIZE-48(r1)
+	ld	r27,FRAMESIZE-40(r1)
+	ld	r28,FRAMESIZE-32(r1)
+	ld	r29,FRAMESIZE-24(r1)
+	ld	r30,FRAMESIZE-16(r1)
+	ld	r31,FRAMESIZE-8(r1)
+	/* Restore the stack frame.  */
+	addi	r1,r1,FRAMESIZE
+	cfi_adjust_cfa_offset(-FRAMESIZE)
+	/* Restore the link register.  */
+	mtlr	r0
+	cfi_restore(lr)
 	blr
 
 	/* The common case where [src]+16 will not cross a 4K page boundary.
@@ -301,7 +321,7 @@ L(pagecross):
 #endif
 	orc	r9,r7,r9	/* Mask bits that are not part of the
 				   string.  */
-	li	cr7,0
+	li	r7,0
 	cmpb	r9,r9,r7	/* Check for null bytes in DWORD1.  */
 	cmpdi	cr7,r9,0
 	bne	cr7,L(short_path_prepare_2)
@@ -312,14 +332,14 @@ L(pagecross):
 	/* For next checks we have aligned address, so we check for more
 	   three doublewords to make sure we can read 16 unaligned bytes
 	   to start the bulk copy with 16 aligned addresses.  */
-	ld	cr7,8(r11)
+	ld	r7,8(r11)
 	cmpb	r9,r7,r9
 	cmpdi	cr7,r9,0
 	bne	cr7,L(short_path_prepare_2)
-	addi	cr7,r8,-8
+	addi	r7,r8,-8
 	cmpldi	cr7,r7,8
 	ble	cr7,L(short_path_prepare_2)
-	ld	cr7,16(r11)
+	ld	r7,16(r11)
 	cmpb	r9,r7,r9
 	cmpdi	cr7,r9,0
 	bne	cr7,L(short_path_prepare_2)
@@ -443,18 +463,12 @@ L(short_path_prepare_2_3):
 	mr	r4,r28
 	mr	r9,r29
 	b	L(short_path_2)
-L(zero_pad_loop_b_prepare):
-	addi	r10,r9,8
-	rldicl	r5,r5,0,61
-	b	L(zero_pad_loop_b_start)
 L(zero_pad_start_prepare_1):
 	mr	r5,r6
 	mr	r9,r8
 	b	L(zero_pad_start_1)
 END (FUNC_NAME)
 
-#ifdef USE_AS_STPNCPY
-libc_hidden_def (__stpncpy)
-#else
+#ifndef USE_AS_STPNCPY
 libc_hidden_builtin_def (strncpy)
 #endif
diff --git a/sysdeps/powerpc/powerpc64/power8/strnlen.S b/sysdeps/powerpc/powerpc64/power8/strnlen.S
new file mode 100644
index 0000000000..a98dfba4bd
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strnlen.S
@@ -0,0 +1,425 @@
+/* Optimized strnlen implementation for POWER8 using a vmx loop.
+
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* It is implemented the following heuristic:
+	1. Case maxlen <= 32: align the pointer to 8 bytes to loop through
+	reading doublewords. Uses the POWER7 algorithm.
+	2. Case maxlen > 32: check for null bytes in the first 16 bytes using
+	unaligned accesses. Return length if found. Otherwise:
+		2.1 Case maxlen < 64: deduct the bytes previously read, align
+		the pointer to 16 bytes and loop through reading quadwords
+		until find null bytes or reach maxlen.
+		2.2 Case maxlen > 64: deduct the bytes previously read, align
+		the pointer to 64 bytes and set up a counter to loop through
+		reading in strides of 64 bytes. In case it finished the loop
+		with null bytes not found, process the remainder bytes by
+		switching to the loop to heuristic in 2.1.  */
+
+#include <sysdep.h>
+
+/* Define default page size to 4KB.  */
+#define PAGE_SIZE 4096
+
+/* The following macros implement Power ISA v2.07 opcodes
+   that could not be used directly into this code to the keep
+   compatibility with older binutils versions.  */
+
+/* Move from vector register doubleword.  */
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+
+/* Move to vector register doubleword.  */
+#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
+
+/* Vector Bit Permute Quadword.  */
+#define VBPERMQ(t,a,b)	.long (0x1000054c	\
+			       | ((t)<<(32-11))	\
+			       | ((a)<<(32-16))	\
+			       | ((b)<<(32-21)) )
+
+/* Vector Population Count Halfword.  */
+#define VPOPCNTH(t,b) .long (0x10000743 | ((t)<<(32-11)) | ((b)<<(32-21)))
+
+/* Vector Count Leading Zeros Halfword.  */
+#define VCLZH(t,b) .long (0x10000742 | ((t)<<(32-11)) | ((b)<<(32-21)))
+
+
+/* int [r3] strnlen (char *s [r3], size_t maxlen [r4])  */
+/* TODO: change to power8 when minimum required binutils allows it.  */
+	.machine  power7
+ENTRY_TOCLESS (__strnlen)
+	CALL_MCOUNT 2
+	dcbt	0,r3
+
+	cmpldi	r4,32           /* Check if maxlen <= 32.  */
+	ble	L(small_range)  /* If maxlen <= 32.  */
+
+	/* Upcoming 16 bytes unaligned accesses cannot cross the page boundary
+	   otherwise the processor throws an memory access error.
+	   Use following code to check there is room for such as accesses:
+	     (((size_t) s) % PAGE_SIZE > (PAGE_SIZE - 16)
+	   If it is disallowed then switch to the code that handles
+	   the string when maxlen <= 32.  */
+	clrldi	r10,r3,52
+	cmpldi  cr7,r10,PAGE_SIZE-16
+	bgt     cr7,L(small_range)	/* If less than 16B of page end.  */
+
+	/* Compute our permute constant r8.  */
+	li	r7,0
+	/* Compute a bpermd constant to move bit 0 of each word into
+	   a halfword value, and count trailing zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	li	r8,0x2820
+	oris	r8,r8,0x3830
+	sldi	r8,r8,32
+	ori	r8,r8,0x0800
+	oris	r8,r8,0x1810
+#else
+	li	r8,0x1018
+	oris	r8,r8,0x0008
+	sldi	r8,r8,32
+	ori	r8,r8,0x3038
+	oris	r8,r8,0x2028
+#endif
+
+	/* maxlen > 32. Optimistically check for null bytes in the first
+	   16 bytes of the string using unaligned accesses.  */
+	ld	r5,0(r3)
+	ld	r6,8(r3)
+	cmpb	r10,r7,r5		/* Check for null bytes in DWORD1.  */
+	cmpb	r11,r7,r6		/* Check for null bytes in DWORD2.  */
+	or.	r7,r10,r11
+	bne	cr0, L(early_find)	/* If found null bytes.  */
+
+	/* At this point maxlen > 32 and null bytes were not found at first
+	   16 bytes. Prepare for loop using VMX.  */
+
+	/* r3 == s, r4 == maxlen. All other volatile regs are unused now.  */
+
+	addi	r5,r3,16	/* Align up, or just add the 16B we
+				   already checked.  */
+	li	r0,15
+	and	r7,r5,r0	/* Find offset into 16B alignment.  */
+	andc	r5,r5,r0	/* Quadword align up s to the next quadword.  */
+	li	r0,16
+	subf	r0,r7,r0
+	subf	r4,r0,r4	/* Deduct unaligned bytes from maxlen.  */
+
+
+	/* Compute offsets for vmx loads, and precompute the vbpermq
+	   constants for both the 64B and 16B loops.  */
+	li	r6,0
+	vspltisb  v0,0
+	vspltisb  v10,3
+	lvsl	  v11,r6,r6
+	vslb	  v10,v11,v10
+
+	cmpldi  r4,64		/* Check maxlen < 64.  */
+	blt	L(smaller)	/* If maxlen < 64 */
+
+	/* In order to begin the 64B loop, it needs to be 64
+	   bytes aligned. So read quadwords until it is aligned or found null
+	   bytes. At worst case it will be aligned after the fourth iteration,
+	   so unroll the loop to avoid counter checking.  */
+	andi.   r7,r5,63		/* Check if is 64 bytes aligned.  */
+	beq     cr0,L(preloop_64B)	/* If it is already 64B aligned.  */
+	lvx     v1,r5,r6
+	vcmpequb.       v1,v1,v0
+	addi    r5,r5,16
+	addi    r4,r4,-16		/* Decrement maxlen in 16 bytes. */
+	bne     cr6,L(found_aligning64B) /* If found null bytes.  */
+
+	/* Unroll 2x above code block until aligned or find null bytes.  */
+	andi.   r7,r5,63
+	beq     cr0,L(preloop_64B)
+	lvx     v1,r5,r6
+	vcmpequb.      v1,v1,v0
+	addi    r5,r5,16
+	addi    r4,r4,-16
+	bne     cr6,L(found_aligning64B)
+
+	andi.   r7,r5,63
+	beq     cr0,L(preloop_64B)
+	lvx     v1,r5,r6
+	vcmpequb.      v1,v1,v0
+	addi    r5,r5,16
+	addi    r4,r4,-16
+	bne     cr6,L(found_aligning64B)
+
+	/* At this point it should be 16 bytes aligned.
+	   Prepare for the 64B loop.  */
+	.p2align 4
+L(preloop_64B):
+	/* Check if maxlen became is less than 64, therefore disallowing the
+	   64B loop. If it happened switch to the 16B loop code.  */
+	cmpldi  r4,64		/* Check if maxlen < 64.  */
+	blt     L(smaller)	/* If maxlen < 64.  */
+	/* Set some constant values.  */
+	li      r7,16
+	li      r10,32
+	li      r9,48
+
+	/* Compute the number of 64 bytes iterations needed.  */
+	srdi	r11,r4,6	/* Compute loop count (maxlen / 64).  */
+	andi.	r4,r4,63	/* Set maxlen the remainder (maxlen % 64).  */
+	mtctr	r11		/* Move loop count to counter register.  */
+
+	/* Handle maxlen > 64. Loop over the bytes in strides of 64B.  */
+	.p2align 4
+L(loop_64B):
+	lvx	v1,r5,r6	/* r5 is the pointer to s.  */
+	lvx	v2,r5,r7
+	lvx	v3,r5,r10
+	lvx	v4,r5,r9
+	/* Compare the four 16B vectors to obtain the least 16 values.
+	   Null bytes should emerge into v7, then check for null bytes.  */
+	vminub	v5,v1,v2
+	vminub	v6,v3,v4
+	vminub	v7,v5,v6
+	vcmpequb. v7,v7,v0		/* Check for null bytes.  */
+	addi	r5,r5,64		/* Add pointer to next iteraction.  */
+	bne	cr6,L(found_64B)	/* If found null bytes.  */
+	bdnz	L(loop_64B)		/* Continue the loop if count > 0. */
+
+/* Hit loop end without null match. So branch to handle the remainder.  */
+
+	/* Prepare a 16B loop to handle two cases:
+		1. If 32 > maxlen < 64.
+		2. If maxlen >= 64, and reached end of the 64B loop with null
+		bytes not found. Thus handle the remainder bytes here. */
+	.p2align 4
+L(smaller):
+        cmpldi  r4,0            /* Check maxlen is zero.  */
+        beq     L(done)         /* If maxlen is zero.  */
+
+	/* Place rounded up number of qw's to check into a vmx
+	   register, and use some vector tricks to minimize
+	   branching.  */
+        MTVRD(v7,r4)            /* Copy maxlen from GPR to vector register. */
+        vspltisb v5,1
+        vspltisb v6,15
+        vspltb   v2,v7,7
+        vaddubs  v3,v5,v6
+
+#ifdef __LITTLE_ENDIAN__
+	vspltish v5,1           /* Compute 16 in each byte.  */
+#endif
+
+	/* Loop in 16B aligned incremements now. */
+	.p2align 4
+L(loop_16B):
+	lvx     v1,r5,r6        /* Load quadword into vector register.  */
+	addi    r5,r5,16        /* Increment address to next 16B block.  */
+	vor     v7,v2,v2        /* Save loop count (v2) into v7. */
+	vsububs v2,v2,v3        /* Subtract 16B from count, saturate at 0. */
+	vminub  v4,v1,v2
+	vcmpequb. v4,v4,v0      /* Checking for null bytes.  */
+	beq     cr6,L(loop_16B) /* If null bytes not found.  */
+
+	vcmpequb  v1,v1,v0
+	VBPERMQ(v1,v1,v10)
+#ifdef __LITTLE_ENDIAN__
+	vsubuhm  v2,v1,v5       /* Form a mask of trailing zeros.  */
+	vandc    v2,v2,v1
+	VPOPCNTH(v1,v2)         /* Count of trailing zeros, 16 if none.  */
+#else
+	VCLZH(v1,v1)            /* Count the leading zeros, 16 if none.  */
+#endif
+	/* Truncate to maximum allowable offset.  */
+	vcmpgtub v2,v1,v7       /* Compare and truncate for matches beyond
+				   maxlen.  */
+	vsel     v1,v1,v7,v2    /* 0-16 is now in byte 7.  */
+
+	MFVRD(r0,v1)
+	addi    r5,r5,-16       /* Undo speculative bump.  */
+	extsb   r0,r0           /* Clear whatever gunk is in the high 56b.  */
+	add     r5,r5,r0        /* Add the offset of whatever was found.  */
+L(done):
+	subf    r3,r3,r5        /* Length is equal to the offset of null byte
+				   matched minus the pointer to s.  */
+	blr                     /* Done.  */
+
+	/* Handle case of maxlen > 64 and found null bytes in last block
+	   of 64 bytes read.  */
+	.p2align 4
+L(found_64B):
+	/* A zero was found. Reduce the result.  */
+	vcmpequb  v1,v1,v0
+	vcmpequb  v2,v2,v0
+	vcmpequb  v3,v3,v0
+	vcmpequb  v4,v4,v0
+
+	/* Permute the first bit of each byte into bits 48-63.  */
+	VBPERMQ(v1,v1,v10)
+	VBPERMQ(v2,v2,v10)
+	VBPERMQ(v3,v3,v10)
+	VBPERMQ(v4,v4,v10)
+
+	/* Shift each component into its correct position for merging.  */
+#ifdef __LITTLE_ENDIAN__
+	vsldoi	v2,v2,v2,2
+	vsldoi	v3,v3,v3,4
+	vsldoi	v4,v4,v4,6
+#else
+	vsldoi	v1,v1,v1,6
+	vsldoi	v2,v2,v2,4
+	vsldoi	v3,v3,v3,2
+#endif
+
+	/* Merge the results and move to a GPR.  */
+	vor	v1,v2,v1
+	vor	v2,v3,v4
+	vor	v4,v1,v2
+
+	/* Adjust address to the start of the current 64B block.  */
+	addi	r5,r5,-64
+
+	MFVRD(r10,v4)
+#ifdef __LITTLE_ENDIAN__
+	addi	r9,r10,-1	/* Form a mask from trailing zeros.  */
+	andc	r9,r9,r10
+	popcntd	r0,r9		/* Count the bits in the mask.  */
+#else
+	cntlzd	r0,r10		/* Count leading zeros before the match.  */
+#endif
+	subf	r5,r3,r5
+	add	r3,r5,r0	/* Compute final length.  */
+	blr                     /* Done.  */
+
+	/* Handle case where null bytes were found while aligning
+	   as a preparation for the 64B loop.  */
+	.p2align 4
+L(found_aligning64B):
+	VBPERMQ(v1,v1,v10)
+#ifdef __LITTLE_ENDIAN__
+	MFVRD(r10,v1)
+	addi    r9,r10,-1       /* Form a mask from trailing zeros.  */
+	andc    r9,r9,r10
+	popcntd r0,r9           /* Count the bits in the mask.  */
+#else
+	vsldoi  v1,v1,v1,6
+	MFVRD(r10,v1)
+	cntlzd  r0,r10          /* Count leading zeros before the match.  */
+#endif
+	addi    r5,r5,-16	/* Adjust address to offset of last 16 bytes
+				   read.  */
+	/* Calculate length as subtracted the pointer to s of last 16 bytes
+	   offset, added with the bytes before the match.  */
+	subf    r5,r3,r5
+	add     r3,r5,r0
+	blr			/* Done.  */
+
+	/* Handle case of maxlen > 32 and found a null bytes within the first
+	   16 bytes of s.  */
+	.p2align 4
+L(early_find):
+	bpermd	r5,r8,r10        /* r8 contains the bit permute constants.  */
+	bpermd	r6,r8,r11
+	sldi	r5,r5,8
+	or	r5,r5,r6	/* r5 should hold a 16B mask of
+				   a potential 0.  */
+	cntlzd	r5,r5		/* Count leading zeros.  */
+	addi	r3,r5,-48	/* Deduct the 48 leading zeros always
+				   present.  */
+	blr			/* Done.  */
+
+	/* Handle case of maxlen <= 32. Use the POWER7 algorithm.  */
+	.p2align 4
+L(small_range):
+	clrrdi	r8,r3,3  	/* Align the pointer to 8B.  */
+	li	r0,0
+	/* Register's content at this point:
+	   r3 == pointer to s, r4 == maxlen, r8 == pointer to s aligned to 8B,
+	   r7 == last acceptable address. */
+	cmpldi	r4,0                 /* Check if maxlen is zero.  */
+	beq	L(end_max)	     /* If maxlen is zero.  */
+
+	/* Calculate the last acceptable address and check for possible
+	   addition overflow by using satured math:
+	   r7 = r3 + r4
+	   r7 |= -(r7 < x)  */
+	add     r7,r3,r4
+	subfc   r6,r3,r7
+	subfe   r9,r9,r9
+	extsw   r6,r9
+	or      r7,r7,r6
+	addi    r7,r7,-1
+
+	clrrdi	r7,r7,3              /* Align to 8B address of last
+					acceptable address.  */
+
+	rlwinm	r6,r3,3,26,28        /* Calculate padding.  */
+	ld	r12,0(r8)            /* Load aligned doubleword.  */
+	cmpb	r10,r12,r0           /* Check for null bytes. */
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	sld	r10,r10,r6
+#else
+	sld	r10,r10,r6
+	srd	r10,r10,r6
+#endif /* __LITTLE_ENDIAN__  */
+	cmpldi	cr7,r10,0
+	bne	cr7,L(done_small)    /* If found null byte.  */
+
+	cmpld	r8,r7                /* Check if reached maxlen.  */
+	beq	L(end_max)	     /* If reached maxlen.  */
+
+	/* Still handling case of maxlen <= 32. Read doubleword aligned until
+	   find null bytes or reach maxlen.  */
+	.p2align 4
+L(loop_small):
+	ldu	r12,8(r8)         /* Load next doubleword and update r8.  */
+	cmpb	r10,r12,r0        /* Check for null bytes.  */
+	cmpldi	cr6,r10,0
+	bne	cr6,L(done_small) /* If found null bytes.  */
+	cmpld	r8,r7             /* Check if reached maxlen. */
+	bne	L(loop_small)	  /* If it has more bytes to read.  */
+	mr	r3,r4             /* Reached maxlen with null bytes not found.
+				     Length is equal to maxlen.  */
+	blr			  /* Done.  */
+
+	/* Still handling case of maxlen <= 32. Found null bytes.
+	   Registers: r10 == match bits within doubleword, r8 == address of
+	   last doubleword read, r3 == pointer to s, r4 == maxlen.  */
+	.p2align 4
+L(done_small):
+#ifdef __LITTLE_ENDIAN__
+	/* Count trailing zeros.  */
+	addi	r0,r10,-1
+	andc	r0,r0,r10
+	popcntd	r0,r0
+#else
+	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
+#endif
+	sub	r3,r8,r3      /* Calculate total of bytes before the match.  */
+	srdi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
+	add	r3,r3,r0      /* Length until the match.  */
+	cmpld	r3,r4         /* Check length is greater than maxlen.  */
+	blelr
+	mr	r3,r4	      /* If length is greater than maxlen, return
+				 maxlen.  */
+	blr
+
+	/* Handle case of reached maxlen with null bytes not found.  */
+	.p2align 4
+L(end_max):
+	mr	r3,r4	/* Length is equal to maxlen.  */
+	blr		/* Done.  */
+
+
+END (__strnlen)
+libc_hidden_def (__strnlen)
+weak_alias (__strnlen, strnlen)
+libc_hidden_def (strnlen)
diff --git a/sysdeps/powerpc/powerpc64/power8/strrchr.S b/sysdeps/powerpc/powerpc64/power8/strrchr.S
new file mode 100644
index 0000000000..6ff8a528b6
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strrchr.S
@@ -0,0 +1,468 @@
+/* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* char *[r3] strrchr (char *s [r3], int c [r4])  */
+/* TODO: change these to the actual instructions when the minimum required
+   binutils allows it.  */
+#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define VBPERMQ(t,a,b)  .long (0x1000054c \
+				| ((t)<<(32-11)) \
+				| ((a)<<(32-16)) \
+				| ((b)<<(32-21)) )
+#define VCLZD(r,v) .long (0x100007c2 | ((r)<<(32-11)) | ((v)<<(32-21)))
+#define VPOPCNTD(r,v) .long (0x100007c3 | ((r)<<(32-11)) | ((v)<<(32-21)))
+#define VADDUQM(t,a,b)  .long (0x10000100 \
+				| ((t)<<(32-11)) \
+				| ((a)<<(32-16)) \
+				| ((b)<<(32-21)) )
+#ifdef __LITTLE_ENDIAN__
+/* Find the match position from v6 and place result in r6.  */
+# define CALCULATE_MATCH() \
+	VBPERMQ(v6, v6, v10); \
+	vsldoi	v6, v6, v6, 6; \
+	MFVRD(r7, v6); \
+	cntlzd	r6, r7; \
+	subfic	r6, r6, 15;
+/*
+ * Find the first null position to mask bytes after null.
+ * (reg): vcmpequb result: v2 for 1st qw v3 for 2nd qw.
+ * Result placed at v2.
+ */
+# define FIND_NULL_POS(reg) \
+	vspltisb	v11, -1; \
+	VADDUQM(v11, reg, v11); \
+	vandc	v11, v11, reg; \
+	VPOPCNTD(v2, v11); \
+	vspltb	v11, v2, 15; \
+	vcmpequb.	v11, v11, v9; \
+	blt	cr6, 1f; \
+	vsldoi	v9, v0, v9, 1; \
+	vslo	v2, v2, v9; \
+1: \
+	vsumsws	v2, v2, v0;
+#else
+# define CALCULATE_MATCH() \
+	VBPERMQ(v6, v6, v10); \
+	MFVRD(r7, v6); \
+	addi	r6, r7, -1; \
+	andc	r6, r6, r7; \
+	popcntd	r6, r6; \
+	subfic	r6, r6, 15;
+# define FIND_NULL_POS(reg) \
+	VCLZD(v2, reg); \
+	vspltb	v11, v2, 7; \
+	vcmpequb.	v11, v11, v9; \
+	blt	cr6, 1f; \
+	vsldoi	v9, v0, v9, 1; \
+	vsro	v2, v2, v9; \
+1: \
+	vsumsws	v2, v2, v0;
+#endif	/* !__LITTLE_ENDIAN__  */
+
+#ifndef STRRCHR
+# define STRRCHR strrchr
+#endif
+	.machine  power7
+ENTRY_TOCLESS (STRRCHR)
+	CALL_MCOUNT 2
+	dcbt	0,r3
+	clrrdi	r8,r3,3	      /* Align the address to doubleword boundary.  */
+	cmpdi	cr7,r4,0
+	ld	r12,0(r8)     /* Load doubleword from memory.  */
+	li	r9,0	      /* Used to store last occurence.  */
+	li	r0,0	      /* Doubleword with null chars to use
+				 with cmpb.  */
+
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+
+	beq	cr7,L(null_match)
+
+	/* Replicate byte to doubleword.  */
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
+	insrdi	r4,r4,32,0
+
+	/* r4 is changed now.  If it's passed more chars, then
+	   check for null again.  */
+	cmpdi	cr7,r4,0
+	beq	cr7,L(null_match)
+	/* Now r4 has a doubleword of c bytes and r0 has
+	   a doubleword of null bytes.  */
+
+	cmpb	r10,r12,r4     /* Compare each byte against c byte.  */
+	cmpb	r11,r12,r0     /* Compare each byte against null byte.  */
+
+	/* Move the doublewords left and right to discard the bits that are
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	srd	r11,r11,r6
+	sld	r10,r10,r6
+	sld	r11,r11,r6
+#else
+	sld	r10,r10,r6
+	sld	r11,r11,r6
+	srd	r10,r10,r6
+	srd	r11,r11,r6
+#endif
+	or	r5,r10,r11    /* OR the results to speed things up.  */
+	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done)
+
+L(align):
+	andi.	r12, r8, 15
+
+	/* Are we now aligned to a doubleword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bne	cr0, L(loop)
+
+	/* Handle WORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb	r10,r12,r4
+	cmpb	r11,r12,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done)
+	b	L(loop)	      /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+	.p2align  5
+L(loop):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu	r7,16(r8)
+	cmpb	r10,r12,r4
+	cmpb	r11,r12,r0
+	cmpb	r6,r7,r4
+	cmpb	r7,r7,r0
+	or	r12,r10,r11
+	or	r5,r6,r7
+	or	r5,r12,r5
+	cmpdi	cr7,r5,0
+	beq	cr7,L(vector)
+
+	/* OK, one (or both) of the doublewords contains a c/null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a c/null byte.  */
+	cmpdi	cr6,r12,0
+	addi	r8,r8,-8
+	bne	cr6,L(done)
+
+	/* The c/null byte must be in the second doubleword.  Adjust the
+	   address again and move the result of cmpb to r10 so we can calculate
+	   the pointer.  */
+
+	mr	r10,r6
+	mr	r11,r7
+	addi	r8,r8,8
+
+	/* r10/r11 have the output of the cmpb instructions, that is,
+	   0xff in the same position as the c/null byte in the original
+	   doubleword from the string.  Use that to calculate the pointer.  */
+
+L(done):
+	/* If there are more than one 0xff in r11, find the first position of
+	   0xff in r11 and fill r10 with 0 from that position.  */
+	cmpdi	cr7,r11,0
+	beq	cr7,L(no_null)
+#ifdef __LITTLE_ENDIAN__
+	addi	r3,r11,-1
+	andc	r3,r3,r11
+	popcntd r0,r3
+#else
+	cntlzd	r0,r11
+#endif
+	subfic	r0,r0,63
+	li	r6,-1
+#ifdef __LITTLE_ENDIAN__
+	srd	r0,r6,r0
+#else
+	sld	r0,r6,r0
+#endif
+	and	r10,r0,r10
+L(no_null):
+#ifdef __LITTLE_ENDIAN__
+	cntlzd	r0,r10		/* Count leading zeros before c matches.  */
+	addi	r3,r10,-1
+	andc	r3,r3,r10
+	addi	r10,r11,-1
+	andc	r10,r10,r11
+	cmpld	cr7,r3,r10
+	bgt	cr7,L(no_match)
+#else
+	addi	r3,r10,-1	/* Count trailing zeros before c matches.  */
+	andc	r3,r3,r10
+	popcntd	r0,r3
+	cmpld	cr7,r11,r10
+	bgt	cr7,L(no_match)
+#endif
+	srdi	r0,r0,3		/* Convert trailing zeros to bytes.  */
+	subfic	r0,r0,7
+	add	r9,r8,r0      /* Return address of the matching c byte
+				 or null in case c was not found.  */
+	li	r0,0
+	cmpdi	cr7,r11,0     /* If r11 == 0, no null's have been found.  */
+	beq	cr7,L(align)
+
+	.align	4
+L(no_match):
+	mr	r3,r9
+	blr
+
+/* Check the first 32B in GPR's and move to vectorized loop.  */
+	.p2align  5
+L(vector):
+	addi	r3, r8, 8
+	/* Make sure 32B aligned.  */
+	andi.	r10, r3, 31
+	bne	cr0, L(loop)
+	vspltisb	v0, 0
+	/* Precompute vbpermq constant.  */
+	vspltisb	v10, 3
+	lvsl	v11, r0, r0
+	vslb	v10, v11, v10
+	MTVRD(v1, r4)
+	li	r5, 16
+	vspltb	v1, v1, 7
+	/* Compare 32 bytes in each loop.  */
+L(continue):
+	lvx	v4, 0, r3
+	lvx	v5, r3, r5
+	vcmpequb	v2, v0, v4
+	vcmpequb	v3, v0, v5
+	vcmpequb	v6, v1, v4
+	vcmpequb	v7, v1, v5
+	vor	v8, v2, v3
+	vor	v9, v6, v7
+	vor	v11, v8, v9
+	vcmpequb.	v11, v0, v11
+	addi	r3, r3, 32
+	blt	cr6, L(continue)
+	vcmpequb.	v8, v0, v8
+	blt	cr6, L(match)
+
+	/* One (or both) of the quadwords contains c/null.  */
+	vspltisb	v8, 2
+	vspltisb	v9, 5
+	/* Precompute values used for comparison.  */
+	vsl	v9, v8, v9	/* v9 = 0x4040404040404040.  */
+	vaddubm	v8, v9, v9
+	vsldoi	v8, v0, v8, 1	/* v8 = 0x80.  */
+
+	/* Check if null is in second qw.  */
+	vcmpequb.	v11, v0, v2
+	blt	cr6, L(secondqw)
+
+	/* Null found in first qw.  */
+	addi	r8, r3, -32
+	/* Calculate the null position.  */
+	FIND_NULL_POS(v2)
+	/* Check if null is in the first byte.  */
+	vcmpequb.	v11, v0, v2
+	blt	cr6, L(no_match)
+	vsububm	v2, v8, v2
+	/* Mask unwanted bytes after null.  */
+#ifdef __LITTLE_ENDIAN__
+	vslo	v6, v6, v2
+	vsro	v6, v6, v2
+#else
+	vsro	v6, v6, v2
+	vslo	v6, v6, v2
+#endif
+	vcmpequb.	v11, v0, v6
+	blt	cr6, L(no_match)
+	/* Found a match before null.  */
+	CALCULATE_MATCH()
+	add	r3, r8, r6
+	blr
+
+L(secondqw):
+	addi	r8, r3, -16
+	FIND_NULL_POS(v3)
+	vcmpequb.	v11, v0, v2
+	blt	cr6, L(no_match1)
+	vsububm	v2, v8, v2
+	/* Mask unwanted bytes after null.  */
+#ifdef __LITTLE_ENDIAN__
+	vslo	v7, v7, v2
+	vsro	v7, v7, v2
+#else
+	vsro	v7, v7, v2
+	vslo	v7, v7, v2
+#endif
+	vcmpequb.	v11, v0, v7
+	blt	cr6, L(no_match1)
+	addi	r8, r8, 16
+	vor	v6, v0, v7
+L(no_match1):
+	addi	r8, r8, -16
+	vcmpequb.	v11, v0, v6
+	blt	cr6, L(no_match)
+	/* Found a match before null.  */
+	CALCULATE_MATCH()
+	add	r3, r8, r6
+	blr
+
+L(match):
+	/* One (or both) of the quadwords contains a match.  */
+	mr	r8, r3
+	vcmpequb.	v8, v0, v7
+	blt	cr6, L(firstqw)
+	/* Match found in second qw.  */
+	addi	r8, r8, 16
+	vor	v6, v0, v7
+L(firstqw):
+	addi	r8, r8, -32
+	CALCULATE_MATCH()
+	add	r9, r8, r6      /* Compute final length.  */
+	b	L(continue)
+/* We are here because strrchr was called with a null byte.  */
+	.align	4
+L(null_match):
+	/* r0 has a doubleword of null bytes.  */
+
+	cmpb	r5,r12,r0     /* Compare each byte against null bytes.  */
+
+	/* Move the doublewords left and right to discard the bits that are
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r5,r5,r6
+	sld	r5,r5,r6
+#else
+	sld	r5,r5,r6
+	srd	r5,r5,r6
+#endif
+	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done_null)
+
+	andi.	r12, r8, 15
+
+	/* Are we now aligned to a quadword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bne	cr0, L(loop_null)
+
+	/* Handle WORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb	r5,r12,r0
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done_null)
+	b	L(loop_null)  /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+
+	/* Main loop to look for the end of the string.  Since it's a
+	   small loop (< 8 instructions), align it to 32-bytes.  */
+	.p2align  5
+L(loop_null):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu	r11,16(r8)
+	cmpb	r5,r12,r0
+	cmpb	r10,r11,r0
+	or	r6,r5,r10
+	cmpdi	cr7,r6,0
+	beq	cr7,L(vector1)
+
+	/* OK, one (or both) of the doublewords contains a null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a null byte.  */
+
+	cmpdi	cr6,r5,0
+	addi	r8,r8,-8
+	bne	cr6,L(done_null)
+
+	/* The null byte must be in the second doubleword.  Adjust the address
+	   again and move the result of cmpb to r10 so we can calculate the
+	   pointer.  */
+
+	mr	r5,r10
+	addi	r8,r8,8
+
+	/* r5 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as the null byte in the original
+	   doubleword from the string.  Use that to calculate the pointer.  */
+L(done_null):
+#ifdef __LITTLE_ENDIAN__
+	addi	r0,r5,-1
+	andc	r0,r0,r5
+	popcntd	r0,r0
+#else
+	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
+#endif
+	srdi	r0,r0,3	      /* Convert trailing zeros to bytes.  */
+	add	r3,r8,r0      /* Return address of the matching null byte.  */
+	blr
+/* Check the first 32B in GPR's and move to vectorized loop.  */
+	.p2align  5
+L(vector1):
+	addi	r3, r8, 8
+	/* Make sure 32B aligned.  */
+	andi.	r10, r3, 31
+	bne	cr0, L(loop_null)
+	vspltisb	v0, 0
+	/* Precompute vbpermq constant.  */
+	vspltisb	v10, 3
+	lvsl	v11, r0, r0
+	vslb	v10, v11, v10
+	li	r5, 16
+	/* Compare 32 bytes in each loop.  */
+L(continue1):
+	lvx	v4, 0, r3
+	lvx	v5, r3, r5
+	vcmpequb	v2, v0, v4
+	vcmpequb	v3, v0, v5
+	vor	v8, v2, v3
+	vcmpequb.	v11, v0, v8
+	addi	r3, r3, 32
+	blt	cr6, L(continue1)
+	addi	r3, r3, -32
+	VBPERMQ(v2, v2, v10)
+	VBPERMQ(v3, v3, v10)
+	/* Shift each component into its correct position for merging.  */
+#ifdef __LITTLE_ENDIAN__
+	vsldoi	v3, v3, v3, 2
+#else
+	vsldoi	v2, v2, v2, 6
+	vsldoi	v3, v3, v3, 4
+#endif
+	/* Merge the results and move to a GPR.  */
+	vor	v4, v3, v2
+	MFVRD(r5, v4)
+#ifdef __LITTLE_ENDIAN__
+	addi	r6, r5, -1
+	andc	r6, r6, r5
+	popcntd	r6, r6
+#else
+	cntlzd	r6, r5  /* Count leading zeros before the match.  */
+#endif
+	add	r3, r3, r6      /* Compute final length.  */
+	blr
+END_GEN_TB (STRRCHR, TB_TOCLESS)
+weak_alias (strrchr, rindex)
+libc_hidden_builtin_def (strrchr)
diff --git a/sysdeps/powerpc/powerpc64/power8/strspn.S b/sysdeps/powerpc/powerpc64/power8/strspn.S
new file mode 100644
index 0000000000..095f6d6f41
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strspn.S
@@ -0,0 +1,202 @@
+/* Optimized strspn implementation for Power8.
+
+   Copyright (C) 2016-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* size_t [r3] strspn (const char *string [r3],
+                       const char *needleAccept [r4])  */
+
+/* This takes a novel approach by computing a 256 bit mask whereby
+   each set bit implies the byte is "accepted".  P8 vector hardware
+   has extremely efficient hardware for selecting bits from a mask.
+
+   One might ask "why not use bpermd for short strings"?  It is
+   so slow that its performance about matches the generic PPC64
+   variant without any fancy masking, with the added expense of
+   making the mask.  That was the first variant of this.  */
+
+
+
+#include "sysdep.h"
+
+#ifndef USE_AS_STRCSPN
+#  define USE_AS_STRCSPN 0
+#  ifndef STRSPN
+#    define STRSPN strspn
+#  endif
+#  define INITIAL_MASK 0
+#  define UPDATE_MASK(RA, RS, RB) or	RA, RS, RB
+#else
+#  ifndef STRSPN
+#    define STRSPN strcspn
+#  endif
+#  define INITIAL_MASK -1
+#  define UPDATE_MASK(RA, RS, RB) andc	RA, RS, RB
+#endif
+
+/* Simple macro to use VSX instructions in overlapping VR's.  */
+#define XXVR(insn, vrt, vra, vrb) \
+	insn 32+vrt, 32+vra, 32+vrb
+
+/* ISA 2.07B instructions are not all defined for older binutils.
+   Macros are defined below for these newer instructions in order
+   to maintain compatibility.  */
+
+/* Note, TX/SX is always set as VMX regs are the high 32 VSX regs.  */
+#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+
+#define VBPERMQ(t,a,b) .long (0x1000054c \
+			      | ((t)<<(32-11))	\
+			      | ((a)<<(32-16))	\
+			      | ((b)<<(32-21)) )
+
+	/* This can be updated to power8 once the minimum version of
+	   binutils supports power8 and the above instructions.  */
+	.machine power7
+ENTRY_TOCLESS (STRSPN, 4)
+	CALL_MCOUNT 2
+
+	/* Generate useful constants for later on.  */
+	vspltisb v1, 7
+	vspltisb v2, -1
+	vslb	v1, v1, v1	/* 0x80 to swap high bit for vbpermq.  */
+	vspltisb v10, 0
+	vsldoi	v4, v10, v2, 2	/* 0xFFFF into vr4.  */
+	XXVR(xxmrgld, v4, v4, v10) /* Mask for checking matches.  */
+
+	/* Prepare to compute 256b mask.  */
+	addi	r4, r4, -1
+	li	r5, INITIAL_MASK
+	li	r6, INITIAL_MASK
+	li	r7, INITIAL_MASK
+	li	r8, INITIAL_MASK
+
+#if USE_AS_STRCSPN
+	/* Ensure the null character never matches by clearing ISA bit 0 in
+	   in r5 which is the bit which will check for it in the later usage
+	   of vbpermq.  */
+	srdi	r5, r5, 1
+#endif
+
+	li	r11, 1
+	sldi	r11, r11, 63
+
+	/* Start interleaved Mask computation.
+	   This will eventually or 1's into ignored bits from vbpermq.  */
+	lvsr	v11, 0, r3
+	vspltb  v11, v11, 0	/* Splat shift constant.  */
+
+	/* Build a 256b mask in r5-r8.  */
+	.align 4
+L(next_needle):
+	lbzu	r9, 1(r4)
+
+	cmpldi	cr0, r9, 0
+	cmpldi	cr1, r9, 128
+
+	/* This is a little tricky.  srd only uses the first 7 bits,
+	   and if bit 7 is set, value is always 0.  So, we can
+	   effectively shift 128b in this case.  */
+	xori	r12, r9,  0x40	/* Invert bit 6.  */
+	srd	r10, r11, r9	/* Mask for bits 0-63.  */
+	srd	r12, r11, r12	/* Mask for bits 64-127.  */
+
+	beq	cr0, L(start_cmp)
+
+	/* Now, or the value into the correct GPR.  */
+	bge cr1,L(needle_gt128)
+	UPDATE_MASK (r5, r5, r10)	/* 0 - 63.  */
+	UPDATE_MASK (r6, r6, r12)	/* 64 - 127.  */
+	b L(next_needle)
+
+	.align 4
+L(needle_gt128):
+	UPDATE_MASK (r7, r7, r10)	/* 128 - 191.  */
+	UPDATE_MASK (r8, r8, r12)	/* 192 - 255.  */
+	b L(next_needle)
+
+
+	.align 4
+L(start_cmp):
+	/* Move and merge bitmap into 2 VRs.  bpermd is slower on P8.  */
+	mr	r0, r3		/* Save r3 for final length computation.  */
+	MTVRD (v5, r5)
+	MTVRD (v6, r6)
+	MTVRD (v7, r7)
+	MTVRD (v8, r8)
+
+	/* Continue interleaved mask generation.  */
+#ifdef __LITTLE_ENDIAN__
+	vsrw	v11, v2, v11	/* Note, shift ignores higher order bits.  */
+	vsplth  v11, v11, 0	/* Only care about the high 16 bits of v10.  */
+#else
+	vslw	v11, v2, v11	/* Note, shift ignores higher order bits.  */
+	vsplth  v11, v11, 1	/* Only care about the low 16 bits of v10.  */
+#endif
+	lvx	v0, 0, r3	/* Note, unaligned load ignores lower bits.  */
+
+	/* Do the merging of the bitmask.  */
+	XXVR(xxmrghd, v5, v5, v6)
+	XXVR(xxmrghd, v6, v7, v8)
+
+	/* Finish mask generation.  */
+	vand	v11, v11, v4	/* Throwaway bits not in the mask.  */
+
+	/* Compare the first 1-16B, while masking unwanted bytes.  */
+	clrrdi  r3, r3, 4	/* Note,  counts from qw boundaries.  */
+	vxor	v9, v0, v1	/* Swap high bit.  */
+	VBPERMQ (v8, v5, v0)
+	VBPERMQ (v7, v6, v9)
+	vor	v7, v7, v8
+	vor	v7, v7, v11	/* Ignore non-participating bytes.  */
+	vcmpequh. v8, v7, v4
+	bnl	cr6, L(done)
+
+	addi	r3, r3, 16
+
+	.align 4
+L(vec):
+	lvx	v0, 0, r3
+	addi	r3, r3, 16
+	vxor	v9, v0, v1	/* Swap high bit.  */
+	VBPERMQ (v8, v5, v0)
+	VBPERMQ (v7, v6, v9)
+	vor	v7, v7, v8
+	vcmpequh. v8, v7, v4
+	blt	cr6, L(vec)
+
+	addi	r3, r3, -16
+L(done):
+	subf	r3, r0, r3
+	MFVRD (r10, v7)
+
+#ifdef __LITTLE_ENDIAN__
+	addi	r0,  r10, 1	/* Count the trailing 1's.  */
+	andc	r10, r10, r0
+	popcntd	r10, r10
+#else
+	xori	r10, r10, 0xffff /* Count leading 1's by inverting.  */
+	addi	r3,  r3,  -48	/* Account for the extra leading zeros.  */
+	cntlzd  r10, r10
+#endif
+
+	add	r3, r3, r10
+	blr
+
+END(STRSPN)
+libc_hidden_builtin_def (STRSPN)