12 files changed, 1366 insertions, 38 deletions
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/Implies b/sysdeps/powerpc/powerpc64/power8/fpu/Implies
deleted file mode 100644
index 1187cdfb0a..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/fpu/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc64/power7/fpu/
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S b/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S
new file mode 100644
index 0000000000..32ee8326e1
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S
@@ -0,0 +1,303 @@
+/* Optimized expf().  PowerPC64/POWER8 version.
+   Copyright (C) 2016-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Short algorithm description:
+ *
+ *  Let K = 64 (table size).
+ *       e^x  = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y))
+ *  where:
+ *       x = m*log(2)/K + y,    y in [0.0..log(2)/K]
+ *       m = n*K + j,           m,n,j - signed integer, j in [0..K-1]
+ *       values of 2^(j/K) are tabulated as T[j].
+ *
+ *       P(y) is a minimax polynomial approximation of expf(y)-1
+ *       on small interval [0.0..log(2)/K].
+ *
+ *       P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as
+ *       z = y*y;    P(y) = (P3*z + P1)*z + (P2*z + P0)*y
+ *
+ * Special cases:
+ *  expf(NaN) = NaN
+ *  expf(+INF) = +INF
+ *  expf(-INF) = 0
+ *  expf(x) = 1 for subnormals
+ *  for finite argument, only expf(0)=1 is exact
+ *  expf(x) overflows if x>88.7228317260742190
+ *  expf(x) underflows if x<-103.972076416015620
+ */
+
+#define C1 0x42ad496b		/* Single precision 125*log(2).  */
+#define C2 0x31800000		/* Single precision 2^(-28).  */
+#define SP_INF 0x7f800000	/* Single precision Inf.  */
+#define SP_EXP_BIAS 0x1fc0	/* Single precision exponent bias.  */
+
+#define DATA_OFFSET r9
+
+/* Implements the function
+
+   float [fp1] expf (float [fp1] x)  */
+
+	.machine power8
+ENTRY (__ieee754_expf, 4)
+	addis	DATA_OFFSET,r2,.Lanchor@toc@ha
+	addi	DATA_OFFSET,DATA_OFFSET,.Lanchor@toc@l
+
+	xscvdpspn v0,v1
+	mfvsrd	r8,v0		/* r8 = x  */
+	lfd	fp2,(.KLN2-.Lanchor)(DATA_OFFSET)
+	lfd	fp3,(.P2-.Lanchor)(DATA_OFFSET)
+	rldicl	r3,r8,32,33	/* r3 = |x|  */
+	lis	r4,C1@ha	/* r4 = 125*log(2)  */
+	ori	r4,r4,C1@l
+	cmpw	r3,r4
+	lfd	fp5,(.P3-.Lanchor)(DATA_OFFSET)
+	lfd	fp4,(.RS-.Lanchor)(DATA_OFFSET)
+	fmadd	fp2,fp1,fp2,fp4	/* fp2 = x * K/log(2) + (2^23 + 2^22)  */
+	bge	L(special_paths)	/* |x| >= 125*log(2) ?  */
+
+	lis	r4,C2@ha
+	ori	r4,r4,C2@l
+	cmpw	r3,r4
+	blt	L(small_args)	/* |x| < 2^(-28) ?  */
+
+	/* Main path: here if 2^(-28) <= |x| < 125*log(2) */
+	frsp	fp6,fp2
+	xscvdpsp v2,v2
+	mfvsrd	r8,v2
+	mr	r3,r8			/* r3 = m  */
+	rldicl	r8,r8,32,58		/* r8 = j  */
+	lfs	fp4,(.SP_RS-.Lanchor)(DATA_OFFSET)
+	fsubs	fp2,fp6,fp4		/* fp2 = m = x * K/log(2)  */
+	srdi	r3,r3,32
+	clrrwi	r3,r3,6			/* r3 = n  */
+	lfd	fp6,(.NLN2K-.Lanchor)(DATA_OFFSET)
+	fmadd	fp0,fp2,fp6,fp1		/* fp0 = y = x - m*log(2)/K  */
+	fmul	fp2,fp0,fp0		/* fp2 = z = y^2  */
+	lfd	fp4,(.P1-.Lanchor)(DATA_OFFSET)
+	lfd	fp6,(.P0-.Lanchor)(DATA_OFFSET)
+	lis	r4,SP_EXP_BIAS@ha
+	ori	r4,r4,SP_EXP_BIAS@l
+	add	r3,r3,r4
+	rldic	r3,r3,49,1		/* r3 = 2^n  */
+	fmadd	fp4,fp5,fp2,fp4		/* fp4 = P3 * z + P1  */
+	fmadd	fp6,fp3,fp2,fp6		/* fp6 = P2 * z + P0  */
+	mtvsrd	v1,r3
+	xscvspdp v1,v1
+	fmul	fp4,fp4,fp2		/* fp4 = (P3 * z + P1)*z  */
+	fmadd	fp0,fp0,fp6,fp4		/* fp0 = P(y)  */
+	sldi	r8,r8,3			/* Access doublewords from T[j].  */
+	addi	r6,DATA_OFFSET,(.Ttable-.Lanchor)
+	lfdx	fp3,r6,r8
+	fmadd	fp0,fp0,fp3,fp3		/* fp0 = T[j] * (1 + P(y))  */
+	fmul	fp1,fp1,fp0		/* fp1 = 2^n * T[j] * (1 + P(y))  */
+	frsp	fp1,fp1
+	blr
+
+	.align	4
+/* x is either underflow, overflow, infinite or NaN.  */
+L(special_paths):
+	srdi	r8,r8,32
+	rlwinm	r8,r8,3,29,29		/* r8 = 0, if x positive.
+					   r8 = 4, otherwise.  */
+	addi	r6,DATA_OFFSET,(.SPRANGE-.Lanchor)
+	lwzx	r4,r6,r8		/* r4 = .SPRANGE[signbit(x)]  */
+	cmpw	r3,r4
+	/* |x| <= .SPRANGE[signbit(x)]  */
+	ble	L(near_under_or_overflow)
+
+	lis	r4,SP_INF@ha
+	ori	r4,r4,SP_INF@l
+	cmpw	r3,r4
+	bge	L(arg_inf_or_nan)	/* |x| > Infinite ?  */
+
+	addi	r6,DATA_OFFSET,(.SPLARGE_SMALL-.Lanchor)
+	lfsx	fp1,r6,r8
+	fmuls	fp1,fp1,fp1
+	blr
+
+
+	.align	4
+L(small_args):
+	/* expf(x) = 1.0, where |x| < |2^(-28)|  */
+	lfs	fp2,(.SPone-.Lanchor)(DATA_OFFSET)
+	fadds	fp1,fp1,fp2
+	blr
+
+
+	.align	4
+L(arg_inf_or_nan:)
+	bne	L(arg_nan)
+
+	/* expf(+INF) = +INF
+	   expf(-INF) = 0  */
+	addi	r6,DATA_OFFSET,(.INF_ZERO-.Lanchor)
+	lfsx	fp1,r6,r8
+	blr
+
+
+	.align	4
+L(arg_nan):
+	/* expf(NaN) = NaN  */
+	fadd	fp1,fp1,fp1
+	frsp	fp1,fp1
+	blr
+
+	.align	4
+L(near_under_or_overflow):
+	frsp	fp6,fp2
+	xscvdpsp v2,v2
+	mfvsrd	r8,v2
+	mr	r3,r8			/* r3 = m  */
+	rldicl	r8,r8,32,58		/* r8 = j  */
+	lfs	fp4,(.SP_RS-.Lanchor)(DATA_OFFSET)
+	fsubs	fp2,fp6,fp4		/* fp2 = m = x * K/log(2)  */
+	srdi	r3,r3,32
+	clrrwi	r3,r3,6			/* r3 = n  */
+	lfd	fp6,(.NLN2K-.Lanchor)(DATA_OFFSET)
+	fmadd	fp0,fp2,fp6,fp1		/* fp0 = y = x - m*log(2)/K  */
+	fmul	fp2,fp0,fp0		/* fp2 = z = y^2  */
+	lfd	fp4,(.P1-.Lanchor)(DATA_OFFSET)
+	lfd	fp6,(.P0-.Lanchor)(DATA_OFFSET)
+	ld	r4,(.DP_EXP_BIAS-.Lanchor)(DATA_OFFSET)
+	add	r3,r3,r4
+	rldic	r3,r3,46,1		/* r3 = 2  */
+	fmadd	fp4,fp5,fp2,fp4		/* fp4 = P3 * z + P1  */
+	fmadd	fp6,fp3,fp2,fp6		/* fp6 = P2 * z + P0  */
+	mtvsrd	v1,r3
+	fmul	fp4,fp4,fp2		/* fp4 = (P3*z + P1)*z  */
+	fmadd	fp0,fp0,fp6,fp4		/* fp0 = P(y)  */
+	sldi	r8,r8,3			/* Access doublewords from T[j].  */
+	addi	r6,DATA_OFFSET,(.Ttable-.Lanchor)
+	lfdx	fp3,r6,r8
+	fmadd	fp0,fp0,fp3,fp3		/* fp0 = T[j] * (1 + T[j])  */
+	fmul	fp1,fp1,fp0		/* fp1 = 2^n * T[j] * (1 + T[j])  */
+	frsp	fp1,fp1
+	blr
+END(__ieee754_expf)
+
+	.section .rodata, "a",@progbits
+.Lanchor:
+	.balign	8
+/* Table T[j] = 2^(j/K).  Double precision.  */
+.Ttable:
+	.8byte	0x3ff0000000000000
+	.8byte	0x3ff02c9a3e778061
+	.8byte	0x3ff059b0d3158574
+	.8byte	0x3ff0874518759bc8
+	.8byte	0x3ff0b5586cf9890f
+	.8byte	0x3ff0e3ec32d3d1a2
+	.8byte	0x3ff11301d0125b51
+	.8byte	0x3ff1429aaea92de0
+	.8byte	0x3ff172b83c7d517b
+	.8byte	0x3ff1a35beb6fcb75
+	.8byte	0x3ff1d4873168b9aa
+	.8byte	0x3ff2063b88628cd6
+	.8byte	0x3ff2387a6e756238
+	.8byte	0x3ff26b4565e27cdd
+	.8byte	0x3ff29e9df51fdee1
+	.8byte	0x3ff2d285a6e4030b
+	.8byte	0x3ff306fe0a31b715
+	.8byte	0x3ff33c08b26416ff
+	.8byte	0x3ff371a7373aa9cb
+	.8byte	0x3ff3a7db34e59ff7
+	.8byte	0x3ff3dea64c123422
+	.8byte	0x3ff4160a21f72e2a
+	.8byte	0x3ff44e086061892d
+	.8byte	0x3ff486a2b5c13cd0
+	.8byte	0x3ff4bfdad5362a27
+	.8byte	0x3ff4f9b2769d2ca7
+	.8byte	0x3ff5342b569d4f82
+	.8byte	0x3ff56f4736b527da
+	.8byte	0x3ff5ab07dd485429
+	.8byte	0x3ff5e76f15ad2148
+	.8byte	0x3ff6247eb03a5585
+	.8byte	0x3ff6623882552225
+	.8byte	0x3ff6a09e667f3bcd
+	.8byte	0x3ff6dfb23c651a2f
+	.8byte	0x3ff71f75e8ec5f74
+	.8byte	0x3ff75feb564267c9
+	.8byte	0x3ff7a11473eb0187
+	.8byte	0x3ff7e2f336cf4e62
+	.8byte	0x3ff82589994cce13
+	.8byte	0x3ff868d99b4492ed
+	.8byte	0x3ff8ace5422aa0db
+	.8byte	0x3ff8f1ae99157736
+	.8byte	0x3ff93737b0cdc5e5
+	.8byte	0x3ff97d829fde4e50
+	.8byte	0x3ff9c49182a3f090
+	.8byte	0x3ffa0c667b5de565
+	.8byte	0x3ffa5503b23e255d
+	.8byte	0x3ffa9e6b5579fdbf
+	.8byte	0x3ffae89f995ad3ad
+	.8byte	0x3ffb33a2b84f15fb
+	.8byte	0x3ffb7f76f2fb5e47
+	.8byte	0x3ffbcc1e904bc1d2
+	.8byte	0x3ffc199bdd85529c
+	.8byte	0x3ffc67f12e57d14b
+	.8byte	0x3ffcb720dcef9069
+	.8byte	0x3ffd072d4a07897c
+	.8byte	0x3ffd5818dcfba487
+	.8byte	0x3ffda9e603db3285
+	.8byte	0x3ffdfc97337b9b5f
+	.8byte	0x3ffe502ee78b3ff6
+	.8byte	0x3ffea4afa2a490da
+	.8byte	0x3ffefa1bee615a27
+	.8byte	0x3fff50765b6e4540
+	.8byte	0x3fffa7c1819e90d8
+
+.KLN2:
+	.8byte	0x40571547652b82fe	/* Double precision K/log(2).  */
+
+/* Double precision polynomial coefficients.  */
+.P0:
+	.8byte	0x3fefffffffffe7c6
+.P1:
+	.8byte	0x3fe00000008d6118
+.P2:
+	.8byte	0x3fc55550da752d4f
+.P3:
+	.8byte	0x3fa56420eb78fa85
+
+.RS:
+	.8byte	0x4168000000000000	/* Double precision 2^23 + 2^22.  */
+.NLN2K:
+	.8byte	0xbf862e42fefa39ef	/* Double precision -log(2)/K.  */
+.DP_EXP_BIAS:
+	.8byte	0x000000000000ffc0	/* Double precision exponent bias.  */
+
+	.balign	4
+.SPone:
+	.4byte	0x3f800000	/* Single precision 1.0.  */
+.SP_RS:
+	.4byte	0x4b400000	/* Single precision 2^23 + 2^22.  */
+
+.SPRANGE: /* Single precision overflow/underflow bounds.  */
+	.4byte	0x42b17217	/* if x>this bound, then result overflows.  */
+	.4byte	0x42cff1b4	/* if x<this bound, then result underflows.  */
+
+.SPLARGE_SMALL:
+	.4byte	0x71800000	/* 2^100.  */
+	.4byte	0x0d800000	/* 2^-100.  */
+
+.INF_ZERO:
+	.4byte	0x7f800000	/* Single precision Inf.  */
+	.4byte	0		/* Single precision zero.  */
+
+strong_alias (__ieee754_expf, __expf_finite)
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies b/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies
deleted file mode 100644
index 7fd86fdf87..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc64/power7/fpu/multiarch
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S
new file mode 100644
index 0000000000..af71382fb2
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S
@@ -0,0 +1,509 @@
+/* Optimized cosf().  PowerPC64/POWER8 version.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#define _ERRNO_H	1
+#include <bits/errno.h>
+#include <libm-alias-float.h>
+
+#define FRAMESIZE (FRAME_MIN_SIZE+16)
+
+#define FLOAT_EXPONENT_SHIFT	23
+#define FLOAT_EXPONENT_BIAS	127
+#define INTEGER_BITS		3
+
+#define PI_4		0x3f490fdb	/* PI/4 */
+#define NINEPI_4	0x40e231d6	/* 9 * PI/4 */
+#define TWO_PN5		0x3d000000	/* 2^-5 */
+#define TWO_PN27	0x32000000	/* 2^-27 */
+#define INFINITY	0x7f800000
+#define TWO_P23		0x4b000000	/* 2^23 */
+#define FX_FRACTION_1_28 0x9249250	/* 0x100000000 / 28 + 1 */
+
+	/* Implements the function
+
+	   float [fp1] cosf (float [fp1] x)  */
+
+	.machine power8
+ENTRY (__cosf, 4)
+	addis	r9,r2,L(anchor)@toc@ha
+	addi	r9,r9,L(anchor)@toc@l
+
+	lis	r4,PI_4@h
+	ori	r4,r4,PI_4@l
+
+	xscvdpspn v0,v1
+	mfvsrd	r8,v0
+	rldicl	r3,r8,32,33		/* Remove sign bit.  */
+
+	cmpw	r3,r4
+	bge	L(greater_or_equal_pio4)
+
+	lis	r4,TWO_PN5@h
+	ori	r4,r4,TWO_PN5@l
+
+	cmpw	r3,r4
+	blt	L(less_2pn5)
+
+	/* Chebyshev polynomial of the form:
+	 * 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).  */
+
+	lfd	fp9,(L(C0)-L(anchor))(r9)
+	lfd	fp10,(L(C1)-L(anchor))(r9)
+	lfd	fp11,(L(C2)-L(anchor))(r9)
+	lfd	fp12,(L(C3)-L(anchor))(r9)
+	lfd	fp13,(L(C4)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp1		/* x^2 */
+	lfd     fp3,(L(DPone)-L(anchor))(r9)
+
+	fmadd	fp4,fp2,fp13,fp12	/* C3+x^2*C4 */
+	fmadd	fp4,fp2,fp4,fp11	/* C2+x^2*(C3+x^2*C4) */
+	fmadd	fp4,fp2,fp4,fp10	/* C1+x^2*(C2+x^2*(C3+x^2*C4)) */
+	fmadd	fp4,fp2,fp4,fp9		/* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */
+	fmadd	fp1,fp2,fp4,fp3		/* 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */
+	frsp	fp1,fp1			/* Round to single precision.  */
+
+	blr
+
+	.balign 16
+L(greater_or_equal_pio4):
+	lis	r4,NINEPI_4@h
+	ori	r4,r4,NINEPI_4@l
+	cmpw	r3,r4
+	bge	L(greater_or_equal_9pio4)
+
+	/* Calculate quotient of |x|/(PI/4).  */
+	lfd	fp2,(L(invpio4)-L(anchor))(r9)
+	fabs	fp1,fp1			/* |x| */
+	fmul	fp2,fp1,fp2		/* |x|/(PI/4) */
+	fctiduz	fp2,fp2
+	mfvsrd	r3,v2			/* n = |x| mod PI/4 */
+
+	/* Now use that quotient to find |x| mod (PI/2).  */
+	addi	r7,r3,1
+	rldicr	r5,r7,2,60		/* ((n+1) >> 1) << 3 */
+	addi	r6,r9,(L(pio2_table)-L(anchor))
+	lfdx	fp4,r5,r6
+	fsub	fp1,fp1,fp4
+
+	.balign 16
+L(reduced):
+	/* Now we are in the range -PI/4 to PI/4.  */
+
+	/* Work out if we are in a positive or negative primary interval.  */
+	addi    r7,r7,2
+	rldicl	r4,r7,62,63		/* ((n+3) >> 2) & 1 */
+
+	/* Load a 1.0 or -1.0.  */
+	addi	r5,r9,(L(ones)-L(anchor))
+	sldi	r4,r4,3
+	lfdx	fp0,r4,r5
+
+	/* Are we in the primary interval of sin or cos?  */
+	andi.	r4,r7,0x2
+	bne	L(cos)
+
+	/* Chebyshev polynomial of the form:
+	   x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).  */
+
+	lfd	fp9,(L(S0)-L(anchor))(r9)
+	lfd	fp10,(L(S1)-L(anchor))(r9)
+	lfd	fp11,(L(S2)-L(anchor))(r9)
+	lfd	fp12,(L(S3)-L(anchor))(r9)
+	lfd	fp13,(L(S4)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp1		/* x^2 */
+	fmul	fp3,fp2,fp1		/* x^3 */
+
+	fmadd	fp4,fp2,fp13,fp12	/* S3+x^2*S4 */
+	fmadd	fp4,fp2,fp4,fp11	/* S2+x^2*(S3+x^2*S4) */
+	fmadd	fp4,fp2,fp4,fp10	/* S1+x^2*(S2+x^2*(S3+x^2*S4)) */
+	fmadd	fp4,fp2,fp4,fp9		/* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */
+	fmadd	fp4,fp3,fp4,fp1		/* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */
+	fmul	fp4,fp4,fp0		/* Add in the sign.  */
+	frsp	fp1,fp4			/* Round to single precision.  */
+
+	blr
+
+	.balign 16
+L(cos):
+	/* Chebyshev polynomial of the form:
+	   1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).  */
+
+	lfd	fp9,(L(C0)-L(anchor))(r9)
+	lfd	fp10,(L(C1)-L(anchor))(r9)
+	lfd	fp11,(L(C2)-L(anchor))(r9)
+	lfd	fp12,(L(C3)-L(anchor))(r9)
+	lfd	fp13,(L(C4)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp1		/* x^2 */
+	lfd	fp3,(L(DPone)-L(anchor))(r9)
+
+	fmadd	fp4,fp2,fp13,fp12	/* C3+x^2*C4 */
+	fmadd	fp4,fp2,fp4,fp11	/* C2+x^2*(C3+x^2*C4) */
+	fmadd	fp4,fp2,fp4,fp10	/* C1+x^2*(C2+x^2*(C3+x^2*C4)) */
+	fmadd	fp4,fp2,fp4,fp9		/* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */
+	fmadd	fp4,fp2,fp4,fp3		/* 1.0 + x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */
+	fmul	fp4,fp4,fp0		/* Add in the sign.  */
+	frsp	fp1,fp4			/* Round to single precision.  */
+
+	blr
+
+	.balign 16
+L(greater_or_equal_9pio4):
+	lis	r4,INFINITY@h
+	ori	r4,r4,INFINITY@l
+	cmpw	r3,r4
+	bge	L(inf_or_nan)
+
+	lis	r4,TWO_P23@h
+	ori	r4,r4,TWO_P23@l
+	cmpw	r3,r4
+	bge	L(greater_or_equal_2p23)
+
+	fabs	fp1,fp1			/* |x| */
+
+	/* Calculate quotient of |x|/(PI/4).  */
+	lfd	fp2,(L(invpio4)-L(anchor))(r9)
+
+	lfd     fp3,(L(DPone)-L(anchor))(r9)
+	lfd     fp4,(L(DPhalf)-L(anchor))(r9)
+	fmul    fp2,fp1,fp2             /* |x|/(PI/4) */
+	friz    fp2,fp2                 /* n = floor(|x|/(PI/4)) */
+
+	/* Calculate (n + 1) / 2.  */
+	fadd	fp2,fp2,fp3		/* n + 1 */
+	fmul	fp3,fp2,fp4		/* (n + 1) / 2 */
+	friz	fp3,fp3
+
+	lfd	fp4,(L(pio2hi)-L(anchor))(r9)
+	lfd	fp5,(L(pio2lo)-L(anchor))(r9)
+
+	fmul	fp6,fp4,fp3
+	fadd	fp6,fp6,fp1
+	fmadd	fp1,fp5,fp3,fp6
+
+	fctiduz	fp2,fp2
+	mfvsrd	r7,v2			/* n + 1 */
+
+	b	L(reduced)
+
+	.balign 16
+L(inf_or_nan):
+	bne	L(skip_errno_setting)	/* Is a NAN?  */
+
+	/* We delayed the creation of the stack frame, as well as the saving of
+	   the link register, because only at this point, we are sure that
+	   doing so is actually needed.  */
+
+	stfd	fp1,-8(r1)
+
+	/* Save the link register.  */
+	mflr	r0
+	std	r0,16(r1)
+	cfi_offset(lr, 16)
+
+	/* Create the stack frame.  */
+	stdu	r1,-FRAMESIZE(r1)
+	cfi_adjust_cfa_offset(FRAMESIZE)
+
+	bl	JUMPTARGET(__errno_location)
+	nop
+
+	/* Restore the stack frame.  */
+	addi	r1,r1,FRAMESIZE
+	cfi_adjust_cfa_offset(-FRAMESIZE)
+	/* Restore the link register.  */
+	ld	r0,16(r1)
+	mtlr	r0
+
+	lfd	fp1,-8(r1)
+
+	/* errno = EDOM */
+	li	r4,EDOM
+	stw	r4,0(r3)
+
+L(skip_errno_setting):
+	fsub	fp1,fp1,fp1		/* x - x */
+	blr
+
+	.balign 16
+L(greater_or_equal_2p23):
+	fabs	fp1,fp1
+
+	srwi	r4,r3,FLOAT_EXPONENT_SHIFT
+	subi	r4,r4,FLOAT_EXPONENT_BIAS
+
+	/* We reduce the input modulo pi/4, so we need 3 bits of integer
+	   to determine where in 2*pi we are. Index into our array
+	   accordingly.  */
+	addi r4,r4,INTEGER_BITS
+
+	/* To avoid an expensive divide, for the range we care about (0 - 127)
+	   we can transform x/28 into:
+
+	   x/28 = (x * ((0x100000000 / 28) + 1)) >> 32
+
+	   mulhwu returns the top 32 bits of the 64 bit result, doing the
+	   shift for us in the same instruction. The top 32 bits are undefined,
+	   so we have to mask them.  */
+
+	lis	r6,FX_FRACTION_1_28@h
+	ori	r6,r6,FX_FRACTION_1_28@l
+	mulhwu	r5,r4,r6
+	clrldi	r5,r5,32
+
+	/* Get our pointer into the invpio4_table array.  */
+	sldi	r4,r5,3
+	addi	r6,r9,(L(invpio4_table)-L(anchor))
+	add	r4,r4,r6
+
+	lfd	fp2,0(r4)
+	lfd	fp3,8(r4)
+	lfd	fp4,16(r4)
+	lfd	fp5,24(r4)
+
+	fmul	fp6,fp2,fp1
+	fmul	fp7,fp3,fp1
+	fmul	fp8,fp4,fp1
+	fmul	fp9,fp5,fp1
+
+	/* Mask off larger integer bits in highest double word that we don't
+	   care about to avoid losing precision when combining with smaller
+	   values.  */
+	fctiduz	fp10,fp6
+	mfvsrd	r7,v10
+	rldicr	r7,r7,0,(63-INTEGER_BITS)
+	mtvsrd	v10,r7
+	fcfidu	fp10,fp10		/* Integer bits.  */
+
+	fsub	fp6,fp6,fp10		/* highest -= integer bits */
+
+	/* Work out the integer component, rounded down. Use the top two
+	   limbs for this.  */
+	fadd	fp10,fp6,fp7		/* highest + higher */
+
+	fctiduz	fp10,fp10
+	mfvsrd	r7,v10
+	andi.	r0,r7,1
+	fcfidu	fp10,fp10
+
+	/* Subtract integer component from highest limb.  */
+	fsub	fp12,fp6,fp10
+
+	beq	L(even_integer)
+
+	/* Our integer component is odd, so we are in the -PI/4 to 0 primary
+	   region. We need to shift our result down by PI/4, and to do this
+	   in the mod (4/PI) space we simply subtract 1.  */
+	lfd	fp11,(L(DPone)-L(anchor))(r9)
+	fsub	fp12,fp12,fp11
+
+	/* Now add up all the limbs in order.  */
+	fadd	fp12,fp12,fp7
+	fadd	fp12,fp12,fp8
+	fadd	fp12,fp12,fp9
+
+	/* And finally multiply by pi/4.  */
+	lfd	fp13,(L(pio4)-L(anchor))(r9)
+	fmul	fp1,fp12,fp13
+
+	addi	r7,r7,1
+	b	L(reduced)
+
+L(even_integer):
+	lfd	fp11,(L(DPone)-L(anchor))(r9)
+
+	/* Now add up all the limbs in order.  */
+	fadd	fp12,fp12,fp7
+	fadd	fp12,r12,fp8
+	fadd	fp12,r12,fp9
+
+	/* We need to check if the addition of all the limbs resulted in us
+	   overflowing 1.0.  */
+	fcmpu	0,fp12,fp11
+	bgt	L(greater_than_one)
+
+	/* And finally multiply by pi/4.  */
+	lfd	fp13,(L(pio4)-L(anchor))(r9)
+	fmul	fp1,fp12,fp13
+
+	addi	r7,r7,1
+	b	L(reduced)
+
+L(greater_than_one):
+	/* We did overflow 1.0 when adding up all the limbs. Add 1.0 to our
+	   integer, and subtract 1.0 from our result. Since that makes the
+	   integer component odd, we need to subtract another 1.0 as
+	   explained above.  */
+	addi	r7,r7,1
+
+	lfd	fp11,(L(DPtwo)-L(anchor))(r9)
+	fsub	fp12,fp12,fp11
+
+	/* And finally multiply by pi/4.  */
+	lfd	fp13,(L(pio4)-L(anchor))(r9)
+	fmul	fp1,fp12,fp13
+
+	addi	r7,r7,1
+	b	L(reduced)
+
+	.balign 16
+L(less_2pn5):
+	lis	r4,TWO_PN27@h
+	ori	r4,r4,TWO_PN27@l
+
+	cmpw	r3,r4
+	blt	L(less_2pn27)
+
+	/* A simpler Chebyshev approximation is close enough for this range:
+	   1.0+x^2*(CC0+x^3*CC1).  */
+
+	lfd	fp10,(L(CC0)-L(anchor))(r9)
+	lfd	fp11,(L(CC1)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp1		/* x^2 */
+	fmul	fp3,fp2,fp1		/* x^3 */
+	lfd     fp1,(L(DPone)-L(anchor))(r9)
+
+	fmadd   fp4,fp3,fp11,fp10       /* CC0+x^3*CC1 */
+	fmadd	fp1,fp2,fp4,fp1		/* 1.0+x^2*(CC0+x^3*CC1) */
+
+	frsp	fp1,fp1			/* Round to single precision.  */
+
+	blr
+
+	.balign 16
+L(less_2pn27):
+	/* Handle some special cases:
+
+	   cosf(subnormal) raises inexact
+	   cosf(min_normalized) raises inexact
+	   cosf(normalized) raises inexact.  */
+
+	lfd     fp2,(L(DPone)-L(anchor))(r9)
+
+	fabs    fp1,fp1                 /* |x| */
+	fsub	fp1,fp2,fp1		/* 1.0-|x| */
+
+	frsp	fp1,fp1
+
+	blr
+
+END (__cosf)
+
+	.section .rodata, "a"
+
+	.balign 8
+
+L(anchor):
+
+	/* Chebyshev constants for sin, range -PI/4 - PI/4.  */
+L(S0):	.8byte	0xbfc5555555551cd9
+L(S1):	.8byte	0x3f81111110c2688b
+L(S2):	.8byte	0xbf2a019f8b4bd1f9
+L(S3):	.8byte	0x3ec71d7264e6b5b4
+L(S4):	.8byte	0xbe5a947e1674b58a
+
+	/* Chebyshev constants for cos, range 2^-27 - 2^-5.  */
+L(CC0):	.8byte	0xbfdfffffff5cc6fd
+L(CC1):	.8byte	0x3fa55514b178dac5
+
+	/* Chebyshev constants for cos, range -PI/4 - PI/4.  */
+L(C0):	.8byte	0xbfdffffffffe98ae
+L(C1):	.8byte	0x3fa55555545c50c7
+L(C2):	.8byte	0xbf56c16b348b6874
+L(C3):	.8byte	0x3efa00eb9ac43cc0
+L(C4):	.8byte	0xbe923c97dd8844d7
+
+L(invpio2):
+	.8byte	0x3fe45f306dc9c883	/* 2/PI */
+
+L(invpio4):
+	.8byte	0x3ff45f306dc9c883	/* 4/PI */
+
+L(invpio4_table):
+	.8byte	0x0000000000000000
+	.8byte	0x3ff45f306c000000
+	.8byte	0x3e3c9c882a000000
+	.8byte	0x3c54fe13a8000000
+	.8byte	0x3aaf47d4d0000000
+	.8byte	0x38fbb81b6c000000
+	.8byte	0x3714acc9e0000000
+	.8byte	0x3560e4107c000000
+	.8byte	0x33bca2c756000000
+	.8byte	0x31fbd778ac000000
+	.8byte	0x300b7246e0000000
+	.8byte	0x2e5d2126e8000000
+	.8byte	0x2c97003248000000
+	.8byte	0x2ad77504e8000000
+	.8byte	0x290921cfe0000000
+	.8byte	0x274deb1cb0000000
+	.8byte	0x25829a73e0000000
+	.8byte	0x23fd1046be000000
+	.8byte	0x2224baed10000000
+	.8byte	0x20709d338e000000
+	.8byte	0x1e535a2f80000000
+	.8byte	0x1cef904e64000000
+	.8byte	0x1b0d639830000000
+	.8byte	0x1964ce7d24000000
+	.8byte	0x17b908bf16000000
+
+L(pio4):
+	.8byte	0x3fe921fb54442d18	/* PI/4 */
+
+/* PI/2 as a sum of two doubles. We only use 32 bits of the upper limb
+   to avoid losing significant bits when multiplying with up to
+   (2^22)/(pi/2).  */
+L(pio2hi):
+	.8byte	0xbff921fb54400000
+
+L(pio2lo):
+	.8byte	0xbdd0b4611a626332
+
+L(pio2_table):
+	.8byte	0
+	.8byte	0x3ff921fb54442d18	/* 1 * PI/2 */
+	.8byte	0x400921fb54442d18	/* 2 * PI/2 */
+	.8byte	0x4012d97c7f3321d2	/* 3 * PI/2 */
+	.8byte	0x401921fb54442d18	/* 4 * PI/2 */
+	.8byte	0x401f6a7a2955385e	/* 5 * PI/2 */
+	.8byte	0x4022d97c7f3321d2	/* 6 * PI/2 */
+	.8byte	0x4025fdbbe9bba775	/* 7 * PI/2 */
+	.8byte	0x402921fb54442d18	/* 8 * PI/2 */
+	.8byte	0x402c463abeccb2bb	/* 9 * PI/2 */
+	.8byte	0x402f6a7a2955385e	/* 10 * PI/2 */
+
+L(small):
+	.8byte	0x3cd0000000000000	/* 2^-50 */
+
+L(ones):
+	.8byte	0x3ff0000000000000	/* +1.0 */
+	.8byte	0xbff0000000000000	/* -1.0 */
+
+L(DPhalf):
+	.8byte	0x3fe0000000000000	/* 0.5 */
+
+L(DPone):
+	.8byte	0x3ff0000000000000	/* 1.0 */
+
+L(DPtwo):
+	.8byte	0x4000000000000000	/* 2.0 */
+
+libm_alias_float (__cos, cos)
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S
index 3b0c88e5eb..aac54a9364 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S
@@ -1,5 +1,5 @@
 /* isfinite().  PowerPC64/POWER8 version.
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
+   Copyright (C) 2014-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -23,7 +23,7 @@
 
 /* int [r3] __finite ([fp1] x)  */
 
-EALIGN (__finite, 4, 0)
+ENTRY_TOCLESS (__finite, 4)
 	CALL_MCOUNT 0
 	MFVSRD_R3_V1
 	lis     r9,0x8010
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S
index 4708239689..94746ef068 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S
@@ -1,5 +1,5 @@
 /* isinf().  PowerPC64/POWER8 version.
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
+   Copyright (C) 2014-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -23,7 +23,7 @@
 
 /* int [r3] __isinf([fp1] x)  */
 
-EALIGN (__isinf, 4, 0)
+ENTRY_TOCLESS (__isinf, 4)
 	CALL_MCOUNT 0
 	MFVSRD_R3_V1
 	lis     r9,0x7ff0     /* r9 = 0x7ff0  */
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S
index 0e5c19333c..8aef354f68 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S
@@ -1,5 +1,5 @@
 /* isnan().  PowerPC64/POWER8 version.
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
+   Copyright (C) 2014-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -23,7 +23,7 @@
 
 /* int [r3] __isnan([f1] x)  */
 
-EALIGN (__isnan, 4, 0)
+ENTRY_TOCLESS (__isnan, 4)
 	CALL_MCOUNT 0
 	MFVSRD_R3_V1
 	lis     r9,0x7ff0
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S
index 0af9feee5a..7f18d705a9 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S
@@ -1,5 +1,5 @@
 /* Round double to long int.  POWER8 PowerPC64 version.
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
+   Copyright (C) 2014-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -18,11 +18,13 @@
 
 #include <sysdep.h>
 #include <math_ldbl_opt.h>
+#include <libm-alias-float.h>
+#include <libm-alias-double.h>
 
 #define MFVSRD_R3_V1  .long 0x7c230066     /* mfvsrd  r3,vs1  */
 
 /* long long int[r3] __llrint (double x[fp1])  */
-ENTRY (__llrint)
+ENTRY_TOCLESS (__llrint)
 	CALL_MCOUNT 0
 	fctid	fp1,fp1
 	MFVSRD_R3_V1
@@ -30,16 +32,12 @@ ENTRY (__llrint)
 END (__llrint)
 
 strong_alias (__llrint, __lrint)
-weak_alias (__llrint, llrint)
-weak_alias (__lrint, lrint)
-
-#ifdef NO_LONG_DOUBLE
-strong_alias (__llrint, __llrintl)
-weak_alias (__llrint, llrintl)
-strong_alias (__lrint, __lrintl)
-weak_alias (__lrint, lrintl)
-#endif
-#if LONG_DOUBLE_COMPAT(libm, GLIBC_2_1)
-compat_symbol (libm, __llrint, llrintl, GLIBC_2_1)
-compat_symbol (libm, __lrint, lrintl, GLIBC_2_1)
-#endif
+libm_alias_double (__llrint, llrint)
+libm_alias_double (__lrint, lrint)
+/* The double version also works for single-precision as both float and
+   double parameters are passed in 64bit FPRs and both versions are expected
+   to return [long] long type.  */
+strong_alias (__llrint, __llrintf)
+libm_alias_float (__llrint, llrint)
+strong_alias (__lrint, __lrintf)
+libm_alias_float (__lrint, lrint)
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S
index af2409deb8..a22fc63bb3 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S
@@ -1,5 +1,5 @@
 /* llround function.  POWER8 PowerPC64 version.
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
+   Copyright (C) 2014-2018 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -19,12 +19,14 @@
 #include <sysdep.h>
 #include <endian.h>
 #include <math_ldbl_opt.h>
+#include <libm-alias-float.h>
+#include <libm-alias-double.h>
 
 #define MFVSRD_R3_V1  .long 0x7c230066     /* mfvsrd  r3,vs1  */
 
 /* long long [r3] llround (float x [fp1])  */
 
-ENTRY (__llround)
+ENTRY_TOCLESS (__llround)
 	CALL_MCOUNT 0
 	frin	fp1,fp1	/* Round to nearest +-0.5.  */
 	fctidz	fp1,fp1	/* Convert To Integer DW round toward 0.  */
@@ -33,16 +35,12 @@ ENTRY (__llround)
 END (__llround)
 
 strong_alias (__llround, __lround)
-weak_alias (__llround, llround)
-weak_alias (__lround, lround)
-
-#ifdef NO_LONG_DOUBLE
-weak_alias (__llround, llroundl)
-strong_alias (__llround, __llroundl)
-weak_alias (__lround, lroundl)
-strong_alias (__lround, __lroundl)
-#endif
-#if LONG_DOUBLE_COMPAT(libm, GLIBC_2_1)
-compat_symbol (libm, __llround, llroundl, GLIBC_2_1)
-compat_symbol (libm, __lround, lroundl, GLIBC_2_1)
-#endif
+libm_alias_double (__llround, llround)
+libm_alias_double (__lround, lround)
+/* The double version also works for single-precision as both float and
+   double parameters are passed in 64bit FPRs and both versions are expected
+   to return [long] long type.  */
+strong_alias (__llround, __llroundf)
+libm_alias_float (__llround, llround)
+strong_alias (__lround, __lroundf)
+libm_alias_float (__lround, lround)
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_llroundf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_llroundf.S
new file mode 100644
index 0000000000..9ea6bd105b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_llroundf.S
@@ -0,0 +1 @@
+/* __lroundf is in s_llround.S.  */
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S
new file mode 100644
index 0000000000..59e613c102
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S
@@ -0,0 +1,520 @@
+/* Optimized sinf().  PowerPC64/POWER8 version.
+   Copyright (C) 2016-2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#define _ERRNO_H	1
+#include <bits/errno.h>
+#include <libm-alias-float.h>
+
+#define FRAMESIZE (FRAME_MIN_SIZE+16)
+
+#define FLOAT_EXPONENT_SHIFT	23
+#define FLOAT_EXPONENT_BIAS	127
+#define INTEGER_BITS		3
+
+#define PI_4		0x3f490fdb	/* PI/4 */
+#define NINEPI_4	0x40e231d6	/* 9 * PI/4 */
+#define TWO_PN5		0x3d000000	/* 2^-5 */
+#define TWO_PN27	0x32000000	/* 2^-27 */
+#define INFINITY	0x7f800000
+#define TWO_P23		0x4b000000	/* 2^27 */
+#define FX_FRACTION_1_28 0x9249250	/* 0x100000000 / 28 + 1 */
+
+	/* Implements the function
+
+	   float [fp1] sinf (float [fp1] x)  */
+
+	.machine power8
+ENTRY (__sinf, 4)
+	addis	r9,r2,L(anchor)@toc@ha
+	addi	r9,r9,L(anchor)@toc@l
+
+	lis	r4,PI_4@h
+	ori	r4,r4,PI_4@l
+
+	xscvdpspn v0,v1
+	mfvsrd	r8,v0
+	rldicl	r3,r8,32,33		/* Remove sign bit.  */
+
+	cmpw	r3,r4
+	bge	L(greater_or_equal_pio4)
+
+	lis	r4,TWO_PN5@h
+	ori	r4,r4,TWO_PN5@l
+
+	cmpw	r3,r4
+	blt	L(less_2pn5)
+
+	/* Chebyshev polynomial of the form:
+	 * x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).  */
+
+	lfd	fp9,(L(S0)-L(anchor))(r9)
+	lfd	fp10,(L(S1)-L(anchor))(r9)
+	lfd	fp11,(L(S2)-L(anchor))(r9)
+	lfd	fp12,(L(S3)-L(anchor))(r9)
+	lfd	fp13,(L(S4)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp1		/* x^2 */
+	fmul	fp3,fp2,fp1		/* x^3 */
+
+	fmadd	fp4,fp2,fp13,fp12	/* S3+x^2*S4 */
+	fmadd	fp4,fp2,fp4,fp11	/* S2+x^2*(S3+x^2*S4) */
+	fmadd	fp4,fp2,fp4,fp10	/* S1+x^2*(S2+x^2*(S3+x^2*S4)) */
+	fmadd	fp4,fp2,fp4,fp9		/* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */
+	fmadd	fp1,fp3,fp4,fp1		/* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */
+	frsp	fp1,fp1			/* Round to single precision.  */
+
+	blr
+
+	.balign 16
+L(greater_or_equal_pio4):
+	lis	r4,NINEPI_4@h
+	ori	r4,r4,NINEPI_4@l
+	cmpw	r3,r4
+	bge	L(greater_or_equal_9pio4)
+
+	/* Calculate quotient of |x|/(PI/4).  */
+	lfd	fp2,(L(invpio4)-L(anchor))(r9)
+	fabs	fp1,fp1			/* |x| */
+	fmul	fp2,fp1,fp2		/* |x|/(PI/4) */
+	fctiduz	fp2,fp2
+	mfvsrd	r3,v2			/* n = |x| mod PI/4 */
+
+	/* Now use that quotient to find |x| mod (PI/2).  */
+	addi	r7,r3,1
+	rldicr	r5,r7,2,60		/* ((n+1) >> 1) << 3 */
+	addi	r6,r9,(L(pio2_table)-L(anchor))
+	lfdx	fp4,r5,r6
+	fsub	fp1,fp1,fp4
+
+	.balign 16
+L(reduced):
+	/* Now we are in the range -PI/4 to PI/4.  */
+
+	/* Work out if we are in a positive or negative primary interval.  */
+	rldicl	r4,r7,62,63		/* ((n+1) >> 2) & 1 */
+
+	/* We are operating on |x|, so we need to add back the original
+	   sign.  */
+	rldicl	r8,r8,33,63		/* (x >> 31) & 1, ie the sign bit.  */
+	xor	r4,r4,r8		/* 0 if result should be positive,
+					   1 if negative.  */
+
+	/* Load a 1.0 or -1.0.  */
+	addi	r5,r9,(L(ones)-L(anchor))
+	sldi	r4,r4,3
+	lfdx	fp0,r4,r5
+
+	/* Are we in the primary interval of sin or cos?  */
+	andi.	r4,r7,0x2
+	bne	L(cos)
+
+	/* Chebyshev polynomial of the form:
+	   x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).  */
+
+	lfd	fp9,(L(S0)-L(anchor))(r9)
+	lfd	fp10,(L(S1)-L(anchor))(r9)
+	lfd	fp11,(L(S2)-L(anchor))(r9)
+	lfd	fp12,(L(S3)-L(anchor))(r9)
+	lfd	fp13,(L(S4)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp1		/* x^2 */
+	fmul	fp3,fp2,fp1		/* x^3 */
+
+	fmadd	fp4,fp2,fp13,fp12	/* S3+x^2*S4 */
+	fmadd	fp4,fp2,fp4,fp11	/* S2+x^2*(S3+x^2*S4) */
+	fmadd	fp4,fp2,fp4,fp10	/* S1+x^2*(S2+x^2*(S3+x^2*S4)) */
+	fmadd	fp4,fp2,fp4,fp9		/* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */
+	fmadd	fp4,fp3,fp4,fp1		/* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */
+	fmul	fp4,fp4,fp0		/* Add in the sign.  */
+	frsp	fp1,fp4			/* Round to single precision.  */
+
+	blr
+
+	.balign 16
+L(cos):
+	/* Chebyshev polynomial of the form:
+	   1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).  */
+
+	lfd	fp9,(L(C0)-L(anchor))(r9)
+	lfd	fp10,(L(C1)-L(anchor))(r9)
+	lfd	fp11,(L(C2)-L(anchor))(r9)
+	lfd	fp12,(L(C3)-L(anchor))(r9)
+	lfd	fp13,(L(C4)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp1		/* x^2 */
+	lfd	fp3,(L(DPone)-L(anchor))(r9)
+
+	fmadd	fp4,fp2,fp13,fp12	/* C3+x^2*C4 */
+	fmadd	fp4,fp2,fp4,fp11	/* C2+x^2*(C3+x^2*C4) */
+	fmadd	fp4,fp2,fp4,fp10	/* C1+x^2*(C2+x^2*(C3+x^2*C4)) */
+	fmadd	fp4,fp2,fp4,fp9		/* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */
+	fmadd	fp4,fp2,fp4,fp3		/* 1.0 + x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */
+	fmul	fp4,fp4,fp0		/* Add in the sign.  */
+	frsp	fp1,fp4			/* Round to single precision.  */
+
+	blr
+
+	.balign 16
+L(greater_or_equal_9pio4):
+	lis	r4,INFINITY@h
+	ori	r4,r4,INFINITY@l
+	cmpw	r3,r4
+	bge	L(inf_or_nan)
+
+	lis	r4,TWO_P23@h
+	ori	r4,r4,TWO_P23@l
+	cmpw	r3,r4
+	bge	L(greater_or_equal_2p23)
+
+	fabs	fp1,fp1			/* |x| */
+
+	/* Calculate quotient of |x|/(PI/4).  */
+	lfd	fp2,(L(invpio4)-L(anchor))(r9)
+
+	lfd	fp3,(L(DPone)-L(anchor))(r9)
+	lfd	fp4,(L(DPhalf)-L(anchor))(r9)
+	fmul	fp2,fp1,fp2		/* |x|/(PI/4) */
+	friz	fp2,fp2			/* n = floor(|x|/(PI/4)) */
+
+	/* Calculate (n + 1) / 2.  */
+	fadd	fp2,fp2,fp3		/* n + 1 */
+	fmul	fp3,fp2,fp4		/* (n + 1) / 2 */
+	friz	fp3,fp3
+
+	lfd	fp4,(L(pio2hi)-L(anchor))(r9)
+	lfd	fp5,(L(pio2lo)-L(anchor))(r9)
+
+	fmul	fp6,fp4,fp3
+	fadd	fp6,fp6,fp1
+	fmadd	fp1,fp5,fp3,fp6
+
+	fctiduz	fp2,fp2
+	mfvsrd	r7,v2			/* n + 1 */
+
+	b	L(reduced)
+
+	.balign 16
+L(inf_or_nan):
+	bne	L(skip_errno_setting)	/* Is a NAN?  */
+
+	/* We delayed the creation of the stack frame, as well as the saving of
+	   the link register, because only at this point, we are sure that
+	   doing so is actually needed.  */
+
+	stfd	fp1,-8(r1)
+
+	/* Save the link register.  */
+	mflr	r0
+	std	r0,16(r1)
+	cfi_offset(lr, 16)
+
+	/* Create the stack frame.  */
+	stdu	r1,-FRAMESIZE(r1)
+	cfi_adjust_cfa_offset(FRAMESIZE)
+
+	bl	JUMPTARGET(__errno_location)
+	nop
+
+	/* Restore the stack frame.  */
+	addi	r1,r1,FRAMESIZE
+	cfi_adjust_cfa_offset(-FRAMESIZE)
+	/* Restore the link register.  */
+	ld	r0,16(r1)
+	mtlr	r0
+
+	lfd	fp1,-8(r1)
+
+	/* errno = EDOM */
+	li	r4,EDOM
+	stw	r4,0(r3)
+
+L(skip_errno_setting):
+	fsub	fp1,fp1,fp1		/* x - x */
+	blr
+
+	.balign 16
+L(greater_or_equal_2p23):
+	fabs	fp1,fp1
+
+	srwi	r4,r3,FLOAT_EXPONENT_SHIFT
+	subi	r4,r4,FLOAT_EXPONENT_BIAS
+
+	/* We reduce the input modulo pi/4, so we need 3 bits of integer
+	   to determine where in 2*pi we are. Index into our array
+	   accordingly.  */
+	addi r4,r4,INTEGER_BITS
+
+	/* To avoid an expensive divide, for the range we care about (0 - 127)
+	   we can transform x/28 into:
+
+	   x/28 = (x * ((0x100000000 / 28) + 1)) >> 32
+
+	   mulhwu returns the top 32 bits of the 64 bit result, doing the
+	   shift for us in the same instruction. The top 32 bits are undefined,
+	   so we have to mask them.  */
+
+	lis	r6,FX_FRACTION_1_28@h
+	ori	r6,r6,FX_FRACTION_1_28@l
+	mulhwu	r5,r4,r6
+	clrldi	r5,r5,32
+
+	/* Get our pointer into the invpio4_table array.  */
+	sldi	r4,r5,3
+	addi	r6,r9,(L(invpio4_table)-L(anchor))
+	add	r4,r4,r6
+
+	lfd	fp2,0(r4)
+	lfd	fp3,8(r4)
+	lfd	fp4,16(r4)
+	lfd	fp5,24(r4)
+
+	fmul	fp6,fp2,fp1
+	fmul	fp7,fp3,fp1
+	fmul	fp8,fp4,fp1
+	fmul	fp9,fp5,fp1
+
+	/* Mask off larger integer bits in highest double word that we don't
+	   care about to avoid losing precision when combining with smaller
+	   values.  */
+	fctiduz	fp10,fp6
+	mfvsrd	r7,v10
+	rldicr	r7,r7,0,(63-INTEGER_BITS)
+	mtvsrd	v10,r7
+	fcfidu	fp10,fp10		/* Integer bits.  */
+
+	fsub	fp6,fp6,fp10		/* highest -= integer bits */
+
+	/* Work out the integer component, rounded down. Use the top two
+	   limbs for this.  */
+	fadd	fp10,fp6,fp7		/* highest + higher */
+
+	fctiduz	fp10,fp10
+	mfvsrd	r7,v10
+	andi.	r0,r7,1
+	fcfidu	fp10,fp10
+
+	/* Subtract integer component from highest limb.  */
+	fsub	fp12,fp6,fp10
+
+	beq	L(even_integer)
+
+	/* Our integer component is odd, so we are in the -PI/4 to 0 primary
+	   region. We need to shift our result down by PI/4, and to do this
+	   in the mod (4/PI) space we simply subtract 1.  */
+	lfd	fp11,(L(DPone)-L(anchor))(r9)
+	fsub	fp12,fp12,fp11
+
+	/* Now add up all the limbs in order.  */
+	fadd	fp12,fp12,fp7
+	fadd	fp12,fp12,fp8
+	fadd	fp12,fp12,fp9
+
+	/* And finally multiply by pi/4.  */
+	lfd	fp13,(L(pio4)-L(anchor))(r9)
+	fmul	fp1,fp12,fp13
+
+	addi	r7,r7,1
+	b	L(reduced)
+
+L(even_integer):
+	lfd	fp11,(L(DPone)-L(anchor))(r9)
+
+	/* Now add up all the limbs in order.  */
+	fadd	fp12,fp12,fp7
+	fadd	fp12,r12,fp8
+	fadd	fp12,r12,fp9
+
+	/* We need to check if the addition of all the limbs resulted in us
+	   overflowing 1.0.  */
+	fcmpu	0,fp12,fp11
+	bgt	L(greater_than_one)
+
+	/* And finally multiply by pi/4.  */
+	lfd	fp13,(L(pio4)-L(anchor))(r9)
+	fmul	fp1,fp12,fp13
+
+	addi	r7,r7,1
+	b	L(reduced)
+
+L(greater_than_one):
+	/* We did overflow 1.0 when adding up all the limbs. Add 1.0 to our
+	   integer, and subtract 1.0 from our result. Since that makes the
+	   integer component odd, we need to subtract another 1.0 as
+	   explained above.  */
+	addi	r7,r7,1
+
+	lfd	fp11,(L(DPtwo)-L(anchor))(r9)
+	fsub	fp12,fp12,fp11
+
+	/* And finally multiply by pi/4.  */
+	lfd	fp13,(L(pio4)-L(anchor))(r9)
+	fmul	fp1,fp12,fp13
+
+	addi	r7,r7,1
+	b	L(reduced)
+
+	.balign 16
+L(less_2pn5):
+	lis	r4,TWO_PN27@h
+	ori	r4,r4,TWO_PN27@l
+
+	cmpw	r3,r4
+	blt	L(less_2pn27)
+
+	/* A simpler Chebyshev approximation is close enough for this range:
+	   x+x^3*(SS0+x^2*SS1).  */
+
+	lfd	fp10,(L(SS0)-L(anchor))(r9)
+	lfd	fp11,(L(SS1)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp1		/* x^2 */
+	fmul	fp3,fp2,fp1		/* x^3 */
+
+	fmadd	fp4,fp2,fp11,fp10	/* SS0+x^2*SS1 */
+	fmadd	fp1,fp3,fp4,fp1		/* x+x^3*(SS0+x^2*SS1) */
+
+	frsp	fp1,fp1			/* Round to single precision.  */
+
+	blr
+
+	.balign 16
+L(less_2pn27):
+	cmpwi	r3,0
+	beq	L(zero)
+
+	/* Handle some special cases:
+
+	   sinf(subnormal) raises inexact/underflow
+	   sinf(min_normalized) raises inexact/underflow
+	   sinf(normalized) raises inexact.  */
+
+	lfd	fp2,(L(small)-L(anchor))(r9)
+
+	fmul	fp2,fp1,fp2		/* x * small */
+	fsub	fp1,fp1,fp2		/* x - x * small */
+
+	frsp	fp1,fp1
+
+	blr
+
+	.balign 16
+L(zero):
+	blr
+
+END (__sinf)
+
+	.section .rodata, "a"
+
+	.balign 8
+
+L(anchor):
+
+	/* Chebyshev constants for sin, range -PI/4 - PI/4.  */
+L(S0):	.8byte	0xbfc5555555551cd9
+L(S1):	.8byte	0x3f81111110c2688b
+L(S2):	.8byte	0xbf2a019f8b4bd1f9
+L(S3):	.8byte	0x3ec71d7264e6b5b4
+L(S4):	.8byte	0xbe5a947e1674b58a
+
+	/* Chebyshev constants for sin, range 2^-27 - 2^-5.  */
+L(SS0):	.8byte	0xbfc555555543d49d
+L(SS1):	.8byte	0x3f8110f475cec8c5
+
+	/* Chebyshev constants for cos, range -PI/4 - PI/4.  */
+L(C0):	.8byte	0xbfdffffffffe98ae
+L(C1):	.8byte	0x3fa55555545c50c7
+L(C2):	.8byte	0xbf56c16b348b6874
+L(C3):	.8byte	0x3efa00eb9ac43cc0
+L(C4):	.8byte	0xbe923c97dd8844d7
+
+L(invpio2):
+	.8byte	0x3fe45f306dc9c883	/* 2/PI */
+
+L(invpio4):
+	.8byte	0x3ff45f306dc9c883	/* 4/PI */
+
+L(invpio4_table):
+	.8byte	0x0000000000000000
+	.8byte	0x3ff45f306c000000
+	.8byte	0x3e3c9c882a000000
+	.8byte	0x3c54fe13a8000000
+	.8byte	0x3aaf47d4d0000000
+	.8byte	0x38fbb81b6c000000
+	.8byte	0x3714acc9e0000000
+	.8byte	0x3560e4107c000000
+	.8byte	0x33bca2c756000000
+	.8byte	0x31fbd778ac000000
+	.8byte	0x300b7246e0000000
+	.8byte	0x2e5d2126e8000000
+	.8byte	0x2c97003248000000
+	.8byte	0x2ad77504e8000000
+	.8byte	0x290921cfe0000000
+	.8byte	0x274deb1cb0000000
+	.8byte	0x25829a73e0000000
+	.8byte	0x23fd1046be000000
+	.8byte	0x2224baed10000000
+	.8byte	0x20709d338e000000
+	.8byte	0x1e535a2f80000000
+	.8byte	0x1cef904e64000000
+	.8byte	0x1b0d639830000000
+	.8byte	0x1964ce7d24000000
+	.8byte	0x17b908bf16000000
+
+L(pio4):
+	.8byte	0x3fe921fb54442d18	/* PI/4 */
+
+/* PI/2 as a sum of two doubles. We only use 32 bits of the upper limb
+   to avoid losing significant bits when multiplying with up to
+   (2^22)/(pi/2).  */
+L(pio2hi):
+	.8byte	0xbff921fb54400000
+
+L(pio2lo):
+	.8byte	0xbdd0b4611a626332
+
+L(pio2_table):
+	.8byte	0
+	.8byte	0x3ff921fb54442d18	/* 1 * PI/2 */
+	.8byte	0x400921fb54442d18	/* 2 * PI/2 */
+	.8byte	0x4012d97c7f3321d2	/* 3 * PI/2 */
+	.8byte	0x401921fb54442d18	/* 4 * PI/2 */
+	.8byte	0x401f6a7a2955385e	/* 5 * PI/2 */
+	.8byte	0x4022d97c7f3321d2	/* 6 * PI/2 */
+	.8byte	0x4025fdbbe9bba775	/* 7 * PI/2 */
+	.8byte	0x402921fb54442d18	/* 8 * PI/2 */
+	.8byte	0x402c463abeccb2bb	/* 9 * PI/2 */
+	.8byte	0x402f6a7a2955385e	/* 10 * PI/2 */
+
+L(small):
+	.8byte	0x3cd0000000000000	/* 2^-50 */
+
+L(ones):
+	.8byte	0x3ff0000000000000	/* +1.0 */
+	.8byte	0xbff0000000000000	/* -1.0 */
+
+L(DPhalf):
+	.8byte	0x3fe0000000000000	/* 0.5 */
+
+L(DPone):
+	.8byte	0x3ff0000000000000	/* 1.0 */
+
+L(DPtwo):
+	.8byte	0x4000000000000000	/* 2.0 */
+
+libm_alias_float (__sin, sin)
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/w_expf.c b/sysdeps/powerpc/powerpc64/power8/fpu/w_expf.c
new file mode 100644
index 0000000000..b5fe164520
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/w_expf.c
@@ -0,0 +1 @@
+#include <sysdeps/../math/w_expf.c>