diff options
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power8/fpu')
-rw-r--r-- | sysdeps/powerpc/powerpc64/power8/fpu/Implies | 1 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S | 303 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies | 1 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S | 509 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S | 4 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S | 4 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S | 4 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S | 28 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S | 28 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/power8/fpu/s_llroundf.S | 1 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S | 520 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/power8/fpu/w_expf.c | 1 |
12 files changed, 1366 insertions, 38 deletions
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/Implies b/sysdeps/powerpc/powerpc64/power8/fpu/Implies deleted file mode 100644 index 1187cdfb0a..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/fpu/Implies +++ /dev/null @@ -1 +0,0 @@ -powerpc/powerpc64/power7/fpu/ diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S b/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S new file mode 100644 index 0000000000..32ee8326e1 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S @@ -0,0 +1,303 @@ +/* Optimized expf(). PowerPC64/POWER8 version. + Copyright (C) 2016-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Short algorithm description: + * + * Let K = 64 (table size). + * e^x = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y)) + * where: + * x = m*log(2)/K + y, y in [0.0..log(2)/K] + * m = n*K + j, m,n,j - signed integer, j in [0..K-1] + * values of 2^(j/K) are tabulated as T[j]. + * + * P(y) is a minimax polynomial approximation of expf(y)-1 + * on small interval [0.0..log(2)/K]. + * + * P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as + * z = y*y; P(y) = (P3*z + P1)*z + (P2*z + P0)*y + * + * Special cases: + * expf(NaN) = NaN + * expf(+INF) = +INF + * expf(-INF) = 0 + * expf(x) = 1 for subnormals + * for finite argument, only expf(0)=1 is exact + * expf(x) overflows if x>88.7228317260742190 + * expf(x) underflows if x<-103.972076416015620 + */ + +#define C1 0x42ad496b /* Single precision 125*log(2). */ +#define C2 0x31800000 /* Single precision 2^(-28). */ +#define SP_INF 0x7f800000 /* Single precision Inf. */ +#define SP_EXP_BIAS 0x1fc0 /* Single precision exponent bias. */ + +#define DATA_OFFSET r9 + +/* Implements the function + + float [fp1] expf (float [fp1] x) */ + + .machine power8 +ENTRY (__ieee754_expf, 4) + addis DATA_OFFSET,r2,.Lanchor@toc@ha + addi DATA_OFFSET,DATA_OFFSET,.Lanchor@toc@l + + xscvdpspn v0,v1 + mfvsrd r8,v0 /* r8 = x */ + lfd fp2,(.KLN2-.Lanchor)(DATA_OFFSET) + lfd fp3,(.P2-.Lanchor)(DATA_OFFSET) + rldicl r3,r8,32,33 /* r3 = |x| */ + lis r4,C1@ha /* r4 = 125*log(2) */ + ori r4,r4,C1@l + cmpw r3,r4 + lfd fp5,(.P3-.Lanchor)(DATA_OFFSET) + lfd fp4,(.RS-.Lanchor)(DATA_OFFSET) + fmadd fp2,fp1,fp2,fp4 /* fp2 = x * K/log(2) + (2^23 + 2^22) */ + bge L(special_paths) /* |x| >= 125*log(2) ? */ + + lis r4,C2@ha + ori r4,r4,C2@l + cmpw r3,r4 + blt L(small_args) /* |x| < 2^(-28) ? */ + + /* Main path: here if 2^(-28) <= |x| < 125*log(2) */ + frsp fp6,fp2 + xscvdpsp v2,v2 + mfvsrd r8,v2 + mr r3,r8 /* r3 = m */ + rldicl r8,r8,32,58 /* r8 = j */ + lfs fp4,(.SP_RS-.Lanchor)(DATA_OFFSET) + fsubs fp2,fp6,fp4 /* fp2 = m = x * K/log(2) */ + srdi r3,r3,32 + clrrwi r3,r3,6 /* r3 = n */ + lfd fp6,(.NLN2K-.Lanchor)(DATA_OFFSET) + fmadd fp0,fp2,fp6,fp1 /* fp0 = y = x - m*log(2)/K */ + fmul fp2,fp0,fp0 /* fp2 = z = y^2 */ + lfd fp4,(.P1-.Lanchor)(DATA_OFFSET) + lfd fp6,(.P0-.Lanchor)(DATA_OFFSET) + lis r4,SP_EXP_BIAS@ha + ori r4,r4,SP_EXP_BIAS@l + add r3,r3,r4 + rldic r3,r3,49,1 /* r3 = 2^n */ + fmadd fp4,fp5,fp2,fp4 /* fp4 = P3 * z + P1 */ + fmadd fp6,fp3,fp2,fp6 /* fp6 = P2 * z + P0 */ + mtvsrd v1,r3 + xscvspdp v1,v1 + fmul fp4,fp4,fp2 /* fp4 = (P3 * z + P1)*z */ + fmadd fp0,fp0,fp6,fp4 /* fp0 = P(y) */ + sldi r8,r8,3 /* Access doublewords from T[j]. */ + addi r6,DATA_OFFSET,(.Ttable-.Lanchor) + lfdx fp3,r6,r8 + fmadd fp0,fp0,fp3,fp3 /* fp0 = T[j] * (1 + P(y)) */ + fmul fp1,fp1,fp0 /* fp1 = 2^n * T[j] * (1 + P(y)) */ + frsp fp1,fp1 + blr + + .align 4 +/* x is either underflow, overflow, infinite or NaN. */ +L(special_paths): + srdi r8,r8,32 + rlwinm r8,r8,3,29,29 /* r8 = 0, if x positive. + r8 = 4, otherwise. */ + addi r6,DATA_OFFSET,(.SPRANGE-.Lanchor) + lwzx r4,r6,r8 /* r4 = .SPRANGE[signbit(x)] */ + cmpw r3,r4 + /* |x| <= .SPRANGE[signbit(x)] */ + ble L(near_under_or_overflow) + + lis r4,SP_INF@ha + ori r4,r4,SP_INF@l + cmpw r3,r4 + bge L(arg_inf_or_nan) /* |x| > Infinite ? */ + + addi r6,DATA_OFFSET,(.SPLARGE_SMALL-.Lanchor) + lfsx fp1,r6,r8 + fmuls fp1,fp1,fp1 + blr + + + .align 4 +L(small_args): + /* expf(x) = 1.0, where |x| < |2^(-28)| */ + lfs fp2,(.SPone-.Lanchor)(DATA_OFFSET) + fadds fp1,fp1,fp2 + blr + + + .align 4 +L(arg_inf_or_nan:) + bne L(arg_nan) + + /* expf(+INF) = +INF + expf(-INF) = 0 */ + addi r6,DATA_OFFSET,(.INF_ZERO-.Lanchor) + lfsx fp1,r6,r8 + blr + + + .align 4 +L(arg_nan): + /* expf(NaN) = NaN */ + fadd fp1,fp1,fp1 + frsp fp1,fp1 + blr + + .align 4 +L(near_under_or_overflow): + frsp fp6,fp2 + xscvdpsp v2,v2 + mfvsrd r8,v2 + mr r3,r8 /* r3 = m */ + rldicl r8,r8,32,58 /* r8 = j */ + lfs fp4,(.SP_RS-.Lanchor)(DATA_OFFSET) + fsubs fp2,fp6,fp4 /* fp2 = m = x * K/log(2) */ + srdi r3,r3,32 + clrrwi r3,r3,6 /* r3 = n */ + lfd fp6,(.NLN2K-.Lanchor)(DATA_OFFSET) + fmadd fp0,fp2,fp6,fp1 /* fp0 = y = x - m*log(2)/K */ + fmul fp2,fp0,fp0 /* fp2 = z = y^2 */ + lfd fp4,(.P1-.Lanchor)(DATA_OFFSET) + lfd fp6,(.P0-.Lanchor)(DATA_OFFSET) + ld r4,(.DP_EXP_BIAS-.Lanchor)(DATA_OFFSET) + add r3,r3,r4 + rldic r3,r3,46,1 /* r3 = 2 */ + fmadd fp4,fp5,fp2,fp4 /* fp4 = P3 * z + P1 */ + fmadd fp6,fp3,fp2,fp6 /* fp6 = P2 * z + P0 */ + mtvsrd v1,r3 + fmul fp4,fp4,fp2 /* fp4 = (P3*z + P1)*z */ + fmadd fp0,fp0,fp6,fp4 /* fp0 = P(y) */ + sldi r8,r8,3 /* Access doublewords from T[j]. */ + addi r6,DATA_OFFSET,(.Ttable-.Lanchor) + lfdx fp3,r6,r8 + fmadd fp0,fp0,fp3,fp3 /* fp0 = T[j] * (1 + T[j]) */ + fmul fp1,fp1,fp0 /* fp1 = 2^n * T[j] * (1 + T[j]) */ + frsp fp1,fp1 + blr +END(__ieee754_expf) + + .section .rodata, "a",@progbits +.Lanchor: + .balign 8 +/* Table T[j] = 2^(j/K). Double precision. */ +.Ttable: + .8byte 0x3ff0000000000000 + .8byte 0x3ff02c9a3e778061 + .8byte 0x3ff059b0d3158574 + .8byte 0x3ff0874518759bc8 + .8byte 0x3ff0b5586cf9890f + .8byte 0x3ff0e3ec32d3d1a2 + .8byte 0x3ff11301d0125b51 + .8byte 0x3ff1429aaea92de0 + .8byte 0x3ff172b83c7d517b + .8byte 0x3ff1a35beb6fcb75 + .8byte 0x3ff1d4873168b9aa + .8byte 0x3ff2063b88628cd6 + .8byte 0x3ff2387a6e756238 + .8byte 0x3ff26b4565e27cdd + .8byte 0x3ff29e9df51fdee1 + .8byte 0x3ff2d285a6e4030b + .8byte 0x3ff306fe0a31b715 + .8byte 0x3ff33c08b26416ff + .8byte 0x3ff371a7373aa9cb + .8byte 0x3ff3a7db34e59ff7 + .8byte 0x3ff3dea64c123422 + .8byte 0x3ff4160a21f72e2a + .8byte 0x3ff44e086061892d + .8byte 0x3ff486a2b5c13cd0 + .8byte 0x3ff4bfdad5362a27 + .8byte 0x3ff4f9b2769d2ca7 + .8byte 0x3ff5342b569d4f82 + .8byte 0x3ff56f4736b527da + .8byte 0x3ff5ab07dd485429 + .8byte 0x3ff5e76f15ad2148 + .8byte 0x3ff6247eb03a5585 + .8byte 0x3ff6623882552225 + .8byte 0x3ff6a09e667f3bcd + .8byte 0x3ff6dfb23c651a2f + .8byte 0x3ff71f75e8ec5f74 + .8byte 0x3ff75feb564267c9 + .8byte 0x3ff7a11473eb0187 + .8byte 0x3ff7e2f336cf4e62 + .8byte 0x3ff82589994cce13 + .8byte 0x3ff868d99b4492ed + .8byte 0x3ff8ace5422aa0db + .8byte 0x3ff8f1ae99157736 + .8byte 0x3ff93737b0cdc5e5 + .8byte 0x3ff97d829fde4e50 + .8byte 0x3ff9c49182a3f090 + .8byte 0x3ffa0c667b5de565 + .8byte 0x3ffa5503b23e255d + .8byte 0x3ffa9e6b5579fdbf + .8byte 0x3ffae89f995ad3ad + .8byte 0x3ffb33a2b84f15fb + .8byte 0x3ffb7f76f2fb5e47 + .8byte 0x3ffbcc1e904bc1d2 + .8byte 0x3ffc199bdd85529c + .8byte 0x3ffc67f12e57d14b + .8byte 0x3ffcb720dcef9069 + .8byte 0x3ffd072d4a07897c + .8byte 0x3ffd5818dcfba487 + .8byte 0x3ffda9e603db3285 + .8byte 0x3ffdfc97337b9b5f + .8byte 0x3ffe502ee78b3ff6 + .8byte 0x3ffea4afa2a490da + .8byte 0x3ffefa1bee615a27 + .8byte 0x3fff50765b6e4540 + .8byte 0x3fffa7c1819e90d8 + +.KLN2: + .8byte 0x40571547652b82fe /* Double precision K/log(2). */ + +/* Double precision polynomial coefficients. */ +.P0: + .8byte 0x3fefffffffffe7c6 +.P1: + .8byte 0x3fe00000008d6118 +.P2: + .8byte 0x3fc55550da752d4f +.P3: + .8byte 0x3fa56420eb78fa85 + +.RS: + .8byte 0x4168000000000000 /* Double precision 2^23 + 2^22. */ +.NLN2K: + .8byte 0xbf862e42fefa39ef /* Double precision -log(2)/K. */ +.DP_EXP_BIAS: + .8byte 0x000000000000ffc0 /* Double precision exponent bias. */ + + .balign 4 +.SPone: + .4byte 0x3f800000 /* Single precision 1.0. */ +.SP_RS: + .4byte 0x4b400000 /* Single precision 2^23 + 2^22. */ + +.SPRANGE: /* Single precision overflow/underflow bounds. */ + .4byte 0x42b17217 /* if x>this bound, then result overflows. */ + .4byte 0x42cff1b4 /* if x<this bound, then result underflows. */ + +.SPLARGE_SMALL: + .4byte 0x71800000 /* 2^100. */ + .4byte 0x0d800000 /* 2^-100. */ + +.INF_ZERO: + .4byte 0x7f800000 /* Single precision Inf. */ + .4byte 0 /* Single precision zero. */ + +strong_alias (__ieee754_expf, __expf_finite) diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies b/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies deleted file mode 100644 index 7fd86fdf87..0000000000 --- a/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies +++ /dev/null @@ -1 +0,0 @@ -powerpc/powerpc64/power7/fpu/multiarch diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S new file mode 100644 index 0000000000..af71382fb2 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S @@ -0,0 +1,509 @@ +/* Optimized cosf(). PowerPC64/POWER8 version. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#define _ERRNO_H 1 +#include <bits/errno.h> +#include <libm-alias-float.h> + +#define FRAMESIZE (FRAME_MIN_SIZE+16) + +#define FLOAT_EXPONENT_SHIFT 23 +#define FLOAT_EXPONENT_BIAS 127 +#define INTEGER_BITS 3 + +#define PI_4 0x3f490fdb /* PI/4 */ +#define NINEPI_4 0x40e231d6 /* 9 * PI/4 */ +#define TWO_PN5 0x3d000000 /* 2^-5 */ +#define TWO_PN27 0x32000000 /* 2^-27 */ +#define INFINITY 0x7f800000 +#define TWO_P23 0x4b000000 /* 2^23 */ +#define FX_FRACTION_1_28 0x9249250 /* 0x100000000 / 28 + 1 */ + + /* Implements the function + + float [fp1] cosf (float [fp1] x) */ + + .machine power8 +ENTRY (__cosf, 4) + addis r9,r2,L(anchor)@toc@ha + addi r9,r9,L(anchor)@toc@l + + lis r4,PI_4@h + ori r4,r4,PI_4@l + + xscvdpspn v0,v1 + mfvsrd r8,v0 + rldicl r3,r8,32,33 /* Remove sign bit. */ + + cmpw r3,r4 + bge L(greater_or_equal_pio4) + + lis r4,TWO_PN5@h + ori r4,r4,TWO_PN5@l + + cmpw r3,r4 + blt L(less_2pn5) + + /* Chebyshev polynomial of the form: + * 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). */ + + lfd fp9,(L(C0)-L(anchor))(r9) + lfd fp10,(L(C1)-L(anchor))(r9) + lfd fp11,(L(C2)-L(anchor))(r9) + lfd fp12,(L(C3)-L(anchor))(r9) + lfd fp13,(L(C4)-L(anchor))(r9) + + fmul fp2,fp1,fp1 /* x^2 */ + lfd fp3,(L(DPone)-L(anchor))(r9) + + fmadd fp4,fp2,fp13,fp12 /* C3+x^2*C4 */ + fmadd fp4,fp2,fp4,fp11 /* C2+x^2*(C3+x^2*C4) */ + fmadd fp4,fp2,fp4,fp10 /* C1+x^2*(C2+x^2*(C3+x^2*C4)) */ + fmadd fp4,fp2,fp4,fp9 /* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */ + fmadd fp1,fp2,fp4,fp3 /* 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */ + frsp fp1,fp1 /* Round to single precision. */ + + blr + + .balign 16 +L(greater_or_equal_pio4): + lis r4,NINEPI_4@h + ori r4,r4,NINEPI_4@l + cmpw r3,r4 + bge L(greater_or_equal_9pio4) + + /* Calculate quotient of |x|/(PI/4). */ + lfd fp2,(L(invpio4)-L(anchor))(r9) + fabs fp1,fp1 /* |x| */ + fmul fp2,fp1,fp2 /* |x|/(PI/4) */ + fctiduz fp2,fp2 + mfvsrd r3,v2 /* n = |x| mod PI/4 */ + + /* Now use that quotient to find |x| mod (PI/2). */ + addi r7,r3,1 + rldicr r5,r7,2,60 /* ((n+1) >> 1) << 3 */ + addi r6,r9,(L(pio2_table)-L(anchor)) + lfdx fp4,r5,r6 + fsub fp1,fp1,fp4 + + .balign 16 +L(reduced): + /* Now we are in the range -PI/4 to PI/4. */ + + /* Work out if we are in a positive or negative primary interval. */ + addi r7,r7,2 + rldicl r4,r7,62,63 /* ((n+3) >> 2) & 1 */ + + /* Load a 1.0 or -1.0. */ + addi r5,r9,(L(ones)-L(anchor)) + sldi r4,r4,3 + lfdx fp0,r4,r5 + + /* Are we in the primary interval of sin or cos? */ + andi. r4,r7,0x2 + bne L(cos) + + /* Chebyshev polynomial of the form: + x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). */ + + lfd fp9,(L(S0)-L(anchor))(r9) + lfd fp10,(L(S1)-L(anchor))(r9) + lfd fp11,(L(S2)-L(anchor))(r9) + lfd fp12,(L(S3)-L(anchor))(r9) + lfd fp13,(L(S4)-L(anchor))(r9) + + fmul fp2,fp1,fp1 /* x^2 */ + fmul fp3,fp2,fp1 /* x^3 */ + + fmadd fp4,fp2,fp13,fp12 /* S3+x^2*S4 */ + fmadd fp4,fp2,fp4,fp11 /* S2+x^2*(S3+x^2*S4) */ + fmadd fp4,fp2,fp4,fp10 /* S1+x^2*(S2+x^2*(S3+x^2*S4)) */ + fmadd fp4,fp2,fp4,fp9 /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */ + fmadd fp4,fp3,fp4,fp1 /* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */ + fmul fp4,fp4,fp0 /* Add in the sign. */ + frsp fp1,fp4 /* Round to single precision. */ + + blr + + .balign 16 +L(cos): + /* Chebyshev polynomial of the form: + 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). */ + + lfd fp9,(L(C0)-L(anchor))(r9) + lfd fp10,(L(C1)-L(anchor))(r9) + lfd fp11,(L(C2)-L(anchor))(r9) + lfd fp12,(L(C3)-L(anchor))(r9) + lfd fp13,(L(C4)-L(anchor))(r9) + + fmul fp2,fp1,fp1 /* x^2 */ + lfd fp3,(L(DPone)-L(anchor))(r9) + + fmadd fp4,fp2,fp13,fp12 /* C3+x^2*C4 */ + fmadd fp4,fp2,fp4,fp11 /* C2+x^2*(C3+x^2*C4) */ + fmadd fp4,fp2,fp4,fp10 /* C1+x^2*(C2+x^2*(C3+x^2*C4)) */ + fmadd fp4,fp2,fp4,fp9 /* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */ + fmadd fp4,fp2,fp4,fp3 /* 1.0 + x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */ + fmul fp4,fp4,fp0 /* Add in the sign. */ + frsp fp1,fp4 /* Round to single precision. */ + + blr + + .balign 16 +L(greater_or_equal_9pio4): + lis r4,INFINITY@h + ori r4,r4,INFINITY@l + cmpw r3,r4 + bge L(inf_or_nan) + + lis r4,TWO_P23@h + ori r4,r4,TWO_P23@l + cmpw r3,r4 + bge L(greater_or_equal_2p23) + + fabs fp1,fp1 /* |x| */ + + /* Calculate quotient of |x|/(PI/4). */ + lfd fp2,(L(invpio4)-L(anchor))(r9) + + lfd fp3,(L(DPone)-L(anchor))(r9) + lfd fp4,(L(DPhalf)-L(anchor))(r9) + fmul fp2,fp1,fp2 /* |x|/(PI/4) */ + friz fp2,fp2 /* n = floor(|x|/(PI/4)) */ + + /* Calculate (n + 1) / 2. */ + fadd fp2,fp2,fp3 /* n + 1 */ + fmul fp3,fp2,fp4 /* (n + 1) / 2 */ + friz fp3,fp3 + + lfd fp4,(L(pio2hi)-L(anchor))(r9) + lfd fp5,(L(pio2lo)-L(anchor))(r9) + + fmul fp6,fp4,fp3 + fadd fp6,fp6,fp1 + fmadd fp1,fp5,fp3,fp6 + + fctiduz fp2,fp2 + mfvsrd r7,v2 /* n + 1 */ + + b L(reduced) + + .balign 16 +L(inf_or_nan): + bne L(skip_errno_setting) /* Is a NAN? */ + + /* We delayed the creation of the stack frame, as well as the saving of + the link register, because only at this point, we are sure that + doing so is actually needed. */ + + stfd fp1,-8(r1) + + /* Save the link register. */ + mflr r0 + std r0,16(r1) + cfi_offset(lr, 16) + + /* Create the stack frame. */ + stdu r1,-FRAMESIZE(r1) + cfi_adjust_cfa_offset(FRAMESIZE) + + bl JUMPTARGET(__errno_location) + nop + + /* Restore the stack frame. */ + addi r1,r1,FRAMESIZE + cfi_adjust_cfa_offset(-FRAMESIZE) + /* Restore the link register. */ + ld r0,16(r1) + mtlr r0 + + lfd fp1,-8(r1) + + /* errno = EDOM */ + li r4,EDOM + stw r4,0(r3) + +L(skip_errno_setting): + fsub fp1,fp1,fp1 /* x - x */ + blr + + .balign 16 +L(greater_or_equal_2p23): + fabs fp1,fp1 + + srwi r4,r3,FLOAT_EXPONENT_SHIFT + subi r4,r4,FLOAT_EXPONENT_BIAS + + /* We reduce the input modulo pi/4, so we need 3 bits of integer + to determine where in 2*pi we are. Index into our array + accordingly. */ + addi r4,r4,INTEGER_BITS + + /* To avoid an expensive divide, for the range we care about (0 - 127) + we can transform x/28 into: + + x/28 = (x * ((0x100000000 / 28) + 1)) >> 32 + + mulhwu returns the top 32 bits of the 64 bit result, doing the + shift for us in the same instruction. The top 32 bits are undefined, + so we have to mask them. */ + + lis r6,FX_FRACTION_1_28@h + ori r6,r6,FX_FRACTION_1_28@l + mulhwu r5,r4,r6 + clrldi r5,r5,32 + + /* Get our pointer into the invpio4_table array. */ + sldi r4,r5,3 + addi r6,r9,(L(invpio4_table)-L(anchor)) + add r4,r4,r6 + + lfd fp2,0(r4) + lfd fp3,8(r4) + lfd fp4,16(r4) + lfd fp5,24(r4) + + fmul fp6,fp2,fp1 + fmul fp7,fp3,fp1 + fmul fp8,fp4,fp1 + fmul fp9,fp5,fp1 + + /* Mask off larger integer bits in highest double word that we don't + care about to avoid losing precision when combining with smaller + values. */ + fctiduz fp10,fp6 + mfvsrd r7,v10 + rldicr r7,r7,0,(63-INTEGER_BITS) + mtvsrd v10,r7 + fcfidu fp10,fp10 /* Integer bits. */ + + fsub fp6,fp6,fp10 /* highest -= integer bits */ + + /* Work out the integer component, rounded down. Use the top two + limbs for this. */ + fadd fp10,fp6,fp7 /* highest + higher */ + + fctiduz fp10,fp10 + mfvsrd r7,v10 + andi. r0,r7,1 + fcfidu fp10,fp10 + + /* Subtract integer component from highest limb. */ + fsub fp12,fp6,fp10 + + beq L(even_integer) + + /* Our integer component is odd, so we are in the -PI/4 to 0 primary + region. We need to shift our result down by PI/4, and to do this + in the mod (4/PI) space we simply subtract 1. */ + lfd fp11,(L(DPone)-L(anchor))(r9) + fsub fp12,fp12,fp11 + + /* Now add up all the limbs in order. */ + fadd fp12,fp12,fp7 + fadd fp12,fp12,fp8 + fadd fp12,fp12,fp9 + + /* And finally multiply by pi/4. */ + lfd fp13,(L(pio4)-L(anchor))(r9) + fmul fp1,fp12,fp13 + + addi r7,r7,1 + b L(reduced) + +L(even_integer): + lfd fp11,(L(DPone)-L(anchor))(r9) + + /* Now add up all the limbs in order. */ + fadd fp12,fp12,fp7 + fadd fp12,r12,fp8 + fadd fp12,r12,fp9 + + /* We need to check if the addition of all the limbs resulted in us + overflowing 1.0. */ + fcmpu 0,fp12,fp11 + bgt L(greater_than_one) + + /* And finally multiply by pi/4. */ + lfd fp13,(L(pio4)-L(anchor))(r9) + fmul fp1,fp12,fp13 + + addi r7,r7,1 + b L(reduced) + +L(greater_than_one): + /* We did overflow 1.0 when adding up all the limbs. Add 1.0 to our + integer, and subtract 1.0 from our result. Since that makes the + integer component odd, we need to subtract another 1.0 as + explained above. */ + addi r7,r7,1 + + lfd fp11,(L(DPtwo)-L(anchor))(r9) + fsub fp12,fp12,fp11 + + /* And finally multiply by pi/4. */ + lfd fp13,(L(pio4)-L(anchor))(r9) + fmul fp1,fp12,fp13 + + addi r7,r7,1 + b L(reduced) + + .balign 16 +L(less_2pn5): + lis r4,TWO_PN27@h + ori r4,r4,TWO_PN27@l + + cmpw r3,r4 + blt L(less_2pn27) + + /* A simpler Chebyshev approximation is close enough for this range: + 1.0+x^2*(CC0+x^3*CC1). */ + + lfd fp10,(L(CC0)-L(anchor))(r9) + lfd fp11,(L(CC1)-L(anchor))(r9) + + fmul fp2,fp1,fp1 /* x^2 */ + fmul fp3,fp2,fp1 /* x^3 */ + lfd fp1,(L(DPone)-L(anchor))(r9) + + fmadd fp4,fp3,fp11,fp10 /* CC0+x^3*CC1 */ + fmadd fp1,fp2,fp4,fp1 /* 1.0+x^2*(CC0+x^3*CC1) */ + + frsp fp1,fp1 /* Round to single precision. */ + + blr + + .balign 16 +L(less_2pn27): + /* Handle some special cases: + + cosf(subnormal) raises inexact + cosf(min_normalized) raises inexact + cosf(normalized) raises inexact. */ + + lfd fp2,(L(DPone)-L(anchor))(r9) + + fabs fp1,fp1 /* |x| */ + fsub fp1,fp2,fp1 /* 1.0-|x| */ + + frsp fp1,fp1 + + blr + +END (__cosf) + + .section .rodata, "a" + + .balign 8 + +L(anchor): + + /* Chebyshev constants for sin, range -PI/4 - PI/4. */ +L(S0): .8byte 0xbfc5555555551cd9 +L(S1): .8byte 0x3f81111110c2688b +L(S2): .8byte 0xbf2a019f8b4bd1f9 +L(S3): .8byte 0x3ec71d7264e6b5b4 +L(S4): .8byte 0xbe5a947e1674b58a + + /* Chebyshev constants for cos, range 2^-27 - 2^-5. */ +L(CC0): .8byte 0xbfdfffffff5cc6fd +L(CC1): .8byte 0x3fa55514b178dac5 + + /* Chebyshev constants for cos, range -PI/4 - PI/4. */ +L(C0): .8byte 0xbfdffffffffe98ae +L(C1): .8byte 0x3fa55555545c50c7 +L(C2): .8byte 0xbf56c16b348b6874 +L(C3): .8byte 0x3efa00eb9ac43cc0 +L(C4): .8byte 0xbe923c97dd8844d7 + +L(invpio2): + .8byte 0x3fe45f306dc9c883 /* 2/PI */ + +L(invpio4): + .8byte 0x3ff45f306dc9c883 /* 4/PI */ + +L(invpio4_table): + .8byte 0x0000000000000000 + .8byte 0x3ff45f306c000000 + .8byte 0x3e3c9c882a000000 + .8byte 0x3c54fe13a8000000 + .8byte 0x3aaf47d4d0000000 + .8byte 0x38fbb81b6c000000 + .8byte 0x3714acc9e0000000 + .8byte 0x3560e4107c000000 + .8byte 0x33bca2c756000000 + .8byte 0x31fbd778ac000000 + .8byte 0x300b7246e0000000 + .8byte 0x2e5d2126e8000000 + .8byte 0x2c97003248000000 + .8byte 0x2ad77504e8000000 + .8byte 0x290921cfe0000000 + .8byte 0x274deb1cb0000000 + .8byte 0x25829a73e0000000 + .8byte 0x23fd1046be000000 + .8byte 0x2224baed10000000 + .8byte 0x20709d338e000000 + .8byte 0x1e535a2f80000000 + .8byte 0x1cef904e64000000 + .8byte 0x1b0d639830000000 + .8byte 0x1964ce7d24000000 + .8byte 0x17b908bf16000000 + +L(pio4): + .8byte 0x3fe921fb54442d18 /* PI/4 */ + +/* PI/2 as a sum of two doubles. We only use 32 bits of the upper limb + to avoid losing significant bits when multiplying with up to + (2^22)/(pi/2). */ +L(pio2hi): + .8byte 0xbff921fb54400000 + +L(pio2lo): + .8byte 0xbdd0b4611a626332 + +L(pio2_table): + .8byte 0 + .8byte 0x3ff921fb54442d18 /* 1 * PI/2 */ + .8byte 0x400921fb54442d18 /* 2 * PI/2 */ + .8byte 0x4012d97c7f3321d2 /* 3 * PI/2 */ + .8byte 0x401921fb54442d18 /* 4 * PI/2 */ + .8byte 0x401f6a7a2955385e /* 5 * PI/2 */ + .8byte 0x4022d97c7f3321d2 /* 6 * PI/2 */ + .8byte 0x4025fdbbe9bba775 /* 7 * PI/2 */ + .8byte 0x402921fb54442d18 /* 8 * PI/2 */ + .8byte 0x402c463abeccb2bb /* 9 * PI/2 */ + .8byte 0x402f6a7a2955385e /* 10 * PI/2 */ + +L(small): + .8byte 0x3cd0000000000000 /* 2^-50 */ + +L(ones): + .8byte 0x3ff0000000000000 /* +1.0 */ + .8byte 0xbff0000000000000 /* -1.0 */ + +L(DPhalf): + .8byte 0x3fe0000000000000 /* 0.5 */ + +L(DPone): + .8byte 0x3ff0000000000000 /* 1.0 */ + +L(DPtwo): + .8byte 0x4000000000000000 /* 2.0 */ + +libm_alias_float (__cos, cos) diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S index 3b0c88e5eb..aac54a9364 100644 --- a/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S +++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S @@ -1,5 +1,5 @@ /* isfinite(). PowerPC64/POWER8 version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -23,7 +23,7 @@ /* int [r3] __finite ([fp1] x) */ -EALIGN (__finite, 4, 0) +ENTRY_TOCLESS (__finite, 4) CALL_MCOUNT 0 MFVSRD_R3_V1 lis r9,0x8010 diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S index 4708239689..94746ef068 100644 --- a/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S +++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S @@ -1,5 +1,5 @@ /* isinf(). PowerPC64/POWER8 version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -23,7 +23,7 @@ /* int [r3] __isinf([fp1] x) */ -EALIGN (__isinf, 4, 0) +ENTRY_TOCLESS (__isinf, 4) CALL_MCOUNT 0 MFVSRD_R3_V1 lis r9,0x7ff0 /* r9 = 0x7ff0 */ diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S index 0e5c19333c..8aef354f68 100644 --- a/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S +++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S @@ -1,5 +1,5 @@ /* isnan(). PowerPC64/POWER8 version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -23,7 +23,7 @@ /* int [r3] __isnan([f1] x) */ -EALIGN (__isnan, 4, 0) +ENTRY_TOCLESS (__isnan, 4) CALL_MCOUNT 0 MFVSRD_R3_V1 lis r9,0x7ff0 diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S index 0af9feee5a..7f18d705a9 100644 --- a/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S +++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S @@ -1,5 +1,5 @@ /* Round double to long int. POWER8 PowerPC64 version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,11 +18,13 @@ #include <sysdep.h> #include <math_ldbl_opt.h> +#include <libm-alias-float.h> +#include <libm-alias-double.h> #define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */ /* long long int[r3] __llrint (double x[fp1]) */ -ENTRY (__llrint) +ENTRY_TOCLESS (__llrint) CALL_MCOUNT 0 fctid fp1,fp1 MFVSRD_R3_V1 @@ -30,16 +32,12 @@ ENTRY (__llrint) END (__llrint) strong_alias (__llrint, __lrint) -weak_alias (__llrint, llrint) -weak_alias (__lrint, lrint) - -#ifdef NO_LONG_DOUBLE -strong_alias (__llrint, __llrintl) -weak_alias (__llrint, llrintl) -strong_alias (__lrint, __lrintl) -weak_alias (__lrint, lrintl) -#endif -#if LONG_DOUBLE_COMPAT(libm, GLIBC_2_1) -compat_symbol (libm, __llrint, llrintl, GLIBC_2_1) -compat_symbol (libm, __lrint, lrintl, GLIBC_2_1) -#endif +libm_alias_double (__llrint, llrint) +libm_alias_double (__lrint, lrint) +/* The double version also works for single-precision as both float and + double parameters are passed in 64bit FPRs and both versions are expected + to return [long] long type. */ +strong_alias (__llrint, __llrintf) +libm_alias_float (__llrint, llrint) +strong_alias (__lrint, __lrintf) +libm_alias_float (__lrint, lrint) diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S index af2409deb8..a22fc63bb3 100644 --- a/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S +++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S @@ -1,5 +1,5 @@ /* llround function. POWER8 PowerPC64 version. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -19,12 +19,14 @@ #include <sysdep.h> #include <endian.h> #include <math_ldbl_opt.h> +#include <libm-alias-float.h> +#include <libm-alias-double.h> #define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */ /* long long [r3] llround (float x [fp1]) */ -ENTRY (__llround) +ENTRY_TOCLESS (__llround) CALL_MCOUNT 0 frin fp1,fp1 /* Round to nearest +-0.5. */ fctidz fp1,fp1 /* Convert To Integer DW round toward 0. */ @@ -33,16 +35,12 @@ ENTRY (__llround) END (__llround) strong_alias (__llround, __lround) -weak_alias (__llround, llround) -weak_alias (__lround, lround) - -#ifdef NO_LONG_DOUBLE -weak_alias (__llround, llroundl) -strong_alias (__llround, __llroundl) -weak_alias (__lround, lroundl) -strong_alias (__lround, __lroundl) -#endif -#if LONG_DOUBLE_COMPAT(libm, GLIBC_2_1) -compat_symbol (libm, __llround, llroundl, GLIBC_2_1) -compat_symbol (libm, __lround, lroundl, GLIBC_2_1) -#endif +libm_alias_double (__llround, llround) +libm_alias_double (__lround, lround) +/* The double version also works for single-precision as both float and + double parameters are passed in 64bit FPRs and both versions are expected + to return [long] long type. */ +strong_alias (__llround, __llroundf) +libm_alias_float (__llround, llround) +strong_alias (__lround, __lroundf) +libm_alias_float (__lround, lround) diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_llroundf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_llroundf.S new file mode 100644 index 0000000000..9ea6bd105b --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_llroundf.S @@ -0,0 +1 @@ +/* __lroundf is in s_llround.S. */ diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S new file mode 100644 index 0000000000..59e613c102 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S @@ -0,0 +1,520 @@ +/* Optimized sinf(). PowerPC64/POWER8 version. + Copyright (C) 2016-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#define _ERRNO_H 1 +#include <bits/errno.h> +#include <libm-alias-float.h> + +#define FRAMESIZE (FRAME_MIN_SIZE+16) + +#define FLOAT_EXPONENT_SHIFT 23 +#define FLOAT_EXPONENT_BIAS 127 +#define INTEGER_BITS 3 + +#define PI_4 0x3f490fdb /* PI/4 */ +#define NINEPI_4 0x40e231d6 /* 9 * PI/4 */ +#define TWO_PN5 0x3d000000 /* 2^-5 */ +#define TWO_PN27 0x32000000 /* 2^-27 */ +#define INFINITY 0x7f800000 +#define TWO_P23 0x4b000000 /* 2^27 */ +#define FX_FRACTION_1_28 0x9249250 /* 0x100000000 / 28 + 1 */ + + /* Implements the function + + float [fp1] sinf (float [fp1] x) */ + + .machine power8 +ENTRY (__sinf, 4) + addis r9,r2,L(anchor)@toc@ha + addi r9,r9,L(anchor)@toc@l + + lis r4,PI_4@h + ori r4,r4,PI_4@l + + xscvdpspn v0,v1 + mfvsrd r8,v0 + rldicl r3,r8,32,33 /* Remove sign bit. */ + + cmpw r3,r4 + bge L(greater_or_equal_pio4) + + lis r4,TWO_PN5@h + ori r4,r4,TWO_PN5@l + + cmpw r3,r4 + blt L(less_2pn5) + + /* Chebyshev polynomial of the form: + * x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). */ + + lfd fp9,(L(S0)-L(anchor))(r9) + lfd fp10,(L(S1)-L(anchor))(r9) + lfd fp11,(L(S2)-L(anchor))(r9) + lfd fp12,(L(S3)-L(anchor))(r9) + lfd fp13,(L(S4)-L(anchor))(r9) + + fmul fp2,fp1,fp1 /* x^2 */ + fmul fp3,fp2,fp1 /* x^3 */ + + fmadd fp4,fp2,fp13,fp12 /* S3+x^2*S4 */ + fmadd fp4,fp2,fp4,fp11 /* S2+x^2*(S3+x^2*S4) */ + fmadd fp4,fp2,fp4,fp10 /* S1+x^2*(S2+x^2*(S3+x^2*S4)) */ + fmadd fp4,fp2,fp4,fp9 /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */ + fmadd fp1,fp3,fp4,fp1 /* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */ + frsp fp1,fp1 /* Round to single precision. */ + + blr + + .balign 16 +L(greater_or_equal_pio4): + lis r4,NINEPI_4@h + ori r4,r4,NINEPI_4@l + cmpw r3,r4 + bge L(greater_or_equal_9pio4) + + /* Calculate quotient of |x|/(PI/4). */ + lfd fp2,(L(invpio4)-L(anchor))(r9) + fabs fp1,fp1 /* |x| */ + fmul fp2,fp1,fp2 /* |x|/(PI/4) */ + fctiduz fp2,fp2 + mfvsrd r3,v2 /* n = |x| mod PI/4 */ + + /* Now use that quotient to find |x| mod (PI/2). */ + addi r7,r3,1 + rldicr r5,r7,2,60 /* ((n+1) >> 1) << 3 */ + addi r6,r9,(L(pio2_table)-L(anchor)) + lfdx fp4,r5,r6 + fsub fp1,fp1,fp4 + + .balign 16 +L(reduced): + /* Now we are in the range -PI/4 to PI/4. */ + + /* Work out if we are in a positive or negative primary interval. */ + rldicl r4,r7,62,63 /* ((n+1) >> 2) & 1 */ + + /* We are operating on |x|, so we need to add back the original + sign. */ + rldicl r8,r8,33,63 /* (x >> 31) & 1, ie the sign bit. */ + xor r4,r4,r8 /* 0 if result should be positive, + 1 if negative. */ + + /* Load a 1.0 or -1.0. */ + addi r5,r9,(L(ones)-L(anchor)) + sldi r4,r4,3 + lfdx fp0,r4,r5 + + /* Are we in the primary interval of sin or cos? */ + andi. r4,r7,0x2 + bne L(cos) + + /* Chebyshev polynomial of the form: + x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). */ + + lfd fp9,(L(S0)-L(anchor))(r9) + lfd fp10,(L(S1)-L(anchor))(r9) + lfd fp11,(L(S2)-L(anchor))(r9) + lfd fp12,(L(S3)-L(anchor))(r9) + lfd fp13,(L(S4)-L(anchor))(r9) + + fmul fp2,fp1,fp1 /* x^2 */ + fmul fp3,fp2,fp1 /* x^3 */ + + fmadd fp4,fp2,fp13,fp12 /* S3+x^2*S4 */ + fmadd fp4,fp2,fp4,fp11 /* S2+x^2*(S3+x^2*S4) */ + fmadd fp4,fp2,fp4,fp10 /* S1+x^2*(S2+x^2*(S3+x^2*S4)) */ + fmadd fp4,fp2,fp4,fp9 /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */ + fmadd fp4,fp3,fp4,fp1 /* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */ + fmul fp4,fp4,fp0 /* Add in the sign. */ + frsp fp1,fp4 /* Round to single precision. */ + + blr + + .balign 16 +L(cos): + /* Chebyshev polynomial of the form: + 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). */ + + lfd fp9,(L(C0)-L(anchor))(r9) + lfd fp10,(L(C1)-L(anchor))(r9) + lfd fp11,(L(C2)-L(anchor))(r9) + lfd fp12,(L(C3)-L(anchor))(r9) + lfd fp13,(L(C4)-L(anchor))(r9) + + fmul fp2,fp1,fp1 /* x^2 */ + lfd fp3,(L(DPone)-L(anchor))(r9) + + fmadd fp4,fp2,fp13,fp12 /* C3+x^2*C4 */ + fmadd fp4,fp2,fp4,fp11 /* C2+x^2*(C3+x^2*C4) */ + fmadd fp4,fp2,fp4,fp10 /* C1+x^2*(C2+x^2*(C3+x^2*C4)) */ + fmadd fp4,fp2,fp4,fp9 /* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */ + fmadd fp4,fp2,fp4,fp3 /* 1.0 + x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */ + fmul fp4,fp4,fp0 /* Add in the sign. */ + frsp fp1,fp4 /* Round to single precision. */ + + blr + + .balign 16 +L(greater_or_equal_9pio4): + lis r4,INFINITY@h + ori r4,r4,INFINITY@l + cmpw r3,r4 + bge L(inf_or_nan) + + lis r4,TWO_P23@h + ori r4,r4,TWO_P23@l + cmpw r3,r4 + bge L(greater_or_equal_2p23) + + fabs fp1,fp1 /* |x| */ + + /* Calculate quotient of |x|/(PI/4). */ + lfd fp2,(L(invpio4)-L(anchor))(r9) + + lfd fp3,(L(DPone)-L(anchor))(r9) + lfd fp4,(L(DPhalf)-L(anchor))(r9) + fmul fp2,fp1,fp2 /* |x|/(PI/4) */ + friz fp2,fp2 /* n = floor(|x|/(PI/4)) */ + + /* Calculate (n + 1) / 2. */ + fadd fp2,fp2,fp3 /* n + 1 */ + fmul fp3,fp2,fp4 /* (n + 1) / 2 */ + friz fp3,fp3 + + lfd fp4,(L(pio2hi)-L(anchor))(r9) + lfd fp5,(L(pio2lo)-L(anchor))(r9) + + fmul fp6,fp4,fp3 + fadd fp6,fp6,fp1 + fmadd fp1,fp5,fp3,fp6 + + fctiduz fp2,fp2 + mfvsrd r7,v2 /* n + 1 */ + + b L(reduced) + + .balign 16 +L(inf_or_nan): + bne L(skip_errno_setting) /* Is a NAN? */ + + /* We delayed the creation of the stack frame, as well as the saving of + the link register, because only at this point, we are sure that + doing so is actually needed. */ + + stfd fp1,-8(r1) + + /* Save the link register. */ + mflr r0 + std r0,16(r1) + cfi_offset(lr, 16) + + /* Create the stack frame. */ + stdu r1,-FRAMESIZE(r1) + cfi_adjust_cfa_offset(FRAMESIZE) + + bl JUMPTARGET(__errno_location) + nop + + /* Restore the stack frame. */ + addi r1,r1,FRAMESIZE + cfi_adjust_cfa_offset(-FRAMESIZE) + /* Restore the link register. */ + ld r0,16(r1) + mtlr r0 + + lfd fp1,-8(r1) + + /* errno = EDOM */ + li r4,EDOM + stw r4,0(r3) + +L(skip_errno_setting): + fsub fp1,fp1,fp1 /* x - x */ + blr + + .balign 16 +L(greater_or_equal_2p23): + fabs fp1,fp1 + + srwi r4,r3,FLOAT_EXPONENT_SHIFT + subi r4,r4,FLOAT_EXPONENT_BIAS + + /* We reduce the input modulo pi/4, so we need 3 bits of integer + to determine where in 2*pi we are. Index into our array + accordingly. */ + addi r4,r4,INTEGER_BITS + + /* To avoid an expensive divide, for the range we care about (0 - 127) + we can transform x/28 into: + + x/28 = (x * ((0x100000000 / 28) + 1)) >> 32 + + mulhwu returns the top 32 bits of the 64 bit result, doing the + shift for us in the same instruction. The top 32 bits are undefined, + so we have to mask them. */ + + lis r6,FX_FRACTION_1_28@h + ori r6,r6,FX_FRACTION_1_28@l + mulhwu r5,r4,r6 + clrldi r5,r5,32 + + /* Get our pointer into the invpio4_table array. */ + sldi r4,r5,3 + addi r6,r9,(L(invpio4_table)-L(anchor)) + add r4,r4,r6 + + lfd fp2,0(r4) + lfd fp3,8(r4) + lfd fp4,16(r4) + lfd fp5,24(r4) + + fmul fp6,fp2,fp1 + fmul fp7,fp3,fp1 + fmul fp8,fp4,fp1 + fmul fp9,fp5,fp1 + + /* Mask off larger integer bits in highest double word that we don't + care about to avoid losing precision when combining with smaller + values. */ + fctiduz fp10,fp6 + mfvsrd r7,v10 + rldicr r7,r7,0,(63-INTEGER_BITS) + mtvsrd v10,r7 + fcfidu fp10,fp10 /* Integer bits. */ + + fsub fp6,fp6,fp10 /* highest -= integer bits */ + + /* Work out the integer component, rounded down. Use the top two + limbs for this. */ + fadd fp10,fp6,fp7 /* highest + higher */ + + fctiduz fp10,fp10 + mfvsrd r7,v10 + andi. r0,r7,1 + fcfidu fp10,fp10 + + /* Subtract integer component from highest limb. */ + fsub fp12,fp6,fp10 + + beq L(even_integer) + + /* Our integer component is odd, so we are in the -PI/4 to 0 primary + region. We need to shift our result down by PI/4, and to do this + in the mod (4/PI) space we simply subtract 1. */ + lfd fp11,(L(DPone)-L(anchor))(r9) + fsub fp12,fp12,fp11 + + /* Now add up all the limbs in order. */ + fadd fp12,fp12,fp7 + fadd fp12,fp12,fp8 + fadd fp12,fp12,fp9 + + /* And finally multiply by pi/4. */ + lfd fp13,(L(pio4)-L(anchor))(r9) + fmul fp1,fp12,fp13 + + addi r7,r7,1 + b L(reduced) + +L(even_integer): + lfd fp11,(L(DPone)-L(anchor))(r9) + + /* Now add up all the limbs in order. */ + fadd fp12,fp12,fp7 + fadd fp12,r12,fp8 + fadd fp12,r12,fp9 + + /* We need to check if the addition of all the limbs resulted in us + overflowing 1.0. */ + fcmpu 0,fp12,fp11 + bgt L(greater_than_one) + + /* And finally multiply by pi/4. */ + lfd fp13,(L(pio4)-L(anchor))(r9) + fmul fp1,fp12,fp13 + + addi r7,r7,1 + b L(reduced) + +L(greater_than_one): + /* We did overflow 1.0 when adding up all the limbs. Add 1.0 to our + integer, and subtract 1.0 from our result. Since that makes the + integer component odd, we need to subtract another 1.0 as + explained above. */ + addi r7,r7,1 + + lfd fp11,(L(DPtwo)-L(anchor))(r9) + fsub fp12,fp12,fp11 + + /* And finally multiply by pi/4. */ + lfd fp13,(L(pio4)-L(anchor))(r9) + fmul fp1,fp12,fp13 + + addi r7,r7,1 + b L(reduced) + + .balign 16 +L(less_2pn5): + lis r4,TWO_PN27@h + ori r4,r4,TWO_PN27@l + + cmpw r3,r4 + blt L(less_2pn27) + + /* A simpler Chebyshev approximation is close enough for this range: + x+x^3*(SS0+x^2*SS1). */ + + lfd fp10,(L(SS0)-L(anchor))(r9) + lfd fp11,(L(SS1)-L(anchor))(r9) + + fmul fp2,fp1,fp1 /* x^2 */ + fmul fp3,fp2,fp1 /* x^3 */ + + fmadd fp4,fp2,fp11,fp10 /* SS0+x^2*SS1 */ + fmadd fp1,fp3,fp4,fp1 /* x+x^3*(SS0+x^2*SS1) */ + + frsp fp1,fp1 /* Round to single precision. */ + + blr + + .balign 16 +L(less_2pn27): + cmpwi r3,0 + beq L(zero) + + /* Handle some special cases: + + sinf(subnormal) raises inexact/underflow + sinf(min_normalized) raises inexact/underflow + sinf(normalized) raises inexact. */ + + lfd fp2,(L(small)-L(anchor))(r9) + + fmul fp2,fp1,fp2 /* x * small */ + fsub fp1,fp1,fp2 /* x - x * small */ + + frsp fp1,fp1 + + blr + + .balign 16 +L(zero): + blr + +END (__sinf) + + .section .rodata, "a" + + .balign 8 + +L(anchor): + + /* Chebyshev constants for sin, range -PI/4 - PI/4. */ +L(S0): .8byte 0xbfc5555555551cd9 +L(S1): .8byte 0x3f81111110c2688b +L(S2): .8byte 0xbf2a019f8b4bd1f9 +L(S3): .8byte 0x3ec71d7264e6b5b4 +L(S4): .8byte 0xbe5a947e1674b58a + + /* Chebyshev constants for sin, range 2^-27 - 2^-5. */ +L(SS0): .8byte 0xbfc555555543d49d +L(SS1): .8byte 0x3f8110f475cec8c5 + + /* Chebyshev constants for cos, range -PI/4 - PI/4. */ +L(C0): .8byte 0xbfdffffffffe98ae +L(C1): .8byte 0x3fa55555545c50c7 +L(C2): .8byte 0xbf56c16b348b6874 +L(C3): .8byte 0x3efa00eb9ac43cc0 +L(C4): .8byte 0xbe923c97dd8844d7 + +L(invpio2): + .8byte 0x3fe45f306dc9c883 /* 2/PI */ + +L(invpio4): + .8byte 0x3ff45f306dc9c883 /* 4/PI */ + +L(invpio4_table): + .8byte 0x0000000000000000 + .8byte 0x3ff45f306c000000 + .8byte 0x3e3c9c882a000000 + .8byte 0x3c54fe13a8000000 + .8byte 0x3aaf47d4d0000000 + .8byte 0x38fbb81b6c000000 + .8byte 0x3714acc9e0000000 + .8byte 0x3560e4107c000000 + .8byte 0x33bca2c756000000 + .8byte 0x31fbd778ac000000 + .8byte 0x300b7246e0000000 + .8byte 0x2e5d2126e8000000 + .8byte 0x2c97003248000000 + .8byte 0x2ad77504e8000000 + .8byte 0x290921cfe0000000 + .8byte 0x274deb1cb0000000 + .8byte 0x25829a73e0000000 + .8byte 0x23fd1046be000000 + .8byte 0x2224baed10000000 + .8byte 0x20709d338e000000 + .8byte 0x1e535a2f80000000 + .8byte 0x1cef904e64000000 + .8byte 0x1b0d639830000000 + .8byte 0x1964ce7d24000000 + .8byte 0x17b908bf16000000 + +L(pio4): + .8byte 0x3fe921fb54442d18 /* PI/4 */ + +/* PI/2 as a sum of two doubles. We only use 32 bits of the upper limb + to avoid losing significant bits when multiplying with up to + (2^22)/(pi/2). */ +L(pio2hi): + .8byte 0xbff921fb54400000 + +L(pio2lo): + .8byte 0xbdd0b4611a626332 + +L(pio2_table): + .8byte 0 + .8byte 0x3ff921fb54442d18 /* 1 * PI/2 */ + .8byte 0x400921fb54442d18 /* 2 * PI/2 */ + .8byte 0x4012d97c7f3321d2 /* 3 * PI/2 */ + .8byte 0x401921fb54442d18 /* 4 * PI/2 */ + .8byte 0x401f6a7a2955385e /* 5 * PI/2 */ + .8byte 0x4022d97c7f3321d2 /* 6 * PI/2 */ + .8byte 0x4025fdbbe9bba775 /* 7 * PI/2 */ + .8byte 0x402921fb54442d18 /* 8 * PI/2 */ + .8byte 0x402c463abeccb2bb /* 9 * PI/2 */ + .8byte 0x402f6a7a2955385e /* 10 * PI/2 */ + +L(small): + .8byte 0x3cd0000000000000 /* 2^-50 */ + +L(ones): + .8byte 0x3ff0000000000000 /* +1.0 */ + .8byte 0xbff0000000000000 /* -1.0 */ + +L(DPhalf): + .8byte 0x3fe0000000000000 /* 0.5 */ + +L(DPone): + .8byte 0x3ff0000000000000 /* 1.0 */ + +L(DPtwo): + .8byte 0x4000000000000000 /* 2.0 */ + +libm_alias_float (__sin, sin) diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/w_expf.c b/sysdeps/powerpc/powerpc64/power8/fpu/w_expf.c new file mode 100644 index 0000000000..b5fe164520 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power8/fpu/w_expf.c @@ -0,0 +1 @@ +#include <sysdeps/../math/w_expf.c> |