sysdeps/x86_64/fpu/multiarch/s_sincosf-fma.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240

/* Compute sine and cosine of argument optimized with vector.
   Copyright (C) 2017 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#include <errno.h>
#include <math.h>
#include <math_private.h>
#include <x86intrin.h>
#include <libm-alias-float.h>
#include "s_sincosf.h"

#define SINCOSF __sincosf_fma

#ifndef SINCOSF
# define SINCOSF_FUNC __sincosf
#else
# define SINCOSF_FUNC SINCOSF
#endif

/* Chebyshev constants for sin and cos, range -PI/4 - PI/4.  */
static const __v2df V0 = { -0x1.5555555551cd9p-3, -0x1.ffffffffe98aep-2};
static const __v2df V1 = { 0x1.1111110c2688bp-7, 0x1.55555545c50c7p-5 };
static const __v2df V2 = { -0x1.a019f8b4bd1f9p-13, -0x1.6c16b348b6874p-10 };
static const __v2df V3 = { 0x1.71d7264e6b5b4p-19, 0x1.a00eb9ac43ccp-16 };
static const __v2df V4 = { -0x1.a947e1674b58ap-26, -0x1.23c97dd8844d7p-22 };

/* Chebyshev constants for sin and cos, range 2^-27 - 2^-5.  */
static const __v2df VC0 = { -0x1.555555543d49dp-3, -0x1.fffffff5cc6fdp-2 };
static const __v2df VC1 = { 0x1.110f475cec8c5p-7, 0x1.55514b178dac5p-5 };

static const __v2df v2ones = { 1.0, 1.0 };

/* Compute the sine and cosine values using Chebyshev polynomials where
   THETA is the range reduced absolute value of the input
   and it is less than Pi/4,
   N is calculated as trunc(|x|/(Pi/4)) + 1 and it is used to decide
   whether a sine or cosine approximation is more accurate and
   SIGNBIT is used to add the correct sign after the Chebyshev
   polynomial is computed.  */
static void
reduced_sincos (const double theta, const unsigned int n,
		const unsigned int signbit, float *sinx, float *cosx)
{
  __v2df v2x, v2sx, v2cx;
  const __v2df v2theta = { theta, theta };
  const __v2df v2theta2 = v2theta * v2theta;
  /* Here sinf() and cosf() are calculated using sin Chebyshev polynomial:
     x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).  */
  v2x = V3 + v2theta2 * V4;    /* S3+x^2*S4.  */
  v2x = V2 + v2theta2 * v2x;   /* S2+x^2*(S3+x^2*S4).  */
  v2x = V1 + v2theta2 * v2x;   /* S1+x^2*(S2+x^2*(S3+x^2*S4)).  */
  v2x = V0 + v2theta2 * v2x;   /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))).  */
  v2x = v2theta2 * v2x;
  v2cx = v2ones + v2x;
  v2sx = v2theta + v2theta * v2x;
  /* We are operating on |x|, so we need to add back the original
     signbit for sinf.  */
  /* Determine positive or negative primary interval.  */
  /* Are we in the primary interval of sin or cos?  */
  if ((n & 2) == 0)
    {
      const __v2df v2sign =
	{
	  ones[((n >> 2) & 1) ^ signbit],
	  ones[((n + 2) >> 2) & 1]
	};
      v2cx[0] = v2sx[0];
      v2cx *= v2sign;
      __v4sf v4sx = _mm_cvtpd_ps (v2cx);
      *sinx = v4sx[0];
      *cosx = v4sx[1];
    }
  else
    {
      const __v2df v2sign =
	{
	  ones[((n + 2) >> 2) & 1],
	  ones[((n >> 2) & 1) ^ signbit]
	};
      v2cx[0] = v2sx[0];
      v2cx *= v2sign;
      __v4sf v4sx = _mm_cvtpd_ps (v2cx);
      *sinx = v4sx[1];
      *cosx = v4sx[0];
    }
}

void
SINCOSF_FUNC (float x, float *sinx, float *cosx)
{
  double theta = x;
  double abstheta = fabs (theta);
  uint32_t ix, xi;
  GET_FLOAT_WORD (xi, x);
  /* |x| */
  ix = xi & 0x7fffffff;
  /* If |x|< Pi/4.  */
  if (ix < 0x3f490fdb)
    {
      if (ix >= 0x3d000000) /* |x| >= 2^-5.  */
	{
	  __v2df v2x, v2sx, v2cx;
	  const __v2df v2theta = { theta, theta };
	  const __v2df v2theta2 = v2theta * v2theta;
	  /* Chebyshev polynomial of the form for sin and cos.  */
	  v2x = V3 + v2theta2 * V4;
	  v2x = V2 + v2theta2 * v2x;
	  v2x = V1 + v2theta2 * v2x;
	  v2x = V0 + v2theta2 * v2x;
	  v2x = v2theta2 * v2x;
	  v2cx = v2ones + v2x;
	  v2sx = v2theta + v2theta * v2x;
	  v2cx[0] = v2sx[0];
	  __v4sf v4sx = _mm_cvtpd_ps (v2cx);
	  *sinx = v4sx[0];
	  *cosx = v4sx[1];
	}
      else if (ix >= 0x32000000)     /* |x| >= 2^-27.  */
	{
	  /* A simpler Chebyshev approximation is close enough for this range:
	     for sin: x+x^3*(SS0+x^2*SS1)
	     for cos: 1.0+x^2*(CC0+x^3*CC1).  */
	  __v2df v2x, v2sx, v2cx;
	  const __v2df v2theta = { theta, theta };
	  const __v2df v2theta2 = v2theta * v2theta;
	  v2x = VC0 + v2theta * v2theta2 * VC1;
	  v2x = v2theta2 * v2x;
	  v2cx = v2ones + v2x;
	  v2sx = v2theta + v2theta * v2x;
	  v2cx[0] = v2sx[0];
	  __v4sf v4sx = _mm_cvtpd_ps (v2cx);
	  *sinx = v4sx[0];
	  *cosx = v4sx[1];
	}
      else
	{
	  /* Handle some special cases.  */
	  if (ix)
	    *sinx = theta - (theta * SMALL);
	  else
	    *sinx = theta;
	  *cosx = 1.0 - abstheta;
	}
    }
  else                          /* |x| >= Pi/4.  */
    {
      unsigned int signbit = xi >> 31;
      if (ix < 0x40e231d6) /* |x| < 9*Pi/4.  */
	{
	  /* There are cases where FE_UPWARD rounding mode can
	     produce a result of abstheta * inv_PI_4 == 9,
	     where abstheta < 9pi/4, so the domain for
	     pio2_table must go to 5 (9 / 2 + 1).  */
	  unsigned int n = (abstheta * inv_PI_4) + 1;
	  theta = abstheta - pio2_table[n / 2];
	  reduced_sincos (theta, n, signbit, sinx, cosx);
	}
      else if (ix < 0x7f800000)
	{
	  if (ix < 0x4b000000)     /* |x| < 2^23.  */
	    {
	      unsigned int n = ((unsigned int) (abstheta * inv_PI_4)) + 1;
	      double x = n / 2;
	      theta = (abstheta - x * PI_2_hi) - x * PI_2_lo;
	      /* Argument reduction needed.  */
	      reduced_sincos (theta, n, signbit, sinx, cosx);
	    }
	  else                  /* |x| >= 2^23.  */
	    {
	      x = fabsf (x);
	      int exponent
	        = (ix >> FLOAT_EXPONENT_SHIFT) - FLOAT_EXPONENT_BIAS;
	      exponent += 3;
	      exponent /= 28;
	      double a = invpio4_table[exponent] * x;
	      double b = invpio4_table[exponent + 1] * x;
	      double c = invpio4_table[exponent + 2] * x;
	      double d = invpio4_table[exponent + 3] * x;
	      uint64_t l = a;
	      l &= ~0x7;
	      a -= l;
	      double e = a + b;
	      l = e;
	      e = a - l;
	      if (l & 1)
	        {
	          e -= 1.0;
	          e += b;
	          e += c;
	          e += d;
	          e *= M_PI_4;
		  reduced_sincos (e, l + 1, signbit, sinx, cosx);
	        }
	      else
		{
		  e += b;
		  e += c;
		  e += d;
		  if (e <= 1.0)
		    {
		      e *= M_PI_4;
		      reduced_sincos (e, l + 1, signbit, sinx, cosx);
		    }
		  else
		    {
		      l++;
		      e -= 2.0;
		      e *= M_PI_4;
		      reduced_sincos (e, l + 1, signbit, sinx, cosx);
		    }
		}
	    }
	}
      else
	{
	  if (ix == 0x7f800000)
	    __set_errno (EDOM);
	  /* sin/cos(Inf or NaN) is NaN.  */
	  *sinx = *cosx = x - x;
	}
    }
}

#ifndef SINCOSF
libm_alias_float (__sincos, sincos)
#endif