summaryrefslogtreecommitdiff
path: root/sysdeps/x86_64/fpu/multiarch
diff options
context:
space:
mode:
authorSamuel Thibault <samuel.thibault@ens-lyon.org>2016-08-20 19:14:56 +0200
committerSamuel Thibault <samuel.thibault@ens-lyon.org>2016-08-20 19:14:56 +0200
commitf76453c31593957fec1a99b986bfa5506618b79c (patch)
treeda353c882fb9b2261c9871bcb9e3876a3e6ed7f6 /sysdeps/x86_64/fpu/multiarch
parent58695b88a9deaecbcf7794760cc333177edaa2b4 (diff)
parent78bd7499af46d739ce94410eaeea006e874ca9e5 (diff)
Merge tag 'glibc-2.22' into baseline
The GNU C Library ================= The GNU C Library version 2.22 is now available. The GNU C Library is used as *the* C library in the GNU system and in GNU/Linux systems, as well as many other systems that use Linux as the kernel. The GNU C Library is primarily designed to be a portable and high performance C library. It follows all relevant standards including ISO C11 and POSIX.1-2008. It is also internationalized and has one of the most complete internationalization interfaces known. The GNU C Library webpage is at http://www.gnu.org/software/libc/ Packages for the 2.22 release may be downloaded from: http://ftpmirror.gnu.org/libc/ http://ftp.gnu.org/gnu/libc/ The mirror list is at http://www.gnu.org/order/ftp.html NEWS for version 2.22 ===================== * The following bugs are resolved with this release: 438, 4719, 6544, 6792, 11216, 12836, 13028, 13064, 13151, 13152, 14094, 14292, 14841, 14906, 14958, 15319, 15467, 15790, 15969, 16159, 16339, 16350, 16351, 16352, 16353, 16361, 16512, 16526, 16538, 16559, 16560, 16704, 16783, 16850, 17053, 17090, 17195, 17269, 17293, 17322, 17403, 17475, 17523, 17542, 17569, 17581, 17588, 17596, 17620, 17621, 17628, 17631, 17692, 17711, 17715, 17776, 17779, 17792, 17833, 17836, 17841, 17912, 17916, 17930, 17932, 17944, 17949, 17964, 17965, 17967, 17969, 17977, 17978, 17987, 17991, 17996, 17998, 17999, 18007, 18019, 18020, 18029, 18030, 18032, 18034, 18036, 18038, 18039, 18042, 18043, 18046, 18047, 18049, 18068, 18080, 18093, 18100, 18104, 18110, 18111, 18116, 18125, 18128, 18134, 18138, 18185, 18196, 18197, 18206, 18210, 18211, 18217, 18219, 18220, 18221, 18234, 18244, 18245, 18247, 18287, 18319, 18324, 18333, 18346, 18371, 18383, 18397, 18400, 18409, 18410, 18412, 18418, 18422, 18434, 18444, 18457, 18468, 18469, 18470, 18479, 18483, 18495, 18496, 18497, 18498, 18502, 18507, 18508, 18512, 18513, 18519, 18520, 18522, 18527, 18528, 18529, 18530, 18532, 18533, 18534, 18536, 18539, 18540, 18542, 18544, 18545, 18546, 18547, 18549, 18553, 18557, 18558, 18569, 18583, 18585, 18586, 18592, 18593, 18594, 18602, 18612, 18613, 18619, 18633, 18641, 18643, 18648, 18657, 18676, 18694, 18696. * Cache information can be queried via sysconf() function on s390 e.g. with _SC_LEVEL1_ICACHE_SIZE as argument. * A buffer overflow in gethostbyname_r and related functions performing DNS requests has been fixed. If the NSS functions were called with a misaligned buffer, the buffer length change due to pointer alignment was not taken into account. This could result in application crashes or, potentially arbitrary code execution, using crafted, but syntactically valid DNS responses. (CVE-2015-1781) * The time zone file parser has been made more robust against crafted time zone files, avoiding heap buffer overflows related to the processing of the tzh_ttisstdcnt and tzh_ttisgmtcnt fields, and a stack overflow due to large time zone data files. Overly long time zone specifiers in the TZ variable no longer result in stack overflows and crashes. * A powerpc and powerpc64 optimization for TLS, similar to TLS descriptors for LD and GD on x86 and x86-64, has been implemented. You will need binutils-2.24 or later to enable this optimization. * Character encoding and ctype tables were updated to Unicode 7.0.0, using new generator scripts contributed by Pravin Satpute and Mike FABIAN (Red Hat). These updates cause user visible changes, such as the fix for bug 17998. * CVE-2014-8121 The NSS backends shared internal state between the getXXent and getXXbyYY NSS calls for the same database, causing a denial-of-service condition in some applications. * Added vector math library named libmvec with the following vectorized x86_64 implementations: cos, cosf, sin, sinf, sincos, sincosf, log, logf, exp, expf, pow, powf. The library can be disabled with --disable-mathvec. Use of the functions is enabled with -fopenmp -ffast-math starting from -O1 for GCC version >= 4.9.0. Shared library libmvec.so is linked in as needed when using -lm (no need to specify -lmvec explicitly for not static builds). Visit <https://sourceware.org/glibc/wiki/libmvec> for detailed information. * A new fmemopen implementation has been added with the goal of POSIX compliance. The new implementation fixes the following long-standing issues: BZ#6544, BZ#11216, BZ#12836, BZ#13151, BZ#13152, and BZ#14292. The old implementation is still present for use be by existing binaries. * The 32-bit sparc sigaction ABI was inadvertently broken in the 2.20 and 2.21 releases. It has been fixed to match 2.19 and older, but binaries built against 2.20 and 2.21 might need to be recompiled. See BZ#18694. * Port to Native Client running on ARMv7-A (--host=arm-nacl). Contributed by Roland McGrath (Google). Contributors ============ This release was made possible by the contributions of many people. The maintainers are grateful to everyone who has contributed changes or bug reports. These include: Adhemerval Zanella Alan Modra Alexandre Oliva Andreas Schwab Andrew Senkevich Andriy Rysin Arjun Shankar Aurelien Jarno Benno Schulenberg Brad Hubbard Carlos O'Donell Chris Metcalf Christian Schmidt Chung-Lin Tang Cong Wang Cyril Hrubis Daniel Marjamäki David S. Miller Dmitry V. Levin Eric Rannaud Evangelos Foutras Feng Gao Florian Weimer Gleb Fotengauer-Malinovskiy H.J. Lu Igor Zamyatin J William Piggott James Cowgill James Lemke John David Anglin Joseph Myers Kevin Easton Khem Raj Leonhard Holz Mark Wielaard Marko Myllynen Martin Galvan Martin Sebor Matthew Fortune Mel Gorman Mike Frysinger Miroslav Lichvar Nathan Lynch Ondřej Bílka Paul Eggert Paul Pluzhnikov Pavel Kopyl Pravin Satpute Rajalakshmi Srinivasaraghavan Rical Jasan Richard Henderson Roland McGrath Rüdiger Sonderfeld Samuel Thibault Siddhesh Poyarekar Stefan Liebler Steve Ellcey Szabolcs Nagy Torvald Riegel Tulio Magno Quites Machado Filho Vincent Bernat Wilco Dijkstra Yaakov Selkowitz Zack Weinberg
Diffstat (limited to 'sysdeps/x86_64/fpu/multiarch')
-rw-r--r--sysdeps/x86_64/fpu/multiarch/Makefile21
-rw-r--r--sysdeps/x86_64/fpu/multiarch/e_asin.c1
-rw-r--r--sysdeps/x86_64/fpu/multiarch/e_atan2.c1
-rw-r--r--sysdeps/x86_64/fpu/multiarch/e_exp.c1
-rw-r--r--sysdeps/x86_64/fpu/multiarch/e_log.c1
-rw-r--r--sysdeps/x86_64/fpu/multiarch/e_pow.c1
-rw-r--r--sysdeps/x86_64/fpu/multiarch/s_ceil.S2
-rw-r--r--sysdeps/x86_64/fpu/multiarch/s_ceilf.S2
-rw-r--r--sysdeps/x86_64/fpu/multiarch/s_floor.S2
-rw-r--r--sysdeps/x86_64/fpu/multiarch/s_floorf.S2
-rw-r--r--sysdeps/x86_64/fpu/multiarch/s_fma.c2
-rw-r--r--sysdeps/x86_64/fpu/multiarch/s_fmaf.c2
-rw-r--r--sysdeps/x86_64/fpu/multiarch/s_nearbyint.S2
-rw-r--r--sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S2
-rw-r--r--sysdeps/x86_64/fpu/multiarch/s_rint.S2
-rw-r--r--sysdeps/x86_64/fpu/multiarch/s_rintf.S2
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S223
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S207
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S39
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S463
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S225
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S212
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S39
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S456
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S229
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_log4_core_avx2.S210
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S39
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S468
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S432
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S387
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S39
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S741
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S229
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core_avx2.S210
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S39
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S465
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S314
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S277
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S39
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S593
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S39
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S460
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S227
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S215
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S39
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S447
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S212
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S202
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S39
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S416
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S194
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S184
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S39
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S653
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S374
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core_avx2.S357
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S39
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S504
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S268
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S241
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S39
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S479
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S224
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S38
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S219
88 files changed, 13633 insertions, 10 deletions
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index 12b0526e50..86ea473b4f 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -51,3 +51,24 @@ CFLAGS-slowexp-avx.c = -msse2avx -DSSE2AVX
CFLAGS-s_tan-avx.c = -msse2avx -DSSE2AVX
endif
endif
+
+ifeq ($(subdir),mathvec)
+libmvec-sysdep_routines += svml_d_cos2_core_sse4 svml_d_cos4_core_avx2 \
+ svml_d_cos8_core_avx512 svml_d_sin2_core_sse4 \
+ svml_d_sin4_core_avx2 svml_d_sin8_core_avx512 \
+ svml_d_log2_core_sse4 svml_d_log4_core_avx2 \
+ svml_d_log8_core_avx512 svml_d_sincos2_core_sse4 \
+ svml_d_sincos4_core_avx2 svml_d_sincos8_core_avx512 \
+ svml_s_cosf4_core_sse4 svml_s_cosf8_core_avx2 \
+ svml_s_cosf16_core_avx512 svml_s_sinf4_core_sse4 \
+ svml_s_sinf8_core_avx2 svml_s_sinf16_core_avx512 \
+ svml_s_logf4_core_sse4 svml_s_logf8_core_avx2 \
+ svml_s_logf16_core_avx512 svml_d_exp2_core_sse4 \
+ svml_d_exp4_core_avx2 svml_d_exp8_core_avx512 \
+ svml_s_expf4_core_sse4 svml_s_expf8_core_avx2 \
+ svml_s_expf16_core_avx512 svml_d_pow2_core_sse4 \
+ svml_d_pow4_core_avx2 svml_d_pow8_core_avx512 \
+ svml_s_powf4_core_sse4 svml_s_powf8_core_avx2 \
+ svml_s_powf16_core_avx512 svml_s_sincosf4_core_sse4 \
+ svml_s_sincosf8_core_avx2 svml_s_sincosf16_core_avx512
+endif
diff --git a/sysdeps/x86_64/fpu/multiarch/e_asin.c b/sysdeps/x86_64/fpu/multiarch/e_asin.c
index e742a9c133..55865c02f3 100644
--- a/sysdeps/x86_64/fpu/multiarch/e_asin.c
+++ b/sysdeps/x86_64/fpu/multiarch/e_asin.c
@@ -1,5 +1,6 @@
#ifdef HAVE_FMA4_SUPPORT
# include <init-arch.h>
+# include <math.h>
# include <math_private.h>
extern double __ieee754_acos_sse2 (double);
diff --git a/sysdeps/x86_64/fpu/multiarch/e_atan2.c b/sysdeps/x86_64/fpu/multiarch/e_atan2.c
index 6867c6e64e..547681cb59 100644
--- a/sysdeps/x86_64/fpu/multiarch/e_atan2.c
+++ b/sysdeps/x86_64/fpu/multiarch/e_atan2.c
@@ -1,5 +1,6 @@
#if defined HAVE_FMA4_SUPPORT || defined HAVE_AVX_SUPPORT
# include <init-arch.h>
+# include <math.h>
# include <math_private.h>
extern double __ieee754_atan2_sse2 (double, double);
diff --git a/sysdeps/x86_64/fpu/multiarch/e_exp.c b/sysdeps/x86_64/fpu/multiarch/e_exp.c
index 3c650287c5..d244954056 100644
--- a/sysdeps/x86_64/fpu/multiarch/e_exp.c
+++ b/sysdeps/x86_64/fpu/multiarch/e_exp.c
@@ -1,5 +1,6 @@
#if defined HAVE_FMA4_SUPPORT || defined HAVE_AVX_SUPPORT
# include <init-arch.h>
+# include <math.h>
# include <math_private.h>
extern double __ieee754_exp_sse2 (double);
diff --git a/sysdeps/x86_64/fpu/multiarch/e_log.c b/sysdeps/x86_64/fpu/multiarch/e_log.c
index 05f36680be..98054737bd 100644
--- a/sysdeps/x86_64/fpu/multiarch/e_log.c
+++ b/sysdeps/x86_64/fpu/multiarch/e_log.c
@@ -1,5 +1,6 @@
#if defined HAVE_FMA4_SUPPORT || defined HAVE_AVX_SUPPORT
# include <init-arch.h>
+# include <math.h>
# include <math_private.h>
extern double __ieee754_log_sse2 (double);
diff --git a/sysdeps/x86_64/fpu/multiarch/e_pow.c b/sysdeps/x86_64/fpu/multiarch/e_pow.c
index a740b6c447..433cce0de6 100644
--- a/sysdeps/x86_64/fpu/multiarch/e_pow.c
+++ b/sysdeps/x86_64/fpu/multiarch/e_pow.c
@@ -1,5 +1,6 @@
#ifdef HAVE_FMA4_SUPPORT
# include <init-arch.h>
+# include <math.h>
# include <math_private.h>
extern double __ieee754_pow_sse2 (double, double);
diff --git a/sysdeps/x86_64/fpu/multiarch/s_ceil.S b/sysdeps/x86_64/fpu/multiarch/s_ceil.S
index 866c79684d..00ecede74d 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_ceil.S
+++ b/sysdeps/x86_64/fpu/multiarch/s_ceil.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2014 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@gmail.come>, 2011.
diff --git a/sysdeps/x86_64/fpu/multiarch/s_ceilf.S b/sysdeps/x86_64/fpu/multiarch/s_ceilf.S
index 02d5e2b90c..c8ed70553e 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_ceilf.S
+++ b/sysdeps/x86_64/fpu/multiarch/s_ceilf.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2014 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@gmail.come>, 2011.
diff --git a/sysdeps/x86_64/fpu/multiarch/s_floor.S b/sysdeps/x86_64/fpu/multiarch/s_floor.S
index 02ece20bb2..952ffaa314 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_floor.S
+++ b/sysdeps/x86_64/fpu/multiarch/s_floor.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2014 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@gmail.come>, 2011.
diff --git a/sysdeps/x86_64/fpu/multiarch/s_floorf.S b/sysdeps/x86_64/fpu/multiarch/s_floorf.S
index d485c5f330..c8231e86b3 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_floorf.S
+++ b/sysdeps/x86_64/fpu/multiarch/s_floorf.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2014 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@gmail.come>, 2011.
diff --git a/sysdeps/x86_64/fpu/multiarch/s_fma.c b/sysdeps/x86_64/fpu/multiarch/s_fma.c
index 507edc4aba..0963a0b36a 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_fma.c
+++ b/sysdeps/x86_64/fpu/multiarch/s_fma.c
@@ -1,5 +1,5 @@
/* FMA version of fma.
- Copyright (C) 2009-2014 Free Software Foundation, Inc.
+ Copyright (C) 2009-2015 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
diff --git a/sysdeps/x86_64/fpu/multiarch/s_fmaf.c b/sysdeps/x86_64/fpu/multiarch/s_fmaf.c
index e5fc5609cb..6046961f86 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_fmaf.c
+++ b/sysdeps/x86_64/fpu/multiarch/s_fmaf.c
@@ -1,5 +1,5 @@
/* FMA version of fmaf.
- Copyright (C) 2009-2014 Free Software Foundation, Inc.
+ Copyright (C) 2009-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S b/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S
index da9387151e..b5d32b5873 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S
+++ b/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2014 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@gmail.come>, 2011.
diff --git a/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S b/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S
index 265b89f8ae..cd7e177a55 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S
+++ b/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2014 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@gmail.come>, 2011.
diff --git a/sysdeps/x86_64/fpu/multiarch/s_rint.S b/sysdeps/x86_64/fpu/multiarch/s_rint.S
index 0dc4329d94..f52cef65db 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_rint.S
+++ b/sysdeps/x86_64/fpu/multiarch/s_rint.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2014 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@gmail.come>, 2011.
diff --git a/sysdeps/x86_64/fpu/multiarch/s_rintf.S b/sysdeps/x86_64/fpu/multiarch/s_rintf.S
index e058eb2b99..e2608d4c4e 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_rintf.S
+++ b/sysdeps/x86_64/fpu/multiarch/s_rintf.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2014 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@gmail.come>, 2011.
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S
new file mode 100644
index 0000000000..5f67d83bd4
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S
@@ -0,0 +1,38 @@
+/* Multiple versions of vectorized cos, vector length is 2.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVbN2v_cos)
+ .type _ZGVbN2v_cos, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVbN2v_cos_sse4(%rip), %rax
+ testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVbN2v_cos_sse2(%rip), %rax
+ ret
+END (_ZGVbN2v_cos)
+libmvec_hidden_def (_ZGVbN2v_cos)
+
+#define _ZGVbN2v_cos _ZGVbN2v_cos_sse2
+#include "../svml_d_cos2_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S
new file mode 100644
index 0000000000..4420edcae0
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S
@@ -0,0 +1,223 @@
+/* Function cos vectorized with SSE4.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_trig_data.h"
+
+ .text
+ENTRY (_ZGVbN2v_cos_sse4)
+/* ALGORITHM DESCRIPTION:
+
+ ( low accuracy ( < 4ulp ) or enhanced performance
+ ( half of correct mantissa ) implementation )
+
+ Argument representation:
+ arg + Pi/2 = (N*Pi + R)
+
+ Result calculation:
+ cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R)
+ sin(R) is approximated by corresponding polynomial
+ */
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $320, %rsp
+ movaps %xmm0, %xmm3
+ movq __svml_d_trig_data@GOTPCREL(%rip), %rax
+ movups __dHalfPI(%rax), %xmm2
+
+/* ARGUMENT RANGE REDUCTION:
+ Add Pi/2 to argument: X' = X+Pi/2
+ */
+ addpd %xmm3, %xmm2
+ movups __dInvPI(%rax), %xmm5
+ movups __dAbsMask(%rax), %xmm4
+
+/* Get absolute argument value: X' = |X'| */
+ andps %xmm2, %xmm4
+
+/* Y = X'*InvPi + RS : right shifter add */
+ mulpd %xmm5, %xmm2
+
+/* Check for large arguments path */
+ cmpnlepd __dRangeVal(%rax), %xmm4
+ movups __dRShifter(%rax), %xmm6
+ addpd %xmm6, %xmm2
+ movmskpd %xmm4, %ecx
+
+/* N = Y - RS : right shifter sub */
+ movaps %xmm2, %xmm1
+
+/* SignRes = Y<<63 : shift LSB to MSB place for result sign */
+ psllq $63, %xmm2
+ subpd %xmm6, %xmm1
+
+/* N = N - 0.5 */
+ subpd __dOneHalf(%rax), %xmm1
+ movups __dPI1(%rax), %xmm7
+
+/* R = X - N*Pi1 */
+ mulpd %xmm1, %xmm7
+ movups __dPI2(%rax), %xmm4
+
+/* R = R - N*Pi2 */
+ mulpd %xmm1, %xmm4
+ subpd %xmm7, %xmm0
+ movups __dPI3(%rax), %xmm5
+
+/* R = R - N*Pi3 */
+ mulpd %xmm1, %xmm5
+ subpd %xmm4, %xmm0
+
+/* R = R - N*Pi4 */
+ movups __dPI4(%rax), %xmm6
+ mulpd %xmm6, %xmm1
+ subpd %xmm5, %xmm0
+ subpd %xmm1, %xmm0
+
+/* POLYNOMIAL APPROXIMATION: R2 = R*R */
+ movaps %xmm0, %xmm4
+ mulpd %xmm0, %xmm4
+ movups __dC7(%rax), %xmm1
+ mulpd %xmm4, %xmm1
+ addpd __dC6(%rax), %xmm1
+ mulpd %xmm4, %xmm1
+ addpd __dC5(%rax), %xmm1
+ mulpd %xmm4, %xmm1
+ addpd __dC4(%rax), %xmm1
+
+/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */
+ mulpd %xmm4, %xmm1
+ addpd __dC3(%rax), %xmm1
+
+/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */
+ mulpd %xmm4, %xmm1
+ addpd __dC2(%rax), %xmm1
+ mulpd %xmm4, %xmm1
+ addpd __dC1(%rax), %xmm1
+ mulpd %xmm1, %xmm4
+ mulpd %xmm0, %xmm4
+ addpd %xmm4, %xmm0
+
+/* RECONSTRUCTION:
+ Final sign setting: Res = Poly^SignRes */
+ xorps %xmm2, %xmm0
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ movups %xmm3, 192(%rsp)
+ movups %xmm0, 256(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ movups %xmm8, 112(%rsp)
+ movups %xmm9, 96(%rsp)
+ movups %xmm10, 80(%rsp)
+ movups %xmm11, 64(%rsp)
+ movups %xmm12, 48(%rsp)
+ movups %xmm13, 32(%rsp)
+ movups %xmm14, 16(%rsp)
+ movups %xmm15, (%rsp)
+ movq %rsi, 136(%rsp)
+ movq %rdi, 128(%rsp)
+ movq %r12, 168(%rsp)
+ cfi_offset_rel_rsp (12, 168)
+ movb %dl, %r12b
+ movq %r13, 160(%rsp)
+ cfi_offset_rel_rsp (13, 160)
+ movl %ecx, %r13d
+ movq %r14, 152(%rsp)
+ cfi_offset_rel_rsp (14, 152)
+ movl %eax, %r14d
+ movq %r15, 144(%rsp)
+ cfi_offset_rel_rsp (15, 144)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ movups 112(%rsp), %xmm8
+ movups 96(%rsp), %xmm9
+ movups 80(%rsp), %xmm10
+ movups 64(%rsp), %xmm11
+ movups 48(%rsp), %xmm12
+ movups 32(%rsp), %xmm13
+ movups 16(%rsp), %xmm14
+ movups (%rsp), %xmm15
+ movq 136(%rsp), %rsi
+ movq 128(%rsp), %rdi
+ movq 168(%rsp), %r12
+ cfi_restore (%r12)
+ movq 160(%rsp), %r13
+ cfi_restore (%r13)
+ movq 152(%rsp), %r14
+ cfi_restore (%r14)
+ movq 144(%rsp), %r15
+ cfi_restore (%r15)
+ movups 256(%rsp), %xmm0
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ movsd 200(%rsp,%r15), %xmm0
+
+ call cos@PLT
+
+ movsd %xmm0, 264(%rsp,%r15)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ movsd 192(%rsp,%r15), %xmm0
+
+ call cos@PLT
+
+ movsd %xmm0, 256(%rsp,%r15)
+ jmp .LBL_1_7
+
+END (_ZGVbN2v_cos_sse4)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S
new file mode 100644
index 0000000000..5babb834ad
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S
@@ -0,0 +1,38 @@
+/* Multiple versions of vectorized cos, vector length is 4.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVdN4v_cos)
+ .type _ZGVdN4v_cos, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVdN4v_cos_avx2(%rip), %rax
+ testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVdN4v_cos_sse_wrapper(%rip), %rax
+ ret
+END (_ZGVdN4v_cos)
+libmvec_hidden_def (_ZGVdN4v_cos)
+
+#define _ZGVdN4v_cos _ZGVdN4v_cos_sse_wrapper
+#include "../svml_d_cos4_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S
new file mode 100644
index 0000000000..9a776e7df7
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S
@@ -0,0 +1,207 @@
+/* Function cos vectorized with AVX2.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_trig_data.h"
+
+ .text
+ENTRY (_ZGVdN4v_cos_avx2)
+
+/* ALGORITHM DESCRIPTION:
+
+ ( low accuracy ( < 4ulp ) or enhanced performance
+ ( half of correct mantissa ) implementation )
+
+ Argument representation:
+ arg + Pi/2 = (N*Pi + R)
+
+ Result calculation:
+ cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R)
+ sin(R) is approximated by corresponding polynomial
+ */
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $448, %rsp
+ movq __svml_d_trig_data@GOTPCREL(%rip), %rax
+ vmovapd %ymm0, %ymm1
+ vmovupd __dInvPI(%rax), %ymm4
+ vmovupd __dRShifter(%rax), %ymm5
+
+/*
+ ARGUMENT RANGE REDUCTION:
+ Add Pi/2 to argument: X' = X+Pi/2
+ */
+ vaddpd __dHalfPI(%rax), %ymm1, %ymm7
+
+/* Get absolute argument value: X' = |X'| */
+ vandpd __dAbsMask(%rax), %ymm7, %ymm2
+
+/* Y = X'*InvPi + RS : right shifter add */
+ vfmadd213pd %ymm5, %ymm4, %ymm7
+ vmovupd __dC7(%rax), %ymm4
+
+/* Check for large arguments path */
+ vcmpnle_uqpd __dRangeVal(%rax), %ymm2, %ymm3
+
+/* N = Y - RS : right shifter sub */
+ vsubpd %ymm5, %ymm7, %ymm6
+ vmovupd __dPI1_FMA(%rax), %ymm2
+
+/* SignRes = Y<<63 : shift LSB to MSB place for result sign */
+ vpsllq $63, %ymm7, %ymm7
+
+/* N = N - 0.5 */
+ vsubpd __dOneHalf(%rax), %ymm6, %ymm0
+ vmovmskpd %ymm3, %ecx
+
+/* R = X - N*Pi1 */
+ vmovapd %ymm1, %ymm3
+ vfnmadd231pd %ymm0, %ymm2, %ymm3
+
+/* R = R - N*Pi2 */
+ vfnmadd231pd __dPI2_FMA(%rax), %ymm0, %ymm3
+
+/* R = R - N*Pi3 */
+ vfnmadd132pd __dPI3_FMA(%rax), %ymm3, %ymm0
+
+/* POLYNOMIAL APPROXIMATION: R2 = R*R */
+ vmulpd %ymm0, %ymm0, %ymm5
+ vfmadd213pd __dC6(%rax), %ymm5, %ymm4
+ vfmadd213pd __dC5(%rax), %ymm5, %ymm4
+ vfmadd213pd __dC4(%rax), %ymm5, %ymm4
+
+/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */
+ vfmadd213pd __dC3(%rax), %ymm5, %ymm4
+
+/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */
+ vfmadd213pd __dC2(%rax), %ymm5, %ymm4
+ vfmadd213pd __dC1(%rax), %ymm5, %ymm4
+ vmulpd %ymm5, %ymm4, %ymm6
+ vfmadd213pd %ymm0, %ymm0, %ymm6
+
+/*
+ RECONSTRUCTION:
+ Final sign setting: Res = Poly^SignRes */
+ vxorpd %ymm7, %ymm6, %ymm0
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovupd %ymm1, 320(%rsp)
+ vmovupd %ymm0, 384(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ vmovups %ymm8, 224(%rsp)
+ vmovups %ymm9, 192(%rsp)
+ vmovups %ymm10, 160(%rsp)
+ vmovups %ymm11, 128(%rsp)
+ vmovups %ymm12, 96(%rsp)
+ vmovups %ymm13, 64(%rsp)
+ vmovups %ymm14, 32(%rsp)
+ vmovups %ymm15, (%rsp)
+ movq %rsi, 264(%rsp)
+ movq %rdi, 256(%rsp)
+ movq %r12, 296(%rsp)
+ cfi_offset_rel_rsp (12, 296)
+ movb %dl, %r12b
+ movq %r13, 288(%rsp)
+ cfi_offset_rel_rsp (13, 288)
+ movl %ecx, %r13d
+ movq %r14, 280(%rsp)
+ cfi_offset_rel_rsp (14, 280)
+ movl %eax, %r14d
+ movq %r15, 272(%rsp)
+ cfi_offset_rel_rsp (15, 272)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ vmovups 224(%rsp), %ymm8
+ vmovups 192(%rsp), %ymm9
+ vmovups 160(%rsp), %ymm10
+ vmovups 128(%rsp), %ymm11
+ vmovups 96(%rsp), %ymm12
+ vmovups 64(%rsp), %ymm13
+ vmovups 32(%rsp), %ymm14
+ vmovups (%rsp), %ymm15
+ vmovupd 384(%rsp), %ymm0
+ movq 264(%rsp), %rsi
+ movq 256(%rsp), %rdi
+ movq 296(%rsp), %r12
+ cfi_restore (%r12)
+ movq 288(%rsp), %r13
+ cfi_restore (%r13)
+ movq 280(%rsp), %r14
+ cfi_restore (%r14)
+ movq 272(%rsp), %r15
+ cfi_restore (%r15)
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 328(%rsp,%r15), %xmm0
+ vzeroupper
+
+ call cos@PLT
+
+ vmovsd %xmm0, 392(%rsp,%r15)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 320(%rsp,%r15), %xmm0
+ vzeroupper
+
+ call cos@PLT
+
+ vmovsd %xmm0, 384(%rsp,%r15)
+ jmp .LBL_1_7
+
+END (_ZGVdN4v_cos_avx2)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
new file mode 100644
index 0000000000..d0f4f27f46
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
@@ -0,0 +1,39 @@
+/* Multiple versions of vectorized cos, vector length is 8.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVeN8v_cos)
+ .type _ZGVeN8v_cos, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVeN8v_cos_skx(%rip), %rax
+ testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+ jnz 2f
+ leaq _ZGVeN8v_cos_knl(%rip), %rax
+ testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+ jnz 2f
+ leaq _ZGVeN8v_cos_avx2_wrapper(%rip), %rax
+2: ret
+END (_ZGVeN8v_cos)
+
+#define _ZGVeN8v_cos _ZGVeN8v_cos_avx2_wrapper
+#include "../svml_d_cos8_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
new file mode 100644
index 0000000000..b376155210
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
@@ -0,0 +1,463 @@
+/* Function cos vectorized with AVX-512, KNL and SKX versions.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_trig_data.h"
+#include "svml_d_wrapper_impl.h"
+
+ .text
+ENTRY (_ZGVeN8v_cos_knl)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
+#else
+/*
+ ALGORITHM DESCRIPTION:
+
+ ( low accuracy ( < 4ulp ) or enhanced performance
+ ( half of correct mantissa ) implementation )
+
+ Argument representation:
+ arg + Pi/2 = (N*Pi + R)
+
+ Result calculation:
+ cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R)
+ sin(R) is approximated by corresponding polynomial
+ */
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1280, %rsp
+ movq __svml_d_trig_data@GOTPCREL(%rip), %rax
+
+/* R = X - N*Pi1 */
+ vmovaps %zmm0, %zmm7
+
+/* Check for large arguments path */
+ movq $-1, %rcx
+
+/*
+ ARGUMENT RANGE REDUCTION:
+ Add Pi/2 to argument: X' = X+Pi/2
+ */
+ vaddpd __dHalfPI(%rax), %zmm0, %zmm5
+ vmovups __dInvPI(%rax), %zmm3
+
+/* Get absolute argument value: X' = |X'| */
+ vpandq __dAbsMask(%rax), %zmm5, %zmm1
+
+/* Y = X'*InvPi + RS : right shifter add */
+ vfmadd213pd __dRShifter(%rax), %zmm3, %zmm5
+ vmovups __dPI1_FMA(%rax), %zmm6
+
+/* N = Y - RS : right shifter sub */
+ vsubpd __dRShifter(%rax), %zmm5, %zmm4
+
+/* SignRes = Y<<63 : shift LSB to MSB place for result sign */
+ vpsllq $63, %zmm5, %zmm12
+ vmovups __dC7(%rax), %zmm8
+
+/* N = N - 0.5 */
+ vsubpd __dOneHalf(%rax), %zmm4, %zmm10
+ vcmppd $22, __dRangeVal(%rax), %zmm1, %k1
+ vpbroadcastq %rcx, %zmm2{%k1}{z}
+ vfnmadd231pd %zmm10, %zmm6, %zmm7
+ vptestmq %zmm2, %zmm2, %k0
+
+/* R = R - N*Pi2 */
+ vfnmadd231pd __dPI2_FMA(%rax), %zmm10, %zmm7
+ kmovw %k0, %ecx
+ movzbl %cl, %ecx
+
+/* R = R - N*Pi3 */
+ vfnmadd132pd __dPI3_FMA(%rax), %zmm7, %zmm10
+
+/*
+ POLYNOMIAL APPROXIMATION:
+ R2 = R*R
+ */
+ vmulpd %zmm10, %zmm10, %zmm9
+ vfmadd213pd __dC6(%rax), %zmm9, %zmm8
+ vfmadd213pd __dC5(%rax), %zmm9, %zmm8
+ vfmadd213pd __dC4(%rax), %zmm9, %zmm8
+
+/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */
+ vfmadd213pd __dC3(%rax), %zmm9, %zmm8
+
+/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */
+ vfmadd213pd __dC2(%rax), %zmm9, %zmm8
+ vfmadd213pd __dC1(%rax), %zmm9, %zmm8
+ vmulpd %zmm9, %zmm8, %zmm11
+ vfmadd213pd %zmm10, %zmm10, %zmm11
+
+/*
+ RECONSTRUCTION:
+ Final sign setting: Res = Poly^SignRes
+ */
+ vpxorq %zmm12, %zmm11, %zmm1
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ vmovaps %zmm1, %zmm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovups %zmm0, 1152(%rsp)
+ vmovups %zmm1, 1216(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ kmovw %k4, 1048(%rsp)
+ xorl %eax, %eax
+ kmovw %k5, 1040(%rsp)
+ kmovw %k6, 1032(%rsp)
+ kmovw %k7, 1024(%rsp)
+ vmovups %zmm16, 960(%rsp)
+ vmovups %zmm17, 896(%rsp)
+ vmovups %zmm18, 832(%rsp)
+ vmovups %zmm19, 768(%rsp)
+ vmovups %zmm20, 704(%rsp)
+ vmovups %zmm21, 640(%rsp)
+ vmovups %zmm22, 576(%rsp)
+ vmovups %zmm23, 512(%rsp)
+ vmovups %zmm24, 448(%rsp)
+ vmovups %zmm25, 384(%rsp)
+ vmovups %zmm26, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1064(%rsp)
+ movq %rdi, 1056(%rsp)
+ movq %r12, 1096(%rsp)
+ cfi_offset_rel_rsp (12, 1096)
+ movb %dl, %r12b
+ movq %r13, 1088(%rsp)
+ cfi_offset_rel_rsp (13, 1088)
+ movl %ecx, %r13d
+ movq %r14, 1080(%rsp)
+ cfi_offset_rel_rsp (14, 1080)
+ movl %eax, %r14d
+ movq %r15, 1072(%rsp)
+ cfi_offset_rel_rsp (15, 1072)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ addb $1, %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ kmovw 1048(%rsp), %k4
+ movq 1064(%rsp), %rsi
+ kmovw 1040(%rsp), %k5
+ movq 1056(%rsp), %rdi
+ kmovw 1032(%rsp), %k6
+ movq 1096(%rsp), %r12
+ cfi_restore (%r12)
+ movq 1088(%rsp), %r13
+ cfi_restore (%r13)
+ kmovw 1024(%rsp), %k7
+ vmovups 960(%rsp), %zmm16
+ vmovups 896(%rsp), %zmm17
+ vmovups 832(%rsp), %zmm18
+ vmovups 768(%rsp), %zmm19
+ vmovups 704(%rsp), %zmm20
+ vmovups 640(%rsp), %zmm21
+ vmovups 576(%rsp), %zmm22
+ vmovups 512(%rsp), %zmm23
+ vmovups 448(%rsp), %zmm24
+ vmovups 384(%rsp), %zmm25
+ vmovups 320(%rsp), %zmm26
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ movq 1080(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1072(%rsp), %r15
+ cfi_restore (%r15)
+ vmovups 1216(%rsp), %zmm1
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 1160(%rsp,%r15), %xmm0
+ call cos@PLT
+ vmovsd %xmm0, 1224(%rsp,%r15)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 1152(%rsp,%r15), %xmm0
+ call cos@PLT
+ vmovsd %xmm0, 1216(%rsp,%r15)
+ jmp .LBL_1_7
+#endif
+END (_ZGVeN8v_cos_knl)
+
+ENTRY (_ZGVeN8v_cos_skx)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
+#else
+/*
+ ALGORITHM DESCRIPTION:
+
+ ( low accuracy ( < 4ulp ) or enhanced performance
+ ( half of correct mantissa ) implementation )
+
+ Argument representation:
+ arg + Pi/2 = (N*Pi + R)
+
+ Result calculation:
+ cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R)
+ sin(R) is approximated by corresponding polynomial
+ */
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1280, %rsp
+ movq __svml_d_trig_data@GOTPCREL(%rip), %rax
+
+/* R = X - N*Pi1 */
+ vmovaps %zmm0, %zmm8
+
+/* Check for large arguments path */
+ vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
+
+/*
+ ARGUMENT RANGE REDUCTION:
+ Add Pi/2 to argument: X' = X+Pi/2
+ */
+ vaddpd __dHalfPI(%rax), %zmm0, %zmm6
+ vmovups __dInvPI(%rax), %zmm3
+ vmovups __dRShifter(%rax), %zmm4
+ vmovups __dPI1_FMA(%rax), %zmm7
+ vmovups __dC7(%rax), %zmm9
+
+/* Get absolute argument value: X' = |X'| */
+ vandpd __dAbsMask(%rax), %zmm6, %zmm1
+
+/* Y = X'*InvPi + RS : right shifter add */
+ vfmadd213pd %zmm4, %zmm3, %zmm6
+ vcmppd $18, __dRangeVal(%rax), %zmm1, %k1
+
+/* SignRes = Y<<63 : shift LSB to MSB place for result sign */
+ vpsllq $63, %zmm6, %zmm13
+
+/* N = Y - RS : right shifter sub */
+ vsubpd %zmm4, %zmm6, %zmm5
+
+/* N = N - 0.5 */
+ vsubpd __dOneHalf(%rax), %zmm5, %zmm11
+ vfnmadd231pd %zmm11, %zmm7, %zmm8
+
+/* R = R - N*Pi2 */
+ vfnmadd231pd __dPI2_FMA(%rax), %zmm11, %zmm8
+
+/* R = R - N*Pi3 */
+ vfnmadd132pd __dPI3_FMA(%rax), %zmm8, %zmm11
+
+/*
+ POLYNOMIAL APPROXIMATION:
+ R2 = R*R
+ */
+ vmulpd %zmm11, %zmm11, %zmm10
+ vfmadd213pd __dC6(%rax), %zmm10, %zmm9
+ vfmadd213pd __dC5(%rax), %zmm10, %zmm9
+ vfmadd213pd __dC4(%rax), %zmm10, %zmm9
+
+/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */
+ vfmadd213pd __dC3(%rax), %zmm10, %zmm9
+
+/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */
+ vfmadd213pd __dC2(%rax), %zmm10, %zmm9
+ vfmadd213pd __dC1(%rax), %zmm10, %zmm9
+ vmulpd %zmm10, %zmm9, %zmm12
+ vfmadd213pd %zmm11, %zmm11, %zmm12
+ vpandnq %zmm1, %zmm1, %zmm2{%k1}
+ vcmppd $3, %zmm2, %zmm2, %k0
+
+/*
+ RECONSTRUCTION:
+ Final sign setting: Res = Poly^SignRes
+ */
+ vxorpd %zmm13, %zmm12, %zmm1
+ kmovw %k0, %ecx
+ testl %ecx, %ecx
+ jne .LBL_2_3
+
+.LBL_2_2:
+ cfi_remember_state
+ vmovaps %zmm1, %zmm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_2_3:
+ cfi_restore_state
+ vmovups %zmm0, 1152(%rsp)
+ vmovups %zmm1, 1216(%rsp)
+ je .LBL_2_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ kmovw %k4, 1048(%rsp)
+ kmovw %k5, 1040(%rsp)
+ kmovw %k6, 1032(%rsp)
+ kmovw %k7, 1024(%rsp)
+ vmovups %zmm16, 960(%rsp)
+ vmovups %zmm17, 896(%rsp)
+ vmovups %zmm18, 832(%rsp)
+ vmovups %zmm19, 768(%rsp)
+ vmovups %zmm20, 704(%rsp)
+ vmovups %zmm21, 640(%rsp)
+ vmovups %zmm22, 576(%rsp)
+ vmovups %zmm23, 512(%rsp)
+ vmovups %zmm24, 448(%rsp)
+ vmovups %zmm25, 384(%rsp)
+ vmovups %zmm26, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1064(%rsp)
+ movq %rdi, 1056(%rsp)
+ movq %r12, 1096(%rsp)
+ cfi_offset_rel_rsp (12, 1096)
+ movb %dl, %r12b
+ movq %r13, 1088(%rsp)
+ cfi_offset_rel_rsp (13, 1088)
+ movl %ecx, %r13d
+ movq %r14, 1080(%rsp)
+ cfi_offset_rel_rsp (14, 1080)
+ movl %eax, %r14d
+ movq %r15, 1072(%rsp)
+ cfi_offset_rel_rsp (15, 1072)
+ cfi_remember_state
+
+.LBL_2_6:
+ btl %r14d, %r13d
+ jc .LBL_2_12
+
+.LBL_2_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_2_10
+
+.LBL_2_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_2_6
+
+ kmovw 1048(%rsp), %k4
+ kmovw 1040(%rsp), %k5
+ kmovw 1032(%rsp), %k6
+ kmovw 1024(%rsp), %k7
+ vmovups 960(%rsp), %zmm16
+ vmovups 896(%rsp), %zmm17
+ vmovups 832(%rsp), %zmm18
+ vmovups 768(%rsp), %zmm19
+ vmovups 704(%rsp), %zmm20
+ vmovups 640(%rsp), %zmm21
+ vmovups 576(%rsp), %zmm22
+ vmovups 512(%rsp), %zmm23
+ vmovups 448(%rsp), %zmm24
+ vmovups 384(%rsp), %zmm25
+ vmovups 320(%rsp), %zmm26
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ vmovups 1216(%rsp), %zmm1
+ movq 1064(%rsp), %rsi
+ movq 1056(%rsp), %rdi
+ movq 1096(%rsp), %r12
+ cfi_restore (%r12)
+ movq 1088(%rsp), %r13
+ cfi_restore (%r13)
+ movq 1080(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1072(%rsp), %r15
+ cfi_restore (%r15)
+ jmp .LBL_2_2
+
+.LBL_2_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 1160(%rsp,%r15), %xmm0
+ vzeroupper
+ vmovsd 1160(%rsp,%r15), %xmm0
+
+ call cos@PLT
+
+ vmovsd %xmm0, 1224(%rsp,%r15)
+ jmp .LBL_2_8
+
+.LBL_2_12:
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 1152(%rsp,%r15), %xmm0
+ vzeroupper
+ vmovsd 1152(%rsp,%r15), %xmm0
+
+ call cos@PLT
+
+ vmovsd %xmm0, 1216(%rsp,%r15)
+ jmp .LBL_2_7
+#endif
+END (_ZGVeN8v_cos_skx)
+
+ .section .rodata, "a"
+.L_2il0floatpacket.16:
+ .long 0xffffffff,0xffffffff
+ .type .L_2il0floatpacket.16,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S
new file mode 100644
index 0000000000..ef3dc49a1c
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S
@@ -0,0 +1,38 @@
+/* Multiple versions of vectorized exp.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVbN2v_exp)
+ .type _ZGVbN2v_exp, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVbN2v_exp_sse4(%rip), %rax
+ testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVbN2v_exp_sse2(%rip), %rax
+ ret
+END (_ZGVbN2v_exp)
+libmvec_hidden_def (_ZGVbN2v_exp)
+
+#define _ZGVbN2v_exp _ZGVbN2v_exp_sse2
+#include "../svml_d_exp2_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S
new file mode 100644
index 0000000000..1f5445924a
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S
@@ -0,0 +1,225 @@
+/* Function exp vectorized with SSE4.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_exp_data.h"
+
+ .text
+ENTRY (_ZGVbN2v_exp_sse4)
+/*
+ ALGORITHM DESCRIPTION:
+
+ Argument representation:
+ N = rint(X*2^k/ln2) = 2^k*M+j
+ X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r
+ then -ln2/2^(k+1) < r < ln2/2^(k+1)
+ Alternatively:
+ N = trunc(X*2^k/ln2)
+ then 0 < r < ln2/2^k
+
+ Result calculation:
+ exp(X) = exp(M*ln2 + ln2*(j/2^k) + r)
+ = 2^M * 2^(j/2^k) * exp(r)
+ 2^M is calculated by bit manipulation
+ 2^(j/2^k) is stored in table
+ exp(r) is approximated by polynomial.
+
+ The table lookup is skipped if k = 0. */
+
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $320, %rsp
+ movaps %xmm0, %xmm3
+ movq __svml_dexp_data@GOTPCREL(%rip), %r8
+
+/* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */
+ pshufd $221, %xmm3, %xmm7
+ movups __dbInvLn2(%r8), %xmm0
+
+/* dK = X*dbInvLn2 */
+ mulpd %xmm3, %xmm0
+ movq __iAbsMask(%r8), %xmm5
+ movq __iDomainRange(%r8), %xmm6
+
+/* iAbsX = iAbsX&iAbsMask */
+ pand %xmm5, %xmm7
+
+/* iRangeMask = (iAbsX>iDomainRange) */
+ pcmpgtd %xmm6, %xmm7
+
+/* Mask = iRangeMask?1:0, set mask for overflow/underflow */
+ movmskps %xmm7, %eax
+
+/* dN = rint(X*2^k/Ln2) */
+ xorps %xmm7, %xmm7
+ movups __dbLn2hi(%r8), %xmm5
+ movups __dbLn2lo(%r8), %xmm6
+ roundpd $0, %xmm0, %xmm7
+
+/* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */
+ mulpd %xmm7, %xmm5
+
+/* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */
+ mulpd %xmm6, %xmm7
+ movups __dbShifter(%r8), %xmm4
+
+/* dM = X*dbInvLn2+dbShifter */
+ addpd %xmm0, %xmm4
+ movaps %xmm3, %xmm0
+ subpd %xmm5, %xmm0
+ subpd %xmm7, %xmm0
+ movups __dPC2(%r8), %xmm5
+
+/* exp(r) = b0+r*(b0+r*(b1+r*b2)) */
+ mulpd %xmm0, %xmm5
+ addpd __dPC1(%r8), %xmm5
+ mulpd %xmm0, %xmm5
+ movups __dPC0(%r8), %xmm6
+ addpd %xmm6, %xmm5
+ mulpd %xmm5, %xmm0
+ movdqu __lIndexMask(%r8), %xmm2
+
+/* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */
+ movdqa %xmm2, %xmm1
+
+/* lM = (*(longlong*)&dM)&(~lIndexMask) */
+ pandn %xmm4, %xmm2
+ pand %xmm4, %xmm1
+
+/* lM = lM<<(52-K), 2^M */
+ psllq $42, %xmm2
+
+/* table lookup for dT[j] = 2^(j/2^k) */
+ movd %xmm1, %edx
+ pextrw $4, %xmm1, %ecx
+ addpd %xmm0, %xmm6
+ shll $3, %edx
+ shll $3, %ecx
+ movq (%r8,%rdx), %xmm0
+ andl $3, %eax
+ movhpd (%r8,%rcx), %xmm0
+
+/* 2^(j/2^k) * exp(r) */
+ mulpd %xmm6, %xmm0
+
+/* multiply by 2^M through integer add */
+ paddq %xmm2, %xmm0
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ movups %xmm3, 192(%rsp)
+ movups %xmm0, 256(%rsp)
+ je .LBL_1_2
+
+ xorb %cl, %cl
+ xorl %edx, %edx
+ movups %xmm8, 112(%rsp)
+ movups %xmm9, 96(%rsp)
+ movups %xmm10, 80(%rsp)
+ movups %xmm11, 64(%rsp)
+ movups %xmm12, 48(%rsp)
+ movups %xmm13, 32(%rsp)
+ movups %xmm14, 16(%rsp)
+ movups %xmm15, (%rsp)
+ movq %rsi, 136(%rsp)
+ movq %rdi, 128(%rsp)
+ movq %r12, 168(%rsp)
+ cfi_offset_rel_rsp (12, 168)
+ movb %cl, %r12b
+ movq %r13, 160(%rsp)
+ cfi_offset_rel_rsp (13, 160)
+ movl %eax, %r13d
+ movq %r14, 152(%rsp)
+ cfi_offset_rel_rsp (14, 152)
+ movl %edx, %r14d
+ movq %r15, 144(%rsp)
+ cfi_offset_rel_rsp (15, 144)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ movups 112(%rsp), %xmm8
+ movups 96(%rsp), %xmm9
+ movups 80(%rsp), %xmm10
+ movups 64(%rsp), %xmm11
+ movups 48(%rsp), %xmm12
+ movups 32(%rsp), %xmm13
+ movups 16(%rsp), %xmm14
+ movups (%rsp), %xmm15
+ movq 136(%rsp), %rsi
+ movq 128(%rsp), %rdi
+ movq 168(%rsp), %r12
+ cfi_restore (%r12)
+ movq 160(%rsp), %r13
+ cfi_restore (%r13)
+ movq 152(%rsp), %r14
+ cfi_restore (%r14)
+ movq 144(%rsp), %r15
+ cfi_restore (%r15)
+ movups 256(%rsp), %xmm0
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ movsd 200(%rsp,%r15), %xmm0
+
+ call exp@PLT
+
+ movsd %xmm0, 264(%rsp,%r15)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ movsd 192(%rsp,%r15), %xmm0
+
+ call exp@PLT
+
+ movsd %xmm0, 256(%rsp,%r15)
+ jmp .LBL_1_7
+
+END (_ZGVbN2v_exp_sse4)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S
new file mode 100644
index 0000000000..7f2ebdef67
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S
@@ -0,0 +1,38 @@
+/* Multiple versions of vectorized exp.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVdN4v_exp)
+ .type _ZGVdN4v_exp, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVdN4v_exp_avx2(%rip), %rax
+ testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVdN4v_exp_sse_wrapper(%rip), %rax
+ ret
+END (_ZGVdN4v_exp)
+libmvec_hidden_def (_ZGVdN4v_exp)
+
+#define _ZGVdN4v_exp _ZGVdN4v_exp_sse_wrapper
+#include "../svml_d_exp4_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S
new file mode 100644
index 0000000000..a34e267433
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S
@@ -0,0 +1,212 @@
+/* Function exp vectorized with AVX2.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_exp_data.h"
+
+ .text
+ENTRY (_ZGVdN4v_exp_avx2)
+/*
+ ALGORITHM DESCRIPTION:
+
+ Argument representation:
+ N = rint(X*2^k/ln2) = 2^k*M+j
+ X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r
+ then -ln2/2^(k+1) < r < ln2/2^(k+1)
+ Alternatively:
+ N = trunc(X*2^k/ln2)
+ then 0 < r < ln2/2^k
+
+ Result calculation:
+ exp(X) = exp(M*ln2 + ln2*(j/2^k) + r)
+ = 2^M * 2^(j/2^k) * exp(r)
+ 2^M is calculated by bit manipulation
+ 2^(j/2^k) is stored in table
+ exp(r) is approximated by polynomial
+
+ The table lookup is skipped if k = 0. */
+
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $448, %rsp
+ movq __svml_dexp_data@GOTPCREL(%rip), %rax
+ vmovdqa %ymm0, %ymm2
+ vmovupd __dbInvLn2(%rax), %ymm3
+ vmovupd __dbShifter(%rax), %ymm1
+ vmovupd __lIndexMask(%rax), %ymm4
+
+/* dM = X*dbInvLn2+dbShifter, dbInvLn2 = 2^k/Ln2 */
+ vfmadd213pd %ymm1, %ymm2, %ymm3
+
+/* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */
+ vextracti128 $1, %ymm2, %xmm5
+ vshufps $221, %xmm5, %xmm2, %xmm6
+
+/* iAbsX = iAbsX&iAbsMask */
+ vandps __iAbsMask(%rax), %xmm6, %xmm7
+
+/* dN = dM-dbShifter, dN = rint(X*2^k/Ln2) */
+ vsubpd %ymm1, %ymm3, %ymm6
+
+/* iRangeMask = (iAbsX>iDomainRange) */
+ vpcmpgtd __iDomainRange(%rax), %xmm7, %xmm0
+ vmovupd __dbLn2hi(%rax), %ymm1
+ vmovupd __dPC0(%rax), %ymm7
+
+/* Mask = iRangeMask?1:0, set mask for overflow/underflow */
+ vmovmskps %xmm0, %ecx
+ vmovupd __dPC2(%rax), %ymm0
+
+/* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */
+ vmovdqa %ymm2, %ymm5
+ vfnmadd231pd %ymm6, %ymm1, %ymm5
+
+/* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */
+ vfnmadd132pd __dbLn2lo(%rax), %ymm5, %ymm6
+
+/* exp(r) = b0+r*(b0+r*(b1+r*b2)) */
+ vfmadd213pd __dPC1(%rax), %ymm6, %ymm0
+ vfmadd213pd %ymm7, %ymm6, %ymm0
+ vfmadd213pd %ymm7, %ymm6, %ymm0
+
+/* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */
+ vandps %ymm4, %ymm3, %ymm1
+
+/* table lookup for dT[j] = 2^(j/2^k) */
+ vxorpd %ymm6, %ymm6, %ymm6
+ vpcmpeqd %ymm5, %ymm5, %ymm5
+ vgatherqpd %ymm5, (%rax,%ymm1,8), %ymm6
+
+/* lM = (*(longlong*)&dM)&(~lIndexMask) */
+ vpandn %ymm3, %ymm4, %ymm3
+
+/* 2^(j/2^k) * exp(r) */
+ vmulpd %ymm0, %ymm6, %ymm0
+
+/* lM = lM<<(52-K), 2^M */
+ vpsllq $42, %ymm3, %ymm4
+
+/* multiply by 2^M through integer add */
+ vpaddq %ymm4, %ymm0, %ymm0
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovupd %ymm2, 320(%rsp)
+ vmovupd %ymm0, 384(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ vmovups %ymm8, 224(%rsp)
+ vmovups %ymm9, 192(%rsp)
+ vmovups %ymm10, 160(%rsp)
+ vmovups %ymm11, 128(%rsp)
+ vmovups %ymm12, 96(%rsp)
+ vmovups %ymm13, 64(%rsp)
+ vmovups %ymm14, 32(%rsp)
+ vmovups %ymm15, (%rsp)
+ movq %rsi, 264(%rsp)
+ movq %rdi, 256(%rsp)
+ movq %r12, 296(%rsp)
+ cfi_offset_rel_rsp (12, 296)
+ movb %dl, %r12b
+ movq %r13, 288(%rsp)
+ cfi_offset_rel_rsp (13, 288)
+ movl %ecx, %r13d
+ movq %r14, 280(%rsp)
+ cfi_offset_rel_rsp (14, 280)
+ movl %eax, %r14d
+ movq %r15, 272(%rsp)
+ cfi_offset_rel_rsp (15, 272)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ vmovups 224(%rsp), %ymm8
+ vmovups 192(%rsp), %ymm9
+ vmovups 160(%rsp), %ymm10
+ vmovups 128(%rsp), %ymm11
+ vmovups 96(%rsp), %ymm12
+ vmovups 64(%rsp), %ymm13
+ vmovups 32(%rsp), %ymm14
+ vmovups (%rsp), %ymm15
+ vmovupd 384(%rsp), %ymm0
+ movq 264(%rsp), %rsi
+ movq 256(%rsp), %rdi
+ movq 296(%rsp), %r12
+ cfi_restore (%r12)
+ movq 288(%rsp), %r13
+ cfi_restore (%r13)
+ movq 280(%rsp), %r14
+ cfi_restore (%r14)
+ movq 272(%rsp), %r15
+ cfi_restore (%r15)
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 328(%rsp,%r15), %xmm0
+ vzeroupper
+
+ call exp@PLT
+
+ vmovsd %xmm0, 392(%rsp,%r15)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 320(%rsp,%r15), %xmm0
+ vzeroupper
+
+ call exp@PLT
+
+ vmovsd %xmm0, 384(%rsp,%r15)
+ jmp .LBL_1_7
+
+END (_ZGVdN4v_exp_avx2)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
new file mode 100644
index 0000000000..7b7c07d926
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
@@ -0,0 +1,39 @@
+/* Multiple versions of vectorized exp.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVeN8v_exp)
+ .type _ZGVeN8v_exp, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVeN8v_exp_skx(%rip), %rax
+ testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+ jnz 2f
+ leaq _ZGVeN8v_exp_knl(%rip), %rax
+ testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+ jnz 2f
+ leaq _ZGVeN8v_exp_avx2_wrapper(%rip), %rax
+2: ret
+END (_ZGVeN8v_exp)
+
+#define _ZGVeN8v_exp _ZGVeN8v_exp_avx2_wrapper
+#include "../svml_d_exp8_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S
new file mode 100644
index 0000000000..049a7e49cd
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S
@@ -0,0 +1,456 @@
+/* Function exp vectorized with AVX-512. KNL and SKX versions.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_exp_data.h"
+#include "svml_d_wrapper_impl.h"
+
+ .text
+ENTRY (_ZGVeN8v_exp_knl)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN4v_exp
+#else
+/*
+ ALGORITHM DESCRIPTION:
+
+ Argument representation:
+ N = rint(X*2^k/ln2) = 2^k*M+j
+ X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r
+ then -ln2/2^(k+1) < r < ln2/2^(k+1)
+ Alternatively:
+ N = trunc(X*2^k/ln2)
+ then 0 < r < ln2/2^k
+
+ Result calculation:
+ exp(X) = exp(M*ln2 + ln2*(j/2^k) + r)
+ = 2^M * 2^(j/2^k) * exp(r)
+ 2^M is calculated by bit manipulation
+ 2^(j/2^k) is stored in table
+ exp(r) is approximated by polynomial
+
+ The table lookup is skipped if k = 0. */
+
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1280, %rsp
+ movq __svml_dexp_data@GOTPCREL(%rip), %rax
+
+/* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */
+ vmovaps %zmm0, %zmm8
+
+/* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */
+ vpsrlq $32, %zmm0, %zmm1
+
+/* iAbsX = iAbsX&iAbsMask */
+ movl $255, %edx
+ vpmovqd %zmm1, %ymm2
+ kmovw %edx, %k2
+
+/* iRangeMask = (iAbsX>iDomainRange) */
+ movl $-1, %ecx
+
+/* table lookup for dT[j] = 2^(j/2^k) */
+ vpxord %zmm11, %zmm11, %zmm11
+ vmovups __dbInvLn2(%rax), %zmm5
+ vmovups __dbLn2hi(%rax), %zmm7
+ kxnorw %k3, %k3, %k3
+
+/* dM = X*dbInvLn2+dbShifter, dbInvLn2 = 2^k/Ln2 */
+ vfmadd213pd __dbShifter(%rax), %zmm0, %zmm5
+ vmovups __dPC2(%rax), %zmm12
+
+/* dN = dM-dbShifter, dN = rint(X*2^k/Ln2) */
+ vsubpd __dbShifter(%rax), %zmm5, %zmm9
+ vmovups __lIndexMask(%rax), %zmm4
+ vfnmadd231pd %zmm9, %zmm7, %zmm8
+ vpandd __iAbsMask(%rax), %zmm2, %zmm2{%k2}
+
+/* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */
+ vpandq %zmm4, %zmm5, %zmm10
+ vgatherqpd (%rax,%zmm10,8), %zmm11{%k3}
+ vpcmpgtd __iDomainRange(%rax), %zmm2, %k1{%k2}
+
+/* lM = (*(longlong*)&dM)&(~lIndexMask) */
+ vpandnq %zmm5, %zmm4, %zmm6
+ vpbroadcastd %ecx, %zmm3{%k1}{z}
+
+/* lM = lM<<(52-K), 2^M */
+ vpsllq $42, %zmm6, %zmm14
+
+/* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */
+ vfnmadd132pd __dbLn2lo(%rax), %zmm8, %zmm9
+
+/* Mask = iRangeMask?1:0, set mask for overflow/underflow */
+ vptestmd %zmm3, %zmm3, %k0{%k2}
+
+/* exp(r) = b0+r*(b0+r*(b1+r*b2)) */
+ vfmadd213pd __dPC1(%rax), %zmm9, %zmm12
+ kmovw %k0, %ecx
+ movzbl %cl, %ecx
+ vfmadd213pd __dPC0(%rax), %zmm9, %zmm12
+ vfmadd213pd __dPC0(%rax), %zmm9, %zmm12
+
+/* 2^(j/2^k) * exp(r) */
+ vmulpd %zmm12, %zmm11, %zmm13
+
+/* multiply by 2^M through integer add */
+ vpaddq %zmm14, %zmm13, %zmm1
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ vmovaps %zmm1, %zmm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovups %zmm0, 1152(%rsp)
+ vmovups %zmm1, 1216(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ kmovw %k4, 1048(%rsp)
+ xorl %eax, %eax
+ kmovw %k5, 1040(%rsp)
+ kmovw %k6, 1032(%rsp)
+ kmovw %k7, 1024(%rsp)
+ vmovups %zmm16, 960(%rsp)
+ vmovups %zmm17, 896(%rsp)
+ vmovups %zmm18, 832(%rsp)
+ vmovups %zmm19, 768(%rsp)
+ vmovups %zmm20, 704(%rsp)
+ vmovups %zmm21, 640(%rsp)
+ vmovups %zmm22, 576(%rsp)
+ vmovups %zmm23, 512(%rsp)
+ vmovups %zmm24, 448(%rsp)
+ vmovups %zmm25, 384(%rsp)
+ vmovups %zmm26, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1064(%rsp)
+ movq %rdi, 1056(%rsp)
+ movq %r12, 1096(%rsp)
+ cfi_offset_rel_rsp (12, 1096)
+ movb %dl, %r12b
+ movq %r13, 1088(%rsp)
+ cfi_offset_rel_rsp (13, 1088)
+ movl %ecx, %r13d
+ movq %r14, 1080(%rsp)
+ cfi_offset_rel_rsp (14, 1080)
+ movl %eax, %r14d
+ movq %r15, 1072(%rsp)
+ cfi_offset_rel_rsp (15, 1072)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ addb $1, %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ kmovw 1048(%rsp), %k4
+ movq 1064(%rsp), %rsi
+ kmovw 1040(%rsp), %k5
+ movq 1056(%rsp), %rdi
+ kmovw 1032(%rsp), %k6
+ movq 1096(%rsp), %r12
+ cfi_restore (%r12)
+ movq 1088(%rsp), %r13
+ cfi_restore (%r13)
+ kmovw 1024(%rsp), %k7
+ vmovups 960(%rsp), %zmm16
+ vmovups 896(%rsp), %zmm17
+ vmovups 832(%rsp), %zmm18
+ vmovups 768(%rsp), %zmm19
+ vmovups 704(%rsp), %zmm20
+ vmovups 640(%rsp), %zmm21
+ vmovups 576(%rsp), %zmm22
+ vmovups 512(%rsp), %zmm23
+ vmovups 448(%rsp), %zmm24
+ vmovups 384(%rsp), %zmm25
+ vmovups 320(%rsp), %zmm26
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ movq 1080(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1072(%rsp), %r15
+ cfi_restore (%r15)
+ vmovups 1216(%rsp), %zmm1
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 1160(%rsp,%r15), %xmm0
+ call exp@PLT
+ vmovsd %xmm0, 1224(%rsp,%r15)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 1152(%rsp,%r15), %xmm0
+ call exp@PLT
+ vmovsd %xmm0, 1216(%rsp,%r15)
+ jmp .LBL_1_7
+#endif
+END (_ZGVeN8v_exp_knl)
+
+ENTRY (_ZGVeN8v_exp_skx)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN4v_exp
+#else
+/*
+ ALGORITHM DESCRIPTION:
+
+ Argument representation:
+ N = rint(X*2^k/ln2) = 2^k*M+j
+ X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r
+ then -ln2/2^(k+1) < r < ln2/2^(k+1)
+ Alternatively:
+ N = trunc(X*2^k/ln2)
+ then 0 < r < ln2/2^k
+
+ Result calculation:
+ exp(X) = exp(M*ln2 + ln2*(j/2^k) + r)
+ = 2^M * 2^(j/2^k) * exp(r)
+ 2^M is calculated by bit manipulation
+ 2^(j/2^k) is stored in table
+ exp(r) is approximated by polynomial
+
+ The table lookup is skipped if k = 0. */
+
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1280, %rsp
+ movq __svml_dexp_data@GOTPCREL(%rip), %rax
+
+/* table lookup for dT[j] = 2^(j/2^k) */
+ kxnorw %k1, %k1, %k1
+
+/* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */
+ vpsrlq $32, %zmm0, %zmm1
+ vmovups __dbInvLn2(%rax), %zmm7
+ vmovups __dbShifter(%rax), %zmm5
+ vmovups __lIndexMask(%rax), %zmm6
+ vmovups __dbLn2hi(%rax), %zmm9
+ vmovups __dPC0(%rax), %zmm12
+
+/* dM = X*dbInvLn2+dbShifter, dbInvLn2 = 2^k/Ln2 */
+ vfmadd213pd %zmm5, %zmm0, %zmm7
+ vpmovqd %zmm1, %ymm2
+
+/* dN = dM-dbShifter, dN = rint(X*2^k/Ln2) */
+ vsubpd %zmm5, %zmm7, %zmm11
+
+/* iAbsX = iAbsX&iAbsMask */
+ vpand __iAbsMask(%rax), %ymm2, %ymm3
+
+/* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */
+ vmovaps %zmm0, %zmm10
+ vfnmadd231pd %zmm11, %zmm9, %zmm10
+ vmovups __dPC2(%rax), %zmm9
+
+/* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */
+ vfnmadd132pd __dbLn2lo(%rax), %zmm10, %zmm11
+
+/* exp(r) = b0+r*(b0+r*(b1+r*b2)) */
+ vfmadd213pd __dPC1(%rax), %zmm11, %zmm9
+ vfmadd213pd %zmm12, %zmm11, %zmm9
+ vfmadd213pd %zmm12, %zmm11, %zmm9
+
+/* iRangeMask = (iAbsX>iDomainRange) */
+ vpcmpgtd __iDomainRange(%rax), %ymm3, %ymm4
+
+/* Mask = iRangeMask?1:0, set mask for overflow/underflow */
+ vmovmskps %ymm4, %ecx
+
+/* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */
+ vpandq %zmm6, %zmm7, %zmm13
+ vpmovqd %zmm13, %ymm14
+ vpxord %zmm15, %zmm15, %zmm15
+ vgatherdpd (%rax,%ymm14,8), %zmm15{%k1}
+
+/* 2^(j/2^k) * exp(r) */
+ vmulpd %zmm9, %zmm15, %zmm10
+
+/* lM = (*(longlong*)&dM)&(~lIndexMask) */
+ vpandnq %zmm7, %zmm6, %zmm8
+
+/* lM = lM<<(52-K), 2^M */
+ vpsllq $42, %zmm8, %zmm1
+
+/* multiply by 2^M through integer add */
+ vpaddq %zmm1, %zmm10, %zmm1
+ testl %ecx, %ecx
+ jne .LBL_2_3
+
+.LBL_2_2:
+ cfi_remember_state
+ vmovaps %zmm1, %zmm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_2_3:
+ cfi_restore_state
+ vmovups %zmm0, 1152(%rsp)
+ vmovups %zmm1, 1216(%rsp)
+ je .LBL_2_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ kmovw %k4, 1048(%rsp)
+ kmovw %k5, 1040(%rsp)
+ kmovw %k6, 1032(%rsp)
+ kmovw %k7, 1024(%rsp)
+ vmovups %zmm16, 960(%rsp)
+ vmovups %zmm17, 896(%rsp)
+ vmovups %zmm18, 832(%rsp)
+ vmovups %zmm19, 768(%rsp)
+ vmovups %zmm20, 704(%rsp)
+ vmovups %zmm21, 640(%rsp)
+ vmovups %zmm22, 576(%rsp)
+ vmovups %zmm23, 512(%rsp)
+ vmovups %zmm24, 448(%rsp)
+ vmovups %zmm25, 384(%rsp)
+ vmovups %zmm26, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1064(%rsp)
+ movq %rdi, 1056(%rsp)
+ movq %r12, 1096(%rsp)
+ cfi_offset_rel_rsp (12, 1096)
+ movb %dl, %r12b
+ movq %r13, 1088(%rsp)
+ cfi_offset_rel_rsp (13, 1088)
+ movl %ecx, %r13d
+ movq %r14, 1080(%rsp)
+ cfi_offset_rel_rsp (14, 1080)
+ movl %eax, %r14d
+ movq %r15, 1072(%rsp)
+ cfi_offset_rel_rsp (15, 1072)
+ cfi_remember_state
+
+.LBL_2_6:
+ btl %r14d, %r13d
+ jc .LBL_2_12
+
+.LBL_2_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_2_10
+
+.LBL_2_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_2_6
+
+ kmovw 1048(%rsp), %k4
+ kmovw 1040(%rsp), %k5
+ kmovw 1032(%rsp), %k6
+ kmovw 1024(%rsp), %k7
+ vmovups 960(%rsp), %zmm16
+ vmovups 896(%rsp), %zmm17
+ vmovups 832(%rsp), %zmm18
+ vmovups 768(%rsp), %zmm19
+ vmovups 704(%rsp), %zmm20
+ vmovups 640(%rsp), %zmm21
+ vmovups 576(%rsp), %zmm22
+ vmovups 512(%rsp), %zmm23
+ vmovups 448(%rsp), %zmm24
+ vmovups 384(%rsp), %zmm25
+ vmovups 320(%rsp), %zmm26
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ vmovups 1216(%rsp), %zmm1
+ movq 1064(%rsp), %rsi
+ movq 1056(%rsp), %rdi
+ movq 1096(%rsp), %r12
+ cfi_restore (%r12)
+ movq 1088(%rsp), %r13
+ cfi_restore (%r13)
+ movq 1080(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1072(%rsp), %r15
+ cfi_restore (%r15)
+ jmp .LBL_2_2
+
+.LBL_2_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 1160(%rsp,%r15), %xmm0
+ vzeroupper
+ vmovsd 1160(%rsp,%r15), %xmm0
+ call exp@PLT
+ vmovsd %xmm0, 1224(%rsp,%r15)
+ jmp .LBL_2_8
+
+.LBL_2_12:
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 1152(%rsp,%r15), %xmm0
+ vzeroupper
+ vmovsd 1152(%rsp,%r15), %xmm0
+ call exp@PLT
+ vmovsd %xmm0, 1216(%rsp,%r15)
+ jmp .LBL_2_7
+
+#endif
+END (_ZGVeN8v_exp_skx)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S
new file mode 100644
index 0000000000..38d369fc3c
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S
@@ -0,0 +1,38 @@
+/* Multiple versions of vectorized log.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVbN2v_log)
+ .type _ZGVbN2v_log, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVbN2v_log_sse4(%rip), %rax
+ testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVbN2v_log_sse2(%rip), %rax
+ ret
+END (_ZGVbN2v_log)
+libmvec_hidden_def (_ZGVbN2v_log)
+
+#define _ZGVbN2v_log _ZGVbN2v_log_sse2
+#include "../svml_d_log2_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S
new file mode 100644
index 0000000000..82f3d8215d
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S
@@ -0,0 +1,229 @@
+/* Function log vectorized with SSE4.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_log_data.h"
+
+ .text
+ENTRY (_ZGVbN2v_log_sse4)
+/*
+ ALGORITHM DESCRIPTION:
+
+ log(x) = -log(Rcp) + log(Rcp*x),
+ where Rcp ~ 1/x (accuracy ~9 bits, obtained by rounding
+ HW approximation to 1+9 mantissa bits)
+
+ Reduced argument R=Rcp*x-1 is used to approximate log(1+R) as polynomial
+
+ log(Rcp) = exponent_Rcp*log(2) + log(mantissa_Rcp)
+ -log(mantissa_Rcp) is obtained from a lookup table,
+ accessed by a 9-bit index
+ */
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $320, %rsp
+ movaps %xmm0, %xmm6
+ movq __svml_dlog_data@GOTPCREL(%rip), %r8
+ movaps %xmm6, %xmm3
+ movaps %xmm6, %xmm2
+
+/* isolate exponent bits */
+ movaps %xmm6, %xmm1
+ psrlq $20, %xmm1
+ movups _ExpMask(%r8), %xmm5
+
+/* preserve mantissa, set input exponent to 2^(-10) */
+ andps %xmm6, %xmm5
+ orps _Two10(%r8), %xmm5
+
+/* reciprocal approximation good to at least 11 bits */
+ cvtpd2ps %xmm5, %xmm7
+ cmpltpd _MinNorm(%r8), %xmm3
+ cmpnlepd _MaxNorm(%r8), %xmm2
+ movlhps %xmm7, %xmm7
+
+/* combine and get argument value range mask */
+ orps %xmm2, %xmm3
+ rcpps %xmm7, %xmm0
+ movmskpd %xmm3, %eax
+ movups _HalfMask(%r8), %xmm2
+
+/* argument reduction started: R = Mantissa*Rcp - 1 */
+ andps %xmm5, %xmm2
+ cvtps2pd %xmm0, %xmm4
+ subpd %xmm2, %xmm5
+
+/* round reciprocal to nearest integer, will have 1+9 mantissa bits */
+ roundpd $0, %xmm4, %xmm4
+ mulpd %xmm4, %xmm2
+ mulpd %xmm4, %xmm5
+ subpd _One(%r8), %xmm2
+ addpd %xmm2, %xmm5
+ movups _Threshold(%r8), %xmm2
+
+/* calculate index for table lookup */
+ movaps %xmm4, %xmm3
+ cmpltpd %xmm4, %xmm2
+ pshufd $221, %xmm1, %xmm7
+ psrlq $40, %xmm3
+
+/* convert biased exponent to DP format */
+ cvtdq2pd %xmm7, %xmm0
+ movd %xmm3, %edx
+ movups _poly_coeff_1(%r8), %xmm4
+
+/* polynomial computation */
+ mulpd %xmm5, %xmm4
+ andps _Bias(%r8), %xmm2
+ orps _Bias1(%r8), %xmm2
+
+/*
+ Table stores -log(0.5*mantissa) for larger mantissas,
+ adjust exponent accordingly
+ */
+ subpd %xmm2, %xmm0
+ addpd _poly_coeff_2(%r8), %xmm4
+
+/* exponent*log(2.0) */
+ mulpd _L2(%r8), %xmm0
+ movaps %xmm5, %xmm2
+ mulpd %xmm5, %xmm2
+ movups _poly_coeff_3(%r8), %xmm7
+ mulpd %xmm5, %xmm7
+ mulpd %xmm2, %xmm4
+ addpd _poly_coeff_4(%r8), %xmm7
+ addpd %xmm4, %xmm7
+ mulpd %xmm7, %xmm2
+ movslq %edx, %rdx
+ pextrd $2, %xmm3, %ecx
+
+/*
+ reconstruction:
+ (exponent*log(2)) + (LogRcp + (R+poly))
+ */
+ addpd %xmm2, %xmm5
+ movslq %ecx, %rcx
+ movsd _LogRcp_lookup(%r8,%rdx), %xmm1
+ movhpd _LogRcp_lookup(%r8,%rcx), %xmm1
+ addpd %xmm5, %xmm1
+ addpd %xmm1, %xmm0
+ testl %eax, %eax
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ movups %xmm6, 192(%rsp)
+ movups %xmm0, 256(%rsp)
+ je .LBL_1_2
+
+ xorb %cl, %cl
+ xorl %edx, %edx
+ movups %xmm8, 112(%rsp)
+ movups %xmm9, 96(%rsp)
+ movups %xmm10, 80(%rsp)
+ movups %xmm11, 64(%rsp)
+ movups %xmm12, 48(%rsp)
+ movups %xmm13, 32(%rsp)
+ movups %xmm14, 16(%rsp)
+ movups %xmm15, (%rsp)
+ movq %rsi, 136(%rsp)
+ movq %rdi, 128(%rsp)
+ movq %r12, 168(%rsp)
+ cfi_offset_rel_rsp (12, 168)
+ movb %cl, %r12b
+ movq %r13, 160(%rsp)
+ cfi_offset_rel_rsp (13, 160)
+ movl %eax, %r13d
+ movq %r14, 152(%rsp)
+ cfi_offset_rel_rsp (14, 152)
+ movl %edx, %r14d
+ movq %r15, 144(%rsp)
+ cfi_offset_rel_rsp (15, 144)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ movups 112(%rsp), %xmm8
+ movups 96(%rsp), %xmm9
+ movups 80(%rsp), %xmm10
+ movups 64(%rsp), %xmm11
+ movups 48(%rsp), %xmm12
+ movups 32(%rsp), %xmm13
+ movups 16(%rsp), %xmm14
+ movups (%rsp), %xmm15
+ movq 136(%rsp), %rsi
+ movq 128(%rsp), %rdi
+ movq 168(%rsp), %r12
+ cfi_restore (%r12)
+ movq 160(%rsp), %r13
+ cfi_restore (%r13)
+ movq 152(%rsp), %r14
+ cfi_restore (%r14)
+ movq 144(%rsp), %r15
+ cfi_restore (%r15)
+ movups 256(%rsp), %xmm0
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ movsd 200(%rsp,%r15), %xmm0
+
+ call log@PLT
+
+ movsd %xmm0, 264(%rsp,%r15)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ movsd 192(%rsp,%r15), %xmm0
+
+ call log@PLT
+
+ movsd %xmm0, 256(%rsp,%r15)
+ jmp .LBL_1_7
+
+END (_ZGVbN2v_log_sse4)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S
new file mode 100644
index 0000000000..ddb6105405
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S
@@ -0,0 +1,38 @@
+/* Multiple versions of vectorized log.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVdN4v_log)
+ .type _ZGVdN4v_log, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVdN4v_log_avx2(%rip), %rax
+ testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVdN4v_log_sse_wrapper(%rip), %rax
+ ret
+END (_ZGVdN4v_log)
+libmvec_hidden_def (_ZGVdN4v_log)
+
+#define _ZGVdN4v_log _ZGVdN4v_log_sse_wrapper
+#include "../svml_d_log4_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core_avx2.S
new file mode 100644
index 0000000000..816aede395
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core_avx2.S
@@ -0,0 +1,210 @@
+/* Function log vectorized with AVX2.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_log_data.h"
+
+ .text
+ENTRY (_ZGVdN4v_log_avx2)
+/* ALGORITHM DESCRIPTION:
+
+ log(x) = -log(Rcp) + log(Rcp*x),
+ where Rcp ~ 1/x (accuracy ~9 bits, obtained by rounding
+ HW approximation to 1+9 mantissa bits)
+
+ Reduced argument R=Rcp*x-1 is used to approximate log(1+R) as polynomial
+
+ log(Rcp) = exponent_Rcp*log(2) + log(mantissa_Rcp)
+ -log(mantissa_Rcp) is obtained from a lookup table,
+ accessed by a 9-bit index
+ */
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $448, %rsp
+ movq __svml_dlog_data@GOTPCREL(%rip), %rax
+ vmovdqa %ymm0, %ymm5
+
+/* isolate exponent bits */
+ vpsrlq $20, %ymm5, %ymm0
+
+/* preserve mantissa, set input exponent to 2^(-10) */
+ vandpd _ExpMask(%rax), %ymm5, %ymm6
+ vorpd _Two10(%rax), %ymm6, %ymm4
+
+/* reciprocal approximation good to at least 11 bits */
+ vcvtpd2ps %ymm4, %xmm7
+ vrcpps %xmm7, %xmm1
+ vcmplt_oqpd _MinNorm(%rax), %ymm5, %ymm7
+ vcvtps2pd %xmm1, %ymm3
+ vcmpnle_uqpd _MaxNorm(%rax), %ymm5, %ymm1
+ vextracti128 $1, %ymm0, %xmm2
+ vshufps $221, %xmm2, %xmm0, %xmm6
+
+/* round reciprocal to nearest integer, will have 1+9 mantissa bits */
+ vroundpd $0, %ymm3, %ymm2
+
+/* convert biased exponent to DP format */
+ vcvtdq2pd %xmm6, %ymm0
+
+/* combine and get argument value range mask */
+ vorpd %ymm1, %ymm7, %ymm3
+ vmovupd _One(%rax), %ymm1
+ vmovmskpd %ymm3, %ecx
+
+/* calculate index for table lookup */
+ vpsrlq $40, %ymm2, %ymm3
+
+/* argument reduction started: R = Mantissa*Rcp - 1 */
+ vfmsub213pd %ymm1, %ymm2, %ymm4
+ vcmpgt_oqpd _Threshold(%rax), %ymm2, %ymm2
+ vpcmpeqd %ymm6, %ymm6, %ymm6
+ vxorpd %ymm1, %ymm1, %ymm1
+ vgatherqpd %ymm6, _LogRcp_lookup(%rax,%ymm3), %ymm1
+
+/* exponent*log(2.0) */
+ vmovupd _poly_coeff_1(%rax), %ymm6
+ vmulpd %ymm4, %ymm4, %ymm3
+
+/* polynomial computation */
+ vfmadd213pd _poly_coeff_2(%rax), %ymm4, %ymm6
+ vandpd _Bias(%rax), %ymm2, %ymm7
+ vorpd _Bias1(%rax), %ymm7, %ymm2
+
+/*
+ Table stores -log(0.5*mantissa) for larger mantissas,
+ adjust exponent accordingly
+ */
+ vsubpd %ymm2, %ymm0, %ymm0
+ vmovupd _poly_coeff_3(%rax), %ymm2
+ vfmadd213pd _poly_coeff_4(%rax), %ymm4, %ymm2
+ vfmadd213pd %ymm2, %ymm3, %ymm6
+
+/*
+ reconstruction:
+ (exponent*log(2)) + (LogRcp + (R+poly))
+ */
+ vfmadd213pd %ymm4, %ymm3, %ymm6
+ vaddpd %ymm1, %ymm6, %ymm4
+ vfmadd132pd _L2(%rax), %ymm4, %ymm0
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovupd %ymm5, 320(%rsp)
+ vmovupd %ymm0, 384(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ vmovups %ymm8, 224(%rsp)
+ vmovups %ymm9, 192(%rsp)
+ vmovups %ymm10, 160(%rsp)
+ vmovups %ymm11, 128(%rsp)
+ vmovups %ymm12, 96(%rsp)
+ vmovups %ymm13, 64(%rsp)
+ vmovups %ymm14, 32(%rsp)
+ vmovups %ymm15, (%rsp)
+ movq %rsi, 264(%rsp)
+ movq %rdi, 256(%rsp)
+ movq %r12, 296(%rsp)
+ cfi_offset_rel_rsp (12, 296)
+ movb %dl, %r12b
+ movq %r13, 288(%rsp)
+ cfi_offset_rel_rsp (13, 288)
+ movl %ecx, %r13d
+ movq %r14, 280(%rsp)
+ cfi_offset_rel_rsp (14, 280)
+ movl %eax, %r14d
+ movq %r15, 272(%rsp)
+ cfi_offset_rel_rsp (15, 272)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ vmovups 224(%rsp), %ymm8
+ vmovups 192(%rsp), %ymm9
+ vmovups 160(%rsp), %ymm10
+ vmovups 128(%rsp), %ymm11
+ vmovups 96(%rsp), %ymm12
+ vmovups 64(%rsp), %ymm13
+ vmovups 32(%rsp), %ymm14
+ vmovups (%rsp), %ymm15
+ vmovupd 384(%rsp), %ymm0
+ movq 264(%rsp), %rsi
+ movq 256(%rsp), %rdi
+ movq 296(%rsp), %r12
+ cfi_restore (%r12)
+ movq 288(%rsp), %r13
+ cfi_restore (%r13)
+ movq 280(%rsp), %r14
+ cfi_restore (%r14)
+ movq 272(%rsp), %r15
+ cfi_restore (%r15)
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 328(%rsp,%r15), %xmm0
+ vzeroupper
+
+ call log@PLT
+
+ vmovsd %xmm0, 392(%rsp,%r15)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 320(%rsp,%r15), %xmm0
+ vzeroupper
+
+ call log@PLT
+
+ vmovsd %xmm0, 384(%rsp,%r15)
+ jmp .LBL_1_7
+
+END (_ZGVdN4v_log_avx2)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
new file mode 100644
index 0000000000..76375fdae0
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
@@ -0,0 +1,39 @@
+/* Multiple versions of vectorized log.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVeN8v_log)
+ .type _ZGVeN8v_log, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVeN8v_log_skx(%rip), %rax
+ testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+ jnz 2f
+ leaq _ZGVeN8v_log_knl(%rip), %rax
+ testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+ jnz 2f
+ leaq _ZGVeN8v_log_avx2_wrapper(%rip), %rax
+2: ret
+END (_ZGVeN8v_log)
+
+#define _ZGVeN8v_log _ZGVeN8v_log_avx2_wrapper
+#include "../svml_d_log8_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
new file mode 100644
index 0000000000..b0f3dd580c
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
@@ -0,0 +1,468 @@
+/* Function log vectorized with AVX-512. KNL and SKX versions.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_log_data.h"
+#include "svml_d_wrapper_impl.h"
+
+ .text
+ENTRY (_ZGVeN8v_log_knl)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN4v_log
+#else
+/*
+ ALGORITHM DESCRIPTION:
+
+ log(x) = -log(Rcp) + log(Rcp*x),
+ where Rcp ~ 1/x (accuracy ~9 bits, obtained by
+ rounding HW approximation to 1+9 mantissa bits)
+
+ Reduced argument R=Rcp*x-1 is used to approximate log(1+R) as polynomial
+
+ log(Rcp) = exponent_Rcp*log(2) + log(mantissa_Rcp)
+ -log(mantissa_Rcp) is obtained from a lookup table,
+ accessed by a 9-bit index
+ */
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1280, %rsp
+ movq __svml_dlog_data@GOTPCREL(%rip), %rdx
+ movq $-1, %rax
+
+/* isolate exponent bits */
+ vpsrlq $20, %zmm0, %zmm2
+ vpsrlq $32, %zmm2, %zmm3
+ vpxord %zmm2, %zmm2, %zmm2
+ kxnorw %k3, %k3, %k3
+ vmovups _Two10(%rdx), %zmm1
+ vmovups _One(%rdx), %zmm9
+ vpmovqd %zmm3, %ymm4
+
+/* convert biased exponent to DP format */
+ vcvtdq2pd %ymm4, %zmm13
+
+/* preserve mantissa, set input exponent to 2^(-10) */
+ vpternlogq $248, _ExpMask(%rdx), %zmm0, %zmm1
+ vcmppd $17, _MinNorm(%rdx), %zmm0, %k1
+
+/* reciprocal approximation good to at least 11 bits */
+ vrcp28pd %zmm1, %zmm5
+ vpbroadcastq %rax, %zmm6{%k1}{z}
+ vmovups _poly_coeff_3(%rdx), %zmm15
+ vcmppd $22, _MaxNorm(%rdx), %zmm0, %k2
+ vmovups _Bias1(%rdx), %zmm14
+
+/* round reciprocal to nearest integer, will have 1+9 mantissa bits */
+ vrndscalepd $8, %zmm5, %zmm11
+ vpbroadcastq %rax, %zmm7{%k2}{z}
+
+/* argument reduction started: R = Mantissa*Rcp - 1 */
+ vfmsub213pd %zmm9, %zmm11, %zmm1
+
+/* calculate index for table lookup */
+ vpsrlq $40, %zmm11, %zmm10
+ vgatherqpd _LogRcp_lookup(%rdx,%zmm10), %zmm2{%k3}
+ vcmppd $30, _Threshold(%rdx), %zmm11, %k1
+
+/* combine and get argument value range mask */
+ vporq %zmm7, %zmm6, %zmm8
+
+/* exponent*log(2.0) */
+ vmovups _poly_coeff_1(%rdx), %zmm11
+ vmulpd %zmm1, %zmm1, %zmm10
+ vptestmq %zmm8, %zmm8, %k0
+ vfmadd213pd _poly_coeff_4(%rdx), %zmm1, %zmm15
+ kmovw %k0, %ecx
+
+/* polynomial computation */
+ vfmadd213pd _poly_coeff_2(%rdx), %zmm1, %zmm11
+ movzbl %cl, %ecx
+ vpbroadcastq %rax, %zmm12{%k1}{z}
+ vfmadd213pd %zmm15, %zmm10, %zmm11
+ vpternlogq $248, _Bias(%rdx), %zmm12, %zmm14
+
+/*
+ Table stores -log(0.5*mantissa) for larger mantissas,
+ adjust exponent accordingly
+ */
+ vsubpd %zmm14, %zmm13, %zmm3
+
+/*
+ reconstruction:
+ (exponent*log(2)) + (LogRcp + (R+poly))
+ */
+ vfmadd213pd %zmm1, %zmm10, %zmm11
+ vaddpd %zmm2, %zmm11, %zmm1
+ vfmadd132pd _L2(%rdx), %zmm1, %zmm3
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ vmovaps %zmm3, %zmm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovups %zmm0, 1152(%rsp)
+ vmovups %zmm3, 1216(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ kmovw %k4, 1048(%rsp)
+ xorl %eax, %eax
+ kmovw %k5, 1040(%rsp)
+ kmovw %k6, 1032(%rsp)
+ kmovw %k7, 1024(%rsp)
+ vmovups %zmm16, 960(%rsp)
+ vmovups %zmm17, 896(%rsp)
+ vmovups %zmm18, 832(%rsp)
+ vmovups %zmm19, 768(%rsp)
+ vmovups %zmm20, 704(%rsp)
+ vmovups %zmm21, 640(%rsp)
+ vmovups %zmm22, 576(%rsp)
+ vmovups %zmm23, 512(%rsp)
+ vmovups %zmm24, 448(%rsp)
+ vmovups %zmm25, 384(%rsp)
+ vmovups %zmm26, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1064(%rsp)
+ movq %rdi, 1056(%rsp)
+ movq %r12, 1096(%rsp)
+ cfi_offset_rel_rsp (12, 1096)
+ movb %dl, %r12b
+ movq %r13, 1088(%rsp)
+ cfi_offset_rel_rsp (13, 1088)
+ movl %ecx, %r13d
+ movq %r14, 1080(%rsp)
+ cfi_offset_rel_rsp (14, 1080)
+ movl %eax, %r14d
+ movq %r15, 1072(%rsp)
+ cfi_offset_rel_rsp (15, 1072)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ addb $1, %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ kmovw 1048(%rsp), %k4
+ movq 1064(%rsp), %rsi
+ kmovw 1040(%rsp), %k5
+ movq 1056(%rsp), %rdi
+ kmovw 1032(%rsp), %k6
+ movq 1096(%rsp), %r12
+ cfi_restore (%r12)
+ movq 1088(%rsp), %r13
+ cfi_restore (%r13)
+ kmovw 1024(%rsp), %k7
+ vmovups 960(%rsp), %zmm16
+ vmovups 896(%rsp), %zmm17
+ vmovups 832(%rsp), %zmm18
+ vmovups 768(%rsp), %zmm19
+ vmovups 704(%rsp), %zmm20
+ vmovups 640(%rsp), %zmm21
+ vmovups 576(%rsp), %zmm22
+ vmovups 512(%rsp), %zmm23
+ vmovups 448(%rsp), %zmm24
+ vmovups 384(%rsp), %zmm25
+ vmovups 320(%rsp), %zmm26
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ movq 1080(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1072(%rsp), %r15
+ cfi_restore (%r15)
+ vmovups 1216(%rsp), %zmm3
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 1160(%rsp,%r15), %xmm0
+ call log@PLT
+ vmovsd %xmm0, 1224(%rsp,%r15)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 1152(%rsp,%r15), %xmm0
+ call log@PLT
+ vmovsd %xmm0, 1216(%rsp,%r15)
+ jmp .LBL_1_7
+#endif
+END (_ZGVeN8v_log_knl)
+
+ENTRY (_ZGVeN8v_log_skx)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN4v_log
+#else
+/*
+ ALGORITHM DESCRIPTION:
+
+ log(x) = -log(Rcp) + log(Rcp*x),
+ where Rcp ~ 1/x (accuracy ~9 bits,
+ obtained by rounding HW approximation to 1+9 mantissa bits)
+
+ Reduced argument R=Rcp*x-1 is used to approximate log(1+R) as polynomial
+
+ log(Rcp) = exponent_Rcp*log(2) + log(mantissa_Rcp)
+ -log(mantissa_Rcp) is obtained from a lookup table,
+ accessed by a 9-bit index
+ */
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1280, %rsp
+ movq __svml_dlog_data@GOTPCREL(%rip), %rax
+ vmovaps %zmm0, %zmm3
+ kxnorw %k3, %k3, %k3
+ vmovups _Two10(%rax), %zmm2
+ vmovups _Threshold(%rax), %zmm14
+ vmovups _One(%rax), %zmm11
+ vcmppd $21, _MinNorm(%rax), %zmm3, %k1
+ vcmppd $18, _MaxNorm(%rax), %zmm3, %k2
+
+/* isolate exponent bits */
+ vpsrlq $20, %zmm3, %zmm4
+
+/* preserve mantissa, set input exponent to 2^(-10) */
+ vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
+ vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
+ vpsrlq $32, %zmm4, %zmm6
+
+/* reciprocal approximation good to at least 11 bits */
+ vrcp14pd %zmm2, %zmm5
+
+/* exponent*log(2.0) */
+ vmovups _poly_coeff_1(%rax), %zmm4
+ vpmovqd %zmm6, %ymm7
+
+/* round reciprocal to nearest integer, will have 1+9 mantissa bits */
+ vrndscalepd $8, %zmm5, %zmm0
+
+/* calculate index for table lookup */
+ vpsrlq $40, %zmm0, %zmm12
+
+/* argument reduction started: R = Mantissa*Rcp - 1 */
+ vfmsub213pd %zmm11, %zmm0, %zmm2
+ vpmovqd %zmm12, %ymm13
+
+/* polynomial computation */
+ vfmadd213pd _poly_coeff_2(%rax), %zmm2, %zmm4
+ vmovaps %zmm1, %zmm8
+ vmovaps %zmm1, %zmm9
+ vpxord %zmm5, %zmm5, %zmm5
+ vgatherdpd _LogRcp_lookup(%rax,%ymm13), %zmm5{%k3}
+ vmovups _Bias1(%rax), %zmm13
+ vpandnq %zmm3, %zmm3, %zmm8{%k1}
+ vcmppd $21, %zmm0, %zmm14, %k1
+ vpandnq %zmm14, %zmm14, %zmm1{%k1}
+ vmulpd %zmm2, %zmm2, %zmm14
+ vpternlogq $248, _Bias(%rax), %zmm1, %zmm13
+ vmovups _poly_coeff_3(%rax), %zmm1
+ vfmadd213pd _poly_coeff_4(%rax), %zmm2, %zmm1
+ vfmadd213pd %zmm1, %zmm14, %zmm4
+
+/*
+ reconstruction:
+ (exponent*log(2)) + (LogRcp + (R+poly))
+ */
+ vfmadd213pd %zmm2, %zmm14, %zmm4
+ vaddpd %zmm5, %zmm4, %zmm2
+ vpandnq %zmm3, %zmm3, %zmm9{%k2}
+
+/* combine and get argument value range mask */
+ vorpd %zmm9, %zmm8, %zmm10
+ vcmppd $3, %zmm10, %zmm10, %k0
+ kmovw %k0, %ecx
+
+/* convert biased exponent to DP format */
+ vcvtdq2pd %ymm7, %zmm15
+
+/*
+ Table stores -log(0.5*mantissa) for larger mantissas,
+ adjust exponent accordingly
+ */
+ vsubpd %zmm13, %zmm15, %zmm0
+ vfmadd132pd _L2(%rax), %zmm2, %zmm0
+ testl %ecx, %ecx
+ jne .LBL_2_3
+
+.LBL_2_2:
+ cfi_remember_state
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_2_3:
+ cfi_restore_state
+ vmovups %zmm3, 1152(%rsp)
+ vmovups %zmm0, 1216(%rsp)
+ je .LBL_2_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ kmovw %k4, 1048(%rsp)
+ kmovw %k5, 1040(%rsp)
+ kmovw %k6, 1032(%rsp)
+ kmovw %k7, 1024(%rsp)
+ vmovups %zmm16, 960(%rsp)
+ vmovups %zmm17, 896(%rsp)
+ vmovups %zmm18, 832(%rsp)
+ vmovups %zmm19, 768(%rsp)
+ vmovups %zmm20, 704(%rsp)
+ vmovups %zmm21, 640(%rsp)
+ vmovups %zmm22, 576(%rsp)
+ vmovups %zmm23, 512(%rsp)
+ vmovups %zmm24, 448(%rsp)
+ vmovups %zmm25, 384(%rsp)
+ vmovups %zmm26, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1064(%rsp)
+ movq %rdi, 1056(%rsp)
+ movq %r12, 1096(%rsp)
+ cfi_offset_rel_rsp (12, 1096)
+ movb %dl, %r12b
+ movq %r13, 1088(%rsp)
+ cfi_offset_rel_rsp (13, 1088)
+ movl %ecx, %r13d
+ movq %r14, 1080(%rsp)
+ cfi_offset_rel_rsp (14, 1080)
+ movl %eax, %r14d
+ movq %r15, 1072(%rsp)
+ cfi_offset_rel_rsp (15, 1072)
+ cfi_remember_state
+
+.LBL_2_6:
+ btl %r14d, %r13d
+ jc .LBL_2_12
+
+.LBL_2_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_2_10
+
+.LBL_2_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_2_6
+
+ kmovw 1048(%rsp), %k4
+ kmovw 1040(%rsp), %k5
+ kmovw 1032(%rsp), %k6
+ kmovw 1024(%rsp), %k7
+ vmovups 960(%rsp), %zmm16
+ vmovups 896(%rsp), %zmm17
+ vmovups 832(%rsp), %zmm18
+ vmovups 768(%rsp), %zmm19
+ vmovups 704(%rsp), %zmm20
+ vmovups 640(%rsp), %zmm21
+ vmovups 576(%rsp), %zmm22
+ vmovups 512(%rsp), %zmm23
+ vmovups 448(%rsp), %zmm24
+ vmovups 384(%rsp), %zmm25
+ vmovups 320(%rsp), %zmm26
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ vmovups 1216(%rsp), %zmm0
+ movq 1064(%rsp), %rsi
+ movq 1056(%rsp), %rdi
+ movq 1096(%rsp), %r12
+ cfi_restore (%r12)
+ movq 1088(%rsp), %r13
+ cfi_restore (%r13)
+ movq 1080(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1072(%rsp), %r15
+ cfi_restore (%r15)
+ jmp .LBL_2_2
+
+.LBL_2_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 1160(%rsp,%r15), %xmm0
+ vzeroupper
+ vmovsd 1160(%rsp,%r15), %xmm0
+
+ call log@PLT
+
+ vmovsd %xmm0, 1224(%rsp,%r15)
+ jmp .LBL_2_8
+
+.LBL_2_12:
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 1152(%rsp,%r15), %xmm0
+ vzeroupper
+ vmovsd 1152(%rsp,%r15), %xmm0
+
+ call log@PLT
+
+ vmovsd %xmm0, 1216(%rsp,%r15)
+ jmp .LBL_2_7
+#endif
+END (_ZGVeN8v_log_skx)
+
+ .section .rodata, "a"
+.L_2il0floatpacket.12:
+ .long 0xffffffff,0xffffffff
+ .type .L_2il0floatpacket.12,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S
new file mode 100644
index 0000000000..f111388922
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S
@@ -0,0 +1,38 @@
+/* Multiple versions of vectorized pow.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVbN2vv_pow)
+ .type _ZGVbN2vv_pow, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVbN2vv_pow_sse4(%rip), %rax
+ testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVbN2vv_pow_sse2(%rip), %rax
+ ret
+END (_ZGVbN2vv_pow)
+libmvec_hidden_def (_ZGVbN2vv_pow)
+
+#define _ZGVbN2vv_pow _ZGVbN2vv_pow_sse2
+#include "../svml_d_pow2_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S
new file mode 100644
index 0000000000..9f6ec29ac5
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S
@@ -0,0 +1,432 @@
+/* Function pow vectorized with SSE4.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_pow_data.h"
+
+ .text
+ENTRY (_ZGVbN2vv_pow_sse4)
+/*
+ ALGORITHM DESCRIPTION:
+
+ 1) Calculating log2|x|
+ Here we use the following formula.
+ Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2.
+ Let C ~= 1/ln(2),
+ Rcp1 ~= 1/X1, X2=Rcp1*X1,
+ Rcp2 ~= 1/X2, X3=Rcp2*X2,
+ Rcp3 ~= 1/X3, Rcp3C ~= C/X3.
+ Then
+ log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) +
+ log2(X1*Rcp1*Rcp2*Rcp3C/C),
+ where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small.
+
+ The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2),
+ Rcp3C, log2(C/Rcp3C) are taken from tables.
+ Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C
+ is exactly represented in target precision.
+
+ log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 =
+ = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... =
+ = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... =
+ = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ...,
+ where cq = X1*Rcp1*Rcp2*Rcp3C-C,
+ a1=1/(C*ln(2))-1 is small,
+ a2=1/(2*C^2*ln2),
+ a3=1/(3*C^3*ln2),
+ ...
+ We get 3 parts of log2 result: HH+HL+HLL ~= log2|x|.
+
+ 2) Calculation of y*(HH+HL+HLL).
+ Split y into YHi+YLo.
+ Get high PH and medium PL parts of y*log2|x|.
+ Get low PLL part of y*log2|x|.
+ Now we have PH+PL+PLL ~= y*log2|x|.
+
+ 3) Calculation of 2^(PH+PL+PLL).
+ Mathematical idea of computing 2^(PH+PL+PLL) is the following.
+ Let's represent PH+PL+PLL in the form N + j/2^expK + Z,
+ where expK=7 in this implementation, N and j are integers,
+ 0<=j<=2^expK-1, |Z|<2^(-expK-1).
+ Hence 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z,
+ where 2^(j/2^expK) is stored in a table, and
+ 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5.
+
+ We compute 2^(PH+PL+PLL) as follows.
+ Break PH into PHH + PHL, where PHH = N + j/2^expK.
+ Z = PHL + PL + PLL
+ Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5
+ Get 2^(j/2^expK) from table in the form THI+TLO.
+ Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly).
+
+ Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo:
+ ResHi := THI
+ ResLo := THI * Exp2Poly + TLO
+
+ Get exponent ERes of the result:
+ Res := ResHi + ResLo:
+ Result := ex(Res) + N. */
+
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $256, %rsp
+ movq __svml_dpow_data@GOTPCREL(%rip), %rdx
+ movups %xmm14, 80(%rsp)
+ movups %xmm9, 176(%rsp)
+ movaps %xmm1, %xmm9
+ pshufd $221, %xmm0, %xmm1
+ movq _iIndexMask(%rdx), %xmm14
+ movq _iIndexAdd(%rdx), %xmm6
+
+/* i = (((Hi(x) & 0x000ffe00) + 0x00000200) >> 10); -> i = (b1..b11 + 1) / 2 */
+ pand %xmm1, %xmm14
+ paddd %xmm6, %xmm14
+ psrld $10, %xmm14
+ movups %xmm13, 96(%rsp)
+
+/* Index for reciprocal table */
+ movdqa %xmm14, %xmm13
+ pslld $3, %xmm13
+
+/* Index for log2 table */
+ pslld $4, %xmm14
+ movd %xmm13, %eax
+ movups %xmm10, 160(%rsp)
+ movups _iMantissaMask(%rdx), %xmm10
+ movslq %eax, %rax
+
+/* x1 = x; Hi(x1) = (Hi(x1)&0x000fffff)|0x3ff00000 */
+ andps %xmm0, %xmm10
+ pextrd $1, %xmm13, %ecx
+ movslq %ecx, %rcx
+ movups %xmm0, (%rsp)
+ movdqa %xmm1, %xmm0
+
+/* k = Hi(x); k = k - 0x3fe7fe00; k = k >> 20 */
+ movq _i3fe7fe00(%rdx), %xmm6
+ psubd %xmm6, %xmm0
+ movups _iHighMask(%rdx), %xmm6
+ psrad $20, %xmm0
+ movups %xmm15, 48(%rsp)
+ movups %xmm12, 112(%rsp)
+ orps _dbOne(%rdx), %xmm10
+ movsd 11712(%rdx,%rax), %xmm12
+ movd %xmm14, %r8d
+ movq _i2p20_2p19(%rdx), %xmm15
+ movhpd 11712(%rdx,%rcx), %xmm12
+ paddd %xmm15, %xmm0
+ pextrd $1, %xmm14, %r9d
+
+/* x1Hi=x1; Lo(x1Hi)&=0xf8000000; x1Lo = x1-x1Hi */
+ movaps %xmm6, %xmm14
+ andps %xmm10, %xmm14
+ movaps %xmm10, %xmm15
+ subpd %xmm14, %xmm15
+
+/* r1 = x1*rcp1 */
+ mulpd %xmm12, %xmm10
+
+/* E = -r1+__fence(x1Hi*rcp1) */
+ mulpd %xmm12, %xmm14
+
+/* E=E+x1Lo*rcp1 */
+ mulpd %xmm15, %xmm12
+ subpd %xmm10, %xmm14
+ pshufd $80, %xmm0, %xmm0
+ movslq %r8d, %r8
+ andps _iffffffff00000000(%rdx), %xmm0
+ subpd _db2p20_2p19(%rdx), %xmm0
+ addpd %xmm12, %xmm14
+ movslq %r9d, %r9
+
+/* T_Rh_Eh = T_Rh + E */
+ movaps %xmm14, %xmm15
+ movups %xmm8, 208(%rsp)
+ movups 19968(%rdx,%r8), %xmm8
+ movups %xmm11, 144(%rsp)
+ movaps %xmm8, %xmm11
+
+/* cq = c+r1 */
+ movups _LHN(%rdx), %xmm13
+ movhpd 19968(%rdx,%r9), %xmm11
+ addpd %xmm10, %xmm13
+
+/* T = k + L1hi */
+ addpd %xmm0, %xmm11
+
+/* T_Rh = T + cq */
+ movaps %xmm13, %xmm12
+ addpd %xmm11, %xmm12
+ addpd %xmm12, %xmm15
+
+/* Rl = T-T_Rh; -> -Rh */
+ subpd %xmm12, %xmm11
+
+/* HLL = T_Rh - T_Rh_Eh; -> -Eh */
+ subpd %xmm15, %xmm12
+
+/* Rl=Rl+cq; */
+ addpd %xmm13, %xmm11
+
+/* cq = cq + E */
+ addpd %xmm14, %xmm13
+
+/* HLL+=E; -> El */
+ addpd %xmm14, %xmm12
+
+/* HLL+=Rl */
+ addpd %xmm12, %xmm11
+ unpckhpd 19968(%rdx,%r9), %xmm8
+
+/* T_Rh_Eh_HLLhi = T_Rh_Eh + HLL */
+ movaps %xmm15, %xmm14
+
+/* HLL+=L1lo; */
+ addpd %xmm11, %xmm8
+ movups _clv_2(%rdx), %xmm11
+
+/* HH = T_Rh_Eh_HLLhi; Lo(HH)&=0xf8000000 */
+ movaps %xmm6, %xmm12
+
+/* HLL = HLL + (((((((a7)*cq+a6)*cq+a5)*cq+a4)*cq+a3)*cq+a2)*cq+a1)*cq */
+ mulpd %xmm13, %xmm11
+ addpd _clv_3(%rdx), %xmm11
+ mulpd %xmm13, %xmm11
+ addpd _clv_4(%rdx), %xmm11
+ mulpd %xmm13, %xmm11
+ addpd _clv_5(%rdx), %xmm11
+ mulpd %xmm13, %xmm11
+ addpd _clv_6(%rdx), %xmm11
+ mulpd %xmm13, %xmm11
+ addpd _clv_7(%rdx), %xmm11
+ mulpd %xmm11, %xmm13
+ addpd %xmm13, %xmm8
+ addpd %xmm8, %xmm14
+
+/*
+ 2^(y*(HH+HL+HLL)) starts here:
+ yH = y; Lo(yH)&=0xf8000000
+ */
+ andps %xmm9, %xmm6
+
+/* yL = y-yH; */
+ movaps %xmm9, %xmm11
+ subpd %xmm6, %xmm11
+ andps %xmm14, %xmm12
+
+/* HLLhi = T_Rh_Eh_HLLhi - T_Rh_Eh */
+ movaps %xmm14, %xmm10
+
+/* HL = T_Rh_Eh_HLLhi-HH; */
+ subpd %xmm12, %xmm14
+ subpd %xmm15, %xmm10
+ movq _HIDELTA(%rdx), %xmm2
+
+/* pH = yH*HH; */
+ movaps %xmm6, %xmm13
+ movq _LORANGE(%rdx), %xmm3
+ paddd %xmm2, %xmm1
+ pcmpgtd %xmm1, %xmm3
+
+/* pL=yL*HL+yH*HL; pL+=yL*HH; */
+ movaps %xmm11, %xmm1
+ mulpd %xmm14, %xmm1
+ mulpd %xmm14, %xmm6
+ mulpd %xmm12, %xmm13
+ mulpd %xmm11, %xmm12
+ addpd %xmm6, %xmm1
+
+/* HLL = HLL - HLLhi */
+ subpd %xmm10, %xmm8
+ addpd %xmm12, %xmm1
+
+/* pLL = y*HLL */
+ mulpd %xmm9, %xmm8
+ movups _db2p45_2p44(%rdx), %xmm11
+
+/* pHH = pH + *(double*)&db2p45_2p44 */
+ movaps %xmm11, %xmm12
+ addpd %xmm13, %xmm12
+
+/* t=pL+pLL; t+=pHL */
+ addpd %xmm8, %xmm1
+ movq _ABSMASK(%rdx), %xmm5
+ pshufd $221, %xmm9, %xmm4
+ pand %xmm5, %xmm4
+ movq _INF(%rdx), %xmm7
+ movdqa %xmm4, %xmm2
+ pcmpgtd %xmm7, %xmm2
+ pcmpeqd %xmm7, %xmm4
+ pshufd $136, %xmm12, %xmm7
+ por %xmm4, %xmm2
+
+/* pHH = pHH - *(double*)&db2p45_2p44 */
+ subpd %xmm11, %xmm12
+ pshufd $221, %xmm13, %xmm10
+ por %xmm2, %xmm3
+
+/* pHL = pH - pHH; */
+ subpd %xmm12, %xmm13
+ pand %xmm5, %xmm10
+ movq _DOMAINRANGE(%rdx), %xmm5
+ movdqa %xmm10, %xmm4
+ addpd %xmm1, %xmm13
+ pcmpgtd %xmm5, %xmm4
+ pcmpeqd %xmm5, %xmm10
+ por %xmm10, %xmm4
+ movq _jIndexMask(%rdx), %xmm6
+ por %xmm4, %xmm3
+ movmskps %xmm3, %eax
+
+/* j = Lo(pHH)&0x0000007f */
+ pand %xmm7, %xmm6
+ movq _iOne(%rdx), %xmm3
+
+/* _n = Lo(pHH);
+ _n = _n & 0xffffff80;
+ _n = _n >> 7;
+ Hi(_2n) = (0x3ff+_n)<<20; Lo(_2n) = 0; -> 2^n
+ */
+ pslld $13, %xmm7
+ paddd %xmm3, %xmm7
+ pslld $4, %xmm6
+ movups _cev_1(%rdx), %xmm3
+ movaps %xmm13, %xmm4
+ mulpd %xmm13, %xmm3
+
+/* T1 = ((double*)exp2_tbl)[ 2*j ] */
+ movd %xmm6, %r10d
+ pshufd $80, %xmm7, %xmm0
+ andps _ifff0000000000000(%rdx), %xmm0
+ addpd _cev_2(%rdx), %xmm3
+ mulpd %xmm13, %xmm3
+ addpd _cev_3(%rdx), %xmm3
+ mulpd %xmm13, %xmm3
+ movslq %r10d, %r10
+ andl $3, %eax
+ pextrd $1, %xmm6, %r11d
+ movslq %r11d, %r11
+ addpd _cev_4(%rdx), %xmm3
+ movsd 36416(%rdx,%r10), %xmm2
+ movhpd 36416(%rdx,%r11), %xmm2
+ mulpd %xmm2, %xmm0
+ mulpd %xmm3, %xmm13
+ mulpd %xmm0, %xmm4
+ addpd _cev_5(%rdx), %xmm13
+ mulpd %xmm4, %xmm13
+ addpd %xmm13, %xmm0
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ movups 208(%rsp), %xmm8
+ movups 176(%rsp), %xmm9
+ movups 160(%rsp), %xmm10
+ movups 144(%rsp), %xmm11
+ movups 112(%rsp), %xmm12
+ movups 96(%rsp), %xmm13
+ movups 80(%rsp), %xmm14
+ movups 48(%rsp), %xmm15
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ movups (%rsp), %xmm1
+ movups %xmm1, 64(%rsp)
+ movups %xmm9, 128(%rsp)
+ movups %xmm0, 192(%rsp)
+ je .LBL_1_2
+
+ xorb %cl, %cl
+ xorl %edx, %edx
+ movq %rsi, 8(%rsp)
+ movq %rdi, (%rsp)
+ movq %r12, 40(%rsp)
+ cfi_offset_rel_rsp (12, 40)
+ movb %cl, %r12b
+ movq %r13, 32(%rsp)
+ cfi_offset_rel_rsp (13, 32)
+ movl %eax, %r13d
+ movq %r14, 24(%rsp)
+ cfi_offset_rel_rsp (14, 24)
+ movl %edx, %r14d
+ movq %r15, 16(%rsp)
+ cfi_offset_rel_rsp (15, 16)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ movq 8(%rsp), %rsi
+ movq (%rsp), %rdi
+ movq 40(%rsp), %r12
+ cfi_restore (%r12)
+ movq 32(%rsp), %r13
+ cfi_restore (%r13)
+ movq 24(%rsp), %r14
+ cfi_restore (%r14)
+ movq 16(%rsp), %r15
+ cfi_restore (%r15)
+ movups 192(%rsp), %xmm0
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ movsd 72(%rsp,%r15), %xmm0
+ movsd 136(%rsp,%r15), %xmm1
+
+ call pow@PLT
+
+ movsd %xmm0, 200(%rsp,%r15)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ movsd 64(%rsp,%r15), %xmm0
+ movsd 128(%rsp,%r15), %xmm1
+
+ call pow@PLT
+
+ movsd %xmm0, 192(%rsp,%r15)
+ jmp .LBL_1_7
+
+END (_ZGVbN2vv_pow_sse4)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S
new file mode 100644
index 0000000000..21e3070a42
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S
@@ -0,0 +1,38 @@
+/* Multiple versions of vectorized pow.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVdN4vv_pow)
+ .type _ZGVdN4vv_pow, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVdN4vv_pow_avx2(%rip), %rax
+ testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVdN4vv_pow_sse_wrapper(%rip), %rax
+ ret
+END (_ZGVdN4vv_pow)
+libmvec_hidden_def (_ZGVdN4vv_pow)
+
+#define _ZGVdN4vv_pow _ZGVdN4vv_pow_sse_wrapper
+#include "../svml_d_pow4_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S
new file mode 100644
index 0000000000..f1f1f35ca2
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S
@@ -0,0 +1,387 @@
+/* Function pow vectorized with AVX2.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_pow_data.h"
+
+ .text
+ENTRY (_ZGVdN4vv_pow_avx2)
+/*
+ ALGORITHM DESCRIPTION:
+
+ 1) Calculating log2|x|
+ Here we use the following formula.
+ Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2.
+ Let C ~= 1/ln(2),
+ Rcp1 ~= 1/X1, X2=Rcp1*X1,
+ Rcp2 ~= 1/X2, X3=Rcp2*X2,
+ Rcp3 ~= 1/X3, Rcp3C ~= C/X3.
+ Then
+ log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) +
+ log2(X1*Rcp1*Rcp2*Rcp3C/C),
+ where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small.
+
+ The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2),
+ Rcp3C, log2(C/Rcp3C) are taken from tables.
+ Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C
+ is exactly represented in target precision.
+
+ log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 =
+ = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... =
+ = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... =
+ = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ...,
+ where cq = X1*Rcp1*Rcp2*Rcp3C-C,
+ a1=1/(C*ln(2))-1 is small,
+ a2=1/(2*C^2*ln2),
+ a3=1/(3*C^3*ln2),
+ ...
+ We get 3 parts of log2 result: HH+HL+HLL ~= log2|x|.
+
+ 2) Calculation of y*(HH+HL+HLL).
+ Split y into YHi+YLo.
+ Get high PH and medium PL parts of y*log2|x|.
+ Get low PLL part of y*log2|x|.
+ Now we have PH+PL+PLL ~= y*log2|x|.
+
+ 3) Calculation of 2^(PH+PL+PLL).
+ Mathematical idea of computing 2^(PH+PL+PLL) is the following.
+ Let's represent PH+PL+PLL in the form N + j/2^expK + Z,
+ where expK=7 in this implementation, N and j are integers,
+ 0<=j<=2^expK-1, |Z|<2^(-expK-1).
+ Hence 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z,
+ where 2^(j/2^expK) is stored in a table, and
+ 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5.
+
+ We compute 2^(PH+PL+PLL) as follows.
+ Break PH into PHH + PHL, where PHH = N + j/2^expK.
+ Z = PHL + PL + PLL
+ Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5
+ Get 2^(j/2^expK) from table in the form THI+TLO.
+ Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly).
+
+ Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo:
+ ResHi := THI
+ ResLo := THI * Exp2Poly + TLO
+
+ Get exponent ERes of the result:
+ Res := ResHi + ResLo:
+ Result := ex(Res) + N. */
+
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $448, %rsp
+ movq __svml_dpow_data@GOTPCREL(%rip), %rax
+ vmovups %ymm11, 160(%rsp)
+ vmovups %ymm8, 224(%rsp)
+ vmovups %ymm10, 352(%rsp)
+ vmovups %ymm9, 384(%rsp)
+ vmovups %ymm13, 288(%rsp)
+ vmovapd %ymm1, %ymm11
+ vxorpd %ymm1, %ymm1, %ymm1
+ vextracti128 $1, %ymm0, %xmm5
+ vshufps $221, %xmm5, %xmm0, %xmm5
+
+/* i = (((Hi(x) & 0x000ffe00) + 0x00000200) >> 10); -> i = (b1..b11 + 1) / 2 */
+ vandps _iIndexMask(%rax), %xmm5, %xmm3
+ vpaddd _iIndexAdd(%rax), %xmm3, %xmm6
+ vpsrld $10, %xmm6, %xmm8
+
+/* Index for reciprocal table */
+ vpslld $3, %xmm8, %xmm9
+
+/* Index for log2 table */
+ vpslld $4, %xmm8, %xmm6
+
+/* x1 = x; Hi(x1) = (Hi(x1)&0x000fffff)|0x3ff00000 */
+ vandpd _iMantissaMask(%rax), %ymm0, %ymm4
+ vorpd _dbOne(%rax), %ymm4, %ymm13
+ vpcmpeqd %ymm4, %ymm4, %ymm4
+ vpcmpeqd %ymm8, %ymm8, %ymm8
+
+/* k = Hi(x); k = k - 0x3fe7fe00; k = k >> 20 */
+ vpsubd _i3fe7fe00(%rax), %xmm5, %xmm3
+ vpaddd _HIDELTA(%rax), %xmm5, %xmm5
+ vextracti128 $1, %ymm11, %xmm7
+ vshufps $221, %xmm7, %xmm11, %xmm2
+ vpand _ABSMASK(%rax), %xmm2, %xmm10
+ vpcmpeqd %ymm2, %ymm2, %ymm2
+ vgatherdpd %ymm2, 11712(%rax,%xmm9), %ymm1
+ vmovups _LORANGE(%rax), %xmm7
+ vxorpd %ymm2, %ymm2, %ymm2
+ vgatherdpd %ymm4, 19968(%rax,%xmm6), %ymm2
+ vxorpd %ymm4, %ymm4, %ymm4
+ vgatherdpd %ymm8, 19976(%rax,%xmm6), %ymm4
+ vpsrad $20, %xmm3, %xmm6
+ vpaddd _i2p20_2p19(%rax), %xmm6, %xmm9
+ vpshufd $80, %xmm9, %xmm8
+ vpshufd $250, %xmm9, %xmm3
+
+/* x1Hi=x1; Lo(x1Hi)&=0xf8000000; x1Lo = x1-x1Hi */
+ vandpd _iHighMask(%rax), %ymm13, %ymm9
+ vinserti128 $1, %xmm3, %ymm8, %ymm6
+ vandpd _iffffffff00000000(%rax), %ymm6, %ymm8
+
+/* r1 = x1*rcp1 */
+ vmulpd %ymm1, %ymm13, %ymm6
+ vsubpd %ymm9, %ymm13, %ymm3
+ vsubpd _db2p20_2p19(%rax), %ymm8, %ymm8
+
+/* cq = c+r1 */
+ vaddpd _LHN(%rax), %ymm6, %ymm13
+
+/* E = -r1+__fence(x1Hi*rcp1) */
+ vfmsub213pd %ymm6, %ymm1, %ymm9
+
+/* E=E+x1Lo*rcp1 */
+ vfmadd213pd %ymm9, %ymm1, %ymm3
+
+/* T = k + L1hi */
+ vaddpd %ymm2, %ymm8, %ymm1
+
+/* T_Rh = T + cq */
+ vaddpd %ymm13, %ymm1, %ymm8
+
+/* Rl = T-T_Rh; -> -Rh */
+ vsubpd %ymm8, %ymm1, %ymm6
+
+/* Rl=Rl+cq */
+ vaddpd %ymm6, %ymm13, %ymm1
+
+/* T_Rh_Eh = T_Rh + E */
+ vaddpd %ymm3, %ymm8, %ymm6
+
+/* cq = cq + E */
+ vaddpd %ymm3, %ymm13, %ymm13
+
+/* HLL = T_Rh - T_Rh_Eh; -> -Eh */
+ vsubpd %ymm6, %ymm8, %ymm9
+
+/* HLL+=E; -> El */
+ vaddpd %ymm9, %ymm3, %ymm2
+
+/* HLL+=Rl */
+ vaddpd %ymm1, %ymm2, %ymm8
+
+/* HLL+=L1lo */
+ vaddpd %ymm4, %ymm8, %ymm4
+ vmovupd _clv_2(%rax), %ymm8
+
+/* HLL = HLL + (((((((a7)*cq+a6)*cq+a5)*cq+a4)*cq+a3)*cq+a2)*cq+a1)*cq */
+ vfmadd213pd _clv_3(%rax), %ymm13, %ymm8
+ vfmadd213pd _clv_4(%rax), %ymm13, %ymm8
+ vfmadd213pd _clv_5(%rax), %ymm13, %ymm8
+ vfmadd213pd _clv_6(%rax), %ymm13, %ymm8
+ vfmadd213pd _clv_7(%rax), %ymm13, %ymm8
+ vfmadd213pd %ymm4, %ymm13, %ymm8
+
+/* T_Rh_Eh_HLLhi = T_Rh_Eh + HLL */
+ vaddpd %ymm8, %ymm6, %ymm9
+
+/* HH = T_Rh_Eh_HLLhi; Lo(HH)&=0xf8000000 */
+ vandpd _iHighMask(%rax), %ymm9, %ymm2
+
+/*
+ 2^(y*(HH+HL+HLL)) starts here:
+ yH = y; Lo(yH)&=0xf8000000;
+ */
+ vandpd _iHighMask(%rax), %ymm11, %ymm1
+
+/* HLLhi = T_Rh_Eh_HLLhi - T_Rh_Eh */
+ vsubpd %ymm6, %ymm9, %ymm13
+
+/* HL = T_Rh_Eh_HLLhi-HH */
+ vsubpd %ymm2, %ymm9, %ymm4
+
+/* pH = yH*HH */
+ vmulpd %ymm2, %ymm1, %ymm9
+
+/* HLL = HLL - HLLhi */
+ vsubpd %ymm13, %ymm8, %ymm6
+
+/* yL = y-yH */
+ vsubpd %ymm1, %ymm11, %ymm8
+ vextracti128 $1, %ymm9, %xmm3
+ vshufps $221, %xmm3, %xmm9, %xmm13
+ vpand _ABSMASK(%rax), %xmm13, %xmm3
+ vpcmpgtd %xmm5, %xmm7, %xmm13
+ vpcmpgtd _INF(%rax), %xmm10, %xmm7
+ vpcmpeqd _INF(%rax), %xmm10, %xmm10
+ vpor %xmm10, %xmm7, %xmm7
+ vpor %xmm7, %xmm13, %xmm5
+
+/* pL=yL*HL+yH*HL; pL+=yL*HH */
+ vmulpd %ymm4, %ymm8, %ymm7
+ vpcmpgtd _DOMAINRANGE(%rax), %xmm3, %xmm13
+ vpcmpeqd _DOMAINRANGE(%rax), %xmm3, %xmm10
+ vpor %xmm10, %xmm13, %xmm3
+ vpor %xmm3, %xmm5, %xmm13
+ vfmadd213pd %ymm7, %ymm4, %ymm1
+
+/* pLL = y*HLL;
+ pHH = pH + *(double*)&db2p45_2p44
+ */
+ vaddpd _db2p45_2p44(%rax), %ymm9, %ymm7
+ vmovmskps %xmm13, %ecx
+ vfmadd213pd %ymm1, %ymm2, %ymm8
+
+/* t=pL+pLL; t+=pHL */
+ vfmadd231pd %ymm11, %ymm6, %ymm8
+ vextracti128 $1, %ymm7, %xmm1
+ vshufps $136, %xmm1, %xmm7, %xmm10
+
+/* _n = Lo(pHH);
+ _n = _n & 0xffffff80;
+ _n = _n >> 7;
+ Hi(_2n) = (0x3ff+_n)<<20; Lo(_2n) = 0; -> 2^n
+ */
+ vpslld $13, %xmm10, %xmm2
+ vpaddd _iOne(%rax), %xmm2, %xmm13
+ vpshufd $80, %xmm13, %xmm4
+ vpshufd $250, %xmm13, %xmm1
+
+/* j = Lo(pHH)&0x0000007f */
+ vandps _jIndexMask(%rax), %xmm10, %xmm3
+
+/* T1 = ((double*)exp2_tbl)[ 2*j ] */
+ vpcmpeqd %ymm10, %ymm10, %ymm10
+ vpslld $4, %xmm3, %xmm5
+
+/* pHH = pHH - *(double*)&db2p45_2p44 */
+ vsubpd _db2p45_2p44(%rax), %ymm7, %ymm7
+
+/* pHL = pH - pHH */
+ vsubpd %ymm7, %ymm9, %ymm9
+ vaddpd %ymm9, %ymm8, %ymm6
+ vinserti128 $1, %xmm1, %ymm4, %ymm2
+ vxorpd %ymm1, %ymm1, %ymm1
+ vgatherdpd %ymm10, 36416(%rax,%xmm5), %ymm1
+ vandpd _ifff0000000000000(%rax), %ymm2, %ymm13
+ vmovupd _cev_1(%rax), %ymm2
+ vmulpd %ymm1, %ymm13, %ymm1
+ vfmadd213pd _cev_2(%rax), %ymm6, %ymm2
+ vmulpd %ymm6, %ymm1, %ymm8
+ vfmadd213pd _cev_3(%rax), %ymm6, %ymm2
+ vfmadd213pd _cev_4(%rax), %ymm6, %ymm2
+ vfmadd213pd _cev_5(%rax), %ymm6, %ymm2
+ vfmadd213pd %ymm1, %ymm8, %ymm2
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ vmovups 224(%rsp), %ymm8
+ vmovups 384(%rsp), %ymm9
+ vmovups 352(%rsp), %ymm10
+ vmovups 160(%rsp), %ymm11
+ vmovups 288(%rsp), %ymm13
+ vmovdqa %ymm2, %ymm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovupd %ymm0, 192(%rsp)
+ vmovupd %ymm11, 256(%rsp)
+ vmovupd %ymm2, 320(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ vmovups %ymm12, 64(%rsp)
+ vmovups %ymm14, 32(%rsp)
+ vmovups %ymm15, (%rsp)
+ movq %rsi, 104(%rsp)
+ movq %rdi, 96(%rsp)
+ movq %r12, 136(%rsp)
+ cfi_offset_rel_rsp (12, 136)
+ movb %dl, %r12b
+ movq %r13, 128(%rsp)
+ cfi_offset_rel_rsp (13, 128)
+ movl %ecx, %r13d
+ movq %r14, 120(%rsp)
+ cfi_offset_rel_rsp (14, 120)
+ movl %eax, %r14d
+ movq %r15, 112(%rsp)
+ cfi_offset_rel_rsp (15, 112)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ vmovups 64(%rsp), %ymm12
+ vmovups 32(%rsp), %ymm14
+ vmovups (%rsp), %ymm15
+ vmovupd 320(%rsp), %ymm2
+ movq 104(%rsp), %rsi
+ movq 96(%rsp), %rdi
+ movq 136(%rsp), %r12
+ cfi_restore (%r12)
+ movq 128(%rsp), %r13
+ cfi_restore (%r13)
+ movq 120(%rsp), %r14
+ cfi_restore (%r14)
+ movq 112(%rsp), %r15
+ cfi_restore (%r15)
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 200(%rsp,%r15), %xmm0
+ vmovsd 264(%rsp,%r15), %xmm1
+ vzeroupper
+
+ call pow@PLT
+
+ vmovsd %xmm0, 328(%rsp,%r15)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 192(%rsp,%r15), %xmm0
+ vmovsd 256(%rsp,%r15), %xmm1
+ vzeroupper
+
+ call pow@PLT
+
+ vmovsd %xmm0, 320(%rsp,%r15)
+ jmp .LBL_1_7
+
+END (_ZGVdN4vv_pow_avx2)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
new file mode 100644
index 0000000000..c1e5e76f92
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
@@ -0,0 +1,39 @@
+/* Multiple versions of vectorized pow.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVeN8vv_pow)
+ .type _ZGVeN8vv_pow, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVeN8vv_pow_skx(%rip), %rax
+ testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+ jnz 2f
+ leaq _ZGVeN8vv_pow_knl(%rip), %rax
+ testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+ jnz 2f
+ leaq _ZGVeN8vv_pow_avx2_wrapper(%rip), %rax
+2: ret
+END (_ZGVeN8vv_pow)
+
+#define _ZGVeN8vv_pow _ZGVeN8vv_pow_avx2_wrapper
+#include "../svml_d_pow8_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S
new file mode 100644
index 0000000000..8dd89c8ebb
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S
@@ -0,0 +1,741 @@
+/* Function pow vectorized with AVX-512. KNL and SKX versions.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_pow_data.h"
+#include "svml_d_wrapper_impl.h"
+
+/* ALGORITHM DESCRIPTION:
+
+ 1) Calculating log2|x|
+ Here we use the following formula.
+ Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2.
+ Let C ~= 1/ln(2),
+ Rcp1 ~= 1/X1, X2=Rcp1*X1,
+ Rcp2 ~= 1/X2, X3=Rcp2*X2,
+ Rcp3 ~= 1/X3, Rcp3C ~= C/X3.
+ Then
+ log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) +
+ log2(X1*Rcp1*Rcp2*Rcp3C/C),
+ where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small.
+
+ The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2),
+ Rcp3C, log2(C/Rcp3C) are taken from tables.
+ Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C
+ is exactly represented in target precision.
+
+ log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 =
+ = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... =
+ = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... =
+ = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ...,
+ where cq = X1*Rcp1*Rcp2*Rcp3C-C,
+ a1=1/(C*ln(2))-1 is small,
+ a2=1/(2*C^2*ln2),
+ a3=1/(3*C^3*ln2),
+ ...
+ We get 3 parts of log2 result: HH+HL+HLL ~= log2|x|.
+
+ 2) Calculation of y*(HH+HL+HLL).
+ Split y into YHi+YLo.
+ Get high PH and medium PL parts of y*log2|x|.
+ Get low PLL part of y*log2|x|.
+ Now we have PH+PL+PLL ~= y*log2|x|.
+
+ 3) Calculation of 2^(PH+PL+PLL).
+ Mathematical idea of computing 2^(PH+PL+PLL) is the following.
+ Let's represent PH+PL+PLL in the form N + j/2^expK + Z,
+ where expK=7 in this implementation, N and j are integers,
+ 0<=j<=2^expK-1, |Z|<2^(-expK-1).
+ Hence 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z,
+ where 2^(j/2^expK) is stored in a table, and
+ 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5.
+
+ We compute 2^(PH+PL+PLL) as follows.
+ Break PH into PHH + PHL, where PHH = N + j/2^expK.
+ Z = PHL + PL + PLL
+ Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5
+ Get 2^(j/2^expK) from table in the form THI+TLO.
+ Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly).
+
+ Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo:
+ ResHi := THI
+ ResLo := THI * Exp2Poly + TLO
+
+ Get exponent ERes of the result:
+ Res := ResHi + ResLo:
+ Result := ex(Res) + N. */
+
+ .text
+ENTRY (_ZGVeN8vv_pow_knl)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512_ff _ZGVdN4vv_pow
+#else
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1344, %rsp
+ vpsrlq $32, %zmm0, %zmm13
+ vmovaps %zmm1, %zmm12
+ movq __svml_dpow_data@GOTPCREL(%rip), %rax
+ movl $255, %edx
+ vpmovqd %zmm13, %ymm10
+ vpsrlq $32, %zmm12, %zmm14
+ kmovw %edx, %k1
+ movl $-1, %ecx
+ vpmovqd %zmm14, %ymm15
+
+/* x1 = x; Hi(x1) = (Hi(x1)&0x000fffff)|0x3ff00000 */
+ vmovups _dbOne(%rax), %zmm6
+
+/* i = (((Hi(x) & 0x000ffe00) + 0x00000200) >> 10); -> i = (b1..b11 + 1) / 2 */
+ vmovaps %zmm10, %zmm5
+
+/* k = Hi(x); k = k - 0x3fe7fe00; k = k >> 20 */
+ vpsubd _i3fe7fe00(%rax), %zmm10, %zmm14{%k1}
+ vpandd _iIndexMask(%rax), %zmm10, %zmm5{%k1}
+ vpsrad $20, %zmm14, %zmm14{%k1}
+ vpxord %zmm9, %zmm9, %zmm9
+ vpaddd _HIDELTA(%rax), %zmm10, %zmm3{%k1}
+ vpaddd _iIndexAdd(%rax), %zmm5, %zmm5{%k1}
+ vpxord %zmm7, %zmm7, %zmm7
+ vpaddd _i2p20_2p19(%rax), %zmm14, %zmm14{%k1}
+ vpcmpd $1, _LORANGE(%rax), %zmm3, %k2{%k1}
+ vpsrld $10, %zmm5, %zmm5{%k1}
+ vpandd _ABSMASK(%rax), %zmm15, %zmm2{%k1}
+ vpbroadcastd %ecx, %zmm1{%k2}{z}
+
+/* Index for reciprocal table */
+ vpslld $3, %zmm5, %zmm8{%k1}
+ kxnorw %k2, %k2, %k2
+ vgatherdpd 11712(%rax,%ymm8), %zmm9{%k2}
+ vpmovzxdq %ymm14, %zmm10
+
+/* Index for log2 table */
+ vpslld $4, %zmm5, %zmm13{%k1}
+ kxnorw %k2, %k2, %k2
+ vpsllq $32, %zmm10, %zmm3
+ vpxord %zmm8, %zmm8, %zmm8
+ vpcmpd $5, _INF(%rax), %zmm2, %k3{%k1}
+ vpbroadcastd %ecx, %zmm4{%k3}{z}
+ vpternlogq $248, _iMantissaMask(%rax), %zmm0, %zmm6
+ kxnorw %k3, %k3, %k3
+ vpternlogq $168, _iffffffff00000000(%rax), %zmm10, %zmm3
+
+/* x1Hi=x1; Lo(x1Hi)&=0xf8000000; x1Lo = x1-x1Hi */
+ vpandq _iHighMask(%rax), %zmm6, %zmm2
+ vgatherdpd 19976(%rax,%ymm13), %zmm8{%k2}
+ vpord %zmm4, %zmm1, %zmm11{%k1}
+ vsubpd _db2p20_2p19(%rax), %zmm3, %zmm1
+ vsubpd %zmm2, %zmm6, %zmm5
+
+/* r1 = x1*rcp1 */
+ vmulpd %zmm9, %zmm6, %zmm6
+ vgatherdpd 19968(%rax,%ymm13), %zmm7{%k3}
+
+/* cq = c+r1 */
+ vaddpd _LHN(%rax), %zmm6, %zmm4
+
+/* E = -r1+__fence(x1Hi*rcp1) */
+ vfmsub213pd %zmm6, %zmm9, %zmm2
+
+/* T = k + L1hi */
+ vaddpd %zmm7, %zmm1, %zmm7
+
+/* E=E+x1Lo*rcp1 */
+ vfmadd213pd %zmm2, %zmm9, %zmm5
+
+/* T_Rh = T + cq */
+ vaddpd %zmm4, %zmm7, %zmm3
+
+/* Rl = T-T_Rh; -> -Rh */
+ vsubpd %zmm3, %zmm7, %zmm9
+
+/* Rl=Rl+cq */
+ vaddpd %zmm9, %zmm4, %zmm6
+
+/* T_Rh_Eh = T_Rh + E */
+ vaddpd %zmm5, %zmm3, %zmm9
+
+/* HLL = T_Rh - T_Rh_Eh; -> -Eh */
+ vsubpd %zmm9, %zmm3, %zmm2
+
+/* cq = cq + E; */
+ vaddpd %zmm5, %zmm4, %zmm4
+
+/* HLL+=E; -> El */
+ vaddpd %zmm2, %zmm5, %zmm1
+ vmovups _clv_2(%rax), %zmm5
+
+/* HLL = HLL + (((((((a7)*cq+a6)*cq+a5)*cq+a4)*cq+a3)*cq+a2)*cq+a1)*cq */
+ vfmadd213pd _clv_3(%rax), %zmm4, %zmm5
+
+/* HLL+=Rl */
+ vaddpd %zmm6, %zmm1, %zmm7
+
+/* 2^(y*(HH+HL+HLL)) starts here:
+ yH = y; Lo(yH)&=0xf8000000
+ */
+ vpandq _iHighMask(%rax), %zmm12, %zmm6
+
+/* yL = y-yH */
+ vsubpd %zmm6, %zmm12, %zmm2
+ vfmadd213pd _clv_4(%rax), %zmm4, %zmm5
+
+/* HLL+=L1lo */
+ vaddpd %zmm8, %zmm7, %zmm8
+ vfmadd213pd _clv_5(%rax), %zmm4, %zmm5
+ vfmadd213pd _clv_6(%rax), %zmm4, %zmm5
+ vfmadd213pd _clv_7(%rax), %zmm4, %zmm5
+ vfmadd213pd %zmm8, %zmm4, %zmm5
+
+/* T_Rh_Eh_HLLhi = T_Rh_Eh + HLL */
+ vaddpd %zmm5, %zmm9, %zmm13
+
+/* HLLhi = T_Rh_Eh_HLLhi - T_Rh_Eh */
+ vsubpd %zmm9, %zmm13, %zmm10
+
+/* HLL = HLL - HLLhi */
+ vsubpd %zmm10, %zmm5, %zmm3
+
+/* HH = T_Rh_Eh_HLLhi; Lo(HH)&=0xf8000000 */
+ vpandq _iHighMask(%rax), %zmm13, %zmm5
+
+/* pH = yH*HH */
+ vmulpd %zmm5, %zmm6, %zmm1
+
+/* HL = T_Rh_Eh_HLLhi-HH */
+ vsubpd %zmm5, %zmm13, %zmm4
+ vpsrlq $32, %zmm1, %zmm14
+
+/* pLL = y*HLL;
+ pHH = pH + *(double*)&db2p45_2p44
+ */
+ vaddpd _db2p45_2p44(%rax), %zmm1, %zmm10
+ vpmovqd %zmm14, %ymm15
+ vpandd _ABSMASK(%rax), %zmm15, %zmm14{%k1}
+ vpcmpd $5, _DOMAINRANGE(%rax), %zmm14, %k3{%k1}
+
+/* T1 = ((double*)exp2_tbl)[ 2*j ] */
+ vpxord %zmm14, %zmm14, %zmm14
+ vpbroadcastd %ecx, %zmm13{%k3}{z}
+ vpord %zmm13, %zmm11, %zmm11{%k1}
+ vptestmd %zmm11, %zmm11, %k0{%k1}
+
+/* pL=yL*HL+yH*HL; pL+=yL*HH */
+ vmulpd %zmm4, %zmm2, %zmm11
+ kmovw %k0, %ecx
+ vfmadd213pd %zmm11, %zmm4, %zmm6
+
+/* pHH = pHH - *(double*)&db2p45_2p44 */
+ vsubpd _db2p45_2p44(%rax), %zmm10, %zmm11
+ vpmovqd %zmm10, %ymm4
+ movzbl %cl, %ecx
+
+/* _n = Lo(pHH);
+ _n = _n & 0xffffff80;
+ _n = _n >> 7;
+ Hi(_2n) = (0x3ff+_n)<<20; Lo(_2n) = 0; -> 2^n
+ */
+ vpslld $13, %zmm4, %zmm7{%k1}
+
+/* j = Lo(pHH)&0x0000007f */
+ vpandd _jIndexMask(%rax), %zmm4, %zmm9{%k1}
+ vfmadd213pd %zmm6, %zmm5, %zmm2
+
+/* pHL = pH - pHH */
+ vsubpd %zmm11, %zmm1, %zmm1
+ vpaddd _iOne(%rax), %zmm7, %zmm7{%k1}
+
+/* t=pL+pLL; t+=pHL */
+ vfmadd231pd %zmm12, %zmm3, %zmm2
+ vpslld $4, %zmm9, %zmm9{%k1}
+ kxnorw %k1, %k1, %k1
+ vgatherdpd 36416(%rax,%ymm9), %zmm14{%k1}
+ vpmovzxdq %ymm7, %zmm8
+ vaddpd %zmm1, %zmm2, %zmm2
+ vmovups _cev_1(%rax), %zmm1
+ vpsllq $32, %zmm8, %zmm13
+ vpternlogq $168, _ifff0000000000000(%rax), %zmm8, %zmm13
+ vfmadd213pd _cev_2(%rax), %zmm2, %zmm1
+ vmulpd %zmm14, %zmm13, %zmm15
+ vfmadd213pd _cev_3(%rax), %zmm2, %zmm1
+ vmulpd %zmm2, %zmm15, %zmm3
+ vfmadd213pd _cev_4(%rax), %zmm2, %zmm1
+ vfmadd213pd _cev_5(%rax), %zmm2, %zmm1
+ vfmadd213pd %zmm15, %zmm3, %zmm1
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ vmovaps %zmm1, %zmm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovups %zmm0, 1152(%rsp)
+ vmovups %zmm12, 1216(%rsp)
+ vmovups %zmm1, 1280(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ kmovw %k4, 1048(%rsp)
+ xorl %eax, %eax
+ kmovw %k5, 1040(%rsp)
+ kmovw %k6, 1032(%rsp)
+ kmovw %k7, 1024(%rsp)
+ vmovups %zmm16, 960(%rsp)
+ vmovups %zmm17, 896(%rsp)
+ vmovups %zmm18, 832(%rsp)
+ vmovups %zmm19, 768(%rsp)
+ vmovups %zmm20, 704(%rsp)
+ vmovups %zmm21, 640(%rsp)
+ vmovups %zmm22, 576(%rsp)
+ vmovups %zmm23, 512(%rsp)
+ vmovups %zmm24, 448(%rsp)
+ vmovups %zmm25, 384(%rsp)
+ vmovups %zmm26, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1064(%rsp)
+ movq %rdi, 1056(%rsp)
+ movq %r12, 1096(%rsp)
+ cfi_offset_rel_rsp (12, 1096)
+ movb %dl, %r12b
+ movq %r13, 1088(%rsp)
+ cfi_offset_rel_rsp (13, 1088)
+ movl %ecx, %r13d
+ movq %r14, 1080(%rsp)
+ cfi_offset_rel_rsp (14, 1080)
+ movl %eax, %r14d
+ movq %r15, 1072(%rsp)
+ cfi_offset_rel_rsp (15, 1072)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ addb $1, %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ kmovw 1048(%rsp), %k4
+ movq 1064(%rsp), %rsi
+ kmovw 1040(%rsp), %k5
+ movq 1056(%rsp), %rdi
+ kmovw 1032(%rsp), %k6
+ movq 1096(%rsp), %r12
+ cfi_restore (%r12)
+ movq 1088(%rsp), %r13
+ cfi_restore (%r13)
+ kmovw 1024(%rsp), %k7
+ vmovups 960(%rsp), %zmm16
+ vmovups 896(%rsp), %zmm17
+ vmovups 832(%rsp), %zmm18
+ vmovups 768(%rsp), %zmm19
+ vmovups 704(%rsp), %zmm20
+ vmovups 640(%rsp), %zmm21
+ vmovups 576(%rsp), %zmm22
+ vmovups 512(%rsp), %zmm23
+ vmovups 448(%rsp), %zmm24
+ vmovups 384(%rsp), %zmm25
+ vmovups 320(%rsp), %zmm26
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ movq 1080(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1072(%rsp), %r15
+ cfi_restore (%r15)
+ vmovups 1280(%rsp), %zmm1
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 1160(%rsp,%r15), %xmm0
+ vmovsd 1224(%rsp,%r15), %xmm1
+ call pow@PLT
+ vmovsd %xmm0, 1288(%rsp,%r15)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 1152(%rsp,%r15), %xmm0
+ vmovsd 1216(%rsp,%r15), %xmm1
+ call pow@PLT
+ vmovsd %xmm0, 1280(%rsp,%r15)
+ jmp .LBL_1_7
+
+#endif
+END (_ZGVeN8vv_pow_knl)
+
+ENTRY (_ZGVeN8vv_pow_skx)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512_ff _ZGVdN4vv_pow
+#else
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1344, %rsp
+ vpsrlq $32, %zmm0, %zmm10
+ kxnorw %k1, %k1, %k1
+ kxnorw %k2, %k2, %k2
+ kxnorw %k3, %k3, %k3
+ vpmovqd %zmm10, %ymm7
+ movq __svml_dpow_data@GOTPCREL(%rip), %rax
+ vmovaps %zmm1, %zmm6
+ vpsrlq $32, %zmm6, %zmm13
+
+/* i = (((Hi(x) & 0x000ffe00) + 0x00000200) >> 10); -> i = (b1..b11 + 1) / 2 */
+ vpand _iIndexMask(%rax), %ymm7, %ymm15
+ vpaddd _HIDELTA(%rax), %ymm7, %ymm2
+
+/* k = Hi(x); k = k - 0x3fe7fe00; k = k >> 20 */
+ vpsubd _i3fe7fe00(%rax), %ymm7, %ymm7
+ vmovdqu _ABSMASK(%rax), %ymm4
+ vmovdqu _LORANGE(%rax), %ymm3
+
+/* x1 = x; Hi(x1) = (Hi(x1)&0x000fffff)|0x3ff00000 */
+ vmovups _dbOne(%rax), %zmm11
+ vmovdqu _INF(%rax), %ymm5
+ vpaddd _iIndexAdd(%rax), %ymm15, %ymm12
+ vpmovqd %zmm13, %ymm14
+ vpternlogq $248, _iMantissaMask(%rax), %zmm0, %zmm11
+ vpsrld $10, %ymm12, %ymm10
+ vpsrad $20, %ymm7, %ymm13
+
+/* Index for reciprocal table */
+ vpslld $3, %ymm10, %ymm8
+
+/* Index for log2 table */
+ vpslld $4, %ymm10, %ymm1
+ vpcmpgtd %ymm2, %ymm3, %ymm3
+ vpand %ymm4, %ymm14, %ymm2
+ vpaddd _i2p20_2p19(%rax), %ymm13, %ymm14
+ vpmovzxdq %ymm14, %zmm15
+ vpsllq $32, %zmm15, %zmm7
+ vpternlogq $168, _iffffffff00000000(%rax), %zmm15, %zmm7
+ vsubpd _db2p20_2p19(%rax), %zmm7, %zmm13
+ vpxord %zmm9, %zmm9, %zmm9
+ vgatherdpd 11712(%rax,%ymm8), %zmm9{%k1}
+
+/* T1 = ((double*)exp2_tbl)[ 2*j ] */
+ kxnorw %k1, %k1, %k1
+ vpxord %zmm12, %zmm12, %zmm12
+ vpxord %zmm8, %zmm8, %zmm8
+ vgatherdpd 19968(%rax,%ymm1), %zmm12{%k2}
+ vgatherdpd 19976(%rax,%ymm1), %zmm8{%k3}
+ vmovups _iHighMask(%rax), %zmm1
+
+/* x1Hi=x1; Lo(x1Hi)&=0xf8000000; x1Lo = x1-x1Hi */
+ vandpd %zmm1, %zmm11, %zmm10
+ vsubpd %zmm10, %zmm11, %zmm15
+
+/* r1 = x1*rcp1 */
+ vmulpd %zmm9, %zmm11, %zmm11
+
+/* E = -r1+__fence(x1Hi*rcp1) */
+ vfmsub213pd %zmm11, %zmm9, %zmm10
+
+/* cq = c+r1 */
+ vaddpd _LHN(%rax), %zmm11, %zmm14
+
+/* E=E+x1Lo*rcp1 */
+ vfmadd213pd %zmm10, %zmm9, %zmm15
+
+/* T = k + L1hi */
+ vaddpd %zmm12, %zmm13, %zmm9
+
+/* T_Rh = T + cq */
+ vaddpd %zmm14, %zmm9, %zmm11
+
+/* T_Rh_Eh = T_Rh + E */
+ vaddpd %zmm15, %zmm11, %zmm13
+
+/* Rl = T-T_Rh; -> -Rh */
+ vsubpd %zmm11, %zmm9, %zmm12
+
+/* HLL = T_Rh - T_Rh_Eh; -> -Eh */
+ vsubpd %zmm13, %zmm11, %zmm9
+
+/* Rl=Rl+cq */
+ vaddpd %zmm12, %zmm14, %zmm10
+
+/* HLL+=E; -> El */
+ vaddpd %zmm9, %zmm15, %zmm7
+
+/* HLL+=Rl */
+ vaddpd %zmm10, %zmm7, %zmm12
+
+/* 2^(y*(HH+HL+HLL)) starts here:
+ yH = y; Lo(yH)&=0xf8000000
+ */
+ vandpd %zmm1, %zmm6, %zmm7
+
+/* HLL+=L1lo */
+ vaddpd %zmm8, %zmm12, %zmm12
+
+/* cq = cq + E */
+ vaddpd %zmm15, %zmm14, %zmm8
+ vmovups _clv_2(%rax), %zmm14
+
+/* HLL = HLL + (((((((a7)*cq+a6)*cq+a5)*cq+a4)*cq+a3)*cq+a2)*cq+a1)*cq */
+ vfmadd213pd _clv_3(%rax), %zmm8, %zmm14
+ vfmadd213pd _clv_4(%rax), %zmm8, %zmm14
+ vfmadd213pd _clv_5(%rax), %zmm8, %zmm14
+ vfmadd213pd _clv_6(%rax), %zmm8, %zmm14
+ vfmadd213pd _clv_7(%rax), %zmm8, %zmm14
+ vfmadd213pd %zmm12, %zmm8, %zmm14
+
+/* yL = y-yH */
+ vsubpd %zmm7, %zmm6, %zmm8
+
+/* T_Rh_Eh_HLLhi = T_Rh_Eh + HLL */
+ vaddpd %zmm14, %zmm13, %zmm15
+
+/* HH = T_Rh_Eh_HLLhi; Lo(HH)&=0xf8000000 */
+ vandpd %zmm1, %zmm15, %zmm11
+
+/* HLLhi = T_Rh_Eh_HLLhi - T_Rh_Eh */
+ vsubpd %zmm13, %zmm15, %zmm13
+
+/* pH = yH*HH */
+ vmulpd %zmm11, %zmm7, %zmm9
+
+/* HLL = HLL - HLLhi */
+ vsubpd %zmm13, %zmm14, %zmm12
+
+/* HL = T_Rh_Eh_HLLhi-HH */
+ vsubpd %zmm11, %zmm15, %zmm10
+ vpsrlq $32, %zmm9, %zmm1
+ vmovdqu _DOMAINRANGE(%rax), %ymm13
+ vpmovqd %zmm1, %ymm1
+ vpand %ymm4, %ymm1, %ymm1
+ vpcmpgtd %ymm5, %ymm2, %ymm4
+ vpcmpeqd %ymm5, %ymm2, %ymm5
+ vpternlogd $254, %ymm5, %ymm4, %ymm3
+ vpcmpgtd %ymm13, %ymm1, %ymm2
+ vpcmpeqd %ymm13, %ymm1, %ymm4
+ vpternlogd $254, %ymm4, %ymm2, %ymm3
+
+/* pLL = y*HLL */
+ vmovups _db2p45_2p44(%rax), %zmm2
+
+/* pHH = pH + *(double*)&db2p45_2p44 */
+ vaddpd %zmm2, %zmm9, %zmm1
+ vpmovqd %zmm1, %ymm5
+
+/* j = Lo(pHH)&0x0000007f */
+ vpand _jIndexMask(%rax), %ymm5, %ymm14
+ vpslld $4, %ymm14, %ymm15
+ vmovmskps %ymm3, %ecx
+
+/* pL=yL*HL+yH*HL; pL+=yL*HH */
+ vmulpd %zmm10, %zmm8, %zmm3
+ vfmadd213pd %zmm3, %zmm10, %zmm7
+ vfmadd213pd %zmm7, %zmm11, %zmm8
+
+/* _n = Lo(pHH)
+ _n = _n & 0xffffff80
+ _n = _n >> 7
+ Hi(_2n) = (0x3ff+_n)<<20; Lo(_2n) = 0; -> 2^n
+ */
+ vpslld $13, %ymm5, %ymm7
+
+/* t=pL+pLL; t+=pHL */
+ vfmadd231pd %zmm6, %zmm12, %zmm8
+ vpaddd _iOne(%rax), %ymm7, %ymm10
+ vpmovzxdq %ymm10, %zmm11
+ vpsllq $32, %zmm11, %zmm3
+ vpternlogq $168, _ifff0000000000000(%rax), %zmm11, %zmm3
+
+/* pHH = pHH - *(double*)&db2p45_2p44 */
+ vsubpd %zmm2, %zmm1, %zmm11
+ vmovups _cev_1(%rax), %zmm2
+
+/* pHL = pH - pHH */
+ vsubpd %zmm11, %zmm9, %zmm9
+ vaddpd %zmm9, %zmm8, %zmm8
+ vfmadd213pd _cev_2(%rax), %zmm8, %zmm2
+ vfmadd213pd _cev_3(%rax), %zmm8, %zmm2
+ vfmadd213pd _cev_4(%rax), %zmm8, %zmm2
+ vfmadd213pd _cev_5(%rax), %zmm8, %zmm2
+ vpxord %zmm4, %zmm4, %zmm4
+ vgatherdpd 36416(%rax,%ymm15), %zmm4{%k1}
+ vmulpd %zmm4, %zmm3, %zmm1
+ vmulpd %zmm8, %zmm1, %zmm12
+ vfmadd213pd %zmm1, %zmm12, %zmm2
+ testl %ecx, %ecx
+ jne .LBL_2_3
+
+.LBL_2_2:
+ cfi_remember_state
+ vmovaps %zmm2, %zmm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_2_3:
+ cfi_restore_state
+ vmovups %zmm0, 1152(%rsp)
+ vmovups %zmm6, 1216(%rsp)
+ vmovups %zmm2, 1280(%rsp)
+ je .LBL_2_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ kmovw %k4, 1048(%rsp)
+ kmovw %k5, 1040(%rsp)
+ kmovw %k6, 1032(%rsp)
+ kmovw %k7, 1024(%rsp)
+ vmovups %zmm16, 960(%rsp)
+ vmovups %zmm17, 896(%rsp)
+ vmovups %zmm18, 832(%rsp)
+ vmovups %zmm19, 768(%rsp)
+ vmovups %zmm20, 704(%rsp)
+ vmovups %zmm21, 640(%rsp)
+ vmovups %zmm22, 576(%rsp)
+ vmovups %zmm23, 512(%rsp)
+ vmovups %zmm24, 448(%rsp)
+ vmovups %zmm25, 384(%rsp)
+ vmovups %zmm26, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1064(%rsp)
+ movq %rdi, 1056(%rsp)
+ movq %r12, 1096(%rsp)
+ cfi_offset_rel_rsp (12, 1096)
+ movb %dl, %r12b
+ movq %r13, 1088(%rsp)
+ cfi_offset_rel_rsp (13, 1088)
+ movl %ecx, %r13d
+ movq %r14, 1080(%rsp)
+ cfi_offset_rel_rsp (14, 1080)
+ movl %eax, %r14d
+ movq %r15, 1072(%rsp)
+ cfi_offset_rel_rsp (15, 1072)
+ cfi_remember_state
+
+.LBL_2_6:
+ btl %r14d, %r13d
+ jc .LBL_2_12
+
+.LBL_2_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_2_10
+
+.LBL_2_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_2_6
+
+ kmovw 1048(%rsp), %k4
+ kmovw 1040(%rsp), %k5
+ kmovw 1032(%rsp), %k6
+ kmovw 1024(%rsp), %k7
+ vmovups 960(%rsp), %zmm16
+ vmovups 896(%rsp), %zmm17
+ vmovups 832(%rsp), %zmm18
+ vmovups 768(%rsp), %zmm19
+ vmovups 704(%rsp), %zmm20
+ vmovups 640(%rsp), %zmm21
+ vmovups 576(%rsp), %zmm22
+ vmovups 512(%rsp), %zmm23
+ vmovups 448(%rsp), %zmm24
+ vmovups 384(%rsp), %zmm25
+ vmovups 320(%rsp), %zmm26
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ vmovups 1280(%rsp), %zmm2
+ movq 1064(%rsp), %rsi
+ movq 1056(%rsp), %rdi
+ movq 1096(%rsp), %r12
+ cfi_restore (%r12)
+ movq 1088(%rsp), %r13
+ cfi_restore (%r13)
+ movq 1080(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1072(%rsp), %r15
+ cfi_restore (%r15)
+ jmp .LBL_2_2
+
+.LBL_2_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 1224(%rsp,%r15), %xmm1
+ vzeroupper
+ vmovsd 1160(%rsp,%r15), %xmm0
+
+ call pow@PLT
+
+ vmovsd %xmm0, 1288(%rsp,%r15)
+ jmp .LBL_2_8
+
+.LBL_2_12:
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 1216(%rsp,%r15), %xmm1
+ vzeroupper
+ vmovsd 1152(%rsp,%r15), %xmm0
+
+ call pow@PLT
+
+ vmovsd %xmm0, 1280(%rsp,%r15)
+ jmp .LBL_2_7
+
+#endif
+END (_ZGVeN8vv_pow_skx)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S
new file mode 100644
index 0000000000..29bd0a7b4d
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S
@@ -0,0 +1,38 @@
+/* Multiple versions of vectorized sin.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVbN2v_sin)
+ .type _ZGVbN2v_sin, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVbN2v_sin_sse4(%rip), %rax
+ testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVbN2v_sin_sse2(%rip), %rax
+ ret
+END (_ZGVbN2v_sin)
+libmvec_hidden_def (_ZGVbN2v_sin)
+
+#define _ZGVbN2v_sin _ZGVbN2v_sin_sse2
+#include "../svml_d_sin2_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S
new file mode 100644
index 0000000000..3a1ccbf139
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S
@@ -0,0 +1,229 @@
+/* Function sin vectorized with SSE4.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_trig_data.h"
+
+ .text
+ENTRY (_ZGVbN2v_sin_sse4)
+/* ALGORITHM DESCRIPTION:
+
+ ( low accuracy ( < 4ulp ) or enhanced performance
+ ( half of correct mantissa ) implementation )
+
+ Argument representation:
+ arg = N*Pi + R
+
+ Result calculation:
+ sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R)
+ sin(R) is approximated by corresponding polynomial
+ */
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $320, %rsp
+ movaps %xmm0, %xmm5
+ movq __svml_d_trig_data@GOTPCREL(%rip), %rax
+ movups __dAbsMask(%rax), %xmm3
+/*
+ ARGUMENT RANGE REDUCTION:
+ X' = |X|
+ */
+ movaps %xmm3, %xmm4
+
+/* SignX - sign bit of X */
+ andnps %xmm5, %xmm3
+ movups __dInvPI(%rax), %xmm2
+ andps %xmm5, %xmm4
+
+/* Y = X'*InvPi + RS : right shifter add */
+ mulpd %xmm4, %xmm2
+ movups __dRShifter(%rax), %xmm6
+
+/* R = X' - N*Pi1 */
+ movaps %xmm4, %xmm0
+ addpd %xmm6, %xmm2
+ cmpnlepd __dRangeVal(%rax), %xmm4
+
+/* N = Y - RS : right shifter sub */
+ movaps %xmm2, %xmm1
+
+/* SignRes = Y<<63 : shift LSB to MSB place for result sign */
+ psllq $63, %xmm2
+ subpd %xmm6, %xmm1
+ movmskpd %xmm4, %ecx
+ movups __dPI1(%rax), %xmm7
+ mulpd %xmm1, %xmm7
+ movups __dPI2(%rax), %xmm6
+
+/* R = R - N*Pi2 */
+ mulpd %xmm1, %xmm6
+ subpd %xmm7, %xmm0
+ movups __dPI3(%rax), %xmm7
+
+/* R = R - N*Pi3 */
+ mulpd %xmm1, %xmm7
+ subpd %xmm6, %xmm0
+ movups __dPI4(%rax), %xmm6
+
+/* R = R - N*Pi4 */
+ mulpd %xmm6, %xmm1
+ subpd %xmm7, %xmm0
+ subpd %xmm1, %xmm0
+
+/*
+ POLYNOMIAL APPROXIMATION:
+ R2 = R*R
+ */
+ movaps %xmm0, %xmm1
+ mulpd %xmm0, %xmm1
+
+/* R = R^SignRes : update sign of reduced argument */
+ xorps %xmm2, %xmm0
+ movups __dC7_sin(%rax), %xmm2
+ mulpd %xmm1, %xmm2
+ addpd __dC6_sin(%rax), %xmm2
+ mulpd %xmm1, %xmm2
+ addpd __dC5_sin(%rax), %xmm2
+ mulpd %xmm1, %xmm2
+ addpd __dC4_sin(%rax), %xmm2
+
+/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */
+ mulpd %xmm1, %xmm2
+ addpd __dC3_sin(%rax), %xmm2
+
+/* Poly = R2*(C1+R2*(C2+R2*Poly)) */
+ mulpd %xmm1, %xmm2
+ addpd __dC2_sin(%rax), %xmm2
+ mulpd %xmm1, %xmm2
+ addpd __dC1_sin(%rax), %xmm2
+ mulpd %xmm2, %xmm1
+
+/* Poly = Poly*R + R */
+ mulpd %xmm0, %xmm1
+ addpd %xmm1, %xmm0
+
+/*
+ RECONSTRUCTION:
+ Final sign setting: Res = Poly^SignX
+ */
+ xorps %xmm3, %xmm0
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ movups %xmm5, 192(%rsp)
+ movups %xmm0, 256(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ movups %xmm8, 112(%rsp)
+ movups %xmm9, 96(%rsp)
+ movups %xmm10, 80(%rsp)
+ movups %xmm11, 64(%rsp)
+ movups %xmm12, 48(%rsp)
+ movups %xmm13, 32(%rsp)
+ movups %xmm14, 16(%rsp)
+ movups %xmm15, (%rsp)
+ movq %rsi, 136(%rsp)
+ movq %rdi, 128(%rsp)
+ movq %r12, 168(%rsp)
+ cfi_offset_rel_rsp (12, 168)
+ movb %dl, %r12b
+ movq %r13, 160(%rsp)
+ cfi_offset_rel_rsp (13, 160)
+ movl %ecx, %r13d
+ movq %r14, 152(%rsp)
+ cfi_offset_rel_rsp (14, 152)
+ movl %eax, %r14d
+ movq %r15, 144(%rsp)
+ cfi_offset_rel_rsp (15, 144)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ movups 112(%rsp), %xmm8
+ movups 96(%rsp), %xmm9
+ movups 80(%rsp), %xmm10
+ movups 64(%rsp), %xmm11
+ movups 48(%rsp), %xmm12
+ movups 32(%rsp), %xmm13
+ movups 16(%rsp), %xmm14
+ movups (%rsp), %xmm15
+ movq 136(%rsp), %rsi
+ movq 128(%rsp), %rdi
+ movq 168(%rsp), %r12
+ cfi_restore (%r12)
+ movq 160(%rsp), %r13
+ cfi_restore (%r13)
+ movq 152(%rsp), %r14
+ cfi_restore (%r14)
+ movq 144(%rsp), %r15
+ cfi_restore (%r15)
+ movups 256(%rsp), %xmm0
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ movsd 200(%rsp,%r15), %xmm0
+
+ call sin@PLT
+
+ movsd %xmm0, 264(%rsp,%r15)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ movsd 192(%rsp,%r15), %xmm0
+
+ call sin@PLT
+
+ movsd %xmm0, 256(%rsp,%r15)
+ jmp .LBL_1_7
+
+END (_ZGVbN2v_sin_sse4)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S
new file mode 100644
index 0000000000..c3a453a477
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S
@@ -0,0 +1,38 @@
+/* Multiple versions of vectorized sin, vector length is 4.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVdN4v_sin)
+ .type _ZGVdN4v_sin, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVdN4v_sin_avx2(%rip), %rax
+ testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVdN4v_sin_sse_wrapper(%rip), %rax
+ ret
+END (_ZGVdN4v_sin)
+libmvec_hidden_def (_ZGVdN4v_sin)
+
+#define _ZGVdN4v_sin _ZGVdN4v_sin_sse_wrapper
+#include "../svml_d_sin4_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core_avx2.S
new file mode 100644
index 0000000000..6bf8b32b4f
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core_avx2.S
@@ -0,0 +1,210 @@
+/* Function sin vectorized with AVX2.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_trig_data.h"
+
+ .text
+ENTRY (_ZGVdN4v_sin_avx2)
+/* ALGORITHM DESCRIPTION:
+
+ ( low accuracy ( < 4ulp ) or enhanced performance
+ ( half of correct mantissa ) implementation )
+
+ Argument representation:
+ arg = N*Pi + R
+
+ Result calculation:
+ sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R)
+ sin(R) is approximated by corresponding polynomial
+ */
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $448, %rsp
+ movq __svml_d_trig_data@GOTPCREL(%rip), %rax
+ vmovdqa %ymm0, %ymm4
+ vmovupd __dAbsMask(%rax), %ymm2
+ vmovupd __dInvPI(%rax), %ymm6
+ vmovupd __dRShifter(%rax), %ymm5
+ vmovupd __dPI1_FMA(%rax), %ymm7
+/*
+ ARGUMENT RANGE REDUCTION:
+ X' = |X|
+ */
+ vandpd %ymm2, %ymm4, %ymm3
+
+/* Y = X'*InvPi + RS : right shifter add */
+ vfmadd213pd %ymm5, %ymm3, %ymm6
+
+/* N = Y - RS : right shifter sub */
+ vsubpd %ymm5, %ymm6, %ymm1
+
+/* SignRes = Y<<63 : shift LSB to MSB place for result sign */
+ vpsllq $63, %ymm6, %ymm5
+
+/* R = X' - N*Pi1 */
+ vmovapd %ymm3, %ymm0
+ vfnmadd231pd %ymm1, %ymm7, %ymm0
+ vcmpnle_uqpd __dRangeVal(%rax), %ymm3, %ymm3
+
+/* R = R - N*Pi2 */
+ vfnmadd231pd __dPI2_FMA(%rax), %ymm1, %ymm0
+
+/* R = R - N*Pi3 */
+ vfnmadd132pd __dPI3_FMA(%rax), %ymm0, %ymm1
+
+/*
+ POLYNOMIAL APPROXIMATION:
+ R2 = R*R
+ */
+ vmulpd %ymm1, %ymm1, %ymm0
+
+/* R = R^SignRes : update sign of reduced argument */
+ vxorpd %ymm5, %ymm1, %ymm6
+ vmovupd __dC7_sin(%rax), %ymm1
+ vfmadd213pd __dC6_sin(%rax), %ymm0, %ymm1
+ vfmadd213pd __dC5_sin(%rax), %ymm0, %ymm1
+ vfmadd213pd __dC4_sin(%rax), %ymm0, %ymm1
+
+/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */
+ vfmadd213pd __dC3_sin(%rax), %ymm0, %ymm1
+
+/* Poly = R2*(C1+R2*(C2+R2*Poly)) */
+ vfmadd213pd __dC2_sin(%rax), %ymm0, %ymm1
+ vfmadd213pd __dC1_sin(%rax), %ymm0, %ymm1
+
+/* SignX - sign bit of X */
+ vandnpd %ymm4, %ymm2, %ymm7
+ vmulpd %ymm0, %ymm1, %ymm2
+
+/* Poly = Poly*R + R */
+ vfmadd213pd %ymm6, %ymm6, %ymm2
+ vmovmskpd %ymm3, %ecx
+
+/*
+ RECONSTRUCTION:
+ Final sign setting: Res = Poly^SignX
+ */
+ vxorpd %ymm7, %ymm2, %ymm0
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovupd %ymm4, 320(%rsp)
+ vmovupd %ymm0, 384(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ vmovups %ymm8, 224(%rsp)
+ vmovups %ymm9, 192(%rsp)
+ vmovups %ymm10, 160(%rsp)
+ vmovups %ymm11, 128(%rsp)
+ vmovups %ymm12, 96(%rsp)
+ vmovups %ymm13, 64(%rsp)
+ vmovups %ymm14, 32(%rsp)
+ vmovups %ymm15, (%rsp)
+ movq %rsi, 264(%rsp)
+ movq %rdi, 256(%rsp)
+ movq %r12, 296(%rsp)
+ cfi_offset_rel_rsp (12, 296)
+ movb %dl, %r12b
+ movq %r13, 288(%rsp)
+ cfi_offset_rel_rsp (13, 288)
+ movl %ecx, %r13d
+ movq %r14, 280(%rsp)
+ cfi_offset_rel_rsp (14, 280)
+ movl %eax, %r14d
+ movq %r15, 272(%rsp)
+ cfi_offset_rel_rsp (15, 272)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ vmovups 224(%rsp), %ymm8
+ vmovups 192(%rsp), %ymm9
+ vmovups 160(%rsp), %ymm10
+ vmovups 128(%rsp), %ymm11
+ vmovups 96(%rsp), %ymm12
+ vmovups 64(%rsp), %ymm13
+ vmovups 32(%rsp), %ymm14
+ vmovups (%rsp), %ymm15
+ vmovupd 384(%rsp), %ymm0
+ movq 264(%rsp), %rsi
+ movq 256(%rsp), %rdi
+ movq 296(%rsp), %r12
+ cfi_restore (%r12)
+ movq 288(%rsp), %r13
+ cfi_restore (%r13)
+ movq 280(%rsp), %r14
+ cfi_restore (%r14)
+ movq 272(%rsp), %r15
+ cfi_restore (%r15)
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 328(%rsp,%r15), %xmm0
+ vzeroupper
+
+ call sin@PLT
+
+ vmovsd %xmm0, 392(%rsp,%r15)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 320(%rsp,%r15), %xmm0
+ vzeroupper
+
+ call sin@PLT
+
+ vmovsd %xmm0, 384(%rsp,%r15)
+ jmp .LBL_1_7
+
+END (_ZGVdN4v_sin_avx2)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
new file mode 100644
index 0000000000..131f2f47c5
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
@@ -0,0 +1,39 @@
+/* Multiple versions of vectorized sin.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVeN8v_sin)
+ .type _ZGVeN8v_sin, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVeN8v_sin_skx(%rip), %rax
+ testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+ jnz 2f
+ leaq _ZGVeN8v_sin_knl(%rip), %rax
+ testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+ jnz 2f
+ leaq _ZGVeN8v_sin_avx2_wrapper(%rip), %rax
+2: ret
+END (_ZGVeN8v_sin)
+
+#define _ZGVeN8v_sin _ZGVeN8v_sin_avx2_wrapper
+#include "../svml_d_sin8_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
new file mode 100644
index 0000000000..422f6e8b0f
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
@@ -0,0 +1,465 @@
+/* Function sin vectorized with AVX-512, KNL and SKX versions.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_trig_data.h"
+#include "svml_d_wrapper_impl.h"
+
+ .text
+ENTRY (_ZGVeN8v_sin_knl)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
+#else
+/*
+ ALGORITHM DESCRIPTION:
+
+ ( low accuracy ( < 4ulp ) or enhanced performance
+ ( half of correct mantissa ) implementation )
+
+ Argument representation:
+ arg = N*Pi + R
+
+ Result calculation:
+ sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R)
+ sin(R) is approximated by corresponding polynomial
+ */
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1280, %rsp
+ movq __svml_d_trig_data@GOTPCREL(%rip), %rax
+ movq $-1, %rdx
+ vmovups __dAbsMask(%rax), %zmm6
+ vmovups __dInvPI(%rax), %zmm1
+
+/*
+ ARGUMENT RANGE REDUCTION:
+ X' = |X|
+ */
+ vpandq %zmm6, %zmm0, %zmm12
+ vmovups __dPI1_FMA(%rax), %zmm2
+ vmovups __dC7_sin(%rax), %zmm7
+
+/* SignX - sign bit of X */
+ vpandnq %zmm0, %zmm6, %zmm11
+
+/* R = X' - N*Pi1 */
+ vmovaps %zmm12, %zmm3
+
+/* Y = X'*InvPi + RS : right shifter add */
+ vfmadd213pd __dRShifter(%rax), %zmm12, %zmm1
+ vcmppd $22, __dRangeVal(%rax), %zmm12, %k1
+ vpbroadcastq %rdx, %zmm13{%k1}{z}
+
+/* N = Y - RS : right shifter sub */
+ vsubpd __dRShifter(%rax), %zmm1, %zmm4
+
+/* SignRes = Y<<63 : shift LSB to MSB place for result sign */
+ vpsllq $63, %zmm1, %zmm5
+ vptestmq %zmm13, %zmm13, %k0
+ vfnmadd231pd %zmm4, %zmm2, %zmm3
+ kmovw %k0, %ecx
+ movzbl %cl, %ecx
+
+/* R = R - N*Pi2 */
+ vfnmadd231pd __dPI2_FMA(%rax), %zmm4, %zmm3
+
+/* R = R - N*Pi3 */
+ vfnmadd132pd __dPI3_FMA(%rax), %zmm3, %zmm4
+
+/*
+ POLYNOMIAL APPROXIMATION:
+ R2 = R*R
+ */
+ vmulpd %zmm4, %zmm4, %zmm8
+
+/* R = R^SignRes : update sign of reduced argument */
+ vpxorq %zmm5, %zmm4, %zmm9
+ vfmadd213pd __dC6_sin(%rax), %zmm8, %zmm7
+ vfmadd213pd __dC5_sin(%rax), %zmm8, %zmm7
+ vfmadd213pd __dC4_sin(%rax), %zmm8, %zmm7
+
+/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */
+ vfmadd213pd __dC3_sin(%rax), %zmm8, %zmm7
+
+/* Poly = R2*(C1+R2*(C2+R2*Poly)) */
+ vfmadd213pd __dC2_sin(%rax), %zmm8, %zmm7
+ vfmadd213pd __dC1_sin(%rax), %zmm8, %zmm7
+ vmulpd %zmm8, %zmm7, %zmm10
+
+/* Poly = Poly*R + R */
+ vfmadd213pd %zmm9, %zmm9, %zmm10
+
+/*
+ RECONSTRUCTION:
+ Final sign setting: Res = Poly^SignX
+ */
+ vpxorq %zmm11, %zmm10, %zmm1
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ vmovaps %zmm1, %zmm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovups %zmm0, 1152(%rsp)
+ vmovups %zmm1, 1216(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ kmovw %k4, 1048(%rsp)
+ xorl %eax, %eax
+ kmovw %k5, 1040(%rsp)
+ kmovw %k6, 1032(%rsp)
+ kmovw %k7, 1024(%rsp)
+ vmovups %zmm16, 960(%rsp)
+ vmovups %zmm17, 896(%rsp)
+ vmovups %zmm18, 832(%rsp)
+ vmovups %zmm19, 768(%rsp)
+ vmovups %zmm20, 704(%rsp)
+ vmovups %zmm21, 640(%rsp)
+ vmovups %zmm22, 576(%rsp)
+ vmovups %zmm23, 512(%rsp)
+ vmovups %zmm24, 448(%rsp)
+ vmovups %zmm25, 384(%rsp)
+ vmovups %zmm26, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1064(%rsp)
+ movq %rdi, 1056(%rsp)
+ movq %r12, 1096(%rsp)
+ cfi_offset_rel_rsp (12, 1096)
+ movb %dl, %r12b
+ movq %r13, 1088(%rsp)
+ cfi_offset_rel_rsp (13, 1088)
+ movl %ecx, %r13d
+ movq %r14, 1080(%rsp)
+ cfi_offset_rel_rsp (14, 1080)
+ movl %eax, %r14d
+ movq %r15, 1072(%rsp)
+ cfi_offset_rel_rsp (15, 1072)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ addb $1, %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ kmovw 1048(%rsp), %k4
+ movq 1064(%rsp), %rsi
+ kmovw 1040(%rsp), %k5
+ movq 1056(%rsp), %rdi
+ kmovw 1032(%rsp), %k6
+ movq 1096(%rsp), %r12
+ cfi_restore (%r12)
+ movq 1088(%rsp), %r13
+ cfi_restore (%r13)
+ kmovw 1024(%rsp), %k7
+ vmovups 960(%rsp), %zmm16
+ vmovups 896(%rsp), %zmm17
+ vmovups 832(%rsp), %zmm18
+ vmovups 768(%rsp), %zmm19
+ vmovups 704(%rsp), %zmm20
+ vmovups 640(%rsp), %zmm21
+ vmovups 576(%rsp), %zmm22
+ vmovups 512(%rsp), %zmm23
+ vmovups 448(%rsp), %zmm24
+ vmovups 384(%rsp), %zmm25
+ vmovups 320(%rsp), %zmm26
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ movq 1080(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1072(%rsp), %r15
+ cfi_restore (%r15)
+ vmovups 1216(%rsp), %zmm1
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 1160(%rsp,%r15), %xmm0
+ call sin@PLT
+ vmovsd %xmm0, 1224(%rsp,%r15)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 1152(%rsp,%r15), %xmm0
+ call sin@PLT
+ vmovsd %xmm0, 1216(%rsp,%r15)
+ jmp .LBL_1_7
+#endif
+END (_ZGVeN8v_sin_knl)
+
+ENTRY (_ZGVeN8v_sin_skx)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
+#else
+/*
+ ALGORITHM DESCRIPTION:
+
+ ( low accuracy ( < 4ulp ) or enhanced performance
+ ( half of correct mantissa ) implementation )
+
+ Argument representation:
+ arg = N*Pi + R
+
+ Result calculation:
+ sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R)
+ sin(R) is approximated by corresponding polynomial
+ */
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1280, %rsp
+ movq __svml_d_trig_data@GOTPCREL(%rip), %rax
+ vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
+ vmovups __dAbsMask(%rax), %zmm7
+ vmovups __dInvPI(%rax), %zmm2
+ vmovups __dRShifter(%rax), %zmm1
+ vmovups __dPI1_FMA(%rax), %zmm3
+ vmovups __dC7_sin(%rax), %zmm8
+
+/*
+ ARGUMENT RANGE REDUCTION:
+ X' = |X|
+ */
+ vandpd %zmm7, %zmm0, %zmm13
+
+/* SignX - sign bit of X */
+ vandnpd %zmm0, %zmm7, %zmm12
+
+/* Y = X'*InvPi + RS : right shifter add */
+ vfmadd213pd %zmm1, %zmm13, %zmm2
+ vcmppd $18, __dRangeVal(%rax), %zmm13, %k1
+
+/* SignRes = Y<<63 : shift LSB to MSB place for result sign */
+ vpsllq $63, %zmm2, %zmm6
+
+/* N = Y - RS : right shifter sub */
+ vsubpd %zmm1, %zmm2, %zmm5
+
+/* R = X' - N*Pi1 */
+ vmovaps %zmm13, %zmm4
+ vfnmadd231pd %zmm5, %zmm3, %zmm4
+
+/* R = R - N*Pi2 */
+ vfnmadd231pd __dPI2_FMA(%rax), %zmm5, %zmm4
+
+/* R = R - N*Pi3 */
+ vfnmadd132pd __dPI3_FMA(%rax), %zmm4, %zmm5
+
+/*
+ POLYNOMIAL APPROXIMATION:
+ R2 = R*R
+ */
+ vmulpd %zmm5, %zmm5, %zmm9
+
+/* R = R^SignRes : update sign of reduced argument */
+ vxorpd %zmm6, %zmm5, %zmm10
+ vfmadd213pd __dC6_sin(%rax), %zmm9, %zmm8
+ vfmadd213pd __dC5_sin(%rax), %zmm9, %zmm8
+ vfmadd213pd __dC4_sin(%rax), %zmm9, %zmm8
+
+/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */
+ vfmadd213pd __dC3_sin(%rax), %zmm9, %zmm8
+
+/* Poly = R2*(C1+R2*(C2+R2*Poly)) */
+ vfmadd213pd __dC2_sin(%rax), %zmm9, %zmm8
+ vfmadd213pd __dC1_sin(%rax), %zmm9, %zmm8
+ vmulpd %zmm9, %zmm8, %zmm11
+
+/* Poly = Poly*R + R */
+ vfmadd213pd %zmm10, %zmm10, %zmm11
+
+/*
+ RECONSTRUCTION:
+ Final sign setting: Res = Poly^SignX
+ */
+ vxorpd %zmm12, %zmm11, %zmm1
+ vpandnq %zmm13, %zmm13, %zmm14{%k1}
+ vcmppd $3, %zmm14, %zmm14, %k0
+ kmovw %k0, %ecx
+ testl %ecx, %ecx
+ jne .LBL_2_3
+
+.LBL_2_2:
+ cfi_remember_state
+ vmovaps %zmm1, %zmm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_2_3:
+ cfi_restore_state
+ vmovups %zmm0, 1152(%rsp)
+ vmovups %zmm1, 1216(%rsp)
+ je .LBL_2_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ kmovw %k4, 1048(%rsp)
+ kmovw %k5, 1040(%rsp)
+ kmovw %k6, 1032(%rsp)
+ kmovw %k7, 1024(%rsp)
+ vmovups %zmm16, 960(%rsp)
+ vmovups %zmm17, 896(%rsp)
+ vmovups %zmm18, 832(%rsp)
+ vmovups %zmm19, 768(%rsp)
+ vmovups %zmm20, 704(%rsp)
+ vmovups %zmm21, 640(%rsp)
+ vmovups %zmm22, 576(%rsp)
+ vmovups %zmm23, 512(%rsp)
+ vmovups %zmm24, 448(%rsp)
+ vmovups %zmm25, 384(%rsp)
+ vmovups %zmm26, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1064(%rsp)
+ movq %rdi, 1056(%rsp)
+ movq %r12, 1096(%rsp)
+ cfi_offset_rel_rsp (12, 1096)
+ movb %dl, %r12b
+ movq %r13, 1088(%rsp)
+ cfi_offset_rel_rsp (13, 1088)
+ movl %ecx, %r13d
+ movq %r14, 1080(%rsp)
+ cfi_offset_rel_rsp (14, 1080)
+ movl %eax, %r14d
+ movq %r15, 1072(%rsp)
+ cfi_offset_rel_rsp (15, 1072)
+ cfi_remember_state
+
+.LBL_2_6:
+ btl %r14d, %r13d
+ jc .LBL_2_12
+
+.LBL_2_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_2_10
+
+.LBL_2_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_2_6
+
+ kmovw 1048(%rsp), %k4
+ kmovw 1040(%rsp), %k5
+ kmovw 1032(%rsp), %k6
+ kmovw 1024(%rsp), %k7
+ vmovups 960(%rsp), %zmm16
+ vmovups 896(%rsp), %zmm17
+ vmovups 832(%rsp), %zmm18
+ vmovups 768(%rsp), %zmm19
+ vmovups 704(%rsp), %zmm20
+ vmovups 640(%rsp), %zmm21
+ vmovups 576(%rsp), %zmm22
+ vmovups 512(%rsp), %zmm23
+ vmovups 448(%rsp), %zmm24
+ vmovups 384(%rsp), %zmm25
+ vmovups 320(%rsp), %zmm26
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ vmovups 1216(%rsp), %zmm1
+ movq 1064(%rsp), %rsi
+ movq 1056(%rsp), %rdi
+ movq 1096(%rsp), %r12
+ cfi_restore (%r12)
+ movq 1088(%rsp), %r13
+ cfi_restore (%r13)
+ movq 1080(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1072(%rsp), %r15
+ cfi_restore (%r15)
+ jmp .LBL_2_2
+
+.LBL_2_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 1160(%rsp,%r15), %xmm0
+ vzeroupper
+ vmovsd 1160(%rsp,%r15), %xmm0
+
+ call sin@PLT
+
+ vmovsd %xmm0, 1224(%rsp,%r15)
+ jmp .LBL_2_8
+
+.LBL_2_12:
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 1152(%rsp,%r15), %xmm0
+ vzeroupper
+ vmovsd 1152(%rsp,%r15), %xmm0
+
+ call sin@PLT
+
+ vmovsd %xmm0, 1216(%rsp,%r15)
+ jmp .LBL_2_7
+#endif
+END (_ZGVeN8v_sin_skx)
+
+ .section .rodata, "a"
+.L_2il0floatpacket.14:
+ .long 0xffffffff,0xffffffff
+ .type .L_2il0floatpacket.14,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S
new file mode 100644
index 0000000000..e8e5771808
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S
@@ -0,0 +1,38 @@
+/* Multiple versions of vectorized sincos.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVbN2vvv_sincos)
+ .type _ZGVbN2vvv_sincos, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVbN2vvv_sincos_sse4(%rip), %rax
+ testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVbN2vvv_sincos_sse2(%rip), %rax
+ ret
+END (_ZGVbN2vvv_sincos)
+libmvec_hidden_def (_ZGVbN2vvv_sincos)
+
+#define _ZGVbN2vvv_sincos _ZGVbN2vvv_sincos_sse2
+#include "../svml_d_sincos2_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S
new file mode 100644
index 0000000000..b504d1d732
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S
@@ -0,0 +1,314 @@
+/* Function sincos vectorized with SSE4.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_trig_data.h"
+
+ .text
+ENTRY (_ZGVbN2vvv_sincos_sse4)
+/*
+ ALGORITHM DESCRIPTION:
+
+ ( low accuracy ( < 4ulp ) or enhanced performance
+ ( half of correct mantissa ) implementation )
+
+ Argument representation:
+ arg = N*Pi + R
+
+ Result calculation:
+ sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R)
+ arg + Pi/2 = (N'*Pi + R')
+ cos(arg) = sin(arg+Pi/2) = sin(N'*Pi + R') = (-1)^N' * sin(R')
+ sin(R), sin(R') are approximated by corresponding polynomial. */
+
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $320, %rsp
+ movq __svml_d_trig_data@GOTPCREL(%rip), %rax
+ movups %xmm11, 160(%rsp)
+ movups %xmm12, 144(%rsp)
+ movups __dSignMask(%rax), %xmm11
+
+/* ARGUMENT RANGE REDUCTION:
+ Absolute argument: X' = |X| */
+ movaps %xmm11, %xmm4
+
+/* Grab sign bit from argument */
+ movaps %xmm11, %xmm7
+ movups __dInvPI(%rax), %xmm5
+ andnps %xmm0, %xmm4
+
+/* SinY = X'*InvPi + RS : right shifter add */
+ mulpd %xmm4, %xmm5
+ addpd __dRShifter(%rax), %xmm5
+
+/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */
+ movaps %xmm5, %xmm12
+ andps %xmm0, %xmm7
+
+/* SinN = Y - RS : right shifter sub */
+ subpd __dRShifter(%rax), %xmm5
+ movups %xmm10, 176(%rsp)
+ psllq $63, %xmm12
+ movups __dPI1(%rax), %xmm10
+
+/* SinR = X' - SinN*Pi1 */
+ movaps %xmm10, %xmm1
+ mulpd %xmm5, %xmm1
+ movups __dPI2(%rax), %xmm6
+
+/* SinR = SinR - SinN*Pi1 */
+ movaps %xmm6, %xmm2
+ mulpd %xmm5, %xmm2
+ movups %xmm13, 112(%rsp)
+ movaps %xmm4, %xmm13
+ subpd %xmm1, %xmm13
+ subpd %xmm2, %xmm13
+
+/* Sine result sign: SinRSign = SignMask & SinR */
+ movaps %xmm11, %xmm2
+
+/* CosR = SinX - CosN*Pi1 */
+ movaps %xmm4, %xmm1
+ movups __dOneHalf(%rax), %xmm3
+ andps %xmm13, %xmm2
+
+/* Set SinRSign to 0.5 */
+ orps %xmm2, %xmm3
+
+/* Update CosRSign and CosSignRes signs */
+ xorps %xmm11, %xmm2
+
+/* CosN = SinN +(-)0.5 */
+ addpd %xmm5, %xmm3
+ cmpnlepd __dRangeVal(%rax), %xmm4
+ mulpd %xmm3, %xmm10
+
+/* CosR = CosR - CosN*Pi2 */
+ mulpd %xmm3, %xmm6
+ subpd %xmm10, %xmm1
+ movmskpd %xmm4, %ecx
+ movups __dPI3(%rax), %xmm10
+ xorps %xmm12, %xmm2
+ subpd %xmm6, %xmm1
+
+/* SinR = SinR - SinN*Pi3 */
+ movaps %xmm10, %xmm6
+
+/* Final reconstruction.
+ Combine Sin result's sign */
+ xorps %xmm7, %xmm12
+ mulpd %xmm5, %xmm6
+
+/* CosR = CosR - CosN*Pi3 */
+ mulpd %xmm3, %xmm10
+ subpd %xmm6, %xmm13
+ subpd %xmm10, %xmm1
+ movups __dPI4(%rax), %xmm6
+
+/* SinR = SinR - SinN*Pi4 */
+ mulpd %xmm6, %xmm5
+
+/* CosR = CosR - CosN*Pi4 */
+ mulpd %xmm6, %xmm3
+ subpd %xmm5, %xmm13
+ subpd %xmm3, %xmm1
+
+/* SinR2 = SinR^2 */
+ movaps %xmm13, %xmm6
+
+/* CosR2 = CosR^2 */
+ movaps %xmm1, %xmm10
+ mulpd %xmm13, %xmm6
+ mulpd %xmm1, %xmm10
+
+/* Polynomial approximation */
+ movups __dC7(%rax), %xmm5
+ movaps %xmm5, %xmm3
+ mulpd %xmm6, %xmm3
+ mulpd %xmm10, %xmm5
+ addpd __dC6(%rax), %xmm3
+ addpd __dC6(%rax), %xmm5
+ mulpd %xmm6, %xmm3
+ mulpd %xmm10, %xmm5
+ addpd __dC5(%rax), %xmm3
+ addpd __dC5(%rax), %xmm5
+ mulpd %xmm6, %xmm3
+ mulpd %xmm10, %xmm5
+ addpd __dC4(%rax), %xmm3
+ addpd __dC4(%rax), %xmm5
+
+/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */
+ mulpd %xmm6, %xmm3
+
+/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */
+ mulpd %xmm10, %xmm5
+ addpd __dC3(%rax), %xmm3
+ addpd __dC3(%rax), %xmm5
+
+/* SinPoly = C2 + SinR2*SinPoly */
+ mulpd %xmm6, %xmm3
+
+/* CosPoly = C2 + CosR2*CosPoly */
+ mulpd %xmm10, %xmm5
+ addpd __dC2(%rax), %xmm3
+ addpd __dC2(%rax), %xmm5
+
+/* SinPoly = C1 + SinR2*SinPoly */
+ mulpd %xmm6, %xmm3
+
+/* CosPoly = C1 + CosR2*CosPoly */
+ mulpd %xmm10, %xmm5
+ addpd __dC1(%rax), %xmm3
+ addpd __dC1(%rax), %xmm5
+
+/* SinPoly = SinR2*SinPoly */
+ mulpd %xmm3, %xmm6
+
+/* CosPoly = CosR2*CosPoly */
+ mulpd %xmm5, %xmm10
+
+/* SinPoly = SinR*SinPoly */
+ mulpd %xmm13, %xmm6
+
+/* CosPoly = CosR*CosPoly */
+ mulpd %xmm1, %xmm10
+ addpd %xmm6, %xmm13
+ addpd %xmm10, %xmm1
+
+/* Update Sin result's sign */
+ xorps %xmm12, %xmm13
+
+/* Update Cos result's sign */
+ xorps %xmm2, %xmm1
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ movups 176(%rsp), %xmm10
+ movaps %xmm13, (%rdi)
+ movups 160(%rsp), %xmm11
+ movups 144(%rsp), %xmm12
+ movups 112(%rsp), %xmm13
+ movups %xmm1, (%rsi)
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ movups %xmm0, 128(%rsp)
+ movups %xmm13, 192(%rsp)
+ movups %xmm1, 256(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ movups %xmm8, 48(%rsp)
+ movups %xmm9, 32(%rsp)
+ movups %xmm14, 16(%rsp)
+ movups %xmm15, (%rsp)
+ movq %rsi, 64(%rsp)
+ movq %r12, 104(%rsp)
+ cfi_offset_rel_rsp (12, 104)
+ movb %dl, %r12b
+ movq %r13, 96(%rsp)
+ cfi_offset_rel_rsp (13, 96)
+ movl %eax, %r13d
+ movq %r14, 88(%rsp)
+ cfi_offset_rel_rsp (14, 88)
+ movl %ecx, %r14d
+ movq %r15, 80(%rsp)
+ cfi_offset_rel_rsp (15, 80)
+ movq %rbx, 72(%rsp)
+ movq %rdi, %rbx
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r13d, %r14d
+ jc .LBL_1_13
+
+.LBL_1_7:
+ lea 1(%r13), %esi
+ btl %esi, %r14d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r13d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ movups 48(%rsp), %xmm8
+ movq %rbx, %rdi
+ movups 32(%rsp), %xmm9
+ movups 16(%rsp), %xmm14
+ movups (%rsp), %xmm15
+ movq 64(%rsp), %rsi
+ movq 104(%rsp), %r12
+ cfi_restore (%r12)
+ movq 96(%rsp), %r13
+ cfi_restore (%r13)
+ movq 88(%rsp), %r14
+ cfi_restore (%r14)
+ movq 80(%rsp), %r15
+ cfi_restore (%r15)
+ movq 72(%rsp), %rbx
+ movups 192(%rsp), %xmm13
+ movups 256(%rsp), %xmm1
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ movsd 136(%rsp,%r15), %xmm0
+
+ call sin@PLT
+
+ movsd %xmm0, 200(%rsp,%r15)
+ movsd 136(%rsp,%r15), %xmm0
+
+ call cos@PLT
+
+ movsd %xmm0, 264(%rsp,%r15)
+ jmp .LBL_1_8
+
+.LBL_1_13:
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ movsd 128(%rsp,%r15), %xmm0
+
+ call sin@PLT
+
+ movsd %xmm0, 192(%rsp,%r15)
+ movsd 128(%rsp,%r15), %xmm0
+
+ call cos@PLT
+
+ movsd %xmm0, 256(%rsp,%r15)
+ jmp .LBL_1_7
+END (_ZGVbN2vvv_sincos_sse4)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S
new file mode 100644
index 0000000000..64744ffa62
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S
@@ -0,0 +1,38 @@
+/* Multiple versions of vectorized sincos.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVdN4vvv_sincos)
+ .type _ZGVdN4vvv_sincos, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVdN4vvv_sincos_avx2(%rip), %rax
+ testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVdN4vvv_sincos_sse_wrapper(%rip), %rax
+ ret
+END (_ZGVdN4vvv_sincos)
+libmvec_hidden_def (_ZGVdN4vvv_sincos)
+
+#define _ZGVdN4vvv_sincos _ZGVdN4vvv_sincos_sse_wrapper
+#include "../svml_d_sincos4_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S
new file mode 100644
index 0000000000..dca5604111
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S
@@ -0,0 +1,277 @@
+/* Function sincos vectorized with AVX2.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_trig_data.h"
+
+ .text
+ENTRY (_ZGVdN4vvv_sincos_avx2)
+/*
+ ALGORITHM DESCRIPTION:
+
+ ( low accuracy ( < 4ulp ) or enhanced performance
+ ( half of correct mantissa ) implementation )
+
+ Argument representation:
+ arg = N*Pi + R
+
+ Result calculation:
+ sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R)
+ arg + Pi/2 = (N'*Pi + R')
+ cos(arg) = sin(arg+Pi/2) = sin(N'*Pi + R') = (-1)^N' * sin(R')
+ sin(R), sin(R') are approximated by corresponding polynomial. */
+
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $448, %rsp
+ movq __svml_d_trig_data@GOTPCREL(%rip), %rax
+ vmovups %ymm14, 288(%rsp)
+ vmovups %ymm8, 352(%rsp)
+ vmovupd __dSignMask(%rax), %ymm6
+ vmovupd __dInvPI(%rax), %ymm2
+ vmovupd __dPI1_FMA(%rax), %ymm5
+ vmovups %ymm9, 224(%rsp)
+
+/* ARGUMENT RANGE REDUCTION:
+ Absolute argument: X' = |X| */
+ vandnpd %ymm0, %ymm6, %ymm1
+
+/* SinY = X'*InvPi + RS : right shifter add */
+ vfmadd213pd __dRShifter(%rax), %ymm1, %ymm2
+
+/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */
+ vpsllq $63, %ymm2, %ymm4
+
+/* SinN = Y - RS : right shifter sub */
+ vsubpd __dRShifter(%rax), %ymm2, %ymm2
+
+/* SinR = X' - SinN*Pi1 */
+ vmovdqa %ymm1, %ymm14
+ vfnmadd231pd %ymm2, %ymm5, %ymm14
+
+/* SinR = SinR - SinN*Pi1 */
+ vfnmadd231pd __dPI2_FMA(%rax), %ymm2, %ymm14
+
+/* Sine result sign: SinRSign = SignMask & SinR */
+ vandpd %ymm14, %ymm6, %ymm7
+
+/* Set SinRSign to 0.5 */
+ vorpd __dOneHalf(%rax), %ymm7, %ymm3
+
+/* CosN = SinN +(-)0.5 */
+ vaddpd %ymm3, %ymm2, %ymm3
+
+/* CosR = SinX - CosN*Pi1 */
+ vmovdqa %ymm1, %ymm8
+ vfnmadd231pd %ymm3, %ymm5, %ymm8
+ vmovupd __dPI3_FMA(%rax), %ymm5
+ vcmpnle_uqpd __dRangeVal(%rax), %ymm1, %ymm1
+
+/* CosR = CosR - CosN*Pi2 */
+ vfnmadd231pd __dPI2_FMA(%rax), %ymm3, %ymm8
+
+/* SinR = SinR - SinN*Pi3 */
+ vfnmadd213pd %ymm14, %ymm5, %ymm2
+
+/* CosR = CosR - CosN*Pi3 */
+ vfnmadd213pd %ymm8, %ymm5, %ymm3
+ vmovupd __dC6(%rax), %ymm8
+
+/* SinR2 = SinR^2 */
+ vmulpd %ymm2, %ymm2, %ymm14
+
+/* CosR2 = CosR^2 */
+ vmulpd %ymm3, %ymm3, %ymm5
+
+/* Grab SignX */
+ vandpd %ymm0, %ymm6, %ymm9
+
+/* Update CosRSign and CosSignRes signs */
+ vxorpd %ymm6, %ymm7, %ymm6
+ vxorpd %ymm6, %ymm4, %ymm7
+
+/* Update sign SinSignRes */
+ vxorpd %ymm9, %ymm4, %ymm6
+
+/* Polynomial approximation */
+ vmovupd __dC7(%rax), %ymm4
+ vmovdqa %ymm8, %ymm9
+ vfmadd231pd __dC7(%rax), %ymm14, %ymm9
+ vfmadd213pd %ymm8, %ymm5, %ymm4
+ vfmadd213pd __dC5(%rax), %ymm14, %ymm9
+ vfmadd213pd __dC5(%rax), %ymm5, %ymm4
+ vfmadd213pd __dC4(%rax), %ymm14, %ymm9
+ vfmadd213pd __dC4(%rax), %ymm5, %ymm4
+
+/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */
+ vfmadd213pd __dC3(%rax), %ymm14, %ymm9
+
+/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */
+ vfmadd213pd __dC3(%rax), %ymm5, %ymm4
+
+/* SinPoly = C2 + SinR2*SinPoly */
+ vfmadd213pd __dC2(%rax), %ymm14, %ymm9
+
+/* CosPoly = C2 + CosR2*CosPoly */
+ vfmadd213pd __dC2(%rax), %ymm5, %ymm4
+
+/* SinPoly = C1 + SinR2*SinPoly */
+ vfmadd213pd __dC1(%rax), %ymm14, %ymm9
+
+/* CosPoly = C1 + CosR2*CosPoly */
+ vfmadd213pd __dC1(%rax), %ymm5, %ymm4
+
+/* SinPoly = SinR2*SinPoly */
+ vmulpd %ymm14, %ymm9, %ymm8
+
+/* CosPoly = CosR2*CosPoly */
+ vmulpd %ymm5, %ymm4, %ymm4
+
+/* SinPoly = SinR*SinPoly */
+ vfmadd213pd %ymm2, %ymm2, %ymm8
+
+/* CosPoly = CosR*CosPoly */
+ vfmadd213pd %ymm3, %ymm3, %ymm4
+ vmovmskpd %ymm1, %ecx
+
+/* Final reconstruction
+ Update Sin result's sign */
+ vxorpd %ymm6, %ymm8, %ymm3
+
+/* Update Cos result's sign */
+ vxorpd %ymm7, %ymm4, %ymm2
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ vmovups 352(%rsp), %ymm8
+ vmovups 224(%rsp), %ymm9
+ vmovups 288(%rsp), %ymm14
+ vmovupd %ymm2, (%rsi)
+ vmovdqa %ymm3, (%rdi)
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovupd %ymm0, 256(%rsp)
+ vmovupd %ymm3, 320(%rsp)
+ vmovupd %ymm2, 384(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ vmovups %ymm10, 128(%rsp)
+ vmovups %ymm11, 96(%rsp)
+ vmovups %ymm12, 64(%rsp)
+ vmovups %ymm13, 32(%rsp)
+ vmovups %ymm15, (%rsp)
+ movq %rsi, 160(%rsp)
+ movq %r12, 200(%rsp)
+ cfi_offset_rel_rsp (12, 200)
+ movb %dl, %r12b
+ movq %r13, 192(%rsp)
+ cfi_offset_rel_rsp (13, 192)
+ movl %eax, %r13d
+ movq %r14, 184(%rsp)
+ cfi_offset_rel_rsp (14, 184)
+ movl %ecx, %r14d
+ movq %r15, 176(%rsp)
+ cfi_offset_rel_rsp (15, 176)
+ movq %rbx, 168(%rsp)
+ movq %rdi, %rbx
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r13d, %r14d
+ jc .LBL_1_13
+
+.LBL_1_7:
+ lea 1(%r13), %esi
+ btl %esi, %r14d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r13d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ vmovups 128(%rsp), %ymm10
+ movq %rbx, %rdi
+ vmovups 96(%rsp), %ymm11
+ vmovups 64(%rsp), %ymm12
+ vmovups 32(%rsp), %ymm13
+ vmovups (%rsp), %ymm15
+ vmovupd 320(%rsp), %ymm3
+ vmovupd 384(%rsp), %ymm2
+ movq 160(%rsp), %rsi
+ movq 200(%rsp), %r12
+ cfi_restore (%r12)
+ movq 192(%rsp), %r13
+ cfi_restore (%r13)
+ movq 184(%rsp), %r14
+ cfi_restore (%r14)
+ movq 176(%rsp), %r15
+ cfi_restore (%r15)
+ movq 168(%rsp), %rbx
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 264(%rsp,%r15), %xmm0
+ vzeroupper
+
+ call sin@PLT
+
+ vmovsd %xmm0, 328(%rsp,%r15)
+ vmovsd 264(%rsp,%r15), %xmm0
+
+ call cos@PLT
+
+ vmovsd %xmm0, 392(%rsp,%r15)
+ jmp .LBL_1_8
+
+.LBL_1_13:
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 256(%rsp,%r15), %xmm0
+ vzeroupper
+
+ call sin@PLT
+
+ vmovsd %xmm0, 320(%rsp,%r15)
+ vmovsd 256(%rsp,%r15), %xmm0
+
+ call cos@PLT
+
+ vmovsd %xmm0, 384(%rsp,%r15)
+ jmp .LBL_1_7
+
+END (_ZGVdN4vvv_sincos_avx2)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
new file mode 100644
index 0000000000..e33109099e
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
@@ -0,0 +1,39 @@
+/* Multiple versions of vectorized sincos.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVeN8vvv_sincos)
+ .type _ZGVeN8vvv_sincos, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVeN8vvv_sincos_skx(%rip), %rax
+ testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+ jnz 2f
+ leaq _ZGVeN8vvv_sincos_knl(%rip), %rax
+ testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+ jnz 2f
+ leaq _ZGVeN8vvv_sincos_avx2_wrapper(%rip), %rax
+2: ret
+END (_ZGVeN8vvv_sincos)
+
+#define _ZGVeN8vvv_sincos _ZGVeN8vvv_sincos_avx2_wrapper
+#include "../svml_d_sincos8_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
new file mode 100644
index 0000000000..e8388325f7
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
@@ -0,0 +1,593 @@
+/* Function sincos vectorized with AVX-512. KNL and SKX versions.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_trig_data.h"
+#include "svml_d_wrapper_impl.h"
+
+/*
+ ALGORITHM DESCRIPTION:
+
+ ( low accuracy ( < 4ulp ) or enhanced performance
+ ( half of correct mantissa ) implementation )
+
+ Argument representation:
+ arg = N*Pi + R
+
+ Result calculation:
+ sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R)
+ arg + Pi/2 = (N'*Pi + R')
+ cos(arg) = sin(arg+Pi/2) = sin(N'*Pi + R') = (-1)^N' * sin(R')
+ sin(R), sin(R') are approximated by corresponding polynomial. */
+
+ .text
+ENTRY (_ZGVeN8vvv_sincos_knl)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos
+#else
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1344, %rsp
+ movq __svml_d_trig_data@GOTPCREL(%rip), %rax
+ vmovaps %zmm0, %zmm4
+ movq $-1, %rdx
+ vmovups __dSignMask(%rax), %zmm12
+ vmovups __dInvPI(%rax), %zmm5
+
+/* ARGUMENT RANGE REDUCTION:
+ Absolute argument: X' = |X| */
+ vpandnq %zmm4, %zmm12, %zmm3
+ vmovups __dPI1_FMA(%rax), %zmm7
+ vmovups __dPI3_FMA(%rax), %zmm9
+
+/* SinR = X' - SinN*Pi1 */
+ vmovaps %zmm3, %zmm8
+
+/* CosR = SinX - CosN*Pi1 */
+ vmovaps %zmm3, %zmm10
+
+/* SinY = X'*InvPi + RS : right shifter add */
+ vfmadd213pd __dRShifter(%rax), %zmm3, %zmm5
+ vmovups __dC6(%rax), %zmm13
+
+/* SinN = Y - RS : right shifter sub */
+ vsubpd __dRShifter(%rax), %zmm5, %zmm1
+ vmovaps %zmm13, %zmm14
+
+/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */
+ vpsllq $63, %zmm5, %zmm2
+ vcmppd $22, __dRangeVal(%rax), %zmm3, %k1
+
+/* Update CosRSign and CosSignRes signs */
+ vmovaps %zmm12, %zmm5
+ vfnmadd231pd %zmm1, %zmm7, %zmm8
+
+/* SinR = SinR - SinN*Pi1 */
+ vfnmadd231pd __dPI2_FMA(%rax), %zmm1, %zmm8
+
+/* Sine result sign: SinRSign = SignMask & SinR */
+ vpandq %zmm8, %zmm12, %zmm11
+
+/* Set SinRSign to 0.5 */
+ vporq __dOneHalf(%rax), %zmm11, %zmm6
+ vpternlogq $150, %zmm2, %zmm11, %zmm5
+
+/* Update sign SinSignRes */
+ vpternlogq $120, %zmm4, %zmm12, %zmm2
+
+/* Polynomial approximation */
+ vmovups __dC7(%rax), %zmm11
+
+/* CosN = SinN +(-)0.5 */
+ vaddpd %zmm6, %zmm1, %zmm0
+
+/* SinR = SinR - SinN*Pi3 */
+ vfnmadd213pd %zmm8, %zmm9, %zmm1
+ vfnmadd231pd %zmm0, %zmm7, %zmm10
+
+/* SinR2 = SinR^2 */
+ vmulpd %zmm1, %zmm1, %zmm15
+
+/* Grab SignX
+ CosR = CosR - CosN*Pi2 */
+ vfnmadd231pd __dPI2_FMA(%rax), %zmm0, %zmm10
+ vfmadd231pd __dC7(%rax), %zmm15, %zmm14
+
+/* CosR = CosR - CosN*Pi3 */
+ vfnmadd213pd %zmm10, %zmm9, %zmm0
+ vfmadd213pd __dC5(%rax), %zmm15, %zmm14
+
+/* CosR2 = CosR^2 */
+ vmulpd %zmm0, %zmm0, %zmm12
+ vfmadd213pd __dC4(%rax), %zmm15, %zmm14
+ vfmadd213pd %zmm13, %zmm12, %zmm11
+
+/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */
+ vfmadd213pd __dC3(%rax), %zmm15, %zmm14
+ vfmadd213pd __dC5(%rax), %zmm12, %zmm11
+
+/* SinPoly = C2 + SinR2*SinPoly */
+ vfmadd213pd __dC2(%rax), %zmm15, %zmm14
+ vfmadd213pd __dC4(%rax), %zmm12, %zmm11
+
+/* SinPoly = C1 + SinR2*SinPoly */
+ vfmadd213pd __dC1(%rax), %zmm15, %zmm14
+
+/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */
+ vfmadd213pd __dC3(%rax), %zmm12, %zmm11
+
+/* SinPoly = SinR2*SinPoly */
+ vmulpd %zmm15, %zmm14, %zmm13
+
+/* CosPoly = C2 + CosR2*CosPoly */
+ vfmadd213pd __dC2(%rax), %zmm12, %zmm11
+
+/* SinPoly = SinR*SinPoly */
+ vfmadd213pd %zmm1, %zmm1, %zmm13
+ vpbroadcastq %rdx, %zmm1{%k1}{z}
+
+/* CosPoly = C1 + CosR2*CosPoly */
+ vfmadd213pd __dC1(%rax), %zmm12, %zmm11
+ vptestmq %zmm1, %zmm1, %k0
+ kmovw %k0, %ecx
+
+/* CosPoly = CosR2*CosPoly */
+ vmulpd %zmm12, %zmm11, %zmm14
+ movzbl %cl, %ecx
+
+/* CosPoly = CosR*CosPoly */
+ vfmadd213pd %zmm0, %zmm0, %zmm14
+
+/* Final reconstruction.
+ Update Sin result's sign */
+ vpxorq %zmm2, %zmm13, %zmm0
+
+/* Update Cos result's sign */
+ vpxorq %zmm5, %zmm14, %zmm2
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ vmovups %zmm0, (%rdi)
+ vmovups %zmm2, (%rsi)
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovups %zmm4, 1152(%rsp)
+ vmovups %zmm0, 1216(%rsp)
+ vmovups %zmm2, 1280(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ kmovw %k4, 1048(%rsp)
+ xorl %eax, %eax
+ kmovw %k5, 1040(%rsp)
+ kmovw %k6, 1032(%rsp)
+ kmovw %k7, 1024(%rsp)
+ vmovups %zmm16, 960(%rsp)
+ vmovups %zmm17, 896(%rsp)
+ vmovups %zmm18, 832(%rsp)
+ vmovups %zmm19, 768(%rsp)
+ vmovups %zmm20, 704(%rsp)
+ vmovups %zmm21, 640(%rsp)
+ vmovups %zmm22, 576(%rsp)
+ vmovups %zmm23, 512(%rsp)
+ vmovups %zmm24, 448(%rsp)
+ vmovups %zmm25, 384(%rsp)
+ vmovups %zmm26, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1056(%rsp)
+ movq %r12, 1096(%rsp)
+ cfi_offset_rel_rsp (12, 1096)
+ movb %dl, %r12b
+ movq %r13, 1088(%rsp)
+ cfi_offset_rel_rsp (13, 1088)
+ movl %eax, %r13d
+ movq %r14, 1080(%rsp)
+ cfi_offset_rel_rsp (14, 1080)
+ movl %ecx, %r14d
+ movq %r15, 1072(%rsp)
+ cfi_offset_rel_rsp (15, 1072)
+ movq %rbx, 1064(%rsp)
+ movq %rdi, %rbx
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r13d, %r14d
+ jc .LBL_1_13
+
+.LBL_1_7:
+ lea 1(%r13), %esi
+ btl %esi, %r14d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ addb $1, %r12b
+ addl $2, %r13d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ movq %rbx, %rdi
+ kmovw 1048(%rsp), %k4
+ movq 1056(%rsp), %rsi
+ kmovw 1040(%rsp), %k5
+ movq 1096(%rsp), %r12
+ cfi_restore (%r12)
+ kmovw 1032(%rsp), %k6
+ movq 1088(%rsp), %r13
+ cfi_restore (%r13)
+ kmovw 1024(%rsp), %k7
+ vmovups 960(%rsp), %zmm16
+ vmovups 896(%rsp), %zmm17
+ vmovups 832(%rsp), %zmm18
+ vmovups 768(%rsp), %zmm19
+ vmovups 704(%rsp), %zmm20
+ vmovups 640(%rsp), %zmm21
+ vmovups 576(%rsp), %zmm22
+ vmovups 512(%rsp), %zmm23
+ vmovups 448(%rsp), %zmm24
+ vmovups 384(%rsp), %zmm25
+ vmovups 320(%rsp), %zmm26
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ movq 1080(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1072(%rsp), %r15
+ cfi_restore (%r15)
+ movq 1064(%rsp), %rbx
+ vmovups 1216(%rsp), %zmm0
+ vmovups 1280(%rsp), %zmm2
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 1160(%rsp,%r15), %xmm0
+
+ call sin@PLT
+
+ vmovsd %xmm0, 1224(%rsp,%r15)
+ vmovsd 1160(%rsp,%r15), %xmm0
+
+ call cos@PLT
+
+ vmovsd %xmm0, 1288(%rsp,%r15)
+ jmp .LBL_1_8
+
+.LBL_1_13:
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 1152(%rsp,%r15), %xmm0
+
+ call sin@PLT
+
+ vmovsd %xmm0, 1216(%rsp,%r15)
+ vmovsd 1152(%rsp,%r15), %xmm0
+
+ call cos@PLT
+
+ vmovsd %xmm0, 1280(%rsp,%r15)
+ jmp .LBL_1_7
+
+#endif
+END (_ZGVeN8vvv_sincos_knl)
+
+ENTRY (_ZGVeN8vvv_sincos_skx)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos
+#else
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1344, %rsp
+ movq __svml_d_trig_data@GOTPCREL(%rip), %rax
+ vmovaps %zmm0, %zmm8
+ vmovups __dSignMask(%rax), %zmm4
+ vmovups __dInvPI(%rax), %zmm9
+ vmovups __dRShifter(%rax), %zmm10
+ vmovups __dPI1_FMA(%rax), %zmm13
+ vmovups __dPI2_FMA(%rax), %zmm14
+ vmovups __dOneHalf(%rax), %zmm11
+ vmovups __dPI3_FMA(%rax), %zmm2
+
+/* ARGUMENT RANGE REDUCTION:
+ Absolute argument: X' = |X| */
+ vandnpd %zmm8, %zmm4, %zmm7
+
+/* SinY = X'*InvPi + RS : right shifter add */
+ vfmadd213pd %zmm10, %zmm7, %zmm9
+ vcmppd $18, __dRangeVal(%rax), %zmm7, %k1
+
+/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */
+ vpsllq $63, %zmm9, %zmm6
+
+/* SinN = Y - RS : right shifter sub */
+ vsubpd %zmm10, %zmm9, %zmm5
+ vmovups __dC5(%rax), %zmm9
+ vmovups __dC4(%rax), %zmm10
+
+/* SinR = X' - SinN*Pi1 */
+ vmovaps %zmm7, %zmm15
+ vfnmadd231pd %zmm5, %zmm13, %zmm15
+
+/* SinR = SinR - SinN*Pi1 */
+ vfnmadd231pd %zmm5, %zmm14, %zmm15
+
+/* Sine result sign: SinRSign = SignMask & SinR */
+ vandpd %zmm15, %zmm4, %zmm1
+
+/* Set SinRSign to 0.5 */
+ vorpd %zmm1, %zmm11, %zmm12
+ vmovups __dC3(%rax), %zmm11
+
+/* CosN = SinN +(-)0.5 */
+ vaddpd %zmm12, %zmm5, %zmm3
+
+/* SinR = SinR - SinN*Pi3 */
+ vfnmadd213pd %zmm15, %zmm2, %zmm5
+ vmovups __dC2(%rax), %zmm12
+
+/* SinR2 = SinR^2 */
+ vmulpd %zmm5, %zmm5, %zmm15
+
+/* CosR = SinX - CosN*Pi1 */
+ vmovaps %zmm7, %zmm0
+ vfnmadd231pd %zmm3, %zmm13, %zmm0
+ vmovups __dC1(%rax), %zmm13
+
+/* Grab SignX
+ CosR = CosR - CosN*Pi2 */
+ vfnmadd231pd %zmm3, %zmm14, %zmm0
+
+/* CosR = CosR - CosN*Pi3 */
+ vfnmadd213pd %zmm0, %zmm2, %zmm3
+
+/* Polynomial approximation */
+ vmovups __dC7(%rax), %zmm0
+
+/* Update CosRSign and CosSignRes signs */
+ vmovaps %zmm4, %zmm2
+ vpternlogq $150, %zmm6, %zmm1, %zmm2
+
+/* Update sign SinSignRes */
+ vpternlogq $120, %zmm8, %zmm4, %zmm6
+
+/* CosR2 = CosR^2 */
+ vmulpd %zmm3, %zmm3, %zmm1
+ vmovups __dC6(%rax), %zmm4
+ vmovaps %zmm0, %zmm14
+ vfmadd213pd %zmm4, %zmm1, %zmm0
+ vfmadd213pd %zmm4, %zmm15, %zmm14
+ vfmadd213pd %zmm9, %zmm1, %zmm0
+ vfmadd213pd %zmm9, %zmm15, %zmm14
+ vfmadd213pd %zmm10, %zmm1, %zmm0
+ vfmadd213pd %zmm10, %zmm15, %zmm14
+
+/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */
+ vfmadd213pd %zmm11, %zmm1, %zmm0
+
+/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */
+ vfmadd213pd %zmm11, %zmm15, %zmm14
+
+/* CosPoly = C2 + CosR2*CosPoly */
+ vfmadd213pd %zmm12, %zmm1, %zmm0
+
+/* SinPoly = C2 + SinR2*SinPoly */
+ vfmadd213pd %zmm12, %zmm15, %zmm14
+
+/* CosPoly = C1 + CosR2*CosPoly */
+ vfmadd213pd %zmm13, %zmm1, %zmm0
+
+/* SinPoly = C1 + SinR2*SinPoly */
+ vfmadd213pd %zmm13, %zmm15, %zmm14
+
+/* CosPoly = CosR2*CosPoly */
+ vmulpd %zmm1, %zmm0, %zmm1
+
+/* SinPoly = SinR2*SinPoly */
+ vmulpd %zmm15, %zmm14, %zmm4
+
+/* CosPoly = CosR*CosPoly */
+ vfmadd213pd %zmm3, %zmm3, %zmm1
+
+/* SinPoly = SinR*SinPoly */
+ vfmadd213pd %zmm5, %zmm5, %zmm4
+ vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
+
+/* Update Cos result's sign */
+ vxorpd %zmm2, %zmm1, %zmm1
+
+/* Final reconstruction.
+ Update Sin result's sign */
+ vxorpd %zmm6, %zmm4, %zmm0
+ vpandnq %zmm7, %zmm7, %zmm3{%k1}
+ vcmppd $3, %zmm3, %zmm3, %k0
+ kmovw %k0, %ecx
+ testl %ecx, %ecx
+ jne .LBL_2_3
+
+.LBL_2_2:
+ cfi_remember_state
+ vmovups %zmm0, (%rdi)
+ vmovups %zmm1, (%rsi)
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_2_3:
+ cfi_restore_state
+ vmovups %zmm8, 1152(%rsp)
+ vmovups %zmm0, 1216(%rsp)
+ vmovups %zmm1, 1280(%rsp)
+ je .LBL_2_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ kmovw %k4, 1048(%rsp)
+ kmovw %k5, 1040(%rsp)
+ kmovw %k6, 1032(%rsp)
+ kmovw %k7, 1024(%rsp)
+ vmovups %zmm16, 960(%rsp)
+ vmovups %zmm17, 896(%rsp)
+ vmovups %zmm18, 832(%rsp)
+ vmovups %zmm19, 768(%rsp)
+ vmovups %zmm20, 704(%rsp)
+ vmovups %zmm21, 640(%rsp)
+ vmovups %zmm22, 576(%rsp)
+ vmovups %zmm23, 512(%rsp)
+ vmovups %zmm24, 448(%rsp)
+ vmovups %zmm25, 384(%rsp)
+ vmovups %zmm26, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1056(%rsp)
+ movq %r12, 1096(%rsp)
+ cfi_offset_rel_rsp (12, 1096)
+ movb %dl, %r12b
+ movq %r13, 1088(%rsp)
+ cfi_offset_rel_rsp (13, 1088)
+ movl %eax, %r13d
+ movq %r14, 1080(%rsp)
+ cfi_offset_rel_rsp (14, 1080)
+ movl %ecx, %r14d
+ movq %r15, 1072(%rsp)
+ cfi_offset_rel_rsp (15, 1072)
+ movq %rbx, 1064(%rsp)
+ movq %rdi, %rbx
+ cfi_remember_state
+
+.LBL_2_6:
+ btl %r13d, %r14d
+ jc .LBL_2_13
+
+.LBL_2_7:
+ lea 1(%r13), %esi
+ btl %esi, %r14d
+ jc .LBL_2_10
+
+.LBL_2_8:
+ incb %r12b
+ addl $2, %r13d
+ cmpb $16, %r12b
+ jb .LBL_2_6
+
+ kmovw 1048(%rsp), %k4
+ movq %rbx, %rdi
+ kmovw 1040(%rsp), %k5
+ kmovw 1032(%rsp), %k6
+ kmovw 1024(%rsp), %k7
+ vmovups 960(%rsp), %zmm16
+ vmovups 896(%rsp), %zmm17
+ vmovups 832(%rsp), %zmm18
+ vmovups 768(%rsp), %zmm19
+ vmovups 704(%rsp), %zmm20
+ vmovups 640(%rsp), %zmm21
+ vmovups 576(%rsp), %zmm22
+ vmovups 512(%rsp), %zmm23
+ vmovups 448(%rsp), %zmm24
+ vmovups 384(%rsp), %zmm25
+ vmovups 320(%rsp), %zmm26
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ vmovups 1216(%rsp), %zmm0
+ vmovups 1280(%rsp), %zmm1
+ movq 1056(%rsp), %rsi
+ movq 1096(%rsp), %r12
+ cfi_restore (%r12)
+ movq 1088(%rsp), %r13
+ cfi_restore (%r13)
+ movq 1080(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1072(%rsp), %r15
+ cfi_restore (%r15)
+ movq 1064(%rsp), %rbx
+ jmp .LBL_2_2
+
+.LBL_2_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 1160(%rsp,%r15), %xmm0
+ vzeroupper
+ vmovsd 1160(%rsp,%r15), %xmm0
+
+ call sin@PLT
+
+ vmovsd %xmm0, 1224(%rsp,%r15)
+ vmovsd 1160(%rsp,%r15), %xmm0
+
+ call cos@PLT
+
+ vmovsd %xmm0, 1288(%rsp,%r15)
+ jmp .LBL_2_8
+
+.LBL_2_13:
+ movzbl %r12b, %r15d
+ shlq $4, %r15
+ vmovsd 1152(%rsp,%r15), %xmm0
+ vzeroupper
+ vmovsd 1152(%rsp,%r15), %xmm0
+
+ call sin@PLT
+
+ vmovsd %xmm0, 1216(%rsp,%r15)
+ vmovsd 1152(%rsp,%r15), %xmm0
+
+ call cos@PLT
+
+ vmovsd %xmm0, 1280(%rsp,%r15)
+ jmp .LBL_2_7
+
+#endif
+END (_ZGVeN8vvv_sincos_skx)
+
+ .section .rodata, "a"
+.L_2il0floatpacket.15:
+ .long 0xffffffff,0xffffffff
+ .type .L_2il0floatpacket.15,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
new file mode 100644
index 0000000000..0654d3c19b
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
@@ -0,0 +1,39 @@
+/* Multiple versions of vectorized cosf.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVeN16v_cosf)
+ .type _ZGVeN16v_cosf, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVeN16v_cosf_skx(%rip), %rax
+ testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+ jnz 2f
+ leaq _ZGVeN16v_cosf_knl(%rip), %rax
+ testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+ jnz 2f
+ leaq _ZGVeN16v_cosf_avx2_wrapper(%rip), %rax
+2: ret
+END (_ZGVeN16v_cosf)
+
+#define _ZGVeN16v_cosf _ZGVeN16v_cosf_avx2_wrapper
+#include "../svml_s_cosf16_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
new file mode 100644
index 0000000000..e777476d73
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
@@ -0,0 +1,460 @@
+/* Function cosf vectorized with AVX-512. KNL and SKX versions.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_trig_data.h"
+#include "svml_s_wrapper_impl.h"
+
+ .text
+ENTRY (_ZGVeN16v_cosf_knl)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
+#else
+/*
+ ALGORITHM DESCRIPTION:
+
+ 1) Range reduction to [-Pi/2; +Pi/2] interval
+ a) We remove sign using AND operation
+ b) Add Pi/2 value to argument X for Cos to Sin transformation
+ c) Getting octant Y by 1/Pi multiplication
+ d) Add "Right Shifter" value
+ e) Treat obtained value as integer for destination sign setting.
+ Shift first bit of this value to the last (sign) position
+ f) Subtract "Right Shifter" value
+ g) Subtract 0.5 from result for octant correction
+ h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+ 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+ a) Calculate X^2 = X * X
+ b) Calculate polynomial:
+ R = X + X * X^2 * (A3 + x^2 * (A5 + .....
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R = XOR( R, S );
+ */
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1280, %rsp
+ movq __svml_s_trig_data@GOTPCREL(%rip), %rdx
+
+/*
+ h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3
+ */
+ vmovaps %zmm0, %zmm6
+ movl $-1, %eax
+
+/* b) Add Pi/2 value to argument X for Cos to Sin transformation */
+ vaddps __sHalfPI(%rdx), %zmm0, %zmm2
+ vmovups __sRShifter(%rdx), %zmm3
+
+/*
+ 1) Range reduction to [-Pi/2; +Pi/2] interval
+ c) Getting octant Y by 1/Pi multiplication
+ d) Add "Right Shifter" (0x4B000000) value
+ */
+ vfmadd132ps __sInvPI(%rdx), %zmm3, %zmm2
+ vmovups __sPI1_FMA(%rdx), %zmm5
+
+/* f) Subtract "Right Shifter" (0x4B000000) value */
+ vsubps %zmm3, %zmm2, %zmm4
+ vmovups __sA9_FMA(%rdx), %zmm9
+
+/* Check for large and special arguments */
+ vpandd __sAbsMask(%rdx), %zmm0, %zmm1
+
+/*
+ e) Treat obtained value as integer for destination sign setting.
+ Shift first bit of this value to the last (sign) position (S << 31)
+ */
+ vpslld $31, %zmm2, %zmm8
+ vcmpps $22, __sRangeReductionVal(%rdx), %zmm1, %k1
+ vpbroadcastd %eax, %zmm12{%k1}{z}
+
+/* g) Subtract 0.5 from result for octant correction */
+ vsubps __sOneHalf(%rdx), %zmm4, %zmm7
+ vptestmd %zmm12, %zmm12, %k0
+ vfnmadd231ps %zmm7, %zmm5, %zmm6
+ kmovw %k0, %ecx
+ vfnmadd231ps __sPI2_FMA(%rdx), %zmm7, %zmm6
+ vfnmadd132ps __sPI3_FMA(%rdx), %zmm6, %zmm7
+
+/* a) Calculate X^2 = X * X */
+ vmulps %zmm7, %zmm7, %zmm10
+
+/*
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R = XOR( R, S );
+ */
+ vpxord %zmm8, %zmm7, %zmm11
+
+/*
+ b) Calculate polynomial:
+ R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))));
+ */
+ vfmadd213ps __sA7_FMA(%rdx), %zmm10, %zmm9
+ vfmadd213ps __sA5_FMA(%rdx), %zmm10, %zmm9
+ vfmadd213ps __sA3(%rdx), %zmm10, %zmm9
+ vmulps %zmm10, %zmm9, %zmm1
+ vfmadd213ps %zmm11, %zmm11, %zmm1
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ vmovaps %zmm1, %zmm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovups %zmm0, 1152(%rsp)
+ vmovups %zmm1, 1216(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ kmovw %k4, 1048(%rsp)
+ xorl %eax, %eax
+ kmovw %k5, 1040(%rsp)
+ kmovw %k6, 1032(%rsp)
+ kmovw %k7, 1024(%rsp)
+ vmovups %zmm16, 960(%rsp)
+ vmovups %zmm17, 896(%rsp)
+ vmovups %zmm18, 832(%rsp)
+ vmovups %zmm19, 768(%rsp)
+ vmovups %zmm20, 704(%rsp)
+ vmovups %zmm21, 640(%rsp)
+ vmovups %zmm22, 576(%rsp)
+ vmovups %zmm23, 512(%rsp)
+ vmovups %zmm24, 448(%rsp)
+ vmovups %zmm25, 384(%rsp)
+ vmovups %zmm26, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1064(%rsp)
+ movq %rdi, 1056(%rsp)
+ movq %r12, 1096(%rsp)
+ cfi_offset_rel_rsp (12, 1096)
+ movb %dl, %r12b
+ movq %r13, 1088(%rsp)
+ cfi_offset_rel_rsp (13, 1088)
+ movl %ecx, %r13d
+ movq %r14, 1080(%rsp)
+ cfi_offset_rel_rsp (14, 1080)
+ movl %eax, %r14d
+ movq %r15, 1072(%rsp)
+ cfi_offset_rel_rsp (15, 1072)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ addb $1, %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ kmovw 1048(%rsp), %k4
+ movq 1064(%rsp), %rsi
+ kmovw 1040(%rsp), %k5
+ movq 1056(%rsp), %rdi
+ kmovw 1032(%rsp), %k6
+ movq 1096(%rsp), %r12
+ cfi_restore (%r12)
+ movq 1088(%rsp), %r13
+ cfi_restore (%r13)
+ kmovw 1024(%rsp), %k7
+ vmovups 960(%rsp), %zmm16
+ vmovups 896(%rsp), %zmm17
+ vmovups 832(%rsp), %zmm18
+ vmovups 768(%rsp), %zmm19
+ vmovups 704(%rsp), %zmm20
+ vmovups 640(%rsp), %zmm21
+ vmovups 576(%rsp), %zmm22
+ vmovups 512(%rsp), %zmm23
+ vmovups 448(%rsp), %zmm24
+ vmovups 384(%rsp), %zmm25
+ vmovups 320(%rsp), %zmm26
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ movq 1080(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1072(%rsp), %r15
+ cfi_restore (%r15)
+ vmovups 1216(%rsp), %zmm1
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ vmovss 1156(%rsp,%r15,8), %xmm0
+ call cosf@PLT
+ vmovss %xmm0, 1220(%rsp,%r15,8)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ vmovss 1152(%rsp,%r15,8), %xmm0
+ call cosf@PLT
+ vmovss %xmm0, 1216(%rsp,%r15,8)
+ jmp .LBL_1_7
+#endif
+END (_ZGVeN16v_cosf_knl)
+
+ENTRY (_ZGVeN16v_cosf_skx)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
+#else
+/*
+ ALGORITHM DESCRIPTION:
+
+ 1) Range reduction to [-Pi/2; +Pi/2] interval
+ a) We remove sign using AND operation
+ b) Add Pi/2 value to argument X for Cos to Sin transformation
+ c) Getting octant Y by 1/Pi multiplication
+ d) Add "Right Shifter" value
+ e) Treat obtained value as integer for destination sign setting.
+ Shift first bit of this value to the last (sign) position
+ f) Subtract "Right Shifter" value
+ g) Subtract 0.5 from result for octant correction
+ h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+ 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+ a) Calculate X^2 = X * X
+ b) Calculate polynomial:
+ R = X + X * X^2 * (A3 + x^2 * (A5 + .....
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R = XOR( R, S );
+ */
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1280, %rsp
+ movq __svml_s_trig_data@GOTPCREL(%rip), %rax
+
+/*
+ h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3
+ */
+ vmovaps %zmm0, %zmm6
+ vmovups .L_2il0floatpacket.13(%rip), %zmm12
+ vmovups __sRShifter(%rax), %zmm3
+ vmovups __sPI1_FMA(%rax), %zmm5
+ vmovups __sA9_FMA(%rax), %zmm9
+
+/* b) Add Pi/2 value to argument X for Cos to Sin transformation */
+ vaddps __sHalfPI(%rax), %zmm0, %zmm2
+
+/* Check for large and special arguments */
+ vandps __sAbsMask(%rax), %zmm0, %zmm1
+
+/*
+ 1) Range reduction to [-Pi/2; +Pi/2] interval
+ c) Getting octant Y by 1/Pi multiplication
+ d) Add "Right Shifter" (0x4B000000) value
+ */
+ vfmadd132ps __sInvPI(%rax), %zmm3, %zmm2
+ vcmpps $18, __sRangeReductionVal(%rax), %zmm1, %k1
+
+/*
+ e) Treat obtained value as integer for destination sign setting.
+ Shift first bit of this value to the last (sign) position (S << 31)
+ */
+ vpslld $31, %zmm2, %zmm8
+
+/* f) Subtract "Right Shifter" (0x4B000000) value */
+ vsubps %zmm3, %zmm2, %zmm4
+
+/* g) Subtract 0.5 from result for octant correction */
+ vsubps __sOneHalf(%rax), %zmm4, %zmm7
+ vfnmadd231ps %zmm7, %zmm5, %zmm6
+ vfnmadd231ps __sPI2_FMA(%rax), %zmm7, %zmm6
+ vfnmadd132ps __sPI3_FMA(%rax), %zmm6, %zmm7
+
+/* a) Calculate X^2 = X * X */
+ vmulps %zmm7, %zmm7, %zmm10
+
+/*
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R = XOR( R, S );
+ */
+ vxorps %zmm8, %zmm7, %zmm11
+
+/*
+ b) Calculate polynomial:
+ R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))));
+ */
+ vfmadd213ps __sA7_FMA(%rax), %zmm10, %zmm9
+ vfmadd213ps __sA5_FMA(%rax), %zmm10, %zmm9
+ vfmadd213ps __sA3(%rax), %zmm10, %zmm9
+ vpandnd %zmm1, %zmm1, %zmm12{%k1}
+ vmulps %zmm10, %zmm9, %zmm1
+ vptestmd %zmm12, %zmm12, %k0
+ vfmadd213ps %zmm11, %zmm11, %zmm1
+ kmovw %k0, %ecx
+ testl %ecx, %ecx
+ jne .LBL_2_3
+.LBL_2_2:
+ cfi_remember_state
+ vmovaps %zmm1, %zmm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_2_3:
+ cfi_restore_state
+ vmovups %zmm0, 1152(%rsp)
+ vmovups %zmm1, 1216(%rsp)
+ je .LBL_2_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ kmovw %k4, 1048(%rsp)
+ kmovw %k5, 1040(%rsp)
+ kmovw %k6, 1032(%rsp)
+ kmovw %k7, 1024(%rsp)
+ vmovups %zmm16, 960(%rsp)
+ vmovups %zmm17, 896(%rsp)
+ vmovups %zmm18, 832(%rsp)
+ vmovups %zmm19, 768(%rsp)
+ vmovups %zmm20, 704(%rsp)
+ vmovups %zmm21, 640(%rsp)
+ vmovups %zmm22, 576(%rsp)
+ vmovups %zmm23, 512(%rsp)
+ vmovups %zmm24, 448(%rsp)
+ vmovups %zmm25, 384(%rsp)
+ vmovups %zmm26, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1064(%rsp)
+ movq %rdi, 1056(%rsp)
+ movq %r12, 1096(%rsp)
+ cfi_offset_rel_rsp (12, 1096)
+ movb %dl, %r12b
+ movq %r13, 1088(%rsp)
+ cfi_offset_rel_rsp (13, 1088)
+ movl %ecx, %r13d
+ movq %r14, 1080(%rsp)
+ cfi_offset_rel_rsp (14, 1080)
+ movl %eax, %r14d
+ movq %r15, 1072(%rsp)
+ cfi_offset_rel_rsp (15, 1072)
+ cfi_remember_state
+
+.LBL_2_6:
+ btl %r14d, %r13d
+ jc .LBL_2_12
+.LBL_2_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_2_10
+.LBL_2_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_2_6
+ kmovw 1048(%rsp), %k4
+ kmovw 1040(%rsp), %k5
+ kmovw 1032(%rsp), %k6
+ kmovw 1024(%rsp), %k7
+ vmovups 960(%rsp), %zmm16
+ vmovups 896(%rsp), %zmm17
+ vmovups 832(%rsp), %zmm18
+ vmovups 768(%rsp), %zmm19
+ vmovups 704(%rsp), %zmm20
+ vmovups 640(%rsp), %zmm21
+ vmovups 576(%rsp), %zmm22
+ vmovups 512(%rsp), %zmm23
+ vmovups 448(%rsp), %zmm24
+ vmovups 384(%rsp), %zmm25
+ vmovups 320(%rsp), %zmm26
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ vmovups 1216(%rsp), %zmm1
+ movq 1064(%rsp), %rsi
+ movq 1056(%rsp), %rdi
+ movq 1096(%rsp), %r12
+ cfi_restore (%r12)
+ movq 1088(%rsp), %r13
+ cfi_restore (%r13)
+ movq 1080(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1072(%rsp), %r15
+ cfi_restore (%r15)
+ jmp .LBL_2_2
+
+.LBL_2_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ vmovss 1156(%rsp,%r15,8), %xmm0
+ vzeroupper
+ vmovss 1156(%rsp,%r15,8), %xmm0
+ call cosf@PLT
+ vmovss %xmm0, 1220(%rsp,%r15,8)
+ jmp .LBL_2_8
+.LBL_2_12:
+ movzbl %r12b, %r15d
+ vmovss 1152(%rsp,%r15,8), %xmm0
+ vzeroupper
+ vmovss 1152(%rsp,%r15,8), %xmm0
+ call cosf@PLT
+ vmovss %xmm0, 1216(%rsp,%r15,8)
+ jmp .LBL_2_7
+#endif
+END (_ZGVeN16v_cosf_skx)
+
+ .section .rodata, "a"
+.L_2il0floatpacket.13:
+ .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+ .type .L_2il0floatpacket.13,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S
new file mode 100644
index 0000000000..fa2363bb1f
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S
@@ -0,0 +1,38 @@
+/* Multiple versions of vectorized cosf, vector length is 4.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVbN4v_cosf)
+ .type _ZGVbN4v_cosf, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVbN4v_cosf_sse4(%rip), %rax
+ testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVbN4v_cosf_sse2(%rip), %rax
+ ret
+END (_ZGVbN4v_cosf)
+libmvec_hidden_def (_ZGVbN4v_cosf)
+
+#define _ZGVbN4v_cosf _ZGVbN4v_cosf_sse2
+#include "../svml_s_cosf4_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S
new file mode 100644
index 0000000000..bdb6591905
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S
@@ -0,0 +1,227 @@
+/* Function cosf vectorized with SSE4.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_trig_data.h"
+
+ .text
+ENTRY (_ZGVbN4v_cosf_sse4)
+/*
+ ALGORITHM DESCRIPTION:
+
+ 1) Range reduction to [-Pi/2; +Pi/2] interval
+ a) We remove sign using AND operation
+ b) Add Pi/2 value to argument X for Cos to Sin transformation
+ c) Getting octant Y by 1/Pi multiplication
+ d) Add "Right Shifter" value
+ e) Treat obtained value as integer for destination sign setting.
+ Shift first bit of this value to the last (sign) position
+ f) Subtract "Right Shifter" value
+ g) Subtract 0.5 from result for octant correction
+ h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+ 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+ a) Calculate X^2 = X * X
+ b) Calculate polynomial:
+ R = X + X * X^2 * (A3 + x^2 * (A5 + .....
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R = XOR( R, S );
+ */
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $320, %rsp
+ movaps %xmm0, %xmm4
+ movq __svml_s_trig_data@GOTPCREL(%rip), %rax
+ movups __sHalfPI(%rax), %xmm1
+ movups __sRShifter(%rax), %xmm5
+
+/* b) Add Pi/2 value to argument X for Cos to Sin transformation */
+ addps %xmm4, %xmm1
+
+/*
+ 1) Range reduction to [-Pi/2; +Pi/2] interval
+ c) Getting octant Y by 1/Pi multiplication
+ d) Add "Right Shifter" (0x4B000000) value
+ */
+ mulps __sInvPI(%rax), %xmm1
+ movups __sPI1(%rax), %xmm6
+ addps %xmm5, %xmm1
+
+/*
+ e) Treat obtained value as integer for destination sign setting.
+ Shift first bit of this value to the last (sign) position (S << 31)
+ */
+ movaps %xmm1, %xmm2
+
+/* f) Subtract "Right Shifter" (0x4B000000) value */
+ subps %xmm5, %xmm1
+ movups __sPI2(%rax), %xmm7
+ pslld $31, %xmm2
+ movups __sPI3(%rax), %xmm5
+ movups __sAbsMask(%rax), %xmm3
+
+/* Check for large and special arguments */
+ andps %xmm4, %xmm3
+
+/* g) Subtract 0.5 from result for octant correction */
+ subps __sOneHalf(%rax), %xmm1
+ cmpnleps __sRangeReductionVal(%rax), %xmm3
+
+/*
+ h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+ */
+ mulps %xmm1, %xmm6
+ mulps %xmm1, %xmm7
+ mulps %xmm1, %xmm5
+ subps %xmm6, %xmm0
+ movmskps %xmm3, %ecx
+ movups __sPI4(%rax), %xmm6
+ subps %xmm7, %xmm0
+ mulps %xmm6, %xmm1
+ subps %xmm5, %xmm0
+ subps %xmm1, %xmm0
+
+/* a) Calculate X^2 = X * X */
+ movaps %xmm0, %xmm1
+ mulps %xmm0, %xmm1
+
+/*
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R = XOR( R, S );
+ */
+ xorps %xmm2, %xmm0
+ movups __sA9(%rax), %xmm2
+
+/*
+ b) Calculate polynomial:
+ R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))));
+ */
+ mulps %xmm1, %xmm2
+ addps __sA7(%rax), %xmm2
+ mulps %xmm1, %xmm2
+ addps __sA5(%rax), %xmm2
+ mulps %xmm1, %xmm2
+ addps __sA3(%rax), %xmm2
+ mulps %xmm2, %xmm1
+ mulps %xmm0, %xmm1
+ addps %xmm1, %xmm0
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ movups %xmm4, 192(%rsp)
+ movups %xmm0, 256(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ movups %xmm8, 112(%rsp)
+ movups %xmm9, 96(%rsp)
+ movups %xmm10, 80(%rsp)
+ movups %xmm11, 64(%rsp)
+ movups %xmm12, 48(%rsp)
+ movups %xmm13, 32(%rsp)
+ movups %xmm14, 16(%rsp)
+ movups %xmm15, (%rsp)
+ movq %rsi, 136(%rsp)
+ movq %rdi, 128(%rsp)
+ movq %r12, 168(%rsp)
+ cfi_offset_rel_rsp (12, 168)
+ movb %dl, %r12b
+ movq %r13, 160(%rsp)
+ cfi_offset_rel_rsp (13, 160)
+ movl %ecx, %r13d
+ movq %r14, 152(%rsp)
+ cfi_offset_rel_rsp (14, 152)
+ movl %eax, %r14d
+ movq %r15, 144(%rsp)
+ cfi_offset_rel_rsp (15, 144)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ movups 112(%rsp), %xmm8
+ movups 96(%rsp), %xmm9
+ movups 80(%rsp), %xmm10
+ movups 64(%rsp), %xmm11
+ movups 48(%rsp), %xmm12
+ movups 32(%rsp), %xmm13
+ movups 16(%rsp), %xmm14
+ movups (%rsp), %xmm15
+ movq 136(%rsp), %rsi
+ movq 128(%rsp), %rdi
+ movq 168(%rsp), %r12
+ cfi_restore (%r12)
+ movq 160(%rsp), %r13
+ cfi_restore (%r13)
+ movq 152(%rsp), %r14
+ cfi_restore (%r14)
+ movq 144(%rsp), %r15
+ cfi_restore (%r15)
+ movups 256(%rsp), %xmm0
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ movss 196(%rsp,%r15,8), %xmm0
+
+ call cosf@PLT
+
+ movss %xmm0, 260(%rsp,%r15,8)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ movss 192(%rsp,%r15,8), %xmm0
+
+ call cosf@PLT
+
+ movss %xmm0, 256(%rsp,%r15,8)
+ jmp .LBL_1_7
+END (_ZGVbN4v_cosf_sse4)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S
new file mode 100644
index 0000000000..e14bba4a76
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S
@@ -0,0 +1,38 @@
+/* Multiple versions of vectorized cosf, vector length is 8.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVdN8v_cosf)
+ .type _ZGVdN8v_cosf, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVdN8v_cosf_avx2(%rip), %rax
+ testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVdN8v_cosf_sse_wrapper(%rip), %rax
+ ret
+END (_ZGVdN8v_cosf)
+libmvec_hidden_def (_ZGVdN8v_cosf)
+
+#define _ZGVdN8v_cosf _ZGVdN8v_cosf_sse_wrapper
+#include "../svml_s_cosf8_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S
new file mode 100644
index 0000000000..1efc943295
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S
@@ -0,0 +1,215 @@
+/* Function cosf vectorized with AVX2.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+#include <sysdep.h>
+#include "svml_s_trig_data.h"
+
+ .text
+ENTRY (_ZGVdN8v_cosf_avx2)
+/*
+ ALGORITHM DESCRIPTION:
+
+ 1) Range reduction to [-Pi/2; +Pi/2] interval
+ a) We remove sign using AND operation
+ b) Add Pi/2 value to argument X for Cos to Sin transformation
+ c) Getting octant Y by 1/Pi multiplication
+ d) Add "Right Shifter" value
+ e) Treat obtained value as integer for destination sign setting.
+ Shift first bit of this value to the last (sign) position
+ f) Subtract "Right Shifter" value
+ g) Subtract 0.5 from result for octant correction
+ h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+ 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+ a) Calculate X^2 = X * X
+ b) Calculate polynomial:
+ R = X + X * X^2 * (A3 + x^2 * (A5 + .....
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R = XOR( R, S );
+ */
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $448, %rsp
+ movq __svml_s_trig_data@GOTPCREL(%rip), %rax
+ vmovaps %ymm0, %ymm2
+ vmovups __sRShifter(%rax), %ymm5
+ vmovups __sPI1_FMA(%rax), %ymm7
+
+/* b) Add Pi/2 value to argument X for Cos to Sin transformation */
+ vaddps __sHalfPI(%rax), %ymm2, %ymm4
+
+/*
+ 1) Range reduction to [-Pi/2; +Pi/2] interval
+ c) Getting octant Y by 1/Pi multiplication
+ d) Add "Right Shifter" (0x4B000000) value
+ */
+ vfmadd132ps __sInvPI(%rax), %ymm5, %ymm4
+
+/* f) Subtract "Right Shifter" (0x4B000000) value */
+ vsubps %ymm5, %ymm4, %ymm6
+
+/*
+ e) Treat obtained value as integer for destination sign setting.
+ Shift first bit of this value to the last (sign) position (S << 31)
+ */
+ vpslld $31, %ymm4, %ymm0
+
+/* g) Subtract 0.5 from result for octant correction */
+ vsubps __sOneHalf(%rax), %ymm6, %ymm4
+
+/* Check for large and special arguments */
+ vandps __sAbsMask(%rax), %ymm2, %ymm3
+ vcmpnle_uqps __sRangeReductionVal(%rax), %ymm3, %ymm1
+
+/*
+ h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3
+ */
+ vmovaps %ymm2, %ymm3
+ vfnmadd231ps %ymm4, %ymm7, %ymm3
+ vfnmadd231ps __sPI2_FMA(%rax), %ymm4, %ymm3
+ vfnmadd132ps __sPI3_FMA(%rax), %ymm3, %ymm4
+
+/* a) Calculate X^2 = X * X */
+ vmulps %ymm4, %ymm4, %ymm5
+
+/*
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R = XOR( R, S );
+ */
+ vxorps %ymm0, %ymm4, %ymm6
+ vmovups __sA9_FMA(%rax), %ymm0
+
+/*
+ b) Calculate polynomial:
+ R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))))
+ */
+ vfmadd213ps __sA7_FMA(%rax), %ymm5, %ymm0
+ vfmadd213ps __sA5_FMA(%rax), %ymm5, %ymm0
+ vfmadd213ps __sA3(%rax), %ymm5, %ymm0
+ vmulps %ymm5, %ymm0, %ymm0
+ vmovmskps %ymm1, %ecx
+ vfmadd213ps %ymm6, %ymm6, %ymm0
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovups %ymm2, 320(%rsp)
+ vmovups %ymm0, 384(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ vmovups %ymm8, 224(%rsp)
+ vmovups %ymm9, 192(%rsp)
+ vmovups %ymm10, 160(%rsp)
+ vmovups %ymm11, 128(%rsp)
+ vmovups %ymm12, 96(%rsp)
+ vmovups %ymm13, 64(%rsp)
+ vmovups %ymm14, 32(%rsp)
+ vmovups %ymm15, (%rsp)
+ movq %rsi, 264(%rsp)
+ movq %rdi, 256(%rsp)
+ movq %r12, 296(%rsp)
+ cfi_offset_rel_rsp (12, 296)
+ movb %dl, %r12b
+ movq %r13, 288(%rsp)
+ cfi_offset_rel_rsp (13, 288)
+ movl %ecx, %r13d
+ movq %r14, 280(%rsp)
+ cfi_offset_rel_rsp (14, 280)
+ movl %eax, %r14d
+ movq %r15, 272(%rsp)
+ cfi_offset_rel_rsp (15, 272)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ vmovups 224(%rsp), %ymm8
+ vmovups 192(%rsp), %ymm9
+ vmovups 160(%rsp), %ymm10
+ vmovups 128(%rsp), %ymm11
+ vmovups 96(%rsp), %ymm12
+ vmovups 64(%rsp), %ymm13
+ vmovups 32(%rsp), %ymm14
+ vmovups (%rsp), %ymm15
+ vmovups 384(%rsp), %ymm0
+ movq 264(%rsp), %rsi
+ movq 256(%rsp), %rdi
+ movq 296(%rsp), %r12
+ cfi_restore (%r12)
+ movq 288(%rsp), %r13
+ cfi_restore (%r13)
+ movq 280(%rsp), %r14
+ cfi_restore (%r14)
+ movq 272(%rsp), %r15
+ cfi_restore (%r15)
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ vmovss 324(%rsp,%r15,8), %xmm0
+ vzeroupper
+
+ call cosf@PLT
+
+ vmovss %xmm0, 388(%rsp,%r15,8)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ vmovss 320(%rsp,%r15,8), %xmm0
+ vzeroupper
+
+ call cosf@PLT
+
+ vmovss %xmm0, 384(%rsp,%r15,8)
+ jmp .LBL_1_7
+
+END (_ZGVdN8v_cosf_avx2)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
new file mode 100644
index 0000000000..62858eb39e
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
@@ -0,0 +1,39 @@
+/* Multiple versions of vectorized expf.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVeN16v_expf)
+ .type _ZGVeN16v_expf, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVeN16v_expf_skx(%rip), %rax
+ testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+ jnz 2f
+ leaq _ZGVeN16v_expf_knl(%rip), %rax
+ testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+ jnz 2f
+ leaq _ZGVeN16v_expf_avx2_wrapper(%rip), %rax
+2: ret
+END (_ZGVeN16v_expf)
+
+#define _ZGVeN16v_expf _ZGVeN16v_expf_avx2_wrapper
+#include "../svml_s_expf16_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
new file mode 100644
index 0000000000..ec69055351
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
@@ -0,0 +1,447 @@
+/* Function expf vectorized with AVX-512. KNL and SKX versions.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_expf_data.h"
+#include "svml_s_wrapper_impl.h"
+
+ .text
+ENTRY (_ZGVeN16v_expf_knl)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
+#else
+/*
+ ALGORITHM DESCRIPTION:
+
+ Argument representation:
+ M = rint(X*2^k/ln2) = 2^k*N+j
+ X = M*ln2/2^k + r = N*ln2 + ln2*(j/2^k) + r
+ then -ln2/2^(k+1) < r < ln2/2^(k+1)
+ Alternatively:
+ M = trunc(X*2^k/ln2)
+ then 0 < r < ln2/2^k
+
+ Result calculation:
+ exp(X) = exp(N*ln2 + ln2*(j/2^k) + r)
+ = 2^N * 2^(j/2^k) * exp(r)
+ 2^N is calculated by bit manipulation
+ 2^(j/2^k) is computed from table lookup
+ exp(r) is approximated by polynomial
+
+ The table lookup is skipped if k = 0.
+ For low accuracy approximation, exp(r) ~ 1 or 1+r. */
+
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1280, %rsp
+ movq __svml_sexp_data@GOTPCREL(%rip), %rax
+
+/* r = x-n*ln2_hi/2^k */
+ vmovaps %zmm0, %zmm6
+
+/* compare against threshold */
+ movl $-1, %ecx
+ vmovups __sInvLn2(%rax), %zmm3
+ vmovups __sLn2hi(%rax), %zmm5
+
+/* m = x*2^k/ln2 + shifter */
+ vfmadd213ps __sShifter(%rax), %zmm0, %zmm3
+ vmovups __sPC5(%rax), %zmm9
+
+/* n = m - shifter = rint(x*2^k/ln2) */
+ vsubps __sShifter(%rax), %zmm3, %zmm7
+
+/* remove sign of x by "and" operation */
+ vpandd __iAbsMask(%rax), %zmm0, %zmm1
+ vpaddd __iBias(%rax), %zmm3, %zmm4
+ vpcmpgtd __iDomainRange(%rax), %zmm1, %k1
+
+/* compute 2^N with "shift" */
+ vpslld $23, %zmm4, %zmm8
+ vfnmadd231ps %zmm7, %zmm5, %zmm6
+ vpbroadcastd %ecx, %zmm2{%k1}{z}
+
+/* r = r-n*ln2_lo/2^k = x - n*ln2/2^k */
+ vfnmadd132ps __sLn2lo(%rax), %zmm6, %zmm7
+
+/* set mask for overflow/underflow */
+ vptestmd %zmm2, %zmm2, %k0
+ kmovw %k0, %ecx
+
+/* c5*r+c4 */
+ vfmadd213ps __sPC4(%rax), %zmm7, %zmm9
+
+/* (c5*r+c4)*r+c3 */
+ vfmadd213ps __sPC3(%rax), %zmm7, %zmm9
+
+/* ((c5*r+c4)*r+c3)*r+c2 */
+ vfmadd213ps __sPC2(%rax), %zmm7, %zmm9
+
+/* (((c5*r+c4)*r+c3)*r+c2)*r+c1 */
+ vfmadd213ps __sPC1(%rax), %zmm7, %zmm9
+
+/* exp(r) = ((((c5*r+c4)*r+c3)*r+c2)*r+c1)*r+c0 */
+ vfmadd213ps __sPC0(%rax), %zmm7, %zmm9
+
+/* 2^N*exp(r) */
+ vmulps %zmm9, %zmm8, %zmm1
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ vmovaps %zmm1, %zmm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovups %zmm0, 1152(%rsp)
+ vmovups %zmm1, 1216(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ kmovw %k4, 1048(%rsp)
+ xorl %eax, %eax
+ kmovw %k5, 1040(%rsp)
+ kmovw %k6, 1032(%rsp)
+ kmovw %k7, 1024(%rsp)
+ vmovups %zmm16, 960(%rsp)
+ vmovups %zmm17, 896(%rsp)
+ vmovups %zmm18, 832(%rsp)
+ vmovups %zmm19, 768(%rsp)
+ vmovups %zmm20, 704(%rsp)
+ vmovups %zmm21, 640(%rsp)
+ vmovups %zmm22, 576(%rsp)
+ vmovups %zmm23, 512(%rsp)
+ vmovups %zmm24, 448(%rsp)
+ vmovups %zmm25, 384(%rsp)
+ vmovups %zmm26, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1064(%rsp)
+ movq %rdi, 1056(%rsp)
+ movq %r12, 1096(%rsp)
+ cfi_offset_rel_rsp (12, 1096)
+ movb %dl, %r12b
+ movq %r13, 1088(%rsp)
+ cfi_offset_rel_rsp (13, 1088)
+ movl %ecx, %r13d
+ movq %r14, 1080(%rsp)
+ cfi_offset_rel_rsp (14, 1080)
+ movl %eax, %r14d
+ movq %r15, 1072(%rsp)
+ cfi_offset_rel_rsp (15, 1072)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ addb $1, %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ kmovw 1048(%rsp), %k4
+ movq 1064(%rsp), %rsi
+ kmovw 1040(%rsp), %k5
+ movq 1056(%rsp), %rdi
+ kmovw 1032(%rsp), %k6
+ movq 1096(%rsp), %r12
+ cfi_restore (%r12)
+ movq 1088(%rsp), %r13
+ cfi_restore (%r13)
+ kmovw 1024(%rsp), %k7
+ vmovups 960(%rsp), %zmm16
+ vmovups 896(%rsp), %zmm17
+ vmovups 832(%rsp), %zmm18
+ vmovups 768(%rsp), %zmm19
+ vmovups 704(%rsp), %zmm20
+ vmovups 640(%rsp), %zmm21
+ vmovups 576(%rsp), %zmm22
+ vmovups 512(%rsp), %zmm23
+ vmovups 448(%rsp), %zmm24
+ vmovups 384(%rsp), %zmm25
+ vmovups 320(%rsp), %zmm26
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ movq 1080(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1072(%rsp), %r15
+ cfi_restore (%r15)
+ vmovups 1216(%rsp), %zmm1
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ vmovss 1156(%rsp,%r15,8), %xmm0
+ call expf@PLT
+ vmovss %xmm0, 1220(%rsp,%r15,8)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ vmovss 1152(%rsp,%r15,8), %xmm0
+ call expf@PLT
+ vmovss %xmm0, 1216(%rsp,%r15,8)
+ jmp .LBL_1_7
+
+#endif
+END (_ZGVeN16v_expf_knl)
+
+ENTRY (_ZGVeN16v_expf_skx)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
+#else
+/*
+ ALGORITHM DESCRIPTION:
+
+ Argument representation:
+ M = rint(X*2^k/ln2) = 2^k*N+j
+ X = M*ln2/2^k + r = N*ln2 + ln2*(j/2^k) + r
+ then -ln2/2^(k+1) < r < ln2/2^(k+1)
+ Alternatively:
+ M = trunc(X*2^k/ln2)
+ then 0 < r < ln2/2^k
+
+ Result calculation:
+ exp(X) = exp(N*ln2 + ln2*(j/2^k) + r)
+ = 2^N * 2^(j/2^k) * exp(r)
+ 2^N is calculated by bit manipulation
+ 2^(j/2^k) is computed from table lookup
+ exp(r) is approximated by polynomial
+
+ The table lookup is skipped if k = 0.
+ For low accuracy approximation, exp(r) ~ 1 or 1+r. */
+
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1280, %rsp
+ movq __svml_sexp_data@GOTPCREL(%rip), %rax
+
+/* r = x-n*ln2_hi/2^k */
+ vmovaps %zmm0, %zmm7
+
+/* compare against threshold */
+ vmovups .L_2il0floatpacket.13(%rip), %zmm3
+ vmovups __sInvLn2(%rax), %zmm4
+ vmovups __sShifter(%rax), %zmm1
+ vmovups __sLn2hi(%rax), %zmm6
+ vmovups __sPC5(%rax), %zmm10
+
+/* m = x*2^k/ln2 + shifter */
+ vfmadd213ps %zmm1, %zmm0, %zmm4
+
+/* n = m - shifter = rint(x*2^k/ln2) */
+ vsubps %zmm1, %zmm4, %zmm8
+ vpaddd __iBias(%rax), %zmm4, %zmm5
+ vfnmadd231ps %zmm8, %zmm6, %zmm7
+
+/* compute 2^N with "shift" */
+ vpslld $23, %zmm5, %zmm9
+
+/* r = r-n*ln2_lo/2^k = x - n*ln2/2^k */
+ vfnmadd132ps __sLn2lo(%rax), %zmm7, %zmm8
+
+/* c5*r+c4 */
+ vfmadd213ps __sPC4(%rax), %zmm8, %zmm10
+
+/* (c5*r+c4)*r+c3 */
+ vfmadd213ps __sPC3(%rax), %zmm8, %zmm10
+
+/* ((c5*r+c4)*r+c3)*r+c2 */
+ vfmadd213ps __sPC2(%rax), %zmm8, %zmm10
+
+/* (((c5*r+c4)*r+c3)*r+c2)*r+c1 */
+ vfmadd213ps __sPC1(%rax), %zmm8, %zmm10
+
+/* exp(r) = ((((c5*r+c4)*r+c3)*r+c2)*r+c1)*r+c0 */
+ vfmadd213ps __sPC0(%rax), %zmm8, %zmm10
+
+/* 2^N*exp(r) */
+ vmulps %zmm10, %zmm9, %zmm1
+
+/* remove sign of x by "and" operation */
+ vpandd __iAbsMask(%rax), %zmm0, %zmm2
+ vpcmpd $2, __iDomainRange(%rax), %zmm2, %k1
+ vpandnd %zmm2, %zmm2, %zmm3{%k1}
+
+/* set mask for overflow/underflow */
+ vptestmd %zmm3, %zmm3, %k0
+ kmovw %k0, %ecx
+ testl %ecx, %ecx
+ jne .LBL_2_3
+
+.LBL_2_2:
+ cfi_remember_state
+ vmovaps %zmm1, %zmm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_2_3:
+ cfi_restore_state
+ vmovups %zmm0, 1152(%rsp)
+ vmovups %zmm1, 1216(%rsp)
+ je .LBL_2_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ kmovw %k4, 1048(%rsp)
+ kmovw %k5, 1040(%rsp)
+ kmovw %k6, 1032(%rsp)
+ kmovw %k7, 1024(%rsp)
+ vmovups %zmm16, 960(%rsp)
+ vmovups %zmm17, 896(%rsp)
+ vmovups %zmm18, 832(%rsp)
+ vmovups %zmm19, 768(%rsp)
+ vmovups %zmm20, 704(%rsp)
+ vmovups %zmm21, 640(%rsp)
+ vmovups %zmm22, 576(%rsp)
+ vmovups %zmm23, 512(%rsp)
+ vmovups %zmm24, 448(%rsp)
+ vmovups %zmm25, 384(%rsp)
+ vmovups %zmm26, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1064(%rsp)
+ movq %rdi, 1056(%rsp)
+ movq %r12, 1096(%rsp)
+ cfi_offset_rel_rsp (12, 1096)
+ movb %dl, %r12b
+ movq %r13, 1088(%rsp)
+ cfi_offset_rel_rsp (13, 1088)
+ movl %ecx, %r13d
+ movq %r14, 1080(%rsp)
+ cfi_offset_rel_rsp (14, 1080)
+ movl %eax, %r14d
+ movq %r15, 1072(%rsp)
+ cfi_offset_rel_rsp (15, 1072)
+ cfi_remember_state
+
+
+.LBL_2_6:
+ btl %r14d, %r13d
+ jc .LBL_2_12
+
+.LBL_2_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_2_10
+
+.LBL_2_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_2_6
+
+ kmovw 1048(%rsp), %k4
+ kmovw 1040(%rsp), %k5
+ kmovw 1032(%rsp), %k6
+ kmovw 1024(%rsp), %k7
+ vmovups 960(%rsp), %zmm16
+ vmovups 896(%rsp), %zmm17
+ vmovups 832(%rsp), %zmm18
+ vmovups 768(%rsp), %zmm19
+ vmovups 704(%rsp), %zmm20
+ vmovups 640(%rsp), %zmm21
+ vmovups 576(%rsp), %zmm22
+ vmovups 512(%rsp), %zmm23
+ vmovups 448(%rsp), %zmm24
+ vmovups 384(%rsp), %zmm25
+ vmovups 320(%rsp), %zmm26
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ vmovups 1216(%rsp), %zmm1
+ movq 1064(%rsp), %rsi
+ movq 1056(%rsp), %rdi
+ movq 1096(%rsp), %r12
+ cfi_restore (%r12)
+ movq 1088(%rsp), %r13
+ cfi_restore (%r13)
+ movq 1080(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1072(%rsp), %r15
+ cfi_restore (%r15)
+ jmp .LBL_2_2
+
+.LBL_2_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ vmovss 1156(%rsp,%r15,8), %xmm0
+ vzeroupper
+ vmovss 1156(%rsp,%r15,8), %xmm0
+
+ call expf@PLT
+
+ vmovss %xmm0, 1220(%rsp,%r15,8)
+ jmp .LBL_2_8
+
+.LBL_2_12:
+ movzbl %r12b, %r15d
+ vmovss 1152(%rsp,%r15,8), %xmm0
+ vzeroupper
+ vmovss 1152(%rsp,%r15,8), %xmm0
+
+ call expf@PLT
+
+ vmovss %xmm0, 1216(%rsp,%r15,8)
+ jmp .LBL_2_7
+
+#endif
+END (_ZGVeN16v_expf_skx)
+
+ .section .rodata, "a"
+.L_2il0floatpacket.13:
+ .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+ .type .L_2il0floatpacket.13,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S
new file mode 100644
index 0000000000..37d38bc6f8
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S
@@ -0,0 +1,38 @@
+/* Multiple versions of vectorized expf.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVbN4v_expf)
+ .type _ZGVbN4v_expf, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVbN4v_expf_sse4(%rip), %rax
+ testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVbN4v_expf_sse2(%rip), %rax
+ ret
+END (_ZGVbN4v_expf)
+libmvec_hidden_def (_ZGVbN4v_expf)
+
+#define _ZGVbN4v_expf _ZGVbN4v_expf_sse2
+#include "../svml_s_expf4_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S
new file mode 100644
index 0000000000..fcc1859c3a
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S
@@ -0,0 +1,212 @@
+/* Function expf vectorized with SSE4.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_expf_data.h"
+
+ .text
+ENTRY (_ZGVbN4v_expf_sse4)
+/*
+ ALGORITHM DESCRIPTION:
+
+ Argument representation:
+ M = rint(X*2^k/ln2) = 2^k*N+j
+ X = M*ln2/2^k + r = N*ln2 + ln2*(j/2^k) + r
+ then -ln2/2^(k+1) < r < ln2/2^(k+1)
+ Alternatively:
+ M = trunc(X*2^k/ln2)
+ then 0 < r < ln2/2^k
+
+ Result calculation:
+ exp(X) = exp(N*ln2 + ln2*(j/2^k) + r)
+ = 2^N * 2^(j/2^k) * exp(r)
+ 2^N is calculated by bit manipulation
+ 2^(j/2^k) is computed from table lookup
+ exp(r) is approximated by polynomial
+
+ The table lookup is skipped if k = 0.
+ For low accuracy approximation, exp(r) ~ 1 or 1+r. */
+
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $320, %rsp
+ movaps %xmm0, %xmm5
+ movq __svml_sexp_data@GOTPCREL(%rip), %rax
+ movups __sInvLn2(%rax), %xmm0
+
+/* m = x*2^k/ln2 + shifter */
+ mulps %xmm5, %xmm0
+ movups __sShifter(%rax), %xmm6
+ movups __sLn2hi(%rax), %xmm4
+ addps %xmm6, %xmm0
+
+/* n = m - shifter = rint(x*2^k/ln2) */
+ movaps %xmm0, %xmm2
+
+/* remove sign of x by "and" operation */
+ movdqu __iAbsMask(%rax), %xmm7
+ subps %xmm6, %xmm2
+
+/* r = x-n*ln2_hi/2^k */
+ mulps %xmm2, %xmm4
+ pand %xmm5, %xmm7
+
+/* compare against threshold */
+ pcmpgtd __iDomainRange(%rax), %xmm7
+ movups __sLn2lo(%rax), %xmm1
+
+/* set mask for overflow/underflow */
+ movmskps %xmm7, %ecx
+ movaps %xmm5, %xmm7
+ movups __sPC5(%rax), %xmm3
+ subps %xmm4, %xmm7
+
+/* r = r-n*ln2_lo/2^k = x - n*ln2/2^k */
+ mulps %xmm1, %xmm2
+
+/* compute 2^N with "shift" */
+ movdqu __iBias(%rax), %xmm6
+ subps %xmm2, %xmm7
+
+/* c5*r+c4 */
+ mulps %xmm7, %xmm3
+ paddd %xmm6, %xmm0
+ pslld $23, %xmm0
+ addps __sPC4(%rax), %xmm3
+
+/* (c5*r+c4)*r+c3 */
+ mulps %xmm7, %xmm3
+ addps __sPC3(%rax), %xmm3
+
+/* ((c5*r+c4)*r+c3)*r+c2 */
+ mulps %xmm7, %xmm3
+ addps __sPC2(%rax), %xmm3
+
+/* (((c5*r+c4)*r+c3)*r+c2)*r+c1 */
+ mulps %xmm7, %xmm3
+ addps __sPC1(%rax), %xmm3
+
+/* exp(r) = ((((c5*r+c4)*r+c3)*r+c2)*r+c1)*r+c0 */
+ mulps %xmm3, %xmm7
+ addps __sPC0(%rax), %xmm7
+
+/* 2^N*exp(r) */
+ mulps %xmm7, %xmm0
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ movups %xmm5, 192(%rsp)
+ movups %xmm0, 256(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ movups %xmm8, 112(%rsp)
+ movups %xmm9, 96(%rsp)
+ movups %xmm10, 80(%rsp)
+ movups %xmm11, 64(%rsp)
+ movups %xmm12, 48(%rsp)
+ movups %xmm13, 32(%rsp)
+ movups %xmm14, 16(%rsp)
+ movups %xmm15, (%rsp)
+ movq %rsi, 136(%rsp)
+ movq %rdi, 128(%rsp)
+ movq %r12, 168(%rsp)
+ cfi_offset_rel_rsp (12, 168)
+ movb %dl, %r12b
+ movq %r13, 160(%rsp)
+ cfi_offset_rel_rsp (13, 160)
+ movl %ecx, %r13d
+ movq %r14, 152(%rsp)
+ cfi_offset_rel_rsp (14, 152)
+ movl %eax, %r14d
+ movq %r15, 144(%rsp)
+ cfi_offset_rel_rsp (15, 144)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ movups 112(%rsp), %xmm8
+ movups 96(%rsp), %xmm9
+ movups 80(%rsp), %xmm10
+ movups 64(%rsp), %xmm11
+ movups 48(%rsp), %xmm12
+ movups 32(%rsp), %xmm13
+ movups 16(%rsp), %xmm14
+ movups (%rsp), %xmm15
+ movq 136(%rsp), %rsi
+ movq 128(%rsp), %rdi
+ movq 168(%rsp), %r12
+ cfi_restore (%r12)
+ movq 160(%rsp), %r13
+ cfi_restore (%r13)
+ movq 152(%rsp), %r14
+ cfi_restore (%r14)
+ movq 144(%rsp), %r15
+ cfi_restore (%r15)
+ movups 256(%rsp), %xmm0
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ movss 196(%rsp,%r15,8), %xmm0
+
+ call expf@PLT
+
+ movss %xmm0, 260(%rsp,%r15,8)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ movss 192(%rsp,%r15,8), %xmm0
+
+ call expf@PLT
+
+ movss %xmm0, 256(%rsp,%r15,8)
+ jmp .LBL_1_7
+
+END (_ZGVbN4v_expf_sse4)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S
new file mode 100644
index 0000000000..e3dc1b1038
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S
@@ -0,0 +1,38 @@
+/* Multiple versions of vectorized expf.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVdN8v_expf)
+ .type _ZGVdN8v_expf, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVdN8v_expf_avx2(%rip), %rax
+ testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVdN8v_expf_sse_wrapper(%rip), %rax
+ ret
+END (_ZGVdN8v_expf)
+libmvec_hidden_def (_ZGVdN8v_expf)
+
+#define _ZGVdN8v_expf _ZGVdN8v_expf_sse_wrapper
+#include "../svml_s_expf8_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S
new file mode 100644
index 0000000000..c876ecc03e
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S
@@ -0,0 +1,202 @@
+/* Function expf vectorized with AVX2.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_expf_data.h"
+
+ .text
+ENTRY(_ZGVdN8v_expf_avx2)
+/*
+ ALGORITHM DESCRIPTION:
+
+ Argument representation:
+ M = rint(X*2^k/ln2) = 2^k*N+j
+ X = M*ln2/2^k + r = N*ln2 + ln2*(j/2^k) + r
+ then -ln2/2^(k+1) < r < ln2/2^(k+1)
+ Alternatively:
+ M = trunc(X*2^k/ln2)
+ then 0 < r < ln2/2^k
+
+ Result calculation:
+ exp(X) = exp(N*ln2 + ln2*(j/2^k) + r)
+ = 2^N * 2^(j/2^k) * exp(r)
+ 2^N is calculated by bit manipulation
+ 2^(j/2^k) is computed from table lookup
+ exp(r) is approximated by polynomial
+
+ The table lookup is skipped if k = 0.
+ For low accuracy approximation, exp(r) ~ 1 or 1+r. */
+
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $448, %rsp
+ movq __svml_sexp_data@GOTPCREL(%rip), %rax
+ vmovaps %ymm0, %ymm2
+ vmovups __sInvLn2(%rax), %ymm7
+ vmovups __sShifter(%rax), %ymm4
+ vmovups __sLn2hi(%rax), %ymm3
+ vmovups __sPC5(%rax), %ymm1
+
+/* m = x*2^k/ln2 + shifter */
+ vfmadd213ps %ymm4, %ymm2, %ymm7
+
+/* n = m - shifter = rint(x*2^k/ln2) */
+ vsubps %ymm4, %ymm7, %ymm0
+ vpaddd __iBias(%rax), %ymm7, %ymm4
+
+/* remove sign of x by "and" operation */
+ vandps __iAbsMask(%rax), %ymm2, %ymm5
+
+/* compare against threshold */
+ vpcmpgtd __iDomainRange(%rax), %ymm5, %ymm6
+
+/* r = x-n*ln2_hi/2^k */
+ vmovaps %ymm2, %ymm5
+ vfnmadd231ps %ymm0, %ymm3, %ymm5
+
+/* r = r-n*ln2_lo/2^k = x - n*ln2/2^k */
+ vfnmadd132ps __sLn2lo(%rax), %ymm5, %ymm0
+
+/* c5*r+c4 */
+ vfmadd213ps __sPC4(%rax), %ymm0, %ymm1
+
+/* (c5*r+c4)*r+c3 */
+ vfmadd213ps __sPC3(%rax), %ymm0, %ymm1
+
+/* ((c5*r+c4)*r+c3)*r+c2 */
+ vfmadd213ps __sPC2(%rax), %ymm0, %ymm1
+
+/* (((c5*r+c4)*r+c3)*r+c2)*r+c1 */
+ vfmadd213ps __sPC1(%rax), %ymm0, %ymm1
+
+/* exp(r) = ((((c5*r+c4)*r+c3)*r+c2)*r+c1)*r+c0 */
+ vfmadd213ps __sPC0(%rax), %ymm0, %ymm1
+
+/* set mask for overflow/underflow */
+ vmovmskps %ymm6, %ecx
+
+/* compute 2^N with "shift" */
+ vpslld $23, %ymm4, %ymm6
+
+/* 2^N*exp(r) */
+ vmulps %ymm1, %ymm6, %ymm0
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovups %ymm2, 320(%rsp)
+ vmovups %ymm0, 384(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ vmovups %ymm8, 224(%rsp)
+ vmovups %ymm9, 192(%rsp)
+ vmovups %ymm10, 160(%rsp)
+ vmovups %ymm11, 128(%rsp)
+ vmovups %ymm12, 96(%rsp)
+ vmovups %ymm13, 64(%rsp)
+ vmovups %ymm14, 32(%rsp)
+ vmovups %ymm15, (%rsp)
+ movq %rsi, 264(%rsp)
+ movq %rdi, 256(%rsp)
+ movq %r12, 296(%rsp)
+ cfi_offset_rel_rsp (12, 296)
+ movb %dl, %r12b
+ movq %r13, 288(%rsp)
+ cfi_offset_rel_rsp (13, 288)
+ movl %ecx, %r13d
+ movq %r14, 280(%rsp)
+ cfi_offset_rel_rsp (14, 280)
+ movl %eax, %r14d
+ movq %r15, 272(%rsp)
+ cfi_offset_rel_rsp (15, 272)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ vmovups 224(%rsp), %ymm8
+ vmovups 192(%rsp), %ymm9
+ vmovups 160(%rsp), %ymm10
+ vmovups 128(%rsp), %ymm11
+ vmovups 96(%rsp), %ymm12
+ vmovups 64(%rsp), %ymm13
+ vmovups 32(%rsp), %ymm14
+ vmovups (%rsp), %ymm15
+ vmovups 384(%rsp), %ymm0
+ movq 264(%rsp), %rsi
+ movq 256(%rsp), %rdi
+ movq 296(%rsp), %r12
+ cfi_restore (%r12)
+ movq 288(%rsp), %r13
+ cfi_restore (%r13)
+ movq 280(%rsp), %r14
+ cfi_restore (%r14)
+ movq 272(%rsp), %r15
+ cfi_restore (%r15)
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ vmovss 324(%rsp,%r15,8), %xmm0
+ vzeroupper
+
+ call expf@PLT
+
+ vmovss %xmm0, 388(%rsp,%r15,8)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ vmovss 320(%rsp,%r15,8), %xmm0
+ vzeroupper
+
+ call expf@PLT
+
+ vmovss %xmm0, 384(%rsp,%r15,8)
+ jmp .LBL_1_7
+
+END(_ZGVdN8v_expf_avx2)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
new file mode 100644
index 0000000000..68c57e4386
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
@@ -0,0 +1,39 @@
+/* Multiple versions of vectorized logf.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVeN16v_logf)
+ .type _ZGVeN16v_logf, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVeN16v_logf_skx(%rip), %rax
+ testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+ jnz 2f
+ leaq _ZGVeN16v_logf_knl(%rip), %rax
+ testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+ jnz 2f
+ leaq _ZGVeN16v_logf_avx2_wrapper(%rip), %rax
+2: ret
+END (_ZGVeN16v_logf)
+
+#define _ZGVeN16v_logf _ZGVeN16v_logf_avx2_wrapper
+#include "../svml_s_logf16_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
new file mode 100644
index 0000000000..86fcab6e63
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
@@ -0,0 +1,416 @@
+/* Function logf vectorized with AVX-512. KNL and SKX versions.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_logf_data.h"
+#include "svml_s_wrapper_impl.h"
+
+ .text
+ENTRY (_ZGVeN16v_logf_knl)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
+#else
+/*
+ ALGORITHM DESCRIPTION:
+
+ log(x) = exponent_x*log(2) + log(mantissa_x), if mantissa_x<4/3
+ log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x), if mantissa_x>4/3
+
+ R = mantissa_x - 1, if mantissa_x<4/3
+ R = 0.5*mantissa_x - 1, if mantissa_x>4/3
+ |R|< 1/3
+
+ log(1+R) is approximated as a polynomial: degree 9 for 1-ulp,
+ degree 7 for 4-ulp, degree 3 for half-precision. */
+
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1280, %rsp
+ movq __svml_slog_data@GOTPCREL(%rip), %rax
+ movl $-1, %ecx
+
+/* reduction: compute r,n */
+ vpsubd _iBrkValue(%rax), %zmm0, %zmm2
+ vmovups _sPoly_7(%rax), %zmm7
+ vpandd _iOffExpoMask(%rax), %zmm2, %zmm3
+
+/* exponent_x (mantissa_x<4/3) or exponent_x+1 (mantissa_x>4/3) */
+ vpsrad $23, %zmm2, %zmm4
+
+/* check for working range,
+ set special argument mask (denormals/zero/Inf/NaN)
+ */
+ vpaddd _iHiDelta(%rax), %zmm0, %zmm1
+
+/* mantissa_x (mantissa_x<4/3), or 0.5*mantissa_x (mantissa_x>4/3) */
+ vpaddd _iBrkValue(%rax), %zmm3, %zmm6
+ vpcmpd $1, _iLoRange(%rax), %zmm1, %k1
+ vcvtdq2ps {rn-sae}, %zmm4, %zmm1
+
+/* reduced argument R */
+ vsubps _sOne(%rax), %zmm6, %zmm8
+ vpbroadcastd %ecx, %zmm5{%k1}{z}
+
+/* polynomial evaluation starts here */
+ vfmadd213ps _sPoly_6(%rax), %zmm8, %zmm7
+ vptestmd %zmm5, %zmm5, %k0
+ kmovw %k0, %ecx
+ vfmadd213ps _sPoly_5(%rax), %zmm8, %zmm7
+ vfmadd213ps _sPoly_4(%rax), %zmm8, %zmm7
+ vfmadd213ps _sPoly_3(%rax), %zmm8, %zmm7
+ vfmadd213ps _sPoly_2(%rax), %zmm8, %zmm7
+ vfmadd213ps _sPoly_1(%rax), %zmm8, %zmm7
+ vmulps %zmm8, %zmm7, %zmm9
+
+/* polynomial evaluation end */
+ vfmadd213ps %zmm8, %zmm8, %zmm9
+
+/*
+ final reconstruction:
+ add exponent_value*log2 to polynomial result
+ */
+ vfmadd132ps _sLn2(%rax), %zmm9, %zmm1
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ vmovaps %zmm1, %zmm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovups %zmm0, 1152(%rsp)
+ vmovups %zmm1, 1216(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ kmovw %k4, 1048(%rsp)
+ xorl %eax, %eax
+ kmovw %k5, 1040(%rsp)
+ kmovw %k6, 1032(%rsp)
+ kmovw %k7, 1024(%rsp)
+ vmovups %zmm16, 960(%rsp)
+ vmovups %zmm17, 896(%rsp)
+ vmovups %zmm18, 832(%rsp)
+ vmovups %zmm19, 768(%rsp)
+ vmovups %zmm20, 704(%rsp)
+ vmovups %zmm21, 640(%rsp)
+ vmovups %zmm22, 576(%rsp)
+ vmovups %zmm23, 512(%rsp)
+ vmovups %zmm24, 448(%rsp)
+ vmovups %zmm25, 384(%rsp)
+ vmovups %zmm26, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1064(%rsp)
+ movq %rdi, 1056(%rsp)
+ movq %r12, 1096(%rsp)
+ cfi_offset_rel_rsp (12, 1096)
+ movb %dl, %r12b
+ movq %r13, 1088(%rsp)
+ cfi_offset_rel_rsp (13, 1088)
+ movl %ecx, %r13d
+ movq %r14, 1080(%rsp)
+ cfi_offset_rel_rsp (14, 1080)
+ movl %eax, %r14d
+ movq %r15, 1072(%rsp)
+ cfi_offset_rel_rsp (15, 1072)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ addb $1, %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ kmovw 1048(%rsp), %k4
+ movq 1064(%rsp), %rsi
+ kmovw 1040(%rsp), %k5
+ movq 1056(%rsp), %rdi
+ kmovw 1032(%rsp), %k6
+ movq 1096(%rsp), %r12
+ cfi_restore (%r12)
+ movq 1088(%rsp), %r13
+ cfi_restore (%r13)
+ kmovw 1024(%rsp), %k7
+ vmovups 960(%rsp), %zmm16
+ vmovups 896(%rsp), %zmm17
+ vmovups 832(%rsp), %zmm18
+ vmovups 768(%rsp), %zmm19
+ vmovups 704(%rsp), %zmm20
+ vmovups 640(%rsp), %zmm21
+ vmovups 576(%rsp), %zmm22
+ vmovups 512(%rsp), %zmm23
+ vmovups 448(%rsp), %zmm24
+ vmovups 384(%rsp), %zmm25
+ vmovups 320(%rsp), %zmm26
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ movq 1080(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1072(%rsp), %r15
+ cfi_restore (%r15)
+ vmovups 1216(%rsp), %zmm1
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ vmovss 1156(%rsp,%r15,8), %xmm0
+ call logf@PLT
+ vmovss %xmm0, 1220(%rsp,%r15,8)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ vmovss 1152(%rsp,%r15,8), %xmm0
+ call logf@PLT
+ vmovss %xmm0, 1216(%rsp,%r15,8)
+ jmp .LBL_1_7
+#endif
+END (_ZGVeN16v_logf_knl)
+
+ENTRY (_ZGVeN16v_logf_skx)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
+#else
+/*
+ ALGORITHM DESCRIPTION:
+
+ log(x) = exponent_x*log(2) + log(mantissa_x), if mantissa_x<4/3
+ log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x), if mantissa_x>4/3
+
+ R = mantissa_x - 1, if mantissa_x<4/3
+ R = 0.5*mantissa_x - 1, if mantissa_x>4/3
+ |R|< 1/3
+
+ log(1+R) is approximated as a polynomial: degree 9 for 1-ulp,
+ degree 7 for 4-ulp, degree 3 for half-precision. */
+
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1280, %rsp
+ movq __svml_slog_data@GOTPCREL(%rip), %rax
+ vmovups .L_2il0floatpacket.7(%rip), %zmm6
+ vmovups _iBrkValue(%rax), %zmm4
+ vmovups _sPoly_7(%rax), %zmm8
+
+/*
+ check for working range,
+ set special argument mask (denormals/zero/Inf/NaN)
+ */
+ vpaddd _iHiDelta(%rax), %zmm0, %zmm1
+
+/* reduction: compute r,n */
+ vpsubd %zmm4, %zmm0, %zmm2
+ vpcmpd $5, _iLoRange(%rax), %zmm1, %k1
+
+/* exponent_x (mantissa_x<4/3) or exponent_x+1 (mantissa_x>4/3) */
+ vpsrad $23, %zmm2, %zmm5
+ vpandd _iOffExpoMask(%rax), %zmm2, %zmm3
+
+/* mantissa_x (mantissa_x<4/3), or 0.5*mantissa_x (mantissa_x>4/3) */
+ vpaddd %zmm4, %zmm3, %zmm7
+
+/* reduced argument R */
+ vsubps _sOne(%rax), %zmm7, %zmm9
+
+/* polynomial evaluation starts here */
+ vfmadd213ps _sPoly_6(%rax), %zmm9, %zmm8
+ vfmadd213ps _sPoly_5(%rax), %zmm9, %zmm8
+ vfmadd213ps _sPoly_4(%rax), %zmm9, %zmm8
+ vfmadd213ps _sPoly_3(%rax), %zmm9, %zmm8
+ vfmadd213ps _sPoly_2(%rax), %zmm9, %zmm8
+ vfmadd213ps _sPoly_1(%rax), %zmm9, %zmm8
+ vmulps %zmm9, %zmm8, %zmm10
+
+/* polynomial evaluation end */
+ vfmadd213ps %zmm9, %zmm9, %zmm10
+ vpandnd %zmm1, %zmm1, %zmm6{%k1}
+ vptestmd %zmm6, %zmm6, %k0
+ vcvtdq2ps {rn-sae}, %zmm5, %zmm1
+ kmovw %k0, %ecx
+
+/*
+ final reconstruction:
+ add exponent_value*log2 to polynomial result
+ */
+ vfmadd132ps _sLn2(%rax), %zmm10, %zmm1
+ testl %ecx, %ecx
+ jne .LBL_2_3
+
+.LBL_2_2:
+ cfi_remember_state
+ vmovaps %zmm1, %zmm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_2_3:
+ cfi_restore_state
+ vmovups %zmm0, 1152(%rsp)
+ vmovups %zmm1, 1216(%rsp)
+ je .LBL_2_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ kmovw %k4, 1048(%rsp)
+ kmovw %k5, 1040(%rsp)
+ kmovw %k6, 1032(%rsp)
+ kmovw %k7, 1024(%rsp)
+ vmovups %zmm16, 960(%rsp)
+ vmovups %zmm17, 896(%rsp)
+ vmovups %zmm18, 832(%rsp)
+ vmovups %zmm19, 768(%rsp)
+ vmovups %zmm20, 704(%rsp)
+ vmovups %zmm21, 640(%rsp)
+ vmovups %zmm22, 576(%rsp)
+ vmovups %zmm23, 512(%rsp)
+ vmovups %zmm24, 448(%rsp)
+ vmovups %zmm25, 384(%rsp)
+ vmovups %zmm26, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1064(%rsp)
+ movq %rdi, 1056(%rsp)
+ movq %r12, 1096(%rsp)
+ cfi_offset_rel_rsp (12, 1096)
+ movb %dl, %r12b
+ movq %r13, 1088(%rsp)
+ cfi_offset_rel_rsp (13, 1088)
+ movl %ecx, %r13d
+ movq %r14, 1080(%rsp)
+ cfi_offset_rel_rsp (14, 1080)
+ movl %eax, %r14d
+ movq %r15, 1072(%rsp)
+ cfi_offset_rel_rsp (15, 1072)
+ cfi_remember_state
+
+.LBL_2_6:
+ btl %r14d, %r13d
+ jc .LBL_2_12
+
+.LBL_2_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_2_10
+
+.LBL_2_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_2_6
+
+ kmovw 1048(%rsp), %k4
+ kmovw 1040(%rsp), %k5
+ kmovw 1032(%rsp), %k6
+ kmovw 1024(%rsp), %k7
+ vmovups 960(%rsp), %zmm16
+ vmovups 896(%rsp), %zmm17
+ vmovups 832(%rsp), %zmm18
+ vmovups 768(%rsp), %zmm19
+ vmovups 704(%rsp), %zmm20
+ vmovups 640(%rsp), %zmm21
+ vmovups 576(%rsp), %zmm22
+ vmovups 512(%rsp), %zmm23
+ vmovups 448(%rsp), %zmm24
+ vmovups 384(%rsp), %zmm25
+ vmovups 320(%rsp), %zmm26
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ vmovups 1216(%rsp), %zmm1
+ movq 1064(%rsp), %rsi
+ movq 1056(%rsp), %rdi
+ movq 1096(%rsp), %r12
+ cfi_restore (%r12)
+ movq 1088(%rsp), %r13
+ cfi_restore (%r13)
+ movq 1080(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1072(%rsp), %r15
+ cfi_restore (%r15)
+ jmp .LBL_2_2
+
+.LBL_2_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ vmovss 1156(%rsp,%r15,8), %xmm0
+ vzeroupper
+ vmovss 1156(%rsp,%r15,8), %xmm0
+
+ call logf@PLT
+
+ vmovss %xmm0, 1220(%rsp,%r15,8)
+ jmp .LBL_2_8
+
+.LBL_2_12:
+ movzbl %r12b, %r15d
+ vmovss 1152(%rsp,%r15,8), %xmm0
+ vzeroupper
+ vmovss 1152(%rsp,%r15,8), %xmm0
+
+ call logf@PLT
+
+ vmovss %xmm0, 1216(%rsp,%r15,8)
+ jmp .LBL_2_7
+
+#endif
+END (_ZGVeN16v_logf_skx)
+
+ .section .rodata, "a"
+.L_2il0floatpacket.7:
+ .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+ .type .L_2il0floatpacket.7,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S
new file mode 100644
index 0000000000..153ed8ebc2
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S
@@ -0,0 +1,38 @@
+/* Multiple versions of vectorized logf.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVbN4v_logf)
+ .type _ZGVbN4v_logf, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVbN4v_logf_sse4(%rip), %rax
+ testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVbN4v_logf_sse2(%rip), %rax
+ ret
+END (_ZGVbN4v_logf)
+libmvec_hidden_def (_ZGVbN4v_logf)
+
+#define _ZGVbN4v_logf _ZGVbN4v_logf_sse2
+#include "../svml_s_logf4_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S
new file mode 100644
index 0000000000..68f11033d9
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S
@@ -0,0 +1,194 @@
+/* Function logf vectorized with SSE4.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_logf_data.h"
+
+ .text
+ENTRY (_ZGVbN4v_logf_sse4)
+/*
+ ALGORITHM DESCRIPTION:
+
+ log(x) = exponent_x*log(2) + log(mantissa_x), if mantissa_x<4/3
+ log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x), if mantissa_x>4/3
+
+ R = mantissa_x - 1, if mantissa_x<4/3
+ R = 0.5*mantissa_x - 1, if mantissa_x>4/3
+ |R|< 1/3
+
+ log(1+R) is approximated as a polynomial: degree 9 for 1-ulp,
+ degree 7 for 4-ulp, degree 3 for half-precision. */
+
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $320, %rsp
+
+/* reduction: compute r,n */
+ movaps %xmm0, %xmm2
+
+/* check for working range,
+ set special argument mask (denormals/zero/Inf/NaN) */
+ movq __svml_slog_data@GOTPCREL(%rip), %rax
+ movdqu _iHiDelta(%rax), %xmm1
+ movdqu _iLoRange(%rax), %xmm4
+ paddd %xmm0, %xmm1
+ movdqu _iBrkValue(%rax), %xmm3
+ pcmpgtd %xmm1, %xmm4
+ movdqu _iOffExpoMask(%rax), %xmm1
+ psubd %xmm3, %xmm2
+ pand %xmm2, %xmm1
+
+/* exponent_x (mantissa_x<4/3) or exponent_x+1 (mantissa_x>4/3) */
+ psrad $23, %xmm2
+ paddd %xmm3, %xmm1
+ movups _sPoly_7(%rax), %xmm5
+
+/* mantissa_x (mantissa_x<4/3), or 0.5*mantissa_x (mantissa_x>4/3) */
+ cvtdq2ps %xmm2, %xmm6
+
+/* reduced argument R */
+ subps _sOne(%rax), %xmm1
+ movmskps %xmm4, %ecx
+
+/* final reconstruction:
+ add exponent_value*log2 to polynomial result */
+ mulps _sLn2(%rax), %xmm6
+
+/* polynomial evaluation starts here */
+ mulps %xmm1, %xmm5
+ addps _sPoly_6(%rax), %xmm5
+ mulps %xmm1, %xmm5
+ addps _sPoly_5(%rax), %xmm5
+ mulps %xmm1, %xmm5
+ addps _sPoly_4(%rax), %xmm5
+ mulps %xmm1, %xmm5
+ addps _sPoly_3(%rax), %xmm5
+ mulps %xmm1, %xmm5
+ addps _sPoly_2(%rax), %xmm5
+ mulps %xmm1, %xmm5
+ addps _sPoly_1(%rax), %xmm5
+ mulps %xmm1, %xmm5
+
+/* polynomial evaluation end */
+ mulps %xmm1, %xmm5
+ addps %xmm5, %xmm1
+ addps %xmm6, %xmm1
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ movdqa %xmm1, %xmm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ movups %xmm0, 192(%rsp)
+ movups %xmm1, 256(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ movups %xmm8, 112(%rsp)
+ movups %xmm9, 96(%rsp)
+ movups %xmm10, 80(%rsp)
+ movups %xmm11, 64(%rsp)
+ movups %xmm12, 48(%rsp)
+ movups %xmm13, 32(%rsp)
+ movups %xmm14, 16(%rsp)
+ movups %xmm15, (%rsp)
+ movq %rsi, 136(%rsp)
+ movq %rdi, 128(%rsp)
+ movq %r12, 168(%rsp)
+ cfi_offset_rel_rsp (12, 168)
+ movb %dl, %r12b
+ movq %r13, 160(%rsp)
+ cfi_offset_rel_rsp (13, 160)
+ movl %ecx, %r13d
+ movq %r14, 152(%rsp)
+ cfi_offset_rel_rsp (14, 152)
+ movl %eax, %r14d
+ movq %r15, 144(%rsp)
+ cfi_offset_rel_rsp (15, 144)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ movups 112(%rsp), %xmm8
+ movups 96(%rsp), %xmm9
+ movups 80(%rsp), %xmm10
+ movups 64(%rsp), %xmm11
+ movups 48(%rsp), %xmm12
+ movups 32(%rsp), %xmm13
+ movups 16(%rsp), %xmm14
+ movups (%rsp), %xmm15
+ movq 136(%rsp), %rsi
+ movq 128(%rsp), %rdi
+ movq 168(%rsp), %r12
+ cfi_restore (%r12)
+ movq 160(%rsp), %r13
+ cfi_restore (%r13)
+ movq 152(%rsp), %r14
+ cfi_restore (%r14)
+ movq 144(%rsp), %r15
+ cfi_restore (%r15)
+ movups 256(%rsp), %xmm1
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ movss 196(%rsp,%r15,8), %xmm0
+
+ call logf@PLT
+
+ movss %xmm0, 260(%rsp,%r15,8)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ movss 192(%rsp,%r15,8), %xmm0
+
+ call logf@PLT
+
+ movss %xmm0, 256(%rsp,%r15,8)
+ jmp .LBL_1_7
+
+END (_ZGVbN4v_logf_sse4)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S
new file mode 100644
index 0000000000..6f50bf6bdb
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S
@@ -0,0 +1,38 @@
+/* Multiple versions of vectorized logf.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVdN8v_logf)
+ .type _ZGVdN8v_logf, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVdN8v_logf_avx2(%rip), %rax
+ testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVdN8v_logf_sse_wrapper(%rip), %rax
+ ret
+END (_ZGVdN8v_logf)
+libmvec_hidden_def (_ZGVdN8v_logf)
+
+#define _ZGVdN8v_logf _ZGVdN8v_logf_sse_wrapper
+#include "../svml_s_logf8_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S
new file mode 100644
index 0000000000..1f08b4218a
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S
@@ -0,0 +1,184 @@
+/* Function logf vectorized with AVX2.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_logf_data.h"
+
+ .text
+ENTRY(_ZGVdN8v_logf_avx2)
+/*
+ ALGORITHM DESCRIPTION:
+
+ log(x) = exponent_x*log(2) + log(mantissa_x), if mantissa_x<4/3
+ log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x), if mantissa_x>4/3
+
+ R = mantissa_x - 1, if mantissa_x<4/3
+ R = 0.5*mantissa_x - 1, if mantissa_x>4/3
+ |R|< 1/3
+
+ log(1+R) is approximated as a polynomial: degree 9 for 1-ulp,
+ degree 7 for 4-ulp, degree 3 for half-precision. */
+
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $448, %rsp
+ movq __svml_slog_data@GOTPCREL(%rip), %rax
+ vmovaps %ymm0, %ymm2
+ vmovups _iBrkValue(%rax), %ymm6
+ vmovups _iLoRange(%rax), %ymm1
+/* check for working range,
+ set special argument mask (denormals/zero/Inf/NaN) */
+ vpaddd _iHiDelta(%rax), %ymm2, %ymm7
+
+/* reduction: compute r,n */
+ vpsubd %ymm6, %ymm2, %ymm4
+
+/* exponent_x (mantissa_x<4/3) or exponent_x+1 (mantissa_x>4/3) */
+ vpsrad $23, %ymm4, %ymm3
+ vpand _iOffExpoMask(%rax), %ymm4, %ymm5
+ vmovups _sPoly_7(%rax), %ymm4
+ vcvtdq2ps %ymm3, %ymm0
+
+/* mantissa_x (mantissa_x<4/3), or 0.5*mantissa_x (mantissa_x>4/3) */
+ vpaddd %ymm6, %ymm5, %ymm3
+
+/* reduced argument R */
+ vsubps _sOne(%rax), %ymm3, %ymm5
+
+/* polynomial evaluation starts here */
+ vfmadd213ps _sPoly_6(%rax), %ymm5, %ymm4
+ vfmadd213ps _sPoly_5(%rax), %ymm5, %ymm4
+ vfmadd213ps _sPoly_4(%rax), %ymm5, %ymm4
+ vfmadd213ps _sPoly_3(%rax), %ymm5, %ymm4
+ vfmadd213ps _sPoly_2(%rax), %ymm5, %ymm4
+ vfmadd213ps _sPoly_1(%rax), %ymm5, %ymm4
+ vmulps %ymm5, %ymm4, %ymm6
+
+/* polynomial evaluation end */
+ vfmadd213ps %ymm5, %ymm5, %ymm6
+ vpcmpgtd %ymm7, %ymm1, %ymm1
+ vmovmskps %ymm1, %ecx
+
+/* final reconstruction:
+ add exponent_value*log2 to polynomial result */
+ vfmadd132ps _sLn2(%rax), %ymm6, %ymm0
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovups %ymm2, 320(%rsp)
+ vmovups %ymm0, 384(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ vmovups %ymm8, 224(%rsp)
+ vmovups %ymm9, 192(%rsp)
+ vmovups %ymm10, 160(%rsp)
+ vmovups %ymm11, 128(%rsp)
+ vmovups %ymm12, 96(%rsp)
+ vmovups %ymm13, 64(%rsp)
+ vmovups %ymm14, 32(%rsp)
+ vmovups %ymm15, (%rsp)
+ movq %rsi, 264(%rsp)
+ movq %rdi, 256(%rsp)
+ movq %r12, 296(%rsp)
+ cfi_offset_rel_rsp (12, 296)
+ movb %dl, %r12b
+ movq %r13, 288(%rsp)
+ cfi_offset_rel_rsp (13, 288)
+ movl %ecx, %r13d
+ movq %r14, 280(%rsp)
+ cfi_offset_rel_rsp (14, 280)
+ movl %eax, %r14d
+ movq %r15, 272(%rsp)
+ cfi_offset_rel_rsp (15, 272)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ vmovups 224(%rsp), %ymm8
+ vmovups 192(%rsp), %ymm9
+ vmovups 160(%rsp), %ymm10
+ vmovups 128(%rsp), %ymm11
+ vmovups 96(%rsp), %ymm12
+ vmovups 64(%rsp), %ymm13
+ vmovups 32(%rsp), %ymm14
+ vmovups (%rsp), %ymm15
+ vmovups 384(%rsp), %ymm0
+ movq 264(%rsp), %rsi
+ movq 256(%rsp), %rdi
+ movq 296(%rsp), %r12
+ cfi_restore (%r12)
+ movq 288(%rsp), %r13
+ cfi_restore (%r13)
+ movq 280(%rsp), %r14
+ cfi_restore (%r14)
+ movq 272(%rsp), %r15
+ cfi_restore (%r15)
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ vmovss 324(%rsp,%r15,8), %xmm0
+ vzeroupper
+
+ call logf@PLT
+
+ vmovss %xmm0, 388(%rsp,%r15,8)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ vmovss 320(%rsp,%r15,8), %xmm0
+ vzeroupper
+
+ call logf@PLT
+
+ vmovss %xmm0, 384(%rsp,%r15,8)
+ jmp .LBL_1_7
+
+END(_ZGVdN8v_logf_avx2)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
new file mode 100644
index 0000000000..3aa9f952ce
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
@@ -0,0 +1,39 @@
+/* Multiple versions of vectorized powf.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVeN16vv_powf)
+ .type _ZGVeN16vv_powf, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVeN16vv_powf_skx(%rip), %rax
+ testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+ jnz 2f
+ leaq _ZGVeN16vv_powf_knl(%rip), %rax
+ testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+ jnz 2f
+ leaq _ZGVeN16vv_powf_avx2_wrapper(%rip), %rax
+2: ret
+END (_ZGVeN16vv_powf)
+
+#define _ZGVeN16vv_powf _ZGVeN16vv_powf_avx2_wrapper
+#include "../svml_s_powf16_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
new file mode 100644
index 0000000000..4b61974cb6
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
@@ -0,0 +1,653 @@
+/* Function powf vectorized with AVX-512. KNL and SKX versions.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_powf_data.h"
+#include "svml_s_wrapper_impl.h"
+
+/*
+ ALGORITHM DESCRIPTION:
+
+ We are using the next identity : pow(x,y) = 2^(y * log2(x)).
+
+ 1) log2(x) calculation
+ Here we use the following formula.
+ Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2.
+ Let C ~= 1/ln(2),
+ Rcp1 ~= 1/X1, X2=Rcp1*X1,
+ Rcp2 ~= 1/X2, X3=Rcp2*X2,
+ Rcp3 ~= 1/X3, Rcp3C ~= C/X3.
+ Then
+ log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) +
+ log2(X1*Rcp1*Rcp2*Rcp3C/C),
+ where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small.
+
+ The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2),
+ Rcp3C, log2(C/Rcp3C) are taken from tables.
+ Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C
+ is exactly represented in target precision.
+
+ log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 =
+ = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... =
+ = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... =
+ = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ...,
+ where
+ cq=X1*Rcp1*Rcp2*Rcp3C-C,
+ a1=1/(C*ln(2))-1 is small,
+ a2=1/(2*C^2*ln2),
+ a3=1/(3*C^3*ln2),
+ ...
+ Log2 result is split by three parts: HH+HL+HLL
+
+ 2) Calculation of y*log2(x)
+ Split y into YHi+YLo.
+ Get high PH and medium PL parts of y*log2|x|.
+ Get low PLL part of y*log2|x|.
+ Now we have PH+PL+PLL ~= y*log2|x|.
+
+ 3) Calculation of 2^(y*log2(x))
+ Let's represent PH+PL+PLL in the form N + j/2^expK + Z,
+ where expK=7 in this implementation, N and j are integers,
+ 0<=j<=2^expK-1, |Z|<2^(-expK-1). Hence
+ 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z,
+ where 2^(j/2^expK) is stored in a table, and
+ 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5.
+ We compute 2^(PH+PL+PLL) as follows:
+ Break PH into PHH + PHL, where PHH = N + j/2^expK.
+ Z = PHL + PL + PLL
+ Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5
+ Get 2^(j/2^expK) from table in the form THI+TLO.
+ Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly).
+ Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo:
+ ResHi := THI
+ ResLo := THI * Exp2Poly + TLO
+ Get exponent ERes of the result:
+ Res := ResHi + ResLo:
+ Result := ex(Res) + N. */
+
+ .text
+ENTRY (_ZGVeN16vv_powf_knl)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+#else
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1344, %rsp
+ movq __svml_spow_data@GOTPCREL(%rip), %rdx
+ vmovaps %zmm1, %zmm9
+ vshuff32x4 $238, %zmm0, %zmm0, %zmm7
+ kxnorw %k3, %k3, %k3
+ vcvtps2pd %ymm0, %zmm14
+ vcvtps2pd %ymm7, %zmm10
+ movl $-1, %eax
+ movq $-1, %rcx
+ vpandd _ABSMASK(%rdx), %zmm9, %zmm4
+ vmovups _ExpMask(%rdx), %zmm6
+
+/* exponent bits selection */
+ vpsrlq $20, %zmm14, %zmm13
+ vshuff32x4 $238, %zmm9, %zmm9, %zmm8
+ vpcmpd $5, _INF(%rdx), %zmm4, %k2
+ vpsrlq $32, %zmm13, %zmm15
+ vcvtps2pd %ymm8, %zmm2
+ vmovups _Two10(%rdx), %zmm4
+ vpmovqd %zmm15, %ymm12
+ vcvtps2pd %ymm9, %zmm1
+ vpsubd _NMINNORM(%rdx), %zmm0, %zmm3
+ vpbroadcastd %eax, %zmm8{%k2}{z}
+ vpcmpd $5, _NMAXVAL(%rdx), %zmm3, %k1
+
+/* preserve mantissa, set input exponent to 2^(-10) */
+ vmovaps %zmm6, %zmm3
+ vpternlogq $248, %zmm6, %zmm10, %zmm4
+ vpsrlq $20, %zmm10, %zmm10
+ vpternlogq $234, _Two10(%rdx), %zmm14, %zmm3
+
+/* reciprocal approximation good to at least 11 bits */
+ vrcp28pd %zmm4, %zmm11
+ vpsrlq $32, %zmm10, %zmm14
+ vpbroadcastd %eax, %zmm7{%k1}{z}
+ kxnorw %k1, %k1, %k1
+ vrcp28pd %zmm3, %zmm5
+ vpmovqd %zmm14, %ymm6
+ vshufi32x4 $68, %zmm6, %zmm12, %zmm13
+ vmovups _One(%rdx), %zmm6
+
+/* round reciprocal to nearest integer, will have 1+9 mantissa bits */
+ vrndscalepd $8, %zmm5, %zmm14
+
+/* biased exponent in DP format */
+ vshuff32x4 $238, %zmm13, %zmm13, %zmm5
+ vrndscalepd $8, %zmm11, %zmm11
+ vcmppd $30, _Threshold(%rdx), %zmm14, %k2
+ vcvtdq2pd %ymm13, %zmm10
+ vcvtdq2pd %ymm5, %zmm15
+
+/* table lookup */
+ vpsrlq $40, %zmm14, %zmm13
+ vpxord %zmm5, %zmm5, %zmm5
+ vgatherqpd _Log2Rcp_lookup(%rdx,%zmm13), %zmm5{%k3}
+ vfmsub213pd %zmm6, %zmm14, %zmm3
+ vfmsub213pd %zmm6, %zmm11, %zmm4
+ vcmppd $30, _Threshold(%rdx), %zmm11, %k3
+ vpbroadcastq %rcx, %zmm14{%k2}{z}
+
+/* dpP= _dbT+lJ*T_ITEM_GRAN */
+ kxnorw %k2, %k2, %k2
+ vpsrlq $40, %zmm11, %zmm12
+ vpxord %zmm6, %zmm6, %zmm6
+ vpbroadcastq %rcx, %zmm11{%k3}{z}
+ kxnorw %k3, %k3, %k3
+ vgatherqpd _Log2Rcp_lookup(%rdx,%zmm12), %zmm6{%k1}
+ vmovups _Bias1(%rdx), %zmm12
+ vpternlogq $236, _Bias(%rdx), %zmm12, %zmm14
+ vpternlogq $248, _Bias(%rdx), %zmm11, %zmm12
+ vsubpd %zmm14, %zmm10, %zmm13
+ vsubpd %zmm12, %zmm15, %zmm10
+ vmovups _poly_coeff_3(%rdx), %zmm11
+ vmovups _poly_coeff_4(%rdx), %zmm15
+ vfmadd213pd %zmm15, %zmm4, %zmm11
+ vmulpd %zmm4, %zmm4, %zmm12
+ vmovaps %zmm15, %zmm14
+ vmulpd %zmm3, %zmm3, %zmm15
+ vfmadd231pd _poly_coeff_3(%rdx), %zmm3, %zmm14
+
+/* reconstruction */
+ vfmadd213pd %zmm4, %zmm12, %zmm11
+ vfmadd213pd %zmm3, %zmm15, %zmm14
+ vaddpd %zmm6, %zmm11, %zmm11
+ vaddpd %zmm5, %zmm14, %zmm3
+ vfmadd231pd _L2(%rdx), %zmm10, %zmm11
+ vfmadd132pd _L2(%rdx), %zmm3, %zmm13
+ vmulpd %zmm2, %zmm11, %zmm12
+ vmulpd %zmm1, %zmm13, %zmm10
+ vmulpd __dbInvLn2(%rdx), %zmm12, %zmm6
+
+/* hi bits */
+ vpsrlq $32, %zmm12, %zmm12
+ vmulpd __dbInvLn2(%rdx), %zmm10, %zmm1
+
+/* to round down; if dR is an integer we will get R = 1, which is ok */
+ vsubpd __dbHALF(%rdx), %zmm6, %zmm4
+ vpsrlq $32, %zmm10, %zmm11
+ vpmovqd %zmm11, %ymm3
+ vsubpd __dbHALF(%rdx), %zmm1, %zmm2
+ vaddpd __dbShifter(%rdx), %zmm4, %zmm14
+ vpmovqd %zmm12, %ymm4
+ vshufi32x4 $68, %zmm4, %zmm3, %zmm5
+ vpxord %zmm4, %zmm4, %zmm4
+ vaddpd __dbShifter(%rdx), %zmm2, %zmm2
+
+/* iAbsX = iAbsX&iAbsMask; */
+ vpandd __iAbsMask(%rdx), %zmm5, %zmm11
+ vpxord %zmm5, %zmm5, %zmm5
+ vsubpd __dbShifter(%rdx), %zmm14, %zmm13
+
+/* iRangeMask = (iAbsX>iDomainRange) */
+ vpcmpgtd __iDomainRange(%rdx), %zmm11, %k1
+ vsubpd __dbShifter(%rdx), %zmm2, %zmm15
+ vpbroadcastd %eax, %zmm10{%k1}{z}
+ vpternlogd $254, %zmm8, %zmm7, %zmm10
+
+/* [0..1) */
+ vsubpd %zmm15, %zmm1, %zmm1
+
+/* low K bits */
+ vpandq __lbLOWKBITS(%rdx), %zmm14, %zmm11
+ vgatherqpd 13952(%rdx,%zmm11,8), %zmm5{%k3}
+ vsubpd %zmm13, %zmm6, %zmm7
+ vptestmd %zmm10, %zmm10, %k0
+ vpandq __lbLOWKBITS(%rdx), %zmm2, %zmm10
+ vmulpd __dbC1(%rdx), %zmm1, %zmm1
+ vmulpd __dbC1(%rdx), %zmm7, %zmm3
+ vpsrlq $11, %zmm2, %zmm8
+ vpsrlq $11, %zmm14, %zmm2
+
+/* NB : including +/- sign for the exponent!! */
+ vpsllq $52, %zmm8, %zmm8
+ kmovw %k0, %ecx
+ vpsllq $52, %zmm2, %zmm6
+ vfmadd213pd %zmm5, %zmm3, %zmm5
+ vgatherqpd 13952(%rdx,%zmm10,8), %zmm4{%k2}
+ vfmadd213pd %zmm4, %zmm1, %zmm4
+ vpaddq %zmm6, %zmm5, %zmm10
+ vcvtpd2ps %zmm10, %ymm12
+ vpaddq %zmm8, %zmm4, %zmm7
+ vcvtpd2ps %zmm7, %ymm11
+ vshuff32x4 $68, %zmm12, %zmm11, %zmm1
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ vmovaps %zmm1, %zmm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovups %zmm0, 1152(%rsp)
+ vmovups %zmm9, 1216(%rsp)
+ vmovups %zmm1, 1280(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ kmovw %k4, 1048(%rsp)
+ xorl %eax, %eax
+ kmovw %k5, 1040(%rsp)
+ kmovw %k6, 1032(%rsp)
+ kmovw %k7, 1024(%rsp)
+ vmovups %zmm16, 960(%rsp)
+ vmovups %zmm17, 896(%rsp)
+ vmovups %zmm18, 832(%rsp)
+ vmovups %zmm19, 768(%rsp)
+ vmovups %zmm20, 704(%rsp)
+ vmovups %zmm21, 640(%rsp)
+ vmovups %zmm22, 576(%rsp)
+ vmovups %zmm23, 512(%rsp)
+ vmovups %zmm24, 448(%rsp)
+ vmovups %zmm25, 384(%rsp)
+ vmovups %zmm26, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1064(%rsp)
+ movq %rdi, 1056(%rsp)
+ movq %r12, 1096(%rsp)
+ cfi_offset_rel_rsp (12, 1096)
+ movb %dl, %r12b
+ movq %r13, 1088(%rsp)
+ cfi_offset_rel_rsp (13, 1088)
+ movl %ecx, %r13d
+ movq %r14, 1080(%rsp)
+ cfi_offset_rel_rsp (14, 1080)
+ movl %eax, %r14d
+ movq %r15, 1072(%rsp)
+ cfi_offset_rel_rsp (15, 1072)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ addb $1, %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ kmovw 1048(%rsp), %k4
+ movq 1064(%rsp), %rsi
+ kmovw 1040(%rsp), %k5
+ movq 1056(%rsp), %rdi
+ kmovw 1032(%rsp), %k6
+ movq 1096(%rsp), %r12
+ cfi_restore (%r12)
+ movq 1088(%rsp), %r13
+ cfi_restore (%r13)
+ kmovw 1024(%rsp), %k7
+ vmovups 960(%rsp), %zmm16
+ vmovups 896(%rsp), %zmm17
+ vmovups 832(%rsp), %zmm18
+ vmovups 768(%rsp), %zmm19
+ vmovups 704(%rsp), %zmm20
+ vmovups 640(%rsp), %zmm21
+ vmovups 576(%rsp), %zmm22
+ vmovups 512(%rsp), %zmm23
+ vmovups 448(%rsp), %zmm24
+ vmovups 384(%rsp), %zmm25
+ vmovups 320(%rsp), %zmm26
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ movq 1080(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1072(%rsp), %r15
+ cfi_restore (%r15)
+ vmovups 1280(%rsp), %zmm1
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ vmovss 1156(%rsp,%r15,8), %xmm0
+ vmovss 1220(%rsp,%r15,8), %xmm1
+ call powf@PLT
+ vmovss %xmm0, 1284(%rsp,%r15,8)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ vmovss 1152(%rsp,%r15,8), %xmm0
+ vmovss 1216(%rsp,%r15,8), %xmm1
+ call powf@PLT
+ vmovss %xmm0, 1280(%rsp,%r15,8)
+ jmp .LBL_1_7
+#endif
+END (_ZGVeN16vv_powf_knl)
+
+ENTRY (_ZGVeN16vv_powf_skx)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+#else
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1344, %rsp
+ movq __svml_spow_data@GOTPCREL(%rip), %rax
+ vextractf32x8 $1, %zmm1, %ymm14
+ vextractf32x8 $1, %zmm0, %ymm15
+ vpsubd _NMINNORM(%rax), %zmm0, %zmm9
+ vmovups %zmm26, 1280(%rsp)
+ vmovups _ExpMask(%rax), %zmm6
+ vpcmpd $1, _NMAXVAL(%rax), %zmm9, %k1
+ vcvtps2pd %ymm0, %zmm5
+ vcvtps2pd %ymm1, %zmm12
+ kxnorw %k3, %k3, %k3
+
+/* exponent bits selection */
+ vpsrlq $20, %zmm5, %zmm3
+ vpsrlq $32, %zmm3, %zmm2
+ vpmovqd %zmm2, %ymm11
+ vcvtps2pd %ymm14, %zmm13
+ vmovups .L_2il0floatpacket.23(%rip), %zmm14
+ vmovaps %zmm14, %zmm26
+ vpandd _ABSMASK(%rax), %zmm1, %zmm8
+ vpcmpd $1, _INF(%rax), %zmm8, %k2
+ vpandnd %zmm9, %zmm9, %zmm26{%k1}
+ vmovups _Two10(%rax), %zmm9
+ kxnorw %k1, %k1, %k1
+ vcvtps2pd %ymm15, %zmm4
+ vmovaps %zmm14, %zmm15
+
+/* preserve mantissa, set input exponent to 2^(-10) */
+ vpternlogq $248, %zmm6, %zmm4, %zmm9
+ vpsrlq $20, %zmm4, %zmm4
+
+/* reciprocal approximation good to at least 11 bits */
+ vrcp14pd %zmm9, %zmm10
+
+/* round reciprocal to nearest integer, will have 1+9 mantissa bits */
+ vrndscalepd $8, %zmm10, %zmm3
+ vmovups _One(%rax), %zmm10
+ vfmsub213pd %zmm10, %zmm3, %zmm9
+ vpandnd %zmm8, %zmm8, %zmm15{%k2}
+ vmovaps %zmm6, %zmm8
+ vpternlogq $234, _Two10(%rax), %zmm5, %zmm8
+ vpsrlq $32, %zmm4, %zmm5
+ vrcp14pd %zmm8, %zmm7
+ vpmovqd %zmm5, %ymm6
+ vrndscalepd $8, %zmm7, %zmm2
+ vfmsub213pd %zmm10, %zmm2, %zmm8
+
+/* table lookup */
+ vpsrlq $40, %zmm2, %zmm10
+ vinserti32x8 $1, %ymm6, %zmm11, %zmm4
+ vpsrlq $40, %zmm3, %zmm11
+
+/* biased exponent in DP format */
+ vextracti32x8 $1, %zmm4, %ymm7
+ vcvtdq2pd %ymm4, %zmm6
+ vpmovqd %zmm10, %ymm4
+ vpmovqd %zmm11, %ymm5
+ vpxord %zmm10, %zmm10, %zmm10
+ vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
+ vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
+ vpxord %zmm11, %zmm11, %zmm11
+ vcvtdq2pd %ymm7, %zmm7
+ vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
+ vmovups _Threshold(%rax), %zmm5
+ vcmppd $21, %zmm2, %zmm5, %k2
+ vcmppd $21, %zmm3, %zmm5, %k3
+ vmovups _Bias1(%rax), %zmm3
+ vmovaps %zmm4, %zmm2
+ vpandnq %zmm5, %zmm5, %zmm2{%k2}
+ vpternlogq $236, _Bias(%rax), %zmm3, %zmm2
+
+/* dpP= _dbT+lJ*T_ITEM_GRAN */
+ kxnorw %k2, %k2, %k2
+ vpandnq %zmm5, %zmm5, %zmm4{%k3}
+ vpternlogq $248, _Bias(%rax), %zmm4, %zmm3
+ vsubpd %zmm2, %zmm6, %zmm4
+ vmovups _poly_coeff_3(%rax), %zmm6
+ vmovups _poly_coeff_4(%rax), %zmm2
+ vsubpd %zmm3, %zmm7, %zmm5
+ vmulpd %zmm8, %zmm8, %zmm7
+ vfmadd213pd %zmm2, %zmm9, %zmm6
+ kxnorw %k3, %k3, %k3
+ vmovaps %zmm2, %zmm3
+ vmulpd %zmm9, %zmm9, %zmm2
+ vfmadd231pd _poly_coeff_3(%rax), %zmm8, %zmm3
+
+/* reconstruction */
+ vfmadd213pd %zmm9, %zmm2, %zmm6
+ vfmadd213pd %zmm8, %zmm7, %zmm3
+ vaddpd %zmm11, %zmm6, %zmm8
+ vaddpd %zmm10, %zmm3, %zmm9
+ vfmadd231pd _L2(%rax), %zmm5, %zmm8
+ vfmadd132pd _L2(%rax), %zmm9, %zmm4
+ vmulpd %zmm13, %zmm8, %zmm13
+ vmulpd %zmm12, %zmm4, %zmm3
+ vmulpd __dbInvLn2(%rax), %zmm13, %zmm10
+ vmulpd __dbInvLn2(%rax), %zmm3, %zmm8
+
+/* hi bits */
+ vpsrlq $32, %zmm3, %zmm4
+ vpsrlq $32, %zmm13, %zmm13
+
+/* to round down; if dR is an integer we will get R = 1, which is ok */
+ vsubpd __dbHALF(%rax), %zmm8, %zmm12
+ vpmovqd %zmm4, %ymm5
+ vpmovqd %zmm13, %ymm2
+ vsubpd __dbHALF(%rax), %zmm10, %zmm9
+ vaddpd __dbShifter(%rax), %zmm12, %zmm7
+ vaddpd __dbShifter(%rax), %zmm9, %zmm9
+ vsubpd __dbShifter(%rax), %zmm7, %zmm11
+ vsubpd __dbShifter(%rax), %zmm9, %zmm12
+ vinserti32x8 $1, %ymm2, %zmm5, %zmm3
+
+/* iAbsX = iAbsX&iAbsMask */
+ vpandd __iAbsMask(%rax), %zmm3, %zmm4
+
+/* iRangeMask = (iAbsX>iDomainRange) */
+ vpcmpd $2, __iDomainRange(%rax), %zmm4, %k1
+ vpandnd %zmm4, %zmm4, %zmm14{%k1}
+ vpternlogd $254, %zmm15, %zmm26, %zmm14
+
+/* [0..1) */
+ vsubpd %zmm11, %zmm8, %zmm15
+ vsubpd %zmm12, %zmm10, %zmm26
+ vptestmd %zmm14, %zmm14, %k0
+ vpsrlq $11, %zmm7, %zmm8
+ vpsrlq $11, %zmm9, %zmm10
+ vmulpd __dbC1(%rax), %zmm26, %zmm26
+ vmulpd __dbC1(%rax), %zmm15, %zmm15
+
+/* NB : including +/- sign for the exponent!! */
+ vpsllq $52, %zmm10, %zmm13
+ vpsllq $52, %zmm8, %zmm12
+ kmovw %k0, %ecx
+
+/* low K bits */
+ vpandq __lbLOWKBITS(%rax), %zmm9, %zmm14
+ vpandq __lbLOWKBITS(%rax), %zmm7, %zmm6
+ vpmovqd %zmm14, %ymm7
+ vpmovqd %zmm6, %ymm9
+ vpxord %zmm2, %zmm2, %zmm2
+ vgatherdpd 13952(%rax,%ymm7,8), %zmm2{%k3}
+ vfmadd213pd %zmm2, %zmm26, %zmm2
+ vpaddq %zmm13, %zmm2, %zmm2
+ vcvtpd2ps %zmm2, %ymm4
+ vpxord %zmm11, %zmm11, %zmm11
+ vgatherdpd 13952(%rax,%ymm9,8), %zmm11{%k2}
+ vfmadd213pd %zmm11, %zmm15, %zmm11
+ vpaddq %zmm12, %zmm11, %zmm3
+ vcvtpd2ps %zmm3, %ymm5
+ vinsertf32x8 $1, %ymm4, %zmm5, %zmm2
+ testl %ecx, %ecx
+ jne .LBL_2_3
+
+.LBL_2_2:
+ cfi_remember_state
+ vmovups 1280(%rsp), %zmm26
+ vmovaps %zmm2, %zmm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_2_3:
+ cfi_restore_state
+ vmovups %zmm0, 1088(%rsp)
+ vmovups %zmm1, 1152(%rsp)
+ vmovups %zmm2, 1216(%rsp)
+ je .LBL_2_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ kmovw %k4, 984(%rsp)
+ kmovw %k5, 976(%rsp)
+ kmovw %k6, 968(%rsp)
+ kmovw %k7, 960(%rsp)
+ vmovups %zmm16, 896(%rsp)
+ vmovups %zmm17, 832(%rsp)
+ vmovups %zmm18, 768(%rsp)
+ vmovups %zmm19, 704(%rsp)
+ vmovups %zmm20, 640(%rsp)
+ vmovups %zmm21, 576(%rsp)
+ vmovups %zmm22, 512(%rsp)
+ vmovups %zmm23, 448(%rsp)
+ vmovups %zmm24, 384(%rsp)
+ vmovups %zmm25, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1000(%rsp)
+ movq %rdi, 992(%rsp)
+ movq %r12, 1032(%rsp)
+ cfi_offset_rel_rsp (12, 1032)
+ movb %dl, %r12b
+ movq %r13, 1024(%rsp)
+ cfi_offset_rel_rsp (13, 1024)
+ movl %ecx, %r13d
+ movq %r14, 1016(%rsp)
+ cfi_offset_rel_rsp (14, 1016)
+ movl %eax, %r14d
+ movq %r15, 1008(%rsp)
+ cfi_offset_rel_rsp (15, 1008)
+ cfi_remember_state
+
+.LBL_2_6:
+ btl %r14d, %r13d
+ jc .LBL_2_12
+
+.LBL_2_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_2_10
+
+.LBL_2_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_2_6
+
+ kmovw 984(%rsp), %k4
+ kmovw 976(%rsp), %k5
+ kmovw 968(%rsp), %k6
+ kmovw 960(%rsp), %k7
+ vmovups 896(%rsp), %zmm16
+ vmovups 832(%rsp), %zmm17
+ vmovups 768(%rsp), %zmm18
+ vmovups 704(%rsp), %zmm19
+ vmovups 640(%rsp), %zmm20
+ vmovups 576(%rsp), %zmm21
+ vmovups 512(%rsp), %zmm22
+ vmovups 448(%rsp), %zmm23
+ vmovups 384(%rsp), %zmm24
+ vmovups 320(%rsp), %zmm25
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ vmovups 1216(%rsp), %zmm2
+ movq 1000(%rsp), %rsi
+ movq 992(%rsp), %rdi
+ movq 1032(%rsp), %r12
+ cfi_restore (%r12)
+ movq 1024(%rsp), %r13
+ cfi_restore (%r13)
+ movq 1016(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1008(%rsp), %r15
+ cfi_restore (%r15)
+ jmp .LBL_2_2
+
+.LBL_2_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ vmovss 1156(%rsp,%r15,8), %xmm1
+ vzeroupper
+ vmovss 1092(%rsp,%r15,8), %xmm0
+ call powf@PLT
+ vmovss %xmm0, 1220(%rsp,%r15,8)
+ jmp .LBL_2_8
+
+.LBL_2_12:
+ movzbl %r12b, %r15d
+ vmovss 1152(%rsp,%r15,8), %xmm1
+ vzeroupper
+ vmovss 1088(%rsp,%r15,8), %xmm0
+ call powf@PLT
+ vmovss %xmm0, 1216(%rsp,%r15,8)
+ jmp .LBL_2_7
+#endif
+END (_ZGVeN16vv_powf_skx)
+
+ .section .rodata, "a"
+.L_2il0floatpacket.23:
+ .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+ .type .L_2il0floatpacket.23,@object
+.L_2il0floatpacket.24:
+ .long 0xffffffff,0xffffffff
+ .type .L_2il0floatpacket.24,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S
new file mode 100644
index 0000000000..f88b9ca6d4
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S
@@ -0,0 +1,38 @@
+/* Multiple versions of vectorized powf.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVbN4vv_powf)
+ .type _ZGVbN4vv_powf, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVbN4vv_powf_sse4(%rip), %rax
+ testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVbN4vv_powf_sse2(%rip), %rax
+ ret
+END (_ZGVbN4vv_powf)
+libmvec_hidden_def (_ZGVbN4vv_powf)
+
+#define _ZGVbN4vv_powf _ZGVbN4vv_powf_sse2
+#include "../svml_s_powf4_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S
new file mode 100644
index 0000000000..6068f51c46
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S
@@ -0,0 +1,374 @@
+/* Function powf vectorized with SSE4.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_powf_data.h"
+
+ .text
+ENTRY (_ZGVbN4vv_powf_sse4)
+/*
+ ALGORITHM DESCRIPTION:
+
+ We are using the next identity: pow(x,y) = 2^(y * log2(x)).
+
+ 1) log2(x) calculation
+ Here we use the following formula.
+ Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2.
+ Let C ~= 1/ln(2),
+ Rcp1 ~= 1/X1, X2=Rcp1*X1,
+ Rcp2 ~= 1/X2, X3=Rcp2*X2,
+ Rcp3 ~= 1/X3, Rcp3C ~= C/X3.
+ Then
+ log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) +
+ log2(X1*Rcp1*Rcp2*Rcp3C/C),
+ where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small.
+
+ The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2),
+ Rcp3C, log2(C/Rcp3C) are taken from tables.
+ Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C
+ is exactly represented in target precision.
+
+ log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 =
+ = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... =
+ = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... =
+ = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ...,
+ where
+ cq=X1*Rcp1*Rcp2*Rcp3C-C,
+ a1=1/(C*ln(2))-1 is small,
+ a2=1/(2*C^2*ln2),
+ a3=1/(3*C^3*ln2),
+ ...
+ Log2 result is split by three parts: HH+HL+HLL
+
+ 2) Calculation of y*log2(x)
+ Split y into YHi+YLo.
+ Get high PH and medium PL parts of y*log2|x|.
+ Get low PLL part of y*log2|x|.
+ Now we have PH+PL+PLL ~= y*log2|x|.
+
+ 3) Calculation of 2^(y*log2(x))
+ Let's represent PH+PL+PLL in the form N + j/2^expK + Z,
+ where expK=7 in this implementation, N and j are integers,
+ 0<=j<=2^expK-1, |Z|<2^(-expK-1). Hence
+ 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z,
+ where 2^(j/2^expK) is stored in a table, and
+ 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5.
+ We compute 2^(PH+PL+PLL) as follows:
+ Break PH into PHH + PHL, where PHH = N + j/2^expK.
+ Z = PHL + PL + PLL
+ Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5
+ Get 2^(j/2^expK) from table in the form THI+TLO.
+ Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly).
+ Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo:
+ ResHi := THI
+ ResLo := THI * Exp2Poly + TLO
+ Get exponent ERes of the result:
+ Res := ResHi + ResLo:
+ Result := ex(Res) + N. */
+
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $256, %rsp
+ movaps %xmm0, %xmm3
+ movhlps %xmm0, %xmm3
+ movaps %xmm1, %xmm5
+ movups %xmm8, 112(%rsp)
+ movaps %xmm5, %xmm2
+ cvtps2pd %xmm3, %xmm8
+ cvtps2pd %xmm5, %xmm7
+ movups %xmm9, 96(%rsp)
+ movaps %xmm0, %xmm4
+ cvtps2pd %xmm0, %xmm9
+ movq __svml_spow_data@GOTPCREL(%rip), %rdx
+ movups %xmm10, 176(%rsp)
+ movups %xmm13, 48(%rsp)
+ movups _ExpMask(%rdx), %xmm6
+
+/* preserve mantissa, set input exponent to 2^(-10) */
+ movaps %xmm6, %xmm10
+ andps %xmm8, %xmm6
+ andps %xmm9, %xmm10
+
+/* exponent bits selection */
+ psrlq $20, %xmm9
+ orps _Two10(%rdx), %xmm6
+ psrlq $20, %xmm8
+ orps _Two10(%rdx), %xmm10
+
+/* reciprocal approximation good to at least 11 bits */
+ cvtpd2ps %xmm6, %xmm13
+ cvtpd2ps %xmm10, %xmm1
+ movlhps %xmm13, %xmm13
+ movhlps %xmm5, %xmm2
+ movlhps %xmm1, %xmm1
+ movups %xmm12, 208(%rsp)
+ rcpps %xmm13, %xmm12
+ movups %xmm11, 80(%rsp)
+ cvtps2pd %xmm2, %xmm11
+ rcpps %xmm1, %xmm2
+ movups %xmm14, 144(%rsp)
+ cvtps2pd %xmm12, %xmm14
+ movups %xmm15, 160(%rsp)
+ cvtps2pd %xmm2, %xmm15
+ shufps $221, %xmm8, %xmm9
+
+/* round reciprocal to nearest integer, will have 1+9 mantissa bits */
+ roundpd $0, %xmm14, %xmm14
+
+/* biased exponent in DP format */
+ pshufd $238, %xmm9, %xmm8
+ roundpd $0, %xmm15, %xmm15
+ cvtdq2pd %xmm8, %xmm1
+ mulpd %xmm15, %xmm10
+ mulpd %xmm14, %xmm6
+ cvtdq2pd %xmm9, %xmm2
+ subpd _One(%rdx), %xmm10
+ subpd _One(%rdx), %xmm6
+
+/* table lookup */
+ movaps %xmm14, %xmm8
+ movaps %xmm15, %xmm9
+ psrlq $40, %xmm8
+ psrlq $40, %xmm9
+ movd %xmm8, %r8d
+ movd %xmm9, %eax
+ psubd _NMINNORM(%rdx), %xmm4
+ movdqu _ABSMASK(%rdx), %xmm3
+ pextrd $2, %xmm8, %r9d
+ pand %xmm5, %xmm3
+ movups _Threshold(%rdx), %xmm8
+ pextrd $2, %xmm9, %ecx
+ movaps %xmm8, %xmm9
+ cmpltpd %xmm15, %xmm9
+ cmpltpd %xmm14, %xmm8
+ andps _Bias(%rdx), %xmm9
+ movaps %xmm10, %xmm14
+ andps _Bias(%rdx), %xmm8
+ movaps %xmm6, %xmm15
+ orps _Bias1(%rdx), %xmm9
+ orps _Bias1(%rdx), %xmm8
+ subpd %xmm9, %xmm2
+ subpd %xmm8, %xmm1
+ mulpd %xmm10, %xmm14
+ mulpd %xmm6, %xmm15
+ mulpd _L2(%rdx), %xmm2
+ mulpd _L2(%rdx), %xmm1
+ movups _poly_coeff_3(%rdx), %xmm9
+ movaps %xmm9, %xmm8
+ mulpd %xmm10, %xmm8
+ mulpd %xmm6, %xmm9
+ addpd _poly_coeff_4(%rdx), %xmm8
+ addpd _poly_coeff_4(%rdx), %xmm9
+ mulpd %xmm14, %xmm8
+ mulpd %xmm15, %xmm9
+
+/* reconstruction */
+ addpd %xmm8, %xmm10
+ addpd %xmm9, %xmm6
+ movslq %eax, %rax
+ movslq %r8d, %r8
+ movslq %ecx, %rcx
+ movslq %r9d, %r9
+ movsd _Log2Rcp_lookup(%rdx,%rax), %xmm13
+ movsd _Log2Rcp_lookup(%rdx,%r8), %xmm12
+ movhpd _Log2Rcp_lookup(%rdx,%rcx), %xmm13
+ movhpd _Log2Rcp_lookup(%rdx,%r9), %xmm12
+ addpd %xmm10, %xmm13
+ addpd %xmm6, %xmm12
+ addpd %xmm13, %xmm2
+ addpd %xmm12, %xmm1
+ mulpd %xmm7, %xmm2
+ mulpd %xmm11, %xmm1
+ movups __dbInvLn2(%rdx), %xmm11
+ movdqa %xmm4, %xmm12
+ movaps %xmm11, %xmm10
+ mulpd %xmm2, %xmm10
+ mulpd %xmm1, %xmm11
+
+/* to round down; if dR is an integer we will get R = 1, which is ok */
+ movaps %xmm10, %xmm8
+ movaps %xmm11, %xmm9
+ subpd __dbHALF(%rdx), %xmm8
+ subpd __dbHALF(%rdx), %xmm9
+ addpd __dbShifter(%rdx), %xmm8
+ addpd __dbShifter(%rdx), %xmm9
+ movaps %xmm8, %xmm6
+ movaps %xmm9, %xmm7
+ subpd __dbShifter(%rdx), %xmm6
+ subpd __dbShifter(%rdx), %xmm7
+
+/* [0..1) */
+ subpd %xmm6, %xmm10
+ subpd %xmm7, %xmm11
+ mulpd __dbC1(%rdx), %xmm10
+ mulpd __dbC1(%rdx), %xmm11
+
+/* hi bits */
+ shufps $221, %xmm1, %xmm2
+ movdqu _NMAXVAL(%rdx), %xmm1
+ pcmpgtd %xmm1, %xmm12
+ pcmpeqd %xmm1, %xmm4
+ por %xmm4, %xmm12
+ movdqa %xmm3, %xmm1
+ movdqu _INF(%rdx), %xmm4
+ pcmpgtd %xmm4, %xmm1
+ pcmpeqd %xmm4, %xmm3
+
+/* iAbsX = iAbsX&iAbsMask */
+ pand __iAbsMask(%rdx), %xmm2
+ por %xmm3, %xmm1
+
+/* iRangeMask = (iAbsX>iDomainRange) */
+ pcmpgtd __iDomainRange(%rdx), %xmm2
+ por %xmm1, %xmm12
+ movups __lbLOWKBITS(%rdx), %xmm3
+ por %xmm2, %xmm12
+
+/* low K bits */
+ movaps %xmm3, %xmm2
+ andps %xmm9, %xmm3
+ andps %xmm8, %xmm2
+ psrlq $11, %xmm8
+
+/* dpP= _dbT+lJ*T_ITEM_GRAN */
+ movd %xmm2, %r10d
+ psrlq $11, %xmm9
+ movd %xmm3, %ecx
+
+/* NB : including +/- sign for the exponent!! */
+ psllq $52, %xmm8
+ psllq $52, %xmm9
+ pextrw $4, %xmm2, %r11d
+ pextrw $4, %xmm3, %r8d
+ movmskps %xmm12, %eax
+ shll $3, %r10d
+ shll $3, %ecx
+ shll $3, %r11d
+ shll $3, %r8d
+ movq 13952(%rdx,%r10), %xmm6
+ movq 13952(%rdx,%rcx), %xmm7
+ movhpd 13952(%rdx,%r11), %xmm6
+ movhpd 13952(%rdx,%r8), %xmm7
+ mulpd %xmm6, %xmm10
+ mulpd %xmm7, %xmm11
+ addpd %xmm10, %xmm6
+ addpd %xmm11, %xmm7
+ paddq %xmm8, %xmm6
+ paddq %xmm9, %xmm7
+ cvtpd2ps %xmm6, %xmm1
+ cvtpd2ps %xmm7, %xmm4
+ movlhps %xmm4, %xmm1
+ testl %eax, %eax
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ movups 112(%rsp), %xmm8
+ movaps %xmm1, %xmm0
+ movups 96(%rsp), %xmm9
+ movups 176(%rsp), %xmm10
+ movups 80(%rsp), %xmm11
+ movups 208(%rsp), %xmm12
+ movups 48(%rsp), %xmm13
+ movups 144(%rsp), %xmm14
+ movups 160(%rsp), %xmm15
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ movups %xmm0, 64(%rsp)
+ movups %xmm5, 128(%rsp)
+ movups %xmm1, 192(%rsp)
+ je .LBL_1_2
+
+ xorb %cl, %cl
+ xorl %edx, %edx
+ movq %rsi, 8(%rsp)
+ movq %rdi, (%rsp)
+ movq %r12, 40(%rsp)
+ cfi_offset_rel_rsp (12, 40)
+ movb %cl, %r12b
+ movq %r13, 32(%rsp)
+ cfi_offset_rel_rsp (13, 32)
+ movl %eax, %r13d
+ movq %r14, 24(%rsp)
+ cfi_offset_rel_rsp (14, 24)
+ movl %edx, %r14d
+ movq %r15, 16(%rsp)
+ cfi_offset_rel_rsp (15, 16)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ movq 8(%rsp), %rsi
+ movq (%rsp), %rdi
+ movq 40(%rsp), %r12
+ cfi_restore (%r12)
+ movq 32(%rsp), %r13
+ cfi_restore (%r13)
+ movq 24(%rsp), %r14
+ cfi_restore (%r14)
+ movq 16(%rsp), %r15
+ cfi_restore (%r15)
+ movups 192(%rsp), %xmm1
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ movss 68(%rsp,%r15,8), %xmm0
+ movss 132(%rsp,%r15,8), %xmm1
+
+ call powf@PLT
+
+ movss %xmm0, 196(%rsp,%r15,8)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ movss 64(%rsp,%r15,8), %xmm0
+ movss 128(%rsp,%r15,8), %xmm1
+
+ call powf@PLT
+
+ movss %xmm0, 192(%rsp,%r15,8)
+ jmp .LBL_1_7
+
+END (_ZGVbN4vv_powf_sse4)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S
new file mode 100644
index 0000000000..4552e573a9
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S
@@ -0,0 +1,38 @@
+/* Multiple versions of vectorized powf.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVdN8vv_powf)
+ .type _ZGVdN8vv_powf, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVdN8vv_powf_avx2(%rip), %rax
+ testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVdN8vv_powf_sse_wrapper(%rip), %rax
+ ret
+END (_ZGVdN8vv_powf)
+libmvec_hidden_def (_ZGVdN8vv_powf)
+
+#define _ZGVdN8vv_powf _ZGVdN8vv_powf_sse_wrapper
+#include "../svml_s_powf8_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core_avx2.S
new file mode 100644
index 0000000000..cfb86c7851
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core_avx2.S
@@ -0,0 +1,357 @@
+/* Function powf vectorized with AVX2.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_powf_data.h"
+
+ .text
+ENTRY(_ZGVdN8vv_powf_avx2)
+/*
+ ALGORITHM DESCRIPTION:
+
+ We are using the next identity : pow(x,y) = 2^(y * log2(x)).
+
+ 1) log2(x) calculation
+ Here we use the following formula.
+ Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2.
+ Let C ~= 1/ln(2),
+ Rcp1 ~= 1/X1, X2=Rcp1*X1,
+ Rcp2 ~= 1/X2, X3=Rcp2*X2,
+ Rcp3 ~= 1/X3, Rcp3C ~= C/X3.
+ Then
+ log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) +
+ log2(X1*Rcp1*Rcp2*Rcp3C/C),
+ where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small.
+
+ The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2),
+ Rcp3C, log2(C/Rcp3C) are taken from tables.
+ Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C
+ is exactly represented in target precision.
+
+ log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 =
+ = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... =
+ = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... =
+ = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ...,
+ where
+ cq=X1*Rcp1*Rcp2*Rcp3C-C,
+ a1=1/(C*ln(2))-1 is small,
+ a2=1/(2*C^2*ln2),
+ a3=1/(3*C^3*ln2),
+ ...
+ Log2 result is split by three parts: HH+HL+HLL
+
+ 2) Calculation of y*log2(x)
+ Split y into YHi+YLo.
+ Get high PH and medium PL parts of y*log2|x|.
+ Get low PLL part of y*log2|x|.
+ Now we have PH+PL+PLL ~= y*log2|x|.
+
+ 3) Calculation of 2^(y*log2(x))
+ Let's represent PH+PL+PLL in the form N + j/2^expK + Z,
+ where expK=7 in this implementation, N and j are integers,
+ 0<=j<=2^expK-1, |Z|<2^(-expK-1). Hence
+ 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z,
+ where 2^(j/2^expK) is stored in a table, and
+ 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5.
+ We compute 2^(PH+PL+PLL) as follows:
+ Break PH into PHH + PHL, where PHH = N + j/2^expK.
+ Z = PHL + PL + PLL
+ Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5
+ Get 2^(j/2^expK) from table in the form THI+TLO.
+ Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly).
+ Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo:
+ ResHi := THI
+ ResLo := THI * Exp2Poly + TLO
+ Get exponent ERes of the result:
+ Res := ResHi + ResLo:
+ Result := ex(Res) + N. */
+
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $448, %rsp
+ lea __VPACK_ODD_ind.6357.0.1(%rip), %rcx
+ vmovups %ymm14, 320(%rsp)
+
+/* hi bits */
+ lea __VPACK_ODD_ind.6358.0.1(%rip), %rax
+ vmovups %ymm12, 256(%rsp)
+ vmovups %ymm9, 96(%rsp)
+ vmovups %ymm13, 224(%rsp)
+ vmovups %ymm15, 352(%rsp)
+ vmovups %ymm11, 384(%rsp)
+ vmovups %ymm10, 288(%rsp)
+ vmovups (%rcx), %ymm10
+ vmovups %ymm8, 160(%rsp)
+ vmovdqa %ymm1, %ymm9
+ movq __svml_spow_data@GOTPCREL(%rip), %rdx
+ vextractf128 $1, %ymm0, %xmm7
+ vcvtps2pd %xmm0, %ymm14
+ vcvtps2pd %xmm7, %ymm12
+ vpsubd _NMINNORM(%rdx), %ymm0, %ymm7
+
+/* preserve mantissa, set input exponent to 2^(-10) */
+ vandpd _ExpMask(%rdx), %ymm14, %ymm3
+ vandpd _ExpMask(%rdx), %ymm12, %ymm13
+
+/* exponent bits selection */
+ vpsrlq $20, %ymm12, %ymm12
+ vpsrlq $20, %ymm14, %ymm14
+ vextractf128 $1, %ymm9, %xmm2
+ vcvtps2pd %xmm9, %ymm1
+ vpand _ABSMASK(%rdx), %ymm9, %ymm8
+ vcvtps2pd %xmm2, %ymm6
+ vorpd _Two10(%rdx), %ymm3, %ymm2
+ vorpd _Two10(%rdx), %ymm13, %ymm3
+
+/* reciprocal approximation good to at least 11 bits */
+ vcvtpd2ps %ymm2, %xmm5
+ vcvtpd2ps %ymm3, %xmm15
+ vrcpps %xmm5, %xmm4
+ vrcpps %xmm15, %xmm11
+ vcvtps2pd %xmm4, %ymm13
+ vcvtps2pd %xmm11, %ymm4
+ vpermps %ymm12, %ymm10, %ymm11
+
+/* round reciprocal to nearest integer, will have 1+9 mantissa bits */
+ vroundpd $0, %ymm13, %ymm12
+ vpermps %ymm14, %ymm10, %ymm5
+ vroundpd $0, %ymm4, %ymm14
+ vmovupd _One(%rdx), %ymm4
+
+/* table lookup */
+ vpsrlq $40, %ymm12, %ymm10
+ vfmsub213pd %ymm4, %ymm12, %ymm2
+ vfmsub213pd %ymm4, %ymm14, %ymm3
+ vcmpgt_oqpd _Threshold(%rdx), %ymm12, %ymm12
+ vxorpd %ymm4, %ymm4, %ymm4
+ vandpd _Bias(%rdx), %ymm12, %ymm12
+
+/* biased exponent in DP format */
+ vcvtdq2pd %xmm11, %ymm13
+ vpcmpeqd %ymm11, %ymm11, %ymm11
+ vgatherqpd %ymm11, _Log2Rcp_lookup(%rdx,%ymm10), %ymm4
+ vpsrlq $40, %ymm14, %ymm10
+ vcmpgt_oqpd _Threshold(%rdx), %ymm14, %ymm14
+ vpcmpeqd %ymm11, %ymm11, %ymm11
+ vandpd _Bias(%rdx), %ymm14, %ymm14
+ vcvtdq2pd %xmm5, %ymm15
+ vxorpd %ymm5, %ymm5, %ymm5
+ vgatherqpd %ymm11, _Log2Rcp_lookup(%rdx,%ymm10), %ymm5
+ vorpd _Bias1(%rdx), %ymm12, %ymm11
+ vorpd _Bias1(%rdx), %ymm14, %ymm10
+ vsubpd %ymm11, %ymm15, %ymm11
+ vsubpd %ymm10, %ymm13, %ymm14
+ vmovupd _poly_coeff_4(%rdx), %ymm15
+ vmovupd _poly_coeff_3(%rdx), %ymm13
+ vmulpd %ymm3, %ymm3, %ymm10
+ vfmadd213pd %ymm15, %ymm3, %ymm13
+ vmovdqa %ymm15, %ymm12
+ vfmadd231pd _poly_coeff_3(%rdx), %ymm2, %ymm12
+ vmulpd %ymm2, %ymm2, %ymm15
+
+/* reconstruction */
+ vfmadd213pd %ymm3, %ymm10, %ymm13
+ vfmadd213pd %ymm2, %ymm15, %ymm12
+ vaddpd %ymm5, %ymm13, %ymm13
+ vaddpd %ymm4, %ymm12, %ymm2
+ vfmadd231pd _L2(%rdx), %ymm14, %ymm13
+ vfmadd132pd _L2(%rdx), %ymm2, %ymm11
+ vmulpd %ymm6, %ymm13, %ymm2
+ vmulpd %ymm1, %ymm11, %ymm10
+ vmulpd __dbInvLn2(%rdx), %ymm2, %ymm6
+ vmulpd __dbInvLn2(%rdx), %ymm10, %ymm15
+
+/* to round down; if dR is an integer we will get R = 1, which is ok */
+ vsubpd __dbHALF(%rdx), %ymm6, %ymm3
+ vsubpd __dbHALF(%rdx), %ymm15, %ymm1
+ vaddpd __dbShifter(%rdx), %ymm3, %ymm13
+ vaddpd __dbShifter(%rdx), %ymm1, %ymm14
+ vsubpd __dbShifter(%rdx), %ymm13, %ymm12
+ vmovups (%rax), %ymm1
+ vsubpd __dbShifter(%rdx), %ymm14, %ymm11
+
+/* [0..1) */
+ vsubpd %ymm12, %ymm6, %ymm6
+ vpermps %ymm10, %ymm1, %ymm3
+ vpermps %ymm2, %ymm1, %ymm10
+ vpcmpgtd _NMAXVAL(%rdx), %ymm7, %ymm4
+ vpcmpgtd _INF(%rdx), %ymm8, %ymm1
+ vpcmpeqd _NMAXVAL(%rdx), %ymm7, %ymm7
+ vpcmpeqd _INF(%rdx), %ymm8, %ymm8
+ vpor %ymm7, %ymm4, %ymm2
+ vpor %ymm8, %ymm1, %ymm1
+ vsubpd %ymm11, %ymm15, %ymm7
+ vinsertf128 $1, %xmm10, %ymm3, %ymm10
+ vpor %ymm1, %ymm2, %ymm3
+
+/* iAbsX = iAbsX&iAbsMask */
+ vandps __iAbsMask(%rdx), %ymm10, %ymm10
+
+/* iRangeMask = (iAbsX>iDomainRange) */
+ vpcmpgtd __iDomainRange(%rdx), %ymm10, %ymm4
+ vpor %ymm4, %ymm3, %ymm5
+ vmulpd __dbC1(%rdx), %ymm7, %ymm4
+ vmovmskps %ymm5, %ecx
+ vmulpd __dbC1(%rdx), %ymm6, %ymm5
+
+/* low K bits */
+ vandps __lbLOWKBITS(%rdx), %ymm14, %ymm6
+
+/* dpP= _dbT+lJ*T_ITEM_GRAN */
+ vxorpd %ymm7, %ymm7, %ymm7
+ vpcmpeqd %ymm1, %ymm1, %ymm1
+ vandps __lbLOWKBITS(%rdx), %ymm13, %ymm2
+ vxorpd %ymm10, %ymm10, %ymm10
+ vpcmpeqd %ymm3, %ymm3, %ymm3
+ vgatherqpd %ymm1, 13952(%rdx,%ymm6,8), %ymm7
+ vgatherqpd %ymm3, 13952(%rdx,%ymm2,8), %ymm10
+ vpsrlq $11, %ymm14, %ymm14
+ vpsrlq $11, %ymm13, %ymm13
+ vfmadd213pd %ymm7, %ymm4, %ymm7
+ vfmadd213pd %ymm10, %ymm5, %ymm10
+
+/* NB : including +/- sign for the exponent!! */
+ vpsllq $52, %ymm14, %ymm8
+ vpsllq $52, %ymm13, %ymm11
+ vpaddq %ymm8, %ymm7, %ymm12
+ vpaddq %ymm11, %ymm10, %ymm1
+ vcvtpd2ps %ymm12, %xmm15
+ vcvtpd2ps %ymm1, %xmm2
+ vinsertf128 $1, %xmm2, %ymm15, %ymm1
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ vmovups 160(%rsp), %ymm8
+ vmovups 96(%rsp), %ymm9
+ vmovups 288(%rsp), %ymm10
+ vmovups 384(%rsp), %ymm11
+ vmovups 256(%rsp), %ymm12
+ vmovups 224(%rsp), %ymm13
+ vmovups 320(%rsp), %ymm14
+ vmovups 352(%rsp), %ymm15
+ vmovdqa %ymm1, %ymm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovups %ymm0, 64(%rsp)
+ vmovups %ymm9, 128(%rsp)
+ vmovups %ymm1, 192(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ movq %rsi, 8(%rsp)
+ movq %rdi, (%rsp)
+ movq %r12, 40(%rsp)
+ cfi_offset_rel_rsp (12, 40)
+ movb %dl, %r12b
+ movq %r13, 32(%rsp)
+ cfi_offset_rel_rsp (13, 32)
+ movl %ecx, %r13d
+ movq %r14, 24(%rsp)
+ cfi_offset_rel_rsp (14, 24)
+ movl %eax, %r14d
+ movq %r15, 16(%rsp)
+ cfi_offset_rel_rsp (15, 16)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ movq 8(%rsp), %rsi
+ movq (%rsp), %rdi
+ movq 40(%rsp), %r12
+ cfi_restore (%r12)
+ movq 32(%rsp), %r13
+ cfi_restore (%r13)
+ movq 24(%rsp), %r14
+ cfi_restore (%r14)
+ movq 16(%rsp), %r15
+ cfi_restore (%r15)
+ vmovups 192(%rsp), %ymm1
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ vmovss 68(%rsp,%r15,8), %xmm0
+ vmovss 132(%rsp,%r15,8), %xmm1
+ vzeroupper
+
+ call powf@PLT
+
+ vmovss %xmm0, 196(%rsp,%r15,8)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ vmovss 64(%rsp,%r15,8), %xmm0
+ vmovss 128(%rsp,%r15,8), %xmm1
+ vzeroupper
+
+ call powf@PLT
+
+ vmovss %xmm0, 192(%rsp,%r15,8)
+ jmp .LBL_1_7
+
+END(_ZGVdN8vv_powf_avx2)
+
+ .section .rodata, "a"
+__VPACK_ODD_ind.6357.0.1:
+ .long 1
+ .long 3
+ .long 5
+ .long 7
+ .long 0
+ .long 0
+ .long 0
+ .long 0
+ .space 32, 0x00
+__VPACK_ODD_ind.6358.0.1:
+ .long 1
+ .long 3
+ .long 5
+ .long 7
+ .long 0
+ .long 0
+ .long 0
+ .long 0
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
new file mode 100644
index 0000000000..bdcabab6e2
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
@@ -0,0 +1,39 @@
+/* Multiple versions of vectorized sincosf.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVeN16vvv_sincosf)
+ .type _ZGVeN16vvv_sincosf, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVeN16vvv_sincosf_skx(%rip), %rax
+ testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+ jnz 2f
+ leaq _ZGVeN16vvv_sincosf_knl(%rip), %rax
+ testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+ jnz 2f
+ leaq _ZGVeN16vvv_sincosf_avx2_wrapper(%rip), %rax
+2: ret
+END (_ZGVeN16vvv_sincosf)
+
+#define _ZGVeN16vvv_sincosf _ZGVeN16vvv_sincosf_avx2_wrapper
+#include "../svml_s_sincosf16_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
new file mode 100644
index 0000000000..efff91bb0d
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
@@ -0,0 +1,504 @@
+/* Function sincosf vectorized with AVX-512. KNL and SKX versions.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_trig_data.h"
+#include "svml_s_wrapper_impl.h"
+
+/*
+ ALGORITHM DESCRIPTION:
+
+ 1) Range reduction to [-Pi/4; +Pi/4] interval
+ a) Grab sign from source argument and save it.
+ b) Remove sign using AND operation
+ c) Getting octant Y by 2/Pi multiplication
+ d) Add "Right Shifter" value
+ e) Treat obtained value as integer S for destination sign setting.
+ SS = ((S-S&1)&2)<<30; For sin part
+ SC = ((S+S&1)&2)<<30; For cos part
+ f) Change destination sign if source sign is negative
+ using XOR operation.
+ g) Subtract "Right Shifter" (0x4B000000) value
+ h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+ 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval)
+ a) Calculate X^2 = X * X
+ b) Calculate 2 polynomials for sin and cos:
+ RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
+ RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4))));
+ c) Swap RS & RC if if first bit of obtained value after
+ Right Shifting is set to 1. Using And, Andnot & Or operations.
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R1 = XOR( RS, SS );
+ R2 = XOR( RC, SC ). */
+
+ .text
+ENTRY (_ZGVeN16vvv_sincosf_knl)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
+#else
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1344, %rsp
+ movq __svml_s_trig_data@GOTPCREL(%rip), %rax
+ vmovaps %zmm0, %zmm2
+ movl $-1, %edx
+ vmovups __sAbsMask(%rax), %zmm0
+ vmovups __sInvPI(%rax), %zmm3
+
+/* Absolute argument computation */
+ vpandd %zmm0, %zmm2, %zmm1
+ vmovups __sPI1_FMA(%rax), %zmm5
+ vmovups __sSignMask(%rax), %zmm9
+ vpandnd %zmm2, %zmm0, %zmm0
+
+/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3 */
+ vmovaps %zmm1, %zmm6
+ vmovaps %zmm1, %zmm8
+
+/* c) Getting octant Y by 2/Pi multiplication
+ d) Add "Right Shifter" value */
+ vfmadd213ps __sRShifter(%rax), %zmm1, %zmm3
+ vmovups __sPI3_FMA(%rax), %zmm7
+
+/* g) Subtract "Right Shifter" (0x4B000000) value */
+ vsubps __sRShifter(%rax), %zmm3, %zmm12
+
+/* e) Treat obtained value as integer S for destination sign setting */
+ vpslld $31, %zmm3, %zmm13
+ vmovups __sA7_FMA(%rax), %zmm14
+ vfnmadd231ps %zmm12, %zmm5, %zmm6
+
+/* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval)
+ a) Calculate X^2 = X * X
+ b) Calculate 2 polynomials for sin and cos:
+ RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
+ RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */
+ vmovaps %zmm14, %zmm15
+ vmovups __sA9_FMA(%rax), %zmm3
+ vcmpps $22, __sRangeReductionVal(%rax), %zmm1, %k1
+ vpbroadcastd %edx, %zmm1{%k1}{z}
+ vfnmadd231ps __sPI2_FMA(%rax), %zmm12, %zmm6
+ vptestmd %zmm1, %zmm1, %k0
+ vpandd %zmm6, %zmm9, %zmm11
+ kmovw %k0, %ecx
+ vpxord __sOneHalf(%rax), %zmm11, %zmm4
+
+/* Result sign calculations */
+ vpternlogd $150, %zmm13, %zmm9, %zmm11
+
+/* Add correction term 0.5 for cos() part */
+ vaddps %zmm4, %zmm12, %zmm10
+ vfnmadd213ps %zmm6, %zmm7, %zmm12
+ vfnmadd231ps %zmm10, %zmm5, %zmm8
+ vpxord %zmm13, %zmm12, %zmm13
+ vmulps %zmm13, %zmm13, %zmm12
+ vfnmadd231ps __sPI2_FMA(%rax), %zmm10, %zmm8
+ vfmadd231ps __sA9_FMA(%rax), %zmm12, %zmm15
+ vfnmadd213ps %zmm8, %zmm7, %zmm10
+ vfmadd213ps __sA5_FMA(%rax), %zmm12, %zmm15
+ vpxord %zmm11, %zmm10, %zmm5
+ vmulps %zmm5, %zmm5, %zmm4
+ vfmadd213ps __sA3(%rax), %zmm12, %zmm15
+ vfmadd213ps %zmm14, %zmm4, %zmm3
+ vmulps %zmm12, %zmm15, %zmm14
+ vfmadd213ps __sA5_FMA(%rax), %zmm4, %zmm3
+ vfmadd213ps %zmm13, %zmm13, %zmm14
+ vfmadd213ps __sA3(%rax), %zmm4, %zmm3
+ vpxord %zmm0, %zmm14, %zmm0
+ vmulps %zmm4, %zmm3, %zmm3
+ vfmadd213ps %zmm5, %zmm5, %zmm3
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ vmovups %zmm0, (%rdi)
+ vmovups %zmm3, (%rsi)
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovups %zmm2, 1152(%rsp)
+ vmovups %zmm0, 1216(%rsp)
+ vmovups %zmm3, 1280(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ kmovw %k4, 1048(%rsp)
+ xorl %eax, %eax
+ kmovw %k5, 1040(%rsp)
+ kmovw %k6, 1032(%rsp)
+ kmovw %k7, 1024(%rsp)
+ vmovups %zmm16, 960(%rsp)
+ vmovups %zmm17, 896(%rsp)
+ vmovups %zmm18, 832(%rsp)
+ vmovups %zmm19, 768(%rsp)
+ vmovups %zmm20, 704(%rsp)
+ vmovups %zmm21, 640(%rsp)
+ vmovups %zmm22, 576(%rsp)
+ vmovups %zmm23, 512(%rsp)
+ vmovups %zmm24, 448(%rsp)
+ vmovups %zmm25, 384(%rsp)
+ vmovups %zmm26, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1056(%rsp)
+ movq %r12, 1096(%rsp)
+ cfi_offset_rel_rsp (12, 1096)
+ movb %dl, %r12b
+ movq %r13, 1088(%rsp)
+ cfi_offset_rel_rsp (13, 1088)
+ movl %eax, %r13d
+ movq %r14, 1080(%rsp)
+ cfi_offset_rel_rsp (14, 1080)
+ movl %ecx, %r14d
+ movq %r15, 1072(%rsp)
+ cfi_offset_rel_rsp (15, 1072)
+ movq %rbx, 1064(%rsp)
+ movq %rdi, %rbx
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r13d, %r14d
+ jc .LBL_1_13
+
+.LBL_1_7:
+ lea 1(%r13), %esi
+ btl %esi, %r14d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ addb $1, %r12b
+ addl $2, %r13d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ movq %rbx, %rdi
+ kmovw 1048(%rsp), %k4
+ movq 1056(%rsp), %rsi
+ kmovw 1040(%rsp), %k5
+ movq 1096(%rsp), %r12
+ cfi_restore (%r12)
+ kmovw 1032(%rsp), %k6
+ movq 1088(%rsp), %r13
+ cfi_restore (%r13)
+ kmovw 1024(%rsp), %k7
+ vmovups 960(%rsp), %zmm16
+ vmovups 896(%rsp), %zmm17
+ vmovups 832(%rsp), %zmm18
+ vmovups 768(%rsp), %zmm19
+ vmovups 704(%rsp), %zmm20
+ vmovups 640(%rsp), %zmm21
+ vmovups 576(%rsp), %zmm22
+ vmovups 512(%rsp), %zmm23
+ vmovups 448(%rsp), %zmm24
+ vmovups 384(%rsp), %zmm25
+ vmovups 320(%rsp), %zmm26
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ movq 1080(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1072(%rsp), %r15
+ cfi_restore (%r15)
+ movq 1064(%rsp), %rbx
+ vmovups 1216(%rsp), %zmm0
+ vmovups 1280(%rsp), %zmm3
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ vmovss 1156(%rsp,%r15,8), %xmm0
+
+ call sinf@PLT
+
+ vmovss %xmm0, 1220(%rsp,%r15,8)
+ vmovss 1156(%rsp,%r15,8), %xmm0
+
+ call cosf@PLT
+
+ vmovss %xmm0, 1284(%rsp,%r15,8)
+ jmp .LBL_1_8
+
+.LBL_1_13:
+ movzbl %r12b, %r15d
+ vmovss 1152(%rsp,%r15,8), %xmm0
+
+ call sinf@PLT
+
+ vmovss %xmm0, 1216(%rsp,%r15,8)
+ vmovss 1152(%rsp,%r15,8), %xmm0
+
+ call cosf@PLT
+
+ vmovss %xmm0, 1280(%rsp,%r15,8)
+ jmp .LBL_1_7
+#endif
+END (_ZGVeN16vvv_sincosf_knl)
+
+ENTRY (_ZGVeN16vvv_sincosf_skx)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
+#else
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1344, %rsp
+ movq __svml_s_trig_data@GOTPCREL(%rip), %rax
+ vmovaps %zmm0, %zmm4
+ vmovups __sAbsMask(%rax), %zmm3
+ vmovups __sInvPI(%rax), %zmm5
+ vmovups __sRShifter(%rax), %zmm6
+ vmovups __sPI1_FMA(%rax), %zmm9
+ vmovups __sPI2_FMA(%rax), %zmm10
+ vmovups __sSignMask(%rax), %zmm14
+ vmovups __sOneHalf(%rax), %zmm7
+ vmovups __sPI3_FMA(%rax), %zmm12
+
+/* Absolute argument computation */
+ vandps %zmm3, %zmm4, %zmm2
+
+/* c) Getting octant Y by 2/Pi multiplication
+ d) Add "Right Shifter" value */
+ vfmadd213ps %zmm6, %zmm2, %zmm5
+ vcmpps $18, __sRangeReductionVal(%rax), %zmm2, %k1
+
+/* e) Treat obtained value as integer S for destination sign setting */
+ vpslld $31, %zmm5, %zmm0
+
+/* g) Subtract "Right Shifter" (0x4B000000) value */
+ vsubps %zmm6, %zmm5, %zmm5
+ vmovups __sA3(%rax), %zmm6
+
+/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3 */
+ vmovaps %zmm2, %zmm11
+ vfnmadd231ps %zmm5, %zmm9, %zmm11
+ vfnmadd231ps %zmm5, %zmm10, %zmm11
+ vandps %zmm11, %zmm14, %zmm1
+ vxorps %zmm1, %zmm7, %zmm8
+
+/* Result sign calculations */
+ vpternlogd $150, %zmm0, %zmm14, %zmm1
+ vmovups .L_2il0floatpacket.13(%rip), %zmm14
+
+/* Add correction term 0.5 for cos() part */
+ vaddps %zmm8, %zmm5, %zmm15
+ vfnmadd213ps %zmm11, %zmm12, %zmm5
+ vandnps %zmm4, %zmm3, %zmm11
+ vmovups __sA7_FMA(%rax), %zmm3
+ vmovaps %zmm2, %zmm13
+ vfnmadd231ps %zmm15, %zmm9, %zmm13
+ vxorps %zmm0, %zmm5, %zmm9
+ vmovups __sA5_FMA(%rax), %zmm0
+ vfnmadd231ps %zmm15, %zmm10, %zmm13
+ vmulps %zmm9, %zmm9, %zmm8
+ vfnmadd213ps %zmm13, %zmm12, %zmm15
+ vmovups __sA9_FMA(%rax), %zmm12
+ vxorps %zmm1, %zmm15, %zmm1
+ vmulps %zmm1, %zmm1, %zmm13
+
+/* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval)
+ a) Calculate X^2 = X * X
+ b) Calculate 2 polynomials for sin and cos:
+ RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
+ RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */
+ vmovaps %zmm12, %zmm7
+ vfmadd213ps %zmm3, %zmm8, %zmm7
+ vfmadd213ps %zmm3, %zmm13, %zmm12
+ vfmadd213ps %zmm0, %zmm8, %zmm7
+ vfmadd213ps %zmm0, %zmm13, %zmm12
+ vfmadd213ps %zmm6, %zmm8, %zmm7
+ vfmadd213ps %zmm6, %zmm13, %zmm12
+ vmulps %zmm8, %zmm7, %zmm10
+ vmulps %zmm13, %zmm12, %zmm3
+ vfmadd213ps %zmm9, %zmm9, %zmm10
+ vfmadd213ps %zmm1, %zmm1, %zmm3
+ vxorps %zmm11, %zmm10, %zmm0
+ vpandnd %zmm2, %zmm2, %zmm14{%k1}
+ vptestmd %zmm14, %zmm14, %k0
+ kmovw %k0, %ecx
+ testl %ecx, %ecx
+ jne .LBL_2_3
+
+.LBL_2_2:
+ cfi_remember_state
+ vmovups %zmm0, (%rdi)
+ vmovups %zmm3, (%rsi)
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_2_3:
+ cfi_restore_state
+ vmovups %zmm4, 1152(%rsp)
+ vmovups %zmm0, 1216(%rsp)
+ vmovups %zmm3, 1280(%rsp)
+ je .LBL_2_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ kmovw %k4, 1048(%rsp)
+ kmovw %k5, 1040(%rsp)
+ kmovw %k6, 1032(%rsp)
+ kmovw %k7, 1024(%rsp)
+ vmovups %zmm16, 960(%rsp)
+ vmovups %zmm17, 896(%rsp)
+ vmovups %zmm18, 832(%rsp)
+ vmovups %zmm19, 768(%rsp)
+ vmovups %zmm20, 704(%rsp)
+ vmovups %zmm21, 640(%rsp)
+ vmovups %zmm22, 576(%rsp)
+ vmovups %zmm23, 512(%rsp)
+ vmovups %zmm24, 448(%rsp)
+ vmovups %zmm25, 384(%rsp)
+ vmovups %zmm26, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1056(%rsp)
+ movq %r12, 1096(%rsp)
+ cfi_offset_rel_rsp (12, 1096)
+ movb %dl, %r12b
+ movq %r13, 1088(%rsp)
+ cfi_offset_rel_rsp (13, 1088)
+ movl %eax, %r13d
+ movq %r14, 1080(%rsp)
+ cfi_offset_rel_rsp (14, 1080)
+ movl %ecx, %r14d
+ movq %r15, 1072(%rsp)
+ cfi_offset_rel_rsp (15, 1072)
+ movq %rbx, 1064(%rsp)
+ movq %rdi, %rbx
+ cfi_remember_state
+
+.LBL_2_6:
+ btl %r13d, %r14d
+ jc .LBL_2_13
+
+.LBL_2_7:
+ lea 1(%r13), %esi
+ btl %esi, %r14d
+ jc .LBL_2_10
+
+.LBL_2_8:
+ incb %r12b
+ addl $2, %r13d
+ cmpb $16, %r12b
+ jb .LBL_2_6
+
+ kmovw 1048(%rsp), %k4
+ movq %rbx, %rdi
+ kmovw 1040(%rsp), %k5
+ kmovw 1032(%rsp), %k6
+ kmovw 1024(%rsp), %k7
+ vmovups 960(%rsp), %zmm16
+ vmovups 896(%rsp), %zmm17
+ vmovups 832(%rsp), %zmm18
+ vmovups 768(%rsp), %zmm19
+ vmovups 704(%rsp), %zmm20
+ vmovups 640(%rsp), %zmm21
+ vmovups 576(%rsp), %zmm22
+ vmovups 512(%rsp), %zmm23
+ vmovups 448(%rsp), %zmm24
+ vmovups 384(%rsp), %zmm25
+ vmovups 320(%rsp), %zmm26
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ vmovups 1216(%rsp), %zmm0
+ vmovups 1280(%rsp), %zmm3
+ movq 1056(%rsp), %rsi
+ movq 1096(%rsp), %r12
+ cfi_restore (%r12)
+ movq 1088(%rsp), %r13
+ cfi_restore (%r13)
+ movq 1080(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1072(%rsp), %r15
+ cfi_restore (%r15)
+ movq 1064(%rsp), %rbx
+ jmp .LBL_2_2
+
+.LBL_2_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ vmovss 1156(%rsp,%r15,8), %xmm0
+ vzeroupper
+ vmovss 1156(%rsp,%r15,8), %xmm0
+
+ call sinf@PLT
+
+ vmovss %xmm0, 1220(%rsp,%r15,8)
+ vmovss 1156(%rsp,%r15,8), %xmm0
+
+ call cosf@PLT
+
+ vmovss %xmm0, 1284(%rsp,%r15,8)
+ jmp .LBL_2_8
+
+.LBL_2_13:
+ movzbl %r12b, %r15d
+ vmovss 1152(%rsp,%r15,8), %xmm0
+ vzeroupper
+ vmovss 1152(%rsp,%r15,8), %xmm0
+
+ call sinf@PLT
+
+ vmovss %xmm0, 1216(%rsp,%r15,8)
+ vmovss 1152(%rsp,%r15,8), %xmm0
+
+ call cosf@PLT
+
+ vmovss %xmm0, 1280(%rsp,%r15,8)
+ jmp .LBL_2_7
+#endif
+END (_ZGVeN16vvv_sincosf_skx)
+
+ .section .rodata, "a"
+.L_2il0floatpacket.13:
+ .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+ .type .L_2il0floatpacket.13,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S
new file mode 100644
index 0000000000..610046b587
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S
@@ -0,0 +1,38 @@
+/* Multiple versions of vectorized sincosf.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVbN4vvv_sincosf)
+ .type _ZGVbN4vvv_sincosf, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVbN4vvv_sincosf_sse4(%rip), %rax
+ testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVbN4vvv_sincosf_sse2(%rip), %rax
+ ret
+END (_ZGVbN4vvv_sincosf)
+libmvec_hidden_def (_ZGVbN4vvv_sincosf)
+
+#define _ZGVbN4vvv_sincosf _ZGVbN4vvv_sincosf_sse2
+#include "../svml_s_sincosf4_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S
new file mode 100644
index 0000000000..4d846b5d7e
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S
@@ -0,0 +1,268 @@
+/* Function sincosf vectorized with SSE4.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_trig_data.h"
+
+ .text
+ENTRY (_ZGVbN4vvv_sincosf_sse4)
+/*
+ ALGORITHM DESCRIPTION:
+
+ 1) Range reduction to [-Pi/4; +Pi/4] interval
+ a) Grab sign from source argument and save it.
+ b) Remove sign using AND operation
+ c) Getting octant Y by 2/Pi multiplication
+ d) Add "Right Shifter" value
+ e) Treat obtained value as integer S for destination sign setting.
+ SS = ((S-S&1)&2)<<30; For sin part
+ SC = ((S+S&1)&2)<<30; For cos part
+ f) Change destination sign if source sign is negative
+ using XOR operation.
+ g) Subtract "Right Shifter" (0x4B000000) value
+ h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+ 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval)
+ a) Calculate X^2 = X * X
+ b) Calculate 2 polynomials for sin and cos:
+ RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
+ RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4))));
+ c) Swap RS & RC if if first bit of obtained value after
+ Right Shifting is set to 1. Using And, Andnot & Or operations.
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R1 = XOR( RS, SS );
+ R2 = XOR( RC, SC ). */
+
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $320, %rsp
+ movq __svml_s_trig_data@GOTPCREL(%rip), %rax
+ movups %xmm12, 176(%rsp)
+ movups %xmm9, 160(%rsp)
+ movups __sAbsMask(%rax), %xmm12
+
+/* Absolute argument computation */
+ movaps %xmm12, %xmm5
+ andnps %xmm0, %xmm12
+ movups __sInvPI(%rax), %xmm7
+ andps %xmm0, %xmm5
+
+/* c) Getting octant Y by 2/Pi multiplication
+ d) Add "Right Shifter" value. */
+ mulps %xmm5, %xmm7
+ movups %xmm10, 144(%rsp)
+ movups __sPI1(%rax), %xmm10
+
+/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3. */
+ movaps %xmm10, %xmm1
+ addps __sRShifter(%rax), %xmm7
+
+/* e) Treat obtained value as integer S for destination sign setting */
+ movaps %xmm7, %xmm9
+
+/* g) Subtract "Right Shifter" (0x4B000000) value */
+ subps __sRShifter(%rax), %xmm7
+ mulps %xmm7, %xmm1
+ pslld $31, %xmm9
+ movups __sPI2(%rax), %xmm6
+ movups %xmm13, 112(%rsp)
+ movaps %xmm5, %xmm13
+ movaps %xmm6, %xmm2
+ subps %xmm1, %xmm13
+ mulps %xmm7, %xmm2
+ movups __sSignMask(%rax), %xmm3
+ movaps %xmm5, %xmm1
+ movups __sOneHalf(%rax), %xmm4
+ subps %xmm2, %xmm13
+ cmpnleps __sRangeReductionVal(%rax), %xmm5
+ movaps %xmm3, %xmm2
+ andps %xmm13, %xmm2
+ xorps %xmm2, %xmm4
+
+/* Result sign calculations */
+ xorps %xmm2, %xmm3
+ xorps %xmm9, %xmm3
+
+/* Add correction term 0.5 for cos() part */
+ addps %xmm7, %xmm4
+ movmskps %xmm5, %ecx
+ mulps %xmm4, %xmm10
+ mulps %xmm4, %xmm6
+ subps %xmm10, %xmm1
+ movups __sPI3(%rax), %xmm10
+ subps %xmm6, %xmm1
+ movaps %xmm10, %xmm6
+ mulps %xmm7, %xmm6
+ mulps %xmm4, %xmm10
+ subps %xmm6, %xmm13
+ subps %xmm10, %xmm1
+ movups __sPI4(%rax), %xmm6
+ mulps %xmm6, %xmm7
+ mulps %xmm6, %xmm4
+ subps %xmm7, %xmm13
+ subps %xmm4, %xmm1
+ xorps %xmm9, %xmm13
+ xorps %xmm3, %xmm1
+ movaps %xmm13, %xmm4
+ movaps %xmm1, %xmm2
+ mulps %xmm13, %xmm4
+ mulps %xmm1, %xmm2
+ movups __sA9(%rax), %xmm7
+
+/* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval)
+ a) Calculate X^2 = X * X
+ b) Calculate 2 polynomials for sin and cos:
+ RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
+ RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */
+ movaps %xmm7, %xmm3
+ mulps %xmm4, %xmm3
+ mulps %xmm2, %xmm7
+ addps __sA7(%rax), %xmm3
+ addps __sA7(%rax), %xmm7
+ mulps %xmm4, %xmm3
+ mulps %xmm2, %xmm7
+ addps __sA5(%rax), %xmm3
+ addps __sA5(%rax), %xmm7
+ mulps %xmm4, %xmm3
+ mulps %xmm2, %xmm7
+ addps __sA3(%rax), %xmm3
+ addps __sA3(%rax), %xmm7
+ mulps %xmm3, %xmm4
+ mulps %xmm7, %xmm2
+ mulps %xmm13, %xmm4
+ mulps %xmm1, %xmm2
+ addps %xmm4, %xmm13
+ addps %xmm2, %xmm1
+ xorps %xmm12, %xmm13
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ movups 160(%rsp), %xmm9
+ movaps %xmm13, (%rdi)
+ movups 144(%rsp), %xmm10
+ movups 176(%rsp), %xmm12
+ movups 112(%rsp), %xmm13
+ movups %xmm1, (%rsi)
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ movups %xmm0, 128(%rsp)
+ movups %xmm13, 192(%rsp)
+ movups %xmm1, 256(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ movups %xmm8, 48(%rsp)
+ movups %xmm11, 32(%rsp)
+ movups %xmm14, 16(%rsp)
+ movups %xmm15, (%rsp)
+ movq %rsi, 64(%rsp)
+ movq %r12, 104(%rsp)
+ cfi_offset_rel_rsp (12, 104)
+ movb %dl, %r12b
+ movq %r13, 96(%rsp)
+ cfi_offset_rel_rsp (13, 96)
+ movl %eax, %r13d
+ movq %r14, 88(%rsp)
+ cfi_offset_rel_rsp (14, 88)
+ movl %ecx, %r14d
+ movq %r15, 80(%rsp)
+ cfi_offset_rel_rsp (15, 80)
+ movq %rbx, 72(%rsp)
+ movq %rdi, %rbx
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r13d, %r14d
+ jc .LBL_1_13
+
+.LBL_1_7:
+ lea 1(%r13), %esi
+ btl %esi, %r14d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r13d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ movups 48(%rsp), %xmm8
+ movq %rbx, %rdi
+ movups 32(%rsp), %xmm11
+ movups 16(%rsp), %xmm14
+ movups (%rsp), %xmm15
+ movq 64(%rsp), %rsi
+ movq 104(%rsp), %r12
+ cfi_restore (%r12)
+ movq 96(%rsp), %r13
+ cfi_restore (%r13)
+ movq 88(%rsp), %r14
+ cfi_restore (%r14)
+ movq 80(%rsp), %r15
+ cfi_restore (%r15)
+ movq 72(%rsp), %rbx
+ movups 192(%rsp), %xmm13
+ movups 256(%rsp), %xmm1
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ movss 132(%rsp,%r15,8), %xmm0
+
+ call sinf@PLT
+
+ movss %xmm0, 196(%rsp,%r15,8)
+ movss 132(%rsp,%r15,8), %xmm0
+
+ call cosf@PLT
+
+ movss %xmm0, 260(%rsp,%r15,8)
+ jmp .LBL_1_8
+
+.LBL_1_13:
+ movzbl %r12b, %r15d
+ movss 128(%rsp,%r15,8), %xmm0
+
+ call sinf@PLT
+
+ movss %xmm0, 192(%rsp,%r15,8)
+ movss 128(%rsp,%r15,8), %xmm0
+
+ call cosf@PLT
+
+ movss %xmm0, 256(%rsp,%r15,8)
+ jmp .LBL_1_7
+
+END (_ZGVbN4vvv_sincosf_sse4)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S
new file mode 100644
index 0000000000..9e5be67fc9
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S
@@ -0,0 +1,38 @@
+/* Multiple versions of vectorized sincosf.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVdN8vvv_sincosf)
+ .type _ZGVdN8vvv_sincosf, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVdN8vvv_sincosf_avx2(%rip), %rax
+ testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVdN8vvv_sincosf_sse_wrapper(%rip), %rax
+ ret
+END (_ZGVdN8vvv_sincosf)
+libmvec_hidden_def (_ZGVdN8vvv_sincosf)
+
+#define _ZGVdN8vvv_sincosf _ZGVdN8vvv_sincosf_sse_wrapper
+#include "../svml_s_sincosf8_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S
new file mode 100644
index 0000000000..0108fd5126
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S
@@ -0,0 +1,241 @@
+/* Function sincosf vectorized with AVX2.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_trig_data.h"
+
+ .text
+ENTRY(_ZGVdN8vvv_sincosf_avx2)
+/*
+ ALGORITHM DESCRIPTION:
+
+ 1) Range reduction to [-Pi/4; +Pi/4] interval
+ a) Grab sign from source argument and save it.
+ b) Remove sign using AND operation
+ c) Getting octant Y by 2/Pi multiplication
+ d) Add "Right Shifter" value
+ e) Treat obtained value as integer S for destination sign setting.
+ SS = ((S-S&1)&2)<<30; For sin part
+ SC = ((S+S&1)&2)<<30; For cos part
+ f) Change destination sign if source sign is negative
+ using XOR operation.
+ g) Subtract "Right Shifter" (0x4B000000) value
+ h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+ 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval)
+ a) Calculate X^2 = X * X
+ b) Calculate 2 polynomials for sin and cos:
+ RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
+ RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4))));
+ c) Swap RS & RC if if first bit of obtained value after
+ Right Shifting is set to 1. Using And, Andnot & Or operations.
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R1 = XOR( RS, SS );
+ R2 = XOR( RC, SC ). */
+
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $448, %rsp
+ movq __svml_s_trig_data@GOTPCREL(%rip), %rax
+ vmovdqa %ymm0, %ymm5
+ vmovups %ymm13, 352(%rsp)
+ vmovups __sAbsMask(%rax), %ymm2
+ vmovups __sInvPI(%rax), %ymm1
+ vmovups __sPI1_FMA(%rax), %ymm13
+ vmovups %ymm15, 288(%rsp)
+
+/* Absolute argument computation */
+ vandps %ymm2, %ymm5, %ymm4
+
+/* c) Getting octant Y by 2/Pi multiplication
+ d) Add "Right Shifter" value */
+ vfmadd213ps __sRShifter(%rax), %ymm4, %ymm1
+
+/* e) Treat obtained value as integer S for destination sign setting */
+ vpslld $31, %ymm1, %ymm0
+
+/* g) Subtract "Right Shifter" (0x4B000000) value */
+ vsubps __sRShifter(%rax), %ymm1, %ymm1
+
+/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3 */
+ vmovdqa %ymm4, %ymm7
+ vfnmadd231ps %ymm1, %ymm13, %ymm7
+ vfnmadd231ps __sPI2_FMA(%rax), %ymm1, %ymm7
+ vandps __sSignMask(%rax), %ymm7, %ymm15
+ vxorps __sOneHalf(%rax), %ymm15, %ymm6
+
+/* Add correction term 0.5 for cos() part */
+ vaddps %ymm6, %ymm1, %ymm6
+ vmovdqa %ymm4, %ymm3
+ vfnmadd231ps %ymm6, %ymm13, %ymm3
+ vmovups __sPI3_FMA(%rax), %ymm13
+ vcmpnle_uqps __sRangeReductionVal(%rax), %ymm4, %ymm4
+ vfnmadd231ps __sPI2_FMA(%rax), %ymm6, %ymm3
+ vfnmadd213ps %ymm7, %ymm13, %ymm1
+ vfnmadd213ps %ymm3, %ymm13, %ymm6
+
+/* Result sign calculations */
+ vxorps __sSignMask(%rax), %ymm15, %ymm3
+ vxorps %ymm0, %ymm3, %ymm7
+ vxorps %ymm7, %ymm6, %ymm3
+ vxorps %ymm0, %ymm1, %ymm15
+ vandnps %ymm5, %ymm2, %ymm6
+ vmovups __sA7_FMA(%rax), %ymm2
+ vmulps %ymm15, %ymm15, %ymm13
+ vmovups __sA9_FMA(%rax), %ymm7
+ vmulps %ymm3, %ymm3, %ymm1
+
+/* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval)
+ a) Calculate X^2 = X * X
+ b) Calculate 2 polynomials for sin and cos:
+ RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
+ RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */
+ vmovdqa %ymm2, %ymm0
+ vfmadd231ps __sA9_FMA(%rax), %ymm13, %ymm0
+ vfmadd213ps %ymm2, %ymm1, %ymm7
+ vfmadd213ps __sA5_FMA(%rax), %ymm13, %ymm0
+ vfmadd213ps __sA5_FMA(%rax), %ymm1, %ymm7
+ vfmadd213ps __sA3(%rax), %ymm13, %ymm0
+ vfmadd213ps __sA3(%rax), %ymm1, %ymm7
+ vmulps %ymm13, %ymm0, %ymm13
+ vmulps %ymm1, %ymm7, %ymm1
+ vfmadd213ps %ymm15, %ymm15, %ymm13
+ vfmadd213ps %ymm3, %ymm3, %ymm1
+ vmovmskps %ymm4, %ecx
+ vxorps %ymm6, %ymm13, %ymm0
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ vmovups 352(%rsp), %ymm13
+ vmovups 288(%rsp), %ymm15
+ vmovups %ymm0, (%rdi)
+ vmovups %ymm1, (%rsi)
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovups %ymm5, 256(%rsp)
+ vmovups %ymm0, 320(%rsp)
+ vmovups %ymm1, 384(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ vmovups %ymm8, 160(%rsp)
+ vmovups %ymm9, 128(%rsp)
+ vmovups %ymm10, 96(%rsp)
+ vmovups %ymm11, 64(%rsp)
+ vmovups %ymm12, 32(%rsp)
+ vmovups %ymm14, (%rsp)
+ movq %rsi, 192(%rsp)
+ movq %r12, 232(%rsp)
+ cfi_offset_rel_rsp (12, 232)
+ movb %dl, %r12b
+ movq %r13, 224(%rsp)
+ cfi_offset_rel_rsp (13, 224)
+ movl %eax, %r13d
+ movq %r14, 216(%rsp)
+ cfi_offset_rel_rsp (14, 216)
+ movl %ecx, %r14d
+ movq %r15, 208(%rsp)
+ cfi_offset_rel_rsp (14, 208)
+ movq %rbx, 200(%rsp)
+ movq %rdi, %rbx
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r13d, %r14d
+ jc .LBL_1_13
+
+.LBL_1_7:
+ lea 1(%r13), %esi
+ btl %esi, %r14d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r13d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ vmovups 160(%rsp), %ymm8
+ movq %rbx, %rdi
+ vmovups 128(%rsp), %ymm9
+ vmovups 96(%rsp), %ymm10
+ vmovups 64(%rsp), %ymm11
+ vmovups 32(%rsp), %ymm12
+ vmovups (%rsp), %ymm14
+ vmovups 320(%rsp), %ymm0
+ vmovups 384(%rsp), %ymm1
+ movq 192(%rsp), %rsi
+ movq 232(%rsp), %r12
+ cfi_restore (%r12)
+ movq 224(%rsp), %r13
+ cfi_restore (%r13)
+ movq 216(%rsp), %r14
+ cfi_restore (%r14)
+ movq 208(%rsp), %r15
+ cfi_restore (%r15)
+ movq 200(%rsp), %rbx
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ vmovss 260(%rsp,%r15,8), %xmm0
+ vzeroupper
+
+ call sinf@PLT
+
+ vmovss %xmm0, 324(%rsp,%r15,8)
+ vmovss 260(%rsp,%r15,8), %xmm0
+
+ call cosf@PLT
+
+ vmovss %xmm0, 388(%rsp,%r15,8)
+ jmp .LBL_1_8
+
+.LBL_1_13:
+ movzbl %r12b, %r15d
+ vmovss 256(%rsp,%r15,8), %xmm0
+ vzeroupper
+
+ call sinf@PLT
+
+ vmovss %xmm0, 320(%rsp,%r15,8)
+ vmovss 256(%rsp,%r15,8), %xmm0
+
+ call cosf@PLT
+
+ vmovss %xmm0, 384(%rsp,%r15,8)
+ jmp .LBL_1_7
+
+END(_ZGVdN8vvv_sincosf_avx2)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
new file mode 100644
index 0000000000..3ec78a0b5e
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
@@ -0,0 +1,39 @@
+/* Multiple versions of vectorized sinf.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVeN16v_sinf)
+ .type _ZGVeN16v_sinf, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVeN16v_sinf_skx(%rip), %rax
+ testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+ jnz 2f
+ leaq _ZGVeN16v_sinf_knl(%rip), %rax
+ testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+ jnz 2f
+ leaq _ZGVeN16v_sinf_avx2_wrapper(%rip), %rax
+2: ret
+END (_ZGVeN16v_sinf)
+
+#define _ZGVeN16v_sinf _ZGVeN16v_sinf_avx2_wrapper
+#include "../svml_s_sinf16_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
new file mode 100644
index 0000000000..f13ed96af8
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
@@ -0,0 +1,479 @@
+/* Function sinf vectorized with AVX-512. KNL and SKX versions.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_trig_data.h"
+#include "svml_s_wrapper_impl.h"
+
+ .text
+ENTRY(_ZGVeN16v_sinf_knl)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
+#else
+/*
+ ALGORITHM DESCRIPTION:
+
+ 1) Range reduction to [-Pi/2; +Pi/2] interval
+ a) Grab sign from source argument and save it.
+ b) Remove sign using AND operation
+ c) Getting octant Y by 1/Pi multiplication
+ d) Add "Right Shifter" value
+ e) Treat obtained value as integer for destination sign setting.
+ Shift first bit of this value to the last (sign) position
+ f) Change destination sign if source sign is negative
+ using XOR operation.
+ g) Subtract "Right Shifter" value
+ h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+ 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+ a) Calculate X^2 = X * X
+ b) Calculate polynomial:
+ R = X + X * X^2 * (A3 + x^2 * (A5 + ......
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R = XOR( R, S );
+ */
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1280, %rsp
+ movq __svml_s_trig_data@GOTPCREL(%rip), %rax
+
+/* Check for large and special values */
+ movl $-1, %edx
+ vmovups __sAbsMask(%rax), %zmm4
+ vmovups __sInvPI(%rax), %zmm1
+
+/* b) Remove sign using AND operation */
+ vpandd %zmm4, %zmm0, %zmm12
+ vmovups __sPI1_FMA(%rax), %zmm2
+ vmovups __sA9(%rax), %zmm7
+
+/*
+ f) Change destination sign if source sign is negative
+ using XOR operation.
+ */
+ vpandnd %zmm0, %zmm4, %zmm11
+
+/*
+ h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3;
+ */
+ vmovaps %zmm12, %zmm3
+
+/*
+ c) Getting octant Y by 1/Pi multiplication
+ d) Add "Right Shifter" value
+ */
+ vfmadd213ps __sRShifter(%rax), %zmm12, %zmm1
+ vcmpps $22, __sRangeReductionVal(%rax), %zmm12, %k1
+ vpbroadcastd %edx, %zmm13{%k1}{z}
+
+/* g) Subtract "Right Shifter" value */
+ vsubps __sRShifter(%rax), %zmm1, %zmm5
+
+/*
+ e) Treat obtained value as integer for destination sign setting.
+ Shift first bit of this value to the last (sign) position
+ */
+ vpslld $31, %zmm1, %zmm6
+ vptestmd %zmm13, %zmm13, %k0
+ vfnmadd231ps %zmm5, %zmm2, %zmm3
+ kmovw %k0, %ecx
+ vfnmadd231ps __sPI2_FMA(%rax), %zmm5, %zmm3
+ vfnmadd132ps __sPI3_FMA(%rax), %zmm3, %zmm5
+
+/*
+ 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+ a) Calculate X^2 = X * X
+ b) Calculate polynomial:
+ R = X + X * X^2 * (A3 + x^2 * (A5 + ......
+ */
+ vmulps %zmm5, %zmm5, %zmm8
+ vpxord %zmm6, %zmm5, %zmm9
+ vfmadd213ps __sA7(%rax), %zmm8, %zmm7
+ vfmadd213ps __sA5(%rax), %zmm8, %zmm7
+ vfmadd213ps __sA3(%rax), %zmm8, %zmm7
+ vmulps %zmm8, %zmm7, %zmm10
+ vfmadd213ps %zmm9, %zmm9, %zmm10
+
+/*
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R = XOR( R, S );
+ */
+ vpxord %zmm11, %zmm10, %zmm1
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ vmovaps %zmm1, %zmm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovups %zmm0, 1152(%rsp)
+ vmovups %zmm1, 1216(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ kmovw %k4, 1048(%rsp)
+ xorl %eax, %eax
+ kmovw %k5, 1040(%rsp)
+ kmovw %k6, 1032(%rsp)
+ kmovw %k7, 1024(%rsp)
+ vmovups %zmm16, 960(%rsp)
+ vmovups %zmm17, 896(%rsp)
+ vmovups %zmm18, 832(%rsp)
+ vmovups %zmm19, 768(%rsp)
+ vmovups %zmm20, 704(%rsp)
+ vmovups %zmm21, 640(%rsp)
+ vmovups %zmm22, 576(%rsp)
+ vmovups %zmm23, 512(%rsp)
+ vmovups %zmm24, 448(%rsp)
+ vmovups %zmm25, 384(%rsp)
+ vmovups %zmm26, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1064(%rsp)
+ movq %rdi, 1056(%rsp)
+ movq %r12, 1096(%rsp)
+ cfi_offset_rel_rsp (12, 1096)
+ movb %dl, %r12b
+ movq %r13, 1088(%rsp)
+ cfi_offset_rel_rsp (13, 1088)
+ movl %ecx, %r13d
+ movq %r14, 1080(%rsp)
+ cfi_offset_rel_rsp (14, 1080)
+ movl %eax, %r14d
+ movq %r15, 1072(%rsp)
+ cfi_offset_rel_rsp (15, 1072)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ addb $1, %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ kmovw 1048(%rsp), %k4
+ movq 1064(%rsp), %rsi
+ kmovw 1040(%rsp), %k5
+ movq 1056(%rsp), %rdi
+ kmovw 1032(%rsp), %k6
+ movq 1096(%rsp), %r12
+ cfi_restore (%r12)
+ movq 1088(%rsp), %r13
+ cfi_restore (%r13)
+ kmovw 1024(%rsp), %k7
+ vmovups 960(%rsp), %zmm16
+ vmovups 896(%rsp), %zmm17
+ vmovups 832(%rsp), %zmm18
+ vmovups 768(%rsp), %zmm19
+ vmovups 704(%rsp), %zmm20
+ vmovups 640(%rsp), %zmm21
+ vmovups 576(%rsp), %zmm22
+ vmovups 512(%rsp), %zmm23
+ vmovups 448(%rsp), %zmm24
+ vmovups 384(%rsp), %zmm25
+ vmovups 320(%rsp), %zmm26
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ movq 1080(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1072(%rsp), %r15
+ cfi_restore (%r15)
+ vmovups 1216(%rsp), %zmm1
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ vmovss 1156(%rsp,%r15,8), %xmm0
+ call sinf@PLT
+ vmovss %xmm0, 1220(%rsp,%r15,8)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ vmovss 1152(%rsp,%r15,8), %xmm0
+ call sinf@PLT
+ vmovss %xmm0, 1216(%rsp,%r15,8)
+ jmp .LBL_1_7
+#endif
+END(_ZGVeN16v_sinf_knl)
+
+ENTRY (_ZGVeN16v_sinf_skx)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
+#else
+/*
+ ALGORITHM DESCRIPTION:
+
+ 1) Range reduction to [-Pi/2; +Pi/2] interval
+ a) Grab sign from source argument and save it.
+ b) Remove sign using AND operation
+ c) Getting octant Y by 1/Pi multiplication
+ d) Add "Right Shifter" value
+ e) Treat obtained value as integer for destination sign setting.
+ Shift first bit of this value to the last (sign) position
+ f) Change destination sign if source sign is negative
+ using XOR operation.
+ g) Subtract "Right Shifter" value
+ h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+ 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+ a) Calculate X^2 = X * X
+ b) Calculate polynomial:
+ R = X + X * X^2 * (A3 + x^2 * (A5 + ......
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R = XOR( R, S );
+ */
+
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1280, %rsp
+ movq __svml_s_trig_data@GOTPCREL(%rip), %rax
+
+/* Check for large and special values */
+ vmovups .L_2il0floatpacket.11(%rip), %zmm14
+ vmovups __sAbsMask(%rax), %zmm5
+ vmovups __sInvPI(%rax), %zmm1
+ vmovups __sRShifter(%rax), %zmm2
+ vmovups __sPI1_FMA(%rax), %zmm3
+ vmovups __sA9(%rax), %zmm8
+
+/* b) Remove sign using AND operation */
+ vandps %zmm5, %zmm0, %zmm13
+
+/*
+ f) Change destination sign if source sign is negative
+ using XOR operation.
+ */
+ vandnps %zmm0, %zmm5, %zmm12
+
+/*
+ c) Getting octant Y by 1/Pi multiplication
+ d) Add "Right Shifter" value
+ */
+ vfmadd213ps %zmm2, %zmm13, %zmm1
+ vcmpps $18, __sRangeReductionVal(%rax), %zmm13, %k1
+
+/*
+ e) Treat obtained value as integer for destination sign setting.
+ Shift first bit of this value to the last (sign) position
+ */
+ vpslld $31, %zmm1, %zmm7
+
+/* g) Subtract "Right Shifter" value */
+ vsubps %zmm2, %zmm1, %zmm6
+
+/*
+ h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3;
+ */
+ vmovaps %zmm13, %zmm4
+ vfnmadd231ps %zmm6, %zmm3, %zmm4
+ vfnmadd231ps __sPI2_FMA(%rax), %zmm6, %zmm4
+ vfnmadd132ps __sPI3_FMA(%rax), %zmm4, %zmm6
+
+/*
+ 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+ a) Calculate X^2 = X * X
+ b) Calculate polynomial:
+ R = X + X * X^2 * (A3 + x^2 * (A5 + ......
+ */
+ vmulps %zmm6, %zmm6, %zmm9
+ vxorps %zmm7, %zmm6, %zmm10
+ vfmadd213ps __sA7(%rax), %zmm9, %zmm8
+ vfmadd213ps __sA5(%rax), %zmm9, %zmm8
+ vfmadd213ps __sA3(%rax), %zmm9, %zmm8
+ vmulps %zmm9, %zmm8, %zmm11
+ vfmadd213ps %zmm10, %zmm10, %zmm11
+
+/*
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R = XOR( R, S );
+ */
+ vxorps %zmm12, %zmm11, %zmm1
+ vpandnd %zmm13, %zmm13, %zmm14{%k1}
+ vptestmd %zmm14, %zmm14, %k0
+ kmovw %k0, %ecx
+ testl %ecx, %ecx
+ jne .LBL_2_3
+
+.LBL_2_2:
+ cfi_remember_state
+ vmovaps %zmm1, %zmm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_2_3:
+ cfi_restore_state
+ vmovups %zmm0, 1152(%rsp)
+ vmovups %zmm1, 1216(%rsp)
+ je .LBL_2_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ kmovw %k4, 1048(%rsp)
+ kmovw %k5, 1040(%rsp)
+ kmovw %k6, 1032(%rsp)
+ kmovw %k7, 1024(%rsp)
+ vmovups %zmm16, 960(%rsp)
+ vmovups %zmm17, 896(%rsp)
+ vmovups %zmm18, 832(%rsp)
+ vmovups %zmm19, 768(%rsp)
+ vmovups %zmm20, 704(%rsp)
+ vmovups %zmm21, 640(%rsp)
+ vmovups %zmm22, 576(%rsp)
+ vmovups %zmm23, 512(%rsp)
+ vmovups %zmm24, 448(%rsp)
+ vmovups %zmm25, 384(%rsp)
+ vmovups %zmm26, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1064(%rsp)
+ movq %rdi, 1056(%rsp)
+ movq %r12, 1096(%rsp)
+ cfi_offset_rel_rsp (12, 1096)
+ movb %dl, %r12b
+ movq %r13, 1088(%rsp)
+ cfi_offset_rel_rsp (13, 1088)
+ movl %ecx, %r13d
+ movq %r14, 1080(%rsp)
+ cfi_offset_rel_rsp (14, 1080)
+ movl %eax, %r14d
+ movq %r15, 1072(%rsp)
+ cfi_offset_rel_rsp (15, 1072)
+ cfi_remember_state
+
+.LBL_2_6:
+ btl %r14d, %r13d
+ jc .LBL_2_12
+
+.LBL_2_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_2_10
+
+.LBL_2_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_2_6
+
+ kmovw 1048(%rsp), %k4
+ kmovw 1040(%rsp), %k5
+ kmovw 1032(%rsp), %k6
+ kmovw 1024(%rsp), %k7
+ vmovups 960(%rsp), %zmm16
+ vmovups 896(%rsp), %zmm17
+ vmovups 832(%rsp), %zmm18
+ vmovups 768(%rsp), %zmm19
+ vmovups 704(%rsp), %zmm20
+ vmovups 640(%rsp), %zmm21
+ vmovups 576(%rsp), %zmm22
+ vmovups 512(%rsp), %zmm23
+ vmovups 448(%rsp), %zmm24
+ vmovups 384(%rsp), %zmm25
+ vmovups 320(%rsp), %zmm26
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ vmovups 1216(%rsp), %zmm1
+ movq 1064(%rsp), %rsi
+ movq 1056(%rsp), %rdi
+ movq 1096(%rsp), %r12
+ cfi_restore (%r12)
+ movq 1088(%rsp), %r13
+ cfi_restore (%r13)
+ movq 1080(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1072(%rsp), %r15
+ cfi_restore (%r15)
+ jmp .LBL_2_2
+
+.LBL_2_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ vmovss 1156(%rsp,%r15,8), %xmm0
+ vzeroupper
+ vmovss 1156(%rsp,%r15,8), %xmm0
+
+ call sinf@PLT
+
+ vmovss %xmm0, 1220(%rsp,%r15,8)
+ jmp .LBL_2_8
+
+.LBL_2_12:
+ movzbl %r12b, %r15d
+ vmovss 1152(%rsp,%r15,8), %xmm0
+ vzeroupper
+ vmovss 1152(%rsp,%r15,8), %xmm0
+
+ call sinf@PLT
+
+ vmovss %xmm0, 1216(%rsp,%r15,8)
+ jmp .LBL_2_7
+#endif
+END (_ZGVeN16v_sinf_skx)
+
+ .section .rodata, "a"
+.L_2il0floatpacket.11:
+ .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+ .type .L_2il0floatpacket.11,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S
new file mode 100644
index 0000000000..cf1e4df406
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S
@@ -0,0 +1,38 @@
+/* Multiple versions of vectorized sinf.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVbN4v_sinf)
+ .type _ZGVbN4v_sinf, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVbN4v_sinf_sse4(%rip), %rax
+ testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVbN4v_sinf_sse2(%rip), %rax
+ ret
+END (_ZGVbN4v_sinf)
+libmvec_hidden_def (_ZGVbN4v_sinf)
+
+#define _ZGVbN4v_sinf _ZGVbN4v_sinf_sse2
+#include "../svml_s_sinf4_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S
new file mode 100644
index 0000000000..b8b852bcae
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S
@@ -0,0 +1,224 @@
+/* Function sinf vectorized with SSE4.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+#include <sysdep.h>
+#include "svml_s_trig_data.h"
+
+ .text
+ENTRY(_ZGVbN4v_sinf_sse4)
+/*
+ ALGORITHM DESCRIPTION:
+
+ 1) Range reduction to [-Pi/2; +Pi/2] interval
+ a) Grab sign from source argument and save it.
+ b) Remove sign using AND operation
+ c) Getting octant Y by 1/Pi multiplication
+ d) Add "Right Shifter" value
+ e) Treat obtained value as integer for destination sign setting.
+ Shift first bit of this value to the last (sign) position
+ f) Change destination sign if source sign is negative
+ using XOR operation.
+ g) Subtract "Right Shifter" value
+ h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+ 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+ a) Calculate X^2 = X * X
+ b) Calculate polynomial:
+ R = X + X * X^2 * (A3 + x^2 * (A5 + ......
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R = XOR( R, S );
+ */
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $320, %rsp
+ movaps %xmm0, %xmm5
+ movq __svml_s_trig_data@GOTPCREL(%rip), %rax
+ movups __sAbsMask(%rax), %xmm2
+
+/* b) Remove sign using AND operation */
+ movaps %xmm2, %xmm4
+
+/*
+ f) Change destination sign if source sign is negative
+ using XOR operation.
+ */
+ andnps %xmm5, %xmm2
+ movups __sInvPI(%rax), %xmm1
+ andps %xmm5, %xmm4
+
+/* c) Getting octant Y by 1/Pi multiplication
+ d) Add "Right Shifter" value */
+ mulps %xmm4, %xmm1
+
+/* h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4 */
+ movaps %xmm4, %xmm0
+
+/* Check for large and special values */
+ cmpnleps __sRangeReductionVal(%rax), %xmm4
+ movups __sRShifter(%rax), %xmm6
+ movups __sPI1(%rax), %xmm7
+ addps %xmm6, %xmm1
+ movmskps %xmm4, %ecx
+
+/* e) Treat obtained value as integer for destination sign setting.
+ Shift first bit of this value to the last (sign) position */
+ movaps %xmm1, %xmm3
+
+/* g) Subtract "Right Shifter" value */
+ subps %xmm6, %xmm1
+ mulps %xmm1, %xmm7
+ pslld $31, %xmm3
+ movups __sPI2(%rax), %xmm6
+ subps %xmm7, %xmm0
+ mulps %xmm1, %xmm6
+ movups __sPI3(%rax), %xmm7
+ subps %xmm6, %xmm0
+ mulps %xmm1, %xmm7
+ movups __sPI4(%rax), %xmm6
+ subps %xmm7, %xmm0
+ mulps %xmm6, %xmm1
+ subps %xmm1, %xmm0
+
+/* 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+ a) Calculate X^2 = X * X
+ b) Calculate polynomial:
+ R = X + X * X^2 * (A3 + x^2 * (A5 + ...... */
+ movaps %xmm0, %xmm1
+ mulps %xmm0, %xmm1
+ xorps %xmm3, %xmm0
+ movups __sA9(%rax), %xmm3
+ mulps %xmm1, %xmm3
+ addps __sA7(%rax), %xmm3
+ mulps %xmm1, %xmm3
+ addps __sA5(%rax), %xmm3
+ mulps %xmm1, %xmm3
+ addps __sA3(%rax), %xmm3
+ mulps %xmm3, %xmm1
+ mulps %xmm0, %xmm1
+ addps %xmm1, %xmm0
+
+/* 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R = XOR( R, S ); */
+ xorps %xmm2, %xmm0
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ movups %xmm5, 192(%rsp)
+ movups %xmm0, 256(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ movups %xmm8, 112(%rsp)
+ movups %xmm9, 96(%rsp)
+ movups %xmm10, 80(%rsp)
+ movups %xmm11, 64(%rsp)
+ movups %xmm12, 48(%rsp)
+ movups %xmm13, 32(%rsp)
+ movups %xmm14, 16(%rsp)
+ movups %xmm15, (%rsp)
+ movq %rsi, 136(%rsp)
+ movq %rdi, 128(%rsp)
+ movq %r12, 168(%rsp)
+ cfi_offset_rel_rsp (12, 168)
+ movb %dl, %r12b
+ movq %r13, 160(%rsp)
+ cfi_offset_rel_rsp (13, 160)
+ movl %ecx, %r13d
+ movq %r14, 152(%rsp)
+ cfi_offset_rel_rsp (14, 152)
+ movl %eax, %r14d
+ movq %r15, 144(%rsp)
+ cfi_offset_rel_rsp (15, 144)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ movups 112(%rsp), %xmm8
+ movups 96(%rsp), %xmm9
+ movups 80(%rsp), %xmm10
+ movups 64(%rsp), %xmm11
+ movups 48(%rsp), %xmm12
+ movups 32(%rsp), %xmm13
+ movups 16(%rsp), %xmm14
+ movups (%rsp), %xmm15
+ movq 136(%rsp), %rsi
+ movq 128(%rsp), %rdi
+ movq 168(%rsp), %r12
+ cfi_restore (%r12)
+ movq 160(%rsp), %r13
+ cfi_restore (%r13)
+ movq 152(%rsp), %r14
+ cfi_restore (%r14)
+ movq 144(%rsp), %r15
+ cfi_restore (%r15)
+ movups 256(%rsp), %xmm0
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ movss 196(%rsp,%r15,8), %xmm0
+
+ call sinf@PLT
+
+ movss %xmm0, 260(%rsp,%r15,8)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ movss 192(%rsp,%r15,8), %xmm0
+
+ call sinf@PLT
+
+ movss %xmm0, 256(%rsp,%r15,8)
+ jmp .LBL_1_7
+
+END(_ZGVbN4v_sinf_sse4)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S
new file mode 100644
index 0000000000..b28bf3cabc
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S
@@ -0,0 +1,38 @@
+/* Multiple versions of vectorized sinf, vector length is 8.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+ .text
+ENTRY (_ZGVdN8v_sinf)
+ .type _ZGVdN8v_sinf, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVdN8v_sinf_avx2(%rip), %rax
+ testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVdN8v_sinf_sse_wrapper(%rip), %rax
+ ret
+END (_ZGVdN8v_sinf)
+libmvec_hidden_def (_ZGVdN8v_sinf)
+
+#define _ZGVdN8v_sinf _ZGVdN8v_sinf_sse_wrapper
+#include "../svml_s_sinf8_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S
new file mode 100644
index 0000000000..a130d25fce
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S
@@ -0,0 +1,219 @@
+/* Function sinf vectorized with AVX2.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_trig_data.h"
+
+ .text
+ENTRY(_ZGVdN8v_sinf_avx2)
+/*
+ ALGORITHM DESCRIPTION:
+
+ 1) Range reduction to [-Pi/2; +Pi/2] interval
+ a) Grab sign from source argument and save it.
+ b) Remove sign using AND operation
+ c) Getting octant Y by 1/Pi multiplication
+ d) Add "Right Shifter" value
+ e) Treat obtained value as integer for destination sign setting.
+ Shift first bit of this value to the last (sign) position
+ f) Change destination sign if source sign is negative
+ using XOR operation.
+ g) Subtract "Right Shifter" value
+ h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+ 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+ a) Calculate X^2 = X * X
+ b) Calculate polynomial:
+ R = X + X * X^2 * (A3 + x^2 * (A5 + ......
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R = XOR( R, S );
+ */
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $448, %rsp
+ movq __svml_s_trig_data@GOTPCREL(%rip), %rax
+ vmovdqa %ymm0, %ymm5
+ vmovups __sAbsMask(%rax), %ymm3
+ vmovups __sInvPI(%rax), %ymm7
+ vmovups __sRShifter(%rax), %ymm0
+ vmovups __sPI1_FMA(%rax), %ymm1
+
+/* b) Remove sign using AND operation */
+ vandps %ymm3, %ymm5, %ymm4
+
+/*
+ c) Getting octant Y by 1/Pi multiplication
+ d) Add "Right Shifter" value
+ */
+ vfmadd213ps %ymm0, %ymm4, %ymm7
+
+/* g) Subtract "Right Shifter" value */
+ vsubps %ymm0, %ymm7, %ymm2
+
+/*
+ e) Treat obtained value as integer for destination sign setting.
+ Shift first bit of this value to the last (sign) position
+ */
+ vpslld $31, %ymm7, %ymm6
+
+/*
+ h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3;
+ */
+ vmovdqa %ymm4, %ymm0
+ vfnmadd231ps %ymm2, %ymm1, %ymm0
+
+/* Check for large and special values */
+ vcmpnle_uqps __sRangeReductionVal(%rax), %ymm4, %ymm4
+ vfnmadd231ps __sPI2_FMA(%rax), %ymm2, %ymm0
+ vfnmadd132ps __sPI3_FMA(%rax), %ymm0, %ymm2
+
+/*
+ 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+ a) Calculate X^2 = X * X
+ b) Calculate polynomial:
+ R = X + X * X^2 * (A3 + x^2 * (A5 + ......
+ */
+ vmulps %ymm2, %ymm2, %ymm1
+
+/*
+ f) Change destination sign if source sign is negative
+ using XOR operation.
+ */
+ vandnps %ymm5, %ymm3, %ymm0
+ vxorps %ymm6, %ymm2, %ymm3
+ vmovups __sA9(%rax), %ymm2
+ vfmadd213ps __sA7(%rax), %ymm1, %ymm2
+ vfmadd213ps __sA5(%rax), %ymm1, %ymm2
+ vfmadd213ps __sA3(%rax), %ymm1, %ymm2
+ vmulps %ymm1, %ymm2, %ymm6
+ vfmadd213ps %ymm3, %ymm3, %ymm6
+ vmovmskps %ymm4, %ecx
+
+/*
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R = XOR( R, S );
+ */
+ vxorps %ymm0, %ymm6, %ymm0
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovups %ymm5, 320(%rsp)
+ vmovups %ymm0, 384(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ vmovups %ymm8, 224(%rsp)
+ vmovups %ymm9, 192(%rsp)
+ vmovups %ymm10, 160(%rsp)
+ vmovups %ymm11, 128(%rsp)
+ vmovups %ymm12, 96(%rsp)
+ vmovups %ymm13, 64(%rsp)
+ vmovups %ymm14, 32(%rsp)
+ vmovups %ymm15, (%rsp)
+ movq %rsi, 264(%rsp)
+ movq %rdi, 256(%rsp)
+ movq %r12, 296(%rsp)
+ cfi_offset_rel_rsp (12, 296)
+ movb %dl, %r12b
+ movq %r13, 288(%rsp)
+ cfi_offset_rel_rsp (13, 288)
+ movl %ecx, %r13d
+ movq %r14, 280(%rsp)
+ cfi_offset_rel_rsp (14, 280)
+ movl %eax, %r14d
+ movq %r15, 272(%rsp)
+ cfi_offset_rel_rsp (15, 272)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ vmovups 224(%rsp), %ymm8
+ vmovups 192(%rsp), %ymm9
+ vmovups 160(%rsp), %ymm10
+ vmovups 128(%rsp), %ymm11
+ vmovups 96(%rsp), %ymm12
+ vmovups 64(%rsp), %ymm13
+ vmovups 32(%rsp), %ymm14
+ vmovups (%rsp), %ymm15
+ vmovups 384(%rsp), %ymm0
+ movq 264(%rsp), %rsi
+ movq 256(%rsp), %rdi
+ movq 296(%rsp), %r12
+ cfi_restore (%r12)
+ movq 288(%rsp), %r13
+ cfi_restore (%r13)
+ movq 280(%rsp), %r14
+ cfi_restore (%r14)
+ movq 272(%rsp), %r15
+ cfi_restore (%r15)
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ vmovss 324(%rsp,%r15,8), %xmm0
+ vzeroupper
+
+ call sinf@PLT
+
+ vmovss %xmm0, 388(%rsp,%r15,8)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ vmovss 320(%rsp,%r15,8), %xmm0
+ vzeroupper
+
+ call sinf@PLT
+
+ vmovss %xmm0, 384(%rsp,%r15,8)
+ jmp .LBL_1_7
+
+END(_ZGVdN8v_sinf_avx2)