Fixed several libmvec bugs found during testing on KNL hardware.

AVX512 IFUNC implementations, implementations of wrappers to AVX2 versions and KNL expf implementation fixed. * sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S: Fixed AVX512 IFUNC. * sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S: Likewise. * sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Fixed wrappers to AVX2. * sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S: Fixed KNL implementation.
author: Andrew Senkevich <andrew.senkevich@intel.com> 2015-07-24 14:47:23 +0300
committer: Andrew Senkevich <andrew.senkevich@intel.com> 2015-07-24 14:47:23 +0300
commit: 99017161354321845d11dce4fcd3abfebc5dd0d5 (patch)
tree: 50c62fe44aef915a84b1eb5fb0ad787e39f5a210
parent: 3bcea719ddd6ce399d7bccb492c40af77d216e42 (diff)
16 files changed, 220 insertions, 223 deletions
diff --git a/ChangeLog b/ChangeLog
index 6f6016db43..3e22413e8a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,22 @@
+2015-07-24  Andrew Senkevich  <andrew.senkevich@intel.com>
+
+	* sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S: Fixed AVX512 IFUNC.
+	* sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S: Likewise.
+	* sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S: Likewise.
+	* sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S: Likewise.
+	* sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S: Likewise.
+	* sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S: Likewise.
+	* sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S: Likewise.
+	* sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S: Likewise.
+	* sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S: Likewise.
+	* sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S: Likewise.
+	* sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S: Likewise.
+	* sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S: Likewise.
+	* sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Fixed wrappers to AVX2.
+	* sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise.
+	* sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S: Fixed KNL
+	implementation.
+
 2015-07-24  Szabolcs Nagy  <szabolcs.nagy@arm.com>
 
 	[BZ #17711]
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
index ba3b66f69f..d0f4f27f46 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN8v_cos)
         .type   _ZGVeN8v_cos, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN8v_cos_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8v_cos_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8v_cos_knl(%rip), %rax
         testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN8v_cos_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8v_cos)
 
 #define _ZGVeN8v_cos _ZGVeN8v_cos_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
index 8f837fbfb9..7b7c07d926 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN8v_exp)
         .type   _ZGVeN8v_exp, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN8v_exp_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8v_exp_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8v_exp_knl(%rip), %rax
         testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN8v_exp_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8v_exp)
 
 #define _ZGVeN8v_exp _ZGVeN8v_exp_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
index 2f9e9d8892..76375fdae0 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN8v_log)
         .type   _ZGVeN8v_log, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN8v_log_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8v_log_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8v_log_knl(%rip), %rax
         testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN8v_log_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8v_log)
 
 #define _ZGVeN8v_log _ZGVeN8v_log_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
index 3b11511e51..c1e5e76f92 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN8vv_pow)
         .type   _ZGVeN8vv_pow, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN8vv_pow_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8vv_pow_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8vv_pow_knl(%rip), %rax
         testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN8vv_pow_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8vv_pow)
 
 #define _ZGVeN8vv_pow _ZGVeN8vv_pow_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
index ba631020f3..131f2f47c5 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN8v_sin)
         .type   _ZGVeN8v_sin, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN8v_sin_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8v_sin_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8v_sin_knl(%rip), %rax
         testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN8v_sin_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8v_sin)
 
 #define _ZGVeN8v_sin _ZGVeN8v_sin_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
index 7228ba549a..e33109099e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN8vvv_sincos)
         .type   _ZGVeN8vvv_sincos, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN8vvv_sincos_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8vvv_sincos_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8vvv_sincos_knl(%rip), %rax
         testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN8vvv_sincos_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8vvv_sincos)
 
 #define _ZGVeN8vvv_sincos _ZGVeN8vvv_sincos_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
index 91564de22a..0654d3c19b 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN16v_cosf)
         .type   _ZGVeN16v_cosf, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN16v_cosf_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16v_cosf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16v_cosf_knl(%rip), %rax
         testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN16v_cosf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16v_cosf)
 
 #define _ZGVeN16v_cosf _ZGVeN16v_cosf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
index 3b3489d05a..62858eb39e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN16v_expf)
         .type   _ZGVeN16v_expf, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN16v_expf_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16v_expf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16v_expf_knl(%rip), %rax
         testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN16v_expf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16v_expf)
 
 #define _ZGVeN16v_expf _ZGVeN16v_expf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
index cb807e0757..ec69055351 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
@@ -46,6 +46,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
      The table lookup is skipped if k = 0.
      For low accuracy approximation, exp(r) ~ 1 or 1+r.  */
 
+        pushq     %rbp
         cfi_adjust_cfa_offset (8)
         cfi_rel_offset (%rbp, 0)
         movq      %rsp, %rbp
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
index 8756750c86..68c57e4386 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN16v_logf)
         .type   _ZGVeN16v_logf, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN16v_logf_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16v_logf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16v_logf_knl(%rip), %rax
         testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN16v_logf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16v_logf)
 
 #define _ZGVeN16v_logf _ZGVeN16v_logf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
index a4ba4fbc04..3aa9f952ce 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN16vv_powf)
         .type   _ZGVeN16vv_powf, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN16vv_powf_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16vv_powf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16vv_powf_knl(%rip), %rax
         testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN16vv_powf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16vv_powf)
 
 #define _ZGVeN16vv_powf _ZGVeN16vv_powf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
index 0a1753eab7..bdcabab6e2 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN16vvv_sincosf)
         .type   _ZGVeN16vvv_sincosf, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN16vvv_sincosf_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16vvv_sincosf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16vvv_sincosf_knl(%rip), %rax
         testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN16vvv_sincosf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16vvv_sincosf)
 
 #define _ZGVeN16vvv_sincosf _ZGVeN16vvv_sincosf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
index 7ed637b8e6..3ec78a0b5e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN16v_sinf)
         .type   _ZGVeN16v_sinf, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN16v_sinf_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16v_sinf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16v_sinf_knl(%rip), %rax
         testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN16v_sinf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16v_sinf)
 
 #define _ZGVeN16v_sinf _ZGVeN16v_sinf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
index bd93b8edfa..5c0ff897c0 100644
--- a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
+++ b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
@@ -194,39 +194,39 @@
 
 /* AVX512 ISA version as wrapper to AVX2 ISA version.  */
 .macro WRAPPER_IMPL_AVX512 callee
-        pushq	%rbp
+        pushq     %rbp
         cfi_adjust_cfa_offset (8)
         cfi_rel_offset (%rbp, 0)
-        movq	%rsp, %rbp
+        movq      %rsp, %rbp
         cfi_def_cfa_register (%rbp)
-        andq	$-64, %rsp
-        subq	$64, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
-        .byte	0x62
-        .byte	0xf1
-        .byte	0x7c
-        .byte	0x48
-        .byte	0x29
-        .byte	0x04
-        .byte	0x24
-/* Below is encoding for vmovapd (%rsp), %ymm0.  */
-        .byte	0xc5
-        .byte	0xfd
-        .byte	0x28
-        .byte	0x04
-        .byte	0x24
-        call	HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 32(%rsp), %ymm0.  */
-        .byte	0xc5
-        .byte	0xfd
-        .byte	0x28
-        .byte	0x44
-        .byte	0x24
-        .byte	0x20
-        call	HIDDEN_JUMPTARGET(\callee)
-        movq	%rbp, %rsp
+        andq      $-64, %rsp
+        subq      $128, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x04
+        .byte   0x24
+        vmovupd   (%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 64(%rsp)
+        vmovupd   32(%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 96(%rsp)
+/* Below is encoding for vmovups 64(%rsp), %zmm0.  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x10
+        .byte   0x44
+        .byte   0x24
+        .byte   0x01
+        movq      %rbp, %rsp
         cfi_def_cfa_register (%rsp)
-        popq	%rbp
+        popq      %rbp
         cfi_adjust_cfa_offset (-8)
         cfi_restore (%rbp)
         ret
@@ -234,61 +234,50 @@
 
 /* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version.  */
 .macro WRAPPER_IMPL_AVX512_ff callee
-        pushq	%rbp
+        pushq     %rbp
         cfi_adjust_cfa_offset (8)
         cfi_rel_offset (%rbp, 0)
-        movq	%rsp, %rbp
+        movq      %rsp, %rbp
         cfi_def_cfa_register (%rbp)
-        andq	$-64, %rsp
-        subq	$128, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
-        .byte	0x62
-        .byte	0xf1
-        .byte	0x7c
-        .byte	0x48
-        .byte	0x29
-        .byte	0x04
-        .byte	0x24
-/* Below is encoding for vmovaps %zmm1, 64(%rsp).  */
-        .byte	0x62
-        .byte	0xf1
-        .byte	0x7c
-        .byte	0x48
-        .byte	0x29
-        .byte	0x4c
-        .byte	0x24
-/* Below is encoding for vmovapd (%rsp), %ymm0.  */
-        .byte	0xc5
-        .byte	0xfd
-        .byte	0x28
-        .byte	0x04
-        .byte	0x24
-/* Below is encoding for vmovapd 64(%rsp), %ymm1.  */
-        .byte	0xc5
-        .byte	0xfd
-        .byte	0x28
-        .byte	0x4c
-        .byte	0x24
-        .byte	0x40
-        call	HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 32(%rsp), %ymm0.  */
-        .byte	0xc5
-        .byte	0xfd
-        .byte	0x28
-        .byte	0x44
-        .byte	0x24
-        .byte	0x20
-/* Below is encoding for vmovapd 96(%rsp), %ymm1.  */
-        .byte	0xc5
-        .byte	0xfd
-        .byte	0x28
-        .byte	0x4c
-        .byte	0x24
-        .byte	0x60
-        call	HIDDEN_JUMPTARGET(\callee)
-        movq	%rbp, %rsp
+        andq      $-64, %rsp
+        subq      $192, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x04
+        .byte   0x24
+/* Below is encoding for vmovups %zmm1, 64(%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x4c
+        .byte   0x24
+        .byte   0x01
+        vmovupd   (%rsp), %ymm0
+        vmovupd   64(%rsp), %ymm1
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 128(%rsp)
+        vmovupd   32(%rsp), %ymm0
+        vmovupd   96(%rsp), %ymm1
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 160(%rsp)
+/* Below is encoding for vmovups 128(%rsp), %zmm0.  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x10
+        .byte   0x44
+        .byte   0x24
+        .byte   0x02
+        movq      %rbp, %rsp
         cfi_def_cfa_register (%rsp)
-        popq	%rbp
+        popq      %rbp
         cfi_adjust_cfa_offset (-8)
         cfi_restore (%rbp)
         ret
@@ -310,61 +299,26 @@
         cfi_rel_offset (%r13, 0)
         subq      $176, %rsp
         movq      %rsi, %r13
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
         .byte	0x62
         .byte	0xf1
         .byte	0x7c
         .byte	0x48
-        .byte	0x29
+        .byte	0x11
         .byte	0x04
         .byte	0x24
         movq    %rdi, %r12
-/* Below is encoding for vmovapd (%rsp), %ymm0.  */
-        .byte	0xc5
-        .byte	0xfd
-        .byte	0x28
-        .byte	0x04
-        .byte	0x24
+        vmovupd (%rsp), %ymm0
         call      HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 32(%rsp), %ymm0.  */
-        .byte	0xc5
-        .byte	0xfd
-        .byte	0x28
-        .byte	0x44
-        .byte	0x24
-        .byte	0x20
+        vmovupd   32(%rsp), %ymm0
         lea       64(%rsp), %rdi
         lea       96(%rsp), %rsi
         call      HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 64(%rsp), %ymm0.  */
-        .byte	0xc5
-        .byte	0xfd
-        .byte	0x28
-        .byte	0x44
-        .byte	0x24
-        .byte	0x40
-/* Below is encoding for vmovapd   96(%rsp), %ymm1.  */
-        .byte	0xc5
-        .byte	0xfd
-        .byte	0x28
-        .byte	0x4c
-        .byte	0x24
-        .byte	0x60
-/* Below is encoding for vmovapd   %ymm0, 32(%r12).  */
-        .byte	0xc4
-        .byte	0xc1
-        .byte	0x7d
-        .byte	0x29
-        .byte	0x44
-        .byte	0x24
-        .byte	0x20
-/* Below is encoding for vmovapd   %ymm1, 32(%r13).  */
-        .byte	0xc4
-        .byte	0xc1
-        .byte	0x7d
-        .byte	0x29
-        .byte	0x4d
-        .byte	0x20
+        vmovupd   64(%rsp), %ymm0
+        vmovupd   96(%rsp), %ymm1
+        vmovupd   %ymm0, 32(%r12)
+        vmovupd   %ymm1, 32(%r13)
+        vzeroupper
         addq      $176, %rsp
         popq      %r13
         cfi_adjust_cfa_offset (-8)
diff --git a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
index 66bb081c9d..d255d195ee 100644
--- a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
+++ b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
@@ -239,28 +239,39 @@
 
 /* AVX512 ISA version as wrapper to AVX2 ISA version.  */
 .macro WRAPPER_IMPL_AVX512 callee
-        pushq	%rbp
+        pushq     %rbp
         cfi_adjust_cfa_offset (8)
         cfi_rel_offset (%rbp, 0)
-        movq	%rsp, %rbp
+        movq      %rsp, %rbp
         cfi_def_cfa_register (%rbp)
-        andq	$-64, %rsp
-        subq	$64, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
-        .byte	0x62
-        .byte	0xf1
-        .byte	0x7c
-        .byte	0x48
-        .byte	0x29
-        .byte	0x04
-        .byte	0x24
-        vmovaps (%rsp), %ymm0
-        call	HIDDEN_JUMPTARGET(\callee)
-        vmovaps 32(%rsp), %ymm0
-        call	HIDDEN_JUMPTARGET(\callee)
-        movq	%rbp, %rsp
+        andq      $-64, %rsp
+        subq      $128, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x04
+        .byte   0x24
+        vmovupd   (%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 64(%rsp)
+        vmovupd   32(%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 96(%rsp)
+/* Below is encoding for vmovups 64(%rsp), %zmm0.  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x10
+        .byte   0x44
+        .byte   0x24
+        .byte   0x01
+        movq      %rbp, %rsp
         cfi_def_cfa_register (%rsp)
-        popq	%rbp
+        popq      %rbp
         cfi_adjust_cfa_offset (-8)
         cfi_restore (%rbp)
         ret
@@ -274,29 +285,41 @@
         movq      %rsp, %rbp
         cfi_def_cfa_register (%rbp)
         andq      $-64, %rsp
-        subq      $128, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
-        .byte	0x62
-        .byte	0xf1
-        .byte	0x7c
-        .byte	0x48
-        .byte	0x29
-        .byte	0x04
-        .byte	0x24
-/* Below is encoding for vmovaps %zmm1, 64(%rsp).  */
-        .byte	0x62
-        .byte	0xf1
-        .byte	0x7c
-        .byte	0x48
-        .byte	0x29
-        .byte	0x4c
-        .byte	0x24
-        vmovaps (%rsp), %ymm0
-        vmovaps 64(%rsp), %ymm1
+        subq      $192, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x04
+        .byte   0x24
+/* Below is encoding for vmovups %zmm1, 64(%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x4c
+        .byte   0x24
+        .byte   0x01
+        vmovups   (%rsp), %ymm0
+        vmovups   64(%rsp), %ymm1
         call      HIDDEN_JUMPTARGET(\callee)
-        vmovaps 32(%rsp), %ymm0
-        vmovaps 96(%rsp), %ymm1
+        vmovups   %ymm0, 128(%rsp)
+        vmovups   32(%rsp), %ymm0
+        vmovups   96(%rsp), %ymm1
         call      HIDDEN_JUMPTARGET(\callee)
+        vmovups   %ymm0, 160(%rsp)
+/* Below is encoding for vmovups 128(%rsp), %zmm0.  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x10
+        .byte   0x44
+        .byte   0x24
+        .byte   0x02
         movq      %rbp, %rsp
         cfi_def_cfa_register (%rsp)
         popq      %rbp
author	Andrew Senkevich <andrew.senkevich@intel.com>	2015-07-24 14:47:23 +0300
committer	Andrew Senkevich <andrew.senkevich@intel.com>	2015-07-24 14:47:23 +0300
commit	99017161354321845d11dce4fcd3abfebc5dd0d5 (patch)
tree	50c62fe44aef915a84b1eb5fb0ad787e39f5a210
parent	3bcea719ddd6ce399d7bccb492c40af77d216e42 (diff)