diff options
author | Samuel Thibault <samuel.thibault@ens-lyon.org> | 2018-12-27 20:07:48 +0000 |
---|---|---|
committer | Samuel Thibault <samuel.thibault@ens-lyon.org> | 2018-12-27 20:07:48 +0000 |
commit | ad19c620767174a56d7b25615d3a54b7486f1cca (patch) | |
tree | 113406f3d7178f4979abc5a4960bb6f979648e9b /sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S | |
parent | e1777dde9106cbd223b56d6ddcf8464093679c81 (diff) | |
parent | 5fbd1a9aad274b17f2788dd22e5306cba07dab71 (diff) |
Merge commit 'refs/top-bases/tschwinge/Roger_Whittaker' into tschwinge/Roger_Whittaker
Diffstat (limited to 'sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S')
-rw-r--r-- | sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S | 187 |
1 files changed, 171 insertions, 16 deletions
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S index 44700f90b8..2df626c0c1 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S @@ -1,5 +1,5 @@ /* Function sincos vectorized with AVX-512. KNL and SKX versions. - Copyright (C) 2014-2016 Free Software Foundation, Inc. + Copyright (C) 2014-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -36,9 +36,9 @@ sin(R), sin(R') are approximated by corresponding polynomial. */ .text -ENTRY (_ZGVeN8vvv_sincos_knl) -#ifndef HAVE_AVX512_ASM_SUPPORT -WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos +ENTRY (_ZGVeN8vl8l8_sincos_knl) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos #else pushq %rbp cfi_adjust_cfa_offset (8) @@ -278,12 +278,12 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos shlq $4, %r15 vmovsd 1160(%rsp,%r15), %xmm0 - call sin@PLT + call JUMPTARGET(sin) vmovsd %xmm0, 1224(%rsp,%r15) vmovsd 1160(%rsp,%r15), %xmm0 - call cos@PLT + call JUMPTARGET(cos) vmovsd %xmm0, 1288(%rsp,%r15) jmp .LBL_1_8 @@ -293,22 +293,23 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos shlq $4, %r15 vmovsd 1152(%rsp,%r15), %xmm0 - call sin@PLT + call JUMPTARGET(sin) vmovsd %xmm0, 1216(%rsp,%r15) vmovsd 1152(%rsp,%r15), %xmm0 - call cos@PLT + call JUMPTARGET(cos) vmovsd %xmm0, 1280(%rsp,%r15) jmp .LBL_1_7 #endif -END (_ZGVeN8vvv_sincos_knl) +END (_ZGVeN8vl8l8_sincos_knl) +libmvec_hidden_def(_ZGVeN8vl8l8_sincos_knl) -ENTRY (_ZGVeN8vvv_sincos_skx) -#ifndef HAVE_AVX512_ASM_SUPPORT -WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos +ENTRY (_ZGVeN8vl8l8_sincos_skx) +#ifndef HAVE_AVX512DQ_ASM_SUPPORT +WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos #else pushq %rbp cfi_adjust_cfa_offset (8) @@ -557,12 +558,12 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos vzeroupper vmovsd 1160(%rsp,%r15), %xmm0 - call sin@PLT + call JUMPTARGET(sin) vmovsd %xmm0, 1224(%rsp,%r15) vmovsd 1160(%rsp,%r15), %xmm0 - call cos@PLT + call JUMPTARGET(cos) vmovsd %xmm0, 1288(%rsp,%r15) jmp .LBL_2_8 @@ -574,17 +575,171 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos vzeroupper vmovsd 1152(%rsp,%r15), %xmm0 - call sin@PLT + call JUMPTARGET(sin) vmovsd %xmm0, 1216(%rsp,%r15) vmovsd 1152(%rsp,%r15), %xmm0 - call cos@PLT + call JUMPTARGET(cos) vmovsd %xmm0, 1280(%rsp,%r15) jmp .LBL_2_7 #endif +END (_ZGVeN8vl8l8_sincos_skx) +libmvec_hidden_def(_ZGVeN8vl8l8_sincos_skx) + +/* Wrapper between vvv and vl8l8 vector variants. */ +.macro WRAPPER_AVX512_vvv_vl8l8 callee +#ifndef __ILP32__ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $256, %rsp + vmovups %zmm1, 128(%rsp) + lea (%rsp), %rdi + vmovups %zmm2, 192(%rdi) + lea 64(%rsp), %rsi + call HIDDEN_JUMPTARGET(\callee) + movq 128(%rsp), %rdx + movq 136(%rsp), %rsi + movq 144(%rsp), %r8 + movq 152(%rsp), %r10 + movq (%rsp), %rax + movq 8(%rsp), %rcx + movq 16(%rsp), %rdi + movq 24(%rsp), %r9 + movq %rax, (%rdx) + movq %rcx, (%rsi) + movq 160(%rsp), %rax + movq 168(%rsp), %rcx + movq %rdi, (%r8) + movq %r9, (%r10) + movq 176(%rsp), %rdi + movq 184(%rsp), %r9 + movq 32(%rsp), %r11 + movq 40(%rsp), %rdx + movq 48(%rsp), %rsi + movq 56(%rsp), %r8 + movq %r11, (%rax) + movq %rdx, (%rcx) + movq 192(%rsp), %r11 + movq 200(%rsp), %rdx + movq %rsi, (%rdi) + movq %r8, (%r9) + movq 208(%rsp), %rsi + movq 216(%rsp), %r8 + movq 64(%rsp), %r10 + movq 72(%rsp), %rax + movq 80(%rsp), %rcx + movq 88(%rsp), %rdi + movq %r10, (%r11) + movq %rax, (%rdx) + movq 224(%rsp), %r10 + movq 232(%rsp), %rax + movq %rcx, (%rsi) + movq %rdi, (%r8) + movq 240(%rsp), %rcx + movq 248(%rsp), %rdi + movq 96(%rsp), %r9 + movq 104(%rsp), %r11 + movq 112(%rsp), %rdx + movq 120(%rsp), %rsi + movq %r9, (%r10) + movq %r11, (%rax) + movq %rdx, (%rcx) + movq %rsi, (%rdi) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +#else + leal 8(%rsp), %r10d + .cfi_def_cfa 10, 0 + andl $-64, %esp + pushq -8(%r10d) + pushq %rbp + .cfi_escape 0x10,0x6,0x2,0x76,0 + movl %esp, %ebp + pushq %r10 + .cfi_escape 0xf,0x3,0x76,0x78,0x6 + leal -112(%rbp), %esi + leal -176(%rbp), %edi + subl $232, %esp + vmovdqa %ymm1, -208(%ebp) + vmovdqa %ymm2, -240(%ebp) + call HIDDEN_JUMPTARGET(\callee) + vmovdqa -208(%ebp), %xmm0 + vmovq %xmm0, %rax + vmovsd -176(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -168(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movq -200(%ebp), %rax + vmovsd -160(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -152(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movq -192(%ebp), %rax + vmovsd -144(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -136(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movq -184(%ebp), %rax + vmovsd -128(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -120(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + vmovdqa -240(%ebp), %xmm0 + vmovq %xmm0, %rax + vmovsd -112(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -104(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movq -232(%ebp), %rax + vmovsd -96(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -88(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movq -224(%ebp), %rax + vmovsd -80(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -72(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + movq -216(%ebp), %rax + vmovsd -64(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + shrq $32, %rax + vmovsd -56(%ebp), %xmm0 + vmovsd %xmm0, (%eax) + addl $232, %esp + popq %r10 + .cfi_def_cfa 10, 0 + popq %rbp + leal -8(%r10), %esp + .cfi_def_cfa 7, 8 + ret +#endif +.endm + +ENTRY (_ZGVeN8vvv_sincos_knl) +WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_knl +END (_ZGVeN8vvv_sincos_knl) + +ENTRY (_ZGVeN8vvv_sincos_skx) +WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx END (_ZGVeN8vvv_sincos_skx) .section .rodata, "a" |