summaryrefslogtreecommitdiff
path: root/sysdeps/x86_64/fpu/svml_s_sincosf4_core.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/x86_64/fpu/svml_s_sincosf4_core.S')
-rw-r--r--sysdeps/x86_64/fpu/svml_s_sincosf4_core.S128
1 files changed, 125 insertions, 3 deletions
diff --git a/sysdeps/x86_64/fpu/svml_s_sincosf4_core.S b/sysdeps/x86_64/fpu/svml_s_sincosf4_core.S
index 1a7d2733af..5daa5118d6 100644
--- a/sysdeps/x86_64/fpu/svml_s_sincosf4_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_sincosf4_core.S
@@ -1,5 +1,5 @@
/* Function sincosf vectorized with SSE2.
- Copyright (C) 2014-2016 Free Software Foundation, Inc.
+ Copyright (C) 2014-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,13 +16,135 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
-ENTRY (_ZGVbN4vvv_sincosf)
+ENTRY (_ZGVbN4vl4l4_sincosf)
WRAPPER_IMPL_SSE2_fFF sincosf
+END (_ZGVbN4vl4l4_sincosf)
+libmvec_hidden_def (_ZGVbN4vl4l4_sincosf)
+
+/* SSE2 ISA version as wrapper to scalar (for vector
+ function declared with #pragma omp declare simd notinbranch). */
+.macro WRAPPER_IMPL_SSE2_fFF_vvv callee
+#ifndef __ILP32__
+ subq $120, %rsp
+ cfi_adjust_cfa_offset(120)
+ movaps %xmm0, 96(%rsp)
+ lea (%rsp), %rdi
+ movdqa %xmm1, 32(%rdi)
+ lea 16(%rsp), %rsi
+ movdqa %xmm2, 32(%rsi)
+ movdqa %xmm3, 48(%rsi)
+ movdqa %xmm4, 64(%rsi)
+ call JUMPTARGET(\callee)
+ movss 100(%rsp), %xmm0
+ lea 4(%rsp), %rdi
+ lea 20(%rsp), %rsi
+ call JUMPTARGET(\callee)
+ movss 104(%rsp), %xmm0
+ lea 8(%rsp), %rdi
+ lea 24(%rsp), %rsi
+ call JUMPTARGET(\callee)
+ movss 108(%rsp), %xmm0
+ lea 12(%rsp), %rdi
+ lea 28(%rsp), %rsi
+ call JUMPTARGET(\callee)
+ movq 32(%rsp), %rdx
+ movq 40(%rsp), %rsi
+ movq 48(%rsp), %r8
+ movq 56(%rsp), %r10
+ movl (%rsp), %eax
+ movl 4(%rsp), %ecx
+ movl 8(%rsp), %edi
+ movl 12(%rsp), %r9d
+ movl %eax, (%rdx)
+ movl %ecx, (%rsi)
+ movq 64(%rsp), %rax
+ movq 72(%rsp), %rcx
+ movl %edi, (%r8)
+ movl %r9d, (%r10)
+ movq 80(%rsp), %rdi
+ movq 88(%rsp), %r9
+ movl 16(%rsp), %r11d
+ movl 20(%rsp), %edx
+ movl 24(%rsp), %esi
+ movl 28(%rsp), %r8d
+ movl %r11d, (%rax)
+ movl %edx, (%rcx)
+ movl %esi, (%rdi)
+ movl %r8d, (%r9)
+ addq $120, %rsp
+ cfi_adjust_cfa_offset(-120)
+ ret
+#else
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset 6, -16
+ pushq %rbx
+ .cfi_def_cfa_offset 24
+ .cfi_offset 3, -24
+ subl $88, %esp
+ .cfi_def_cfa_offset 112
+ leal 64(%rsp), %esi
+ movaps %xmm1, (%esp)
+ leal 48(%rsp), %edi
+ movaps %xmm2, 16(%esp)
+ movq %rsi, %rbp
+ movq %rdi, %rbx
+ movaps %xmm0, 32(%esp)
+ call JUMPTARGET(\callee)
+ movups 36(%esp), %xmm0
+ leal 4(%rbp), %esi
+ leal 4(%rbx), %edi
+ call JUMPTARGET(\callee)
+ movups 40(%esp), %xmm0
+ leal 8(%rbp), %esi
+ leal 8(%rbx), %edi
+ call JUMPTARGET(\callee)
+ movups 44(%esp), %xmm0
+ leal 12(%rbp), %esi
+ leal 12(%rbx), %edi
+ call JUMPTARGET(\callee)
+ movq (%esp), %rax
+ movss 48(%esp), %xmm0
+ movdqa (%esp), %xmm4
+ movdqa 16(%esp), %xmm7
+ movss %xmm0, (%eax)
+ movss 52(%esp), %xmm0
+ pextrd $1, %xmm4, %eax
+ movss %xmm0, (%eax)
+ movq 8(%esp), %rax
+ movss 56(%esp), %xmm0
+ movss %xmm0, (%eax)
+ movss 60(%esp), %xmm0
+ pextrd $3, %xmm4, %eax
+ movss %xmm0, (%eax)
+ movq 16(%esp), %rax
+ movss 64(%esp), %xmm0
+ movss %xmm0, (%eax)
+ movss 68(%esp), %xmm0
+ pextrd $1, %xmm7, %eax
+ movss %xmm0, (%eax)
+ movq 24(%esp), %rax
+ movss 72(%esp), %xmm0
+ movss %xmm0, (%eax)
+ movss 76(%esp), %xmm0
+ pextrd $3, %xmm7, %eax
+ movss %xmm0, (%eax)
+ addl $88, %esp
+ .cfi_def_cfa_offset 24
+ popq %rbx
+ .cfi_def_cfa_offset 16
+ popq %rbp
+ .cfi_def_cfa_offset 8
+ ret
+#endif
+.endm
+
+ENTRY (_ZGVbN4vvv_sincosf)
+WRAPPER_IMPL_SSE2_fFF_vvv sincosf
END (_ZGVbN4vvv_sincosf)
#ifndef USE_MULTIARCH