summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@redhat.com>2009-07-16 07:15:15 -0700
committerUlrich Drepper <drepper@redhat.com>2009-07-16 07:15:15 -0700
commitc8027cced1d3e7803c440cb13d4294754d8791e2 (patch)
tree33bcc93dc74b635aa5e821e617a98503776beb34
parent24a12a5a5f7ea63bc349f219b9fbb722c009a719 (diff)
Optimize restoring of ymm registers on x86-64.
The patch mainly reduces the code size but also avoids some jumps.
-rw-r--r--ChangeLog5
-rw-r--r--sysdeps/x86_64/dl-trampoline.S77
2 files changed, 39 insertions, 43 deletions
diff --git a/ChangeLog b/ChangeLog
index 87db19e000..1bfdd7b56d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2009-07-16 Ulrich Drepper <drepper@redhat.com>
+
+ * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Optimize
+ restoring of ymm registers a bit.
+
2009-07-15 H.J. Lu <hongjiu.lu@intel.com>
* sysdeps/x86_64/memcmp.S: New file.
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 7f20491130..49d239f075 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -185,81 +185,73 @@ L(no_avx1):
movq LR_R8_OFFSET(%rsp), %r8
movq LR_R9_OFFSET(%rsp), %r9
+ movaps (LR_XMM_OFFSET)(%rsp), %xmm0
+ movaps (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1
+ movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2
+ movaps (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3
+ movaps (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4
+ movaps (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5
+ movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
+ movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
+
# ifdef HAVE_AVX_SUPPORT
cmpl $0, L(have_avx)(%rip)
js L(no_avx2)
/* Check if any xmm0-xmm7 registers are changed by audit
module. */
- vmovdqa (LR_XMM_OFFSET)(%rsp), %xmm0
- vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm1
- vpmovmskb %xmm1, %esi
+ vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8
+ vpmovmskb %xmm8, %esi
cmpl $0xffff, %esi
je 1f
vmovdqu (LR_VECTOR_OFFSET)(%rsp), %ymm0
-1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1
- vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
- vpmovmskb %xmm2, %esi
+1: vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8
+ vpmovmskb %xmm8, %esi
cmpl $0xffff, %esi
je 1f
vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1
-1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2
- vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm3
- vpmovmskb %xmm3, %esi
+1: vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8
+ vpmovmskb %xmm8, %esi
cmpl $0xffff, %esi
je 1f
vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2
-1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3
- vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm4
- vpmovmskb %xmm4, %esi
+1: vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8
+ vpmovmskb %xmm8, %esi
cmpl $0xffff, %esi
je 1f
vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3
-1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4
- vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm5
- vpmovmskb %xmm5, %esi
+1: vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8
+ vpmovmskb %xmm8, %esi
cmpl $0xffff, %esi
je 1f
vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4
-1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5
- vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm6
- vpmovmskb %xmm6, %esi
+1: vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8
+ vpmovmskb %xmm8, %esi
cmpl $0xffff, %esi
je 1f
vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5
-1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
- vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm7
- vpmovmskb %xmm7, %esi
+1: vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8
+ vpmovmskb %xmm8, %esi
cmpl $0xffff, %esi
je 1f
vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6
-1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
- vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
+1: vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
vpmovmskb %xmm8, %esi
cmpl $0xffff, %esi
je 1f
vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7
- jmp 1f
L(no_avx2):
+1:
# endif
- movaps (LR_XMM_OFFSET)(%rsp), %xmm0
- movaps (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1
- movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2
- movaps (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3
- movaps (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4
- movaps (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5
- movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
- movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
-
-1: movq 16(%rbx), %r10 # Anything in framesize?
+ movq 16(%rbx), %r10 # Anything in framesize?
testq %r10, %r10
jns 3f
@@ -358,32 +350,31 @@ L(no_avx3):
movq LRV_RAX_OFFSET(%rsp), %rax
movq LRV_RDX_OFFSET(%rsp), %rdx
+ movaps LRV_XMM0_OFFSET(%rsp), %xmm0
+ movaps LRV_XMM1_OFFSET(%rsp), %xmm1
+
# ifdef HAVE_AVX_SUPPORT
cmpl $0, L(have_avx)(%rip)
js L(no_avx4)
/* Check if xmm0/xmm1 registers are changed by audit module. */
- vmovdqa LRV_XMM0_OFFSET(%rsp), %xmm0
- vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm1
- vpmovmskb %xmm1, %esi
+ vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2
+ vpmovmskb %xmm2, %esi
cmpl $0xffff, %esi
je 1f
vmovdqu LRV_VECTOR0_OFFSET(%rsp), %ymm0
-1: vmovdqa LRV_XMM1_OFFSET(%rsp), %xmm1
- vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
+1: vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
vpmovmskb %xmm2, %esi
cmpl $0xffff, %esi
je 1f
vmovdqu LRV_VECTOR1_OFFSET(%rsp), %ymm1
- jmp 1f
L(no_avx4):
+1:
# endif
- movaps LRV_XMM0_OFFSET(%rsp), %xmm0
- movaps LRV_XMM1_OFFSET(%rsp), %xmm1
-1: fldt LRV_ST1_OFFSET(%rsp)
+ fldt LRV_ST1_OFFSET(%rsp)
fldt LRV_ST0_OFFSET(%rsp)
movq %rbx, %rsp