summaryrefslogtreecommitdiff
path: root/sysdeps/powerpc/powerpc64/multiarch/memcpy-power8-cached.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/powerpc/powerpc64/multiarch/memcpy-power8-cached.S')
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/memcpy-power8-cached.S176
1 files changed, 176 insertions, 0 deletions
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy-power8-cached.S b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power8-cached.S
new file mode 100644
index 0000000000..6b69e57212
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power8-cached.S
@@ -0,0 +1,176 @@
+/* Optimized memcpy implementation for cached memory on PowerPC64/POWER8.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+
+/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
+ Returns 'dst'. */
+
+ .machine power8
+ENTRY_TOCLESS (__memcpy_power8_cached, 5)
+ CALL_MCOUNT 3
+
+ cmpldi cr7,r5,15
+ bgt cr7,L(ge_16)
+ andi. r9,r5,0x1
+ mr r9,r3
+ beq cr0,1f
+ lbz r10,0(r4)
+ addi r9,r3,1
+ addi r4,r4,1
+ stb r10,0(r3)
+1:
+ andi. r10,r5,0x2
+ beq cr0,2f
+ lhz r10,0(r4)
+ addi r9,r9,2
+ addi r4,r4,2
+ sth r10,-2(r9)
+2:
+ andi. r10,r5,0x4
+ beq cr0,3f
+ lwz r10,0(r4)
+ addi r9,9,4
+ addi r4,4,4
+ stw r10,-4(r9)
+3:
+ andi. r10,r5,0x8
+ beqlr cr0
+ ld r10,0(r4)
+ std r10,0(r9)
+ blr
+
+ .align 4
+L(ge_16):
+ cmpldi cr7,r5,32
+ ble cr7,L(ge_16_le_32)
+ cmpldi cr7,r5,64
+ ble cr7,L(gt_32_le_64)
+
+ /* Align dst to 16 bytes. */
+ andi. r9,r3,0xf
+ mr r12,r3
+ beq cr0,L(dst_is_align_16)
+ lxvd2x v0,0,r4
+ subfic r12,r9,16
+ subf r5,r12,r5
+ add r4,r4,r12
+ add r12,r3,r12
+ stxvd2x v0,0,r3
+L(dst_is_align_16):
+ cmpldi cr7,r5,127
+ ble cr7,L(tail_copy)
+ mr r9,r12
+ srdi r10,r5,7
+ li r11,16
+ li r6,32
+ li r7,48
+ mtctr r10
+ clrrdi r0,r5,7
+
+ /* Main loop, copy 128 bytes each time. */
+ .align 4
+L(copy_128):
+ lxvd2x v10,0,r4
+ lxvd2x v11,r4,r11
+ addi r8,r4,64
+ addi r10,r9,64
+ lxvd2x v12,r4,r6
+ lxvd2x v0,r4,r7
+ addi r4,r4,128
+ stxvd2x v10,0,r9
+ stxvd2x v11,r9,r11
+ stxvd2x v12,r9,r6
+ stxvd2x v0,r9,r7
+ addi r9,r9,128
+ lxvd2x v10,0,r8
+ lxvd2x v11,r8,r11
+ lxvd2x v12,r8,r6
+ lxvd2x v0,r8,r7
+ stxvd2x v10,0,r10
+ stxvd2x v11,r10,r11
+ stxvd2x v12,r10,r6
+ stxvd2x v0,r10,r7
+ bdnz L(copy_128)
+
+ add r12,r12,r0
+ rldicl r5,r5,0,57
+L(tail_copy):
+ cmpldi cr7,r5,63
+ ble cr7,L(tail_le_64)
+ li r8,16
+ li r10,32
+ lxvd2x v10,0,r4
+ li r9,48
+ addi r5,r5,-64
+ lxvd2x v11,r4,r8
+ lxvd2x v12,r4,r10
+ lxvd2x v0,r4,r9
+ addi r4,r4,64
+ stxvd2x v10,0,r12
+ stxvd2x v11,r12,r8
+ stxvd2x v12,r12,r10
+ stxvd2x v0,r12,9
+ addi r12,r12,64
+
+L(tail_le_64):
+ cmpldi cr7,r5,32
+ bgt cr7,L(tail_gt_32_le_64)
+ cmpdi cr7,r5,0
+ beqlr cr7
+ addi r5,r5,-32
+ li r9,16
+ add r8,r4,r5
+ add r10,r12,r5
+ lxvd2x v12,r4,r5
+ lxvd2x v0,r8,r9
+ stxvd2x v12,r12,r5
+ stxvd2x v0,r10,r9
+ blr
+
+ .align 4
+L(ge_16_le_32):
+ addi r5,r5,-16
+ lxvd2x v0,0,r4
+ lxvd2x v1,r4,r5
+ stxvd2x v0,0,r3
+ stxvd2x v1,r3,r5
+ blr
+
+ .align 4
+L(gt_32_le_64):
+ mr r12,r3
+
+ .align 4
+L(tail_gt_32_le_64):
+ li r9,16
+ lxvd2x v0,0,r4
+ addi r5,r5,-32
+ lxvd2x v1,r4,r9
+ add r8,r4,r5
+ lxvd2x v2,r4,r5
+ add r10,r12,r5
+ lxvd2x v3,r8,r9
+ stxvd2x v0,0,r12
+ stxvd2x v1,r12,r9
+ stxvd2x v2,r12,r5
+ stxvd2x v3,r10,r9
+ blr
+
+END_GEN_TB (__memcpy_power8_cached,TB_TOCLESS)