summaryrefslogtreecommitdiff
path: root/sysdeps/powerpc/powerpc64/power8/strncpy.S
diff options
context:
space:
mode:
authorSamuel Thibault <samuel.thibault@ens-lyon.org>2016-08-20 19:50:45 +0200
committerSamuel Thibault <samuel.thibault@ens-lyon.org>2016-08-20 19:50:45 +0200
commit4dd9e35bfd35d3138bc44169baba098005bad51e (patch)
treea4939c43a9c3fe00eb27f023e14acc5e1fe8808c /sysdeps/powerpc/powerpc64/power8/strncpy.S
parentbd42a4599d1b6f77bcfe1e4f67b7cbd9e1cb2dfd (diff)
parentf76453c31593957fec1a99b986bfa5506618b79c (diff)
Merge commit 'refs/top-bases/t/bigmem' into t/bigmem
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power8/strncpy.S')
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strncpy.S424
1 files changed, 424 insertions, 0 deletions
diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
new file mode 100644
index 0000000000..5fda953526
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S
@@ -0,0 +1,424 @@
+/* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8.
+ Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifdef USE_AS_STPNCPY
+# define FUNC_NAME __stpncpy
+#else
+# define FUNC_NAME strncpy
+#endif
+
+/* Implements the function
+
+ char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+ or
+
+ char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+ if USE_AS_STPCPY is defined.
+
+ The implementation uses unaligned doubleword access to avoid specialized
+ code paths depending of data alignment. Although recent powerpc64 uses
+ 64K as default, the page cross handling assumes minimum page size of
+ 4k. */
+
+ .machine power7
+EALIGN (FUNC_NAME, 4, 0)
+
+ /* Check if the [src]+15 will cross a 4K page by checking if the bit
+ indicating the page size changes. Basically:
+
+ uint64_t srcin = (uint64_t)src;
+ uint64_t ob = srcin & 4096UL;
+ uint64_t nb = (srcin+15UL) & 4096UL;
+ if (ob ^ nb)
+ goto pagecross; */
+
+ addi r10,r4,16
+ rlwinm r9,r4,0,19,19
+
+ /* Since it is a leaf function, save some non-volatile registers on the
+ protected/red zone. */
+ std r26,-48(r1)
+ std r27,-40(r1)
+
+ rlwinm r8,r10,0,19,19
+
+ std r28,-32(r1)
+ std r29,-24(r1)
+
+ cmpld r7,r9,r8
+
+ std r30,-16(r1)
+ std r31,-8(r1)
+
+ beq cr7,L(unaligned_lt_16)
+ rldicl r9,r4,0,61
+ subfic r8,r9,8
+ cmpld cr7,r5,r8
+ bgt cr7,L(pagecross)
+
+ /* At this points there is 1 to 15 bytes to check and write. Since it could
+ be either from first unaligned 16 bytes access or from bulk copy, the code
+ uses an unrolled byte read/write instead of trying to analyze the cmpb
+ results. */
+L(short_path):
+ mr r9,r3
+L(short_path_1):
+ cmpdi cr7,r5,0
+ beq cr7,L(short_path_loop_end_1)
+L(short_path_2):
+ lbz r10,0(r4)
+ cmpdi cr7,r10,0
+ stb r10,0(r9)
+ beq cr7,L(zero_pad_start_1)
+ cmpdi cr0,r5,1
+ addi r8,r9,1
+ addi r6,r5,-1
+ beq cr0,L(short_path_loop_end_0)
+ lbz r10,1(r4)
+ cmpdi cr7,r10,0
+ stb r10,1(r9)
+ beq cr7,L(zero_pad_start_prepare_1)
+ addi r10,r5,-3
+ b L(short_path_loop_1)
+
+ .align 4
+L(short_path_loop):
+ lbz r8,0(r4)
+ addi r7,r10,-2
+ cmpdi cr5,r8,0
+ stb r8,0(r9)
+ beq cr5,L(zero_pad_start_1)
+ beq r7,L(short_path_loop_end_0)
+ lbz r8,1(r4)
+ cmpdi cr7,r8,0
+ stb r8,1(r9)
+ beq cr7,L(zero_pad_start)
+ mr r10,r7
+L(short_path_loop_1):
+ addic. r5,r5,-2
+ addi r9,r9,2
+ cmpdi cr7,r10,0
+ addi r4,r4,2
+ addi r6,r9,1
+ bne cr0,L(short_path_loop)
+#ifdef USE_AS_STPNCPY
+ mr r3,r9
+ b L(short_path_loop_end)
+#endif
+
+L(short_path_loop_end_0):
+#ifdef USE_AS_STPNCPY
+ addi r3,r9,1
+ b L(short_path_loop_end)
+#endif
+L(short_path_loop_end_1):
+#ifdef USE_AS_STPNCPY
+ mr r3,r9
+#endif
+L(short_path_loop_end):
+ /* Restore non-volatile registers. */
+ ld r26,-48(r1)
+ ld r27,-40(r1)
+ ld r28,-32(r1)
+ ld r29,-24(r1)
+ ld r30,-16(r1)
+ ld r31,-8(r1)
+ blr
+
+ /* This code pads the remainder dest with NULL bytes. The algorithm
+ calculate the remanining size and issues a doubleword unrolled
+ loops followed by a byte a byte set. */
+ .align 4
+L(zero_pad_start):
+ mr r5,r10
+ mr r9,r6
+L(zero_pad_start_1):
+ srdi. r8,r5,r3
+ mr r10,r9
+#ifdef USE_AS_STPNCPY
+ mr r3,r9
+#endif
+ beq- cr0,L(zero_pad_loop_b_start)
+ cmpldi cr7,r8,1
+ li cr7,0
+ std r7,0(r9)
+ beq cr7,L(zero_pad_loop_b_prepare)
+ addic. r8,r8,-2
+ addi r10,r9,r16
+ std r7,8(r9)
+ beq cr0,L(zero_pad_loop_dw_2)
+ std r7,16(r9)
+ li r9,0
+ b L(zero_pad_loop_dw_1)
+
+ .align 4
+L(zero_pad_loop_dw):
+ addi r10,r10,16
+ std r9,-8(r10)
+ beq cr0,L(zero_pad_loop_dw_2)
+ std r9,0(r10)
+L(zero_pad_loop_dw_1):
+ cmpldi cr7,r8,1
+ std r9,0(r10)
+ addic. r8,r8,-2
+ bne cr7,L(zero_pad_loop_dw)
+ addi r10,r10,8
+L(zero_pad_loop_dw_2):
+ rldicl r5,r5,0,61
+L(zero_pad_loop_b_start):
+ cmpdi cr7,r5,0
+ addi r5,r5,-1
+ addi r9,r10,-1
+ add r10,r10,5
+ subf r10,r9,r10
+ li r8,0
+ beq- cr7,L(short_path_loop_end)
+
+ /* Write remaining 1-8 bytes. */
+ .align 4
+ addi r9,r9,1
+ mtocrf 0x1,r10
+ bf 29,4f
+ stw r8,0(r9)
+ addi r9,r9,4
+
+ .align 4
+4: bf 30,2f
+ sth r8,0(r9)
+ addi r9,r9,2
+
+ .align 4
+2: bf 31,1f
+ stb r8,0(r9)
+
+ /* Restore non-volatile registers. */
+1: ld r26,-48(r1)
+ ld r27,-40(r1)
+ ld r28,-32(r1)
+ ld r29,-24(r1)
+ ld r30,-16(r1)
+ ld r31,-8(r1)
+ blr
+
+ /* The common case where [src]+16 will not cross a 4K page boundary.
+ In this case the code fast check the first 16 bytes by using doubleword
+ read/compares and update destiny if neither total size or null byte
+ is found in destiny. */
+ .align 4
+L(unaligned_lt_16):
+ cmpldi cr7,r5,7
+ ble cr7,L(short_path)
+ ld r7,0(r4)
+ li r8,0
+ cmpb r8,r7,r8
+ cmpdi cr7,r8,0
+ bne cr7,L(short_path_prepare_2)
+ addi r6,r5,-8
+ std r7,0(r3)
+ addi r9,r3,r8
+ cmpldi cr7,r6,7
+ addi r7,r4,8
+ ble cr7,L(short_path_prepare_1_1)
+ ld r4,8(r4)
+ cmpb r8,r4,r8
+ cmpdi cr7,r8,0
+ bne cr7,L(short_path_prepare_2_1)
+ std r4,8(r3)
+ addi r29,r3,16
+ addi r5,r5,-16
+ /* Neither the null byte was found or total length was reached,
+ align to 16 bytes and issue a bulk copy/compare. */
+ b L(align_to_16b)
+
+ /* In the case of 4k page boundary cross, the algorithm first align
+ the address to a doubleword, calculate a mask based on alignment
+ to ignore the bytes and continue using doubleword. */
+ .align 4
+L(pagecross):
+ rldicr r11,r4,0,59 /* Align the address to 8 bytes boundary. */
+ li r6,-1 /* MASK = 0xffffffffffffffffUL. */
+ sldi r9,r9,3 /* Calculate padding. */
+ ld r7,0(r11) /* Load doubleword from memory. */
+#ifdef __LITTLE_ENDIAN__
+ sld r9,r6,r9 /* MASK = MASK << padding. */
+#else
+ srd r9,r6,r9 /* MASK = MASK >> padding. */
+#endif
+ orc r9,r7,r9 /* Mask bits that are not part of the
+ string. */
+ li cr7,0
+ cmpb r9,r9,r7 /* Check for null bytes in DWORD1. */
+ cmpdi cr7,r9,0
+ bne cr7,L(short_path_prepare_2)
+ subf r8,r8,r5 /* Adjust total length. */
+ cmpldi cr7,r8,8 /* Check if length was reached. */
+ ble cr7,L(short_path_prepare_2)
+
+ /* For next checks we have aligned address, so we check for more
+ three doublewords to make sure we can read 16 unaligned bytes
+ to start the bulk copy with 16 aligned addresses. */
+ ld cr7,8(r11)
+ cmpb r9,r7,r9
+ cmpdi cr7,r9,0
+ bne cr7,L(short_path_prepare_2)
+ addi cr7,r8,-8
+ cmpldi cr7,r7,8
+ ble cr7,L(short_path_prepare_2)
+ ld cr7,16(r11)
+ cmpb r9,r7,r9
+ cmpdi cr7,r9,0
+ bne cr7,L(short_path_prepare_2)
+ addi r8,r8,-16
+ cmpldi r7,r8,8
+ ble cr7,L(short_path_prepare_2)
+ ld r8,24(r11)
+ cmpb r9,r8,r9
+ cmpdi r7,r9,0
+ bne cr7,L(short_path_prepare_2)
+
+ /* No null byte found in the 32 bytes readed and length not reached,
+ read source again using unaligned loads and store them. */
+ ld r9,0(r4)
+ addi r29,r3,16
+ addi r5,r5,-16
+ std r9,0(r3)
+ ld r9,8(r4)
+ std r9,8(r3)
+
+ /* Align source to 16 bytes and adjust destiny and size. */
+L(align_to_16b):
+ rldicl r9,r10,0,60
+ rldicr r28,r10,0,59
+ add r12,r5,r9
+ subf r29,r9,r29
+
+ /* The bulk read/compare/copy loads two doublewords, compare and merge
+ in a single register for speed. This is an attempt to speed up the
+ null-checking process for bigger strings. */
+
+ cmpldi cr7,r12,15
+ ble cr7,L(short_path_prepare_1_2)
+
+ /* Main loop for large sizes, unrolled 2 times to get better use of
+ pipeline. */
+ ld r8,0(28)
+ ld r10,8(28)
+ li r9,0
+ cmpb r7,r8,r9
+ cmpb r9,r10,r9
+ or. r6,r9,r7
+ bne cr0,L(short_path_prepare_2_3)
+ addi r5,r12,-16
+ addi r4,r28,16
+ std r8,0(r29)
+ std r10,8(r29)
+ cmpldi cr7,r5,15
+ addi r9,r29,16
+ ble cr7,L(short_path_1)
+ mr r11,r28
+ mr r6,r29
+ li r30,0
+ subfic r26,r4,48
+ subfic r27,r9,48
+
+ b L(loop_16b)
+
+ .align 4
+L(loop_start):
+ ld r31,0(r11)
+ ld r10,8(r11)
+ cmpb r0,r31,r7
+ cmpb r8,r10,r7
+ or. r7,r0,r8
+ addi r5,r5,-32
+ cmpldi cr7,r5,15
+ add r4,r4,r26
+ add r9,r9,r27
+ bne cr0,L(short_path_prepare_2_2)
+ add r4,r28,r4
+ std r31,0(r6)
+ add r9,r29,r9
+ std r10,8(r6)
+ ble cr7,L(short_path_1)
+
+L(loop_16b):
+ ld r10,16(r11)
+ ld r0,24(r11)
+ cmpb r8,r10,r30
+ cmpb r7,r0,r30
+ or. r7,r8,r7
+ addi r12,r12,-32
+ cmpldi r7,r12,15
+ addi r11,r11,32
+ bne cr0,L(short_path_2)
+ std r10,16(r6)
+ addi r6,r6,32
+ std r0,-8(r6)
+ bgt cr7,L(loop_start)
+
+ mr r5,r12
+ mr r4,r11
+ mr r9,r6
+ b L(short_path_1)
+
+ .align 4
+L(short_path_prepare_1_1):
+ mr r5,r6
+ mr r4,r7
+ b L(short_path_1)
+L(short_path_prepare_1_2):
+ mr r5,r12
+ mr r4,r28
+ mr r9,r29
+ b L(short_path_1)
+L(short_path_prepare_2):
+ mr r9,r3
+ b L(short_path_2)
+L(short_path_prepare_2_1):
+ mr r5,r6
+ mr r4,r7
+ b L(short_path_2)
+L(short_path_prepare_2_2):
+ mr r5,r12
+ mr r4,r11
+ mr r9,r6
+ b L(short_path_2)
+L(short_path_prepare_2_3):
+ mr r5,r12
+ mr r4,r28
+ mr r9,r29
+ b L(short_path_2)
+L(zero_pad_loop_b_prepare):
+ addi r10,r9,8
+ rldicl r5,r5,0,61
+ b L(zero_pad_loop_b_start)
+L(zero_pad_start_prepare_1):
+ mr r5,r6
+ mr r9,r8
+ b L(zero_pad_start_1)
+END (FUNC_NAME)
+
+#ifdef USE_AS_STPNCPY
+libc_hidden_def (__stpncpy)
+#else
+libc_hidden_builtin_def (strncpy)
+#endif