diff options
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power8/strncpy.S')
-rw-r--r-- | sysdeps/powerpc/powerpc64/power8/strncpy.S | 424 |
1 files changed, 424 insertions, 0 deletions
diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S new file mode 100644 index 0000000000..5fda953526 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S @@ -0,0 +1,424 @@ +/* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#ifdef USE_AS_STPNCPY +# define FUNC_NAME __stpncpy +#else +# define FUNC_NAME strncpy +#endif + +/* Implements the function + + char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) + + or + + char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5]) + + if USE_AS_STPCPY is defined. + + The implementation uses unaligned doubleword access to avoid specialized + code paths depending of data alignment. Although recent powerpc64 uses + 64K as default, the page cross handling assumes minimum page size of + 4k. */ + + .machine power7 +EALIGN (FUNC_NAME, 4, 0) + + /* Check if the [src]+15 will cross a 4K page by checking if the bit + indicating the page size changes. Basically: + + uint64_t srcin = (uint64_t)src; + uint64_t ob = srcin & 4096UL; + uint64_t nb = (srcin+15UL) & 4096UL; + if (ob ^ nb) + goto pagecross; */ + + addi r10,r4,16 + rlwinm r9,r4,0,19,19 + + /* Since it is a leaf function, save some non-volatile registers on the + protected/red zone. */ + std r26,-48(r1) + std r27,-40(r1) + + rlwinm r8,r10,0,19,19 + + std r28,-32(r1) + std r29,-24(r1) + + cmpld r7,r9,r8 + + std r30,-16(r1) + std r31,-8(r1) + + beq cr7,L(unaligned_lt_16) + rldicl r9,r4,0,61 + subfic r8,r9,8 + cmpld cr7,r5,r8 + bgt cr7,L(pagecross) + + /* At this points there is 1 to 15 bytes to check and write. Since it could + be either from first unaligned 16 bytes access or from bulk copy, the code + uses an unrolled byte read/write instead of trying to analyze the cmpb + results. */ +L(short_path): + mr r9,r3 +L(short_path_1): + cmpdi cr7,r5,0 + beq cr7,L(short_path_loop_end_1) +L(short_path_2): + lbz r10,0(r4) + cmpdi cr7,r10,0 + stb r10,0(r9) + beq cr7,L(zero_pad_start_1) + cmpdi cr0,r5,1 + addi r8,r9,1 + addi r6,r5,-1 + beq cr0,L(short_path_loop_end_0) + lbz r10,1(r4) + cmpdi cr7,r10,0 + stb r10,1(r9) + beq cr7,L(zero_pad_start_prepare_1) + addi r10,r5,-3 + b L(short_path_loop_1) + + .align 4 +L(short_path_loop): + lbz r8,0(r4) + addi r7,r10,-2 + cmpdi cr5,r8,0 + stb r8,0(r9) + beq cr5,L(zero_pad_start_1) + beq r7,L(short_path_loop_end_0) + lbz r8,1(r4) + cmpdi cr7,r8,0 + stb r8,1(r9) + beq cr7,L(zero_pad_start) + mr r10,r7 +L(short_path_loop_1): + addic. r5,r5,-2 + addi r9,r9,2 + cmpdi cr7,r10,0 + addi r4,r4,2 + addi r6,r9,1 + bne cr0,L(short_path_loop) +#ifdef USE_AS_STPNCPY + mr r3,r9 + b L(short_path_loop_end) +#endif + +L(short_path_loop_end_0): +#ifdef USE_AS_STPNCPY + addi r3,r9,1 + b L(short_path_loop_end) +#endif +L(short_path_loop_end_1): +#ifdef USE_AS_STPNCPY + mr r3,r9 +#endif +L(short_path_loop_end): + /* Restore non-volatile registers. */ + ld r26,-48(r1) + ld r27,-40(r1) + ld r28,-32(r1) + ld r29,-24(r1) + ld r30,-16(r1) + ld r31,-8(r1) + blr + + /* This code pads the remainder dest with NULL bytes. The algorithm + calculate the remanining size and issues a doubleword unrolled + loops followed by a byte a byte set. */ + .align 4 +L(zero_pad_start): + mr r5,r10 + mr r9,r6 +L(zero_pad_start_1): + srdi. r8,r5,r3 + mr r10,r9 +#ifdef USE_AS_STPNCPY + mr r3,r9 +#endif + beq- cr0,L(zero_pad_loop_b_start) + cmpldi cr7,r8,1 + li cr7,0 + std r7,0(r9) + beq cr7,L(zero_pad_loop_b_prepare) + addic. r8,r8,-2 + addi r10,r9,r16 + std r7,8(r9) + beq cr0,L(zero_pad_loop_dw_2) + std r7,16(r9) + li r9,0 + b L(zero_pad_loop_dw_1) + + .align 4 +L(zero_pad_loop_dw): + addi r10,r10,16 + std r9,-8(r10) + beq cr0,L(zero_pad_loop_dw_2) + std r9,0(r10) +L(zero_pad_loop_dw_1): + cmpldi cr7,r8,1 + std r9,0(r10) + addic. r8,r8,-2 + bne cr7,L(zero_pad_loop_dw) + addi r10,r10,8 +L(zero_pad_loop_dw_2): + rldicl r5,r5,0,61 +L(zero_pad_loop_b_start): + cmpdi cr7,r5,0 + addi r5,r5,-1 + addi r9,r10,-1 + add r10,r10,5 + subf r10,r9,r10 + li r8,0 + beq- cr7,L(short_path_loop_end) + + /* Write remaining 1-8 bytes. */ + .align 4 + addi r9,r9,1 + mtocrf 0x1,r10 + bf 29,4f + stw r8,0(r9) + addi r9,r9,4 + + .align 4 +4: bf 30,2f + sth r8,0(r9) + addi r9,r9,2 + + .align 4 +2: bf 31,1f + stb r8,0(r9) + + /* Restore non-volatile registers. */ +1: ld r26,-48(r1) + ld r27,-40(r1) + ld r28,-32(r1) + ld r29,-24(r1) + ld r30,-16(r1) + ld r31,-8(r1) + blr + + /* The common case where [src]+16 will not cross a 4K page boundary. + In this case the code fast check the first 16 bytes by using doubleword + read/compares and update destiny if neither total size or null byte + is found in destiny. */ + .align 4 +L(unaligned_lt_16): + cmpldi cr7,r5,7 + ble cr7,L(short_path) + ld r7,0(r4) + li r8,0 + cmpb r8,r7,r8 + cmpdi cr7,r8,0 + bne cr7,L(short_path_prepare_2) + addi r6,r5,-8 + std r7,0(r3) + addi r9,r3,r8 + cmpldi cr7,r6,7 + addi r7,r4,8 + ble cr7,L(short_path_prepare_1_1) + ld r4,8(r4) + cmpb r8,r4,r8 + cmpdi cr7,r8,0 + bne cr7,L(short_path_prepare_2_1) + std r4,8(r3) + addi r29,r3,16 + addi r5,r5,-16 + /* Neither the null byte was found or total length was reached, + align to 16 bytes and issue a bulk copy/compare. */ + b L(align_to_16b) + + /* In the case of 4k page boundary cross, the algorithm first align + the address to a doubleword, calculate a mask based on alignment + to ignore the bytes and continue using doubleword. */ + .align 4 +L(pagecross): + rldicr r11,r4,0,59 /* Align the address to 8 bytes boundary. */ + li r6,-1 /* MASK = 0xffffffffffffffffUL. */ + sldi r9,r9,3 /* Calculate padding. */ + ld r7,0(r11) /* Load doubleword from memory. */ +#ifdef __LITTLE_ENDIAN__ + sld r9,r6,r9 /* MASK = MASK << padding. */ +#else + srd r9,r6,r9 /* MASK = MASK >> padding. */ +#endif + orc r9,r7,r9 /* Mask bits that are not part of the + string. */ + li cr7,0 + cmpb r9,r9,r7 /* Check for null bytes in DWORD1. */ + cmpdi cr7,r9,0 + bne cr7,L(short_path_prepare_2) + subf r8,r8,r5 /* Adjust total length. */ + cmpldi cr7,r8,8 /* Check if length was reached. */ + ble cr7,L(short_path_prepare_2) + + /* For next checks we have aligned address, so we check for more + three doublewords to make sure we can read 16 unaligned bytes + to start the bulk copy with 16 aligned addresses. */ + ld cr7,8(r11) + cmpb r9,r7,r9 + cmpdi cr7,r9,0 + bne cr7,L(short_path_prepare_2) + addi cr7,r8,-8 + cmpldi cr7,r7,8 + ble cr7,L(short_path_prepare_2) + ld cr7,16(r11) + cmpb r9,r7,r9 + cmpdi cr7,r9,0 + bne cr7,L(short_path_prepare_2) + addi r8,r8,-16 + cmpldi r7,r8,8 + ble cr7,L(short_path_prepare_2) + ld r8,24(r11) + cmpb r9,r8,r9 + cmpdi r7,r9,0 + bne cr7,L(short_path_prepare_2) + + /* No null byte found in the 32 bytes readed and length not reached, + read source again using unaligned loads and store them. */ + ld r9,0(r4) + addi r29,r3,16 + addi r5,r5,-16 + std r9,0(r3) + ld r9,8(r4) + std r9,8(r3) + + /* Align source to 16 bytes and adjust destiny and size. */ +L(align_to_16b): + rldicl r9,r10,0,60 + rldicr r28,r10,0,59 + add r12,r5,r9 + subf r29,r9,r29 + + /* The bulk read/compare/copy loads two doublewords, compare and merge + in a single register for speed. This is an attempt to speed up the + null-checking process for bigger strings. */ + + cmpldi cr7,r12,15 + ble cr7,L(short_path_prepare_1_2) + + /* Main loop for large sizes, unrolled 2 times to get better use of + pipeline. */ + ld r8,0(28) + ld r10,8(28) + li r9,0 + cmpb r7,r8,r9 + cmpb r9,r10,r9 + or. r6,r9,r7 + bne cr0,L(short_path_prepare_2_3) + addi r5,r12,-16 + addi r4,r28,16 + std r8,0(r29) + std r10,8(r29) + cmpldi cr7,r5,15 + addi r9,r29,16 + ble cr7,L(short_path_1) + mr r11,r28 + mr r6,r29 + li r30,0 + subfic r26,r4,48 + subfic r27,r9,48 + + b L(loop_16b) + + .align 4 +L(loop_start): + ld r31,0(r11) + ld r10,8(r11) + cmpb r0,r31,r7 + cmpb r8,r10,r7 + or. r7,r0,r8 + addi r5,r5,-32 + cmpldi cr7,r5,15 + add r4,r4,r26 + add r9,r9,r27 + bne cr0,L(short_path_prepare_2_2) + add r4,r28,r4 + std r31,0(r6) + add r9,r29,r9 + std r10,8(r6) + ble cr7,L(short_path_1) + +L(loop_16b): + ld r10,16(r11) + ld r0,24(r11) + cmpb r8,r10,r30 + cmpb r7,r0,r30 + or. r7,r8,r7 + addi r12,r12,-32 + cmpldi r7,r12,15 + addi r11,r11,32 + bne cr0,L(short_path_2) + std r10,16(r6) + addi r6,r6,32 + std r0,-8(r6) + bgt cr7,L(loop_start) + + mr r5,r12 + mr r4,r11 + mr r9,r6 + b L(short_path_1) + + .align 4 +L(short_path_prepare_1_1): + mr r5,r6 + mr r4,r7 + b L(short_path_1) +L(short_path_prepare_1_2): + mr r5,r12 + mr r4,r28 + mr r9,r29 + b L(short_path_1) +L(short_path_prepare_2): + mr r9,r3 + b L(short_path_2) +L(short_path_prepare_2_1): + mr r5,r6 + mr r4,r7 + b L(short_path_2) +L(short_path_prepare_2_2): + mr r5,r12 + mr r4,r11 + mr r9,r6 + b L(short_path_2) +L(short_path_prepare_2_3): + mr r5,r12 + mr r4,r28 + mr r9,r29 + b L(short_path_2) +L(zero_pad_loop_b_prepare): + addi r10,r9,8 + rldicl r5,r5,0,61 + b L(zero_pad_loop_b_start) +L(zero_pad_start_prepare_1): + mr r5,r6 + mr r9,r8 + b L(zero_pad_start_1) +END (FUNC_NAME) + +#ifdef USE_AS_STPNCPY +libc_hidden_def (__stpncpy) +#else +libc_hidden_builtin_def (strncpy) +#endif |