summaryrefslogtreecommitdiff
path: root/sysdeps/powerpc/powerpc64/power8/strcasecmp.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power8/strcasecmp.S')
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strcasecmp.S457
1 files changed, 457 insertions, 0 deletions
diff --git a/sysdeps/powerpc/powerpc64/power8/strcasecmp.S b/sysdeps/powerpc/powerpc64/power8/strcasecmp.S
new file mode 100644
index 0000000000..3a2efe2a64
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strcasecmp.S
@@ -0,0 +1,457 @@
+/* Optimized strcasecmp implementation for PowerPC64.
+ Copyright (C) 2016-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <locale-defines.h>
+
+/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] ) */
+
+#ifndef USE_AS_STRNCASECMP
+# define __STRCASECMP __strcasecmp
+# define STRCASECMP strcasecmp
+#else
+# define __STRCASECMP __strncasecmp
+# define STRCASECMP strncasecmp
+#endif
+/* Convert 16 bytes to lowercase and compare */
+#define TOLOWER() \
+ vaddubm v8, v4, v1; \
+ vaddubm v7, v4, v3; \
+ vcmpgtub v8, v8, v2; \
+ vsel v4, v7, v4, v8; \
+ vaddubm v8, v5, v1; \
+ vaddubm v7, v5, v3; \
+ vcmpgtub v8, v8, v2; \
+ vsel v5, v7, v5, v8; \
+ vcmpequb. v7, v5, v4;
+
+/*
+ * Get 16 bytes for unaligned case.
+ * reg1: Vector to hold next 16 bytes.
+ * reg2: Address to read from.
+ * reg3: Permute control vector.
+ * v8: Tmp vector used to mask unwanted bytes.
+ * v9: Tmp vector,0 when null is found on first 16 bytes
+ */
+#ifdef __LITTLE_ENDIAN__
+#define GET16BYTES(reg1, reg2, reg3) \
+ lvx reg1, 0, reg2; \
+ vspltisb v8, -1; \
+ vperm v8, v8, reg1, reg3; \
+ vcmpequb. v8, v0, v8; \
+ beq cr6, 1f; \
+ vspltisb v9, 0; \
+ b 2f; \
+ .align 4; \
+1: \
+ addi r6, reg2, 16; \
+ lvx v9, 0, r6; \
+2: \
+ vperm reg1, v9, reg1, reg3;
+#else
+#define GET16BYTES(reg1, reg2, reg3) \
+ lvx reg1, 0, reg2; \
+ vspltisb v8, -1; \
+ vperm v8, reg1, v8, reg3; \
+ vcmpequb. v8, v0, v8; \
+ beq cr6, 1f; \
+ vspltisb v9, 0; \
+ b 2f; \
+ .align 4; \
+1: \
+ addi r6, reg2, 16; \
+ lvx v9, 0, r6; \
+2: \
+ vperm reg1, reg1, v9, reg3;
+#endif
+
+/* Check null in v4, v5 and convert to lower. */
+#define CHECKNULLANDCONVERT() \
+ vcmpequb. v7, v0, v5; \
+ beq cr6, 3f; \
+ vcmpequb. v7, v0, v4; \
+ beq cr6, 3f; \
+ b L(null_found); \
+ .align 4; \
+3: \
+ TOLOWER()
+
+#ifdef _ARCH_PWR8
+# define VCLZD_V8_v7 vclzd v8, v7;
+# define MFVRD_R3_V1 mfvrd r3, v1;
+# define VSUBUDM_V9_V8 vsubudm v9, v9, v8;
+# define VPOPCNTD_V8_V8 vpopcntd v8, v8;
+# define VADDUQM_V7_V8 vadduqm v9, v7, v8;
+#else
+# define VCLZD_V8_v7 .long 0x11003fc2
+# define MFVRD_R3_V1 .long 0x7c230067
+# define VSUBUDM_V9_V8 .long 0x112944c0
+# define VPOPCNTD_V8_V8 .long 0x110047c3
+# define VADDUQM_V7_V8 .long 0x11274100
+#endif
+
+ .machine power7
+
+ENTRY (__STRCASECMP)
+#ifdef USE_AS_STRNCASECMP
+ CALL_MCOUNT 3
+#else
+ CALL_MCOUNT 2
+#endif
+#define rRTN r3 /* Return value */
+#define rSTR1 r10 /* 1st string */
+#define rSTR2 r4 /* 2nd string */
+#define rCHAR1 r6 /* Byte read from 1st string */
+#define rCHAR2 r7 /* Byte read from 2nd string */
+#define rADDR1 r8 /* Address of tolower(rCHAR1) */
+#define rADDR2 r12 /* Address of tolower(rCHAR2) */
+#define rLWR1 r8 /* Word tolower(rCHAR1) */
+#define rLWR2 r12 /* Word tolower(rCHAR2) */
+#define rTMP r9
+#define rLOC r11 /* Default locale address */
+
+ cmpd cr7, rRTN, rSTR2
+
+ /* Get locale address. */
+ ld rTMP, __libc_tsd_LOCALE@got@tprel(r2)
+ add rLOC, rTMP, __libc_tsd_LOCALE@tls
+ ld rLOC, 0(rLOC)
+
+ mr rSTR1, rRTN
+ li rRTN, 0
+ beqlr cr7
+#ifdef USE_AS_STRNCASECMP
+ cmpdi cr7, r5, 0
+ beq cr7, L(retnull)
+ cmpdi cr7, r5, 16
+ blt cr7, L(bytebybyte)
+#endif
+ vspltisb v0, 0
+ vspltisb v8, -1
+ /* Check for null in initial characters.
+ Check max of 16 char depending on the alignment.
+ If null is present, proceed byte by byte. */
+ lvx v4, 0, rSTR1
+#ifdef __LITTLE_ENDIAN__
+ lvsr v10, 0, rSTR1 /* Compute mask. */
+ vperm v9, v8, v4, v10 /* Mask bits that are not part of string. */
+#else
+ lvsl v10, 0, rSTR1
+ vperm v9, v4, v8, v10
+#endif
+ vcmpequb. v9, v0, v9 /* Check for null bytes. */
+ bne cr6, L(bytebybyte)
+ lvx v5, 0, rSTR2
+ /* Calculate alignment. */
+#ifdef __LITTLE_ENDIAN__
+ lvsr v6, 0, rSTR2
+ vperm v9, v8, v5, v6 /* Mask bits that are not part of string. */
+#else
+ lvsl v6, 0, rSTR2
+ vperm v9, v5, v8, v6
+#endif
+ vcmpequb. v9, v0, v9 /* Check for null bytes. */
+ bne cr6, L(bytebybyte)
+ /* Check if locale has non ascii characters. */
+ ld rTMP, 0(rLOC)
+ addi r6, rTMP,LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES
+ lwz rTMP, 0(r6)
+ cmpdi cr7, rTMP, 1
+ beq cr7, L(bytebybyte)
+
+ /* Load vector registers with values used for TOLOWER. */
+ /* Load v1 = 0xbf, v2 = 0x19 v3 = 0x20 in each byte. */
+ vspltisb v3, 2
+ vspltisb v9, 4
+ vsl v3, v3, v9
+ vaddubm v1, v3, v3
+ vnor v1, v1, v1
+ vspltisb v2, 7
+ vsububm v2, v3, v2
+
+ andi. rADDR1, rSTR1, 0xF
+ beq cr0, L(align)
+ addi r6, rSTR1, 16
+ lvx v9, 0, r6
+ /* Compute 16 bytes from previous two loads. */
+#ifdef __LITTLE_ENDIAN__
+ vperm v4, v9, v4, v10
+#else
+ vperm v4, v4, v9, v10
+#endif
+L(align):
+ andi. rADDR2, rSTR2, 0xF
+ beq cr0, L(align1)
+ addi r6, rSTR2, 16
+ lvx v9, 0, r6
+ /* Compute 16 bytes from previous two loads. */
+#ifdef __LITTLE_ENDIAN__
+ vperm v5, v9, v5, v6
+#else
+ vperm v5, v5, v9, v6
+#endif
+L(align1):
+ CHECKNULLANDCONVERT()
+ blt cr6, L(match)
+ b L(different)
+ .align 4
+L(match):
+ clrldi r6, rSTR1, 60
+ subfic r7, r6, 16
+#ifdef USE_AS_STRNCASECMP
+ sub r5, r5, r7
+#endif
+ add rSTR1, rSTR1, r7
+ add rSTR2, rSTR2, r7
+ andi. rADDR2, rSTR2, 0xF
+ addi rSTR1, rSTR1, -16
+ addi rSTR2, rSTR2, -16
+ beq cr0, L(aligned)
+#ifdef __LITTLE_ENDIAN__
+ lvsr v6, 0, rSTR2
+#else
+ lvsl v6, 0, rSTR2
+#endif
+ /* There are 2 loops depending on the input alignment.
+ Each loop gets 16 bytes from s1 and s2, check for null,
+ convert to lowercase and compare. Loop till difference
+ or null occurs. */
+L(s1_align):
+ addi rSTR1, rSTR1, 16
+ addi rSTR2, rSTR2, 16
+#ifdef USE_AS_STRNCASECMP
+ cmpdi cr7, r5, 16
+ blt cr7, L(bytebybyte)
+ addi r5, r5, -16
+#endif
+ lvx v4, 0, rSTR1
+ GET16BYTES(v5, rSTR2, v6)
+ CHECKNULLANDCONVERT()
+ blt cr6, L(s1_align)
+ b L(different)
+ .align 4
+L(aligned):
+ addi rSTR1, rSTR1, 16
+ addi rSTR2, rSTR2, 16
+#ifdef USE_AS_STRNCASECMP
+ cmpdi cr7, r5, 16
+ blt cr7, L(bytebybyte)
+ addi r5, r5, -16
+#endif
+ lvx v4, 0, rSTR1
+ lvx v5, 0, rSTR2
+ CHECKNULLANDCONVERT()
+ blt cr6, L(aligned)
+
+ /* Calculate and return the difference. */
+L(different):
+ vaddubm v1, v3, v3
+ vcmpequb v7, v0, v7
+#ifdef __LITTLE_ENDIAN__
+ /* Count trailing zero. */
+ vspltisb v8, -1
+ VADDUQM_V7_V8
+ vandc v8, v9, v7
+ VPOPCNTD_V8_V8
+ vspltb v6, v8, 15
+ vcmpequb. v6, v6, v1
+ blt cr6, L(shift8)
+#else
+ /* Count leading zero. */
+ VCLZD_V8_v7
+ vspltb v6, v8, 7
+ vcmpequb. v6, v6, v1
+ blt cr6, L(shift8)
+ vsro v8, v8, v1
+#endif
+ b L(skipsum)
+ .align 4
+L(shift8):
+ vsumsws v8, v8, v0
+L(skipsum):
+#ifdef __LITTLE_ENDIAN__
+ /* Shift registers based on leading zero count. */
+ vsro v6, v5, v8
+ vsro v7, v4, v8
+ /* Merge and move to GPR. */
+ vmrglb v6, v6, v7
+ vslo v1, v6, v1
+ MFVRD_R3_V1
+ /* Place the characters that are different in first position. */
+ sldi rSTR2, rRTN, 56
+ srdi rSTR2, rSTR2, 56
+ sldi rSTR1, rRTN, 48
+ srdi rSTR1, rSTR1, 56
+#else
+ vslo v6, v5, v8
+ vslo v7, v4, v8
+ vmrghb v1, v6, v7
+ MFVRD_R3_V1
+ srdi rSTR2, rRTN, 48
+ sldi rSTR2, rSTR2, 56
+ srdi rSTR2, rSTR2, 56
+ srdi rSTR1, rRTN, 56
+#endif
+ subf rRTN, rSTR1, rSTR2
+ extsw rRTN, rRTN
+ blr
+
+ .align 4
+ /* OK. We've hit the end of the string. We need to be careful that
+ we don't compare two strings as different because of junk beyond
+ the end of the strings... */
+L(null_found):
+ vaddubm v10, v3, v3
+#ifdef __LITTLE_ENDIAN__
+ /* Count trailing zero. */
+ vspltisb v8, -1
+ VADDUQM_V7_V8
+ vandc v8, v9, v7
+ VPOPCNTD_V8_V8
+ vspltb v6, v8, 15
+ vcmpequb. v6, v6, v10
+ blt cr6, L(shift_8)
+#else
+ /* Count leading zero. */
+ VCLZD_V8_v7
+ vspltb v6, v8, 7
+ vcmpequb. v6, v6, v10
+ blt cr6, L(shift_8)
+ vsro v8, v8, v10
+#endif
+ b L(skipsum1)
+ .align 4
+L(shift_8):
+ vsumsws v8, v8, v0
+L(skipsum1):
+ /* Calculate shift count based on count of zero. */
+ vspltisb v10, 7
+ vslb v10, v10, v10
+ vsldoi v9, v0, v10, 1
+ VSUBUDM_V9_V8
+ vspltisb v8, 8
+ vsldoi v8, v0, v8, 1
+ VSUBUDM_V9_V8
+ /* Shift and remove junk after null character. */
+#ifdef __LITTLE_ENDIAN__
+ vslo v5, v5, v9
+ vslo v4, v4, v9
+#else
+ vsro v5, v5, v9
+ vsro v4, v4, v9
+#endif
+ /* Convert and compare 16 bytes. */
+ TOLOWER()
+ blt cr6, L(retnull)
+ b L(different)
+ .align 4
+L(retnull):
+ li rRTN, 0
+ blr
+ .align 4
+L(bytebybyte):
+ /* Unrolling loop for POWER: loads are done with 'lbz' plus
+ offset and string descriptors are only updated in the end
+ of loop unrolling. */
+ ld rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
+ lbz rCHAR1, 0(rSTR1) /* Load char from s1 */
+ lbz rCHAR2, 0(rSTR2) /* Load char from s2 */
+#ifdef USE_AS_STRNCASECMP
+ rldicl rTMP, r5, 62, 2
+ cmpdi cr7, rTMP, 0
+ beq cr7, L(lessthan4)
+ mtctr rTMP
+#endif
+L(loop):
+ cmpdi rCHAR1, 0 /* *s1 == '\0' ? */
+ sldi rADDR1, rCHAR1, 2 /* Calculate address for tolower(*s1) */
+ sldi rADDR2, rCHAR2, 2 /* Calculate address for tolower(*s2) */
+ lwzx rLWR1, rLOC, rADDR1 /* Load tolower(*s1) */
+ lwzx rLWR2, rLOC, rADDR2 /* Load tolower(*s2) */
+ cmpw cr1, rLWR1, rLWR2 /* r = tolower(*s1) == tolower(*s2) ? */
+ crorc 4*cr1+eq,eq,4*cr1+eq /* (*s1 != '\0') || (r == 1) */
+ beq cr1, L(done)
+ lbz rCHAR1, 1(rSTR1)
+ lbz rCHAR2, 1(rSTR2)
+ cmpdi rCHAR1, 0
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpw cr1, rLWR1, rLWR2
+ crorc 4*cr1+eq,eq,4*cr1+eq
+ beq cr1, L(done)
+ lbz rCHAR1, 2(rSTR1)
+ lbz rCHAR2, 2(rSTR2)
+ cmpdi rCHAR1, 0
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpw cr1, rLWR1, rLWR2
+ crorc 4*cr1+eq,eq,4*cr1+eq
+ beq cr1, L(done)
+ lbz rCHAR1, 3(rSTR1)
+ lbz rCHAR2, 3(rSTR2)
+ cmpdi rCHAR1, 0
+ /* Increment both string descriptors */
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpw cr1, rLWR1, rLWR2
+ crorc 4*cr1+eq,eq,4*cr1+eq
+ beq cr1, L(done)
+ lbz rCHAR1, 0(rSTR1) /* Load char from s1 */
+ lbz rCHAR2, 0(rSTR2) /* Load char from s2 */
+#ifdef USE_AS_STRNCASECMP
+ bdnz L(loop)
+#else
+ b L(loop)
+#endif
+#ifdef USE_AS_STRNCASECMP
+L(lessthan4):
+ clrldi r5, r5, 62
+ cmpdi cr7, r5, 0
+ beq cr7, L(retnull)
+ mtctr r5
+L(loop1):
+ cmpdi rCHAR1, 0
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpw cr1, rLWR1, rLWR2
+ crorc 4*cr1+eq,eq,4*cr1+eq
+ beq cr1, L(done)
+ addi rSTR1, rSTR1, 1
+ addi rSTR2, rSTR2, 1
+ lbz rCHAR1, 0(rSTR1)
+ lbz rCHAR2, 0(rSTR2)
+ bdnz L(loop1)
+#endif
+L(done):
+ subf r0, rLWR2, rLWR1
+ extsw rRTN, r0
+ blr
+END (__STRCASECMP)
+
+weak_alias (__STRCASECMP, STRCASECMP)
+libc_hidden_builtin_def (__STRCASECMP)