summaryrefslogtreecommitdiff
path: root/sysdeps/powerpc/powerpc64/power8
diff options
context:
space:
mode:
authorSamuel Thibault <samuel.thibault@ens-lyon.org>2018-12-27 19:01:57 +0000
committerSamuel Thibault <samuel.thibault@ens-lyon.org>2018-12-27 19:01:57 +0000
commitcab56836b146bc129f1ad43f0393d95a9deca63a (patch)
tree4f4e655319bbac78fca170da05275c127429b460 /sysdeps/powerpc/powerpc64/power8
parent04ac1241a4cd004872282c2c82ec37fa33925292 (diff)
parent82dd75a7f436a19047325d62182590c9f9e23a78 (diff)
Merge branch 't/tls' into refs/top-bases/t/tls-threadvar
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power8')
-rw-r--r--sysdeps/powerpc/powerpc64/power8/Implies2
-rw-r--r--sysdeps/powerpc/powerpc64/power8/Makefile3
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/Implies1
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S303
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies1
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S509
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S4
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S4
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S4
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S28
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S28
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/s_llroundf.S1
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S520
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/w_expf.c1
-rw-r--r--sysdeps/powerpc/powerpc64/power8/memchr.S335
-rw-r--r--sysdeps/powerpc/powerpc64/power8/memcmp.S1447
-rw-r--r--sysdeps/powerpc/powerpc64/power8/memrchr.S345
-rw-r--r--sysdeps/powerpc/powerpc64/power8/memset.S84
-rw-r--r--sysdeps/powerpc/powerpc64/power8/multiarch/Implies1
-rw-r--r--sysdeps/powerpc/powerpc64/power8/stpcpy.S2
-rw-r--r--sysdeps/powerpc/powerpc64/power8/stpncpy.S6
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strcasecmp.S457
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strcasestr-ppc64.c29
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strcasestr.S538
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strchr.S377
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strchrnul.S23
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strcmp.S40
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strcpy.S167
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strcspn.S20
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strlen.S290
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strncase.S20
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strncmp.S10
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strncpy.S176
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strnlen.S425
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strrchr.S468
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strspn.S202
36 files changed, 6704 insertions, 167 deletions
diff --git a/sysdeps/powerpc/powerpc64/power8/Implies b/sysdeps/powerpc/powerpc64/power8/Implies
deleted file mode 100644
index 9a5e3c7277..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/Implies
+++ /dev/null
@@ -1,2 +0,0 @@
-powerpc/powerpc64/power7/fpu
-powerpc/powerpc64/power7
diff --git a/sysdeps/powerpc/powerpc64/power8/Makefile b/sysdeps/powerpc/powerpc64/power8/Makefile
new file mode 100644
index 0000000000..71a59529f3
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/Makefile
@@ -0,0 +1,3 @@
+ifeq ($(subdir),string)
+sysdep_routines += strcasestr-ppc64
+endif
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/Implies b/sysdeps/powerpc/powerpc64/power8/fpu/Implies
deleted file mode 100644
index 1187cdfb0a..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/fpu/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc64/power7/fpu/
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S b/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S
new file mode 100644
index 0000000000..32ee8326e1
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S
@@ -0,0 +1,303 @@
+/* Optimized expf(). PowerPC64/POWER8 version.
+ Copyright (C) 2016-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Short algorithm description:
+ *
+ * Let K = 64 (table size).
+ * e^x = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y))
+ * where:
+ * x = m*log(2)/K + y, y in [0.0..log(2)/K]
+ * m = n*K + j, m,n,j - signed integer, j in [0..K-1]
+ * values of 2^(j/K) are tabulated as T[j].
+ *
+ * P(y) is a minimax polynomial approximation of expf(y)-1
+ * on small interval [0.0..log(2)/K].
+ *
+ * P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as
+ * z = y*y; P(y) = (P3*z + P1)*z + (P2*z + P0)*y
+ *
+ * Special cases:
+ * expf(NaN) = NaN
+ * expf(+INF) = +INF
+ * expf(-INF) = 0
+ * expf(x) = 1 for subnormals
+ * for finite argument, only expf(0)=1 is exact
+ * expf(x) overflows if x>88.7228317260742190
+ * expf(x) underflows if x<-103.972076416015620
+ */
+
+#define C1 0x42ad496b /* Single precision 125*log(2). */
+#define C2 0x31800000 /* Single precision 2^(-28). */
+#define SP_INF 0x7f800000 /* Single precision Inf. */
+#define SP_EXP_BIAS 0x1fc0 /* Single precision exponent bias. */
+
+#define DATA_OFFSET r9
+
+/* Implements the function
+
+ float [fp1] expf (float [fp1] x) */
+
+ .machine power8
+ENTRY (__ieee754_expf, 4)
+ addis DATA_OFFSET,r2,.Lanchor@toc@ha
+ addi DATA_OFFSET,DATA_OFFSET,.Lanchor@toc@l
+
+ xscvdpspn v0,v1
+ mfvsrd r8,v0 /* r8 = x */
+ lfd fp2,(.KLN2-.Lanchor)(DATA_OFFSET)
+ lfd fp3,(.P2-.Lanchor)(DATA_OFFSET)
+ rldicl r3,r8,32,33 /* r3 = |x| */
+ lis r4,C1@ha /* r4 = 125*log(2) */
+ ori r4,r4,C1@l
+ cmpw r3,r4
+ lfd fp5,(.P3-.Lanchor)(DATA_OFFSET)
+ lfd fp4,(.RS-.Lanchor)(DATA_OFFSET)
+ fmadd fp2,fp1,fp2,fp4 /* fp2 = x * K/log(2) + (2^23 + 2^22) */
+ bge L(special_paths) /* |x| >= 125*log(2) ? */
+
+ lis r4,C2@ha
+ ori r4,r4,C2@l
+ cmpw r3,r4
+ blt L(small_args) /* |x| < 2^(-28) ? */
+
+ /* Main path: here if 2^(-28) <= |x| < 125*log(2) */
+ frsp fp6,fp2
+ xscvdpsp v2,v2
+ mfvsrd r8,v2
+ mr r3,r8 /* r3 = m */
+ rldicl r8,r8,32,58 /* r8 = j */
+ lfs fp4,(.SP_RS-.Lanchor)(DATA_OFFSET)
+ fsubs fp2,fp6,fp4 /* fp2 = m = x * K/log(2) */
+ srdi r3,r3,32
+ clrrwi r3,r3,6 /* r3 = n */
+ lfd fp6,(.NLN2K-.Lanchor)(DATA_OFFSET)
+ fmadd fp0,fp2,fp6,fp1 /* fp0 = y = x - m*log(2)/K */
+ fmul fp2,fp0,fp0 /* fp2 = z = y^2 */
+ lfd fp4,(.P1-.Lanchor)(DATA_OFFSET)
+ lfd fp6,(.P0-.Lanchor)(DATA_OFFSET)
+ lis r4,SP_EXP_BIAS@ha
+ ori r4,r4,SP_EXP_BIAS@l
+ add r3,r3,r4
+ rldic r3,r3,49,1 /* r3 = 2^n */
+ fmadd fp4,fp5,fp2,fp4 /* fp4 = P3 * z + P1 */
+ fmadd fp6,fp3,fp2,fp6 /* fp6 = P2 * z + P0 */
+ mtvsrd v1,r3
+ xscvspdp v1,v1
+ fmul fp4,fp4,fp2 /* fp4 = (P3 * z + P1)*z */
+ fmadd fp0,fp0,fp6,fp4 /* fp0 = P(y) */
+ sldi r8,r8,3 /* Access doublewords from T[j]. */
+ addi r6,DATA_OFFSET,(.Ttable-.Lanchor)
+ lfdx fp3,r6,r8
+ fmadd fp0,fp0,fp3,fp3 /* fp0 = T[j] * (1 + P(y)) */
+ fmul fp1,fp1,fp0 /* fp1 = 2^n * T[j] * (1 + P(y)) */
+ frsp fp1,fp1
+ blr
+
+ .align 4
+/* x is either underflow, overflow, infinite or NaN. */
+L(special_paths):
+ srdi r8,r8,32
+ rlwinm r8,r8,3,29,29 /* r8 = 0, if x positive.
+ r8 = 4, otherwise. */
+ addi r6,DATA_OFFSET,(.SPRANGE-.Lanchor)
+ lwzx r4,r6,r8 /* r4 = .SPRANGE[signbit(x)] */
+ cmpw r3,r4
+ /* |x| <= .SPRANGE[signbit(x)] */
+ ble L(near_under_or_overflow)
+
+ lis r4,SP_INF@ha
+ ori r4,r4,SP_INF@l
+ cmpw r3,r4
+ bge L(arg_inf_or_nan) /* |x| > Infinite ? */
+
+ addi r6,DATA_OFFSET,(.SPLARGE_SMALL-.Lanchor)
+ lfsx fp1,r6,r8
+ fmuls fp1,fp1,fp1
+ blr
+
+
+ .align 4
+L(small_args):
+ /* expf(x) = 1.0, where |x| < |2^(-28)| */
+ lfs fp2,(.SPone-.Lanchor)(DATA_OFFSET)
+ fadds fp1,fp1,fp2
+ blr
+
+
+ .align 4
+L(arg_inf_or_nan:)
+ bne L(arg_nan)
+
+ /* expf(+INF) = +INF
+ expf(-INF) = 0 */
+ addi r6,DATA_OFFSET,(.INF_ZERO-.Lanchor)
+ lfsx fp1,r6,r8
+ blr
+
+
+ .align 4
+L(arg_nan):
+ /* expf(NaN) = NaN */
+ fadd fp1,fp1,fp1
+ frsp fp1,fp1
+ blr
+
+ .align 4
+L(near_under_or_overflow):
+ frsp fp6,fp2
+ xscvdpsp v2,v2
+ mfvsrd r8,v2
+ mr r3,r8 /* r3 = m */
+ rldicl r8,r8,32,58 /* r8 = j */
+ lfs fp4,(.SP_RS-.Lanchor)(DATA_OFFSET)
+ fsubs fp2,fp6,fp4 /* fp2 = m = x * K/log(2) */
+ srdi r3,r3,32
+ clrrwi r3,r3,6 /* r3 = n */
+ lfd fp6,(.NLN2K-.Lanchor)(DATA_OFFSET)
+ fmadd fp0,fp2,fp6,fp1 /* fp0 = y = x - m*log(2)/K */
+ fmul fp2,fp0,fp0 /* fp2 = z = y^2 */
+ lfd fp4,(.P1-.Lanchor)(DATA_OFFSET)
+ lfd fp6,(.P0-.Lanchor)(DATA_OFFSET)
+ ld r4,(.DP_EXP_BIAS-.Lanchor)(DATA_OFFSET)
+ add r3,r3,r4
+ rldic r3,r3,46,1 /* r3 = 2 */
+ fmadd fp4,fp5,fp2,fp4 /* fp4 = P3 * z + P1 */
+ fmadd fp6,fp3,fp2,fp6 /* fp6 = P2 * z + P0 */
+ mtvsrd v1,r3
+ fmul fp4,fp4,fp2 /* fp4 = (P3*z + P1)*z */
+ fmadd fp0,fp0,fp6,fp4 /* fp0 = P(y) */
+ sldi r8,r8,3 /* Access doublewords from T[j]. */
+ addi r6,DATA_OFFSET,(.Ttable-.Lanchor)
+ lfdx fp3,r6,r8
+ fmadd fp0,fp0,fp3,fp3 /* fp0 = T[j] * (1 + T[j]) */
+ fmul fp1,fp1,fp0 /* fp1 = 2^n * T[j] * (1 + T[j]) */
+ frsp fp1,fp1
+ blr
+END(__ieee754_expf)
+
+ .section .rodata, "a",@progbits
+.Lanchor:
+ .balign 8
+/* Table T[j] = 2^(j/K). Double precision. */
+.Ttable:
+ .8byte 0x3ff0000000000000
+ .8byte 0x3ff02c9a3e778061
+ .8byte 0x3ff059b0d3158574
+ .8byte 0x3ff0874518759bc8
+ .8byte 0x3ff0b5586cf9890f
+ .8byte 0x3ff0e3ec32d3d1a2
+ .8byte 0x3ff11301d0125b51
+ .8byte 0x3ff1429aaea92de0
+ .8byte 0x3ff172b83c7d517b
+ .8byte 0x3ff1a35beb6fcb75
+ .8byte 0x3ff1d4873168b9aa
+ .8byte 0x3ff2063b88628cd6
+ .8byte 0x3ff2387a6e756238
+ .8byte 0x3ff26b4565e27cdd
+ .8byte 0x3ff29e9df51fdee1
+ .8byte 0x3ff2d285a6e4030b
+ .8byte 0x3ff306fe0a31b715
+ .8byte 0x3ff33c08b26416ff
+ .8byte 0x3ff371a7373aa9cb
+ .8byte 0x3ff3a7db34e59ff7
+ .8byte 0x3ff3dea64c123422
+ .8byte 0x3ff4160a21f72e2a
+ .8byte 0x3ff44e086061892d
+ .8byte 0x3ff486a2b5c13cd0
+ .8byte 0x3ff4bfdad5362a27
+ .8byte 0x3ff4f9b2769d2ca7
+ .8byte 0x3ff5342b569d4f82
+ .8byte 0x3ff56f4736b527da
+ .8byte 0x3ff5ab07dd485429
+ .8byte 0x3ff5e76f15ad2148
+ .8byte 0x3ff6247eb03a5585
+ .8byte 0x3ff6623882552225
+ .8byte 0x3ff6a09e667f3bcd
+ .8byte 0x3ff6dfb23c651a2f
+ .8byte 0x3ff71f75e8ec5f74
+ .8byte 0x3ff75feb564267c9
+ .8byte 0x3ff7a11473eb0187
+ .8byte 0x3ff7e2f336cf4e62
+ .8byte 0x3ff82589994cce13
+ .8byte 0x3ff868d99b4492ed
+ .8byte 0x3ff8ace5422aa0db
+ .8byte 0x3ff8f1ae99157736
+ .8byte 0x3ff93737b0cdc5e5
+ .8byte 0x3ff97d829fde4e50
+ .8byte 0x3ff9c49182a3f090
+ .8byte 0x3ffa0c667b5de565
+ .8byte 0x3ffa5503b23e255d
+ .8byte 0x3ffa9e6b5579fdbf
+ .8byte 0x3ffae89f995ad3ad
+ .8byte 0x3ffb33a2b84f15fb
+ .8byte 0x3ffb7f76f2fb5e47
+ .8byte 0x3ffbcc1e904bc1d2
+ .8byte 0x3ffc199bdd85529c
+ .8byte 0x3ffc67f12e57d14b
+ .8byte 0x3ffcb720dcef9069
+ .8byte 0x3ffd072d4a07897c
+ .8byte 0x3ffd5818dcfba487
+ .8byte 0x3ffda9e603db3285
+ .8byte 0x3ffdfc97337b9b5f
+ .8byte 0x3ffe502ee78b3ff6
+ .8byte 0x3ffea4afa2a490da
+ .8byte 0x3ffefa1bee615a27
+ .8byte 0x3fff50765b6e4540
+ .8byte 0x3fffa7c1819e90d8
+
+.KLN2:
+ .8byte 0x40571547652b82fe /* Double precision K/log(2). */
+
+/* Double precision polynomial coefficients. */
+.P0:
+ .8byte 0x3fefffffffffe7c6
+.P1:
+ .8byte 0x3fe00000008d6118
+.P2:
+ .8byte 0x3fc55550da752d4f
+.P3:
+ .8byte 0x3fa56420eb78fa85
+
+.RS:
+ .8byte 0x4168000000000000 /* Double precision 2^23 + 2^22. */
+.NLN2K:
+ .8byte 0xbf862e42fefa39ef /* Double precision -log(2)/K. */
+.DP_EXP_BIAS:
+ .8byte 0x000000000000ffc0 /* Double precision exponent bias. */
+
+ .balign 4
+.SPone:
+ .4byte 0x3f800000 /* Single precision 1.0. */
+.SP_RS:
+ .4byte 0x4b400000 /* Single precision 2^23 + 2^22. */
+
+.SPRANGE: /* Single precision overflow/underflow bounds. */
+ .4byte 0x42b17217 /* if x>this bound, then result overflows. */
+ .4byte 0x42cff1b4 /* if x<this bound, then result underflows. */
+
+.SPLARGE_SMALL:
+ .4byte 0x71800000 /* 2^100. */
+ .4byte 0x0d800000 /* 2^-100. */
+
+.INF_ZERO:
+ .4byte 0x7f800000 /* Single precision Inf. */
+ .4byte 0 /* Single precision zero. */
+
+strong_alias (__ieee754_expf, __expf_finite)
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies b/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies
deleted file mode 100644
index 7fd86fdf87..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/fpu/multiarch/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc64/power7/fpu/multiarch
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S
new file mode 100644
index 0000000000..af71382fb2
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_cosf.S
@@ -0,0 +1,509 @@
+/* Optimized cosf(). PowerPC64/POWER8 version.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#define _ERRNO_H 1
+#include <bits/errno.h>
+#include <libm-alias-float.h>
+
+#define FRAMESIZE (FRAME_MIN_SIZE+16)
+
+#define FLOAT_EXPONENT_SHIFT 23
+#define FLOAT_EXPONENT_BIAS 127
+#define INTEGER_BITS 3
+
+#define PI_4 0x3f490fdb /* PI/4 */
+#define NINEPI_4 0x40e231d6 /* 9 * PI/4 */
+#define TWO_PN5 0x3d000000 /* 2^-5 */
+#define TWO_PN27 0x32000000 /* 2^-27 */
+#define INFINITY 0x7f800000
+#define TWO_P23 0x4b000000 /* 2^23 */
+#define FX_FRACTION_1_28 0x9249250 /* 0x100000000 / 28 + 1 */
+
+ /* Implements the function
+
+ float [fp1] cosf (float [fp1] x) */
+
+ .machine power8
+ENTRY (__cosf, 4)
+ addis r9,r2,L(anchor)@toc@ha
+ addi r9,r9,L(anchor)@toc@l
+
+ lis r4,PI_4@h
+ ori r4,r4,PI_4@l
+
+ xscvdpspn v0,v1
+ mfvsrd r8,v0
+ rldicl r3,r8,32,33 /* Remove sign bit. */
+
+ cmpw r3,r4
+ bge L(greater_or_equal_pio4)
+
+ lis r4,TWO_PN5@h
+ ori r4,r4,TWO_PN5@l
+
+ cmpw r3,r4
+ blt L(less_2pn5)
+
+ /* Chebyshev polynomial of the form:
+ * 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). */
+
+ lfd fp9,(L(C0)-L(anchor))(r9)
+ lfd fp10,(L(C1)-L(anchor))(r9)
+ lfd fp11,(L(C2)-L(anchor))(r9)
+ lfd fp12,(L(C3)-L(anchor))(r9)
+ lfd fp13,(L(C4)-L(anchor))(r9)
+
+ fmul fp2,fp1,fp1 /* x^2 */
+ lfd fp3,(L(DPone)-L(anchor))(r9)
+
+ fmadd fp4,fp2,fp13,fp12 /* C3+x^2*C4 */
+ fmadd fp4,fp2,fp4,fp11 /* C2+x^2*(C3+x^2*C4) */
+ fmadd fp4,fp2,fp4,fp10 /* C1+x^2*(C2+x^2*(C3+x^2*C4)) */
+ fmadd fp4,fp2,fp4,fp9 /* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */
+ fmadd fp1,fp2,fp4,fp3 /* 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */
+ frsp fp1,fp1 /* Round to single precision. */
+
+ blr
+
+ .balign 16
+L(greater_or_equal_pio4):
+ lis r4,NINEPI_4@h
+ ori r4,r4,NINEPI_4@l
+ cmpw r3,r4
+ bge L(greater_or_equal_9pio4)
+
+ /* Calculate quotient of |x|/(PI/4). */
+ lfd fp2,(L(invpio4)-L(anchor))(r9)
+ fabs fp1,fp1 /* |x| */
+ fmul fp2,fp1,fp2 /* |x|/(PI/4) */
+ fctiduz fp2,fp2
+ mfvsrd r3,v2 /* n = |x| mod PI/4 */
+
+ /* Now use that quotient to find |x| mod (PI/2). */
+ addi r7,r3,1
+ rldicr r5,r7,2,60 /* ((n+1) >> 1) << 3 */
+ addi r6,r9,(L(pio2_table)-L(anchor))
+ lfdx fp4,r5,r6
+ fsub fp1,fp1,fp4
+
+ .balign 16
+L(reduced):
+ /* Now we are in the range -PI/4 to PI/4. */
+
+ /* Work out if we are in a positive or negative primary interval. */
+ addi r7,r7,2
+ rldicl r4,r7,62,63 /* ((n+3) >> 2) & 1 */
+
+ /* Load a 1.0 or -1.0. */
+ addi r5,r9,(L(ones)-L(anchor))
+ sldi r4,r4,3
+ lfdx fp0,r4,r5
+
+ /* Are we in the primary interval of sin or cos? */
+ andi. r4,r7,0x2
+ bne L(cos)
+
+ /* Chebyshev polynomial of the form:
+ x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). */
+
+ lfd fp9,(L(S0)-L(anchor))(r9)
+ lfd fp10,(L(S1)-L(anchor))(r9)
+ lfd fp11,(L(S2)-L(anchor))(r9)
+ lfd fp12,(L(S3)-L(anchor))(r9)
+ lfd fp13,(L(S4)-L(anchor))(r9)
+
+ fmul fp2,fp1,fp1 /* x^2 */
+ fmul fp3,fp2,fp1 /* x^3 */
+
+ fmadd fp4,fp2,fp13,fp12 /* S3+x^2*S4 */
+ fmadd fp4,fp2,fp4,fp11 /* S2+x^2*(S3+x^2*S4) */
+ fmadd fp4,fp2,fp4,fp10 /* S1+x^2*(S2+x^2*(S3+x^2*S4)) */
+ fmadd fp4,fp2,fp4,fp9 /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */
+ fmadd fp4,fp3,fp4,fp1 /* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */
+ fmul fp4,fp4,fp0 /* Add in the sign. */
+ frsp fp1,fp4 /* Round to single precision. */
+
+ blr
+
+ .balign 16
+L(cos):
+ /* Chebyshev polynomial of the form:
+ 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). */
+
+ lfd fp9,(L(C0)-L(anchor))(r9)
+ lfd fp10,(L(C1)-L(anchor))(r9)
+ lfd fp11,(L(C2)-L(anchor))(r9)
+ lfd fp12,(L(C3)-L(anchor))(r9)
+ lfd fp13,(L(C4)-L(anchor))(r9)
+
+ fmul fp2,fp1,fp1 /* x^2 */
+ lfd fp3,(L(DPone)-L(anchor))(r9)
+
+ fmadd fp4,fp2,fp13,fp12 /* C3+x^2*C4 */
+ fmadd fp4,fp2,fp4,fp11 /* C2+x^2*(C3+x^2*C4) */
+ fmadd fp4,fp2,fp4,fp10 /* C1+x^2*(C2+x^2*(C3+x^2*C4)) */
+ fmadd fp4,fp2,fp4,fp9 /* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */
+ fmadd fp4,fp2,fp4,fp3 /* 1.0 + x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */
+ fmul fp4,fp4,fp0 /* Add in the sign. */
+ frsp fp1,fp4 /* Round to single precision. */
+
+ blr
+
+ .balign 16
+L(greater_or_equal_9pio4):
+ lis r4,INFINITY@h
+ ori r4,r4,INFINITY@l
+ cmpw r3,r4
+ bge L(inf_or_nan)
+
+ lis r4,TWO_P23@h
+ ori r4,r4,TWO_P23@l
+ cmpw r3,r4
+ bge L(greater_or_equal_2p23)
+
+ fabs fp1,fp1 /* |x| */
+
+ /* Calculate quotient of |x|/(PI/4). */
+ lfd fp2,(L(invpio4)-L(anchor))(r9)
+
+ lfd fp3,(L(DPone)-L(anchor))(r9)
+ lfd fp4,(L(DPhalf)-L(anchor))(r9)
+ fmul fp2,fp1,fp2 /* |x|/(PI/4) */
+ friz fp2,fp2 /* n = floor(|x|/(PI/4)) */
+
+ /* Calculate (n + 1) / 2. */
+ fadd fp2,fp2,fp3 /* n + 1 */
+ fmul fp3,fp2,fp4 /* (n + 1) / 2 */
+ friz fp3,fp3
+
+ lfd fp4,(L(pio2hi)-L(anchor))(r9)
+ lfd fp5,(L(pio2lo)-L(anchor))(r9)
+
+ fmul fp6,fp4,fp3
+ fadd fp6,fp6,fp1
+ fmadd fp1,fp5,fp3,fp6
+
+ fctiduz fp2,fp2
+ mfvsrd r7,v2 /* n + 1 */
+
+ b L(reduced)
+
+ .balign 16
+L(inf_or_nan):
+ bne L(skip_errno_setting) /* Is a NAN? */
+
+ /* We delayed the creation of the stack frame, as well as the saving of
+ the link register, because only at this point, we are sure that
+ doing so is actually needed. */
+
+ stfd fp1,-8(r1)
+
+ /* Save the link register. */
+ mflr r0
+ std r0,16(r1)
+ cfi_offset(lr, 16)
+
+ /* Create the stack frame. */
+ stdu r1,-FRAMESIZE(r1)
+ cfi_adjust_cfa_offset(FRAMESIZE)
+
+ bl JUMPTARGET(__errno_location)
+ nop
+
+ /* Restore the stack frame. */
+ addi r1,r1,FRAMESIZE
+ cfi_adjust_cfa_offset(-FRAMESIZE)
+ /* Restore the link register. */
+ ld r0,16(r1)
+ mtlr r0
+
+ lfd fp1,-8(r1)
+
+ /* errno = EDOM */
+ li r4,EDOM
+ stw r4,0(r3)
+
+L(skip_errno_setting):
+ fsub fp1,fp1,fp1 /* x - x */
+ blr
+
+ .balign 16
+L(greater_or_equal_2p23):
+ fabs fp1,fp1
+
+ srwi r4,r3,FLOAT_EXPONENT_SHIFT
+ subi r4,r4,FLOAT_EXPONENT_BIAS
+
+ /* We reduce the input modulo pi/4, so we need 3 bits of integer
+ to determine where in 2*pi we are. Index into our array
+ accordingly. */
+ addi r4,r4,INTEGER_BITS
+
+ /* To avoid an expensive divide, for the range we care about (0 - 127)
+ we can transform x/28 into:
+
+ x/28 = (x * ((0x100000000 / 28) + 1)) >> 32
+
+ mulhwu returns the top 32 bits of the 64 bit result, doing the
+ shift for us in the same instruction. The top 32 bits are undefined,
+ so we have to mask them. */
+
+ lis r6,FX_FRACTION_1_28@h
+ ori r6,r6,FX_FRACTION_1_28@l
+ mulhwu r5,r4,r6
+ clrldi r5,r5,32
+
+ /* Get our pointer into the invpio4_table array. */
+ sldi r4,r5,3
+ addi r6,r9,(L(invpio4_table)-L(anchor))
+ add r4,r4,r6
+
+ lfd fp2,0(r4)
+ lfd fp3,8(r4)
+ lfd fp4,16(r4)
+ lfd fp5,24(r4)
+
+ fmul fp6,fp2,fp1
+ fmul fp7,fp3,fp1
+ fmul fp8,fp4,fp1
+ fmul fp9,fp5,fp1
+
+ /* Mask off larger integer bits in highest double word that we don't
+ care about to avoid losing precision when combining with smaller
+ values. */
+ fctiduz fp10,fp6
+ mfvsrd r7,v10
+ rldicr r7,r7,0,(63-INTEGER_BITS)
+ mtvsrd v10,r7
+ fcfidu fp10,fp10 /* Integer bits. */
+
+ fsub fp6,fp6,fp10 /* highest -= integer bits */
+
+ /* Work out the integer component, rounded down. Use the top two
+ limbs for this. */
+ fadd fp10,fp6,fp7 /* highest + higher */
+
+ fctiduz fp10,fp10
+ mfvsrd r7,v10
+ andi. r0,r7,1
+ fcfidu fp10,fp10
+
+ /* Subtract integer component from highest limb. */
+ fsub fp12,fp6,fp10
+
+ beq L(even_integer)
+
+ /* Our integer component is odd, so we are in the -PI/4 to 0 primary
+ region. We need to shift our result down by PI/4, and to do this
+ in the mod (4/PI) space we simply subtract 1. */
+ lfd fp11,(L(DPone)-L(anchor))(r9)
+ fsub fp12,fp12,fp11
+
+ /* Now add up all the limbs in order. */
+ fadd fp12,fp12,fp7
+ fadd fp12,fp12,fp8
+ fadd fp12,fp12,fp9
+
+ /* And finally multiply by pi/4. */
+ lfd fp13,(L(pio4)-L(anchor))(r9)
+ fmul fp1,fp12,fp13
+
+ addi r7,r7,1
+ b L(reduced)
+
+L(even_integer):
+ lfd fp11,(L(DPone)-L(anchor))(r9)
+
+ /* Now add up all the limbs in order. */
+ fadd fp12,fp12,fp7
+ fadd fp12,r12,fp8
+ fadd fp12,r12,fp9
+
+ /* We need to check if the addition of all the limbs resulted in us
+ overflowing 1.0. */
+ fcmpu 0,fp12,fp11
+ bgt L(greater_than_one)
+
+ /* And finally multiply by pi/4. */
+ lfd fp13,(L(pio4)-L(anchor))(r9)
+ fmul fp1,fp12,fp13
+
+ addi r7,r7,1
+ b L(reduced)
+
+L(greater_than_one):
+ /* We did overflow 1.0 when adding up all the limbs. Add 1.0 to our
+ integer, and subtract 1.0 from our result. Since that makes the
+ integer component odd, we need to subtract another 1.0 as
+ explained above. */
+ addi r7,r7,1
+
+ lfd fp11,(L(DPtwo)-L(anchor))(r9)
+ fsub fp12,fp12,fp11
+
+ /* And finally multiply by pi/4. */
+ lfd fp13,(L(pio4)-L(anchor))(r9)
+ fmul fp1,fp12,fp13
+
+ addi r7,r7,1
+ b L(reduced)
+
+ .balign 16
+L(less_2pn5):
+ lis r4,TWO_PN27@h
+ ori r4,r4,TWO_PN27@l
+
+ cmpw r3,r4
+ blt L(less_2pn27)
+
+ /* A simpler Chebyshev approximation is close enough for this range:
+ 1.0+x^2*(CC0+x^3*CC1). */
+
+ lfd fp10,(L(CC0)-L(anchor))(r9)
+ lfd fp11,(L(CC1)-L(anchor))(r9)
+
+ fmul fp2,fp1,fp1 /* x^2 */
+ fmul fp3,fp2,fp1 /* x^3 */
+ lfd fp1,(L(DPone)-L(anchor))(r9)
+
+ fmadd fp4,fp3,fp11,fp10 /* CC0+x^3*CC1 */
+ fmadd fp1,fp2,fp4,fp1 /* 1.0+x^2*(CC0+x^3*CC1) */
+
+ frsp fp1,fp1 /* Round to single precision. */
+
+ blr
+
+ .balign 16
+L(less_2pn27):
+ /* Handle some special cases:
+
+ cosf(subnormal) raises inexact
+ cosf(min_normalized) raises inexact
+ cosf(normalized) raises inexact. */
+
+ lfd fp2,(L(DPone)-L(anchor))(r9)
+
+ fabs fp1,fp1 /* |x| */
+ fsub fp1,fp2,fp1 /* 1.0-|x| */
+
+ frsp fp1,fp1
+
+ blr
+
+END (__cosf)
+
+ .section .rodata, "a"
+
+ .balign 8
+
+L(anchor):
+
+ /* Chebyshev constants for sin, range -PI/4 - PI/4. */
+L(S0): .8byte 0xbfc5555555551cd9
+L(S1): .8byte 0x3f81111110c2688b
+L(S2): .8byte 0xbf2a019f8b4bd1f9
+L(S3): .8byte 0x3ec71d7264e6b5b4
+L(S4): .8byte 0xbe5a947e1674b58a
+
+ /* Chebyshev constants for cos, range 2^-27 - 2^-5. */
+L(CC0): .8byte 0xbfdfffffff5cc6fd
+L(CC1): .8byte 0x3fa55514b178dac5
+
+ /* Chebyshev constants for cos, range -PI/4 - PI/4. */
+L(C0): .8byte 0xbfdffffffffe98ae
+L(C1): .8byte 0x3fa55555545c50c7
+L(C2): .8byte 0xbf56c16b348b6874
+L(C3): .8byte 0x3efa00eb9ac43cc0
+L(C4): .8byte 0xbe923c97dd8844d7
+
+L(invpio2):
+ .8byte 0x3fe45f306dc9c883 /* 2/PI */
+
+L(invpio4):
+ .8byte 0x3ff45f306dc9c883 /* 4/PI */
+
+L(invpio4_table):
+ .8byte 0x0000000000000000
+ .8byte 0x3ff45f306c000000
+ .8byte 0x3e3c9c882a000000
+ .8byte 0x3c54fe13a8000000
+ .8byte 0x3aaf47d4d0000000
+ .8byte 0x38fbb81b6c000000
+ .8byte 0x3714acc9e0000000
+ .8byte 0x3560e4107c000000
+ .8byte 0x33bca2c756000000
+ .8byte 0x31fbd778ac000000
+ .8byte 0x300b7246e0000000
+ .8byte 0x2e5d2126e8000000
+ .8byte 0x2c97003248000000
+ .8byte 0x2ad77504e8000000
+ .8byte 0x290921cfe0000000
+ .8byte 0x274deb1cb0000000
+ .8byte 0x25829a73e0000000
+ .8byte 0x23fd1046be000000
+ .8byte 0x2224baed10000000
+ .8byte 0x20709d338e000000
+ .8byte 0x1e535a2f80000000
+ .8byte 0x1cef904e64000000
+ .8byte 0x1b0d639830000000
+ .8byte 0x1964ce7d24000000
+ .8byte 0x17b908bf16000000
+
+L(pio4):
+ .8byte 0x3fe921fb54442d18 /* PI/4 */
+
+/* PI/2 as a sum of two doubles. We only use 32 bits of the upper limb
+ to avoid losing significant bits when multiplying with up to
+ (2^22)/(pi/2). */
+L(pio2hi):
+ .8byte 0xbff921fb54400000
+
+L(pio2lo):
+ .8byte 0xbdd0b4611a626332
+
+L(pio2_table):
+ .8byte 0
+ .8byte 0x3ff921fb54442d18 /* 1 * PI/2 */
+ .8byte 0x400921fb54442d18 /* 2 * PI/2 */
+ .8byte 0x4012d97c7f3321d2 /* 3 * PI/2 */
+ .8byte 0x401921fb54442d18 /* 4 * PI/2 */
+ .8byte 0x401f6a7a2955385e /* 5 * PI/2 */
+ .8byte 0x4022d97c7f3321d2 /* 6 * PI/2 */
+ .8byte 0x4025fdbbe9bba775 /* 7 * PI/2 */
+ .8byte 0x402921fb54442d18 /* 8 * PI/2 */
+ .8byte 0x402c463abeccb2bb /* 9 * PI/2 */
+ .8byte 0x402f6a7a2955385e /* 10 * PI/2 */
+
+L(small):
+ .8byte 0x3cd0000000000000 /* 2^-50 */
+
+L(ones):
+ .8byte 0x3ff0000000000000 /* +1.0 */
+ .8byte 0xbff0000000000000 /* -1.0 */
+
+L(DPhalf):
+ .8byte 0x3fe0000000000000 /* 0.5 */
+
+L(DPone):
+ .8byte 0x3ff0000000000000 /* 1.0 */
+
+L(DPtwo):
+ .8byte 0x4000000000000000 /* 2.0 */
+
+libm_alias_float (__cos, cos)
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S
index 3b0c88e5eb..aac54a9364 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S
@@ -1,5 +1,5 @@
/* isfinite(). PowerPC64/POWER8 version.
- Copyright (C) 2014-2016 Free Software Foundation, Inc.
+ Copyright (C) 2014-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -23,7 +23,7 @@
/* int [r3] __finite ([fp1] x) */
-EALIGN (__finite, 4, 0)
+ENTRY_TOCLESS (__finite, 4)
CALL_MCOUNT 0
MFVSRD_R3_V1
lis r9,0x8010
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S
index 4708239689..94746ef068 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S
@@ -1,5 +1,5 @@
/* isinf(). PowerPC64/POWER8 version.
- Copyright (C) 2014-2016 Free Software Foundation, Inc.
+ Copyright (C) 2014-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -23,7 +23,7 @@
/* int [r3] __isinf([fp1] x) */
-EALIGN (__isinf, 4, 0)
+ENTRY_TOCLESS (__isinf, 4)
CALL_MCOUNT 0
MFVSRD_R3_V1
lis r9,0x7ff0 /* r9 = 0x7ff0 */
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S
index 0e5c19333c..8aef354f68 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S
@@ -1,5 +1,5 @@
/* isnan(). PowerPC64/POWER8 version.
- Copyright (C) 2014-2016 Free Software Foundation, Inc.
+ Copyright (C) 2014-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -23,7 +23,7 @@
/* int [r3] __isnan([f1] x) */
-EALIGN (__isnan, 4, 0)
+ENTRY_TOCLESS (__isnan, 4)
CALL_MCOUNT 0
MFVSRD_R3_V1
lis r9,0x7ff0
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S
index 0af9feee5a..7f18d705a9 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S
@@ -1,5 +1,5 @@
/* Round double to long int. POWER8 PowerPC64 version.
- Copyright (C) 2014-2016 Free Software Foundation, Inc.
+ Copyright (C) 2014-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -18,11 +18,13 @@
#include <sysdep.h>
#include <math_ldbl_opt.h>
+#include <libm-alias-float.h>
+#include <libm-alias-double.h>
#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */
/* long long int[r3] __llrint (double x[fp1]) */
-ENTRY (__llrint)
+ENTRY_TOCLESS (__llrint)
CALL_MCOUNT 0
fctid fp1,fp1
MFVSRD_R3_V1
@@ -30,16 +32,12 @@ ENTRY (__llrint)
END (__llrint)
strong_alias (__llrint, __lrint)
-weak_alias (__llrint, llrint)
-weak_alias (__lrint, lrint)
-
-#ifdef NO_LONG_DOUBLE
-strong_alias (__llrint, __llrintl)
-weak_alias (__llrint, llrintl)
-strong_alias (__lrint, __lrintl)
-weak_alias (__lrint, lrintl)
-#endif
-#if LONG_DOUBLE_COMPAT(libm, GLIBC_2_1)
-compat_symbol (libm, __llrint, llrintl, GLIBC_2_1)
-compat_symbol (libm, __lrint, lrintl, GLIBC_2_1)
-#endif
+libm_alias_double (__llrint, llrint)
+libm_alias_double (__lrint, lrint)
+/* The double version also works for single-precision as both float and
+ double parameters are passed in 64bit FPRs and both versions are expected
+ to return [long] long type. */
+strong_alias (__llrint, __llrintf)
+libm_alias_float (__llrint, llrint)
+strong_alias (__lrint, __lrintf)
+libm_alias_float (__lrint, lrint)
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S
index af2409deb8..a22fc63bb3 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S
@@ -1,5 +1,5 @@
/* llround function. POWER8 PowerPC64 version.
- Copyright (C) 2014-2016 Free Software Foundation, Inc.
+ Copyright (C) 2014-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -19,12 +19,14 @@
#include <sysdep.h>
#include <endian.h>
#include <math_ldbl_opt.h>
+#include <libm-alias-float.h>
+#include <libm-alias-double.h>
#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */
/* long long [r3] llround (float x [fp1]) */
-ENTRY (__llround)
+ENTRY_TOCLESS (__llround)
CALL_MCOUNT 0
frin fp1,fp1 /* Round to nearest +-0.5. */
fctidz fp1,fp1 /* Convert To Integer DW round toward 0. */
@@ -33,16 +35,12 @@ ENTRY (__llround)
END (__llround)
strong_alias (__llround, __lround)
-weak_alias (__llround, llround)
-weak_alias (__lround, lround)
-
-#ifdef NO_LONG_DOUBLE
-weak_alias (__llround, llroundl)
-strong_alias (__llround, __llroundl)
-weak_alias (__lround, lroundl)
-strong_alias (__lround, __lroundl)
-#endif
-#if LONG_DOUBLE_COMPAT(libm, GLIBC_2_1)
-compat_symbol (libm, __llround, llroundl, GLIBC_2_1)
-compat_symbol (libm, __lround, lroundl, GLIBC_2_1)
-#endif
+libm_alias_double (__llround, llround)
+libm_alias_double (__lround, lround)
+/* The double version also works for single-precision as both float and
+ double parameters are passed in 64bit FPRs and both versions are expected
+ to return [long] long type. */
+strong_alias (__llround, __llroundf)
+libm_alias_float (__llround, llround)
+strong_alias (__lround, __lroundf)
+libm_alias_float (__lround, lround)
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_llroundf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_llroundf.S
new file mode 100644
index 0000000000..9ea6bd105b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_llroundf.S
@@ -0,0 +1 @@
+/* __lroundf is in s_llround.S. */
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S
new file mode 100644
index 0000000000..59e613c102
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S
@@ -0,0 +1,520 @@
+/* Optimized sinf(). PowerPC64/POWER8 version.
+ Copyright (C) 2016-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#define _ERRNO_H 1
+#include <bits/errno.h>
+#include <libm-alias-float.h>
+
+#define FRAMESIZE (FRAME_MIN_SIZE+16)
+
+#define FLOAT_EXPONENT_SHIFT 23
+#define FLOAT_EXPONENT_BIAS 127
+#define INTEGER_BITS 3
+
+#define PI_4 0x3f490fdb /* PI/4 */
+#define NINEPI_4 0x40e231d6 /* 9 * PI/4 */
+#define TWO_PN5 0x3d000000 /* 2^-5 */
+#define TWO_PN27 0x32000000 /* 2^-27 */
+#define INFINITY 0x7f800000
+#define TWO_P23 0x4b000000 /* 2^27 */
+#define FX_FRACTION_1_28 0x9249250 /* 0x100000000 / 28 + 1 */
+
+ /* Implements the function
+
+ float [fp1] sinf (float [fp1] x) */
+
+ .machine power8
+ENTRY (__sinf, 4)
+ addis r9,r2,L(anchor)@toc@ha
+ addi r9,r9,L(anchor)@toc@l
+
+ lis r4,PI_4@h
+ ori r4,r4,PI_4@l
+
+ xscvdpspn v0,v1
+ mfvsrd r8,v0
+ rldicl r3,r8,32,33 /* Remove sign bit. */
+
+ cmpw r3,r4
+ bge L(greater_or_equal_pio4)
+
+ lis r4,TWO_PN5@h
+ ori r4,r4,TWO_PN5@l
+
+ cmpw r3,r4
+ blt L(less_2pn5)
+
+ /* Chebyshev polynomial of the form:
+ * x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). */
+
+ lfd fp9,(L(S0)-L(anchor))(r9)
+ lfd fp10,(L(S1)-L(anchor))(r9)
+ lfd fp11,(L(S2)-L(anchor))(r9)
+ lfd fp12,(L(S3)-L(anchor))(r9)
+ lfd fp13,(L(S4)-L(anchor))(r9)
+
+ fmul fp2,fp1,fp1 /* x^2 */
+ fmul fp3,fp2,fp1 /* x^3 */
+
+ fmadd fp4,fp2,fp13,fp12 /* S3+x^2*S4 */
+ fmadd fp4,fp2,fp4,fp11 /* S2+x^2*(S3+x^2*S4) */
+ fmadd fp4,fp2,fp4,fp10 /* S1+x^2*(S2+x^2*(S3+x^2*S4)) */
+ fmadd fp4,fp2,fp4,fp9 /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */
+ fmadd fp1,fp3,fp4,fp1 /* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */
+ frsp fp1,fp1 /* Round to single precision. */
+
+ blr
+
+ .balign 16
+L(greater_or_equal_pio4):
+ lis r4,NINEPI_4@h
+ ori r4,r4,NINEPI_4@l
+ cmpw r3,r4
+ bge L(greater_or_equal_9pio4)
+
+ /* Calculate quotient of |x|/(PI/4). */
+ lfd fp2,(L(invpio4)-L(anchor))(r9)
+ fabs fp1,fp1 /* |x| */
+ fmul fp2,fp1,fp2 /* |x|/(PI/4) */
+ fctiduz fp2,fp2
+ mfvsrd r3,v2 /* n = |x| mod PI/4 */
+
+ /* Now use that quotient to find |x| mod (PI/2). */
+ addi r7,r3,1
+ rldicr r5,r7,2,60 /* ((n+1) >> 1) << 3 */
+ addi r6,r9,(L(pio2_table)-L(anchor))
+ lfdx fp4,r5,r6
+ fsub fp1,fp1,fp4
+
+ .balign 16
+L(reduced):
+ /* Now we are in the range -PI/4 to PI/4. */
+
+ /* Work out if we are in a positive or negative primary interval. */
+ rldicl r4,r7,62,63 /* ((n+1) >> 2) & 1 */
+
+ /* We are operating on |x|, so we need to add back the original
+ sign. */
+ rldicl r8,r8,33,63 /* (x >> 31) & 1, ie the sign bit. */
+ xor r4,r4,r8 /* 0 if result should be positive,
+ 1 if negative. */
+
+ /* Load a 1.0 or -1.0. */
+ addi r5,r9,(L(ones)-L(anchor))
+ sldi r4,r4,3
+ lfdx fp0,r4,r5
+
+ /* Are we in the primary interval of sin or cos? */
+ andi. r4,r7,0x2
+ bne L(cos)
+
+ /* Chebyshev polynomial of the form:
+ x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). */
+
+ lfd fp9,(L(S0)-L(anchor))(r9)
+ lfd fp10,(L(S1)-L(anchor))(r9)
+ lfd fp11,(L(S2)-L(anchor))(r9)
+ lfd fp12,(L(S3)-L(anchor))(r9)
+ lfd fp13,(L(S4)-L(anchor))(r9)
+
+ fmul fp2,fp1,fp1 /* x^2 */
+ fmul fp3,fp2,fp1 /* x^3 */
+
+ fmadd fp4,fp2,fp13,fp12 /* S3+x^2*S4 */
+ fmadd fp4,fp2,fp4,fp11 /* S2+x^2*(S3+x^2*S4) */
+ fmadd fp4,fp2,fp4,fp10 /* S1+x^2*(S2+x^2*(S3+x^2*S4)) */
+ fmadd fp4,fp2,fp4,fp9 /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */
+ fmadd fp4,fp3,fp4,fp1 /* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */
+ fmul fp4,fp4,fp0 /* Add in the sign. */
+ frsp fp1,fp4 /* Round to single precision. */
+
+ blr
+
+ .balign 16
+L(cos):
+ /* Chebyshev polynomial of the form:
+ 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). */
+
+ lfd fp9,(L(C0)-L(anchor))(r9)
+ lfd fp10,(L(C1)-L(anchor))(r9)
+ lfd fp11,(L(C2)-L(anchor))(r9)
+ lfd fp12,(L(C3)-L(anchor))(r9)
+ lfd fp13,(L(C4)-L(anchor))(r9)
+
+ fmul fp2,fp1,fp1 /* x^2 */
+ lfd fp3,(L(DPone)-L(anchor))(r9)
+
+ fmadd fp4,fp2,fp13,fp12 /* C3+x^2*C4 */
+ fmadd fp4,fp2,fp4,fp11 /* C2+x^2*(C3+x^2*C4) */
+ fmadd fp4,fp2,fp4,fp10 /* C1+x^2*(C2+x^2*(C3+x^2*C4)) */
+ fmadd fp4,fp2,fp4,fp9 /* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */
+ fmadd fp4,fp2,fp4,fp3 /* 1.0 + x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */
+ fmul fp4,fp4,fp0 /* Add in the sign. */
+ frsp fp1,fp4 /* Round to single precision. */
+
+ blr
+
+ .balign 16
+L(greater_or_equal_9pio4):
+ lis r4,INFINITY@h
+ ori r4,r4,INFINITY@l
+ cmpw r3,r4
+ bge L(inf_or_nan)
+
+ lis r4,TWO_P23@h
+ ori r4,r4,TWO_P23@l
+ cmpw r3,r4
+ bge L(greater_or_equal_2p23)
+
+ fabs fp1,fp1 /* |x| */
+
+ /* Calculate quotient of |x|/(PI/4). */
+ lfd fp2,(L(invpio4)-L(anchor))(r9)
+
+ lfd fp3,(L(DPone)-L(anchor))(r9)
+ lfd fp4,(L(DPhalf)-L(anchor))(r9)
+ fmul fp2,fp1,fp2 /* |x|/(PI/4) */
+ friz fp2,fp2 /* n = floor(|x|/(PI/4)) */
+
+ /* Calculate (n + 1) / 2. */
+ fadd fp2,fp2,fp3 /* n + 1 */
+ fmul fp3,fp2,fp4 /* (n + 1) / 2 */
+ friz fp3,fp3
+
+ lfd fp4,(L(pio2hi)-L(anchor))(r9)
+ lfd fp5,(L(pio2lo)-L(anchor))(r9)
+
+ fmul fp6,fp4,fp3
+ fadd fp6,fp6,fp1
+ fmadd fp1,fp5,fp3,fp6
+
+ fctiduz fp2,fp2
+ mfvsrd r7,v2 /* n + 1 */
+
+ b L(reduced)
+
+ .balign 16
+L(inf_or_nan):
+ bne L(skip_errno_setting) /* Is a NAN? */
+
+ /* We delayed the creation of the stack frame, as well as the saving of
+ the link register, because only at this point, we are sure that
+ doing so is actually needed. */
+
+ stfd fp1,-8(r1)
+
+ /* Save the link register. */
+ mflr r0
+ std r0,16(r1)
+ cfi_offset(lr, 16)
+
+ /* Create the stack frame. */
+ stdu r1,-FRAMESIZE(r1)
+ cfi_adjust_cfa_offset(FRAMESIZE)
+
+ bl JUMPTARGET(__errno_location)
+ nop
+
+ /* Restore the stack frame. */
+ addi r1,r1,FRAMESIZE
+ cfi_adjust_cfa_offset(-FRAMESIZE)
+ /* Restore the link register. */
+ ld r0,16(r1)
+ mtlr r0
+
+ lfd fp1,-8(r1)
+
+ /* errno = EDOM */
+ li r4,EDOM
+ stw r4,0(r3)
+
+L(skip_errno_setting):
+ fsub fp1,fp1,fp1 /* x - x */
+ blr
+
+ .balign 16
+L(greater_or_equal_2p23):
+ fabs fp1,fp1
+
+ srwi r4,r3,FLOAT_EXPONENT_SHIFT
+ subi r4,r4,FLOAT_EXPONENT_BIAS
+
+ /* We reduce the input modulo pi/4, so we need 3 bits of integer
+ to determine where in 2*pi we are. Index into our array
+ accordingly. */
+ addi r4,r4,INTEGER_BITS
+
+ /* To avoid an expensive divide, for the range we care about (0 - 127)
+ we can transform x/28 into:
+
+ x/28 = (x * ((0x100000000 / 28) + 1)) >> 32
+
+ mulhwu returns the top 32 bits of the 64 bit result, doing the
+ shift for us in the same instruction. The top 32 bits are undefined,
+ so we have to mask them. */
+
+ lis r6,FX_FRACTION_1_28@h
+ ori r6,r6,FX_FRACTION_1_28@l
+ mulhwu r5,r4,r6
+ clrldi r5,r5,32
+
+ /* Get our pointer into the invpio4_table array. */
+ sldi r4,r5,3
+ addi r6,r9,(L(invpio4_table)-L(anchor))
+ add r4,r4,r6
+
+ lfd fp2,0(r4)
+ lfd fp3,8(r4)
+ lfd fp4,16(r4)
+ lfd fp5,24(r4)
+
+ fmul fp6,fp2,fp1
+ fmul fp7,fp3,fp1
+ fmul fp8,fp4,fp1
+ fmul fp9,fp5,fp1
+
+ /* Mask off larger integer bits in highest double word that we don't
+ care about to avoid losing precision when combining with smaller
+ values. */
+ fctiduz fp10,fp6
+ mfvsrd r7,v10
+ rldicr r7,r7,0,(63-INTEGER_BITS)
+ mtvsrd v10,r7
+ fcfidu fp10,fp10 /* Integer bits. */
+
+ fsub fp6,fp6,fp10 /* highest -= integer bits */
+
+ /* Work out the integer component, rounded down. Use the top two
+ limbs for this. */
+ fadd fp10,fp6,fp7 /* highest + higher */
+
+ fctiduz fp10,fp10
+ mfvsrd r7,v10
+ andi. r0,r7,1
+ fcfidu fp10,fp10
+
+ /* Subtract integer component from highest limb. */
+ fsub fp12,fp6,fp10
+
+ beq L(even_integer)
+
+ /* Our integer component is odd, so we are in the -PI/4 to 0 primary
+ region. We need to shift our result down by PI/4, and to do this
+ in the mod (4/PI) space we simply subtract 1. */
+ lfd fp11,(L(DPone)-L(anchor))(r9)
+ fsub fp12,fp12,fp11
+
+ /* Now add up all the limbs in order. */
+ fadd fp12,fp12,fp7
+ fadd fp12,fp12,fp8
+ fadd fp12,fp12,fp9
+
+ /* And finally multiply by pi/4. */
+ lfd fp13,(L(pio4)-L(anchor))(r9)
+ fmul fp1,fp12,fp13
+
+ addi r7,r7,1
+ b L(reduced)
+
+L(even_integer):
+ lfd fp11,(L(DPone)-L(anchor))(r9)
+
+ /* Now add up all the limbs in order. */
+ fadd fp12,fp12,fp7
+ fadd fp12,r12,fp8
+ fadd fp12,r12,fp9
+
+ /* We need to check if the addition of all the limbs resulted in us
+ overflowing 1.0. */
+ fcmpu 0,fp12,fp11
+ bgt L(greater_than_one)
+
+ /* And finally multiply by pi/4. */
+ lfd fp13,(L(pio4)-L(anchor))(r9)
+ fmul fp1,fp12,fp13
+
+ addi r7,r7,1
+ b L(reduced)
+
+L(greater_than_one):
+ /* We did overflow 1.0 when adding up all the limbs. Add 1.0 to our
+ integer, and subtract 1.0 from our result. Since that makes the
+ integer component odd, we need to subtract another 1.0 as
+ explained above. */
+ addi r7,r7,1
+
+ lfd fp11,(L(DPtwo)-L(anchor))(r9)
+ fsub fp12,fp12,fp11
+
+ /* And finally multiply by pi/4. */
+ lfd fp13,(L(pio4)-L(anchor))(r9)
+ fmul fp1,fp12,fp13
+
+ addi r7,r7,1
+ b L(reduced)
+
+ .balign 16
+L(less_2pn5):
+ lis r4,TWO_PN27@h
+ ori r4,r4,TWO_PN27@l
+
+ cmpw r3,r4
+ blt L(less_2pn27)
+
+ /* A simpler Chebyshev approximation is close enough for this range:
+ x+x^3*(SS0+x^2*SS1). */
+
+ lfd fp10,(L(SS0)-L(anchor))(r9)
+ lfd fp11,(L(SS1)-L(anchor))(r9)
+
+ fmul fp2,fp1,fp1 /* x^2 */
+ fmul fp3,fp2,fp1 /* x^3 */
+
+ fmadd fp4,fp2,fp11,fp10 /* SS0+x^2*SS1 */
+ fmadd fp1,fp3,fp4,fp1 /* x+x^3*(SS0+x^2*SS1) */
+
+ frsp fp1,fp1 /* Round to single precision. */
+
+ blr
+
+ .balign 16
+L(less_2pn27):
+ cmpwi r3,0
+ beq L(zero)
+
+ /* Handle some special cases:
+
+ sinf(subnormal) raises inexact/underflow
+ sinf(min_normalized) raises inexact/underflow
+ sinf(normalized) raises inexact. */
+
+ lfd fp2,(L(small)-L(anchor))(r9)
+
+ fmul fp2,fp1,fp2 /* x * small */
+ fsub fp1,fp1,fp2 /* x - x * small */
+
+ frsp fp1,fp1
+
+ blr
+
+ .balign 16
+L(zero):
+ blr
+
+END (__sinf)
+
+ .section .rodata, "a"
+
+ .balign 8
+
+L(anchor):
+
+ /* Chebyshev constants for sin, range -PI/4 - PI/4. */
+L(S0): .8byte 0xbfc5555555551cd9
+L(S1): .8byte 0x3f81111110c2688b
+L(S2): .8byte 0xbf2a019f8b4bd1f9
+L(S3): .8byte 0x3ec71d7264e6b5b4
+L(S4): .8byte 0xbe5a947e1674b58a
+
+ /* Chebyshev constants for sin, range 2^-27 - 2^-5. */
+L(SS0): .8byte 0xbfc555555543d49d
+L(SS1): .8byte 0x3f8110f475cec8c5
+
+ /* Chebyshev constants for cos, range -PI/4 - PI/4. */
+L(C0): .8byte 0xbfdffffffffe98ae
+L(C1): .8byte 0x3fa55555545c50c7
+L(C2): .8byte 0xbf56c16b348b6874
+L(C3): .8byte 0x3efa00eb9ac43cc0
+L(C4): .8byte 0xbe923c97dd8844d7
+
+L(invpio2):
+ .8byte 0x3fe45f306dc9c883 /* 2/PI */
+
+L(invpio4):
+ .8byte 0x3ff45f306dc9c883 /* 4/PI */
+
+L(invpio4_table):
+ .8byte 0x0000000000000000
+ .8byte 0x3ff45f306c000000
+ .8byte 0x3e3c9c882a000000
+ .8byte 0x3c54fe13a8000000
+ .8byte 0x3aaf47d4d0000000
+ .8byte 0x38fbb81b6c000000
+ .8byte 0x3714acc9e0000000
+ .8byte 0x3560e4107c000000
+ .8byte 0x33bca2c756000000
+ .8byte 0x31fbd778ac000000
+ .8byte 0x300b7246e0000000
+ .8byte 0x2e5d2126e8000000
+ .8byte 0x2c97003248000000
+ .8byte 0x2ad77504e8000000
+ .8byte 0x290921cfe0000000
+ .8byte 0x274deb1cb0000000
+ .8byte 0x25829a73e0000000
+ .8byte 0x23fd1046be000000
+ .8byte 0x2224baed10000000
+ .8byte 0x20709d338e000000
+ .8byte 0x1e535a2f80000000
+ .8byte 0x1cef904e64000000
+ .8byte 0x1b0d639830000000
+ .8byte 0x1964ce7d24000000
+ .8byte 0x17b908bf16000000
+
+L(pio4):
+ .8byte 0x3fe921fb54442d18 /* PI/4 */
+
+/* PI/2 as a sum of two doubles. We only use 32 bits of the upper limb
+ to avoid losing significant bits when multiplying with up to
+ (2^22)/(pi/2). */
+L(pio2hi):
+ .8byte 0xbff921fb54400000
+
+L(pio2lo):
+ .8byte 0xbdd0b4611a626332
+
+L(pio2_table):
+ .8byte 0
+ .8byte 0x3ff921fb54442d18 /* 1 * PI/2 */
+ .8byte 0x400921fb54442d18 /* 2 * PI/2 */
+ .8byte 0x4012d97c7f3321d2 /* 3 * PI/2 */
+ .8byte 0x401921fb54442d18 /* 4 * PI/2 */
+ .8byte 0x401f6a7a2955385e /* 5 * PI/2 */
+ .8byte 0x4022d97c7f3321d2 /* 6 * PI/2 */
+ .8byte 0x4025fdbbe9bba775 /* 7 * PI/2 */
+ .8byte 0x402921fb54442d18 /* 8 * PI/2 */
+ .8byte 0x402c463abeccb2bb /* 9 * PI/2 */
+ .8byte 0x402f6a7a2955385e /* 10 * PI/2 */
+
+L(small):
+ .8byte 0x3cd0000000000000 /* 2^-50 */
+
+L(ones):
+ .8byte 0x3ff0000000000000 /* +1.0 */
+ .8byte 0xbff0000000000000 /* -1.0 */
+
+L(DPhalf):
+ .8byte 0x3fe0000000000000 /* 0.5 */
+
+L(DPone):
+ .8byte 0x3ff0000000000000 /* 1.0 */
+
+L(DPtwo):
+ .8byte 0x4000000000000000 /* 2.0 */
+
+libm_alias_float (__sin, sin)
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/w_expf.c b/sysdeps/powerpc/powerpc64/power8/fpu/w_expf.c
new file mode 100644
index 0000000000..b5fe164520
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/w_expf.c
@@ -0,0 +1 @@
+#include <sysdeps/../math/w_expf.c>
diff --git a/sysdeps/powerpc/powerpc64/power8/memchr.S b/sysdeps/powerpc/powerpc64/power8/memchr.S
new file mode 100644
index 0000000000..45ba1b479a
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/memchr.S
@@ -0,0 +1,335 @@
+/* Optimized memchr implementation for POWER8.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* void *[r3] memchr (const void *s [r3], int c [r4], size_t n [r5]) */
+
+/* TODO: change these to the actual instructions when the minimum required
+ binutils allows it. */
+#define MTVRD(v, r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define MFVRD(r, v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define VBPERMQ(t, a, b) .long (0x1000054c \
+ | ((t)<<(32-11)) \
+ | ((a)<<(32-16)) \
+ | ((b)<<(32-21)) )
+
+#ifndef MEMCHR
+# define MEMCHR __memchr
+#endif
+/* TODO: change this to .machine power8 when the minimum required binutils
+ allows it. */
+ .machine power7
+ENTRY_TOCLESS (MEMCHR)
+ CALL_MCOUNT 3
+ dcbt 0, r3
+ clrrdi r8, r3, 3
+ insrdi r4, r4, 8, 48
+
+ /* Calculate the last acceptable address and check for possible
+ addition overflow by using satured math:
+ r7 = r3 + r5
+ r7 |= -(r7 < x) */
+ add r7, r3, r5
+ subfc r6, r3, r7
+ subfe r9, r9, r9
+ extsw r6, r9
+ or r7, r7, r6
+
+ insrdi r4, r4, 16, 32
+ cmpldi r5, 32
+ li r9, -1
+ rlwinm r6, r3, 3, 26, 28 /* Calculate padding. */
+ insrdi r4, r4, 32, 0
+ mr r10, r7
+ addi r7, r7, -1
+#ifdef __LITTLE_ENDIAN__
+ sld r9, r9, r6
+#else
+ srd r9, r9, r6
+#endif
+ ble L(small_range)
+ andi. r11, r3, 63
+ beq cr0, L(align_qw)
+ clrldi r11, r3, 61
+ ld r12, 0(r8) /* Load doubleword from memory. */
+ cmpb r3, r12, r4 /* Check for BYTEs in DWORD1. */
+ and r3, r3, r9
+ clrldi r6, r7, 61 /* Byte count - 1 in last dword. */
+ clrrdi r7, r7, 3 /* Address of last doubleword. */
+ cmpldi cr7, r3, 0 /* Does r3 indicate we got a hit? */
+ bne cr7, L(done)
+ addi r8, r8, 8
+ addi r5, r5, -8
+ add r5, r5, r11
+
+ /* Are we now aligned to a quadword boundary? */
+ andi. r11, r8, 15
+ beq cr0, L(align_qw)
+
+ /* Handle DWORD to make it QW aligned. */
+ ld r12, 0(r8)
+ cmpb r3, r12, r4
+ cmpldi cr7, r3, 0
+ bne cr7, L(done)
+ addi r5, r5, -8
+ addi r8, r8, 8
+ /* At this point, r8 is 16B aligned. */
+L(align_qw):
+ vspltisb v0, 0
+ /* Precompute vbpermq constant. */
+ vspltisb v10, 3
+ li r0, 0
+ lvsl v11, r0, r0
+ vslb v10, v11, v10
+ MTVRD(v1, r4)
+ vspltb v1, v1, 7
+ cmpldi r5, 64
+ ble L(tail64)
+ /* Are we 64-byte aligned? If so, jump to the vectorized loop.
+ Note: aligning to 64-byte will necessarily slow down performance for
+ strings around 64 bytes in length due to the extra comparisons
+ required to check alignment for the vectorized loop. This is a
+ necessary tradeoff we are willing to take in order to speed up the
+ calculation for larger strings. */
+ andi. r11, r8, 63
+ beq cr0, L(preloop_64B)
+ /* In order to begin the 64B loop, it needs to be 64
+ bytes aligned. So read until it is 64B aligned. */
+ lvx v4, 0, r8
+ vcmpequb v6, v1, v4
+ vcmpequb. v11, v0, v6
+ bnl cr6, L(found_16B)
+ addi r8, r8, 16
+ addi r5, r5, -16
+
+ andi. r11, r8, 63
+ beq cr0, L(preloop_64B)
+ lvx v4, 0, r8
+ vcmpequb v6, v1, v4
+ vcmpequb. v11, v0, v6
+ bnl cr6, L(found_16B)
+ addi r8, r8, 16
+ addi r5, r5, -16
+
+ andi. r11, r8, 63
+ beq cr0, L(preloop_64B)
+ lvx v4, 0, r8
+ vcmpequb v6, v1, v4
+ vcmpequb. v11, v0, v6
+ bnl cr6, L(found_16B)
+ addi r8, r8, 16
+ addi r5, r5, -16
+ /* At this point it should be 64B aligned.
+ Prepare for the 64B loop. */
+L(preloop_64B):
+ cmpldi r5, 64 /* Check if r5 < 64. */
+ ble L(tail64)
+ sub r6, r10, r8
+ srdi r9, r6, 6 /* Number of loop iterations. */
+ mtctr r9 /* Setup the counter. */
+ li r11, 16 /* Load required offsets. */
+ li r9, 32
+ li r7, 48
+
+ /* Handle r5 > 64. Loop over the bytes in strides of 64B. */
+ .align 4
+L(loop):
+ lvx v2, 0, r8 /* Load 4 quadwords. */
+ lvx v3, r8, r11
+ lvx v4, v8, r9
+ lvx v5, v8, r7
+ vcmpequb v6, v1, v2
+ vcmpequb v7, v1, v3
+ vcmpequb v8, v1, v4
+ vcmpequb v9, v1, v5
+ vor v11, v6, v7
+ vor v12, v8, v9
+ vor v11, v11, v12 /* Compare and merge into one VR for speed. */
+ vcmpequb. v11, v0, v11
+ bnl cr6, L(found)
+ addi r8, r8, 64 /* Adjust address for the next iteration. */
+ bdnz L(loop)
+ clrldi r5, r6, 58
+
+ /* Handle remainder of 64B loop or r5 > 64. */
+ .align 4
+L(tail64):
+ cmpldi r5, 0
+ beq L(null)
+ lvx v4, 0, r8
+ vcmpequb v6, v1, v4
+ vcmpequb. v11, v0, v6
+ bnl cr6, L(found_16B)
+ addi r8, r8, 16
+ cmpldi cr6, r5, 16
+ ble cr6, L(null)
+ addi r5, r5, -16
+
+ lvx v4, 0, r8
+ vcmpequb v6, v1, v4
+ vcmpequb. v11, v0, v6
+ bnl cr6, L(found_16B)
+ addi r8, r8, 16
+ cmpldi cr6, r5, 16
+ ble cr6, L(null)
+ addi r5, r5, -16
+
+ lvx v4, 0, r8
+ vcmpequb v6, v1, v4
+ vcmpequb. v11, v0, v6
+ bnl cr6, L(found_16B)
+ addi r8, r8, 16
+ cmpldi cr6, r5, 16
+ ble cr6, L(null)
+ addi r5, r5, -16
+
+ lvx v4, 0, r8
+ vcmpequb v6, v1, v4
+ vcmpequb. v11, v0, v6
+ bnl cr6, L(found_16B)
+ li r3, 0
+ blr
+
+ /* Found a match in 64B loop. */
+ .align 4
+L(found):
+ /* Permute the first bit of each byte into bits 48-63. */
+ VBPERMQ(v6, v6, v10)
+ VBPERMQ(v7, v7, v10)
+ VBPERMQ(v8, v8, v10)
+ VBPERMQ(v9, v9, v10)
+ /* Shift each component into its correct position for merging. */
+#ifdef __LITTLE_ENDIAN__
+ vsldoi v7, v7, v7, 2
+ vsldoi v8, v8, v8, 4
+ vsldoi v9, v9, v9, 6
+#else
+ vsldoi v6, v6, v6, 6
+ vsldoi v7, v7, v7, 4
+ vsldoi v8, v8, v8, 2
+#endif
+ /* Merge the results and move to a GPR. */
+ vor v11, v6, v7
+ vor v4, v9, v8
+ vor v4, v11, v4
+ MFVRD(r5, v4)
+#ifdef __LITTLE_ENDIAN__
+ addi r6, r5, -1
+ andc r6, r6, r5
+ popcntd r6, r6
+#else
+ cntlzd r6, r5 /* Count leading zeros before the match. */
+#endif
+ add r3, r8, r6 /* Compute final length. */
+ blr
+
+ /* Found a match in last 16 bytes. */
+ .align 4
+L(found_16B):
+ /* Permute the first bit of each byte into bits 48-63. */
+ VBPERMQ(v6, v6, v10)
+ /* Shift each component into its correct position for merging. */
+#ifdef __LITTLE_ENDIAN__
+ MFVRD(r7, v6)
+ addi r6, r7, -1
+ andc r6, r6, r7
+ popcntd r6, r6
+#else
+ vsldoi v6, v6, v6, 6
+ MFVRD(r7, v6)
+ cntlzd r6, r7 /* Count leading zeros before the match. */
+#endif
+ add r3, r8, r6 /* Compute final length. */
+ cmpld r6, r5
+ bltlr
+ li r3, 0
+ blr
+
+ .align 4
+ /* r3 has the output of the cmpb instruction, that is, it contains
+ 0xff in the same position as BYTE in the original
+ doubleword from the string. Use that to calculate the pointer.
+ We need to make sure BYTE is *before* the end of the range. */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+ addi r0, r3, -1
+ andc r0, r0, r3
+ popcntd r0, r0 /* Count trailing zeros. */
+#else
+ cntlzd r0, r3 /* Count leading zeros before the match. */
+#endif
+ cmpld r8, r7 /* Are we on the last dword? */
+ srdi r0, r0, 3 /* Convert leading/trailing zeros to bytes. */
+ add r3, r8, r0
+ cmpld cr7, r0, r6 /* If on the last dword, check byte offset. */
+ bnelr
+ blelr cr7
+ li r3, 0
+ blr
+
+ .align 4
+L(null):
+ li r3, 0
+ blr
+
+/* Deals with size <= 32. */
+ .align 4
+L(small_range):
+ cmpldi r5, 0
+ beq L(null)
+ ld r12, 0(r8) /* Load word from memory. */
+ cmpb r3, r12, r4 /* Check for BYTE in DWORD1. */
+ and r3, r3, r9
+ cmpldi cr7, r3, 0
+ clrldi r6, r7, 61 /* Byte count - 1 in last dword. */
+ clrrdi r7, r7, 3 /* Address of last doubleword. */
+ cmpld r8, r7 /* Are we done already? */
+ bne cr7, L(done)
+ beqlr
+
+ ldu r12, 8(r8)
+ cmpb r3, r12, r4
+ cmpldi cr6, r3, 0
+ cmpld r8, r7
+ bne cr6, L(done) /* Found something. */
+ beqlr /* Hit end of string (length). */
+
+ ldu r12, 8(r8)
+ cmpb r3, r12, r4
+ cmpldi cr6, r3, 0
+ cmpld r8, r7
+ bne cr6, L(done)
+ beqlr
+
+ ldu r12, 8(r8)
+ cmpb r3, r12, r4
+ cmpldi cr6, r3, 0
+ cmpld r8, r7
+ bne cr6, L(done)
+ beqlr
+
+ ldu r12, 8(r8)
+ cmpb r3, r12, r4
+ cmpldi cr6, r3, 0
+ bne cr6, L(done)
+ blr
+
+END (MEMCHR)
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
diff --git a/sysdeps/powerpc/powerpc64/power8/memcmp.S b/sysdeps/powerpc/powerpc64/power8/memcmp.S
new file mode 100644
index 0000000000..ec4ccf3382
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/memcmp.S
@@ -0,0 +1,1447 @@
+/* Optimized memcmp implementation for POWER7/PowerPC64.
+ Copyright (C) 2010-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* int [r3] memcmp (const char *s1 [r3],
+ const char *s2 [r4],
+ size_t size [r5]) */
+
+/* TODO: change these to the actual instructions when the minimum required
+ binutils allows it. */
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#ifndef MEMCMP
+# define MEMCMP memcmp
+#endif
+ .machine power7
+ENTRY_TOCLESS (MEMCMP, 4)
+ CALL_MCOUNT 3
+
+#define rRTN r3
+#define rSTR1 r3 /* First string arg. */
+#define rSTR2 r4 /* Second string arg. */
+#define rN r5 /* Max string length. */
+#define rWORD1 r6 /* Current word in s1. */
+#define rWORD2 r7 /* Current word in s2. */
+#define rWORD3 r8 /* Next word in s1. */
+#define rWORD4 r9 /* Next word in s2. */
+#define rWORD5 r10 /* Next word in s1. */
+#define rWORD6 r11 /* Next word in s2. */
+
+#define rOFF8 r20 /* 8 bytes offset. */
+#define rOFF16 r21 /* 16 bytes offset. */
+#define rOFF24 r22 /* 24 bytes offset. */
+#define rOFF32 r23 /* 24 bytes offset. */
+#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
+#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
+#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
+#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
+#define rSHR r28 /* Unaligned shift right count. */
+#define rSHL r29 /* Unaligned shift left count. */
+#define rWORD7 r30 /* Next word in s1. */
+#define rWORD8 r31 /* Next word in s2. */
+
+#define rWORD8SAVE (-8)
+#define rWORD7SAVE (-16)
+#define rOFF8SAVE (-24)
+#define rOFF16SAVE (-32)
+#define rOFF24SAVE (-40)
+#define rOFF32SAVE (-48)
+#define rSHRSAVE (-56)
+#define rSHLSAVE (-64)
+#define rWORD8SHIFTSAVE (-72)
+#define rWORD2SHIFTSAVE (-80)
+#define rWORD4SHIFTSAVE (-88)
+#define rWORD6SHIFTSAVE (-96)
+
+#ifdef __LITTLE_ENDIAN__
+# define LD ldbrx
+#else
+# define LD ldx
+#endif
+
+ xor r10, rSTR2, rSTR1
+ cmpldi cr6, rN, 0
+ cmpldi cr1, rN, 8
+ clrldi. r0, r10, 61
+ clrldi r12, rSTR1, 61
+ cmpldi cr5, r12, 0
+ beq- cr6, L(zeroLength)
+ dcbt 0, rSTR1
+ dcbt 0, rSTR2
+ /* If less than 8 bytes or not aligned, use the unaligned
+ byte loop. */
+ blt cr1, L(bytealigned)
+ bne L(unalignedqw)
+/* At this point we know both strings have the same alignment and the
+ compare length is at least 8 bytes. r12 contains the low order
+ 3 bits of rSTR1 and cr5 contains the result of the logical compare
+ of r12 to 0. If r12 == 0 then we are already double word
+ aligned and can perform the DW aligned loop. */
+
+ .align 4
+L(samealignment):
+ or r11, rSTR2, rSTR1
+ clrldi. r11, r11, 60
+ beq L(qw_align)
+ /* Try to align to QW else proceed to DW loop. */
+ clrldi. r10, r10, 60
+ bne L(DW)
+ /* For the difference to reach QW alignment, load as DW. */
+ clrrdi rSTR1, rSTR1, 3
+ clrrdi rSTR2, rSTR2, 3
+ subfic r10, r12, 8
+ LD rWORD1, 0, rSTR1
+ LD rWORD2, 0, rSTR2
+ sldi r9, r10, 3
+ subfic r9, r9, 64
+ sld rWORD1, rWORD1, r9
+ sld rWORD2, rWORD2, r9
+ cmpld cr6, rWORD1, rWORD2
+ addi rSTR1, rSTR1, 8
+ addi rSTR2, rSTR2, 8
+ bne cr6, L(ret_diff)
+ subf rN, r10, rN
+
+ cmpld cr6, r11, r12
+ bgt cr6, L(qw_align)
+ LD rWORD1, 0, rSTR1
+ LD rWORD2, 0, rSTR2
+ cmpld cr6, rWORD1, rWORD2
+ addi rSTR1, rSTR1, 8
+ addi rSTR2, rSTR2, 8
+ bne cr6, L(different)
+ cmpldi cr6, rN, 8
+ ble cr6, L(zeroLength)
+ addi rN, rN, -8
+ /* Now both rSTR1 and rSTR2 are aligned to QW. */
+ .align 4
+L(qw_align):
+ vspltisb v0, 0
+ srdi. r6, rN, 6
+ li r8, 16
+ li r10, 32
+ li r11, 48
+ ble cr0, L(lessthan64)
+ mtctr r6
+ vspltisb v8, 0
+ vspltisb v6, 0
+ /* Aligned vector loop. */
+ .align 4
+L(aligned_loop):
+ lvx v4, 0, rSTR1
+ lvx v5, 0, rSTR2
+ vcmpequb. v7, v6, v8
+ bnl cr6, L(different3)
+ lvx v6, rSTR1, r8
+ lvx v8, rSTR2, r8
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different2)
+ lvx v4, rSTR1, r10
+ lvx v5, rSTR2, r10
+ vcmpequb. v7, v6, v8
+ bnl cr6, L(different3)
+ lvx v6, rSTR1, r11
+ lvx v8, rSTR2, r11
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different2)
+ addi rSTR1, rSTR1, 64
+ addi rSTR2, rSTR2, 64
+ bdnz L(aligned_loop)
+ vcmpequb. v7, v6, v8
+ bnl cr6, L(different3)
+ clrldi rN, rN, 58
+ /* Handle remainder for aligned loop. */
+ .align 4
+L(lessthan64):
+ mr r9, rSTR1
+ cmpdi cr6, rN, 0
+ li rSTR1, 0
+ blelr cr6
+ lvx v4, 0, r9
+ lvx v5, 0, rSTR2
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different1)
+ addi rN, rN, -16
+
+ cmpdi cr6, rN, 0
+ blelr cr6
+ lvx v4, r9, r8
+ lvx v5, rSTR2, r8
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different1)
+ addi rN, rN, -16
+
+ cmpdi cr6, rN, 0
+ blelr cr6
+ lvx v4, r9, r10
+ lvx v5, rSTR2, r10
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different1)
+ addi rN, rN, -16
+
+ cmpdi cr6, rN, 0
+ blelr cr6
+ lvx v4, r9, r11
+ lvx v5, rSTR2, r11
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different1)
+ blr
+
+ /* Calculate and return the difference. */
+ .align 4
+L(different1):
+ cmpdi cr6, rN, 16
+ bge cr6, L(different2)
+ /* Discard unwanted bytes. */
+#ifdef __LITTLE_ENDIAN__
+ lvsr v1, 0, rN
+ vperm v4, v4, v0, v1
+ vperm v5, v5, v0, v1
+#else
+ lvsl v1, 0, rN
+ vperm v4, v0, v4, v1
+ vperm v5, v0, v5, v1
+#endif
+ vcmpequb. v7, v4, v5
+ li rRTN, 0
+ bltlr cr6
+ .align 4
+L(different2):
+#ifdef __LITTLE_ENDIAN__
+ /* Reverse bytes for direct comparison. */
+ lvsl v10, r0, r0
+ vspltisb v8, 15
+ vsububm v9, v8, v10
+ vperm v4, v4, v0, v9
+ vperm v5, v5, v0, v9
+#endif
+ MFVRD(r7, v4)
+ MFVRD(r9, v5)
+ cmpld cr6, r7, r9
+ bne cr6, L(ret_diff)
+ /* Difference in second DW. */
+ vsldoi v4, v4, v4, 8
+ vsldoi v5, v5, v5, 8
+ MFVRD(r7, v4)
+ MFVRD(r9, v5)
+ cmpld cr6, r7, r9
+L(ret_diff):
+ li rRTN, 1
+ bgtlr cr6
+ li rRTN, -1
+ blr
+ .align 4
+L(different3):
+#ifdef __LITTLE_ENDIAN__
+ /* Reverse bytes for direct comparison. */
+ vspltisb v9, 15
+ lvsl v10, r0, r0
+ vsububm v9, v9, v10
+ vperm v6, v6, v0, v9
+ vperm v8, v8, v0, v9
+#endif
+ MFVRD(r7, v6)
+ MFVRD(r9, v8)
+ cmpld cr6, r7, r9
+ bne cr6, L(ret_diff)
+ /* Difference in second DW. */
+ vsldoi v6, v6, v6, 8
+ vsldoi v8, v8, v8, 8
+ MFVRD(r7, v6)
+ MFVRD(r9, v8)
+ cmpld cr6, r7, r9
+ li rRTN, 1
+ bgtlr cr6
+ li rRTN, -1
+ blr
+
+ .align 4
+L(different):
+ cmpldi cr7, rN, 8
+ bgt cr7, L(end)
+ /* Skip unwanted bytes. */
+ sldi r8, rN, 3
+ subfic r8, r8, 64
+ srd rWORD1, rWORD1, r8
+ srd rWORD2, rWORD2, r8
+ cmpld cr6, rWORD1, rWORD2
+ li rRTN, 0
+ beqlr cr6
+L(end):
+ li rRTN, 1
+ bgtlr cr6
+ li rRTN, -1
+ blr
+
+ .align 4
+L(unalignedqw):
+ /* Proceed to DW unaligned loop,if there is a chance of pagecross. */
+ rldicl r9, rSTR1, 0, 52
+ add r9, r9, rN
+ cmpldi cr0, r9, 4096-16
+ bgt cr0, L(unaligned)
+ rldicl r9, rSTR2, 0, 52
+ add r9, r9, rN
+ cmpldi cr0, r9, 4096-16
+ bgt cr0, L(unaligned)
+ li r0, 0
+ li r8, 16
+ vspltisb v0, 0
+ /* Check if rSTR1 is aligned to QW. */
+ andi. r11, rSTR1, 0xF
+ beq L(s1_align)
+
+ /* Compare 16B and align S1 to QW. */
+#ifdef __LITTLE_ENDIAN__
+ lvsr v10, 0, rSTR1 /* Compute mask. */
+ lvsr v6, 0, rSTR2 /* Compute mask. */
+#else
+ lvsl v10, 0, rSTR1 /* Compute mask. */
+ lvsl v6, 0, rSTR2 /* Compute mask. */
+#endif
+ lvx v5, 0, rSTR2
+ lvx v9, rSTR2, r8
+#ifdef __LITTLE_ENDIAN__
+ vperm v5, v9, v5, v6
+#else
+ vperm v5, v5, v9, v6
+#endif
+ lvx v4, 0, rSTR1
+ lvx v9, rSTR1, r8
+#ifdef __LITTLE_ENDIAN__
+ vperm v4, v9, v4, v10
+#else
+ vperm v4, v4, v9, v10
+#endif
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different1)
+ cmpldi cr6, rN, 16
+ ble cr6, L(zeroLength)
+ subfic r11, r11, 16
+ subf rN, r11, rN
+ add rSTR1, rSTR1, r11
+ add rSTR2, rSTR2, r11
+
+ /* As s1 is QW aligned prepare for unaligned loop. */
+ .align 4
+L(s1_align):
+#ifdef __LITTLE_ENDIAN__
+ lvsr v6, 0, rSTR2
+#else
+ lvsl v6, 0, rSTR2
+#endif
+ lvx v5, 0, rSTR2
+ srdi. r6, rN, 6
+ li r10, 32
+ li r11, 48
+ ble cr0, L(lessthan64_unalign)
+ mtctr r6
+ li r9, 64
+ /* Unaligned vector loop. */
+ .align 4
+L(unalign_qwloop):
+ lvx v4, 0, rSTR1
+ lvx v10, rSTR2, r8
+#ifdef __LITTLE_ENDIAN__
+ vperm v5, v10, v5, v6
+#else
+ vperm v5, v5, v10, v6
+#endif
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different2)
+ vor v5, v10, v10
+ lvx v4, rSTR1, r8
+ lvx v10, rSTR2, r10
+#ifdef __LITTLE_ENDIAN__
+ vperm v5, v10, v5, v6
+#else
+ vperm v5, v5, v10, v6
+#endif
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different2)
+ vor v5, v10, v10
+ lvx v4, rSTR1, r10
+ lvx v10, rSTR2, r11
+#ifdef __LITTLE_ENDIAN__
+ vperm v5, v10, v5, v6
+#else
+ vperm v5, v5, v10, v6
+#endif
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different2)
+ vor v5, v10, v10
+ lvx v4, rSTR1, r11
+ lvx v10, rSTR2, r9
+#ifdef __LITTLE_ENDIAN__
+ vperm v5, v10, v5, v6
+#else
+ vperm v5, v5, v10, v6
+#endif
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different2)
+ vor v5, v10, v10
+ addi rSTR1, rSTR1, 64
+ addi rSTR2, rSTR2, 64
+ bdnz L(unalign_qwloop)
+ clrldi rN, rN, 58
+ /* Handle remainder for unaligned loop. */
+ .align 4
+L(lessthan64_unalign):
+ mr r9, rSTR1
+ cmpdi cr6, rN, 0
+ li rSTR1, 0
+ blelr cr6
+ lvx v4, 0, r9
+ lvx v10, rSTR2, r8
+#ifdef __LITTLE_ENDIAN__
+ vperm v5, v10, v5, v6
+#else
+ vperm v5, v5, v10, v6
+#endif
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different1)
+ vor v5, v10, v10
+ addi rN, rN, -16
+
+ cmpdi cr6, rN, 0
+ blelr cr6
+ lvx v4, r9, r8
+ lvx v10, rSTR2, r10
+#ifdef __LITTLE_ENDIAN__
+ vperm v5, v10, v5, v6
+#else
+ vperm v5, v5, v10, v6
+#endif
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different1)
+ vor v5, v10, v10
+ addi rN, rN, -16
+
+ cmpdi cr6, rN, 0
+ blelr cr6
+ lvx v4, r9, r10
+ lvx v10, rSTR2, r11
+#ifdef __LITTLE_ENDIAN__
+ vperm v5, v10, v5, v6
+#else
+ vperm v5, v5, v10, v6
+#endif
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different1)
+ vor v5, v10, v10
+ addi rN, rN, -16
+
+ cmpdi cr6, rN, 0
+ blelr cr6
+ lvx v4, r9, r11
+ addi r11, r11, 16
+ lvx v10, rSTR2, r11
+#ifdef __LITTLE_ENDIAN__
+ vperm v5, v10, v5, v6
+#else
+ vperm v5, v5, v10, v6
+#endif
+ vcmpequb. v7, v5, v4
+ bnl cr6, L(different1)
+ blr
+
+/* Otherwise we know the two strings have the same alignment (but not
+ yet DW). So we force the string addresses to the next lower DW
+ boundary and special case this first DW using shift left to
+ eliminate bits preceding the first byte. Since we want to join the
+ normal (DW aligned) compare loop, starting at the second double word,
+ we need to adjust the length (rN) and special case the loop
+ versioning for the first DW. This ensures that the loop count is
+ correct and the first DW (shifted) is in the expected register pair. */
+ .align 4
+L(DW):
+ std rWORD8, rWORD8SAVE(r1)
+ std rWORD7, rWORD7SAVE(r1)
+ std rOFF8, rOFF8SAVE(r1)
+ std rOFF16, rOFF16SAVE(r1)
+ std rOFF24, rOFF24SAVE(r1)
+ std rOFF32, rOFF32SAVE(r1)
+ cfi_offset(rWORD8, rWORD8SAVE)
+ cfi_offset(rWORD7, rWORD7SAVE)
+ cfi_offset(rOFF8, rOFF8SAVE)
+ cfi_offset(rOFF16, rOFF16SAVE)
+ cfi_offset(rOFF24, rOFF24SAVE)
+ cfi_offset(rOFF32, rOFF32SAVE)
+
+ li rOFF8,8
+ li rOFF16,16
+ li rOFF24,24
+ li rOFF32,32
+ clrrdi rSTR1, rSTR1, 3
+ clrrdi rSTR2, rSTR2, 3
+ beq cr5, L(DWaligned)
+ add rN, rN, r12
+ sldi rWORD6, r12, 3
+ srdi r0, rN, 5 /* Divide by 32. */
+ andi. r12, rN, 24 /* Get the DW remainder. */
+ LD rWORD1, 0, rSTR1
+ LD rWORD2, 0, rSTR2
+ cmpldi cr1, r12, 16
+ cmpldi cr7, rN, 32
+ clrldi rN, rN, 61
+ beq L(dPs4)
+ mtctr r0
+ bgt cr1, L(dPs3)
+ beq cr1, L(dPs2)
+
+/* Remainder is 8. */
+ .align 3
+L(dsP1):
+ sld rWORD5, rWORD1, rWORD6
+ sld rWORD6, rWORD2, rWORD6
+ cmpld cr5, rWORD5, rWORD6
+ blt cr7, L(dP1x)
+/* Do something useful in this cycle since we have to branch anyway. */
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+ b L(dP1e)
+/* Remainder is 16. */
+ .align 4
+L(dPs2):
+ sld rWORD5, rWORD1, rWORD6
+ sld rWORD6, rWORD2, rWORD6
+ cmpld cr6, rWORD5, rWORD6
+ blt cr7, L(dP2x)
+/* Do something useful in this cycle since we have to branch anyway. */
+ LD rWORD7, rOFF8, rSTR1
+ LD rWORD8, rOFF8, rSTR2
+ cmpld cr5, rWORD7, rWORD8
+ b L(dP2e)
+/* Remainder is 24. */
+ .align 4
+L(dPs3):
+ sld rWORD3, rWORD1, rWORD6
+ sld rWORD4, rWORD2, rWORD6
+ cmpld cr1, rWORD3, rWORD4
+ b L(dP3e)
+/* Count is a multiple of 32, remainder is 0. */
+ .align 4
+L(dPs4):
+ mtctr r0
+ sld rWORD1, rWORD1, rWORD6
+ sld rWORD2, rWORD2, rWORD6
+ cmpld cr7, rWORD1, rWORD2
+ b L(dP4e)
+
+/* At this point we know both strings are double word aligned and the
+ compare length is at least 8 bytes. */
+ .align 4
+L(DWaligned):
+ andi. r12, rN, 24 /* Get the DW remainder. */
+ srdi r0, rN, 5 /* Divide by 32. */
+ cmpldi cr1, r12, 16
+ cmpldi cr7, rN, 32
+ clrldi rN, rN, 61
+ beq L(dP4)
+ bgt cr1, L(dP3)
+ beq cr1, L(dP2)
+
+/* Remainder is 8. */
+ .align 4
+L(dP1):
+ mtctr r0
+/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
+ (8-15 byte compare), we want to use only volatile registers. This
+ means we can avoid restoring non-volatile registers since we did not
+ change any on the early exit path. The key here is the non-early
+ exit path only cares about the condition code (cr5), not about which
+ register pair was used. */
+ LD rWORD5, 0, rSTR1
+ LD rWORD6, 0, rSTR2
+ cmpld cr5, rWORD5, rWORD6
+ blt cr7, L(dP1x)
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+L(dP1e):
+ LD rWORD3, rOFF16, rSTR1
+ LD rWORD4, rOFF16, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ LD rWORD5, rOFF24, rSTR1
+ LD rWORD6, rOFF24, rSTR2
+ cmpld cr6, rWORD5, rWORD6
+ bne cr5, L(dLcr5x)
+ bne cr7, L(dLcr7x)
+
+ LD rWORD7, rOFF32, rSTR1
+ LD rWORD8, rOFF32, rSTR2
+ addi rSTR1, rSTR1, 32
+ addi rSTR2, rSTR2, 32
+ bne cr1, L(dLcr1)
+ cmpld cr5, rWORD7, rWORD8
+ bdnz L(dLoop)
+ bne cr6, L(dLcr6)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+ .align 3
+L(dP1x):
+ sldi. r12, rN, 3
+ bne cr5, L(dLcr5x)
+ subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
+ bne L(d00)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 0
+ blr
+
+/* Remainder is 16. */
+ .align 4
+L(dP2):
+ mtctr r0
+ LD rWORD5, 0, rSTR1
+ LD rWORD6, 0, rSTR2
+ cmpld cr6, rWORD5, rWORD6
+ blt cr7, L(dP2x)
+ LD rWORD7, rOFF8, rSTR1
+ LD rWORD8, rOFF8, rSTR2
+ cmpld cr5, rWORD7, rWORD8
+L(dP2e):
+ LD rWORD1, rOFF16, rSTR1
+ LD rWORD2, rOFF16, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+ LD rWORD3, rOFF24, rSTR1
+ LD rWORD4, rOFF24, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ addi rSTR1, rSTR1, 8
+ addi rSTR2, rSTR2, 8
+ bne cr6, L(dLcr6)
+ bne cr5, L(dLcr5)
+ b L(dLoop2)
+ .align 4
+L(dP2x):
+ LD rWORD3, rOFF8, rSTR1
+ LD rWORD4, rOFF8, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ sldi. r12, rN, 3
+ bne cr6, L(dLcr6x)
+ addi rSTR1, rSTR1, 8
+ addi rSTR2, rSTR2, 8
+ bne cr1, L(dLcr1x)
+ subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
+ bne L(d00)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 0
+ blr
+
+/* Remainder is 24. */
+ .align 4
+L(dP3):
+ mtctr r0
+ LD rWORD3, 0, rSTR1
+ LD rWORD4, 0, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+L(dP3e):
+ LD rWORD5, rOFF8, rSTR1
+ LD rWORD6, rOFF8, rSTR2
+ cmpld cr6, rWORD5, rWORD6
+ blt cr7, L(dP3x)
+ LD rWORD7, rOFF16, rSTR1
+ LD rWORD8, rOFF16, rSTR2
+ cmpld cr5, rWORD7, rWORD8
+ LD rWORD1, rOFF24, rSTR1
+ LD rWORD2, rOFF24, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+ addi rSTR1, rSTR1, 16
+ addi rSTR2, rSTR2, 16
+ bne cr1, L(dLcr1)
+ bne cr6, L(dLcr6)
+ b L(dLoop1)
+/* Again we are on a early exit path (24-31 byte compare), we want to
+ only use volatile registers and avoid restoring non-volatile
+ registers. */
+ .align 4
+L(dP3x):
+ LD rWORD1, rOFF16, rSTR1
+ LD rWORD2, rOFF16, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+ sldi. r12, rN, 3
+ bne cr1, L(dLcr1x)
+ addi rSTR1, rSTR1, 16
+ addi rSTR2, rSTR2, 16
+ bne cr6, L(dLcr6x)
+ subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
+ bne cr7, L(dLcr7x)
+ bne L(d00)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 0
+ blr
+
+/* Count is a multiple of 32, remainder is 0. */
+ .align 4
+L(dP4):
+ mtctr r0
+ LD rWORD1, 0, rSTR1
+ LD rWORD2, 0, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+L(dP4e):
+ LD rWORD3, rOFF8, rSTR1
+ LD rWORD4, rOFF8, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ LD rWORD5, rOFF16, rSTR1
+ LD rWORD6, rOFF16, rSTR2
+ cmpld cr6, rWORD5, rWORD6
+ LD rWORD7, rOFF24, rSTR1
+ LD rWORD8, rOFF24, rSTR2
+ addi rSTR1, rSTR1, 24
+ addi rSTR2, rSTR2, 24
+ cmpld cr5, rWORD7, rWORD8
+ bne cr7, L(dLcr7)
+ bne cr1, L(dLcr1)
+ bdz- L(d24) /* Adjust CTR as we start with +4. */
+/* This is the primary loop. */
+ .align 4
+L(dLoop):
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ bne cr6, L(dLcr6)
+L(dLoop1):
+ LD rWORD3, rOFF16, rSTR1
+ LD rWORD4, rOFF16, rSTR2
+ cmpld cr6, rWORD5, rWORD6
+ bne cr5, L(dLcr5)
+L(dLoop2):
+ LD rWORD5, rOFF24, rSTR1
+ LD rWORD6, rOFF24, rSTR2
+ cmpld cr5, rWORD7, rWORD8
+ bne cr7, L(dLcr7)
+L(dLoop3):
+ LD rWORD7, rOFF32, rSTR1
+ LD rWORD8, rOFF32, rSTR2
+ addi rSTR1, rSTR1, 32
+ addi rSTR2, rSTR2, 32
+ bne cr1, L(dLcr1)
+ cmpld cr7, rWORD1, rWORD2
+ bdnz L(dLoop)
+
+L(dL4):
+ cmpld cr1, rWORD3, rWORD4
+ bne cr6, L(dLcr6)
+ cmpld cr6, rWORD5, rWORD6
+ bne cr5, L(dLcr5)
+ cmpld cr5, rWORD7, rWORD8
+L(d44):
+ bne cr7, L(dLcr7)
+L(d34):
+ bne cr1, L(dLcr1)
+L(d24):
+ bne cr6, L(dLcr6)
+L(d14):
+ sldi. r12, rN, 3
+ bne cr5, L(dLcr5)
+L(d04):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+ subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
+ beq L(duzeroLength)
+/* At this point we have a remainder of 1 to 7 bytes to compare. Since
+ we are aligned it is safe to load the whole double word, and use
+ shift right double to eliminate bits beyond the compare length. */
+L(d00):
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
+ srd rWORD1, rWORD1, rN
+ srd rWORD2, rWORD2, rN
+ cmpld cr7, rWORD1, rWORD2
+ bne cr7, L(dLcr7x)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 0
+ blr
+
+ .align 4
+L(dLcr7):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+L(dLcr7x):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 1
+ bgtlr cr7
+ li rRTN, -1
+ blr
+ .align 4
+L(dLcr1):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+L(dLcr1x):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 1
+ bgtlr cr1
+ li rRTN, -1
+ blr
+ .align 4
+L(dLcr6):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+L(dLcr6x):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 1
+ bgtlr cr6
+ li rRTN, -1
+ blr
+ .align 4
+L(dLcr5):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+L(dLcr5x):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 1
+ bgtlr cr5
+ li rRTN, -1
+ blr
+
+ .align 4
+L(bytealigned):
+ mtctr rN
+
+/* We need to prime this loop. This loop is swing modulo scheduled
+ to avoid pipe delays. The dependent instruction latencies (load to
+ compare to conditional branch) is 2 to 3 cycles. In this loop each
+ dispatch group ends in a branch and takes 1 cycle. Effectively
+ the first iteration of the loop only serves to load operands and
+ branches based on compares are delayed until the next loop.
+
+ So we must precondition some registers and condition codes so that
+ we don't exit the loop early on the first iteration. */
+
+ lbz rWORD1, 0(rSTR1)
+ lbz rWORD2, 0(rSTR2)
+ bdz L(b11)
+ cmpld cr7, rWORD1, rWORD2
+ lbz rWORD3, 1(rSTR1)
+ lbz rWORD4, 1(rSTR2)
+ bdz L(b12)
+ cmpld cr1, rWORD3, rWORD4
+ lbzu rWORD5, 2(rSTR1)
+ lbzu rWORD6, 2(rSTR2)
+ bdz L(b13)
+ .align 4
+L(bLoop):
+ lbzu rWORD1, 1(rSTR1)
+ lbzu rWORD2, 1(rSTR2)
+ bne cr7, L(bLcr7)
+
+ cmpld cr6, rWORD5, rWORD6
+ bdz L(b3i)
+
+ lbzu rWORD3, 1(rSTR1)
+ lbzu rWORD4, 1(rSTR2)
+ bne cr1, L(bLcr1)
+
+ cmpld cr7, rWORD1, rWORD2
+ bdz L(b2i)
+
+ lbzu rWORD5, 1(rSTR1)
+ lbzu rWORD6, 1(rSTR2)
+ bne cr6, L(bLcr6)
+
+ cmpld cr1, rWORD3, rWORD4
+ bdnz L(bLoop)
+
+/* We speculatively loading bytes before we have tested the previous
+ bytes. But we must avoid overrunning the length (in the ctr) to
+ prevent these speculative loads from causing a segfault. In this
+ case the loop will exit early (before the all pending bytes are
+ tested. In this case we must complete the pending operations
+ before returning. */
+L(b1i):
+ bne cr7, L(bLcr7)
+ bne cr1, L(bLcr1)
+ b L(bx56)
+ .align 4
+L(b2i):
+ bne cr6, L(bLcr6)
+ bne cr7, L(bLcr7)
+ b L(bx34)
+ .align 4
+L(b3i):
+ bne cr1, L(bLcr1)
+ bne cr6, L(bLcr6)
+ b L(bx12)
+ .align 4
+L(bLcr7):
+ li rRTN, 1
+ bgtlr cr7
+ li rRTN, -1
+ blr
+L(bLcr1):
+ li rRTN, 1
+ bgtlr cr1
+ li rRTN, -1
+ blr
+L(bLcr6):
+ li rRTN, 1
+ bgtlr cr6
+ li rRTN, -1
+ blr
+
+L(b13):
+ bne cr7, L(bx12)
+ bne cr1, L(bx34)
+L(bx56):
+ sub rRTN, rWORD5, rWORD6
+ blr
+ nop
+L(b12):
+ bne cr7, L(bx12)
+L(bx34):
+ sub rRTN, rWORD3, rWORD4
+ blr
+L(b11):
+L(bx12):
+ sub rRTN, rWORD1, rWORD2
+ blr
+
+ .align 4
+L(zeroLength):
+ li rRTN, 0
+ blr
+
+ .align 4
+/* At this point we know the strings have different alignment and the
+ compare length is at least 8 bytes. r12 contains the low order
+ 3 bits of rSTR1 and cr5 contains the result of the logical compare
+ of r12 to 0. If r12 == 0 then rStr1 is double word
+ aligned and can perform the DWunaligned loop.
+
+ Otherwise we know that rSTR1 is not already DW aligned yet.
+ So we can force the string addresses to the next lower DW
+ boundary and special case this first DW using shift left to
+ eliminate bits preceding the first byte. Since we want to join the
+ normal (DWaligned) compare loop, starting at the second double word,
+ we need to adjust the length (rN) and special case the loop
+ versioning for the first DW. This ensures that the loop count is
+ correct and the first DW (shifted) is in the expected resister pair. */
+L(unaligned):
+ std rWORD8, rWORD8SAVE(r1)
+ std rWORD7, rWORD7SAVE(r1)
+ std rOFF8, rOFF8SAVE(r1)
+ std rOFF16, rOFF16SAVE(r1)
+ std rOFF24, rOFF24SAVE(r1)
+ std rOFF32, rOFF32SAVE(r1)
+ cfi_offset(rWORD8, rWORD8SAVE)
+ cfi_offset(rWORD7, rWORD7SAVE)
+ cfi_offset(rOFF8, rOFF8SAVE)
+ cfi_offset(rOFF16, rOFF16SAVE)
+ cfi_offset(rOFF24, rOFF24SAVE)
+ cfi_offset(rOFF32, rOFF32SAVE)
+ li rOFF8,8
+ li rOFF16,16
+ li rOFF24,24
+ li rOFF32,32
+ std rSHL, rSHLSAVE(r1)
+ cfi_offset(rSHL, rSHLSAVE)
+ clrldi rSHL, rSTR2, 61
+ beq cr6, L(duzeroLength)
+ std rSHR, rSHRSAVE(r1)
+ cfi_offset(rSHR, rSHRSAVE)
+ beq cr5, L(DWunaligned)
+ std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+ cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
+/* Adjust the logical start of rSTR2 to compensate for the extra bits
+ in the 1st rSTR1 DW. */
+ sub rWORD8_SHIFT, rSTR2, r12
+/* But do not attempt to address the DW before that DW that contains
+ the actual start of rSTR2. */
+ clrrdi rSTR2, rSTR2, 3
+ std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+/* Compute the left/right shift counts for the unaligned rSTR2,
+ compensating for the logical (DW aligned) start of rSTR1. */
+ clrldi rSHL, rWORD8_SHIFT, 61
+ clrrdi rSTR1, rSTR1, 3
+ std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+ sldi rSHL, rSHL, 3
+ cmpld cr5, rWORD8_SHIFT, rSTR2
+ add rN, rN, r12
+ sldi rWORD6, r12, 3
+ std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+ cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
+ cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
+ cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
+ subfic rSHR, rSHL, 64
+ srdi r0, rN, 5 /* Divide by 32. */
+ andi. r12, rN, 24 /* Get the DW remainder. */
+/* We normally need to load 2 DWs to start the unaligned rSTR2, but in
+ this special case those bits may be discarded anyway. Also we
+ must avoid loading a DW where none of the bits are part of rSTR2 as
+ this may cross a page boundary and cause a page fault. */
+ li rWORD8, 0
+ blt cr5, L(dus0)
+ LD rWORD8, 0, rSTR2
+ addi rSTR2, rSTR2, 8
+ sld rWORD8, rWORD8, rSHL
+
+L(dus0):
+ LD rWORD1, 0, rSTR1
+ LD rWORD2, 0, rSTR2
+ cmpldi cr1, r12, 16
+ cmpldi cr7, rN, 32
+ srd r12, rWORD2, rSHR
+ clrldi rN, rN, 61
+ beq L(duPs4)
+ mtctr r0
+ or rWORD8, r12, rWORD8
+ bgt cr1, L(duPs3)
+ beq cr1, L(duPs2)
+
+/* Remainder is 8. */
+ .align 4
+L(dusP1):
+ sld rWORD8_SHIFT, rWORD2, rSHL
+ sld rWORD7, rWORD1, rWORD6
+ sld rWORD8, rWORD8, rWORD6
+ bge cr7, L(duP1e)
+/* At this point we exit early with the first double word compare
+ complete and remainder of 0 to 7 bytes. See L(du14) for details on
+ how we handle the remaining bytes. */
+ cmpld cr5, rWORD7, rWORD8
+ sldi. rN, rN, 3
+ bne cr5, L(duLcr5)
+ cmpld cr7, rN, rSHR
+ beq L(duZeroReturn)
+ li r0, 0
+ ble cr7, L(dutrim)
+ LD rWORD2, rOFF8, rSTR2
+ srd r0, rWORD2, rSHR
+ b L(dutrim)
+/* Remainder is 16. */
+ .align 4
+L(duPs2):
+ sld rWORD6_SHIFT, rWORD2, rSHL
+ sld rWORD5, rWORD1, rWORD6
+ sld rWORD6, rWORD8, rWORD6
+ b L(duP2e)
+/* Remainder is 24. */
+ .align 4
+L(duPs3):
+ sld rWORD4_SHIFT, rWORD2, rSHL
+ sld rWORD3, rWORD1, rWORD6
+ sld rWORD4, rWORD8, rWORD6
+ b L(duP3e)
+/* Count is a multiple of 32, remainder is 0. */
+ .align 4
+L(duPs4):
+ mtctr r0
+ or rWORD8, r12, rWORD8
+ sld rWORD2_SHIFT, rWORD2, rSHL
+ sld rWORD1, rWORD1, rWORD6
+ sld rWORD2, rWORD8, rWORD6
+ b L(duP4e)
+
+/* At this point we know rSTR1 is double word aligned and the
+ compare length is at least 8 bytes. */
+ .align 4
+L(DWunaligned):
+ std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+ clrrdi rSTR2, rSTR2, 3
+ std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+ srdi r0, rN, 5 /* Divide by 32. */
+ std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+ andi. r12, rN, 24 /* Get the DW remainder. */
+ std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+ cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
+ cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
+ cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
+ cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
+ sldi rSHL, rSHL, 3
+ LD rWORD6, 0, rSTR2
+ LD rWORD8, rOFF8, rSTR2
+ addi rSTR2, rSTR2, 8
+ cmpldi cr1, r12, 16
+ cmpldi cr7, rN, 32
+ clrldi rN, rN, 61
+ subfic rSHR, rSHL, 64
+ sld rWORD6_SHIFT, rWORD6, rSHL
+ beq L(duP4)
+ mtctr r0
+ bgt cr1, L(duP3)
+ beq cr1, L(duP2)
+
+/* Remainder is 8. */
+ .align 4
+L(duP1):
+ srd r12, rWORD8, rSHR
+ LD rWORD7, 0, rSTR1
+ sld rWORD8_SHIFT, rWORD8, rSHL
+ or rWORD8, r12, rWORD6_SHIFT
+ blt cr7, L(duP1x)
+L(duP1e):
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
+ cmpld cr5, rWORD7, rWORD8
+ srd r0, rWORD2, rSHR
+ sld rWORD2_SHIFT, rWORD2, rSHL
+ or rWORD2, r0, rWORD8_SHIFT
+ LD rWORD3, rOFF16, rSTR1
+ LD rWORD4, rOFF16, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+ srd r12, rWORD4, rSHR
+ sld rWORD4_SHIFT, rWORD4, rSHL
+ bne cr5, L(duLcr5)
+ or rWORD4, r12, rWORD2_SHIFT
+ LD rWORD5, rOFF24, rSTR1
+ LD rWORD6, rOFF24, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ srd r0, rWORD6, rSHR
+ sld rWORD6_SHIFT, rWORD6, rSHL
+ bne cr7, L(duLcr7)
+ or rWORD6, r0, rWORD4_SHIFT
+ cmpld cr6, rWORD5, rWORD6
+ b L(duLoop3)
+ .align 4
+/* At this point we exit early with the first double word compare
+ complete and remainder of 0 to 7 bytes. See L(du14) for details on
+ how we handle the remaining bytes. */
+L(duP1x):
+ cmpld cr5, rWORD7, rWORD8
+ sldi. rN, rN, 3
+ bne cr5, L(duLcr5)
+ cmpld cr7, rN, rSHR
+ beq L(duZeroReturn)
+ li r0, 0
+ ble cr7, L(dutrim)
+ LD rWORD2, rOFF8, rSTR2
+ srd r0, rWORD2, rSHR
+ b L(dutrim)
+/* Remainder is 16. */
+ .align 4
+L(duP2):
+ srd r0, rWORD8, rSHR
+ LD rWORD5, 0, rSTR1
+ or rWORD6, r0, rWORD6_SHIFT
+ sld rWORD6_SHIFT, rWORD8, rSHL
+L(duP2e):
+ LD rWORD7, rOFF8, rSTR1
+ LD rWORD8, rOFF8, rSTR2
+ cmpld cr6, rWORD5, rWORD6
+ srd r12, rWORD8, rSHR
+ sld rWORD8_SHIFT, rWORD8, rSHL
+ or rWORD8, r12, rWORD6_SHIFT
+ blt cr7, L(duP2x)
+ LD rWORD1, rOFF16, rSTR1
+ LD rWORD2, rOFF16, rSTR2
+ cmpld cr5, rWORD7, rWORD8
+ bne cr6, L(duLcr6)
+ srd r0, rWORD2, rSHR
+ sld rWORD2_SHIFT, rWORD2, rSHL
+ or rWORD2, r0, rWORD8_SHIFT
+ LD rWORD3, rOFF24, rSTR1
+ LD rWORD4, rOFF24, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+ bne cr5, L(duLcr5)
+ srd r12, rWORD4, rSHR
+ sld rWORD4_SHIFT, rWORD4, rSHL
+ or rWORD4, r12, rWORD2_SHIFT
+ addi rSTR1, rSTR1, 8
+ addi rSTR2, rSTR2, 8
+ cmpld cr1, rWORD3, rWORD4
+ b L(duLoop2)
+ .align 4
+L(duP2x):
+ cmpld cr5, rWORD7, rWORD8
+ addi rSTR1, rSTR1, 8
+ addi rSTR2, rSTR2, 8
+ bne cr6, L(duLcr6)
+ sldi. rN, rN, 3
+ bne cr5, L(duLcr5)
+ cmpld cr7, rN, rSHR
+ beq L(duZeroReturn)
+ li r0, 0
+ ble cr7, L(dutrim)
+ LD rWORD2, rOFF8, rSTR2
+ srd r0, rWORD2, rSHR
+ b L(dutrim)
+
+/* Remainder is 24. */
+ .align 4
+L(duP3):
+ srd r12, rWORD8, rSHR
+ LD rWORD3, 0, rSTR1
+ sld rWORD4_SHIFT, rWORD8, rSHL
+ or rWORD4, r12, rWORD6_SHIFT
+L(duP3e):
+ LD rWORD5, rOFF8, rSTR1
+ LD rWORD6, rOFF8, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ srd r0, rWORD6, rSHR
+ sld rWORD6_SHIFT, rWORD6, rSHL
+ or rWORD6, r0, rWORD4_SHIFT
+ LD rWORD7, rOFF16, rSTR1
+ LD rWORD8, rOFF16, rSTR2
+ cmpld cr6, rWORD5, rWORD6
+ bne cr1, L(duLcr1)
+ srd r12, rWORD8, rSHR
+ sld rWORD8_SHIFT, rWORD8, rSHL
+ or rWORD8, r12, rWORD6_SHIFT
+ blt cr7, L(duP3x)
+ LD rWORD1, rOFF24, rSTR1
+ LD rWORD2, rOFF24, rSTR2
+ cmpld cr5, rWORD7, rWORD8
+ bne cr6, L(duLcr6)
+ srd r0, rWORD2, rSHR
+ sld rWORD2_SHIFT, rWORD2, rSHL
+ or rWORD2, r0, rWORD8_SHIFT
+ addi rSTR1, rSTR1, 16
+ addi rSTR2, rSTR2, 16
+ cmpld cr7, rWORD1, rWORD2
+ b L(duLoop1)
+ .align 4
+L(duP3x):
+ addi rSTR1, rSTR1, 16
+ addi rSTR2, rSTR2, 16
+ cmpld cr5, rWORD7, rWORD8
+ bne cr6, L(duLcr6)
+ sldi. rN, rN, 3
+ bne cr5, L(duLcr5)
+ cmpld cr7, rN, rSHR
+ beq L(duZeroReturn)
+ li r0, 0
+ ble cr7, L(dutrim)
+ LD rWORD2, rOFF8, rSTR2
+ srd r0, rWORD2, rSHR
+ b L(dutrim)
+
+/* Count is a multiple of 32, remainder is 0. */
+ .align 4
+L(duP4):
+ mtctr r0
+ srd r0, rWORD8, rSHR
+ LD rWORD1, 0, rSTR1
+ sld rWORD2_SHIFT, rWORD8, rSHL
+ or rWORD2, r0, rWORD6_SHIFT
+L(duP4e):
+ LD rWORD3, rOFF8, rSTR1
+ LD rWORD4, rOFF8, rSTR2
+ cmpld cr7, rWORD1, rWORD2
+ srd r12, rWORD4, rSHR
+ sld rWORD4_SHIFT, rWORD4, rSHL
+ or rWORD4, r12, rWORD2_SHIFT
+ LD rWORD5, rOFF16, rSTR1
+ LD rWORD6, rOFF16, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ bne cr7, L(duLcr7)
+ srd r0, rWORD6, rSHR
+ sld rWORD6_SHIFT, rWORD6, rSHL
+ or rWORD6, r0, rWORD4_SHIFT
+ LD rWORD7, rOFF24, rSTR1
+ LD rWORD8, rOFF24, rSTR2
+ addi rSTR1, rSTR1, 24
+ addi rSTR2, rSTR2, 24
+ cmpld cr6, rWORD5, rWORD6
+ bne cr1, L(duLcr1)
+ srd r12, rWORD8, rSHR
+ sld rWORD8_SHIFT, rWORD8, rSHL
+ or rWORD8, r12, rWORD6_SHIFT
+ cmpld cr5, rWORD7, rWORD8
+ bdz L(du24) /* Adjust CTR as we start with +4. */
+/* This is the primary loop. */
+ .align 4
+L(duLoop):
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
+ cmpld cr1, rWORD3, rWORD4
+ bne cr6, L(duLcr6)
+ srd r0, rWORD2, rSHR
+ sld rWORD2_SHIFT, rWORD2, rSHL
+ or rWORD2, r0, rWORD8_SHIFT
+L(duLoop1):
+ LD rWORD3, rOFF16, rSTR1
+ LD rWORD4, rOFF16, rSTR2
+ cmpld cr6, rWORD5, rWORD6
+ bne cr5, L(duLcr5)
+ srd r12, rWORD4, rSHR
+ sld rWORD4_SHIFT, rWORD4, rSHL
+ or rWORD4, r12, rWORD2_SHIFT
+L(duLoop2):
+ LD rWORD5, rOFF24, rSTR1
+ LD rWORD6, rOFF24, rSTR2
+ cmpld cr5, rWORD7, rWORD8
+ bne cr7, L(duLcr7)
+ srd r0, rWORD6, rSHR
+ sld rWORD6_SHIFT, rWORD6, rSHL
+ or rWORD6, r0, rWORD4_SHIFT
+L(duLoop3):
+ LD rWORD7, rOFF32, rSTR1
+ LD rWORD8, rOFF32, rSTR2
+ addi rSTR1, rSTR1, 32
+ addi rSTR2, rSTR2, 32
+ cmpld cr7, rWORD1, rWORD2
+ bne cr1, L(duLcr1)
+ srd r12, rWORD8, rSHR
+ sld rWORD8_SHIFT, rWORD8, rSHL
+ or rWORD8, r12, rWORD6_SHIFT
+ bdnz L(duLoop)
+
+L(duL4):
+ cmpld cr1, rWORD3, rWORD4
+ bne cr6, L(duLcr6)
+ cmpld cr6, rWORD5, rWORD6
+ bne cr5, L(duLcr5)
+ cmpld cr5, rWORD7, rWORD8
+L(du44):
+ bne cr7, L(duLcr7)
+L(du34):
+ bne cr1, L(duLcr1)
+L(du24):
+ bne cr6, L(duLcr6)
+L(du14):
+ sldi. rN, rN, 3
+ bne cr5, L(duLcr5)
+/* At this point we have a remainder of 1 to 7 bytes to compare. We use
+ shift right double to eliminate bits beyond the compare length.
+
+ However it may not be safe to load rWORD2 which may be beyond the
+ string length. So we compare the bit length of the remainder to
+ the right shift count (rSHR). If the bit count is less than or equal
+ we do not need to load rWORD2 (all significant bits are already in
+ rWORD8_SHIFT). */
+ cmpld cr7, rN, rSHR
+ beq L(duZeroReturn)
+ li r0, 0
+ ble cr7, L(dutrim)
+ LD rWORD2, rOFF8, rSTR2
+ srd r0, rWORD2, rSHR
+ .align 4
+L(dutrim):
+ LD rWORD1, rOFF8, rSTR1
+ ld rWORD8, -8(r1)
+ subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
+ or rWORD2, r0, rWORD8_SHIFT
+ ld rWORD7, rWORD7SAVE(r1)
+ ld rSHL, rSHLSAVE(r1)
+ srd rWORD1, rWORD1, rN
+ srd rWORD2, rWORD2, rN
+ ld rSHR, rSHRSAVE(r1)
+ ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+ li rRTN, 0
+ cmpld cr7, rWORD1, rWORD2
+ ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+ ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+ beq cr7, L(dureturn24)
+ li rRTN, 1
+ ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ bgtlr cr7
+ li rRTN, -1
+ blr
+ .align 4
+L(duLcr7):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+ li rRTN, 1
+ bgt cr7, L(dureturn29)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
+ li rRTN, -1
+ b L(dureturn27)
+ .align 4
+L(duLcr1):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+ li rRTN, 1
+ bgt cr1, L(dureturn29)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
+ li rRTN, -1
+ b L(dureturn27)
+ .align 4
+L(duLcr6):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+ li rRTN, 1
+ bgt cr6, L(dureturn29)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
+ li rRTN, -1
+ b L(dureturn27)
+ .align 4
+L(duLcr5):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+ li rRTN, 1
+ bgt cr5, L(dureturn29)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
+ li rRTN, -1
+ b L(dureturn27)
+
+ .align 3
+L(duZeroReturn):
+ li rRTN, 0
+ .align 4
+L(dureturn):
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+L(dureturn29):
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
+L(dureturn27):
+ ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+ ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+ ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+L(dureturn24):
+ ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ blr
+
+L(duzeroLength):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
+ li rRTN, 0
+ blr
+
+END (MEMCMP)
+libc_hidden_builtin_def (memcmp)
+weak_alias (memcmp, bcmp)
diff --git a/sysdeps/powerpc/powerpc64/power8/memrchr.S b/sysdeps/powerpc/powerpc64/power8/memrchr.S
new file mode 100644
index 0000000000..54de6566bd
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/memrchr.S
@@ -0,0 +1,345 @@
+/* Optimized memrchr implementation for PowerPC64/POWER8.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* int [r3] memrchr (char *s [r3], int byte [r4], int size [r5]) */
+
+/* TODO: change these to the actual instructions when the minimum required
+ binutils allows it. */
+#define MTVRD(v, r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define MFVRD(r, v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define VBPERMQ(t, a, b) .long (0x1000054c \
+ | ((t)<<(32-11)) \
+ | ((a)<<(32-16)) \
+ | ((b)<<(32-21)) )
+#ifndef MEMRCHR
+# define MEMRCHR __memrchr
+#endif
+ .machine power7
+ENTRY_TOCLESS (MEMRCHR)
+ CALL_MCOUNT 3
+ add r7, r3, r5 /* Calculate the last acceptable address. */
+ neg r0, r7
+ addi r7, r7, -1
+ mr r10, r3
+ clrrdi r6, r7, 7
+ li r9, 3<<5
+ dcbt r9, r6, 8 /* Stream hint, decreasing addresses. */
+
+ /* Replicate BYTE to doubleword. */
+ insrdi r4, r4, 8, 48
+ insrdi r4, r4, 16, 32
+ insrdi r4, r4, 32, 0
+ li r6, -8
+ li r9, -1
+ rlwinm r0, r0, 3, 26, 28 /* Calculate padding. */
+ clrrdi r8, r7, 3
+ srd r9, r9, r0
+ cmpldi r5, 32
+ clrrdi r0, r10, 3
+ ble L(small_range)
+
+#ifdef __LITTLE_ENDIAN__
+ ldx r12, 0, r8
+#else
+ ldbrx r12, 0, r8 /* Load reversed doubleword from memory. */
+#endif
+ cmpb r3, r12, r4 /* Check for BYTE in DWORD1. */
+ and r3, r3, r9
+ cmpldi cr7, r3, 0 /* If r3 == 0, no BYTEs have been found. */
+ bne cr7, L(done)
+
+ /* Are we now aligned to a quadword boundary? If so, skip to
+ the main loop. Otherwise, go through the alignment code. */
+ andi. r12, r8, 15
+ beq cr0, L(align_qw)
+
+ /* Handle DWORD2 of pair. */
+#ifdef __LITTLE_ENDIAN__
+ ldx r12, r8, r6
+#else
+ ldbrx r12, r8, r6
+#endif
+ addi r8, r8, -8
+ cmpb r3, r12, r4
+ cmpldi cr7, r3, 0
+ bne cr7, L(done)
+
+ .align 4
+ /* At this point, r8 is 16B aligned. */
+L(align_qw):
+ sub r5, r8, r0
+ vspltisb v0, 0
+ /* Precompute vbpermq constant. */
+ vspltisb v10, 3
+ li r0, 0
+ lvsl v11, r0, r0
+ vslb v10, v11, v10
+ MTVRD(v1, r4)
+ vspltb v1, v1, 7
+ cmpldi r5, 64
+ ble L(tail64)
+ /* Are we 64-byte aligned? If so, jump to the vectorized loop.
+ Note: aligning to 64-byte will necessarily slow down performance for
+ strings around 64 bytes in length due to the extra comparisons
+ required to check alignment for the vectorized loop. This is a
+ necessary tradeoff we are willing to take in order to speed up the
+ calculation for larger strings. */
+ andi. r11, r8, 63
+ beq cr0, L(preloop_64B)
+ /* In order to begin the 64B loop, it needs to be 64
+ bytes aligned. So read until it is 64B aligned. */
+ addi r8, r8, -16
+ lvx v4, 0, r8
+ vcmpequb v6, v1, v4
+ vcmpequb. v11, v0, v6
+ bnl cr6, L(found_16B)
+ addi r5, r5, -16
+
+ andi. r11, r8, 63
+ beq cr0, L(preloop_64B)
+ addi r8, r8, -16
+ lvx v4, 0, r8
+ vcmpequb v6, v1, v4
+ vcmpequb. v11, v0, v6
+ bnl cr6, L(found_16B)
+ addi r5, r5, -16
+
+ andi. r11, r8, 63
+ beq cr0, L(preloop_64B)
+ addi r8, r8, -16
+ lvx v4, 0, r8
+ vcmpequb v6, v1, v4
+ vcmpequb. v11, v0, v6
+ bnl cr6, L(found_16B)
+ addi r5, r5, -16
+ /* At this point it should be 64B aligned.
+ Prepare for the 64B loop. */
+L(preloop_64B):
+ cmpldi r5, 64 /* Check if r5 < 64. */
+ ble L(tail64)
+ srdi r9, r5, 6 /* Number of loop iterations. */
+ mtctr r9 /* Setup the counter. */
+ li r11, 16 /* Load required offsets. */
+ li r9, 32
+ li r7, 48
+
+ /* Handle r5 > 64. Loop over the bytes in strides of 64B. */
+ .align 4
+L(loop):
+ addi r8, r8, -64 /* Adjust address for the next iteration. */
+ lvx v2, 0, r8 /* Load 4 quadwords. */
+ lvx v3, r8, r11
+ lvx v4, v8, r9
+ lvx v5, v8, r7
+ vcmpequb v6, v1, v2
+ vcmpequb v7, v1, v3
+ vcmpequb v8, v1, v4
+ vcmpequb v9, v1, v5
+ vor v11, v6, v7
+ vor v12, v8, v9
+ vor v11, v11, v12 /* Compare and merge into one VR for speed. */
+ vcmpequb. v11, v0, v11
+ bnl cr6, L(found)
+ bdnz L(loop)
+ clrldi r5, r5, 58
+
+ /* Handle remainder of 64B loop or r5 > 64. */
+ .align 4
+L(tail64):
+ cmpldi r5, 0
+ beq L(null)
+ addi r8, r8, -16
+ lvx v4, 0, r8
+ vcmpequb v6, v1, v4
+ vcmpequb. v11, v0, v6
+ bnl cr6, L(found_16B)
+ cmpldi cr6, r5, 16
+ ble cr6, L(null)
+ addi r5, r5, -16
+
+ addi r8, r8, -16
+ lvx v4, 0, r8
+ vcmpequb v6, v1, v4
+ vcmpequb. v11, v0, v6
+ bnl cr6, L(found_16B)
+ cmpldi cr6, r5, 16
+ ble cr6, L(null)
+ addi r5, r5, -16
+
+ addi r8, r8, -16
+ lvx v4, 0, r8
+ vcmpequb v6, v1, v4
+ vcmpequb. v11, v0, v6
+ bnl cr6, L(found_16B)
+ cmpldi cr6, r5, 16
+ ble cr6, L(null)
+ addi r5, r5, -16
+
+ addi r8, r8, -16
+ lvx v4, 0, r8
+ vcmpequb v6, v1, v4
+ vcmpequb. v11, v0, v6
+ bnl cr6, L(found_16B)
+ li r3, 0
+ blr
+
+ /* Found a match in 64B loop. */
+ .align 4
+L(found):
+ /* Permute the first bit of each byte into bits 48-63. */
+ VBPERMQ(v6, v6, v10)
+ VBPERMQ(v7, v7, v10)
+ VBPERMQ(v8, v8, v10)
+ VBPERMQ(v9, v9, v10)
+ /* Shift each component into its correct position for merging. */
+#ifdef __LITTLE_ENDIAN__
+ vsldoi v7, v7, v7, 2
+ vsldoi v8, v8, v8, 4
+ vsldoi v9, v9, v9, 6
+#else
+ vsldoi v6, v6, v6, 6
+ vsldoi v7, v7, v7, 4
+ vsldoi v8, v8, v8, 2
+#endif
+ /* Merge the results and move to a GPR. */
+ vor v11, v6, v7
+ vor v4, v9, v8
+ vor v4, v11, v4
+ MFVRD(r5, v4)
+#ifdef __LITTLE_ENDIAN__
+ cntlzd r6, r5 /* Count leading zeros before the match. */
+#else
+ addi r6, r5, -1
+ andc r6, r6, r5
+ popcntd r6, r6
+#endif
+ addi r8, r8, 63
+ sub r3, r8, r6 /* Compute final address. */
+ cmpld cr7, r3, r10
+ bgelr cr7
+ li r3, 0
+ blr
+
+ /* Found a match in last 16 bytes. */
+ .align 4
+L(found_16B):
+ cmpld r8, r10 /* Are we on the last QW? */
+ bge L(last)
+ /* Now discard bytes before starting address. */
+ sub r9, r10, r8
+ MTVRD(v9, r9)
+ vspltisb v8, 3
+ /* Mask unwanted bytes. */
+#ifdef __LITTLE_ENDIAN__
+ lvsr v7, 0, r10
+ vperm v6, v0, v6, v7
+ vsldoi v9, v0, v9, 8
+ vsl v9, v9, v8
+ vslo v6, v6, v9
+#else
+ lvsl v7, 0, r10
+ vperm v6, v6, v0, v7
+ vsldoi v9, v0, v9, 8
+ vsl v9, v9, v8
+ vsro v6, v6, v9
+#endif
+L(last):
+ /* Permute the first bit of each byte into bits 48-63. */
+ VBPERMQ(v6, v6, v10)
+ /* Shift each component into its correct position for merging. */
+#ifdef __LITTLE_ENDIAN__
+ vsldoi v6, v6, v6, 6
+ MFVRD(r7, v6)
+ cntlzd r6, r7 /* Count leading zeros before the match. */
+#else
+ MFVRD(r7, v6)
+ addi r6, r7, -1
+ andc r6, r6, r7
+ popcntd r6, r6
+#endif
+ addi r8, r8, 15
+ sub r3, r8, r6 /* Compute final address. */
+ cmpld r6, r5
+ bltlr
+ li r3, 0
+ blr
+
+ /* r3 has the output of the cmpb instruction, that is, it contains
+ 0xff in the same position as BYTE in the original
+ word from the string. Use that to calculate the pointer.
+ We need to make sure BYTE is *before* the end of the
+ range. */
+L(done):
+ cntlzd r9, r3 /* Count leading zeros before the match. */
+ cmpld r8, r0 /* Are we on the last word? */
+ srdi r6, r9, 3 /* Convert leading zeros to bytes. */
+ addi r0, r6, -7
+ sub r3, r8, r0
+ cmpld cr7, r3, r10
+ bnelr
+ bgelr cr7
+ li r3, 0
+ blr
+
+ .align 4
+L(null):
+ li r3, 0
+ blr
+
+/* Deals with size <= 32. */
+ .align 4
+L(small_range):
+ cmpldi r5, 0
+ beq L(null)
+
+#ifdef __LITTLE_ENDIAN__
+ ldx r12, 0, r8
+#else
+ ldbrx r12, 0, r8 /* Load reversed doubleword from memory. */
+#endif
+ cmpb r3, r12, r4 /* Check for BYTE in DWORD1. */
+ and r3, r3, r9
+ cmpldi cr7, r3, 0
+ bne cr7, L(done)
+
+ /* Are we done already? */
+ cmpld r8, r0
+ addi r8, r8, -8
+ beqlr
+
+ .align 5
+L(loop_small):
+#ifdef __LITTLE_ENDIAN__
+ ldx r12, 0, r8
+#else
+ ldbrx r12, 0, r8
+#endif
+ cmpb r3, r12, r4
+ cmpld r8, r0
+ cmpldi cr7, r3, 0
+ bne cr7, L(done)
+ addi r8, r8, -8
+ bne L(loop_small)
+ blr
+
+END (MEMRCHR)
+weak_alias (__memrchr, memrchr)
+libc_hidden_builtin_def (memrchr)
diff --git a/sysdeps/powerpc/powerpc64/power8/memset.S b/sysdeps/powerpc/powerpc64/power8/memset.S
index 11433d89ad..a42232b42a 100644
--- a/sysdeps/powerpc/powerpc64/power8/memset.S
+++ b/sysdeps/powerpc/powerpc64/power8/memset.S
@@ -1,5 +1,5 @@
/* Optimized memset implementation for PowerPC64/POWER8.
- Copyright (C) 2014-2016 Free Software Foundation, Inc.
+ Copyright (C) 2014-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -20,14 +20,18 @@
#define MTVSRD_V1_R4 .long 0x7c240166 /* mtvsrd v1,r4 */
-/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
Returns 's'. */
+#ifndef MEMSET
+# define MEMSET memset
+#endif
+
/* No need to use .machine power8 since mtvsrd is already
handled by the define. It avoid breakage on binutils
that does not support this machine specifier. */
.machine power7
-EALIGN (memset, 5, 0)
+ENTRY_TOCLESS (MEMSET, 5)
CALL_MCOUNT 3
L(_memset):
@@ -373,7 +377,10 @@ L(write_LT_32):
subf r5,r0,r5
2: bf 30,1f
- sth r4,0(r10)
+ /* Use stb instead of sth because it doesn't generate
+ alignment interrupts on cache-inhibited storage. */
+ stb r4,0(r10)
+ stb r4,1(r10)
addi r10,r10,2
1: bf 31,L(end_4bytes_alignment)
@@ -433,17 +440,80 @@ L(tail5):
/* Handles copies of 0~8 bytes. */
.align 4
L(write_LE_8):
- bne cr6,L(tail4)
+ bne cr6,L(LE7_tail4)
+ /* If input is word aligned, use stw, else use stb. */
+ andi. r0,r10,3
+ bne L(8_unalign)
stw r4,0(r10)
stw r4,4(r10)
blr
-END_GEN_TB (memset,TB_TOCLESS)
+
+ /* Unaligned input and size is 8. */
+ .align 4
+L(8_unalign):
+ andi. r0,r10,1
+ beq L(8_hwalign)
+ stb r4,0(r10)
+ sth r4,1(r10)
+ sth r4,3(r10)
+ sth r4,5(r10)
+ stb r4,7(r10)
+ blr
+
+ /* Halfword aligned input and size is 8. */
+ .align 4
+L(8_hwalign):
+ sth r4,0(r10)
+ sth r4,2(r10)
+ sth r4,4(r10)
+ sth r4,6(r10)
+ blr
+
+ .align 4
+ /* Copies 4~7 bytes. */
+L(LE7_tail4):
+ /* Use stb instead of sth because it doesn't generate
+ alignment interrupts on cache-inhibited storage. */
+ bf 29,L(LE7_tail2)
+ stb r4,0(r10)
+ stb r4,1(r10)
+ stb r4,2(r10)
+ stb r4,3(r10)
+ bf 30,L(LE7_tail5)
+ stb r4,4(r10)
+ stb r4,5(r10)
+ bflr 31
+ stb r4,6(r10)
+ blr
+
+ .align 4
+ /* Copies 2~3 bytes. */
+L(LE7_tail2):
+ bf 30,1f
+ stb r4,0(r10)
+ stb r4,1(r10)
+ bflr 31
+ stb r4,2(r10)
+ blr
+
+ .align 4
+L(LE7_tail5):
+ bflr 31
+ stb r4,4(r10)
+ blr
+
+ .align 4
+1: bflr 31
+ stb r4,0(r10)
+ blr
+
+END_GEN_TB (MEMSET,TB_TOCLESS)
libc_hidden_builtin_def (memset)
/* Copied from bzero.S to prevent the linker from inserting a stub
between bzero and memset. */
-ENTRY (__bzero)
+ENTRY_TOCLESS (__bzero)
CALL_MCOUNT 3
mr r5,r4
li r4,0
diff --git a/sysdeps/powerpc/powerpc64/power8/multiarch/Implies b/sysdeps/powerpc/powerpc64/power8/multiarch/Implies
deleted file mode 100644
index 1fc7b7cd39..0000000000
--- a/sysdeps/powerpc/powerpc64/power8/multiarch/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc64/power7/multiarch
diff --git a/sysdeps/powerpc/powerpc64/power8/stpcpy.S b/sysdeps/powerpc/powerpc64/power8/stpcpy.S
index fd8f7fa63a..ebdfaab97c 100644
--- a/sysdeps/powerpc/powerpc64/power8/stpcpy.S
+++ b/sysdeps/powerpc/powerpc64/power8/stpcpy.S
@@ -1,5 +1,5 @@
/* Optimized stpcpy implementation for PowerPC64/POWER8.
- Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Copyright (C) 2015-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/powerpc/powerpc64/power8/stpncpy.S b/sysdeps/powerpc/powerpc64/power8/stpncpy.S
index 067910b373..95c86e9677 100644
--- a/sysdeps/powerpc/powerpc64/power8/stpncpy.S
+++ b/sysdeps/powerpc/powerpc64/power8/stpncpy.S
@@ -1,5 +1,5 @@
/* Optimized stpncpy implementation for PowerPC64/POWER8.
- Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Copyright (C) 2015-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -18,3 +18,7 @@
#define USE_AS_STPNCPY
#include <sysdeps/powerpc/powerpc64/power8/strncpy.S>
+
+weak_alias (__stpncpy, stpncpy)
+libc_hidden_def (__stpncpy)
+libc_hidden_builtin_def (stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/power8/strcasecmp.S b/sysdeps/powerpc/powerpc64/power8/strcasecmp.S
new file mode 100644
index 0000000000..3a2efe2a64
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strcasecmp.S
@@ -0,0 +1,457 @@
+/* Optimized strcasecmp implementation for PowerPC64.
+ Copyright (C) 2016-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <locale-defines.h>
+
+/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] ) */
+
+#ifndef USE_AS_STRNCASECMP
+# define __STRCASECMP __strcasecmp
+# define STRCASECMP strcasecmp
+#else
+# define __STRCASECMP __strncasecmp
+# define STRCASECMP strncasecmp
+#endif
+/* Convert 16 bytes to lowercase and compare */
+#define TOLOWER() \
+ vaddubm v8, v4, v1; \
+ vaddubm v7, v4, v3; \
+ vcmpgtub v8, v8, v2; \
+ vsel v4, v7, v4, v8; \
+ vaddubm v8, v5, v1; \
+ vaddubm v7, v5, v3; \
+ vcmpgtub v8, v8, v2; \
+ vsel v5, v7, v5, v8; \
+ vcmpequb. v7, v5, v4;
+
+/*
+ * Get 16 bytes for unaligned case.
+ * reg1: Vector to hold next 16 bytes.
+ * reg2: Address to read from.
+ * reg3: Permute control vector.
+ * v8: Tmp vector used to mask unwanted bytes.
+ * v9: Tmp vector,0 when null is found on first 16 bytes
+ */
+#ifdef __LITTLE_ENDIAN__
+#define GET16BYTES(reg1, reg2, reg3) \
+ lvx reg1, 0, reg2; \
+ vspltisb v8, -1; \
+ vperm v8, v8, reg1, reg3; \
+ vcmpequb. v8, v0, v8; \
+ beq cr6, 1f; \
+ vspltisb v9, 0; \
+ b 2f; \
+ .align 4; \
+1: \
+ addi r6, reg2, 16; \
+ lvx v9, 0, r6; \
+2: \
+ vperm reg1, v9, reg1, reg3;
+#else
+#define GET16BYTES(reg1, reg2, reg3) \
+ lvx reg1, 0, reg2; \
+ vspltisb v8, -1; \
+ vperm v8, reg1, v8, reg3; \
+ vcmpequb. v8, v0, v8; \
+ beq cr6, 1f; \
+ vspltisb v9, 0; \
+ b 2f; \
+ .align 4; \
+1: \
+ addi r6, reg2, 16; \
+ lvx v9, 0, r6; \
+2: \
+ vperm reg1, reg1, v9, reg3;
+#endif
+
+/* Check null in v4, v5 and convert to lower. */
+#define CHECKNULLANDCONVERT() \
+ vcmpequb. v7, v0, v5; \
+ beq cr6, 3f; \
+ vcmpequb. v7, v0, v4; \
+ beq cr6, 3f; \
+ b L(null_found); \
+ .align 4; \
+3: \
+ TOLOWER()
+
+#ifdef _ARCH_PWR8
+# define VCLZD_V8_v7 vclzd v8, v7;
+# define MFVRD_R3_V1 mfvrd r3, v1;
+# define VSUBUDM_V9_V8 vsubudm v9, v9, v8;
+# define VPOPCNTD_V8_V8 vpopcntd v8, v8;
+# define VADDUQM_V7_V8 vadduqm v9, v7, v8;
+#else
+# define VCLZD_V8_v7 .long 0x11003fc2
+# define MFVRD_R3_V1 .long 0x7c230067
+# define VSUBUDM_V9_V8 .long 0x112944c0
+# define VPOPCNTD_V8_V8 .long 0x110047c3
+# define VADDUQM_V7_V8 .long 0x11274100
+#endif
+
+ .machine power7
+
+ENTRY (__STRCASECMP)
+#ifdef USE_AS_STRNCASECMP
+ CALL_MCOUNT 3
+#else
+ CALL_MCOUNT 2
+#endif
+#define rRTN r3 /* Return value */
+#define rSTR1 r10 /* 1st string */
+#define rSTR2 r4 /* 2nd string */
+#define rCHAR1 r6 /* Byte read from 1st string */
+#define rCHAR2 r7 /* Byte read from 2nd string */
+#define rADDR1 r8 /* Address of tolower(rCHAR1) */
+#define rADDR2 r12 /* Address of tolower(rCHAR2) */
+#define rLWR1 r8 /* Word tolower(rCHAR1) */
+#define rLWR2 r12 /* Word tolower(rCHAR2) */
+#define rTMP r9
+#define rLOC r11 /* Default locale address */
+
+ cmpd cr7, rRTN, rSTR2
+
+ /* Get locale address. */
+ ld rTMP, __libc_tsd_LOCALE@got@tprel(r2)
+ add rLOC, rTMP, __libc_tsd_LOCALE@tls
+ ld rLOC, 0(rLOC)
+
+ mr rSTR1, rRTN
+ li rRTN, 0
+ beqlr cr7
+#ifdef USE_AS_STRNCASECMP
+ cmpdi cr7, r5, 0
+ beq cr7, L(retnull)
+ cmpdi cr7, r5, 16
+ blt cr7, L(bytebybyte)
+#endif
+ vspltisb v0, 0
+ vspltisb v8, -1
+ /* Check for null in initial characters.
+ Check max of 16 char depending on the alignment.
+ If null is present, proceed byte by byte. */
+ lvx v4, 0, rSTR1
+#ifdef __LITTLE_ENDIAN__
+ lvsr v10, 0, rSTR1 /* Compute mask. */
+ vperm v9, v8, v4, v10 /* Mask bits that are not part of string. */
+#else
+ lvsl v10, 0, rSTR1
+ vperm v9, v4, v8, v10
+#endif
+ vcmpequb. v9, v0, v9 /* Check for null bytes. */
+ bne cr6, L(bytebybyte)
+ lvx v5, 0, rSTR2
+ /* Calculate alignment. */
+#ifdef __LITTLE_ENDIAN__
+ lvsr v6, 0, rSTR2
+ vperm v9, v8, v5, v6 /* Mask bits that are not part of string. */
+#else
+ lvsl v6, 0, rSTR2
+ vperm v9, v5, v8, v6
+#endif
+ vcmpequb. v9, v0, v9 /* Check for null bytes. */
+ bne cr6, L(bytebybyte)
+ /* Check if locale has non ascii characters. */
+ ld rTMP, 0(rLOC)
+ addi r6, rTMP,LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES
+ lwz rTMP, 0(r6)
+ cmpdi cr7, rTMP, 1
+ beq cr7, L(bytebybyte)
+
+ /* Load vector registers with values used for TOLOWER. */
+ /* Load v1 = 0xbf, v2 = 0x19 v3 = 0x20 in each byte. */
+ vspltisb v3, 2
+ vspltisb v9, 4
+ vsl v3, v3, v9
+ vaddubm v1, v3, v3
+ vnor v1, v1, v1
+ vspltisb v2, 7
+ vsububm v2, v3, v2
+
+ andi. rADDR1, rSTR1, 0xF
+ beq cr0, L(align)
+ addi r6, rSTR1, 16
+ lvx v9, 0, r6
+ /* Compute 16 bytes from previous two loads. */
+#ifdef __LITTLE_ENDIAN__
+ vperm v4, v9, v4, v10
+#else
+ vperm v4, v4, v9, v10
+#endif
+L(align):
+ andi. rADDR2, rSTR2, 0xF
+ beq cr0, L(align1)
+ addi r6, rSTR2, 16
+ lvx v9, 0, r6
+ /* Compute 16 bytes from previous two loads. */
+#ifdef __LITTLE_ENDIAN__
+ vperm v5, v9, v5, v6
+#else
+ vperm v5, v5, v9, v6
+#endif
+L(align1):
+ CHECKNULLANDCONVERT()
+ blt cr6, L(match)
+ b L(different)
+ .align 4
+L(match):
+ clrldi r6, rSTR1, 60
+ subfic r7, r6, 16
+#ifdef USE_AS_STRNCASECMP
+ sub r5, r5, r7
+#endif
+ add rSTR1, rSTR1, r7
+ add rSTR2, rSTR2, r7
+ andi. rADDR2, rSTR2, 0xF
+ addi rSTR1, rSTR1, -16
+ addi rSTR2, rSTR2, -16
+ beq cr0, L(aligned)
+#ifdef __LITTLE_ENDIAN__
+ lvsr v6, 0, rSTR2
+#else
+ lvsl v6, 0, rSTR2
+#endif
+ /* There are 2 loops depending on the input alignment.
+ Each loop gets 16 bytes from s1 and s2, check for null,
+ convert to lowercase and compare. Loop till difference
+ or null occurs. */
+L(s1_align):
+ addi rSTR1, rSTR1, 16
+ addi rSTR2, rSTR2, 16
+#ifdef USE_AS_STRNCASECMP
+ cmpdi cr7, r5, 16
+ blt cr7, L(bytebybyte)
+ addi r5, r5, -16
+#endif
+ lvx v4, 0, rSTR1
+ GET16BYTES(v5, rSTR2, v6)
+ CHECKNULLANDCONVERT()
+ blt cr6, L(s1_align)
+ b L(different)
+ .align 4
+L(aligned):
+ addi rSTR1, rSTR1, 16
+ addi rSTR2, rSTR2, 16
+#ifdef USE_AS_STRNCASECMP
+ cmpdi cr7, r5, 16
+ blt cr7, L(bytebybyte)
+ addi r5, r5, -16
+#endif
+ lvx v4, 0, rSTR1
+ lvx v5, 0, rSTR2
+ CHECKNULLANDCONVERT()
+ blt cr6, L(aligned)
+
+ /* Calculate and return the difference. */
+L(different):
+ vaddubm v1, v3, v3
+ vcmpequb v7, v0, v7
+#ifdef __LITTLE_ENDIAN__
+ /* Count trailing zero. */
+ vspltisb v8, -1
+ VADDUQM_V7_V8
+ vandc v8, v9, v7
+ VPOPCNTD_V8_V8
+ vspltb v6, v8, 15
+ vcmpequb. v6, v6, v1
+ blt cr6, L(shift8)
+#else
+ /* Count leading zero. */
+ VCLZD_V8_v7
+ vspltb v6, v8, 7
+ vcmpequb. v6, v6, v1
+ blt cr6, L(shift8)
+ vsro v8, v8, v1
+#endif
+ b L(skipsum)
+ .align 4
+L(shift8):
+ vsumsws v8, v8, v0
+L(skipsum):
+#ifdef __LITTLE_ENDIAN__
+ /* Shift registers based on leading zero count. */
+ vsro v6, v5, v8
+ vsro v7, v4, v8
+ /* Merge and move to GPR. */
+ vmrglb v6, v6, v7
+ vslo v1, v6, v1
+ MFVRD_R3_V1
+ /* Place the characters that are different in first position. */
+ sldi rSTR2, rRTN, 56
+ srdi rSTR2, rSTR2, 56
+ sldi rSTR1, rRTN, 48
+ srdi rSTR1, rSTR1, 56
+#else
+ vslo v6, v5, v8
+ vslo v7, v4, v8
+ vmrghb v1, v6, v7
+ MFVRD_R3_V1
+ srdi rSTR2, rRTN, 48
+ sldi rSTR2, rSTR2, 56
+ srdi rSTR2, rSTR2, 56
+ srdi rSTR1, rRTN, 56
+#endif
+ subf rRTN, rSTR1, rSTR2
+ extsw rRTN, rRTN
+ blr
+
+ .align 4
+ /* OK. We've hit the end of the string. We need to be careful that
+ we don't compare two strings as different because of junk beyond
+ the end of the strings... */
+L(null_found):
+ vaddubm v10, v3, v3
+#ifdef __LITTLE_ENDIAN__
+ /* Count trailing zero. */
+ vspltisb v8, -1
+ VADDUQM_V7_V8
+ vandc v8, v9, v7
+ VPOPCNTD_V8_V8
+ vspltb v6, v8, 15
+ vcmpequb. v6, v6, v10
+ blt cr6, L(shift_8)
+#else
+ /* Count leading zero. */
+ VCLZD_V8_v7
+ vspltb v6, v8, 7
+ vcmpequb. v6, v6, v10
+ blt cr6, L(shift_8)
+ vsro v8, v8, v10
+#endif
+ b L(skipsum1)
+ .align 4
+L(shift_8):
+ vsumsws v8, v8, v0
+L(skipsum1):
+ /* Calculate shift count based on count of zero. */
+ vspltisb v10, 7
+ vslb v10, v10, v10
+ vsldoi v9, v0, v10, 1
+ VSUBUDM_V9_V8
+ vspltisb v8, 8
+ vsldoi v8, v0, v8, 1
+ VSUBUDM_V9_V8
+ /* Shift and remove junk after null character. */
+#ifdef __LITTLE_ENDIAN__
+ vslo v5, v5, v9
+ vslo v4, v4, v9
+#else
+ vsro v5, v5, v9
+ vsro v4, v4, v9
+#endif
+ /* Convert and compare 16 bytes. */
+ TOLOWER()
+ blt cr6, L(retnull)
+ b L(different)
+ .align 4
+L(retnull):
+ li rRTN, 0
+ blr
+ .align 4
+L(bytebybyte):
+ /* Unrolling loop for POWER: loads are done with 'lbz' plus
+ offset and string descriptors are only updated in the end
+ of loop unrolling. */
+ ld rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
+ lbz rCHAR1, 0(rSTR1) /* Load char from s1 */
+ lbz rCHAR2, 0(rSTR2) /* Load char from s2 */
+#ifdef USE_AS_STRNCASECMP
+ rldicl rTMP, r5, 62, 2
+ cmpdi cr7, rTMP, 0
+ beq cr7, L(lessthan4)
+ mtctr rTMP
+#endif
+L(loop):
+ cmpdi rCHAR1, 0 /* *s1 == '\0' ? */
+ sldi rADDR1, rCHAR1, 2 /* Calculate address for tolower(*s1) */
+ sldi rADDR2, rCHAR2, 2 /* Calculate address for tolower(*s2) */
+ lwzx rLWR1, rLOC, rADDR1 /* Load tolower(*s1) */
+ lwzx rLWR2, rLOC, rADDR2 /* Load tolower(*s2) */
+ cmpw cr1, rLWR1, rLWR2 /* r = tolower(*s1) == tolower(*s2) ? */
+ crorc 4*cr1+eq,eq,4*cr1+eq /* (*s1 != '\0') || (r == 1) */
+ beq cr1, L(done)
+ lbz rCHAR1, 1(rSTR1)
+ lbz rCHAR2, 1(rSTR2)
+ cmpdi rCHAR1, 0
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpw cr1, rLWR1, rLWR2
+ crorc 4*cr1+eq,eq,4*cr1+eq
+ beq cr1, L(done)
+ lbz rCHAR1, 2(rSTR1)
+ lbz rCHAR2, 2(rSTR2)
+ cmpdi rCHAR1, 0
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpw cr1, rLWR1, rLWR2
+ crorc 4*cr1+eq,eq,4*cr1+eq
+ beq cr1, L(done)
+ lbz rCHAR1, 3(rSTR1)
+ lbz rCHAR2, 3(rSTR2)
+ cmpdi rCHAR1, 0
+ /* Increment both string descriptors */
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpw cr1, rLWR1, rLWR2
+ crorc 4*cr1+eq,eq,4*cr1+eq
+ beq cr1, L(done)
+ lbz rCHAR1, 0(rSTR1) /* Load char from s1 */
+ lbz rCHAR2, 0(rSTR2) /* Load char from s2 */
+#ifdef USE_AS_STRNCASECMP
+ bdnz L(loop)
+#else
+ b L(loop)
+#endif
+#ifdef USE_AS_STRNCASECMP
+L(lessthan4):
+ clrldi r5, r5, 62
+ cmpdi cr7, r5, 0
+ beq cr7, L(retnull)
+ mtctr r5
+L(loop1):
+ cmpdi rCHAR1, 0
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpw cr1, rLWR1, rLWR2
+ crorc 4*cr1+eq,eq,4*cr1+eq
+ beq cr1, L(done)
+ addi rSTR1, rSTR1, 1
+ addi rSTR2, rSTR2, 1
+ lbz rCHAR1, 0(rSTR1)
+ lbz rCHAR2, 0(rSTR2)
+ bdnz L(loop1)
+#endif
+L(done):
+ subf r0, rLWR2, rLWR1
+ extsw rRTN, r0
+ blr
+END (__STRCASECMP)
+
+weak_alias (__STRCASECMP, STRCASECMP)
+libc_hidden_builtin_def (__STRCASECMP)
diff --git a/sysdeps/powerpc/powerpc64/power8/strcasestr-ppc64.c b/sysdeps/powerpc/powerpc64/power8/strcasestr-ppc64.c
new file mode 100644
index 0000000000..221d4733f4
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strcasestr-ppc64.c
@@ -0,0 +1,29 @@
+/* Optimized strcasestr implementation for PowerPC64/POWER8.
+ Copyright (C) 2016-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <string.h>
+
+#define STRCASESTR __strcasestr_ppc
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(__name)
+
+#undef weak_alias
+#define weak_alias(a,b)
+extern __typeof (strcasestr) __strcasestr_ppc attribute_hidden;
+
+#include <string/strcasestr.c>
diff --git a/sysdeps/powerpc/powerpc64/power8/strcasestr.S b/sysdeps/powerpc/powerpc64/power8/strcasestr.S
new file mode 100644
index 0000000000..9fc24c29f9
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strcasestr.S
@@ -0,0 +1,538 @@
+/* Optimized strcasestr implementation for PowerPC64/POWER8.
+ Copyright (C) 2016-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <locale-defines.h>
+
+/* Char * [r3] strcasestr (char *s [r3], char * pat[r4]) */
+
+/* The performance gain is obtained by comparing 16 bytes. */
+
+/* When the first char of r4 is hit ITERATIONS times in r3
+ fallback to default. */
+#define ITERATIONS 64
+
+#ifndef STRCASESTR
+# define STRCASESTR __strcasestr
+#endif
+
+#ifndef STRLEN
+/* For builds without IFUNC support, local calls should be made to internal
+ GLIBC symbol (created by libc_hidden_builtin_def). */
+# ifdef SHARED
+# define STRLEN __GI_strlen
+# else
+# define STRLEN strlen
+# endif
+#endif
+
+#ifndef STRNLEN
+/* For builds without IFUNC support, local calls should be made to internal
+ GLIBC symbol (created by libc_hidden_builtin_def). */
+# ifdef SHARED
+# define STRNLEN __GI_strnlen
+# else
+# define STRNLEN __strnlen
+# endif
+#endif
+
+#ifndef STRCHR
+# ifdef SHARED
+# define STRCHR __GI_strchr
+# else
+# define STRCHR strchr
+# endif
+#endif
+
+/* Convert 16 bytes of v4 and reg to lowercase and compare. */
+#define TOLOWER(reg) \
+ vcmpgtub v6, v4, v1; \
+ vcmpgtub v7, v2, v4; \
+ vand v8, v7, v6; \
+ vand v8, v8, v3; \
+ vor v4, v8, v4; \
+ vcmpgtub v6, reg, v1; \
+ vcmpgtub v7, v2, reg; \
+ vand v8, v7, v6; \
+ vand v8, v8, v3; \
+ vor reg, v8, reg; \
+ vcmpequb. v6, reg, v4;
+
+/* TODO: change these to the actual instructions when the minimum required
+ binutils allows it. */
+#ifdef _ARCH_PWR8
+#define VCLZD_V8_v7 vclzd v8, v7;
+#else
+#define VCLZD_V8_v7 .long 0x11003fc2
+#endif
+
+#define FRAMESIZE (FRAME_MIN_SIZE+48)
+/* TODO: change this to .machine power8 when the minimum required binutils
+ allows it. */
+ .machine power7
+ENTRY (STRCASESTR, 4)
+ CALL_MCOUNT 2
+ mflr r0 /* Load link register LR to r0. */
+ std r31, -8(r1) /* Save callers register r31. */
+ std r30, -16(r1) /* Save callers register r30. */
+ std r29, -24(r1) /* Save callers register r29. */
+ std r28, -32(r1) /* Save callers register r28. */
+ std r27, -40(r1) /* Save callers register r27. */
+ std r0, 16(r1) /* Store the link register. */
+ cfi_offset(r31, -8)
+ cfi_offset(r30, -16)
+ cfi_offset(r29, -24)
+ cfi_offset(r28, -32)
+ cfi_offset(r27, -40)
+ cfi_offset(lr, 16)
+ stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */
+ cfi_adjust_cfa_offset(FRAMESIZE)
+
+ dcbt 0, r3
+ dcbt 0, r4
+ cmpdi cr7, r3, 0 /* Input validation. */
+ beq cr7, L(retnull)
+ cmpdi cr7, r4, 0
+ beq cr7, L(retnull)
+
+ mr r29, r3
+ mr r30, r4
+ /* Load first byte from r4 and check if its null. */
+ lbz r6, 0(r4)
+ cmpdi cr7, r6, 0
+ beq cr7, L(ret_r3)
+
+ ld r10, __libc_tsd_LOCALE@got@tprel(r2)
+ add r9, r10, __libc_tsd_LOCALE@tls
+ ld r9, 0(r9)
+ ld r9, LOCALE_CTYPE_TOUPPER(r9)
+ sldi r10, r6, 2 /* Convert to upper case. */
+ lwzx r28, r9, r10
+
+ ld r10, __libc_tsd_LOCALE@got@tprel(r2)
+ add r11, r10, __libc_tsd_LOCALE@tls
+ ld r11, 0(r11)
+ ld r11, LOCALE_CTYPE_TOLOWER(r11)
+ sldi r10, r6, 2 /* Convert to lower case. */
+ lwzx r27, r11, r10
+
+ /* Check if the first char is present. */
+ mr r4, r27
+ bl STRCHR
+ nop
+ mr r5, r3
+ mr r3, r29
+ mr r29, r5
+ mr r4, r28
+ bl STRCHR
+ nop
+ cmpdi cr7, r29, 0
+ beq cr7, L(firstpos)
+ cmpdi cr7, r3, 0
+ beq cr7, L(skipcheck)
+ cmpw cr7, r3, r29
+ ble cr7, L(firstpos)
+ /* Move r3 to the first occurence. */
+L(skipcheck):
+ mr r3, r29
+L(firstpos):
+ mr r29, r3
+
+ sldi r9, r27, 8
+ or r28, r9, r28
+ /* Reg r27 is used to count the number of iterations. */
+ li r27, 0
+ /* If first char of search str is not present. */
+ cmpdi cr7, r3, 0
+ ble cr7, L(end)
+
+ /* Find the length of pattern. */
+ mr r3, r30
+ bl STRLEN
+ nop
+
+ cmpdi cr7, r3, 0 /* If search str is null. */
+ beq cr7, L(ret_r3)
+
+ mr r31, r3
+ mr r4, r3
+ mr r3, r29
+ bl STRNLEN
+ nop
+
+ cmpd cr7, r3, r31 /* If len(r3) < len(r4). */
+ blt cr7, L(retnull)
+
+ mr r3, r29
+
+ /* Locales not matching ASCII for single bytes. */
+ ld r10, __libc_tsd_LOCALE@got@tprel(r2)
+ add r9, r10, __libc_tsd_LOCALE@tls
+ ld r9, 0(r9)
+ ld r7, 0(r9)
+ addi r7, r7, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES
+ lwz r8, 0(r7)
+ cmpdi cr7, r8, 1
+ beq cr7, L(bytebybyte)
+
+ /* If len(r4) < 16 handle byte by byte. */
+ /* For shorter strings we will not use vector registers. */
+ cmpdi cr7, r31, 16
+ blt cr7, L(bytebybyte)
+
+ /* Comparison values used for TOLOWER. */
+ /* Load v1 = 64('A' - 1), v2 = 91('Z' + 1), v3 = 32 in each byte. */
+ vspltish v0, 0
+ vspltisb v5, 2
+ vspltisb v4, 4
+ vsl v3, v5, v4
+ vaddubm v1, v3, v3
+ vspltisb v5, 15
+ vaddubm v2, v5, v5
+ vaddubm v2, v1, v2
+ vspltisb v4, -3
+ vaddubm v2, v2, v4
+
+ /*
+ 1. Load 16 bytes from r3 and r4
+ 2. Check if there is null, If yes, proceed byte by byte path.
+ 3. Else,Convert both to lowercase and compare.
+ 4. If they are same proceed to 1.
+ 5. If they dont match, find if first char of r4 is present in the
+ loaded 16 byte of r3.
+ 6. If yes, move position, load next 16 bytes of r3 and proceed to 2.
+ */
+
+ mr r8, r3 /* Save r3 for future use. */
+ mr r4, r30 /* Restore r4. */
+ clrldi r10, r4, 60
+ lvx v5, 0, r4 /* Load 16 bytes from r4. */
+ cmpdi cr7, r10, 0
+ beq cr7, L(begin2)
+ /* If r4 is unaligned, load another 16 bytes. */
+#ifdef __LITTLE_ENDIAN__
+ lvsr v7, 0, r4
+#else
+ lvsl v7, 0, r4
+#endif
+ addi r5, r4, 16
+ lvx v9, 0, r5
+#ifdef __LITTLE_ENDIAN__
+ vperm v5, v9, v5, v7
+#else
+ vperm v5, v5, v9, v7
+#endif
+L(begin2):
+ lvx v4, 0, r3
+ vcmpequb. v7, v0, v4 /* Check for null. */
+ beq cr6, L(nullchk6)
+ b L(trailcheck)
+
+ .align 4
+L(nullchk6):
+ clrldi r10, r3, 60
+ cmpdi cr7, r10, 0
+ beq cr7, L(next16)
+#ifdef __LITTLE_ENDIAN__
+ lvsr v7, 0, r3
+#else
+ lvsl v7, 0, r3
+#endif
+ addi r5, r3, 16
+ /* If r3 is unaligned, load another 16 bytes. */
+ lvx v10, 0, r5
+#ifdef __LITTLE_ENDIAN__
+ vperm v4, v10, v4, v7
+#else
+ vperm v4, v4, v10, v7
+#endif
+L(next16):
+ vcmpequb. v6, v0, v5 /* Check for null. */
+ beq cr6, L(nullchk)
+ b L(trailcheck)
+
+ .align 4
+L(nullchk):
+ vcmpequb. v6, v0, v4
+ beq cr6, L(nullchk1)
+ b L(retnull)
+
+ .align 4
+L(nullchk1):
+ /* Convert both v3 and v4 to lower. */
+ TOLOWER(v5)
+ /* If both are same, branch to match. */
+ blt cr6, L(match)
+ /* Find if the first char is present in next 15 bytes. */
+#ifdef __LITTLE_ENDIAN__
+ vspltb v6, v5, 15
+ vsldoi v7, v0, v4, 15
+#else
+ vspltb v6, v5, 0
+ vspltisb v7, 8
+ vslo v7, v4, v7
+#endif
+ vcmpequb v7, v6, v7
+ vcmpequb. v6, v0, v7
+ /* Shift r3 by 16 bytes and proceed. */
+ blt cr6, L(shift16)
+ VCLZD_V8_v7
+#ifdef __LITTLE_ENDIAN__
+ vspltb v6, v8, 15
+#else
+ vspltb v6, v8, 7
+#endif
+ vcmpequb. v6, v6, v1
+ /* Shift r3 by 8 bytes and proceed. */
+ blt cr6, L(shift8)
+ b L(begin)
+
+ .align 4
+L(match):
+ /* There is a match of 16 bytes, check next bytes. */
+ cmpdi cr7, r31, 16
+ mr r29, r3
+ beq cr7, L(ret_r3)
+
+L(secondmatch):
+ addi r3, r3, 16
+ addi r4, r4, 16
+ /* Load next 16 bytes of r3 and r4 and compare. */
+ clrldi r10, r4, 60
+ cmpdi cr7, r10, 0
+ beq cr7, L(nextload)
+ /* Handle unaligned case. */
+ vor v6, v9, v9
+ vcmpequb. v7, v0, v6
+ beq cr6, L(nullchk2)
+ b L(trailcheck)
+
+ .align 4
+L(nullchk2):
+#ifdef __LITTLE_ENDIAN__
+ lvsr v7, 0, r4
+#else
+ lvsl v7, 0, r4
+#endif
+ addi r5, r4, 16
+ /* If r4 is unaligned, load another 16 bytes. */
+ lvx v9, 0, r5
+#ifdef __LITTLE_ENDIAN__
+ vperm v11, v9, v6, v7
+#else
+ vperm v11, v6, v9, v7
+#endif
+ b L(compare)
+
+ .align 4
+L(nextload):
+ lvx v11, 0, r4
+L(compare):
+ vcmpequb. v7, v0, v11
+ beq cr6, L(nullchk3)
+ b L(trailcheck)
+
+ .align 4
+L(nullchk3):
+ clrldi r10, r3, 60
+ cmpdi cr7, r10, 0
+ beq cr7, L(nextload1)
+ /* Handle unaligned case. */
+ vor v4, v10, v10
+ vcmpequb. v7, v0, v4
+ beq cr6, L(nullchk4)
+ b L(retnull)
+
+ .align 4
+L(nullchk4):
+#ifdef __LITTLE_ENDIAN__
+ lvsr v7, 0, r3
+#else
+ lvsl v7, 0, r3
+#endif
+ addi r5, r3, 16
+ /* If r3 is unaligned, load another 16 bytes. */
+ lvx v10, 0, r5
+#ifdef __LITTLE_ENDIAN__
+ vperm v4, v10, v4, v7
+#else
+ vperm v4, v4, v10, v7
+#endif
+ b L(compare1)
+
+ .align 4
+L(nextload1):
+ lvx v4, 0, r3
+L(compare1):
+ vcmpequb. v7, v0, v4
+ beq cr6, L(nullchk5)
+ b L(retnull)
+
+ .align 4
+L(nullchk5):
+ /* Convert both v3 and v4 to lower. */
+ TOLOWER(v11)
+ /* If both are same, branch to secondmatch. */
+ blt cr6, L(secondmatch)
+ /* Continue the search. */
+ b L(begin)
+
+ .align 4
+L(trailcheck):
+ ld r10, __libc_tsd_LOCALE@got@tprel(r2)
+ add r11, r10, __libc_tsd_LOCALE@tls
+ ld r11, 0(r11)
+ ld r11, LOCALE_CTYPE_TOLOWER(r11)
+L(loop2):
+ lbz r5, 0(r3) /* Load byte from r3. */
+ lbz r6, 0(r4) /* Load next byte from r4. */
+ cmpdi cr7, r6, 0 /* Is it null? */
+ beq cr7, L(updater3)
+ cmpdi cr7, r5, 0 /* Is it null? */
+ beq cr7, L(retnull) /* If yes, return. */
+ addi r3, r3, 1
+ addi r4, r4, 1 /* Increment r4. */
+ sldi r10, r5, 2 /* Convert to lower case. */
+ lwzx r10, r11, r10
+ sldi r7, r6, 2 /* Convert to lower case. */
+ lwzx r7, r11, r7
+ cmpw cr7, r7, r10 /* Compare with byte from r4. */
+ bne cr7, L(begin)
+ b L(loop2)
+
+ .align 4
+L(shift8):
+ addi r8, r8, 7
+ b L(begin)
+ .align 4
+L(shift16):
+ addi r8, r8, 15
+ .align 4
+L(begin):
+ addi r8, r8, 1
+ mr r3, r8
+ /* When our iterations exceed ITERATIONS,fall back to default. */
+ addi r27, r27, 1
+ cmpdi cr7, r27, ITERATIONS
+ beq cr7, L(default)
+ mr r4, r30 /* Restore r4. */
+ b L(begin2)
+
+ /* Handling byte by byte. */
+ .align 4
+L(loop1):
+ mr r3, r8
+ addi r27, r27, 1
+ cmpdi cr7, r27, ITERATIONS
+ beq cr7, L(default)
+ mr r29, r8
+ srdi r4, r28, 8
+ /* Check if the first char is present. */
+ bl STRCHR
+ nop
+ mr r5, r3
+ mr r3, r29
+ mr r29, r5
+ sldi r4, r28, 56
+ srdi r4, r4, 56
+ bl STRCHR
+ nop
+ cmpdi cr7, r29, 0
+ beq cr7, L(nextpos)
+ cmpdi cr7, r3, 0
+ beq cr7, L(skipcheck1)
+ cmpw cr7, r3, r29
+ ble cr7, L(nextpos)
+ /* Move r3 to first occurence. */
+L(skipcheck1):
+ mr r3, r29
+L(nextpos):
+ mr r29, r3
+ cmpdi cr7, r3, 0
+ ble cr7, L(retnull)
+L(bytebybyte):
+ ld r10, __libc_tsd_LOCALE@got@tprel(r2)
+ add r11, r10, __libc_tsd_LOCALE@tls
+ ld r11, 0(r11)
+ ld r11, LOCALE_CTYPE_TOLOWER(r11)
+ mr r4, r30 /* Restore r4. */
+ mr r8, r3 /* Save r3. */
+ addi r8, r8, 1
+
+L(loop):
+ addi r3, r3, 1
+ lbz r5, 0(r3) /* Load byte from r3. */
+ addi r4, r4, 1 /* Increment r4. */
+ lbz r6, 0(r4) /* Load next byte from r4. */
+ cmpdi cr7, r6, 0 /* Is it null? */
+ beq cr7, L(updater3)
+ cmpdi cr7, r5, 0 /* Is it null? */
+ beq cr7, L(retnull) /* If yes, return. */
+ sldi r10, r5, 2 /* Convert to lower case. */
+ lwzx r10, r11, r10
+ sldi r7, r6, 2 /* Convert to lower case. */
+ lwzx r7, r11, r7
+ cmpw cr7, r7, r10 /* Compare with byte from r4. */
+ bne cr7, L(loop1)
+ b L(loop)
+
+ /* Handling return values. */
+ .align 4
+L(updater3):
+ subf r3, r31, r3 /* Reduce r31 (len of r4) from r3. */
+ b L(end)
+
+ .align 4
+L(ret_r3):
+ mr r3, r29 /* Return point of match. */
+ b L(end)
+
+ .align 4
+L(retnull):
+ li r3, 0 /* Substring was not found. */
+ b L(end)
+
+ .align 4
+L(default):
+ mr r4, r30
+ bl __strcasestr_ppc
+ nop
+
+ .align 4
+L(end):
+ addi r1, r1, FRAMESIZE /* Restore stack pointer. */
+ cfi_adjust_cfa_offset(-FRAMESIZE)
+ ld r0, 16(r1) /* Restore the saved link register. */
+ ld r27, -40(r1)
+ ld r28, -32(r1)
+ ld r29, -24(r1) /* Restore callers save register r29. */
+ ld r30, -16(r1) /* Restore callers save register r30. */
+ ld r31, -8(r1) /* Restore callers save register r31. */
+ cfi_restore(lr)
+ cfi_restore(r27)
+ cfi_restore(r28)
+ cfi_restore(r29)
+ cfi_restore(r30)
+ cfi_restore(r31)
+ mtlr r0 /* Branch to link register. */
+ blr
+END (STRCASESTR)
+
+weak_alias (__strcasestr, strcasestr)
+libc_hidden_def (__strcasestr)
+libc_hidden_builtin_def (strcasestr)
diff --git a/sysdeps/powerpc/powerpc64/power8/strchr.S b/sysdeps/powerpc/powerpc64/power8/strchr.S
new file mode 100644
index 0000000000..c5e28d9c9e
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strchr.S
@@ -0,0 +1,377 @@
+/* Optimized strchr implementation for PowerPC64/POWER8.
+ Copyright (C) 2016-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifdef USE_AS_STRCHRNUL
+# ifndef STRCHRNUL
+# define FUNC_NAME __strchrnul
+# else
+# define FUNC_NAME STRCHRNUL
+# endif
+#else
+# ifndef STRCHR
+# define FUNC_NAME strchr
+# else
+# define FUNC_NAME STRCHR
+# endif
+#endif /* !USE_AS_STRCHRNUL */
+
+/* int [r3] strchr (char *s [r3], int c [r4]) */
+/* TODO: change these to the actual instructions when the minimum required
+ binutils allows it. */
+#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define VBPERMQ(t,a,b) .long (0x1000054c \
+ | ((t)<<(32-11)) \
+ | ((a)<<(32-16)) \
+ | ((b)<<(32-21)) )
+/* TODO: change this to .machine power8 when the minimum required binutils
+ allows it. */
+ .machine power7
+ENTRY_TOCLESS (FUNC_NAME)
+ CALL_MCOUNT 2
+ dcbt 0,r3
+ clrrdi r8,r3,3 /* Align the address to doubleword boundary. */
+ cmpdi cr7,r4,0
+ ld r12,0(r8) /* Load doubleword from memory. */
+ li r0,0 /* Doubleword with null chars to use
+ with cmpb. */
+
+ rlwinm r6,r3,3,26,28 /* Calculate padding. */
+
+ beq cr7,L(null_match)
+
+ /* Replicate byte to doubleword. */
+ insrdi r4,r4,8,48
+ insrdi r4,r4,16,32
+ insrdi r4,r4,32,0
+
+ /* Now r4 has a doubleword of c bytes and r0 has
+ a doubleword of null bytes. */
+
+ cmpb r10,r12,r4 /* Compare each byte against c byte. */
+ cmpb r11,r12,r0 /* Compare each byte against null byte. */
+
+ /* Move the doublewords left and right to discard the bits that are
+ not part of the string and bring them back as zeros. */
+#ifdef __LITTLE_ENDIAN__
+ srd r10,r10,r6
+ srd r11,r11,r6
+ sld r10,r10,r6
+ sld r11,r11,r6
+#else
+ sld r10,r10,r6
+ sld r11,r11,r6
+ srd r10,r10,r6
+ srd r11,r11,r6
+#endif
+ or r5,r10,r11 /* OR the results to speed things up. */
+ cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
+ have been found. */
+ bne cr7,L(done)
+
+ mtcrf 0x01,r8
+
+ /* Are we now aligned to a doubleword boundary? If so, skip to
+ the main loop. Otherwise, go through the alignment code. */
+
+ bt 28,L(loop)
+
+ /* Handle WORD2 of pair. */
+ ldu r12,8(r8)
+ cmpb r10,r12,r4
+ cmpb r11,r12,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ bne cr7,L(done)
+ b L(loop) /* We branch here (rather than falling through)
+ to skip the nops due to heavy alignment
+ of the loop below. */
+
+ .p2align 5
+L(loop):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the null-checking process for bigger strings. */
+ ld r12,8(r8)
+ ldu r9,16(r8)
+ cmpb r10,r12,r4
+ cmpb r11,r12,r0
+ cmpb r6,r9,r4
+ cmpb r7,r9,r0
+ or r5,r10,r11
+ or r9,r6,r7
+ or r12,r5,r9
+ cmpdi cr7,r12,0
+ beq cr7,L(vector)
+ /* OK, one (or both) of the doublewords contains a c/null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a c/null byte. */
+
+ cmpdi cr6,r5,0
+ addi r8,r8,-8
+ bne cr6,L(done)
+
+ /* The c/null byte must be in the second doubleword. Adjust the
+ address again and move the result of cmpb to r10 so we can calculate
+ the pointer. */
+
+ mr r10,r6
+ mr r11,r7
+ addi r8,r8,8
+#ifdef USE_AS_STRCHRNUL
+ mr r5, r9
+#endif
+ /* r10/r11 have the output of the cmpb instructions, that is,
+ 0xff in the same position as the c/null byte in the original
+ doubleword from the string. Use that to calculate the pointer. */
+L(done):
+#ifdef USE_AS_STRCHRNUL
+ mr r10, r5
+#endif
+#ifdef __LITTLE_ENDIAN__
+ addi r3,r10,-1
+ andc r3,r3,r10
+ popcntd r0,r3
+# ifndef USE_AS_STRCHRNUL
+ addi r4,r11,-1
+ andc r4,r4,r11
+ cmpld cr7,r3,r4
+ bgt cr7,L(no_match)
+# endif
+#else
+ cntlzd r0,r10 /* Count leading zeros before c matches. */
+# ifndef USE_AS_STRCHRNUL
+ cmpld cr7,r11,r10
+ bgt cr7,L(no_match)
+# endif
+#endif
+ srdi r0,r0,3 /* Convert leading zeros to bytes. */
+ add r3,r8,r0 /* Return address of the matching c byte
+ or null in case c was not found. */
+ blr
+
+ /* Check the first 32B in GPR's and move to vectorized loop. */
+ .p2align 5
+L(vector):
+ addi r3, r8, 8
+ andi. r10, r3, 31
+ bne cr0, L(loop)
+ vspltisb v0, 0
+ /* Precompute vbpermq constant. */
+ vspltisb v10, 3
+ lvsl v11, r0, r0
+ vslb v10, v11, v10
+ MTVRD(v1,r4)
+ li r5, 16
+ vspltb v1, v1, 7
+ /* Compare 32 bytes in each loop. */
+L(continue):
+ lvx v4, 0, r3
+ lvx v5, r3, r5
+ vcmpequb v2, v0, v4
+ vcmpequb v3, v0, v5
+ vcmpequb v6, v1, v4
+ vcmpequb v7, v1, v5
+ vor v8, v2, v3
+ vor v9, v6, v7
+ vor v11, v8, v9
+ vcmpequb. v11, v0, v11
+ addi r3, r3, 32
+ blt cr6, L(continue)
+ /* One (or both) of the quadwords contains a c/null byte. */
+ addi r3, r3, -32
+#ifndef USE_AS_STRCHRNUL
+ vcmpequb. v11, v0, v9
+ blt cr6, L(no_match)
+#endif
+ /* Permute the first bit of each byte into bits 48-63. */
+ VBPERMQ(v2, v2, v10)
+ VBPERMQ(v3, v3, v10)
+ VBPERMQ(v6, v6, v10)
+ VBPERMQ(v7, v7, v10)
+ /* Shift each component into its correct position for merging. */
+#ifdef __LITTLE_ENDIAN__
+ vsldoi v3, v3, v3, 2
+ vsldoi v7, v7, v7, 2
+#else
+ vsldoi v2, v2, v2, 6
+ vsldoi v3, v3, v3, 4
+ vsldoi v6, v6, v6, 6
+ vsldoi v7, v7, v7, 4
+#endif
+
+ /* Merge the results and move to a GPR. */
+ vor v1, v3, v2
+ vor v2, v6, v7
+ vor v4, v1, v2
+ MFVRD(r5, v4)
+#ifdef __LITTLE_ENDIAN__
+ addi r6, r5, -1
+ andc r6, r6, r5
+ popcntd r6, r6
+#else
+ cntlzd r6, r5 /* Count leading zeros before the match. */
+#endif
+ add r3, r3, r6 /* Compute final length. */
+ /* Return NULL if null found before c. */
+#ifndef USE_AS_STRCHRNUL
+ lbz r4, 0(r3)
+ cmpdi cr7, r4, 0
+ beq cr7, L(no_match)
+#endif
+ blr
+
+#ifndef USE_AS_STRCHRNUL
+ .align 4
+L(no_match):
+ li r3,0
+ blr
+#endif
+
+/* We are here because strchr was called with a null byte. */
+ .align 4
+L(null_match):
+ /* r0 has a doubleword of null bytes. */
+
+ cmpb r5,r12,r0 /* Compare each byte against null bytes. */
+
+ /* Move the doublewords left and right to discard the bits that are
+ not part of the string and bring them back as zeros. */
+#ifdef __LITTLE_ENDIAN__
+ srd r5,r5,r6
+ sld r5,r5,r6
+#else
+ sld r5,r5,r6
+ srd r5,r5,r6
+#endif
+ cmpdi cr7,r5,0 /* If r10 == 0, no c or null bytes
+ have been found. */
+ bne cr7,L(done_null)
+
+ mtcrf 0x01,r8
+
+ /* Are we now aligned to a quadword boundary? If so, skip to
+ the main loop. Otherwise, go through the alignment code. */
+
+ bt 28,L(loop_null)
+
+ /* Handle WORD2 of pair. */
+ ldu r12,8(r8)
+ cmpb r5,r12,r0
+ cmpdi cr7,r5,0
+ bne cr7,L(done_null)
+ b L(loop_null) /* We branch here (rather than falling through)
+ to skip the nops due to heavy alignment
+ of the loop below. */
+
+ /* Main loop to look for the end of the string. Since it's a
+ small loop (< 8 instructions), align it to 32-bytes. */
+ .p2align 5
+L(loop_null):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the null-checking process for bigger strings. */
+ ld r12,8(r8)
+ ldu r11,16(r8)
+ cmpb r5,r12,r0
+ cmpb r10,r11,r0
+ or r6,r5,r10
+ cmpdi cr7,r6,0
+ beq cr7,L(vector1)
+
+ /* OK, one (or both) of the doublewords contains a null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a null byte. */
+
+ cmpdi cr6,r5,0
+ addi r8,r8,-8
+ bne cr6,L(done_null)
+
+ /* The null byte must be in the second doubleword. Adjust the address
+ again and move the result of cmpb to r10 so we can calculate the
+ pointer. */
+
+ mr r5,r10
+ addi r8,r8,8
+
+ /* r5 has the output of the cmpb instruction, that is, it contains
+ 0xff in the same position as the null byte in the original
+ doubleword from the string. Use that to calculate the pointer. */
+L(done_null):
+#ifdef __LITTLE_ENDIAN__
+ addi r0,r5,-1
+ andc r0,r0,r5
+ popcntd r0,r0
+#else
+ cntlzd r0,r5 /* Count leading zeros before the match. */
+#endif
+ srdi r0,r0,3 /* Convert leading zeros to bytes. */
+ add r3,r8,r0 /* Return address of the matching null byte. */
+ blr
+ .p2align 5
+L(vector1):
+ addi r3, r8, 8
+ andi. r10, r3, 31
+ bne cr0, L(loop_null)
+ vspltisb v8, -1
+ vspltisb v0, 0
+ vspltisb v10, 3
+ lvsl v11, r0, r0
+ vslb v10, v11, v10
+ li r5, 16
+L(continue1):
+ lvx v4, 0, r3
+ lvx v5, r3, r5
+ vcmpequb v2, v0, v4
+ vcmpequb v3, v0, v5
+ vor v8, v2, v3
+ vcmpequb. v11, v0, v8
+ addi r3, r3, 32
+ blt cr6, L(continue1)
+ addi r3, r3, -32
+L(end1):
+ VBPERMQ(v2, v2, v10)
+ VBPERMQ(v3, v3, v10)
+ /* Shift each component into its correct position for merging. */
+#ifdef __LITTLE_ENDIAN__
+ vsldoi v3, v3, v3, 2
+#else
+ vsldoi v2, v2, v2, 6
+ vsldoi v3, v3, v3, 4
+#endif
+
+ /* Merge the results and move to a GPR. */
+ vor v4, v3, v2
+ MFVRD(r5, v4)
+#ifdef __LITTLE_ENDIAN__
+ addi r6, r5, -1
+ andc r6, r6, r5
+ popcntd r6, r6
+#else
+ cntlzd r6, r5 /* Count leading zeros before the match. */
+#endif
+ add r3, r3, r6 /* Compute final length. */
+ blr
+END (FUNC_NAME)
+
+#ifndef USE_AS_STRCHRNUL
+weak_alias (strchr, index)
+libc_hidden_builtin_def (strchr)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/power8/strchrnul.S b/sysdeps/powerpc/powerpc64/power8/strchrnul.S
new file mode 100644
index 0000000000..022ad67a6b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strchrnul.S
@@ -0,0 +1,23 @@
+/* Optimized strchrnul implementation for PowerPC64/POWER8.
+ Copyright (C) 2016-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define USE_AS_STRCHRNUL 1
+#include <sysdeps/powerpc/powerpc64/power8/strchr.S>
+
+weak_alias (__strchrnul,strchrnul)
+libc_hidden_builtin_def (__strchrnul)
diff --git a/sysdeps/powerpc/powerpc64/power8/strcmp.S b/sysdeps/powerpc/powerpc64/power8/strcmp.S
index 4d6c477194..15e7351d1b 100644
--- a/sysdeps/powerpc/powerpc64/power8/strcmp.S
+++ b/sysdeps/powerpc/powerpc64/power8/strcmp.S
@@ -1,5 +1,5 @@
/* Optimized strcmp implementation for PowerPC64/POWER8.
- Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Copyright (C) 2015-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -18,6 +18,10 @@
#include <sysdep.h>
+#ifndef STRCMP
+# define STRCMP strcmp
+#endif
+
/* Implements the function
size_t [r3] strcmp (const char *s1 [r3], const char *s2 [r4])
@@ -27,24 +31,24 @@
64K as default, the page cross handling assumes minimum page size of
4k. */
-EALIGN (strcmp, 4, 0)
+ENTRY_TOCLESS (STRCMP, 4)
li r0,0
- /* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using
+ /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
the code:
(((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
- with PAGE_SIZE being 4096 and ITER_SIZE begin 32. */
+ with PAGE_SIZE being 4096 and ITER_SIZE begin 16. */
rldicl r7,r3,0,52
rldicl r9,r4,0,52
- cmpldi cr7,r7,4096-32
+ cmpldi cr7,r7,4096-16
bgt cr7,L(pagecross_check)
- cmpldi cr5,r9,4096-32
+ cmpldi cr5,r9,4096-16
bgt cr5,L(pagecross_check)
- /* For short string up to 32 bytes, load both s1 and s2 using
+ /* For short string up to 16 bytes, load both s1 and s2 using
unaligned dwords and compare. */
ld r8,0(r3)
ld r10,0(r4)
@@ -60,25 +64,11 @@ EALIGN (strcmp, 4, 0)
orc. r9,r12,r11
bne cr0,L(different_nocmpb)
- ld r8,16(r3)
- ld r10,16(r4)
- cmpb r12,r8,r0
- cmpb r11,r8,r10
- orc. r9,r12,r11
- bne cr0,L(different_nocmpb)
-
- ld r8,24(r3)
- ld r10,24(r4)
- cmpb r12,r8,r0
- cmpb r11,r8,r10
- orc. r9,r12,r11
- bne cr0,L(different_nocmpb)
-
- addi r7,r3,32
- addi r4,r4,32
+ addi r7,r3,16
+ addi r4,r4,16
L(align_8b):
- /* Now it has checked for first 32 bytes, align source1 to doubleword
+ /* Now it has checked for first 16 bytes, align source1 to doubleword
and adjust source2 address. */
rldicl r9,r7,0,61 /* source1 alignment to doubleword */
subf r4,r9,r4 /* Adjust source2 address based on source1
@@ -253,5 +243,5 @@ L(pagecross_retdiff):
L(pagecross_nullfound):
li r3,0
b L(pagecross_retdiff)
-END (strcmp)
+END (STRCMP)
libc_hidden_builtin_def (strcmp)
diff --git a/sysdeps/powerpc/powerpc64/power8/strcpy.S b/sysdeps/powerpc/powerpc64/power8/strcpy.S
index 5130831c6a..956faf714f 100644
--- a/sysdeps/powerpc/powerpc64/power8/strcpy.S
+++ b/sysdeps/powerpc/powerpc64/power8/strcpy.S
@@ -1,5 +1,5 @@
/* Optimized strcpy/stpcpy implementation for PowerPC64/POWER8.
- Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Copyright (C) 2015-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -19,10 +19,18 @@
#include <sysdep.h>
#ifdef USE_AS_STPCPY
-# define FUNC_NAME __stpcpy
+# ifndef STPCPY
+# define FUNC_NAME __stpcpy
+# else
+# define FUNC_NAME STPCPY
+# endif
#else
-# define FUNC_NAME strcpy
-#endif
+# ifndef STRCPY
+# define FUNC_NAME strcpy
+# else
+# define FUNC_NAME STRCPY
+# endif
+#endif /* !USE_AS_STPCPY */
/* Implements the function
@@ -39,8 +47,8 @@
64K as default, the page cross handling assumes minimum page size of
4k. */
- .machine power7
-EALIGN (FUNC_NAME, 4, 0)
+ .machine power8
+ENTRY_TOCLESS (FUNC_NAME, 4)
li r0,0 /* Doubleword with null chars to use
with cmpb. */
@@ -112,7 +120,7 @@ L(pagecross):
ldu r8, 8(r7)
L(loop_before):
- /* Save the two doublewords readed from source and align the source
+ /* Save the two doublewords read from source and align the source
to 16 bytes for the loop. */
mr r11,r3
std r12,0(r11)
@@ -121,7 +129,150 @@ L(loop_before):
rldicl r9,r4,0,60
subf r7,r9,r7
subf r11,r9,r11
- b L(loop_start)
+ /* Source is adjusted to 16B alignment and destination r11 is
+ also moved based on that adjustment. Now check if r11 is
+ also 16B aligned to move to vectorized loop. */
+ andi. r6, r11, 0xF
+ bne L(loop_start)
+
+ /* Prepare for the loop. */
+ subf r4, r9, r4 /* Adjust r4 based on alignment. */
+ li r7, 16 /* Load required offsets. */
+ li r8, 32
+ li r9, 48
+ vspltisb v0, 0
+ addi r4, r4, 16
+ /* Are we 64-byte aligned? If so, jump to the vectorized loop.
+ Else copy 16B till r4 is 64B aligned. */
+ andi. r6, r4, 63
+ beq L(qw_loop)
+
+ lvx v6, 0, r4 /* Load 16 bytes from memory. */
+ vcmpequb. v5, v0, v6 /* Check for null. */
+ bne cr6, L(qw_done)
+ stvx v6, 0, r11 /* Store 16 bytes. */
+ addi r4, r4, 16 /* Increment the address. */
+ addi r11, r11, 16
+ andi. r6, r4, 63
+ beq L(qw_loop)
+
+ lvx v6, 0, r4
+ vcmpequb. v5, v0, v6
+ bne cr6, L(qw_done)
+ stvx v6, 0, r11
+ addi r4, r4, 16
+ addi r11, r11, 16
+ andi. r6, r4, 63
+ beq L(qw_loop)
+
+ lvx v6, 0, r4
+ vcmpequb. v5, v0, v6
+ bne cr6, L(qw_done)
+ stvx v6, 0, r11
+ addi r4, r4, 16
+ addi r11, r11, 16
+
+ .align 4
+L(qw_loop):
+ lvx v1, r4, r0 /* Load 4 quadwords. */
+ lvx v2, r4, r7
+ lvx v3, r4, r8
+ lvx v4, r4, r9
+ vminub v5, v1, v2 /* Compare and merge into one VR for speed. */
+ vminub v8, v3, v4
+ vminub v7, v5, v8
+ vcmpequb. v7, v7, v0 /* Check for NULLs. */
+ bne cr6, L(qw_loop_done)
+ stvx v1, r11, r0 /* Store 4 quadwords. */
+ stvx v2, r11, r7
+ stvx v3, r11, r8
+ stvx v4, r11, r9
+ addi r4, r4, 64 /* Adjust address for the next iteration. */
+ addi r11, r11, 64 /* Adjust address for the next iteration. */
+
+ lvx v1, r4, r0 /* Load 4 quadwords. */
+ lvx v2, r4, r7
+ lvx v3, r4, r8
+ lvx v4, r4, r9
+ vminub v5, v1, v2 /* Compare and merge into one VR for speed. */
+ vminub v8, v3, v4
+ vminub v7, v5, v8
+ vcmpequb. v7, v7, v0 /* Check for NULLs. */
+ bne cr6, L(qw_loop_done)
+ stvx v1, r11, r0 /* Store 4 quadwords. */
+ stvx v2, r11, r7
+ stvx v3, r11, r8
+ stvx v4, r11, r9
+ addi r4, r4, 64 /* Adjust address for the next iteration. */
+ addi r11, r11, 64 /* Adjust address for the next iteration. */
+
+ lvx v1, r4, r0 /* Load 4 quadwords. */
+ lvx v2, r4, r7
+ lvx v3, r4, r8
+ lvx v4, r4, r9
+ vminub v5, v1, v2 /* Compare and merge into one VR for speed. */
+ vminub v8, v3, v4
+ vminub v7, v5, v8
+ vcmpequb. v7, v7, v0 /* Check for NULLs. */
+ bne cr6, L(qw_loop_done)
+ stvx v1, r11, r0 /* Store 4 quadwords. */
+ stvx v2, r11, r7
+ stvx v3, r11, r8
+ stvx v4, r11, r9
+ addi r4, r4, 64 /* Adjust address for the next iteration. */
+ addi r11, r11, 64 /* Adjust address for the next iteration. */
+ b L(qw_loop)
+
+ .align 4
+L(qw_loop_done):
+ /* Null found in one of the 4 loads. */
+ vcmpequb. v7, v1, v0
+ vor v6, v1, v1
+ bne cr6, L(qw_done)
+ /* Not on the first 16B, So store it. */
+ stvx v1, r11, r0
+ addi r4, r4, 16
+ addi r11, r11, 16
+ vcmpequb. v7, v2, v0
+ vor v6, v2, v2
+ bne cr6, L(qw_done)
+ /* Not on the second 16B, So store it. */
+ stvx v2, r11, r0
+ addi r4, r4, 16
+ addi r11, r11, 16
+ vcmpequb. v7, v3, v0
+ vor v6, v3, v3
+ bne cr6, L(qw_done)
+ /* Not on the third 16B, So store it. */
+ stvx v6, r11, r0
+ addi r4, r4, 16
+ addi r11, r11, 16
+ vor v6, v4, v4
+
+ .align 4
+L(qw_done):
+ mr r7, r4
+ /* Move the result to GPR. */
+#ifdef __LITTLE_ENDIAN__
+ vsldoi v4, v6, v0, 8
+ mfvrd r12, v4
+#else
+ mfvrd r12, v6
+#endif
+ /* Check for null in the first 8 bytes. */
+ cmpb r10, r12, r0
+ cmpdi cr6, r10, 0
+ bne cr6, L(done2)
+ /* Null found in second doubleword. */
+#ifdef __LITTLE_ENDIAN__
+ mfvrd r6, v6
+#else
+ vsldoi v6, v6, v0, 8
+ mfvrd r6, v6
+#endif
+ cmpb r10, r6, r0
+ addi r7, r7, 8
+ b L(done2)
.align 5
L(loop):
diff --git a/sysdeps/powerpc/powerpc64/power8/strcspn.S b/sysdeps/powerpc/powerpc64/power8/strcspn.S
new file mode 100644
index 0000000000..c2d130e7db
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strcspn.S
@@ -0,0 +1,20 @@
+/* Optimized strcspn implementation for PowerPC64/POWER8.
+ Copyright (C) 2016-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define USE_AS_STRCSPN 1
+#include <sysdeps/powerpc/powerpc64/power8/strspn.S>
diff --git a/sysdeps/powerpc/powerpc64/power8/strlen.S b/sysdeps/powerpc/powerpc64/power8/strlen.S
new file mode 100644
index 0000000000..719b5c604c
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strlen.S
@@ -0,0 +1,290 @@
+/* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized
+ loop.
+ Copyright (C) 2016-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* TODO: change these to the actual instructions when the minimum required
+ binutils allows it. */
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define VBPERMQ(t,a,b) .long (0x1000054c \
+ | ((t)<<(32-11)) \
+ | ((a)<<(32-16)) \
+ | ((b)<<(32-21)) )
+
+/* int [r3] strlen (char *s [r3]) */
+
+#ifndef STRLEN
+# define STRLEN strlen
+#endif
+
+/* TODO: change this to .machine power8 when the minimum required binutils
+ allows it. */
+ .machine power7
+ENTRY_TOCLESS (STRLEN, 4)
+ CALL_MCOUNT 1
+ dcbt 0,r3
+ clrrdi r4,r3,3 /* Align the address to doubleword boundary. */
+ rlwinm r6,r3,3,26,28 /* Calculate padding. */
+ li r0,0 /* Doubleword with null chars to use
+ with cmpb. */
+ li r5,-1 /* MASK = 0xffffffffffffffff. */
+ ld r12,0(r4) /* Load doubleword from memory. */
+#ifdef __LITTLE_ENDIAN__
+ sld r5,r5,r6
+#else
+ srd r5,r5,r6 /* MASK = MASK >> padding. */
+#endif
+ orc r9,r12,r5 /* Mask bits that are not part of the string. */
+ cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */
+ cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */
+ bne cr7,L(done)
+
+ /* For shorter strings (< 64 bytes), we will not use vector registers,
+ as the overhead isn't worth it. So, let's use GPRs instead. This
+ will be done the same way as we do in the POWER7 implementation.
+ Let's see if we are aligned to a quadword boundary. If so, we can
+ jump to the first (non-vectorized) loop. Otherwise, we have to
+ handle the next DWORD first. */
+ mtcrf 0x01,r4
+ mr r9,r4
+ addi r9,r9,8
+ bt 28,L(align64)
+
+ /* Handle the next 8 bytes so we are aligned to a quadword
+ boundary. */
+ ldu r5,8(r4)
+ cmpb r10,r5,r0
+ cmpdi cr7,r10,0
+ addi r9,r9,8
+ bne cr7,L(done)
+
+L(align64):
+ /* Proceed to the old (POWER7) implementation, checking two doublewords
+ per iteraction. For the first 56 bytes, we will just check for null
+ characters. After that, we will also check if we are 64-byte aligned
+ so we can jump to the vectorized implementation. We will unroll
+ these loops to avoid excessive branching. */
+ ld r6,8(r4)
+ ldu r5,16(r4)
+ cmpb r10,r6,r0
+ cmpb r11,r5,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ addi r9,r9,16
+ bne cr7,L(dword_zero)
+
+ ld r6,8(r4)
+ ldu r5,16(r4)
+ cmpb r10,r6,r0
+ cmpb r11,r5,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ addi r9,r9,16
+ bne cr7,L(dword_zero)
+
+ ld r6,8(r4)
+ ldu r5,16(r4)
+ cmpb r10,r6,r0
+ cmpb r11,r5,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ addi r9,r9,16
+ bne cr7,L(dword_zero)
+
+ /* Are we 64-byte aligned? If so, jump to the vectorized loop.
+ Note: aligning to 64-byte will necessarily slow down performance for
+ strings around 64 bytes in length due to the extra comparisons
+ required to check alignment for the vectorized loop. This is a
+ necessary tradeoff we are willing to take in order to speed up the
+ calculation for larger strings. */
+ andi. r10,r9,63
+ beq cr0,L(preloop)
+ ld r6,8(r4)
+ ldu r5,16(r4)
+ cmpb r10,r6,r0
+ cmpb r11,r5,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ addi r9,r9,16
+ bne cr7,L(dword_zero)
+
+ andi. r10,r9,63
+ beq cr0,L(preloop)
+ ld r6,8(r4)
+ ldu r5,16(r4)
+ cmpb r10,r6,r0
+ cmpb r11,r5,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ addi r9,r9,16
+ bne cr7,L(dword_zero)
+
+ andi. r10,r9,63
+ beq cr0,L(preloop)
+ ld r6,8(r4)
+ ldu r5,16(r4)
+ cmpb r10,r6,r0
+ cmpb r11,r5,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ addi r9,r9,16
+
+ /* At this point, we are necessarily 64-byte aligned. If no zeroes were
+ found, jump to the vectorized loop. */
+ beq cr7,L(preloop)
+
+L(dword_zero):
+ /* OK, one (or both) of the doublewords contains a null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a null byte. */
+
+ cmpdi cr6,r10,0
+ addi r4,r4,-8
+ bne cr6,L(done)
+
+ /* The null byte must be in the second doubleword. Adjust the address
+ again and move the result of cmpb to r10 so we can calculate the
+ length. */
+
+ mr r10,r11
+ addi r4,r4,8
+
+ /* If the null byte was found in the non-vectorized code, compute the
+ final length. r10 has the output of the cmpb instruction, that is,
+ it contains 0xff in the same position as the null byte in the
+ original doubleword from the string. Use that to calculate the
+ length. */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+ addi r9, r10,-1 /* Form a mask from trailing zeros. */
+ andc r9, r9,r10
+ popcntd r0, r9 /* Count the bits in the mask. */
+#else
+ cntlzd r0,r10 /* Count leading zeros before the match. */
+#endif
+ subf r5,r3,r4
+ srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
+ add r3,r5,r0 /* Compute final length. */
+ blr
+
+ /* Vectorized implementation starts here. */
+ .p2align 4
+L(preloop):
+ /* Set up for the loop. */
+ mr r4,r9
+ li r7, 16 /* Load required offsets. */
+ li r8, 32
+ li r9, 48
+ li r12, 8
+ vxor v0,v0,v0 /* VR with null chars to use with
+ vcmpequb. */
+
+ /* Main loop to look for the end of the string. We will read in
+ 64-byte chunks. Align it to 32 bytes and unroll it 3 times to
+ leverage the icache performance. */
+ .p2align 5
+L(loop):
+ lvx v1,r4,r0 /* Load 4 quadwords. */
+ lvx v2,r4,r7
+ lvx v3,r4,r8
+ lvx v4,r4,r9
+ vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
+ vminub v6,v3,v4
+ vminub v7,v5,v6
+ vcmpequb. v7,v7,v0 /* Check for NULLs. */
+ addi r4,r4,64 /* Adjust address for the next iteration. */
+ bne cr6,L(vmx_zero)
+
+ lvx v1,r4,r0 /* Load 4 quadwords. */
+ lvx v2,r4,r7
+ lvx v3,r4,r8
+ lvx v4,r4,r9
+ vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
+ vminub v6,v3,v4
+ vminub v7,v5,v6
+ vcmpequb. v7,v7,v0 /* Check for NULLs. */
+ addi r4,r4,64 /* Adjust address for the next iteration. */
+ bne cr6,L(vmx_zero)
+
+ lvx v1,r4,r0 /* Load 4 quadwords. */
+ lvx v2,r4,r7
+ lvx v3,r4,r8
+ lvx v4,r4,r9
+ vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
+ vminub v6,v3,v4
+ vminub v7,v5,v6
+ vcmpequb. v7,v7,v0 /* Check for NULLs. */
+ addi r4,r4,64 /* Adjust address for the next iteration. */
+ beq cr6,L(loop)
+
+L(vmx_zero):
+ /* OK, we found a null byte. Let's look for it in the current 64-byte
+ block and mark it in its corresponding VR. */
+ vcmpequb v1,v1,v0
+ vcmpequb v2,v2,v0
+ vcmpequb v3,v3,v0
+ vcmpequb v4,v4,v0
+
+ /* We will now 'compress' the result into a single doubleword, so it
+ can be moved to a GPR for the final calculation. First, we
+ generate an appropriate mask for vbpermq, so we can permute bits into
+ the first halfword. */
+ vspltisb v10,3
+ lvsl v11,r0,r0
+ vslb v10,v11,v10
+
+ /* Permute the first bit of each byte into bits 48-63. */
+ VBPERMQ(v1,v1,v10)
+ VBPERMQ(v2,v2,v10)
+ VBPERMQ(v3,v3,v10)
+ VBPERMQ(v4,v4,v10)
+
+ /* Shift each component into its correct position for merging. */
+#ifdef __LITTLE_ENDIAN__
+ vsldoi v2,v2,v2,2
+ vsldoi v3,v3,v3,4
+ vsldoi v4,v4,v4,6
+#else
+ vsldoi v1,v1,v1,6
+ vsldoi v2,v2,v2,4
+ vsldoi v3,v3,v3,2
+#endif
+
+ /* Merge the results and move to a GPR. */
+ vor v1,v2,v1
+ vor v2,v3,v4
+ vor v4,v1,v2
+ MFVRD(r10,v4)
+
+ /* Adjust address to the begninning of the current 64-byte block. */
+ addi r4,r4,-64
+
+#ifdef __LITTLE_ENDIAN__
+ addi r9, r10,-1 /* Form a mask from trailing zeros. */
+ andc r9, r9,r10
+ popcntd r0, r9 /* Count the bits in the mask. */
+#else
+ cntlzd r0,r10 /* Count leading zeros before the match. */
+#endif
+ subf r5,r3,r4
+ add r3,r5,r0 /* Compute final length. */
+ blr
+
+END (STRLEN)
+libc_hidden_builtin_def (strlen)
diff --git a/sysdeps/powerpc/powerpc64/power8/strncase.S b/sysdeps/powerpc/powerpc64/power8/strncase.S
new file mode 100644
index 0000000000..050b63ab91
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strncase.S
@@ -0,0 +1,20 @@
+/* Optimized strncasecmp implementation for POWER8.
+ Copyright (C) 2016-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define USE_AS_STRNCASECMP 1
+#include <sysdeps/powerpc/powerpc64/power8/strcasecmp.S>
diff --git a/sysdeps/powerpc/powerpc64/power8/strncmp.S b/sysdeps/powerpc/powerpc64/power8/strncmp.S
index 1ce9e3fc65..2eefa4a2ba 100644
--- a/sysdeps/powerpc/powerpc64/power8/strncmp.S
+++ b/sysdeps/powerpc/powerpc64/power8/strncmp.S
@@ -1,5 +1,5 @@
/* Optimized strncmp implementation for PowerPC64/POWER8.
- Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Copyright (C) 2015-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -18,6 +18,10 @@
#include <sysdep.h>
+#ifndef STRNCMP
+# define STRNCMP strncmp
+#endif
+
/* Implements the function
int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n)
@@ -28,7 +32,7 @@
4k. */
.machine power7
-EALIGN (strncmp, 4, 0)
+ENTRY_TOCLESS (STRNCMP, 4)
/* Check if size is 0. */
mr. r10,r5
beq cr0,L(ret0)
@@ -319,5 +323,5 @@ L(byte_ne_4):
extsw r10,r9
mr r9,r8
b L(size_reached_1)
-END(strncmp)
+END(STRNCMP)
libc_hidden_builtin_def(strncmp)
diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
index 17c3afb5fe..e8c5c71f87 100644
--- a/sysdeps/powerpc/powerpc64/power8/strncpy.S
+++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S
@@ -1,5 +1,5 @@
/* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8.
- Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Copyright (C) 2015-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -19,11 +19,32 @@
#include <sysdep.h>
#ifdef USE_AS_STPNCPY
-# define FUNC_NAME __stpncpy
+# ifndef STPNCPY
+# define FUNC_NAME __stpncpy
+# else
+# define FUNC_NAME STPNCPY
+# endif
#else
-# define FUNC_NAME strncpy
+# ifndef STRNCPY
+# define FUNC_NAME strncpy
+# else
+# define FUNC_NAME STRNCPY
+# endif
+#endif /* !USE_AS_STPNCPY */
+
+#ifndef MEMSET
+/* For builds without IFUNC support, local calls should be made to internal
+ GLIBC symbol (created by libc_hidden_builtin_def). */
+# ifdef SHARED
+# define MEMSET_is_local
+# define MEMSET __GI_memset
+# else
+# define MEMSET memset
+# endif
#endif
+#define FRAMESIZE (FRAME_MIN_SIZE+48)
+
/* Implements the function
char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
@@ -40,7 +61,12 @@
4k. */
.machine power7
-EALIGN (FUNC_NAME, 4, 0)
+#ifdef MEMSET_is_local
+ENTRY_TOCLESS (FUNC_NAME, 4)
+#else
+ENTRY (FUNC_NAME, 4)
+#endif
+ CALL_MCOUNT 3
/* Check if the [src]+15 will cross a 4K page by checking if the bit
indicating the page size changes. Basically:
@@ -54,8 +80,7 @@ EALIGN (FUNC_NAME, 4, 0)
addi r10,r4,16
rlwinm r9,r4,0,19,19
- /* Since it is a leaf function, save some non-volatile registers on the
- protected/red zone. */
+ /* Save some non-volatile registers on the stack. */
std r26,-48(r1)
std r27,-40(r1)
@@ -69,6 +94,14 @@ EALIGN (FUNC_NAME, 4, 0)
std r30,-16(r1)
std r31,-8(r1)
+ /* Update CFI. */
+ cfi_offset(r26, -48)
+ cfi_offset(r27, -40)
+ cfi_offset(r28, -32)
+ cfi_offset(r29, -24)
+ cfi_offset(r30, -16)
+ cfi_offset(r31, -8)
+
beq cr7,L(unaligned_lt_16)
rldicl r9,r4,0,61
subfic r8,r9,8
@@ -180,79 +213,66 @@ L(short_path_loop_end):
ld r31,-8(r1)
blr
- /* This code pads the remainder dest with NULL bytes. The algorithm
- calculate the remanining size and issues a doubleword unrolled
- loops followed by a byte a byte set. */
+ /* This code pads the remainder of dest with NULL bytes. The algorithm
+ calculates the remaining size and calls memset. */
.align 4
L(zero_pad_start):
mr r5,r10
mr r9,r6
L(zero_pad_start_1):
- srdi. r8,r5,r3
- mr r10,r9
-#ifdef USE_AS_STPNCPY
- mr r3,r9
+ /* At this point:
+ - r5 holds the number of bytes that still have to be written to
+ dest.
+ - r9 points to the position, in dest, where the first null byte
+ will be written.
+ The above statements are true both when control reaches this label
+ from a branch or when falling through the previous lines. */
+#ifndef USE_AS_STPNCPY
+ mr r30,r3 /* Save the return value of strncpy. */
+#endif
+ /* Prepare the call to memset. */
+ mr r3,r9 /* Pointer to the area to be zero-filled. */
+ li r4,0 /* Byte to be written (zero). */
+
+ /* We delayed the creation of the stack frame, as well as the saving of
+ the link register, because only at this point, we are sure that
+ doing so is actually needed. */
+
+ /* Save the link register. */
+ mflr r0
+ std r0,16(r1)
+
+ /* Create the stack frame. */
+ stdu r1,-FRAMESIZE(r1)
+ cfi_adjust_cfa_offset(FRAMESIZE)
+ cfi_offset(lr, 16)
+
+ bl MEMSET
+#ifndef MEMSET_is_local
+ nop
#endif
- beq- cr0,L(zero_pad_loop_b_start)
- cmpldi cr7,r8,1
- li cr7,0
- std r7,0(r9)
- beq cr7,L(zero_pad_loop_b_prepare)
- addic. r8,r8,-2
- addi r10,r9,r16
- std r7,8(r9)
- beq cr0,L(zero_pad_loop_dw_2)
- std r7,16(r9)
- li r9,0
- b L(zero_pad_loop_dw_1)
-
- .align 4
-L(zero_pad_loop_dw):
- addi r10,r10,16
- std r9,-8(r10)
- beq cr0,L(zero_pad_loop_dw_2)
- std r9,0(r10)
-L(zero_pad_loop_dw_1):
- cmpldi cr7,r8,1
- std r9,0(r10)
- addic. r8,r8,-2
- bne cr7,L(zero_pad_loop_dw)
- addi r10,r10,8
-L(zero_pad_loop_dw_2):
- rldicl r5,r5,0,61
-L(zero_pad_loop_b_start):
- cmpdi cr7,r5,0
- addi r5,r5,-1
- addi r9,r10,-1
- add r10,r10,5
- subf r10,r9,r10
- li r8,0
- beq- cr7,L(short_path_loop_end)
-
- /* Write remaining 1-8 bytes. */
- .align 4
- addi r9,r9,1
- mtocrf 0x1,r10
- bf 29,4f
- stw r8,0(r9)
- addi r9,r9,4
- .align 4
-4: bf 30,2f
- sth r8,0(r9)
- addi r9,r9,2
+ ld r0,FRAMESIZE+16(r1)
- .align 4
-2: bf 31,1f
- stb r8,0(r9)
+#ifndef USE_AS_STPNCPY
+ mr r3,r30 /* Restore the return value of strncpy, i.e.:
+ dest. For stpncpy, the return value is the
+ same as return value of memset. */
+#endif
- /* Restore non-volatile registers. */
-1: ld r26,-48(r1)
- ld r27,-40(r1)
- ld r28,-32(r1)
- ld r29,-24(r1)
- ld r30,-16(r1)
- ld r31,-8(r1)
+ /* Restore non-volatile registers and return. */
+ ld r26,FRAMESIZE-48(r1)
+ ld r27,FRAMESIZE-40(r1)
+ ld r28,FRAMESIZE-32(r1)
+ ld r29,FRAMESIZE-24(r1)
+ ld r30,FRAMESIZE-16(r1)
+ ld r31,FRAMESIZE-8(r1)
+ /* Restore the stack frame. */
+ addi r1,r1,FRAMESIZE
+ cfi_adjust_cfa_offset(-FRAMESIZE)
+ /* Restore the link register. */
+ mtlr r0
+ cfi_restore(lr)
blr
/* The common case where [src]+16 will not cross a 4K page boundary.
@@ -301,7 +321,7 @@ L(pagecross):
#endif
orc r9,r7,r9 /* Mask bits that are not part of the
string. */
- li cr7,0
+ li r7,0
cmpb r9,r9,r7 /* Check for null bytes in DWORD1. */
cmpdi cr7,r9,0
bne cr7,L(short_path_prepare_2)
@@ -312,14 +332,14 @@ L(pagecross):
/* For next checks we have aligned address, so we check for more
three doublewords to make sure we can read 16 unaligned bytes
to start the bulk copy with 16 aligned addresses. */
- ld cr7,8(r11)
+ ld r7,8(r11)
cmpb r9,r7,r9
cmpdi cr7,r9,0
bne cr7,L(short_path_prepare_2)
- addi cr7,r8,-8
+ addi r7,r8,-8
cmpldi cr7,r7,8
ble cr7,L(short_path_prepare_2)
- ld cr7,16(r11)
+ ld r7,16(r11)
cmpb r9,r7,r9
cmpdi cr7,r9,0
bne cr7,L(short_path_prepare_2)
@@ -443,18 +463,12 @@ L(short_path_prepare_2_3):
mr r4,r28
mr r9,r29
b L(short_path_2)
-L(zero_pad_loop_b_prepare):
- addi r10,r9,8
- rldicl r5,r5,0,61
- b L(zero_pad_loop_b_start)
L(zero_pad_start_prepare_1):
mr r5,r6
mr r9,r8
b L(zero_pad_start_1)
END (FUNC_NAME)
-#ifdef USE_AS_STPNCPY
-libc_hidden_def (__stpncpy)
-#else
+#ifndef USE_AS_STPNCPY
libc_hidden_builtin_def (strncpy)
#endif
diff --git a/sysdeps/powerpc/powerpc64/power8/strnlen.S b/sysdeps/powerpc/powerpc64/power8/strnlen.S
new file mode 100644
index 0000000000..a98dfba4bd
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strnlen.S
@@ -0,0 +1,425 @@
+/* Optimized strnlen implementation for POWER8 using a vmx loop.
+
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* It is implemented the following heuristic:
+ 1. Case maxlen <= 32: align the pointer to 8 bytes to loop through
+ reading doublewords. Uses the POWER7 algorithm.
+ 2. Case maxlen > 32: check for null bytes in the first 16 bytes using
+ unaligned accesses. Return length if found. Otherwise:
+ 2.1 Case maxlen < 64: deduct the bytes previously read, align
+ the pointer to 16 bytes and loop through reading quadwords
+ until find null bytes or reach maxlen.
+ 2.2 Case maxlen > 64: deduct the bytes previously read, align
+ the pointer to 64 bytes and set up a counter to loop through
+ reading in strides of 64 bytes. In case it finished the loop
+ with null bytes not found, process the remainder bytes by
+ switching to the loop to heuristic in 2.1. */
+
+#include <sysdep.h>
+
+/* Define default page size to 4KB. */
+#define PAGE_SIZE 4096
+
+/* The following macros implement Power ISA v2.07 opcodes
+ that could not be used directly into this code to the keep
+ compatibility with older binutils versions. */
+
+/* Move from vector register doubleword. */
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+
+/* Move to vector register doubleword. */
+#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
+
+/* Vector Bit Permute Quadword. */
+#define VBPERMQ(t,a,b) .long (0x1000054c \
+ | ((t)<<(32-11)) \
+ | ((a)<<(32-16)) \
+ | ((b)<<(32-21)) )
+
+/* Vector Population Count Halfword. */
+#define VPOPCNTH(t,b) .long (0x10000743 | ((t)<<(32-11)) | ((b)<<(32-21)))
+
+/* Vector Count Leading Zeros Halfword. */
+#define VCLZH(t,b) .long (0x10000742 | ((t)<<(32-11)) | ((b)<<(32-21)))
+
+
+/* int [r3] strnlen (char *s [r3], size_t maxlen [r4]) */
+/* TODO: change to power8 when minimum required binutils allows it. */
+ .machine power7
+ENTRY_TOCLESS (__strnlen)
+ CALL_MCOUNT 2
+ dcbt 0,r3
+
+ cmpldi r4,32 /* Check if maxlen <= 32. */
+ ble L(small_range) /* If maxlen <= 32. */
+
+ /* Upcoming 16 bytes unaligned accesses cannot cross the page boundary
+ otherwise the processor throws an memory access error.
+ Use following code to check there is room for such as accesses:
+ (((size_t) s) % PAGE_SIZE > (PAGE_SIZE - 16)
+ If it is disallowed then switch to the code that handles
+ the string when maxlen <= 32. */
+ clrldi r10,r3,52
+ cmpldi cr7,r10,PAGE_SIZE-16
+ bgt cr7,L(small_range) /* If less than 16B of page end. */
+
+ /* Compute our permute constant r8. */
+ li r7,0
+ /* Compute a bpermd constant to move bit 0 of each word into
+ a halfword value, and count trailing zeros. */
+#ifdef __LITTLE_ENDIAN__
+ li r8,0x2820
+ oris r8,r8,0x3830
+ sldi r8,r8,32
+ ori r8,r8,0x0800
+ oris r8,r8,0x1810
+#else
+ li r8,0x1018
+ oris r8,r8,0x0008
+ sldi r8,r8,32
+ ori r8,r8,0x3038
+ oris r8,r8,0x2028
+#endif
+
+ /* maxlen > 32. Optimistically check for null bytes in the first
+ 16 bytes of the string using unaligned accesses. */
+ ld r5,0(r3)
+ ld r6,8(r3)
+ cmpb r10,r7,r5 /* Check for null bytes in DWORD1. */
+ cmpb r11,r7,r6 /* Check for null bytes in DWORD2. */
+ or. r7,r10,r11
+ bne cr0, L(early_find) /* If found null bytes. */
+
+ /* At this point maxlen > 32 and null bytes were not found at first
+ 16 bytes. Prepare for loop using VMX. */
+
+ /* r3 == s, r4 == maxlen. All other volatile regs are unused now. */
+
+ addi r5,r3,16 /* Align up, or just add the 16B we
+ already checked. */
+ li r0,15
+ and r7,r5,r0 /* Find offset into 16B alignment. */
+ andc r5,r5,r0 /* Quadword align up s to the next quadword. */
+ li r0,16
+ subf r0,r7,r0
+ subf r4,r0,r4 /* Deduct unaligned bytes from maxlen. */
+
+
+ /* Compute offsets for vmx loads, and precompute the vbpermq
+ constants for both the 64B and 16B loops. */
+ li r6,0
+ vspltisb v0,0
+ vspltisb v10,3
+ lvsl v11,r6,r6
+ vslb v10,v11,v10
+
+ cmpldi r4,64 /* Check maxlen < 64. */
+ blt L(smaller) /* If maxlen < 64 */
+
+ /* In order to begin the 64B loop, it needs to be 64
+ bytes aligned. So read quadwords until it is aligned or found null
+ bytes. At worst case it will be aligned after the fourth iteration,
+ so unroll the loop to avoid counter checking. */
+ andi. r7,r5,63 /* Check if is 64 bytes aligned. */
+ beq cr0,L(preloop_64B) /* If it is already 64B aligned. */
+ lvx v1,r5,r6
+ vcmpequb. v1,v1,v0
+ addi r5,r5,16
+ addi r4,r4,-16 /* Decrement maxlen in 16 bytes. */
+ bne cr6,L(found_aligning64B) /* If found null bytes. */
+
+ /* Unroll 2x above code block until aligned or find null bytes. */
+ andi. r7,r5,63
+ beq cr0,L(preloop_64B)
+ lvx v1,r5,r6
+ vcmpequb. v1,v1,v0
+ addi r5,r5,16
+ addi r4,r4,-16
+ bne cr6,L(found_aligning64B)
+
+ andi. r7,r5,63
+ beq cr0,L(preloop_64B)
+ lvx v1,r5,r6
+ vcmpequb. v1,v1,v0
+ addi r5,r5,16
+ addi r4,r4,-16
+ bne cr6,L(found_aligning64B)
+
+ /* At this point it should be 16 bytes aligned.
+ Prepare for the 64B loop. */
+ .p2align 4
+L(preloop_64B):
+ /* Check if maxlen became is less than 64, therefore disallowing the
+ 64B loop. If it happened switch to the 16B loop code. */
+ cmpldi r4,64 /* Check if maxlen < 64. */
+ blt L(smaller) /* If maxlen < 64. */
+ /* Set some constant values. */
+ li r7,16
+ li r10,32
+ li r9,48
+
+ /* Compute the number of 64 bytes iterations needed. */
+ srdi r11,r4,6 /* Compute loop count (maxlen / 64). */
+ andi. r4,r4,63 /* Set maxlen the remainder (maxlen % 64). */
+ mtctr r11 /* Move loop count to counter register. */
+
+ /* Handle maxlen > 64. Loop over the bytes in strides of 64B. */
+ .p2align 4
+L(loop_64B):
+ lvx v1,r5,r6 /* r5 is the pointer to s. */
+ lvx v2,r5,r7
+ lvx v3,r5,r10
+ lvx v4,r5,r9
+ /* Compare the four 16B vectors to obtain the least 16 values.
+ Null bytes should emerge into v7, then check for null bytes. */
+ vminub v5,v1,v2
+ vminub v6,v3,v4
+ vminub v7,v5,v6
+ vcmpequb. v7,v7,v0 /* Check for null bytes. */
+ addi r5,r5,64 /* Add pointer to next iteraction. */
+ bne cr6,L(found_64B) /* If found null bytes. */
+ bdnz L(loop_64B) /* Continue the loop if count > 0. */
+
+/* Hit loop end without null match. So branch to handle the remainder. */
+
+ /* Prepare a 16B loop to handle two cases:
+ 1. If 32 > maxlen < 64.
+ 2. If maxlen >= 64, and reached end of the 64B loop with null
+ bytes not found. Thus handle the remainder bytes here. */
+ .p2align 4
+L(smaller):
+ cmpldi r4,0 /* Check maxlen is zero. */
+ beq L(done) /* If maxlen is zero. */
+
+ /* Place rounded up number of qw's to check into a vmx
+ register, and use some vector tricks to minimize
+ branching. */
+ MTVRD(v7,r4) /* Copy maxlen from GPR to vector register. */
+ vspltisb v5,1
+ vspltisb v6,15
+ vspltb v2,v7,7
+ vaddubs v3,v5,v6
+
+#ifdef __LITTLE_ENDIAN__
+ vspltish v5,1 /* Compute 16 in each byte. */
+#endif
+
+ /* Loop in 16B aligned incremements now. */
+ .p2align 4
+L(loop_16B):
+ lvx v1,r5,r6 /* Load quadword into vector register. */
+ addi r5,r5,16 /* Increment address to next 16B block. */
+ vor v7,v2,v2 /* Save loop count (v2) into v7. */
+ vsububs v2,v2,v3 /* Subtract 16B from count, saturate at 0. */
+ vminub v4,v1,v2
+ vcmpequb. v4,v4,v0 /* Checking for null bytes. */
+ beq cr6,L(loop_16B) /* If null bytes not found. */
+
+ vcmpequb v1,v1,v0
+ VBPERMQ(v1,v1,v10)
+#ifdef __LITTLE_ENDIAN__
+ vsubuhm v2,v1,v5 /* Form a mask of trailing zeros. */
+ vandc v2,v2,v1
+ VPOPCNTH(v1,v2) /* Count of trailing zeros, 16 if none. */
+#else
+ VCLZH(v1,v1) /* Count the leading zeros, 16 if none. */
+#endif
+ /* Truncate to maximum allowable offset. */
+ vcmpgtub v2,v1,v7 /* Compare and truncate for matches beyond
+ maxlen. */
+ vsel v1,v1,v7,v2 /* 0-16 is now in byte 7. */
+
+ MFVRD(r0,v1)
+ addi r5,r5,-16 /* Undo speculative bump. */
+ extsb r0,r0 /* Clear whatever gunk is in the high 56b. */
+ add r5,r5,r0 /* Add the offset of whatever was found. */
+L(done):
+ subf r3,r3,r5 /* Length is equal to the offset of null byte
+ matched minus the pointer to s. */
+ blr /* Done. */
+
+ /* Handle case of maxlen > 64 and found null bytes in last block
+ of 64 bytes read. */
+ .p2align 4
+L(found_64B):
+ /* A zero was found. Reduce the result. */
+ vcmpequb v1,v1,v0
+ vcmpequb v2,v2,v0
+ vcmpequb v3,v3,v0
+ vcmpequb v4,v4,v0
+
+ /* Permute the first bit of each byte into bits 48-63. */
+ VBPERMQ(v1,v1,v10)
+ VBPERMQ(v2,v2,v10)
+ VBPERMQ(v3,v3,v10)
+ VBPERMQ(v4,v4,v10)
+
+ /* Shift each component into its correct position for merging. */
+#ifdef __LITTLE_ENDIAN__
+ vsldoi v2,v2,v2,2
+ vsldoi v3,v3,v3,4
+ vsldoi v4,v4,v4,6
+#else
+ vsldoi v1,v1,v1,6
+ vsldoi v2,v2,v2,4
+ vsldoi v3,v3,v3,2
+#endif
+
+ /* Merge the results and move to a GPR. */
+ vor v1,v2,v1
+ vor v2,v3,v4
+ vor v4,v1,v2
+
+ /* Adjust address to the start of the current 64B block. */
+ addi r5,r5,-64
+
+ MFVRD(r10,v4)
+#ifdef __LITTLE_ENDIAN__
+ addi r9,r10,-1 /* Form a mask from trailing zeros. */
+ andc r9,r9,r10
+ popcntd r0,r9 /* Count the bits in the mask. */
+#else
+ cntlzd r0,r10 /* Count leading zeros before the match. */
+#endif
+ subf r5,r3,r5
+ add r3,r5,r0 /* Compute final length. */
+ blr /* Done. */
+
+ /* Handle case where null bytes were found while aligning
+ as a preparation for the 64B loop. */
+ .p2align 4
+L(found_aligning64B):
+ VBPERMQ(v1,v1,v10)
+#ifdef __LITTLE_ENDIAN__
+ MFVRD(r10,v1)
+ addi r9,r10,-1 /* Form a mask from trailing zeros. */
+ andc r9,r9,r10
+ popcntd r0,r9 /* Count the bits in the mask. */
+#else
+ vsldoi v1,v1,v1,6
+ MFVRD(r10,v1)
+ cntlzd r0,r10 /* Count leading zeros before the match. */
+#endif
+ addi r5,r5,-16 /* Adjust address to offset of last 16 bytes
+ read. */
+ /* Calculate length as subtracted the pointer to s of last 16 bytes
+ offset, added with the bytes before the match. */
+ subf r5,r3,r5
+ add r3,r5,r0
+ blr /* Done. */
+
+ /* Handle case of maxlen > 32 and found a null bytes within the first
+ 16 bytes of s. */
+ .p2align 4
+L(early_find):
+ bpermd r5,r8,r10 /* r8 contains the bit permute constants. */
+ bpermd r6,r8,r11
+ sldi r5,r5,8
+ or r5,r5,r6 /* r5 should hold a 16B mask of
+ a potential 0. */
+ cntlzd r5,r5 /* Count leading zeros. */
+ addi r3,r5,-48 /* Deduct the 48 leading zeros always
+ present. */
+ blr /* Done. */
+
+ /* Handle case of maxlen <= 32. Use the POWER7 algorithm. */
+ .p2align 4
+L(small_range):
+ clrrdi r8,r3,3 /* Align the pointer to 8B. */
+ li r0,0
+ /* Register's content at this point:
+ r3 == pointer to s, r4 == maxlen, r8 == pointer to s aligned to 8B,
+ r7 == last acceptable address. */
+ cmpldi r4,0 /* Check if maxlen is zero. */
+ beq L(end_max) /* If maxlen is zero. */
+
+ /* Calculate the last acceptable address and check for possible
+ addition overflow by using satured math:
+ r7 = r3 + r4
+ r7 |= -(r7 < x) */
+ add r7,r3,r4
+ subfc r6,r3,r7
+ subfe r9,r9,r9
+ extsw r6,r9
+ or r7,r7,r6
+ addi r7,r7,-1
+
+ clrrdi r7,r7,3 /* Align to 8B address of last
+ acceptable address. */
+
+ rlwinm r6,r3,3,26,28 /* Calculate padding. */
+ ld r12,0(r8) /* Load aligned doubleword. */
+ cmpb r10,r12,r0 /* Check for null bytes. */
+#ifdef __LITTLE_ENDIAN__
+ srd r10,r10,r6
+ sld r10,r10,r6
+#else
+ sld r10,r10,r6
+ srd r10,r10,r6
+#endif /* __LITTLE_ENDIAN__ */
+ cmpldi cr7,r10,0
+ bne cr7,L(done_small) /* If found null byte. */
+
+ cmpld r8,r7 /* Check if reached maxlen. */
+ beq L(end_max) /* If reached maxlen. */
+
+ /* Still handling case of maxlen <= 32. Read doubleword aligned until
+ find null bytes or reach maxlen. */
+ .p2align 4
+L(loop_small):
+ ldu r12,8(r8) /* Load next doubleword and update r8. */
+ cmpb r10,r12,r0 /* Check for null bytes. */
+ cmpldi cr6,r10,0
+ bne cr6,L(done_small) /* If found null bytes. */
+ cmpld r8,r7 /* Check if reached maxlen. */
+ bne L(loop_small) /* If it has more bytes to read. */
+ mr r3,r4 /* Reached maxlen with null bytes not found.
+ Length is equal to maxlen. */
+ blr /* Done. */
+
+ /* Still handling case of maxlen <= 32. Found null bytes.
+ Registers: r10 == match bits within doubleword, r8 == address of
+ last doubleword read, r3 == pointer to s, r4 == maxlen. */
+ .p2align 4
+L(done_small):
+#ifdef __LITTLE_ENDIAN__
+ /* Count trailing zeros. */
+ addi r0,r10,-1
+ andc r0,r0,r10
+ popcntd r0,r0
+#else
+ cntlzd r0,r10 /* Count leading zeros before the match. */
+#endif
+ sub r3,r8,r3 /* Calculate total of bytes before the match. */
+ srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
+ add r3,r3,r0 /* Length until the match. */
+ cmpld r3,r4 /* Check length is greater than maxlen. */
+ blelr
+ mr r3,r4 /* If length is greater than maxlen, return
+ maxlen. */
+ blr
+
+ /* Handle case of reached maxlen with null bytes not found. */
+ .p2align 4
+L(end_max):
+ mr r3,r4 /* Length is equal to maxlen. */
+ blr /* Done. */
+
+
+END (__strnlen)
+libc_hidden_def (__strnlen)
+weak_alias (__strnlen, strnlen)
+libc_hidden_def (strnlen)
diff --git a/sysdeps/powerpc/powerpc64/power8/strrchr.S b/sysdeps/powerpc/powerpc64/power8/strrchr.S
new file mode 100644
index 0000000000..6ff8a528b6
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strrchr.S
@@ -0,0 +1,468 @@
+/* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* char *[r3] strrchr (char *s [r3], int c [r4]) */
+/* TODO: change these to the actual instructions when the minimum required
+ binutils allows it. */
+#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define VBPERMQ(t,a,b) .long (0x1000054c \
+ | ((t)<<(32-11)) \
+ | ((a)<<(32-16)) \
+ | ((b)<<(32-21)) )
+#define VCLZD(r,v) .long (0x100007c2 | ((r)<<(32-11)) | ((v)<<(32-21)))
+#define VPOPCNTD(r,v) .long (0x100007c3 | ((r)<<(32-11)) | ((v)<<(32-21)))
+#define VADDUQM(t,a,b) .long (0x10000100 \
+ | ((t)<<(32-11)) \
+ | ((a)<<(32-16)) \
+ | ((b)<<(32-21)) )
+#ifdef __LITTLE_ENDIAN__
+/* Find the match position from v6 and place result in r6. */
+# define CALCULATE_MATCH() \
+ VBPERMQ(v6, v6, v10); \
+ vsldoi v6, v6, v6, 6; \
+ MFVRD(r7, v6); \
+ cntlzd r6, r7; \
+ subfic r6, r6, 15;
+/*
+ * Find the first null position to mask bytes after null.
+ * (reg): vcmpequb result: v2 for 1st qw v3 for 2nd qw.
+ * Result placed at v2.
+ */
+# define FIND_NULL_POS(reg) \
+ vspltisb v11, -1; \
+ VADDUQM(v11, reg, v11); \
+ vandc v11, v11, reg; \
+ VPOPCNTD(v2, v11); \
+ vspltb v11, v2, 15; \
+ vcmpequb. v11, v11, v9; \
+ blt cr6, 1f; \
+ vsldoi v9, v0, v9, 1; \
+ vslo v2, v2, v9; \
+1: \
+ vsumsws v2, v2, v0;
+#else
+# define CALCULATE_MATCH() \
+ VBPERMQ(v6, v6, v10); \
+ MFVRD(r7, v6); \
+ addi r6, r7, -1; \
+ andc r6, r6, r7; \
+ popcntd r6, r6; \
+ subfic r6, r6, 15;
+# define FIND_NULL_POS(reg) \
+ VCLZD(v2, reg); \
+ vspltb v11, v2, 7; \
+ vcmpequb. v11, v11, v9; \
+ blt cr6, 1f; \
+ vsldoi v9, v0, v9, 1; \
+ vsro v2, v2, v9; \
+1: \
+ vsumsws v2, v2, v0;
+#endif /* !__LITTLE_ENDIAN__ */
+
+#ifndef STRRCHR
+# define STRRCHR strrchr
+#endif
+ .machine power7
+ENTRY_TOCLESS (STRRCHR)
+ CALL_MCOUNT 2
+ dcbt 0,r3
+ clrrdi r8,r3,3 /* Align the address to doubleword boundary. */
+ cmpdi cr7,r4,0
+ ld r12,0(r8) /* Load doubleword from memory. */
+ li r9,0 /* Used to store last occurence. */
+ li r0,0 /* Doubleword with null chars to use
+ with cmpb. */
+
+ rlwinm r6,r3,3,26,28 /* Calculate padding. */
+
+ beq cr7,L(null_match)
+
+ /* Replicate byte to doubleword. */
+ insrdi r4,r4,8,48
+ insrdi r4,r4,16,32
+ insrdi r4,r4,32,0
+
+ /* r4 is changed now. If it's passed more chars, then
+ check for null again. */
+ cmpdi cr7,r4,0
+ beq cr7,L(null_match)
+ /* Now r4 has a doubleword of c bytes and r0 has
+ a doubleword of null bytes. */
+
+ cmpb r10,r12,r4 /* Compare each byte against c byte. */
+ cmpb r11,r12,r0 /* Compare each byte against null byte. */
+
+ /* Move the doublewords left and right to discard the bits that are
+ not part of the string and bring them back as zeros. */
+#ifdef __LITTLE_ENDIAN__
+ srd r10,r10,r6
+ srd r11,r11,r6
+ sld r10,r10,r6
+ sld r11,r11,r6
+#else
+ sld r10,r10,r6
+ sld r11,r11,r6
+ srd r10,r10,r6
+ srd r11,r11,r6
+#endif
+ or r5,r10,r11 /* OR the results to speed things up. */
+ cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
+ have been found. */
+ bne cr7,L(done)
+
+L(align):
+ andi. r12, r8, 15
+
+ /* Are we now aligned to a doubleword boundary? If so, skip to
+ the main loop. Otherwise, go through the alignment code. */
+
+ bne cr0, L(loop)
+
+ /* Handle WORD2 of pair. */
+ ldu r12,8(r8)
+ cmpb r10,r12,r4
+ cmpb r11,r12,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ bne cr7,L(done)
+ b L(loop) /* We branch here (rather than falling through)
+ to skip the nops due to heavy alignment
+ of the loop below. */
+ .p2align 5
+L(loop):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the null-checking process for bigger strings. */
+ ld r12,8(r8)
+ ldu r7,16(r8)
+ cmpb r10,r12,r4
+ cmpb r11,r12,r0
+ cmpb r6,r7,r4
+ cmpb r7,r7,r0
+ or r12,r10,r11
+ or r5,r6,r7
+ or r5,r12,r5
+ cmpdi cr7,r5,0
+ beq cr7,L(vector)
+
+ /* OK, one (or both) of the doublewords contains a c/null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a c/null byte. */
+ cmpdi cr6,r12,0
+ addi r8,r8,-8
+ bne cr6,L(done)
+
+ /* The c/null byte must be in the second doubleword. Adjust the
+ address again and move the result of cmpb to r10 so we can calculate
+ the pointer. */
+
+ mr r10,r6
+ mr r11,r7
+ addi r8,r8,8
+
+ /* r10/r11 have the output of the cmpb instructions, that is,
+ 0xff in the same position as the c/null byte in the original
+ doubleword from the string. Use that to calculate the pointer. */
+
+L(done):
+ /* If there are more than one 0xff in r11, find the first position of
+ 0xff in r11 and fill r10 with 0 from that position. */
+ cmpdi cr7,r11,0
+ beq cr7,L(no_null)
+#ifdef __LITTLE_ENDIAN__
+ addi r3,r11,-1
+ andc r3,r3,r11
+ popcntd r0,r3
+#else
+ cntlzd r0,r11
+#endif
+ subfic r0,r0,63
+ li r6,-1
+#ifdef __LITTLE_ENDIAN__
+ srd r0,r6,r0
+#else
+ sld r0,r6,r0
+#endif
+ and r10,r0,r10
+L(no_null):
+#ifdef __LITTLE_ENDIAN__
+ cntlzd r0,r10 /* Count leading zeros before c matches. */
+ addi r3,r10,-1
+ andc r3,r3,r10
+ addi r10,r11,-1
+ andc r10,r10,r11
+ cmpld cr7,r3,r10
+ bgt cr7,L(no_match)
+#else
+ addi r3,r10,-1 /* Count trailing zeros before c matches. */
+ andc r3,r3,r10
+ popcntd r0,r3
+ cmpld cr7,r11,r10
+ bgt cr7,L(no_match)
+#endif
+ srdi r0,r0,3 /* Convert trailing zeros to bytes. */
+ subfic r0,r0,7
+ add r9,r8,r0 /* Return address of the matching c byte
+ or null in case c was not found. */
+ li r0,0
+ cmpdi cr7,r11,0 /* If r11 == 0, no null's have been found. */
+ beq cr7,L(align)
+
+ .align 4
+L(no_match):
+ mr r3,r9
+ blr
+
+/* Check the first 32B in GPR's and move to vectorized loop. */
+ .p2align 5
+L(vector):
+ addi r3, r8, 8
+ /* Make sure 32B aligned. */
+ andi. r10, r3, 31
+ bne cr0, L(loop)
+ vspltisb v0, 0
+ /* Precompute vbpermq constant. */
+ vspltisb v10, 3
+ lvsl v11, r0, r0
+ vslb v10, v11, v10
+ MTVRD(v1, r4)
+ li r5, 16
+ vspltb v1, v1, 7
+ /* Compare 32 bytes in each loop. */
+L(continue):
+ lvx v4, 0, r3
+ lvx v5, r3, r5
+ vcmpequb v2, v0, v4
+ vcmpequb v3, v0, v5
+ vcmpequb v6, v1, v4
+ vcmpequb v7, v1, v5
+ vor v8, v2, v3
+ vor v9, v6, v7
+ vor v11, v8, v9
+ vcmpequb. v11, v0, v11
+ addi r3, r3, 32
+ blt cr6, L(continue)
+ vcmpequb. v8, v0, v8
+ blt cr6, L(match)
+
+ /* One (or both) of the quadwords contains c/null. */
+ vspltisb v8, 2
+ vspltisb v9, 5
+ /* Precompute values used for comparison. */
+ vsl v9, v8, v9 /* v9 = 0x4040404040404040. */
+ vaddubm v8, v9, v9
+ vsldoi v8, v0, v8, 1 /* v8 = 0x80. */
+
+ /* Check if null is in second qw. */
+ vcmpequb. v11, v0, v2
+ blt cr6, L(secondqw)
+
+ /* Null found in first qw. */
+ addi r8, r3, -32
+ /* Calculate the null position. */
+ FIND_NULL_POS(v2)
+ /* Check if null is in the first byte. */
+ vcmpequb. v11, v0, v2
+ blt cr6, L(no_match)
+ vsububm v2, v8, v2
+ /* Mask unwanted bytes after null. */
+#ifdef __LITTLE_ENDIAN__
+ vslo v6, v6, v2
+ vsro v6, v6, v2
+#else
+ vsro v6, v6, v2
+ vslo v6, v6, v2
+#endif
+ vcmpequb. v11, v0, v6
+ blt cr6, L(no_match)
+ /* Found a match before null. */
+ CALCULATE_MATCH()
+ add r3, r8, r6
+ blr
+
+L(secondqw):
+ addi r8, r3, -16
+ FIND_NULL_POS(v3)
+ vcmpequb. v11, v0, v2
+ blt cr6, L(no_match1)
+ vsububm v2, v8, v2
+ /* Mask unwanted bytes after null. */
+#ifdef __LITTLE_ENDIAN__
+ vslo v7, v7, v2
+ vsro v7, v7, v2
+#else
+ vsro v7, v7, v2
+ vslo v7, v7, v2
+#endif
+ vcmpequb. v11, v0, v7
+ blt cr6, L(no_match1)
+ addi r8, r8, 16
+ vor v6, v0, v7
+L(no_match1):
+ addi r8, r8, -16
+ vcmpequb. v11, v0, v6
+ blt cr6, L(no_match)
+ /* Found a match before null. */
+ CALCULATE_MATCH()
+ add r3, r8, r6
+ blr
+
+L(match):
+ /* One (or both) of the quadwords contains a match. */
+ mr r8, r3
+ vcmpequb. v8, v0, v7
+ blt cr6, L(firstqw)
+ /* Match found in second qw. */
+ addi r8, r8, 16
+ vor v6, v0, v7
+L(firstqw):
+ addi r8, r8, -32
+ CALCULATE_MATCH()
+ add r9, r8, r6 /* Compute final length. */
+ b L(continue)
+/* We are here because strrchr was called with a null byte. */
+ .align 4
+L(null_match):
+ /* r0 has a doubleword of null bytes. */
+
+ cmpb r5,r12,r0 /* Compare each byte against null bytes. */
+
+ /* Move the doublewords left and right to discard the bits that are
+ not part of the string and bring them back as zeros. */
+#ifdef __LITTLE_ENDIAN__
+ srd r5,r5,r6
+ sld r5,r5,r6
+#else
+ sld r5,r5,r6
+ srd r5,r5,r6
+#endif
+ cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
+ have been found. */
+ bne cr7,L(done_null)
+
+ andi. r12, r8, 15
+
+ /* Are we now aligned to a quadword boundary? If so, skip to
+ the main loop. Otherwise, go through the alignment code. */
+
+ bne cr0, L(loop_null)
+
+ /* Handle WORD2 of pair. */
+ ldu r12,8(r8)
+ cmpb r5,r12,r0
+ cmpdi cr7,r5,0
+ bne cr7,L(done_null)
+ b L(loop_null) /* We branch here (rather than falling through)
+ to skip the nops due to heavy alignment
+ of the loop below. */
+
+ /* Main loop to look for the end of the string. Since it's a
+ small loop (< 8 instructions), align it to 32-bytes. */
+ .p2align 5
+L(loop_null):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the null-checking process for bigger strings. */
+ ld r12,8(r8)
+ ldu r11,16(r8)
+ cmpb r5,r12,r0
+ cmpb r10,r11,r0
+ or r6,r5,r10
+ cmpdi cr7,r6,0
+ beq cr7,L(vector1)
+
+ /* OK, one (or both) of the doublewords contains a null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a null byte. */
+
+ cmpdi cr6,r5,0
+ addi r8,r8,-8
+ bne cr6,L(done_null)
+
+ /* The null byte must be in the second doubleword. Adjust the address
+ again and move the result of cmpb to r10 so we can calculate the
+ pointer. */
+
+ mr r5,r10
+ addi r8,r8,8
+
+ /* r5 has the output of the cmpb instruction, that is, it contains
+ 0xff in the same position as the null byte in the original
+ doubleword from the string. Use that to calculate the pointer. */
+L(done_null):
+#ifdef __LITTLE_ENDIAN__
+ addi r0,r5,-1
+ andc r0,r0,r5
+ popcntd r0,r0
+#else
+ cntlzd r0,r5 /* Count leading zeros before the match. */
+#endif
+ srdi r0,r0,3 /* Convert trailing zeros to bytes. */
+ add r3,r8,r0 /* Return address of the matching null byte. */
+ blr
+/* Check the first 32B in GPR's and move to vectorized loop. */
+ .p2align 5
+L(vector1):
+ addi r3, r8, 8
+ /* Make sure 32B aligned. */
+ andi. r10, r3, 31
+ bne cr0, L(loop_null)
+ vspltisb v0, 0
+ /* Precompute vbpermq constant. */
+ vspltisb v10, 3
+ lvsl v11, r0, r0
+ vslb v10, v11, v10
+ li r5, 16
+ /* Compare 32 bytes in each loop. */
+L(continue1):
+ lvx v4, 0, r3
+ lvx v5, r3, r5
+ vcmpequb v2, v0, v4
+ vcmpequb v3, v0, v5
+ vor v8, v2, v3
+ vcmpequb. v11, v0, v8
+ addi r3, r3, 32
+ blt cr6, L(continue1)
+ addi r3, r3, -32
+ VBPERMQ(v2, v2, v10)
+ VBPERMQ(v3, v3, v10)
+ /* Shift each component into its correct position for merging. */
+#ifdef __LITTLE_ENDIAN__
+ vsldoi v3, v3, v3, 2
+#else
+ vsldoi v2, v2, v2, 6
+ vsldoi v3, v3, v3, 4
+#endif
+ /* Merge the results and move to a GPR. */
+ vor v4, v3, v2
+ MFVRD(r5, v4)
+#ifdef __LITTLE_ENDIAN__
+ addi r6, r5, -1
+ andc r6, r6, r5
+ popcntd r6, r6
+#else
+ cntlzd r6, r5 /* Count leading zeros before the match. */
+#endif
+ add r3, r3, r6 /* Compute final length. */
+ blr
+END_GEN_TB (STRRCHR, TB_TOCLESS)
+weak_alias (strrchr, rindex)
+libc_hidden_builtin_def (strrchr)
diff --git a/sysdeps/powerpc/powerpc64/power8/strspn.S b/sysdeps/powerpc/powerpc64/power8/strspn.S
new file mode 100644
index 0000000000..095f6d6f41
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strspn.S
@@ -0,0 +1,202 @@
+/* Optimized strspn implementation for Power8.
+
+ Copyright (C) 2016-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* size_t [r3] strspn (const char *string [r3],
+ const char *needleAccept [r4]) */
+
+/* This takes a novel approach by computing a 256 bit mask whereby
+ each set bit implies the byte is "accepted". P8 vector hardware
+ has extremely efficient hardware for selecting bits from a mask.
+
+ One might ask "why not use bpermd for short strings"? It is
+ so slow that its performance about matches the generic PPC64
+ variant without any fancy masking, with the added expense of
+ making the mask. That was the first variant of this. */
+
+
+
+#include "sysdep.h"
+
+#ifndef USE_AS_STRCSPN
+# define USE_AS_STRCSPN 0
+# ifndef STRSPN
+# define STRSPN strspn
+# endif
+# define INITIAL_MASK 0
+# define UPDATE_MASK(RA, RS, RB) or RA, RS, RB
+#else
+# ifndef STRSPN
+# define STRSPN strcspn
+# endif
+# define INITIAL_MASK -1
+# define UPDATE_MASK(RA, RS, RB) andc RA, RS, RB
+#endif
+
+/* Simple macro to use VSX instructions in overlapping VR's. */
+#define XXVR(insn, vrt, vra, vrb) \
+ insn 32+vrt, 32+vra, 32+vrb
+
+/* ISA 2.07B instructions are not all defined for older binutils.
+ Macros are defined below for these newer instructions in order
+ to maintain compatibility. */
+
+/* Note, TX/SX is always set as VMX regs are the high 32 VSX regs. */
+#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+
+#define VBPERMQ(t,a,b) .long (0x1000054c \
+ | ((t)<<(32-11)) \
+ | ((a)<<(32-16)) \
+ | ((b)<<(32-21)) )
+
+ /* This can be updated to power8 once the minimum version of
+ binutils supports power8 and the above instructions. */
+ .machine power7
+ENTRY_TOCLESS (STRSPN, 4)
+ CALL_MCOUNT 2
+
+ /* Generate useful constants for later on. */
+ vspltisb v1, 7
+ vspltisb v2, -1
+ vslb v1, v1, v1 /* 0x80 to swap high bit for vbpermq. */
+ vspltisb v10, 0
+ vsldoi v4, v10, v2, 2 /* 0xFFFF into vr4. */
+ XXVR(xxmrgld, v4, v4, v10) /* Mask for checking matches. */
+
+ /* Prepare to compute 256b mask. */
+ addi r4, r4, -1
+ li r5, INITIAL_MASK
+ li r6, INITIAL_MASK
+ li r7, INITIAL_MASK
+ li r8, INITIAL_MASK
+
+#if USE_AS_STRCSPN
+ /* Ensure the null character never matches by clearing ISA bit 0 in
+ in r5 which is the bit which will check for it in the later usage
+ of vbpermq. */
+ srdi r5, r5, 1
+#endif
+
+ li r11, 1
+ sldi r11, r11, 63
+
+ /* Start interleaved Mask computation.
+ This will eventually or 1's into ignored bits from vbpermq. */
+ lvsr v11, 0, r3
+ vspltb v11, v11, 0 /* Splat shift constant. */
+
+ /* Build a 256b mask in r5-r8. */
+ .align 4
+L(next_needle):
+ lbzu r9, 1(r4)
+
+ cmpldi cr0, r9, 0
+ cmpldi cr1, r9, 128
+
+ /* This is a little tricky. srd only uses the first 7 bits,
+ and if bit 7 is set, value is always 0. So, we can
+ effectively shift 128b in this case. */
+ xori r12, r9, 0x40 /* Invert bit 6. */
+ srd r10, r11, r9 /* Mask for bits 0-63. */
+ srd r12, r11, r12 /* Mask for bits 64-127. */
+
+ beq cr0, L(start_cmp)
+
+ /* Now, or the value into the correct GPR. */
+ bge cr1,L(needle_gt128)
+ UPDATE_MASK (r5, r5, r10) /* 0 - 63. */
+ UPDATE_MASK (r6, r6, r12) /* 64 - 127. */
+ b L(next_needle)
+
+ .align 4
+L(needle_gt128):
+ UPDATE_MASK (r7, r7, r10) /* 128 - 191. */
+ UPDATE_MASK (r8, r8, r12) /* 192 - 255. */
+ b L(next_needle)
+
+
+ .align 4
+L(start_cmp):
+ /* Move and merge bitmap into 2 VRs. bpermd is slower on P8. */
+ mr r0, r3 /* Save r3 for final length computation. */
+ MTVRD (v5, r5)
+ MTVRD (v6, r6)
+ MTVRD (v7, r7)
+ MTVRD (v8, r8)
+
+ /* Continue interleaved mask generation. */
+#ifdef __LITTLE_ENDIAN__
+ vsrw v11, v2, v11 /* Note, shift ignores higher order bits. */
+ vsplth v11, v11, 0 /* Only care about the high 16 bits of v10. */
+#else
+ vslw v11, v2, v11 /* Note, shift ignores higher order bits. */
+ vsplth v11, v11, 1 /* Only care about the low 16 bits of v10. */
+#endif
+ lvx v0, 0, r3 /* Note, unaligned load ignores lower bits. */
+
+ /* Do the merging of the bitmask. */
+ XXVR(xxmrghd, v5, v5, v6)
+ XXVR(xxmrghd, v6, v7, v8)
+
+ /* Finish mask generation. */
+ vand v11, v11, v4 /* Throwaway bits not in the mask. */
+
+ /* Compare the first 1-16B, while masking unwanted bytes. */
+ clrrdi r3, r3, 4 /* Note, counts from qw boundaries. */
+ vxor v9, v0, v1 /* Swap high bit. */
+ VBPERMQ (v8, v5, v0)
+ VBPERMQ (v7, v6, v9)
+ vor v7, v7, v8
+ vor v7, v7, v11 /* Ignore non-participating bytes. */
+ vcmpequh. v8, v7, v4
+ bnl cr6, L(done)
+
+ addi r3, r3, 16
+
+ .align 4
+L(vec):
+ lvx v0, 0, r3
+ addi r3, r3, 16
+ vxor v9, v0, v1 /* Swap high bit. */
+ VBPERMQ (v8, v5, v0)
+ VBPERMQ (v7, v6, v9)
+ vor v7, v7, v8
+ vcmpequh. v8, v7, v4
+ blt cr6, L(vec)
+
+ addi r3, r3, -16
+L(done):
+ subf r3, r0, r3
+ MFVRD (r10, v7)
+
+#ifdef __LITTLE_ENDIAN__
+ addi r0, r10, 1 /* Count the trailing 1's. */
+ andc r10, r10, r0
+ popcntd r10, r10
+#else
+ xori r10, r10, 0xffff /* Count leading 1's by inverting. */
+ addi r3, r3, -48 /* Account for the extra leading zeros. */
+ cntlzd r10, r10
+#endif
+
+ add r3, r3, r10
+ blr
+
+END(STRSPN)
+libc_hidden_builtin_def (STRSPN)