summaryrefslogtreecommitdiff
path: root/sysdeps/powerpc/powerpc64/power7
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power7')
-rw-r--r--sysdeps/powerpc/powerpc64/power7/add_n.S2
-rw-r--r--sysdeps/powerpc/powerpc64/power7/bcopy.c1
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/Implies2
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S4
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S4
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S4
-rw-r--r--sysdeps/powerpc/powerpc64/power7/memchr.S2
-rw-r--r--sysdeps/powerpc/powerpc64/power7/memcmp.S872
-rw-r--r--sysdeps/powerpc/powerpc64/power7/memcpy.S24
-rw-r--r--sysdeps/powerpc/powerpc64/power7/memmove.S831
-rw-r--r--sysdeps/powerpc/powerpc64/power7/mempcpy.S2
-rw-r--r--sysdeps/powerpc/powerpc64/power7/memrchr.S4
-rw-r--r--sysdeps/powerpc/powerpc64/power7/memset.S7
-rw-r--r--sysdeps/powerpc/powerpc64/power7/rawmemchr.S2
-rw-r--r--sysdeps/powerpc/powerpc64/power7/stpcpy.S2
-rw-r--r--sysdeps/powerpc/powerpc64/power7/stpncpy.S24
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strcasecmp.S2
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strchr.S2
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strchrnul.S2
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strcmp.S164
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strcpy.S329
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strlen.S2
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strncmp.S2
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strncpy.S714
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strnlen.S2
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strrchr.S255
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strstr.S509
-rw-r--r--sysdeps/powerpc/powerpc64/power7/sub_n.S2
28 files changed, 3056 insertions, 716 deletions
diff --git a/sysdeps/powerpc/powerpc64/power7/add_n.S b/sysdeps/powerpc/powerpc64/power7/add_n.S
index 6df442ccec..0661cbdbaf 100644
--- a/sysdeps/powerpc/powerpc64/power7/add_n.S
+++ b/sysdeps/powerpc/powerpc64/power7/add_n.S
@@ -1,6 +1,6 @@
/* PowerPC64 mpn_lshift -- mpn_add_n/mpn_sub_n -- mpn addition and
subtraction.
- Copyright (C) 2003-2014 Free Software Foundation, Inc.
+ Copyright (C) 2003-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/powerpc/powerpc64/power7/bcopy.c b/sysdeps/powerpc/powerpc64/power7/bcopy.c
new file mode 100644
index 0000000000..4a6a400e7a
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/bcopy.c
@@ -0,0 +1 @@
+/* Implemented at memmove.S */
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/Implies b/sysdeps/powerpc/powerpc64/power7/fpu/Implies
index 410d289a6d..30fa17646e 100644
--- a/sysdeps/powerpc/powerpc64/power7/fpu/Implies
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/Implies
@@ -1 +1 @@
-powerpc/powerpc64/power6/fpu/multiarch
+powerpc/powerpc64/power6/fpu
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
index 765d68914a..0d37e54491 100644
--- a/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
@@ -1,5 +1,5 @@
/* finite(). PowerPC64/POWER7 version.
- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+ Copyright (C) 2010-2015 Free Software Foundation, Inc.
Contributed by Luis Machado <luisgpm@br.ibm.com>.
This file is part of the GNU C Library.
@@ -57,7 +57,7 @@ strong_alias (__finite, __finitef)
hidden_def (__finitef)
weak_alias (__finitef, finitef)
-#ifdef IS_IN_libm
+#if IS_IN (libm)
# if LONG_DOUBLE_COMPAT (libm, GLIBC_2_0)
compat_symbol (libm, __finite, __finitel, GLIBC_2_0)
compat_symbol (libm, finite, finitel, GLIBC_2_0)
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
index e102d4b448..b24760a953 100644
--- a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
@@ -1,5 +1,5 @@
/* isinf(). PowerPC64/POWER7 version.
- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+ Copyright (C) 2010-2015 Free Software Foundation, Inc.
Contributed by Luis Machado <luisgpm@br.ibm.com>.
This file is part of the GNU C Library.
@@ -61,7 +61,7 @@ strong_alias (__isinf, __isinfl)
weak_alias (__isinf, isinfl)
#endif
-#ifndef IS_IN_libm
+#if !IS_IN (libm)
# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0);
compat_symbol (libc, isinf, isinfl, GLIBC_2_0);
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S
index eabee712ea..e53779b877 100644
--- a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S
@@ -1,5 +1,5 @@
/* isnan(). PowerPC64/POWER7 version.
- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+ Copyright (C) 2010-2015 Free Software Foundation, Inc.
Contributed by Luis Machado <luisgpm@br.ibm.com>.
This file is part of the GNU C Library.
@@ -60,7 +60,7 @@ strong_alias (__isnan, __isnanl)
weak_alias (__isnan, isnanl)
#endif
-#ifndef IS_IN_libm
+#if !IS_IN (libm)
# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
compat_symbol (libc, __isnan, __isnanl, GLIBC_2_0);
compat_symbol (libc, isnan, isnanl, GLIBC_2_0);
diff --git a/sysdeps/powerpc/powerpc64/power7/memchr.S b/sysdeps/powerpc/powerpc64/power7/memchr.S
index f502ad022b..0e70921a08 100644
--- a/sysdeps/powerpc/powerpc64/power7/memchr.S
+++ b/sysdeps/powerpc/powerpc64/power7/memchr.S
@@ -1,5 +1,5 @@
/* Optimized memchr implementation for PowerPC64/POWER7 using cmpb insn.
- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+ Copyright (C) 2010-2015 Free Software Foundation, Inc.
Contributed by Luis Machado <luisgpm@br.ibm.com>.
This file is part of the GNU C Library.
diff --git a/sysdeps/powerpc/powerpc64/power7/memcmp.S b/sysdeps/powerpc/powerpc64/power7/memcmp.S
index 09bff696ff..d60dfdaa18 100644
--- a/sysdeps/powerpc/powerpc64/power7/memcmp.S
+++ b/sysdeps/powerpc/powerpc64/power7/memcmp.S
@@ -1,5 +1,5 @@
/* Optimized memcmp implementation for POWER7/PowerPC64.
- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+ Copyright (C) 2010-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -26,18 +26,48 @@
EALIGN (memcmp, 4, 0)
CALL_MCOUNT 3
-#define rRTN r3
-#define rSTR1 r3 /* first string arg */
-#define rSTR2 r4 /* second string arg */
-#define rN r5 /* max string length */
-#define rWORD1 r6 /* current word in s1 */
-#define rWORD2 r7 /* current word in s2 */
-#define rWORD3 r8 /* next word in s1 */
-#define rWORD4 r9 /* next word in s2 */
-#define rWORD5 r10 /* next word in s1 */
-#define rWORD6 r11 /* next word in s2 */
-#define rWORD7 r30 /* next word in s1 */
-#define rWORD8 r31 /* next word in s2 */
+#define rRTN r3
+#define rSTR1 r3 /* first string arg */
+#define rSTR2 r4 /* second string arg */
+#define rN r5 /* max string length */
+#define rWORD1 r6 /* current word in s1 */
+#define rWORD2 r7 /* current word in s2 */
+#define rWORD3 r8 /* next word in s1 */
+#define rWORD4 r9 /* next word in s2 */
+#define rWORD5 r10 /* next word in s1 */
+#define rWORD6 r11 /* next word in s2 */
+
+#define rOFF8 r20 /* 8 bytes offset. */
+#define rOFF16 r21 /* 16 bytes offset. */
+#define rOFF24 r22 /* 24 bytes offset. */
+#define rOFF32 r23 /* 24 bytes offset. */
+#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
+#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
+#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
+#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
+#define rSHR r28 /* Unaligned shift right count. */
+#define rSHL r29 /* Unaligned shift left count. */
+#define rWORD7 r30 /* next word in s1 */
+#define rWORD8 r31 /* next word in s2 */
+
+#define rWORD8SAVE (-8)
+#define rWORD7SAVE (-16)
+#define rOFF8SAVE (-24)
+#define rOFF16SAVE (-32)
+#define rOFF24SAVE (-40)
+#define rOFF32SAVE (-48)
+#define rSHRSAVE (-56)
+#define rSHLSAVE (-64)
+#define rWORD8SHIFTSAVE (-72)
+#define rWORD2SHIFTSAVE (-80)
+#define rWORD4SHIFTSAVE (-88)
+#define rWORD6SHIFTSAVE (-96)
+
+#ifdef __LITTLE_ENDIAN__
+# define LD ldbrx
+#else
+# define LD ldx
+#endif
xor r0, rSTR2, rSTR1
cmpldi cr6, rN, 0
@@ -51,10 +81,24 @@ EALIGN (memcmp, 4, 0)
/* If less than 8 bytes or not aligned, use the unaligned
byte loop. */
blt cr1, L(bytealigned)
- std rWORD8, -8(r1)
- cfi_offset(rWORD8, -8)
- std rWORD7, -16(r1)
- cfi_offset(rWORD7, -16)
+ std rWORD8, rWORD8SAVE(r1)
+ cfi_offset(rWORD8, rWORD8SAVE)
+ std rWORD7, rWORD7SAVE(r1)
+ cfi_offset(rWORD7, rWORD7SAVE)
+ std rOFF8, rOFF8SAVE(r1)
+ cfi_offset(rWORD7, rOFF8SAVE)
+ std rOFF16, rOFF16SAVE(r1)
+ cfi_offset(rWORD7, rOFF16SAVE)
+ std rOFF24, rOFF24SAVE(r1)
+ cfi_offset(rWORD7, rOFF24SAVE)
+ std rOFF32, rOFF32SAVE(r1)
+ cfi_offset(rWORD7, rOFF32SAVE)
+
+ li rOFF8,8
+ li rOFF16,16
+ li rOFF24,24
+ li rOFF32,32
+
bne L(unaligned)
/* At this point we know both strings have the same alignment and the
compare length is at least 8 bytes. r12 contains the low order
@@ -79,15 +123,8 @@ L(samealignment):
sldi rWORD6, r12, 3
srdi r0, rN, 5 /* Divide by 32 */
andi. r12, rN, 24 /* Get the DW remainder */
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 0(rSTR1)
- ld rWORD2, 0(rSTR2)
-#endif
+ LD rWORD1, 0, rSTR1
+ LD rWORD2, 0, rSTR2
cmpldi cr1, r12, 16
cmpldi cr7, rN, 32
clrldi rN, rN, 61
@@ -104,15 +141,8 @@ L(dsP1):
cmpld cr5, rWORD5, rWORD6
blt cr7, L(dP1x)
/* Do something useful in this cycle since we have to branch anyway. */
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 8(rSTR1)
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
cmpld cr7, rWORD1, rWORD2
b L(dP1e)
/* Remainder is 16 */
@@ -123,15 +153,8 @@ L(dPs2):
cmpld cr6, rWORD5, rWORD6
blt cr7, L(dP2x)
/* Do something useful in this cycle since we have to branch anyway. */
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD7, 8(rSTR1)
- ld rWORD8, 8(rSTR2)
-#endif
+ LD rWORD7, rOFF8, rSTR1
+ LD rWORD8, rOFF8, rSTR2
cmpld cr5, rWORD7, rWORD8
b L(dP2e)
/* Remainder is 24 */
@@ -173,72 +196,43 @@ L(dP1):
change any on the early exit path. The key here is the non-early
exit path only cares about the condition code (cr5), not about which
register pair was used. */
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 0(rSTR1)
- ld rWORD6, 0(rSTR2)
-#endif
+ LD rWORD5, 0, rSTR1
+ LD rWORD6, 0, rSTR2
cmpld cr5, rWORD5, rWORD6
blt cr7, L(dP1x)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 8(rSTR1)
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
cmpld cr7, rWORD1, rWORD2
L(dP1e):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 16(rSTR1)
- ld rWORD4, 16(rSTR2)
-#endif
+ LD rWORD3, rOFF16, rSTR1
+ LD rWORD4, rOFF16, rSTR2
cmpld cr1, rWORD3, rWORD4
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 24(rSTR1)
- ld rWORD6, 24(rSTR2)
-#endif
+ LD rWORD5, rOFF24, rSTR1
+ LD rWORD6, rOFF24, rSTR2
cmpld cr6, rWORD5, rWORD6
bne cr5, L(dLcr5x)
bne cr7, L(dLcr7x)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ldu rWORD7, 32(rSTR1)
- ldu rWORD8, 32(rSTR2)
-#endif
+ LD rWORD7, rOFF32, rSTR1
+ LD rWORD8, rOFF32, rSTR2
+ addi rSTR1, rSTR1, 32
+ addi rSTR2, rSTR2, 32
bne cr1, L(dLcr1)
cmpld cr5, rWORD7, rWORD8
bdnz L(dLoop)
bne cr6, L(dLcr6)
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
.align 3
L(dP1x):
sldi. r12, rN, 3
bne cr5, L(dLcr5x)
subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
bne L(d00)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 0
blr
@@ -246,79 +240,41 @@ L(dP1x):
.align 4
L(dP2):
mtctr r0
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 0(rSTR1)
- ld rWORD6, 0(rSTR2)
-#endif
+ LD rWORD5, 0, rSTR1
+ LD rWORD6, 0, rSTR2
cmpld cr6, rWORD5, rWORD6
blt cr7, L(dP2x)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD7, 8(rSTR1)
- ld rWORD8, 8(rSTR2)
-#endif
+ LD rWORD7, rOFF8, rSTR1
+ LD rWORD8, rOFF8, rSTR2
cmpld cr5, rWORD7, rWORD8
L(dP2e):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 16(rSTR1)
- ld rWORD2, 16(rSTR2)
-#endif
+ LD rWORD1, rOFF16, rSTR1
+ LD rWORD2, rOFF16, rSTR2
cmpld cr7, rWORD1, rWORD2
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 24(rSTR1)
- ld rWORD4, 24(rSTR2)
-#endif
+ LD rWORD3, rOFF24, rSTR1
+ LD rWORD4, rOFF24, rSTR2
cmpld cr1, rWORD3, rWORD4
-#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 8
addi rSTR2, rSTR2, 8
-#endif
bne cr6, L(dLcr6)
bne cr5, L(dLcr5)
b L(dLoop2)
-/* Again we are on a early exit path (16-23 byte compare), we want to
- only use volatile registers and avoid restoring non-volatile
- registers. */
.align 4
L(dP2x):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 8(rSTR1)
- ld rWORD4, 8(rSTR2)
-#endif
+ LD rWORD3, rOFF8, rSTR1
+ LD rWORD4, rOFF8, rSTR2
cmpld cr1, rWORD3, rWORD4
sldi. r12, rN, 3
bne cr6, L(dLcr6x)
-#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 8
addi rSTR2, rSTR2, 8
-#endif
bne cr1, L(dLcr1x)
subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
bne L(d00)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 0
blr
@@ -326,52 +282,22 @@ L(dP2x):
.align 4
L(dP3):
mtctr r0
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 0(rSTR1)
- ld rWORD4, 0(rSTR2)
-#endif
+ LD rWORD3, 0, rSTR1
+ LD rWORD4, 0, rSTR2
cmpld cr1, rWORD3, rWORD4
L(dP3e):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 8(rSTR1)
- ld rWORD6, 8(rSTR2)
-#endif
+ LD rWORD5, rOFF8, rSTR1
+ LD rWORD6, rOFF8, rSTR2
cmpld cr6, rWORD5, rWORD6
blt cr7, L(dP3x)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD7, 16(rSTR1)
- ld rWORD8, 16(rSTR2)
-#endif
+ LD rWORD7, rOFF16, rSTR1
+ LD rWORD8, rOFF16, rSTR2
cmpld cr5, rWORD7, rWORD8
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 24(rSTR1)
- ld rWORD2, 24(rSTR2)
-#endif
+ LD rWORD1, rOFF24, rSTR1
+ LD rWORD2, rOFF24, rSTR2
cmpld cr7, rWORD1, rWORD2
-#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 16
addi rSTR2, rSTR2, 16
-#endif
bne cr1, L(dLcr1)
bne cr6, L(dLcr6)
b L(dLoop1)
@@ -380,26 +306,21 @@ L(dP3e):
registers. */
.align 4
L(dP3x):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 16(rSTR1)
- ld rWORD2, 16(rSTR2)
-#endif
+ LD rWORD1, rOFF16, rSTR1
+ LD rWORD2, rOFF16, rSTR2
cmpld cr7, rWORD1, rWORD2
sldi. r12, rN, 3
bne cr1, L(dLcr1x)
-#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 16
addi rSTR2, rSTR2, 16
-#endif
bne cr6, L(dLcr6x)
subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
bne cr7, L(dLcr7x)
bne L(d00)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 0
blr
@@ -407,46 +328,20 @@ L(dP3x):
.align 4
L(dP4):
mtctr r0
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 0(rSTR1)
- ld rWORD2, 0(rSTR2)
-#endif
+ LD rWORD1, 0, rSTR1
+ LD rWORD2, 0, rSTR2
cmpld cr7, rWORD1, rWORD2
L(dP4e):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 8(rSTR1)
- ld rWORD4, 8(rSTR2)
-#endif
+ LD rWORD3, rOFF8, rSTR1
+ LD rWORD4, rOFF8, rSTR2
cmpld cr1, rWORD3, rWORD4
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 16(rSTR1)
- ld rWORD6, 16(rSTR2)
-#endif
+ LD rWORD5, rOFF16, rSTR1
+ LD rWORD6, rOFF16, rSTR2
cmpld cr6, rWORD5, rWORD6
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ldu rWORD7, 24(rSTR1)
- ldu rWORD8, 24(rSTR2)
-#endif
+ LD rWORD7, rOFF24, rSTR1
+ LD rWORD8, rOFF24, rSTR2
+ addi rSTR1, rSTR1, 24
+ addi rSTR2, rSTR2, 24
cmpld cr5, rWORD7, rWORD8
bne cr7, L(dLcr7)
bne cr1, L(dLcr1)
@@ -454,51 +349,25 @@ L(dP4e):
/* This is the primary loop */
.align 4
L(dLoop):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 8(rSTR1)
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
cmpld cr1, rWORD3, rWORD4
bne cr6, L(dLcr6)
L(dLoop1):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 16(rSTR1)
- ld rWORD4, 16(rSTR2)
-#endif
+ LD rWORD3, rOFF16, rSTR1
+ LD rWORD4, rOFF16, rSTR2
cmpld cr6, rWORD5, rWORD6
bne cr5, L(dLcr5)
L(dLoop2):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 24(rSTR1)
- ld rWORD6, 24(rSTR2)
-#endif
+ LD rWORD5, rOFF24, rSTR1
+ LD rWORD6, rOFF24, rSTR2
cmpld cr5, rWORD7, rWORD8
bne cr7, L(dLcr7)
L(dLoop3):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ldu rWORD7, 32(rSTR1)
- ldu rWORD8, 32(rSTR2)
-#endif
+ LD rWORD7, rOFF32, rSTR1
+ LD rWORD8, rOFF32, rSTR2
+ addi rSTR1, rSTR1, 32
+ addi rSTR2, rSTR2, 32
bne cr1, L(dLcr1)
cmpld cr7, rWORD1, rWORD2
bdnz L(dLoop)
@@ -519,62 +388,75 @@ L(d14):
sldi. r12, rN, 3
bne cr5, L(dLcr5)
L(d04):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
- beq L(zeroLength)
+ beq L(duzeroLength)
/* At this point we have a remainder of 1 to 7 bytes to compare. Since
we are aligned it is safe to load the whole double word, and use
shift right double to eliminate bits beyond the compare length. */
L(d00):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 8(rSTR1)
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
srd rWORD1, rWORD1, rN
srd rWORD2, rWORD2, rN
cmpld cr7, rWORD1, rWORD2
bne cr7, L(dLcr7x)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 0
blr
.align 4
L(dLcr7):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
L(dLcr7x):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 1
bgtlr cr7
li rRTN, -1
blr
.align 4
L(dLcr1):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
L(dLcr1x):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 1
bgtlr cr1
li rRTN, -1
blr
.align 4
L(dLcr6):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
L(dLcr6x):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 1
bgtlr cr6
li rRTN, -1
blr
.align 4
L(dLcr5):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
L(dLcr5x):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 1
bgtlr cr5
li rRTN, -1
@@ -583,10 +465,6 @@ L(dLcr5x):
.align 4
L(bytealigned):
mtctr rN
-#if 0
-/* Huh? We've already branched on cr6! */
- beq cr6, L(zeroLength)
-#endif
/* We need to prime this loop. This loop is swing modulo scheduled
to avoid pipe delays. The dependent instruction latencies (load to
@@ -685,6 +563,7 @@ L(b11):
L(bx12):
sub rRTN, rWORD1, rWORD2
blr
+
.align 4
L(zeroLength):
li rRTN, 0
@@ -705,42 +584,36 @@ L(zeroLength):
we need to adjust the length (rN) and special case the loop
versioning for the first DW. This ensures that the loop count is
correct and the first DW (shifted) is in the expected resister pair. */
-#define rSHL r29 /* Unaligned shift left count. */
-#define rSHR r28 /* Unaligned shift right count. */
-#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
-#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
-#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
-#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
L(unaligned):
- std rSHL, -24(r1)
- cfi_offset(rSHL, -24)
+ std rSHL, rSHLSAVE(r1)
+ cfi_offset(rSHL, rSHLSAVE)
clrldi rSHL, rSTR2, 61
beq cr6, L(duzeroLength)
- std rSHR, -32(r1)
- cfi_offset(rSHR, -32)
+ std rSHR, rSHRSAVE(r1)
+ cfi_offset(rSHR, rSHRSAVE)
beq cr5, L(DWunaligned)
- std rWORD8_SHIFT, -40(r1)
- cfi_offset(rWORD8_SHIFT, -40)
+ std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+ cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
/* Adjust the logical start of rSTR2 to compensate for the extra bits
in the 1st rSTR1 DW. */
sub rWORD8_SHIFT, rSTR2, r12
/* But do not attempt to address the DW before that DW that contains
the actual start of rSTR2. */
clrrdi rSTR2, rSTR2, 3
- std rWORD2_SHIFT, -48(r1)
- cfi_offset(rWORD2_SHIFT, -48)
+ std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+ cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
/* Compute the left/right shift counts for the unaligned rSTR2,
compensating for the logical (DW aligned) start of rSTR1. */
clrldi rSHL, rWORD8_SHIFT, 61
clrrdi rSTR1, rSTR1, 3
- std rWORD4_SHIFT, -56(r1)
- cfi_offset(rWORD4_SHIFT, -56)
+ std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+ cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
sldi rSHL, rSHL, 3
cmpld cr5, rWORD8_SHIFT, rSTR2
add rN, rN, r12
sldi rWORD6, r12, 3
- std rWORD6_SHIFT, -64(r1)
- cfi_offset(rWORD6_SHIFT, -64)
+ std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+ cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
subfic rSHR, rSHL, 64
srdi r0, rN, 5 /* Divide by 32 */
andi. r12, rN, 24 /* Get the DW remainder */
@@ -750,25 +623,13 @@ L(unaligned):
this may cross a page boundary and cause a page fault. */
li rWORD8, 0
blt cr5, L(dus0)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD8, 0, rSTR2
+ LD rWORD8, 0, rSTR2
addi rSTR2, rSTR2, 8
-#else
- ld rWORD8, 0(rSTR2)
- addi rSTR2, rSTR2, 8
-#endif
sld rWORD8, rWORD8, rSHL
L(dus0):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 0(rSTR1)
- ld rWORD2, 0(rSTR2)
-#endif
+ LD rWORD1, 0, rSTR1
+ LD rWORD2, 0, rSTR2
cmpldi cr1, r12, 16
cmpldi cr7, rN, 32
srd r12, rWORD2, rSHR
@@ -796,12 +657,7 @@ L(dusP1):
beq L(duZeroReturn)
li r0, 0
ble cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD2, 0, rSTR2
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD2, rOFF8, rSTR2
srd r0, rWORD2, rSHR
b L(dutrim)
/* Remainder is 16 */
@@ -832,27 +688,21 @@ L(duPs4):
compare length is at least 8 bytes. */
.align 4
L(DWunaligned):
- std rWORD8_SHIFT, -40(r1)
- cfi_offset(rWORD8_SHIFT, -40)
+ std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+ cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
clrrdi rSTR2, rSTR2, 3
- std rWORD2_SHIFT, -48(r1)
- cfi_offset(rWORD2_SHIFT, -48)
+ std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+ cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
srdi r0, rN, 5 /* Divide by 32 */
- std rWORD4_SHIFT, -56(r1)
- cfi_offset(rWORD4_SHIFT, -56)
+ std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+ cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
andi. r12, rN, 24 /* Get the DW remainder */
- std rWORD6_SHIFT, -64(r1)
- cfi_offset(rWORD6_SHIFT, -64)
+ std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+ cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
sldi rSHL, rSHL, 3
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD6, 0, rSTR2
+ LD rWORD6, 0, rSTR2
+ LD rWORD8, rOFF8, rSTR2
addi rSTR2, rSTR2, 8
- ldbrx rWORD8, 0, rSTR2
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD6, 0(rSTR2)
- ldu rWORD8, 8(rSTR2)
-#endif
cmpldi cr1, r12, 16
cmpldi cr7, rN, 32
clrldi rN, rN, 61
@@ -867,52 +717,26 @@ L(DWunaligned):
.align 4
L(duP1):
srd r12, rWORD8, rSHR
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- addi rSTR1, rSTR1, 8
-#else
- ld rWORD7, 0(rSTR1)
-#endif
+ LD rWORD7, 0, rSTR1
sld rWORD8_SHIFT, rWORD8, rSHL
or rWORD8, r12, rWORD6_SHIFT
blt cr7, L(duP1x)
L(duP1e):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 8(rSTR1)
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
cmpld cr5, rWORD7, rWORD8
srd r0, rWORD2, rSHR
sld rWORD2_SHIFT, rWORD2, rSHL
or rWORD2, r0, rWORD8_SHIFT
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 16(rSTR1)
- ld rWORD4, 16(rSTR2)
-#endif
+ LD rWORD3, rOFF16, rSTR1
+ LD rWORD4, rOFF16, rSTR2
cmpld cr7, rWORD1, rWORD2
srd r12, rWORD4, rSHR
sld rWORD4_SHIFT, rWORD4, rSHL
bne cr5, L(duLcr5)
or rWORD4, r12, rWORD2_SHIFT
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 24(rSTR1)
- ld rWORD6, 24(rSTR2)
-#endif
+ LD rWORD5, rOFF24, rSTR1
+ LD rWORD6, rOFF24, rSTR2
cmpld cr1, rWORD3, rWORD4
srd r0, rWORD6, rSHR
sld rWORD6_SHIFT, rWORD6, rSHL
@@ -932,82 +756,47 @@ L(duP1x):
beq L(duZeroReturn)
li r0, 0
ble cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD2, 0, rSTR2
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD2, rOFF8, rSTR2
srd r0, rWORD2, rSHR
b L(dutrim)
/* Remainder is 16 */
.align 4
L(duP2):
srd r0, rWORD8, rSHR
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- addi rSTR1, rSTR1, 8
-#else
- ld rWORD5, 0(rSTR1)
-#endif
+ LD rWORD5, 0, rSTR1
or rWORD6, r0, rWORD6_SHIFT
sld rWORD6_SHIFT, rWORD8, rSHL
L(duP2e):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD7, 8(rSTR1)
- ld rWORD8, 8(rSTR2)
-#endif
+ LD rWORD7, rOFF8, rSTR1
+ LD rWORD8, rOFF8, rSTR2
cmpld cr6, rWORD5, rWORD6
srd r12, rWORD8, rSHR
sld rWORD8_SHIFT, rWORD8, rSHL
or rWORD8, r12, rWORD6_SHIFT
blt cr7, L(duP2x)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 16(rSTR1)
- ld rWORD2, 16(rSTR2)
-#endif
+ LD rWORD1, rOFF16, rSTR1
+ LD rWORD2, rOFF16, rSTR2
cmpld cr5, rWORD7, rWORD8
bne cr6, L(duLcr6)
srd r0, rWORD2, rSHR
sld rWORD2_SHIFT, rWORD2, rSHL
or rWORD2, r0, rWORD8_SHIFT
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 24(rSTR1)
- ld rWORD4, 24(rSTR2)
-#endif
+ LD rWORD3, rOFF24, rSTR1
+ LD rWORD4, rOFF24, rSTR2
cmpld cr7, rWORD1, rWORD2
bne cr5, L(duLcr5)
srd r12, rWORD4, rSHR
sld rWORD4_SHIFT, rWORD4, rSHL
or rWORD4, r12, rWORD2_SHIFT
-#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 8
addi rSTR2, rSTR2, 8
-#endif
cmpld cr1, rWORD3, rWORD4
b L(duLoop2)
.align 4
L(duP2x):
cmpld cr5, rWORD7, rWORD8
-#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 8
addi rSTR2, rSTR2, 8
-#endif
bne cr6, L(duLcr6)
sldi. rN, rN, 3
bne cr5, L(duLcr5)
@@ -1015,12 +804,7 @@ L(duP2x):
beq L(duZeroReturn)
li r0, 0
ble cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD2, 0, rSTR2
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD2, rOFF8, rSTR2
srd r0, rWORD2, rSHR
b L(dutrim)
@@ -1028,73 +812,39 @@ L(duP2x):
.align 4
L(duP3):
srd r12, rWORD8, rSHR
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- addi rSTR1, rSTR1, 8
-#else
- ld rWORD3, 0(rSTR1)
-#endif
+ LD rWORD3, 0, rSTR1
sld rWORD4_SHIFT, rWORD8, rSHL
or rWORD4, r12, rWORD6_SHIFT
L(duP3e):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 8(rSTR1)
- ld rWORD6, 8(rSTR2)
-#endif
+ LD rWORD5, rOFF8, rSTR1
+ LD rWORD6, rOFF8, rSTR2
cmpld cr1, rWORD3, rWORD4
srd r0, rWORD6, rSHR
sld rWORD6_SHIFT, rWORD6, rSHL
or rWORD6, r0, rWORD4_SHIFT
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD7, 16(rSTR1)
- ld rWORD8, 16(rSTR2)
-#endif
+ LD rWORD7, rOFF16, rSTR1
+ LD rWORD8, rOFF16, rSTR2
cmpld cr6, rWORD5, rWORD6
bne cr1, L(duLcr1)
srd r12, rWORD8, rSHR
sld rWORD8_SHIFT, rWORD8, rSHL
or rWORD8, r12, rWORD6_SHIFT
blt cr7, L(duP3x)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 24(rSTR1)
- ld rWORD2, 24(rSTR2)
-#endif
+ LD rWORD1, rOFF24, rSTR1
+ LD rWORD2, rOFF24, rSTR2
cmpld cr5, rWORD7, rWORD8
bne cr6, L(duLcr6)
srd r0, rWORD2, rSHR
sld rWORD2_SHIFT, rWORD2, rSHL
or rWORD2, r0, rWORD8_SHIFT
-#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 16
addi rSTR2, rSTR2, 16
-#endif
cmpld cr7, rWORD1, rWORD2
b L(duLoop1)
.align 4
L(duP3x):
-#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 16
addi rSTR2, rSTR2, 16
-#endif
-#if 0
-/* Huh? We've already branched on cr1! */
- bne cr1, L(duLcr1)
-#endif
cmpld cr5, rWORD7, rWORD8
bne cr6, L(duLcr6)
sldi. rN, rN, 3
@@ -1103,12 +853,7 @@ L(duP3x):
beq L(duZeroReturn)
li r0, 0
ble cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD2, 0, rSTR2
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD2, rOFF8, rSTR2
srd r0, rWORD2, rSHR
b L(dutrim)
@@ -1117,51 +862,27 @@ L(duP3x):
L(duP4):
mtctr r0
srd r0, rWORD8, rSHR
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- addi rSTR1, rSTR1, 8
-#else
- ld rWORD1, 0(rSTR1)
-#endif
+ LD rWORD1, 0, rSTR1
sld rWORD2_SHIFT, rWORD8, rSHL
or rWORD2, r0, rWORD6_SHIFT
L(duP4e):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 8(rSTR1)
- ld rWORD4, 8(rSTR2)
-#endif
+ LD rWORD3, rOFF8, rSTR1
+ LD rWORD4, rOFF8, rSTR2
cmpld cr7, rWORD1, rWORD2
srd r12, rWORD4, rSHR
sld rWORD4_SHIFT, rWORD4, rSHL
or rWORD4, r12, rWORD2_SHIFT
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 16(rSTR1)
- ld rWORD6, 16(rSTR2)
-#endif
+ LD rWORD5, rOFF16, rSTR1
+ LD rWORD6, rOFF16, rSTR2
cmpld cr1, rWORD3, rWORD4
bne cr7, L(duLcr7)
srd r0, rWORD6, rSHR
sld rWORD6_SHIFT, rWORD6, rSHL
or rWORD6, r0, rWORD4_SHIFT
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ldu rWORD7, 24(rSTR1)
- ldu rWORD8, 24(rSTR2)
-#endif
+ LD rWORD7, rOFF24, rSTR1
+ LD rWORD8, rOFF24, rSTR2
+ addi rSTR1, rSTR1, 24
+ addi rSTR2, rSTR2, 24
cmpld cr6, rWORD5, rWORD6
bne cr1, L(duLcr1)
srd r12, rWORD8, rSHR
@@ -1172,60 +893,34 @@ L(duP4e):
/* This is the primary loop */
.align 4
L(duLoop):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 8(rSTR1)
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
cmpld cr1, rWORD3, rWORD4
bne cr6, L(duLcr6)
srd r0, rWORD2, rSHR
sld rWORD2_SHIFT, rWORD2, rSHL
or rWORD2, r0, rWORD8_SHIFT
L(duLoop1):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 16(rSTR1)
- ld rWORD4, 16(rSTR2)
-#endif
+ LD rWORD3, rOFF16, rSTR1
+ LD rWORD4, rOFF16, rSTR2
cmpld cr6, rWORD5, rWORD6
bne cr5, L(duLcr5)
srd r12, rWORD4, rSHR
sld rWORD4_SHIFT, rWORD4, rSHL
or rWORD4, r12, rWORD2_SHIFT
L(duLoop2):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 24(rSTR1)
- ld rWORD6, 24(rSTR2)
-#endif
+ LD rWORD5, rOFF24, rSTR1
+ LD rWORD6, rOFF24, rSTR2
cmpld cr5, rWORD7, rWORD8
bne cr7, L(duLcr7)
srd r0, rWORD6, rSHR
sld rWORD6_SHIFT, rWORD6, rSHL
or rWORD6, r0, rWORD4_SHIFT
L(duLoop3):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ldu rWORD7, 32(rSTR1)
- ldu rWORD8, 32(rSTR2)
-#endif
+ LD rWORD7, rOFF32, rSTR1
+ LD rWORD8, rOFF32, rSTR2
+ addi rSTR1, rSTR1, 32
+ addi rSTR2, rSTR2, 32
cmpld cr7, rWORD1, rWORD2
bne cr1, L(duLcr1)
srd r12, rWORD8, rSHR
@@ -1234,10 +929,6 @@ L(duLoop3):
bdnz L(duLoop)
L(duL4):
-#if 0
-/* Huh? We've already branched on cr1! */
- bne cr1, L(duLcr1)
-#endif
cmpld cr1, rWORD3, rWORD4
bne cr6, L(duLcr6)
cmpld cr6, rWORD5, rWORD6
@@ -1264,99 +955,102 @@ L(du14):
beq L(duZeroReturn)
li r0, 0
ble cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD2, 0, rSTR2
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD2, rOFF8, rSTR2
srd r0, rWORD2, rSHR
.align 4
L(dutrim):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
-#else
- ld rWORD1, 8(rSTR1)
-#endif
+ LD rWORD1, rOFF8, rSTR1
ld rWORD8, -8(r1)
subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
or rWORD2, r0, rWORD8_SHIFT
- ld rWORD7, -16(r1)
- ld rSHL, -24(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+ ld rSHL, rSHLSAVE(r1)
srd rWORD1, rWORD1, rN
srd rWORD2, rWORD2, rN
- ld rSHR, -32(r1)
- ld rWORD8_SHIFT, -40(r1)
+ ld rSHR, rSHRSAVE(r1)
+ ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
li rRTN, 0
cmpld cr7, rWORD1, rWORD2
- ld rWORD2_SHIFT, -48(r1)
- ld rWORD4_SHIFT, -56(r1)
+ ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+ ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
beq cr7, L(dureturn24)
li rRTN, 1
- ld rWORD6_SHIFT, -64(r1)
+ ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
bgtlr cr7
li rRTN, -1
blr
.align 4
L(duLcr7):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
li rRTN, 1
bgt cr7, L(dureturn29)
- ld rSHL, -24(r1)
- ld rSHR, -32(r1)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
li rRTN, -1
b L(dureturn27)
.align 4
L(duLcr1):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
li rRTN, 1
bgt cr1, L(dureturn29)
- ld rSHL, -24(r1)
- ld rSHR, -32(r1)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
li rRTN, -1
b L(dureturn27)
.align 4
L(duLcr6):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
li rRTN, 1
bgt cr6, L(dureturn29)
- ld rSHL, -24(r1)
- ld rSHR, -32(r1)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
li rRTN, -1
b L(dureturn27)
.align 4
L(duLcr5):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
li rRTN, 1
bgt cr5, L(dureturn29)
- ld rSHL, -24(r1)
- ld rSHR, -32(r1)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
li rRTN, -1
b L(dureturn27)
+
.align 3
L(duZeroReturn):
li rRTN, 0
.align 4
L(dureturn):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
L(dureturn29):
- ld rSHL, -24(r1)
- ld rSHR, -32(r1)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
L(dureturn27):
- ld rWORD8_SHIFT, -40(r1)
-L(dureturn26):
- ld rWORD2_SHIFT, -48(r1)
-L(dureturn25):
- ld rWORD4_SHIFT, -56(r1)
+ ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+ ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+ ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
L(dureturn24):
- ld rWORD6_SHIFT, -64(r1)
+ ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
blr
+
L(duzeroLength):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 0
blr
diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S
index bbfd381b1b..8c8834e2a2 100644
--- a/sysdeps/powerpc/powerpc64/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S
@@ -1,5 +1,5 @@
/* Optimized memcpy implementation for PowerPC64/POWER7.
- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+ Copyright (C) 2010-2015 Free Software Foundation, Inc.
Contributed by Luis Machado <luisgpm@br.ibm.com>.
This file is part of the GNU C Library.
@@ -36,16 +36,11 @@ EALIGN (memcpy, 5, 0)
ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
code. */
-#ifdef __LITTLE_ENDIAN__
-/* In little-endian mode, power7 takes an alignment trap on any lxvd2x
- or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy
- loop is only used for quadword aligned copies. */
+/* Align copies using VSX instructions to quadword. It is to avoid alignment
+ traps when memcpy is used on non-cacheable memory (for instance, memory
+ mapped I/O). */
andi. 10,3,15
clrldi 11,4,60
-#else
- andi. 10,3,7 /* Check alignment of DST. */
- clrldi 11,4,61 /* Check alignment of SRC. */
-#endif
cmpld cr6,10,11 /* SRC and DST alignments match? */
mr dst,3
@@ -53,13 +48,9 @@ EALIGN (memcpy, 5, 0)
beq L(aligned_copy)
mtocrf 0x01,0
-#ifdef __LITTLE_ENDIAN__
clrldi 0,0,60
-#else
- clrldi 0,0,61
-#endif
-/* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */
+/* Get the DST and SRC aligned to 16 bytes. */
1:
bf 31,2f
lbz 6,0(src)
@@ -79,14 +70,12 @@ EALIGN (memcpy, 5, 0)
stw 6,0(dst)
addi dst,dst,4
8:
-#ifdef __LITTLE_ENDIAN__
bf 28,16f
ld 6,0(src)
addi src,src,8
std 6,0(dst)
addi dst,dst,8
16:
-#endif
subf cnt,0,cnt
/* Main aligned copy loop. Copies 128 bytes at a time. */
@@ -298,9 +287,6 @@ L(copy_LE_8):
.align 4
L(copy_GE_32_unaligned):
clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */
-#ifndef __LITTLE_ENDIAN__
- andi. 10,3,15 /* Check alignment of DST (against quadwords). */
-#endif
srdi 9,cnt,4 /* Number of full quadwords remaining. */
beq L(copy_GE_32_unaligned_cont)
diff --git a/sysdeps/powerpc/powerpc64/power7/memmove.S b/sysdeps/powerpc/powerpc64/power7/memmove.S
new file mode 100644
index 0000000000..3bd4b4bb3f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/memmove.S
@@ -0,0 +1,831 @@
+/* Optimized memmove implementation for PowerPC64/POWER7.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+
+/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5])
+
+ This optimization check if memory 'dest' overlaps with 'src'. If it does
+ not then it calls an optimized memcpy call (similar to memcpy for POWER7,
+ embedded here to gain some cycles).
+ If source and destiny overlaps, a optimized backwards memcpy is used
+ instead. */
+
+ .machine power7
+EALIGN (memmove, 5, 0)
+ CALL_MCOUNT 3
+
+L(_memmove):
+ subf r9,r4,r3
+ cmpld cr7,r9,r5
+ blt cr7,L(memmove_bwd)
+
+ cmpldi cr1,r5,31
+ neg 0,3
+ ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
+ code. */
+
+ andi. 10,3,15
+ clrldi 11,4,60
+ cmpld cr6,10,11 /* SRC and DST alignments match? */
+
+ mr r11,3
+ bne cr6,L(copy_GE_32_unaligned)
+ beq L(aligned_copy)
+
+ mtocrf 0x01,0
+ clrldi 0,0,60
+
+/* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */
+1:
+ bf 31,2f
+ lbz 6,0(r4)
+ addi r4,r4,1
+ stb 6,0(r11)
+ addi r11,r11,1
+2:
+ bf 30,4f
+ lhz 6,0(r4)
+ addi r4,r4,2
+ sth 6,0(r11)
+ addi r11,r11,2
+4:
+ bf 29,8f
+ lwz 6,0(r4)
+ addi r4,r4,4
+ stw 6,0(r11)
+ addi r11,r11,4
+8:
+ bf 28,16f
+ ld 6,0(r4)
+ addi r4,r4,8
+ std 6,0(r11)
+ addi r11,r11,8
+16:
+ subf r5,0,r5
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy):
+ li 6,16
+ li 7,32
+ li 8,48
+ mtocrf 0x02,r5
+ srdi 12,r5,7
+ cmpdi 12,0
+ beq L(aligned_tail)
+ lxvd2x 6,0,r4
+ lxvd2x 7,r4,6
+ mtctr 12
+ b L(aligned_128loop)
+
+ .align 4
+L(aligned_128head):
+ /* for the 2nd + iteration of this loop. */
+ lxvd2x 6,0,r4
+ lxvd2x 7,r4,6
+L(aligned_128loop):
+ lxvd2x 8,r4,7
+ lxvd2x 9,r4,8
+ stxvd2x 6,0,r11
+ addi r4,r4,64
+ stxvd2x 7,r11,6
+ stxvd2x 8,r11,7
+ stxvd2x 9,r11,8
+ lxvd2x 6,0,r4
+ lxvd2x 7,r4,6
+ addi r11,r11,64
+ lxvd2x 8,r4,7
+ lxvd2x 9,r4,8
+ addi r4,r4,64
+ stxvd2x 6,0,r11
+ stxvd2x 7,r11,6
+ stxvd2x 8,r11,7
+ stxvd2x 9,r11,8
+ addi r11,r11,64
+ bdnz L(aligned_128head)
+
+L(aligned_tail):
+ mtocrf 0x01,r5
+ bf 25,32f
+ lxvd2x 6,0,r4
+ lxvd2x 7,r4,6
+ lxvd2x 8,r4,7
+ lxvd2x 9,r4,8
+ addi r4,r4,64
+ stxvd2x 6,0,r11
+ stxvd2x 7,r11,6
+ stxvd2x 8,r11,7
+ stxvd2x 9,r11,8
+ addi r11,r11,64
+32:
+ bf 26,16f
+ lxvd2x 6,0,r4
+ lxvd2x 7,r4,6
+ addi r4,r4,32
+ stxvd2x 6,0,r11
+ stxvd2x 7,r11,6
+ addi r11,r11,32
+16:
+ bf 27,8f
+ lxvd2x 6,0,r4
+ addi r4,r4,16
+ stxvd2x 6,0,r11
+ addi r11,r11,16
+8:
+ bf 28,4f
+ ld 6,0(r4)
+ addi r4,r4,8
+ std 6,0(r11)
+ addi r11,r11,8
+4: /* Copies 4~7 bytes. */
+ bf 29,L(tail2)
+ lwz 6,0(r4)
+ stw 6,0(r11)
+ bf 30,L(tail5)
+ lhz 7,4(r4)
+ sth 7,4(r11)
+ bflr 31
+ lbz 8,6(r4)
+ stb 8,6(r11)
+ /* Return original DST pointer. */
+ blr
+
+/* Handle copies of 0~31 bytes. */
+ .align 4
+L(copy_LT_32):
+ mr r11,3
+ cmpldi cr6,r5,8
+ mtocrf 0x01,r5
+ ble cr6,L(copy_LE_8)
+
+ /* At least 9 bytes to go. */
+ neg 8,4
+ andi. 0,8,3
+ cmpldi cr1,r5,16
+ beq L(copy_LT_32_aligned)
+
+ /* Force 4-byte alignment for SRC. */
+ mtocrf 0x01,0
+ subf r5,0,r5
+2:
+ bf 30,1f
+ lhz 6,0(r4)
+ addi r4,r4,2
+ sth 6,0(r11)
+ addi r11,r11,2
+1:
+ bf 31,L(end_4bytes_alignment)
+ lbz 6,0(r4)
+ addi r4,r4,1
+ stb 6,0(r11)
+ addi r11,r11,1
+
+ .align 4
+L(end_4bytes_alignment):
+ cmpldi cr1,r5,16
+ mtocrf 0x01,r5
+
+L(copy_LT_32_aligned):
+ /* At least 6 bytes to go, and SRC is word-aligned. */
+ blt cr1,8f
+
+ /* Copy 16 bytes. */
+ lwz 6,0(r4)
+ lwz 7,4(r4)
+ stw 6,0(r11)
+ lwz 8,8(r4)
+ stw 7,4(r11)
+ lwz 6,12(r4)
+ addi r4,r4,16
+ stw 8,8(r11)
+ stw 6,12(r11)
+ addi r11,r11,16
+8: /* Copy 8 bytes. */
+ bf 28,L(tail4)
+ lwz 6,0(r4)
+ lwz 7,4(r4)
+ addi r4,r4,8
+ stw 6,0(r11)
+ stw 7,4(r11)
+ addi r11,r11,8
+
+ .align 4
+/* Copies 4~7 bytes. */
+L(tail4):
+ bf 29,L(tail2)
+ lwz 6,0(r4)
+ stw 6,0(r11)
+ bf 30,L(tail5)
+ lhz 7,4(r4)
+ sth 7,4(r11)
+ bflr 31
+ lbz 8,6(r4)
+ stb 8,6(r11)
+ /* Return original DST pointer. */
+ blr
+
+ .align 4
+/* Copies 2~3 bytes. */
+L(tail2):
+ bf 30,1f
+ lhz 6,0(r4)
+ sth 6,0(r11)
+ bflr 31
+ lbz 7,2(r4)
+ stb 7,2(r11)
+ blr
+
+ .align 4
+L(tail5):
+ bflr 31
+ lbz 6,4(r4)
+ stb 6,4(r11)
+ blr
+
+ .align 4
+1:
+ bflr 31
+ lbz 6,0(r4)
+ stb 6,0(r11)
+ /* Return original DST pointer. */
+ blr
+
+/* Handles copies of 0~8 bytes. */
+ .align 4
+L(copy_LE_8):
+ bne cr6,L(tail4)
+
+ /* Though we could've used ld/std here, they are still
+ slow for unaligned cases. */
+
+ lwz 6,0(r4)
+ lwz 7,4(r4)
+ stw 6,0(r11)
+ stw 7,4(r11)
+ blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+ SRC is not. Use aligned quadword loads from SRC, shifted to realign
+ the data, allowing for aligned DST stores. */
+ .align 4
+L(copy_GE_32_unaligned):
+ clrldi 0,0,60 /* Number of bytes until the 1st r11 quadword. */
+ srdi 9,r5,4 /* Number of full quadwords remaining. */
+
+ beq L(copy_GE_32_unaligned_cont)
+
+ /* DST is not quadword aligned, get it aligned. */
+
+ mtocrf 0x01,0
+ subf r5,0,r5
+
+ /* Vector instructions work best when proper alignment (16-bytes)
+ is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
+1:
+ bf 31,2f
+ lbz 6,0(r4)
+ addi r4,r4,1
+ stb 6,0(r11)
+ addi r11,r11,1
+2:
+ bf 30,4f
+ lhz 6,0(r4)
+ addi r4,r4,2
+ sth 6,0(r11)
+ addi r11,r11,2
+4:
+ bf 29,8f
+ lwz 6,0(r4)
+ addi r4,r4,4
+ stw 6,0(r11)
+ addi r11,r11,4
+8:
+ bf 28,0f
+ ld 6,0(r4)
+ addi r4,r4,8
+ std 6,0(r11)
+ addi r11,r11,8
+0:
+ srdi 9,r5,4 /* Number of full quadwords remaining. */
+
+ /* The proper alignment is present, it is OK to copy the bytes now. */
+L(copy_GE_32_unaligned_cont):
+
+ /* Setup two indexes to speed up the indexed vector operations. */
+ clrldi 10,r5,60
+ li 6,16 /* Index for 16-bytes offsets. */
+ li 7,32 /* Index for 32-bytes offsets. */
+ cmpldi cr1,10,0
+ srdi 8,r5,5 /* Setup the loop counter. */
+ mtocrf 0x01,9
+ cmpldi cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+ lvsr 5,0,r4
+#else
+ lvsl 5,0,r4
+#endif
+ lvx 3,0,r4
+ li 0,0
+ bf 31,L(setup_unaligned_loop)
+
+ /* Copy another 16 bytes to align to 32-bytes due to the loop. */
+ lvx 4,r4,6
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
+ vperm 6,3,4,5
+#endif
+ addi r4,r4,16
+ stvx 6,0,r11
+ addi r11,r11,16
+ vor 3,4,4
+ clrrdi 0,r4,60
+
+L(setup_unaligned_loop):
+ mtctr 8
+ ble cr6,L(end_unaligned_loop)
+
+ /* Copy 32 bytes at a time using vector instructions. */
+ .align 4
+L(unaligned_loop):
+
+ /* Note: vr6/vr10 may contain data that was already copied,
+ but in order to get proper alignment, we may have to copy
+ some portions again. This is faster than having unaligned
+ vector instructions though. */
+
+ lvx 4,r4,6
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
+ vperm 6,3,4,5
+#endif
+ lvx 3,r4,7
+#ifdef __LITTLE_ENDIAN__
+ vperm 10,3,4,5
+#else
+ vperm 10,4,3,5
+#endif
+ addi r4,r4,32
+ stvx 6,0,r11
+ stvx 10,r11,6
+ addi r11,r11,32
+ bdnz L(unaligned_loop)
+
+ clrrdi 0,r4,60
+
+ .align 4
+L(end_unaligned_loop):
+
+ /* Check for tail bytes. */
+ mtocrf 0x01,r5
+ beqlr cr1
+
+ add r4,r4,0
+
+ /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
+ /* Copy 8 bytes. */
+ bf 28,4f
+ lwz 6,0(r4)
+ lwz 7,4(r4)
+ addi r4,r4,8
+ stw 6,0(r11)
+ stw 7,4(r11)
+ addi r11,r11,8
+4: /* Copy 4~7 bytes. */
+ bf 29,L(tail2)
+ lwz 6,0(r4)
+ stw 6,0(r11)
+ bf 30,L(tail5)
+ lhz 7,4(r4)
+ sth 7,4(r11)
+ bflr 31
+ lbz 8,6(r4)
+ stb 8,6(r11)
+ /* Return original DST pointer. */
+ blr
+
+ /* Start to memcpy backward implementation: the algorith first check if
+ src and dest have the same alignment and if it does align both to 16
+ bytes and copy using VSX instructions.
+ If does not, align dest to 16 bytes and use VMX (altivec) instruction
+ to read two 16 bytes at time, shift/permute the bytes read and write
+ aligned to dest. */
+L(memmove_bwd):
+ cmpldi cr1,r5,31
+ /* Copy is done backwards: update the pointers and check alignment. */
+ add r11,r3,r5
+ add r4,r4,r5
+ mr r0,r11
+ ble cr1, L(copy_LT_32_bwd) /* If move < 32 bytes use short move
+ code. */
+
+ andi. r10,r11,15 /* Check if r11 is aligned to 16 bytes */
+ clrldi r9,r4,60 /* Check if r4 is aligned to 16 bytes */
+ cmpld cr6,r10,r9 /* SRC and DST alignments match? */
+
+ bne cr6,L(copy_GE_32_unaligned_bwd)
+ beq L(aligned_copy_bwd)
+
+ mtocrf 0x01,r0
+ clrldi r0,r0,60
+
+/* Get the DST and SRC aligned to 16 bytes. */
+1:
+ bf 31,2f
+ lbz r6,-1(r4)
+ subi r4,r4,1
+ stb r6,-1(r11)
+ subi r11,r11,1
+2:
+ bf 30,4f
+ lhz r6,-2(r4)
+ subi r4,r4,2
+ sth r6,-2(r11)
+ subi r11,r11,2
+4:
+ bf 29,8f
+ lwz r6,-4(r4)
+ subi r4,r4,4
+ stw r6,-4(r11)
+ subi r11,r11,4
+8:
+ bf 28,16f
+ ld r6,-8(r4)
+ subi r4,r4,8
+ std r6,-8(r11)
+ subi r11,r11,8
+16:
+ subf r5,0,r5
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy_bwd):
+ li r6,-16
+ li r7,-32
+ li r8,-48
+ li r9,-64
+ mtocrf 0x02,r5
+ srdi r12,r5,7
+ cmpdi r12,0
+ beq L(aligned_tail_bwd)
+ lxvd2x v6,r4,r6
+ lxvd2x v7,r4,r7
+ mtctr 12
+ b L(aligned_128loop_bwd)
+
+ .align 4
+L(aligned_128head_bwd):
+ /* for the 2nd + iteration of this loop. */
+ lxvd2x v6,r4,r6
+ lxvd2x v7,r4,r7
+L(aligned_128loop_bwd):
+ lxvd2x v8,r4,r8
+ lxvd2x v9,r4,r9
+ stxvd2x v6,r11,r6
+ subi r4,r4,64
+ stxvd2x v7,r11,r7
+ stxvd2x v8,r11,r8
+ stxvd2x v9,r11,r9
+ lxvd2x v6,r4,r6
+ lxvd2x v7,r4,7
+ subi r11,r11,64
+ lxvd2x v8,r4,r8
+ lxvd2x v9,r4,r9
+ subi r4,r4,64
+ stxvd2x v6,r11,r6
+ stxvd2x v7,r11,r7
+ stxvd2x v8,r11,r8
+ stxvd2x v9,r11,r9
+ subi r11,r11,64
+ bdnz L(aligned_128head_bwd)
+
+L(aligned_tail_bwd):
+ mtocrf 0x01,r5
+ bf 25,32f
+ lxvd2x v6,r4,r6
+ lxvd2x v7,r4,r7
+ lxvd2x v8,r4,r8
+ lxvd2x v9,r4,r9
+ subi r4,r4,64
+ stxvd2x v6,r11,r6
+ stxvd2x v7,r11,r7
+ stxvd2x v8,r11,r8
+ stxvd2x v9,r11,r9
+ subi r11,r11,64
+32:
+ bf 26,16f
+ lxvd2x v6,r4,r6
+ lxvd2x v7,r4,r7
+ subi r4,r4,32
+ stxvd2x v6,r11,r6
+ stxvd2x v7,r11,r7
+ subi r11,r11,32
+16:
+ bf 27,8f
+ lxvd2x v6,r4,r6
+ subi r4,r4,16
+ stxvd2x v6,r11,r6
+ subi r11,r11,16
+8:
+ bf 28,4f
+ ld r6,-8(r4)
+ subi r4,r4,8
+ std r6,-8(r11)
+ subi r11,r11,8
+4: /* Copies 4~7 bytes. */
+ bf 29,L(tail2_bwd)
+ lwz r6,-4(r4)
+ stw r6,-4(r11)
+ bf 30,L(tail5_bwd)
+ lhz r7,-6(r4)
+ sth r7,-6(r11)
+ bflr 31
+ lbz r8,-7(r4)
+ stb r8,-7(r11)
+ /* Return original DST pointer. */
+ blr
+
+/* Handle copies of 0~31 bytes. */
+ .align 4
+L(copy_LT_32_bwd):
+ cmpldi cr6,r5,8
+ mtocrf 0x01,r5
+ ble cr6,L(copy_LE_8_bwd)
+
+ /* At least 9 bytes to go. */
+ neg r8,r4
+ andi. r0,r8,3
+ cmpldi cr1,r5,16
+ beq L(copy_LT_32_aligned_bwd)
+
+ /* Force 4-byte alignment for SRC. */
+ mtocrf 0x01,0
+ subf r5,0,r5
+2:
+ bf 30,1f
+ lhz r6,-2(r4)
+ subi r4,r4,2
+ sth r6,-2(r11)
+ subi r11,r11,2
+1:
+ bf 31,L(end_4bytes_alignment_bwd)
+ lbz 6,-1(r4)
+ subi r4,r4,1
+ stb 6,-1(r11)
+ subi r11,r11,1
+
+ .align 4
+L(end_4bytes_alignment_bwd):
+ cmpldi cr1,r5,16
+ mtocrf 0x01,r5
+
+L(copy_LT_32_aligned_bwd):
+ /* At least 6 bytes to go, and SRC is word-aligned. */
+ blt cr1,8f
+
+ /* Copy 16 bytes. */
+ lwz r6,-4(r4)
+ lwz r7,-8(r4)
+ stw r6,-4(r11)
+ lwz r8,-12(r4)
+ stw r7,-8(r11)
+ lwz r6,-16(r4)
+ subi r4,r4,16
+ stw r8,-12(r11)
+ stw r6,-16(r11)
+ subi r11,r11,16
+8: /* Copy 8 bytes. */
+ bf 28,L(tail4_bwd)
+ lwz r6,-4(r4)
+ lwz r7,-8(r4)
+ subi r4,r4,8
+ stw r6,-4(r11)
+ stw r7,-8(r11)
+ subi r11,r11,8
+
+ .align 4
+/* Copies 4~7 bytes. */
+L(tail4_bwd):
+ bf 29,L(tail2_bwd)
+ lwz 6,-4(r4)
+ stw 6,-4(r11)
+ bf 30,L(tail5_bwd)
+ lhz 7,-6(r4)
+ sth 7,-6(r11)
+ bflr 31
+ lbz 8,-7(r4)
+ stb 8,-7(r11)
+ /* Return original DST pointer. */
+ blr
+
+ .align 4
+/* Copies 2~3 bytes. */
+L(tail2_bwd):
+ bf 30,1f
+ lhz 6,-2(r4)
+ sth 6,-2(r11)
+ bflr 31
+ lbz 7,-3(r4)
+ stb 7,-3(r11)
+ blr
+
+ .align 4
+L(tail5_bwd):
+ bflr 31
+ lbz 6,-5(r4)
+ stb 6,-5(r11)
+ blr
+
+ .align 4
+1:
+ bflr 31
+ lbz 6,-1(r4)
+ stb 6,-1(r11)
+ /* Return original DST pointer. */
+ blr
+
+
+/* Handles copies of 0~8 bytes. */
+ .align 4
+L(copy_LE_8_bwd):
+ bne cr6,L(tail4_bwd)
+
+ /* Though we could've used ld/std here, they are still
+ slow for unaligned cases. */
+ lwz 6,-8(r4)
+ lwz 7,-4(r4)
+ stw 6,-8(r11)
+ stw 7,-4(r11)
+ blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+ SRC is not. Use aligned quadword loads from SRC, shifted to realign
+ the data, allowing for aligned DST stores. */
+ .align 4
+L(copy_GE_32_unaligned_bwd):
+ andi. r10,r11,15 /* Check alignment of DST against 16 bytes.. */
+ srdi r9,r5,4 /* Number of full quadwords remaining. */
+
+ beq L(copy_GE_32_unaligned_cont_bwd)
+
+ /* DST is not quadword aligned and r10 holds the address masked to
+ compare alignments. */
+ mtocrf 0x01,r10
+ subf r5,r10,r5
+
+ /* Vector instructions work best when proper alignment (16-bytes)
+ is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
+1:
+ bf 31,2f
+ lbz r6,-1(r4)
+ subi r4,r4,1
+ stb r6,-1(r11)
+ subi r11,r11,1
+2:
+ bf 30,4f
+ lhz r6,-2(r4)
+ subi r4,r4,2
+ sth r6,-2(r11)
+ subi r11,r11,2
+4:
+ bf 29,8f
+ lwz r6,-4(r4)
+ subi r4,r4,4
+ stw r6,-4(r11)
+ subi r11,r11,4
+8:
+ bf 28,0f
+ ld r6,-8(r4)
+ subi r4,r4,8
+ std r6,-8(r11)
+ subi r11,r11,8
+0:
+ srdi r9,r5,4 /* Number of full quadwords remaining. */
+
+ /* The proper alignment is present, it is OK to copy the bytes now. */
+L(copy_GE_32_unaligned_cont_bwd):
+
+ /* Setup two indexes to speed up the indexed vector operations. */
+ clrldi r10,r5,60
+ li r6,-16 /* Index for 16-bytes offsets. */
+ li r7,-32 /* Index for 32-bytes offsets. */
+ cmpldi cr1,10,0
+ srdi r8,r5,5 /* Setup the loop counter. */
+ mtocrf 0x01,9
+ cmpldi cr6,r9,1
+#ifdef __LITTLE_ENDIAN__
+ lvsr v5,r0,r4
+#else
+ lvsl v5,r0,r4
+#endif
+ lvx v3,0,r4
+ li r0,0
+ bf 31,L(setup_unaligned_loop_bwd)
+
+ /* Copy another 16 bytes to align to 32-bytes due to the loop. */
+ lvx v4,r4,r6
+#ifdef __LITTLE_ENDIAN__
+ vperm v6,v3,v4,v5
+#else
+ vperm v6,v4,v3,v5
+#endif
+ subi r4,r4,16
+ stvx v6,r11,r6
+ subi r11,r11,16
+ vor v3,v4,v4
+ clrrdi r0,r4,60
+
+L(setup_unaligned_loop_bwd):
+ mtctr r8
+ ble cr6,L(end_unaligned_loop_bwd)
+
+ /* Copy 32 bytes at a time using vector instructions. */
+ .align 4
+L(unaligned_loop_bwd):
+
+ /* Note: vr6/vr10 may contain data that was already copied,
+ but in order to get proper alignment, we may have to copy
+ some portions again. This is faster than having unaligned
+ vector instructions though. */
+
+ lvx v4,r4,r6
+#ifdef __LITTLE_ENDIAN__
+ vperm v6,v3,v4,v5
+#else
+ vperm v6,v4,v3,v5
+#endif
+ lvx v3,r4,r7
+#ifdef __LITTLE_ENDIAN__
+ vperm v10,v4,v3,v5
+#else
+ vperm v10,v3,v4,v5
+#endif
+ subi r4,r4,32
+ stvx v6,r11,r6
+ stvx v10,r11,r7
+ subi r11,r11,32
+ bdnz L(unaligned_loop_bwd)
+
+ clrrdi r0,r4,60
+
+ .align 4
+L(end_unaligned_loop_bwd):
+
+ /* Check for tail bytes. */
+ mtocrf 0x01,r5
+ beqlr cr1
+
+ add r4,r4,0
+
+ /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
+ /* Copy 8 bytes. */
+ bf 28,4f
+ lwz r6,-4(r4)
+ lwz r7,-8(r4)
+ subi r4,r4,8
+ stw r6,-4(r11)
+ stw r7,-8(r11)
+ subi r11,r11,8
+4: /* Copy 4~7 bytes. */
+ bf 29,L(tail2_bwd)
+ lwz r6,-4(r4)
+ stw r6,-4(r11)
+ bf 30,L(tail5_bwd)
+ lhz r7,-6(r4)
+ sth r7,-6(r11)
+ bflr 31
+ lbz r8,-7(r4)
+ stb r8,-7(r11)
+ /* Return original DST pointer. */
+ blr
+END_GEN_TB (memmove, TB_TOCLESS)
+libc_hidden_builtin_def (memmove)
+
+
+/* void bcopy(const void *src [r3], void *dest [r4], size_t n [r5])
+ Implemented in this file to avoid linker create a stub function call
+ in the branch to '_memmove'. */
+ENTRY (bcopy)
+ mr r6,r3
+ mr r3,r4
+ mr r4,r6
+ b L(_memmove)
+END (bcopy)
diff --git a/sysdeps/powerpc/powerpc64/power7/mempcpy.S b/sysdeps/powerpc/powerpc64/power7/mempcpy.S
index a7239eeac1..1cd69df137 100644
--- a/sysdeps/powerpc/powerpc64/power7/mempcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/mempcpy.S
@@ -1,5 +1,5 @@
/* Optimized mempcpy implementation for POWER7.
- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+ Copyright (C) 2010-2015 Free Software Foundation, Inc.
Contributed by Luis Machado <luisgpm@br.ibm.com>.
This file is part of the GNU C Library.
diff --git a/sysdeps/powerpc/powerpc64/power7/memrchr.S b/sysdeps/powerpc/powerpc64/power7/memrchr.S
index 40e436f853..bd3f085872 100644
--- a/sysdeps/powerpc/powerpc64/power7/memrchr.S
+++ b/sysdeps/powerpc/powerpc64/power7/memrchr.S
@@ -1,5 +1,5 @@
/* Optimized memrchr implementation for PowerPC64/POWER7 using cmpb insn.
- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+ Copyright (C) 2010-2015 Free Software Foundation, Inc.
Contributed by Luis Machado <luisgpm@br.ibm.com>.
This file is part of the GNU C Library.
@@ -29,7 +29,7 @@ ENTRY (__memrchr)
mr r10,r3
clrrdi r6,r7,7
li r9,3<<5
- dcbt r9,r6,16 /* Stream hint, decreasing addresses. */
+ dcbt r9,r6,8 /* Stream hint, decreasing addresses. */
/* Replicate BYTE to doubleword. */
insrdi r4,r4,8,48
diff --git a/sysdeps/powerpc/powerpc64/power7/memset.S b/sysdeps/powerpc/powerpc64/power7/memset.S
index 6b8999dc1f..4c8c06fec9 100644
--- a/sysdeps/powerpc/powerpc64/power7/memset.S
+++ b/sysdeps/powerpc/powerpc64/power7/memset.S
@@ -1,5 +1,5 @@
/* Optimized memset implementation for PowerPC64/POWER7.
- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+ Copyright (C) 2010-2015 Free Software Foundation, Inc.
Contributed by Luis Machado <luisgpm@br.ibm.com>.
This file is part of the GNU C Library.
@@ -383,7 +383,6 @@ L(small):
END_GEN_TB (memset,TB_TOCLESS)
libc_hidden_builtin_def (memset)
-#ifndef NO_BZERO_IMPL
/* Copied from bzero.S to prevent the linker from inserting a stub
between bzero and memset. */
ENTRY (__bzero)
@@ -391,7 +390,7 @@ ENTRY (__bzero)
mr r5,r4
li r4,0
b L(_memset)
-END_GEN_TB (__bzero,TB_TOCLESS)
-
+END (__bzero)
+#ifndef __bzero
weak_alias (__bzero, bzero)
#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/rawmemchr.S b/sysdeps/powerpc/powerpc64/power7/rawmemchr.S
index 56a19bd885..cccac6e7fb 100644
--- a/sysdeps/powerpc/powerpc64/power7/rawmemchr.S
+++ b/sysdeps/powerpc/powerpc64/power7/rawmemchr.S
@@ -1,5 +1,5 @@
/* Optimized rawmemchr implementation for PowerPC64/POWER7 using cmpb insn.
- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+ Copyright (C) 2010-2015 Free Software Foundation, Inc.
Contributed by Luis Machado <luisgpm@br.ibm.com>.
This file is part of the GNU C Library.
diff --git a/sysdeps/powerpc/powerpc64/power7/stpcpy.S b/sysdeps/powerpc/powerpc64/power7/stpcpy.S
index baf6e98826..ef90142932 100644
--- a/sysdeps/powerpc/powerpc64/power7/stpcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/stpcpy.S
@@ -1,5 +1,5 @@
/* Optimized stpcpy implementation for PowerPC64/POWER7.
- Copyright (C) 2013-2014 Free Software Foundation, Inc.
+ Copyright (C) 2013-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/powerpc/powerpc64/power7/stpncpy.S b/sysdeps/powerpc/powerpc64/power7/stpncpy.S
new file mode 100644
index 0000000000..c60453a55f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/stpncpy.S
@@ -0,0 +1,24 @@
+/* Optimized stpncpy implementation for PowerPC64/POWER7.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/power7/strncpy.S>
+
+weak_alias (__stpncpy, stpncpy)
+libc_hidden_def (__stpncpy)
+libc_hidden_builtin_def (stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/power7/strcasecmp.S b/sysdeps/powerpc/powerpc64/power7/strcasecmp.S
index 417c7e56af..2dcb2bc7dc 100644
--- a/sysdeps/powerpc/powerpc64/power7/strcasecmp.S
+++ b/sysdeps/powerpc/powerpc64/power7/strcasecmp.S
@@ -1,5 +1,5 @@
/* Optimized strcasecmp implementation for PowerPC64.
- Copyright (C) 2011-2014 Free Software Foundation, Inc.
+ Copyright (C) 2011-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/powerpc/powerpc64/power7/strchr.S b/sysdeps/powerpc/powerpc64/power7/strchr.S
index 1c0a556c04..1ba388c791 100644
--- a/sysdeps/powerpc/powerpc64/power7/strchr.S
+++ b/sysdeps/powerpc/powerpc64/power7/strchr.S
@@ -1,5 +1,5 @@
/* Optimized strchr implementation for PowerPC64/POWER7 using cmpb insn.
- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+ Copyright (C) 2010-2015 Free Software Foundation, Inc.
Contributed by Luis Machado <luisgpm@br.ibm.com>.
This file is part of the GNU C Library.
diff --git a/sysdeps/powerpc/powerpc64/power7/strchrnul.S b/sysdeps/powerpc/powerpc64/power7/strchrnul.S
index 586c76950a..180b72bf5c 100644
--- a/sysdeps/powerpc/powerpc64/power7/strchrnul.S
+++ b/sysdeps/powerpc/powerpc64/power7/strchrnul.S
@@ -1,5 +1,5 @@
/* Optimized strchrnul implementation for PowerPC64/POWER7 using cmpb insn.
- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+ Copyright (C) 2010-2015 Free Software Foundation, Inc.
Contributed by Luis Machado <luisgpm@br.ibm.com>.
This file is part of the GNU C Library.
diff --git a/sysdeps/powerpc/powerpc64/power7/strcmp.S b/sysdeps/powerpc/powerpc64/power7/strcmp.S
new file mode 100644
index 0000000000..6af0e7dad6
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/strcmp.S
@@ -0,0 +1,164 @@
+/* Optimized strcmp implementation for Power7 using 'cmpb' instruction
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* The optimization is achieved here through cmpb instruction.
+ 8byte aligned strings are processed with double word comparision
+ and unaligned strings are handled effectively with loop unrolling
+ technique */
+
+#include <sysdep.h>
+
+/* int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) */
+
+ .machine power7
+EALIGN (strcmp, 4, 0)
+ CALL_MCOUNT 2
+
+ or r9, r3, r4
+ rldicl. r10, r9, 0, 61 /* are s1 and s2 8 byte aligned..? */
+ bne cr0, L(process_unaligned_bytes)
+ li r5, 0
+
+ .align 4
+/* process input parameters on double word aligned boundary */
+L(unrollDword):
+ ld r8,0(r3)
+ ld r10,0(r4)
+ cmpb r7,r8,r5
+ cmpdi cr7,r7,0
+ mr r9,r7
+ bne cr7,L(null_found)
+ cmpld cr7,r8,r10
+ bne cr7,L(different)
+
+ ld r8,8(r3)
+ ld r10,8(r4)
+ cmpb r7,r8,r5
+ cmpdi cr7,r7,0
+ mr r9,r7
+ bne cr7,L(null_found)
+ cmpld cr7,r8,r10
+ bne cr7,L(different)
+
+ ld r8,16(r3)
+ ld r10,16(r4)
+ cmpb r7,r8,r5
+ cmpdi cr7,r7,0
+ mr r9,r7
+ bne cr7,L(null_found)
+ cmpld cr7,r8,r10
+ bne cr7,L(different)
+
+ ld r8,24(r3)
+ ld r10,24(r4)
+ cmpb r7,r8,r5
+ cmpdi cr7,r7,0
+ mr r9,r7
+ bne cr7,L(null_found)
+ cmpld cr7,r8,r10
+ bne cr7,L(different)
+
+ addi r3, r3, 32
+ addi r4, r4, 32
+ beq cr7, L(unrollDword)
+
+ .align 4
+L(null_found):
+#ifdef __LITTLE_ENDIAN__
+ neg r7,r9
+ and r9,r9,r7
+ li r7,-1
+ cntlzd r9,r9
+ subfic r9,r9,71
+ sld r9,r7,r9
+#else
+ cntlzd r9,r9
+ li r7,-1
+ addi r9,r9,8
+ srd r9,r7,r9
+#endif
+ or r8,r8,r9
+ or r10,r10,r9
+
+L(different):
+ cmpb r9,r8,r10
+#ifdef __LITTLE_ENDIAN__
+ addi r7,r9,1
+ andc r9,r7,r9
+ cntlzd r9,r9
+ subfic r9,r9,63
+#else
+ not r9,r9
+ cntlzd r9,r9
+ subfic r9,r9,56
+#endif
+ srd r3,r8,r9
+ srd r10,r10,r9
+ rldicl r10,r10,0,56
+ rldicl r3,r3,0,56
+ subf r3,r10,r3
+ blr
+
+ .align 4
+L(process_unaligned_bytes):
+ lbz r9, 0(r3) /* load byte from s1 */
+ lbz r10, 0(r4) /* load byte from s2 */
+ cmpdi cr7, r9, 0 /* compare *s1 with NULL */
+ beq cr7, L(diffOfNULL) /* if *s1 is NULL , return *s1 - *s2 */
+ cmplw cr7, r9, r10 /* compare *s1 and *s2 */
+ bne cr7, L(ComputeDiff) /* branch to compute difference and return */
+
+ lbz r9, 1(r3) /* load next byte from s1 */
+ lbz r10, 1(r4) /* load next byte from s2 */
+ cmpdi cr7, r9, 0 /* compare *s1 with NULL */
+ beq cr7, L(diffOfNULL) /* if *s1 is NULL , return *s1 - *s2 */
+ cmplw cr7, r9, r10 /* compare *s1 and *s2 */
+ bne cr7, L(ComputeDiff) /* branch to compute difference and return */
+
+ lbz r9, 2(r3) /* unroll 3rd byte here */
+ lbz r10, 2(r4)
+ cmpdi cr7, r9, 0
+ beq cr7, L(diffOfNULL)
+ cmplw cr7, r9, r10
+ bne 7, L(ComputeDiff)
+
+ lbz r9, 3(r3) /* unroll 4th byte now */
+ lbz r10, 3(r4)
+ addi r3, r3, 4 /* increment s1 by unroll factor */
+ cmpdi cr7, r9, 0
+ cmplw cr6, 9, r10
+ beq cr7, L(diffOfNULL)
+ addi r4, r4, 4 /* increment s2 by unroll factor */
+ beq cr6, L(process_unaligned_bytes) /* unroll byte processing */
+
+ .align 4
+L(ComputeDiff):
+ extsw r9, r9
+ subf r10, r10, r9 /* compute s1 - s2 */
+ extsw r3, r10
+ blr /* return */
+
+ .align 4
+L(diffOfNULL):
+ li r9, 0
+ subf r10, r10, r9 /* compute s1 - s2 */
+ extsw r3, r10 /* sign extend result */
+ blr /* return */
+
+END (strcmp)
+libc_hidden_builtin_def (strcmp)
diff --git a/sysdeps/powerpc/powerpc64/power7/strcpy.S b/sysdeps/powerpc/powerpc64/power7/strcpy.S
index ce71982eaf..70f2987181 100644
--- a/sysdeps/powerpc/powerpc64/power7/strcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/strcpy.S
@@ -1,5 +1,5 @@
/* Optimized strcpy/stpcpy implementation for PowerPC64/POWER7.
- Copyright (C) 2013-2014 Free Software Foundation, Inc.
+ Copyright (C) 2013-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -31,8 +31,6 @@
if (((((uintptr_t)dst & 0x7UL) == 0) && ((uintptr_t)src & 0x7UL) == 0))
goto aligned_doubleword_copy;
- if (((((uintptr_t)dst & 0x3UL) == 0) && ((uintptr_t)src & 0x3UL) == 0))
- goto aligned_word_copy;
if (((uintptr_t)dst & 0x7UL) == ((uintptr_t)src & 0x7UL))
goto same_alignment;
goto unaligned;
@@ -70,9 +68,18 @@ EALIGN (FUNC_NAME, 4, 0)
#endif
or rTMP, rSRC, rRTN
clrldi. rTMP, rTMP, 61
- bne L(check_word_alignment)
+ bne L(check_alignment)
b L(aligned_doubleword_copy)
+ .align 4
+L(check_alignment):
+ rldicl rRTNAL, rRTN, 0, 61
+ rldicl rSRCAL, rSRC, 0, 61
+ cmpld cr7, rSRCAL, rRTNAL
+ beq cr7, L(same_alignment)
+ b L(unaligned)
+
+ .align 4
L(same_alignment):
/* Src and dst with same alignment: align both to doubleword. */
mr rALCNT, rRTN
@@ -180,93 +187,249 @@ L(g1):
#endif
blr
-L(check_word_alignment):
- clrldi. rTMP, rTMP, 62
- beq L(aligned_word_copy)
- rldicl rRTNAL, rRTN, 0, 61
- rldicl rSRCAL, rSRC, 0, 61
- cmpld cr7, rSRCAL, rRTNAL
- beq cr7, L(same_alignment)
- b L(unaligned)
-
-/* For word aligned memory, operate using word load and stores. */
.align 4
-L(aligned_word_copy):
- li rMASK, 0
- addi rRTN, rRTN, -4
- lwz rWORD, 0(rSRC)
- b L(g5)
+L(unaligned):
+ cmpdi rSRCAL, 0 /* Check src alignment */
+ beq L(srcaligndstunalign)
+ /* src is unaligned */
+ rlwinm r10, rSRC, 3,26,28 /* Calculate padding. */
+ clrrdi rSRC, rSRC, 3 /* Align the addr to dw boundary */
+ ld rWORD, 0(rSRC) /* Load doubleword from memory. */
+ li rTMP, 0
+ /* Discard bits not part of the string */
+#ifdef __LITTLE_ENDIAN__
+ srd rALT, rWORD, r10
+#else
+ sld rALT, rWORD, r10
+#endif
+ cmpb rTMP, rALT, rTMP /* Compare each byte against null */
+ /* Discard bits not part of the string */
+#ifdef __LITTLE_ENDIAN__
+ sld rTMP, rTMP, r10
+#else
+ srd rTMP, rTMP, r10
+#endif
+ cmpdi rTMP, 0
+ bne L(bytebybyte) /* if it has null, copy byte by byte */
+ subfic r8, r9, 8
+ rlwinm r5, rRTN, 3,26,28 /* Calculate padding in bits. */
+ rldicl r9, rRTN, 0, 61 /* Calculate padding in bytes. */
+ addi rRTN, rRTN, -1
- .align 4
-L(g3): lwzu rALT, 4(rSRC)
- stwu rWORD, 4(rRTN)
- cmpb rTMP, rALT, rMASK
- cmpwi rTMP, 0
- bne L(g4)
- lwzu rWORD, 4(rSRC)
- stwu rALT, 4(rRTN)
-L(g5): cmpb rTMP, rWORD, rMASK
- cmpwi rTMP, 0 /* If rTMP is 0, no null in word. */
- beq L(g3)
-
- mr rALT, rWORD
-/* We've hit the end of the string. Do the rest byte-by-byte. */
-L(g4):
+ cmpdi r5, 0 /* check dest alignment */
+ beq L(srcunaligndstalign)
+
+ /* both src and dst unaligned */
#ifdef __LITTLE_ENDIAN__
- rlwinm. rTMP, rALT, 0, 24, 31
- stbu rALT, 4(rRTN)
- beqlr-
- rlwinm. rTMP, rALT, 24, 24, 31
- stbu rTMP, 1(rRTN)
- beqlr-
- rlwinm. rTMP, rALT, 16, 24, 31
- stbu rTMP, 1(rRTN)
- beqlr-
- rlwinm rTMP, rALT, 8, 24, 31
- stbu rTMP, 1(rRTN)
+ sld rWORD, rALT, r10
+ mr r11, r10
+ addi r11, r11, -8 /* Adjust byte pointer on loaded dw */
#else
- rlwinm. rTMP, rALT, 8, 24, 31
- stbu rTMP, 4(rRTN)
- beqlr
- rlwinm. rTMP, rALT, 16, 24, 31
- stbu rTMP, 1(rRTN)
- beqlr
- rlwinm. rTMP, rALT, 24, 24, 31
- stbu rTMP, 1(rRTN)
- beqlr
- stbu rALT, 1(rRTN)
+ srd rWORD, rALT, r10
+ subfic r11, r10, 64
#endif
- blr
+ /* dst alignment is greater then src alignment? */
+ cmpd cr7, r5, r10
+ blt cr7, L(dst_align_small)
+ /* src alignment is less than dst */
-/* Oh well. In this case, we just do a byte-by-byte copy. */
- .align 4
-L(unaligned):
- lbz rWORD, 0(rSRC)
- addi rRTN, rRTN, -1
- cmpdi rWORD, 0
- beq L(u2)
-
- .align 5
-L(u0): lbzu rALT, 1(rSRC)
- stbu rWORD, 1(rRTN)
- cmpdi rALT, 0
- beq L(u1)
- lbzu rWORD, 1(rSRC)
+ /* Calculate the dst alignment differnce */
+ subfic rALT, r9, 8
+ mtctr rALT
+
+ /* Write till dst is aligned */
+ cmpdi rTMP, rALT, 4
+ blt L(storebyte1) /* less than 4, store byte by byte */
+ beq L(equal1) /* if its 4, store word */
+ addi rTMP, rALT, -4 /* greater than 4, so stb and stw */
+ mtctr rTMP
+L(storebyte1):
+#ifdef __LITTLE_ENDIAN__
+ addi r11, r11, 8 /* Adjust byte pointer on loaded dw */
+#else
+ addi r11, r11, -8
+#endif
+ srd rALT, rWORD, r11
+ stbu rALT, 1(rRTN)
+ bdnz L(storebyte1)
+
+ subfic rALT, r9, 8 /* Check the remaining bytes */
+ cmpdi rTMP, rALT, 4
+ blt L(proceed)
+
+ .align 4
+L(equal1):
+#ifdef __LITTLE_ENDIAN__
+ addi r11, r11, 8 /* Adjust byte pointer on loaded dw */
+ srd rALT, rWORD, r11
+#else
+ subfic r11, r11, 64
+ sld rALT, rWORD, r11
+ srdi rALT, rALT, 32
+#endif
+ stw rALT, 1(rRTN)
+ addi rRTN, rRTN, 4
+
+L(proceed):
+ mr rALT, rWORD
+ /* calculate the Left over bytes to be written */
+ subfic r11, r10, 64
+ subfic r5, r5, 64
+ subf r5, r5, r11 /* remaining bytes on second dw */
+ subfic r10, r5, 64 /* remaining bytes on first dw */
+ subfic r9, r9, 8
+ subf r8, r9, r8 /* recalculate padding */
+L(srcunaligndstalign):
+ addi rRTN, rRTN, 1
+ subfic r5, r10, 64 /* remaining bytes on second dw */
+ addi rSRC, rSRC, 8
+ li rTMP,0
+ b L(storedouble)
+
+ .align 4
+L(dst_align_small):
+ mtctr r8
+ /* Write till src is aligned */
+L(storebyte2):
+#ifdef __LITTLE_ENDIAN__
+ addi r11, r11, 8 /* Adjust byte pointer on dw */
+#else
+ addi r11, r11, -8
+#endif
+ srd rALT, rWORD, r11
stbu rALT, 1(rRTN)
- cmpdi rWORD, 0
- beq L(u2)
- lbzu rALT, 1(rSRC)
- stbu rWORD, 1(rRTN)
- cmpdi rALT, 0
- beq L(u1)
- lbzu rWORD, 1(rSRC)
+ bdnz L(storebyte2)
+
+ addi rSRC, rSRC, 8 /* Increment src pointer */
+ addi rRTN, rRTN, 1 /* Increment dst pointer */
+ rldicl r8, rRTN, 0, 61 /* Recalculate padding */
+
+ /* src is aligned */
+L(srcaligndstunalign):
+ ld rWORD, 0(rSRC)
+ mr rALT, rWORD
+ li rTMP, 0 /* Check null */
+ cmpb rTMP, rWORD, rTMP
+ cmpdi rTMP, 0
+ bne L(bytebybyte) /* Do byte by byte if there is NULL */
+ rlwinm r5, rRTN, 3,26,28 /* Calculate padding */
+ addi rRTN, rRTN, -1
+ subfic r10, r8, 8
+ /* write byte by byte till aligned */
+#ifdef __LITTLE_ENDIAN__
+ li r11, -8
+#else
+ li r11, 64
+#endif
+ mtctr r10
+ cmpdi rTMP, r10, 4
+ blt L(storebyte)
+ beq L(equal)
+ addi rTMP, r10, -4
+ mtctr rTMP
+L(storebyte):
+#ifdef __LITTLE_ENDIAN__
+ addi r11, r11, 8 /* Adjust byte pointer on dw */
+#else
+ addi r11, r11, -8
+#endif
+ srd rALT, rWORD, r11
stbu rALT, 1(rRTN)
- cmpdi rWORD, 0
- bne L(u0)
-L(u2): stbu rWORD, 1(rRTN)
- blr
-L(u1): stbu rALT, 1(rRTN)
- blr
+ bdnz L(storebyte)
+
+ cmpdi rTMP, r10, 4
+ blt L(align)
+
+ .align 4
+L(equal):
+#ifdef __LITTLE_ENDIAN__
+ addi r11, r11, 8
+ srd rALT, rWORD, r11
+#else
+ subfic r11, r11, 64
+ sld rALT, rWORD, r11
+ srdi rALT, rALT, 32
+#endif
+ stw rALT, 1(rRTN)
+ addi rRTN, rRTN, 4
+L(align):
+ addi rRTN, rRTN, 1
+ addi rSRC, rSRC, 8 /* Increment src pointer */
+ subfic r10, r5, 64
+ li rTMP, 0
+ /* dst addr aligned to 8 */
+L(storedouble):
+ ld rALT, 0(rSRC) /* load next dw */
+ cmpb rTMP, rALT, rTMP
+ cmpdi rTMP, 0 /* check for null on each new dw */
+ bne L(null)
+#ifdef __LITTLE_ENDIAN__
+ srd r9, rWORD, r10 /* bytes from first dw */
+ sld r11, rALT, r5 /* bytes from second dw */
+#else
+ sld r9, rWORD, r10
+ srd r11, rALT, r5
+#endif
+ or r11, r9, r11 /* make as a single dw */
+ std r11, 0(rRTN) /* store as std on aligned addr */
+ mr rWORD, rALT /* still few bytes left to be written */
+ addi rRTN, rRTN, 8 /* increment dst addr */
+ addi rSRC, rSRC, 8 /* increment src addr */
+ b L(storedouble) /* Loop till NULL */
+
+ .align 4
+
+/* We've hit the end of the string. Do the rest byte-by-byte. */
+L(null):
+ addi rRTN, rRTN, -1
+ mr r10, r5
+ mtctr r8
+#ifdef __LITTLE_ENDIAN__
+ subfic r10, r10, 64
+ addi r10, r10, -8
+#endif
+ cmpdi rTMP, r8, 4
+ blt L(loop)
+
+ /* we can still use stw if leftover >= 4*/
+#ifdef __LITTLE_ENDIAN__
+ addi r10, r10, 8
+ srd r11, rWORD, r10
+#else
+ subfic r10, r10, 64
+ sld r11, rWORD, r10
+ srdi r11, r11, 32
+#endif
+ stw r11, 1(rRTN)
+ addi rRTN, rRTN, 4
+
+ beq L(bytebybyte1)
+ addi r10, r10, 32
+#ifdef __LITTLE_ENDIAN__
+ addi r10, r10, -8
+#else
+ subfic r10, r10, 64
+#endif
+ addi rTMP, r8, -4
+ mtctr rTMP
+ /* remaining byte by byte part of first dw */
+L(loop):
+#ifdef __LITTLE_ENDIAN__
+ addi r10, r10, 8
+#else
+ addi r10, r10, -8
+#endif
+ srd rTMP, rWORD, r10
+ stbu rTMP, 1(rRTN)
+ bdnz L(loop)
+
+L(bytebybyte1):
+ addi rRTN, rRTN, 1
+ /* remaining byte by byte part of second dw */
+L(bytebybyte):
+ addi rRTN, rRTN, -8
+ b L(g1)
+
END (FUNC_NAME)
#ifndef USE_AS_STPCPY
diff --git a/sysdeps/powerpc/powerpc64/power7/strlen.S b/sysdeps/powerpc/powerpc64/power7/strlen.S
index d023e85938..598fe0b5ff 100644
--- a/sysdeps/powerpc/powerpc64/power7/strlen.S
+++ b/sysdeps/powerpc/powerpc64/power7/strlen.S
@@ -1,5 +1,5 @@
/* Optimized strlen implementation for PowerPC64/POWER7 using cmpb insn.
- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+ Copyright (C) 2010-2015 Free Software Foundation, Inc.
Contributed by Luis Machado <luisgpm@br.ibm.com>.
This file is part of the GNU C Library.
diff --git a/sysdeps/powerpc/powerpc64/power7/strncmp.S b/sysdeps/powerpc/powerpc64/power7/strncmp.S
index 35cc244f36..959eb95752 100644
--- a/sysdeps/powerpc/powerpc64/power7/strncmp.S
+++ b/sysdeps/powerpc/powerpc64/power7/strncmp.S
@@ -1,5 +1,5 @@
/* Optimized strcmp implementation for POWER7/PowerPC64.
- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+ Copyright (C) 2010-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/powerpc/powerpc64/power7/strncpy.S b/sysdeps/powerpc/powerpc64/power7/strncpy.S
new file mode 100644
index 0000000000..a6c9abf7d9
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/strncpy.S
@@ -0,0 +1,714 @@
+/* Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Implements the functions
+
+ char * [r3] strncpy (char *dst [r3], const char *src [r4], size_t n [r5])
+
+ AND
+
+ char * [r3] stpncpy (char *dst [r3], const char *src [r4], size_t n [r5])
+
+ The algorithm is as follows:
+ > if src and dest are 8 byte aligned, perform double word copy
+ else
+ > copy byte by byte on unaligned addresses.
+
+ The aligned comparison are made using cmpb instructions. */
+
+/* The focus on optimization for performance improvements are as follows:
+ 1. data alignment [gain from aligned memory access on read/write]
+ 2. POWER7 gains performance with loop unrolling/unwinding
+ [gain by reduction of branch penalty].
+ 3. The final pad with null bytes is done by calling an optimized
+ memset. */
+
+#ifdef USE_AS_STPNCPY
+# define FUNC_NAME __stpncpy
+#else
+# define FUNC_NAME strncpy
+#endif
+
+#define FRAMESIZE (FRAME_MIN_SIZE+32)
+
+#ifndef MEMSET
+/* For builds with no IFUNC support, local calls should be made to internal
+ GLIBC symbol (created by libc_hidden_builtin_def). */
+# ifdef SHARED
+# define MEMSET __GI_memset
+# else
+# define MEMSET memset
+# endif
+#endif
+
+ .machine power7
+EALIGN(FUNC_NAME, 4, 0)
+ CALL_MCOUNT 3
+
+ mflr r0 /* load link register LR to r0 */
+ or r10, r3, r4 /* to verify source and destination */
+ rldicl. r8, r10, 0, 61 /* is double word aligned .. ? */
+
+ std r19, -8(r1) /* save callers register , r19 */
+ std r18, -16(r1) /* save callers register , r18 */
+ std r0, 16(r1) /* store the link register */
+ stdu r1, -FRAMESIZE(r1) /* create the stack frame */
+
+ mr r9, r3 /* save r3 into r9 for use */
+ mr r18, r3 /* save r3 for retCode of strncpy */
+ bne 0, L(unaligned)
+
+L(aligned):
+ srdi r11, r5, 3 /* compute count for CTR ; count = n/8 */
+ cmpldi cr7, r11, 3 /* if count > 4 ; perform unrolling 4 times */
+ ble 7, L(update1)
+
+ ld r10, 0(r4) /* load doubleWord from src */
+ cmpb r8, r10, r8 /* compare src with NULL ,we read just now */
+ cmpdi cr7, r8, 0 /* if cmpb returned NULL ; we continue */
+ bne cr7, L(update3)
+
+ std r10, 0(r3) /* copy doubleword at offset=0 */
+ ld r10, 8(r4) /* load next doubleword from offset=8 */
+ cmpb r8, r10, r8 /* compare src with NULL , we read just now */
+ cmpdi cr7, r8, 0 /* if cmpb returned NULL ; we continue */
+ bne 7,L(HopBy8)
+
+ addi r8, r11, -4
+ mr r7, r3
+ srdi r8, r8, 2
+ mr r6, r4
+ addi r8, r8, 1
+ li r12, 0
+ mtctr r8
+ b L(dwordCopy)
+
+ .p2align 4
+L(dWordUnroll):
+ std r8, 16(r9)
+ ld r8, 24(r4) /* load dword,perform loop unrolling again */
+ cmpb r10, r8, r10
+ cmpdi cr7, r10, 0
+ bne cr7, L(HopBy24)
+
+ std r8, 24(r7) /* copy dword at offset=24 */
+ addi r9, r9, 32
+ addi r4, r4, 32
+ bdz L(leftDwords) /* continue with loop on counter */
+
+ ld r3, 32(r6)
+ cmpb r8, r3, r10
+ cmpdi cr7, r8, 0
+ bne cr7, L(update2)
+
+ std r3, 32(r7)
+ ld r10, 40(r6)
+ cmpb r8, r10, r8
+ cmpdi cr7, r8, 0
+ bne cr7, L(HopBy40)
+
+ mr r6, r4 /* update values */
+ mr r7, r9
+ mr r11, r0
+ mr r5, r19
+
+L(dwordCopy):
+ std r10, 8(r9) /* copy dword at offset=8 */
+ addi r19, r5, -32
+ addi r0, r11, -4
+ ld r8, 16(r4)
+ cmpb r10, r8, r12
+ cmpdi cr7, r10, 0
+ beq cr7, L(dWordUnroll)
+
+ addi r9, r9, 16 /* increment dst by 16 */
+ addi r4, r4, 16 /* increment src by 16 */
+ addi r5, r5, -16 /* decrement length 'n' by 16 */
+ addi r0, r11, -2 /* decrement loop counter */
+
+L(dWordUnrollOFF):
+ ld r10, 0(r4) /* load first dword */
+ li r8, 0 /* load mask */
+ cmpb r8, r10, r8
+ cmpdi cr7, r8, 0
+ bne cr7, L(byte_by_byte)
+ mtctr r0
+ li r7, 0
+ b L(CopyDword)
+
+ .p2align 4
+L(loadDWordandCompare):
+ ld r10, 0(r4)
+ cmpb r8, r10, r7
+ cmpdi cr7, r8, 0
+ bne cr7, L(byte_by_byte)
+
+L(CopyDword):
+ addi r9, r9, 8
+ std r10, -8(r9)
+ addi r4, r4, 8
+ addi r5, r5, -8
+ bdnz L(loadDWordandCompare)
+
+L(byte_by_byte):
+ cmpldi cr7, r5, 3
+ ble cr7, L(verifyByte)
+ srdi r10, r5, 2
+ mr r19, r9
+ mtctr r10
+ b L(firstByteUnroll)
+
+ .p2align 4
+L(bytes_unroll):
+ lbz r10, 1(r4) /* load byte from src */
+ cmpdi cr7, r10, 0 /* compare for NULL */
+ stb r10, 1(r19) /* store byte to dst */
+ beq cr7, L(updtDestComputeN2ndByte)
+
+ addi r4, r4, 4 /* advance src */
+
+ lbz r10, -2(r4) /* perform loop unrolling for byte r/w */
+ cmpdi cr7, r10, 0
+ stb r10, 2(r19)
+ beq cr7, L(updtDestComputeN3rdByte)
+
+ lbz r10, -1(r4) /* perform loop unrolling for byte r/w */
+ addi r19, r19, 4
+ cmpdi cr7, r10, 0
+ stb r10, -1(r19)
+ beq cr7, L(ComputeNByte)
+
+ bdz L(update0)
+
+L(firstByteUnroll):
+ lbz r10, 0(r4) /* perform loop unrolling for byte r/w */
+ cmpdi cr7, 10, 0
+ stb r10, 0(r19)
+ bne cr7, L(bytes_unroll)
+ addi r19, r19, 1
+
+L(ComputeNByte):
+ subf r9, r19, r9 /* compute 'n'n bytes to fill */
+ add r8, r9, r5
+
+L(zeroFill):
+ cmpdi cr7, r8, 0 /* compare if length is zero */
+ beq cr7, L(update3return)
+
+ mr r3, r19 /* fill buffer with */
+ li r4, 0 /* zero fill buffer */
+ mr r5, r8 /* how many bytes to fill buffer with */
+ bl MEMSET /* call optimized memset */
+ nop
+
+L(update3return):
+#ifdef USE_AS_STPNCPY
+ addi r3, r19, -1 /* update return value */
+#endif
+
+L(hop2return):
+#ifndef USE_AS_STPNCPY
+ mr r3, r18 /* set return value */
+#endif
+ addi r1, r1, FRAMESIZE /* restore stack pointer */
+ ld r0, 16(r1) /* read the saved link register */
+ ld r18, -16(r1) /* restore callers save register, r18 */
+ ld r19, -8(r1) /* restore callers save register, r19 */
+ mtlr r0 /* branch to link register */
+ blr /* return */
+
+ .p2align 4
+L(update0):
+ mr r9, r19
+
+ .p2align 4
+L(verifyByte):
+ rldicl. r8, r5, 0, 62
+#ifdef USE_AS_STPNCPY
+ mr r3, r9
+#endif
+ beq cr0, L(hop2return)
+ mtctr r8
+ addi r4, r4, -1
+ mr r19, r9
+ b L(oneBYone)
+
+ .p2align 4
+L(proceed):
+ bdz L(done)
+
+L(oneBYone):
+ lbzu r10, 1(r4) /* copy byte */
+ addi r19, r19, 1
+ addi r8, r8, -1
+ cmpdi cr7, r10, 0
+ stb r10, -1(r19)
+ bne cr7, L(proceed)
+ b L(zeroFill)
+
+ .p2align 4
+L(done):
+ addi r1, r1, FRAMESIZE /* restore stack pointer */
+#ifdef USE_AS_STPNCPY
+ mr r3, r19 /* set the return value */
+#else
+ mr r3, r18 /* set the return value */
+#endif
+ ld r0, 16(r1) /* read the saved link register */
+ ld r18, -16(r1) /* restore callers save register, r18 */
+ ld r19, -8(r1) /* restore callers save register, r19 */
+ mtlr r0 /* branch to link register */
+ blr /* return */
+
+L(update1):
+ mr r0, r11
+ mr r19, r5
+
+ .p2align 4
+L(leftDwords):
+ cmpdi cr7, r0, 0
+ mr r5, r19
+ bne cr7, L(dWordUnrollOFF)
+ b L(byte_by_byte)
+
+ .p2align 4
+L(updtDestComputeN2ndByte):
+ addi r19, r19, 2 /* update dst by 2 */
+ subf r9, r19, r9 /* compute distance covered */
+ add r8, r9, r5
+ b L(zeroFill)
+
+ .p2align 4
+L(updtDestComputeN3rdByte):
+ addi r19, r19, 3 /* update dst by 3 */
+ subf r9, r19, r9 /* compute distance covered */
+ add r8, r9, r5
+ b L(zeroFill)
+
+ .p2align 4
+L(HopBy24):
+ addi r9, r9, 24 /* increment dst by 24 */
+ addi r4, r4, 24 /* increment src by 24 */
+ addi r5, r5, -24 /* decrement length 'n' by 24 */
+ addi r0, r11, -3 /* decrement loop counter */
+ b L(dWordUnrollOFF)
+
+ .p2align 4
+L(update2):
+ mr r5, r19
+ b L(dWordUnrollOFF)
+
+ .p2align 4
+L(HopBy40):
+ addi r9, r7, 40 /* increment dst by 40 */
+ addi r4, r6, 40 /* increment src by 40 */
+ addi r5, r5, -40 /* decrement length 'n' by 40 */
+ addi r0, r11, -5 /* decrement loop counter */
+ b L(dWordUnrollOFF)
+
+L(update3):
+ mr r0, r11
+ b L(dWordUnrollOFF)
+
+L(HopBy8):
+ addi r9, r3, 8 /* increment dst by 8 */
+ addi r4, r4, 8 /* increment src by 8 */
+ addi r5, r5, -8 /* decrement length 'n' by 8 */
+ addi r0, r11, -1 /* decrement loop counter */
+ b L(dWordUnrollOFF)
+
+L(unaligned):
+ cmpdi r5, 16 /* Proceed byte by byte for less than 16 */
+ ble L(byte_by_byte)
+ rldicl r7, r3, 0, 61
+ rldicl r6, r4, 0, 61
+ cmpdi r6, 0 /* Check src alignment */
+ beq L(srcaligndstunalign)
+ /* src is unaligned */
+ rlwinm r10, r4, 3,26,28 /* Calculate padding. */
+ clrrdi r4, r4, 3 /* Align the addr to dw boundary */
+ ld r8, 0(r4) /* Load doubleword from memory. */
+ li r0, 0
+ /* Discard bits not part of the string */
+#ifdef __LITTLE_ENDIAN__
+ srd r7, r8, r10
+#else
+ sld r7, r8, r10
+#endif
+ cmpb r0, r7, r0 /* Compare each byte against null */
+ /* Discard bits not part of the string */
+#ifdef __LITTLE_ENDIAN__
+ sld r0, r0, r10
+#else
+ srd r0, r0, r10
+#endif
+ cmpdi r0, 0
+ bne L(bytebybyte) /* if it has null, copy byte by byte */
+ subfic r6, r6, 8
+ rlwinm r12, r3, 3,26,28 /* Calculate padding in bits. */
+ rldicl r9, r3, 0, 61 /* Calculate padding in bytes. */
+ addi r3, r3, -1
+
+ cmpdi r12, 0 /* check dest alignment */
+ beq L(srcunaligndstalign)
+
+ /* both src and dst unaligned */
+#ifdef __LITTLE_ENDIAN__
+ sld r8, r7, r10
+ mr r11, r10
+ addi r11, r11, -8 /* Adjust byte pointer on loaded dw */
+#else
+ srd r8, r7, r10
+ subfic r11, r10, 64
+#endif
+ /* dst alignment is greater then src alignment? */
+ cmpd cr7, r12, r10
+ ble cr7, L(dst_align_small)
+ /* src alignment is less than dst */
+
+ /* Calculate the dst alignment difference */
+ subfic r7, r9, 8
+ mtctr r7
+
+ /* Write until dst is aligned */
+ cmpdi r0, r7, 4
+ blt L(storebyte1) /* less than 4, store byte by byte */
+ beq L(equal1) /* if its 4, store word */
+ addi r0, r7, -4 /* greater than 4, so stb and stw */
+ mtctr r0
+L(storebyte1):
+#ifdef __LITTLE_ENDIAN__
+ addi r11, r11, 8 /* Adjust byte pointer on loaded dw */
+#else
+ addi r11, r11, -8
+#endif
+ srd r7, r8, r11
+ stbu r7, 1(r3)
+ addi r5, r5, -1
+ bdnz L(storebyte1)
+
+ subfic r7, r9, 8 /* Check the remaining bytes */
+ cmpdi r0, r7, 4
+ blt L(proceed1)
+
+ .align 4
+L(equal1):
+#ifdef __LITTLE_ENDIAN__
+ addi r11, r11, 8 /* Adjust byte pointer on loaded dw */
+ srd r7, r8, r11
+#else
+ subfic r11, r11, 64
+ sld r7, r8, r11
+ srdi r7, r7, 32
+#endif
+ stw r7, 1(r3)
+ addi r3, r3, 4
+ addi r5, r5, -4
+
+L(proceed1):
+ mr r7, r8
+ /* calculate the Left over bytes to be written */
+ subfic r11, r10, 64
+ subfic r12, r12, 64
+ subf r12, r12, r11 /* remaining bytes on second dw */
+ subfic r10, r12, 64 /* remaining bytes on first dw */
+ subfic r9, r9, 8
+ subf r6, r9, r6 /* recalculate padding */
+L(srcunaligndstalign):
+ addi r3, r3, 1
+ subfic r12, r10, 64 /* remaining bytes on second dw */
+ addi r4, r4, 8
+ li r0,0
+ b L(storedouble)
+
+ .align 4
+L(dst_align_small):
+ mtctr r6
+ /* Write until src is aligned */
+L(storebyte2):
+#ifdef __LITTLE_ENDIAN__
+ addi r11, r11, 8 /* Adjust byte pointer on dw */
+#else
+ addi r11, r11, -8
+#endif
+ srd r7, r8, r11
+ stbu r7, 1(r3)
+ addi r5, r5, -1
+ bdnz L(storebyte2)
+
+ addi r4, r4, 8 /* Increment src pointer */
+ addi r3, r3, 1 /* Increment dst pointer */
+ mr r9, r3
+ li r8, 0
+ cmpd cr7, r12, r10
+ beq cr7, L(aligned)
+ rldicl r6, r3, 0, 61 /* Recalculate padding */
+ mr r7, r6
+
+ /* src is algined */
+L(srcaligndstunalign):
+ mr r9, r3
+ mr r6, r7
+ ld r8, 0(r4)
+ subfic r10, r7, 8
+ mr r7, r8
+ li r0, 0 /* Check null */
+ cmpb r0, r8, r0
+ cmpdi r0, 0
+ bne L(byte_by_byte) /* Do byte by byte if there is NULL */
+ rlwinm r12, r3, 3,26,28 /* Calculate padding */
+ addi r3, r3, -1
+ /* write byte by byte until aligned */
+#ifdef __LITTLE_ENDIAN__
+ li r11, -8
+#else
+ li r11, 64
+#endif
+ mtctr r10
+ cmpdi r0, r10, 4
+ blt L(storebyte)
+ beq L(equal)
+ addi r0, r10, -4
+ mtctr r0
+L(storebyte):
+#ifdef __LITTLE_ENDIAN__
+ addi r11, r11, 8 /* Adjust byte pointer on dw */
+#else
+ addi r11, r11, -8
+#endif
+ srd r7, r8, r11
+ stbu r7, 1(r3)
+ addi r5, r5, -1
+ bdnz L(storebyte)
+
+ cmpdi r0, r10, 4
+ blt L(align)
+
+ .align 4
+L(equal):
+#ifdef __LITTLE_ENDIAN__
+ addi r11, r11, 8
+ srd r7, r8, r11
+#else
+ subfic r11, r11, 64
+ sld r7, r8, r11
+ srdi r7, r7, 32
+#endif
+ stw r7, 1(r3)
+ addi r5, r5, -4
+ addi r3, r3, 4
+L(align):
+ addi r3, r3, 1
+ addi r4, r4, 8 /* Increment src pointer */
+ subfic r10, r12, 64
+ li r0, 0
+ /* dst addr aligned to 8 */
+L(storedouble):
+ cmpdi r5, 8
+ ble L(null1)
+ ld r7, 0(r4) /* load next dw */
+ cmpb r0, r7, r0
+ cmpdi r0, 0 /* check for null on each new dw */
+ bne L(null)
+#ifdef __LITTLE_ENDIAN__
+ srd r9, r8, r10 /* bytes from first dw */
+ sld r11, r7, r12 /* bytes from second dw */
+#else
+ sld r9, r8, r10
+ srd r11, r7, r12
+#endif
+ or r11, r9, r11 /* make as a single dw */
+ std r11, 0(r3) /* store as std on aligned addr */
+ mr r8, r7 /* still few bytes left to be written */
+ addi r3, r3, 8 /* increment dst addr */
+ addi r4, r4, 8 /* increment src addr */
+ addi r5, r5, -8
+ b L(storedouble) /* Loop until NULL */
+
+ .align 4
+
+/* We've hit the end of the string. Do the rest byte-by-byte. */
+L(null):
+ addi r3, r3, -1
+ mr r10, r12
+ mtctr r6
+#ifdef __LITTLE_ENDIAN__
+ subfic r10, r10, 64
+ addi r10, r10, -8
+#endif
+ cmpdi r0, r5, 4
+ blt L(loop)
+ cmpdi r0, r6, 4
+ blt L(loop)
+
+ /* we can still use stw if leftover >= 4 */
+#ifdef __LITTLE_ENDIAN__
+ addi r10, r10, 8
+ srd r11, r8, r10
+#else
+ subfic r10, r10, 64
+ sld r11, r8, r10
+ srdi r11, r11, 32
+#endif
+ stw r11, 1(r3)
+ addi r5, r5, -4
+ addi r3, r3, 4
+ cmpdi r0, r5, 0
+ beq L(g1)
+ cmpdi r0, r6, 4
+ beq L(bytebybyte1)
+ addi r10, r10, 32
+#ifdef __LITTLE_ENDIAN__
+ addi r10, r10, -8
+#else
+ subfic r10, r10, 64
+#endif
+ addi r0, r6, -4
+ mtctr r0
+ /* remaining byte by byte part of first dw */
+L(loop):
+#ifdef __LITTLE_ENDIAN__
+ addi r10, r10, 8
+#else
+ addi r10, r10, -8
+#endif
+ srd r0, r8, r10
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ cmpdi r0, r5, 0
+ beq L(g1)
+ bdnz L(loop)
+L(bytebybyte1):
+ addi r3, r3, 1
+ /* remaining byte by byte part of second dw */
+L(bytebybyte):
+ addi r3, r3, -8
+ addi r4, r4, -1
+
+#ifdef __LITTLE_ENDIAN__
+ extrdi. r0, r7, 8, 56
+ stbu r7, 8(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi. r0, r7, 8, 48
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi. r0, r7, 8, 40
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi. r0, r7, 8, 32
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi. r0, r7, 8, 24
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi. r0, r7, 8, 16
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi. r0, r7, 8, 8
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi r0, r7, 8, 0
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ b L(g2)
+#else
+ extrdi. r0, r7, 8, 0
+ stbu r0, 8(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi. r0, r7, 8, 8
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi. r0, r7, 8, 16
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi. r0, r7, 8, 24
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi. r0, r7, 8, 32
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi. r0, r7, 8, 40
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ extrdi. r0, r7, 8, 48
+ stbu r0, 1(r3)
+ addi r5, r5, -1
+ beq L(g2)
+ cmpdi r5, 0
+ beq L(g1)
+ stbu r7, 1(r3)
+ addi r5, r5, -1
+ b L(g2)
+#endif
+L(g1):
+#ifdef USE_AS_STPNCPY
+ addi r3, r3, 1
+#endif
+L(g2):
+ addi r3, r3, 1
+ mr r19, r3
+ mr r8, r5
+ b L(zeroFill)
+L(null1):
+ mr r9, r3
+ subf r4, r6, r4
+ b L(byte_by_byte)
+END(FUNC_NAME)
+#ifndef USE_AS_STPNCPY
+libc_hidden_builtin_def (strncpy)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/strnlen.S b/sysdeps/powerpc/powerpc64/power7/strnlen.S
index 7993dae69e..2de267fd35 100644
--- a/sysdeps/powerpc/powerpc64/power7/strnlen.S
+++ b/sysdeps/powerpc/powerpc64/power7/strnlen.S
@@ -1,5 +1,5 @@
/* Optimized strnlen implementation for PowerPC64/POWER7 using cmpb insn.
- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+ Copyright (C) 2010-2015 Free Software Foundation, Inc.
Contributed by Luis Machado <luisgpm@br.ibm.com>.
This file is part of the GNU C Library.
diff --git a/sysdeps/powerpc/powerpc64/power7/strrchr.S b/sysdeps/powerpc/powerpc64/power7/strrchr.S
new file mode 100644
index 0000000000..68565c68bc
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/strrchr.S
@@ -0,0 +1,255 @@
+/* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* int [r3] strrchr (char *s [r3], int c [r4]) */
+ .machine power7
+ENTRY (strrchr)
+ CALL_MCOUNT 2
+ dcbt 0,r3
+ clrrdi r8,r3,3 /* Align the address to doubleword boundary. */
+ cmpdi cr7,r4,0
+ ld r12,0(r8) /* Load doubleword from memory. */
+ li r9,0 /* used to store last occurence */
+ li r0,0 /* Doubleword with null chars to use
+ with cmpb. */
+
+ rlwinm r6,r3,3,26,28 /* Calculate padding. */
+
+ beq cr7,L(null_match)
+
+ /* Replicate byte to doubleword. */
+ insrdi r4,r4,8,48
+ insrdi r4,r4,16,32
+ insrdi r4,r4,32,0
+
+ /* r4 is changed now ,if its passed as more chars
+ check for null again */
+ cmpdi cr7,r4,0
+ beq cr7,L(null_match)
+ /* Now r4 has a doubleword of c bytes and r0 has
+ a doubleword of null bytes. */
+
+ cmpb r10,r12,r4 /* Compare each byte against c byte. */
+ cmpb r11,r12,r0 /* Compare each byte against null byte. */
+
+ /* Move the doublewords left and right to discard the bits that are
+ not part of the string and bring them back as zeros. */
+#ifdef __LITTLE_ENDIAN__
+ srd r10,r10,r6
+ srd r11,r11,r6
+ sld r10,r10,r6
+ sld r11,r11,r6
+#else
+ sld r10,r10,r6
+ sld r11,r11,r6
+ srd r10,r10,r6
+ srd r11,r11,r6
+#endif
+ or r5,r10,r11 /* OR the results to speed things up. */
+ cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
+ have been found. */
+ bne cr7,L(done)
+
+L(align):
+ mtcrf 0x01,r8
+
+ /* Are we now aligned to a doubleword boundary? If so, skip to
+ the main loop. Otherwise, go through the alignment code. */
+
+ bt 28,L(loop)
+
+ /* Handle WORD2 of pair. */
+ ldu r12,8(r8)
+ cmpb r10,r12,r4
+ cmpb r11,r12,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ bne cr7,L(done)
+ b L(loop) /* We branch here (rather than falling through)
+ to skip the nops due to heavy alignment
+ of the loop below. */
+ .p2align 5
+L(loop):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the null-checking process for bigger strings. */
+ ld r12,8(r8)
+ ldu r7,16(r8)
+ cmpb r10,r12,r4
+ cmpb r11,r12,r0
+ cmpb r6,r7,r4
+ cmpb r7,r7,r0
+ or r12,r10,r11
+ or r5,r6,r7
+ or r5,r12,r5
+ cmpdi cr7,r5,0
+ beq cr7,L(loop)
+
+ /* OK, one (or both) of the doublewords contains a c/null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a c/null byte. */
+ cmpdi cr6,r12,0
+ addi r8,r8,-8
+ bne cr6,L(done)
+
+ /* The c/null byte must be in the second doubleword. Adjust the
+ address again and move the result of cmpb to r10 so we can calculate
+ the pointer. */
+
+ mr r10,r6
+ mr r11,r7
+ addi r8,r8,8
+
+ /* r10/r11 have the output of the cmpb instructions, that is,
+ 0xff in the same position as the c/null byte in the original
+ doubleword from the string. Use that to calculate the pointer. */
+
+L(done):
+ /* if there are more than one 0xff in r11, find the first pos of ff
+ in r11 and fill r10 with 0 from that position */
+ cmpdi cr7,r11,0
+ beq cr7,L(no_null)
+#ifdef __LITTLE_ENDIAN__
+ addi r3,r11,-1
+ andc r3,r3,r11
+ popcntd r0,r3
+#else
+ cntlzd r0,r11
+#endif
+ subfic r0,r0,63
+ li r6,-1
+#ifdef __LITTLE_ENDIAN__
+ srd r0,r6,r0
+#else
+ sld r0,r6,r0
+#endif
+ and r10,r0,r10
+L(no_null):
+#ifdef __LITTLE_ENDIAN__
+ cntlzd r0,r10 /* Count leading zeros before c matches. */
+ addi r3,r10,-1
+ andc r3,r3,r10
+ addi r10,r11,-1
+ andc r10,r10,r11
+ cmpld cr7,r3,r10
+ bgt cr7,L(no_match)
+#else
+ addi r3,r10,-1 /* Count trailing zeros before c matches. */
+ andc r3,r3,r10
+ popcntd r0,r3
+ cmpld cr7,r11,r10
+ bgt cr7,L(no_match)
+#endif
+ srdi r0,r0,3 /* Convert trailing zeros to bytes. */
+ subfic r0,r0,7
+ add r9,r8,r0 /* Return address of the matching c byte
+ or null in case c was not found. */
+ li r0,0
+ cmpdi cr7,r11,0 /* If r11 == 0, no null's have been found. */
+ beq cr7,L(align)
+
+ .align 4
+L(no_match):
+ mr r3,r9
+ blr
+
+/* We are here because strrchr was called with a null byte. */
+ .align 4
+L(null_match):
+ /* r0 has a doubleword of null bytes. */
+
+ cmpb r5,r12,r0 /* Compare each byte against null bytes. */
+
+ /* Move the doublewords left and right to discard the bits that are
+ not part of the string and bring them back as zeros. */
+#ifdef __LITTLE_ENDIAN__
+ srd r5,r5,r6
+ sld r5,r5,r6
+#else
+ sld r5,r5,r6
+ srd r5,r5,r6
+#endif
+ cmpdi cr7,r5,0 /* If r10 == 0, no c or null bytes
+ have been found. */
+ bne cr7,L(done_null)
+
+ mtcrf 0x01,r8
+
+ /* Are we now aligned to a quadword boundary? If so, skip to
+ the main loop. Otherwise, go through the alignment code. */
+
+ bt 28,L(loop_null)
+
+ /* Handle WORD2 of pair. */
+ ldu r12,8(r8)
+ cmpb r5,r12,r0
+ cmpdi cr7,r5,0
+ bne cr7,L(done_null)
+ b L(loop_null) /* We branch here (rather than falling through)
+ to skip the nops due to heavy alignment
+ of the loop below. */
+
+ /* Main loop to look for the end of the string. Since it's a
+ small loop (< 8 instructions), align it to 32-bytes. */
+ .p2align 5
+L(loop_null):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the null-checking process for bigger strings. */
+ ld r12,8(r8)
+ ldu r11,16(r8)
+ cmpb r5,r12,r0
+ cmpb r10,r11,r0
+ or r6,r5,r10
+ cmpdi cr7,r6,0
+ beq cr7,L(loop_null)
+
+ /* OK, one (or both) of the doublewords contains a null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a null byte. */
+
+ cmpdi cr6,r5,0
+ addi r8,r8,-8
+ bne cr6,L(done_null)
+
+ /* The null byte must be in the second doubleword. Adjust the address
+ again and move the result of cmpb to r10 so we can calculate the
+ pointer. */
+
+ mr r5,r10
+ addi r8,r8,8
+
+ /* r5 has the output of the cmpb instruction, that is, it contains
+ 0xff in the same position as the null byte in the original
+ doubleword from the string. Use that to calculate the pointer. */
+L(done_null):
+#ifdef __LITTLE_ENDIAN__
+ addi r0,r5,-1
+ andc r0,r0,r5
+ popcntd r0,r0
+#else
+ cntlzd r0,r5 /* Count leading zeros before the match. */
+#endif
+ srdi r0,r0,3 /* Convert trailing zeros to bytes. */
+ add r3,r8,r0 /* Return address of the matching null byte. */
+ blr
+END (strrchr)
+weak_alias (strrchr, rindex)
+libc_hidden_builtin_def (strrchr)
diff --git a/sysdeps/powerpc/powerpc64/power7/strstr.S b/sysdeps/powerpc/powerpc64/power7/strstr.S
new file mode 100644
index 0000000000..8dca31ce35
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/strstr.S
@@ -0,0 +1,509 @@
+/* Optimized strstr implementation for PowerPC64/POWER7.
+ Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Char * [r3] strstr (char *s [r3], char * pat[r4]) */
+
+/* The performance gain is obtained using aligned memory access, load
+ * doubleword and usage of cmpb instruction for quicker comparison. */
+
+#ifndef STRLEN
+/* For builds with no IFUNC support, local calls should be made to internal
+ GLIBC symbol (created by libc_hidden_builtin_def). */
+# ifdef SHARED
+# define STRLEN __GI_strlen
+# else
+# define STRLEN strlen
+# endif
+#endif
+
+#ifndef STRNLEN
+/* For builds with no IFUNC support, local calls should be made to internal
+ GLIBC symbol (created by libc_hidden_builtin_def). */
+# ifdef SHARED
+# define STRNLEN __GI_strnlen
+# else
+# define STRNLEN strnlen
+# endif
+#endif
+
+#ifndef STRCHR
+# ifdef SHARED
+# define STRCHR __GI_strchr
+# else
+# define STRCHR strchr
+# endif
+#endif
+
+#define FRAMESIZE (FRAME_MIN_SIZE+32)
+ .machine power7
+EALIGN (strstr, 4, 0)
+ CALL_MCOUNT 2
+ mflr r0 /* Load link register LR to r0. */
+ std r31, -8(r1) /* Save callers register r31. */
+ cfi_offset(r31, -8)
+ std r30, -16(r1) /* Save callers register r30. */
+ cfi_offset(r30, -16)
+ std r29, -24(r1) /* Save callers register r29. */
+ cfi_offset(r29, -24)
+ std r0, 16(r1) /* Store the link register. */
+ cfi_offset(lr, 16)
+ stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */
+ cfi_adjust_cfa_offset(FRAMESIZE)
+
+ dcbt 0, r3
+ dcbt 0, r4
+
+ cmpdi cr7, r3, 0
+ beq cr7, L(retnull)
+ cmpdi cr7, r4, 0
+ beq cr7, L(retnull)
+
+ mr r29, r3
+ mr r30, r4
+ mr r3, r4
+ bl STRLEN
+ nop
+
+ cmpdi cr7, r3, 0 /* If search str is null. */
+ beq cr7, L(ret_r3)
+
+ /* Call __strstr_ppc if needle len > 2048 */
+ cmpdi cr7, r3, 2048
+ bgt cr7, L(default)
+
+ mr r31, r3
+ mr r4, r3
+ mr r3, r29
+ bl STRNLEN
+ nop
+
+ cmpd cr7, r3, r31 /* If len(r3) < len(r4). */
+ blt cr7, L(retnull)
+ mr r3, r29
+ lbz r4, 0(r30)
+ bl STRCHR
+ nop
+
+ mr r11, r3
+ /* If first char of search str is not present. */
+ cmpdi cr7, r3, 0
+ ble cr7, L(end)
+
+ rldicl r8, r3, 0, 52 /* Page cross check. */
+ cmpldi cr7, r8, 4096-16
+ bgt cr7, L(bytebybyte)
+
+ rldicl r8, r30, 0, 52
+ cmpldi cr7, r8, 4096-16
+ bgt cr7, L(bytebybyte)
+
+ /* If len(r4) < 8 handle in a different way. */
+ /* Shift position based on null and use cmpb. */
+ cmpdi cr7, r31, 8
+ blt cr7, L(lessthan8)
+
+ /* Len(r4) >= 8 reaches here. */
+ mr r8, r3 /* Save r3 for future use. */
+ mr r4, r30 /* Restore r4. */
+ li r0, 0
+ rlwinm r10, r30, 3, 26, 28 /* Calculate padding in bits. */
+ clrrdi r4, r4, 3 /* Make r4 aligned to 8. */
+ ld r6, 0(r4)
+ addi r4, r4, 8
+ cmpdi cr7, r10, 0 /* Check if its already aligned? */
+ beq cr7, L(begin1)
+#ifdef __LITTLE_ENDIAN__
+ srd r6, r6, r10 /* Discard unwanted bits. */
+#else
+ sld r6, r6, r10
+#endif
+ ld r9, 0(r4)
+ subfic r10, r10, 64
+#ifdef __LITTLE_ENDIAN__
+ sld r9, r9, r10 /* Discard unwanted bits. */
+#else
+ srd r9, r9, r10
+#endif
+ or r6, r6, r9 /* Form complete search str. */
+L(begin1):
+ mr r29, r6
+ rlwinm r10, r3, 3, 26, 28
+ clrrdi r3, r3, 3
+ ld r5, 0(r3)
+ cmpb r9, r0, r6 /* Check if input has null. */
+ cmpdi cr7, r9, 0
+ bne cr7, L(return3)
+ cmpb r9, r0, r5 /* Check if input has null. */
+#ifdef __LITTLE_ENDIAN__
+ srd r9, r9, r10
+#else
+ sld r9, r9, r10
+#endif
+ cmpdi cr7, r9, 0
+ bne cr7, L(retnull)
+
+ li r12, -8 /* Shift values. */
+ li r11, 72 /* Shift values. */
+ cmpdi cr7, r10, 0
+ beq cr7, L(nextbyte1)
+ mr r12, r10
+ addi r12, r12, -8
+ subfic r11, r12, 64
+
+L(nextbyte1):
+ ldu r7, 8(r3) /* Load next dw. */
+ addi r12, r12, 8 /* Shift one byte and compare. */
+ addi r11, r11, -8
+#ifdef __LITTLE_ENDIAN__
+ srd r9, r5, r12 /* Rotate based on mask. */
+ sld r10, r7, r11
+#else
+ sld r9, r5, r12
+ srd r10, r7, r11
+#endif
+ /* Form single dw from few bytes on first load and second load. */
+ or r10, r9, r10
+ /* Check for null in the formed dw. */
+ cmpb r9, r0, r10
+ cmpdi cr7, r9, 0
+ bne cr7, L(retnull)
+ /* Cmpb search str and input str. */
+ cmpb r9, r10, r6
+ cmpdi cr7, r9, -1
+ beq cr7, L(match)
+ addi r8, r8, 1
+ b L(begin)
+
+ .align 4
+L(match):
+ /* There is a match of 8 bytes, check next bytes. */
+ cmpdi cr7, r31, 8
+ beq cr7, L(return)
+ /* Update next starting point r8. */
+ srdi r9, r11, 3
+ subf r9, r9, r3
+ mr r8, r9
+
+L(secondmatch):
+ mr r5, r7
+ rlwinm r10, r30, 3, 26, 28 /* Calculate padding in bits. */
+ ld r6, 0(r4)
+ addi r4, r4, 8
+ cmpdi cr7, r10, 0 /* Check if its already aligned? */
+ beq cr7, L(proceed3)
+#ifdef __LITTLE_ENDIAN__
+ srd r6, r6, r10 /* Discard unwanted bits. */
+ cmpb r9, r0, r6
+ sld r9, r9, r10
+#else
+ sld r6, r6, r10
+ cmpb r9, r0, r6
+ srd r9, r9, r10
+#endif
+ cmpdi cr7, r9, 0
+ bne cr7, L(proceed3)
+ ld r9, 0(r4)
+ subfic r10, r10, 64
+#ifdef __LITTLE_ENDIAN__
+ sld r9, r9, r10 /* Discard unwanted bits. */
+#else
+ srd r9, r9, r10
+#endif
+ or r6, r6, r9 /* Form complete search str. */
+
+L(proceed3):
+ li r7, 0
+ addi r3, r3, 8
+ cmpb r9, r0, r5
+ cmpdi cr7, r9, 0
+ bne cr7, L(proceed4)
+ ld r7, 0(r3)
+L(proceed4):
+#ifdef __LITTLE_ENDIAN__
+ srd r9, r5, r12
+ sld r10, r7, r11
+#else
+ sld r9, r5, r12
+ srd r10, r7, r11
+#endif
+ /* Form single dw with few bytes from first and second load. */
+ or r10, r9, r10
+ cmpb r9, r0, r6
+ cmpdi cr7, r9, 0
+ bne cr7, L(return4)
+ /* Check for null in the formed dw. */
+ cmpb r9, r0, r10
+ cmpdi cr7, r9, 0
+ bne cr7, L(retnull)
+ /* If the next 8 bytes dont match, start search again. */
+ cmpb r9, r10, r6
+ cmpdi cr7, r9, -1
+ bne cr7, L(reset)
+ /* If the next 8 bytes match, load and compare next 8. */
+ b L(secondmatch)
+
+ .align 4
+L(reset):
+ /* Start the search again. */
+ addi r8, r8, 1
+ b L(begin)
+
+ .align 4
+L(return3):
+ /* Count leading zeros and compare partial dw. */
+#ifdef __LITTLE_ENDIAN__
+ addi r7, r9, -1
+ andc r7, r7, r9
+ popcntd r7, r7
+ subfic r7, r7, 64
+ sld r10, r5, r7
+ sld r6, r6, r7
+#else
+ cntlzd r7, r9
+ subfic r7, r7, 64
+ srd r10, r5, r7
+ srd r6, r6, r7
+#endif
+ cmpb r9, r10, r6
+ cmpdi cr7, r9, -1
+ addi r8, r8, 1
+ /* Start search again if there is no match. */
+ bne cr7, L(begin)
+ /* If the words match, update return values. */
+ subfic r7, r7, 64
+ srdi r7, r7, 3
+ add r3, r3, r7
+ subf r3, r31, r3
+ b L(end)
+
+ .align 4
+L(return4):
+ /* Count leading zeros and compare partial dw. */
+#ifdef __LITTLE_ENDIAN__
+ addi r7, r9, -1
+ andc r7, r7, r9
+ popcntd r7, r7
+ subfic r7, r7, 64
+ sld r10, r10, r7
+ sld r6, r6, r7
+#else
+ cntlzd r7, r9
+ subfic r7, r7, 64
+ srd r10, r10, r7
+ srd r6, r6, r7
+#endif
+ cmpb r9, r10, r6
+ cmpdi cr7, r9, -1
+ addi r8, r8, 1
+ bne cr7, L(begin)
+ subfic r7, r7, 64
+ srdi r11, r11, 3
+ subf r3, r11, r3
+ srdi r7, r7, 3
+ add r3, r3, r7
+ subf r3, r31, r3
+ b L(end)
+
+ .align 4
+L(begin):
+ mr r3, r8
+ lbz r4, 0(r30)
+ bl STRCHR
+ nop
+ /* If first char of search str is not present. */
+ cmpdi cr7, r3, 0
+ ble cr7, L(end)
+ mr r8, r3
+ mr r4, r30 /* Restore r4. */
+ li r0, 0
+ mr r6, r29
+ clrrdi r4, r4, 3
+ addi r4, r4, 8
+ b L(begin1)
+
+ /* Handle less than 8 search string. */
+ .align 4
+L(lessthan8):
+ mr r4, r3
+ mr r9, r30
+ li r0, 0
+
+ rlwinm r10, r9, 3, 26, 28 /* Calculate padding in bits. */
+ srdi r8, r10, 3 /* Padding in bytes. */
+ clrrdi r9, r9, 3 /* Make r4 aligned to 8. */
+ ld r6, 0(r9)
+ cmpdi cr7, r10, 0 /* Check if its already aligned? */
+ beq cr7, L(proceed2)
+#ifdef __LITTLE_ENDIAN__
+ srd r6, r6, r10 /* Discard unwanted bits. */
+#else
+ sld r6, r6, r10
+#endif
+ subfic r8, r8, 8
+ cmpd cr7, r8, r31 /* Next load needed? */
+ bge cr7, L(proceed2)
+ ld r7, 8(r9)
+ subfic r10, r10, 64
+#ifdef __LITTLE_ENDIAN__
+ sld r7, r7, r10 /* Discard unwanted bits. */
+#else
+ srd r7, r7, r10
+#endif
+ or r6, r6, r7 /* Form complete search str. */
+L(proceed2):
+ mr r29, r6
+ rlwinm r10, r3, 3, 26, 28
+ clrrdi r7, r3, 3 /* Make r3 aligned. */
+ ld r5, 0(r7)
+ sldi r8, r31, 3
+ subfic r8, r8, 64
+#ifdef __LITTLE_ENDIAN__
+ sld r6, r6, r8
+ cmpb r9, r0, r5
+ srd r9, r9, r10
+#else
+ srd r6, r6, r8
+ cmpb r9, r0, r5
+ sld r9, r9, r10
+#endif
+ cmpdi cr7, r9, 0
+ bne cr7, L(noload)
+ cmpdi cr7, r10, 0
+ beq cr7, L(continue)
+ ld r7, 8(r7)
+L(continue1):
+ mr r12, r10
+ addi r12, r12, -8
+ subfic r11, r12, 64
+ b L(nextbyte)
+
+ .align 4
+L(continue):
+ ld r7, 8(r7)
+ li r12, -8 /* Shift values. */
+ li r11, 72 /* Shift values. */
+L(nextbyte):
+ addi r12, r12, 8 /* Mask for rotation. */
+ addi r11, r11, -8
+#ifdef __LITTLE_ENDIAN__
+ srd r9, r5, r12
+ sld r10, r7, r11
+ or r10, r9, r10
+ sld r10, r10, r8
+ cmpb r9, r0, r10
+ srd r9, r9, r8
+#else
+ sld r9, r5, r12
+ srd r10, r7, r11
+ or r10, r9, r10
+ srd r10, r10, r8
+ cmpb r9, r0, r10
+ sld r9, r9, r8
+#endif
+ cmpdi cr7, r9, 0
+ bne cr7, L(retnull)
+ cmpb r9, r10, r6
+ cmpdi cr7, r9, -1
+ beq cr7, L(end)
+ addi r3, r4, 1
+ lbz r4, 0(r30)
+ bl STRCHR
+ nop
+ /* If first char of search str is not present. */
+ cmpdi cr7, r3, 0
+ ble cr7, L(end)
+ mr r4, r3
+ mr r6, r29
+ li r0, 0
+ b L(proceed2)
+
+ .align 4
+L(noload):
+ /* Reached null in r3, so skip next load. */
+ li r7, 0
+ b L(continue1)
+
+ .align 4
+L(return):
+ /* Update return values. */
+ srdi r9, r11, 3
+ subf r3, r9, r3
+ b L(end)
+
+ /* Handling byte by byte. */
+ .align 4
+L(bytebybyte):
+ mr r8, r3
+ addi r8, r8, -1
+L(loop1):
+ addi r8, r8, 1
+ mr r3, r8
+ mr r4, r30
+ lbz r6, 0(r4)
+ cmpdi cr7, r6, 0
+ beq cr7, L(updater3)
+L(loop):
+ lbz r5, 0(r3)
+ cmpdi cr7, r5, 0
+ beq cr7, L(retnull)
+ cmpld cr7, r6, r5
+ bne cr7, L(loop1)
+ addi r3, r3, 1
+ addi r4, r4, 1
+ lbz r6, 0(r4)
+ cmpdi cr7, r6, 0
+ beq cr7, L(updater3)
+ b L(loop)
+
+ /* Handling return values. */
+ .align 4
+L(updater3):
+ subf r3, r31, r3 /* Reduce len of r4 from r3. */
+ b L(end)
+
+ .align 4
+L(ret_r3):
+ mr r3, r29 /* Return r3. */
+ b L(end)
+
+ .align 4
+L(retnull):
+ li r3, 0 /* Return NULL. */
+ b L(end)
+
+ .align 4
+L(default):
+ mr r3, r29
+ mr r4, r30
+ bl __strstr_ppc
+ nop
+
+ .align 4
+L(end):
+ addi r1, r1, FRAMESIZE /* Restore stack pointer. */
+ cfi_adjust_cfa_offset(-FRAMESIZE)
+ ld r0, 16(r1) /* Restore the saved link register. */
+ ld r29, -24(r1) /* Restore callers save register r29. */
+ ld r30, -16(r1) /* Restore callers save register r30. */
+ ld r31, -8(r1) /* Restore callers save register r31. */
+ mtlr r0 /* Branch to link register. */
+ blr
+END (strstr)
+libc_hidden_builtin_def (strstr)
diff --git a/sysdeps/powerpc/powerpc64/power7/sub_n.S b/sysdeps/powerpc/powerpc64/power7/sub_n.S
index d6539aa067..460bb1907a 100644
--- a/sysdeps/powerpc/powerpc64/power7/sub_n.S
+++ b/sysdeps/powerpc/powerpc64/power7/sub_n.S
@@ -1,6 +1,6 @@
/* PowerPC64 mpn_lshift -- mpn_add_n/mpn_sub_n -- mpn addition and
subtraction.
- Copyright (C) 2013-2014 Free Software Foundation, Inc.
+ Copyright (C) 2013-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or