summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRyan S. Arnold <rsa@linux.vnet.ibm.com>2012-10-30 17:07:18 -0500
committerRyan S. Arnold <rsa@linux.vnet.ibm.com>2012-10-30 17:07:18 -0500
commit09dec6c37e3cd967f62795320703647f24545e3e (patch)
tree78b5597c0aa457682a6215382022348ec87d07c5
parent9f45bfe790a59bfc072ef096b21dc701e03bccf9 (diff)
Correct cacheline size to 32-bytes for ppc405 memset.S (bug 14595).
This patch also creates a version of memset.S for the ppc476 processor which uses a 128-byte cacheline size for dcbz insns.
-rw-r--r--NEWS4
-rw-r--r--ports/ChangeLog.powerpc9
-rw-r--r--ports/sysdeps/powerpc/powerpc32/405/memset.S12
-rw-r--r--ports/sysdeps/powerpc/powerpc32/476/memset.S154
4 files changed, 171 insertions, 8 deletions
diff --git a/NEWS b/NEWS
index b54365f1e7..fe569e1240 100644
--- a/NEWS
+++ b/NEWS
@@ -16,8 +16,8 @@ Version 2.17
14303, 14307, 14328, 14331, 14336, 14337, 14347, 14349, 14376, 14417,
14459, 14476, 14477, 14505, 14510, 14516, 14518, 14519, 14530, 14532,
14538, 14543, 14544, 14545, 14557, 14562, 14568, 14576, 14579, 14583,
- 14587, 14602, 14621, 14638, 14645, 14648, 14652, 14660, 14661, 14683,
- 14694, 14716, 14743, 14767, 14783.
+ 14587, 14595, 14602, 14621, 14638, 14645, 14648, 14652, 14660, 14661,
+ 14683, 14694, 14716, 14743, 14767, 14783.
* Support for STT_GNU_IFUNC symbols added for s390 and s390x.
Optimized versions of memcpy, memset, and memcmp added for System z10 and
diff --git a/ports/ChangeLog.powerpc b/ports/ChangeLog.powerpc
index 642e7165c6..e22a7333a3 100644
--- a/ports/ChangeLog.powerpc
+++ b/ports/ChangeLog.powerpc
@@ -1,3 +1,12 @@
+2012-09-25 Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
+ Ryan S. Arnold <rsa@linux.vnet.ibm.com>
+
+ [BZ #14595]
+ * sysdeps/powerpc/powerpc32/476/memset.S: New file copied from
+ 405/memset.S to preserve 128-byte cacheline size.
+ * sysdeps/powerpc/powerpc32/405/memset.S (memset): Fix cacheline size
+ to 32-bytes for 405, 440, and 464 processors.
+
2012-10-19 Roland McGrath <roland@hack.frob.com>
* sysdeps/unix/sysv/linux/powerpc/powerpc32/nofpu/nptl/libc.abilist
diff --git a/ports/sysdeps/powerpc/powerpc32/405/memset.S b/ports/sysdeps/powerpc/powerpc32/405/memset.S
index e132ce3652..c2ee6c593c 100644
--- a/ports/sysdeps/powerpc/powerpc32/405/memset.S
+++ b/ports/sysdeps/powerpc/powerpc32/405/memset.S
@@ -1,5 +1,5 @@
-/* Optimized memset implementation for PowerPC476.
- Copyright (C) 2010 Free Software Foundation, Inc.
+/* Optimized memset for PowerPC405,440,464 (32-byte cacheline).
+ Copyright (C) 2012 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -104,7 +104,7 @@ L(use_dcbz):
add r3,r3,r7
L(skip_string_loop):
- clrlwi r8,r6,25
+ clrlwi r8,r6,27
srwi. r8,r8,4
beq L(dcbz_pre_loop)
mtctr r8
@@ -119,14 +119,14 @@ L(word_loop):
bdnz L(word_loop)
L(dcbz_pre_loop):
- srwi r6,r5,7
+ srwi r6,r5,5
mtctr r6
addi r7,0,0
L(dcbz_loop):
dcbz r3,r7
- addi r3,r3,0x80
- subi r5,r5,0x80
+ addi r3,r3,0x20
+ subi r5,r5,0x20
bdnz L(dcbz_loop)
srwi. r6,r5,4
beq L(postword2_count_loop)
diff --git a/ports/sysdeps/powerpc/powerpc32/476/memset.S b/ports/sysdeps/powerpc/powerpc32/476/memset.S
new file mode 100644
index 0000000000..8b5750442b
--- /dev/null
+++ b/ports/sysdeps/powerpc/powerpc32/476/memset.S
@@ -0,0 +1,154 @@
+/* Optimized memset for PowerPC476 (128-byte cacheline).
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+
+/* memset
+
+ r3:destination address and return address
+ r4:source integer to copy
+ r5:byte count
+ r11:sources integer to copy in all 32 bits of reg
+ r12:temp return address
+
+ Save return address in r12
+ If destinationn is unaligned and count is greater tha 255 bytes
+ set 0-3 bytes to make destination aligned
+ If count is greater tha 255 bytes and setting zero to memory
+ use dbcz to set memeory when we can
+ otherwsie do the follwoing
+ If 16 or more words to set we use 16 word copy loop.
+ Finaly we set 0-15 extra bytes with string store. */
+
+EALIGN (BP_SYM (memset), 5, 0)
+ rlwinm r11,r4,0,24,31
+ rlwimi r11,r4,8,16,23
+ rlwimi r11,r11,16,0,15
+ addi r12,r3,0
+ cmpwi r5,0x00FF
+ ble L(preword8_count_loop)
+ cmpwi r4,0x00
+ beq L(use_dcbz)
+ neg r6,r3
+ clrlwi. r6,r6,30
+ beq L(preword8_count_loop)
+ addi r8,0,1
+ mtctr r6
+ subi r3,r3,1
+
+L(unaligned_bytecopy_loop):
+ stbu r11,0x1(r3)
+ subf. r5,r8,r5
+ beq L(end_memset)
+ bdnz L(unaligned_bytecopy_loop)
+ addi r3,r3,1
+
+L(preword8_count_loop):
+ srwi. r6,r5,4
+ beq L(preword2_count_loop)
+ mtctr r6
+ addi r3,r3,-4
+ mr r8,r11
+ mr r9,r11
+ mr r10,r11
+
+L(word8_count_loop_no_dcbt):
+ stwu r8,4(r3)
+ stwu r9,4(r3)
+ subi r5,r5,0x10
+ stwu r10,4(r3)
+ stwu r11,4(r3)
+ bdnz L(word8_count_loop_no_dcbt)
+ addi r3,r3,4
+
+L(preword2_count_loop):
+ clrlwi. r7,r5,28
+ beq L(end_memset)
+ mr r8,r11
+ mr r9,r11
+ mr r10,r11
+ mtxer r7
+ stswx r8,0,r3
+
+L(end_memset):
+ addi r3,r12,0
+ blr
+
+L(use_dcbz):
+ neg r6,r3
+ clrlwi. r7,r6,28
+ beq L(skip_string_loop)
+ mr r8,r11
+ mr r9,r11
+ mr r10,r11
+ subf r5,r7,r5
+ mtxer r7
+ stswx r8,0,r3
+ add r3,r3,r7
+
+L(skip_string_loop):
+ clrlwi r8,r6,25
+ srwi. r8,r8,4
+ beq L(dcbz_pre_loop)
+ mtctr r8
+
+L(word_loop):
+ stw r11,0(r3)
+ subi r5,r5,0x10
+ stw r11,4(r3)
+ stw r11,8(r3)
+ stw r11,12(r3)
+ addi r3,r3,0x10
+ bdnz L(word_loop)
+
+L(dcbz_pre_loop):
+ srwi r6,r5,7
+ mtctr r6
+ addi r7,0,0
+
+L(dcbz_loop):
+ dcbz r3,r7
+ addi r3,r3,0x80
+ subi r5,r5,0x80
+ bdnz L(dcbz_loop)
+ srwi. r6,r5,4
+ beq L(postword2_count_loop)
+ mtctr r6
+
+L(postword8_count_loop):
+ stw r11,0(r3)
+ subi r5,r5,0x10
+ stw r11,4(r3)
+ stw r11,8(r3)
+ stw r11,12(r3)
+ addi r3,r3,0x10
+ bdnz L(postword8_count_loop)
+
+L(postword2_count_loop):
+ clrlwi. r7,r5,28
+ beq L(end_memset)
+ mr r8,r11
+ mr r9,r11
+ mr r10,r11
+ mtxer r7
+ stswx r8,0,r3
+ b L(end_memset)
+END (BP_SYM (memset))
+libc_hidden_builtin_def (memset)