summaryrefslogtreecommitdiff
path: root/sysdeps
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2010-03-09 06:42:53 -0800
committerDavid S. Miller <davem@davemloft.net>2010-03-09 06:42:53 -0800
commit462a5227b0d3220ab68f65272bd5b9d6d4f49b1f (patch)
treec294d0568ec8d2d65520227423def46b215ab297 /sysdeps
parent2fe000dfd673859eb3b1e1c9739de66445d9fe08 (diff)
sparc: Optimize strlen using techniques from powerpc implementation.
Diffstat (limited to 'sysdeps')
-rw-r--r--sysdeps/sparc/sparc32/sparcv9/strlen.S3
-rw-r--r--sysdeps/sparc/sparc32/strlen.S128
-rw-r--r--sysdeps/sparc/sparc64/strlen.S210
3 files changed, 110 insertions, 231 deletions
diff --git a/sysdeps/sparc/sparc32/sparcv9/strlen.S b/sysdeps/sparc/sparc32/sparcv9/strlen.S
index b8f4dba4f4..28a216c076 100644
--- a/sysdeps/sparc/sparc32/sparcv9/strlen.S
+++ b/sysdeps/sparc/sparc32/sparcv9/strlen.S
@@ -1,4 +1 @@
-#define ASI_PNF 0x82
-#define ASI_BLK_P 0xf0
-#define XCC icc
#include <sparc64/strlen.S>
diff --git a/sysdeps/sparc/sparc32/strlen.S b/sysdeps/sparc/sparc32/strlen.S
index ed92f20e28..2945bb5484 100644
--- a/sysdeps/sparc/sparc32/strlen.S
+++ b/sysdeps/sparc/sparc32/strlen.S
@@ -1,8 +1,9 @@
/* Determine the length of a string.
For SPARC v7.
- Copyright (C) 1996, 1999, 2003 Free Software Foundation, Inc.
+ Copyright (C) 1996, 1999, 2003, 2010 Free Software Foundation, Inc.
This file is part of the GNU C Library.
- Contributed by Jakub Jelinek <jj@ultra.linux.cz>.
+ Contributed by Jakub Jelinek <jj@ultra.linux.cz> and
+ David S. Miller <davem@davemloft.net>.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -21,86 +22,55 @@
#include <sysdep.h>
- /* Normally, this uses ((xword - 0x01010101) & 0x80808080) test
- to find out if any byte in xword could be zero. This is fast, but
- also gives false alarm for any byte in range 0x81-0xff. It does
- not matter for correctness, as if this test tells us there could
- be some zero byte, we check it byte by byte, but if bytes with
- high bits set are common in the strings, then this will give poor
- performance. You can #define EIGHTBIT_NOT_RARE and the algorithm
- will use one tick slower, but more precise test
- ((xword - 0x01010101) & (~xword) & 0x80808080),
- which does not give any false alarms (but if some bits are set,
- one cannot assume from it which bytes are zero and which are not).
- It is yet to be measured, what is the correct default for glibc
- in these days for an average user.
- */
-
.text
.align 4
ENTRY(strlen)
- mov %o0, %o1
- andcc %o0, 3, %g0
- be 20f
- sethi %hi(0x80808080), %o4
-
- ldub [%o0], %o5
- cmp %o5, 0
- be 21f
- add %o0, 1, %o0
- andcc %o0, 3, %g0
- be 4f
- or %o4, %lo(0x80808080), %o3
- ldub [%o0], %o5
- cmp %o5, 0
- be 22f
- add %o0, 1, %o0
- andcc %o0, 3, %g0
- be 5f
- sethi %hi(0x01010101), %o4
- ldub [%o0], %o5
- cmp %o5, 0
- be 23f
- add %o0, 1, %o0
- b 11f
- or %o4, %lo(0x01010101), %o2
-21: retl
- mov 0, %o0
-22: retl
- mov 1, %o0
-23: retl
- mov 2, %o0
-
-20: or %o4, %lo(0x80808080), %o3
-4: sethi %hi(0x01010101), %o4
-5: or %o4, %lo(0x01010101), %o2
-11: ld [%o0], %o5
-12: sub %o5, %o2, %o4
-#ifdef EIGHTBIT_NOT_RARE
- andn %o4, %o5, %o4
-#endif
- andcc %o4, %o3, %g0
- be 11b
- add %o0, 4, %o0
-
- srl %o5, 24, %g5
- andcc %g5, 0xff, %g0
- be 13f
- add %o0, -4, %o4
- srl %o5, 16, %g5
- andcc %g5, 0xff, %g0
- be 13f
- add %o4, 1, %o4
- srl %o5, 8, %g5
- andcc %g5, 0xff, %g0
- be 13f
- add %o4, 1, %o4
- andcc %o5, 0xff, %g0
- bne,a 12b
- ld [%o0], %o5
- add %o4, 1, %o4
-13: retl
- sub %o4, %o1, %o0
+ mov %o0, %o1
+ andn %o0, 0x3, %o0
+
+ ld [%o0], %o5
+ and %o1, 0x3, %g1
+ mov -1, %g5
+
+ sethi %hi(0x01010101), %o2
+ sll %g1, 3, %g1
+
+ or %o2, %lo(0x01010101), %o2
+ srl %g5, %g1, %g2
+
+ orn %o5, %g2, %o5
+ sll %o2, 7, %o3
+10: add %o0, 4, %o0
+
+ andn %o3, %o5, %g1
+ sub %o5, %o2, %g2
+
+ andcc %g1, %g2, %g0
+ be,a 10b
+ ld [%o0], %o5
+
+ srl %o5, 24, %g1
+
+ andcc %g1, 0xff, %g0
+ be 90f
+ sub %o0, 4, %o0
+
+ srl %o5, 16, %g2
+
+ andcc %g2, 0xff, %g0
+ be 90f
+ add %o0, 1, %o0
+
+ srl %o5, 8, %g1
+
+ andcc %g1, 0xff, %g0
+ be 90f
+ add %o0, 1, %o0
+
+ add %o0, 1, %o0
+
+90: retl
+ sub %o0, %o1, %o0
END(strlen)
libc_hidden_builtin_def (strlen)
diff --git a/sysdeps/sparc/sparc64/strlen.S b/sysdeps/sparc/sparc64/strlen.S
index cc15e4e3fb..64350fb05e 100644
--- a/sysdeps/sparc/sparc64/strlen.S
+++ b/sysdeps/sparc/sparc64/strlen.S
@@ -1,8 +1,9 @@
/* Determine the length of a string. For SPARC v9.
- Copyright (C) 1998, 1999, 2003 Free Software Foundation, Inc.
+ Copyright (C) 1998, 1999, 2003, 2010 Free Software Foundation, Inc.
This file is part of the GNU C Library.
- Contributed by Jan Vondrak <jvon4518@ss1000.ms.mff.cuni.cz> and
- Jakub Jelinek <jj@ultra.linux.cz>.
+ Contributed by Jan Vondrak <jvon4518@ss1000.ms.mff.cuni.cz>,
+ Jakub Jelinek <jj@ultra.linux.cz>, and
+ David S. Miller <davem@davemloft.net>.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -20,155 +21,66 @@
02111-1307 USA. */
#include <sysdep.h>
-#include <asm/asi.h>
-
- /* Normally, this uses
- ((xword - 0x0101010101010101) & 0x8080808080808080) test
- to find out if any byte in xword could be zero. This is fast, but
- also gives false alarm for any byte in range 0x81-0xff. It does
- not matter for correctness, as if this test tells us there could
- be some zero byte, we check it byte by byte, but if bytes with
- high bits set are common in the strings, then this will give poor
- performance. You can #define EIGHTBIT_NOT_RARE and the algorithm
- will use one tick slower, but more precise test
- ((xword - 0x0101010101010101) & (~xword) & 0x8080808080808080),
- which does not give any false alarms (but if some bits are set,
- one cannot assume from it which bytes are zero and which are not).
- It is yet to be measured, what is the correct default for glibc
- in these days for an average user.
- */
+
+ .register %g2, #scratch
+ .register %g3, #scratch
.text
.align 32
ENTRY(strlen)
- sethi %hi(0x01010101), %g1 /* IEU0 Group */
- ldub [%o0], %o3 /* Load */
- or %g1, %lo(0x01010101), %g1 /* IEU0 Group */
- mov %o0, %o1 /* IEU1 */
-
- sllx %g1, 32, %g4 /* IEU0 Group */
- andcc %o0, 7, %g0 /* IEU1 */
- or %g1, %g4, %g1 /* IEU0 Group */
- brz,pn %o3, 13f /* CTI+IEU1 */
-
- sllx %g1, 7, %g4 /* IEU0 Group */
- bne,a,pn %icc, 15f /* CTI */
- add %o0, 1, %o0 /* IEU1 */
- /* %g1 = 0x0101010101010101 *
- * %g4 = 0x8080808080808080 *
- * %o0 = string pointer *
- * %o1 = start of string */
-1: ldx [%o0], %o3 /* Load Group */
-
- add %o0, 8, %o0 /* IEU1 */
-2: sub %o3, %g1, %o2 /* IEU0 Group */
-#ifdef EIGHTBIT_NOT_RARE
- andn %o2, %o3, %o5 /* IEU0 Group */
- ldxa [%o0] ASI_PNF, %o3 /* Load */
- andcc %o5, %g4, %g0 /* IEU1 Group */
-#else
- ldxa [%o0] ASI_PNF, %o3 /* Load */
- andcc %o2, %g4, %g0 /* IEU1 Group */
-#endif
-
- be,pt %xcc, 2b /* CTI */
- add %o0, 8, %o0 /* IEU0 */
- addcc %o2, %g1, %g5 /* IEU1 Group */
-#ifdef EIGHTBIT_NOT_RARE
- srlx %o5, 32, %o5 /* IEU0 */
-
-3: andcc %o5, %g4, %g0 /* IEU1 Group */
-#else
- srlx %o2, 32, %o2 /* IEU0 */
-
-3: andcc %o2, %g4, %g0 /* IEU1 Group */
-#endif
- be,pn %xcc, 4f /* CTI */
- srlx %g5, 56, %o2 /* IEU0 */
- andcc %o2, 0xff, %g0 /* IEU1 Group */
-
- be,pn %icc, 12f /* CTI */
- srlx %g5, 48, %o2 /* IEU0 */
- andcc %o2, 0xff, %g0 /* IEU1 Group */
- be,pn %icc, 11f /* CTI */
-
- srlx %g5, 40, %o2 /* IEU0 */
- andcc %o2, 0xff, %g0 /* IEU1 Group */
- be,pn %icc, 10f /* CTI */
- srlx %g5, 32, %o2 /* IEU0 */
-
- andcc %o2, 0xff, %g0 /* IEU1 Group */
- be,pn %icc, 9f /* CTI */
-4: srlx %g5, 24, %o2 /* IEU0 */
- andcc %o2, 0xff, %g0 /* IEU1 Group */
-
- be,pn %icc, 8f /* CTI */
- srlx %g5, 16, %o2 /* IEU0 */
- andcc %o2, 0xff, %g0 /* IEU1 Group */
- be,pn %icc, 7f /* CTI */
-
- srlx %g5, 8, %o2 /* IEU0 */
- andcc %o2, 0xff, %g0 /* IEU1 Group */
- be,pn %icc, 6f /* CTI */
- sub %o3, %g1, %o2 /* IEU0 */
-
- andcc %g5, 0xff, %g0 /* IEU1 Group */
- be,pn %icc, 5f /* CTI */
- ldxa [%o0] ASI_PNF, %o3 /* Load */
- andcc %o2, %g4, %g0 /* IEU1 Group */
-
- be,pt %xcc, 2b /* CTI */
- add %o0, 8, %o0 /* IEU0 */
- addcc %o2, %g1, %g5 /* IEU1 Group */
- ba,pt %xcc, 3b /* CTI */
-
- srlx %o2, 32, %o2 /* IEU0 */
-5: add %o0, -9, %o0 /* IEU0 Group */
- retl /* CTI+IEU1 Group */
- sub %o0, %o1, %o0 /* IEU0 */
-
-6: add %o0, -10, %o0 /* IEU0 Group */
- retl /* CTI+IEU1 Group */
- sub %o0, %o1, %o0 /* IEU0 */
-7: add %o0, -11, %o0 /* IEU0 Group */
-
- retl /* CTI+IEU1 Group */
- sub %o0, %o1, %o0 /* IEU0 */
-8: add %o0, -12, %o0 /* IEU0 Group */
- retl /* CTI+IEU1 Group */
-
- sub %o0, %o1, %o0 /* IEU0 */
-9: add %o0, -13, %o0 /* IEU0 Group */
- retl /* CTI+IEU1 Group */
- sub %o0, %o1, %o0 /* IEU0 */
-
-10: add %o0, -14, %o0 /* IEU0 Group */
- retl /* CTI+IEU1 Group */
- sub %o0, %o1, %o0 /* IEU0 */
-11: add %o0, -15, %o0 /* IEU0 Group */
-
- retl /* CTI+IEU1 Group */
- sub %o0, %o1, %o0 /* IEU0 */
-12: add %o0, -16, %o0 /* IEU0 Group */
- retl /* CTI+IEU1 Group */
-
- sub %o0, %o1, %o0 /* IEU0 */
-13: retl /* CTI+IEU1 Group */
- mov 0, %o0 /* IEU0 */
- nop
-
-15: ldub [%o0], %o3 /* Load Group */
-16: andcc %o0, 7, %g0 /* IEU1 */
- be,pn %icc, 1b /* CTI */
- nop /* IEU0 Group */
-
- add %o0, 1, %o0 /* IEU1 */
- andcc %o3, 0xff, %g0 /* IEU1 Group */
- bne,a,pt %icc, 16b /* CTI */
- lduba [%o0] ASI_PNF, %o3 /* Load */
-
- add %o0, -1, %o0 /* IEU0 Group */
- retl /* CTI+IEU1 Group */
- sub %o0, %o1, %o0 /* IEU0 */
+ mov %o0, %o1
+ andn %o0, 0x7, %o0
+
+ ldx [%o0], %o5
+ and %o1, 0x7, %g1
+ mov -1, %g5
+
+ sethi %hi(0x01010101), %o2
+ sll %g1, 3, %g1
+
+ or %o2, %lo(0x01010101), %o2
+ srlx %g5, %g1, %o3
+
+ sllx %o2, 32, %g1
+ sethi %hi(0x0000ff00), %g5
+
+ orn %o5, %o3, %o5
+ or %o2, %g1, %o2
+
+ sllx %o2, 7, %o3
+10: add %o0, 8, %o0
+
+ andn %o3, %o5, %g1
+ sub %o5, %o2, %g2
+
+ andcc %g1, %g2, %g0
+ be,a,pt %xcc, 10b
+ ldx [%o0], %o5
+ srlx %o5, 32, %g1
+
+ andn %o3, %g1, %o4
+ sub %g1, %o2, %g2
+
+ add %o0, 4, %g3
+ andcc %o4, %g2, %g0
+ movne %icc, %g1, %o5
+
+ move %icc, %g3, %o0
+ or %g5, %lo(0x0000ff00), %g5
+ mov 3 - 8, %g2
+
+ andcc %o5, %g5, %g0
+ srlx %o5, 16, %g1
+ move %icc, 2 - 8, %g2
+
+ andcc %g1, 0xff, %g0
+ srl %o5, 24, %o5
+ move %icc, 1 - 8, %g2
+
+ movrz %o5, 0 - 8, %g2
+ sub %o0, %o1, %o0
+
+ retl
+ add %o0, %g2, %o0
END(strlen)
libc_hidden_builtin_def (strlen)