summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Schwab <schwab@redhat.com>2009-07-22 11:37:18 +0200
committerAndreas Schwab <schwab@redhat.com>2009-07-22 11:37:18 +0200
commit64322469ecb5746709e560f36dbc740c1300f978 (patch)
treeadb81205fb9862b78ed02770ea42e96f51e89561
parent0457885b7efb5731e67202746d476c0d023bf43f (diff)
parentae612b04cc0716186e0d14e342bee184ba94ac1b (diff)
Merge commit 'origin/master' into fedora/master
-rw-r--r--ChangeLog41
-rw-r--r--locale/C-ctype.c6
-rw-r--r--locale/langinfo.h1
-rw-r--r--locale/localeinfo.h4
-rw-r--r--locale/programs/ld-ctype.c27
-rw-r--r--nptl/ChangeLog6
-rw-r--r--nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S8
-rw-r--r--nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S31
-rw-r--r--string/strcasestr.c10
-rw-r--r--string/strstr.c9
-rw-r--r--sysdeps/x86_64/multiarch/Makefile4
-rw-r--r--sysdeps/x86_64/multiarch/strcasestr-c.c18
-rw-r--r--sysdeps/x86_64/multiarch/strcasestr.c3
-rw-r--r--sysdeps/x86_64/multiarch/strstr-c.c12
-rw-r--r--sysdeps/x86_64/multiarch/strstr.c464
15 files changed, 612 insertions, 32 deletions
diff --git a/ChangeLog b/ChangeLog
index 9ab7a42c63..ab6f980177 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,44 @@
+2009-07-21 Ulrich Drepper <drepper@redhat.com>
+
+ * sysdeps/x86_64/multiarch/strstr.c: Minor cleanups. Remove
+ unnecesary variables. Comment fixes.
+
+2009-07-20 Ulrich Drepper <drepper@redhat.com>
+
+ * sysdeps/x86_64/multiarch/strstr.c [USE_AS_STRCASESTR] (STRSTR_SSE42):
+ Use NONASCII_CASE information provided by the locale to determine
+ whether optimized string load function can be used. Minor cleanups.
+
+2009-07-20 H.J. Lu <hongjiu.lu@intel.com>
+
+ * string/strcasestr.c (STRCASESTR): New macro.
+ (__strcasestr): Renamed to ..
+ (STRCASESTR): ...this.
+ * string/strstr.c (STRSTR): New macro.
+ (strstr): Renamed to ..
+ (STRSTR): ...this.
+ * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+ strstr-c strcasestr-c
+ (CFLAGS-strstr.c): New.
+ (CFLAGS-strcasestr.c): Likewise.
+ * sysdeps/x86_64/multiarch/strcasestr-c.c: New file.
+ * sysdeps/x86_64/multiarch/strcasestr.c: New file.
+ * sysdeps/x86_64/multiarch/strstr-c.c: New file.
+ * sysdeps/x86_64/multiarch/strstr.c: New file.
+
+2009-07-20 Ulrich Drepper <drepper@redhat.com>
+
+ * locale/localeinfo.h (LIMAGIC): Update value for LC_CTYPE.
+ * locale/langinfo.h: Define _NL_CTYPE_NONASCII_CASE.
+ * locale/C-ctype.c (_nl_C_LC_CTYPE): Add initializer for
+ _NL_CTYPE_NONASCII_CASE.
+ * locale/programs/ld-ctype.c (locale_ctype_t): Add nonascii_case
+ field.
+ (ctype_finish): Check whether there are any 8-bit characters outside
+ the range ASCII has or whether the mapping isn't the same as for
+ ASCII (±0x20). Set nonascii_case appropriately.
+ (ctype_output): Add output handler for nonascii_case.
+
2009-07-17 Ulrich Drepper <drepper@redhat.com>
* sysdeps/generic/sysdep.h: Define cfi_personality, cfi_lsda,
diff --git a/locale/C-ctype.c b/locale/C-ctype.c
index 85f3d2addb..420b08a13f 100644
--- a/locale/C-ctype.c
+++ b/locale/C-ctype.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1995-2002, 2003 Free Software Foundation, Inc.
+/* Copyright (C) 1995-2002, 2003, 2009 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@cygnus.com>, 1995.
@@ -528,7 +528,7 @@ _nl_C_LC_CTYPE_width attribute_hidden =
};
/* Number of fields with fixed meanings, starting at 0. */
-#define NR_FIXED 71
+#define NR_FIXED 72
/* Number of class fields, starting at CLASS_OFFSET. */
#define NR_CLASSES 12
/* Number of map fields, starting at MAP_OFFSET. */
@@ -667,6 +667,8 @@ const struct locale_data _nl_C_LC_CTYPE attribute_hidden =
{ .wstr = NULL },
/* _NL_CTYPE_MAP_TO_NONASCII */
{ .word = 0 },
+ /* _NL_CTYPE_NONASCII_CASE */
+ { .word = 0 },
/* NR_CLASSES wctype_tables */
{ .string = (const char *) _nl_C_LC_CTYPE_class_upper.header },
{ .string = (const char *) _nl_C_LC_CTYPE_class_lower.header },
diff --git a/locale/langinfo.h b/locale/langinfo.h
index 59017b31c8..c940c743aa 100644
--- a/locale/langinfo.h
+++ b/locale/langinfo.h
@@ -334,6 +334,7 @@ enum
_NL_CTYPE_TRANSLIT_IGNORE_LEN,
_NL_CTYPE_TRANSLIT_IGNORE,
_NL_CTYPE_MAP_TO_NONASCII,
+ _NL_CTYPE_NONASCII_CASE,
_NL_CTYPE_EXTRA_MAP_1,
_NL_CTYPE_EXTRA_MAP_2,
_NL_CTYPE_EXTRA_MAP_3,
diff --git a/locale/localeinfo.h b/locale/localeinfo.h
index 3661080bb2..19ea41ae6d 100644
--- a/locale/localeinfo.h
+++ b/locale/localeinfo.h
@@ -1,5 +1,5 @@
/* Declarations for internal libc locale interfaces
- Copyright (C) 1995-2003, 2005, 2006, 2007, 2008
+ Copyright (C) 1995-2003, 2005, 2006, 2007, 2008, 2009
Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -35,6 +35,8 @@
#define LIMAGIC(category) \
(category == LC_COLLATE \
? ((unsigned int) (0x20051014 ^ (category))) \
+ : category == LC_CTYPE \
+ ? ((unsigned int) (0x20090720 ^ (category))) \
: ((unsigned int) (0x20031115 ^ (category))))
/* Two special weight constants for the collation data. */
diff --git a/locale/programs/ld-ctype.c b/locale/programs/ld-ctype.c
index d4474bf1a2..376a02c2f0 100644
--- a/locale/programs/ld-ctype.c
+++ b/locale/programs/ld-ctype.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1995-2006, 2007 Free Software Foundation, Inc.
+/* Copyright (C) 1995-2006, 2007, 2009 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
@@ -181,6 +181,7 @@ struct locale_ctype_t
size_t default_missing_lineno;
uint32_t to_nonascii;
+ uint32_t nonascii_case;
/* The arrays for the binary representation. */
char_class_t *ctype_b;
@@ -625,6 +626,27 @@ character <SP> not defined in character map")));
else
ctype->class256_collection[space_seq->bytes[0]] |= BIT (tok_print);
+ /* Check whether all single-byte characters make to their upper/lowercase
+ equivalent according to the ASCII rules. */
+ for (cnt = 'A'; cnt <= 'Z'; ++cnt)
+ {
+ uint32_t uppval = ctype->map256_collection[0][cnt];
+ uint32_t lowval = ctype->map256_collection[1][cnt];
+ uint32_t lowuppval = ctype->map256_collection[0][lowval];
+ uint32_t lowlowval = ctype->map256_collection[1][lowval];
+
+ if (uppval != cnt
+ || lowval != cnt + 0x20
+ || lowuppval != cnt
+ || lowlowval != cnt + 0x20)
+ ctype->nonascii_case = 1;
+ }
+ for (cnt = 0; cnt < 256; ++cnt)
+ if (cnt < 'A' || (cnt > 'Z' && cnt < 'a') || cnt > 'z')
+ if (ctype->map256_collection[0][cnt] != cnt
+ || ctype->map256_collection[1][cnt] != cnt)
+ ctype->nonascii_case = 1;
+
/* Now that the tests are done make sure the name array contains all
characters which are handled in the WIDTH section of the
character set definition file. */
@@ -1045,6 +1067,9 @@ ctype_output (struct localedef_t *locale, const struct charmap_t *charmap,
CTYPE_DATA (_NL_CTYPE_MAP_TO_NONASCII,
&ctype->to_nonascii, sizeof (uint32_t));
+ CTYPE_DATA (_NL_CTYPE_NONASCII_CASE,
+ &ctype->nonascii_case, sizeof (uint32_t));
+
case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_MB_LEN):
iov[2 + elem + offset].iov_base = alloca (sizeof (uint32_t));
iov[2 + elem + offset].iov_len = sizeof (uint32_t);
diff --git a/nptl/ChangeLog b/nptl/ChangeLog
index 3eded66512..1f24aa5849 100644
--- a/nptl/ChangeLog
+++ b/nptl/ChangeLog
@@ -1,3 +1,9 @@
+2009-07-20 Ulrich Drepper <drepper@redhat.com>
+
+ * sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S: Minor
+ optimizations of last changes.
+ * sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S: Likewise.
+
2009-07-19 Ulrich Drepper <drepper@redhat.com>
* sysdeps/unix/sysv/linux/x86_64/lowlevellock.h: Define
diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S
index f81466e1a5..e12790cb96 100644
--- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S
+++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S
@@ -160,16 +160,14 @@ __pthread_cond_timedwait:
movq 8(%rsp), %rdi
movq %r13, %r10
+ movl $FUTEX_WAIT_BITSET, %esi
cmpq $-1, dep_mutex(%rdi)
- movl $FUTEX_WAIT_BITSET, %eax
- movl $(FUTEX_WAIT_BITSET|FUTEX_PRIVATE_FLAG), %esi
- cmove %eax, %esi
je 60f
movq dep_mutex(%rdi), %r8
/* Requeue to a PI mutex if the PI bit is set. */
testl $PI_BIT, MUTEX_KIND(%r8)
- je 60f
+ je 61f
movl $(FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG), %esi
xorl %eax, %eax
@@ -191,10 +189,10 @@ __pthread_cond_timedwait:
cmpq $-4095, %rax
jnae 62f
- movl $(FUTEX_WAIT_BITSET|FUTEX_PRIVATE_FLAG), %esi
subq $cond_futex, %rdi
#endif
+61: movl $(FUTEX_WAIT_BITSET|FUTEX_PRIVATE_FLAG), %esi
60: xorl %r15d, %r15d
xorl %eax, %eax
/* The following only works like this because we only support
diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S
index e6323ea3e2..2fab38e277 100644
--- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S
+++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S
@@ -128,28 +128,15 @@ __pthread_cond_wait:
movq 8(%rsp), %rdi
xorq %r10, %r10
movq %r12, %rdx
- // XXX reverse + lea
- addq $cond_futex, %rdi
- cmpq $-1, dep_mutex-cond_futex(%rdi)
-#ifdef __ASSUME_PRIVATE_FUTEX
- movl $FUTEX_WAIT, %eax
- movl $(FUTEX_WAIT|FUTEX_PRIVATE_FLAG), %esi
- cmove %eax, %esi
-#else
- movl $0, %eax
- movl %fs:PRIVATE_FUTEX, %esi
- cmove %eax, %esi
-# if FUTEX_WAIT != 0
-# error "cc destroyed by following orl"
- orl $FUTEX_WAIT, %esi
-# endif
-#endif
+ cmpq $-1, dep_mutex(%rdi)
+ leaq cond_futex(%rdi), %rdi
+ movl $FUTEX_WAIT, %esi
je 60f
movq dep_mutex-cond_futex(%rdi), %r8
/* Requeue to a PI mutex if the PI bit is set. */
testl $PI_BIT, MUTEX_KIND(%r8)
- je 60f
+ je 61f
movl $(FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG), %esi
movl $SYS_futex, %eax
@@ -162,9 +149,17 @@ __pthread_cond_wait:
cmpq $-4095, %rax
jnae 62f
- movl $(FUTEX_WAIT|FUTEX_PRIVATE_FLAG), %esi
+# ifndef __ASSUME_PRIVATE_FUTEX
+ movl $FUTEX_WAIT, %esi
+# endif
#endif
+61:
+#ifdef __ASSUME_PRIVATE_FUTEX
+ movl $(FUTEX_WAIT|FUTEX_PRIVATE_FLAG), %esi
+#else
+ orl %fs:PRIVATE_FUTEX, %esi
+#endif
60: xorl %r13d, %r13d
movl $SYS_futex, %eax
syscall
diff --git a/string/strcasestr.c b/string/strcasestr.c
index 92f2eac7c8..088b5d91c7 100644
--- a/string/strcasestr.c
+++ b/string/strcasestr.c
@@ -1,5 +1,6 @@
/* Return the offset of one string within another.
- Copyright (C) 1994, 1996-2000, 2004, 2008 Free Software Foundation, Inc.
+ Copyright (C) 1994, 1996-2000, 2004, 2008, 2009
+ Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -52,11 +53,16 @@
#undef strcasestr
#undef __strcasestr
+#ifndef STRCASESTR
+#define STRCASESTR __strcasestr
+#endif
+
+
/* Find the first occurrence of NEEDLE in HAYSTACK, using
case-insensitive comparison. This function gives unspecified
results in multibyte locales. */
char *
-__strcasestr (const char *haystack_start, const char *needle_start)
+STRCASESTR (const char *haystack_start, const char *needle_start)
{
const char *haystack = haystack_start;
const char *needle = needle_start;
diff --git a/string/strstr.c b/string/strstr.c
index a9dc312992..ef45f82758 100644
--- a/string/strstr.c
+++ b/string/strstr.c
@@ -1,5 +1,6 @@
/* Return the offset of one string within another.
- Copyright (C) 1994,1996,1997,2000,2001,2003,2008 Free Software Foundation, Inc.
+ Copyright (C) 1994,1996,1997,2000,2001,2003,2008,2009
+ Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -40,11 +41,15 @@
#undef strstr
+#ifndef STRSTR
+#define STRSTR strstr
+#endif
+
/* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK
if NEEDLE is empty, otherwise NULL if NEEDLE is not found in
HAYSTACK. */
char *
-strstr (const char *haystack_start, const char *needle_start)
+STRSTR (const char *haystack_start, const char *needle_start)
{
const char *haystack = haystack_start;
const char *needle = needle_start;
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 71e85f0652..5ce14aad8d 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -6,9 +6,11 @@ endif
ifeq ($(subdir),string)
sysdep_routines += stpncpy-c strncpy-c strncmp-c
ifeq (yes,$(config-cflags-sse4))
-sysdep_routines += strcspn-c strpbrk-c strspn-c
+sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
CFLAGS-strspn-c.c += -msse4
+CFLAGS-strstr.c += -msse4
+CFLAGS-strcasestr.c += -msse4
endif
endif
diff --git a/sysdeps/x86_64/multiarch/strcasestr-c.c b/sysdeps/x86_64/multiarch/strcasestr-c.c
new file mode 100644
index 0000000000..e6879531bc
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasestr-c.c
@@ -0,0 +1,18 @@
+#include "init-arch.h"
+
+#define STRCASESTR __strcasestr_sse2
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name) \
+ __hidden_ver1 (__strcasestr_sse2, __GI_strcasestr, __strcasestr_sse2);
+
+#include "string/strcasestr.c"
+
+extern char *__strcasestr_sse42 (const char *, const char *);
+
+#if 1
+libc_ifunc (__strcasestr,
+ HAS_SSE4_2 ? __strcasestr_sse42 : __strcasestr_sse2);
+#else
+libc_ifunc (__strcasestr,
+ 0 ? __strcasestr_sse42 : __strcasestr_sse2);
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcasestr.c b/sysdeps/x86_64/multiarch/strcasestr.c
new file mode 100644
index 0000000000..064e3ef4fd
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasestr.c
@@ -0,0 +1,3 @@
+#define USE_AS_STRCASESTR
+#define STRSTR_SSE42 __strcasestr_sse42
+#include "strstr.c"
diff --git a/sysdeps/x86_64/multiarch/strstr-c.c b/sysdeps/x86_64/multiarch/strstr-c.c
new file mode 100644
index 0000000000..cff99b71ef
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strstr-c.c
@@ -0,0 +1,12 @@
+#include "init-arch.h"
+
+#define STRSTR __strstr_sse2
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name) \
+ __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2);
+
+#include "string/strstr.c"
+
+extern char *__strstr_sse42 (const char *, const char *);
+
+libc_ifunc (strstr, HAS_SSE4_2 ? __strstr_sse42 : __strstr_sse2);
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
new file mode 100644
index 0000000000..76d5ad16df
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strstr.c
@@ -0,0 +1,464 @@
+/* strstr with SSE4.2 intrinsics
+ Copyright (C) 2009 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <nmmintrin.h>
+
+#ifndef STRSTR_SSE42
+# define STRSTR_SSE42 __strstr_sse42
+#endif
+
+#ifdef USE_AS_STRCASESTR
+# include <ctype.h>
+# include <locale/localeinfo.h>
+
+# define LOADBYTE(C) tolower (C)
+# define CMPBYTE(C1, C2) (tolower (C1) == tolower (C2))
+#else
+# define LOADBYTE(C) (C)
+# define CMPBYTE(C1, C2) ((C1) == (C2))
+#endif
+
+/* We use 0xe ordered-compare:
+ _SIDD_SBYTE_OPS
+ | _SIDD_CMP_EQUAL_ORDER
+ | _SIDD_LEAST_SIGNIFICANT
+ on pcmpistri to do the scanning and string comparsion requirements of
+ sub-string match. In the scanning phase, we process Cflag and ECX
+ index to locate the first fragment match; once the first fragment
+ match position has been identified, we do comparison of subsequent
+ string fragments until we can conclude false or true match; whe
+ n concluding a false match, we may need to repeat scanning process
+ from next relevant offset in the target string.
+
+ In the scanning phase we have 4 cases:
+ case ECX CFlag ZFlag SFlag
+ 1 16 0 0 0
+ 2a 16 0 0 1
+ 2b 16 0 1 0
+ 2c 16 0 1 1
+
+ 1. No ordered-comparison match, both 16B fragments are valid, so
+ continue to next fragment.
+ 2. No ordered-comparison match, there is EOS in either fragment,
+ 2a. Zflg = 0, Sflg = 1, we continue
+ 2b. Zflg = 1, Sflg = 0, we conclude no match and return.
+ 2c. Zflg = 1, sflg = 1, lenth determine match or no match
+
+ In the string comparison phase, the 1st fragment match is fixed up
+ to produce ECX = 0. Subsequent fragment compare of nonzero index
+ and no match conclude a false match.
+
+ case ECX CFlag ZFlag SFlag
+ 3 X 1 0 0/1
+ 4a 0 1 0 0
+ 4b 0 1 0 1
+ 4c 0 < X 1 0 0/1
+ 5 16 0 1 0
+
+ 3. An initial ordered-comparison fragment match, we fix up to do
+ subsequent string comparison
+ 4a. Continuation of fragment comparison of a string compare.
+ 4b. EOS reached in the reference string, we conclude true match and
+ return
+ 4c. String compare failed if index is nonzero, we need to go back to
+ scanning
+ 5. failed string compare, go back to scanning
+ */
+
+/* Fix-up of removal of unneeded data due to 16B aligned load
+ parameters:
+ value: 16B data loaded from 16B aligned address.
+ offset: Offset of target data address relative to 16B aligned load
+ address.
+ */
+
+static __inline__ __m128i
+__m128i_shift_right (__m128i value, int offset)
+{
+ switch (offset)
+ {
+ case 1:
+ value = _mm_srli_si128 (value, 1);
+ break;
+ case 2:
+ value = _mm_srli_si128 (value, 2);
+ break;
+ case 3:
+ value = _mm_srli_si128 (value, 3);
+ break;
+ case 4:
+ value = _mm_srli_si128 (value, 4);
+ break;
+ case 5:
+ value = _mm_srli_si128 (value, 5);
+ break;
+ case 6:
+ value = _mm_srli_si128 (value, 6);
+ break;
+ case 7:
+ value = _mm_srli_si128 (value, 7);
+ break;
+ case 8:
+ value = _mm_srli_si128 (value, 8);
+ break;
+ case 9:
+ value = _mm_srli_si128 (value, 9);
+ break;
+ case 10:
+ value = _mm_srli_si128 (value, 10);
+ break;
+ case 11:
+ value = _mm_srli_si128 (value, 11);
+ break;
+ case 12:
+ value = _mm_srli_si128 (value, 12);
+ break;
+ case 13:
+ value = _mm_srli_si128 (value, 13);
+ break;
+ case 14:
+ value = _mm_srli_si128 (value, 14);
+ break;
+ case 15:
+ value = _mm_srli_si128 (value, 15);
+ break;
+ }
+ return value;
+}
+
+/* Simple replacement of movdqu to address 4KB boundary cross issue.
+ If EOS occurs within less than 16B before 4KB boundary, we don't
+ cross to next page. */
+
+static __m128i
+__attribute__ ((section (".text.sse4.2")))
+__m128i_strloadu (const unsigned char * p)
+{
+ int offset = ((size_t) p & (16 - 1));
+
+ if (offset && (int) ((size_t) p & 0xfff) > 0xff0)
+ {
+ __m128i a = _mm_load_si128 ((__m128i *) (p - offset));
+ __m128i zero = _mm_setzero_si128 ();
+ int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero));
+ if ((bmsk >> offset) != 0)
+ return __m128i_shift_right (a, offset);
+ }
+ return _mm_loadu_si128 ((__m128i *) p);
+}
+
+#ifdef USE_AS_STRCASESTR
+
+/* Similar to __m128i_strloadu. Convert to lower case for POSIX/C
+ locale. */
+
+static __m128i
+__attribute__ ((section (".text.sse4.2")))
+__m128i_strloadu_tolower_posix (const unsigned char * p)
+{
+ __m128i frag = __m128i_strloadu (p);
+
+ /* Convert frag to lower case for POSIX/C locale. */
+ __m128i rangeuc = _mm_set_epi64x (0x0, 0x5a41);
+ __m128i u2ldelta = _mm_set1_epi64x (0xe0e0e0e0e0e0e0e0);
+ __m128i mask1 = _mm_cmpistrm (rangeuc, frag, 0x44);
+ __m128i mask2 = _mm_blendv_epi8 (u2ldelta, frag, mask1);
+ mask2 = _mm_sub_epi8 (mask2, u2ldelta);
+ return _mm_blendv_epi8 (frag, mask2, mask1);
+}
+
+/* Similar to __m128i_strloadu. Convert to lower case for none-POSIX/C
+ locale. */
+
+static __m128i
+__attribute__ ((section (".text.sse4.2")))
+__m128i_strloadu_tolower (const unsigned char * p)
+{
+ union
+ {
+ char b[16];
+ __m128i x;
+ } u;
+
+ for (int i = 0; i < 16; i++)
+ if (p[i] == 0)
+ {
+ u.b[i] = 0;
+ break;
+ }
+ else
+ u.b[i] = tolower (p[i]);
+
+ return u.x;
+}
+#endif
+
+/* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP
+ algorithm) overlap for a fully populated 16B vector.
+ Input parameter: 1st 16Byte loaded from the reference string of a
+ strstr function.
+ We don't use KMP algorithm if reference string is less than 16B.
+ */
+
+static int
+__inline__ __attribute__ ((__always_inline__,))
+KMP16Bovrlap (__m128i s2)
+{
+ __m128i b = _mm_unpacklo_epi8 (s2, s2);
+ __m128i a = _mm_unpacklo_epi8 (b, b);
+ a = _mm_shuffle_epi32 (a, 0);
+ b = _mm_srli_si128 (s2, sizeof (char));
+ int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a));
+
+ /* _BitScanForward(&k1, bmsk); */
+ int k1;
+ __asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk));
+ if (!bmsk)
+ return 16;
+ else if (bmsk == 0x7fff)
+ return 1;
+ else if (!k1)
+ {
+ /* There are al least two ditinct char in s2. If byte 0 and 1 are
+ idential and the distinct value lies farther down, we can deduce
+ the next byte offset to restart full compare is least no earlier
+ than byte 3. */
+ return 3;
+ }
+ else
+ {
+ /* Byte 1 is not degenerated to byte 0. */
+ return k1 + 1;
+ }
+}
+
+char *
+__attribute__ ((section (".text.sse4.2")))
+STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
+{
+#define p1 s1
+ const unsigned char *p2 = s2;
+
+ if (p2[0] == '\0')
+ return (char *) p1;
+
+ if (p1[0] == '\0')
+ return NULL;
+
+ /* Check if p1 length is 1 byte long. */
+ if (p1[1] == '\0')
+ return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL;
+
+#ifdef USE_AS_STRCASESTR
+ __m128i (*strloadu) (const unsigned char *);
+
+ if (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE) == 0)
+ strloadu = __m128i_strloadu_tolower_posix;
+ else
+ strloadu = __m128i_strloadu_tolower;
+#else
+# define strloadu __m128i_strloadu
+#endif
+
+ /* p1 > 1 byte long. Load up to 16 bytes of fragment. */
+ __m128i frag1 = strloadu (p1);
+
+ __m128i frag2;
+ if (p2[1] != '\0')
+ /* p2 is > 1 byte long. */
+ frag2 = strloadu (p2);
+ else
+ frag2 = _mm_insert_epi8 (_mm_setzero_si128 (), LOADBYTE (p2[0]), 0);
+
+ /* Unsigned bytes, equal order, does frag2 has null? */
+ int cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+ int cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+ int cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+ int cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
+ if (cmp_s & cmp_c)
+ {
+ int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2,
+ _mm_setzero_si128 ()));
+ int len;
+ __asm ("bsfl %[bmsk], %[len]"
+ : [len] "=r" (len) : [bmsk] "r" (bmsk));
+ p1 += cmp;
+ if ((len + cmp) <= 16)
+ return (char *) p1;
+
+ /* Load up to 16 bytes of fragment. */
+ frag1 = strloadu (p1);
+ cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+ cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
+ cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+ cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+ if ((len + cmp) <= 16)
+ return (char *) p1 + cmp;
+ }
+
+ if (cmp_s)
+ {
+ /* Adjust addr for 16B alginment in ensuing loop. */
+ while (!cmp_z)
+ {
+ p1 += cmp;
+ /* Load up to 16 bytes of fragment. */
+ frag1 = strloadu (p1);
+ cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+ cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+ cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+ /* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp
+ once already, this time cmp will be zero and we can exit. */
+ if ((!cmp) & cmp_c)
+ break;
+ }
+
+ if (!cmp_c)
+ return NULL;
+
+ /* Since s2 is less than 16 bytes, com_c is definitive
+ determination of full match. */
+ return (char *) p1 + cmp;
+ }
+
+ /* General case, s2 is at least 16 bytes or more.
+ First, the common case of false-match at first byte of p2. */
+ const unsigned char *pt = NULL;
+ int kmp_fwd = 0;
+re_trace:
+ while (!cmp_c)
+ {
+ /* frag1 has null. */
+ if (cmp_z)
+ return NULL;
+
+ /* frag 1 has no null, advance 16 bytes. */
+ p1 += 16;
+ /* Load up to 16 bytes of fragment. */
+ frag1 = strloadu (p1);
+ /* Unsigned bytes, equal order, is there a partial match? */
+ cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+ cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+ cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+ }
+
+ /* Next, handle initial positive match as first byte of p2. We have
+ a partial fragment match, make full determination until we reached
+ end of s2. */
+ if (!cmp)
+ {
+ if (cmp_z)
+ return (char *) p1;
+
+ pt = p1;
+ p1 += 16;
+ p2 += 16;
+ /* Load up to 16 bytes of fragment. */
+ frag2 = strloadu (p2);
+ }
+ else
+ {
+ /* Adjust 16B alignment. */
+ p1 += cmp;
+ pt = p1;
+ }
+
+ /* Load up to 16 bytes of fragment. */
+ frag1 = strloadu (p1);
+
+ /* Unsigned bytes, equal order, does frag2 has null? */
+ cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+ cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+ cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+ cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
+ while (!(cmp | cmp_z | cmp_s))
+ {
+ p1 += 16;
+ p2 += 16;
+ /* Load up to 16 bytes of fragment. */
+ frag2 = strloadu (p2);
+ /* Load up to 16 bytes of fragment. */
+ frag1 = strloadu (p1);
+ /* Unsigned bytes, equal order, does frag2 has null? */
+ cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+ cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+ cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+ cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
+ }
+
+ /* Full determination yielded a false result, retrace s1 to next
+ starting position.
+ Zflg 1 0 1 0/1
+ Sflg 0 1 1 0/1
+ cmp na 0 0 >0
+ action done done continue continue if s2 < s1
+ false match retrace s1 else false
+ */
+
+ if (cmp_s & !cmp)
+ return (char *) pt;
+ if (cmp_z)
+ {
+ if (!cmp_s)
+ return NULL;
+
+ /* Handle both zero and sign flag set and s1 is shorter in
+ length. */
+ __m128i zero = _mm_setzero_si128 ();
+ int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2));
+ int bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1));
+ int len;
+ int len1;
+ __asm ("bsfl %[bmsk], %[len]"
+ : [len] "=r" (len) : [bmsk] "r" (bmsk));
+ __asm ("bsfl %[bmsk1], %[len1]"
+ : [len1] "=r" (len1) : [bmsk1] "r" (bmsk1));
+ if (len >= len1)
+ return NULL;
+ }
+ else if (!cmp)
+ return (char *) pt;
+
+ /* Otherwise, we have to retrace and continue. Default of multiple
+ paths that need to retrace from next byte in s1. */
+ p2 = s2;
+ frag2 = strloadu (p2);
+
+ if (!kmp_fwd)
+ kmp_fwd = KMP16Bovrlap (frag2);
+
+ /* KMP algorithm predicted overlap needs to be corrected for
+ partial fragment compare. */
+ p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd);
+
+ /* Since s2 is at least 16 bytes long, we're certain there is no
+ match. */
+ if (p1[0] == '\0')
+ return NULL;
+
+ /* Load up to 16 bytes of fragment. */
+ frag1 = strloadu (p1);
+
+ /* Unsigned bytes, equal order, is there a partial match? */
+ cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
+ cmp = _mm_cmpistri (frag2, frag1, 0x0c);
+ cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
+ goto re_trace;
+}