From d28797e426aa7e4e380a7ae10faf4aa3c4767e0b Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Thu, 23 Jul 2009 13:15:17 -0700
Subject: Perform test for Arom x86-64 in central place and handle it.

There will be more than one function which, in multiarch mode, wants
to use SSSE3.  We should not test in each of them for Atoms with
slow SSSE3.  Instead, disable the SSSE3 bit in the startup code for
such machines.
---
 sysdeps/x86_64/multiarch/init-arch.c |  8 +++++++-
 sysdeps/x86_64/multiarch/strcpy.S    | 13 +++----------
 2 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'sysdeps')

diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c
index 29e687344d..35fd19af0e 100644
--- a/sysdeps/x86_64/multiarch/init-arch.c
+++ b/sysdeps/x86_64/multiarch/init-arch.c
@@ -68,7 +68,13 @@ __init_cpu_features (void)
 	  __cpu_features.model += extended_model;
 	}
       else if (__cpu_features.family == 0x06)
-	__cpu_features.model += extended_model;
+	{
+	  __cpu_features.model += extended_model;
+
+	  if (__cpu_features.model == 0x1c)
+	    /* Avoid SSSE3 on Atom since it is slow.  */
+	    __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx &= ~(1 << 9);
+	}
     }
   /* This spells out "AuthenticAMD".  */
   else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S
index 25cd01307d..7e400a9140 100644
--- a/sysdeps/x86_64/multiarch/strcpy.S
+++ b/sysdeps/x86_64/multiarch/strcpy.S
@@ -64,16 +64,9 @@ ENTRY(STRCPY)
 	call	__init_cpu_features
 1:	leaq	STRCPY_SSE2(%rip), %rax
 	testl	$(1<<9), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
-	jz	3f
-/* Avoid SSSE3 strcpy on Atom since it is slow.  */
-	cmpl	$1, __cpu_features+KIND_OFFSET(%rip)
-	jne	2f
-	cmpl	$6, __cpu_features+FAMILY_OFFSET(%rip)
-	jne	2f
-	cmpl	$28, __cpu_features+MODEL_OFFSET(%rip)
-	jz	3f
-2:	leaq	STRCPY_SSSE3(%rip), %rax
-3:	ret
+	jz	2f
+	leaq	STRCPY_SSSE3(%rip), %rax
+2:	ret
 END(STRCPY)
 
 	.section .text.ssse3,"ax",@progbits
-- 
cgit v1.2.3


From 3e9099b4f6666cd05b62d2829f65161daddb151b Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Thu, 23 Jul 2009 13:42:46 -0700
Subject: Add more cache descriptors for L3 caches on x86 and x86-64.

The most recent AP 485 describes a few more cache descriptors for
L3 caches with 24-way associativity.
---
 ChangeLog                              | 4 ++++
 sysdeps/unix/sysv/linux/i386/sysconf.c | 3 +++
 sysdeps/x86_64/cacheinfo.c             | 3 +++
 3 files changed, 10 insertions(+)

(limited to 'sysdeps')

diff --git a/ChangeLog b/ChangeLog
index 1498e4a3c0..29e4468ce4 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
 2009-07-23  Ulrich Drepper  <drepper@redhat.com>
 
+	* sysdeps/unix/sysv/linux/i386/sysconf.c (intel_02_known): Add more
+	cache descriptors.
+	* sysdeps/x86_64/cacheinfo.c (intel_02_known): Likewise.
+
 	* sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): Reset
 	SSSE3 bit for Atoms.
 	* sysdeps/x86_64/multiarch/strcpy.S: New need to perform Atom test
diff --git a/sysdeps/unix/sysv/linux/i386/sysconf.c b/sysdeps/unix/sysv/linux/i386/sysconf.c
index efe1a639cd..ff3cf9f7c7 100644
--- a/sysdeps/unix/sysv/linux/i386/sysconf.c
+++ b/sysdeps/unix/sysv/linux/i386/sysconf.c
@@ -138,6 +138,9 @@ static const struct intel_02_cache_info
     { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
     { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
     { 0xe4, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  8388608 },
+    { 0xea, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
+    { 0xeb, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 18874368 },
+    { 0xec, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 25165824 },
   };
 #define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known[0]))
 
diff --git a/sysdeps/x86_64/cacheinfo.c b/sysdeps/x86_64/cacheinfo.c
index 362687c181..07939099b9 100644
--- a/sysdeps/x86_64/cacheinfo.c
+++ b/sysdeps/x86_64/cacheinfo.c
@@ -100,6 +100,9 @@ static const struct intel_02_cache_info
     { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
     { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
     { 0xe4, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  8388608 },
+    { 0xea, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
+    { 0xeb, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 18874368 },
+    { 0xec, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 25165824 },
   };
 
 #define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0]))
-- 
cgit v1.2.3


From b2509a1e380bc92ee6ae6437103d349e1f517773 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Thu, 23 Jul 2009 14:03:53 -0700
Subject: Avoid cpuid instructions in cache info discovery.

When multiarch is enabled we have this information stored.  Use it.
---
 ChangeLog                  |  4 ++++
 sysdeps/x86_64/cacheinfo.c | 50 ++++++++++++++++++++++++++++------------------
 2 files changed, 35 insertions(+), 19 deletions(-)

(limited to 'sysdeps')

diff --git a/ChangeLog b/ChangeLog
index 29e4468ce4..f0a6675016 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
 2009-07-23  Ulrich Drepper  <drepper@redhat.com>
 
+	* sysdeps/x86_64/cacheinfo.c [USE_MULTIARCH]: Rearrange code to
+	avoid additional cpuid instructions.  Most of the information is
+	stored somewhere.
+
 	* sysdeps/unix/sysv/linux/i386/sysconf.c (intel_02_known): Add more
 	cache descriptors.
 	* sysdeps/x86_64/cacheinfo.c (intel_02_known): Likewise.
diff --git a/sysdeps/x86_64/cacheinfo.c b/sysdeps/x86_64/cacheinfo.c
index 07939099b9..75b81958dd 100644
--- a/sysdeps/x86_64/cacheinfo.c
+++ b/sysdeps/x86_64/cacheinfo.c
@@ -25,6 +25,17 @@
 
 #ifdef USE_MULTIARCH
 # include "multiarch/init-arch.h"
+
+# define is_intel __cpu_features.kind == arch_kind_intel
+# define is_amd __cpu_features.kind == arch_kind_amd
+# define max_cpuid __cpu_features.max_cpuid
+#else
+  /* This spells out "GenuineIntel".  */
+# define is_intel \
+  ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69
+  /* This spells out "AuthenticAMD".  */
+# define is_amd \
+  ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65
 #endif
 
 static const struct intel_02_cache_info
@@ -155,6 +166,12 @@ intel_check_word (int name, unsigned int value, bool *has_level_2,
 	      /* Intel reused this value.  For family 15, model 6 it
 		 specifies the 3rd level cache.  Otherwise the 2nd
 		 level cache.  */
+	      unsigned int family;
+	      unsigned int model;
+#ifdef USE_MULTIARCH
+	      family = __cpu_features.family;
+	      model = __cpu_features.model;
+#else
 	      unsigned int eax;
 	      unsigned int ebx;
 	      unsigned int ecx;
@@ -163,9 +180,10 @@ intel_check_word (int name, unsigned int value, bool *has_level_2,
 			    : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
 			    : "0" (1));
 
-	      unsigned int family = ((eax >> 20) & 0xff) + ((eax >> 8) & 0xf);
-	      unsigned int model = ((((eax >>16) & 0xf) << 4)
-				    + ((eax >> 4) & 0xf));
+	      family = ((eax >> 20) & 0xff) + ((eax >> 8) & 0xf);
+	      model = (((eax >>16) & 0xf) << 4) + ((eax >> 4) & 0xf);
+#endif
+
 	      if (family == 15 && model == 6)
 		{
 		  /* The level 3 cache is encoded for this model like
@@ -397,21 +415,24 @@ long int
 attribute_hidden
 __cache_sysconf (int name)
 {
+#ifdef USE_MULTIARCH
+  if (__cpu_features.kind == arch_kind_unknown)
+    __init_cpu_features ();
+#else
   /* Find out what brand of processor.  */
-  unsigned int eax;
+  unsigned int max_cpuid;
   unsigned int ebx;
   unsigned int ecx;
   unsigned int edx;
   asm volatile ("cpuid"
-		: "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
+		: "=a" (max_cpuid), "=b" (ebx), "=c" (ecx), "=d" (edx)
 		: "0" (0));
+#endif
 
-  /* This spells out "GenuineIntel".  */
-  if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
-    return handle_intel (name, eax);
+  if (is_intel)
+    return handle_intel (name, max_cpuid);
 
-  /* This spells out "AuthenticAMD".  */
-  if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
+  if (is_amd)
     return handle_amd (name);
 
   // XXX Fill in more vendors.
@@ -460,20 +481,11 @@ init_cacheinfo (void)
 #ifdef USE_MULTIARCH
   if (__cpu_features.kind == arch_kind_unknown)
     __init_cpu_features ();
-# define is_intel __cpu_features.kind == arch_kind_intel
-# define is_amd __cpu_features.kind == arch_kind_amd
-# define max_cpuid __cpu_features.max_cpuid
 #else
   int max_cpuid;
   asm volatile ("cpuid"
 		: "=a" (max_cpuid), "=b" (ebx), "=c" (ecx), "=d" (edx)
 		: "0" (0));
-  /* This spells out "GenuineIntel".  */
-# define is_intel \
-  ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69
-  /* This spells out "AuthenticAMD".  */
-# define is_amd \
-  ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65
 #endif
 
   if (is_intel)
-- 
cgit v1.2.3


From f957edded874c786c51bc9264fa244cb18bc6568 Mon Sep 17 00:00:00 2001
From: Andreas Krebbel <Andreas.Krebbel@de.ibm.com>
Date: Fri, 24 Jul 2009 08:29:06 -0700
Subject: S/390: Hardware iconv modules.

---
 ChangeLog                                    |  16 +
 sysdeps/s390/dl-procinfo.c                   |  10 +-
 sysdeps/s390/dl-procinfo.h                   |   9 +-
 sysdeps/s390/s390-64/Makefile                |  67 ++++
 sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c | 238 +++++++++++++
 sysdeps/s390/s390-64/utf16-utf32-z9.c        | 325 +++++++++++++++++
 sysdeps/s390/s390-64/utf8-utf16-z9.c         | 463 ++++++++++++++++++++++++
 sysdeps/s390/s390-64/utf8-utf32-z9.c         | 508 +++++++++++++++++++++++++++
 8 files changed, 1628 insertions(+), 8 deletions(-)
 create mode 100644 sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c
 create mode 100644 sysdeps/s390/s390-64/utf16-utf32-z9.c
 create mode 100644 sysdeps/s390/s390-64/utf8-utf16-z9.c
 create mode 100644 sysdeps/s390/s390-64/utf8-utf32-z9.c

(limited to 'sysdeps')

diff --git a/ChangeLog b/ChangeLog
index f0a6675016..4bdd2f5e35 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,19 @@
+2009-06-16  Andreas Krebbel  <Andreas.Krebbel@de.ibm.com>
+
+	* sysdeps/s390/dl-procinfo.c (_dl_s390_cap_flags): "hpage",
+	"etf3enh" and "highgprs" added.
+	(_dl_s390_platforms): "z10" added.
+	* sysdeps/s390/dl-procinfo.h (_DL_HWCAP_COUNT, _DL_PLATFORMS_COUNT):
+	Increased for the new entries.
+	(HWCAP enum): HWCAP_S390_HPAGE, HWCAP_S390_ETF3EH and
+	HWCAP_S390_HIGH_GPRS added.
+
+	* sysdeps/s390/s390-64/Makefile: Adjusted to build the new modules.
+	* sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c: New file.
+	* sysdeps/s390/s390-64/utf16-utf32-z9.c: New file.
+	* sysdeps/s390/s390-64/utf8-utf16-z9.c: New file.
+	* sysdeps/s390/s390-64/utf8-utf32-z9.c: New file.
+
 2009-07-23  Ulrich Drepper  <drepper@redhat.com>
 
 	* sysdeps/x86_64/cacheinfo.c [USE_MULTIARCH]: Rearrange code to
diff --git a/sysdeps/s390/dl-procinfo.c b/sysdeps/s390/dl-procinfo.c
index 32c6aef951..d51d7b2379 100644
--- a/sysdeps/s390/dl-procinfo.c
+++ b/sysdeps/s390/dl-procinfo.c
@@ -1,5 +1,5 @@
 /* Data for s390 version of processor capability information.
-   Copyright (C) 2006 Free Software Foundation, Inc.
+   Copyright (C) 2006, 2009 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Martin Schwidefsky <schwidefsky@de.ibm.com>, 2006.
 
@@ -47,11 +47,11 @@
 #if !defined PROCINFO_DECL && defined SHARED
   ._dl_s390_cap_flags
 #else
-PROCINFO_CLASS const char _dl_s390_cap_flags[7][6]
+PROCINFO_CLASS const char _dl_s390_cap_flags[10][8]
 #endif
 #ifndef PROCINFO_DECL
 = {
-     "esan3", "zarch", "stfle", "msa", "ldisp", "eimm", "dfp"
+     "esan3", "zarch", "stfle", "msa", "ldisp", "eimm", "dfp", "hpage", "etf3enh", "highgprs"
   }
 #endif
 #if !defined SHARED || defined PROCINFO_DECL
@@ -63,11 +63,11 @@ PROCINFO_CLASS const char _dl_s390_cap_flags[7][6]
 #if !defined PROCINFO_DECL && defined SHARED
   ._dl_s390_platforms
 #else
-PROCINFO_CLASS const char _dl_s390_platforms[4][7]
+PROCINFO_CLASS const char _dl_s390_platforms[5][7]
 #endif
 #ifndef PROCINFO_DECL
 = {
-     "g5", "z900", "z990", "z9-109"
+    "g5", "z900", "z990", "z9-109", "z10"
   }
 #endif
 #if !defined SHARED || defined PROCINFO_DECL
diff --git a/sysdeps/s390/dl-procinfo.h b/sysdeps/s390/dl-procinfo.h
index 178d7cc017..0a7ebd3be9 100644
--- a/sysdeps/s390/dl-procinfo.h
+++ b/sysdeps/s390/dl-procinfo.h
@@ -1,5 +1,5 @@
 /* s390 version of processor capability information handling macros.
-   Copyright (C) 2006 Free Software Foundation, Inc.
+   Copyright (C) 2006, 2009 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Martin Schwidefsky <schwidefsky@de.ibm.com>, 2006.
 
@@ -22,9 +22,9 @@
 #define _DL_PROCINFO_H	1
 #include <ldsodefs.h>
 
-#define _DL_HWCAP_COUNT 7
+#define _DL_HWCAP_COUNT 10
 
-#define _DL_PLATFORMS_COUNT	4
+#define _DL_PLATFORMS_COUNT	5
 
 /* The kernel provides up to 32 capability bits with elf_hwcap.  */
 #define _DL_FIRST_PLATFORM	32
@@ -45,6 +45,9 @@ enum
   HWCAP_S390_LDISP = 1 << 4,
   HWCAP_S390_EIMM = 1 << 5,
   HWCAP_S390_DFP = 1 << 6,
+  HWCAP_S390_HPAGE = 1 << 7,
+  HWCAP_S390_ETF3EH = 1 << 8,
+  HWCAP_S390_HIGH_GPRS = 1 << 9,
 };
 
 #define HWCAP_IMPORTANT (HWCAP_S390_ZARCH | HWCAP_S390_LDISP \
diff --git a/sysdeps/s390/s390-64/Makefile b/sysdeps/s390/s390-64/Makefile
index 0a5051449d..1814f37abd 100644
--- a/sysdeps/s390/s390-64/Makefile
+++ b/sysdeps/s390/s390-64/Makefile
@@ -9,3 +9,70 @@ CFLAGS-rtld.c += -Wno-uninitialized -Wno-unused
 CFLAGS-dl-load.c += -Wno-unused
 CFLAGS-dl-reloc.c += -Wno-unused
 endif
+
+ifeq ($(subdir),iconvdata)
+ISO-8859-1_CP037_Z900-routines := iso-8859-1_cp037_z900
+ISO-8859-1_CP037_Z900-map := gconv.map
+
+UTF8_UTF32_Z9-routines := utf8-utf32-z9
+UTF8_UTF32_Z9-map := gconv.map
+
+UTF16_UTF32_Z9-routines := utf16-utf32-z9
+UTF16_UTF32_Z9-map := gconv.map
+
+UTF8_UTF16_Z9-routines := utf8-utf16-z9
+UTF8_UTF16_Z9-map := gconv.map
+
+s390x-iconv-modules = ISO-8859-1_CP037_Z900 UTF8_UTF16_Z9 UTF16_UTF32_Z9 UTF8_UTF32_Z9
+
+extra-modules-left += $(s390x-iconv-modules)
+include extra-module.mk
+
+extra-objs      += $(addsuffix .so, $(s390x-iconv-modules))
+install-others  += $(patsubst %, $(inst_gconvdir)/%.so, $(s390x-iconv-modules))
+
+distribute += iso-8859-1_cp037_z900.c utf8-utf32-z9.c utf16-utf32-z9.c utf8-utf16-z9.c
+
+$(patsubst %, $(inst_gconvdir)/%.so, $(s390x-iconv-modules)) : \
+$(inst_gconvdir)/%.so: $(objpfx)%.so $(+force)
+	$(do-install-program)
+
+$(objpfx)gconv-modules-s390: gconv-modules $(+force)
+	cp $< $@
+	echo >> $@
+	echo "# S/390 hardware accelerated modules" >> $@
+	echo -n "module	ISO-8859-1//		IBM037//	" >> $@
+	echo "	ISO-8859-1_CP037_Z900	1" >> $@
+	echo -n "module	IBM037//		ISO-8859-1//	" >> $@
+	echo "	ISO-8859-1_CP037_Z900	1" >> $@
+	echo -n "module	ISO-10646/UTF8/		UTF-32//	" >> $@
+	echo "	UTF8_UTF32_Z9		1" >> $@
+	echo -n "module	UTF-32BE//		ISO-10646/UTF8/	" >> $@
+	echo "	UTF8_UTF32_Z9		1" >> $@
+	echo -n "module	ISO-10646/UTF8/		UTF-32BE//	" >> $@
+	echo "	UTF8_UTF32_Z9		1" >> $@
+	echo -n "module	UTF-16BE//		UTF-32//	" >> $@
+	echo "	UTF16_UTF32_Z9		1" >> $@
+	echo -n "module	UTF-32BE//		UTF-16//	" >> $@
+	echo "	UTF16_UTF32_Z9		1" >> $@
+	echo -n "module	INTERNAL		UTF-16//	" >> $@
+	echo "	UTF16_UTF32_Z9		1" >> $@
+	echo -n "module	UTF-32BE//		UTF-16BE//	" >> $@
+	echo "	UTF16_UTF32_Z9		1" >> $@
+	echo -n "module	INTERNAL		UTF-16BE//	" >> $@
+	echo "	UTF16_UTF32_Z9		1" >> $@
+	echo -n "module	UTF-16BE//		UTF-32BE//	" >> $@
+	echo "	UTF16_UTF32_Z9		1" >> $@
+	echo -n "module	UTF-16BE//		INTERNAL	" >> $@
+	echo "	UTF16_UTF32_Z9		1" >> $@
+	echo -n "module	UTF-16BE//		ISO-10646/UTF8/	" >> $@
+	echo "	UTF8_UTF16_Z9		1" >> $@
+	echo -n "module	ISO-10646/UTF8/		UTF-16//	" >> $@
+	echo "	UTF8_UTF16_Z9		1" >> $@
+	echo -n "module	ISO-10646/UTF8/		UTF-16BE//	" >> $@
+	echo "	UTF8_UTF16_Z9		1" >> $@
+
+$(inst_gconvdir)/gconv-modules: $(objpfx)gconv-modules-s390 $(+force)
+	$(do-install)
+
+endif
diff --git a/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c b/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c
new file mode 100644
index 0000000000..d4c4931f22
--- /dev/null
+++ b/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c
@@ -0,0 +1,238 @@
+/* Conversion between ISO 8859-1 and IBM037.
+
+   This module uses the Z900 variant of the Translate One To One
+   instruction.
+   Copyright (C) 1997-2009 Free Software Foundation, Inc.
+
+   Author: Andreas Krebbel  <Andreas.Krebbel@de.ibm.com>
+   Based on the work by Ulrich Drepper  <drepper@cygnus.com>, 1997.
+
+   Thanks to Daniel Appich who covered the relevant performance work
+   in his diploma thesis.
+
+   This is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   This is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <dlfcn.h>
+#include <stdint.h>
+
+// conversion table from ISO-8859-1 to IBM037
+static const unsigned char table_iso8859_1_to_cp037[256]
+__attribute__ ((aligned (8))) =
+{
+  [0x00] = 0x00, [0x01] = 0x01, [0x02] = 0x02, [0x03] = 0x03,
+  [0x04] = 0x37, [0x05] = 0x2D, [0x06] = 0x2E, [0x07] = 0x2F,
+  [0x08] = 0x16, [0x09] = 0x05, [0x0A] = 0x25, [0x0B] = 0x0B,
+  [0x0C] = 0x0C, [0x0D] = 0x0D, [0x0E] = 0x0E, [0x0F] = 0x0F,
+  [0x10] = 0x10, [0x11] = 0x11, [0x12] = 0x12, [0x13] = 0x13,
+  [0x14] = 0x3C, [0x15] = 0x3D, [0x16] = 0x32, [0x17] = 0x26,
+  [0x18] = 0x18, [0x19] = 0x19, [0x1A] = 0x3F, [0x1B] = 0x27,
+  [0x1C] = 0x1C, [0x1D] = 0x1D, [0x1E] = 0x1E, [0x1F] = 0x1F,
+  [0x20] = 0x40, [0x21] = 0x5A, [0x22] = 0x7F, [0x23] = 0x7B,
+  [0x24] = 0x5B, [0x25] = 0x6C, [0x26] = 0x50, [0x27] = 0x7D,
+  [0x28] = 0x4D, [0x29] = 0x5D, [0x2A] = 0x5C, [0x2B] = 0x4E,
+  [0x2C] = 0x6B, [0x2D] = 0x60, [0x2E] = 0x4B, [0x2F] = 0x61,
+  [0x30] = 0xF0, [0x31] = 0xF1, [0x32] = 0xF2, [0x33] = 0xF3,
+  [0x34] = 0xF4, [0x35] = 0xF5, [0x36] = 0xF6, [0x37] = 0xF7,
+  [0x38] = 0xF8, [0x39] = 0xF9, [0x3A] = 0x7A, [0x3B] = 0x5E,
+  [0x3C] = 0x4C, [0x3D] = 0x7E, [0x3E] = 0x6E, [0x3F] = 0x6F,
+  [0x40] = 0x7C, [0x41] = 0xC1, [0x42] = 0xC2, [0x43] = 0xC3,
+  [0x44] = 0xC4, [0x45] = 0xC5, [0x46] = 0xC6, [0x47] = 0xC7,
+  [0x48] = 0xC8, [0x49] = 0xC9, [0x4A] = 0xD1, [0x4B] = 0xD2,
+  [0x4C] = 0xD3, [0x4D] = 0xD4, [0x4E] = 0xD5, [0x4F] = 0xD6,
+  [0x50] = 0xD7, [0x51] = 0xD8, [0x52] = 0xD9, [0x53] = 0xE2,
+  [0x54] = 0xE3, [0x55] = 0xE4, [0x56] = 0xE5, [0x57] = 0xE6,
+  [0x58] = 0xE7, [0x59] = 0xE8, [0x5A] = 0xE9, [0x5B] = 0xBA,
+  [0x5C] = 0xE0, [0x5D] = 0xBB, [0x5E] = 0xB0, [0x5F] = 0x6D,
+  [0x60] = 0x79, [0x61] = 0x81, [0x62] = 0x82, [0x63] = 0x83,
+  [0x64] = 0x84, [0x65] = 0x85, [0x66] = 0x86, [0x67] = 0x87,
+  [0x68] = 0x88, [0x69] = 0x89, [0x6A] = 0x91, [0x6B] = 0x92,
+  [0x6C] = 0x93, [0x6D] = 0x94, [0x6E] = 0x95, [0x6F] = 0x96,
+  [0x70] = 0x97, [0x71] = 0x98, [0x72] = 0x99, [0x73] = 0xA2,
+  [0x74] = 0xA3, [0x75] = 0xA4, [0x76] = 0xA5, [0x77] = 0xA6,
+  [0x78] = 0xA7, [0x79] = 0xA8, [0x7A] = 0xA9, [0x7B] = 0xC0,
+  [0x7C] = 0x4F, [0x7D] = 0xD0, [0x7E] = 0xA1, [0x7F] = 0x07,
+  [0x80] = 0x20, [0x81] = 0x21, [0x82] = 0x22, [0x83] = 0x23,
+  [0x84] = 0x24, [0x85] = 0x15, [0x86] = 0x06, [0x87] = 0x17,
+  [0x88] = 0x28, [0x89] = 0x29, [0x8A] = 0x2A, [0x8B] = 0x2B,
+  [0x8C] = 0x2C, [0x8D] = 0x09, [0x8E] = 0x0A, [0x8F] = 0x1B,
+  [0x90] = 0x30, [0x91] = 0x31, [0x92] = 0x1A, [0x93] = 0x33,
+  [0x94] = 0x34, [0x95] = 0x35, [0x96] = 0x36, [0x97] = 0x08,
+  [0x98] = 0x38, [0x99] = 0x39, [0x9A] = 0x3A, [0x9B] = 0x3B,
+  [0x9C] = 0x04, [0x9D] = 0x14, [0x9E] = 0x3E, [0x9F] = 0xFF,
+  [0xA0] = 0x41, [0xA1] = 0xAA, [0xA2] = 0x4A, [0xA3] = 0xB1,
+  [0xA4] = 0x9F, [0xA5] = 0xB2, [0xA6] = 0x6A, [0xA7] = 0xB5,
+  [0xA8] = 0xBD, [0xA9] = 0xB4, [0xAA] = 0x9A, [0xAB] = 0x8A,
+  [0xAC] = 0x5F, [0xAD] = 0xCA, [0xAE] = 0xAF, [0xAF] = 0xBC,
+  [0xB0] = 0x90, [0xB1] = 0x8F, [0xB2] = 0xEA, [0xB3] = 0xFA,
+  [0xB4] = 0xBE, [0xB5] = 0xA0, [0xB6] = 0xB6, [0xB7] = 0xB3,
+  [0xB8] = 0x9D, [0xB9] = 0xDA, [0xBA] = 0x9B, [0xBB] = 0x8B,
+  [0xBC] = 0xB7, [0xBD] = 0xB8, [0xBE] = 0xB9, [0xBF] = 0xAB,
+  [0xC0] = 0x64, [0xC1] = 0x65, [0xC2] = 0x62, [0xC3] = 0x66,
+  [0xC4] = 0x63, [0xC5] = 0x67, [0xC6] = 0x9E, [0xC7] = 0x68,
+  [0xC8] = 0x74, [0xC9] = 0x71, [0xCA] = 0x72, [0xCB] = 0x73,
+  [0xCC] = 0x78, [0xCD] = 0x75, [0xCE] = 0x76, [0xCF] = 0x77,
+  [0xD0] = 0xAC, [0xD1] = 0x69, [0xD2] = 0xED, [0xD3] = 0xEE,
+  [0xD4] = 0xEB, [0xD5] = 0xEF, [0xD6] = 0xEC, [0xD7] = 0xBF,
+  [0xD8] = 0x80, [0xD9] = 0xFD, [0xDA] = 0xFE, [0xDB] = 0xFB,
+  [0xDC] = 0xFC, [0xDD] = 0xAD, [0xDE] = 0xAE, [0xDF] = 0x59,
+  [0xE0] = 0x44, [0xE1] = 0x45, [0xE2] = 0x42, [0xE3] = 0x46,
+  [0xE4] = 0x43, [0xE5] = 0x47, [0xE6] = 0x9C, [0xE7] = 0x48,
+  [0xE8] = 0x54, [0xE9] = 0x51, [0xEA] = 0x52, [0xEB] = 0x53,
+  [0xEC] = 0x58, [0xED] = 0x55, [0xEE] = 0x56, [0xEF] = 0x57,
+  [0xF0] = 0x8C, [0xF1] = 0x49, [0xF2] = 0xCD, [0xF3] = 0xCE,
+  [0xF4] = 0xCB, [0xF5] = 0xCF, [0xF6] = 0xCC, [0xF7] = 0xE1,
+  [0xF8] = 0x70, [0xF9] = 0xDD, [0xFA] = 0xDE, [0xFB] = 0xDB,
+  [0xFC] = 0xDC, [0xFD] = 0x8D, [0xFE] = 0x8E, [0xFF] = 0xDF
+};
+
+// conversion table from IBM037 to ISO-8859-1
+static const unsigned char table_cp037_iso8859_1[256]
+__attribute__ ((aligned (8))) =
+{
+  [0x00] = 0x00, [0x01] = 0x01, [0x02] = 0x02, [0x03] = 0x03,
+  [0x04] = 0x9C, [0x05] = 0x09, [0x06] = 0x86, [0x07] = 0x7F,
+  [0x08] = 0x97, [0x09] = 0x8D, [0x0A] = 0x8E, [0x0B] = 0x0B,
+  [0x0C] = 0x0C, [0x0D] = 0x0D, [0x0E] = 0x0E, [0x0F] = 0x0F,
+  [0x10] = 0x10, [0x11] = 0x11, [0x12] = 0x12, [0x13] = 0x13,
+  [0x14] = 0x9D, [0x15] = 0x85, [0x16] = 0x08, [0x17] = 0x87,
+  [0x18] = 0x18, [0x19] = 0x19, [0x1A] = 0x92, [0x1B] = 0x8F,
+  [0x1C] = 0x1C, [0x1D] = 0x1D, [0x1E] = 0x1E, [0x1F] = 0x1F,
+  [0x20] = 0x80, [0x21] = 0x81, [0x22] = 0x82, [0x23] = 0x83,
+  [0x24] = 0x84, [0x25] = 0x0A, [0x26] = 0x17, [0x27] = 0x1B,
+  [0x28] = 0x88, [0x29] = 0x89, [0x2A] = 0x8A, [0x2B] = 0x8B,
+  [0x2C] = 0x8C, [0x2D] = 0x05, [0x2E] = 0x06, [0x2F] = 0x07,
+  [0x30] = 0x90, [0x31] = 0x91, [0x32] = 0x16, [0x33] = 0x93,
+  [0x34] = 0x94, [0x35] = 0x95, [0x36] = 0x96, [0x37] = 0x04,
+  [0x38] = 0x98, [0x39] = 0x99, [0x3A] = 0x9A, [0x3B] = 0x9B,
+  [0x3C] = 0x14, [0x3D] = 0x15, [0x3E] = 0x9E, [0x3F] = 0x1A,
+  [0x40] = 0x20, [0x41] = 0xA0, [0x42] = 0xE2, [0x43] = 0xE4,
+  [0x44] = 0xE0, [0x45] = 0xE1, [0x46] = 0xE3, [0x47] = 0xE5,
+  [0x48] = 0xE7, [0x49] = 0xF1, [0x4A] = 0xA2, [0x4B] = 0x2E,
+  [0x4C] = 0x3C, [0x4D] = 0x28, [0x4E] = 0x2B, [0x4F] = 0x7C,
+  [0x50] = 0x26, [0x51] = 0xE9, [0x52] = 0xEA, [0x53] = 0xEB,
+  [0x54] = 0xE8, [0x55] = 0xED, [0x56] = 0xEE, [0x57] = 0xEF,
+  [0x58] = 0xEC, [0x59] = 0xDF, [0x5A] = 0x21, [0x5B] = 0x24,
+  [0x5C] = 0x2A, [0x5D] = 0x29, [0x5E] = 0x3B, [0x5F] = 0xAC,
+  [0x60] = 0x2D, [0x61] = 0x2F, [0x62] = 0xC2, [0x63] = 0xC4,
+  [0x64] = 0xC0, [0x65] = 0xC1, [0x66] = 0xC3, [0x67] = 0xC5,
+  [0x68] = 0xC7, [0x69] = 0xD1, [0x6A] = 0xA6, [0x6B] = 0x2C,
+  [0x6C] = 0x25, [0x6D] = 0x5F, [0x6E] = 0x3E, [0x6F] = 0x3F,
+  [0x70] = 0xF8, [0x71] = 0xC9, [0x72] = 0xCA, [0x73] = 0xCB,
+  [0x74] = 0xC8, [0x75] = 0xCD, [0x76] = 0xCE, [0x77] = 0xCF,
+  [0x78] = 0xCC, [0x79] = 0x60, [0x7A] = 0x3A, [0x7B] = 0x23,
+  [0x7C] = 0x40, [0x7D] = 0x27, [0x7E] = 0x3D, [0x7F] = 0x22,
+  [0x80] = 0xD8, [0x81] = 0x61, [0x82] = 0x62, [0x83] = 0x63,
+  [0x84] = 0x64, [0x85] = 0x65, [0x86] = 0x66, [0x87] = 0x67,
+  [0x88] = 0x68, [0x89] = 0x69, [0x8A] = 0xAB, [0x8B] = 0xBB,
+  [0x8C] = 0xF0, [0x8D] = 0xFD, [0x8E] = 0xFE, [0x8F] = 0xB1,
+  [0x90] = 0xB0, [0x91] = 0x6A, [0x92] = 0x6B, [0x93] = 0x6C,
+  [0x94] = 0x6D, [0x95] = 0x6E, [0x96] = 0x6F, [0x97] = 0x70,
+  [0x98] = 0x71, [0x99] = 0x72, [0x9A] = 0xAA, [0x9B] = 0xBA,
+  [0x9C] = 0xE6, [0x9D] = 0xB8, [0x9E] = 0xC6, [0x9F] = 0xA4,
+  [0xA0] = 0xB5, [0xA1] = 0x7E, [0xA2] = 0x73, [0xA3] = 0x74,
+  [0xA4] = 0x75, [0xA5] = 0x76, [0xA6] = 0x77, [0xA7] = 0x78,
+  [0xA8] = 0x79, [0xA9] = 0x7A, [0xAA] = 0xA1, [0xAB] = 0xBF,
+  [0xAC] = 0xD0, [0xAD] = 0xDD, [0xAE] = 0xDE, [0xAF] = 0xAE,
+  [0xB0] = 0x5E, [0xB1] = 0xA3, [0xB2] = 0xA5, [0xB3] = 0xB7,
+  [0xB4] = 0xA9, [0xB5] = 0xA7, [0xB6] = 0xB6, [0xB7] = 0xBC,
+  [0xB8] = 0xBD, [0xB9] = 0xBE, [0xBA] = 0x5B, [0xBB] = 0x5D,
+  [0xBC] = 0xAF, [0xBD] = 0xA8, [0xBE] = 0xB4, [0xBF] = 0xD7,
+  [0xC0] = 0x7B, [0xC1] = 0x41, [0xC2] = 0x42, [0xC3] = 0x43,
+  [0xC4] = 0x44, [0xC5] = 0x45, [0xC6] = 0x46, [0xC7] = 0x47,
+  [0xC8] = 0x48, [0xC9] = 0x49, [0xCA] = 0xAD, [0xCB] = 0xF4,
+  [0xCC] = 0xF6, [0xCD] = 0xF2, [0xCE] = 0xF3, [0xCF] = 0xF5,
+  [0xD0] = 0x7D, [0xD1] = 0x4A, [0xD2] = 0x4B, [0xD3] = 0x4C,
+  [0xD4] = 0x4D, [0xD5] = 0x4E, [0xD6] = 0x4F, [0xD7] = 0x50,
+  [0xD8] = 0x51, [0xD9] = 0x52, [0xDA] = 0xB9, [0xDB] = 0xFB,
+  [0xDC] = 0xFC, [0xDD] = 0xF9, [0xDE] = 0xFA, [0xDF] = 0xFF,
+  [0xE0] = 0x5C, [0xE1] = 0xF7, [0xE2] = 0x53, [0xE3] = 0x54,
+  [0xE4] = 0x55, [0xE5] = 0x56, [0xE6] = 0x57, [0xE7] = 0x58,
+  [0xE8] = 0x59, [0xE9] = 0x5A, [0xEA] = 0xB2, [0xEB] = 0xD4,
+  [0xEC] = 0xD6, [0xED] = 0xD2, [0xEE] = 0xD3, [0xEF] = 0xD5,
+  [0xF0] = 0x30, [0xF1] = 0x31, [0xF2] = 0x32, [0xF3] = 0x33,
+  [0xF4] = 0x34, [0xF5] = 0x35, [0xF6] = 0x36, [0xF7] = 0x37,
+  [0xF8] = 0x38, [0xF9] = 0x39, [0xFA] = 0xB3, [0xFB] = 0xDB,
+  [0xFC] = 0xDC, [0xFD] = 0xD9, [0xFE] = 0xDA, [0xFF] = 0x9F
+};
+
+/* Definitions used in the body of the `gconv' function.  */
+#define CHARSET_NAME		"ISO-8859-1//"
+#define FROM_LOOP		iso8859_1_to_cp037_z900
+#define TO_LOOP			cp037_to_iso8859_1_z900
+#define DEFINE_INIT		1
+#define DEFINE_FINI		1
+#define MIN_NEEDED_FROM		1
+#define MIN_NEEDED_TO		1
+
+/* The Z900 variant of troo forces us to always specify a test
+   character which ends the translation.  So if we run into the
+   situation where the translation has been interrupted due to the
+   test character we translate the character by hand and jump back
+   into the instruction.  */
+
+#define TROO_LOOP(TABLE)						\
+  {									\
+    register const unsigned char test asm ("0") = 0;			\
+    register const unsigned char *pTable asm ("1") = TABLE;		\
+    register unsigned char *pOutput asm ("2") = outptr;			\
+    register uint64_t length asm ("3");					\
+    const unsigned char* pInput = inptr;				\
+    uint64_t tmp;							\
+									\
+    length = (inend - inptr < outend - outptr				\
+	      ? inend - inptr : outend - outptr);			\
+									\
+    asm volatile ("0:                        \n\t"			\
+                  "  troo    %0,%1           \n\t"			\
+                  "  jz      1f              \n\t"			\
+                  "  jo      0b              \n\t"			\
+                  "  llgc    %3,0(%1)        \n\t"			\
+                  "  la      %3,0(%3,%4)     \n\t"			\
+                  "  mvc     0(1,%0),0(%3)   \n\t"			\
+                  "  aghi    %1,1            \n\t"			\
+                  "  aghi    %0,1            \n\t"			\
+                  "  aghi    %2,-1           \n\t"			\
+                  "  j       0b              \n\t"			\
+                  "1:                        \n"			\
+									\
+     : "+a" (pOutput), "+a" (pInput), "+d" (length), "=&a" (tmp)        \
+     : "a" (pTable), "d" (test)						\
+     : "cc");								\
+									\
+    inptr = pInput;							\
+    outptr = pOutput;							\
+  }
+
+/* First define the conversion function from ISO 8859-1 to CP037.  */
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY TROO_LOOP (table_iso8859_1_to_cp037)
+
+#include <iconv/loop.c>
+
+
+/* Next, define the conversion function from CP037 to ISO 8859-1.  */
+#define MIN_NEEDED_INPUT	MIN_NEEDED_TO
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_FROM
+#define LOOPFCT			TO_LOOP
+#define BODY TROO_LOOP (table_cp037_iso8859_1);
+
+#include <iconv/loop.c>
+
+
+/* Now define the toplevel functions.  */
+#include <iconv/skeleton.c>
diff --git a/sysdeps/s390/s390-64/utf16-utf32-z9.c b/sysdeps/s390/s390-64/utf16-utf32-z9.c
new file mode 100644
index 0000000000..c9bccc98a7
--- /dev/null
+++ b/sysdeps/s390/s390-64/utf16-utf32-z9.c
@@ -0,0 +1,325 @@
+/* Conversion between UTF-16 and UTF-32 BE/internal.
+
+   This module uses the Z9-109 variants of the Convert Unicode
+   instructions.
+   Copyright (C) 1997-2009 Free Software Foundation, Inc.
+
+   Author: Andreas Krebbel  <Andreas.Krebbel@de.ibm.com>
+   Based on the work by Ulrich Drepper  <drepper@cygnus.com>, 1997.
+
+   Thanks to Daniel Appich who covered the relevant performance work
+   in his diploma thesis.
+
+   This is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   This is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <dlfcn.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <dl-procinfo.h>
+#include <gconv.h>
+
+/* UTF-32 big endian byte order mark.  */
+#define BOM_UTF32               0x0000feffu
+
+/* UTF-16 big endian byte order mark.  */
+#define BOM_UTF16	        0xfeff
+
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		2
+#define MAX_NEEDED_FROM		4
+#define MIN_NEEDED_TO		4
+#define FROM_LOOP		from_utf16_loop
+#define TO_LOOP			to_utf16_loop
+#define FROM_DIRECTION		(dir == from_utf16)
+#define PREPARE_LOOP							\
+  enum direction dir = ((struct utf16_data *) step->__data)->dir;	\
+  int emit_bom = ((struct utf16_data *) step->__data)->emit_bom;	\
+									\
+  if (emit_bom && !data->__internal_use					\
+      && data->__invocation_counter == 0)				\
+    {									\
+      if (dir == to_utf16)						\
+	{								\
+          /* Emit the UTF-16 Byte Order Mark.  */			\
+          if (__builtin_expect (outbuf + 2 > outend, 0))		\
+	    return __GCONV_FULL_OUTPUT;					\
+									\
+	  put16u (outbuf, BOM_UTF16);					\
+	  outbuf += 2;							\
+	}								\
+      else								\
+	{								\
+          /* Emit the UTF-32 Byte Order Mark.  */			\
+	  if (__builtin_expect (outbuf + 4 > outend, 0))		\
+	    return __GCONV_FULL_OUTPUT;					\
+									\
+	  put32u (outbuf, BOM_UTF32);					\
+	  outbuf += 4;							\
+	}								\
+    }
+
+/* Direction of the transformation.  */
+enum direction
+{
+  illegal_dir,
+  to_utf16,
+  from_utf16
+};
+
+struct utf16_data
+{
+  enum direction dir;
+  int emit_bom;
+};
+
+
+extern int gconv_init (struct __gconv_step *step);
+int
+gconv_init (struct __gconv_step *step)
+{
+  /* Determine which direction.  */
+  struct utf16_data *new_data;
+  enum direction dir = illegal_dir;
+  int emit_bom;
+  int result;
+
+  emit_bom = (__strcasecmp (step->__to_name, "UTF-32//") == 0
+	      || __strcasecmp (step->__to_name, "UTF-16//") == 0);
+
+  if (__strcasecmp (step->__from_name, "UTF-16BE//") == 0
+      && (__strcasecmp (step->__to_name, "UTF-32//") == 0
+	  || __strcasecmp (step->__to_name, "UTF-32BE//") == 0
+      	  || __strcasecmp (step->__to_name, "INTERNAL") == 0))
+    {
+      dir = from_utf16;
+    }
+  else if ((__strcasecmp (step->__to_name, "UTF-16//") == 0
+	    || __strcasecmp (step->__to_name, "UTF-16BE//") == 0)
+	   && (__strcasecmp (step->__from_name, "UTF-32BE//") == 0
+	       || __strcasecmp (step->__from_name, "INTERNAL") == 0))
+    {
+      dir = to_utf16;
+    }
+
+  result = __GCONV_NOCONV;
+  if (dir != illegal_dir)
+    {
+      new_data = (struct utf16_data *) malloc (sizeof (struct utf16_data));
+
+      result = __GCONV_NOMEM;
+      if (new_data != NULL)
+	{
+	  new_data->dir = dir;
+	  new_data->emit_bom = emit_bom;
+	  step->__data = new_data;
+
+	  if (dir == from_utf16)
+	    {
+	      step->__min_needed_from = MIN_NEEDED_FROM;
+	      step->__max_needed_from = MIN_NEEDED_FROM;
+	      step->__min_needed_to = MIN_NEEDED_TO;
+	      step->__max_needed_to = MIN_NEEDED_TO;
+	    }
+	  else
+	    {
+	      step->__min_needed_from = MIN_NEEDED_TO;
+	      step->__max_needed_from = MIN_NEEDED_TO;
+	      step->__min_needed_to = MIN_NEEDED_FROM;
+	      step->__max_needed_to = MIN_NEEDED_FROM;
+	    }
+
+	  step->__stateful = 0;
+
+	  result = __GCONV_OK;
+	}
+    }
+
+  return result;
+}
+
+
+extern void gconv_end (struct __gconv_step *data);
+void
+gconv_end (struct __gconv_step *data)
+{
+  free (data->__data);
+}
+
+/* The macro for the hardware loop.  This is used for both
+   directions.  */
+#define HARDWARE_CONVERT(INSTRUCTION)					\
+  {									\
+    register const unsigned char* pInput asm ("8") = inptr;		\
+    register unsigned long long inlen asm ("9") = inend - inptr;	\
+    register unsigned char* pOutput asm ("10") = outptr;		\
+    register unsigned long long outlen asm("11") = outend - outptr;	\
+    uint64_t cc = 0;							\
+									\
+    asm volatile ("0: " INSTRUCTION "  \n\t"				\
+                  "   jo     0b        \n\t"				\
+		  "   ipm    %2        \n"			        \
+		  : "+a" (pOutput), "+a" (pInput), "+d" (cc),		\
+		    "+d" (outlen), "+d" (inlen)				\
+		  :							\
+		  : "cc", "memory");					\
+									\
+    inptr = pInput;							\
+    outptr = pOutput;							\
+    cc >>= 28;          						\
+									\
+    if (cc == 1)							\
+      {									\
+	result = __GCONV_FULL_OUTPUT;					\
+	break;								\
+      }									\
+    else if (cc == 2)							\
+      {									\
+	result = __GCONV_ILLEGAL_INPUT;					\
+	break;								\
+      }									\
+  }
+
+/* Conversion function from UTF-16 to UTF-32 internal/BE.  */
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MAX_NEEDED_INPUT	MAX_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+/* The software routine is copied from utf-16.c (minus bytes
+   swapping).  */
+#define BODY								\
+  {									\
+    if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH)				\
+      {									\
+	HARDWARE_CONVERT ("cu24 %0, %1, 1");				\
+	if (inptr != inend)						\
+	  {								\
+	    /* Check if the third byte is				\
+	       a valid start of a UTF-16 surrogate.  */			\
+	    if (inend - inptr == 3 && (inptr[3] & 0xfc) != 0xdc)	\
+	      STANDARD_FROM_LOOP_ERR_HANDLER (3);			\
+									\
+	    result = __GCONV_INCOMPLETE_INPUT;				\
+	    break;							\
+	  }								\
+	continue;							\
+      }									\
+									\
+    uint16_t u1 = get16 (inptr);					\
+									\
+    if (__builtin_expect (u1 < 0xd800, 1) || u1 > 0xdfff)		\
+      {									\
+	/* No surrogate.  */						\
+	put32 (outptr, u1);						\
+	inptr += 2;							\
+      }									\
+    else								\
+      {									\
+	/* It's a surrogate character.  At least the first word says	\
+	   it is.  */							\
+	if (__builtin_expect (inptr + 4 > inend, 0))			\
+	  {								\
+	    /* We don't have enough input for another complete input	\
+	       character.  */						\
+	    result = __GCONV_INCOMPLETE_INPUT;				\
+	    break;							\
+	  }								\
+									\
+	inptr += 2;							\
+	uint16_t u2 = get16 (inptr);					\
+	if (__builtin_expect (u2 < 0xdc00, 0)				\
+	    || __builtin_expect (u2 > 0xdfff, 0))			\
+	  {								\
+	    /* This is no valid second word for a surrogate.  */	\
+	    inptr -= 2;							\
+	    STANDARD_FROM_LOOP_ERR_HANDLER (2);				\
+	  }								\
+									\
+	put32 (outptr, ((u1 - 0xd7c0) << 10) + (u2 - 0xdc00));		\
+	inptr += 2;							\
+      }									\
+    outptr += 4;							\
+  }
+#define LOOP_NEED_FLAGS
+#include <iconv/loop.c>
+
+/* Conversion from UTF-32 internal/BE to UTF-16.  */
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_TO
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_FROM
+#define MAX_NEEDED_OUTPUT	MAX_NEEDED_FROM
+#define LOOPFCT			TO_LOOP
+/* The software routine is copied from utf-16.c (minus bytes
+   swapping).  */
+#define BODY								\
+  {									\
+    if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH)				\
+      {									\
+	HARDWARE_CONVERT ("cu42 %0, %1");				\
+									\
+	if (inptr != inend)						\
+	  {								\
+	    result = __GCONV_INCOMPLETE_INPUT;				\
+	    break;							\
+	  }								\
+	continue;							\
+      }									\
+									\
+    uint32_t c = get32 (inptr);						\
+									\
+    if (__builtin_expect (c <= 0xd7ff, 1)				\
+	|| (c >=0xdc00 && c <= 0xffff))					\
+      {									\
+        /* Two UTF-16 chars.  */					\
+        put16 (outptr, c);						\
+      }									\
+    else if (__builtin_expect (c >= 0x10000, 1)				\
+	     && __builtin_expect (c <= 0x10ffff, 1))			\
+      {									\
+	/* Four UTF-16 chars.  */					\
+        uint16_t zabcd = ((c & 0x1f0000) >> 16) - 1;			\
+	uint16_t out;							\
+									\
+	/* Generate a surrogate character.  */				\
+	if (__builtin_expect (outptr + 4 > outend, 0))			\
+	  {								\
+	    /* Overflow in the output buffer.  */			\
+	    result = __GCONV_FULL_OUTPUT;				\
+	    break;							\
+	  }								\
+									\
+	out = 0xd800;							\
+	out |= (zabcd & 0xff) << 6;					\
+	out |= (c >> 10) & 0x3f;					\
+	put16 (outptr, out);						\
+	outptr += 2;							\
+									\
+	out = 0xdc00;							\
+	out |= c & 0x3ff;						\
+	put16 (outptr, out);						\
+      }									\
+    else								\
+      {									\
+        STANDARD_TO_LOOP_ERR_HANDLER (4);				\
+      }									\
+    outptr += 2;							\
+    inptr += 4;								\
+  }
+#define LOOP_NEED_FLAGS
+#include <iconv/loop.c>
+
+#include <iconv/skeleton.c>
diff --git a/sysdeps/s390/s390-64/utf8-utf16-z9.c b/sysdeps/s390/s390-64/utf8-utf16-z9.c
new file mode 100644
index 0000000000..3fe3652b35
--- /dev/null
+++ b/sysdeps/s390/s390-64/utf8-utf16-z9.c
@@ -0,0 +1,463 @@
+/* Conversion between UTF-16 and UTF-32 BE/internal.
+
+   This module uses the Z9-109 variants of the Convert Unicode
+   instructions.
+   Copyright (C) 1997-2009 Free Software Foundation, Inc.
+
+   Author: Andreas Krebbel  <Andreas.Krebbel@de.ibm.com>
+   Based on the work by Ulrich Drepper  <drepper@cygnus.com>, 1997.
+
+   Thanks to Daniel Appich who covered the relevant performance work
+   in his diploma thesis.
+
+   This is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   This is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <dlfcn.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <dl-procinfo.h>
+#include <gconv.h>
+
+/* UTF-16 big endian byte order mark.  */
+#define BOM_UTF16	0xfeff
+
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		1
+#define MAX_NEEDED_FROM		4
+#define MIN_NEEDED_TO		2
+#define MAX_NEEDED_TO		4
+#define FROM_LOOP		from_utf8_loop
+#define TO_LOOP			to_utf8_loop
+#define FROM_DIRECTION		(dir == from_utf8)
+#define PREPARE_LOOP							\
+  enum direction dir = ((struct utf8_data *) step->__data)->dir;	\
+  int emit_bom = ((struct utf8_data *) step->__data)->emit_bom;		\
+									\
+  if (emit_bom && !data->__internal_use					\
+      && data->__invocation_counter == 0)				\
+    {									\
+      /* Emit the UTF-16 Byte Order Mark.  */				\
+      if (__builtin_expect (outbuf + 2 > outend, 0))			\
+	return __GCONV_FULL_OUTPUT;					\
+									\
+      put16u (outbuf, BOM_UTF16);					\
+      outbuf += 2;							\
+    }
+
+/* Direction of the transformation.  */
+enum direction
+{
+  illegal_dir,
+  to_utf8,
+  from_utf8
+};
+
+struct utf8_data
+{
+  enum direction dir;
+  int emit_bom;
+};
+
+
+extern int gconv_init (struct __gconv_step *step);
+int
+gconv_init (struct __gconv_step *step)
+{
+  /* Determine which direction.  */
+  struct utf8_data *new_data;
+  enum direction dir = illegal_dir;
+  int emit_bom;
+  int result;
+
+  emit_bom = (__strcasecmp (step->__to_name, "UTF-16//") == 0);
+
+  if (__strcasecmp (step->__from_name, "ISO-10646/UTF8/") == 0
+      && (__strcasecmp (step->__to_name, "UTF-16//") == 0
+	  || __strcasecmp (step->__to_name, "UTF-16BE//") == 0))
+    {
+      dir = from_utf8;
+    }
+  else if (__strcasecmp (step->__from_name, "UTF-16BE//") == 0
+	   && __strcasecmp (step->__to_name, "ISO-10646/UTF8/") == 0)
+    {
+      dir = to_utf8;
+    }
+
+  result = __GCONV_NOCONV;
+  if (dir != illegal_dir)
+    {
+      new_data = (struct utf8_data *) malloc (sizeof (struct utf8_data));
+
+      result = __GCONV_NOMEM;
+      if (new_data != NULL)
+	{
+	  new_data->dir = dir;
+	  new_data->emit_bom = emit_bom;
+	  step->__data = new_data;
+
+	  if (dir == from_utf8)
+	    {
+	      step->__min_needed_from = MIN_NEEDED_FROM;
+	      step->__max_needed_from = MIN_NEEDED_FROM;
+	      step->__min_needed_to = MIN_NEEDED_TO;
+	      step->__max_needed_to = MIN_NEEDED_TO;
+	    }
+	  else
+	    {
+	      step->__min_needed_from = MIN_NEEDED_TO;
+	      step->__max_needed_from = MIN_NEEDED_TO;
+	      step->__min_needed_to = MIN_NEEDED_FROM;
+	      step->__max_needed_to = MIN_NEEDED_FROM;
+	    }
+
+	  step->__stateful = 0;
+
+	  result = __GCONV_OK;
+	}
+    }
+
+  return result;
+}
+
+
+extern void gconv_end (struct __gconv_step *data);
+void
+gconv_end (struct __gconv_step *data)
+{
+  free (data->__data);
+}
+
+/* The macro for the hardware loop.  This is used for both
+   directions.  */
+#define HARDWARE_CONVERT(INSTRUCTION)					\
+  {									\
+    register const unsigned char* pInput asm ("8") = inptr;		\
+    register unsigned long long inlen asm ("9") = inend - inptr;	\
+    register unsigned char* pOutput asm ("10") = outptr;		\
+    register unsigned long long outlen asm("11") = outend - outptr;	\
+    uint64_t cc = 0;							\
+									\
+    asm volatile ("0: " INSTRUCTION "  \n\t"				\
+                  "   jo     0b        \n\t"				\
+		  "   ipm    %2        \n"			        \
+		  : "+a" (pOutput), "+a" (pInput), "+d" (cc),		\
+		    "+d" (outlen), "+d" (inlen)				\
+		  :							\
+		  : "cc", "memory");					\
+									\
+    inptr = pInput;							\
+    outptr = pOutput;							\
+    cc >>= 28;								\
+									\
+    if (cc == 1)							\
+      {									\
+	result = __GCONV_FULL_OUTPUT;					\
+	break;								\
+      }									\
+    else if (cc == 2)							\
+      {									\
+	result = __GCONV_ILLEGAL_INPUT;					\
+	break;								\
+      }									\
+  }
+
+/* Conversion function from UTF-8 to UTF-16.  */
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MAX_NEEDED_INPUT	MAX_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+/* The software implementation is based on the code in gconv_simple.c.  */
+#define BODY								\
+  {									\
+    if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH)				\
+      {									\
+	HARDWARE_CONVERT ("cu12 %0, %1, 1");				\
+	  								\
+	if (inptr != inend)						\
+	  {								\
+	    int i;							\
+	    for (i = 1; inptr + i < inend; ++i)				\
+	      if ((inptr[i] & 0xc0) != 0x80)				\
+		break;							\
+	    								\
+	    if (__builtin_expect (inptr + i == inend, 1))		\
+	      {								\
+		result = __GCONV_INCOMPLETE_INPUT;			\
+		break;							\
+	      }								\
+	    STANDARD_FROM_LOOP_ERR_HANDLER (i);				\
+	  }								\
+	continue;							\
+    }									\
+									\
+    /* Next input byte.  */						\
+    uint16_t ch = *inptr;						\
+									\
+    if (__builtin_expect (ch < 0x80, 1))				\
+      {									\
+	/* One byte sequence.  */					\
+	++inptr;							\
+      }									\
+    else								\
+      {									\
+	uint_fast32_t cnt;						\
+	uint_fast32_t i;						\
+									\
+	if (ch >= 0xc2 && ch < 0xe0)					\
+	  {								\
+	    /* We expect two bytes.  The first byte cannot be 0xc0	\
+	       or 0xc1, otherwise the wide character could have been	\
+	       represented using a single byte.  */			\
+	    cnt = 2;							\
+	    ch &= 0x1f;							\
+	  }								\
+        else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1))		\
+	  {								\
+	    /* We expect three bytes.  */				\
+	    cnt = 3;							\
+	    ch &= 0x0f;							\
+	  }								\
+	else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1))		\
+	  {								\
+	    /* We expect four bytes.  */				\
+	    cnt = 4;							\
+	    ch &= 0x07;							\
+	  }								\
+	else								\
+	  {								\
+	    /* Search the end of this ill-formed UTF-8 character.  This	\
+	       is the next byte with (x & 0xc0) != 0x80.  */		\
+	    i = 0;							\
+	    do								\
+	      ++i;							\
+	    while (inptr + i < inend					\
+		   && (*(inptr + i) & 0xc0) == 0x80			\
+		   && i < 5);						\
+									\
+	  errout:							\
+	    STANDARD_FROM_LOOP_ERR_HANDLER (i);				\
+	  }								\
+									\
+	if (__builtin_expect (inptr + cnt > inend, 0))			\
+	  {								\
+	    /* We don't have enough input.  But before we report	\
+	       that check that all the bytes are correct.  */		\
+	    for (i = 1; inptr + i < inend; ++i)				\
+	      if ((inptr[i] & 0xc0) != 0x80)				\
+		break;							\
+									\
+	    if (__builtin_expect (inptr + i == inend, 1))		\
+	      {								\
+		result = __GCONV_INCOMPLETE_INPUT;			\
+		break;							\
+	      }								\
+									\
+	    goto errout;						\
+	  }								\
+									\
+	if (cnt == 4)							\
+	  {								\
+	    /* For 4 byte UTF-8 chars two UTF-16 chars (high and	\
+	       low) are needed.  */					\
+	    uint16_t zabcd, high, low;					\
+	    								\
+	    if (__builtin_expect (outptr + 4 > outend, 0))		\
+	      {								\
+		/* Overflow in the output buffer.  */			\
+		result = __GCONV_FULL_OUTPUT;				\
+		break;							\
+	      }								\
+									\
+	    /* See Principles of Operations cu12.  */			\
+	    zabcd = (((inptr[0] & 0x7) << 2) |				\
+                     ((inptr[1] & 0x30) >> 4)) - 1;			\
+									\
+	    /* z-bit must be zero after subtracting 1.  */		\
+	    if (zabcd & 0x10)						\
+	      STANDARD_FROM_LOOP_ERR_HANDLER (4)			\
+									\
+	    high = (uint16_t)(0xd8 << 8);       /* high surrogate id */ \
+	    high |= zabcd << 6;	                        /* abcd bits */	\
+	    high |= (inptr[1] & 0xf) << 2;              /* efgh bits */	\
+	    high |= (inptr[2] & 0x30) >> 4;               /* ij bits */	\
+									\
+	    low = (uint16_t)(0xdc << 8);         /* low surrogate id */ \
+	    low |= ((uint16_t)inptr[2] & 0xc) << 6;       /* kl bits */	\
+	    low |= (inptr[2] & 0x3) << 6;                 /* mn bits */	\
+	    low |= inptr[3] & 0x3f;                   /* opqrst bits */	\
+	    								\
+	    put16 (outptr, high);					\
+	    outptr += 2;						\
+	    put16 (outptr, low);					\
+	    outptr += 2;						\
+	    inptr += 4;							\
+	    continue;							\
+	  }								\
+	else								\
+	  {								\
+	    /* Read the possible remaining bytes.  */			\
+	    for (i = 1; i < cnt; ++i)					\
+	      {								\
+		uint16_t byte = inptr[i];				\
+									\
+		if ((byte & 0xc0) != 0x80)				\
+		  /* This is an illegal encoding.  */			\
+		  break;						\
+									\
+		ch <<= 6;						\
+		ch |= byte & 0x3f;					\
+	      }								\
+	    inptr += cnt;						\
+									\
+	  }								\
+      }									\
+    /* Now adjust the pointers and store the result.  */		\
+    *((uint16_t *) outptr) = ch;					\
+    outptr += sizeof (uint16_t);					\
+  }
+
+#define LOOP_NEED_FLAGS
+#include <iconv/loop.c>
+
+/* Conversion from UTF-16 to UTF-8.  */
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_TO
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_FROM
+#define MAX_NEEDED_OUTPUT	MAX_NEEDED_FROM
+#define LOOPFCT			TO_LOOP
+/* The software routine is based on the functionality of the S/390
+   hardware instruction (cu21) as described in the Principles of
+   Operation.  */
+#define BODY								\
+  {									\
+    if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH)				\
+      {									\
+	HARDWARE_CONVERT ("cu21 %0, %1");				\
+	if (inptr != inend)						\
+	  {								\
+	    /* Check if the third byte is				\
+	       a valid start of a UTF-16 surrogate.  */			\
+	    if (inend - inptr == 3 && (inptr[3] & 0xfc) != 0xdc)	\
+	      STANDARD_TO_LOOP_ERR_HANDLER (3);				\
+									\
+	    result = __GCONV_INCOMPLETE_INPUT;				\
+	    break;							\
+	  }								\
+	continue;							\
+      }									\
+									\
+    uint16_t c = get16 (inptr);						\
+									\
+    if (__builtin_expect (c <= 0x007f, 1))				\
+      {									\
+	/* Single byte UTF-8 char.  */					\
+	*outptr = c & 0xff;						\
+	outptr++;							\
+      }									\
+    else if (c >= 0x0080 && c <= 0x07ff)				\
+      {									\
+        /* Two byte UTF-8 char.  */					\
+									\
+	if (__builtin_expect (outptr + 2 > outend, 0))			\
+	  {								\
+	    /* Overflow in the output buffer.  */			\
+	    result = __GCONV_FULL_OUTPUT;				\
+	    break;							\
+	  }								\
+									\
+        outptr[0] = 0xc0;						\
+        outptr[0] |= c >> 6;						\
+      									\
+        outptr[1] = 0x80;						\
+        outptr[1] |= c & 0x3f;						\
+									\
+	outptr += 2;							\
+      }									\
+    else if (c >= 0x0800 && c <= 0xd7ff)				\
+      {									\
+	/* Three byte UTF-8 char.  */					\
+									\
+	if (__builtin_expect (outptr + 3 > outend, 0))			\
+	  {								\
+	    /* Overflow in the output buffer.  */			\
+	    result = __GCONV_FULL_OUTPUT;				\
+	    break;							\
+	  }								\
+	outptr[0] = 0xe0;						\
+	outptr[0] |= c >> 12;						\
+									\
+	outptr[1] = 0x80;						\
+	outptr[1] |= (c >> 6) & 0x3f;					\
+									\
+	outptr[2] = 0x80;						\
+	outptr[2] |= c & 0x3f;						\
+									\
+	outptr += 3;							\
+      }									\
+    else if (c >= 0xd800 && c <= 0xdbff)				\
+      {									\
+        /* Four byte UTF-8 char.  */					\
+	uint16_t low, uvwxy;						\
+									\
+	if (__builtin_expect (outptr + 4 > outend, 0))			\
+	  {								\
+	    /* Overflow in the output buffer.  */			\
+	    result = __GCONV_FULL_OUTPUT;				\
+	    break;							\
+	  }								\
+	inptr += 2;							\
+	if (__builtin_expect (inptr + 2 > inend, 0))			\
+	  {								\
+	    result = __GCONV_INCOMPLETE_INPUT;				\
+	    break;							\
+	  }								\
+									\
+	low = get16 (inptr);						\
+									\
+	if ((low & 0xfc00) != 0xdc00)					\
+	  {								\
+	    inptr -= 2;							\
+	    STANDARD_TO_LOOP_ERR_HANDLER (2);				\
+	  }								\
+	uvwxy = ((c >> 6) & 0xf) + 1;					\
+	outptr[0] = 0xf0;						\
+	outptr[0] |= uvwxy >> 2;					\
+									\
+	outptr[1] = 0x80;						\
+	outptr[1] |= (uvwxy << 4) & 0x30;				\
+	outptr[1] |= (c >> 2) & 0x0f;					\
+									\
+	outptr[2] = 0x80;						\
+	outptr[2] |= (c & 0x03) << 4;					\
+	outptr[2] |= (low >> 6) & 0x0f;					\
+									\
+	outptr[3] = 0x80;						\
+	outptr[3] |= low & 0x3f;					\
+									\
+	outptr += 4;							\
+      }									\
+    else								\
+      {									\
+        STANDARD_TO_LOOP_ERR_HANDLER (2);				\
+      }									\
+    inptr += 2;								\
+  }
+#define LOOP_NEED_FLAGS
+#include <iconv/loop.c>
+
+#include <iconv/skeleton.c>
diff --git a/sysdeps/s390/s390-64/utf8-utf32-z9.c b/sysdeps/s390/s390-64/utf8-utf32-z9.c
new file mode 100644
index 0000000000..14847e2403
--- /dev/null
+++ b/sysdeps/s390/s390-64/utf8-utf32-z9.c
@@ -0,0 +1,508 @@
+/* Conversion between UTF-8 and UTF-32 BE/internal.
+
+   This module uses the Z9-109 variants of the Convert Unicode
+   instructions.
+   Copyright (C) 1997-2009 Free Software Foundation, Inc.
+
+   Author: Andreas Krebbel  <Andreas.Krebbel@de.ibm.com>
+   Based on the work by Ulrich Drepper  <drepper@cygnus.com>, 1997.
+
+   Thanks to Daniel Appich who covered the relevant performance work
+   in his diploma thesis.
+
+   This is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   This is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <dlfcn.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <dl-procinfo.h>
+#include <gconv.h>
+
+/* UTF-32 big endian byte order mark.  */
+#define BOM	                0x0000feffu
+
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+/* These definitions apply to the UTF-8 to UTF-32 direction.  The
+   software implementation for UTF-8 still supports multibyte
+   characters up to 6 bytes whereas the hardware variant does not.  */
+#define MIN_NEEDED_FROM		1
+#define MAX_NEEDED_FROM		6
+#define MIN_NEEDED_TO		4
+#define FROM_LOOP		from_utf8_loop
+#define TO_LOOP			to_utf8_loop
+#define FROM_DIRECTION		(dir == from_utf8)
+#define PREPARE_LOOP							\
+  enum direction dir = ((struct utf8_data *) step->__data)->dir;	\
+  int emit_bom = ((struct utf8_data *) step->__data)->emit_bom;		\
+									\
+  if (emit_bom && !data->__internal_use					\
+      && data->__invocation_counter == 0)				\
+    {									\
+      /* Emit the Byte Order Mark.  */					\
+      if (__builtin_expect (outbuf + 4 > outend, 0))			\
+	return __GCONV_FULL_OUTPUT;					\
+      									\
+      put32u (outbuf, BOM);						\
+      outbuf += 4;							\
+    }
+
+/* Direction of the transformation.  */
+enum direction
+{
+  illegal_dir,
+  to_utf8,
+  from_utf8
+};
+
+struct utf8_data
+{
+  enum direction dir;
+  int emit_bom;
+};
+
+
+extern int gconv_init (struct __gconv_step *step);
+int
+gconv_init (struct __gconv_step *step)
+{
+  /* Determine which direction.  */
+  struct utf8_data *new_data;
+  enum direction dir = illegal_dir;
+  int emit_bom;
+  int result;
+
+  emit_bom = (__strcasecmp (step->__to_name, "UTF-32//") == 0);
+
+  if (__strcasecmp (step->__from_name, "ISO-10646/UTF8/") == 0
+      && (__strcasecmp (step->__to_name, "UTF-32//") == 0
+	  || __strcasecmp (step->__to_name, "UTF-32BE//") == 0
+      	  || __strcasecmp (step->__to_name, "INTERNAL") == 0))
+    {
+      dir = from_utf8;
+    }
+  else if (__strcasecmp (step->__to_name, "ISO-10646/UTF8/") == 0
+	   && (__strcasecmp (step->__from_name, "UTF-32BE//") == 0
+	       || __strcasecmp (step->__from_name, "INTERNAL") == 0))
+    {
+      dir = to_utf8;
+    }
+
+  result = __GCONV_NOCONV;
+  if (dir != illegal_dir)
+    {
+      new_data = (struct utf8_data *) malloc (sizeof (struct utf8_data));
+
+      result = __GCONV_NOMEM;
+      if (new_data != NULL)
+	{
+	  new_data->dir = dir;
+	  new_data->emit_bom = emit_bom;
+	  step->__data = new_data;
+
+	  if (dir == from_utf8)
+	    {
+	      step->__min_needed_from = MIN_NEEDED_FROM;
+	      step->__max_needed_from = MIN_NEEDED_FROM;
+	      step->__min_needed_to = MIN_NEEDED_TO;
+	      step->__max_needed_to = MIN_NEEDED_TO;
+	    }
+	  else
+	    {
+	      step->__min_needed_from = MIN_NEEDED_TO;
+	      step->__max_needed_from = MIN_NEEDED_TO;
+	      step->__min_needed_to = MIN_NEEDED_FROM;
+	      step->__max_needed_to = MIN_NEEDED_FROM;
+	    }
+
+	  step->__stateful = 0;
+
+	  result = __GCONV_OK;
+	}
+    }
+
+  return result;
+}
+
+
+extern void gconv_end (struct __gconv_step *data);
+void
+gconv_end (struct __gconv_step *data)
+{
+  free (data->__data);
+}
+
+/* The macro for the hardware loop.  This is used for both
+   directions.  */
+#define HARDWARE_CONVERT(INSTRUCTION)					\
+  {									\
+    register const unsigned char* pInput asm ("8") = inptr;		\
+    register unsigned long long inlen asm ("9") = inend - inptr;	\
+    register unsigned char* pOutput asm ("10") = outptr;		\
+    register unsigned long long outlen asm("11") = outend - outptr;	\
+    uint64_t cc = 0;							\
+									\
+    asm volatile ("0: " INSTRUCTION "  \n\t"				\
+                  "   jo     0b        \n\t"				\
+		  "   ipm    %2        \n"				\
+		  : "+a" (pOutput), "+a" (pInput), "+d" (cc),		\
+		    "+d" (outlen), "+d" (inlen)				\
+		  :							\
+		  : "cc", "memory");					\
+									\
+    inptr = pInput;							\
+    outptr = pOutput;							\
+    cc >>= 28;          						\
+									\
+    if (cc == 1)							\
+      {									\
+	result = __GCONV_FULL_OUTPUT;					\
+	break;								\
+      }									\
+    else if (cc == 2)							\
+      {									\
+	result = __GCONV_ILLEGAL_INPUT;					\
+	break;								\
+      }									\
+  }
+
+/* Conversion function from UTF-8 to UTF-32 internal/BE.  */
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MAX_NEEDED_INPUT	MAX_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+/* The software routine is copied from gconv_simple.c.  */
+#define BODY             						\
+  {									\
+    if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH)				\
+      {									\
+	HARDWARE_CONVERT ("cu14 %0, %1, 1");				\
+									\
+	if (inptr != inend)						\
+	  {								\
+	    int i;							\
+	    for (i = 1; inptr + i < inend; ++i)				\
+	      if ((inptr[i] & 0xc0) != 0x80)				\
+		break;							\
+									\
+	    if (__builtin_expect (inptr + i == inend, 1))		\
+	      {								\
+		result = __GCONV_INCOMPLETE_INPUT;			\
+		break;							\
+	      }								\
+	    STANDARD_FROM_LOOP_ERR_HANDLER (i);				\
+	  }								\
+	continue;							\
+      }									\
+									\
+    /* Next input byte.  */						\
+    uint32_t ch = *inptr;						\
+									\
+    if (__builtin_expect (ch < 0x80, 1))				\
+      {									\
+	/* One byte sequence.  */					\
+	++inptr;							\
+      }									\
+    else								\
+      {									\
+	uint_fast32_t cnt;						\
+	uint_fast32_t i;						\
+									\
+	if (ch >= 0xc2 && ch < 0xe0)					\
+	  {								\
+	    /* We expect two bytes.  The first byte cannot be 0xc0 or	\
+	       0xc1, otherwise the wide character could have been	\
+	       represented using a single byte.  */			\
+	    cnt = 2;							\
+	    ch &= 0x1f;							\
+	  }								\
+        else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1))		\
+	  {								\
+	    /* We expect three bytes.  */				\
+	    cnt = 3;							\
+	    ch &= 0x0f;							\
+	  }								\
+	else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1))		\
+	  {								\
+	    /* We expect four bytes.  */				\
+	    cnt = 4;							\
+	    ch &= 0x07;							\
+	  }								\
+	else if (__builtin_expect ((ch & 0xfc) == 0xf8, 1))		\
+	  {								\
+	    /* We expect five bytes.  */				\
+	    cnt = 5;							\
+	    ch &= 0x03;							\
+	  }								\
+	else if (__builtin_expect ((ch & 0xfe) == 0xfc, 1))		\
+	  {								\
+	    /* We expect six bytes.  */					\
+	    cnt = 6;							\
+	    ch &= 0x01;							\
+	  }								\
+	else								\
+	  {								\
+	    /* Search the end of this ill-formed UTF-8 character.  This	\
+	       is the next byte with (x & 0xc0) != 0x80.  */		\
+	    i = 0;							\
+	    do								\
+	      ++i;							\
+	    while (inptr + i < inend					\
+		   && (*(inptr + i) & 0xc0) == 0x80			\
+		   && i < 5);						\
+									\
+	  errout:							\
+	    STANDARD_FROM_LOOP_ERR_HANDLER (i);				\
+	  }								\
+									\
+	if (__builtin_expect (inptr + cnt > inend, 0))			\
+	  {								\
+	    /* We don't have enough input.  But before we report	\
+	       that check that all the bytes are correct.  */		\
+	    for (i = 1; inptr + i < inend; ++i)				\
+	      if ((inptr[i] & 0xc0) != 0x80)				\
+		break;							\
+									\
+	    if (__builtin_expect (inptr + i == inend, 1))		\
+	      {								\
+		result = __GCONV_INCOMPLETE_INPUT;			\
+		break;							\
+	      }								\
+									\
+	    goto errout;						\
+	  }								\
+									\
+	/* Read the possible remaining bytes.  */			\
+	for (i = 1; i < cnt; ++i)					\
+	  {								\
+	    uint32_t byte = inptr[i];					\
+									\
+	    if ((byte & 0xc0) != 0x80)					\
+	      /* This is an illegal encoding.  */			\
+	      break;							\
+									\
+	    ch <<= 6;							\
+	    ch |= byte & 0x3f;						\
+	  }								\
+									\
+	/* If i < cnt, some trail byte was not >= 0x80, < 0xc0.		\
+	   If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could	\
+	   have been represented with fewer than cnt bytes.  */		\
+	if (i < cnt || (cnt > 2 && (ch >> (5 * cnt - 4)) == 0))		\
+	  {								\
+	    /* This is an illegal encoding.  */				\
+	    goto errout;						\
+	  }								\
+									\
+	inptr += cnt;							\
+      }									\
+									\
+    /* Now adjust the pointers and store the result.  */		\
+    *((uint32_t *) outptr) = ch;					\
+    outptr += sizeof (uint32_t);					\
+  }
+#define LOOP_NEED_FLAGS
+
+#define STORE_REST							\
+  {									      \
+    /* We store the remaining bytes while converting them into the UCS4	      \
+       format.  We can assume that the first byte in the buffer is	      \
+       correct and that it requires a larger number of bytes than there	      \
+       are in the input buffer.  */					      \
+    wint_t ch = **inptrp;						      \
+    size_t cnt, r;							      \
+									      \
+    state->__count = inend - *inptrp;					      \
+									      \
+    if (ch >= 0xc2 && ch < 0xe0)					      \
+      {									      \
+	/* We expect two bytes.  The first byte cannot be 0xc0 or	      \
+	   0xc1, otherwise the wide character could have been		      \
+	   represented using a single byte.  */				      \
+	cnt = 2;							      \
+	ch &= 0x1f;							      \
+      }									      \
+    else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1))			      \
+      {									      \
+	/* We expect three bytes.  */					      \
+	cnt = 3;							      \
+	ch &= 0x0f;							      \
+      }									      \
+    else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1))			      \
+      {									      \
+	/* We expect four bytes.  */					      \
+	cnt = 4;							      \
+	ch &= 0x07;							      \
+      }									      \
+    else if (__builtin_expect ((ch & 0xfc) == 0xf8, 1))			      \
+      {									      \
+	/* We expect five bytes.  */					      \
+	cnt = 5;							      \
+	ch &= 0x03;							      \
+      }									      \
+    else								      \
+      {									      \
+	/* We expect six bytes.  */					      \
+	cnt = 6;							      \
+	ch &= 0x01;							      \
+      }									      \
+									      \
+    /* The first byte is already consumed.  */				      \
+    r = cnt - 1;							      \
+    while (++(*inptrp) < inend)						      \
+      {									      \
+	ch <<= 6;							      \
+	ch |= **inptrp & 0x3f;						      \
+	--r;								      \
+      }									      \
+									      \
+    /* Shift for the so far missing bytes.  */				      \
+    ch <<= r * 6;							      \
+									      \
+    /* Store the number of bytes expected for the entire sequence.  */	      \
+    state->__count |= cnt << 8;						      \
+									      \
+    /* Store the value.  */						      \
+    state->__value.__wch = ch;						      \
+  }
+
+#define UNPACK_BYTES \
+  {									      \
+    static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc };  \
+    wint_t wch = state->__value.__wch;					      \
+    size_t ntotal = state->__count >> 8;				      \
+									      \
+    inlen = state->__count & 255;					      \
+									      \
+    bytebuf[0] = inmask[ntotal - 2];					      \
+									      \
+    do									      \
+      {									      \
+	if (--ntotal < inlen)						      \
+	  bytebuf[ntotal] = 0x80 | (wch & 0x3f);			      \
+	wch >>= 6;							      \
+      }									      \
+    while (ntotal > 1);							      \
+									      \
+    bytebuf[0] |= wch;							      \
+  }
+
+#define CLEAR_STATE \
+  state->__count = 0
+
+#include <iconv/loop.c>
+
+/* Conversion from UTF-32 internal/BE to UTF-8.  */
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_TO
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_FROM
+#define MAX_NEEDED_OUTPUT	MAX_NEEDED_FROM
+#define LOOPFCT			TO_LOOP
+/* The software routine mimics the S/390 cu41 instruction.  */
+#define BODY							\
+  {								\
+    if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH)			\
+      {								\
+	HARDWARE_CONVERT ("cu41 %0, %1");			\
+								\
+	if (inptr != inend)					\
+	  {							\
+	    result = __GCONV_INCOMPLETE_INPUT;			\
+	    break;						\
+	  }							\
+	continue;						\
+      }								\
+								\
+    uint32_t wc = *((const uint32_t *) inptr);			\
+								\
+    if (__builtin_expect (wc <= 0x7f, 1))			\
+      {								\
+        /* Single UTF-8 char.  */				\
+        *outptr = (uint8_t)wc;					\
+	outptr++;						\
+      }								\
+    else if (wc <= 0x7ff)					\
+      {								\
+        /* Two UTF-8 chars.  */					\
+        if (__builtin_expect (outptr + 2 > outend, 0))		\
+	  {							\
+	    /* Overflow in the output buffer.  */		\
+	    result = __GCONV_FULL_OUTPUT;			\
+	    break;						\
+	  }							\
+								\
+        outptr[0] = 0xc0;					\
+	outptr[0] |= wc >> 6;					\
+								\
+	outptr[1] = 0x80;					\
+	outptr[1] |= wc & 0x3f;					\
+								\
+	outptr += 2;						\
+      }								\
+    else if (wc <= 0xffff)					\
+      {								\
+	/* Three UTF-8 chars.  */				\
+	if (__builtin_expect (outptr + 3 > outend, 0))		\
+	  {							\
+	    /* Overflow in the output buffer.  */		\
+	    result = __GCONV_FULL_OUTPUT;			\
+	    break;						\
+	  }							\
+	outptr[0] = 0xe0;					\
+	outptr[0] |= wc >> 12;					\
+								\
+	outptr[1] = 0x80;					\
+	outptr[1] |= (wc >> 6) & 0x3f;				\
+								\
+	outptr[2] = 0x80;					\
+	outptr[2] |= wc & 0x3f;					\
+								\
+	outptr += 3;						\
+      }								\
+      else if (wc <= 0x10ffff)					\
+	{							\
+	  /* Four UTF-8 chars.  */				\
+	  if (__builtin_expect (outptr + 4 > outend, 0))	\
+	    {							\
+	      /* Overflow in the output buffer.  */		\
+	      result = __GCONV_FULL_OUTPUT;			\
+	      break;						\
+	    }							\
+	  outptr[0] = 0xf0;					\
+	  outptr[0] |= wc >> 18;				\
+								\
+	  outptr[1] = 0x80;					\
+	  outptr[1] |= (wc >> 12) & 0x3f;			\
+								\
+	  outptr[2] = 0x80;					\
+	  outptr[2] |= (wc >> 6) & 0x3f;			\
+								\
+	  outptr[3] = 0x80;					\
+	  outptr[3] |= wc & 0x3f;				\
+								\
+	  outptr += 4;						\
+	}							\
+      else							\
+	{							\
+	  STANDARD_TO_LOOP_ERR_HANDLER (4);			\
+	}							\
+    inptr += 4;							\
+  }
+#define LOOP_NEED_FLAGS
+#include <iconv/loop.c>
+
+#include <iconv/skeleton.c>
-- 
cgit v1.2.3


From 89749d1970ad7326672c5dfcc765777d8d48acec Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Fri, 24 Jul 2009 08:32:47 -0700
Subject: White space fixes in last checkin.

---
 sysdeps/s390/s390-64/utf16-utf32-z9.c |  4 ++--
 sysdeps/s390/s390-64/utf8-utf16-z9.c  | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'sysdeps')

diff --git a/sysdeps/s390/s390-64/utf16-utf32-z9.c b/sysdeps/s390/s390-64/utf16-utf32-z9.c
index c9bccc98a7..868dea68ca 100644
--- a/sysdeps/s390/s390-64/utf16-utf32-z9.c
+++ b/sysdeps/s390/s390-64/utf16-utf32-z9.c
@@ -103,7 +103,7 @@ gconv_init (struct __gconv_step *step)
   if (__strcasecmp (step->__from_name, "UTF-16BE//") == 0
       && (__strcasecmp (step->__to_name, "UTF-32//") == 0
 	  || __strcasecmp (step->__to_name, "UTF-32BE//") == 0
-      	  || __strcasecmp (step->__to_name, "INTERNAL") == 0))
+	  || __strcasecmp (step->__to_name, "INTERNAL") == 0))
     {
       dir = from_utf16;
     }
@@ -179,7 +179,7 @@ gconv_end (struct __gconv_step *data)
 									\
     inptr = pInput;							\
     outptr = pOutput;							\
-    cc >>= 28;          						\
+    cc >>= 28;								\
 									\
     if (cc == 1)							\
       {									\
diff --git a/sysdeps/s390/s390-64/utf8-utf16-z9.c b/sysdeps/s390/s390-64/utf8-utf16-z9.c
index 3fe3652b35..531d3ebd4b 100644
--- a/sysdeps/s390/s390-64/utf8-utf16-z9.c
+++ b/sysdeps/s390/s390-64/utf8-utf16-z9.c
@@ -187,14 +187,14 @@ gconv_end (struct __gconv_step *data)
     if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH)				\
       {									\
 	HARDWARE_CONVERT ("cu12 %0, %1, 1");				\
-	  								\
+									\
 	if (inptr != inend)						\
 	  {								\
 	    int i;							\
 	    for (i = 1; inptr + i < inend; ++i)				\
 	      if ((inptr[i] & 0xc0) != 0x80)				\
 		break;							\
-	    								\
+								\
 	    if (__builtin_expect (inptr + i == inend, 1))		\
 	      {								\
 		result = __GCONV_INCOMPLETE_INPUT;			\
@@ -275,7 +275,7 @@ gconv_end (struct __gconv_step *data)
 	    /* For 4 byte UTF-8 chars two UTF-16 chars (high and	\
 	       low) are needed.  */					\
 	    uint16_t zabcd, high, low;					\
-	    								\
+									\
 	    if (__builtin_expect (outptr + 4 > outend, 0))		\
 	      {								\
 		/* Overflow in the output buffer.  */			\
@@ -300,7 +300,7 @@ gconv_end (struct __gconv_step *data)
 	    low |= ((uint16_t)inptr[2] & 0xc) << 6;       /* kl bits */	\
 	    low |= (inptr[2] & 0x3) << 6;                 /* mn bits */	\
 	    low |= inptr[3] & 0x3f;                   /* opqrst bits */	\
-	    								\
+									\
 	    put16 (outptr, high);					\
 	    outptr += 2;						\
 	    put16 (outptr, low);					\
@@ -382,7 +382,7 @@ gconv_end (struct __gconv_step *data)
 									\
         outptr[0] = 0xc0;						\
         outptr[0] |= c >> 6;						\
-      									\
+									\
         outptr[1] = 0x80;						\
         outptr[1] |= c & 0x3f;						\
 									\
-- 
cgit v1.2.3


From 7c36ced067da038ea42c909b9f7c3bad5f35d7d0 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Fri, 24 Jul 2009 08:34:47 -0700
Subject: More white space fixes.

---
 sysdeps/s390/s390-64/utf8-utf32-z9.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'sysdeps')

diff --git a/sysdeps/s390/s390-64/utf8-utf32-z9.c b/sysdeps/s390/s390-64/utf8-utf32-z9.c
index 14847e2403..17ef8bc890 100644
--- a/sysdeps/s390/s390-64/utf8-utf32-z9.c
+++ b/sysdeps/s390/s390-64/utf8-utf32-z9.c
@@ -55,7 +55,7 @@
       /* Emit the Byte Order Mark.  */					\
       if (__builtin_expect (outbuf + 4 > outend, 0))			\
 	return __GCONV_FULL_OUTPUT;					\
-      									\
+									\
       put32u (outbuf, BOM);						\
       outbuf += 4;							\
     }
@@ -90,7 +90,7 @@ gconv_init (struct __gconv_step *step)
   if (__strcasecmp (step->__from_name, "ISO-10646/UTF8/") == 0
       && (__strcasecmp (step->__to_name, "UTF-32//") == 0
 	  || __strcasecmp (step->__to_name, "UTF-32BE//") == 0
-      	  || __strcasecmp (step->__to_name, "INTERNAL") == 0))
+	  || __strcasecmp (step->__to_name, "INTERNAL") == 0))
     {
       dir = from_utf8;
     }
@@ -165,7 +165,7 @@ gconv_end (struct __gconv_step *data)
 									\
     inptr = pInput;							\
     outptr = pOutput;							\
-    cc >>= 28;          						\
+    cc >>= 28;								\
 									\
     if (cc == 1)							\
       {									\
@@ -186,7 +186,7 @@ gconv_end (struct __gconv_step *data)
 #define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
 #define LOOPFCT			FROM_LOOP
 /* The software routine is copied from gconv_simple.c.  */
-#define BODY             						\
+#define BODY								\
   {									\
     if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH)				\
       {									\
-- 
cgit v1.2.3


From da331e8e14f17c973444a0cbf62d90c72097b135 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Fri, 24 Jul 2009 13:01:17 -0700
Subject: Don't automatically use /lib/modules/* headers.

Ever since the /usr/include/linux headers got cleaned up this isn't
necessary.  Meanwhile everybody should have these cleanups.
---
 ChangeLog                            |  6 ++++++
 sysdeps/unix/sysv/linux/configure    | 11 -----------
 sysdeps/unix/sysv/linux/configure.in | 13 -------------
 3 files changed, 6 insertions(+), 24 deletions(-)

(limited to 'sysdeps')

diff --git a/ChangeLog b/ChangeLog
index 4bdd2f5e35..e95e94b4b2 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2009-07-24  Ulrich Drepper  <drepper@redhat.com>
+
+	* sysdeps/unix/sysv/linux/configure.in: Don't automatically include
+	/lib/modules/* headers anymore.  We have sane headers in the standard
+	place now.
+
 2009-06-16  Andreas Krebbel  <Andreas.Krebbel@de.ibm.com>
 
 	* sysdeps/s390/dl-procinfo.c (_dl_s390_cap_flags): "hpage",
diff --git a/sysdeps/unix/sysv/linux/configure b/sysdeps/unix/sysv/linux/configure
index 253e9c57ff..199457a3ac 100644
--- a/sysdeps/unix/sysv/linux/configure
+++ b/sysdeps/unix/sysv/linux/configure
@@ -1,17 +1,6 @@
 # This file is generated from configure.in by Autoconf.  DO NOT EDIT!
  # Local configure fragment for sysdeps/unix/sysv/linux.
 
-# The Linux kernel headers can be found in
-#   /lib/modules/$(uname -r)/build/include
-# Check whether this directory is available.
-if test -z "$sysheaders" &&
-   test "x$cross_compiling" = xno &&
-   test -d /lib/modules/`uname -r`/build/include; then
-  sysheaders="/lib/modules/`uname -r`/build/include"
-  ccheaders=`$CC -print-file-name=include`
-      SYSINCLUDES="-I $sysheaders"
-fi
-
 # Don't bother trying to generate any glue code to be compatible with the
 # existing system library, because we are the only system library.
 inhibit_glue=yes
diff --git a/sysdeps/unix/sysv/linux/configure.in b/sysdeps/unix/sysv/linux/configure.in
index 5330e98c2d..8f00407a8b 100644
--- a/sysdeps/unix/sysv/linux/configure.in
+++ b/sysdeps/unix/sysv/linux/configure.in
@@ -1,19 +1,6 @@
 GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
 # Local configure fragment for sysdeps/unix/sysv/linux.
 
-# The Linux kernel headers can be found in
-#   /lib/modules/$(uname -r)/build/include
-# Check whether this directory is available.
-if test -z "$sysheaders" &&
-   test "x$cross_compiling" = xno &&
-   test -d /lib/modules/`uname -r`/build/include; then
-  sysheaders="/lib/modules/`uname -r`/build/include"
-  ccheaders=`$CC -print-file-name=include`
-  dnl We don't have to use -nostdinc.  We just want one more directory
-  dnl to be used.
-  SYSINCLUDES="-I $sysheaders"
-fi
-
 # Don't bother trying to generate any glue code to be compatible with the
 # existing system library, because we are the only system library.
 inhibit_glue=yes
-- 
cgit v1.2.3


From 29e92fa5cd9c5c09a9c78563f35729fec9075e7f Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Sat, 25 Jul 2009 12:02:47 -0700
Subject: Optimize x86-64 SSE4.2 strcmp.

The file contained some code which was never used.  Don't compile it
in.
---
 ChangeLog                         | 5 +++++
 sysdeps/x86_64/multiarch/strcmp.S | 5 +++++
 2 files changed, 10 insertions(+)

(limited to 'sysdeps')

diff --git a/ChangeLog b/ChangeLog
index e95e94b4b2..8246614d1a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2009-07-25  Ulrich Drepper  <drepper@redhat.com>
+
+	* sysdeps/x86_64/multiarch/strcmp.S: Exclude unused code from being
+	compiled in.
+
 2009-07-24  Ulrich Drepper  <drepper@redhat.com>
 
 	* sysdeps/unix/sysv/linux/configure.in: Don't automatically include
diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
index 37985036aa..f9cf943e32 100644
--- a/sysdeps/x86_64/multiarch/strcmp.S
+++ b/sysdeps/x86_64/multiarch/strcmp.S
@@ -1457,6 +1457,9 @@ LABEL(use_sse4_2_ret):
 	sub	%edx, %eax
 	ret
 
+#if 0
+	/* This code was in the origial submission but isn't used.
+	   --drepper */
 	.p2align 4
 LABEL(aftertail):
 	pcmpeqb	%xmm3, %xmm1
@@ -1467,6 +1470,8 @@ LABEL(aftertail):
 	.p2align 4
 LABEL(exit):
 	lea	-16(%r9, %rcx), %rax	/* locate the exact offset for rdi */
+#endif
+
 LABEL(less32bytes):
 	lea	(%rdi, %rax), %rdi	/* locate the exact address for first operand(rdi) */
 	lea	(%rsi, %rcx), %rsi	/* locate the exact address for second operand(rsi) */
-- 
cgit v1.2.3


From 657317537c09b82a2feb1194fda045f63e3a1222 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Sat, 25 Jul 2009 12:29:04 -0700
Subject: Handle missing NSS modules and those without callbacks.

getaddrinfo didn't update the status variable in that round of the
loop if no callback was used.
---
 ChangeLog                   | 4 ++++
 sysdeps/posix/getaddrinfo.c | 2 ++
 2 files changed, 6 insertions(+)

(limited to 'sysdeps')

diff --git a/ChangeLog b/ChangeLog
index 8246614d1a..f47b0897a4 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
 2009-07-25  Ulrich Drepper  <drepper@redhat.com>
 
+	[BZ #10448]
+	* sysdeps/posix/getaddrinfo.c (gaih_inet): If NSS module contains no
+	callback we must touch the status to avoid using stale value.
+
 	* sysdeps/x86_64/multiarch/strcmp.S: Exclude unused code from being
 	compiled in.
 
diff --git a/sysdeps/posix/getaddrinfo.c b/sysdeps/posix/getaddrinfo.c
index d346c621fb..a788d18fee 100644
--- a/sysdeps/posix/getaddrinfo.c
+++ b/sysdeps/posix/getaddrinfo.c
@@ -833,6 +833,8 @@ gaih_inet (const char *name, const struct gaih_service *service,
 			       && inet6_status != NSS_STATUS_UNAVAIL)
 			status = inet6_status;
 		    }
+		  else
+		    status = NSS_STATUS_UNAVAIL;
 		}
 
 	      if (nss_next_action (nip, status) == NSS_ACTION_RETURN)
-- 
cgit v1.2.3


From 4e5b5821bf58ddc30d455ee4968623f3334fbe28 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hongjiu.lu@intel.com>
Date: Sat, 25 Jul 2009 19:15:14 -0700
Subject: Some some optimizations for x86-64 strcmp.

---
 ChangeLog                         |  3 +++
 sysdeps/x86_64/multiarch/strcmp.S | 13 ++++---------
 2 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'sysdeps')

diff --git a/ChangeLog b/ChangeLog
index f47b0897a4..abfe7dbfbc 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,8 @@
 2009-07-25  Ulrich Drepper  <drepper@redhat.com>
 
+	* sysdeps/x86_64/multiarch/strcmp.S: Some more optimizations for
+	modern processor versions.  Patch by H.J. Lu <hongjiu.lu@intel.com>.
+
 	[BZ #10448]
 	* sysdeps/posix/getaddrinfo.c (gaih_inet): If NSS module contains no
 	callback we must touch the status to avoid using stale value.
diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
index f9cf943e32..15148e4f7f 100644
--- a/sysdeps/x86_64/multiarch/strcmp.S
+++ b/sysdeps/x86_64/multiarch/strcmp.S
@@ -120,10 +120,8 @@ STRCMP_SSE42:
 	ja	LABEL(crosscache)	/* rsi: 16-byte load will cross cache line */
 	cmp	$0x30, %eax
 	ja	LABEL(crosscache)	/* rdi: 16-byte load will cross cache line */
-	movlpd	(%rdi), %xmm1
-	movlpd	(%rsi), %xmm2
-	movhpd	8(%rdi), %xmm1
-	movhpd	8(%rsi), %xmm2
+	movdqu	(%rdi), %xmm1
+	movdqu	(%rsi), %xmm2
 	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
 	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
 	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
@@ -1492,11 +1490,8 @@ LABEL(less16bytes):
 	sub	%rdx, %r11
 	jbe	LABEL(strcmp_exitz)
 #endif
-	xor	%ecx, %ecx		/* clear %ecx */
-	xor	%eax, %eax		/* clear %eax */
-
-	movb	(%rsi, %rdx), %cl
-	movb	(%rdi, %rdx), %al
+	movzbl	(%rsi, %rdx), %ecx
+	movzbl	(%rdi, %rdx), %eax
 
 	sub	%ecx, %eax
 	ret
-- 
cgit v1.2.3


From aa7492d20e5a2cef54dab7b41f534048b3eca479 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Sun, 26 Jul 2009 12:55:03 -0700
Subject: Compatibility of signalfd/eventfd with older kernels.

---
 ChangeLog                                 |  8 ++++++++
 sysdeps/unix/sysv/linux/eventfd.c         | 15 ++++++++++++---
 sysdeps/unix/sysv/linux/kernel-features.h |  2 ++
 sysdeps/unix/sysv/linux/signalfd.c        | 15 ++++++++++++---
 4 files changed, 34 insertions(+), 6 deletions(-)

(limited to 'sysdeps')

diff --git a/ChangeLog b/ChangeLog
index 977f0f8936..d32f15b08a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,13 @@
 2009-07-26  Ulrich Drepper  <drepper@redhat.com>
 
+	[BZ #10422]
+	* sysdeps/unix/sysv/linux/eventfd.c: Add compatibility for old
+	kernels, dropped when eventfd2 support was added.
+	* sysdeps/unix/sysv/linux/signalfd.c: Add compatibility for old
+	kernels, dropped when signalfd4 support was added.
+	* sysdeps/unix/sysv/linux/kernel-features.h: More CLOEXEC syscalls
+	added, name them.
+
 	[BZ #10452]
 	* resolv/res_send.c (send_dg): Pass full SERVFAIL, NOTIMP, REFUSED
 	replies up.
diff --git a/sysdeps/unix/sysv/linux/eventfd.c b/sysdeps/unix/sysv/linux/eventfd.c
index 4cd557983e..7f69ecdb8c 100644
--- a/sysdeps/unix/sysv/linux/eventfd.c
+++ b/sysdeps/unix/sysv/linux/eventfd.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007, 2008 Free Software Foundation, Inc.
+/* Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -19,14 +19,21 @@
 #include <errno.h>
 #include <sys/eventfd.h>
 #include <sysdep.h>
+#include <kernel-features.h>
 
 
 int
 eventfd (int count, int flags)
 {
 #ifdef __NR_eventfd2
-  return INLINE_SYSCALL (eventfd2, 2, count, flags);
-#else
+  int res = INLINE_SYSCALL (eventfd2, 2, count, flags);
+# ifndef __ASSUME_EVENTFD2
+  if (res != -1 || errno != ENOSYS)
+# endif
+    return res;
+#endif
+
+#ifndef __ASSUME_EVENTFD2
   /* The old system call has no flag parameter which is bad.  So we have
      to wait until we have to support to pass additional values to the
      kernel (sys_indirect) before implementing setting flags like
@@ -43,5 +50,7 @@ eventfd (int count, int flags)
   __set_errno (ENOSYS);
   return -1;
 # endif
+#elif !defined __NR_eventfd2
+# error "__ASSUME_EVENTFD2 defined but not __NR_eventfd2"
 #endif
 }
diff --git a/sysdeps/unix/sysv/linux/kernel-features.h b/sysdeps/unix/sysv/linux/kernel-features.h
index 4562515790..ff065effb5 100644
--- a/sysdeps/unix/sysv/linux/kernel-features.h
+++ b/sysdeps/unix/sysv/linux/kernel-features.h
@@ -516,6 +516,8 @@
 # define __ASSUME_SOCK_CLOEXEC	1
 # define __ASSUME_IN_NONBLOCK	1
 # define __ASSUME_PIPE2		1
+# define __ASSUME_EVENTFD2	1
+# define __ASSUME_SIGNALFD4	1
 #endif
 
 /* Support for the accept4 syscall was added in 2.6.28.  */
diff --git a/sysdeps/unix/sysv/linux/signalfd.c b/sysdeps/unix/sysv/linux/signalfd.c
index 9898f29231..c2d974a45d 100644
--- a/sysdeps/unix/sysv/linux/signalfd.c
+++ b/sysdeps/unix/sysv/linux/signalfd.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007, 2008 Free Software Foundation, Inc.
+/* Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -20,14 +20,21 @@
 #include <signal.h>
 #include <sys/signalfd.h>
 #include <sysdep.h>
+#include <kernel-features.h>
 
 
 int
 signalfd (int fd, const sigset_t *mask, int flags)
 {
 #ifdef __NR_signalfd4
-  return INLINE_SYSCALL (signalfd4, 4, fd, mask, _NSIG / 8, flags);
-#else
+  int res = INLINE_SYSCALL (signalfd4, 4, fd, mask, _NSIG / 8, flags);
+# ifndef __ASSUME_SIGNALFD4
+  if (res != -1 || errno != ENOSYS)
+# endif
+    return res;
+#endif
+
+#ifndef __ASSUME_SIGNALFD4
   /* The old system call has no flag parameter which is bad.  So we have
      to wait until we have to support to pass additional values to the
      kernel (sys_indirect) before implementing setting flags like
@@ -44,5 +51,7 @@ signalfd (int fd, const sigset_t *mask, int flags)
   __set_errno (ENOSYS);
   return -1;
 # endif
+#elif !defined __NR_signalfd4
+# error "__ASSUME_SIGNALFD4 defined but not __NR_signalfd4"
 #endif
 }
-- 
cgit v1.2.3


From 7956a3d27c6552f57c8b1c3893d55e501fe30e14 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hongjiu.lu@intel.com>
Date: Sun, 26 Jul 2009 13:32:28 -0700
Subject: Add SSE2 support to str{,n}cmp for x86-64.

---
 sysdeps/x86_64/multiarch/Makefile    |    2 +-
 sysdeps/x86_64/multiarch/strcmp.S    |  361 ++-----
 sysdeps/x86_64/multiarch/strncmp-c.c |    8 -
 sysdeps/x86_64/strcmp.S              | 1948 +++++++++++++++++++++++++++++++++-
 sysdeps/x86_64/strncmp.S             |    3 +
 5 files changed, 2055 insertions(+), 267 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/strncmp-c.c
 create mode 100644 sysdeps/x86_64/strncmp.S

(limited to 'sysdeps')

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 5ce14aad8d..b066402204 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -4,7 +4,7 @@ gen-as-const-headers += ifunc-defines.sym
 endif
 
 ifeq ($(subdir),string)
-sysdep_routines += stpncpy-c strncpy-c strncmp-c
+sysdep_routines += stpncpy-c strncpy-c
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
 CFLAGS-strcspn-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
index 15148e4f7f..1a315737af 100644
--- a/sysdeps/x86_64/multiarch/strcmp.S
+++ b/sysdeps/x86_64/multiarch/strcmp.S
@@ -28,9 +28,9 @@
 	/* calculate left number to compare */		\
 	lea	-16(%rcx, %r11), %r9;			\
 	cmp	%r9, %r11;				\
-	jb	LABEL(strcmp_exitz);			\
+	jb	LABEL(strcmp_exitz_sse4_2);		\
 	test	%r9, %r9;				\
-	je	LABEL(strcmp_exitz);			\
+	je	LABEL(strcmp_exitz_sse4_2);		\
 	mov	%r9, %r11
 
 #define STRCMP_SSE42	__strncmp_sse42
@@ -106,9 +106,9 @@ STRCMP_SSE42:
  */
 #ifdef USE_AS_STRNCMP
 	test	%rdx, %rdx
-	je	LABEL(strcmp_exitz)
+	je	LABEL(strcmp_exitz_sse4_2)
 	cmp	$1, %rdx
-	je	LABEL(Byte0)
+	je	LABEL(Byte0_sse4_2)
 	mov	%rdx, %r11
 #endif
 	mov	%esi, %ecx
@@ -117,9 +117,9 @@ STRCMP_SSE42:
 	and	$0x3f, %rcx		/* rsi alignment in cache line */
 	and	$0x3f, %rax		/* rdi alignment in cache line */
 	cmp	$0x30, %ecx
-	ja	LABEL(crosscache)	/* rsi: 16-byte load will cross cache line */
+	ja	LABEL(crosscache_sse4_2)/* rsi: 16-byte load will cross cache line */
 	cmp	$0x30, %eax
-	ja	LABEL(crosscache)	/* rdi: 16-byte load will cross cache line */
+	ja	LABEL(crosscache_sse4_2)/* rdi: 16-byte load will cross cache line */
 	movdqu	(%rdi), %xmm1
 	movdqu	(%rsi), %xmm2
 	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
@@ -128,10 +128,10 @@ STRCMP_SSE42:
 	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
 	pmovmskb %xmm1, %edx
 	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
-	jnz	LABEL(less16bytes)	/* If not, find different value or null char */
+	jnz	LABEL(less16bytes_sse4_2)/* If not, find different value or null char */
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)	/* finish comparision */
+	jbe	LABEL(strcmp_exitz_sse4_2)/* finish comparision */
 #endif
 	add	$16, %rsi		/* prepare to search next 16 bytes */
 	add	$16, %rdi		/* prepare to search next 16 bytes */
@@ -142,7 +142,7 @@ STRCMP_SSE42:
 	 * below to use.
 	 */
 	.p2align 4
-LABEL(crosscache):
+LABEL(crosscache_sse4_2):
 	and	$0xfffffffffffffff0, %rsi	/* force %rsi is 16 byte aligned */
 	and	$0xfffffffffffffff0, %rdi	/* force %rdi is 16 byte aligned */
 	mov	$0xffff, %edx			/* for equivalent offset */
@@ -150,15 +150,15 @@ LABEL(crosscache):
 	and	$0xf, %ecx			/* offset of rsi */
 	and	$0xf, %eax			/* offset of rdi */
 	cmp	%eax, %ecx
-	je	LABEL(ashr_0)			/* rsi and rdi relative offset same */
-	ja	LABEL(bigger)
+	je	LABEL(ashr_0_sse4_2)		/* rsi and rdi relative offset same */
+	ja	LABEL(bigger_sse4_2)
 	mov	%edx, %r8d			/* r8d is offset flag for exit tail */
 	xchg	%ecx, %eax
 	xchg	%rsi, %rdi
-LABEL(bigger):
+LABEL(bigger_sse4_2):
 	lea	15(%rax), %r9
 	sub	%rcx, %r9
-	lea	LABEL(unaligned_table)(%rip), %r10
+	lea	LABEL(unaligned_table_sse4_2)(%rip), %r10
 	movslq	(%r10, %r9,4), %r9
 	lea	(%r10, %r9), %r10
 	jmp	*%r10				/* jump to corresponding case */
@@ -169,7 +169,7 @@ LABEL(bigger):
  *        n(0~15)            n(0~15)           15(15+ n-n)         ashr_0
  */
 	.p2align 4
-LABEL(ashr_0):
+LABEL(ashr_0_sse4_2):
 
 	movdqa	(%rsi), %xmm1
 	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char check */
@@ -184,7 +184,7 @@ LABEL(ashr_0):
 	 * edx must be the same with r9d if in left byte (16-rcx) is equal to
 	 * the start from (16-rax) and no null char was seen.
 	 */
-	jne	LABEL(less32bytes)		/* mismatch or null char */
+	jne	LABEL(less32bytes_sse4_2)	/* mismatch or null char */
 	UPDATE_STRNCMP_COUNTER
 	mov	$16, %rcx
 	mov	$16, %r9
@@ -203,7 +203,7 @@ LABEL(ashr_0_use_sse4_2):
 	jbe	LABEL(ashr_0_use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 
 	movdqa	(%rdi,%rdx), %xmm0
@@ -212,17 +212,17 @@ LABEL(ashr_0_use_sse4_2):
 	jbe	LABEL(ashr_0_use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 	jmp	LABEL(ashr_0_use_sse4_2)
 
 
 	.p2align 4
 LABEL(ashr_0_use_sse4_2_exit):
-	jnc	LABEL(strcmp_exitz)
+	jnc	LABEL(strcmp_exitz_sse4_2)
 #ifdef USE_AS_STRNCMP
 	sub	%rcx, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 	lea	-16(%rdx, %rcx), %rcx
 	movzbl	(%rdi, %rcx), %eax
@@ -239,7 +239,7 @@ LABEL(ashr_0_use_sse4_2_exit):
  *        n(15)            n -15            0(15 +(n-15) - n)         ashr_1
  */
 	.p2align 4
-LABEL(ashr_1):
+LABEL(ashr_1_sse4_2):
 	pxor	%xmm0, %xmm0
 	movdqa	(%rdi), %xmm2
 	movdqa	(%rsi), %xmm1
@@ -251,7 +251,7 @@ LABEL(ashr_1):
 	shr	%cl, %edx		/* adjust 0xffff for offset */
 	shr	%cl, %r9d		/* adjust for 16-byte offset */
 	sub	%r9d, %edx
-	jnz	LABEL(less32bytes)	/* mismatch or null char seen */
+	jnz	LABEL(less32bytes_sse4_2)/* mismatch or null char seen */
 	movdqa	(%rdi), %xmm3
 	UPDATE_STRNCMP_COUNTER
 
@@ -279,7 +279,7 @@ LABEL(loop_ashr_1_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 
 	add	$16, %rdx
@@ -292,7 +292,7 @@ LABEL(loop_ashr_1_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_1_use_sse4_2)
@@ -318,7 +318,7 @@ LABEL(nibble_ashr_1_use_sse4_2):
  *        n(14~15)            n -14         1(15 +(n-14) - n)         ashr_2
  */
 	.p2align 4
-LABEL(ashr_2):
+LABEL(ashr_2_sse4_2):
 	pxor	%xmm0, %xmm0
 	movdqa	(%rdi), %xmm2
 	movdqa	(%rsi), %xmm1
@@ -330,7 +330,7 @@ LABEL(ashr_2):
 	shr	%cl, %edx
 	shr	%cl, %r9d
 	sub	%r9d, %edx
-	jnz	LABEL(less32bytes)
+	jnz	LABEL(less32bytes_sse4_2)
 	movdqa	(%rdi), %xmm3
 	UPDATE_STRNCMP_COUNTER
 
@@ -358,7 +358,7 @@ LABEL(loop_ashr_2_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 
 	add	$16, %rdx
@@ -371,7 +371,7 @@ LABEL(loop_ashr_2_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_2_use_sse4_2)
@@ -397,7 +397,7 @@ LABEL(nibble_ashr_2_use_sse4_2):
  *        n(13~15)            n -13         2(15 +(n-13) - n)         ashr_3
  */
 	.p2align 4
-LABEL(ashr_3):
+LABEL(ashr_3_sse4_2):
 	pxor	%xmm0, %xmm0
 	movdqa	(%rdi), %xmm2
 	movdqa	(%rsi), %xmm1
@@ -409,7 +409,7 @@ LABEL(ashr_3):
 	shr	%cl, %edx
 	shr	%cl, %r9d
 	sub	%r9d, %edx
-	jnz	LABEL(less32bytes)
+	jnz	LABEL(less32bytes_sse4_2)
 	movdqa	(%rdi), %xmm3
 
 	UPDATE_STRNCMP_COUNTER
@@ -437,7 +437,7 @@ LABEL(loop_ashr_3_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 
 	add	$16, %rdx
@@ -450,7 +450,7 @@ LABEL(loop_ashr_3_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_3_use_sse4_2)
@@ -476,7 +476,7 @@ LABEL(nibble_ashr_3_use_sse4_2):
  *        n(12~15)            n -12         3(15 +(n-12) - n)         ashr_4
  */
 	.p2align 4
-LABEL(ashr_4):
+LABEL(ashr_4_sse4_2):
 	pxor	%xmm0, %xmm0
 	movdqa	(%rdi), %xmm2
 	movdqa	(%rsi), %xmm1
@@ -488,7 +488,7 @@ LABEL(ashr_4):
 	shr	%cl, %edx
 	shr	%cl, %r9d
 	sub	%r9d, %edx
-	jnz	LABEL(less32bytes)
+	jnz	LABEL(less32bytes_sse4_2)
 	movdqa	(%rdi), %xmm3
 
 	UPDATE_STRNCMP_COUNTER
@@ -517,7 +517,7 @@ LABEL(loop_ashr_4_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 
 	add	$16, %rdx
@@ -530,7 +530,7 @@ LABEL(loop_ashr_4_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_4_use_sse4_2)
@@ -556,7 +556,7 @@ LABEL(nibble_ashr_4_use_sse4_2):
  *        n(11~15)          n - 11      	  4(15 +(n-11) - n)         ashr_5
  */
 	.p2align 4
-LABEL(ashr_5):
+LABEL(ashr_5_sse4_2):
 	pxor	%xmm0, %xmm0
 	movdqa	(%rdi), %xmm2
 	movdqa	(%rsi), %xmm1
@@ -568,7 +568,7 @@ LABEL(ashr_5):
 	shr	%cl, %edx
 	shr	%cl, %r9d
 	sub	%r9d, %edx
-	jnz	LABEL(less32bytes)
+	jnz	LABEL(less32bytes_sse4_2)
 	movdqa	(%rdi), %xmm3
 
 	UPDATE_STRNCMP_COUNTER
@@ -597,7 +597,7 @@ LABEL(loop_ashr_5_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 
 	add	$16, %rdx
@@ -611,7 +611,7 @@ LABEL(loop_ashr_5_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_5_use_sse4_2)
@@ -637,7 +637,7 @@ LABEL(nibble_ashr_5_use_sse4_2):
  *        n(10~15)          n - 10      	  5(15 +(n-10) - n)         ashr_6
  */
 	.p2align 4
-LABEL(ashr_6):
+LABEL(ashr_6_sse4_2):
 	pxor	%xmm0, %xmm0
 	movdqa	(%rdi), %xmm2
 	movdqa	(%rsi), %xmm1
@@ -649,7 +649,7 @@ LABEL(ashr_6):
 	shr	%cl, %edx
 	shr	%cl, %r9d
 	sub	%r9d, %edx
-	jnz	LABEL(less32bytes)
+	jnz	LABEL(less32bytes_sse4_2)
 	movdqa	(%rdi), %xmm3
 
 	UPDATE_STRNCMP_COUNTER
@@ -678,7 +678,7 @@ LABEL(loop_ashr_6_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 
 	add	$16, %rdx
@@ -691,7 +691,7 @@ LABEL(loop_ashr_6_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_6_use_sse4_2)
@@ -717,7 +717,7 @@ LABEL(nibble_ashr_6_use_sse4_2):
  *        n(9~15)          n - 9      	        6(15 +(n - 9) - n)         ashr_7
  */
 	.p2align 4
-LABEL(ashr_7):
+LABEL(ashr_7_sse4_2):
 	pxor	%xmm0, %xmm0
 	movdqa	(%rdi), %xmm2
 	movdqa	(%rsi), %xmm1
@@ -729,7 +729,7 @@ LABEL(ashr_7):
 	shr	%cl, %edx
 	shr	%cl, %r9d
 	sub	%r9d, %edx
-	jnz	LABEL(less32bytes)
+	jnz	LABEL(less32bytes_sse4_2)
 	movdqa	(%rdi), %xmm3
 
 	UPDATE_STRNCMP_COUNTER
@@ -758,7 +758,7 @@ LABEL(loop_ashr_7_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 
 	add	$16, %rdx
@@ -771,7 +771,7 @@ LABEL(loop_ashr_7_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_7_use_sse4_2)
@@ -797,7 +797,7 @@ LABEL(nibble_ashr_7_use_sse4_2):
  *        n(8~15)          n - 8      	        7(15 +(n - 8) - n)         ashr_8
  */
 	.p2align 4
-LABEL(ashr_8):
+LABEL(ashr_8_sse4_2):
 	pxor	%xmm0, %xmm0
 	movdqa	(%rdi), %xmm2
 	movdqa	(%rsi), %xmm1
@@ -809,7 +809,7 @@ LABEL(ashr_8):
 	shr	%cl, %edx
 	shr	%cl, %r9d
 	sub	%r9d, %edx
-	jnz	LABEL(less32bytes)
+	jnz	LABEL(less32bytes_sse4_2)
 	movdqa	(%rdi), %xmm3
 
 	UPDATE_STRNCMP_COUNTER
@@ -838,7 +838,7 @@ LABEL(loop_ashr_8_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 
 	add	$16, %rdx
@@ -851,7 +851,7 @@ LABEL(loop_ashr_8_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_8_use_sse4_2)
@@ -877,7 +877,7 @@ LABEL(nibble_ashr_8_use_sse4_2):
  *        n(7~15)          n - 7      	        8(15 +(n - 7) - n)         ashr_9
  */
 	.p2align 4
-LABEL(ashr_9):
+LABEL(ashr_9_sse4_2):
 	pxor	%xmm0, %xmm0
 	movdqa	(%rdi), %xmm2
 	movdqa	(%rsi), %xmm1
@@ -889,7 +889,7 @@ LABEL(ashr_9):
 	shr	%cl, %edx
 	shr	%cl, %r9d
 	sub	%r9d, %edx
-	jnz	LABEL(less32bytes)
+	jnz	LABEL(less32bytes_sse4_2)
 	movdqa	(%rdi), %xmm3
 
 	UPDATE_STRNCMP_COUNTER
@@ -919,7 +919,7 @@ LABEL(loop_ashr_9_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 
 	add	$16, %rdx
@@ -932,7 +932,7 @@ LABEL(loop_ashr_9_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_9_use_sse4_2)
@@ -958,7 +958,7 @@ LABEL(nibble_ashr_9_use_sse4_2):
  *        n(6~15)          n - 6      	        9(15 +(n - 6) - n)         ashr_10
  */
 	.p2align 4
-LABEL(ashr_10):
+LABEL(ashr_10_sse4_2):
 	pxor	%xmm0, %xmm0
 	movdqa	(%rdi), %xmm2
 	movdqa	(%rsi), %xmm1
@@ -970,7 +970,7 @@ LABEL(ashr_10):
 	shr	%cl, %edx
 	shr	%cl, %r9d
 	sub	%r9d, %edx
-	jnz	LABEL(less32bytes)
+	jnz	LABEL(less32bytes_sse4_2)
 	movdqa	(%rdi), %xmm3
 
 	UPDATE_STRNCMP_COUNTER
@@ -999,7 +999,7 @@ LABEL(loop_ashr_10_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 
 	add	$16, %rdx
@@ -1012,7 +1012,7 @@ LABEL(loop_ashr_10_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_10_use_sse4_2)
@@ -1038,7 +1038,7 @@ LABEL(nibble_ashr_10_use_sse4_2):
  *        n(5~15)          n - 5      	        10(15 +(n - 5) - n)         ashr_11
  */
 	.p2align 4
-LABEL(ashr_11):
+LABEL(ashr_11_sse4_2):
 	pxor	%xmm0, %xmm0
 	movdqa	(%rdi), %xmm2
 	movdqa	(%rsi), %xmm1
@@ -1050,7 +1050,7 @@ LABEL(ashr_11):
 	shr	%cl, %edx
 	shr	%cl, %r9d
 	sub	%r9d, %edx
-	jnz	LABEL(less32bytes)
+	jnz	LABEL(less32bytes_sse4_2)
 	movdqa	(%rdi), %xmm3
 
 	UPDATE_STRNCMP_COUNTER
@@ -1079,7 +1079,7 @@ LABEL(loop_ashr_11_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 
 	add	$16, %rdx
@@ -1092,7 +1092,7 @@ LABEL(loop_ashr_11_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_11_use_sse4_2)
@@ -1118,7 +1118,7 @@ LABEL(nibble_ashr_11_use_sse4_2):
  *        n(4~15)          n - 4      	        11(15 +(n - 4) - n)         ashr_12
  */
 	.p2align 4
-LABEL(ashr_12):
+LABEL(ashr_12_sse4_2):
 	pxor	%xmm0, %xmm0
 	movdqa	(%rdi), %xmm2
 	movdqa	(%rsi), %xmm1
@@ -1130,7 +1130,7 @@ LABEL(ashr_12):
 	shr	%cl, %edx
 	shr	%cl, %r9d
 	sub	%r9d, %edx
-	jnz	LABEL(less32bytes)
+	jnz	LABEL(less32bytes_sse4_2)
 	movdqa	(%rdi), %xmm3
 
 	UPDATE_STRNCMP_COUNTER
@@ -1159,7 +1159,7 @@ LABEL(loop_ashr_12_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 
 	add	$16, %rdx
@@ -1172,7 +1172,7 @@ LABEL(loop_ashr_12_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_12_use_sse4_2)
@@ -1198,7 +1198,7 @@ LABEL(nibble_ashr_12_use_sse4_2):
  *        n(3~15)          n - 3      	        12(15 +(n - 3) - n)         ashr_13
  */
 	.p2align 4
-LABEL(ashr_13):
+LABEL(ashr_13_sse4_2):
 	pxor	%xmm0, %xmm0
 	movdqa	(%rdi), %xmm2
 	movdqa	(%rsi), %xmm1
@@ -1210,7 +1210,7 @@ LABEL(ashr_13):
 	shr	%cl, %edx
 	shr	%cl, %r9d
 	sub	%r9d, %edx
-	jnz	LABEL(less32bytes)
+	jnz	LABEL(less32bytes_sse4_2)
 	movdqa	(%rdi), %xmm3
 
 	UPDATE_STRNCMP_COUNTER
@@ -1240,7 +1240,7 @@ LABEL(loop_ashr_13_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 
 	add	$16, %rdx
@@ -1253,7 +1253,7 @@ LABEL(loop_ashr_13_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_13_use_sse4_2)
@@ -1279,7 +1279,7 @@ LABEL(nibble_ashr_13_use_sse4_2):
  *        n(2~15)          n - 2      	        13(15 +(n - 2) - n)         ashr_14
  */
 	.p2align 4
-LABEL(ashr_14):
+LABEL(ashr_14_sse4_2):
 	pxor	%xmm0, %xmm0
 	movdqa	(%rdi), %xmm2
 	movdqa	(%rsi), %xmm1
@@ -1291,7 +1291,7 @@ LABEL(ashr_14):
 	shr	%cl, %edx
 	shr	%cl, %r9d
 	sub	%r9d, %edx
-	jnz	LABEL(less32bytes)
+	jnz	LABEL(less32bytes_sse4_2)
 	movdqa	(%rdi), %xmm3
 
 	UPDATE_STRNCMP_COUNTER
@@ -1321,7 +1321,7 @@ LABEL(loop_ashr_14_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 
 	add	$16, %rdx
@@ -1334,7 +1334,7 @@ LABEL(loop_ashr_14_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_14_use_sse4_2)
@@ -1360,7 +1360,7 @@ LABEL(nibble_ashr_14_use_sse4_2):
  *        n(1~15)          n - 1      	        14(15 +(n - 1) - n)         ashr_15
  */
 	.p2align 4
-LABEL(ashr_15):
+LABEL(ashr_15_sse4_2):
 	pxor	%xmm0, %xmm0
 	movdqa	(%rdi), %xmm2
 	movdqa	(%rsi), %xmm1
@@ -1372,7 +1372,7 @@ LABEL(ashr_15):
 	shr	%cl, %edx
 	shr	%cl, %r9d
 	sub	%r9d, %edx
-	jnz	LABEL(less32bytes)
+	jnz	LABEL(less32bytes_sse4_2)
 
 	movdqa	(%rdi), %xmm3
 
@@ -1404,7 +1404,7 @@ LABEL(loop_ashr_15_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 
 	add	$16, %rdx
@@ -1417,7 +1417,7 @@ LABEL(loop_ashr_15_use_sse4_2):
 	jbe	LABEL(use_sse4_2_exit)
 #ifdef USE_AS_STRNCMP
 	sub	$16, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 	add	$16, %rdx
 	jmp	LABEL(loop_ashr_15_use_sse4_2)
@@ -1439,56 +1439,37 @@ LABEL(nibble_ashr_use_sse4_2_exit):
 	pcmpistri      $0x1a,(%rsi,%rdx), %xmm0
 	.p2align 4
 LABEL(use_sse4_2_exit):
-	jnc	LABEL(strcmp_exitz)
+	jnc	LABEL(strcmp_exitz_sse4_2)
 #ifdef USE_AS_STRNCMP
 	sub	%rcx, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 	add	%rcx, %rdx
 	lea	-16(%rdi, %r9), %rdi
 	movzbl	(%rdi, %rdx), %eax
 	movzbl	(%rsi, %rdx), %edx
 	test	%r8d, %r8d
-	jz	LABEL(use_sse4_2_ret)
+	jz	LABEL(use_sse4_2_ret_sse4_2)
 	xchg	%eax, %edx
-LABEL(use_sse4_2_ret):
+LABEL(use_sse4_2_ret_sse4_2):
 	sub	%edx, %eax
 	ret
 
-#if 0
-	/* This code was in the origial submission but isn't used.
-	   --drepper */
-	.p2align 4
-LABEL(aftertail):
-	pcmpeqb	%xmm3, %xmm1
-	psubb	%xmm0, %xmm1
-	pmovmskb %xmm1, %edx
-	not	%edx
-
-	.p2align 4
-LABEL(exit):
-	lea	-16(%r9, %rcx), %rax	/* locate the exact offset for rdi */
-#endif
-
-LABEL(less32bytes):
+LABEL(less32bytes_sse4_2):
 	lea	(%rdi, %rax), %rdi	/* locate the exact address for first operand(rdi) */
 	lea	(%rsi, %rcx), %rsi	/* locate the exact address for second operand(rsi) */
 	test	%r8d, %r8d
-	jz	LABEL(ret)
+	jz	LABEL(ret_sse4_2)
 	xchg	%rsi, %rdi		/* recover original order according to flag(%r8d) */
 
 	.p2align 4
-LABEL(ret):
-LABEL(less16bytes):
-	/*
-	 * Check to see if BSF is fast on this processor. If not, use a different
-	 * exit tail.
-	 */
+LABEL(ret_sse4_2):
+LABEL(less16bytes_sse4_2):
 	bsf	%rdx, %rdx		/* find and store bit index in %rdx */
 
 #ifdef USE_AS_STRNCMP
 	sub	%rdx, %r11
-	jbe	LABEL(strcmp_exitz)
+	jbe	LABEL(strcmp_exitz_sse4_2)
 #endif
 	movzbl	(%rsi, %rdx), %ecx
 	movzbl	(%rdi, %rdx), %eax
@@ -1496,139 +1477,15 @@ LABEL(less16bytes):
 	sub	%ecx, %eax
 	ret
 
-LABEL(strcmp_exitz):
+LABEL(strcmp_exitz_sse4_2):
 	xor	%eax, %eax
 	ret
 
 	.p2align 4
-LABEL(Byte0):
-	/*
-	 * never need to handle byte 0 for strncmpy
-#ifdef USE_AS_STRNCMP
-	sub	$0, %r11
-	jbe	LABEL(strcmp_exitz)
-#endif
-	*/
+LABEL(Byte0_sse4_2):
 	movzx	(%rsi), %ecx
 	movzx	(%rdi), %eax
 
-	sub	%ecx, %eax
-	ret
-
-	.p2align 4
-LABEL(Byte1):
-
-#ifdef USE_AS_STRNCMP
-	sub	$1, %r11
-	jbe	LABEL(strcmp_exitz)
-#endif
-	movzx	1(%rsi), %ecx
-	movzx	1(%rdi), %eax
-
-	sub	%ecx, %eax
-	ret
-
-	.p2align 4
-LABEL(Byte2):
-
-#ifdef USE_AS_STRNCMP
-	sub	$2, %r11
-	jbe	LABEL(strcmp_exitz)
-#endif
-	movzx	2(%rsi), %ecx
-	movzx	2(%rdi), %eax
-
-	sub	%ecx, %eax
-	ret
-
-	.p2align 4
-LABEL(Byte3):
-
-#ifdef USE_AS_STRNCMP
-	sub	$3, %r11
-	jbe	LABEL(strcmp_exitz)
-#endif
-	movzx	3(%rsi), %ecx
-	movzx	3(%rdi), %eax
-
-	sub	%ecx, %eax
-	ret
-
-	.p2align 4
-LABEL(Byte4):
-
-#ifdef USE_AS_STRNCMP
-	sub	$4, %r11
-	jbe	LABEL(strcmp_exitz)
-#endif
-	movzx	4(%rsi), %ecx
-	movzx	4(%rdi), %eax
-
-	sub	%ecx, %eax
-	ret
-
-	.p2align 4
-LABEL(Byte5):
-
-#ifdef USE_AS_STRNCMP
-	sub	$5, %r11
-	jbe	LABEL(strcmp_exitz)
-#endif
-	movzx	5(%rsi), %ecx
-	movzx	5(%rdi), %eax
-
-	sub	%ecx, %eax
-	ret
-
-	.p2align 4
-LABEL(Byte6):
-
-#ifdef USE_AS_STRNCMP
-	sub	$6, %r11
-	jbe	LABEL(strcmp_exitz)
-#endif
-	movzx	6(%rsi), %ecx
-	movzx	6(%rdi), %eax
-
-	sub	%ecx, %eax
-	ret
-
-	.p2align 4
-LABEL(next_8_bytes):
-	add	$8, %rdi
-	add	$8, %rsi
-#ifdef USE_AS_STRNCMP
-	sub	$8, %r11
-	jbe	LABEL(strcmp_exitz)
-#endif
-	test	$0x01, %dh
-	jnz	LABEL(Byte0)
-
-	test	$0x02, %dh
-	jnz	LABEL(Byte1)
-
-	test	$0x04, %dh
-	jnz	LABEL(Byte2)
-
-	test	$0x08, %dh
-	jnz	LABEL(Byte3)
-
-	test	$0x10, %dh
-	jnz	LABEL(Byte4)
-
-	test	$0x20, %dh
-	jnz	LABEL(Byte5)
-
-	test	$0x40, %dh
-	jnz	LABEL(Byte6)
-
-#ifdef USE_AS_STRNCMP
-	sub	$7, %r11
-	jbe	LABEL(strcmp_exitz)
-#endif
-	movzx	7(%rsi), %ecx
-	movzx	7(%rdi), %eax
-
 	sub	%ecx, %eax
 	ret
 	cfi_endproc
@@ -1636,24 +1493,24 @@ LABEL(next_8_bytes):
 
 	/* Put all SSE 4.2 functions together.  */
 	.section .rodata.sse4.2,"a",@progbits
-	.p2align 4
-LABEL(unaligned_table):
-	.int	LABEL(ashr_1) - LABEL(unaligned_table)
-	.int	LABEL(ashr_2) - LABEL(unaligned_table)
-	.int	LABEL(ashr_3) - LABEL(unaligned_table)
-	.int	LABEL(ashr_4) - LABEL(unaligned_table)
-	.int	LABEL(ashr_5) - LABEL(unaligned_table)
-	.int	LABEL(ashr_6) - LABEL(unaligned_table)
-	.int	LABEL(ashr_7) - LABEL(unaligned_table)
-	.int	LABEL(ashr_8) - LABEL(unaligned_table)
-	.int	LABEL(ashr_9) - LABEL(unaligned_table)
-	.int	LABEL(ashr_10) - LABEL(unaligned_table)
-	.int	LABEL(ashr_11) - LABEL(unaligned_table)
-	.int	LABEL(ashr_12) - LABEL(unaligned_table)
-	.int	LABEL(ashr_13) - LABEL(unaligned_table)
-	.int	LABEL(ashr_14) - LABEL(unaligned_table)
-	.int	LABEL(ashr_15) - LABEL(unaligned_table)
-	.int	LABEL(ashr_0) - LABEL(unaligned_table)
+	.p2align 3
+LABEL(unaligned_table_sse4_2):
+	.int	LABEL(ashr_1_sse4_2) - LABEL(unaligned_table_sse4_2)
+	.int	LABEL(ashr_2_sse4_2) - LABEL(unaligned_table_sse4_2)
+	.int	LABEL(ashr_3_sse4_2) - LABEL(unaligned_table_sse4_2)
+	.int	LABEL(ashr_4_sse4_2) - LABEL(unaligned_table_sse4_2)
+	.int	LABEL(ashr_5_sse4_2) - LABEL(unaligned_table_sse4_2)
+	.int	LABEL(ashr_6_sse4_2) - LABEL(unaligned_table_sse4_2)
+	.int	LABEL(ashr_7_sse4_2) - LABEL(unaligned_table_sse4_2)
+	.int	LABEL(ashr_8_sse4_2) - LABEL(unaligned_table_sse4_2)
+	.int	LABEL(ashr_9_sse4_2) - LABEL(unaligned_table_sse4_2)
+	.int	LABEL(ashr_10_sse4_2) - LABEL(unaligned_table_sse4_2)
+	.int	LABEL(ashr_11_sse4_2) - LABEL(unaligned_table_sse4_2)
+	.int	LABEL(ashr_12_sse4_2) - LABEL(unaligned_table_sse4_2)
+	.int	LABEL(ashr_13_sse4_2) - LABEL(unaligned_table_sse4_2)
+	.int	LABEL(ashr_14_sse4_2) - LABEL(unaligned_table_sse4_2)
+	.int	LABEL(ashr_15_sse4_2) - LABEL(unaligned_table_sse4_2)
+	.int	LABEL(ashr_0_sse4_2) - LABEL(unaligned_table_sse4_2)
 
 
 # undef ENTRY
@@ -1673,6 +1530,4 @@ LABEL(unaligned_table):
 	.globl __GI_STRCMP; __GI_STRCMP = STRCMP_SSE2
 #endif
 
-#ifndef USE_AS_STRNCMP
 #include "../strcmp.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncmp-c.c b/sysdeps/x86_64/multiarch/strncmp-c.c
deleted file mode 100644
index d4f74a418d..0000000000
--- a/sysdeps/x86_64/multiarch/strncmp-c.c
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifdef SHARED
-#define STRNCMP __strncmp_sse2
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(name) \
-  __hidden_ver1 (__strncmp_sse2, __GI_strncmp, __strncmp_sse2);
-#endif
-
-#include "strncmp.c"
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index 119b88e40b..340a64ba35 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -1,8 +1,10 @@
 /* Highly optimized version for x86-64.
-   Copyright (C) 1999, 2000, 2002, 2003, 2005 Free Software Foundation, Inc.
+   Copyright (C) 1999, 2000, 2002, 2003, 2005, 2009
+   Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Based on i686 version contributed by Ulrich Drepper
    <drepper@cygnus.com>, 1999.
+   Updated with SSE2 support contributed by Intel Corporation.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -24,8 +26,35 @@
 #include "bp-sym.h"
 #include "bp-asm.h"
 
-        .text
-ENTRY (BP_SYM (strcmp))
+#undef UPDATE_STRNCMP_COUNTER
+
+#ifndef LABEL
+#define LABEL(l) L(l)
+#endif
+
+#ifdef USE_AS_STRNCMP
+/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
+   if the new counter > the old one or is 0.  */
+# define UPDATE_STRNCMP_COUNTER				\
+	/* calculate left number to compare */		\
+	lea	-16(%rcx, %r11), %r9;			\
+	cmp	%r9, %r11;				\
+	jb	LABEL(strcmp_exitz);			\
+	test	%r9, %r9;				\
+	je	LABEL(strcmp_exitz);			\
+	mov	%r9, %r11
+
+#else
+# define UPDATE_STRNCMP_COUNTER
+# ifndef STRCMP
+#  define STRCMP strcmp
+# endif
+#endif
+
+	.text
+ENTRY (BP_SYM (STRCMP))
+#ifdef NOT_IN_libc
+/* Simple version since we can't use SSE registers in ld.so.  */
 L(oop):	movb	(%rdi), %al
 	cmpb	(%rsi), %al
 	jne	L(neq)
@@ -41,5 +70,1914 @@ L(neq):	movl	$1, %eax
 	movl	$-1, %ecx
 	cmovbl	%ecx, %eax
 	ret
-END (BP_SYM (strcmp))
-libc_hidden_builtin_def (strcmp)
+END (BP_SYM (STRCMP))
+#else	/* NOT_IN_libc */
+/*
+ * This implementation uses SSE to compare up to 16 bytes at a time.
+ */
+#ifdef USE_AS_STRNCMP
+	test	%rdx, %rdx
+	je	LABEL(strcmp_exitz)
+	cmp	$1, %rdx
+	je	LABEL(Byte0)
+	mov	%rdx, %r11
+#endif
+	mov	%esi, %ecx
+	mov	%edi, %eax
+/* Use 64bit AND here to avoid long NOP padding.  */
+	and	$0x3f, %rcx		/* rsi alignment in cache line */
+	and	$0x3f, %rax		/* rdi alignment in cache line */
+	cmp	$0x30, %ecx
+	ja	LABEL(crosscache)	/* rsi: 16-byte load will cross cache line */
+	cmp	$0x30, %eax
+	ja	LABEL(crosscache)	/* rdi: 16-byte load will cross cache line */
+	movlpd	(%rdi), %xmm1
+	movlpd	(%rsi), %xmm2
+	movhpd	8(%rdi), %xmm1
+	movhpd	8(%rsi), %xmm2
+	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
+	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
+	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
+	jnz	LABEL(less16bytes)	/* If not, find different value or null char */
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)	/* finish comparision */
+#endif
+	add	$16, %rsi		/* prepare to search next 16 bytes */
+	add	$16, %rdi		/* prepare to search next 16 bytes */
+
+	/*
+	 * Determine source and destination string offsets from 16-byte alignment.
+	 * Use relative offset difference between the two to determine which case
+	 * below to use.
+	 */
+	.p2align 4
+LABEL(crosscache):
+	and	$0xfffffffffffffff0, %rsi	/* force %rsi is 16 byte aligned */
+	and	$0xfffffffffffffff0, %rdi	/* force %rdi is 16 byte aligned */
+	mov	$0xffff, %edx			/* for equivalent offset */
+	xor	%r8d, %r8d
+	and	$0xf, %ecx			/* offset of rsi */
+	and	$0xf, %eax			/* offset of rdi */
+	cmp	%eax, %ecx
+	je	LABEL(ashr_0)			/* rsi and rdi relative offset same */
+	ja	LABEL(bigger)
+	mov	%edx, %r8d			/* r8d is offset flag for exit tail */
+	xchg	%ecx, %eax
+	xchg	%rsi, %rdi
+LABEL(bigger):
+	lea	15(%rax), %r9
+	sub	%rcx, %r9
+	lea	LABEL(unaligned_table)(%rip), %r10
+	movslq	(%r10, %r9,4), %r9
+	lea	(%r10, %r9), %r10
+	jmp	*%r10				/* jump to corresponding case */
+
+/*
+ * The following cases will be handled by ashr_0
+ *  rcx(offset of rsi)  rax(offset of rdi)  relative offset  corresponding case
+ *        n(0~15)            n(0~15)           15(15+ n-n)         ashr_0
+ */
+	.p2align 4
+LABEL(ashr_0):
+
+	movdqa	(%rsi), %xmm1
+	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char check */
+	pcmpeqb	%xmm1, %xmm0			/* Any null chars? */
+	pcmpeqb	(%rdi), %xmm1			/* compare 16 bytes for equality */
+	psubb	%xmm0, %xmm1			/* packed sub of comparison results*/
+	pmovmskb %xmm1, %r9d
+	shr	%cl, %edx			/* adjust 0xffff for offset */
+	shr	%cl, %r9d			/* adjust for 16-byte offset */
+	sub	%r9d, %edx
+	/*
+	 * edx must be the same with r9d if in left byte (16-rcx) is equal to
+	 * the start from (16-rax) and no null char was seen.
+	 */
+	jne	LABEL(less32bytes)		/* mismatch or null char */
+	UPDATE_STRNCMP_COUNTER
+	mov	$16, %rcx
+	mov	$16, %r9
+	pxor	%xmm0, %xmm0			/* clear xmm0, may have changed above */
+
+	/*
+	 * Now both strings are aligned at 16-byte boundary. Loop over strings
+	 * checking 32-bytes per iteration.
+	 */
+	.p2align 4
+LABEL(loop_ashr_0):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)		/* mismatch or null char seen */
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rcx
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rcx
+	jmp	LABEL(loop_ashr_0)
+
+/*
+ * The following cases will be handled by ashr_1
+ * rcx(offset of rsi)  rax(offset of rdi)   relative offset   	corresponding case
+ *        n(15)            n -15            0(15 +(n-15) - n)         ashr_1
+ */
+	.p2align 4
+LABEL(ashr_1):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
+	pslldq	$15, %xmm2		/* shift first string to align with second */
+	pcmpeqb	%xmm1, %xmm2		/* compare 16 bytes for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx		/* adjust 0xffff for offset */
+	shr	%cl, %r9d		/* adjust for 16-byte offset */
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)	/* mismatch or null char seen */
+	movdqa	(%rdi), %xmm3
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx		/* index for loads*/
+	mov	$1, %r9d		/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	1(%rdi), %r10
+	and	$0xfff, %r10		/* offset into 4K page */
+	sub	$0x1000, %r10		/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_1):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_1)	/* cross page boundary */
+
+LABEL(gobble_ashr_1):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4		 /* store for next cycle */
+
+	psrldq	$1, %xmm3
+	pslldq	$15, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_1)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4		/* store for next cycle */
+
+	psrldq	$1, %xmm3
+	pslldq 	$15, %xmm2
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_1)
+
+	/*
+	 * Nibble avoids loads across page boundary. This is to avoid a potential
+	 * access into unmapped memory.
+	 */
+	.p2align 4
+LABEL(nibble_ashr_1):
+	pcmpeqb	%xmm3, %xmm0		 /* check nibble for null char*/
+	pmovmskb %xmm0, %edx
+	test	$0xfffe, %edx
+	jnz	LABEL(ashr_1_exittail)	/* find null char*/
+
+#ifdef USE_AS_STRNCMP
+	cmp	$14, %r11
+	jbe	LABEL(ashr_1_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10		/* substract 4K from %r10 */
+	jmp	LABEL(gobble_ashr_1)
+
+	/*
+	 * Once find null char, determine if there is a string mismatch
+	 * before the null char.
+	 */
+	.p2align 4
+LABEL(ashr_1_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$1, %xmm0
+	psrldq	$1, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_2
+ * rcx(offset of rsi)  rax(offset of rdi)   relative offset   	corresponding case
+ *        n(14~15)            n -14         1(15 +(n-14) - n)         ashr_2
+ */
+	.p2align 4
+LABEL(ashr_2):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$14, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$2, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	2(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_2):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_2)
+
+LABEL(gobble_ashr_2):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$2, %xmm3
+	pslldq	$14, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_2)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$2, %xmm3
+	pslldq 	$14, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_2)
+
+	.p2align 4
+LABEL(nibble_ashr_2):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xfffc, %edx
+	jnz	LABEL(ashr_2_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$13, %r11
+	jbe	LABEL(ashr_2_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_2)
+
+	.p2align 4
+LABEL(ashr_2_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$2, %xmm0
+	psrldq	$2, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_3
+ *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
+ *        n(13~15)            n -13         2(15 +(n-13) - n)         ashr_3
+ */
+	.p2align 4
+LABEL(ashr_3):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$13, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$3, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	3(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_3):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_3)
+
+LABEL(gobble_ashr_3):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$3, %xmm3
+	pslldq	$13, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_3)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$3, %xmm3
+	pslldq 	$13, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_3)
+
+	.p2align 4
+LABEL(nibble_ashr_3):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xfff8, %edx
+	jnz	LABEL(ashr_3_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$12, %r11
+	jbe	LABEL(ashr_3_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_3)
+
+	.p2align 4
+LABEL(ashr_3_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$3, %xmm0
+	psrldq	$3, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_4
+ *  rcx(offset of rsi)  rax(offset of rdi)  relative offset	 corresponding case
+ *        n(12~15)            n -12         3(15 +(n-12) - n)         ashr_4
+ */
+	.p2align 4
+LABEL(ashr_4):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$12, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$4, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	4(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_4):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_4)
+
+LABEL(gobble_ashr_4):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$4, %xmm3
+	pslldq	$12, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_4)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$4, %xmm3
+	pslldq 	$12, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_4)
+
+	.p2align 4
+LABEL(nibble_ashr_4):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xfff0, %edx
+	jnz	LABEL(ashr_4_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$11, %r11
+	jbe	LABEL(ashr_4_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_4)
+
+	.p2align 4
+LABEL(ashr_4_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$4, %xmm0
+	psrldq	$4, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_5
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
+ *        n(11~15)          n - 11      	  4(15 +(n-11) - n)         ashr_5
+ */
+	.p2align 4
+LABEL(ashr_5):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$11, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$5, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	5(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_5):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_5)
+
+LABEL(gobble_ashr_5):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$5, %xmm3
+	pslldq	$11, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_5)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$5, %xmm3
+	pslldq 	$11, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_5)
+
+	.p2align 4
+LABEL(nibble_ashr_5):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xffe0, %edx
+	jnz	LABEL(ashr_5_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$10, %r11
+	jbe	LABEL(ashr_5_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_5)
+
+	.p2align 4
+LABEL(ashr_5_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$5, %xmm0
+	psrldq	$5, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_6
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
+ *        n(10~15)          n - 10      	  5(15 +(n-10) - n)         ashr_6
+ */
+	.p2align 4
+LABEL(ashr_6):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$10, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$6, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	6(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_6):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_6)
+
+LABEL(gobble_ashr_6):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$6, %xmm3
+	pslldq	$10, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_6)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$6, %xmm3
+	pslldq 	$10, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_6)
+
+	.p2align 4
+LABEL(nibble_ashr_6):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xffc0, %edx
+	jnz	LABEL(ashr_6_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$9, %r11
+	jbe	LABEL(ashr_6_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_6)
+
+	.p2align 4
+LABEL(ashr_6_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$6, %xmm0
+	psrldq	$6, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_7
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset      corresponding case
+ *        n(9~15)          n - 9      	        6(15 +(n - 9) - n)         ashr_7
+ */
+	.p2align 4
+LABEL(ashr_7):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$9, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$7, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	7(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_7):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_7)
+
+LABEL(gobble_ashr_7):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$7, %xmm3
+	pslldq	$9, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_7)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$7, %xmm3
+	pslldq 	$9, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_7)
+
+	.p2align 4
+LABEL(nibble_ashr_7):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xff80, %edx
+	jnz	LABEL(ashr_7_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$8, %r11
+	jbe	LABEL(ashr_7_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_7)
+
+	.p2align 4
+LABEL(ashr_7_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$7, %xmm0
+	psrldq	$7, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ *  The following cases will be handled by ashr_8
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(8~15)          n - 8      	        7(15 +(n - 8) - n)         ashr_8
+ */
+	.p2align 4
+LABEL(ashr_8):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$8, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$8, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	8(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_8):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_8)
+
+LABEL(gobble_ashr_8):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$8, %xmm3
+	pslldq	$8, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_8)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$8, %xmm3
+	pslldq 	$8, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_8)
+
+	.p2align 4
+LABEL(nibble_ashr_8):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xff00, %edx
+	jnz	LABEL(ashr_8_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$7, %r11
+	jbe	LABEL(ashr_8_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_8)
+
+	.p2align 4
+LABEL(ashr_8_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$8, %xmm0
+	psrldq	$8, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ *  The following cases will be handled by ashr_9
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(7~15)          n - 7      	        8(15 +(n - 7) - n)         ashr_9
+ */
+	.p2align 4
+LABEL(ashr_9):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$7, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$9, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	9(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_9):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_9)
+
+LABEL(gobble_ashr_9):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$9, %xmm3
+	pslldq	$7, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_9)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$9, %xmm3
+	pslldq 	$7, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3		/* store for next cycle */
+	jmp	LABEL(loop_ashr_9)
+
+	.p2align 4
+LABEL(nibble_ashr_9):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xfe00, %edx
+	jnz	LABEL(ashr_9_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$6, %r11
+	jbe	LABEL(ashr_9_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_9)
+
+	.p2align 4
+LABEL(ashr_9_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$9, %xmm0
+	psrldq	$9, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ *  The following cases will be handled by ashr_10
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(6~15)          n - 6      	        9(15 +(n - 6) - n)         ashr_10
+ */
+	.p2align 4
+LABEL(ashr_10):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$6, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$10, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	10(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_10):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_10)
+
+LABEL(gobble_ashr_10):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$10, %xmm3
+	pslldq	$6, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_10)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$10, %xmm3
+	pslldq 	$6, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_10)
+
+	.p2align 4
+LABEL(nibble_ashr_10):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xfc00, %edx
+	jnz	LABEL(ashr_10_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$5, %r11
+	jbe	LABEL(ashr_10_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_10)
+
+	.p2align 4
+LABEL(ashr_10_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$10, %xmm0
+	psrldq	$10, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ *  The following cases will be handled by ashr_11
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(5~15)          n - 5      	        10(15 +(n - 5) - n)         ashr_11
+ */
+	.p2align 4
+LABEL(ashr_11):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$5, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$11, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	11(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_11):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_11)
+
+LABEL(gobble_ashr_11):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$11, %xmm3
+	pslldq	$5, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_11)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$11, %xmm3
+	pslldq 	$5, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_11)
+
+	.p2align 4
+LABEL(nibble_ashr_11):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xf800, %edx
+	jnz	LABEL(ashr_11_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$4, %r11
+	jbe	LABEL(ashr_11_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_11)
+
+	.p2align 4
+LABEL(ashr_11_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$11, %xmm0
+	psrldq	$11, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ *  The following cases will be handled by ashr_12
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(4~15)          n - 4      	        11(15 +(n - 4) - n)         ashr_12
+ */
+	.p2align 4
+LABEL(ashr_12):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$4, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$12, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	12(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_12):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_12)
+
+LABEL(gobble_ashr_12):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$12, %xmm3
+	pslldq	$4, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_12)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$12, %xmm3
+	pslldq 	$4, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_12)
+
+	.p2align 4
+LABEL(nibble_ashr_12):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xf000, %edx
+	jnz	LABEL(ashr_12_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$3, %r11
+	jbe	LABEL(ashr_12_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_12)
+
+	.p2align 4
+LABEL(ashr_12_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$12, %xmm0
+	psrldq	$12, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ *  The following cases will be handled by ashr_13
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(3~15)          n - 3      	        12(15 +(n - 3) - n)         ashr_13
+ */
+	.p2align 4
+LABEL(ashr_13):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$3, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$13, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	13(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_13):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_13)
+
+LABEL(gobble_ashr_13):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$13, %xmm3
+	pslldq	$3, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_13)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$13, %xmm3
+	pslldq 	$3, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_13)
+
+	.p2align 4
+LABEL(nibble_ashr_13):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xe000, %edx
+	jnz	LABEL(ashr_13_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$2, %r11
+	jbe	LABEL(ashr_13_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_13)
+
+	.p2align 4
+LABEL(ashr_13_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq  $13, %xmm0
+	psrldq  $13, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ *  The following cases will be handled by ashr_14
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(2~15)          n - 2      	        13(15 +(n - 2) - n)         ashr_14
+ */
+	.p2align 4
+LABEL(ashr_14):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq  $2, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$14, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	14(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_14):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_14)
+
+LABEL(gobble_ashr_14):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$14, %xmm3
+	pslldq	$2, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_14)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$14, %xmm3
+	pslldq 	$2, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_14)
+
+	.p2align 4
+LABEL(nibble_ashr_14):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0xc000, %edx
+	jnz	LABEL(ashr_14_exittail)
+
+#ifdef USE_AS_STRNCMP
+	cmp	$1, %r11
+	jbe	LABEL(ashr_14_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_14)
+
+	.p2align 4
+LABEL(ashr_14_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$14, %xmm0
+	psrldq	$14, %xmm3
+	jmp	LABEL(aftertail)
+
+/*
+ *  The following cases will be handled by ashr_15
+ *  rcx(offset of rsi)  rax(offset of rdi)        relative offset	 corresponding case
+ *        n(1~15)          n - 1      	        14(15 +(n - 1) - n)         ashr_15
+ */
+	.p2align 4
+LABEL(ashr_15):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$1, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$15, %r9d	/* byte position left over from less32bytes case */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we have crossed a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	15(%rdi), %r10
+	and	$0xfff, %r10	/* offset into 4K page */
+
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+
+	.p2align 4
+LABEL(loop_ashr_15):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_15)
+
+LABEL(gobble_ashr_15):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$15, %xmm3
+	pslldq	$1, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_15)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$15, %xmm3
+	pslldq 	$1, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_15)
+
+	.p2align 4
+LABEL(nibble_ashr_15):
+	pcmpeqb	%xmm3, %xmm0		/* check nibble for null char */
+	pmovmskb %xmm0, %edx
+	test	$0x8000, %edx
+	jnz	LABEL(ashr_15_exittail)
+
+#ifdef USE_AS_STRNCMP
+	test	%r11, %r11
+	je	LABEL(ashr_15_exittail)
+#endif
+
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10
+	jmp	LABEL(gobble_ashr_15)
+
+	.p2align 4
+LABEL(ashr_15_exittail):
+	movdqa	(%rsi, %rcx), %xmm1
+	psrldq	$15, %xmm3
+	psrldq	$15, %xmm0
+
+	.p2align 4
+LABEL(aftertail):
+	pcmpeqb	%xmm3, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	not	%edx
+
+	.p2align 4
+LABEL(exit):
+	lea	-16(%r9, %rcx), %rax	/* locate the exact offset for rdi */
+LABEL(less32bytes):
+	lea	(%rdi, %rax), %rdi	/* locate the exact address for first operand(rdi) */
+	lea	(%rsi, %rcx), %rsi	/* locate the exact address for second operand(rsi) */
+	test	%r8d, %r8d
+	jz	LABEL(ret)
+	xchg	%rsi, %rdi		/* recover original order according to flag(%r8d) */
+
+	.p2align 4
+LABEL(ret):
+LABEL(less16bytes):
+	bsf	%rdx, %rdx		/* find and store bit index in %rdx */
+
+#ifdef USE_AS_STRNCMP
+	sub	%rdx, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	movzbl	(%rsi, %rdx), %ecx
+	movzbl	(%rdi, %rdx), %eax
+
+	sub	%ecx, %eax
+	ret
+
+LABEL(strcmp_exitz):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+LABEL(Byte0):
+	movzx	(%rsi), %ecx
+	movzx	(%rdi), %eax
+
+	sub	%ecx, %eax
+	ret
+END (BP_SYM (STRCMP))
+
+	.section .rodata,"a",@progbits
+	.p2align 3
+LABEL(unaligned_table):
+	.int	LABEL(ashr_1) - LABEL(unaligned_table)
+	.int	LABEL(ashr_2) - LABEL(unaligned_table)
+	.int	LABEL(ashr_3) - LABEL(unaligned_table)
+	.int	LABEL(ashr_4) - LABEL(unaligned_table)
+	.int	LABEL(ashr_5) - LABEL(unaligned_table)
+	.int	LABEL(ashr_6) - LABEL(unaligned_table)
+	.int	LABEL(ashr_7) - LABEL(unaligned_table)
+	.int	LABEL(ashr_8) - LABEL(unaligned_table)
+	.int	LABEL(ashr_9) - LABEL(unaligned_table)
+	.int	LABEL(ashr_10) - LABEL(unaligned_table)
+	.int	LABEL(ashr_11) - LABEL(unaligned_table)
+	.int	LABEL(ashr_12) - LABEL(unaligned_table)
+	.int	LABEL(ashr_13) - LABEL(unaligned_table)
+	.int	LABEL(ashr_14) - LABEL(unaligned_table)
+	.int	LABEL(ashr_15) - LABEL(unaligned_table)
+	.int	LABEL(ashr_0) - LABEL(unaligned_table)
+#endif /* NOT_IN_libc */
+libc_hidden_builtin_def (STRCMP)
diff --git a/sysdeps/x86_64/strncmp.S b/sysdeps/x86_64/strncmp.S
new file mode 100644
index 0000000000..0af34e7f15
--- /dev/null
+++ b/sysdeps/x86_64/strncmp.S
@@ -0,0 +1,3 @@
+#define STRCMP strncmp
+#define USE_AS_STRNCMP
+#include "strcmp.S"
-- 
cgit v1.2.3


From 16d2ea4c821502948d193a152c8b151f5497a0d3 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Sun, 26 Jul 2009 16:10:00 -0700
Subject: Make sure no code in ld.so uses xmm/ymm registers on x86-64.

This patch introduces a test to make sure no function modifies the
xmm/ymm registers.  With the exception of the auditing functions.

The test is probably too pessimistic.  All code linked into ld.so
is checked.  Perhaps at some point the callgraph starting from
_dl_fixup and _dl_profile_fixup is checked and we can start using
faster SSE-using functions in parts of ld.so.
---
 ChangeLog                                 |  14 ++
 sysdeps/x86_64/Makefile                   |   4 +
 sysdeps/x86_64/multiarch/rtld-rawmemchr.c |   1 +
 sysdeps/x86_64/multiarch/rtld-strlen.S    |   1 +
 sysdeps/x86_64/rtld-memchr.c              |   1 +
 sysdeps/x86_64/rtld-memcmp.c              |   1 +
 sysdeps/x86_64/rtld-rawmemchr.c           |   1 +
 sysdeps/x86_64/rtld-strchr.S              | 291 ++++++++++++++++++++++++++++++
 sysdeps/x86_64/rtld-strcmp.S              |  28 +++
 sysdeps/x86_64/rtld-strlen.S              | 139 ++++++++++++++
 sysdeps/x86_64/tst-xmmymm.sh              |  17 ++
 11 files changed, 498 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/rtld-rawmemchr.c
 create mode 100644 sysdeps/x86_64/multiarch/rtld-strlen.S
 create mode 100644 sysdeps/x86_64/rtld-memchr.c
 create mode 100644 sysdeps/x86_64/rtld-memcmp.c
 create mode 100644 sysdeps/x86_64/rtld-rawmemchr.c
 create mode 100644 sysdeps/x86_64/rtld-strchr.S
 create mode 100644 sysdeps/x86_64/rtld-strcmp.S
 create mode 100644 sysdeps/x86_64/rtld-strlen.S
 create mode 100755 sysdeps/x86_64/tst-xmmymm.sh

(limited to 'sysdeps')

diff --git a/ChangeLog b/ChangeLog
index 7adbf1f7a8..12ac487385 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,17 @@
+2009-07-26  Ulrich Drepper  <drepper@redhat.com>
+
+	* sysdeps/x86_64/tst-xmmymm.sh: New file.  Check whether any of the
+	functions used in ld.so modify xmm/ymm registers.
+	* sysdeps/x86_64/Makefile:  Hook new test up.
+	* sysdeps/x86_64/rtld-memchr.c: New file.
+	* sysdeps/x86_64/rtld-memcmp.c: New file.
+	* sysdeps/x86_64/rtld-rawmemchr.c: New file.
+	* sysdeps/x86_64/rtld-strchr.S: New file.
+	* sysdeps/x86_64/rtld-strcmp.S: New file.
+	* sysdeps/x86_64/rtld-strlen.S: New file.
+	* sysdeps/x86_64/multiarch/rtld-rawmemchr.c: New file.
+	* sysdeps/x86_64/multiarch/rtld-strlen.S: New file.
+
 2009-07-26  H.J. Lu  <hongjiu.lu@intel.com>
 
 	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Remove
diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index 78fdb04fcb..57cd88432a 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -19,6 +19,10 @@ ifeq ($(subdir),elf)
 sysdep-dl-routines += tlsdesc dl-tlsdesc
 sysdep_routines += tlsdesc dl-tlsdesc
 sysdep-rtld-routines += tlsdesc dl-tlsdesc
+
+tests: $(objpfx)tst-xmmymm.out
+$(objpfx)tst-xmmymm.out: ../sysdeps/x86_64/tst-xmmymm.sh $(objpfx)ld.so
+	$(SHELL) -e $< $(objpfx) > $@
 endif
 
 ifeq ($(subdir),csu)
diff --git a/sysdeps/x86_64/multiarch/rtld-rawmemchr.c b/sysdeps/x86_64/multiarch/rtld-rawmemchr.c
new file mode 100644
index 0000000000..53a90675ab
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rtld-rawmemchr.c
@@ -0,0 +1 @@
+#include "../rtld-rawmemchr.c"
diff --git a/sysdeps/x86_64/multiarch/rtld-strlen.S b/sysdeps/x86_64/multiarch/rtld-strlen.S
new file mode 100644
index 0000000000..596e0549ea
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rtld-strlen.S
@@ -0,0 +1 @@
+#include "../rtld-strlen.S"
diff --git a/sysdeps/x86_64/rtld-memchr.c b/sysdeps/x86_64/rtld-memchr.c
new file mode 100644
index 0000000000..f63fefbcec
--- /dev/null
+++ b/sysdeps/x86_64/rtld-memchr.c
@@ -0,0 +1 @@
+#include <string/memchr.c>
diff --git a/sysdeps/x86_64/rtld-memcmp.c b/sysdeps/x86_64/rtld-memcmp.c
new file mode 100644
index 0000000000..2ee40328b8
--- /dev/null
+++ b/sysdeps/x86_64/rtld-memcmp.c
@@ -0,0 +1 @@
+#include <string/memcmp.c>
diff --git a/sysdeps/x86_64/rtld-rawmemchr.c b/sysdeps/x86_64/rtld-rawmemchr.c
new file mode 100644
index 0000000000..2b9189393c
--- /dev/null
+++ b/sysdeps/x86_64/rtld-rawmemchr.c
@@ -0,0 +1 @@
+#include <string/rawmemchr.c>
diff --git a/sysdeps/x86_64/rtld-strchr.S b/sysdeps/x86_64/rtld-strchr.S
new file mode 100644
index 0000000000..8934697972
--- /dev/null
+++ b/sysdeps/x86_64/rtld-strchr.S
@@ -0,0 +1,291 @@
+/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR.
+   For AMD x86-64.
+   Copyright (C) 2002, 2005 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+#include "bp-sym.h"
+#include "bp-asm.h"
+
+
+	.text
+ENTRY (BP_SYM (strchr))
+
+	/* Before we start with the main loop we process single bytes
+	   until the source pointer is aligned.  This has two reasons:
+	   1. aligned 64-bit memory access is faster
+	   and (more important)
+	   2. we process in the main loop 64 bit in one step although
+	      we don't know the end of the string.  But accessing at
+	      8-byte alignment guarantees that we never access illegal
+	      memory if this would not also be done by the trivial
+	      implementation (this is because all processor inherent
+	      boundaries are multiples of 8).  */
+
+	movq	%rdi, %rdx
+	andl	$7, %edx	/* Mask alignment bits  */
+	movq	%rdi, %rax	/* duplicate destination.  */
+	jz	1f		/* aligned => start loop */
+	neg	%edx
+	addl	$8, %edx	/* Align to 8 bytes.  */
+
+	/* Search the first bytes directly.  */
+0:	movb	(%rax), %cl	/* load byte  */
+	cmpb	%cl,%sil	/* compare byte.  */
+	je	6f		/* target found */
+	testb	%cl,%cl		/* is byte NUL? */
+	je	7f		/* yes => return NULL */
+	incq	%rax		/* increment pointer */
+	decl	%edx
+	jnz	0b
+
+
+1:
+	/* At the moment %rsi contains C.  What we need for the
+	   algorithm is C in all bytes of the register.  Avoid
+	   operations on 16 bit words because these require an
+	   prefix byte (and one more cycle).  */
+	/* Populate 8 bit data to full 64-bit.  */
+	movabs	$0x0101010101010101,%r9
+	movzbl	%sil,%edx
+	imul	%rdx,%r9
+
+	movq $0xfefefefefefefeff, %r8 /* Save magic.  */
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+	 change any of the hole bits of LONGWORD.
+
+	 1) Is this safe?  Will it catch all the zero bytes?
+	 Suppose there is a byte with all zeros.  Any carry bits
+	 propagating from its left will fall into the hole at its
+	 least significant bit and stop.  Since there will be no
+	 carry from its most significant bit, the LSB of the
+	 byte to the left will be unchanged, and the zero will be
+	 detected.
+
+	 2) Is this worthwhile?  Will it ignore everything except
+	 zero bytes?  Suppose every byte of QUARDWORD has a bit set
+	 somewhere.  There will be a carry into bit 8.	If bit 8
+	 is set, this will carry into bit 16.  If bit 8 is clear,
+	 one of bits 9-15 must be set, so there will be a carry
+	 into bit 16.  Similarly, there will be a carry into bit
+	 24 tec..  If one of bits 54-63 is set, there will be a carry
+	 into bit 64 (=carry flag), so all of the hole bits will
+	 be changed.
+
+	 3) But wait!  Aren't we looking for C, not zero?
+	 Good point.  So what we do is XOR LONGWORD with a longword,
+	 each of whose bytes is C.  This turns each byte that is C
+	 into a zero.  */
+
+	.p2align 4
+4:
+	/* Main Loop is unrolled 4 times.  */
+	/* First unroll.  */
+	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
+	addq $8,%rax		/* adjust pointer for next word */
+	movq %r8, %rdx		/* magic value */
+	xorq %r9, %rcx		/* XOR with qword c|...|c => bytes of str == c
+				   are now 0 */
+	addq %rcx, %rdx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc 3f			/* highest byte is NUL => return pointer */
+	xorq %rcx, %rdx		/* (word+magic)^word */
+	orq %r8, %rdx		/* set all non-carry bits */
+	incq %rdx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz 3f			/* found c => return pointer */
+
+	/* The quadword we looked at does not contain the value we're looking
+	   for.  Let's search now whether we have reached the end of the
+	   string.  */
+	xorq %r9, %rcx		/* restore original dword without reload */
+	movq %r8, %rdx		/* magic value */
+	addq %rcx, %rdx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc 7f			/* highest byte is NUL => return NULL */
+	xorq %rcx, %rdx		/* (word+magic)^word */
+	orq %r8, %rdx		/* set all non-carry bits */
+	incq %rdx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz 7f			/* found NUL => return NULL */
+
+	/* Second unroll.  */
+	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
+	addq $8,%rax		/* adjust pointer for next word */
+	movq %r8, %rdx		/* magic value */
+	xorq %r9, %rcx		/* XOR with qword c|...|c => bytes of str == c
+				   are now 0 */
+	addq %rcx, %rdx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc 3f			/* highest byte is NUL => return pointer */
+	xorq %rcx, %rdx		/* (word+magic)^word */
+	orq %r8, %rdx		/* set all non-carry bits */
+	incq %rdx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz 3f			/* found c => return pointer */
+
+	/* The quadword we looked at does not contain the value we're looking
+	   for.  Let's search now whether we have reached the end of the
+	   string.  */
+	xorq %r9, %rcx		/* restore original dword without reload */
+	movq %r8, %rdx		/* magic value */
+	addq %rcx, %rdx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc 7f			/* highest byte is NUL => return NULL */
+	xorq %rcx, %rdx		/* (word+magic)^word */
+	orq %r8, %rdx		/* set all non-carry bits */
+	incq %rdx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz 7f			/* found NUL => return NULL */
+	/* Third unroll.  */
+	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
+	addq $8,%rax		/* adjust pointer for next word */
+	movq %r8, %rdx		/* magic value */
+	xorq %r9, %rcx		/* XOR with qword c|...|c => bytes of str == c
+				   are now 0 */
+	addq %rcx, %rdx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc 3f			/* highest byte is NUL => return pointer */
+	xorq %rcx, %rdx		/* (word+magic)^word */
+	orq %r8, %rdx		/* set all non-carry bits */
+	incq %rdx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz 3f			/* found c => return pointer */
+
+	/* The quadword we looked at does not contain the value we're looking
+	   for.  Let's search now whether we have reached the end of the
+	   string.  */
+	xorq %r9, %rcx		/* restore original dword without reload */
+	movq %r8, %rdx		/* magic value */
+	addq %rcx, %rdx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc 7f			/* highest byte is NUL => return NULL */
+	xorq %rcx, %rdx		/* (word+magic)^word */
+	orq %r8, %rdx		/* set all non-carry bits */
+	incq %rdx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz 7f			/* found NUL => return NULL */
+	/* Fourth unroll.  */
+	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
+	addq $8,%rax		/* adjust pointer for next word */
+	movq %r8, %rdx		/* magic value */
+	xorq %r9, %rcx		/* XOR with qword c|...|c => bytes of str == c
+				   are now 0 */
+	addq %rcx, %rdx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc 3f			/* highest byte is NUL => return pointer */
+	xorq %rcx, %rdx		/* (word+magic)^word */
+	orq %r8, %rdx		/* set all non-carry bits */
+	incq %rdx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz 3f			/* found c => return pointer */
+
+	/* The quadword we looked at does not contain the value we're looking
+	   for.  Let's search now whether we have reached the end of the
+	   string.  */
+	xorq %r9, %rcx		/* restore original dword without reload */
+	movq %r8, %rdx		/* magic value */
+	addq %rcx, %rdx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc 7f			/* highest byte is NUL => return NULL */
+	xorq %rcx, %rdx		/* (word+magic)^word */
+	orq %r8, %rdx		/* set all non-carry bits */
+	incq %rdx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz 4b			/* no NUL found => restart loop */
+
+
+7:	/* Return NULL.  */
+	xorl %eax, %eax
+	retq
+
+
+	/* We now scan for the byte in which the character was matched.
+	   But we have to take care of the case that a NUL char is
+	   found before this in the dword.  Note that we XORed %rcx
+	   with the byte we're looking for, therefore the tests below look
+	   reversed.  */
+
+
+	.p2align 4		/* Align, it's a jump target.  */
+3:	movq	%r9,%rdx	/* move to %rdx so that we can access bytes */
+	subq	$8,%rax		/* correct pointer increment.  */
+	testb %cl, %cl		/* is first byte C? */
+	jz 6f			/* yes => return pointer */
+	cmpb %dl, %cl		/* is first byte NUL? */
+	je 7b			/* yes => return NULL */
+	incq %rax		/* increment pointer */
+
+	testb %ch, %ch		/* is second byte C? */
+	jz 6f			/* yes => return pointer */
+	cmpb %dl, %ch		/* is second byte NUL? */
+	je 7b			/* yes => return NULL? */
+	incq %rax		/* increment pointer */
+
+	shrq $16, %rcx		/* make upper bytes accessible */
+	testb %cl, %cl		/* is third byte C? */
+	jz 6f			/* yes => return pointer */
+	cmpb %dl, %cl		/* is third byte NUL? */
+	je 7b			/* yes => return NULL */
+	incq %rax		/* increment pointer */
+
+	testb %ch, %ch		/* is fourth byte C? */
+	jz 6f			/* yes => return pointer */
+	cmpb %dl, %ch		/* is fourth byte NUL? */
+	je 7b			/* yes => return NULL? */
+	incq %rax		/* increment pointer */
+
+	shrq $16, %rcx		/* make upper bytes accessible */
+	testb %cl, %cl		/* is fifth byte C? */
+	jz 6f			/* yes => return pointer */
+	cmpb %dl, %cl		/* is fifth byte NUL? */
+	je 7b			/* yes => return NULL */
+	incq %rax		/* increment pointer */
+
+	testb %ch, %ch		/* is sixth byte C? */
+	jz 6f			/* yes => return pointer */
+	cmpb %dl, %ch		/* is sixth byte NUL? */
+	je 7b			/* yes => return NULL? */
+	incq %rax		/* increment pointer */
+
+	shrq $16, %rcx		/* make upper bytes accessible */
+	testb %cl, %cl		/* is seventh byte C? */
+	jz 6f			/* yes => return pointer */
+	cmpb %dl, %cl		/* is seventh byte NUL? */
+	je 7b			/* yes => return NULL */
+
+	/* It must be in the eigth byte and it cannot be NUL.  */
+	incq %rax
+
+6:
+	nop
+	retq
+END (BP_SYM (strchr))
+
+weak_alias (BP_SYM (strchr), BP_SYM (index))
+libc_hidden_builtin_def (strchr)
diff --git a/sysdeps/x86_64/rtld-strcmp.S b/sysdeps/x86_64/rtld-strcmp.S
new file mode 100644
index 0000000000..a25535c161
--- /dev/null
+++ b/sysdeps/x86_64/rtld-strcmp.S
@@ -0,0 +1,28 @@
+#include <sysdep.h>
+#include "asm-syntax.h"
+#include "bp-sym.h"
+#include "bp-asm.h"
+
+#ifndef LABEL
+#define LABEL(l) L(l)
+#endif
+
+	.text
+ENTRY (BP_SYM (STRCMP))
+/* Simple version since we can't use SSE registers in ld.so.  */
+L(oop):	movb	(%rdi), %al
+	cmpb	(%rsi), %al
+	jne	L(neq)
+	incq	%rdi
+	incq	%rsi
+	testb	%al, %al
+	jnz	L(oop)
+
+	xorl	%eax, %eax
+	ret
+
+L(neq):	movl	$1, %eax
+	movl	$-1, %ecx
+	cmovbl	%ecx, %eax
+	ret
+END (BP_SYM (STRCMP))
diff --git a/sysdeps/x86_64/rtld-strlen.S b/sysdeps/x86_64/rtld-strlen.S
new file mode 100644
index 0000000000..fd950edaaa
--- /dev/null
+++ b/sysdeps/x86_64/rtld-strlen.S
@@ -0,0 +1,139 @@
+/* strlen(str) -- determine the length of the string STR.
+   Copyright (C) 2002, 2003 Free Software Foundation, Inc.
+   Based on i486 version contributed by Ulrich Drepper <drepper@redhat.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+#include "bp-sym.h"
+#include "bp-asm.h"
+
+
+	.text
+ENTRY (strlen)
+	movq %rdi, %rcx		/* Duplicate source pointer. */
+	andl $7, %ecx		/* mask alignment bits */
+	movq %rdi, %rax		/* duplicate destination.  */
+	jz 1f			/* aligned => start loop */
+
+	neg %ecx		/* We need to align to 8 bytes.  */
+	addl $8,%ecx
+	/* Search the first bytes directly.  */
+0:	cmpb $0x0,(%rax)	/* is byte NUL? */
+	je 2f			/* yes => return */
+	incq %rax		/* increment pointer */
+	decl %ecx
+	jnz 0b
+
+1:	movq $0xfefefefefefefeff,%r8 /* Save magic.  */
+
+	.p2align 4		/* Align loop.  */
+4:	/* Main Loop is unrolled 4 times.  */
+	/* First unroll.  */
+	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
+	addq $8,%rax		/* adjust pointer for next word */
+	movq %r8, %rdx		/* magic value */
+	addq %rcx, %rdx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc 3f			/* highest byte is NUL => return pointer */
+	xorq %rcx, %rdx		/* (word+magic)^word */
+	orq %r8, %rdx		/* set all non-carry bits */
+	incq %rdx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz 3f			/* found NUL => return pointer */
+
+	/* Second unroll.  */
+	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
+	addq $8,%rax		/* adjust pointer for next word */
+	movq %r8, %rdx		/* magic value */
+	addq %rcx, %rdx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc 3f			/* highest byte is NUL => return pointer */
+	xorq %rcx, %rdx		/* (word+magic)^word */
+	orq %r8, %rdx		/* set all non-carry bits */
+	incq %rdx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz 3f			/* found NUL => return pointer */
+
+	/* Third unroll.  */
+	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
+	addq $8,%rax		/* adjust pointer for next word */
+	movq %r8, %rdx		/* magic value */
+	addq %rcx, %rdx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc 3f			/* highest byte is NUL => return pointer */
+	xorq %rcx, %rdx		/* (word+magic)^word */
+	orq %r8, %rdx		/* set all non-carry bits */
+	incq %rdx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz 3f			/* found NUL => return pointer */
+
+	/* Fourth unroll.  */
+	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
+	addq $8,%rax		/* adjust pointer for next word */
+	movq %r8, %rdx		/* magic value */
+	addq %rcx, %rdx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc 3f			/* highest byte is NUL => return pointer */
+	xorq %rcx, %rdx		/* (word+magic)^word */
+	orq %r8, %rdx		/* set all non-carry bits */
+	incq %rdx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz 4b			/* no NUL found => continue loop */
+
+	.p2align 4		/* Align, it's a jump target.  */
+3:	subq $8,%rax		/* correct pointer increment.  */
+
+	testb %cl, %cl		/* is first byte NUL? */
+	jz 2f			/* yes => return */
+	incq %rax		/* increment pointer */
+
+	testb %ch, %ch		/* is second byte NUL? */
+	jz 2f			/* yes => return */
+	incq %rax		/* increment pointer */
+
+	testl $0x00ff0000, %ecx /* is third byte NUL? */
+	jz 2f			/* yes => return pointer */
+	incq %rax		/* increment pointer */
+
+	testl $0xff000000, %ecx /* is fourth byte NUL? */
+	jz 2f			/* yes => return pointer */
+	incq %rax		/* increment pointer */
+
+	shrq $32, %rcx		/* look at other half.  */
+
+	testb %cl, %cl		/* is first byte NUL? */
+	jz 2f			/* yes => return */
+	incq %rax		/* increment pointer */
+
+	testb %ch, %ch		/* is second byte NUL? */
+	jz 2f			/* yes => return */
+	incq %rax		/* increment pointer */
+
+	testl $0xff0000, %ecx	/* is third byte NUL? */
+	jz 2f			/* yes => return pointer */
+	incq %rax		/* increment pointer */
+2:
+	subq %rdi, %rax		/* compute difference to string start */
+	ret
+END (strlen)
+libc_hidden_builtin_def (strlen)
diff --git a/sysdeps/x86_64/tst-xmmymm.sh b/sysdeps/x86_64/tst-xmmymm.sh
new file mode 100755
index 0000000000..0735276e6d
--- /dev/null
+++ b/sysdeps/x86_64/tst-xmmymm.sh
@@ -0,0 +1,17 @@
+#! /bin/sh
+objpfx="$1"
+
+tmp=$(mktemp ${objpfx}tst-xmmymm.XXXXXX)
+trap 'rm -f "$tmp"' 1 2 3 15
+
+objdump -d "${objpfx}ld.so" |
+awk 'BEGIN { last="" } /^[[:xdigit:]]* <[_[:alnum:]]*>:$/ { fct=substr($2, 2, length($2)-3) } /,%[xy]mm[[:digit:]]*$/ { if (last != fct) { print fct; last=fct} }' |
+tee "$tmp"
+
+echo "Functions which incorrectly modify xmm/ymm registers:"
+err=1
+egrep -vs '^_dl_runtime_profile$' "$tmp" || err=0
+if test $err -eq 0; then echo "None"; fi
+
+rm "$tmp"
+exit $err
-- 
cgit v1.2.3