From 4a22fa60cd42eba5ab1931547be33f7764ef6f73 Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Thu, 2 Mar 2006 09:06:20 +0000
Subject: Updated to fedora-glibc-20060302T0855

---
 sysdeps/ia64/memccpy.S                        |  55 ++-
 sysdeps/mach/hurd/Subdirs                     |  10 +-
 sysdeps/rs6000/add_n.s                        |  81 ----
 sysdeps/rs6000/addmul_1.s                     | 123 ------
 sysdeps/rs6000/ffs.c                          |  42 --
 sysdeps/rs6000/lshift.s                       |  59 ---
 sysdeps/rs6000/memcopy.h                      |  86 ----
 sysdeps/rs6000/mul_1.s                        | 110 -----
 sysdeps/rs6000/rshift.s                       |  57 ---
 sysdeps/rs6000/sub_n.s                        |  82 ----
 sysdeps/rs6000/submul_1.s                     | 128 ------
 sysdeps/sparc/fpu/fraiseexcpt.c               |  34 +-
 sysdeps/sparc/sparc32/fpu/libm-test-ulps      |   9 +
 sysdeps/sparc/sparc32/sparcv9v/memcpy.S       |   2 +
 sysdeps/sparc/sparc32/sparcv9v/memset.S       |   2 +
 sysdeps/sparc/sparc64/fpu/libm-test-ulps      |   9 +
 sysdeps/sparc/sparc64/sparcv9v/memcpy.S       | 593 ++++++++++++++++++++++++++
 sysdeps/sparc/sparc64/sparcv9v/memset.S       | 127 ++++++
 sysdeps/unix/sysv/linux/i386/fxstatat.c       |   2 +-
 sysdeps/unix/sysv/linux/s390/s390-64/sysdep.h |   2 +-
 20 files changed, 826 insertions(+), 787 deletions(-)
 delete mode 100644 sysdeps/rs6000/add_n.s
 delete mode 100644 sysdeps/rs6000/addmul_1.s
 delete mode 100644 sysdeps/rs6000/ffs.c
 delete mode 100644 sysdeps/rs6000/lshift.s
 delete mode 100644 sysdeps/rs6000/memcopy.h
 delete mode 100644 sysdeps/rs6000/mul_1.s
 delete mode 100644 sysdeps/rs6000/rshift.s
 delete mode 100644 sysdeps/rs6000/sub_n.s
 delete mode 100644 sysdeps/rs6000/submul_1.s
 create mode 100644 sysdeps/sparc/sparc32/sparcv9v/memcpy.S
 create mode 100644 sysdeps/sparc/sparc32/sparcv9v/memset.S
 create mode 100644 sysdeps/sparc/sparc64/sparcv9v/memcpy.S
 create mode 100644 sysdeps/sparc/sparc64/sparcv9v/memset.S

(limited to 'sysdeps')

diff --git a/sysdeps/ia64/memccpy.S b/sysdeps/ia64/memccpy.S
index 53c43c512b..dd638d43c8 100644
--- a/sysdeps/ia64/memccpy.S
+++ b/sysdeps/ia64/memccpy.S
@@ -1,6 +1,6 @@
 /* Optimized version of the memccpy() function.
    This file is part of the GNU C Library.
-   Copyright (C) 2000, 2001, 2003 Free Software Foundation, Inc.
+   Copyright (C) 2000,2001,2003,2006 Free Software Foundation, Inc.
    Contributed by Dan Pop <Dan.Pop@cern.ch>.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -183,27 +183,64 @@ ENTRY(memccpy)
 	br.ret.sptk.many b0
 
 .recovery1:
-	adds	src = -(MEMLAT + 6 + 1) * 8, asrc
+#if MEMLAT != 6
+# error "MEMLAT must be 6!"
+#endif
+	adds	src = -8, asrc
 	mov	loopcnt = ar.lc
-	mov	tmp = ar.ec ;;
+	mov	tmp = ar.ec
+	;;
+(p[0])	adds	src = -8, src
+	;;
+(p[1])	adds	src = -8, src
 	sub	sh1 = (MEMLAT + 6 + 1), tmp
-	shr.u	sh2 = sh2, 3
-	;; 
+	;;
+(p[2])	adds	src = -8, src
+	;;
+(p[3])	adds	src = -8, src
 	shl	loopcnt = loopcnt, 3
-	sub	src = src, sh2
+	;;
+(p[4])	adds	src = -8, src
+	;;
+(p[5])	adds	src = -8, src
 	shl	sh1 = sh1, 3
+	;;
+(p[6])	adds	src = -8, src
+	;;
+(p[7])	adds	src = -8, src
 	shl	tmp = tmp, 3
 	;;
+(p[8])	adds	src = -8, src
+	;;
+(p[9])	adds	src = -8, src
+	shr.u	sh2 = sh2, 3
+	;;
+(p[10])	adds	src = -8, src
+	;;
+(p[11])	adds	src = -8, src
 	add	len = len, loopcnt
-	add	src = sh1, src ;;
+	;;
+	sub	src = src, sh2
+	;;
 	add	len = tmp, len
-.back1:
+	add	src = sh1, src
 	br.cond.sptk .cpyfew
 
 .recovery2:
-	add	tmp = -(MEMLAT + 3) * 8, src
+#if MEMLAT != 6
+# error "MEMLAT must be 6!"
+#endif
+	add	tmp = -8, src
 (p7)	br.cond.spnt .gotit
 	;;
+(p[0])	add	tmp = -8, tmp ;;
+(p[1])	add	tmp = -8, tmp ;;
+(p[2])	add	tmp = -8, tmp ;;
+(p[3])	add	tmp = -8, tmp ;;
+(p[4])	add	tmp = -8, tmp ;;
+(p[5])	add	tmp = -8, tmp ;;
+(p[6])	add	tmp = -8, tmp ;;
+(p[7])	add	tmp = -8, tmp ;;
 	ld8	r[MEMLAT+2] = [tmp] ;;
 	xor	pos0[1] = r[MEMLAT+2], charx8 ;;
 	czx1.r	pos0[1] = pos0[1] ;;
diff --git a/sysdeps/mach/hurd/Subdirs b/sysdeps/mach/hurd/Subdirs
index 16b8348437..7a7757582a 100644
--- a/sysdeps/mach/hurd/Subdirs
+++ b/sysdeps/mach/hurd/Subdirs
@@ -1 +1,9 @@
-hurd
+# This file says that the hurd subdirectory should appear before all others.
+# The mach and hurd subdirectories have many generated header files which
+# much of the rest of the library depends on, so it is best to build them
+# first (and mach before hurd, at that).  The before-compile additions in
+# sysdeps/{mach,hurd}/Makefile should make it reliably work for these files
+# not to exist when making in other directories, but it will be slower that
+# way with more somewhat expensive `make' invocations.
+
+first hurd
diff --git a/sysdeps/rs6000/add_n.s b/sysdeps/rs6000/add_n.s
deleted file mode 100644
index 216874e7a4..0000000000
--- a/sysdeps/rs6000/add_n.s
+++ /dev/null
@@ -1,81 +0,0 @@
-# IBM POWER __mpn_add_n -- Add two limb vectors of equal, non-zero length.
-
-# Copyright (C) 1992, 1994, 1995, 1996 Free Software Foundation, Inc.
-
-# This file is part of the GNU MP Library.
-
-# The GNU MP Library is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as published by
-# the Free Software Foundation; either version 2.1 of the License, or (at your
-# option) any later version.
-
-# The GNU MP Library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
-# License for more details.
-
-# You should have received a copy of the GNU Lesser General Public License
-# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
-# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-# MA 02111-1307, USA.
-
-
-# INPUT PARAMETERS
-# res_ptr	r3
-# s1_ptr	r4
-# s2_ptr	r5
-# size		r6
-
-	.toc
-	.extern __mpn_add_n[DS]
-	.extern .__mpn_add_n
-.csect [PR]
-	.align 2
-	.globl __mpn_add_n
-	.globl .__mpn_add_n
-	.csect __mpn_add_n[DS]
-__mpn_add_n:
-	.long .__mpn_add_n, TOC[tc0], 0
-	.csect [PR]
-.__mpn_add_n:
-	andil.	10,6,1		# odd or even number of limbs?
-	l	8,0(4)		# load least significant s1 limb
-	l	0,0(5)		# load least significant s2 limb
-	cal	3,-4(3)		# offset res_ptr, it's updated before it's used
-	sri	10,6,1		# count for unrolled loop
-	a	7,0,8		# add least significant limbs, set cy
-	mtctr	10		# copy count into CTR
-	beq	0,Leven		# branch if even # of limbs (# of limbs >= 2)
-
-# We have an odd # of limbs.  Add the first limbs separately.
-	cmpi	1,10,0		# is count for unrolled loop zero?
-	bne	1,L1		# branch if not
-	st	7,4(3)
-	aze	3,10		# use the fact that r10 is zero...
-	br			# return
-
-# We added least significant limbs.  Now reload the next limbs to enter loop.
-L1:	lu	8,4(4)		# load s1 limb and update s1_ptr
-	lu	0,4(5)		# load s2 limb and update s2_ptr
-	stu	7,4(3)
-	ae	7,0,8		# add limbs, set cy
-Leven:	lu	9,4(4)		# load s1 limb and update s1_ptr
-	lu	10,4(5)		# load s2 limb and update s2_ptr
-	bdz	Lend		# If done, skip loop
-
-Loop:	lu	8,4(4)		# load s1 limb and update s1_ptr
-	lu	0,4(5)		# load s2 limb and update s2_ptr
-	ae	11,9,10		# add previous limbs with cy, set cy
-	stu	7,4(3)		# 
-	lu	9,4(4)		# load s1 limb and update s1_ptr
-	lu	10,4(5)		# load s2 limb and update s2_ptr
-	ae	7,0,8		# add previous limbs with cy, set cy
-	stu	11,4(3)		# 
-	bdn	Loop		# decrement CTR and loop back
-
-Lend:	ae	11,9,10		# add limbs with cy, set cy
-	st	7,4(3)		# 
-	st	11,8(3)		# 
-	lil	3,0		# load cy into ...
-	aze	3,3		# ... return value register
-	br
diff --git a/sysdeps/rs6000/addmul_1.s b/sysdeps/rs6000/addmul_1.s
deleted file mode 100644
index 7cd743cede..0000000000
--- a/sysdeps/rs6000/addmul_1.s
+++ /dev/null
@@ -1,123 +0,0 @@
-# IBM POWER __mpn_addmul_1 -- Multiply a limb vector with a limb and add
-# the result to a second limb vector.
-
-# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
-
-# This file is part of the GNU MP Library.
-
-# The GNU MP Library is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as published by
-# the Free Software Foundation; either version 2.1 of the License, or (at your
-# option) any later version.
-
-# The GNU MP Library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
-# License for more details.
-
-# You should have received a copy of the GNU Lesser General Public License
-# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
-# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-# MA 02111-1307, USA.
-
-
-# INPUT PARAMETERS
-# res_ptr	r3
-# s1_ptr	r4
-# size		r5
-# s2_limb	r6
-
-# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction.  To
-# obtain that operation, we have to use the 32x32->64 signed multiplication
-# instruction, and add the appropriate compensation to the high limb of the
-# result.  We add the multiplicand if the multiplier has its most significant
-# bit set, and we add the multiplier if the multiplicand has its most
-# significant bit set.  We need to preserve the carry flag between each
-# iteration, so we have to compute the compensation carefully (the natural,
-# srai+and doesn't work).  Since the POWER architecture has a branch unit
-# we can branch in zero cycles, so that's how we perform the additions.
-
-	.toc
-	.csect .__mpn_addmul_1[PR]
-	.align 2
-	.globl __mpn_addmul_1
-	.globl .__mpn_addmul_1
-	.csect __mpn_addmul_1[DS]
-__mpn_addmul_1:
-	.long .__mpn_addmul_1[PR], TOC[tc0], 0
-	.csect .__mpn_addmul_1[PR]
-.__mpn_addmul_1:
-
-	cal	3,-4(3)
-	l	0,0(4)
-	cmpi	0,6,0
-	mtctr	5
-	mul	9,0,6
-	srai	7,0,31
-	and	7,7,6
-	mfmq	8
-	cax	9,9,7
-	l	7,4(3)
-	a	8,8,7		# add res_limb
-	blt	Lneg
-Lpos:	bdz	Lend
-
-Lploop:	lu	0,4(4)
-	stu	8,4(3)
-	cmpi	0,0,0
-	mul	10,0,6
-	mfmq	0
-	ae	8,0,9		# low limb + old_cy_limb + old cy
-	l	7,4(3)
-	aze	10,10		# propagate cy to new cy_limb
-	a	8,8,7		# add res_limb
-	bge	Lp0
-	cax	10,10,6		# adjust high limb for negative limb from s1
-Lp0:	bdz	Lend0
-	lu	0,4(4)
-	stu	8,4(3)
-	cmpi	0,0,0
-	mul	9,0,6
-	mfmq	0
-	ae	8,0,10
-	l	7,4(3)
-	aze	9,9
-	a	8,8,7
-	bge	Lp1
-	cax	9,9,6		# adjust high limb for negative limb from s1
-Lp1:	bdn	Lploop
-
-	b	Lend
-
-Lneg:	cax	9,9,0
-	bdz	Lend
-Lnloop:	lu	0,4(4)
-	stu	8,4(3)
-	cmpi	0,0,0
-	mul	10,0,6
-	mfmq	7
-	ae	8,7,9
-	l	7,4(3)
-	ae	10,10,0		# propagate cy to new cy_limb
-	a	8,8,7		# add res_limb
-	bge	Ln0
-	cax	10,10,6		# adjust high limb for negative limb from s1
-Ln0:	bdz	Lend0
-	lu	0,4(4)
-	stu	8,4(3)
-	cmpi	0,0,0
-	mul	9,0,6
-	mfmq	7
-	ae	8,7,10
-	l	7,4(3)
-	ae	9,9,0		# propagate cy to new cy_limb
-	a	8,8,7		# add res_limb
-	bge	Ln1
-	cax	9,9,6		# adjust high limb for negative limb from s1
-Ln1:	bdn	Lnloop
-	b	Lend
-
-Lend0:	cal	9,0(10)
-Lend:	st	8,4(3)
-	aze	3,9
-	br
diff --git a/sysdeps/rs6000/ffs.c b/sysdeps/rs6000/ffs.c
deleted file mode 100644
index 619412cb50..0000000000
--- a/sysdeps/rs6000/ffs.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* ffs -- find first set bit in a word, counted from least significant end.
-   For IBM rs6000.
-   Copyright (C) 1991, 1992, 1997, 2004, 2005 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Torbjorn Granlund (tege@sics.se).
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <string.h>
-
-#undef	ffs
-
-#ifdef	__GNUC__
-
-int
-__ffs (x)
-     int x;
-{
-  int cnt;
-
-  asm ("cntlz %0,%1" : "=r" (cnt) : "r" (x & -x));
-  return 32 - cnt;
-}
-weak_alias (__ffs, ffs)
-libc_hidden_builtin_def (ffs)
-
-#else
-#include <string/ffs.c>
-#endif
diff --git a/sysdeps/rs6000/lshift.s b/sysdeps/rs6000/lshift.s
deleted file mode 100644
index 8ccba7407e..0000000000
--- a/sysdeps/rs6000/lshift.s
+++ /dev/null
@@ -1,59 +0,0 @@
-# IBM POWER __mpn_lshift -- 
-
-# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
-
-# This file is part of the GNU MP Library.
-
-# The GNU MP Library is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as published by
-# the Free Software Foundation; either version 2.1 of the License, or (at your
-# option) any later version.
-
-# The GNU MP Library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
-# License for more details.
-
-# You should have received a copy of the GNU Lesser General Public License
-# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
-# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-# MA 02111-1307, USA.
-
-
-# INPUT PARAMETERS
-# res_ptr	r3
-# s_ptr		r4
-# size		r5
-# cnt		r6
-
-	.toc
-	.extern __mpn_lshift[DS]
-	.extern .__mpn_lshift
-.csect [PR]
-	.align 2
-	.globl __mpn_lshift
-	.globl .__mpn_lshift
-	.csect __mpn_lshift[DS]
-__mpn_lshift:
-	.long .__mpn_lshift, TOC[tc0], 0
-	.csect [PR]
-.__mpn_lshift:
-	sli	0,5,2
-	cax	9,3,0
-	cax	4,4,0
-	sfi	8,6,32
-	mtctr	5		# put limb count in CTR loop register
-	lu	0,-4(4)		# read most significant limb
-	sre	3,0,8		# compute carry out limb, and init MQ register
-	bdz	Lend2		# if just one limb, skip loop
-	lu	0,-4(4)		# read 2:nd most significant limb
-	sreq	7,0,8		# compute most significant limb of result
-	bdz	Lend		# if just two limb, skip loop
-Loop:	lu	0,-4(4)		# load next lower limb
-	stu	7,-4(9)		# store previous result during read latency
-	sreq	7,0,8		# compute result limb
-	bdn	Loop		# loop back until CTR is zero
-Lend:	stu	7,-4(9)		# store 2:nd least significant limb
-Lend2:	sle	7,0,6		# compute least significant limb
-	st      7,-4(9)		# store it"				\
-	br
diff --git a/sysdeps/rs6000/memcopy.h b/sysdeps/rs6000/memcopy.h
deleted file mode 100644
index 8bdb6e9766..0000000000
--- a/sysdeps/rs6000/memcopy.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (C) 1991, 1997 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <sysdeps/generic/memcopy.h>
-
-#undef	OP_T_THRES
-#define OP_T_THRES 32
-
-#undef	BYTE_COPY_FWD
-#define BYTE_COPY_FWD(dst_bp, src_bp, nbytes)				      \
-  do									      \
-    {									      \
-      size_t __nbytes = nbytes;						      \
-      asm volatile("mtspr	1,%2\n"					      \
-		   "lsx		6,0,%1\n"				      \
-		   "stsx	6,0,%0" : /* No outputs.  */ :		      \
-		   "b" (dst_bp), "b" (src_bp), "r" (__nbytes) :		      \
-		   "6", "7", "8", "9", "10", "11", "12", "13");		      \
-      dst_bp += __nbytes;						      \
-      src_bp += __nbytes;						      \
-    } while (0)
-
-#undef	BYTE_COPY_BWD
-#define BYTE_COPY_BWD(dst_ep, src_ep, nbytes)				      \
-  do									      \
-    {									      \
-      size_t __nbytes = (nbytes);					      \
-      dst_ep -= __nbytes;						      \
-      src_ep -= __nbytes;						      \
-      asm volatile("mtspr	1,%2\n"					      \
-		   "lsx		6,0,%1\n"				      \
-		   "stsx	6,0,%0" : /* No outputs.  */ :		      \
-		   "b" (dst_ep), "b" (src_ep), "r" (__nbytes) :		      \
-		   "6", "7", "8", "9", "10", "11", "12", "13");		      \
-    } while (0)
-
-#undef	WORD_COPY_FWD
-#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes)		      \
-  do									      \
-    {									      \
-      size_t __nblocks = (nbytes) / 32;					      \
-      if (__nblocks != 0)						      \
-	asm volatile("mtctr	%4\n"					      \
-		     "lsi	6,%1,32\n"				      \
-		     "ai	%1,%1,32\n"				      \
-		     "stsi	6,%0,32\n"				      \
-		     "ai	%0,%0,32\n"				      \
-		     "bdn	$-16" :					      \
-		     "=b" (dst_bp), "=b" (src_bp) :			      \
-		     "0" (dst_bp), "1" (src_bp), "r" (__nblocks) :	      \
-		     "6", "7", "8", "9", "10", "11", "12", "13");	      \
-      (nbytes_left) = (nbytes) % 32;					      \
-    } while (0)
-
-#undef	WORD_COPY_BWD
-#define WORD_COPY_BWD(dst_ep, src_ep, nbytes_left, nbytes)		      \
-  do									      \
-    {									      \
-      size_t __nblocks = (nbytes) / 32;					      \
-      if (__nblocks != 0)						      \
-	asm volatile("mtctr	%4\n"					      \
-		     "ai	%1,%1,-32\n"				      \
-		     "lsi	6,%1,32\n"				      \
-		     "ai	%0,%0,-32\n"				      \
-		     "stsi	6,%0,32\n"				      \
-		     "bdn	$-16" :					      \
-		     "=b" (dst_ep), "=b" (src_ep) :			      \
-		     "0" (dst_ep), "1" (src_ep), "r" (__nblocks) :	      \
-		     "6", "7", "8", "9", "10", "11", "12", "13");	      \
-      (nbytes_left) = (nbytes) % 32;					      \
-    } while (0)
diff --git a/sysdeps/rs6000/mul_1.s b/sysdeps/rs6000/mul_1.s
deleted file mode 100644
index c0feef4b72..0000000000
--- a/sysdeps/rs6000/mul_1.s
+++ /dev/null
@@ -1,110 +0,0 @@
-# IBM POWER __mpn_mul_1 -- Multiply a limb vector with a limb and store
-# the result in a second limb vector.
-
-# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
-
-# This file is part of the GNU MP Library.
-
-# The GNU MP Library is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as published by
-# the Free Software Foundation; either version 2.1 of the License, or (at your
-# option) any later version.
-
-# The GNU MP Library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
-# License for more details.
-
-# You should have received a copy of the GNU Lesser General Public License
-# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
-# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-# MA 02111-1307, USA.
-
-
-# INPUT PARAMETERS
-# res_ptr	r3
-# s1_ptr	r4
-# size		r5
-# s2_limb	r6
-
-# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction.  To
-# obtain that operation, we have to use the 32x32->64 signed multiplication
-# instruction, and add the appropriate compensation to the high limb of the
-# result.  We add the multiplicand if the multiplier has its most significant
-# bit set, and we add the multiplier if the multiplicand has its most
-# significant bit set.  We need to preserve the carry flag between each
-# iteration, so we have to compute the compensation carefully (the natural,
-# srai+and doesn't work).  Since the POWER architecture has a branch unit
-# we can branch in zero cycles, so that's how we perform the additions.
-
-	.toc
-	.csect .__mpn_mul_1[PR]
-	.align 2
-	.globl __mpn_mul_1
-	.globl .__mpn_mul_1
-	.csect __mpn_mul_1[DS]
-__mpn_mul_1:
-	.long .__mpn_mul_1[PR], TOC[tc0], 0
-	.csect .__mpn_mul_1[PR]
-.__mpn_mul_1:
-
-	cal	3,-4(3)
-	l	0,0(4)
-	cmpi	0,6,0
-	mtctr	5
-	mul	9,0,6
-	srai	7,0,31
-	and	7,7,6
-	mfmq	8
-	ai	0,0,0		# reset carry
-	cax	9,9,7
-	blt	Lneg
-Lpos:	bdz	Lend
-Lploop:	lu	0,4(4)
-	stu	8,4(3)
-	cmpi	0,0,0
-	mul	10,0,6
-	mfmq	0
-	ae	8,0,9
-	bge	Lp0
-	cax	10,10,6		# adjust high limb for negative limb from s1
-Lp0:	bdz	Lend0
-	lu	0,4(4)
-	stu	8,4(3)
-	cmpi	0,0,0
-	mul	9,0,6
-	mfmq	0
-	ae	8,0,10
-	bge	Lp1
-	cax	9,9,6		# adjust high limb for negative limb from s1
-Lp1:	bdn	Lploop
-	b	Lend
-
-Lneg:	cax	9,9,0
-	bdz	Lend
-Lnloop:	lu	0,4(4)
-	stu	8,4(3)
-	cmpi	0,0,0
-	mul	10,0,6
-	cax	10,10,0		# adjust high limb for negative s2_limb
-	mfmq	0
-	ae	8,0,9
-	bge	Ln0
-	cax	10,10,6		# adjust high limb for negative limb from s1
-Ln0:	bdz	Lend0
-	lu	0,4(4)
-	stu	8,4(3)
-	cmpi	0,0,0
-	mul	9,0,6
-	cax	9,9,0		# adjust high limb for negative s2_limb
-	mfmq	0
-	ae	8,0,10
-	bge	Ln1
-	cax	9,9,6		# adjust high limb for negative limb from s1
-Ln1:	bdn	Lnloop
-	b	Lend
-
-Lend0:	cal	9,0(10)
-Lend:	st	8,4(3)
-	aze	3,9
-	br
diff --git a/sysdeps/rs6000/rshift.s b/sysdeps/rs6000/rshift.s
deleted file mode 100644
index 145218fabd..0000000000
--- a/sysdeps/rs6000/rshift.s
+++ /dev/null
@@ -1,57 +0,0 @@
-# IBM POWER __mpn_rshift -- 
-
-# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
-
-# This file is part of the GNU MP Library.
-
-# The GNU MP Library is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as published by
-# the Free Software Foundation; either version 2.1 of the License, or (at your
-# option) any later version.
-
-# The GNU MP Library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
-# License for more details.
-
-# You should have received a copy of the GNU Lesser General Public License
-# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
-# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-# MA 02111-1307, USA.
-
-
-# INPUT PARAMETERS
-# res_ptr	r3
-# s_ptr		r4
-# size		r5
-# cnt		r6
-
-	.toc
-	.extern __mpn_rshift[DS]
-	.extern .__mpn_rshift
-.csect [PR]
-	.align 2
-	.globl __mpn_rshift
-	.globl .__mpn_rshift
-	.csect __mpn_rshift[DS]
-__mpn_rshift:
-	.long .__mpn_rshift, TOC[tc0], 0
-	.csect [PR]
-.__mpn_rshift:
-	sfi	8,6,32
-	mtctr	5		# put limb count in CTR loop register
-	l	0,0(4)		# read least significant limb
-	ai	9,3,-4		# adjust res_ptr since it's offset in the stu:s
-	sle	3,0,8		# compute carry limb, and init MQ register
-	bdz	Lend2		# if just one limb, skip loop
-	lu	0,4(4)		# read 2:nd least significant limb
-	sleq	7,0,8		# compute least significant limb of result
-	bdz	Lend		# if just two limb, skip loop
-Loop:	lu	0,4(4)		# load next higher limb
-	stu	7,4(9)		# store previous result during read latency
-	sleq	7,0,8		# compute result limb
-	bdn	Loop		# loop back until CTR is zero
-Lend:	stu	7,4(9)		# store 2:nd most significant limb
-Lend2:	sre	7,0,6		# compute most significant limb
-	st      7,4(9)		# store it"				\
-	br
diff --git a/sysdeps/rs6000/sub_n.s b/sysdeps/rs6000/sub_n.s
deleted file mode 100644
index d931870935..0000000000
--- a/sysdeps/rs6000/sub_n.s
+++ /dev/null
@@ -1,82 +0,0 @@
-# IBM POWER __mpn_sub_n -- Subtract two limb vectors of equal, non-zero length.
-
-# Copyright (C) 1992, 1994, 1995, 1996 Free Software Foundation, Inc.
-
-# This file is part of the GNU MP Library.
-
-# The GNU MP Library is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as published by
-# the Free Software Foundation; either version 2.1 of the License, or (at your
-# option) any later version.
-
-# The GNU MP Library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
-# License for more details.
-
-# You should have received a copy of the GNU Lesser General Public License
-# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
-# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-# MA 02111-1307, USA.
-
-
-# INPUT PARAMETERS
-# res_ptr	r3
-# s1_ptr	r4
-# s2_ptr	r5
-# size		r6
-
-	.toc
-	.extern __mpn_sub_n[DS]
-	.extern .__mpn_sub_n
-.csect [PR]
-	.align 2
-	.globl __mpn_sub_n
-	.globl .__mpn_sub_n
-	.csect __mpn_sub_n[DS]
-__mpn_sub_n:
-	.long .__mpn_sub_n, TOC[tc0], 0
-	.csect [PR]
-.__mpn_sub_n:
-	andil.	10,6,1		# odd or even number of limbs?
-	l	8,0(4)		# load least significant s1 limb
-	l	0,0(5)		# load least significant s2 limb
-	cal	3,-4(3)		# offset res_ptr, it's updated before it's used
-	sri	10,6,1		# count for unrolled loop
-	sf	7,0,8		# subtract least significant limbs, set cy
-	mtctr	10		# copy count into CTR
-	beq	0,Leven		# branch if even # of limbs (# of limbs >= 2)
-
-# We have an odd # of limbs.  Add the first limbs separately.
-	cmpi	1,10,0		# is count for unrolled loop zero?
-	bne	1,L1		# branch if not
-	st	7,4(3)
-	sfe	3,0,0		# load !cy into ...
-	sfi	3,3,0		# ... return value register
-	br			# return
-
-# We added least significant limbs.  Now reload the next limbs to enter loop.
-L1:	lu	8,4(4)		# load s1 limb and update s1_ptr
-	lu	0,4(5)		# load s2 limb and update s2_ptr
-	stu	7,4(3)
-	sfe	7,0,8		# subtract limbs, set cy
-Leven:	lu	9,4(4)		# load s1 limb and update s1_ptr
-	lu	10,4(5)		# load s2 limb and update s2_ptr
-	bdz	Lend		# If done, skip loop
-
-Loop:	lu	8,4(4)		# load s1 limb and update s1_ptr
-	lu	0,4(5)		# load s2 limb and update s2_ptr
-	sfe	11,10,9		# subtract previous limbs with cy, set cy
-	stu	7,4(3)		# 
-	lu	9,4(4)		# load s1 limb and update s1_ptr
-	lu	10,4(5)		# load s2 limb and update s2_ptr
-	sfe	7,0,8		# subtract previous limbs with cy, set cy
-	stu	11,4(3)		# 
-	bdn	Loop		# decrement CTR and loop back
-
-Lend:	sfe	11,10,9		# subtract limbs with cy, set cy
-	st	7,4(3)		# 
-	st	11,8(3)		# 
-	sfe	3,0,0		# load !cy into ...
-	sfi	3,3,0		# ... return value register
-	br
diff --git a/sysdeps/rs6000/submul_1.s b/sysdeps/rs6000/submul_1.s
deleted file mode 100644
index 41095ab001..0000000000
--- a/sysdeps/rs6000/submul_1.s
+++ /dev/null
@@ -1,128 +0,0 @@
-# IBM POWER __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
-# the result from a second limb vector.
-
-# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
-
-# This file is part of the GNU MP Library.
-
-# The GNU MP Library is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as published by
-# the Free Software Foundation; either version 2.1 of the License, or (at your
-# option) any later version.
-
-# The GNU MP Library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
-# License for more details.
-
-# You should have received a copy of the GNU Lesser General Public License
-# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
-# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-# MA 02111-1307, USA.
-
-
-# INPUT PARAMETERS
-# res_ptr	r3
-# s1_ptr	r4
-# size		r5
-# s2_limb	r6
-
-# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction.  To
-# obtain that operation, we have to use the 32x32->64 signed multiplication
-# instruction, and add the appropriate compensation to the high limb of the
-# result.  We add the multiplicand if the multiplier has its most significant
-# bit set, and we add the multiplier if the multiplicand has its most
-# significant bit set.  We need to preserve the carry flag between each
-# iteration, so we have to compute the compensation carefully (the natural,
-# srai+and doesn't work).  Since the POWER architecture has a branch unit
-# we can branch in zero cycles, so that's how we perform the additions.
-
-	.toc
-	.csect .__mpn_submul_1[PR]
-	.align 2
-	.globl __mpn_submul_1
-	.globl .__mpn_submul_1
-	.csect __mpn_submul_1[DS]
-__mpn_submul_1:
-	.long .__mpn_submul_1[PR], TOC[tc0], 0
-	.csect .__mpn_submul_1[PR]
-.__mpn_submul_1:
-
-	cal	3,-4(3)
-	l	0,0(4)
-	cmpi	0,6,0
-	mtctr	5
-	mul	9,0,6
-	srai	7,0,31
-	and	7,7,6
-	mfmq	11
-	cax	9,9,7
-	l	7,4(3)
-	sf	8,11,7		# add res_limb
-	a	11,8,11		# invert cy (r11 is junk)
-	blt	Lneg
-Lpos:	bdz	Lend
-
-Lploop:	lu	0,4(4)
-	stu	8,4(3)
-	cmpi	0,0,0
-	mul	10,0,6
-	mfmq	0
-	ae	11,0,9		# low limb + old_cy_limb + old cy
-	l	7,4(3)
-	aze	10,10		# propagate cy to new cy_limb
-	sf	8,11,7		# add res_limb
-	a	11,8,11		# invert cy (r11 is junk)
-	bge	Lp0
-	cax	10,10,6		# adjust high limb for negative limb from s1
-Lp0:	bdz	Lend0
-	lu	0,4(4)
-	stu	8,4(3)
-	cmpi	0,0,0
-	mul	9,0,6
-	mfmq	0
-	ae	11,0,10
-	l	7,4(3)
-	aze	9,9
-	sf	8,11,7
-	a	11,8,11		# invert cy (r11 is junk)
-	bge	Lp1
-	cax	9,9,6		# adjust high limb for negative limb from s1
-Lp1:	bdn	Lploop
-
-	b	Lend
-
-Lneg:	cax	9,9,0
-	bdz	Lend
-Lnloop:	lu	0,4(4)
-	stu	8,4(3)
-	cmpi	0,0,0
-	mul	10,0,6
-	mfmq	7
-	ae	11,7,9
-	l	7,4(3)
-	ae	10,10,0		# propagate cy to new cy_limb
-	sf	8,11,7		# add res_limb
-	a	11,8,11		# invert cy (r11 is junk)
-	bge	Ln0
-	cax	10,10,6		# adjust high limb for negative limb from s1
-Ln0:	bdz	Lend0
-	lu	0,4(4)
-	stu	8,4(3)
-	cmpi	0,0,0
-	mul	9,0,6
-	mfmq	7
-	ae	11,7,10
-	l	7,4(3)
-	ae	9,9,0		# propagate cy to new cy_limb
-	sf	8,11,7		# add res_limb
-	a	11,8,11		# invert cy (r11 is junk)
-	bge	Ln1
-	cax	9,9,6		# adjust high limb for negative limb from s1
-Ln1:	bdn	Lnloop
-	b	Lend
-
-Lend0:	cal	9,0(10)
-Lend:	st	8,4(3)
-	aze	3,9
-	br
diff --git a/sysdeps/sparc/fpu/fraiseexcpt.c b/sysdeps/sparc/fpu/fraiseexcpt.c
index 0d45ec82d2..cbb8be80ec 100644
--- a/sysdeps/sparc/fpu/fraiseexcpt.c
+++ b/sysdeps/sparc/fpu/fraiseexcpt.c
@@ -25,12 +25,12 @@
 int
 __feraiseexcept (int excepts)
 {
-  static volatile double sink;
   static const struct {
     double zero, one, max, min, sixteen, pi;
   } c = {
     0.0, 1.0, DBL_MAX, DBL_MIN, 16.0, M_PI
   };
+  double d;
 
   /* Raise exceptions represented by EXPECTS.  But we must raise only
      one signal at a time.  It is important the if the overflow/underflow
@@ -39,24 +39,44 @@ __feraiseexcept (int excepts)
 
   /* First: invalid exception.  */
   if ((FE_INVALID & excepts) != 0)
-    /* One example of a invalid operation is 0/0.  */
-    sink = c.zero / c.zero;
+    {
+      /* One example of a invalid operation is 0/0.  */
+      __asm ("" : "=e" (d) : "0" (c.zero));
+      d /= c.zero;
+      __asm __volatile ("" : : "e" (d));
+    }
 
   /* Next: division by zero.  */
   if ((FE_DIVBYZERO & excepts) != 0)
-    sink = c.one / c.zero;
+    {
+      __asm ("" : "=e" (d) : "0" (c.one));
+      d /= c.zero;
+      __asm __volatile ("" : : "e" (d));
+    }
 
   /* Next: overflow.  */
   if ((FE_OVERFLOW & excepts) != 0)
-    sink = c.max * c.max;
+    {
+      __asm ("" : "=e" (d) : "0" (c.max));
+      d *= d;
+      __asm __volatile ("" : : "e" (d));
+    }
 
   /* Next: underflow.  */
   if ((FE_UNDERFLOW & excepts) != 0)
-    sink = c.min / c.sixteen;
+    {
+      __asm ("" : "=e" (d) : "0" (c.min));
+      d /= c.sixteen;
+      __asm __volatile ("" : : "e" (d));
+    }
 
   /* Last: inexact.  */
   if ((FE_INEXACT & excepts) != 0)
-    sink = c.one / c.pi;
+    {
+      __asm ("" : "=e" (d) : "0" (c.one));
+      d /= c.pi;
+      __asm __volatile ("" : : "e" (d));
+    }
 
   /* Success.  */
   return 0;
diff --git a/sysdeps/sparc/sparc32/fpu/libm-test-ulps b/sysdeps/sparc/sparc32/fpu/libm-test-ulps
index 40d563971a..ccf53788a6 100644
--- a/sysdeps/sparc/sparc32/fpu/libm-test-ulps
+++ b/sysdeps/sparc/sparc32/fpu/libm-test-ulps
@@ -465,6 +465,11 @@ ifloat: 2
 ildouble: 1
 ldouble: 1
 
+# exp2
+Test "exp2 (10) == 1024":
+ildouble: 2
+ldouble: 2
+
 # expm1
 Test "expm1 (0.75) == 1.11700001661267466854536981983709561":
 double: 1
@@ -1192,6 +1197,10 @@ ifloat: 2
 ildouble: 1
 ldouble: 1
 
+Function: "exp2":
+ildouble: 2
+ldouble: 2
+
 Function: "expm1":
 double: 1
 float: 1
diff --git a/sysdeps/sparc/sparc32/sparcv9v/memcpy.S b/sysdeps/sparc/sparc32/sparcv9v/memcpy.S
new file mode 100644
index 0000000000..4c05f57bc2
--- /dev/null
+++ b/sysdeps/sparc/sparc32/sparcv9v/memcpy.S
@@ -0,0 +1,2 @@
+#define XCC icc
+#include <sparc64/sparcv9v/memcpy.S>
diff --git a/sysdeps/sparc/sparc32/sparcv9v/memset.S b/sysdeps/sparc/sparc32/sparcv9v/memset.S
new file mode 100644
index 0000000000..5e46c7489f
--- /dev/null
+++ b/sysdeps/sparc/sparc32/sparcv9v/memset.S
@@ -0,0 +1,2 @@
+#define XCC icc
+#include <sparc64/sparcv9v/memset.S>
diff --git a/sysdeps/sparc/sparc64/fpu/libm-test-ulps b/sysdeps/sparc/sparc64/fpu/libm-test-ulps
index 5719a7ca54..db5543e9eb 100644
--- a/sysdeps/sparc/sparc64/fpu/libm-test-ulps
+++ b/sysdeps/sparc/sparc64/fpu/libm-test-ulps
@@ -465,6 +465,11 @@ ifloat: 2
 ildouble: 1
 ldouble: 1
 
+# exp2
+Test "exp2 (10) == 1024":
+ildouble: 2
+ldouble: 2
+
 # expm1
 Test "expm1 (0.75) == 1.11700001661267466854536981983709561":
 double: 1
@@ -1192,6 +1197,10 @@ ifloat: 2
 ildouble: 1
 ldouble: 1
 
+Function: "exp2":
+ildouble: 2
+ldouble: 2
+
 Function: "expm1":
 double: 1
 float: 1
diff --git a/sysdeps/sparc/sparc64/sparcv9v/memcpy.S b/sysdeps/sparc/sparc64/sparcv9v/memcpy.S
new file mode 100644
index 0000000000..05c837fa25
--- /dev/null
+++ b/sysdeps/sparc/sparc64/sparcv9v/memcpy.S
@@ -0,0 +1,593 @@
+/* Copy SIZE bytes from SRC to DEST.  For SUN4V Niagara.
+   Copyright (C) 2006 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by David S. Miller (davem@davemloft.net)
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+
+#define ASI_BLK_INIT_QUAD_LDD_P	0xe2
+#define ASI_P			0x80
+#define ASI_PNF			0x82
+
+#define LOAD(type,addr,dest)	type##a [addr] ASI_P, dest
+#define LOAD_TWIN(addr_reg,dest0,dest1)	\
+	ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0
+
+#define STORE(type,src,addr)	type src, [addr]
+#define STORE_INIT(src,addr)	stxa src, [addr] %asi
+
+#ifndef XCC
+#define USE_BPR
+#define XCC xcc
+#endif
+
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+	.register	%g6,#scratch
+
+	.text
+	.align		32
+
+ENTRY(bcopy)
+	sub		%o1, %o0, %o4
+	mov		%o0, %g4
+	cmp		%o4, %o2
+	mov		%o1, %o0
+	bgeu,pt		%XCC, 100f
+	 mov		%g4, %o1
+#ifndef USE_BPR
+	srl		%o2, 0, %o2
+#endif
+	brnz,pn		%o2, 220f
+	 add		%o0, %o2, %o0
+	retl
+	 nop
+END(bcopy)
+
+	.align		32
+ENTRY(memcpy)
+100:	/* %o0=dst, %o1=src, %o2=len */
+	mov		%o0, %g5
+	cmp		%o2, 0
+	be,pn		%XCC, 85f
+218:	 or		%o0, %o1, %o3
+	cmp		%o2, 16
+	blu,a,pn	%XCC, 80f
+	 or		%o3, %o2, %o3
+
+	/* 2 blocks (128 bytes) is the minimum we can do the block
+	 * copy with.  We need to ensure that we'll iterate at least
+	 * once in the block copy loop.  At worst we'll need to align
+	 * the destination to a 64-byte boundary which can chew up
+	 * to (64 - 1) bytes from the length before we perform the
+	 * block copy loop.
+	 */
+	cmp		%o2, (2 * 64)
+	blu,pt		%XCC, 70f
+	 andcc		%o3, 0x7, %g0
+
+	/* %o0:	dst
+	 * %o1:	src
+	 * %o2:	len  (known to be >= 128)
+	 *
+	 * The block copy loops will use %o4/%o5,%g2/%g3 as
+	 * temporaries while copying the data.
+	 */
+
+	LOAD(prefetch, %o1, #one_read)
+	wr		%g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
+
+	/* Align destination on 64-byte boundary.  */
+	andcc		%o0, (64 - 1), %o4
+	be,pt		%XCC, 2f
+	 sub		%o4, 64, %o4
+	sub		%g0, %o4, %o4	! bytes to align dst
+	sub		%o2, %o4, %o2
+1:	subcc		%o4, 1, %o4
+	LOAD(ldub, %o1, %g1)
+	STORE(stb, %g1, %o0)
+	add		%o1, 1, %o1
+	bne,pt		%XCC, 1b
+	add		%o0, 1, %o0
+
+	/* If the source is on a 16-byte boundary we can do
+	 * the direct block copy loop.  If it is 8-byte aligned
+	 * we can do the 16-byte loads offset by -8 bytes and the
+	 * init stores offset by one register.
+	 *
+	 * If the source is not even 8-byte aligned, we need to do
+	 * shifting and masking (basically integer faligndata).
+	 *
+	 * The careful bit with init stores is that if we store
+	 * to any part of the cache line we have to store the whole
+	 * cacheline else we can end up with corrupt L2 cache line
+	 * contents.  Since the loop works on 64-bytes of 64-byte
+	 * aligned store data at a time, this is easy to ensure.
+	 */
+2:
+	andcc		%o1, (16 - 1), %o4
+	andn		%o2, (64 - 1), %g1	! block copy loop iterator
+	sub		%o2, %g1, %o2		! final sub-block copy bytes
+	be,pt		%XCC, 50f
+	 cmp		%o4, 8
+	be,a,pt		%XCC, 10f
+	 sub		%o1, 0x8, %o1
+
+	/* Neither 8-byte nor 16-byte aligned, shift and mask.  */
+	mov		%g1, %o4
+	and		%o1, 0x7, %g1
+	sll		%g1, 3, %g1
+	mov		64, %o3
+	andn		%o1, 0x7, %o1
+	LOAD(ldx, %o1, %g2)
+	sub		%o3, %g1, %o3
+	sllx		%g2, %g1, %g2
+
+#define SWIVEL_ONE_DWORD(SRC, TMP1, TMP2, PRE_VAL, PRE_SHIFT, POST_SHIFT, DST)\
+	LOAD(ldx, SRC, TMP1); \
+	srlx		TMP1, PRE_SHIFT, TMP2; \
+	or		TMP2, PRE_VAL, TMP2; \
+	STORE_INIT(TMP2, DST); \
+	sllx		TMP1, POST_SHIFT, PRE_VAL;
+
+1:	add		%o1, 0x8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x00)
+	add		%o1, 0x8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x08)
+	add		%o1, 0x8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x10)
+	add		%o1, 0x8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x18)
+	add		%o1, 32, %o1
+	LOAD(prefetch, %o1, #one_read)
+	sub		%o1, 32 - 8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x20)
+	add		%o1, 8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x28)
+	add		%o1, 8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x30)
+	add		%o1, 8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x38)
+	subcc		%o4, 64, %o4
+	bne,pt		%XCC, 1b
+	 add		%o0, 64, %o0
+
+#undef SWIVEL_ONE_DWORD
+
+	srl		%g1, 3, %g1
+	ba,pt		%XCC, 60f
+	 add		%o1, %g1, %o1
+
+10:	/* Destination is 64-byte aligned, source was only 8-byte
+	 * aligned but it has been subtracted by 8 and we perform
+	 * one twin load ahead, then add 8 back into source when
+	 * we finish the loop.
+	 */
+	LOAD_TWIN(%o1, %o4, %o5)
+1:	add		%o1, 16, %o1
+	LOAD_TWIN(%o1, %g2, %g3)
+	add		%o1, 16 + 32, %o1
+	LOAD(prefetch, %o1, #one_read)
+	sub		%o1, 32, %o1
+	STORE_INIT(%o5, %o0 + 0x00)		! initializes cache line
+	STORE_INIT(%g2, %o0 + 0x08)
+	LOAD_TWIN(%o1, %o4, %o5)
+	add		%o1, 16, %o1
+	STORE_INIT(%g3, %o0 + 0x10)
+	STORE_INIT(%o4, %o0 + 0x18)
+	LOAD_TWIN(%o1, %g2, %g3)
+	add		%o1, 16, %o1
+	STORE_INIT(%o5, %o0 + 0x20)
+	STORE_INIT(%g2, %o0 + 0x28)
+	LOAD_TWIN(%o1, %o4, %o5)
+	STORE_INIT(%g3, %o0 + 0x30)
+	STORE_INIT(%o4, %o0 + 0x38)
+	subcc		%g1, 64, %g1
+	bne,pt		%XCC, 1b
+	 add		%o0, 64, %o0
+
+	ba,pt		%XCC, 60f
+	 add		%o1, 0x8, %o1
+
+50:	/* Destination is 64-byte aligned, and source is 16-byte
+	 * aligned.
+	 */
+1:	LOAD_TWIN(%o1, %o4, %o5)
+	add	%o1, 16, %o1
+	LOAD_TWIN(%o1, %g2, %g3)
+	add	%o1, 16 + 32, %o1
+	LOAD(prefetch, %o1, #one_read)
+	sub	%o1, 32, %o1
+	STORE_INIT(%o4, %o0 + 0x00)		! initializes cache line
+	STORE_INIT(%o5, %o0 + 0x08)
+	LOAD_TWIN(%o1, %o4, %o5)
+	add	%o1, 16, %o1
+	STORE_INIT(%g2, %o0 + 0x10)
+	STORE_INIT(%g3, %o0 + 0x18)
+	LOAD_TWIN(%o1, %g2, %g3)
+	add	%o1, 16, %o1
+	STORE_INIT(%o4, %o0 + 0x20)
+	STORE_INIT(%o5, %o0 + 0x28)
+	STORE_INIT(%g2, %o0 + 0x30)
+	STORE_INIT(%g3, %o0 + 0x38)
+	subcc	%g1, 64, %g1
+	bne,pt	%XCC, 1b
+	 add	%o0, 64, %o0
+	/* fall through */
+
+60:
+	/* %o2 contains any final bytes still needed to be copied
+	 * over. If anything is left, we copy it one byte at a time.
+	 */
+	wr		%g0, ASI_PNF, %asi
+	brz,pt		%o2, 85f
+	 sub		%o0, %o1, %o3
+	ba,a,pt		%XCC, 90f
+
+	.align		64
+70: /* 16 < len <= 64 */
+	bne,pn		%XCC, 75f
+	 sub		%o0, %o1, %o3
+
+72:
+	andn		%o2, 0xf, %o4
+	and		%o2, 0xf, %o2
+1:	subcc		%o4, 0x10, %o4
+	LOAD(ldx, %o1, %o5)
+	add		%o1, 0x08, %o1
+	LOAD(ldx, %o1, %g1)
+	sub		%o1, 0x08, %o1
+	STORE(stx, %o5, %o1 + %o3)
+	add		%o1, 0x8, %o1
+	STORE(stx, %g1, %o1 + %o3)
+	bgu,pt		%XCC, 1b
+	 add		%o1, 0x8, %o1
+73:	andcc		%o2, 0x8, %g0
+	be,pt		%XCC, 1f
+	 nop
+	sub		%o2, 0x8, %o2
+	LOAD(ldx, %o1, %o5)
+	STORE(stx, %o5, %o1 + %o3)
+	add		%o1, 0x8, %o1
+1:	andcc		%o2, 0x4, %g0
+	be,pt		%XCC, 1f
+	 nop
+	sub		%o2, 0x4, %o2
+	LOAD(lduw, %o1, %o5)
+	STORE(stw, %o5, %o1 + %o3)
+	add		%o1, 0x4, %o1
+1:	cmp		%o2, 0
+	be,pt		%XCC, 85f
+	 nop
+	ba,pt		%XCC, 90f
+	 nop
+
+75:
+	andcc		%o0, 0x7, %g1
+	sub		%g1, 0x8, %g1
+	be,pn		%icc, 2f
+	 sub		%g0, %g1, %g1
+	sub		%o2, %g1, %o2
+
+1:	subcc		%g1, 1, %g1
+	LOAD(ldub, %o1, %o5)
+	STORE(stb, %o5, %o1 + %o3)
+	bgu,pt		%icc, 1b
+	 add		%o1, 1, %o1
+
+2:	add		%o1, %o3, %o0
+	andcc		%o1, 0x7, %g1
+	bne,pt		%icc, 8f
+	 sll		%g1, 3, %g1
+
+	cmp		%o2, 16
+	bgeu,pt		%icc, 72b
+	 nop
+	ba,a,pt		%XCC, 73b
+
+8:	mov		64, %o3
+	andn		%o1, 0x7, %o1
+	LOAD(ldx, %o1, %g2)
+	sub		%o3, %g1, %o3
+	andn		%o2, 0x7, %o4
+	sllx		%g2, %g1, %g2
+1:	add		%o1, 0x8, %o1
+	LOAD(ldx, %o1, %g3)
+	subcc		%o4, 0x8, %o4
+	srlx		%g3, %o3, %o5
+	or		%o5, %g2, %o5
+	STORE(stx, %o5, %o0)
+	add		%o0, 0x8, %o0
+	bgu,pt		%icc, 1b
+	 sllx		%g3, %g1, %g2
+
+	srl		%g1, 3, %g1
+	andcc		%o2, 0x7, %o2
+	be,pn		%icc, 85f
+	 add		%o1, %g1, %o1
+	ba,pt		%XCC, 90f
+	 sub		%o0, %o1, %o3
+
+	.align		64
+80: /* 0 < len <= 16 */
+	andcc		%o3, 0x3, %g0
+	bne,pn		%XCC, 90f
+	 sub		%o0, %o1, %o3
+
+1:
+	subcc		%o2, 4, %o2
+	LOAD(lduw, %o1, %g1)
+	STORE(stw, %g1, %o1 + %o3)
+	bgu,pt		%XCC, 1b
+	 add		%o1, 4, %o1
+
+85:	retl
+	 mov		%g5, %o0
+
+	.align		32
+90:
+	subcc		%o2, 1, %o2
+	LOAD(ldub, %o1, %g1)
+	STORE(stb, %g1, %o1 + %o3)
+	bgu,pt		%XCC, 90b
+	 add		%o1, 1, %o1
+	retl
+	 mov		%g5, %o0
+
+END(memcpy)
+
+#define RMOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3)	\
+	ldx		[%src - offset - 0x20], %t0; 		\
+	ldx		[%src - offset - 0x18], %t1; 		\
+	ldx		[%src - offset - 0x10], %t2; 		\
+	ldx		[%src - offset - 0x08], %t3; 		\
+	stw		%t0, [%dst - offset - 0x1c]; 		\
+	srlx		%t0, 32, %t0;				\
+	stw		%t0, [%dst - offset - 0x20]; 		\
+	stw		%t1, [%dst - offset - 0x14]; 		\
+	srlx		%t1, 32, %t1;				\
+	stw		%t1, [%dst - offset - 0x18]; 		\
+	stw		%t2, [%dst - offset - 0x0c]; 		\
+	srlx		%t2, 32, %t2;				\
+	stw		%t2, [%dst - offset - 0x10]; 		\
+	stw		%t3, [%dst - offset - 0x04];		\
+	srlx		%t3, 32, %t3;				\
+	stw		%t3, [%dst - offset - 0x08];
+
+#define RMOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3)	\
+	ldx		[%src - offset - 0x20], %t0; 		\
+	ldx		[%src - offset - 0x18], %t1; 		\
+	ldx		[%src - offset - 0x10], %t2; 		\
+	ldx		[%src - offset - 0x08], %t3; 		\
+	stx		%t0, [%dst - offset - 0x20]; 		\
+	stx		%t1, [%dst - offset - 0x18]; 		\
+	stx		%t2, [%dst - offset - 0x10]; 		\
+	stx		%t3, [%dst - offset - 0x08];		\
+	ldx		[%src - offset - 0x40], %t0; 		\
+	ldx		[%src - offset - 0x38], %t1; 		\
+	ldx		[%src - offset - 0x30], %t2; 		\
+	ldx		[%src - offset - 0x28], %t3; 		\
+	stx		%t0, [%dst - offset - 0x40]; 		\
+	stx		%t1, [%dst - offset - 0x38]; 		\
+	stx		%t2, [%dst - offset - 0x30]; 		\
+	stx		%t3, [%dst - offset - 0x28];
+
+#define RMOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3)	\
+	ldx		[%src + offset + 0x00], %t0;		\
+	ldx		[%src + offset + 0x08], %t1; 		\
+	stw		%t0, [%dst + offset + 0x04]; 		\
+	srlx		%t0, 32, %t2;				\
+	stw		%t2, [%dst + offset + 0x00]; 		\
+	stw		%t1, [%dst + offset + 0x0c]; 		\
+	srlx		%t1, 32, %t3;				\
+	stw		%t3, [%dst + offset + 0x08];
+
+#define RMOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1)		\
+	ldx		[%src + offset + 0x00], %t0; 		\
+	ldx		[%src + offset + 0x08], %t1; 		\
+	stx		%t0, [%dst + offset + 0x00]; 		\
+	stx		%t1, [%dst + offset + 0x08];
+
+	.align		32
+228:	andcc		%o2, 1, %g0
+	be,pt		%icc, 2f+4
+1:	 ldub		[%o1 - 1], %o5
+	sub		%o1, 1, %o1
+	sub		%o0, 1, %o0
+	subcc		%o2, 1, %o2
+	be,pn		%xcc, 229f
+	 stb		%o5, [%o0]
+2:	ldub		[%o1 - 1], %o5
+	sub		%o0, 2, %o0
+	ldub		[%o1 - 2], %g5
+	sub		%o1, 2, %o1
+	subcc		%o2, 2, %o2
+	stb		%o5, [%o0 + 1]
+	bne,pt		%xcc, 2b
+	 stb		%g5, [%o0]
+229:	retl
+	 mov		%g4, %o0
+out:	retl
+	 mov		%g5, %o0
+
+	.align		32
+ENTRY(memmove)
+	mov		%o0, %g5
+#ifndef USE_BPR
+	srl		%o2, 0, %o2
+#endif
+	brz,pn		%o2, out
+	 sub		%o0, %o1, %o4
+	cmp		%o4, %o2
+	bgeu,pt		%XCC, 218b
+	 mov		%o0, %g4
+	add		%o0, %o2, %o0
+220:	add		%o1, %o2, %o1
+	cmp		%o2, 15
+	bleu,pn		%xcc, 228b
+	 andcc		%o0, 7, %g2
+	sub		%o0, %o1, %g5
+	andcc		%g5, 3, %o5
+	bne,pn		%xcc, 232f
+	 andcc		%o1, 3, %g0
+	be,a,pt		%xcc, 236f
+	 andcc		%o1, 4, %g0
+	andcc		%o1, 1, %g0
+	be,pn		%xcc, 4f
+	 andcc		%o1, 2, %g0
+	ldub		[%o1 - 1], %g2
+	sub		%o1, 1, %o1
+	sub		%o0, 1, %o0
+	sub		%o2, 1, %o2
+	be,pn		%xcc, 5f
+	 stb		%g2, [%o0]
+4:	lduh		[%o1 - 2], %g2
+	sub		%o1, 2, %o1
+	sub		%o0, 2, %o0
+	sub		%o2, 2, %o2
+	sth		%g2, [%o0]
+5:	andcc		%o1, 4, %g0
+236:	be,a,pn		%xcc, 2f
+	 andcc		%o2, -128, %g6
+	lduw		[%o1 - 4], %g5
+	sub		%o1, 4, %o1
+	sub		%o0, 4, %o0
+	sub		%o2, 4, %o2
+	stw		%g5, [%o0]
+	andcc		%o2, -128, %g6
+2:	be,pn		%xcc, 235f
+	 andcc		%o0, 4, %g0
+	be,pn		%xcc, 282f + 4
+5:	RMOVE_BIGCHUNK(o1, o0, 0x00, g1, g3, g5, o5)
+	RMOVE_BIGCHUNK(o1, o0, 0x20, g1, g3, g5, o5)
+	RMOVE_BIGCHUNK(o1, o0, 0x40, g1, g3, g5, o5)
+	RMOVE_BIGCHUNK(o1, o0, 0x60, g1, g3, g5, o5)
+	subcc		%g6, 128, %g6
+	sub		%o1, 128, %o1
+	bne,pt		%xcc, 5b
+	 sub		%o0, 128, %o0
+235:	andcc		%o2, 0x70, %g6
+41:	be,pn		%xcc, 280f
+	 andcc		%o2, 8, %g0
+
+279:	rd		%pc, %o5
+	sll		%g6, 1, %g5
+	sub		%o1, %g6, %o1
+	sub		%o5, %g5, %o5
+	jmpl		%o5 + %lo(280f - 279b), %g0
+	 sub		%o0, %g6, %o0
+	RMOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g5, o5)
+	RMOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g5, o5)
+	RMOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g5, o5)
+	RMOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g5, o5)
+	RMOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g5, o5)
+	RMOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g5, o5)
+	RMOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g5, o5)
+280:	be,pt		%xcc, 281f
+	 andcc		%o2, 4, %g0
+	ldx		[%o1 - 8], %g2
+	sub		%o0, 8, %o0
+	stw		%g2, [%o0 + 4]
+	sub		%o1, 8, %o1
+	srlx		%g2, 32, %g2
+	stw		%g2, [%o0]
+281:	be,pt		%xcc, 1f
+	 andcc		%o2, 2, %g0
+	lduw		[%o1 - 4], %g2
+	sub		%o1, 4, %o1
+	stw		%g2, [%o0 - 4]
+	sub		%o0, 4, %o0
+1:	be,pt		%xcc, 1f
+	 andcc		%o2, 1, %g0
+	lduh		[%o1 - 2], %g2
+	sub		%o1, 2, %o1
+	sth		%g2, [%o0 - 2]
+	sub		%o0, 2, %o0
+1:	be,pt		%xcc, 211f
+	 nop
+	ldub		[%o1 - 1], %g2
+	stb		%g2, [%o0 - 1]
+211:	retl
+	 mov		%g4, %o0
+
+282:	RMOVE_BIGALIGNCHUNK(o1, o0, 0x00, g1, g3, g5, o5)
+	RMOVE_BIGALIGNCHUNK(o1, o0, 0x40, g1, g3, g5, o5)
+	subcc		%g6, 128, %g6
+	sub		%o1, 128, %o1
+	bne,pt		%xcc, 282b
+	 sub		%o0, 128, %o0
+	andcc		%o2, 0x70, %g6
+	be,pn		%xcc, 284f
+	 andcc		%o2, 8, %g0
+
+283:	rd		%pc, %o5
+	sub		%o1, %g6, %o1
+	sub		%o5, %g6, %o5
+	jmpl		%o5 + %lo(284f - 283b), %g0
+	 sub		%o0, %g6, %o0
+	RMOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, g3)
+	RMOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, g3)
+	RMOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, g3)
+	RMOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, g3)
+	RMOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, g3)
+	RMOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, g3)
+	RMOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, g3)
+284:	be,pt		%xcc, 285f
+	 andcc		%o2, 4, %g0
+	ldx		[%o1 - 8], %g2
+	sub		%o0, 8, %o0
+	sub		%o1, 8, %o1
+	stx		%g2, [%o0]
+285:	be,pt		%xcc, 1f
+	 andcc		%o2, 2, %g0
+	lduw		[%o1 - 4], %g2
+	sub		%o0, 4, %o0
+	sub		%o1, 4, %o1
+	stw		%g2, [%o0]
+1:	be,pt		%xcc, 1f
+	 andcc		%o2, 1, %g0
+	lduh		[%o1 - 2], %g2
+	sub		%o0, 2, %o0
+	sub		%o1, 2, %o1
+	sth		%g2, [%o0]
+1:	be,pt		%xcc, 1f
+	 nop
+	ldub		[%o1 - 1], %g2
+	stb		%g2, [%o0 - 1]
+1:	retl
+	 mov		%g4, %o0
+
+232:	ldub		[%o1 - 1], %g5
+	sub		%o1, 1, %o1
+	sub		%o0, 1, %o0
+	subcc		%o2, 1, %o2
+	bne,pt		%xcc, 232b
+	 stb		%g5, [%o0]
+234:	retl
+	 mov		%g4, %o0
+END(memmove)
+
+#ifdef USE_BPR
+weak_alias (memcpy, __align_cpy_1)
+weak_alias (memcpy, __align_cpy_2)
+weak_alias (memcpy, __align_cpy_4)
+weak_alias (memcpy, __align_cpy_8)
+weak_alias (memcpy, __align_cpy_16)
+#endif
+libc_hidden_builtin_def (memcpy)
+libc_hidden_builtin_def (memmove)
diff --git a/sysdeps/sparc/sparc64/sparcv9v/memset.S b/sysdeps/sparc/sparc64/sparcv9v/memset.S
new file mode 100644
index 0000000000..7a51ef77dc
--- /dev/null
+++ b/sysdeps/sparc/sparc64/sparcv9v/memset.S
@@ -0,0 +1,127 @@
+/* Set a block of memory to some byte value.  For SUN4V Niagara.
+   Copyright (C) 2006 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by David S. Miller (davem@davemloft.net)
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+
+#define ASI_BLK_INIT_QUAD_LDD_P	0xe2
+#define ASI_P			0x80
+#define ASI_PNF			0x82
+
+#ifndef XCC
+#define USE_BPR
+#define XCC xcc
+#endif
+
+	.register	%g2,#scratch
+
+	.text
+	.align		32
+
+ENTRY(memset)
+	/* %o0=buf, %o1=pat, %o2=len */
+	and		%o1, 0xff, %o3
+	mov		%o2, %o1
+	sllx		%o3, 8, %g1
+	or		%g1, %o3, %o2
+	sllx		%o2, 16, %g1
+	or		%g1, %o2, %o2
+	sllx		%o2, 32, %g1
+	ba,pt		%XCC, 1f
+	 or		%g1, %o2, %o2
+
+ENTRY(__bzero)
+	clr		%o2
+1:	brz,pn		%o1, 90f
+	 mov		%o0, %o3
+
+	wr		%g0, ASI_P, %asi
+
+	cmp		%o1, 15
+	bl,pn		%icc, 70f
+	 andcc		%o0, 0x7, %g1
+	be,pt		%XCC, 2f
+	 mov		8, %g2
+	sub		%g2, %g1, %g1
+	sub		%o1, %g1, %o1
+1:	stba		%o2, [%o0 + 0x00] %asi
+	subcc		%g1, 1, %g1
+	bne,pt		%XCC, 1b
+	 add		%o0, 1, %o0
+2:	cmp		%o1, 128
+	bl,pn		%icc, 60f
+	 andcc		%o0, (64 - 1), %g1
+	be,pt		%XCC, 40f
+	 mov		64, %g2
+	sub		%g2, %g1, %g1
+	sub		%o1, %g1, %o1
+1:	stxa		%o2, [%o0 + 0x00] %asi
+	subcc		%g1, 8, %g1
+	bne,pt		%XCC, 1b
+	 add		%o0, 8, %o0
+
+40:
+	wr		%g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
+	andn		%o1, (64 - 1), %g1
+	sub		%o1, %g1, %o1
+50:
+	stxa		%o2, [%o0 + 0x00] %asi
+	stxa		%o2, [%o0 + 0x08] %asi
+	stxa		%o2, [%o0 + 0x10] %asi
+	stxa		%o2, [%o0 + 0x18] %asi
+	stxa		%o2, [%o0 + 0x20] %asi
+	stxa		%o2, [%o0 + 0x28] %asi
+	stxa		%o2, [%o0 + 0x30] %asi
+	stxa		%o2, [%o0 + 0x38] %asi
+	subcc		%g1, 64, %g1
+	bne,pt		%XCC, 50b
+	 add		%o0, 64, %o0
+
+	wr		%g0, ASI_P, %asi
+	brz,pn		%o1, 80f
+60:
+	 andncc		%o1, 0x7, %g1
+	be,pn		%XCC, 2f
+	 sub		%o1, %g1, %o1
+1:	stxa		%o2, [%o0 + 0x00] %asi
+	subcc		%g1, 8, %g1
+	bne,pt		%XCC, 1b
+	 add		%o0, 8, %o0
+2:	brz,pt		%o1, 80f
+	 nop
+
+70:
+1:	stba		%o2, [%o0 + 0x00] %asi
+	subcc		%o1, 1, %o1
+	bne,pt		%icc, 1b
+	 add		%o0, 1, %o0
+
+	/* fallthrough */
+
+80:
+	wr		%g0, ASI_PNF, %asi
+
+90:
+	retl
+	 mov		%o3, %o0
+END(__bzero)
+END(memset)
+
+libc_hidden_builtin_def (memset)
+weak_alias (__bzero, bzero)
diff --git a/sysdeps/unix/sysv/linux/i386/fxstatat.c b/sysdeps/unix/sysv/linux/i386/fxstatat.c
index b077435553..94f6e81186 100644
--- a/sysdeps/unix/sysv/linux/i386/fxstatat.c
+++ b/sysdeps/unix/sysv/linux/i386/fxstatat.c
@@ -172,5 +172,5 @@ libc_hidden_def (__fxstatat)
 #ifdef XSTAT_IS_XSTAT64
 # undef __fxstatat64
 strong_alias (__fxstatat, __fxstatat64);
-libc_hidden_def (__fxstatat64)
+libc_hidden_ver (__fxstatat, __fxstatat64)
 #endif
diff --git a/sysdeps/unix/sysv/linux/s390/s390-64/sysdep.h b/sysdeps/unix/sysv/linux/s390/s390-64/sysdep.h
index fc80c9ff86..9ddec8e041 100644
--- a/sysdeps/unix/sysv/linux/s390/s390-64/sysdep.h
+++ b/sysdeps/unix/sysv/linux/s390/s390-64/sysdep.h
@@ -268,7 +268,7 @@
 	register unsigned long gpr6 asm ("6") = (unsigned long)(arg5);
 #define DECLARGS_6(arg1, arg2, arg3, arg4, arg5, arg6) \
 	DECLARGS_5(arg1, arg2, arg3, arg4, arg5) \
-	register unsigned long gpr6 asm ("7") = (unsigned long)(arg6);
+	register unsigned long gpr7 asm ("7") = (unsigned long)(arg6);
 
 #define ASMFMT_0
 #define ASMFMT_1 , "0" (gpr2)
-- 
cgit v1.2.3