Sat Oct 14 02:52:36 1995 Ulrich Drepper <drepper@ipd.info.uni-karlsruhe.de>

* malloc/malloc.c (_malloc_internal): Performance fix. Move if statement out of loop. * stdio/_itoa.c, stdio/_itoa.h: Complete rewrite. Much faster implementation using GMP functions. Contributed by Torbjorn Granlund and Ulrich Drepper. * stdio/test_rdwr.c: Include <errno.h>. * sysdeps/i386/i586/Implies: New file. New highly optimized string functions for i[345]86. * sysdeps/i386/memchr.S, sysdeps/i386/memcmp.S: New files. * sysdeps/i386/stpcpy.S, sysdeps/i386/stpncpy.S: New files. * sysdeps/i386/strchr.S, sysdeps/i386/strcspn.S: New files. * sysdeps/i386/strpbrk.S, sysdeps/i386/strrchr.S: New files. * sysdeps/i386/strspn.S, sysdeps/i386/i486/strcat.S: New files. * sysdeps/i386/i486/strlen.S, sysdeps/i386/i586/strchr.S: New files. * sysdeps/i386/i586/strlen.S: New file. * sysdeps/i386/memchr.c: Removed. There is now an assembler version. * sysdeps/i386/i586/memcopy.h (WORD_COPY_BWD): Parameters did not correspond to used values. * sysdeps/unix/sysv/linux/nfs/nfs.h: New file. Simply a wrapper around a kernel header file. * sysdeps/unix/sysv/linux/Dist: Add it. * sysdeps/unix/sysv/linux/Makefile [$(subdir)=sunrpc] (headers): Likewise. * sysdeps/unix/sysv/linux/local_lim.h: Rewrite. Instead of defining ourself we use a kernel header file. * sysdeps/unix/sysv/linux/i386/sysdep.h (DO_CALL): Optimize system call handler for i586. * sysdeps/unix/sysv/linux/sys/param.h: Add copyright and clean up. Sat Oct 14 02:52:36 1995 Ulrich Drepper <drepper@ipd.info.uni-karlsruhe.de> * malloc/malloc.c (_malloc_internal): Performance fix. Move if statement out of loop. * stdio/_itoa.c, stdio/_itoa.h: Complete rewrite. Much faster implementation using GMP functions. Contributed by Torbjorn Granlund and Ulrich Drepper. * stdio/test_rdwr.c: Include <errno.h>. * sysdeps/i386/i586/Implies: New file. New highly optimized string functions for i[345]86. * sysdeps/i386/memchr.S, sysdeps/i386/memcmp.S: New files. * sysdeps/i386/stpcpy.S, sysdeps/i386/stpncpy.S: New files. * sysdeps/i386/strchr.S, sysdeps/i386/strcspn.S: New files. * sysdeps/i386/strpbrk.S, sysdeps/i386/strrchr.S: New files. * sysdeps/i386/strspn.S, sysdeps/i386/i486/strcat.S: New files. * sysdeps/i386/i486/strlen.S, sysdeps/i386/i586/strchr.S: New files. * sysdeps/i386/i586/strlen.S: New file. * sysdeps/i386/memchr.c: Removed. There is now an assembler version. * sysdeps/i386/i586/memcopy.h (WORD_COPY_BWD): Parameters did not correspond to used values. * sysdeps/unix/sysv/linux/nfs/nfs.h: New file. Simply a wrapper around a kernel header file. * sysdeps/unix/sysv/linux/Dist: Add it. * sysdeps/unix/sysv/linux/Makefile [$(subdir)=sunrpc] (headers): Likewise. * sysdeps/unix/sysv/linux/local_lim.h: Rewrite. Instead of defining ourself we use a kernel header file. * sysdeps/unix/sysv/linux/i386/sysdep.h (DO_CALL): Optimize system call handler for i586. * sysdeps/unix/sysv/linux/sys/param.h: Add copyright and clean up.
author: Roland McGrath <roland@gnu.org> 1995-10-16 01:37:51 +0000
committer: Roland McGrath <roland@gnu.org> 1995-10-16 01:37:51 +0000
commit: 8f5ca04bc7fd53741d80117df992995ace8f6d2d (patch)
tree: e39c13fc198b22ec55647259a8080051988e8c69 /sysdeps
parent: 5d82cf5c55f56ae10d3b0a205d1fcc7de1cf56a0 (diff)
97 files changed, 9062 insertions, 138 deletions
diff --git a/sysdeps/alpha/add_n.s b/sysdeps/alpha/add_n.s
new file mode 100644
index 0000000000..e1ad4600f5
--- /dev/null
+++ b/sysdeps/alpha/add_n.s
@@ -0,0 +1,119 @@
+ # Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$16
+ # s1_ptr	$17
+ # s2_ptr	$18
+ # size		$19
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_add_n
+	.ent	__mpn_add_n
+__mpn_add_n:
+	.frame	$30,0,$26,0
+
+	ldq	$3,0($17)
+	ldq	$4,0($18)
+
+	subq	$19,1,$19
+	and	$19,4-1,$2	# number of limbs in first loop
+	bis	$31,$31,$0
+	beq	$2,.L0		# if multiple of 4 limbs, skip first loop
+
+	subq	$19,$2,$19
+
+.Loop0:	subq	$2,1,$2
+	ldq	$5,8($17)
+	addq	$4,$0,$4
+	ldq	$6,8($18)
+	cmpult	$4,$0,$1
+	addq	$3,$4,$4
+	cmpult	$4,$3,$0
+	stq	$4,0($16)
+	or	$0,$1,$0
+
+	addq	$17,8,$17
+	addq	$18,8,$18
+	bis	$5,$5,$3
+	bis	$6,$6,$4
+	addq	$16,8,$16
+	bne	$2,.Loop0
+
+.L0:	beq	$19,.Lend
+
+	.align	3
+.Loop:	subq	$19,4,$19
+
+	ldq	$5,8($17)
+	addq	$4,$0,$4
+	ldq	$6,8($18)
+	cmpult	$4,$0,$1
+	addq	$3,$4,$4
+	cmpult	$4,$3,$0
+	stq	$4,0($16)
+	or	$0,$1,$0
+
+	ldq	$3,16($17)
+	addq	$6,$0,$6
+	ldq	$4,16($18)
+	cmpult	$6,$0,$1
+	addq	$5,$6,$6
+	cmpult	$6,$5,$0
+	stq	$6,8($16)
+	or	$0,$1,$0
+
+	ldq	$5,24($17)
+	addq	$4,$0,$4
+	ldq	$6,24($18)
+	cmpult	$4,$0,$1
+	addq	$3,$4,$4
+	cmpult	$4,$3,$0
+	stq	$4,16($16)
+	or	$0,$1,$0
+
+	ldq	$3,32($17)
+	addq	$6,$0,$6
+	ldq	$4,32($18)
+	cmpult	$6,$0,$1
+	addq	$5,$6,$6
+	cmpult	$6,$5,$0
+	stq	$6,24($16)
+	or	$0,$1,$0
+
+	addq	$17,32,$17
+	addq	$18,32,$18
+	addq	$16,32,$16
+	bne	$19,.Loop
+
+.Lend:	addq	$4,$0,$4
+	cmpult	$4,$0,$1
+	addq	$3,$4,$4
+	cmpult	$4,$3,$0
+	stq	$4,0($16)
+	or	$0,$1,$0
+	ret	$31,($26),1
+
+	.end	__mpn_add_n
diff --git a/sysdeps/alpha/addmul_1.s b/sysdeps/alpha/addmul_1.s
new file mode 100644
index 0000000000..46d277df6e
--- /dev/null
+++ b/sysdeps/alpha/addmul_1.s
@@ -0,0 +1,100 @@
+ # Alpha 21064 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+ # the result to a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # s2_limb	r19
+
+ # This code runs at 42 cycles/limb on the 21064.
+
+ # To improve performance for long multiplications, we would use
+ # 'fetch' for S1 and 'fetch_m' for RES.  It's not obvious how to use
+ # these instructions without slowing down the general code: 1. We can
+ # only have two prefetches in operation at any time in the Alpha
+ # architecture.  2. There will seldom be any special alignment
+ # between RES_PTR and S1_PTR.  Maybe we can simply divide the current
+ # loop into an inner and outer loop, having the inner loop handle
+ # exactly one prefetch block?
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_addmul_1
+	.ent	__mpn_addmul_1 2
+__mpn_addmul_1:
+	.frame	$30,0,$26
+
+	ldq	$2,0($17)	# $2 = s1_limb
+	addq	$17,8,$17	# s1_ptr++
+	subq	$18,1,$18	# size--
+	mulq	$2,$19,$3	# $3 = prod_low
+	ldq	$5,0($16)	# $5 = *res_ptr
+	umulh	$2,$19,$0	# $0 = prod_high
+	beq	$18,Lend1	# jump if size was == 1
+	ldq	$2,0($17)	# $2 = s1_limb
+	addq	$17,8,$17	# s1_ptr++
+	subq	$18,1,$18	# size--
+	addq	$5,$3,$3
+	cmpult	$3,$5,$4
+	stq	$3,0($16)
+	addq	$16,8,$16	# res_ptr++
+	beq	$18,Lend2	# jump if size was == 2
+
+	.align	3
+Loop:	mulq	$2,$19,$3	# $3 = prod_low
+	ldq	$5,0($16)	# $5 = *res_ptr
+	addq	$4,$0,$0	# cy_limb = cy_limb + 'cy'
+	subq	$18,1,$18	# size--
+	umulh	$2,$19,$4	# $4 = cy_limb
+	ldq	$2,0($17)	# $2 = s1_limb
+	addq	$17,8,$17	# s1_ptr++
+	addq	$3,$0,$3	# $3 = cy_limb + prod_low
+	cmpult	$3,$0,$0	# $0 = carry from (cy_limb + prod_low)
+	addq	$5,$3,$3
+	cmpult	$3,$5,$5
+	stq	$3,0($16)
+	addq	$16,8,$16	# res_ptr++
+	addq	$5,$0,$0	# combine carries
+	bne	$18,Loop
+
+Lend2:	mulq	$2,$19,$3	# $3 = prod_low
+	ldq	$5,0($16)	# $5 = *res_ptr
+	addq	$4,$0,$0	# cy_limb = cy_limb + 'cy'
+	umulh	$2,$19,$4	# $4 = cy_limb
+	addq	$3,$0,$3	# $3 = cy_limb + prod_low
+	cmpult	$3,$0,$0	# $0 = carry from (cy_limb + prod_low)
+	addq	$5,$3,$3
+	cmpult	$3,$5,$5
+	stq	$3,0($16)
+	addq	$5,$0,$0	# combine carries
+	addq	$4,$0,$0	# cy_limb = prod_high + cy
+	ret	$31,($26),1
+Lend1:	addq	$5,$3,$3
+	cmpult	$3,$5,$5
+	stq	$3,0($16)
+	addq	$0,$5,$0
+	ret	$31,($26),1
+
+	.end	__mpn_addmul_1
diff --git a/sysdeps/alpha/alphaev5/add_n.s b/sysdeps/alpha/alphaev5/add_n.s
new file mode 100644
index 0000000000..2aaf041774
--- /dev/null
+++ b/sysdeps/alpha/alphaev5/add_n.s
@@ -0,0 +1,118 @@
+ # Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$16
+ # s1_ptr	$17
+ # s2_ptr	$18
+ # size		$19
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_add_n
+	.ent	__mpn_add_n
+__mpn_add_n:
+	.frame	$30,0,$26,0
+
+	ldq	$3,0($17)
+	ldq	$4,0($18)
+
+	subq	$19,1,$19
+	and	$19,4-1,$2	# number of limbs in first loop
+	bis	$31,$31,$0
+	beq	$2,.L0		# if multiple of 4 limbs, skip first loop
+
+	subq	$19,$2,$19
+
+.Loop0:	subq	$2,1,$2
+	ldq	$5,8($17)
+	addq	$4,$0,$4
+	ldq	$6,8($18)
+	cmpult	$4,$0,$1
+	addq	$3,$4,$4
+	cmpult	$4,$3,$0
+	stq	$4,0($16)
+	or	$0,$1,$0
+
+	addq	$17,8,$17
+	addq	$18,8,$18
+	bis	$5,$5,$3
+	bis	$6,$6,$4
+	addq	$16,8,$16
+	bne	$2,.Loop0
+
+.L0:	beq	$19,.Lend
+
+	.align	4
+.Loop:	subq	$19,4,$19
+	unop
+
+	ldq	$6,8($18)
+	addq	$4,$0,$0
+	ldq	$5,8($17)
+	cmpult	$0,$4,$1
+	ldq	$4,16($18)
+	addq	$3,$0,$20
+	cmpult	$20,$3,$0
+	ldq	$3,16($17)
+	or	$0,$1,$0
+	addq	$6,$0,$0
+	cmpult	$0,$6,$1
+	ldq	$6,24($18)
+	addq	$5,$0,$21
+	cmpult	$21,$5,$0
+	ldq	$5,24($17)
+	or	$0,$1,$0
+	addq	$4,$0,$0
+	cmpult	$0,$4,$1
+	ldq	$4,32($18)
+	addq	$3,$0,$22
+	cmpult	$22,$3,$0
+	ldq	$3,32($17)
+	or	$0,$1,$0
+	addq	$6,$0,$0
+	cmpult	$0,$6,$1
+	addq	$5,$0,$23
+	cmpult	$23,$5,$0
+	or	$0,$1,$0
+
+	stq	$20,0($16)
+	stq	$21,8($16)
+	stq	$22,16($16)
+	stq	$23,24($16)
+
+	addq	$17,32,$17
+	addq	$18,32,$18
+	addq	$16,32,$16
+	bne	$19,.Loop
+
+.Lend:	addq	$4,$0,$4
+	cmpult	$4,$0,$1
+	addq	$3,$4,$4
+	cmpult	$4,$3,$0
+	stq	$4,0($16)
+	or	$0,$1,$0
+	ret	$31,($26),1
+
+	.end	__mpn_add_n
diff --git a/sysdeps/alpha/alphaev5/lshift.s b/sysdeps/alpha/alphaev5/lshift.s
new file mode 100644
index 0000000000..fdb089550f
--- /dev/null
+++ b/sysdeps/alpha/alphaev5/lshift.s
@@ -0,0 +1,175 @@
+ # Alpha EV5 __mpn_lshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # cnt		r19
+
+ # This code runs at 4.25 cycles/limb on the EV5.
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_lshift
+	.ent	__mpn_lshift
+__mpn_lshift:
+	.frame	$30,0,$26,0
+
+	s8addq	$18,$17,$17	# make r17 point at end of s1
+	ldq	$4,-8($17)	# load first limb
+	subq	$31,$19,$20
+	s8addq	$18,$16,$16	# make r16 point at end of RES
+	subq	$18,1,$18
+	and	$18,4-1,$28	# number of limbs in first loop
+	srl	$4,$20,$0	# compute function result
+
+	beq	$28,L0
+	subq	$18,$28,$18
+
+	.align	3
+Loop0:	ldq	$3,-16($17)
+	subq	$16,8,$16
+	sll	$4,$19,$5
+	subq	$17,8,$17
+	subq	$28,1,$28
+	srl	$3,$20,$6
+	or	$3,$3,$4
+	or	$5,$6,$8
+	stq	$8,0($16)
+	bne	$28,Loop0
+
+L0:	sll	$4,$19,$24
+	beq	$18,Lend
+ # warm up phase 1
+	ldq	$1,-16($17)
+	subq	$18,4,$18
+	ldq	$2,-24($17)
+	ldq	$3,-32($17)
+	ldq	$4,-40($17)
+	beq	$18,Lcool1
+ # warm up phase 2
+	srl	$1,$20,$7
+	sll	$1,$19,$21
+	srl	$2,$20,$8
+	ldq	$1,-48($17)
+	sll	$2,$19,$22
+	ldq	$2,-56($17)
+	srl	$3,$20,$5
+	or	$7,$24,$7
+	sll	$3,$19,$23
+	or	$8,$21,$8
+	srl	$4,$20,$6
+	ldq	$3,-64($17)
+	sll	$4,$19,$24
+	ldq	$4,-72($17)
+	subq	$18,4,$18
+	beq	$18,Lcool1
+	.align  4
+ # main loop
+Loop:	stq	$7,-8($16)
+	or	$5,$22,$5
+	stq	$8,-16($16)
+	or	$6,$23,$6
+
+	srl	$1,$20,$7
+	subq	$18,4,$18
+	sll	$1,$19,$21
+	unop	# ldq	$31,-96($17)
+
+	srl	$2,$20,$8
+	ldq	$1,-80($17)
+	sll	$2,$19,$22
+	ldq	$2,-88($17)
+
+	stq	$5,-24($16)
+	or	$7,$24,$7
+	stq	$6,-32($16)
+	or	$8,$21,$8
+
+	srl	$3,$20,$5
+	unop	# ldq	$31,-96($17)
+	sll	$3,$19,$23
+	subq	$16,32,$16
+
+	srl	$4,$20,$6
+	ldq	$3,-96($17
+	sll	$4,$19,$24
+	ldq	$4,-104($17)
+
+	subq	$17,32,$17
+	bne	$18,Loop
+	unop
+	unop
+ # cool down phase 2/1
+Lcool1:	stq	$7,-8($16)
+	or	$5,$22,$5
+	stq	$8,-16($16)
+	or	$6,$23,$6
+	srl	$1,$20,$7
+	sll	$1,$19,$21
+	srl	$2,$20,$8
+	sll	$2,$19,$22
+	stq	$5,-24($16)
+	or	$7,$24,$7
+	stq	$6,-32($16)
+	or	$8,$21,$8
+	srl	$3,$20,$5
+	sll	$3,$19,$23
+	srl	$4,$20,$6
+	sll	$4,$19,$24
+ # cool down phase 2/2
+	stq	$7,-40($16)
+	or	$5,$22,$5
+	stq	$8,-48($16)
+	or	$6,$23,$6
+	stq	$5,-56($16)
+	stq	$6,-64($16)
+ # cool down phase 2/3
+	stq	$24,-72($16)
+	ret	$31,($26),1
+
+ # cool down phase 1/1
+Lcool1:	srl	$1,$20,$7
+	sll	$1,$19,$21
+	srl	$2,$20,$8
+	sll	$2,$19,$22
+	srl	$3,$20,$5
+	or	$7,$24,$7
+	sll	$3,$19,$23
+	or	$8,$21,$8
+	srl	$4,$20,$6
+	sll	$4,$19,$24
+ # cool down phase 1/2
+	stq	$7,-8($16)
+	or	$5,$22,$5
+	stq	$8,-16($16)
+	or	$6,$23,$6
+	stq	$5,-24($16)
+	stq	$6,-32($16)
+	stq	$24,-40($16)
+	ret	$31,($26),1
+
+Lend	stq	$24,-8($16)
+	ret	$31,($26),1
+	.end	__mpn_lshift
diff --git a/sysdeps/alpha/alphaev5/rshift.s b/sysdeps/alpha/alphaev5/rshift.s
new file mode 100644
index 0000000000..1da9960b46
--- /dev/null
+++ b/sysdeps/alpha/alphaev5/rshift.s
@@ -0,0 +1,173 @@
+ # Alpha EV5 __mpn_rshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # cnt		r19
+
+ # This code runs at 4.25 cycles/limb on the EV5.
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_rshift
+	.ent	__mpn_rshift
+__mpn_rshift:
+	.frame	$30,0,$26,0
+
+	ldq	$4,0($17)	# load first limb
+	subq	$31,$19,$20
+	subq	$18,1,$18
+	and	$18,4-1,$28	# number of limbs in first loop
+	sll	$4,$20,$0	# compute function result
+
+	beq	$28,L0
+	subq	$18,$28,$18
+
+	.align	3
+Loop0:	ldq	$3,8($17)
+	addq	$16,8,$16
+	srl	$4,$19,$5
+	addq	$17,8,$17
+	subq	$28,1,$28
+	sll	$3,$20,$6
+	or	$3,$3,$4
+	or	$5,$6,$8
+	stq	$8,-8($16)
+	bne	$28,Loop0
+
+L0:	srl	$4,$19,$24
+	beq	$18,Lend
+ # warm up phase 1
+	ldq	$1,8($17)
+	subq	$18,4,$18
+	ldq	$2,16($17)
+	ldq	$3,24($17)
+	ldq	$4,32($17)
+	beq	$18,Lcool1
+ # warm up phase 2
+	sll	$1,$20,$7
+	srl	$1,$19,$21
+	sll	$2,$20,$8
+	ldq	$1,40($17)
+	srl	$2,$19,$22
+	ldq	$2,48($17)
+	sll	$3,$20,$5
+	or	$7,$24,$7
+	srl	$3,$19,$23
+	or	$8,$21,$8
+	sll	$4,$20,$6
+	ldq	$3,56($17)
+	srl	$4,$19,$24
+	ldq	$4,64($17)
+	subq	$18,4,$18
+	beq	$18,Lcool2
+	.align  4
+ # main loop
+Loop:	stq	$7,0($16)
+	or	$5,$22,$5
+	stq	$8,8($16)
+	or	$6,$23,$6
+
+	sll	$1,$20,$7
+	subq	$18,4,$18
+	srl	$1,$19,$21
+	unop	# ldq	$31,-96($17)
+
+	sll	$2,$20,$8
+	ldq	$1,72($17)
+	srl	$2,$19,$22
+	ldq	$2,80($17)
+
+	stq	$5,16($16)
+	or	$7,$24,$7
+	stq	$6,24($16)
+	or	$8,$21,$8
+
+	sll	$3,$20,$5
+	unop	# ldq	$31,-96($17)
+	srl	$3,$19,$23
+	addq	$16,32,$16
+
+	sll	$4,$20,$6
+	ldq	$3,88($17)
+	srl	$4,$19,$24
+	ldq	$4,96($17)
+
+	addq	$17,32,$17
+	bne	$18,Loop
+	unop
+	unop
+ # cool down phase 2/1
+Lcool2:	stq	$7,0($16)
+	or	$5,$22,$5
+	stq	$8,8($16)
+	or	$6,$23,$6
+	sll	$1,$20,$7
+	srl	$1,$19,$21
+	sll	$2,$20,$8
+	srl	$2,$19,$22
+	stq	$5,16($16)
+	or	$7,$24,$7
+	stq	$6,24($16)
+	or	$8,$21,$8
+	sll	$3,$20,$5
+	srl	$3,$19,$23
+	sll	$4,$20,$6
+	srl	$4,$19,$24
+ # cool down phase 2/2
+	stq	$7,32($16)
+	or	$5,$22,$5
+	stq	$8,40($16)
+	or	$6,$23,$6
+	stq	$5,48($16)
+	stq	$6,56($16)
+ # cool down phase 2/3
+	stq	$24,64($16)
+	ret	$31,($26),1
+
+ # cool down phase 1/1
+Lcool1:	sll	$1,$20,$7
+	srl	$1,$19,$21
+	sll	$2,$20,$8
+	srl	$2,$19,$22
+	sll	$3,$20,$5
+	or	$7,$24,$7
+	srl	$3,$19,$23
+	or	$8,$21,$8
+	sll	$4,$20,$6
+	srl	$4,$19,$24
+ # cool down phase 1/2
+	stq	$7,0($16)
+	or	$5,$22,$5
+	stq	$8,8($16)
+	or	$6,$23,$6
+	stq	$5,16($16)
+	stq	$6,24($16)
+	stq	$24,32($16)
+	ret	$31,($26),1
+
+Lend:	stq	$24,0($16)
+	ret	$31,($26),1
+	.end	__mpn_rshift
diff --git a/sysdeps/alpha/lshift.s b/sysdeps/alpha/lshift.s
new file mode 100644
index 0000000000..c28434926b
--- /dev/null
+++ b/sysdeps/alpha/lshift.s
@@ -0,0 +1,108 @@
+ # Alpha 21064 __mpn_lshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # cnt		r19
+
+ # This code runs at 4.8 cycles/limb on the 21064.  With infinite unrolling,
+ # it would take 4 cycles/limb.  It should be possible to get down to 3
+ # cycles/limb since both ldq and stq can be paired with the other used
+ # instructions.  But there are many restrictions in the 21064 pipeline that
+ # makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+ # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+ # 2. Only aligned instruction pairs can be paired.
+ # 3. The store buffer or silo might not be able to deal with the bandwidth.
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_lshift
+	.ent	__mpn_lshift
+__mpn_lshift:
+	.frame	$30,0,$26,0
+
+	s8addq	$18,$17,$17	# make r17 point at end of s1
+	ldq	$4,-8($17)	# load first limb
+	subq	$17,8,$17
+	subq	$31,$19,$7
+	s8addq	$18,$16,$16	# make r16 point at end of RES
+	subq	$18,1,$18
+	and	$18,4-1,$20	# number of limbs in first loop
+	srl	$4,$7,$0	# compute function result
+
+	beq	$20,L0
+	subq	$18,$20,$18
+
+	.align	3
+Loop0:
+	ldq	$3,-8($17)
+	subq	$16,8,$16
+	subq	$17,8,$17
+	subq	$20,1,$20
+	sll	$4,$19,$5
+	srl	$3,$7,$6
+	bis	$3,$3,$4
+	bis	$5,$6,$8
+	stq	$8,0($16)
+	bne	$20,Loop0
+
+L0:	beq	$18,Lend
+
+	.align	3
+Loop:	ldq	$3,-8($17)
+	subq	$16,32,$16
+	subq	$18,4,$18
+	sll	$4,$19,$5
+	srl	$3,$7,$6
+
+	ldq	$4,-16($17)
+	sll	$3,$19,$1
+	bis	$5,$6,$8
+	stq	$8,24($16)
+	srl	$4,$7,$2
+
+	ldq	$3,-24($17)
+	sll	$4,$19,$5
+	bis	$1,$2,$8
+	stq	$8,16($16)
+	srl	$3,$7,$6
+
+	ldq	$4,-32($17)
+	sll	$3,$19,$1
+	bis	$5,$6,$8
+	stq	$8,8($16)
+	srl	$4,$7,$2
+
+	subq	$17,32,$17
+	bis	$1,$2,$8
+	stq	$8,0($16)
+
+	bgt	$18,Loop
+
+Lend:	sll	$4,$19,$8
+	stq	$8,-8($16)
+	ret	$31,($26),1
+	.end	__mpn_lshift
diff --git a/sysdeps/alpha/mul_1.s b/sysdeps/alpha/mul_1.s
new file mode 100644
index 0000000000..3ef194d7e6
--- /dev/null
+++ b/sysdeps/alpha/mul_1.s
@@ -0,0 +1,84 @@
+ # Alpha 21064 __mpn_mul_1 -- Multiply a limb vector with a limb and store
+ # the result in a second limb vector.
+
+ # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # s2_limb	r19
+
+ # This code runs at 42 cycles/limb on the EV4 and 18 cycles/limb on the EV5.
+
+ # To improve performance for long multiplications, we would use
+ # 'fetch' for S1 and 'fetch_m' for RES.  It's not obvious how to use
+ # these instructions without slowing down the general code: 1. We can
+ # only have two prefetches in operation at any time in the Alpha
+ # architecture.  2. There will seldom be any special alignment
+ # between RES_PTR and S1_PTR.  Maybe we can simply divide the current
+ # loop into an inner and outer loop, having the inner loop handle
+ # exactly one prefetch block?
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_mul_1
+	.ent	__mpn_mul_1 2
+__mpn_mul_1:
+	.frame	$30,0,$26
+
+	ldq	$2,0($17)	# $2 = s1_limb
+	subq	$18,1,$18	# size--
+	mulq	$2,$19,$3	# $3 = prod_low
+	bic	$31,$31,$4	# clear cy_limb
+	umulh	$2,$19,$0	# $0 = prod_high
+	beq	$18,Lend1	# jump if size was == 1
+	ldq	$2,8($17)	# $2 = s1_limb
+	subq	$18,1,$18	# size--
+	stq	$3,0($16)
+	beq	$18,Lend2	# jump if size was == 2
+
+	.align	3
+Loop:	mulq	$2,$19,$3	# $3 = prod_low
+	addq	$4,$0,$0	# cy_limb = cy_limb + 'cy'
+	subq	$18,1,$18	# size--
+	umulh	$2,$19,$4	# $4 = cy_limb
+	ldq	$2,16($17)	# $2 = s1_limb
+	addq	$17,8,$17	# s1_ptr++
+	addq	$3,$0,$3	# $3 = cy_limb + prod_low
+	stq	$3,8($16)
+	cmpult	$3,$0,$0	# $0 = carry from (cy_limb + prod_low)
+	addq	$16,8,$16	# res_ptr++
+	bne	$18,Loop
+
+Lend2:	mulq	$2,$19,$3	# $3 = prod_low
+	addq	$4,$0,$0	# cy_limb = cy_limb + 'cy'
+	umulh	$2,$19,$4	# $4 = cy_limb
+	addq	$3,$0,$3	# $3 = cy_limb + prod_low
+	cmpult	$3,$0,$0	# $0 = carry from (cy_limb + prod_low)
+	stq	$3,8($16)
+	addq	$4,$0,$0	# cy_limb = prod_high + cy
+	ret	$31,($26),1
+Lend1:	stq	$3,0($16)
+	ret	$31,($26),1
+
+	.end	__mpn_mul_1
diff --git a/sysdeps/alpha/rshift.s b/sysdeps/alpha/rshift.s
new file mode 100644
index 0000000000..74eab0434a
--- /dev/null
+++ b/sysdeps/alpha/rshift.s
@@ -0,0 +1,106 @@
+ # Alpha 21064 __mpn_rshift --
+
+ # Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # cnt		r19
+
+ # This code runs at 4.8 cycles/limb on the 21064.  With infinite unrolling,
+ # it would take 4 cycles/limb.  It should be possible to get down to 3
+ # cycles/limb since both ldq and stq can be paired with the other used
+ # instructions.  But there are many restrictions in the 21064 pipeline that
+ # makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+ # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+ # 2. Only aligned instruction pairs can be paired.
+ # 3. The store buffer or silo might not be able to deal with the bandwidth.
+      
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_rshift
+	.ent	__mpn_rshift
+__mpn_rshift:
+	.frame	$30,0,$26,0
+
+	ldq	$4,0($17)	# load first limb
+	addq	$17,8,$17
+	subq	$31,$19,$7
+	subq	$18,1,$18
+	and	$18,4-1,$20	# number of limbs in first loop
+	sll	$4,$7,$0	# compute function result
+
+	beq	$20,L0
+	subq	$18,$20,$18
+
+	.align	3
+Loop0:
+	ldq	$3,0($17)
+	addq	$16,8,$16
+	addq	$17,8,$17
+	subq	$20,1,$20
+	srl	$4,$19,$5
+	sll	$3,$7,$6
+	bis	$3,$3,$4
+	bis	$5,$6,$8
+	stq	$8,-8($16)
+	bne	$20,Loop0
+
+L0:	beq	$18,Lend
+
+	.align	3
+Loop:	ldq	$3,0($17)
+	addq	$16,32,$16
+	subq	$18,4,$18
+	srl	$4,$19,$5
+	sll	$3,$7,$6
+
+	ldq	$4,8($17)
+	srl	$3,$19,$1
+	bis	$5,$6,$8
+	stq	$8,-32($16)
+	sll	$4,$7,$2
+
+	ldq	$3,16($17)
+	srl	$4,$19,$5
+	bis	$1,$2,$8
+	stq	$8,-24($16)
+	sll	$3,$7,$6
+
+	ldq	$4,24($17)
+	srl	$3,$19,$1
+	bis	$5,$6,$8
+	stq	$8,-16($16)
+	sll	$4,$7,$2
+
+	addq	$17,32,$17
+	bis	$1,$2,$8
+	stq	$8,-8($16)
+
+	bgt	$18,Loop
+
+Lend:	srl	$4,$19,$8
+	stq	$8,0($16)
+	ret	$31,($26),1
+	.end	__mpn_rshift
diff --git a/sysdeps/alpha/sub_n.s b/sysdeps/alpha/sub_n.s
new file mode 100644
index 0000000000..5200025b41
--- /dev/null
+++ b/sysdeps/alpha/sub_n.s
@@ -0,0 +1,119 @@
+ # Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$16
+ # s1_ptr	$17
+ # s2_ptr	$18
+ # size		$19
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_sub_n
+	.ent	__mpn_sub_n
+__mpn_sub_n:
+	.frame	$30,0,$26,0
+
+	ldq	$3,0($17)
+	ldq	$4,0($18)
+
+	subq	$19,1,$19
+	and	$19,4-1,$2	# number of limbs in first loop
+	bis	$31,$31,$0
+	beq	$2,.L0		# if multiple of 4 limbs, skip first loop
+
+	subq	$19,$2,$19
+
+.Loop0:	subq	$2,1,$2
+	ldq	$5,8($17)
+	addq	$4,$0,$4
+	ldq	$6,8($18)
+	cmpult	$4,$0,$1
+	subq	$3,$4,$4
+	cmpult	$3,$4,$0
+	stq	$4,0($16)
+	or	$0,$1,$0
+
+	addq	$17,8,$17
+	addq	$18,8,$18
+	bis	$5,$5,$3
+	bis	$6,$6,$4
+	addq	$16,8,$16
+	bne	$2,.Loop0
+
+.L0:	beq	$19,.Lend
+
+	.align	3
+.Loop:	subq	$19,4,$19
+
+	ldq	$5,8($17)
+	addq	$4,$0,$4
+	ldq	$6,8($18)
+	cmpult	$4,$0,$1
+	subq	$3,$4,$4
+	cmpult	$3,$4,$0
+	stq	$4,0($16)
+	or	$0,$1,$0
+
+	ldq	$3,16($17)
+	addq	$6,$0,$6
+	ldq	$4,16($18)
+	cmpult	$6,$0,$1
+	subq	$5,$6,$6
+	cmpult	$5,$6,$0
+	stq	$6,8($16)
+	or	$0,$1,$0
+
+	ldq	$5,24($17)
+	addq	$4,$0,$4
+	ldq	$6,24($18)
+	cmpult	$4,$0,$1
+	subq	$3,$4,$4
+	cmpult	$3,$4,$0
+	stq	$4,16($16)
+	or	$0,$1,$0
+
+	ldq	$3,32($17)
+	addq	$6,$0,$6
+	ldq	$4,32($18)
+	cmpult	$6,$0,$1
+	subq	$5,$6,$6
+	cmpult	$5,$6,$0
+	stq	$6,24($16)
+	or	$0,$1,$0
+
+	addq	$17,32,$17
+	addq	$18,32,$18
+	addq	$16,32,$16
+	bne	$19,.Loop
+
+.Lend:	addq	$4,$0,$4
+	cmpult	$4,$0,$1
+	subq	$3,$4,$4
+	cmpult	$3,$4,$0
+	stq	$4,0($16)
+	or	$0,$1,$0
+	ret	$31,($26),1
+
+	.end	__mpn_sub_n
diff --git a/sysdeps/alpha/submul_1.s b/sysdeps/alpha/submul_1.s
new file mode 100644
index 0000000000..acaa11c545
--- /dev/null
+++ b/sysdeps/alpha/submul_1.s
@@ -0,0 +1,100 @@
+ # Alpha 21064 __mpn_submul_1 -- Multiply a limb vector with a limb and
+ # subtract the result from a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	r16
+ # s1_ptr	r17
+ # size		r18
+ # s2_limb	r19
+
+ # This code runs at 42 cycles/limb on the 21064.
+
+ # To improve performance for long multiplications, we would use
+ # 'fetch' for S1 and 'fetch_m' for RES.  It's not obvious how to use
+ # these instructions without slowing down the general code: 1. We can
+ # only have two prefetches in operation at any time in the Alpha
+ # architecture.  2. There will seldom be any special alignment
+ # between RES_PTR and S1_PTR.  Maybe we can simply divide the current
+ # loop into an inner and outer loop, having the inner loop handle
+ # exactly one prefetch block?
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	__mpn_submul_1
+	.ent	__mpn_submul_1 2
+__mpn_submul_1:
+	.frame	$30,0,$26
+
+	ldq	$2,0($17)	# $2 = s1_limb
+	addq	$17,8,$17	# s1_ptr++
+	subq	$18,1,$18	# size--
+	mulq	$2,$19,$3	# $3 = prod_low
+	ldq	$5,0($16)	# $5 = *res_ptr
+	umulh	$2,$19,$0	# $0 = prod_high
+	beq	$18,Lend1	# jump if size was == 1
+	ldq	$2,0($17)	# $2 = s1_limb
+	addq	$17,8,$17	# s1_ptr++
+	subq	$18,1,$18	# size--
+	subq	$5,$3,$3
+	cmpult	$5,$3,$4
+	stq	$3,0($16)
+	addq	$16,8,$16	# res_ptr++
+	beq	$18,Lend2	# jump if size was == 2
+
+	.align	3
+Loop:	mulq	$2,$19,$3	# $3 = prod_low
+	ldq	$5,0($16)	# $5 = *res_ptr
+	addq	$4,$0,$0	# cy_limb = cy_limb + 'cy'
+	subq	$18,1,$18	# size--
+	umulh	$2,$19,$4	# $4 = cy_limb
+	ldq	$2,0($17)	# $2 = s1_limb
+	addq	$17,8,$17	# s1_ptr++
+	addq	$3,$0,$3	# $3 = cy_limb + prod_low
+	cmpult	$3,$0,$0	# $0 = carry from (cy_limb + prod_low)
+	subq	$5,$3,$3
+	cmpult	$5,$3,$5
+	stq	$3,0($16)
+	addq	$16,8,$16	# res_ptr++
+	addq	$5,$0,$0	# combine carries
+	bne	$18,Loop
+
+Lend2:	mulq	$2,$19,$3	# $3 = prod_low
+	ldq	$5,0($16)	# $5 = *res_ptr
+	addq	$4,$0,$0	# cy_limb = cy_limb + 'cy'
+	umulh	$2,$19,$4	# $4 = cy_limb
+	addq	$3,$0,$3	# $3 = cy_limb + prod_low
+	cmpult	$3,$0,$0	# $0 = carry from (cy_limb + prod_low)
+	subq	$5,$3,$3
+	cmpult	$5,$3,$5
+	stq	$3,0($16)
+	addq	$5,$0,$0	# combine carries
+	addq	$4,$0,$0	# cy_limb = prod_high + cy
+	ret	$31,($26),1
+Lend1:	subq	$5,$3,$3
+	cmpult	$5,$3,$5
+	stq	$3,0($16)
+	addq	$0,$5,$0
+	ret	$31,($26),1
+
+	.end	__mpn_submul_1
diff --git a/sysdeps/alpha/udiv_qrnnd.S b/sysdeps/alpha/udiv_qrnnd.S
index 942d7a884b..bafafd672e 100644
--- a/sysdeps/alpha/udiv_qrnnd.S
+++ b/sysdeps/alpha/udiv_qrnnd.S
@@ -134,7 +134,7 @@ Loop2:	cmplt	n0,0,tmp
 	ret	$31,($26),1
 
 Odd:
-	/* q' in n0.  r' in n1.  */
+	/* q' in n0. r' in n1 */
 	addq	n1,n0,n1
 	cmpult	n1,n0,tmp	# tmp := carry from addq
 	beq	tmp,LLp6
diff --git a/sysdeps/generic/divmod_1.c b/sysdeps/generic/divmod_1.c
index d156eeb00d..2989d36708 100644
--- a/sysdeps/generic/divmod_1.c
+++ b/sysdeps/generic/divmod_1.c
@@ -83,14 +83,12 @@ __mpn_divmod_1 (quot_ptr, dividend_ptr, dividend_size, divisor_limb)
 	     result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the
 	     most significant bit (with weight 2**N) implicit.  */
 
-#if 0 /* This can't happen when normalization_steps != 0 */
 	  /* Special case for DIVISOR_LIMB == 100...000.  */
 	  if (divisor_limb << 1 == 0)
 	    divisor_limb_inverted = ~(mp_limb) 0;
 	  else
-#endif
-	  udiv_qrnnd (divisor_limb_inverted, dummy,
-		      -divisor_limb, 0, divisor_limb);
+	    udiv_qrnnd (divisor_limb_inverted, dummy,
+			-divisor_limb, 0, divisor_limb);
 
 	  n1 = dividend_ptr[dividend_size - 1];
 	  r = n1 >> (BITS_PER_MP_LIMB - normalization_steps);
diff --git a/sysdeps/generic/mod_1.c b/sysdeps/generic/mod_1.c
index ae4ed0914f..8a49fb4be0 100644
--- a/sysdeps/generic/mod_1.c
+++ b/sysdeps/generic/mod_1.c
@@ -3,8 +3,6 @@
    Return the single-limb remainder.
    There are no constraints on the value of the divisor.
 
-   QUOT_PTR and DIVIDEND_PTR might point to the same limb.
-
 Copyright (C) 1991, 1993, 1994, Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
diff --git a/sysdeps/hppa/add_n.s b/sysdeps/hppa/add_n.s
new file mode 100644
index 0000000000..7f3e32342b
--- /dev/null
+++ b/sysdeps/hppa/add_n.s
@@ -0,0 +1,57 @@
+; HP-PA  __mpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; One might want to unroll this as for other processors, but it turns
+; out that the data cache contention after a store makes such
+; unrolling useless.  We can't come under 5 cycles/limb anyway.
+
+	.code
+	.export		__mpn_add_n
+__mpn_add_n
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+
+	addib,=		-1,%r23,L$end	; check for (SIZE == 1)
+	 add		%r20,%r19,%r28	; add first limbs ignoring cy
+
+L$loop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,<>	-1,%r23,L$loop
+	 addc		%r20,%r19,%r28
+
+L$end	stws		%r28,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r0,%r28
+
+	.exit
+	.procend
diff --git a/sysdeps/hppa/hppa1.1/addmul_1.s b/sysdeps/hppa/hppa1.1/addmul_1.s
new file mode 100644
index 0000000000..a9dfdd1c28
--- /dev/null
+++ b/sysdeps/hppa/hppa1.1/addmul_1.s
@@ -0,0 +1,101 @@
+; HP-PA-1.1 __mpn_addmul_1 -- Multiply a limb vector with a limb and
+; add the result to a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r26
+; s1_ptr	r25
+; size		r24
+; s2_limb	r23
+
+; This runs at 11 cycles/limb on a PA7000.  With the used instructions, it
+; can not become faster due to data cache contention after a store.  On the
+; PA7100 it runs at 10 cycles/limb, and that can not be improved either,
+; since only the xmpyu does not need the integer pipeline, so the only
+; dual-issue we will get are addc+xmpyu.  Unrolling could gain a cycle/limb
+; on the PA7100.
+
+; There are some ideas described in mul_1.s that applies to this code too.
+
+	.code
+	.export		__mpn_addmul_1
+__mpn_addmul_1
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldo		64(%r30),%r30
+	fldws,ma	4(%r25),%fr5
+	stw		%r23,-16(%r30)		; move s2_limb ...
+	addib,=		-1,%r24,L$just_one_limb
+	 fldws		-16(%r30),%fr4		; ... into fr4
+	add		%r0,%r0,%r0		; clear carry
+	xmpyu		%fr4,%fr5,%fr6
+	fldws,ma	4(%r25),%fr7
+	fstds		%fr6,-16(%r30)
+	xmpyu		%fr4,%fr7,%fr8
+	ldw		-12(%r30),%r19		; least significant limb in product
+	ldw		-16(%r30),%r28
+
+	fstds		%fr8,-16(%r30)
+	addib,=		-1,%r24,L$end
+	 ldw		-12(%r30),%r1
+
+; Main loop
+L$loop	ldws		0(%r26),%r29
+	fldws,ma	4(%r25),%fr5
+	add		%r29,%r19,%r19
+	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		-16(%r30),%r28
+	fstds		%fr6,-16(%r30)
+	addc		%r0,%r28,%r28
+	addib,<>	-1,%r24,L$loop
+	 ldw		-12(%r30),%r1
+
+L$end	ldw		0(%r26),%r29
+	add		%r29,%r19,%r19
+	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	ldw		-16(%r30),%r28
+	ldws		0(%r26),%r29
+	addc		%r0,%r28,%r28
+	add		%r29,%r19,%r19
+	stws,ma		%r19,4(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+L$just_one_limb
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		0(%r26),%r29
+	fstds		%fr6,-16(%r30)
+	ldw		-12(%r30),%r1
+	ldw		-16(%r30),%r28
+	add		%r29,%r1,%r19
+	stw		%r19,0(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+	.exit
+	.procend
diff --git a/sysdeps/hppa/hppa1.1/mul_1.s b/sysdeps/hppa/hppa1.1/mul_1.s
new file mode 100644
index 0000000000..ebf0778b90
--- /dev/null
+++ b/sysdeps/hppa/hppa1.1/mul_1.s
@@ -0,0 +1,97 @@
+; HP-PA-1.1 __mpn_mul_1 -- Multiply a limb vector with a limb and store
+; the result in a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r26
+; s1_ptr	r25
+; size		r24
+; s2_limb	r23
+
+; This runs at 9 cycles/limb on a PA7000.  With the used instructions, it can
+; not become faster due to data cache contention after a store.  On the
+; PA7100 it runs at 7 cycles/limb, and that can not be improved either, since
+; only the xmpyu does not need the integer pipeline, so the only dual-issue
+; we will get are addc+xmpyu.  Unrolling would not help either CPU.
+
+; We could use fldds to read two limbs at a time from the S1 array, and that
+; could bring down the times to 8.5 and 6.5 cycles/limb for the PA7000 and
+; PA7100, respectively.  We don't do that since it does not seem worth the
+; (alignment) troubles...
+
+; At least the PA7100 is rumored to be able to deal with cache-misses
+; without stalling instruction issue.  If this is true, and the cache is
+; actually also lockup-free, we should use a deeper software pipeline, and
+; load from S1 very early!  (The loads and stores to -12(sp) will surely be
+; in the cache.)
+
+	.code
+	.export		__mpn_mul_1
+__mpn_mul_1
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldo		64(%r30),%r30
+	fldws,ma	4(%r25),%fr5
+	stw		%r23,-16(%r30)		; move s2_limb ...
+	addib,=		-1,%r24,L$just_one_limb
+	 fldws		-16(%r30),%fr4		; ... into fr4
+	add		%r0,%r0,%r0		; clear carry
+	xmpyu		%fr4,%fr5,%fr6
+	fldws,ma	4(%r25),%fr7
+	fstds	 	%fr6,-16(%r30)
+	xmpyu		%fr4,%fr7,%fr8
+	ldw		-12(%r30),%r19		; least significant limb in product
+	ldw		-16(%r30),%r28
+
+	fstds		%fr8,-16(%r30)
+	addib,=		-1,%r24,L$end
+	 ldw		-12(%r30),%r1
+
+; Main loop
+L$loop	fldws,ma	4(%r25),%fr5
+	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		-16(%r30),%r28
+	fstds		%fr6,-16(%r30)
+	addib,<>	-1,%r24,L$loop
+	 ldw		-12(%r30),%r1
+
+L$end	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	ldw		-16(%r30),%r28
+	stws,ma		%r19,4(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+L$just_one_limb
+	xmpyu		%fr4,%fr5,%fr6
+	fstds		%fr6,-16(%r30)
+	ldw		-16(%r30),%r28
+	ldo		-64(%r30),%r30
+	bv		0(%r2)
+	 fstws		%fr6R,0(%r26)
+
+	.exit
+	.procend
diff --git a/sysdeps/hppa/hppa1.1/submul_1.s b/sysdeps/hppa/hppa1.1/submul_1.s
new file mode 100644
index 0000000000..44cabf4690
--- /dev/null
+++ b/sysdeps/hppa/hppa1.1/submul_1.s
@@ -0,0 +1,110 @@
+; HP-PA-1.1 __mpn_submul_1 -- Multiply a limb vector with a limb and
+; subtract the result from a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r26
+; s1_ptr	r25
+; size		r24
+; s2_limb	r23
+
+; This runs at 12 cycles/limb on a PA7000.  With the used instructions, it
+; can not become faster due to data cache contention after a store.  On the
+; PA7100 it runs at 11 cycles/limb, and that can not be improved either,
+; since only the xmpyu does not need the integer pipeline, so the only
+; dual-issue we will get are addc+xmpyu.  Unrolling could gain a cycle/limb
+; on the PA7100.
+
+; There are some ideas described in mul_1.s that applies to this code too.
+
+; It seems possible to make this run as fast as __mpn_addmul_1, if we use
+; 	sub,>>=	%r29,%r19,%r22
+;	addi	1,%r28,%r28
+; but that requires reworking the hairy software pipeline...
+
+	.code
+	.export		__mpn_submul_1
+__mpn_submul_1
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldo		64(%r30),%r30
+	fldws,ma	4(%r25),%fr5
+	stw		%r23,-16(%r30)		; move s2_limb ...
+	addib,=		-1,%r24,L$just_one_limb
+	 fldws		-16(%r30),%fr4		; ... into fr4
+	add		%r0,%r0,%r0		; clear carry
+	xmpyu		%fr4,%fr5,%fr6
+	fldws,ma	4(%r25),%fr7
+	fstds		%fr6,-16(%r30)
+	xmpyu		%fr4,%fr7,%fr8
+	ldw		-12(%r30),%r19		; least significant limb in product
+	ldw		-16(%r30),%r28
+
+	fstds		%fr8,-16(%r30)
+	addib,=		-1,%r24,L$end
+	 ldw		-12(%r30),%r1
+
+; Main loop
+L$loop	ldws		0(%r26),%r29
+	fldws,ma	4(%r25),%fr5
+	sub		%r29,%r19,%r22
+	add		%r22,%r19,%r0
+	stws,ma		%r22,4(%r26)
+	addc		%r28,%r1,%r19
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		-16(%r30),%r28
+	fstds		%fr6,-16(%r30)
+	addc		%r0,%r28,%r28
+	addib,<>	-1,%r24,L$loop
+	 ldw		-12(%r30),%r1
+
+L$end	ldw		0(%r26),%r29
+	sub		%r29,%r19,%r22
+	add		%r22,%r19,%r0
+	stws,ma		%r22,4(%r26)
+	addc		%r28,%r1,%r19
+	ldw		-16(%r30),%r28
+	ldws		0(%r26),%r29
+	addc		%r0,%r28,%r28
+	sub		%r29,%r19,%r22
+	add		%r22,%r19,%r0
+	stws,ma		%r22,4(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+L$just_one_limb
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		0(%r26),%r29
+	fstds		%fr6,-16(%r30)
+	ldw		-12(%r30),%r1
+	ldw		-16(%r30),%r28
+	sub		%r29,%r1,%r22
+	add		%r22,%r1,%r0
+	stw		%r22,0(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+	.exit
+	.procend
diff --git a/sysdeps/hppa/hppa1.1/udiv_qrnnd.s b/sysdeps/hppa/hppa1.1/udiv_qrnnd.s
new file mode 100644
index 0000000000..4ffef3a4fb
--- /dev/null
+++ b/sysdeps/hppa/hppa1.1/udiv_qrnnd.s
@@ -0,0 +1,74 @@
+; HP-PA  __udiv_qrnnd division support, used from longlong.h.
+; This version runs fast on PA 7000 and later.
+
+; Copyright (C) 1993, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; rem_ptr	gr26
+; n1		gr25
+; n0		gr24
+; d		gr23
+
+	.code
+L$0000	.word		0x43f00000
+	.word		0x0
+	.export		__udiv_qrnnd
+__udiv_qrnnd
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+	ldo		64(%r30),%r30
+
+	stws		%r25,-16(0,%r30)	; n_hi
+	stws		%r24,-12(0,%r30)	; n_lo
+	ldil		L'L$0000,%r19
+	ldo		R'L$0000(%r19),%r19
+	fldds		-16(0,%r30),%fr5
+	stws		%r23,-12(0,%r30)
+	comib,<=	0,%r25,L$1
+	fcnvxf,dbl,dbl	%fr5,%fr5
+	fldds		0(0,%r19),%fr4
+	fadd,dbl	%fr4,%fr5,%fr5
+L$1
+	fcpy,sgl	%fr0,%fr6L
+	fldws		-12(0,%r30),%fr6R
+	fcnvxf,dbl,dbl	%fr6,%fr4
+
+	fdiv,dbl	%fr5,%fr4,%fr5
+
+	fcnvfx,dbl,dbl	%fr5,%fr4
+	fstws		%fr4R,-16(%r30)
+	xmpyu		%fr4R,%fr6R,%fr6
+	ldws		-16(%r30),%r28
+	fstds		%fr6,-16(0,%r30)
+	ldws		-12(0,%r30),%r21
+	ldws		-16(0,%r30),%r20
+	sub		%r24,%r21,%r22
+	subb		%r25,%r20,%r19
+	comib,=		0,%r19,L$2
+	ldo		-64(%r30),%r30
+
+	add		%r22,%r23,%r22
+	ldo		-1(%r28),%r28
+L$2	bv		0(%r2)
+	stws		%r22,0(0,%r26)
+
+	.exit
+	.procend
diff --git a/sysdeps/hppa/lshift.s b/sysdeps/hppa/lshift.s
new file mode 100644
index 0000000000..0479f4a281
--- /dev/null
+++ b/sysdeps/hppa/lshift.s
@@ -0,0 +1,65 @@
+; HP-PA  __mpn_lshift --
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s_ptr		gr25
+; size		gr24
+; cnt		gr23
+
+	.code
+	.export		__mpn_lshift
+__mpn_lshift
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	sh2add		%r24,%r25,%r25
+	sh2add		%r24,%r26,%r26
+	ldws,mb		-4(0,%r25),%r22
+	subi		32,%r23,%r1
+	mtsar		%r1
+	addib,=		-1,%r24,L$0004
+	vshd		%r0,%r22,%r28		; compute carry out limb
+	ldws,mb		-4(0,%r25),%r29
+	addib,=		-1,%r24,L$0002
+	vshd		%r22,%r29,%r20
+
+L$loop	ldws,mb		-4(0,%r25),%r22
+	stws,mb		%r20,-4(0,%r26)
+	addib,=		-1,%r24,L$0003
+	vshd		%r29,%r22,%r20
+	ldws,mb		-4(0,%r25),%r29
+	stws,mb		%r20,-4(0,%r26)
+	addib,<>	-1,%r24,L$loop
+	vshd		%r22,%r29,%r20
+
+L$0002	stws,mb		%r20,-4(0,%r26)
+	vshd		%r29,%r0,%r20
+	bv		0(%r2)
+	stw		%r20,-4(0,%r26)
+L$0003	stws,mb		%r20,-4(0,%r26)
+L$0004	vshd		%r22,%r0,%r20
+	bv		0(%r2)
+	stw		%r20,-4(0,%r26)
+
+	.exit
+	.procend
diff --git a/sysdeps/hppa/rshift.s b/sysdeps/hppa/rshift.s
new file mode 100644
index 0000000000..18d33f2f86
--- /dev/null
+++ b/sysdeps/hppa/rshift.s
@@ -0,0 +1,62 @@
+; HP-PA  __mpn_rshift -- 
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s_ptr		gr25
+; size		gr24
+; cnt		gr23
+
+	.code
+	.export		__mpn_rshift
+__mpn_rshift
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r22
+	mtsar		%r23
+	addib,=		-1,%r24,L$0004
+	vshd		%r22,%r0,%r28		; compute carry out limb
+	ldws,ma		4(0,%r25),%r29
+	addib,=		-1,%r24,L$0002
+	vshd		%r29,%r22,%r20
+
+L$loop	ldws,ma		4(0,%r25),%r22
+	stws,ma		%r20,4(0,%r26)
+	addib,=		-1,%r24,L$0003
+	vshd		%r22,%r29,%r20
+	ldws,ma		4(0,%r25),%r29
+	stws,ma		%r20,4(0,%r26)
+	addib,<>	-1,%r24,L$loop
+	vshd		%r29,%r22,%r20
+
+L$0002	stws,ma		%r20,4(0,%r26)
+	vshd		%r0,%r29,%r20
+	bv		0(%r2)
+	stw		%r20,0(0,%r26)
+L$0003	stws,ma		%r20,4(0,%r26)
+L$0004	vshd		%r0,%r22,%r20
+	bv		0(%r2)
+	stw		%r20,0(0,%r26)
+
+	.exit
+	.procend
diff --git a/sysdeps/hppa/sub_n.s b/sysdeps/hppa/sub_n.s
new file mode 100644
index 0000000000..daae46e61d
--- /dev/null
+++ b/sysdeps/hppa/sub_n.s
@@ -0,0 +1,58 @@
+; HP-PA  __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; One might want to unroll this as for other processors, but it turns
+; out that the data cache contention after a store makes such
+; unrolling useless.  We can't come under 5 cycles/limb anyway.
+
+	.code
+	.export		__mpn_sub_n
+__mpn_sub_n
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+
+	addib,=		-1,%r23,L$end	; check for (SIZE == 1)
+	 sub		%r20,%r19,%r28	; subtract first limbs ignoring cy
+
+L$loop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,<>	-1,%r23,L$loop
+	 subb		%r20,%r19,%r28
+
+L$end	stws		%r28,0(0,%r26)
+	addc		%r0,%r0,%r28
+	bv		0(%r2)
+	 subi		1,%r28,%r28
+
+	.exit
+	.procend
diff --git a/sysdeps/hppa/udiv_qrnnd.s b/sysdeps/hppa/udiv_qrnnd.s
new file mode 100644
index 0000000000..0b069bf7f9
--- /dev/null
+++ b/sysdeps/hppa/udiv_qrnnd.s
@@ -0,0 +1,285 @@
+; HP-PA  __udiv_qrnnd division support, used from longlong.h.
+; This version runs fast on pre-PA7000 CPUs.
+
+; Copyright (C) 1993, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; rem_ptr	gr26
+; n1		gr25
+; n0		gr24
+; d		gr23
+
+; The code size is a bit excessive.  We could merge the last two ds;addc
+; sequences by simply moving the "bb,< Odd" instruction down.  The only
+; trouble is the FFFFFFFF code that would need some hacking.
+
+	.code
+	.export		__udiv_qrnnd
+__udiv_qrnnd
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	comb,<		%r23,0,L$largedivisor
+	 sub		%r0,%r23,%r1		; clear cy as side-effect
+	ds		%r0,%r1,%r0
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r28
+	ds		%r25,%r23,%r25
+	comclr,>=	%r25,%r0,%r0
+	addl		%r25,%r23,%r25
+	stws		%r25,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r28,%r28,%r28
+
+L$largedivisor
+	extru		%r24,31,1,%r19		; r19 = n0 & 1
+	bb,<		%r23,31,L$odd
+	 extru		%r23,30,31,%r22		; r22 = d >> 1
+	shd		%r25,%r24,1,%r24	; r24 = new n0
+	extru		%r25,30,31,%r25		; r25 = new n1
+	sub		%r0,%r22,%r21
+	ds		%r0,%r21,%r0
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	comclr,>=	%r25,%r0,%r0
+	addl		%r25,%r22,%r25
+	sh1addl		%r25,%r19,%r25
+	stws		%r25,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r24,%r24,%r28
+
+L$odd	addib,sv,n	1,%r22,L$FF..		; r22 = (d / 2 + 1)
+	shd		%r25,%r24,1,%r24	; r24 = new n0
+	extru		%r25,30,31,%r25		; r25 = new n1
+	sub		%r0,%r22,%r21
+	ds		%r0,%r21,%r0
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r28
+	comclr,>=	%r25,%r0,%r0
+	addl		%r25,%r22,%r25
+	sh1addl		%r25,%r19,%r25
+; We have computed (n1,,n0) / (d + 1), q' = r28, r' = r25
+	add,nuv		%r28,%r25,%r25
+	addl		%r25,%r1,%r25
+	addc		%r0,%r28,%r28
+	sub,<<		%r25,%r23,%r0
+	addl		%r25,%r1,%r25
+	stws		%r25,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r28,%r28
+
+; This is just a special case of the code above.
+; We come here when d == 0xFFFFFFFF
+L$FF..	add,uv		%r25,%r24,%r24
+	sub,<<		%r24,%r23,%r0
+	ldo		1(%r24),%r24
+	stws		%r24,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r25,%r28
+
+	.exit
+	.procend
diff --git a/sysdeps/i386/add_n.S b/sysdeps/i386/add_n.S
index c4e71ea8c7..c3b3c3e4e1 100644
--- a/sysdeps/i386/add_n.S
+++ b/sysdeps/i386/add_n.S
@@ -1,7 +1,7 @@
 /* i80386 __mpn_add_n -- Add two limb vectors of the same length > 0 and store
 sum in a third limb vector.
 
-Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -54,14 +54,18 @@ C_SYMBOL_NAME(__mpn_add_n:)
 	subl	%eax,%edx		/* ... enter the loop */
 	shrl	$2,%eax			/* restore previous value */
 #ifdef PIC
-	call	here
-here:	leal	(Loop - 3 - here)(%eax,%eax,8),%eax
-	addl	%eax,(%esp)
-	ret
+/* Calculate start address in loop for PIC.  Due to limitations in some
+   assemblers, Loop-L0-3 cannot be put into the leal */
+	call	L0
+L0:	leal	(%eax,%eax,8),%eax
+	addl	(%esp),%eax
+	addl	$(Loop-L0-3),%eax 
+	addl	$4,%esp
 #else
-	leal	(Loop - 3)(%eax,%eax,8),%eax	/* calc start addr in loop */
-	jmp	*%eax			/* jump into loop */
+/* Calculate start address in loop for non-PIC.  */
+ 	leal	(Loop - 3)(%eax,%eax,8),%eax
 #endif
+	jmp	*%eax			/* jump into loop */
 	ALIGN (3)
 Loop:	movl	(%esi),%eax
 	adcl	(%edx),%eax
diff --git a/sysdeps/i386/gmp-mparam.h b/sysdeps/i386/gmp-mparam.h
new file mode 100644
index 0000000000..687f12aa35
--- /dev/null
+++ b/sysdeps/i386/gmp-mparam.h
@@ -0,0 +1,28 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+#define IEEE_DOUBLE_BIG_ENDIAN 0
diff --git a/sysdeps/i386/i486/strcat.S b/sysdeps/i386/i486/strcat.S
new file mode 100644
index 0000000000..e3d2181bdb
--- /dev/null
+++ b/sysdeps/i386/i486/strcat.S
@@ -0,0 +1,260 @@
+/* strcat(dest, src) -- Append SRC on the end of DEST.
+For Intel 80x86, x>=4.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@ipd.info.uni-karlsruhe.de>.
+Optimised a little by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   dest		(sp + 4)
+   src		(sp + 8)
+*/
+
+	.text
+ENTRY (strcat)
+	pushl %edi		/* Save callee-safe register.  */
+
+	movl 12(%esp), %ecx	/* load source pointer */
+	movl 8(%esp), %edx	/* load destination pointer */
+
+	testb $0xff, (%ecx)	/* Is source string empty? */
+	jz L8			/* yes => return */
+
+	/* Test the first bytes separately until destination is aligned.  */
+	testb $3, %edx		/* destination pointer aligned? */
+	jz L1			/* yes => begin scan loop */
+	testb $0xff, (%edx)	/* is end of string? */
+	jz L2			/* yes => start appending */
+	incl %edx		/* increment source pointer */
+
+	testb $3, %edx		/* destination pointer aligned? */
+	jz L1			/* yes => begin scan loop */
+	testb $0xff, (%edx)	/* is end of string? */
+	jz L2			/* yes => start appending */
+	incl %edx		/* increment source pointer */
+
+	testb $3, %edx		/* destination pointer aligned? */
+	jz L1			/* yes => begin scan loop */
+	testb $0xff, (%edx)	/* is end of string? */
+	jz L2			/* yes => start appending */
+	incl %edx		/* increment source pointer */
+
+	/* Now we are aligned.  Begin scan loop.  */
+	jmp L1
+
+	ALIGN(4)
+
+L4:	addl $16,%edx		/* increment destination pointer for round */
+
+L1:	movl (%edx), %eax	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+
+	/* If you compare this with the algorithm in memchr.S you will
+	   notice that here is an `xorl' statement missing.  But you must
+	   not forget that we are looking for C == 0 and `xorl $0, %eax'
+	   is a no-op.  */
+
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+
+	/* According to the algorithm we had to reverse the effect of the
+	   XOR first and then test the overflow bits.  But because the
+	   following XOR would destroy the carry flag and it would (in a
+	   representation with more than 32 bits) not alter then last
+	   overflow, we can now test this condition.  If no carry is signaled
+	   no overflow must have occured in the last byte => it was 0.	*/
+	jnc L3
+
+	/* We are only interested in carry bits that change due to the
+	   previous add, so remove original bits */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+
+	/* Now test for the other three overflow bits.  */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	/* If at least one byte of the word is C we don't get 0 in %ecx.  */
+	jnz L3
+
+	movl 4(%edx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L5			/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L5			/* one byte is NUL => stop copying */
+
+	movl 8(%edx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L6			/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L6			/* one byte is NUL => stop copying */
+
+	movl 12(%edx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L7			/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz L4			/* no byte is NUL => carry on copying */
+
+L7:	addl $4, %edx		/* adjust source pointer */
+L6:	addl $4, %edx
+L5:	addl $4, %edx
+
+L3:	testb %al, %al		/* is first byte NUL? */
+	jz L2			/* yes => start copying */
+	incl %edx		/* increment source pointer */
+
+	testb %ah, %ah		/* is second byte NUL? */
+	jz L2			/* yes => start copying */
+	incl %edx		/* increment source pointer */
+
+	testl $0xff0000, %eax	/* is third byte NUL? */
+	jz L2			/* yes => start copying */
+	incl %edx		/* increment source pointer */
+
+L2:	subl %ecx, %edx		/* reduce number of loop variants */
+
+	/* Now we have to align the source pointer.  */
+	testb $3, %ecx		/* pointer correctly aligned? */
+	jz L29			/* yes => start copy loop */
+	movb (%ecx), %al	/* get first byte */
+	movb %al, (%ecx,%edx)	/* and store it */
+	andl %al, %al		/* is byte NUL? */
+	jz L8			/* yes => return */
+	incl %ecx		/* increment pointer */
+
+	testb $3, %ecx		/* pointer correctly aligned? */
+	jz L29			/* yes => start copy loop */
+	movb (%ecx), %al	/* get first byte */
+	movb %al, (%ecx,%edx)	/* and store it */
+	andl %al, %al		/* is byte NUL? */
+	jz L8			/* yes => return */
+	incl %ecx		/* increment pointer */
+
+	testb $3, %ecx		/* pointer correctly aligned? */
+	jz L29			/* yes => start copy loop */
+	movb (%ecx), %al	/* get first byte */
+	movb %al, (%ecx,%edx)	/* and store it */
+	andl %al, %al		/* is byte NUL? */
+	jz L8			/* yes => return */
+	incl %ecx		/* increment pointer */
+
+	/* Now we are aligned.  */
+	jmp L29			/* start copy loop */
+
+	ALIGN(4)
+
+L28:	movl %eax, 12(%ecx,%edx)/* store word at destination */
+	addl $16, %ecx		/* adjust pointer for full round */
+
+L29:	movl (%ecx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L9			/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L9			/* one byte is NUL => stop copying */
+	movl %eax, (%ecx,%edx)	/* store word to destination */
+
+	movl 4(%ecx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L91			/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L91			/* one byte is NUL => stop copying */
+	movl %eax, 4(%ecx,%edx)	/* store word to destination */
+
+	movl 8(%ecx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L92			/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L92			/* one byte is NUL => stop copying */
+	movl %eax, 8(%ecx,%edx)	/* store word to destination */
+
+	movl 12(%ecx), %eax	/* get word from source */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %eax, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L93			/* highest byte is C => stop copying */
+	xorl %eax, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz L28			/* no is NUL => carry on copying */
+
+L93:	addl $4, %ecx		/* adjust pointer */
+L92:	addl $4, %ecx
+L91:	addl $4, %ecx
+
+L9:	movb %al, (%ecx,%edx)	/* store first byte of last word */
+	orb %al, %al		/* is it NUL? */
+	jz L8			/* yes => return */
+
+	movb %ah, 1(%ecx,%edx)	/* store second byte of last word */
+	orb %ah, %ah		/* is it NUL? */
+	jz L8			/* yes => return */
+
+	shrl $16, %eax		/* make upper bytes accessible */
+	movb %al, 2(%ecx,%edx)	/* store third byte of last word */
+	orb %al, %al		/* is it NUL? */
+	jz L8			/* yes => return */
+
+	movb %ah, 3(%ecx,%edx)	/* store fourth byte of last word */
+
+L8:	movl 8(%esp), %eax	/* start address of destination is result */
+	popl %edi		/* restore saved register */
+
+	ret
diff --git a/sysdeps/i386/i486/strlen.S b/sysdeps/i386/i486/strlen.S
new file mode 100644
index 0000000000..276563b11a
--- /dev/null
+++ b/sysdeps/i386/i486/strlen.S
@@ -0,0 +1,132 @@
+/* strlen(str) -- determine the length of the string STR.
+Optimized for Intel 80x86, x>=4.
+Copyright (C) 1991, 1992, 1993, 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>.
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   str		(sp + 4)
+*/
+
+	.text
+ENTRY (strlen)
+	movl 4(%esp), %ecx	/* get string pointer */
+	movl %ecx, %eax		/* duplicate it */
+
+	andl $3, %ecx		/* mask alignment bits */
+	jz L1			/* aligned => start loop */
+	cmpb %ch, (%eax)	/* is byte NUL? */
+	je L2			/* yes => return */
+	incl %eax		/* increment pointer */
+
+	xorl $3, %ecx		/* was alignment = 3? */
+	jz L1			/* yes => now it is aligned and start loop */
+	cmpb %ch, (%eax)	/* is byte NUL? */
+	je L2			/* yes => return */
+	addl $1, %eax		/* increment pointer */
+
+	subl $1, %ecx		/* was alignment = 2? */
+	jz L1			/* yes => now it is aligned and start loop */
+	cmpb %ch, (%eax)	/* is byte NUL? */
+	je L2			/* yes => return */
+
+/* Don't change the above `addl $1,%eax' and `subl $1, %ecx' into `incl %eax'
+   and `decl %ecx' resp.  The additional two byte per instruction make the
+   label 4 to be aligned on a 16 byte boundary with nops.
+
+   The following `sub $15, %eax' is part of this trick, too.  Together with
+   the next instruction (`addl $16, %eax') it is in fact a `incl %eax', just
+   as expected from the algorithm.  But doing so has the advantage that
+   no jump to label 1 is necessary and so the pipeline is not flushed.  */
+
+	subl $15, %eax		/* effectively +1 */
+
+
+L4:	addl $16, %eax		/* adjust pointer for full loop */
+
+L1:	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edx	/* magic value */
+	addl %ecx, %edx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L3			/* highest byte is NUL => return pointer */
+	xorl %ecx, %edx		/* (word+magic)^word */
+	orl $0xfefefeff, %edx	/* set all non-carry bits */
+	incl %edx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L3			/* found NUL => return pointer */
+
+	movl 4(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edx	/* magic value */
+	addl %ecx, %edx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L5			/* highest byte is NUL => return pointer */
+	xorl %ecx, %edx		/* (word+magic)^word */
+	orl $0xfefefeff, %edx	/* set all non-carry bits */
+	incl %edx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L5			/* found NUL => return pointer */
+
+	movl 8(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edx	/* magic value */
+	addl %ecx, %edx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L6			/* highest byte is NUL => return pointer */
+	xorl %ecx, %edx		/* (word+magic)^word */
+	orl $0xfefefeff, %edx	/* set all non-carry bits */
+	incl %edx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L6			/* found NUL => return pointer */
+
+	movl 12(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edx	/* magic value */
+	addl %ecx, %edx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L7			/* highest byte is NUL => return pointer */
+	xorl %ecx, %edx		/* (word+magic)^word */
+	orl $0xfefefeff, %edx	/* set all non-carry bits */
+	incl %edx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz L4			/* no NUL found => continue loop */
+
+L7:	addl $4, %eax		/* adjust pointer */
+L6:	addl $4, %eax
+L5:	addl $4, %eax
+
+L3:	testb %cl, %cl		/* is first byte NUL? */
+	jz L2			/* yes => return */
+	incl %eax		/* increment pointer */
+
+	testb %ch, %ch		/* is second byte NUL? */
+	jz L2			/* yes => return */
+	incl %eax		/* increment pointer */
+
+	testl $0xff0000, %ecx	/* is third byte NUL? */
+	jz L2			/* yes => return pointer */
+	incl %eax		/* increment pointer */
+
+L2:	subl 4(%esp), %eax	/* compute difference to string start */
+
+	ret
diff --git a/sysdeps/i386/i586/Implies b/sysdeps/i386/i586/Implies
new file mode 100644
index 0000000000..477cd741ce
--- /dev/null
+++ b/sysdeps/i386/i586/Implies
@@ -0,0 +1,2 @@
+# Code optimized for i486 is better than simple i386 code.
+i386/i486
diff --git a/sysdeps/i386/i586/add_n.S b/sysdeps/i386/i586/add_n.S
new file mode 100644
index 0000000000..9be45ed93b
--- /dev/null
+++ b/sysdeps/i386/i586/add_n.S
@@ -0,0 +1,136 @@
+/* Pentium __mpn_add_n -- Add two limb vectors of the same length > 0 and store
+   sum in a third limb vector.
+
+Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+   INPUT PARAMETERS
+   res_ptr	(sp + 4)
+   s1_ptr	(sp + 8)
+   s2_ptr	(sp + 12)
+   size		(sp + 16)
+*/
+
+#define r1	%eax
+#define r2	%edx
+#define src1	%esi
+#define src2	%ebp
+#define dst	%edi
+#define x	%ebx
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+.text
+	ALIGN (3)
+	.globl C_SYMBOL_NAME(__mpn_add_n)
+C_SYMBOL_NAME(__mpn_add_n:)
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+
+	movl	20(%esp),dst		/* res_ptr */
+	movl	24(%esp),src1		/* s1_ptr */
+	movl	28(%esp),src2		/* s2_ptr */
+	movl	32(%esp),%ecx		/* size */
+
+	movl	(src2),x
+
+	decl	%ecx
+	movl	%ecx,r2
+	shrl	$3,%ecx
+	andl	$7,r2
+	testl	%ecx,%ecx		/* zero carry flag */
+	jz	Lend
+	pushl	r2
+
+	ALIGN (3)
+Loop:	movl	28(dst),%eax		/* fetch destination cache line */
+	leal	32(dst),dst
+
+L1:	movl	(src1),r1
+	movl	4(src1),r2
+	adcl	x,r1
+	movl	4(src2),x
+	adcl	x,r2
+	movl	8(src2),x
+	movl	r1,-32(dst)
+	movl	r2,-28(dst)
+
+L2:	movl	8(src1),r1
+	movl	12(src1),r2
+	adcl	x,r1
+	movl	12(src2),x
+	adcl	x,r2
+	movl	16(src2),x
+	movl	r1,-24(dst)
+	movl	r2,-20(dst)
+
+L3:	movl	16(src1),r1
+	movl	20(src1),r2
+	adcl	x,r1
+	movl	20(src2),x
+	adcl	x,r2
+	movl	24(src2),x
+	movl	r1,-16(dst)
+	movl	r2,-12(dst)
+
+L4:	movl	24(src1),r1
+	movl	28(src1),r2
+	adcl	x,r1
+	movl	28(src2),x
+	adcl	x,r2
+	movl	32(src2),x
+	movl	r1,-8(dst)
+	movl	r2,-4(dst)
+
+	leal	32(src1),src1
+	leal	32(src2),src2
+	decl	%ecx
+	jnz	Loop
+
+	popl	r2
+Lend:
+	decl	r2			/* test r2 w/o clobbering carry */
+	js	Lend2
+	incl	r2
+Loop2:
+	leal	4(dst),dst
+	movl	(src1),r1
+	adcl	x,r1
+	movl	4(src2),x
+	movl	r1,-4(dst)
+	leal	4(src1),src1
+	leal	4(src2),src2
+	decl	r2
+	jnz	Loop2
+Lend2:
+	movl	(src1),r1
+	adcl	x,r1
+	movl	r1,(dst)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
diff --git a/sysdeps/i386/i586/addmul_1.S b/sysdeps/i386/i586/addmul_1.S
new file mode 100644
index 0000000000..b222840591
--- /dev/null
+++ b/sysdeps/i386/i586/addmul_1.S
@@ -0,0 +1,84 @@
+/* Pentium __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+   the result to a second limb vector.
+
+Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+   INPUT PARAMETERS
+   res_ptr	(sp + 4)
+   s1_ptr	(sp + 8)
+   size		(sp + 12)
+   s2_limb	(sp + 16)
+*/
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define res_ptr edi
+#define s1_ptr esi
+#define s2_limb ebp
+
+	TEXT
+	ALIGN (3)
+	GLOBL	C_SYMBOL_NAME(__mpn_addmul_1)
+	.type	C_SYMBOL_NAME(__mpn_addmul_1),@function
+C_SYMBOL_NAME(__mpn_addmul_1:)
+
+	INSN1(push,l	,R(edi))
+	INSN1(push,l	,R(esi))
+	INSN1(push,l	,R(ebx))
+	INSN1(push,l	,R(ebp))
+
+	INSN2(mov,l	,R(res_ptr),MEM_DISP(esp,20))
+	INSN2(mov,l	,R(s1_ptr),MEM_DISP(esp,24))
+	INSN2(mov,l	,R(ecx),MEM_DISP(esp,28))
+	INSN2(mov,l	,R(s2_limb),MEM_DISP(esp,32))
+
+	INSN2(lea,l	,R(res_ptr),MEM_INDEX(res_ptr,ecx,4))
+	INSN2(lea,l	,R(s1_ptr),MEM_INDEX(s1_ptr,ecx,4))
+	INSN1(neg,l	,R(ecx))
+	INSN2(xor,l	,R(edx),R(edx))
+	ALIGN (3)
+Loop:
+	INSN2(mov,l	,R(ebx),R(edx))
+	INSN2(mov,l	,R(eax),MEM_INDEX(s1_ptr,ecx,4))
+
+	INSN1(mul,l	,R(s2_limb))
+
+	INSN2(add,l	,R(eax),R(ebx))
+	INSN2(mov,l	,R(ebx),MEM_INDEX(res_ptr,ecx,4))
+
+	INSN2(adc,l	,R(edx),$0)
+	INSN2(add,l	,R(ebx),R(eax))
+
+	INSN2(adc,l	,R(edx),$0)
+	INSN2(mov,l	,MEM_INDEX(res_ptr,ecx,4),R(ebx))
+
+	INSN1(inc,l	,R(ecx))
+	INSN1(jnz,	,Loop)
+
+
+	INSN2(mov,l	,R(eax),R(edx))
+	INSN1(pop,l	,R(ebp))
+	INSN1(pop,l	,R(ebx))
+	INSN1(pop,l	,R(esi))
+	INSN1(pop,l	,R(edi))
+	ret
+Lfe1:
+	.size	C_SYMBOL_NAME(__mpn_addmul_1),Lfe1-C_SYMBOL_NAME(__mpn_addmul_1)
diff --git a/sysdeps/i386/i586/lshift.S b/sysdeps/i386/i586/lshift.S
new file mode 100644
index 0000000000..b9f8131297
--- /dev/null
+++ b/sysdeps/i386/i586/lshift.S
@@ -0,0 +1,213 @@
+/* Pentium optimized __mpn_lshift -- 
+
+Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s_ptr		(sp + 8)
+  size		(sp + 12)
+  cnt		(sp + 16)
+*/
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+.text
+	ALIGN (3)
+	.globl C_SYMBOL_NAME(__mpn_lshift)
+C_SYMBOL_NAME(__mpn_lshift:)
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+
+	movl	20(%esp),%edi		/* res_ptr */
+	movl	24(%esp),%esi		/* s_ptr */
+	movl	28(%esp),%ebp		/* size */
+	movl	32(%esp),%ecx		/* cnt */
+
+	cmp	$1,%ecx
+	jne	Lnormal
+	movl	%edi,%eax
+	subl	%esi,%eax
+	cmpl	%ebp,%eax
+	jnc	Lspecial
+
+Lnormal:
+	leal	-4(%edi,%ebp,4),%edi
+	leal	-4(%esi,%ebp,4),%esi
+
+	movl	(%esi),%edx
+	subl	$4,%esi
+	xorl	%eax,%eax
+	shldl	%cl,%edx,%eax		/* compute carry limb */
+	pushl	%eax			/* push carry limb onto stack */
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+	jz	Lend
+
+	movl	(%edi),%eax		/* fetch destination cache line */
+
+	ALIGN	(2)
+Loop:	movl	-28(%edi),%eax		/* fetch destination cache line */
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	-4(%esi),%edx
+	shldl	%cl,%eax,%ebx
+	shldl	%cl,%edx,%eax
+	movl	%ebx,(%edi)
+	movl	%eax,-4(%edi)
+
+	movl	-8(%esi),%ebx
+	movl	-12(%esi),%eax
+	shldl	%cl,%ebx,%edx
+	shldl	%cl,%eax,%ebx
+	movl	%edx,-8(%edi)
+	movl	%ebx,-12(%edi)
+
+	movl	-16(%esi),%edx
+	movl	-20(%esi),%ebx
+	shldl	%cl,%edx,%eax
+	shldl	%cl,%ebx,%edx
+	movl	%eax,-16(%edi)
+	movl	%edx,-20(%edi)
+
+	movl	-24(%esi),%eax
+	movl	-28(%esi),%edx
+	shldl	%cl,%eax,%ebx
+	shldl	%cl,%edx,%eax
+	movl	%ebx,-24(%edi)
+	movl	%eax,-28(%edi)
+
+	subl	$32,%esi
+	subl	$32,%edi
+	decl	%ebp
+	jnz	Loop
+
+Lend:	popl	%ebp
+	andl	$7,%ebp
+	jz	Lend2
+Loop2:	movl	(%esi),%eax
+	shldl	%cl,%eax,%edx
+	movl	%edx,(%edi)
+	movl	%eax,%edx
+	subl	$4,%esi
+	subl	$4,%edi
+	decl	%ebp
+	jnz	Loop2
+
+Lend2:	shll	%cl,%edx		/* compute least significant limb */
+	movl	%edx,(%edi)		/* store it */
+
+	popl	%eax			/* pop carry limb */
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+/* We loop from least significant end of the arrays, which is only
+   permissable if the source and destination don't overlap, since the
+   function is documented to work for overlapping source and destination.
+*/
+
+Lspecial:
+	movl	(%esi),%edx
+	addl	$4,%esi
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+
+	addl	%edx,%edx
+	incl	%ebp
+	decl	%ebp
+	jz	LLend
+
+	movl	(%edi),%eax		/* fetch destination cache line */
+
+	ALIGN	(2)
+LLoop:	movl	28(%edi),%eax		/* fetch destination cache line */
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	adcl	%eax,%eax
+	movl	%ebx,(%edi)
+	adcl	%edx,%edx
+	movl	%eax,4(%edi)
+
+	movl	8(%esi),%ebx
+	movl	12(%esi),%eax
+	adcl	%ebx,%ebx
+	movl	%edx,8(%edi)
+	adcl	%eax,%eax
+	movl	%ebx,12(%edi)
+
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	adcl	%edx,%edx
+	movl	%eax,16(%edi)
+	adcl	%ebx,%ebx
+	movl	%edx,20(%edi)
+
+	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	adcl	%eax,%eax
+	movl	%ebx,24(%edi)
+	adcl	%edx,%edx
+	movl	%eax,28(%edi)
+
+	leal	32(%esi),%esi		/* use leal not to clobber carry */
+	leal	32(%edi),%edi
+	decl	%ebp
+	jnz	LLoop
+
+LLend:	popl	%ebp
+	sbbl	%eax,%eax		/* save carry in %eax */
+	andl	$7,%ebp
+	jz	LLend2
+	addl	%eax,%eax		/* restore carry from eax */
+LLoop2:	movl	%edx,%ebx
+	movl	(%esi),%edx
+	adcl	%edx,%edx
+	movl	%ebx,(%edi)
+
+	leal	4(%esi),%esi		/* use leal not to clobber carry */
+	leal	4(%edi),%edi
+	decl	%ebp
+	jnz	LLoop2
+
+	jmp	LL1
+LLend2:	addl	%eax,%eax		/* restore carry from eax */
+LL1:	movl	%edx,(%edi)		/* store last limb */
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
diff --git a/sysdeps/i386/i586/memcopy.h b/sysdeps/i386/i586/memcopy.h
index a9bb9e7a40..0a8768788e 100644
--- a/sysdeps/i386/i586/memcopy.h
+++ b/sysdeps/i386/i586/memcopy.h
@@ -1,5 +1,5 @@
 /* memcopy.h -- definitions for memory copy functions.  Pentium version.
-   Copyright (C) 1994 Free Software Foundation, Inc.
+   Copyright (C) 1994, 1995 Free Software Foundation, Inc.
    Contributed by Torbjorn Granlund (tege@sics.se).
 
 This file is part of the GNU C Library.
@@ -88,7 +88,7 @@ Cambridge, MA 02139, USA.  */
 		    "subl	$32,%2\n"				\
 		    "jns	1b\n"					\
 		    "2: addl	$32,%2" :				\
-		    "=r" (dst_bp), "=r" (src_bp), "=r" (nbytes_left) :	\
-		    "0" (dst_bp), "1" (src_bp), "2" (nbytes) :		\
+		    "=r" (dst_ep), "=r" (src_ep), "=r" (nbytes_left) :	\
+		    "0" (dst_ep), "1" (src_ep), "2" (nbytes) :		\
 		    "ax", "dx");					\
     } while (0)
diff --git a/sysdeps/i386/i586/mul_1.S b/sysdeps/i386/i586/mul_1.S
new file mode 100644
index 0000000000..2b7258e130
--- /dev/null
+++ b/sysdeps/i386/i586/mul_1.S
@@ -0,0 +1,78 @@
+/* Pentium __mpn_mul_1 -- Multiply a limb vector with a limb and store
+   the result in a second limb vector.
+
+Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+   INPUT PARAMETERS
+   res_ptr	(sp + 4)
+   s1_ptr	(sp + 8)
+   size		(sp + 12)
+   s2_limb	(sp + 16)
+*/
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define res_ptr edi
+#define s1_ptr esi
+#define size ecx
+#define s2_limb ebp
+
+	TEXT
+	ALIGN (3)
+	GLOBL	C_SYMBOL_NAME(__mpn_mul_1)
+C_SYMBOL_NAME(__mpn_mul_1:)
+
+	INSN1(push,l	,R(edi))
+	INSN1(push,l	,R(esi))
+	INSN1(push,l	,R(ebx))
+	INSN1(push,l	,R(ebp))
+
+	INSN2(mov,l	,R(res_ptr),MEM_DISP(esp,20))
+	INSN2(mov,l	,R(s1_ptr),MEM_DISP(esp,24))
+	INSN2(mov,l	,R(size),MEM_DISP(esp,28))
+	INSN2(mov,l	,R(s2_limb),MEM_DISP(esp,32))
+
+	INSN2(lea,l	,R(res_ptr),MEM_INDEX(res_ptr,size,4))
+	INSN2(lea,l	,R(s1_ptr),MEM_INDEX(s1_ptr,size,4))
+	INSN1(neg,l	,R(size))
+	INSN2(xor,l	,R(edx),R(edx))
+	ALIGN (3)
+Loop:
+	INSN2(mov,l	,R(ebx),R(edx))
+	INSN2(mov,l	,R(eax),MEM_INDEX(s1_ptr,size,4))
+
+	INSN1(mul,l	,R(s2_limb))
+
+	INSN2(add,l	,R(eax),R(ebx))
+
+	INSN2(adc,l	,R(edx),$0)
+	INSN2(mov,l	,MEM_INDEX(res_ptr,size,4),R(eax))
+
+	INSN1(inc,l	,R(size))
+	INSN1(jnz,	,Loop)
+
+
+	INSN2(mov,l	,R(eax),R(edx))
+	INSN1(pop,l	,R(ebp))
+	INSN1(pop,l	,R(ebx))
+	INSN1(pop,l	,R(esi))
+	INSN1(pop,l	,R(edi))
+	ret
diff --git a/sysdeps/i386/i586/rshift.S b/sysdeps/i386/i586/rshift.S
new file mode 100644
index 0000000000..51cde8f07f
--- /dev/null
+++ b/sysdeps/i386/i586/rshift.S
@@ -0,0 +1,213 @@
+/* Pentium optimized __mpn_rshift -- 
+
+Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s_ptr		(sp + 8)
+  size		(sp + 12)
+  cnt		(sp + 16)
+*/
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+.text
+	ALIGN (3)
+	.globl C_SYMBOL_NAME(__mpn_rshift)
+C_SYMBOL_NAME(__mpn_rshift:)
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+
+	movl	20(%esp),%edi		/* res_ptr */
+	movl	24(%esp),%esi		/* s_ptr */
+	movl	28(%esp),%ebp		/* size */
+	movl	32(%esp),%ecx		/* cnt */
+
+	cmp	$1,%ecx
+	jne	Lnormal
+	movl	%edi,%eax
+	subl	%esi,%eax
+	cmpl	%ebp,%eax
+	jnc	Lspecial
+
+Lnormal:
+	movl	(%esi),%edx
+	addl	$4,%esi
+	xorl	%eax,%eax
+	shrdl	%cl,%edx,%eax		/* compute carry limb */
+	pushl	%eax			/* push carry limb onto stack */
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+	jz	Lend
+
+	movl	(%edi),%eax		/* fetch destination cache line */
+
+	ALIGN	(2)
+Loop:	movl	28(%edi),%eax		/* fetch destination cache line */
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	shrdl	%cl,%eax,%ebx
+	shrdl	%cl,%edx,%eax
+	movl	%ebx,(%edi)
+	movl	%eax,4(%edi)
+
+	movl	8(%esi),%ebx
+	movl	12(%esi),%eax
+	shrdl	%cl,%ebx,%edx
+	shrdl	%cl,%eax,%ebx
+	movl	%edx,8(%edi)
+	movl	%ebx,12(%edi)
+
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	shrdl	%cl,%edx,%eax
+	shrdl	%cl,%ebx,%edx
+	movl	%eax,16(%edi)
+	movl	%edx,20(%edi)
+
+	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	shrdl	%cl,%eax,%ebx
+	shrdl	%cl,%edx,%eax
+	movl	%ebx,24(%edi)
+	movl	%eax,28(%edi)
+
+	addl	$32,%esi
+	addl	$32,%edi
+	decl	%ebp
+	jnz	Loop
+
+Lend:	popl	%ebp
+	andl	$7,%ebp
+	jz	Lend2
+Loop2:	movl	(%esi),%eax
+	shrdl	%cl,%eax,%edx		/* compute result limb */
+	movl	%edx,(%edi)
+	movl	%eax,%edx
+	addl	$4,%esi
+	addl	$4,%edi
+	decl	%ebp
+	jnz	Loop2
+
+Lend2:	shrl	%cl,%edx		/* compute most significant limb */
+	movl	%edx,(%edi)		/* store it */
+
+	popl	%eax			/* pop carry limb */
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+/* We loop from least significant end of the arrays, which is only
+   permissable if the source and destination don't overlap, since the
+   function is documented to work for overlapping source and destination.
+*/
+
+Lspecial:
+	leal	-4(%edi,%ebp,4),%edi
+	leal	-4(%esi,%ebp,4),%esi
+
+	movl	(%esi),%edx
+	subl	$4,%esi
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+
+	shrl	$1,%edx
+	incl	%ebp
+	decl	%ebp
+	jz	LLend
+
+	movl	(%edi),%eax		/* fetch destination cache line */
+
+	ALIGN	(2)
+LLoop:	movl	-28(%edi),%eax		/* fetch destination cache line */
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	-4(%esi),%edx
+	rcrl	$1,%eax
+	movl	%ebx,(%edi)
+	rcrl	$1,%edx
+	movl	%eax,-4(%edi)
+
+	movl	-8(%esi),%ebx
+	movl	-12(%esi),%eax
+	rcrl	$1,%ebx
+	movl	%edx,-8(%edi)
+	rcrl	$1,%eax
+	movl	%ebx,-12(%edi)
+
+	movl	-16(%esi),%edx
+	movl	-20(%esi),%ebx
+	rcrl	$1,%edx
+	movl	%eax,-16(%edi)
+	rcrl	$1,%ebx
+	movl	%edx,-20(%edi)
+
+	movl	-24(%esi),%eax
+	movl	-28(%esi),%edx
+	rcrl	$1,%eax
+	movl	%ebx,-24(%edi)
+	rcrl	$1,%edx
+	movl	%eax,-28(%edi)
+
+	leal	-32(%esi),%esi		/* use leal not to clobber carry */
+	leal	-32(%edi),%edi
+	decl	%ebp
+	jnz	LLoop
+
+LLend:	popl	%ebp
+	sbbl	%eax,%eax		/* save carry in %eax */
+	andl	$7,%ebp
+	jz	LLend2
+	addl	%eax,%eax		/* restore carry from eax */
+LLoop2:	movl	%edx,%ebx
+	movl	(%esi),%edx
+	rcrl	$1,%edx
+	movl	%ebx,(%edi)
+
+	leal	-4(%esi),%esi		/* use leal not to clobber carry */
+	leal	-4(%edi),%edi
+	decl	%ebp
+	jnz	LLoop2
+
+	jmp	LL1
+LLend2:	addl	%eax,%eax		/* restore carry from eax */
+LL1:	movl	%edx,(%edi)		/* store last limb */
+
+	movl	$0,%eax
+	rcrl	$1,%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
diff --git a/sysdeps/i386/i586/strchr.S b/sysdeps/i386/i586/strchr.S
new file mode 100644
index 0000000000..982c80ec9a
--- /dev/null
+++ b/sysdeps/i386/i586/strchr.S
@@ -0,0 +1,334 @@
+/* strchr -- find character CH in a NUL terminated string.
+Highly optimized version for ix85, x>=5.
+Copyright (C) 1995 Free Software Foundation, Inc.
+This file is part of the GNU C Library.
+Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+
+/* This version is especially optimized for the i586 (and following?)
+   processors.  This is mainly done by using the two pipelines.  The
+   version optimized for i486 is weak in this aspect because to get
+   as much parallelism we have to executs some *more* instructions.
+
+   The code below is structured to reflect the pairing of the instructions
+   as *I think* it is.  I have no processor data book to verify this.
+   If you find something you think is incorrect let me know.  */
+
+
+/* The magic value which is used throughout in the whole code.  */
+#define magic 0xfefefeff
+
+/*
+   INPUT PARAMETERS:
+   str		(sp + 4)
+   ch		(sp + 8)
+*/
+
+	.text
+ENTRY (strchr)
+	pushl %edi		/* Save callee-safe registers.  */
+	pushl %esi
+
+	pushl %ebx
+	pushl %ebp
+
+	movl 20(%esp), %eax	/* get string pointer */
+	movl 24(%esp), %edx	/* get character we are looking for */
+
+	movl %eax, %edi		/* duplicate string pointer for later */
+	xorl %ecx, %ecx		/* clear %ecx */
+
+	/* At the moment %edx contains C.  What we need for the
+	   algorithm is C in all bytes of the dword.  Avoid
+	   operations on 16 bit words because these require an
+	   prefix byte (and one more cycle).  */
+	movb %dl, %dh		/* now it is 0|0|c|c */
+	movb %dl, %cl		/* we construct the lower half in %ecx */
+
+	shll $16, %edx		/* now %edx is c|c|0|0 */
+	movb %cl, %ch		/* now %ecx is 0|0|c|c */
+
+	orl %ecx, %edx		/* and finally c|c|c|c */
+	andl $3, %edi		/* mask alignment bits */
+
+	jz L11			/* alignment is 0 => start loop */
+
+	movb (%eax), %cl	/* load single byte */
+	cmpb %cl, %dl		/* is byte == C? */
+
+	je L2			/* aligned => return pointer */
+
+	cmp $0, %cl		/* is byte NUL? */
+	je L3			/* yes => return NULL */
+
+	incl %eax		/* increment pointer */
+	cmp $3, %edi		/* was alignment == 3? */
+
+	je L11			/* yes => start loop */
+
+	movb (%eax), %cl	/* load single byte */
+	cmpb %cl, %dl		/* is byte == C? */
+
+	je L2			/* aligned => return pointer */
+
+	cmp $0, %cl		/* is byte NUL? */
+	je L3			/* yes => return NULL */
+
+	incl %eax		/* increment pointer */
+	cmp $2, %edi		/* was alignment == 2? */
+
+	je L11			/* yes => start loop */
+
+	movb (%eax), %cl	/* load single byte */
+	cmpb %cl, %dl		/* is byte == C? */
+
+	je L2			/* aligned => return pointer */
+
+	cmp $0, %cl		/* is byte NUL? */
+	je L3			/* yes => return NULL */
+
+	incl %eax		/* increment pointer */
+
+	/* The following code is the preparation for the loop.  The
+	   four instruction up to `L1' will not be executed in the loop
+	   because the same code is found at the end of the loop, but
+	   there it is executed in parallel with other instructions.  */
+L11:	movl (%eax), %ecx
+	movl $magic, %ebp
+
+	movl $magic, %edi
+	addl %ecx, %ebp
+
+	/* The main loop: it looks complex and indeed it is.  I would
+	   love to say `it was hard to write, so it should he hard to
+	   read' but I will give some more hints.  To fully understand
+	   this code you should first take a look at the i486 version.
+	   The basic algorithm is the same, but here the code organized
+	   in a way which permits to use both pipelines all the time.
+
+	   I tried to make it a bit more understandable by indenting
+	   the code according to stage in the algorithm.  It goes as
+	   follows:
+		check for 0 in 1st word
+			check for C in 1st word
+					check for 0 in 2nd word
+						check for C in 2nd word
+		check for 0 in 3rd word
+			check for C in 3rd word
+					check for 0 in 4th word
+						check for C in 4th word
+
+	   Please note that doing the test for NUL before the test for
+	   C allows us to overlap the test for 0 in the next word with
+	   the test for C.  */
+
+L1:	xorl %ecx, %ebp			/* (word^magic) */
+	addl %ecx, %edi			/* add magic word */
+
+	leal 4(%eax), %eax		/* increment pointer */
+	jnc L4				/* previous addl caused overflow? */
+
+		movl %ecx, %ebx		/* duplicate original word */
+	orl $magic, %ebp		/* (word^magic)|magic */
+
+	addl $1, %ebp			/* (word^magic)|magic == 0xffffffff? */
+	jne L4				/* yes => we found word with NUL */
+
+		movl $magic, %esi	/* load magic value */
+		xorl %edx, %ebx		/* clear words which are C */
+
+					movl (%eax), %ecx
+		addl %ebx, %esi		/* (word+magic) */
+
+					movl $magic, %edi
+		jnc L5			/* previous addl caused overflow? */
+
+					movl %edi, %ebp
+		xorl %ebx, %esi		/* (word+magic)^word */
+
+					addl %ecx, %ebp
+		orl $magic, %esi	/* ((word+magic)^word)|magic */
+
+		addl $1, %esi		/* ((word+magic)^word)|magic==0xf..f?*/
+		jne L5			/* yes => we found word with C */
+
+					xorl %ecx, %ebp
+					addl %ecx, %edi
+
+					leal 4(%eax), %eax
+					jnc L4
+
+						movl %ecx, %ebx
+					orl $magic, %ebp
+
+					addl $1, %ebp
+					jne L4
+
+						movl $magic, %esi
+						xorl %edx, %ebx
+
+	movl (%eax), %ecx
+						addl %ebx, %esi
+
+	movl $magic, %edi
+						jnc L5
+
+	movl %edi, %ebp
+						xorl %ebx, %esi
+
+	addl %ecx, %ebp
+						orl $magic, %esi
+
+						addl $1, %esi
+						jne L5
+
+	xorl %ecx, %ebp
+	addl %ecx, %edi
+
+	leal 4(%eax), %eax
+	jnc L4
+
+		movl %ecx, %ebx
+	orl $magic, %ebp
+
+	addl $1, %ebp
+	jne L4
+
+		movl $magic, %esi
+		xorl %edx, %ebx
+
+					movl (%eax), %ecx
+		addl %ebx, %esi
+
+					movl $magic, %edi
+		jnc L5
+
+					movl %edi, %ebp
+		xorl %ebx, %esi
+
+					addl %ecx, %ebp
+		orl $magic, %esi
+
+		addl $1, %esi
+		jne L5
+
+					xorl %ecx, %ebp
+					addl %ecx, %edi
+
+					leal 4(%eax), %eax
+					jnc L4
+
+						movl %ecx, %ebx
+					orl $magic, %ebp
+
+					addl $1, %ebp
+					jne L4
+
+						movl $magic, %esi
+						xorl %edx, %ebx
+
+	movl (%eax), %ecx
+						addl %ebx, %esi
+
+	movl $magic, %edi
+						jnc L5
+
+	movl %edi, %ebp
+						xorl %ebx, %esi
+
+	addl %ecx, %ebp
+						orl $magic, %esi
+
+						addl $1, %esi
+
+						je L1
+
+	/* We know there is no NUL byte but a C byte in the word.
+	   %ebx contains NUL in this particular byte.  */
+L5:	subl $4, %eax		/* adjust pointer */
+	testb %bl, %bl		/* first byte == C? */
+
+	jz L2			/* yes => return pointer */
+
+	incl %eax		/* increment pointer */
+	testb %bh, %bh		/* second byte == C? */
+
+	jz L2			/* yes => return pointer */
+
+	shrl $16, %ebx		/* make upper bytes accessible */
+	incl %eax		/* increment pointer */
+
+	cmp $0, %bl		/* third byte == C */
+	je L2			/* yes => return pointer */
+
+	incl %eax		/* increment pointer */
+
+L2:	popl %ebp		/* restore saved registers */
+	popl %ebx
+
+	popl %esi
+	popl %edi
+
+	ret
+
+	/* We know there is a NUL byte in the word.  But we have to test
+	   whether there is an C byte before it in the word.  */
+L4:	subl $4, %eax		/* adjust pointer */
+	cmpb %dl, %cl		/* first byte == C? */
+
+	je L2			/* yes => return pointer */
+
+	cmpb $0, %cl		/* first byte == NUL? */
+	je L3			/* yes => return NULL */
+
+	incl %eax		/* increment pointer */
+
+	cmpb %dl, %ch		/* second byte == C? */
+	je L2			/* yes => return pointer */
+
+	cmpb $0, %ch		/* second byte == NUL? */
+	je L3			/* yes => return NULL */
+
+	shrl $16, %ecx		/* make upper bytes accessible */
+	incl %eax		/* increment pointer */
+
+	cmpb %dl, %cl		/* third byte == C? */
+	je L2			/* yes => return pointer */
+
+	cmpb $0, %cl		/* third byte == NUL? */
+	je L3			/* yes => return NULL */
+
+	incl %eax		/* increment pointer */
+
+	/* The test four the fourth byte is necessary!  */
+	cmpb %dl, %ch		/* fourth byte == C? */
+	je L2			/* yes => return pointer */
+
+L3:	xorl %eax, %eax		/* set return value = NULL */
+
+	popl %ebp		/* restore saved registers */
+	popl %ebx
+
+	popl %esi
+	popl %edi
+
+	ret
+
+#undef index
+weak_alias (strchr, index)
diff --git a/sysdeps/i386/i586/strlen.S b/sysdeps/i386/i586/strlen.S
new file mode 100644
index 0000000000..b807ed4b4f
--- /dev/null
+++ b/sysdeps/i386/i586/strlen.S
@@ -0,0 +1,185 @@
+/* strlen -- Compute length og NUL terminated string.
+Highly optimized version for ix86, x>=5.
+Copyright (C) 1995 Free Software Foundation, Inc.
+This file is part of the GNU C Library.
+Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+
+/* This version is especially optimized for the i586 (and following?)
+   processors.  This is mainly done by using the two pipelines.  The
+   version optimized for i486 is weak in this aspect because to get
+   as much parallelism we have to executs some *more* instructions.
+
+   The code below is structured to reflect the pairing of the instructions
+   as *I think* it is.  I have no processor data book to verify this.
+   If you find something you think is incorrect let me know.  */
+
+
+/* The magic value which is used throughout in the whole code.  */
+#define magic 0xfefefeff
+
+/*
+   INPUT PARAMETERS:
+   str		(sp + 4)
+*/
+
+	.text
+ENTRY(strlen)
+	movl 4(%esp), %eax	/* get string pointer */
+
+	movl %eax, %ecx		/* duplicate it */
+	andl $3, %ecx		/* mask alignment bits */
+
+	jz L11			/* aligned => start loop */
+
+	cmpb %ch, (%eax)	/* is byte NUL? */
+	je L2			/* yes => return */
+
+	incl %eax		/* increment pointer */
+	cmpl $3, %ecx		/* was alignment = 3? */
+
+	je L11			/* yes => now it is aligned and start loop */
+
+	cmpb %ch, (%eax)	/* is byte NUL? */
+	je L2			/* yes => return */
+
+	incl %eax		/* increment pointer */
+	cmpl $2, %ecx		/* was alignment = 2? */
+
+	je L11			/* yes => now it is aligned and start loop */
+
+	cmpb %ch, (%eax)	/* is byte NUL? */
+	je L2			/* yes => return */
+
+	incl %eax		/* increment pointer */
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+	 change any of the hole bits of LONGWORD.
+
+	 1) Is this safe?  Will it catch all the zero bytes?
+	 Suppose there is a byte with all zeros.  Any carry bits
+	 propagating from its left will fall into the hole at its
+	 least significant bit and stop.  Since there will be no
+	 carry from its most significant bit, the LSB of the
+	 byte to the left will be unchanged, and the zero will be
+	 detected.
+
+	 2) Is this worthwhile?  Will it ignore everything except
+	 zero bytes?  Suppose every byte of LONGWORD has a bit set
+	 somewhere.  There will be a carry into bit 8.	If bit 8
+	 is set, this will carry into bit 16.  If bit 8 is clear,
+	 one of bits 9-15 must be set, so there will be a carry
+	 into bit 16.  Similarly, there will be a carry into bit
+	 24.  If one of bits 24-31 is set, there will be a carry
+	 into bit 32 (=carry flag), so all of the hole bits will
+	 be changed.  */
+L11:	xorl %edx, %edx		/* We need %edx == 0 for later */
+
+L1:
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	addl $4, %eax		/* adjust pointer for *next* word */
+
+	subl %ecx, %edx		/* first step to negate word */
+	addl $magic, %ecx	/* add magic word */
+
+	decl %edx		/* complete negation of word */
+	jnc L3			/* previous addl caused overflow? */
+
+	xorl %ecx, %edx		/* (word+magic)^word */
+	subl $magic, %ecx	/* undo previous addl to restore word */
+
+	andl $~magic, %edx	/* any of the carry flags set? */
+
+	jne L3			/* yes => determine byte */
+
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	addl $4, %eax		/* adjust pointer for *next* word */
+
+	subl %ecx, %edx		/* first step to negate word */
+	addl $magic, %ecx	/* add magic word */
+
+	decl %edx		/* complete negation of word */
+	jnc L3			/* previous addl caused overflow? */
+
+	xorl %ecx, %edx		/* (word+magic)^word */
+	subl $magic, %ecx	/* undo previous addl to restore word */
+
+	andl $~magic, %edx	/* any of the carry flags set? */
+
+	jne L3			/* yes => determine byte */
+
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	addl $4, %eax		/* adjust pointer for *next* word */
+
+	subl %ecx, %edx		/* first step to negate word */
+	addl $magic, %ecx	/* add magic word */
+
+	decl %edx		/* complete negation of word */
+	jnc L3			/* previous addl caused overflow? */
+
+	xorl %ecx, %edx		/* (word+magic)^word */
+	subl $magic, %ecx	/* undo previous addl to restore word */
+
+	andl $~magic, %edx	/* any of the carry flags set? */
+
+	jne L3			/* yes => determine byte */
+
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	addl $4, %eax		/* adjust pointer for *next* word */
+
+	subl %ecx, %edx		/* first step to negate word */
+	addl $magic, %ecx	/* add magic word */
+
+	decl %edx		/* wcomplete negation of ord */
+	jnc L3			/* previous addl caused overflow? */
+
+	xorl %ecx, %edx		/* (word+magic)^word */
+	subl $magic, %ecx	/* undo previous addl to restore word */
+
+	andl $~magic, %edx	/* any of the carry flags set? */
+
+	je L1			/* no => start loop again */
+
+
+L3:	subl $4, %eax		/* correct too early pointer increment */
+	testb %cl, %cl		/* lowest byte NUL? */
+
+	jz L2			/* yes => return */
+
+	inc %eax		/* increment pointer */
+	testb %ch, %ch		/* second byte NUL? */
+
+	jz L2			/* yes => return */
+
+	shrl $16, %ecx		/* make upper bytes accessible */
+	incl %eax		/* increment pointer */
+
+	cmpb $0, %cl		/* is third byte NUL? */
+	jz L2			/* yes => return */
+
+	incl %eax		/* increment pointer */
+
+L2:	subl 4(%esp), %eax	/* now compute the length as difference
+				   between start and terminating NUL
+				   character */
+
+	ret
diff --git a/sysdeps/i386/i586/sub_n.S b/sysdeps/i386/i586/sub_n.S
new file mode 100644
index 0000000000..1382e665f6
--- /dev/null
+++ b/sysdeps/i386/i586/sub_n.S
@@ -0,0 +1,136 @@
+/* Pentium __mpn_sub_n -- Subtract two limb vectors of the same length > 0
+   and store difference in a third limb vector.
+
+Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+   INPUT PARAMETERS
+   res_ptr	(sp + 4)
+   s1_ptr	(sp + 8)
+   s2_ptr	(sp + 12)
+   size		(sp + 16)
+*/
+
+#define r1	%eax
+#define r2	%edx
+#define src1	%esi
+#define src2	%ebp
+#define dst	%edi
+#define x	%ebx
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+.text
+	ALIGN (3)
+	.globl C_SYMBOL_NAME(__mpn_sub_n)
+C_SYMBOL_NAME(__mpn_sub_n:)
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+
+	movl	20(%esp),dst		/* res_ptr */
+	movl	24(%esp),src1		/* s1_ptr */
+	movl	28(%esp),src2		/* s2_ptr */
+	movl	32(%esp),%ecx		/* size */
+
+	movl	(src2),x
+
+	decl	%ecx
+	movl	%ecx,r2
+	shrl	$3,%ecx
+	andl	$7,r2
+	testl	%ecx,%ecx		/* zero carry flag */
+	jz	Lend
+	pushl	r2
+
+	ALIGN (3)
+Loop:	movl	28(dst),%eax		/* fetch destination cache line */
+	leal	32(dst),dst
+
+L1:	movl	(src1),r1
+	movl	4(src1),r2
+	sbbl	x,r1
+	movl	4(src2),x
+	sbbl	x,r2
+	movl	8(src2),x
+	movl	r1,-32(dst)
+	movl	r2,-28(dst)
+
+L2:	movl	8(src1),r1
+	movl	12(src1),r2
+	sbbl	x,r1
+	movl	12(src2),x
+	sbbl	x,r2
+	movl	16(src2),x
+	movl	r1,-24(dst)
+	movl	r2,-20(dst)
+
+L3:	movl	16(src1),r1
+	movl	20(src1),r2
+	sbbl	x,r1
+	movl	20(src2),x
+	sbbl	x,r2
+	movl	24(src2),x
+	movl	r1,-16(dst)
+	movl	r2,-12(dst)
+
+L4:	movl	24(src1),r1
+	movl	28(src1),r2
+	sbbl	x,r1
+	movl	28(src2),x
+	sbbl	x,r2
+	movl	32(src2),x
+	movl	r1,-8(dst)
+	movl	r2,-4(dst)
+
+	leal	32(src1),src1
+	leal	32(src2),src2
+	decl	%ecx
+	jnz	Loop
+
+	popl	r2
+Lend:
+	decl	r2			/* test r2 w/o clobbering carry */
+	js	Lend2
+	incl	r2
+Loop2:
+	leal	4(dst),dst
+	movl	(src1),r1
+	sbbl	x,r1
+	movl	4(src2),x
+	movl	r1,-4(dst)
+	leal	4(src1),src1
+	leal	4(src2),src2
+	decl	r2
+	jnz	Loop2
+Lend2:
+	movl	(src1),r1
+	sbbl	x,r1
+	movl	r1,(dst)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
diff --git a/sysdeps/i386/i586/submul_1.S b/sysdeps/i386/i586/submul_1.S
new file mode 100644
index 0000000000..14bfe54e24
--- /dev/null
+++ b/sysdeps/i386/i586/submul_1.S
@@ -0,0 +1,82 @@
+/* Pentium __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+   the result from a second limb vector.
+
+Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/*
+   INPUT PARAMETERS
+   res_ptr	(sp + 4)
+   s1_ptr	(sp + 8)
+   size		(sp + 12)
+   s2_limb	(sp + 16)
+*/
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+#define res_ptr edi
+#define s1_ptr esi
+#define size ecx
+#define s2_limb ebp
+
+	TEXT
+	ALIGN (3)
+	GLOBL	C_SYMBOL_NAME(__mpn_submul_1)
+C_SYMBOL_NAME(__mpn_submul_1:)
+
+	INSN1(push,l	,R(edi))
+	INSN1(push,l	,R(esi))
+	INSN1(push,l	,R(ebx))
+	INSN1(push,l	,R(ebp))
+
+	INSN2(mov,l	,R(res_ptr),MEM_DISP(esp,20))
+	INSN2(mov,l	,R(s1_ptr),MEM_DISP(esp,24))
+	INSN2(mov,l	,R(size),MEM_DISP(esp,28))
+	INSN2(mov,l	,R(s2_limb),MEM_DISP(esp,32))
+
+	INSN2(lea,l	,R(res_ptr),MEM_INDEX(res_ptr,size,4))
+	INSN2(lea,l	,R(s1_ptr),MEM_INDEX(s1_ptr,size,4))
+	INSN1(neg,l	,R(size))
+	INSN2(xor,l	,R(edx),R(edx))
+	ALIGN (3)
+Loop:
+	INSN2(mov,l	,R(ebx),R(edx))
+	INSN2(mov,l	,R(eax),MEM_INDEX(s1_ptr,size,4))
+
+	INSN1(mul,l	,R(s2_limb))
+
+	INSN2(add,l	,R(eax),R(ebx))
+	INSN2(mov,l	,R(ebx),MEM_INDEX(res_ptr,size,4))
+
+	INSN2(adc,l	,R(edx),$0)
+	INSN2(sub,l	,R(ebx),R(eax))
+
+	INSN2(adc,l	,R(edx),$0)
+	INSN2(mov,l	,MEM_INDEX(res_ptr,size,4),R(ebx))
+
+	INSN1(inc,l	,R(size))
+	INSN1(jnz,	,Loop)
+
+
+	INSN2(mov,l	,R(eax),R(edx))
+	INSN1(pop,l	,R(ebp))
+	INSN1(pop,l	,R(ebx))
+	INSN1(pop,l	,R(esi))
+	INSN1(pop,l	,R(edi))
+	ret
diff --git a/sysdeps/i386/memchr.S b/sysdeps/i386/memchr.S
new file mode 100644
index 0000000000..9931f97241
--- /dev/null
+++ b/sysdeps/i386/memchr.S
@@ -0,0 +1,315 @@
+/* memchr (str, ch, n) -- Return pointer to first occurrence of CH in STR less
+   than N.
+For Intel 80x86, x>=3.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+Optimised a little by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+This file is part of the GNU C Library.
+
+This version is developed using the same algorithm as the fast C
+version which carries the following introduction:
+
+Based on strlen implemention by Torbjorn Granlund (tege@sics.se),
+with help from Dan Sahlin (dan@sics.se) and
+commentary by Jim Blandy (jimb@ai.mit.edu);
+adaptation to memchr suggested by Dick Karpinski (dick@cca.ucsf.edu),
+and implemented by Roland McGrath (roland@ai.mit.edu).
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   str		(sp + 4)
+   c		(sp + 8)
+   len		(sp + 12)
+*/
+
+	.text
+ENTRY (memchr)
+	/* Save callee-safe registers used in this function.  */
+	pushl %esi
+	pushl %edi
+
+	/* Load parameters into registers.  */
+	movl 12(%esp), %eax	/* str: pointer to memory block.  */
+	movl 16(%esp), %edx	/* c: byte we are looking for.  */
+	movl 20(%esp), %esi	/* len: length of memory block.  */
+
+	/* If my must not test more than three characters test
+	   them one by one.  This is especially true for 0.  */
+	cmpl $4, %esi
+	jb L3
+
+	/* At the moment %edx contains C.  What we need for the
+	   algorithm is C in all bytes of the dword.  Avoid
+	   operations on 16 bit words because these require an
+	   prefix byte (and one more cycle).  */
+	movb %dl, %dh		/* Now it is 0|0|c|c */
+	movl %edx, %ecx
+	shll $16, %edx		/* Now c|c|0|0 */
+	movw %cx, %dx		/* And finally c|c|c|c */
+
+	/* Better performance can be achieved if the word (32
+	   bit) memory access is aligned on a four-byte-boundary.
+	   So process first bytes one by one until boundary is
+	   reached. Don't use a loop for better performance.  */
+
+	testb $3, %eax		/* correctly aligned ? */
+	je L2			/* yes => begin loop */
+	cmpb %dl, (%eax)	/* compare byte */
+	je L9			/* target found => return */
+	incl %eax		/* increment source pointer */
+	decl %esi		/* decrement length counter */
+	je L4			/* len==0 => return NULL */
+
+	testb $3, %eax		/* correctly aligned ? */
+	je L2			/* yes => begin loop */
+	cmpb %dl, (%eax)	/* compare byte */
+	je L9			/* target found => return */
+	incl %eax		/* increment source pointer */
+	decl %esi		/* decrement length counter */
+	je L4			/* len==0 => return NULL */
+
+	testb $3, %eax		/* correctly aligned ? */
+	je L2			/* yes => begin loop */
+	cmpb %dl, (%eax)	/* compare byte */
+	je L9			/* target found => return */
+	incl %eax		/* increment source pointer */
+	decl %esi		/* decrement length counter */
+	/* no test for len==0 here, because this is done in the
+	   loop head */
+	jmp L2
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+	 change any of the hole bits of LONGWORD.
+
+	 1) Is this safe?  Will it catch all the zero bytes?
+	 Suppose there is a byte with all zeros.  Any carry bits
+	 propagating from its left will fall into the hole at its
+	 least significant bit and stop.  Since there will be no
+	 carry from its most significant bit, the LSB of the
+	 byte to the left will be unchanged, and the zero will be
+	 detected.
+
+	 2) Is this worthwhile?  Will it ignore everything except
+	 zero bytes?  Suppose every byte of LONGWORD has a bit set
+	 somewhere.  There will be a carry into bit 8.	If bit 8
+	 is set, this will carry into bit 16.  If bit 8 is clear,
+	 one of bits 9-15 must be set, so there will be a carry
+	 into bit 16.  Similarly, there will be a carry into bit
+	 24.  If one of bits 24-31 is set, there will be a carry
+	 into bit 32 (=carry flag), so all of the hole bits will
+	 be changed.
+
+	 3) But wait!  Aren't we looking for C, not zero?
+	 Good point.  So what we do is XOR LONGWORD with a longword,
+	 each of whose bytes is C.  This turns each byte that is C
+	 into a zero.  */
+
+
+	/* Each round the main loop processes 16 bytes.  */
+
+	ALIGN (4)
+
+L1:	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+
+	/* According to the algorithm we had to reverse the effect of the
+	   XOR first and then test the overflow bits.  But because the
+	   following XOR would destroy the carry flag and it would (in a
+	   representation with more than 32 bits) not alter then last
+	   overflow, we can now test this condition.  If no carry is signaled
+	   no overflow must have occured in the last byte => it was 0.	*/
+	jnc L8
+
+	/* We are only interested in carry bits that change due to the
+	   previous add, so remove original bits */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+
+	/* Now test for the other three overflow bits.  */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	/* If at least one byte of the word is C we don't get 0 in %edi.  */
+	jnz L8			/* found it => return pointer */
+
+	/* This process is unfolded four times for better performance.
+	   we don't increment the source pointer each time.  Instead we
+	   use offsets and increment by 16 in each run of the loop.  But
+	   before probing for the matching byte we need some extra code
+	   (following LL(13) below).  Even the len can be compared with
+	   constants instead of decrementing each time.  */
+
+	movl 4(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L7			/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L7			/* found it => return pointer */
+
+	movl 8(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L6			/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L6			/* found it => return pointer */
+
+	movl 12(%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L5			/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L5			/* found it => return pointer */
+
+	/* Adjust both counters for a full round, i.e. 16 bytes.  */
+	addl $16, %eax
+L2:	subl $16, %esi
+	jae L1			/* Still more than 16 bytes remaining */
+
+	/* Process remaining bytes separately.  */
+	cmpl $4-16, %esi	/* rest < 4 bytes? */
+	jb L3			/* yes, than test byte by byte */
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L8			/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jne L8			/* found it => return pointer */
+	addl $4, %eax		/* adjust source pointer */
+
+	cmpl $8-16, %esi	/* rest < 8 bytes? */
+	jb L3			/* yes, than test byte by byte */
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L8			/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jne L8			/* found it => return pointer */
+	addl $4, %eax		/* adjust source pointer */
+
+	cmpl $12-16, %esi	/* rest < 12 bytes? */
+	jb L3			/* yes, than test byte by byte */
+
+	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L8			/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jne L8			/* found it => return pointer */
+	addl $4, %eax		/* adjust source pointer */
+
+	/* Check the remaining bytes one by one.  */
+L3:	andl $3, %esi		/* mask out uninteresting bytes */
+	jz L4			/* no remaining bytes => return NULL */
+
+	cmpb %dl, (%eax)	/* compare byte with C */
+	je L9			/* equal, than return pointer */
+	incl %eax		/* increment source pointer */
+	decl %esi		/* decrement length */
+	jz L4			/* no remaining bytes => return NULL */
+
+	cmpb %dl, (%eax)	/* compare byte with C */
+	je L9			/* equal, than return pointer */
+	incl %eax		/* increment source pointer */
+	decl %esi		/* decrement length */
+	jz L4			/* no remaining bytes => return NULL */
+
+	cmpb %dl, (%eax)	/* compare byte with C */
+	je L9			/* equal, than return pointer */
+
+L4:	/* no byte found => return NULL */
+	xorl %eax, %eax
+	jmp L9
+
+	/* add missing source pointer increments */
+L5:	addl $4, %eax
+L6:	addl $4, %eax
+L7:	addl $4, %eax
+
+	/* Test for the matching byte in the word.  %ecx contains a NUL
+	   char in the byte which originally was the byte we are looking
+	   at.  */
+L8:	testb %cl, %cl		/* test first byte in dword */
+	jz L9			/* if zero => return pointer */
+	incl %eax		/* increment source pointer */
+
+	testb %ch, %ch		/* test second byte in dword */
+	jz L9			/* if zero => return pointer */
+	incl %eax		/* increment source pointer */
+
+	testl $0xff0000, %ecx	/* test third byte in dword */
+	jz L9			/* if zero => return pointer */
+	incl %eax		/* increment source pointer */
+
+	/* No further test needed we we known it is one of the four byytes.  */
+
+L9:	popl %edi		/* pop saved registers */
+	popl %esi
+
+	ret
diff --git a/sysdeps/i386/memchr.c b/sysdeps/i386/memchr.c
deleted file mode 100644
index ff0f8d9044..0000000000
--- a/sysdeps/i386/memchr.c
+++ /dev/null
@@ -1,48 +0,0 @@
-/* memchr (str, ch, n) -- Return pointer to first occurrence of CH in STR less
-   than N.
-   For Intel 80x86, x>=3.
-   Copyright (C) 1991, 1992, 1993 Free Software Foundation, Inc.
-   Contributed by Torbjorn Granlund (tege@sics.se).
-
-The GNU C Library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Library General Public License as
-published by the Free Software Foundation; either version 2 of the
-License, or (at your option) any later version.
-
-The GNU C Library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-Library General Public License for more details.
-
-You should have received a copy of the GNU Library General Public
-License along with the GNU C Library; see the file COPYING.LIB.  If
-not, write to the Free Software Foundation, Inc., 675 Mass Ave,
-Cambridge, MA 02139, USA.  */
-
-#include <ansidecl.h>
-#include <string.h>
-
-#ifdef	__GNUC__
-
-PTR
-DEFUN(memchr, (str, c, len),
-      CONST PTR str AND int c AND size_t len)
-{
-  PTR retval;
-  asm("cld\n"			/* Search forward.  */
-      "testl %1,%1\n"		/* Clear Z flag, to handle LEN == 0.  */
-      /* Some old versions of gas need `repne' instead of `repnz'.  */
-      "repnz\n"			/* Search for C in al.  */
-      "scasb\n"
-      "movl %2,%0\n"		/* Set %0 to 0 (without affecting Z flag).  */
-      "jnz done\n"		/* Jump if we found nothing equal to C.  */
-      "leal -1(%1),%0\n"	/* edi has been incremented.  Return edi-1.  */
-      "done:" :
-      "=a" (retval), "=D" (str), "=c" (len) :
-      "0" (c), "1" (str), "2" (len));
-  return retval;
-}
-
-#else
-#include <sysdeps/generic/memchr.c>
-#endif
diff --git a/sysdeps/i386/memcmp.S b/sysdeps/i386/memcmp.S
new file mode 100644
index 0000000000..f16b44a1b4
--- /dev/null
+++ b/sysdeps/i386/memcmp.S
@@ -0,0 +1,68 @@
+/* memcmp -- compare two memory blocks for differences in the first COUNT
+	     bytes.
+Copyright (C) 1995 Free Software Foundation, Inc.
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   block1	(sp + 4)
+   block2	(sp + 8)
+   len		(sp + 12)
+*/
+
+	.text
+ENTRY (memcmp)
+	pushl %esi		/* Save callee-safe registers.  */
+	movl %edi, %edx		/* Note that %edx is not used and can
+				   so be used to save %edi.  It's faster.  */
+
+	movl 12(%esp), %esi	/* Load address of block #1.  */
+	movl 16(%esp), %edi	/* Load address of block #2.  */
+	movl 20(%esp), %ecx	/* Load maximal length of compare area.  */
+
+	cld			/* Set direction of comparison.  */
+
+	xorl %eax, %eax		/* Default result.  */
+
+	repe			/* Compare at most %ecx bytes.  */
+	cmpsb
+	jz L1			/* If even last byte was equal we return 0.  */
+
+	/* The memory blocks are not equal.  So result of the last
+	   subtraction is present in the carry flag.  It is set when
+	   the byte in block #2 is bigger.  In this case we have to
+	   return -1 (=0xffffffff), else 1.  */
+	sbbl %eax, %eax		/* This is tricky.  %eax == 0 and carry is set
+				   or not depending on last subtraction.  */
+
+	/* At this point %eax == 0, if the byte of block #1 was bigger, and
+	   0xffffffff if the last byte of block #2 was bigger.  The later
+	   case is already correct but the former needs a little adjustment.
+	   Note that the following operation does not change 0xffffffff.  */
+	orb $1, %al		/* Change 0 to 1.  */
+
+L1:	popl %esi		/* Restore registers.  */
+	movl %edx, %edi
+
+	ret
+
+#undef bcmp
+weak_alias (memcmp, bcmp)
diff --git a/sysdeps/i386/stpcpy.S b/sysdeps/i386/stpcpy.S
new file mode 100644
index 0000000000..f38a9089bc
--- /dev/null
+++ b/sysdeps/i386/stpcpy.S
@@ -0,0 +1,87 @@
+/* stpcpy -- copy SRC to DEST returning the address of the terminating '\0'
+	     in DEST.
+For Intel 80x86, x>=3.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper (drepper@gnu.ai.mit.edu).
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+/* This function is defined neither in ANSI nor POSIX standards but is
+   also not invented here.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   dest		(sp + 4)
+   src		(sp + 8)
+*/
+
+	.text
+ENTRY (__stpcpy)
+	movl 4(%esp), %eax	/* load destination pointer */
+	movl 8(%esp), %ecx	/* load source pointer */
+
+	subl %eax, %ecx		/* magic: reduce number of loop variants
+				   to one using addressing mode */
+
+	/* Here we would like to write
+
+	subl $4, %eax
+	ALIGN (4)
+
+	but the assembler is too smart and optimizes for the shortest
+	form where the number only needs one byte.  But if we could
+	have the long form we would not need the alignment.  */
+
+	.byte 0x81, 0xe8	/* This is `subl $0x00000004, %eax' */
+	.long 0x00000004
+
+	/* Four times unfolded loop with only one loop counter.  This
+	   is achieved by the use of index+base adressing mode.  As the
+	   loop counter we use the destination address because this is
+	   also the result.  */
+L1:	addl $4, %eax		/* increment loop counter */
+
+	movb (%eax,%ecx), %dl	/* load current char */
+	movb %dl, (%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L2			/* yes, then exit */
+
+	movb 1(%eax,%ecx), %dl	/* load current char */
+	movb %dl, 1(%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L3			/* yes, then exit */
+
+	movb 2(%eax,%ecx), %dl	/* load current char */
+	movb %dl, 2(%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L4			/* yes, then exit */
+
+	movb 3(%eax,%ecx), %dl	/* load current char */
+	movb %dl, 3(%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jnz L1			/* no, then continue loop */
+
+	incl %eax		/* correct loop counter */
+L4:	incl %eax
+L3:	incl %eax
+L2:
+	ret
+
+weak_alias (__stpcpy, stpcpy)
diff --git a/sysdeps/i386/stpncpy.S b/sysdeps/i386/stpncpy.S
new file mode 100644
index 0000000000..59192e66c9
--- /dev/null
+++ b/sysdeps/i386/stpncpy.S
@@ -0,0 +1,143 @@
+/* stpncpy -- copy no more then N bytes from SRC to DEST, returning the
+	      address of the terminating '\0' in DEST.
+For Intel 80x86, x>=3.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+Some bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+  - original wrote n+1 chars in some cases.
+  - stpncpy() ought to behave like strncpy() ie. not null-terminate
+    if limited by n.  glibc-1.09 stpncpy() does this.
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   dest		(sp + 4)
+   src		(sp + 8)
+   maxlen	(sp + 12)
+*/
+
+	.text
+ENTRY (__stpncpy)
+
+	pushl %esi
+
+	movl 8(%esp), %eax	/* load destination pointer */
+	movl 12(%esp), %esi	/* load source pointer */
+	movl 16(%esp), %ecx	/* load maximal length */
+
+	subl %eax, %esi		/* magic: reduce number of loop variants
+				   to one using addressing mode */
+	jmp L1			/* jump to loop "head" */
+
+	ALIGN(4)
+
+	/* Four times unfolded loop with two loop counters.  We get the
+	   the third value (the source address) by using the index+base
+	   adressing mode.  */
+L2:	movb (%eax,%esi), %dl	/* load current char */
+	movb %dl, (%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L7			/* yes, then exit */
+
+	movb 1(%eax,%esi), %dl	/* load current char */
+	movb %dl, 1(%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L6			/* yes, then exit */
+
+	movb 2(%eax,%esi), %dl	/* load current char */
+	movb %dl, 2(%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L5			/* yes, then exit */
+
+	movb 3(%eax,%esi), %dl	/* load current char */
+	movb %dl, 3(%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L4			/* yes, then exit */
+
+	addl $4, %eax		/* increment loop counter for full round */
+
+L1:	subl $4, %ecx		/* still more than 4 bytes allowed? */
+	jae L2			/* yes, then go to start of loop */
+
+	/* The maximal remaining 15 bytes are not processed in a loop.  */
+
+	addl $4, %ecx		/* correct above subtraction */
+	jz L9			/* maximal allowed char reached => go to end */
+
+	movb (%eax,%esi), %dl	/* load current char */
+	movb %dl, (%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L3			/* yes, then exit */
+
+	incl %eax		/* increment pointer */
+	decl %ecx		/* decrement length counter */
+	jz L9			/* no more allowed => exit */
+
+	movb (%eax,%esi), %dl	/* load current char */
+	movb %dl, (%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L3			/* yes, then exit */
+
+	incl %eax		/* increment pointer */
+	decl %ecx		/* decrement length counter */
+	jz L9			/* no more allowed => exit */
+
+	movb (%eax,%esi), %dl	/* load current char */
+	movb %dl, (%eax)	/* and store it */
+	testb %dl, %dl		/* was it NUL? */
+	jz L3			/* yes, then exit */
+
+	incl %eax		/* increment pointer */
+	jmp L9			/* we don't have to test for counter underflow
+				   because we know we had a most 3 bytes
+				   remaining => exit */
+
+	/* When coming from the main loop we have to adjust the pointer.  */
+L4:	decl %ecx		/* decrement counter */
+	incl %eax		/* increment pointer */
+
+L5:	decl %ecx		/* increment pointer */
+	incl %eax		/* increment pointer */
+
+L6:	decl %ecx		/* increment pointer */
+	incl %eax		/* increment pointer */
+L7:
+
+	addl $3, %ecx		/* correct pre-decrementation of counter
+				   at the beginning of the loop; but why 3
+				   and not 4?  Very simple, we have to count
+				   the NUL char we already wrote.  */
+	jz L9			/* counter is also 0 => exit */
+
+	/* We now have to fill the rest of the buffer with NUL.  This
+	   is done in a tricky way.  Please note that the adressing mode
+	   used below is not the same we used above.  Here we use the
+	   %ecx register.  */
+L8:
+	movb $0, (%ecx,%eax)	/* store NUL char */
+L3:	decl %ecx		/* all bytes written? */
+	jnz L8			/* no, then again */
+
+L9:	popl %esi		/* restore saved register content */
+
+	ret
+
+weak_alias (__stpncpy, stpncpy)
diff --git a/sysdeps/i386/strchr.S b/sysdeps/i386/strchr.S
new file mode 100644
index 0000000000..de947cd252
--- /dev/null
+++ b/sysdeps/i386/strchr.S
@@ -0,0 +1,278 @@
+/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR.
+For Intel 80x86, x>=3.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+Some optimisations by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   str		(sp + 4)
+   ch		(sp + 8)
+*/
+
+	.text
+ENTRY (strchr)
+	pushl %edi		/* Save callee-safe registers used here.  */
+
+	movl 8(%esp), %eax	/* get string pointer */
+	movl 12(%esp), %edx	/* get character we are looking for */
+
+	/* At the moment %edx contains C.  What we need for the
+	   algorithm is C in all bytes of the dword.  Avoid
+	   operations on 16 bit words because these require an
+	   prefix byte (and one more cycle).  */
+	movb %dl, %dh		/* now it is 0|0|c|c */
+	movl %edx, %ecx
+	shll $16, %edx		/* now it is c|c|0|0 */
+	movw %cx, %dx		/* and finally c|c|c|c */
+
+	/* Before we start with the main loop we process single bytes
+	   until the source pointer is aligned.  This has two reasons:
+	   1. aligned 32-bit memory access is faster
+	   and (more important)
+	   2. we process in the main loop 32 bit in one step although
+	      we don't know the end of the string.  But accessing at
+	      4-byte alignment guarantees that we never access illegal
+	      memory if this would not also be done by the trivial
+	      implementation (this is because all processor inherant
+	      boundaries are multiples of 4.  */
+
+	testb $3, %eax		/* correctly aligned ? */
+	jz L11			/* yes => begin loop */
+	movb (%eax), %cl	/* load byte in question (we need it twice) */
+	cmpb %cl, %dl		/* compare byte */
+	je L6			/* target found => return */
+	testb %cl, %cl		/* is NUL? */
+	jz L2			/* yes => return NULL */
+	incl %eax		/* increment pointer */
+
+	testb $3, %eax		/* correctly aligned ? */
+	jz L11			/* yes => begin loop */
+	movb (%eax), %cl	/* load byte in question (we need it twice) */
+	cmpb %cl, %dl		/* compare byte */
+	je L6			/* target found => return */
+	testb %cl, %cl		/* is NUL? */
+	jz L2			/* yes => return NULL */
+	incl %eax		/* increment pointer */
+
+	testb $3, %eax		/* correctly aligned ? */
+	jz L11			/* yes => begin loop */
+	movb (%eax), %cl	/* load byte in question (we need it twice) */
+	cmpb %cl, %dl		/* compare byte */
+	je L6			/* target found => return */
+	testb %cl, %cl		/* is NUL? */
+	jz L2			/* yes => return NULL */
+	incl %eax		/* increment pointer */
+
+	/* No we have reached alignment.  */
+	jmp L11			/* begin loop */
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+	 change any of the hole bits of LONGWORD.
+
+	 1) Is this safe?  Will it catch all the zero bytes?
+	 Suppose there is a byte with all zeros.  Any carry bits
+	 propagating from its left will fall into the hole at its
+	 least significant bit and stop.  Since there will be no
+	 carry from its most significant bit, the LSB of the
+	 byte to the left will be unchanged, and the zero will be
+	 detected.
+
+	 2) Is this worthwhile?  Will it ignore everything except
+	 zero bytes?  Suppose every byte of LONGWORD has a bit set
+	 somewhere.  There will be a carry into bit 8.	If bit 8
+	 is set, this will carry into bit 16.  If bit 8 is clear,
+	 one of bits 9-15 must be set, so there will be a carry
+	 into bit 16.  Similarly, there will be a carry into bit
+	 24.  If one of bits 24-31 is set, there will be a carry
+	 into bit 32 (=carry flag), so all of the hole bits will
+	 be changed.
+
+	 3) But wait!  Aren't we looking for C, not zero?
+	 Good point.  So what we do is XOR LONGWORD with a longword,
+	 each of whose bytes is C.  This turns each byte that is C
+	 into a zero.  */
+
+	/* Each round the main loop processes 16 bytes.  */
+
+	ALIGN(4)
+
+L1:	addl $16, %eax		/* adjust pointer for whole round */
+
+L11:	movl (%eax), %ecx	/* get word (= 4 bytes) in question */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* C */
+
+	/* According to the algorithm we had to reverse the effect of the
+	   XOR first and then test the overflow bits.  But because the
+	   following XOR would destroy the carry flag and it would (in a
+	   representation with more than 32 bits) not alter then last
+	   overflow, we can now test this condition.  If no carry is signaled
+	   no overflow must have occured in the last byte => it was 0.	*/
+	jnc L7
+
+	/* We are only interested in carry bits that change due to the
+	   previous add, so remove original bits */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+
+	/* Now test for the other three overflow bits.  */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	/* If at least one byte of the word is C we don't get 0 in %edi.  */
+	jnz L7			/* found it => return pointer */
+
+	/* Now we made sure the dword does not contain the character we are
+	   looking for.  But because we deal with strings we have to check
+	   for the end of string before testing the next dword.  */
+
+	xorl %edx, %ecx		/* restore original dword without reload */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L2			/* highest byte is NUL => return NULL */
+	xorl %ecx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L2			/* found NUL => return NULL */
+
+	movl 4(%eax), %ecx	/* get word (= 4 bytes) in question */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* C */
+	jnc L71			/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L71			/* found it => return pointer */
+	xorl %edx, %ecx		/* restore original dword without reload */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L2			/* highest byte is NUL => return NULL */
+	xorl %ecx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L2			/* found NUL => return NULL */
+
+	movl 8(%eax), %ecx	/* get word (= 4 bytes) in question */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* C */
+	jnc L72			/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L72			/* found it => return pointer */
+	xorl %edx, %ecx		/* restore original dword without reload */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L2			/* highest byte is NUL => return NULL */
+	xorl %ecx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L2			/* found NUL => return NULL */
+
+	movl 12(%eax), %ecx	/* get word (= 4 bytes) in question */
+	xorl %edx, %ecx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* C */
+	jnc L73			/* highest byte is C => return pointer */
+	xorl %ecx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L73			/* found it => return pointer */
+	xorl %edx, %ecx		/* restore original dword without reload */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %ecx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L2			/* highest byte is NUL => return NULL */
+	xorl %ecx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz L1			/* no NUL found => restart loop */
+
+L2:	/* Return NULL.  */
+	xorl %eax, %eax		/* load NULL in return value register */
+	popl %edi		/* restore saved register content */
+	ret
+
+L73:	addl $4, %eax		/* adjust pointer */
+L72:	addl $4, %eax
+L71:	addl $4, %eax
+
+	/* We now scan for the byte in which the character was matched.
+	   But we have to take care of the case that a NUL char is
+	   found before this in the dword.  */
+
+L7:	testb %cl, %cl		/* is first byte C? */
+	jz L6			/* yes => return pointer */
+	cmpb %dl, %cl		/* is first byte NUL? */
+	je L2			/* yes => return NULL */
+	incl %eax		/* it's not in the first byte */
+
+	testb %ch, %ch		/* is second byte C? */
+	jz L6			/* yes => return pointer */
+	cmpb %dl, %ch		/* is second byte NUL? */
+	je L2			/* yes => return NULL? */
+	incl %eax		/* it's not in the second byte */
+
+	shrl $16, %ecx		/* make upper byte accessible */
+	testb %cl, %cl		/* is third byte C? */
+	jz L6			/* yes => return pointer */
+	cmpb %dl, %cl		/* is third byte NUL? */
+	je L2			/* yes => return NULL */
+
+	/* It must be in the fourth byte and it cannot be NUL.  */
+	incl %eax
+
+L6:	popl %edi		/* restore saved register content */
+
+	ret
+
+weak_alias (strchr, index)
diff --git a/sysdeps/i386/strcspn.S b/sysdeps/i386/strcspn.S
new file mode 100644
index 0000000000..b0e789b490
--- /dev/null
+++ b/sysdeps/i386/strcspn.S
@@ -0,0 +1,176 @@
+/* strcspn (str, ss) -- Return the length of the initial segement of STR
+			which contains no characters from SS.
+For Intel 80x86, x>=3.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   str		(sp + 4)
+   stopset	(sp + 8)
+*/
+
+	.text
+ENTRY (strcspn)
+	movl 4(%esp), %edx	/* get string pointer */
+	movl 8(%esp), %eax	/* get stopset pointer */
+
+	/* First we create a table with flags for all possible characters.
+	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+	   supported by the C string functions we have 256 characters.
+	   Before inserting marks for the stop characters we clear the whole
+	   table.  The unrolled form is much faster than a loop.  */
+	xorl %ecx, %ecx		/* %ecx = 0 !!! */
+
+	pushl %ecx		/* make a 256 bytes long block filled with 0 */
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl $0		/* These immediate values make the label 2 */
+	pushl $0		/* to be aligned on a 16 byte boundary to */
+	pushl $0		/* get a better performance of the loop.  */
+	pushl $0
+	pushl $0
+	pushl $0
+
+/* For understanding the following code remember that %ecx == 0 now.
+   Although all the following instruction only modify %cl we always
+   have a correct zero-extended 32-bit value in %ecx.  */
+
+/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl".  We want
+   longer instructions so that the next loop aligns without adding nops.  */
+
+L2:	movb (%eax), %cl	/* get byte from stopset */
+	testb %cl, %cl		/* is NUL char? */
+	jz L1			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 1(%eax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L1			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 2(%eax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L1			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 3(%eax), %cl	/* get byte from stopset */
+	addl $4, %eax		/* increment stopset pointer */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+	testb $0xff, %cl	/* is NUL char? */
+	jnz L2			/* no => process next dword from stopset */
+
+L1:	leal -4(%edx), %eax	/* prepare loop */
+
+	/* We use a neat trick for the following loop.  Normally we would
+	   have to test for two termination conditions
+	   1. a character in the stopset was found
+	   and
+	   2. the end of the string was found
+	   But as a sign that the chracter is in the stopset we store its
+	   value in the table.  But the value of NUL is NUL so the loop
+	   terminates for NUL in every case.  */
+
+L3:	addl $4, %eax		/* adjust pointer for full loop round */
+
+	movb (%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	je L4			/* yes => return */
+
+	movb 1(%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	je L5			/* yes => return */
+
+	movb 2(%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	je L6			/* yes => return */
+
+	movb 3(%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	jne L3			/* yes => return */
+
+	incl %eax		/* adjust pointer */
+L6:	incl %eax
+L5:	incl %eax
+
+L4:	subl %edx, %eax		/* we have to return the number of valid
+				   characters, so compute distance to first
+				   non-valid character */
+	addl $256, %esp		/* remove stopset */
+
+	ret
diff --git a/sysdeps/i386/strpbrk.S b/sysdeps/i386/strpbrk.S
new file mode 100644
index 0000000000..245bf1a935
--- /dev/null
+++ b/sysdeps/i386/strpbrk.S
@@ -0,0 +1,177 @@
+/* strcspn (str, ss) -- Return the length of the initial segement of STR
+			which contains no characters from SS.
+For Intel 80x86, x>=3.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   str		(sp + 4)
+   stopset	(sp + 8)
+*/
+
+	.text
+ENTRY (strpbrk)
+	movl 4(%esp), %edx	/* get string pointer */
+	movl 8(%esp), %eax	/* get stopset pointer */
+
+	/* First we create a table with flags for all possible characters.
+	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+	   supported by the C string functions we have 256 characters.
+	   Before inserting marks for the stop characters we clear the whole
+	   table.  The unrolled form is much faster than a loop.  */
+	xorl %ecx, %ecx		/* %ecx = 0 !!! */
+
+	pushl %ecx		/* make a 256 bytes long block filled with 0 */
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl $0		/* These immediate values make the label 2 */
+	pushl $0		/* to be aligned on a 16 byte boundary to */
+	pushl $0		/* get a better performance of the loop.  */
+	pushl $0
+	pushl $0
+	pushl $0
+
+/* For understanding the following code remember that %ecx == 0 now.
+   Although all the following instruction only modify %cl we always
+   have a correct zero-extended 32-bit value in %ecx.  */
+
+/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl".  We want
+   longer instructions so that the next loop aligns without adding nops.  */
+
+L2:	movb (%eax), %cl	/* get byte from stopset */
+	testb %cl, %cl		/* is NUL char? */
+	jz L1			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 1(%eax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L1			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 2(%eax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L1			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 3(%eax), %cl	/* get byte from stopset */
+	addl $4, %eax		/* increment stopset pointer */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+	testb $0xff, %cl	/* is NUL char? */
+	jnz L2			/* no => process next dword from stopset */
+
+L1:	leal -4(%edx), %eax	/* prepare loop */
+
+	/* We use a neat trick for the following loop.  Normally we would
+	   have to test for two termination conditions
+	   1. a character in the stopset was found
+	   and
+	   2. the end of the string was found
+	   But as a sign that the chracter is in the stopset we store its
+	   value in the table.  But the value of NUL is NUL so the loop
+	   terminates for NUL in every case.  */
+
+L3:	addl $4, %eax		/* adjust pointer for full loop round */
+
+	movb (%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	je L4			/* yes => return */
+
+	movb 1(%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	je L5			/* yes => return */
+
+	movb 2(%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	je L6			/* yes => return */
+
+	movb 3(%eax), %cl	/* get byte from string */
+	cmpb %cl, (%esp,%ecx)	/* is it contained in stopset? */
+	jne L3			/* yes => return */
+
+	incl %eax		/* adjust pointer */
+L6:	incl %eax
+L5:	incl %eax
+
+L4:	addl $256, %esp		/* remove stopset */
+
+	orb %cl, %cl		/* was last character NUL? */
+	jnz L7			/* no => return pointer */
+	xorl %eax, %eax		/* return NULL */
+
+L7:	ret
diff --git a/sysdeps/i386/strrchr.S b/sysdeps/i386/strrchr.S
new file mode 100644
index 0000000000..468a940d74
--- /dev/null
+++ b/sysdeps/i386/strrchr.S
@@ -0,0 +1,321 @@
+/* strchr (str, ch) -- Return pointer to last occurrence of CH in STR.
+For Intel 80x86, x>=3.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+Some optimisations by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   str		(sp + 4)
+   ch		(sp + 8)
+*/
+
+	.text
+ENTRY (strrchr)
+	pushl %edi		/* Save callee-safe registers used here.  */
+	pushl %esi
+
+	xorl %eax, %eax
+	movl 12(%esp), %esi	/* get string pointer */
+	movl 16(%esp), %ecx	/* get character we are looking for */
+
+	/* At the moment %ecx contains C.  What we need for the
+	   algorithm is C in all bytes of the dword.  Avoid
+	   operations on 16 bit words because these require an
+	   prefix byte (and one more cycle).  */
+	movb %cl, %ch		/* now it is 0|0|c|c */
+	movl %ecx, %edx
+	shll $16, %ecx		/* now it is c|c|0|0 */
+	movw %dx, %cx		/* and finally c|c|c|c */
+
+	/* Before we start with the main loop we process single bytes
+	   until the source pointer is aligned.  This has two reasons:
+	   1. aligned 32-bit memory access is faster
+	   and (more important)
+	   2. we process in the main loop 32 bit in one step although
+	      we don't know the end of the string.  But accessing at
+	      4-byte alignment guarantees that we never access illegal
+	      memory if this would not also be done by the trivial
+	      implementation (this is because all processor inherant
+	      boundaries are multiples of 4.  */
+
+	testb $3, %esi		/* correctly aligned ? */
+	jz L19			/* yes => begin loop */
+	movb (%esi), %dl	/* load byte in question (we need it twice) */
+	cmpb %dl, %cl		/* compare byte */
+	jne L11			/* target found => return */
+	movl %esi, %eax		/* remember pointer as possible result */
+L11:	orb %dl, %dl		/* is NUL? */
+	jz L2			/* yes => return NULL */
+	incl %esi		/* increment pointer */
+
+	testb $3, %esi		/* correctly aligned ? */
+	jz L19			/* yes => begin loop */
+	movb (%esi), %dl	/* load byte in question (we need it twice) */
+	cmpb %dl, %cl		/* compare byte */
+	jne L12			/* target found => return */
+	movl %esi, %eax		/* remember pointer as result */
+L12:	orb %dl, %dl		/* is NUL? */
+	jz L2			/* yes => return NULL */
+	incl %esi		/* increment pointer */
+
+	testb $3, %esi		/* correctly aligned ? */
+	jz L19			/* yes => begin loop */
+	movb (%esi), %dl	/* load byte in question (we need it twice) */
+	cmpb %dl, %cl		/* compare byte */
+	jne L13			/* target found => return */
+	movl %esi, %eax		/* remember pointer as result */
+L13:	orb %cl, %cl		/* is NUL? */
+	jz L2			/* yes => return NULL */
+	incl %esi		/* increment pointer */
+
+	/* No we have reached alignment.  */
+	jmp L19			/* begin loop */
+
+      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
+	 change any of the hole bits of LONGWORD.
+
+	 1) Is this safe?  Will it catch all the zero bytes?
+	 Suppose there is a byte with all zeros.  Any carry bits
+	 propagating from its left will fall into the hole at its
+	 least significant bit and stop.  Since there will be no
+	 carry from its most significant bit, the LSB of the
+	 byte to the left will be unchanged, and the zero will be
+	 detected.
+
+	 2) Is this worthwhile?  Will it ignore everything except
+	 zero bytes?  Suppose every byte of LONGWORD has a bit set
+	 somewhere.  There will be a carry into bit 8.	If bit 8
+	 is set, this will carry into bit 16.  If bit 8 is clear,
+	 one of bits 9-15 must be set, so there will be a carry
+	 into bit 16.  Similarly, there will be a carry into bit
+	 24.  If one of bits 24-31 is set, there will be a carry
+	 into bit 32 (=carry flag), so all of the hole bits will
+	 be changed.
+
+	 3) But wait!  Aren't we looking for C, not zero?
+	 Good point.  So what we do is XOR LONGWORD with a longword,
+	 each of whose bytes is C.  This turns each byte that is C
+	 into a zero.  */
+
+	/* Each round the main loop processes 16 bytes.  */
+
+	/* Jump to here when the character is detected.  We chose this
+	   way around because the character one is looking for is not
+	   as frequent as the rest and taking a conditional jump is more
+	   expensive than ignoring it.
+
+	   Some more words to the code below: it might not be obvious why
+	   we decrement the source pointer here.  In the loop the pointer
+	   is not pre-incremented and so it still points before the word
+	   we are looking at.  But you should take a look at the instruction
+	   which gets executed before we get into the loop: `addl $16, %esi'.
+	   This makes the following subs into adds.  */
+
+	/* These fill bytes make the main loop be correctly aligned.
+	   We cannot use align because it is not the following instruction
+	   which should be aligned.  */
+	.byte 0, 0, 0, 0, 0, 0, 0, 0
+
+L4:	subl $4, %esi		/* adjust pointer */
+L41:	subl $4, %esi
+L42:	subl $4, %esi
+L43:	testl $0xff000000, %edx	/* is highest byte == C? */
+	jnz L33			/* no => try other bytes */
+	leal 15(%esi), %eax	/* store address as result */
+	jmp L1			/* and start loop again */
+
+L3:	subl $4, %esi		/* adjust pointer */
+L31:	subl $4, %esi
+L32:	subl $4, %esi
+L33:	testl $0xff0000, %edx	/* is C in third byte? */
+	jnz L51			/* no => try other bytes */
+	leal 14(%esi), %eax	/* store address as result */
+	jmp L1			/* and start loop again */
+
+L51:
+	/* At this point we know that the byte is in one of the lower bytes.
+	   We make a guess and correct it if necessary.  This reduces the
+	   number of necessary jumps.  */
+	leal 12(%esi), %eax	/* guess address of lowest byte as result */
+	testb %dh, %dh		/* is guess correct? */
+	jnz L1			/* yes => start loop */
+	leal 13(%esi), %eax	/* correct guess to second byte */
+
+L1:	addl $16, %esi		/* increment pointer for full round */
+
+L19:	movl (%esi), %edx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+
+	/* According to the algorithm we had to reverse the effect of the
+	   XOR first and then test the overflow bits.  But because the
+	   following XOR would destroy the carry flag and it would (in a
+	   representation with more than 32 bits) not alter then last
+	   overflow, we can now test this condition.  If no carry is signaled
+	   no overflow must have occured in the last byte => it was 0.	*/
+
+	jnc L20			/* found NUL => check last word */
+
+	/* We are only interested in carry bits that change due to the
+	   previous add, so remove original bits */
+	xorl %edx, %edi		/* (word+magic)^word */
+
+	/* Now test for the other three overflow bits.  */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	/* If at least one byte of the word is C we don't get 0 in %edi.  */
+	jnz L20			/* found NUL => check last word */
+
+	/* Now we made sure the dword does not contain the character we are
+	   looking for.  But because we deal with strings we have to check
+	   for the end of string before testing the next dword.  */
+
+	xorl %ecx, %edx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L4			/* highest byte is C => examine dword */
+	xorl %edx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L3			/* C is detected in the word => examine it */
+
+	movl 4(%esi), %edx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L21			/* found NUL => check last word */
+	xorl %edx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L21			/* found NUL => check last word */
+	xorl %ecx, %edx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L41			/* highest byte is C => examine dword */
+	xorl %edx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L31			/* C is detected in the word => examine it */
+
+	movl 8(%esi), %edx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L22			/* found NUL => check last word */
+	xorl %edx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L22			/* found NUL => check last word */
+	xorl %ecx, %edx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L42			/* highest byte is C => examine dword */
+	xorl %edx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L32			/* C is detected in the word => examine it */
+
+	movl 12(%esi), %edx	/* get word (= 4 bytes) in question */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L23			/* found NUL => check last word */
+	xorl %edx, %edi		/* (word+magic)^word */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz L23			/* found NUL => check last word */
+	xorl %ecx, %edx		/* XOR with word c|c|c|c => bytes of str == c
+				   are now 0 */
+	movl $0xfefefeff, %edi	/* magic value */
+	addl %edx, %edi		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc L43			/* highest byte is C => examine dword */
+	xorl %edx, %edi		/* ((word^charmask)+magic)^(word^charmask) */
+	orl $0xfefefeff, %edi	/* set all non-carry bits */
+	incl %edi		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz L1			/* C is not detected => restart loop */
+	jmp L33			/* examine word */
+
+L23:	addl $4, %esi		/* adjust pointer */
+L22:	addl $4, %esi
+L21:	addl $4, %esi
+
+	/* What remains to do is to test which byte the NUL char is and
+	   whether the searched character appears in one of the bytes
+	   before.  A special case is that the searched byte maybe NUL.
+	   In this case a pointer to the terminating NUL char has to be
+	   returned.  */
+
+L20:	cmpb %cl, %dl		/* is first byte == C? */
+	jne L24			/* no => skip */
+	movl %esi, %eax		/* store address as result */
+L24:	testb %dl, %dl		/* is first byte == NUL? */
+	jz L2			/* yes => return */
+
+	cmpb %cl, %dh		/* is second byte == C? */
+	jne L25			/* no => skip */
+	leal 1(%esi), %eax	/* store address as result */
+L25:	testb %dh, %dh		/* is second byte == NUL? */
+	jz L2			/* yes => return */
+
+	shrl $16,%edx		/* make upper bytes accessible */
+	cmpb %cl, %dl		/* is third byte == C */
+	jne L26			/* no => skip */
+	leal 2(%esi), %eax	/* store address as result */
+L26:	testb %dl, %dl		/* is third byte == NUL */
+	jz L2			/* yes => return */
+
+	cmpb %cl, %dh		/* is fourth byte == C */
+	jne L2			/* no => skip */
+	leal 3(%esi), %eax	/* store address as result */
+
+L2:	popl %esi		/* restore saved register content */
+	popl %edi
+
+	ret
+
+weak_alias (strrchr, rindex)
diff --git a/sysdeps/i386/strspn.S b/sysdeps/i386/strspn.S
new file mode 100644
index 0000000000..1a02026285
--- /dev/null
+++ b/sysdeps/i386/strspn.S
@@ -0,0 +1,176 @@
+/* strcspn (str, ss) -- Return the length of the initial segement of STR
+			which contains only characters from SS.
+For Intel 80x86, x>=3.
+Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>
+Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+/*
+   INPUT PARAMETERS:
+   str		(sp + 4)
+   skipset	(sp + 8)
+*/
+
+	.text
+ENTRY (strspn)
+	movl 4(%esp), %edx	/* get string pointer */
+	movl 8(%esp), %eax	/* get skipset pointer */
+
+	/* First we create a table with flags for all possible characters.
+	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+	   supported by the C string functions we have 256 characters.
+	   Before inserting marks for the stop characters we clear the whole
+	   table.  The unrolled form is much faster than a loop.  */
+	xorl %ecx, %ecx		/* %ecx = 0 !!! */
+
+	pushl %ecx		/* make a 256 bytes long block filled with 0 */
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl %ecx
+	pushl $0		/* These immediate values make the label 2 */
+	pushl $0		/* to be aligned on a 16 byte boundary to */
+	pushl $0		/* get a better performance of the loop.  */
+	pushl $0
+	pushl $0
+	pushl $0
+
+/* For understanding the following code remember that %ecx == 0 now.
+   Although all the following instruction only modify %cl we always
+   have a correct zero-extended 32-bit value in %ecx.  */
+
+/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl".  We want
+   longer instructions so that the next loop aligns without adding nops.  */
+
+L2:	movb (%eax), %cl	/* get byte from stopset */
+	testb %cl, %cl		/* is NUL char? */
+	jz L1			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 1(%eax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L1			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 2(%eax), %cl	/* get byte from stopset */
+	testb $0xff, %cl	/* is NUL char? */
+	jz L1			/* yes => start compare loop */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+
+	movb 3(%eax), %cl	/* get byte from stopset */
+	addl $4, %eax		/* increment stopset pointer */
+	movb %cl, (%esp,%ecx)	/* set corresponding byte in stopset table */
+	testb $0xff, %cl	/* is NUL char? */
+	jnz L2			/* no => process next dword from stopset */
+
+L1:	leal -4(%edx), %eax	/* prepare loop */
+
+	/* We use a neat trick for the following loop.  Normally we would
+	   have to test for two termination conditions
+	   1. a character in the stopset was found
+	   and
+	   2. the end of the string was found
+	   But as a sign that the chracter is in the stopset we store its
+	   value in the table.  But the value of NUL is NUL so the loop
+	   terminates for NUL in every case.  */
+
+L3:	addl $4, %eax		/* adjust pointer for full loop round */
+
+	movb (%eax), %cl	/* get byte from string */
+	testb %cl, (%esp,%ecx)	/* is it contained in skipset? */
+	jz L4			/* no => return */
+
+	movb 1(%eax), %cl	/* get byte from string */
+	testb %cl, (%esp,%ecx)	/* is it contained in skipset? */
+	jz L5			/* no => return */
+
+	movb 2(%eax), %cl	/* get byte from string */
+	testb %cl, (%esp,%ecx)	/* is it contained in skipset? */
+	jz L6			/* no => return */
+
+	movb 3(%eax), %cl	/* get byte from string */
+	testb %cl, (%esp,%ecx)	/* is it contained in skipset? */
+	jnz L3			/* yes => start loop again */
+
+	incl %eax		/* adjust pointer */
+L6:	incl %eax
+L5:	incl %eax
+
+L4:	subl %edx, %eax		/* we have to return the number of valid
+				   characters, so compute distance to first
+				   non-valid character */
+	addl $256, %esp		/* remove stopset */
+
+	ret
diff --git a/sysdeps/i386/sub_n.S b/sysdeps/i386/sub_n.S
index 64d2c25293..e18a70885b 100644
--- a/sysdeps/i386/sub_n.S
+++ b/sysdeps/i386/sub_n.S
@@ -1,7 +1,7 @@
 /* i80386 __mpn_sub_n -- Add two limb vectors of the same length > 0 and store
    sum in a third limb vector.
 
-Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -37,10 +37,10 @@ C_SYMBOL_NAME(__mpn_sub_n:)
 	pushl %edi
 	pushl %esi
 
-	movl 12(%esp),%edi	/* res_ptr */
-	movl 16(%esp),%esi	/* s1_ptr */
-	movl 20(%esp),%edx	/* s2_ptr */
-	movl 24(%esp),%ecx	/* size */
+	movl 12(%esp),%edi		/* res_ptr */
+	movl 16(%esp),%esi		/* s1_ptr */
+	movl 20(%esp),%edx		/* s2_ptr */
+	movl 24(%esp),%ecx		/* size */
 
 	movl	%ecx,%eax
 	shrl	$3,%ecx			/* compute count for unrolled loop */
@@ -54,14 +54,18 @@ C_SYMBOL_NAME(__mpn_sub_n:)
 	subl	%eax,%edx		/* ... enter the loop */
 	shrl	$2,%eax			/* restore previous value */
 #ifdef PIC
-	call	here
-here:	leal	(Loop - 3 - here)(%eax,%eax,8),%eax
-	addl	%eax,(%esp)
-	ret
+/* Calculate start address in loop for PIC.  Due to limitations in some
+   assemblers, Loop-L0-3 cannot be put into the leal */
+	call	L0
+L0:	leal	(%eax,%eax,8),%eax
+	addl	(%esp),%eax
+	addl	$(Loop-L0-3),%eax 
+	addl	$4,%esp
 #else
-	leal	(Loop - 3)(%eax,%eax,8),%eax	/* calc start addr in loop */
-	jmp	*%eax			/* jump into loop */
+/* Calculate start address in loop for non-PIC.  */
+ 	leal	(Loop - 3)(%eax,%eax,8),%eax
 #endif
+	jmp	*%eax			/* jump into loop */
 	ALIGN (3)
 Loop:	movl	(%esi),%eax
 	sbbl	(%edx),%eax
diff --git a/sysdeps/i960/add_n.s b/sysdeps/i960/add_n.s
new file mode 100644
index 0000000000..6031f6d4c3
--- /dev/null
+++ b/sysdeps/i960/add_n.s
@@ -0,0 +1,21 @@
+.text
+	.align 4
+	.globl ___mpn_add_n
+___mpn_add_n:
+	mov	0,g6		# clear carry-save register
+	cmpo	1,0		# clear cy
+
+Loop:	subo	1,g3,g3		# update loop counter
+	ld	(g1),g5		# load from s1_ptr
+	addo	4,g1,g1		# s1_ptr++
+	ld	(g2),g4		# load from s2_ptr
+	addo	4,g2,g2		# s2_ptr++
+	cmpo	g6,1		# restore cy from g6, relies on cy being 0
+	addc	g4,g5,g4	# main add
+	subc	0,0,g6		# save cy in g6
+	st	g4,(g0)		# store result to res_ptr
+	addo	4,g0,g0		# res_ptr++
+	cmpobne	0,g3,Loop	# when branch is taken, clears C bit
+
+	mov	g6,g0
+	ret
diff --git a/sysdeps/i960/addmul_1.s b/sysdeps/i960/addmul_1.s
new file mode 100644
index 0000000000..1a3de95e50
--- /dev/null
+++ b/sysdeps/i960/addmul_1.s
@@ -0,0 +1,26 @@
+.text
+	.align	4
+	.globl	___mpn_mul_1
+___mpn_mul_1:
+	subo	g2,0,g2
+	shlo	2,g2,g4
+	subo	g4,g1,g1
+	subo	g4,g0,g13
+	mov	0,g0
+
+	cmpo	1,0		# clear C bit on AC.cc
+
+Loop:	ld	(g1)[g2*4],g5
+	emul	g3,g5,g6
+	ld	(g13)[g2*4],g5
+
+	addc	g0,g6,g6	# relies on that C bit is clear
+	addc	0,g7,g7
+	addc	g5,g6,g6	# relies on that C bit is clear
+	st	g6,(g13)[g2*4]
+	addc	0,g7,g0
+
+	addo	g2,1,g2
+	cmpobne	0,g2,Loop	# when branch is taken, clears C bit
+
+	ret
diff --git a/sysdeps/i960/mul_1.s b/sysdeps/i960/mul_1.s
new file mode 100644
index 0000000000..e75ea42d39
--- /dev/null
+++ b/sysdeps/i960/mul_1.s
@@ -0,0 +1,23 @@
+.text
+	.align	4
+	.globl	___mpn_mul_1
+___mpn_mul_1:
+	subo	g2,0,g2
+	shlo	2,g2,g4
+	subo	g4,g1,g1
+	subo	g4,g0,g13
+	mov	0,g0
+
+	cmpo	1,0		# clear C bit on AC.cc
+
+Loop:	ld	(g1)[g2*4],g5
+	emul	g3,g5,g6
+
+	addc	g0,g6,g6	# relies on that C bit is clear
+	st	g6,(g13)[g2*4]
+	addc	0,g7,g0
+
+	addo	g2,1,g2
+	cmpobne	0,g2,Loop	# when branch is taken, clears C bit
+
+	ret
diff --git a/sysdeps/i960/sub_n.s b/sysdeps/i960/sub_n.s
new file mode 100644
index 0000000000..13ebbfa9f2
--- /dev/null
+++ b/sysdeps/i960/sub_n.s
@@ -0,0 +1,21 @@
+.text
+	.align 4
+	.globl ___mpn_sub_n
+___mpn_sub_n:
+	mov	1,g6		# set carry-save register
+	cmpo	1,0		# clear cy
+
+Loop:	subo	1,g3,g3		# update loop counter
+	ld	(g1),g5		# load from s1_ptr
+	addo	4,g1,g1		# s1_ptr++
+	ld	(g2),g4		# load from s2_ptr
+	addo	4,g2,g2		# s2_ptr++
+	cmpo	g6,1		# restore cy from g6, relies on cy being 0
+	subc	g4,g5,g4	# main subtract
+	subc	0,0,g6		# save cy in g6
+	st	g4,(g0)		# store result to res_ptr
+	addo	4,g0,g0		# res_ptr++
+	cmpobne	0,g3,Loop	# when branch is taken, cy will be 0
+
+	mov	g6,g0
+	ret
diff --git a/sysdeps/m88k/m88100/add_n.s b/sysdeps/m88k/m88100/add_n.s
new file mode 100644
index 0000000000..7e4ccccb90
--- /dev/null
+++ b/sysdeps/m88k/m88100/add_n.s
@@ -0,0 +1,103 @@
+; mc88100 __mpn_add -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; s2_ptr	r4
+; size		r5
+
+; This code has been optimized to run one instruction per clock, avoiding
+; load stalls and writeback contention.  As a result, the instruction
+; order is not always natural.
+
+; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100,
+; but on the 88110, it seems to run much slower, 6.6 clocks/limb.
+
+	text
+	align	 16
+	global	 ___mpn_add_n
+___mpn_add_n:
+	ld	r6,r3,0			; read first limb from s1_ptr
+	extu	r10,r5,3
+	ld	r7,r4,0			; read first limb from s2_ptr
+
+	subu.co	r5,r0,r5		; (clear carry as side effect)
+	mak	r5,r5,3<4>
+	bcnd	eq0,r5,Lzero
+
+	or	r12,r0,lo16(Lbase)
+	or.u	r12,r12,hi16(Lbase)
+	addu	r12,r12,r5		; r12 is address for entering in loop
+
+	extu	r5,r5,2			; divide by 4
+	subu	r2,r2,r5		; adjust res_ptr
+	subu	r3,r3,r5		; adjust s1_ptr
+	subu	r4,r4,r5		; adjust s2_ptr
+
+	or	r8,r6,r0
+
+	jmp.n	r12
+	 or	r9,r7,r0
+
+Loop:	addu	r3,r3,32
+	st	r8,r2,28
+	addu	r4,r4,32
+	ld	r6,r3,0
+	addu	r2,r2,32
+	ld	r7,r4,0
+Lzero:	subu	r10,r10,1		; add 0 + 8r limbs (adj loop cnt)
+Lbase:	ld	r8,r3,4
+	addu.cio r6,r6,r7
+	ld	r9,r4,4
+	st	r6,r2,0
+	ld	r6,r3,8			; add 7 + 8r limbs
+	addu.cio r8,r8,r9
+	ld	r7,r4,8
+	st	r8,r2,4
+	ld	r8,r3,12		; add 6 + 8r limbs
+	addu.cio r6,r6,r7
+	ld	r9,r4,12
+	st	r6,r2,8
+	ld	r6,r3,16		; add 5 + 8r limbs
+	addu.cio r8,r8,r9
+	ld	r7,r4,16
+	st	r8,r2,12
+	ld	r8,r3,20		; add 4 + 8r limbs
+	addu.cio r6,r6,r7
+	ld	r9,r4,20
+	st	r6,r2,16
+	ld	r6,r3,24		; add 3 + 8r limbs
+	addu.cio r8,r8,r9
+	ld	r7,r4,24
+	st	r8,r2,20
+	ld	r8,r3,28		; add 2 + 8r limbs
+	addu.cio r6,r6,r7
+	ld	r9,r4,28
+	st	r6,r2,24
+	bcnd.n	ne0,r10,Loop		; add 1 + 8r limbs
+	 addu.cio r8,r8,r9
+
+	st	r8,r2,28		; store most significant limb
+
+	jmp.n	 r1
+	 addu.ci r2,r0,r0		; return carry-out from most sign. limb
diff --git a/sysdeps/m88k/m88100/mul_1.s b/sysdeps/m88k/m88100/mul_1.s
new file mode 100644
index 0000000000..35c238d570
--- /dev/null
+++ b/sysdeps/m88k/m88100/mul_1.s
@@ -0,0 +1,128 @@
+; mc88100 __mpn_mul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; size		r4
+; s2_limb	r5
+
+; Common overhead is about 11 cycles/invocation.
+
+; The speed for S2_LIMB >= 0x10000 is approximately 21 cycles/limb.  (The
+; pipeline stalls 2 cycles due to WB contention.)
+
+; The speed for S2_LIMB < 0x10000 is approximately 16 cycles/limb.  (The
+; pipeline stalls 2 cycles due to WB contention and 1 cycle due to latency.)
+
+; To enhance speed:
+; 1. Unroll main loop 4-8 times.
+; 2. Schedule code to avoid WB contention.  It might be tempting to move the
+;    ld instruction in the loops down to save 2 cycles (less WB contention),
+;    but that looses because the ultimate value will be read from outside
+;    the allocated space.  But if we handle the ultimate multiplication in
+;    the tail, we can do this.
+; 3. Make the multiplication with less instructions.  I think the code for
+;    (S2_LIMB >= 0x10000) is not minimal.
+; With these techniques the (S2_LIMB >= 0x10000) case would run in 17 or
+; less cycles/limb; the (S2_LIMB < 0x10000) case would run in 11
+; cycles/limb.  (Assuming infinite unrolling.)
+
+	text
+	align	 16
+	global	 ___mpn_mul_1
+___mpn_mul_1:
+
+	; Make S1_PTR and RES_PTR point at the end of their blocks
+	; and negate SIZE.
+	lda	 r3,r3[r4]
+	lda	 r6,r2[r4]		; RES_PTR in r6 since r2 is retval
+	subu	 r4,r0,r4
+
+	addu.co	 r2,r0,r0		; r2 = cy = 0
+	ld	 r9,r3[r4]
+	mask	 r7,r5,0xffff		; r7 = lo(S2_LIMB)
+	extu	 r8,r5,16		; r8 = hi(S2_LIMB)
+	bcnd.n	 eq0,r8,Lsmall		; jump if (hi(S2_LIMB) == 0)
+	 subu	 r6,r6,4
+
+; General code for any value of S2_LIMB.
+
+	; Make a stack frame and save r25 and r26
+	subu	 r31,r31,16
+	st.d	 r25,r31,8
+
+	; Enter the loop in the middle
+	br.n	L1
+	addu	 r4,r4,1
+
+Loop:
+	ld	 r9,r3[r4]
+	st	 r26,r6[r4]
+; bcnd	ne0,r0,0			; bubble
+	addu	 r4,r4,1
+L1:	mul	 r26,r9,r5		; low word of product	mul_1	WB ld
+	mask	 r12,r9,0xffff		; r12 = lo(s1_limb)	mask_1
+	mul	 r11,r12,r7		; r11 =  prod_0		mul_2	WB mask_1
+	mul	 r10,r12,r8		; r10 = prod_1a		mul_3
+	extu	 r13,r9,16		; r13 = hi(s1_limb)	extu_1	WB mul_1
+	mul	 r12,r13,r7		; r12 = prod_1b		mul_4	WB extu_1
+	mul	 r25,r13,r8		; r25  = prod_2		mul_5	WB mul_2
+	extu	 r11,r11,16		; r11 = hi(prod_0)	extu_2	WB mul_3
+	addu	 r10,r10,r11		;			addu_1	WB extu_2
+; bcnd	ne0,r0,0			; bubble			WB addu_1
+	addu.co	 r10,r10,r12		;				WB mul_4
+	mask.u	 r10,r10,0xffff		; move the 16 most significant bits...
+	addu.ci	 r10,r10,r0		; ...to the low half of the word...
+	rot	 r10,r10,16		; ...and put carry in pos 16.
+	addu.co	 r26,r26,r2		; add old carry limb
+	bcnd.n	 ne0,r4,Loop
+	 addu.ci r2,r25,r10		; compute new carry limb
+
+	st	 r26,r6[r4]
+	ld.d	 r25,r31,8
+	jmp.n	 r1
+	 addu	 r31,r31,16
+
+; Fast code for S2_LIMB < 0x10000
+Lsmall:
+	; Enter the loop in the middle
+	br.n	SL1
+	addu	 r4,r4,1
+
+SLoop:
+	ld	 r9,r3[r4]		;
+	st	 r8,r6[r4]		;
+	addu	 r4,r4,1		;
+SL1:	mul	 r8,r9,r5		; low word of product
+	mask	 r12,r9,0xffff		; r12 = lo(s1_limb)
+	extu	 r13,r9,16		; r13 = hi(s1_limb)
+	mul	 r11,r12,r7		; r11 =  prod_0
+	mul	 r12,r13,r7		; r12 = prod_1b
+	addu.cio r8,r8,r2		; add old carry limb
+	extu	 r10,r11,16		; r11 = hi(prod_0)
+	addu	 r10,r10,r12		;
+	bcnd.n	 ne0,r4,SLoop
+	extu	 r2,r10,16		; r2 = new carry limb
+
+	jmp.n	 r1
+	st	 r8,r6[r4]
diff --git a/sysdeps/m88k/m88100/sub_n.s b/sysdeps/m88k/m88100/sub_n.s
new file mode 100644
index 0000000000..3963cd5479
--- /dev/null
+++ b/sysdeps/m88k/m88100/sub_n.s
@@ -0,0 +1,104 @@
+; mc88100 __mpn_sub -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; s2_ptr	r4
+; size		r5
+
+; This code has been optimized to run one instruction per clock, avoiding
+; load stalls and writeback contention.  As a result, the instruction
+; order is not always natural.
+
+; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100,
+; but on the 88110, it seems to run much slower, 6.6 clocks/limb.
+
+	text
+	align	 16
+	global	 ___mpn_sub_n
+___mpn_sub_n:
+	ld	r6,r3,0			; read first limb from s1_ptr
+	extu	r10,r5,3
+	ld	r7,r4,0			; read first limb from s2_ptr
+
+	subu.co	r5,r0,r5		; (clear carry as side effect)
+	mak	r5,r5,3<4>
+	bcnd	eq0,r5,Lzero
+
+	or	r12,r0,lo16(Lbase)
+	or.u	r12,r12,hi16(Lbase)
+	addu	r12,r12,r5		; r12 is address for entering in loop
+
+	extu	r5,r5,2			; divide by 4
+	subu	r2,r2,r5		; adjust res_ptr
+	subu	r3,r3,r5		; adjust s1_ptr
+	subu	r4,r4,r5		; adjust s2_ptr
+
+	or	r8,r6,r0
+
+	jmp.n	r12
+	 or	r9,r7,r0
+
+Loop:	addu	r3,r3,32
+	st	r8,r2,28
+	addu	r4,r4,32
+	ld	r6,r3,0
+	addu	r2,r2,32
+	ld	r7,r4,0
+Lzero:	subu	r10,r10,1		; subtract 0 + 8r limbs (adj loop cnt)
+Lbase:	ld	r8,r3,4
+	subu.cio r6,r6,r7
+	ld	r9,r4,4
+	st	r6,r2,0
+	ld	r6,r3,8			; subtract 7 + 8r limbs
+	subu.cio r8,r8,r9
+	ld	r7,r4,8
+	st	r8,r2,4
+	ld	r8,r3,12		; subtract 6 + 8r limbs
+	subu.cio r6,r6,r7
+	ld	r9,r4,12
+	st	r6,r2,8
+	ld	r6,r3,16		; subtract 5 + 8r limbs
+	subu.cio r8,r8,r9
+	ld	r7,r4,16
+	st	r8,r2,12
+	ld	r8,r3,20		; subtract 4 + 8r limbs
+	subu.cio r6,r6,r7
+	ld	r9,r4,20
+	st	r6,r2,16
+	ld	r6,r3,24		; subtract 3 + 8r limbs
+	subu.cio r8,r8,r9
+	ld	r7,r4,24
+	st	r8,r2,20
+	ld	r8,r3,28		; subtract 2 + 8r limbs
+	subu.cio r6,r6,r7
+	ld	r9,r4,28
+	st	r6,r2,24
+	bcnd.n	ne0,r10,Loop		; subtract 1 + 8r limbs
+	 subu.cio r8,r8,r9
+
+	st	r8,r2,28		; store most significant limb
+
+	addu.ci r2,r0,r0		; return carry-out from most sign. limb
+	jmp.n	 r1
+	 xor	r2,r2,1
diff --git a/sysdeps/m88k/m88110/mul_1.s b/sysdeps/m88k/m88110/mul_1.s
new file mode 100644
index 0000000000..08c3ca07ee
--- /dev/null
+++ b/sysdeps/m88k/m88110/mul_1.s
@@ -0,0 +1,84 @@
+; mc88110 __mpn_mul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Library General Public License as published by
+; the Free Software Foundation; either version 2 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+; License for more details.
+
+; You should have received a copy of the GNU Library General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; size		r4
+; s2_limb	r5
+
+	text
+	align	16
+	global	___mpn_mul_1
+___mpn_mul_1:
+	; Make S1_PTR and RES_PTR point at the end of their blocks
+	; and negate SIZE.
+	lda	 r3,r3[r4]
+	lda	 r8,r2[r4]		; RES_PTR in r8 since r2 is retval
+	subu	 r4,r0,r4
+
+	addu.co	 r2,r0,r0		; r2 = cy = 0
+
+	ld	 r6,r3[r4]
+	addu	 r4,r4,1
+	mulu.d	 r10,r6,r5
+	bcnd.n	 eq0,r4,Lend
+	 subu	 r8,r8,8
+
+Loop:	ld	 r6,r3[r4]
+	addu.cio r9,r11,r2
+	or	 r2,r10,r0		; could be avoided if unrolled
+	addu	 r4,r4,1
+	mulu.d	 r10,r6,r5
+	bcnd.n	 ne0,r4,Loop
+	 st	 r9,r8[r4]
+
+Lend:	addu.cio r9,r11,r2
+	st	 r9,r8,4
+	jmp.n	 r1
+	 addu.ci r2,r10,r0
+
+; This is the Right Way to do this on '110.  4 cycles / 64-bit limb.
+;	ld.d	r10,
+;	mulu.d
+;	addu.cio
+;	addu.cio
+;	st.d
+;	mulu.d	,r11,r5
+;	ld.d	r12,
+;	mulu.d	,r10,r5
+;	addu.cio
+;	addu.cio
+;	st.d
+;	mulu.d
+;	ld.d	r10,
+;	mulu.d
+;	addu.cio
+;	addu.cio
+;	st.d
+;	mulu.d
+;	ld.d	r10,
+;	mulu.d
+;	addu.cio
+;	addu.cio
+;	st.d
+;	mulu.d
diff --git a/sysdeps/mips/add_n.s b/sysdeps/mips/add_n.s
new file mode 100644
index 0000000000..c82910816e
--- /dev/null
+++ b/sysdeps/mips/add_n.s
@@ -0,0 +1,119 @@
+ # MIPS2 __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # s2_ptr	$6
+ # size		$7
+
+	.text
+	.align	2
+	.globl	__mpn_add_n
+	.ent	__mpn_add_n
+__mpn_add_n:
+	.set	noreorder
+	.set	nomacro
+
+	lw	$10,0($5)
+	lw	$11,0($6)
+
+	addiu	$7,$7,-1
+	and	$9,$7,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 move	$2,$0
+
+	subu	$7,$7,$9
+
+.Loop0:	addiu	$9,$9,-1
+	lw	$12,4($5)
+	addu	$11,$11,$2
+	lw	$13,4($6)
+	sltu	$8,$11,$2
+	addu	$11,$10,$11
+	sltu	$2,$11,$10
+	sw	$11,0($4)
+	or	$2,$2,$8
+
+	addiu	$5,$5,4
+	addiu	$6,$6,4
+	move	$10,$12
+	move	$11,$13
+	bne	$9,$0,.Loop0
+	 addiu	$4,$4,4
+
+.L0:	beq	$7,$0,.Lend
+	 nop
+
+.Loop:	addiu	$7,$7,-4
+
+	lw	$12,4($5)
+	addu	$11,$11,$2
+	lw	$13,4($6)
+	sltu	$8,$11,$2
+	addu	$11,$10,$11
+	sltu	$2,$11,$10
+	sw	$11,0($4)
+	or	$2,$2,$8
+
+	lw	$10,8($5)
+	addu	$13,$13,$2
+	lw	$11,8($6)
+	sltu	$8,$13,$2
+	addu	$13,$12,$13
+	sltu	$2,$13,$12
+	sw	$13,4($4)
+	or	$2,$2,$8
+
+	lw	$12,12($5)
+	addu	$11,$11,$2
+	lw	$13,12($6)
+	sltu	$8,$11,$2
+	addu	$11,$10,$11
+	sltu	$2,$11,$10
+	sw	$11,8($4)
+	or	$2,$2,$8
+
+	lw	$10,16($5)
+	addu	$13,$13,$2
+	lw	$11,16($6)
+	sltu	$8,$13,$2
+	addu	$13,$12,$13
+	sltu	$2,$13,$12
+	sw	$13,12($4)
+	or	$2,$2,$8
+
+	addiu	$5,$5,16
+	addiu	$6,$6,16
+
+	bne	$7,$0,.Loop
+	 addiu	$4,$4,16
+
+.Lend:	addu	$11,$11,$2
+	sltu	$8,$11,$2
+	addu	$11,$10,$11
+	sltu	$2,$11,$10
+	sw	$11,0($4)
+	j	$31
+	or	$2,$2,$8
+
+	.end	__mpn_add_n
diff --git a/sysdeps/mips/addmul_1.s b/sysdeps/mips/addmul_1.s
new file mode 100644
index 0000000000..abc2fb8dcf
--- /dev/null
+++ b/sysdeps/mips/addmul_1.s
@@ -0,0 +1,96 @@
+ # MIPS __mpn_addmul_1 -- Multiply a limb vector with a single limb and
+ # add the product to a second limb vector.
+
+ # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	 4
+	.globl	 __mpn_addmul_1
+	.ent	__mpn_addmul_1
+__mpn_addmul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	lw	$8,0($5)
+
+ # warm up phase 1
+	addiu	$5,$5,4
+	multu	$8,$7
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	lw	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addiu	$5,$5,4
+	addu	$3,$3,$2	# add old carry limb to low product limb
+	multu	$8,$7
+	lw	$8,0($5)	# load new s1 limb as early as possible
+	addiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$3,$2	# carry from previous addition -> $2
+	addu	$3,$10,$3
+	sltu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	addiu	$4,$4,4
+	bne	$6,$0,Loop	# should be "bnel"
+	 addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addu	$3,$3,$2
+	sltu	$2,$3,$2
+	multu	$8,$7
+	addu	$3,$10,$3
+	sltu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	addiu	$4,$4,4
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addu	$3,$3,$2
+	sltu	$2,$3,$2
+	addu	$3,$10,$3
+	sltu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	j	$31
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__mpn_addmul_1
diff --git a/sysdeps/mips/lshift.s b/sysdeps/mips/lshift.s
new file mode 100644
index 0000000000..ce33e7c84c
--- /dev/null
+++ b/sysdeps/mips/lshift.s
@@ -0,0 +1,94 @@
+ # MIPS2 __mpn_lshift --
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # src_ptr	$5
+ # size		$6
+ # cnt		$7
+
+	.text
+	.align	2
+	.globl	__mpn_lshift
+	.ent	__mpn_lshift
+__mpn_lshift:
+	.set	noreorder
+	.set	nomacro
+
+	sll	$2,$6,2
+	addu	$5,$5,$2	# make r5 point at end of src
+	lw	$10,-4($5)	# load first limb
+	subu	$13,$0,$7
+	addu	$4,$4,$2	# make r4 point at end of res
+	addiu	$6,$6,-1
+	and	$9,$6,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 srl	$2,$10,$13	# compute function result
+
+	subu	$6,$6,$9
+
+.Loop0:	lw	$3,-8($5)
+	addiu	$4,$4,-4
+	addiu	$5,$5,-4
+	addiu	$9,$9,-1
+	sll	$11,$10,$7
+	srl	$12,$3,$13
+	move	$10,$3
+	or	$8,$11,$12
+	bne	$9,$0,.Loop0
+	 sw	$8,0($4)
+
+.L0:	beq	$6,$0,.Lend
+	 nop
+
+.Loop:	lw	$3,-8($5)
+	addiu	$4,$4,-16
+	addiu	$6,$6,-4
+	sll	$11,$10,$7
+	srl	$12,$3,$13
+
+	lw	$10,-12($5)
+	sll	$14,$3,$7
+	or	$8,$11,$12
+	sw	$8,12($4)
+	srl	$9,$10,$13
+
+	lw	$3,-16($5)
+	sll	$11,$10,$7
+	or	$8,$14,$9
+	sw	$8,8($4)
+	srl	$12,$3,$13
+
+	lw	$10,-20($5)
+	sll	$14,$3,$7
+	or	$8,$11,$12
+	sw	$8,4($4)
+	srl	$9,$10,$13
+
+	addiu	$5,$5,-16
+	or	$8,$14,$9
+	bgtz	$6,.Loop
+	 sw	$8,0($4)
+
+.Lend:	sll	$8,$10,$7
+	j	$31
+	sw	$8,-4($4)
+	.end	__mpn_lshift
diff --git a/sysdeps/mips/mips3/add_n.s b/sysdeps/mips/mips3/add_n.s
new file mode 100644
index 0000000000..b5257804ad
--- /dev/null
+++ b/sysdeps/mips/mips3/add_n.s
@@ -0,0 +1,119 @@
+ # MIPS3 __mpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # s2_ptr	$6
+ # size		$7
+
+	.text
+	.align	2
+	.globl	__mpn_add_n
+	.ent	__mpn_add_n
+__mpn_add_n:
+	.set	noreorder
+	.set	nomacro
+
+	ld	$10,0($5)
+	ld	$11,0($6)
+
+	daddiu	$7,$7,-1
+	and	$9,$7,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 move	$2,$0
+
+	dsubu	$7,$7,$9
+
+.Loop0:	daddiu	$9,$9,-1
+	ld	$12,8($5)
+	daddu	$11,$11,$2
+	ld	$13,8($6)
+	sltu	$8,$11,$2
+	daddu	$11,$10,$11
+	sltu	$2,$11,$10
+	sd	$11,0($4)
+	or	$2,$2,$8
+
+	daddiu	$5,$5,8
+	daddiu	$6,$6,8
+	move	$10,$12
+	move	$11,$13
+	bne	$9,$0,.Loop0
+	 daddiu	$4,$4,8
+
+.L0:	beq	$7,$0,.Lend
+	 nop
+
+.Loop:	daddiu	$7,$7,-4
+
+	ld	$12,8($5)
+	daddu	$11,$11,$2
+	ld	$13,8($6)
+	sltu	$8,$11,$2
+	daddu	$11,$10,$11
+	sltu	$2,$11,$10
+	sd	$11,0($4)
+	or	$2,$2,$8
+
+	ld	$10,16($5)
+	daddu	$13,$13,$2
+	ld	$11,16($6)
+	sltu	$8,$13,$2
+	daddu	$13,$12,$13
+	sltu	$2,$13,$12
+	sd	$13,8($4)
+	or	$2,$2,$8
+
+	ld	$12,24($5)
+	daddu	$11,$11,$2
+	ld	$13,24($6)
+	sltu	$8,$11,$2
+	daddu	$11,$10,$11
+	sltu	$2,$11,$10
+	sd	$11,16($4)
+	or	$2,$2,$8
+
+	ld	$10,32($5)
+	daddu	$13,$13,$2
+	ld	$11,32($6)
+	sltu	$8,$13,$2
+	daddu	$13,$12,$13
+	sltu	$2,$13,$12
+	sd	$13,24($4)
+	or	$2,$2,$8
+
+	daddiu	$5,$5,32
+	daddiu	$6,$6,32
+
+	bne	$7,$0,.Loop
+	 daddiu	$4,$4,32
+
+.Lend:	daddu	$11,$11,$2
+	sltu	$8,$11,$2
+	daddu	$11,$10,$11
+	sltu	$2,$11,$10
+	sd	$11,0($4)
+	j	$31
+	or	$2,$2,$8
+
+	.end	__mpn_add_n
diff --git a/sysdeps/mips/mips3/addmul_1.s b/sysdeps/mips/mips3/addmul_1.s
new file mode 100644
index 0000000000..7af0172614
--- /dev/null
+++ b/sysdeps/mips/mips3/addmul_1.s
@@ -0,0 +1,96 @@
+ # MIPS3 __mpn_addmul_1 -- Multiply a limb vector with a single limb and
+ # add the product to a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	4
+	.globl	__mpn_addmul_1
+	.ent	__mpn_addmul_1
+__mpn_addmul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	ld	$8,0($5)
+
+ # warm up phase 1
+	daddiu	$5,$5,8
+	dmultu	$8,$7
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	ld	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddiu	$5,$5,8
+	daddu	$3,$3,$2	# add old carry limb to low product limb
+	dmultu	$8,$7
+	ld	$8,0($5)	# load new s1 limb as early as possible
+	daddiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$3,$2	# carry from previous addition -> $2
+	daddu	$3,$10,$3
+	sltu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	daddiu	$4,$4,8
+	bne	$6,$0,Loop	# should be "bnel"
+	 daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddu	$3,$3,$2
+	sltu	$2,$3,$2
+	dmultu	$8,$7
+	daddu	$3,$10,$3
+	sltu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	daddiu	$4,$4,8
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddu	$3,$3,$2
+	sltu	$2,$3,$2
+	daddu	$3,$10,$3
+	sltu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	j	$31
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__mpn_addmul_1
diff --git a/sysdeps/mips/mips3/gmp-mparam.h b/sysdeps/mips/mips3/gmp-mparam.h
new file mode 100644
index 0000000000..a801b35d7a
--- /dev/null
+++ b/sysdeps/mips/mips3/gmp-mparam.h
@@ -0,0 +1,26 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
diff --git a/sysdeps/mips/mips3/lshift.s b/sysdeps/mips/mips3/lshift.s
new file mode 100644
index 0000000000..c05dcafffd
--- /dev/null
+++ b/sysdeps/mips/mips3/lshift.s
@@ -0,0 +1,94 @@
+ # MIPS3 __mpn_lshift --
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # src_ptr	$5
+ # size		$6
+ # cnt		$7
+
+	.text
+	.align	2
+	.globl	__mpn_lshift
+	.ent	__mpn_lshift
+__mpn_lshift:
+	.set	noreorder
+	.set	nomacro
+
+	dsll	$2,$6,3
+	daddu	$5,$5,$2	# make r5 point at end of src
+	ld	$10,-8($5)	# load first limb
+	dsubu	$13,$0,$7
+	daddu	$4,$4,$2	# make r4 point at end of res
+	daddiu	$6,$6,-1
+	and	$9,$6,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 dsrl	$2,$10,$13	# compute function result
+
+	dsubu	$6,$6,$9
+
+.Loop0:	ld	$3,-16($5)
+	daddiu	$4,$4,-8
+	daddiu	$5,$5,-8
+	daddiu	$9,$9,-1
+	dsll	$11,$10,$7
+	dsrl	$12,$3,$13
+	move	$10,$3
+	or	$8,$11,$12
+	bne	$9,$0,.Loop0
+	 sd	$8,0($4)
+
+.L0:	beq	$6,$0,.Lend
+	 nop
+
+.Loop:	ld	$3,-16($5)
+	daddiu	$4,$4,-32
+	daddiu	$6,$6,-4
+	dsll	$11,$10,$7
+	dsrl	$12,$3,$13
+
+	ld	$10,-24($5)
+	dsll	$14,$3,$7
+	or	$8,$11,$12
+	sd	$8,24($4)
+	dsrl	$9,$10,$13
+
+	ld	$3,-32($5)
+	dsll	$11,$10,$7
+	or	$8,$14,$9
+	sd	$8,16($4)
+	dsrl	$12,$3,$13
+
+	ld	$10,-40($5)
+	dsll	$14,$3,$7
+	or	$8,$11,$12
+	sd	$8,8($4)
+	dsrl	$9,$10,$13
+
+	daddiu	$5,$5,-32
+	or	$8,$14,$9
+	bgtz	$6,.Loop
+	 sd	$8,0($4)
+
+.Lend:	dsll	$8,$10,$7
+	j	$31
+	sd	$8,-8($4)
+	.end	__mpn_lshift
diff --git a/sysdeps/mips/mips3/mul_1.s b/sysdeps/mips/mips3/mul_1.s
new file mode 100644
index 0000000000..87954e5bc3
--- /dev/null
+++ b/sysdeps/mips/mips3/mul_1.s
@@ -0,0 +1,84 @@
+ # MIPS3 __mpn_mul_1 -- Multiply a limb vector with a single limb and
+ # store the product in a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	4
+	.globl	__mpn_mul_1
+	.ent	__mpn_mul_1
+__mpn_mul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	ld	$8,0($5)
+
+ # warm up phase 1
+	daddiu	$5,$5,8
+	dmultu	$8,$7
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	ld	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	mflo	$10
+	mfhi	$9
+	daddiu	$5,$5,8
+	daddu	$10,$10,$2	# add old carry limb to low product limb
+	dmultu	$8,$7
+	ld	$8,0($5)	# load new s1 limb as early as possible
+	daddiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$10,$2	# carry from previous addition -> $2
+	sd	$10,0($4)
+	daddiu	$4,$4,8
+	bne	$6,$0,Loop	# should be "bnel"
+	 daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	mflo	$10
+	mfhi	$9
+	daddu	$10,$10,$2
+	sltu	$2,$10,$2
+	dmultu	$8,$7
+	sd	$10,0($4)
+	daddiu	$4,$4,8
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	mflo	$10
+	mfhi	$9
+	daddu	$10,$10,$2
+	sltu	$2,$10,$2
+	sd	$10,0($4)
+	j	$31
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__mpn_mul_1
diff --git a/sysdeps/mips/mips3/rshift.s b/sysdeps/mips/mips3/rshift.s
new file mode 100644
index 0000000000..e0e2ca2c5f
--- /dev/null
+++ b/sysdeps/mips/mips3/rshift.s
@@ -0,0 +1,91 @@
+ # MIPS3 __mpn_rshift --
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # src_ptr	$5
+ # size		$6
+ # cnt		$7
+
+	.text
+	.align	2
+	.globl	__mpn_rshift
+	.ent	__mpn_rshift
+__mpn_rshift:
+	.set	noreorder
+	.set	nomacro
+
+	ld	$10,0($5)	# load first limb
+	dsubu	$13,$0,$7
+	daddiu	$6,$6,-1
+	and	$9,$6,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 dsll	$2,$10,$13	# compute function result
+
+	dsubu	$6,$6,$9
+
+.Loop0:	ld	$3,8($5)
+	daddiu	$4,$4,8
+	daddiu	$5,$5,8
+	daddiu	$9,$9,-1
+	dsrl	$11,$10,$7
+	dsll	$12,$3,$13
+	move	$10,$3
+	or	$8,$11,$12
+	bne	$9,$0,.Loop0
+	 sd	$8,-8($4)
+
+.L0:	beq	$6,$0,.Lend
+	 nop
+
+.Loop:	ld	$3,8($5)
+	daddiu	$4,$4,32
+	daddiu	$6,$6,-4
+	dsrl	$11,$10,$7
+	dsll	$12,$3,$13
+
+	ld	$10,16($5)
+	dsrl	$14,$3,$7
+	or	$8,$11,$12
+	sd	$8,-32($4)
+	dsll	$9,$10,$13
+
+	ld	$3,24($5)
+	dsrl	$11,$10,$7
+	or	$8,$14,$9
+	sd	$8,-24($4)
+	dsll	$12,$3,$13
+
+	ld	$10,32($5)
+	dsrl	$14,$3,$7
+	or	$8,$11,$12
+	sd	$8,-16($4)
+	dsll	$9,$10,$13
+
+	daddiu	$5,$5,32
+	or	$8,$14,$9
+	bgtz	$6,.Loop
+	 sd	$8,-8($4)
+
+.Lend:	dsrl	$8,$10,$7
+	j	$31
+	sd	$8,0($4)
+	.end	__mpn_rshift
diff --git a/sysdeps/mips/mips3/sub_n.s b/sysdeps/mips/mips3/sub_n.s
new file mode 100644
index 0000000000..9a45ffde5a
--- /dev/null
+++ b/sysdeps/mips/mips3/sub_n.s
@@ -0,0 +1,119 @@
+ # MIPS3 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # s2_ptr	$6
+ # size		$7
+
+	.text
+	.align	2
+	.globl	__mpn_sub_n
+	.ent	__mpn_sub_n
+__mpn_sub_n:
+	.set	noreorder
+	.set	nomacro
+
+	ld	$10,0($5)
+	ld	$11,0($6)
+
+	daddiu	$7,$7,-1
+	and	$9,$7,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 move	$2,$0
+
+	dsubu	$7,$7,$9
+
+.Loop0:	daddiu	$9,$9,-1
+	ld	$12,8($5)
+	daddu	$11,$11,$2
+	ld	$13,8($6)
+	sltu	$8,$11,$2
+	dsubu	$11,$10,$11
+	sltu	$2,$10,$11
+	sd	$11,0($4)
+	or	$2,$2,$8
+
+	daddiu	$5,$5,8
+	daddiu	$6,$6,8
+	move	$10,$12
+	move	$11,$13
+	bne	$9,$0,.Loop0
+	 daddiu	$4,$4,8
+
+.L0:	beq	$7,$0,.Lend
+	 nop
+
+.Loop:	daddiu	$7,$7,-4
+
+	ld	$12,8($5)
+	daddu	$11,$11,$2
+	ld	$13,8($6)
+	sltu	$8,$11,$2
+	dsubu	$11,$10,$11
+	sltu	$2,$10,$11
+	sd	$11,0($4)
+	or	$2,$2,$8
+
+	ld	$10,16($5)
+	daddu	$13,$13,$2
+	ld	$11,16($6)
+	sltu	$8,$13,$2
+	dsubu	$13,$12,$13
+	sltu	$2,$12,$13
+	sd	$13,8($4)
+	or	$2,$2,$8
+
+	ld	$12,24($5)
+	daddu	$11,$11,$2
+	ld	$13,24($6)
+	sltu	$8,$11,$2
+	dsubu	$11,$10,$11
+	sltu	$2,$10,$11
+	sd	$11,16($4)
+	or	$2,$2,$8
+
+	ld	$10,32($5)
+	daddu	$13,$13,$2
+	ld	$11,32($6)
+	sltu	$8,$13,$2
+	dsubu	$13,$12,$13
+	sltu	$2,$12,$13
+	sd	$13,24($4)
+	or	$2,$2,$8
+
+	daddiu	$5,$5,32
+	daddiu	$6,$6,32
+
+	bne	$7,$0,.Loop
+	 daddiu	$4,$4,32
+
+.Lend:	daddu	$11,$11,$2
+	sltu	$8,$11,$2
+	dsubu	$11,$10,$11
+	sltu	$2,$10,$11
+	sd	$11,0($4)
+	j	$31
+	or	$2,$2,$8
+
+	.end	__mpn_sub_n
diff --git a/sysdeps/mips/mips3/submul_1.s b/sysdeps/mips/mips3/submul_1.s
new file mode 100644
index 0000000000..f28c6a5167
--- /dev/null
+++ b/sysdeps/mips/mips3/submul_1.s
@@ -0,0 +1,96 @@
+ # MIPS3 __mpn_submul_1 -- Multiply a limb vector with a single limb and
+ # subtract the product from a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	4
+	.globl	__mpn_submul_1
+	.ent	__mpn_submul_1
+__mpn_submul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	ld	$8,0($5)
+
+ # warm up phase 1
+	daddiu	$5,$5,8
+	dmultu	$8,$7
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	ld	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddiu	$5,$5,8
+	daddu	$3,$3,$2	# add old carry limb to low product limb
+	dmultu	$8,$7
+	ld	$8,0($5)	# load new s1 limb as early as possible
+	daddiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$3,$2	# carry from previous addition -> $2
+	dsubu	$3,$10,$3
+	sgtu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	daddiu	$4,$4,8
+	bne	$6,$0,Loop	# should be "bnel"
+	 daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddu	$3,$3,$2
+	sltu	$2,$3,$2
+	dmultu	$8,$7
+	dsubu	$3,$10,$3
+	sgtu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	daddiu	$4,$4,8
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddu	$3,$3,$2
+	sltu	$2,$3,$2
+	dsubu	$3,$10,$3
+	sgtu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	j	$31
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__mpn_submul_1
diff --git a/sysdeps/mips/mul_1.s b/sysdeps/mips/mul_1.s
new file mode 100644
index 0000000000..01327e22d8
--- /dev/null
+++ b/sysdeps/mips/mul_1.s
@@ -0,0 +1,84 @@
+ # MIPS __mpn_mul_1 -- Multiply a limb vector with a single limb and
+ # store the product in a second limb vector.
+
+ # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	 4
+	.globl	 __mpn_mul_1
+	.ent	__mpn_mul_1
+__mpn_mul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	lw	$8,0($5)
+
+ # warm up phase 1
+	addiu	$5,$5,4
+	multu	$8,$7
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	lw	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	mflo	$10
+	mfhi	$9
+	addiu	$5,$5,4
+	addu	$10,$10,$2	# add old carry limb to low product limb
+	multu	$8,$7
+	lw	$8,0($5)	# load new s1 limb as early as possible
+	addiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$10,$2	# carry from previous addition -> $2
+	sw	$10,0($4)
+	addiu	$4,$4,4
+	bne	$6,$0,Loop	# should be "bnel"
+	 addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	mflo	$10
+	mfhi	$9
+	addu	$10,$10,$2
+	sltu	$2,$10,$2
+	multu	$8,$7
+	sw	$10,0($4)
+	addiu	$4,$4,4
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	mflo	$10
+	mfhi	$9
+	addu	$10,$10,$2
+	sltu	$2,$10,$2
+	sw	$10,0($4)
+	j	$31
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__mpn_mul_1
diff --git a/sysdeps/mips/rshift.s b/sysdeps/mips/rshift.s
new file mode 100644
index 0000000000..6941691373
--- /dev/null
+++ b/sysdeps/mips/rshift.s
@@ -0,0 +1,91 @@
+ # MIPS2 __mpn_rshift --
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # src_ptr	$5
+ # size		$6
+ # cnt		$7
+
+	.text
+	.align	2
+	.globl	__mpn_rshift
+	.ent	__mpn_rshift
+__mpn_rshift:
+	.set	noreorder
+	.set	nomacro
+
+	lw	$10,0($5)	# load first limb
+	subu	$13,$0,$7
+	addiu	$6,$6,-1
+	and	$9,$6,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 sll	$2,$10,$13	# compute function result
+
+	subu	$6,$6,$9
+
+.Loop0:	lw	$3,4($5)
+	addiu	$4,$4,4
+	addiu	$5,$5,4
+	addiu	$9,$9,-1
+	srl	$11,$10,$7
+	sll	$12,$3,$13
+	move	$10,$3
+	or	$8,$11,$12
+	bne	$9,$0,.Loop0
+	 sw	$8,-4($4)
+
+.L0:	beq	$6,$0,.Lend
+	 nop
+
+.Loop:	lw	$3,4($5)
+	addiu	$4,$4,16
+	addiu	$6,$6,-4
+	srl	$11,$10,$7
+	sll	$12,$3,$13
+
+	lw	$10,8($5)
+	srl	$14,$3,$7
+	or	$8,$11,$12
+	sw	$8,-16($4)
+	sll	$9,$10,$13
+
+	lw	$3,12($5)
+	srl	$11,$10,$7
+	or	$8,$14,$9
+	sw	$8,-12($4)
+	sll	$12,$3,$13
+
+	lw	$10,16($5)
+	srl	$14,$3,$7
+	or	$8,$11,$12
+	sw	$8,-8($4)
+	sll	$9,$10,$13
+
+	addiu	$5,$5,16
+	or	$8,$14,$9
+	bgtz	$6,.Loop
+	 sw	$8,-4($4)
+
+.Lend:	srl	$8,$10,$7
+	j	$31
+	sw	$8,0($4)
+	.end	__mpn_rshift
diff --git a/sysdeps/mips/sub_n.s b/sysdeps/mips/sub_n.s
new file mode 100644
index 0000000000..63f3b55354
--- /dev/null
+++ b/sysdeps/mips/sub_n.s
@@ -0,0 +1,119 @@
+ # MIPS2 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # s2_ptr	$6
+ # size		$7
+
+	.text
+	.align	2
+	.globl	__mpn_sub_n
+	.ent	__mpn_sub_n
+__mpn_sub_n:
+	.set	noreorder
+	.set	nomacro
+
+	lw	$10,0($5)
+	lw	$11,0($6)
+
+	addiu	$7,$7,-1
+	and	$9,$7,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 move	$2,$0
+
+	subu	$7,$7,$9
+
+.Loop0:	addiu	$9,$9,-1
+	lw	$12,4($5)
+	addu	$11,$11,$2
+	lw	$13,4($6)
+	sltu	$8,$11,$2
+	subu	$11,$10,$11
+	sltu	$2,$10,$11
+	sw	$11,0($4)
+	or	$2,$2,$8
+
+	addiu	$5,$5,4
+	addiu	$6,$6,4
+	move	$10,$12
+	move	$11,$13
+	bne	$9,$0,.Loop0
+	 addiu	$4,$4,4
+
+.L0:	beq	$7,$0,.Lend
+	 nop
+
+.Loop:	addiu	$7,$7,-4
+
+	lw	$12,4($5)
+	addu	$11,$11,$2
+	lw	$13,4($6)
+	sltu	$8,$11,$2
+	subu	$11,$10,$11
+	sltu	$2,$10,$11
+	sw	$11,0($4)
+	or	$2,$2,$8
+
+	lw	$10,8($5)
+	addu	$13,$13,$2
+	lw	$11,8($6)
+	sltu	$8,$13,$2
+	subu	$13,$12,$13
+	sltu	$2,$12,$13
+	sw	$13,4($4)
+	or	$2,$2,$8
+
+	lw	$12,12($5)
+	addu	$11,$11,$2
+	lw	$13,12($6)
+	sltu	$8,$11,$2
+	subu	$11,$10,$11
+	sltu	$2,$10,$11
+	sw	$11,8($4)
+	or	$2,$2,$8
+
+	lw	$10,16($5)
+	addu	$13,$13,$2
+	lw	$11,16($6)
+	sltu	$8,$13,$2
+	subu	$13,$12,$13
+	sltu	$2,$12,$13
+	sw	$13,12($4)
+	or	$2,$2,$8
+
+	addiu	$5,$5,16
+	addiu	$6,$6,16
+
+	bne	$7,$0,.Loop
+	 addiu	$4,$4,16
+
+.Lend:	addu	$11,$11,$2
+	sltu	$8,$11,$2
+	subu	$11,$10,$11
+	sltu	$2,$10,$11
+	sw	$11,0($4)
+	j	$31
+	or	$2,$2,$8
+
+	.end	__mpn_sub_n
diff --git a/sysdeps/mips/submul_1.s b/sysdeps/mips/submul_1.s
new file mode 100644
index 0000000000..616dd1b47c
--- /dev/null
+++ b/sysdeps/mips/submul_1.s
@@ -0,0 +1,96 @@
+ # MIPS __mpn_submul_1 -- Multiply a limb vector with a single limb and
+ # subtract the product from a second limb vector.
+
+ # Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Library General Public License as published by
+ # the Free Software Foundation; either version 2 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Library General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	 4
+	.globl	 __mpn_submul_1
+	.ent	__mpn_submul_1
+__mpn_submul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	lw	$8,0($5)
+
+ # warm up phase 1
+	addiu	$5,$5,4
+	multu	$8,$7
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	lw	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addiu	$5,$5,4
+	addu	$3,$3,$2	# add old carry limb to low product limb
+	multu	$8,$7
+	lw	$8,0($5)	# load new s1 limb as early as possible
+	addiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$3,$2	# carry from previous addition -> $2
+	subu	$3,$10,$3
+	sgtu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	addiu	$4,$4,4
+	bne	$6,$0,Loop	# should be "bnel"
+	 addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addu	$3,$3,$2
+	sltu	$2,$3,$2
+	multu	$8,$7
+	subu	$3,$10,$3
+	sgtu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	addiu	$4,$4,4
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addu	$3,$3,$2
+	sltu	$2,$3,$2
+	subu	$3,$10,$3
+	sgtu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	j	$31
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__mpn_submul_1
diff --git a/sysdeps/rs6000/add_n.s b/sysdeps/rs6000/add_n.s
new file mode 100644
index 0000000000..34ad9e1d2d
--- /dev/null
+++ b/sysdeps/rs6000/add_n.s
@@ -0,0 +1,54 @@
+# IBM POWER __mpn_add_n -- Add two limb vectors of equal, non-zero length.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# s2_ptr	r5
+# size		r6
+
+	.toc
+	.extern __mpn_add_n[DS]
+	.extern .__mpn_add_n
+.csect [PR]
+	.align 2
+	.globl __mpn_add_n
+	.globl .__mpn_add_n
+	.csect __mpn_add_n[DS]
+__mpn_add_n:
+	.long .__mpn_add_n, TOC[tc0], 0
+	.csect [PR]
+.__mpn_add_n:
+	mtctr	6		# copy size into CTR
+	l	8,0(4)		# load least significant s1 limb
+	l	0,0(5)		# load least significant s2 limb
+	cal	3,-4(3)		# offset res_ptr, it's updated before used
+	a	7,0,8		# add least significant limbs, set cy
+	bdz	Lend		# If done, skip loop
+Loop:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	stu	7,4(3)		# store previous limb in load latecny slot
+	ae	7,0,8		# add new limbs with cy, set cy
+	bdn	Loop		# decrement CTR and loop back
+Lend:	st	7,4(3)		# store ultimate result limb
+	lil	3,0		# load cy into ...
+	aze	3,3		# ... return value register
+	br
diff --git a/sysdeps/rs6000/addmul_1.s b/sysdeps/rs6000/addmul_1.s
new file mode 100644
index 0000000000..862b6139fe
--- /dev/null
+++ b/sysdeps/rs6000/addmul_1.s
@@ -0,0 +1,122 @@
+# IBM POWER __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction.  To
+# obtain that operation, we have to use the 32x32->64 signed multiplication
+# instruction, and add the appropriate compensation to the high limb of the
+# result.  We add the multiplicand if the multiplier has its most significant
+# bit set, and we add the multiplier if the multiplicand has its most
+# significant bit set.  We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit
+# we can branch in zero cycles, so that's how we perform the additions.
+
+	.toc
+	.csect .__mpn_addmul_1[PR]
+	.align 2
+	.globl __mpn_addmul_1
+	.globl .__mpn_addmul_1
+	.csect __mpn_addmul_1[DS]
+__mpn_addmul_1:
+	.long .__mpn_addmul_1[PR], TOC[tc0], 0
+	.csect .__mpn_addmul_1[PR]
+.__mpn_addmul_1:
+
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	8
+	cax	9,9,7
+	l	7,4(3)
+	a	8,8,7		# add res_limb
+	blt	Lneg
+Lpos:	bdz	Lend
+
+Lploop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	8,0,9		# low limb + old_cy_limb + old cy
+	l	7,4(3)
+	aze	10,10		# propagate cy to new cy_limb
+	a	8,8,7		# add res_limb
+	bge	Lp0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	8,0,10
+	l	7,4(3)
+	aze	9,9
+	a	8,8,7
+	bge	Lp1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	7
+	ae	8,7,9
+	l	7,4(3)
+	ae	10,10,0		# propagate cy to new cy_limb
+	a	8,8,7		# add res_limb
+	bge	Ln0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	7
+	ae	8,7,10
+	l	7,4(3)
+	ae	9,9,0		# propagate cy to new cy_limb
+	a	8,8,7		# add res_limb
+	bge	Ln1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
diff --git a/sysdeps/rs6000/lshift.s b/sysdeps/rs6000/lshift.s
new file mode 100644
index 0000000000..69c7502061
--- /dev/null
+++ b/sysdeps/rs6000/lshift.s
@@ -0,0 +1,58 @@
+# IBM POWER __mpn_lshift -- 
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s_ptr		r4
+# size		r5
+# cnt		r6
+
+	.toc
+	.extern __mpn_lshift[DS]
+	.extern .__mpn_lshift
+.csect [PR]
+	.align 2
+	.globl __mpn_lshift
+	.globl .__mpn_lshift
+	.csect __mpn_lshift[DS]
+__mpn_lshift:
+	.long .__mpn_lshift, TOC[tc0], 0
+	.csect [PR]
+.__mpn_lshift:
+	sli	0,5,2
+	cax	9,3,0
+	cax	4,4,0
+	sfi	8,6,32
+	mtctr	5		# put limb count in CTR loop register
+	lu	0,-4(4)		# read most significant limb
+	sre	3,0,8		# compute carry out limb, and init MQ register
+	bdz	Lend2		# if just one limb, skip loop
+	lu	0,-4(4)		# read 2:nd most significant limb
+	sreq	7,0,8		# compute most significant limb of result
+	bdz	Lend		# if just two limb, skip loop
+Loop:	lu	0,-4(4)		# load next lower limb
+	stu	7,-4(9)		# store previous result during read latency
+	sreq	7,0,8		# compute result limb
+	bdn	Loop		# loop back until CTR is zero
+Lend:	stu	7,-4(9)		# store 2:nd least significant limb
+Lend2:	sle	7,0,6		# compute least significant limb
+	st      7,-4(9)		# store it"				\
+	br
diff --git a/sysdeps/rs6000/mul_1.s b/sysdeps/rs6000/mul_1.s
new file mode 100644
index 0000000000..f4fa894339
--- /dev/null
+++ b/sysdeps/rs6000/mul_1.s
@@ -0,0 +1,109 @@
+# IBM POWER __mpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction.  To
+# obtain that operation, we have to use the 32x32->64 signed multiplication
+# instruction, and add the appropriate compensation to the high limb of the
+# result.  We add the multiplicand if the multiplier has its most significant
+# bit set, and we add the multiplier if the multiplicand has its most
+# significant bit set.  We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit
+# we can branch in zero cycles, so that's how we perform the additions.
+
+	.toc
+	.csect .__mpn_mul_1[PR]
+	.align 2
+	.globl __mpn_mul_1
+	.globl .__mpn_mul_1
+	.csect __mpn_mul_1[DS]
+__mpn_mul_1:
+	.long .__mpn_mul_1[PR], TOC[tc0], 0
+	.csect .__mpn_mul_1[PR]
+.__mpn_mul_1:
+
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	8
+	ai	0,0,0		# reset carry
+	cax	9,9,7
+	blt	Lneg
+Lpos:	bdz	Lend
+Lploop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	8,0,9
+	bge	Lp0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	8,0,10
+	bge	Lp1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	cax	10,10,0		# adjust high limb for negative s2_limb
+	mfmq	0
+	ae	8,0,9
+	bge	Ln0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	cax	9,9,0		# adjust high limb for negative s2_limb
+	mfmq	0
+	ae	8,0,10
+	bge	Ln1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
diff --git a/sysdeps/rs6000/rshift.s b/sysdeps/rs6000/rshift.s
new file mode 100644
index 0000000000..6056acc753
--- /dev/null
+++ b/sysdeps/rs6000/rshift.s
@@ -0,0 +1,56 @@
+# IBM POWER __mpn_rshift -- 
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s_ptr		r4
+# size		r5
+# cnt		r6
+
+	.toc
+	.extern __mpn_rshift[DS]
+	.extern .__mpn_rshift
+.csect [PR]
+	.align 2
+	.globl __mpn_rshift
+	.globl .__mpn_rshift
+	.csect __mpn_rshift[DS]
+__mpn_rshift:
+	.long .__mpn_rshift, TOC[tc0], 0
+	.csect [PR]
+.__mpn_rshift:
+	sfi	8,6,32
+	mtctr	5		# put limb count in CTR loop register
+	l	0,0(4)		# read least significant limb
+	ai	9,3,-4		# adjust res_ptr since it's offset in the stu:s
+	sle	3,0,8		# compute carry limb, and init MQ register
+	bdz	Lend2		# if just one limb, skip loop
+	lu	0,4(4)		# read 2:nd least significant limb
+	sleq	7,0,8		# compute least significant limb of result
+	bdz	Lend		# if just two limb, skip loop
+Loop:	lu	0,4(4)		# load next higher limb
+	stu	7,4(9)		# store previous result during read latency
+	sleq	7,0,8		# compute result limb
+	bdn	Loop		# loop back until CTR is zero
+Lend:	stu	7,4(9)		# store 2:nd most significant limb
+Lend2:	sre	7,0,6		# compute most significant limb
+	st      7,4(9)		# store it"				\
+	br
diff --git a/sysdeps/rs6000/sub_n.s b/sysdeps/rs6000/sub_n.s
new file mode 100644
index 0000000000..402fdcefc4
--- /dev/null
+++ b/sysdeps/rs6000/sub_n.s
@@ -0,0 +1,55 @@
+# IBM POWER __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+# store difference in a third limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# s2_ptr	r5
+# size		r6
+
+	.toc
+	.extern __mpn_sub_n[DS]
+	.extern .__mpn_sub_n
+.csect [PR]
+	.align 2
+	.globl __mpn_sub_n
+	.globl .__mpn_sub_n
+	.csect __mpn_sub_n[DS]
+__mpn_sub_n:
+	.long .__mpn_sub_n, TOC[tc0], 0
+	.csect [PR]
+.__mpn_sub_n:
+	mtctr	6		# copy size into CTR
+	l	8,0(4)		# load least significant s1 limb
+	l	0,0(5)		# load least significant s2 limb
+	cal	3,-4(3)		# offset res_ptr, it's updated before used
+	sf	7,0,8		# add least significant limbs, set cy
+	bdz	Lend		# If done, skip loop
+Loop:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	stu	7,4(3)		# store previous limb in load latecny slot
+	sfe	7,0,8		# add new limbs with cy, set cy
+	bdn	Loop		# decrement CTR and loop back
+Lend:	st	7,4(3)		# store ultimate result limb
+	sfe	3,0,0		# load !cy into ...
+	sfi	3,3,0		# ... return value register
+	br
diff --git a/sysdeps/rs6000/submul_1.s b/sysdeps/rs6000/submul_1.s
new file mode 100644
index 0000000000..252633261d
--- /dev/null
+++ b/sysdeps/rs6000/submul_1.s
@@ -0,0 +1,127 @@
+# IBM POWER __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction.  To
+# obtain that operation, we have to use the 32x32->64 signed multiplication
+# instruction, and add the appropriate compensation to the high limb of the
+# result.  We add the multiplicand if the multiplier has its most significant
+# bit set, and we add the multiplier if the multiplicand has its most
+# significant bit set.  We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit
+# we can branch in zero cycles, so that's how we perform the additions.
+
+	.toc
+	.csect .__mpn_submul_1[PR]
+	.align 2
+	.globl __mpn_submul_1
+	.globl .__mpn_submul_1
+	.csect __mpn_submul_1[DS]
+__mpn_submul_1:
+	.long .__mpn_submul_1[PR], TOC[tc0], 0
+	.csect .__mpn_submul_1[PR]
+.__mpn_submul_1:
+
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	11
+	cax	9,9,7
+	l	7,4(3)
+	sf	8,11,7		# add res_limb
+	a	11,8,11		# invert cy (r11 is junk)
+	blt	Lneg
+Lpos:	bdz	Lend
+
+Lploop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	11,0,9		# low limb + old_cy_limb + old cy
+	l	7,4(3)
+	aze	10,10		# propagate cy to new cy_limb
+	sf	8,11,7		# add res_limb
+	a	11,8,11		# invert cy (r11 is junk)
+	bge	Lp0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	11,0,10
+	l	7,4(3)
+	aze	9,9
+	sf	8,11,7
+	a	11,8,11		# invert cy (r11 is junk)
+	bge	Lp1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	7
+	ae	11,7,9
+	l	7,4(3)
+	ae	10,10,0		# propagate cy to new cy_limb
+	sf	8,11,7		# add res_limb
+	a	11,8,11		# invert cy (r11 is junk)
+	bge	Ln0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	7
+	ae	11,7,10
+	l	7,4(3)
+	ae	9,9,0		# propagate cy to new cy_limb
+	sf	8,11,7		# add res_limb
+	a	11,8,11		# invert cy (r11 is junk)
+	bge	Ln1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
diff --git a/sysdeps/sparc/add_n.S b/sysdeps/sparc/add_n.S
index 3be3e39b86..13704d32d2 100644
--- a/sysdeps/sparc/add_n.S
+++ b/sysdeps/sparc/add_n.S
@@ -1,7 +1,7 @@
 ! sparc __mpn_add_n -- Add two limb vectors of the same length > 0 and store
 ! sum in a third limb vector.
 
-! Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+! Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
 
 ! This file is part of the GNU MP Library.
 
@@ -39,20 +39,25 @@ C_SYMBOL_NAME(__mpn_add_n):
 	sub	%g0,%o3,%o3
 	andcc	%o3,(16-1),%o3
 	be	Lzero
-	 nop
+	 mov	%o4,%g2			! put first s1_limb in g2 too
 
 	sll	%o3,2,%o3		! multiply by 4
 	sub	%o0,%o3,%o0		! adjust res_ptr
 	sub	%o1,%o3,%o1		! adjust s1_ptr
 	sub	%o2,%o3,%o2		! adjust s2_ptr
 
-	mov	%o4,%g2
-
+#if PIC
+	mov	%o7,%g4			! Save return address register
+	call	1f
+	add	%o7,Lbase-1f,%g3
+1:	mov	%g4,%o7			! Restore return address register
+#else
 	sethi	%hi(Lbase),%g3
 	or	%g3,%lo(Lbase),%g3
+#endif
 	sll	%o3,2,%o3		! multiply by 4
 	jmp	%g3+%o3
-	 mov	%o5,%g3
+	 mov	%o5,%g3			! put first s2_limb in g3 too
 
 Loop:	addxcc	%g2,%g3,%o3
 	add	%o1,64,%o1
diff --git a/sysdeps/sparc/sparc8/addmul_1.S b/sysdeps/sparc/sparc8/addmul_1.S
index fbaacfda4f..d1de0c3649 100644
--- a/sysdeps/sparc/sparc8/addmul_1.S
+++ b/sysdeps/sparc/sparc8/addmul_1.S
@@ -37,8 +37,15 @@ C_SYMBOL_NAME(__mpn_addmul_1):
 
 	sll	%o2,4,%g1
 	and	%g1,(4-1)<<4,%g1
+#if PIC
+	mov	%o7,%g4			! Save return address register
+	call	1f
+	add	%o7,LL-1f,%g3
+1:	mov	%g4,%o7			! Restore return address register
+#else
 	sethi	%hi(LL),%g3
 	or	%g3,%lo(LL),%g3
+#endif
 	jmp	%g3+%g1
 	nop
 LL:
diff --git a/sysdeps/sparc/sparc8/mul_1.S b/sysdeps/sparc/sparc8/mul_1.S
index 9c21768eb1..42717be33b 100644
--- a/sysdeps/sparc/sparc8/mul_1.S
+++ b/sysdeps/sparc/sparc8/mul_1.S
@@ -34,8 +34,15 @@
 C_SYMBOL_NAME(__mpn_mul_1):
 	sll	%o2,4,%g1
 	and	%g1,(4-1)<<4,%g1
+#if PIC
+	mov	%o7,%g4			! Save return address register
+	call	1f
+	add	%o7,LL-1f,%g3
+1:	mov	%g4,%o7			! Restore return address register
+#else
 	sethi	%hi(LL),%g3
 	or	%g3,%lo(LL),%g3
+#endif
 	jmp	%g3+%g1
 	ld	[%o1+0],%o4	! 1
 LL:
diff --git a/sysdeps/sparc/sub_n.S b/sysdeps/sparc/sub_n.S
index 7a167b2ac1..6264344009 100644
--- a/sysdeps/sparc/sub_n.S
+++ b/sysdeps/sparc/sub_n.S
@@ -1,7 +1,7 @@
 ! sparc __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
 ! store difference in a third limb vector.
 
-! Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+! Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
 
 ! This file is part of the GNU MP Library.
 
@@ -39,20 +39,25 @@ C_SYMBOL_NAME(__mpn_sub_n):
 	sub	%g0,%o3,%o3
 	andcc	%o3,(16-1),%o3
 	be	Lzero
-	 nop
+	 mov	%o4,%g2			! put first s1_limb in g2 too
 
 	sll	%o3,2,%o3		! multiply by 4
 	sub	%o0,%o3,%o0		! adjust res_ptr
 	sub	%o1,%o3,%o1		! adjust s1_ptr
 	sub	%o2,%o3,%o2		! adjust s2_ptr
 
-	mov	%o4,%g2
-
+#if PIC
+	mov	%o7,%g4			! Save return address register
+	call	1f
+	add	%o7,Lbase-1f,%g3
+1:	mov	%g4,%o7			! Restore return address register
+#else
 	sethi	%hi(Lbase),%g3
 	or	%g3,%lo(Lbase),%g3
+#endif
 	sll	%o3,2,%o3		! multiply by 4
 	jmp	%g3+%o3
-	 mov	%o5,%g3
+	 mov	%o5,%g3			! put first s2_limb in g3 too
 
 Loop:	subxcc	%g2,%g3,%o3
 	add	%o1,64,%o1
diff --git a/sysdeps/unix/sysv/linux/Dist b/sysdeps/unix/sysv/linux/Dist
index db5ff9596a..d6124bd2ba 100644
--- a/sysdeps/unix/sysv/linux/Dist
+++ b/sysdeps/unix/sysv/linux/Dist
@@ -1,2 +1,3 @@
 sys/socketcall.h
 sys/timex.h
+nfs/nfs.h
diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
index 6e1dd8ccb2..fcacc53993 100644
--- a/sysdeps/unix/sysv/linux/Makefile
+++ b/sysdeps/unix/sysv/linux/Makefile
@@ -20,7 +20,11 @@ sysdep_routines := $(sysdep_routines) ipc
 endif
 
 ifeq ($(subdir), socket)
-headers += sys/socketcall.h 
+headers += sys/socketcall.h
+endif
+
+ifeq ($(subdir), sunrpc)
+headers += nfs/nfs.h
 endif
 
 config-LDFLAGS = -Wl,-dynamic-linker=/lib/ld-gnu.so.1
diff --git a/sysdeps/unix/sysv/linux/i386/sysdep.h b/sysdeps/unix/sysv/linux/i386/sysdep.h
index 7fe4d414e3..a40ca86e40 100644
--- a/sysdeps/unix/sysv/linux/i386/sysdep.h
+++ b/sysdeps/unix/sysv/linux/i386/sysdep.h
@@ -93,43 +93,61 @@ Cambridge, MA 02139, USA.  */
    (2 * movl is less expensive than pushl + popl).
 
    Second unlike for the other registers we don't save the content of
-   %ecx and %edx when we have than 1 and 2 registers resp.  */
+   %ecx and %edx when we have than 1 and 2 registers resp.
+
+   The code below might look a bit long but we have to take care for
+   the pipelined processors (i586 and up).  Here the `pushl' and `popl'
+   instructions are marked as NP (not pairable) but the exception is
+   two consecutive of these instruction.  This gives no penalty on
+   i386 and i486 processors though.  */
 
 #undef	DO_CALL
 #define DO_CALL(args)					      		      \
+    PUSHARGS_##args							      \
     DOARGS_##args							      \
-    int $0x80;								      \
-    UNDOARGS_##args
+    int $0x80								      \
+    POPARGS_##args
 
+#define PUSHARGS_0	/* No arguments to push.  */
 #define	DOARGS_0	/* No arguments to frob.  */
-#define	UNDOARGS_0	/* No arguments to unfrob.  */
-#define	_DOARGS_0(n)	/* No arguments to frob.  */
-#define	_UNDOARGS_0	/* No arguments to unfrob.  */
-
-#define	DOARGS_1	movl %ebx, %edx; movl 4(%esp), %ebx; DOARGS_0
-#define	UNDOARGS_1	UNDOARGS_0; movl %edx, %ebx
-#define	_DOARGS_1(n)	pushl %ebx; movl n+4(%esp), %ebx; _DOARGS_0 (n)
-#define	_UNDOARGS_1	_UNDOARGS_0; popl %ebx
-
-#define	DOARGS_2	movl 8(%esp), %ecx; DOARGS_1
-#define	UNDOARGS_2	UNDOARGS_1
+#define	POPARGS_0	/* No arguments to pop.  */
+#define	_PUSHARGS_0	/* No arguments to push.  */
+#define _DOARGS_0(n)	/* No arguments to frob.  */
+#define	_POPARGS_0	/* No arguments to pop.  */
+
+#define PUSHARGS_1	movl %ebx, %edx; PUSHARGS_0
+#define	DOARGS_1	_DOARGS_1 (4)
+#define	POPARGS_1	POPARGS_0; movl %edx, %ebx
+#define	_PUSHARGS_1	pushl %ebx; _PUSHARGS_0
+#define _DOARGS_1(n)	movl n(%esp), %ebx; _DOARGS_0(n-4)
+#define	_POPARGS_1	_POPARGS_0; popl %ebx
+
+#define PUSHARGS_2	PUSHARGS_1
+#define	DOARGS_2	_DOARGS_2 (8)
+#define	POPARGS_2	POPARGS_1
+#define _PUSHARGS_2	_PUSHARGS_1
 #define	_DOARGS_2(n)	movl n(%esp), %ecx; _DOARGS_1 (n-4)
-#define	_UNDOARGS_2	_UNDOARGS_1
+#define	_POPARGS_2	_POPARGS_1
 
-#define DOARGS_3	_DOARGS_3 (12)
-#define UNDOARGS_3	_UNDOARGS_3
+#define PUSHARGS_3	_PUSHARGS_2
+#define DOARGS_3	_DOARGS_3 (16)
+#define POPARGS_3	_POPARGS_3
+#define _PUSHARGS_3	_PUSHARGS_2
 #define _DOARGS_3(n)	movl n(%esp), %edx; _DOARGS_2 (n-4)
-#define _UNDOARGS_3	_UNDOARGS_2
-
-#define DOARGS_4	_DOARGS_4 (16)
-#define UNDOARGS_4	_UNDOARGS_4
-#define _DOARGS_4(n)	pushl %esi; movl n+4(%esp), %esi; _DOARGS_3 (n)
-#define _UNDOARGS_4	_UNDOARGS_3; popl %esi
-
-#define DOARGS_5	_DOARGS_5 (20)
-#define UNDOARGS_5	_UNDOARGS_5
-#define _DOARGS_5(n)	pushl %edi; movl n+4(%esp), %edi; _DOARGS_4 (n)
-#define _UNDOARGS_5	_UNDOARGS_4; popl %edi
-
+#define _POPARGS_3	_POPARGS_2
+
+#define PUSHARGS_4	_PUSHARGS_4
+#define DOARGS_4	_DOARGS_4 (24)
+#define POPARGS_4	_POPARGS_4
+#define _PUSHARGS_4	pushl %esi; _PUSHARGS_3
+#define _DOARGS_4(n)	movl n(%esp), %esi; _DOARGS_3 (n-4)
+#define _POPARGS_4	_POPARGS_3; popl %esi
+
+#define PUSHARGS_5	_PUSHARGS_5
+#define DOARGS_5	_DOARGS_5 (32)
+#define POPARGS_5	_POPARGS_5
+#define _PUSHARGS_5	pushl %edi; _PUSHARGS_4
+#define _DOARGS_5(n)	movl n(%esp), %edi; _DOARGS_4 (n-4)
+#define _POPARGS_5	_POPARGS_4; popl %edi
 
 #endif	/* ASSEMBLER */
diff --git a/sysdeps/unix/sysv/linux/local_lim.h b/sysdeps/unix/sysv/linux/local_lim.h
index bfc65bd6fd..a1c81d87d9 100644
--- a/sysdeps/unix/sysv/linux/local_lim.h
+++ b/sysdeps/unix/sysv/linux/local_lim.h
@@ -1,6 +1,6 @@
-/* Minimum guaranteed maximum values for system limits.  Hurd version.
+/* Minimum guaranteed maximum values for system limits.  Linux version.
 
-Copyright (C) 1993, 1994 Free Software Foundation, Inc.
+Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
 This file is part of the GNU C Library.
 
 The GNU C Library is free software; you can redistribute it and/or
@@ -18,14 +18,5 @@ License along with the GNU C Library; see the file COPYING.LIB.  If
 not, write to the Free Software Foundation, Inc., 675 Mass Ave,
 Cambridge, MA 02139, USA.  */
 
-/* Linux has a fixed limit of supplementary groups allocated with a
-   process.  This value is determined by the size of the `groups'
-   member of the `task_struct' structure in <linux/sched.h>.  */
-   
-#define NGROUPS_MAX	32
-
-
-/* Maximum size of file names.  Not all file system types support
-   this size but it is only a maximum value.  */
-
-#define NAME_MAX	255
+/* The kernel sources contain a file with all the needed information.  */
+#include <linux/limits.h>
diff --git a/sysdeps/unix/sysv/linux/nfs/nfs.h b/sysdeps/unix/sysv/linux/nfs/nfs.h
new file mode 100644
index 0000000000..61e4b656d7
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/nfs/nfs.h
@@ -0,0 +1 @@
+#include <linux/nfs.h>
diff --git a/sysdeps/unix/sysv/linux/sys/param.h b/sysdeps/unix/sysv/linux/sys/param.h
index 652605e92a..a2d4984166 100644
--- a/sysdeps/unix/sysv/linux/sys/param.h
+++ b/sysdeps/unix/sysv/linux/sys/param.h
@@ -1,3 +1,21 @@
+/* Copyright (C) 1995 Free Software Foundation, Inc.
+This file is part of the GNU C Library.
+
+The GNU C Library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+The GNU C Library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with the GNU C Library; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
 #ifndef _SYS_PARAM_H
 #define _SYS_PARAM_H
 
@@ -7,26 +25,21 @@
 
 #include <sys/types.h>
 
-/* Don't change it. H.J. */
-#ifdef OLD_LINUX
-#undef	MAXHOSTNAMELEN
-#define MAXHOSTNAMELEN 	8	/* max length of hostname */
-#endif
 
 #ifndef howmany
-#define howmany(x, y)	(((x)+((y)-1))/(y))
+# define howmany(x, y)	(((x)+((y)-1))/(y))
 #endif
 
 #ifndef roundup
-#define roundup(x, y)	((((x)+((y)-1))/(y))*(y))
+# define roundup(x, y)	((((x)+((y)-1))/(y))*(y))
 #endif
 
 #define MAXPATHLEN      PATH_MAX
 #define NOFILE          OPEN_MAX
 
 /*  Following the information of some of the kernel people I here assume
- *  that block size (i.e. the value of stat.st_blocks) for all filesystem
- *  is 512 bytes.  If not tell me or HJ.  -- Uli */
+    that block size (i.e. the value of stat.st_blocks) for all filesystem
+    is 512 bytes.  If not tell HJ, Roland, or me.  -- drepper */
 #define DEV_BSIZE       512
 
 #endif
diff --git a/sysdeps/vax/add_n.s b/sysdeps/vax/add_n.s
new file mode 100644
index 0000000000..c89b226051
--- /dev/null
+++ b/sysdeps/vax/add_n.s
@@ -0,0 +1,47 @@
+# VAX __mpn_add_n -- Add two limb vectors of the same length > 0 and store
+# sum in a third limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	(sp + 4)
+# s1_ptr	(sp + 8)
+# s2_ptr	(sp + 12)
+# size		(sp + 16)
+
+.text
+	.align 1
+.globl ___mpn_add_n
+___mpn_add_n:
+	.word	0x0
+	movl	16(ap),r0
+	movl	12(ap),r1
+	movl	8(ap),r2
+	movl	4(ap),r3
+	subl2	r4,r4
+
+Loop:
+	movl	(r2)+,r4
+	adwc	(r1)+,r4
+	movl	r4,(r3)+
+	jsobgtr	r0,Loop
+
+	adwc	r0,r0
+	ret
diff --git a/sysdeps/vax/addmul_1.s b/sysdeps/vax/addmul_1.s
new file mode 100644
index 0000000000..8e83204b81
--- /dev/null
+++ b/sysdeps/vax/addmul_1.s
@@ -0,0 +1,125 @@
+# VAX __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	(sp + 4)
+# s1_ptr	(sp + 8)
+# size		(sp + 12)
+# s2_limb	(sp + 16)
+
+.text
+	.align 1
+.globl ___mpn_addmul_1
+___mpn_addmul_1:
+	.word	0xfc0
+	movl	12(ap),r4
+	movl	8(ap),r8
+	movl	4(ap),r9
+	movl	16(ap),r6
+	jlss	s2_big
+
+	clrl	r3
+	incl	r4
+	ashl	$-1,r4,r7
+	jlbc	r4,L1
+	clrl	r11
+
+# Loop for S2_LIMB < 0x80000000
+Loop1:	movl	(r8)+,r1
+	jlss	L1n0
+	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	$0,r3
+	addl2	r2,(r9)+
+	adwc	$0,r3
+L1:	movl	(r8)+,r1
+	jlss	L1n1
+L1p1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	$0,r11
+	addl2	r10,(r9)+
+	adwc	$0,r11
+
+	jsobgtr	r7,Loop1
+	movl	r11,r0
+	ret
+
+L1n0:	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r6,r3
+	addl2	r2,(r9)+
+	adwc	$0,r3
+	movl	(r8)+,r1
+	jgeq	L1p1
+L1n1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r6,r11
+	addl2	r10,(r9)+
+	adwc	$0,r11
+
+	jsobgtr	r7,Loop1
+	movl	r11,r0
+	ret
+
+
+s2_big:	clrl	r3
+	incl	r4
+	ashl	$-1,r4,r7
+	jlbc	r4,L2
+	clrl	r11
+
+# Loop for S2_LIMB >= 0x80000000
+Loop2:	movl	(r8)+,r1
+	jlss	L2n0
+	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r1,r3
+	addl2	r2,(r9)+
+	adwc	$0,r3
+L2:	movl	(r8)+,r1
+	jlss	L2n1
+L2p1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r1,r11
+	addl2	r10,(r9)+
+	adwc	$0,r11
+
+	jsobgtr	r7,Loop2
+	movl	r11,r0
+	ret
+
+L2n0:	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r6,r3
+	addl2	r2,(r9)+
+	adwc	r1,r3
+	movl	(r8)+,r1
+	jgeq	L2p1
+L2n1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r6,r11
+	addl2	r10,(r9)+
+	adwc	r1,r11
+
+	jsobgtr	r7,Loop2
+	movl	r11,r0
+	ret
diff --git a/sysdeps/vax/gmp-mparam.h b/sysdeps/vax/gmp-mparam.h
new file mode 100644
index 0000000000..687f12aa35
--- /dev/null
+++ b/sysdeps/vax/gmp-mparam.h
@@ -0,0 +1,28 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Library General Public License as published by
+the Free Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+License for more details.
+
+You should have received a copy of the GNU Library General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+#define IEEE_DOUBLE_BIG_ENDIAN 0
diff --git a/sysdeps/vax/mul_1.s b/sysdeps/vax/mul_1.s
new file mode 100644
index 0000000000..3fe375bacf
--- /dev/null
+++ b/sysdeps/vax/mul_1.s
@@ -0,0 +1,122 @@
+# VAX __mpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	(sp + 4)
+# s1_ptr	(sp + 8)
+# size		(sp + 12)
+# s2_limb	(sp + 16)
+
+.text
+	.align 1
+.globl ___mpn_mul_1
+___mpn_mul_1:
+	.word	0xfc0
+	movl	12(ap),r4
+	movl	8(ap),r8
+	movl	4(ap),r9
+	movl	16(ap),r6
+	jlss	s2_big
+
+# One might want to combine the addl2 and the store below, but that
+# is actually just slower according to my timing tests.  (VAX 3600)
+
+	clrl	r3
+	incl	r4
+	ashl	$-1,r4,r7
+	jlbc	r4,L1
+	clrl	r11
+
+# Loop for S2_LIMB < 0x80000000
+Loop1:	movl	(r8)+,r1
+	jlss	L1n0
+	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	$0,r3
+	movl	r2,(r9)+
+L1:	movl	(r8)+,r1
+	jlss	L1n1
+L1p1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	$0,r11
+	movl	r10,(r9)+
+
+	jsobgtr	r7,Loop1
+	movl	r11,r0
+	ret
+
+L1n0:	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r6,r3
+	movl	r2,(r9)+
+	movl	(r8)+,r1
+	jgeq	L1p1
+L1n1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r6,r11
+	movl	r10,(r9)+
+
+	jsobgtr	r7,Loop1
+	movl	r11,r0
+	ret
+
+
+s2_big:	clrl	r3
+	incl	r4
+	ashl	$-1,r4,r7
+	jlbc	r4,L2
+	clrl	r11
+
+# Loop for S2_LIMB >= 0x80000000
+Loop2:	movl	(r8)+,r1
+	jlss	L2n0
+	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r1,r3
+	movl	r2,(r9)+
+L2:	movl	(r8)+,r1
+	jlss	L2n1
+L2p1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r1,r11
+	movl	r10,(r9)+
+
+	jsobgtr	r7,Loop2
+	movl	r11,r0
+	ret
+
+L2n0:	emul	r1,r6,$0,r2
+	addl2	r1,r3
+	addl2	r11,r2
+	adwc	r6,r3
+	movl	r2,(r9)+
+	movl	(r8)+,r1
+	jgeq	L2p1
+L2n1:	emul	r1,r6,$0,r10
+	addl2	r1,r11
+	addl2	r3,r10
+	adwc	r6,r11
+	movl	r10,(r9)+
+
+	jsobgtr	r7,Loop2
+	movl	r11,r0
+	ret
diff --git a/sysdeps/vax/sub_n.s b/sysdeps/vax/sub_n.s
new file mode 100644
index 0000000000..300b4dee8f
--- /dev/null
+++ b/sysdeps/vax/sub_n.s
@@ -0,0 +1,47 @@
+# VAX __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and store
+# difference in a third limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	(sp + 4)
+# s1_ptr	(sp + 8)
+# s2_ptr	(sp + 12)
+# size		(sp + 16)
+
+.text
+	.align 1
+.globl ___mpn_sub_n
+___mpn_sub_n:
+	.word	0x0
+	movl	16(ap),r0
+	movl	12(ap),r1
+	movl	8(ap),r2
+	movl	4(ap),r3
+	subl2	r4,r4
+
+Loop:
+	movl	(r2)+,r4
+	sbwc	(r1)+,r4
+	movl	r4,(r3)+
+	jsobgtr	r0,Loop
+
+	adwc	r0,r0
+	ret
diff --git a/sysdeps/vax/submul_1.s b/sysdeps/vax/submul_1.s
new file mode 100644
index 0000000000..875cbfd651
--- /dev/null
+++ b/sysdeps/vax/submul_1.s
@@ -0,0 +1,125 @@
+# VAX __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright (C) 1992, 1994 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+# License for more details.
+
+# You should have received a copy of the GNU Library General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	(sp + 4)
+# s1_ptr	(sp + 8)
+# size		(sp + 12)
+# s2_limb	(sp + 16)
+
+.text
+	.align 1
+.globl ___mpn_submul_1
+___mpn_submul_1:
+	.word	0xfc0
+	movl	12(ap),r4
+	movl	8(ap),r8
+	movl	4(ap),r9
+	movl	16(ap),r6
+	jlss	s2_big
+
+	clrl	r3
+	incl	r4
+	ashl	$-1,r4,r7
+	jlbc	r4,L1
+	clrl	r11
+
+# Loop for S2_LIMB < 0x80000000
+Loop1:	movl	(r8)+,r1
+	jlss	L1n0
+	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	$0,r3
+	subl2	r2,(r9)+
+	adwc	$0,r3
+L1:	movl	(r8)+,r1
+	jlss	L1n1
+L1p1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	$0,r11
+	subl2	r10,(r9)+
+	adwc	$0,r11
+
+	jsobgtr	r7,Loop1
+	movl	r11,r0
+	ret
+
+L1n0:	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r6,r3
+	subl2	r2,(r9)+
+	adwc	$0,r3
+	movl	(r8)+,r1
+	jgeq	L1p1
+L1n1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r6,r11
+	subl2	r10,(r9)+
+	adwc	$0,r11
+
+	jsobgtr	r7,Loop1
+	movl	r11,r0
+	ret
+
+
+s2_big:	clrl	r3
+	incl	r4
+	ashl	$-1,r4,r7
+	jlbc	r4,L2
+	clrl	r11
+
+# Loop for S2_LIMB >= 0x80000000
+Loop2:	movl	(r8)+,r1
+	jlss	L2n0
+	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r1,r3
+	subl2	r2,(r9)+
+	adwc	$0,r3
+L2:	movl	(r8)+,r1
+	jlss	L2n1
+L2p1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r1,r11
+	subl2	r10,(r9)+
+	adwc	$0,r11
+
+	jsobgtr	r7,Loop2
+	movl	r11,r0
+	ret
+
+L2n0:	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r6,r3
+	subl2	r2,(r9)+
+	adwc	r1,r3
+	movl	(r8)+,r1
+	jgeq	L2p1
+L2n1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r6,r11
+	subl2	r10,(r9)+
+	adwc	r1,r11
+
+	jsobgtr	r7,Loop2
+	movl	r11,r0
+	ret
diff --git a/sysdeps/z8000/add_n.s b/sysdeps/z8000/add_n.s
new file mode 100644
index 0000000000..21efaf5714
--- /dev/null
+++ b/sysdeps/z8000/add_n.s
@@ -0,0 +1,52 @@
+! Z8000 __mpn_add_n -- Add two limb vectors of equal, non-zero length.
+
+! Copyright (C) 1993, 1994 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Library General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+! License for more details.
+
+! You should have received a copy of the GNU Library General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r7
+! s1_ptr	r6
+! s2_ptr	r5
+! size		r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp.  We'd
+! then add 2x the number of words written to r7...
+
+	unseg
+	.text
+	even
+	global ___mpn_add_n
+___mpn_add_n:
+	pop	r0,@r6
+	pop	r1,@r5
+	add	r0,r1
+	ld	@r7,r0
+	dec	r4
+	jr	eq,Lend
+Loop:	pop	r0,@r6
+	pop	r1,@r5
+	adc	r0,r1
+	inc	r7,#2
+	ld	@r7,r0
+	dec	r4
+	jr	ne,Loop
+Lend:	ld	r2,r4		! use 0 already in r4
+	adc	r2,r2
+	ret	t
diff --git a/sysdeps/z8000/mul_1.s b/sysdeps/z8000/mul_1.s
new file mode 100644
index 0000000000..2075225d11
--- /dev/null
+++ b/sysdeps/z8000/mul_1.s
@@ -0,0 +1,67 @@
+! Z8000 __mpn_mul_1 -- Multiply a limb vector with a limb and store
+! the result in a second limb vector.
+
+! Copyright (C) 1993, 1994 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Library General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+! License for more details.
+
+! You should have received a copy of the GNU Library General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r7
+! s1_ptr	r6
+! size		r5
+! s2_limb	r4
+
+	unseg
+	.text
+	even
+	global ___mpn_mul_1
+___mpn_mul_1:
+	sub	r2,r2		! zero carry limb
+	and	r4,r4
+	jr	mi,Lneg
+
+Lpos:	pop	r1,@r6
+	ld	r9,r1
+	mult	rr8,r4
+	and	r1,r1		! shift msb of loaded limb into cy
+	jr	mi,Lp		! branch if loaded limb's msb is set
+	add	r8,r4		! hi_limb += sign_comp2
+Lp:	add	r9,r2		! lo_limb += cy_limb
+	xor	r2,r2
+	adc	r2,r8
+	ld	@r7,r9
+	inc	r7,#2
+	dec	r5
+	jr	ne,Lpos
+	ret t
+
+Lneg:	pop	r1,@r6
+	ld	r9,r1
+	mult	rr8,r4
+	add	r8,r1		! hi_limb += sign_comp1
+	and	r1,r1
+	jr	mi,Ln
+	add	r8,r4		! hi_limb += sign_comp2
+Ln:	add	r9,r2		! lo_limb += cy_limb
+	xor	r2,r2
+	adc	r2,r8
+	ld	@r7,r9
+	inc	r7,#2
+	dec	r5
+	jr	ne,Lneg
+	ret t
diff --git a/sysdeps/z8000/sub_n.s b/sysdeps/z8000/sub_n.s
new file mode 100644
index 0000000000..f75ef22d04
--- /dev/null
+++ b/sysdeps/z8000/sub_n.s
@@ -0,0 +1,53 @@
+! Z8000 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+! store difference in a third limb vector.
+
+! Copyright (C) 1993, 1994 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Library General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+! License for more details.
+
+! You should have received a copy of the GNU Library General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r7
+! s1_ptr	r6
+! s2_ptr	r5
+! size		r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp.  We'd
+! then add 2x the number of words written to r7...
+
+	unseg
+	.text
+	even
+	global ___mpn_sub_n
+___mpn_sub_n:
+	pop	r0,@r6
+	pop	r1,@r5
+	sub	r0,r1
+	ld	@r7,r0
+	dec	r4
+	jr	eq,Lend
+Loop:	pop	r0,@r6
+	pop	r1,@r5
+	sbc	r0,r1
+	inc	r7,#2
+	ld	@r7,r0
+	dec	r4
+	jr	ne,Loop
+Lend:	ld	r2,r4		! use 0 already in r4
+	adc	r2,r2
+	ret	t
author	Roland McGrath <roland@gnu.org>	1995-10-16 01:37:51 +0000
committer	Roland McGrath <roland@gnu.org>	1995-10-16 01:37:51 +0000
commit	8f5ca04bc7fd53741d80117df992995ace8f6d2d (patch)
tree	e39c13fc198b22ec55647259a8080051988e8c69 /sysdeps
parent	5d82cf5c55f56ae10d3b0a205d1fcc7de1cf56a0 (diff)