summaryrefslogtreecommitdiff
path: root/ports/sysdeps/alpha/alphaev6
diff options
context:
space:
mode:
Diffstat (limited to 'ports/sysdeps/alpha/alphaev6')
-rw-r--r--ports/sysdeps/alpha/alphaev6/Implies1
-rw-r--r--ports/sysdeps/alpha/alphaev6/addmul_1.S477
-rw-r--r--ports/sysdeps/alpha/alphaev6/fpu/e_sqrt.S44
-rw-r--r--ports/sysdeps/alpha/alphaev6/fpu/e_sqrtf.S44
-rw-r--r--ports/sysdeps/alpha/alphaev6/memcpy.S255
-rw-r--r--ports/sysdeps/alpha/alphaev6/memset.S223
-rw-r--r--ports/sysdeps/alpha/alphaev6/stxcpy.S314
-rw-r--r--ports/sysdeps/alpha/alphaev6/stxncpy.S392
8 files changed, 1750 insertions, 0 deletions
diff --git a/ports/sysdeps/alpha/alphaev6/Implies b/ports/sysdeps/alpha/alphaev6/Implies
new file mode 100644
index 0000000000..0e7fc170ba
--- /dev/null
+++ b/ports/sysdeps/alpha/alphaev6/Implies
@@ -0,0 +1 @@
+alpha/alphaev5
diff --git a/ports/sysdeps/alpha/alphaev6/addmul_1.S b/ports/sysdeps/alpha/alphaev6/addmul_1.S
new file mode 100644
index 0000000000..c663f0a1d2
--- /dev/null
+++ b/ports/sysdeps/alpha/alphaev6/addmul_1.S
@@ -0,0 +1,477 @@
+ # Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+ # the result to a second limb vector.
+ #
+ # Copyright (C) 2000 Free Software Foundation, Inc.
+ #
+ # This file is part of the GNU MP Library.
+ #
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published
+ # by the Free Software Foundation; either version 2.1 of the License, or (at
+ # your option) any later version.
+ #
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+ #
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>.
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # size $18
+ # s2_limb $19
+ #
+ # This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and
+ # exactly 3.625 cycles/limb on EV6...
+ #
+ # This code was written in close cooperation with ev6 pipeline expert
+ # Steve Root (root@toober.hlo.dec.com). Any errors are tege's fault, though.
+ #
+ # Register usages for unrolled loop:
+ # 0-3 mul's
+ # 4-7 acc's
+ # 8-15 mul results
+ # 20,21 carry's
+ # 22,23 save for stores
+ #
+ # Sustains 8 mul-adds in 29 cycles in the unrolled inner loop.
+ #
+ # The stores can issue a cycle late so we have paired no-op's to 'catch'
+ # them, so that further disturbance to the schedule is damped.
+ #
+ # We couldn't pair the loads, because the entangled schedule of the
+ # carry's has to happen on one side {0} of the machine. Note, the total
+ # use of U0, and the total use of L0 (after attending to the stores).
+ # which is part of the reason why....
+ #
+ # This is a great schedule for the d_cache, a poor schedule for the
+ # b_cache. The lockup on U0 means that any stall can't be recovered
+ # from. Consider a ldq in L1. say that load gets stalled because it
+ # collides with a fill from the b_Cache. On the next cycle, this load
+ # gets priority. If first looks at L0, and goes there. The instruction
+ # we intended for L0 gets to look at L1, which is NOT where we want
+ # it. It either stalls 1, because it can't go in L0, or goes there, and
+ # causes a further instruction to stall.
+ #
+ # So for b_cache, we're likely going to want to put one or more cycles
+ # back into the code! And, of course, put in prefetches. For the
+ # accumulator, lds, intent to modify. For the multiplier, you might
+ # want ldq, evict next, if you're not wanting to use it again soon. Use
+ # 256 ahead of present pointer value. At a place where we have an mt
+ # followed by a bookkeeping, put the bookkeeping in upper, and the
+ # prefetch into lower.
+ #
+ # Note, the usage of physical registers per cycle is smoothed off, as
+ # much as possible.
+ #
+ # Note, the ldq's and stq's are at the end of the quadpacks. note, we'd
+ # like not to have a ldq or stq to preceded a conditional branch in a
+ # quadpack. The conditional branch moves the retire pointer one cycle
+ # later.
+ #
+ # Optimization notes:
+ # Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27?
+ # Reserved regs: $29 $30 $31
+ # Free caller-saves regs in unrolled code: $24 $25 $28
+ # We should swap some of the callee-saves regs for some of the free
+ # caller-saves regs, saving some overhead cycles.
+ # Most importantly, we should write fast code for the 0-7 case.
+ # The code we use there are for the 21164, and runs at 7 cycles/limb
+ # on the 21264. Should not be hard, if we write specialized code for
+ # 1-7 limbs (the one for 0 limbs should be straightforward). We then just
+ # need a jump table indexed by the low 3 bits of the count argument.
+
+ .set noreorder
+ .set noat
+ .text
+
+ .globl __mpn_addmul_1
+ .ent __mpn_addmul_1
+__mpn_addmul_1:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ cmpult $18, 8, $1
+ beq $1, $Large
+
+ ldq $2, 0($17) # $2 = s1_limb
+ addq $17, 8, $17 # s1_ptr++
+ subq $18, 1, $18 # size--
+ mulq $2, $19, $3 # $3 = prod_low
+ ldq $5, 0($16) # $5 = *res_ptr
+ umulh $2, $19, $0 # $0 = prod_high
+ beq $18, $Lend0b # jump if size was == 1
+ ldq $2, 0($17) # $2 = s1_limb
+ addq $17, 8, $17 # s1_ptr++
+ subq $18, 1, $18 # size--
+ addq $5, $3, $3
+ cmpult $3, $5, $4
+ stq $3, 0($16)
+ addq $16, 8, $16 # res_ptr++
+ beq $18, $Lend0a # jump if size was == 2
+
+ .align 3
+$Loop0: mulq $2, $19, $3 # $3 = prod_low
+ ldq $5, 0($16) # $5 = *res_ptr
+ addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ subq $18, 1, $18 # size--
+ umulh $2, $19, $4 # $4 = cy_limb
+ ldq $2, 0($17) # $2 = s1_limb
+ addq $17, 8, $17 # s1_ptr++
+ addq $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addq $5, $3, $3
+ cmpult $3, $5, $5
+ stq $3, 0($16)
+ addq $16, 8, $16 # res_ptr++
+ addq $5, $0, $0 # combine carries
+ bne $18, $Loop0
+$Lend0a:
+ mulq $2, $19, $3 # $3 = prod_low
+ ldq $5, 0($16) # $5 = *res_ptr
+ addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ umulh $2, $19, $4 # $4 = cy_limb
+ addq $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addq $5, $3, $3
+ cmpult $3, $5, $5
+ stq $3, 0($16)
+ addq $5, $0, $0 # combine carries
+ addq $4, $0, $0 # cy_limb = prod_high + cy
+ ret $31, ($26), 1
+$Lend0b:
+ addq $5, $3, $3
+ cmpult $3, $5, $5
+ stq $3, 0($16)
+ addq $0, $5, $0
+ ret $31, ($26), 1
+
+$Large:
+ lda $30, -240($30)
+ stq $9, 8($30)
+ stq $10, 16($30)
+ stq $11, 24($30)
+ stq $12, 32($30)
+ stq $13, 40($30)
+ stq $14, 48($30)
+ stq $15, 56($30)
+
+ and $18, 7, $20 # count for the first loop, 0-7
+ srl $18, 3, $18 # count for unrolled loop
+ bis $31, $31, $0
+ beq $20, $Lunroll
+ ldq $2, 0($17) # $2 = s1_limb
+ addq $17, 8, $17 # s1_ptr++
+ subq $20, 1, $20 # size--
+ mulq $2, $19, $3 # $3 = prod_low
+ ldq $5, 0($16) # $5 = *res_ptr
+ umulh $2, $19, $0 # $0 = prod_high
+ beq $20, $Lend1b # jump if size was == 1
+ ldq $2, 0($17) # $2 = s1_limb
+ addq $17, 8, $17 # s1_ptr++
+ subq $20, 1, $20 # size--
+ addq $5, $3, $3
+ cmpult $3, $5, $4
+ stq $3, 0($16)
+ addq $16, 8, $16 # res_ptr++
+ beq $20, $Lend1a # jump if size was == 2
+
+ .align 3
+$Loop1: mulq $2, $19, $3 # $3 = prod_low
+ ldq $5, 0($16) # $5 = *res_ptr
+ addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ subq $20, 1, $20 # size--
+ umulh $2, $19, $4 # $4 = cy_limb
+ ldq $2, 0($17) # $2 = s1_limb
+ addq $17, 8, $17 # s1_ptr++
+ addq $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addq $5, $3, $3
+ cmpult $3, $5, $5
+ stq $3, 0($16)
+ addq $16, 8, $16 # res_ptr++
+ addq $5, $0, $0 # combine carries
+ bne $20, $Loop1
+
+$Lend1a:
+ mulq $2, $19, $3 # $3 = prod_low
+ ldq $5, 0($16) # $5 = *res_ptr
+ addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ umulh $2, $19, $4 # $4 = cy_limb
+ addq $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addq $5, $3, $3
+ cmpult $3, $5, $5
+ stq $3, 0($16)
+ addq $16, 8, $16 # res_ptr++
+ addq $5, $0, $0 # combine carries
+ addq $4, $0, $0 # cy_limb = prod_high + cy
+ br $31, $Lunroll
+$Lend1b:
+ addq $5, $3, $3
+ cmpult $3, $5, $5
+ stq $3, 0($16)
+ addq $16, 8, $16 # res_ptr++
+ addq $0, $5, $0
+
+$Lunroll:
+ lda $17, -16($17) # L1 bookkeeping
+ lda $16, -16($16) # L1 bookkeeping
+ bis $0, $31, $12
+
+ # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
+
+ ldq $2, 16($17) # L1
+ ldq $3, 24($17) # L1
+ lda $18, -1($18) # L1 bookkeeping
+ ldq $6, 16($16) # L1
+ ldq $7, 24($16) # L1
+ ldq $0, 32($17) # L1
+ mulq $19, $2, $13 # U1
+ ldq $1, 40($17) # L1
+ umulh $19, $2, $14 # U1
+ mulq $19, $3, $15 # U1
+ lda $17, 64($17) # L1 bookkeeping
+ ldq $4, 32($16) # L1
+ ldq $5, 40($16) # L1
+ umulh $19, $3, $8 # U1
+ ldq $2, -16($17) # L1
+ mulq $19, $0, $9 # U1
+ ldq $3, -8($17) # L1
+ umulh $19, $0, $10 # U1
+ addq $6, $13, $6 # L0 lo + acc
+ mulq $19, $1, $11 # U1
+ cmpult $6, $13, $20 # L0 lo add => carry
+ lda $16, 64($16) # L1 bookkeeping
+ addq $6, $12, $22 # U0 hi add => answer
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addq $14, $20, $14 # U0 hi mul + carry
+ ldq $6, -16($16) # L1
+ addq $7, $15, $23 # L0 lo + acc
+ addq $14, $21, $14 # U0 hi mul + carry
+ ldq $7, -8($16) # L1
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addq $23, $14, $23 # U0 hi add => answer
+ ldq $0, 0($17) # L1
+ mulq $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addq $8, $20, $8 # U0 hi mul + carry
+ ldq $1, 8($17) # L1
+ umulh $19, $2, $14 # U1
+ addq $4, $9, $4 # L0 lo + acc
+ stq $22, -48($16) # L0
+ stq $23, -40($16) # L1
+ mulq $19, $3, $15 # U1
+ addq $8, $21, $8 # U0 hi mul + carry
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addq $4, $8, $22 # U0 hi add => answer
+ ble $18, $Lend # U1 bookkeeping
+
+ # ____ MAIN UNROLLED LOOP ____
+ .align 4
+$Loop:
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addq $10, $20, $10 # U0 hi mul + carry
+ ldq $4, 0($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addq $5, $11, $23 # L0 lo + acc
+ addq $10, $21, $10 # L0 hi mul + carry
+ ldq $5, 8($16) # L1
+
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addq $23, $10, $23 # U0 hi add => answer
+ ldq $2, 16($17) # L1
+
+ mulq $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addq $12, $20, $12 # U0 hi mul + carry
+ ldq $3, 24($17) # L1
+
+ umulh $19, $0, $10 # U1
+ addq $6, $13, $6 # L0 lo + acc
+ stq $22, -32($16) # L0
+ stq $23, -24($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mulq $19, $1, $11 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addq $12, $21, $12 # U0 hi mul + carry
+
+ cmpult $6, $13, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ lda $18, -1($18) # L1 bookkeeping
+ addq $6, $12, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addq $14, $20, $14 # U0 hi mul + carry
+ ldq $6, 16($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addq $7, $15, $23 # L0 lo + acc
+ addq $14, $21, $14 # U0 hi mul + carry
+ ldq $7, 24($16) # L1
+
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addq $23, $14, $23 # U0 hi add => answer
+ ldq $0, 32($17) # L1
+
+ mulq $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addq $8, $20, $8 # U0 hi mul + carry
+ ldq $1, 40($17) # L1
+
+ umulh $19, $2, $14 # U1
+ addq $4, $9, $4 # U0 lo + acc
+ stq $22, -16($16) # L0
+ stq $23, -8($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mulq $19, $3, $15 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addq $8, $21, $8 # L0 hi mul + carry
+
+ cmpult $4, $9, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ lda $17, 64($17) # L1 bookkeeping
+ addq $4, $8, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addq $10, $20, $10 # U0 hi mul + carry
+ ldq $4, 32($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addq $5, $11, $23 # L0 lo + acc
+ addq $10, $21, $10 # L0 hi mul + carry
+ ldq $5, 40($16) # L1
+
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addq $23, $10, $23 # U0 hi add => answer
+ ldq $2, -16($17) # L1
+
+ mulq $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addq $12, $20, $12 # U0 hi mul + carry
+ ldq $3, -8($17) # L1
+
+ umulh $19, $0, $10 # U1
+ addq $6, $13, $6 # L0 lo + acc
+ stq $22, 0($16) # L0
+ stq $23, 8($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mulq $19, $1, $11 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addq $12, $21, $12 # U0 hi mul + carry
+
+ cmpult $6, $13, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ lda $16, 64($16) # L1 bookkeeping
+ addq $6, $12, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addq $14, $20, $14 # U0 hi mul + carry
+ ldq $6, -16($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addq $7, $15, $23 # L0 lo + acc
+ addq $14, $21, $14 # U0 hi mul + carry
+ ldq $7, -8($16) # L1
+
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addq $23, $14, $23 # U0 hi add => answer
+ ldq $0, 0($17) # L1
+
+ mulq $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addq $8, $20, $8 # U0 hi mul + carry
+ ldq $1, 8($17) # L1
+
+ umulh $19, $2, $14 # U1
+ addq $4, $9, $4 # L0 lo + acc
+ stq $22, -48($16) # L0
+ stq $23, -40($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mulq $19, $3, $15 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addq $8, $21, $8 # U0 hi mul + carry
+
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addq $4, $8, $22 # U0 hi add => answer
+ bis $31, $31, $31 # L1 mt
+ bgt $18, $Loop # U1 bookkeeping
+
+# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
+$Lend:
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addq $10, $20, $10 # U0 hi mul + carry
+ ldq $4, 0($16) # L1
+ addq $5, $11, $23 # L0 lo + acc
+ addq $10, $21, $10 # L0 hi mul + carry
+ ldq $5, 8($16) # L1
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addq $23, $10, $23 # U0 hi add => answer
+ mulq $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addq $12, $20, $12 # U0 hi mul + carry
+ umulh $19, $0, $10 # U1
+ addq $6, $13, $6 # L0 lo + acc
+ stq $22, -32($16) # L0
+ stq $23, -24($16) # L1
+ mulq $19, $1, $11 # U1
+ addq $12, $21, $12 # U0 hi mul + carry
+ cmpult $6, $13, $20 # L0 lo add => carry
+ addq $6, $12, $22 # U0 hi add => answer
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addq $14, $20, $14 # U0 hi mul + carry
+ addq $7, $15, $23 # L0 lo + acc
+ addq $14, $21, $14 # U0 hi mul + carry
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addq $23, $14, $23 # U0 hi add => answer
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addq $8, $20, $8 # U0 hi mul + carry
+ addq $4, $9, $4 # U0 lo + acc
+ stq $22, -16($16) # L0
+ stq $23, -8($16) # L1
+ bis $31, $31, $31 # L0 st slosh
+ addq $8, $21, $8 # L0 hi mul + carry
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addq $4, $8, $22 # U0 hi add => answer
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addq $10, $20, $10 # U0 hi mul + carry
+ addq $5, $11, $23 # L0 lo + acc
+ addq $10, $21, $10 # L0 hi mul + carry
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addq $23, $10, $23 # U0 hi add => answer
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addq $12, $20, $12 # U0 hi mul + carry
+ stq $22, 0($16) # L0
+ stq $23, 8($16) # L1
+ addq $12, $21, $0 # U0 hi mul + carry
+
+ ldq $9, 8($30)
+ ldq $10, 16($30)
+ ldq $11, 24($30)
+ ldq $12, 32($30)
+ ldq $13, 40($30)
+ ldq $14, 48($30)
+ ldq $15, 56($30)
+ lda $30, 240($30)
+ ret $31, ($26), 1
+
+ .end __mpn_addmul_1
diff --git a/ports/sysdeps/alpha/alphaev6/fpu/e_sqrt.S b/ports/sysdeps/alpha/alphaev6/fpu/e_sqrt.S
new file mode 100644
index 0000000000..c4625d0eca
--- /dev/null
+++ b/ports/sysdeps/alpha/alphaev6/fpu/e_sqrt.S
@@ -0,0 +1,44 @@
+/* Copyright (C) 2000-2012 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .arch ev6
+ .set noreorder
+ .set noat
+
+ENTRY(__ieee754_sqrt)
+#ifdef PROF
+ ldgp gp, 0(pv)
+ lda AT, _mcount
+ jsr AT, (AT), _mcount
+ .prologue 1
+#else
+ .prologue 0
+#endif
+
+ .align 4
+#ifdef _IEEE_FP_INEXACT
+ sqrtt/suid $f16, $f0
+#else
+ sqrtt/sud $f16, $f0
+#endif
+ ret
+ nop
+ nop
+
+END(__ieee754_sqrt)
diff --git a/ports/sysdeps/alpha/alphaev6/fpu/e_sqrtf.S b/ports/sysdeps/alpha/alphaev6/fpu/e_sqrtf.S
new file mode 100644
index 0000000000..5681f3a947
--- /dev/null
+++ b/ports/sysdeps/alpha/alphaev6/fpu/e_sqrtf.S
@@ -0,0 +1,44 @@
+/* Copyright (C) 2000-2012 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .arch ev6
+ .set noreorder
+ .set noat
+
+ENTRY(__ieee754_sqrtf)
+#ifdef PROF
+ ldgp gp, 0(pv)
+ lda AT, _mcount
+ jsr AT, (AT), _mcount
+ .prologue 1
+#else
+ .prologue 0
+#endif
+
+ .align 4
+#ifdef _IEEE_FP_INEXACT
+ sqrts/suid $f16, $f0
+#else
+ sqrts/sud $f16, $f0
+#endif
+ ret
+ nop
+ nop
+
+END(__ieee754_sqrtf)
diff --git a/ports/sysdeps/alpha/alphaev6/memcpy.S b/ports/sysdeps/alpha/alphaev6/memcpy.S
new file mode 100644
index 0000000000..99fbda2141
--- /dev/null
+++ b/ports/sysdeps/alpha/alphaev6/memcpy.S
@@ -0,0 +1,255 @@
+/* Copyright (C) 2000, 2003 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/*
+ * Much of the information about 21264 scheduling/coding comes from:
+ * Compiler Writer's Guide for the Alpha 21264
+ * abbreviated as 'CWG' in other comments here
+ * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
+ * Scheduling notation:
+ * E - either cluster
+ * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
+ * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
+ *
+ * Temp usage notes:
+ * $0 - destination address
+ * $1,$2, - scratch
+ */
+
+#include <sysdep.h>
+
+ .arch ev6
+ .set noreorder
+ .set noat
+
+ENTRY(memcpy)
+ .prologue 0
+
+ mov $16, $0 # E : copy dest to return
+ ble $18, $nomoredata # U : done with the copy?
+ xor $16, $17, $1 # E : are source and dest alignments the same?
+ and $1, 7, $1 # E : are they the same mod 8?
+
+ bne $1, $misaligned # U : Nope - gotta do this the slow way
+ /* source and dest are same mod 8 address */
+ and $16, 7, $1 # E : Are both 0mod8?
+ beq $1, $both_0mod8 # U : Yes
+ nop # E :
+
+ /*
+ * source and dest are same misalignment. move a byte at a time
+ * until a 0mod8 alignment for both is reached.
+ * At least one byte more to move
+ */
+
+$head_align:
+ ldbu $1, 0($17) # L : grab a byte
+ subq $18, 1, $18 # E : count--
+ addq $17, 1, $17 # E : src++
+ stb $1, 0($16) # L :
+ addq $16, 1, $16 # E : dest++
+ and $16, 7, $1 # E : Are we at 0mod8 yet?
+ ble $18, $nomoredata # U : done with the copy?
+ bne $1, $head_align # U :
+
+$both_0mod8:
+ cmple $18, 127, $1 # E : Can we unroll the loop?
+ bne $1, $no_unroll # U :
+ and $16, 63, $1 # E : get mod64 alignment
+ beq $1, $do_unroll # U : no single quads to fiddle
+
+$single_head_quad:
+ ldq $1, 0($17) # L : get 8 bytes
+ subq $18, 8, $18 # E : count -= 8
+ addq $17, 8, $17 # E : src += 8
+ nop # E :
+
+ stq $1, 0($16) # L : store
+ addq $16, 8, $16 # E : dest += 8
+ and $16, 63, $1 # E : get mod64 alignment
+ bne $1, $single_head_quad # U : still not fully aligned
+
+$do_unroll:
+ addq $16, 64, $7 # E : Initial (+1 trip) wh64 address
+ cmple $18, 127, $1 # E : Can we go through the unrolled loop?
+ bne $1, $tail_quads # U : Nope
+ nop # E :
+
+$unroll_body:
+ wh64 ($7) # L1 : memory subsystem hint: 64 bytes at
+ # ($7) are about to be over-written
+ ldq $6, 0($17) # L0 : bytes 0..7
+ nop # E :
+ nop # E :
+
+ ldq $4, 8($17) # L : bytes 8..15
+ ldq $5, 16($17) # L : bytes 16..23
+ addq $7, 64, $7 # E : Update next wh64 address
+ nop # E :
+
+ ldq $3, 24($17) # L : bytes 24..31
+ addq $16, 64, $1 # E : fallback value for wh64
+ nop # E :
+ nop # E :
+
+ addq $17, 32, $17 # E : src += 32 bytes
+ stq $6, 0($16) # L : bytes 0..7
+ nop # E :
+ nop # E :
+
+ stq $4, 8($16) # L : bytes 8..15
+ stq $5, 16($16) # L : bytes 16..23
+ subq $18, 192, $2 # E : At least two more trips to go?
+ nop # E :
+
+ stq $3, 24($16) # L : bytes 24..31
+ addq $16, 32, $16 # E : dest += 32 bytes
+ nop # E :
+ nop # E :
+
+ ldq $6, 0($17) # L : bytes 0..7
+ ldq $4, 8($17) # L : bytes 8..15
+ cmovlt $2, $1, $7 # E : Latency 2, extra map slot - Use
+ # fallback wh64 address if < 2 more trips
+ nop # E :
+
+ ldq $5, 16($17) # L : bytes 16..23
+ ldq $3, 24($17) # L : bytes 24..31
+ addq $16, 32, $16 # E : dest += 32
+ subq $18, 64, $18 # E : count -= 64
+
+ addq $17, 32, $17 # E : src += 32
+ stq $6, -32($16) # L : bytes 0..7
+ stq $4, -24($16) # L : bytes 8..15
+ cmple $18, 63, $1 # E : At least one more trip?
+
+ stq $5, -16($16) # L : bytes 16..23
+ stq $3, -8($16) # L : bytes 24..31
+ nop # E :
+ beq $1, $unroll_body
+
+$tail_quads:
+$no_unroll:
+ .align 4
+ subq $18, 8, $18 # E : At least a quad left?
+ blt $18, $less_than_8 # U : Nope
+ nop # E :
+ nop # E :
+
+$move_a_quad:
+ ldq $1, 0($17) # L : fetch 8
+ subq $18, 8, $18 # E : count -= 8
+ addq $17, 8, $17 # E : src += 8
+ nop # E :
+
+ stq $1, 0($16) # L : store 8
+ addq $16, 8, $16 # E : dest += 8
+ bge $18, $move_a_quad # U :
+ nop # E :
+
+$less_than_8:
+ .align 4
+ addq $18, 8, $18 # E : add back for trailing bytes
+ ble $18, $nomoredata # U : All-done
+ nop # E :
+ nop # E :
+
+ /* Trailing bytes */
+$tail_bytes:
+ subq $18, 1, $18 # E : count--
+ ldbu $1, 0($17) # L : fetch a byte
+ addq $17, 1, $17 # E : src++
+ nop # E :
+
+ stb $1, 0($16) # L : store a byte
+ addq $16, 1, $16 # E : dest++
+ bgt $18, $tail_bytes # U : more to be done?
+ nop # E :
+
+ /* branching to exit takes 3 extra cycles, so replicate exit here */
+ ret $31, ($26), 1 # L0 :
+ nop # E :
+ nop # E :
+ nop # E :
+
+$misaligned:
+ mov $0, $4 # E : dest temp
+ and $0, 7, $1 # E : dest alignment mod8
+ beq $1, $dest_0mod8 # U : life doesnt totally suck
+ nop
+
+$aligndest:
+ ble $18, $nomoredata # U :
+ ldbu $1, 0($17) # L : fetch a byte
+ subq $18, 1, $18 # E : count--
+ addq $17, 1, $17 # E : src++
+
+ stb $1, 0($4) # L : store it
+ addq $4, 1, $4 # E : dest++
+ and $4, 7, $1 # E : dest 0mod8 yet?
+ bne $1, $aligndest # U : go until we are aligned.
+
+ /* Source has unknown alignment, but dest is known to be 0mod8 */
+$dest_0mod8:
+ subq $18, 8, $18 # E : At least a quad left?
+ blt $18, $misalign_tail # U : Nope
+ ldq_u $3, 0($17) # L : seed (rotating load) of 8 bytes
+ nop # E :
+
+$mis_quad:
+ ldq_u $16, 8($17) # L : Fetch next 8
+ extql $3, $17, $3 # U : masking
+ extqh $16, $17, $1 # U : masking
+ bis $3, $1, $1 # E : merged bytes to store
+
+ subq $18, 8, $18 # E : count -= 8
+ addq $17, 8, $17 # E : src += 8
+ stq $1, 0($4) # L : store 8 (aligned)
+ mov $16, $3 # E : "rotate" source data
+
+ addq $4, 8, $4 # E : dest += 8
+ bge $18, $mis_quad # U : More quads to move
+ nop
+ nop
+
+$misalign_tail:
+ addq $18, 8, $18 # E : account for tail stuff
+ ble $18, $nomoredata # U :
+ nop
+ nop
+
+$misalign_byte:
+ ldbu $1, 0($17) # L : fetch 1
+ subq $18, 1, $18 # E : count--
+ addq $17, 1, $17 # E : src++
+ nop # E :
+
+ stb $1, 0($4) # L : store
+ addq $4, 1, $4 # E : dest++
+ bgt $18, $misalign_byte # U : more to go?
+ nop
+
+
+$nomoredata:
+ ret $31, ($26), 1 # L0 :
+ nop # E :
+ nop # E :
+ nop # E :
+
+END(memcpy)
+libc_hidden_builtin_def (memcpy)
diff --git a/ports/sysdeps/alpha/alphaev6/memset.S b/ports/sysdeps/alpha/alphaev6/memset.S
new file mode 100644
index 0000000000..78b43b2e8d
--- /dev/null
+++ b/ports/sysdeps/alpha/alphaev6/memset.S
@@ -0,0 +1,223 @@
+/* Copyright (C) 2000, 2003 Free Software Foundation, Inc.
+ Contributed by Richard Henderson (rth@tamu.edu)
+ EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .arch ev6
+ .set noat
+ .set noreorder
+
+ENTRY(memset)
+#ifdef PROF
+ ldgp gp, 0(pv)
+ lda AT, _mcount
+ jsr AT, (AT), _mcount
+ .prologue 1
+#else
+ .prologue 0
+#endif
+
+ /*
+ * Serious stalling happens. The only way to mitigate this is to
+ * undertake a major re-write to interleave the constant materialization
+ * with other parts of the fall-through code. This is important, even
+ * though it makes maintenance tougher.
+ * Do this later.
+ */
+ and $17, 255, $1 # E : 00000000000000ch
+ insbl $17, 1, $2 # U : 000000000000ch00
+ mov $16, $0 # E : return value
+ ble $18, $end # U : zero length requested?
+
+ addq $18, $16, $6 # E : max address to write to
+ or $1, $2, $17 # E : 000000000000chch
+ insbl $1, 2, $3 # U : 0000000000ch0000
+ insbl $1, 3, $4 # U : 00000000ch000000
+
+ or $3, $4, $3 # E : 00000000chch0000
+ inswl $17, 4, $5 # U : 0000chch00000000
+ xor $16, $6, $1 # E : will complete write be within one quadword?
+ inswl $17, 6, $2 # U : chch000000000000
+
+ or $17, $3, $17 # E : 00000000chchchch
+ or $2, $5, $2 # E : chchchch00000000
+ bic $1, 7, $1 # E : fit within a single quadword?
+ and $16, 7, $3 # E : Target addr misalignment
+
+ or $17, $2, $17 # E : chchchchchchchch
+ beq $1, $within_quad # U :
+ nop # E :
+ beq $3, $aligned # U : target is 0mod8
+
+ /*
+ * Target address is misaligned, and won't fit within a quadword.
+ */
+ ldq_u $4, 0($16) # L : Fetch first partial
+ mov $16, $5 # E : Save the address
+ insql $17, $16, $2 # U : Insert new bytes
+ subq $3, 8, $3 # E : Invert (for addressing uses)
+
+ addq $18, $3, $18 # E : $18 is new count ($3 is negative)
+ mskql $4, $16, $4 # U : clear relevant parts of the quad
+ subq $16, $3, $16 # E : $16 is new aligned destination
+ or $2, $4, $1 # E : Final bytes
+
+ nop
+ stq_u $1,0($5) # L : Store result
+ nop
+ nop
+
+ .align 4
+$aligned:
+ /*
+ * We are now guaranteed to be quad aligned, with at least
+ * one partial quad to write.
+ */
+
+ sra $18, 3, $3 # U : Number of remaining quads to write
+ and $18, 7, $18 # E : Number of trailing bytes to write
+ mov $16, $5 # E : Save dest address
+ beq $3, $no_quad # U : tail stuff only
+
+ /*
+ * It's worth the effort to unroll this and use wh64 if possible.
+ * At this point, entry values are:
+ * $16 Current destination address
+ * $5 A copy of $16
+ * $6 The max quadword address to write to
+ * $18 Number trailer bytes
+ * $3 Number quads to write
+ */
+
+ and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
+ subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
+ subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
+ blt $4, $loop # U :
+
+ /*
+ * We know we've got at least 16 quads, minimum of one trip
+ * through unrolled loop. Do a quad at a time to get us 0mod64
+ * aligned.
+ */
+
+ nop # E :
+ nop # E :
+ nop # E :
+ beq $1, $bigalign # U :
+
+$alignmod64:
+ stq $17, 0($5) # L :
+ subq $3, 1, $3 # E : For consistency later
+ addq $1, 8, $1 # E : Increment towards zero for alignment
+ addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
+
+ nop
+ nop
+ addq $5, 8, $5 # E : Inc address
+ blt $1, $alignmod64 # U :
+
+$bigalign:
+ /*
+ * $3 - number quads left to go
+ * $5 - target address (aligned 0mod64)
+ * $17 - mask of stuff to store
+ * Scratch registers available: $7, $2, $4, $1
+ * We know that we'll be taking a minimum of one trip through.
+ * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
+ * Assumes the wh64 needs to be for 2 trips through the loop in the future.
+ * The wh64 is issued on for the starting destination address for trip +2
+ * through the loop, and if there are less than two trips left, the target
+ * address will be for the current trip.
+ */
+
+$do_wh64:
+ wh64 ($4) # L1 : memory subsystem write hint
+ subq $3, 24, $2 # E : For determining future wh64 addresses
+ stq $17, 0($5) # L :
+ nop # E :
+
+ addq $5, 128, $4 # E : speculative target of next wh64
+ stq $17, 8($5) # L :
+ stq $17, 16($5) # L :
+ addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
+
+ stq $17, 24($5) # L :
+ stq $17, 32($5) # L :
+ cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
+ nop
+
+ stq $17, 40($5) # L :
+ stq $17, 48($5) # L :
+ subq $3, 16, $2 # E : Repeat the loop at least once more?
+ nop
+
+ stq $17, 56($5) # L :
+ addq $5, 64, $5 # E :
+ subq $3, 8, $3 # E :
+ bge $2, $do_wh64 # U :
+
+ nop
+ nop
+ nop
+ beq $3, $no_quad # U : Might have finished already
+
+ .align 4
+ /*
+ * Simple loop for trailing quadwords, or for small amounts
+ * of data (where we can't use an unrolled loop and wh64)
+ */
+$loop:
+ stq $17, 0($5) # L :
+ subq $3, 1, $3 # E : Decrement number quads left
+ addq $5, 8, $5 # E : Inc address
+ bne $3, $loop # U : more?
+
+$no_quad:
+ /*
+ * Write 0..7 trailing bytes.
+ */
+ nop # E :
+ beq $18, $end # U : All done?
+ ldq $7, 0($5) # L :
+ mskqh $7, $6, $2 # U : Mask final quad
+
+ insqh $17, $6, $4 # U : New bits
+ or $2, $4, $1 # E : Put it all together
+ stq $1, 0($5) # L : And back to memory
+ ret $31,($26),1 # L0 :
+
+$within_quad:
+ ldq_u $1, 0($16) # L :
+ insql $17, $16, $2 # U : New bits
+ mskql $1, $16, $4 # U : Clear old
+ or $2, $4, $2 # E : New result
+
+ mskql $2, $6, $4 # U :
+ mskqh $1, $6, $2 # U :
+ or $2, $4, $1 # E :
+ stq_u $1, 0($16) # L :
+
+$end:
+ nop
+ nop
+ nop
+ ret $31,($26),1 # L0 :
+
+ END(memset)
+libc_hidden_builtin_def (memset)
diff --git a/ports/sysdeps/alpha/alphaev6/stxcpy.S b/ports/sysdeps/alpha/alphaev6/stxcpy.S
new file mode 100644
index 0000000000..bdc8e72eb6
--- /dev/null
+++ b/ports/sysdeps/alpha/alphaev6/stxcpy.S
@@ -0,0 +1,314 @@
+/* Copyright (C) 2000-2012 Free Software Foundation, Inc.
+ Contributed by Richard Henderson (rth@tamu.edu)
+ EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Copy a null-terminated string from SRC to DST.
+
+ This is an internal routine used by strcpy, stpcpy, and strcat.
+ As such, it uses special linkage conventions to make implementation
+ of these public functions more efficient.
+
+ On input:
+ t9 = return address
+ a0 = DST
+ a1 = SRC
+
+ On output:
+ t8 = bitmask (with one bit set) indicating the last byte written
+ a0 = unaligned address of the last *word* written
+
+ Furthermore, v0, a3-a5, t11, and t12 are untouched.
+*/
+
+
+#include <sysdep.h>
+
+ .arch ev6
+ .set noat
+ .set noreorder
+
+ .text
+ .type __stxcpy, @function
+ .globl __stxcpy
+ .usepv __stxcpy, no
+
+ cfi_startproc
+ cfi_return_column (t9)
+
+ /* On entry to this basic block:
+ t0 == the first destination word for masking back in
+ t1 == the first source word. */
+ .align 4
+stxcpy_aligned:
+ /* Create the 1st output word and detect 0's in the 1st input word. */
+ lda t2, -1 # E : build a mask against false zero
+ mskqh t2, a1, t2 # U : detection in the src word (stall)
+ mskqh t1, a1, t3 # U :
+ ornot t1, t2, t2 # E : (stall)
+
+ mskql t0, a1, t0 # U : assemble the first output word
+ cmpbge zero, t2, t10 # E : bits set iff null found
+ or t0, t3, t1 # E : (stall)
+ bne t10, $a_eos # U : (stall)
+
+ /* On entry to this basic block:
+ t0 == the first destination word for masking back in
+ t1 == a source word not containing a null. */
+ /* Nops here to separate store quads from load quads */
+
+$a_loop:
+ stq_u t1, 0(a0) # L :
+ addq a0, 8, a0 # E :
+ nop
+ nop
+
+ ldq_u t1, 0(a1) # L : Latency=3
+ addq a1, 8, a1 # E :
+ cmpbge zero, t1, t10 # E : (3 cycle stall)
+ beq t10, $a_loop # U : (stall for t10)
+
+ /* Take care of the final (partial) word store.
+ On entry to this basic block we have:
+ t1 == the source word containing the null
+ t10 == the cmpbge mask that found it. */
+$a_eos:
+ negq t10, t6 # E : find low bit set
+ and t10, t6, t8 # E : (stall)
+ /* For the sake of the cache, don't read a destination word
+ if we're not going to need it. */
+ and t8, 0x80, t6 # E : (stall)
+ bne t6, 1f # U : (stall)
+
+ /* We're doing a partial word store and so need to combine
+ our source and original destination words. */
+ ldq_u t0, 0(a0) # L : Latency=3
+ subq t8, 1, t6 # E :
+ zapnot t1, t6, t1 # U : clear src bytes >= null (stall)
+ or t8, t6, t10 # E : (stall)
+
+ zap t0, t10, t0 # E : clear dst bytes <= null
+ or t0, t1, t1 # E : (stall)
+ nop
+ nop
+
+1: stq_u t1, 0(a0) # L :
+ ret (t9) # L0 : Latency=3
+ nop
+ nop
+
+ .align 4
+__stxcpy:
+ /* Are source and destination co-aligned? */
+ xor a0, a1, t0 # E :
+ unop # E :
+ and t0, 7, t0 # E : (stall)
+ bne t0, $unaligned # U : (stall)
+
+ /* We are co-aligned; take care of a partial first word. */
+ ldq_u t1, 0(a1) # L : load first src word
+ and a0, 7, t0 # E : take care not to load a word ...
+ addq a1, 8, a1 # E :
+ beq t0, stxcpy_aligned # U : ... if we wont need it (stall)
+
+ ldq_u t0, 0(a0) # L :
+ br stxcpy_aligned # L0 : Latency=3
+ nop
+ nop
+
+
+/* The source and destination are not co-aligned. Align the destination
+ and cope. We have to be very careful about not reading too much and
+ causing a SEGV. */
+
+ .align 4
+$u_head:
+ /* We know just enough now to be able to assemble the first
+ full source word. We can still find a zero at the end of it
+ that prevents us from outputting the whole thing.
+
+ On entry to this basic block:
+ t0 == the first dest word, for masking back in, if needed else 0
+ t1 == the low bits of the first source word
+ t6 == bytemask that is -1 in dest word bytes */
+
+ ldq_u t2, 8(a1) # L :
+ addq a1, 8, a1 # E :
+ extql t1, a1, t1 # U : (stall on a1)
+ extqh t2, a1, t4 # U : (stall on a1)
+
+ mskql t0, a0, t0 # U :
+ or t1, t4, t1 # E :
+ mskqh t1, a0, t1 # U : (stall on t1)
+ or t0, t1, t1 # E : (stall on t1)
+
+ or t1, t6, t6 # E :
+ cmpbge zero, t6, t10 # E : (stall)
+ lda t6, -1 # E : for masking just below
+ bne t10, $u_final # U : (stall)
+
+ mskql t6, a1, t6 # U : mask out the bits we have
+ or t6, t2, t2 # E : already extracted before (stall)
+ cmpbge zero, t2, t10 # E : testing eos (stall)
+ bne t10, $u_late_head_exit # U : (stall)
+
+ /* Finally, we've got all the stupid leading edge cases taken care
+ of and we can set up to enter the main loop. */
+
+ stq_u t1, 0(a0) # L : store first output word
+ addq a0, 8, a0 # E :
+ extql t2, a1, t0 # U : position ho-bits of lo word
+ ldq_u t2, 8(a1) # U : read next high-order source word
+
+ addq a1, 8, a1 # E :
+ cmpbge zero, t2, t10 # E : (stall for t2)
+ nop # E :
+ bne t10, $u_eos # U : (stall)
+
+ /* Unaligned copy main loop. In order to avoid reading too much,
+ the loop is structured to detect zeros in aligned source words.
+ This has, unfortunately, effectively pulled half of a loop
+ iteration out into the head and half into the tail, but it does
+ prevent nastiness from accumulating in the very thing we want
+ to run as fast as possible.
+
+ On entry to this basic block:
+ t0 == the shifted high-order bits from the previous source word
+ t2 == the unshifted current source word
+
+ We further know that t2 does not contain a null terminator. */
+
+ .align 3
+$u_loop:
+ extqh t2, a1, t1 # U : extract high bits for current word
+ addq a1, 8, a1 # E : (stall)
+ extql t2, a1, t3 # U : extract low bits for next time (stall)
+ addq a0, 8, a0 # E :
+
+ or t0, t1, t1 # E : current dst word now complete
+ ldq_u t2, 0(a1) # L : Latency=3 load high word for next time
+ stq_u t1, -8(a0) # L : save the current word (stall)
+ mov t3, t0 # E :
+
+ cmpbge zero, t2, t10 # E : test new word for eos
+ beq t10, $u_loop # U : (stall)
+ nop
+ nop
+
+ /* We've found a zero somewhere in the source word we just read.
+ If it resides in the lower half, we have one (probably partial)
+ word to write out, and if it resides in the upper half, we
+ have one full and one partial word left to write out.
+
+ On entry to this basic block:
+ t0 == the shifted high-order bits from the previous source word
+ t2 == the unshifted current source word. */
+$u_eos:
+ extqh t2, a1, t1 # U :
+ or t0, t1, t1 # E : first (partial) source word complete (stall)
+ cmpbge zero, t1, t10 # E : is the null in this first bit? (stall)
+ bne t10, $u_final # U : (stall)
+
+$u_late_head_exit:
+ stq_u t1, 0(a0) # L : the null was in the high-order bits
+ addq a0, 8, a0 # E :
+ extql t2, a1, t1 # U :
+ cmpbge zero, t1, t10 # E : (stall)
+
+ /* Take care of a final (probably partial) result word.
+ On entry to this basic block:
+ t1 == assembled source word
+ t10 == cmpbge mask that found the null. */
+$u_final:
+ negq t10, t6 # E : isolate low bit set
+ and t6, t10, t8 # E : (stall)
+ and t8, 0x80, t6 # E : avoid dest word load if we can (stall)
+ bne t6, 1f # U : (stall)
+
+ ldq_u t0, 0(a0) # E :
+ subq t8, 1, t6 # E :
+ or t6, t8, t10 # E : (stall)
+ zapnot t1, t6, t1 # U : kill source bytes >= null (stall)
+
+ zap t0, t10, t0 # U : kill dest bytes <= null (2 cycle data stall)
+ or t0, t1, t1 # E : (stall)
+ nop
+ nop
+
+1: stq_u t1, 0(a0) # L :
+ ret (t9) # L0 : Latency=3
+ nop
+ nop
+
+ /* Unaligned copy entry point. */
+ .align 4
+$unaligned:
+
+ ldq_u t1, 0(a1) # L : load first source word
+ and a0, 7, t4 # E : find dest misalignment
+ and a1, 7, t5 # E : find src misalignment
+ /* Conditionally load the first destination word and a bytemask
+ with 0xff indicating that the destination byte is sacrosanct. */
+ mov zero, t0 # E :
+
+ mov zero, t6 # E :
+ beq t4, 1f # U :
+ ldq_u t0, 0(a0) # L :
+ lda t6, -1 # E :
+
+ mskql t6, a0, t6 # U :
+ nop
+ nop
+ nop
+1:
+ subq a1, t4, a1 # E : sub dest misalignment from src addr
+ /* If source misalignment is larger than dest misalignment, we need
+ extra startup checks to avoid SEGV. */
+ cmplt t4, t5, t8 # E :
+ beq t8, $u_head # U :
+ lda t2, -1 # E : mask out leading garbage in source
+
+ mskqh t2, t5, t2 # U :
+ ornot t1, t2, t3 # E : (stall)
+ cmpbge zero, t3, t10 # E : is there a zero? (stall)
+ beq t10, $u_head # U : (stall)
+
+ /* At this point we've found a zero in the first partial word of
+ the source. We need to isolate the valid source data and mask
+ it into the original destination data. (Incidentally, we know
+ that we'll need at least one byte of that original dest word.) */
+
+ ldq_u t0, 0(a0) # L :
+ negq t10, t6 # E : build bitmask of bytes <= zero
+ and t6, t10, t8 # E : (stall)
+ and a1, 7, t5 # E :
+
+ subq t8, 1, t6 # E :
+ or t6, t8, t10 # E : (stall)
+ srl t8, t5, t8 # U : adjust final null return value
+ zapnot t2, t10, t2 # U : prepare source word; mirror changes (stall)
+
+ and t1, t2, t1 # E : to source validity mask
+ extql t2, a1, t2 # U :
+ extql t1, a1, t1 # U : (stall)
+ andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall)
+
+ or t0, t1, t1 # e1 : and put it there
+ stq_u t1, 0(a0) # .. e0 : (stall)
+ ret (t9) # e1 :
+
+ cfi_endproc
diff --git a/ports/sysdeps/alpha/alphaev6/stxncpy.S b/ports/sysdeps/alpha/alphaev6/stxncpy.S
new file mode 100644
index 0000000000..28495df004
--- /dev/null
+++ b/ports/sysdeps/alpha/alphaev6/stxncpy.S
@@ -0,0 +1,392 @@
+/* Copyright (C) 2000-2012 Free Software Foundation, Inc.
+ Contributed by Richard Henderson (rth@tamu.edu)
+ EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Copy no more than COUNT bytes of the null-terminated string from
+ SRC to DST.
+
+ This is an internal routine used by strncpy, stpncpy, and strncat.
+ As such, it uses special linkage conventions to make implementation
+ of these public functions more efficient.
+
+ On input:
+ t9 = return address
+ a0 = DST
+ a1 = SRC
+ a2 = COUNT
+
+ Furthermore, COUNT may not be zero.
+
+ On output:
+ t0 = last word written
+ t8 = bitmask (with one bit set) indicating the last byte written
+ t10 = bitmask (with one bit set) indicating the byte position of
+ the end of the range specified by COUNT
+ a0 = unaligned address of the last *word* written
+ a2 = the number of full words left in COUNT
+
+ Furthermore, v0, a3-a5, t11, and t12 are untouched.
+*/
+
+#include <sysdep.h>
+
+ .arch ev6
+ .set noat
+ .set noreorder
+
+ .text
+ .type __stxncpy, @function
+ .globl __stxncpy
+ .usepv __stxncpy, no
+
+ cfi_startproc
+ cfi_return_column (t9)
+
+ /* On entry to this basic block:
+ t0 == the first destination word for masking back in
+ t1 == the first source word. */
+ .align 4
+stxncpy_aligned:
+ /* Create the 1st output word and detect 0's in the 1st input word. */
+ lda t2, -1 # E : build a mask against false zero
+ mskqh t2, a1, t2 # U : detection in the src word (stall)
+ mskqh t1, a1, t3 # U :
+ ornot t1, t2, t2 # E : (stall)
+
+ mskql t0, a1, t0 # U : assemble the first output word
+ cmpbge zero, t2, t7 # E : bits set iff null found
+ or t0, t3, t0 # E : (stall)
+ beq a2, $a_eoc # U :
+
+ bne t7, $a_eos # U :
+ nop
+ nop
+ nop
+
+ /* On entry to this basic block:
+ t0 == a source word not containing a null. */
+
+ /*
+ * nops here to:
+ * separate store quads from load quads
+ * limit of 1 bcond/quad to permit training
+ */
+$a_loop:
+ stq_u t0, 0(a0) # L :
+ addq a0, 8, a0 # E :
+ subq a2, 1, a2 # E :
+ nop
+
+ ldq_u t0, 0(a1) # L :
+ addq a1, 8, a1 # E :
+ cmpbge zero, t0, t7 # E :
+ beq a2, $a_eoc # U :
+
+ beq t7, $a_loop # U :
+ nop
+ nop
+ nop
+
+ /* Take care of the final (partial) word store. At this point
+ the end-of-count bit is set in t7 iff it applies.
+
+ On entry to this basic block we have:
+ t0 == the source word containing the null
+ t7 == the cmpbge mask that found it. */
+$a_eos:
+ negq t7, t8 # E : find low bit set
+ and t7, t8, t8 # E : (stall)
+ /* For the sake of the cache, don't read a destination word
+ if we're not going to need it. */
+ and t8, 0x80, t6 # E : (stall)
+ bne t6, 1f # U : (stall)
+
+ /* We're doing a partial word store and so need to combine
+ our source and original destination words. */
+ ldq_u t1, 0(a0) # L :
+ subq t8, 1, t6 # E :
+ or t8, t6, t7 # E : (stall)
+ zapnot t0, t7, t0 # U : clear src bytes > null (stall)
+
+ zap t1, t7, t1 # .. e1 : clear dst bytes <= null
+ or t0, t1, t0 # e1 : (stall)
+ nop
+ nop
+
+1: stq_u t0, 0(a0) # L :
+ ret (t9) # L0 : Latency=3
+ nop
+ nop
+
+ /* Add the end-of-count bit to the eos detection bitmask. */
+$a_eoc:
+ or t10, t7, t7 # E :
+ br $a_eos # L0 : Latency=3
+ nop
+ nop
+
+ .align 4
+__stxncpy:
+ /* Are source and destination co-aligned? */
+ lda t2, -1 # E :
+ xor a0, a1, t1 # E :
+ and a0, 7, t0 # E : find dest misalignment
+ nop # E :
+
+ srl t2, 1, t2 # U :
+ and t1, 7, t1 # E :
+ cmovlt a2, t2, a2 # E : bound count to LONG_MAX (stall)
+ nop # E :
+
+ addq a2, t0, a2 # E : bias count by dest misalignment
+ subq a2, 1, a2 # E : (stall)
+ and a2, 7, t2 # E : (stall)
+ lda t10, 1 # E :
+
+ srl a2, 3, a2 # U : a2 = loop counter = (count - 1)/8
+ sll t10, t2, t10 # U : t10 = bitmask of last count byte
+ nop # E :
+ bne t1, $unaligned # U : (stall)
+
+ /* We are co-aligned; take care of a partial first word. */
+ ldq_u t1, 0(a1) # L : load first src word
+ addq a1, 8, a1 # E :
+ beq t0, stxncpy_aligned # U : avoid loading dest word if not needed
+ ldq_u t0, 0(a0) # L :
+
+ br stxncpy_aligned # U :
+ nop
+ nop
+ nop
+
+
+
+/* The source and destination are not co-aligned. Align the destination
+ and cope. We have to be very careful about not reading too much and
+ causing a SEGV. */
+
+ .align 4
+$u_head:
+ /* We know just enough now to be able to assemble the first
+ full source word. We can still find a zero at the end of it
+ that prevents us from outputting the whole thing.
+
+ On entry to this basic block:
+ t0 == the first dest word, unmasked
+ t1 == the shifted low bits of the first source word
+ t6 == bytemask that is -1 in dest word bytes */
+
+ ldq_u t2, 8(a1) # L : Latency=3 load second src word
+ addq a1, 8, a1 # E :
+ mskql t0, a0, t0 # U : mask trailing garbage in dst
+ extqh t2, a1, t4 # U : (3 cycle stall on t2)
+
+ or t1, t4, t1 # E : first aligned src word complete (stall)
+ mskqh t1, a0, t1 # U : mask leading garbage in src (stall)
+ or t0, t1, t0 # E : first output word complete (stall)
+ or t0, t6, t6 # E : mask original data for zero test (stall)
+
+ cmpbge zero, t6, t7 # E :
+ beq a2, $u_eocfin # U :
+ lda t6, -1 # E :
+ nop
+
+ bne t7, $u_final # U :
+ mskql t6, a1, t6 # U : mask out bits already seen
+ stq_u t0, 0(a0) # L : store first output word
+ or t6, t2, t2 # E :
+
+ cmpbge zero, t2, t7 # E : find nulls in second partial
+ addq a0, 8, a0 # E :
+ subq a2, 1, a2 # E :
+ bne t7, $u_late_head_exit # U :
+
+ /* Finally, we've got all the stupid leading edge cases taken care
+ of and we can set up to enter the main loop. */
+ extql t2, a1, t1 # U : position hi-bits of lo word
+ beq a2, $u_eoc # U :
+ ldq_u t2, 8(a1) # L : read next high-order source word
+ addq a1, 8, a1 # E :
+
+ extqh t2, a1, t0 # U : position lo-bits of hi word (stall)
+ cmpbge zero, t2, t7 # E :
+ nop
+ bne t7, $u_eos # U :
+
+ /* Unaligned copy main loop. In order to avoid reading too much,
+ the loop is structured to detect zeros in aligned source words.
+ This has, unfortunately, effectively pulled half of a loop
+ iteration out into the head and half into the tail, but it does
+ prevent nastiness from accumulating in the very thing we want
+ to run as fast as possible.
+
+ On entry to this basic block:
+ t0 == the shifted low-order bits from the current source word
+ t1 == the shifted high-order bits from the previous source word
+ t2 == the unshifted current source word
+
+ We further know that t2 does not contain a null terminator. */
+
+ .align 4
+$u_loop:
+ or t0, t1, t0 # E : current dst word now complete
+ subq a2, 1, a2 # E : decrement word count
+ extql t2, a1, t1 # U : extract high bits for next time
+ addq a0, 8, a0 # E :
+
+ stq_u t0, -8(a0) # L : save the current word
+ beq a2, $u_eoc # U :
+ ldq_u t2, 8(a1) # L : Latency=3 load high word for next time
+ addq a1, 8, a1 # E :
+
+ extqh t2, a1, t0 # U : extract low bits (2 cycle stall)
+ cmpbge zero, t2, t7 # E : test new word for eos
+ nop
+ beq t7, $u_loop # U :
+
+ /* We've found a zero somewhere in the source word we just read.
+ If it resides in the lower half, we have one (probably partial)
+ word to write out, and if it resides in the upper half, we
+ have one full and one partial word left to write out.
+
+ On entry to this basic block:
+ t0 == the shifted low-order bits from the current source word
+ t1 == the shifted high-order bits from the previous source word
+ t2 == the unshifted current source word. */
+$u_eos:
+ or t0, t1, t0 # E : first (partial) source word complete
+ nop
+ cmpbge zero, t0, t7 # E : is the null in this first bit? (stall)
+ bne t7, $u_final # U : (stall)
+
+ stq_u t0, 0(a0) # L : the null was in the high-order bits
+ addq a0, 8, a0 # E :
+ subq a2, 1, a2 # E :
+ nop
+
+$u_late_head_exit:
+ extql t2, a1, t0 # U :
+ cmpbge zero, t0, t7 # E :
+ or t7, t10, t6 # E : (stall)
+ cmoveq a2, t6, t7 # E : Latency=2, extra map slot (stall)
+
+ /* Take care of a final (probably partial) result word.
+ On entry to this basic block:
+ t0 == assembled source word
+ t7 == cmpbge mask that found the null. */
+$u_final:
+ negq t7, t6 # E : isolate low bit set
+ and t6, t7, t8 # E : (stall)
+ and t8, 0x80, t6 # E : avoid dest word load if we can (stall)
+ bne t6, 1f # U : (stall)
+
+ ldq_u t1, 0(a0) # L :
+ subq t8, 1, t6 # E :
+ or t6, t8, t7 # E : (stall)
+ zapnot t0, t7, t0 # U : kill source bytes > null
+
+ zap t1, t7, t1 # U : kill dest bytes <= null
+ or t0, t1, t0 # E : (stall)
+ nop
+ nop
+
+1: stq_u t0, 0(a0) # L :
+ ret (t9) # L0 : Latency=3
+
+ /* Got to end-of-count before end of string.
+ On entry to this basic block:
+ t1 == the shifted high-order bits from the previous source word */
+$u_eoc:
+ and a1, 7, t6 # E :
+ sll t10, t6, t6 # U : (stall)
+ and t6, 0xff, t6 # E : (stall)
+ bne t6, 1f # U : (stall)
+
+ ldq_u t2, 8(a1) # L : load final src word
+ nop
+ extqh t2, a1, t0 # U : extract low bits for last word (stall)
+ or t1, t0, t1 # E : (stall)
+
+1: cmpbge zero, t1, t7 # E :
+ mov t1, t0
+
+$u_eocfin: # end-of-count, final word
+ or t10, t7, t7 # E :
+ br $u_final # L0 : Latency=3
+
+ /* Unaligned copy entry point. */
+ .align 4
+$unaligned:
+
+ ldq_u t1, 0(a1) # L : load first source word
+ and a0, 7, t4 # E : find dest misalignment
+ and a1, 7, t5 # E : find src misalignment
+ /* Conditionally load the first destination word and a bytemask
+ with 0xff indicating that the destination byte is sacrosanct. */
+ mov zero, t0 # E :
+
+ mov zero, t6 # E :
+ beq t4, 1f # U :
+ ldq_u t0, 0(a0) # L :
+ lda t6, -1 # E :
+
+ mskql t6, a0, t6 # U :
+ nop
+ nop
+1: subq a1, t4, a1 # E : sub dest misalignment from src addr
+
+ /* If source misalignment is larger than dest misalignment, we need
+ extra startup checks to avoid SEGV. */
+
+ cmplt t4, t5, t8 # E :
+ extql t1, a1, t1 # U : shift src into place
+ lda t2, -1 # E : for creating masks later
+ beq t8, $u_head # U : (stall)
+
+ mskqh t2, t5, t2 # U : begin src byte validity mask
+ cmpbge zero, t1, t7 # E : is there a zero?
+ extql t2, a1, t2 # U :
+ or t7, t10, t5 # E : test for end-of-count too
+
+ cmpbge zero, t2, t3 # E :
+ cmoveq a2, t5, t7 # E : Latency=2, extra map slot
+ nop # E : keep with cmoveq
+ andnot t7, t3, t7 # E : (stall)
+
+ beq t7, $u_head # U :
+ /* At this point we've found a zero in the first partial word of
+ the source. We need to isolate the valid source data and mask
+ it into the original destination data. (Incidentally, we know
+ that we'll need at least one byte of that original dest word.) */
+ ldq_u t0, 0(a0) # L :
+ negq t7, t6 # E : build bitmask of bytes <= zero
+ mskqh t1, t4, t1 # U :
+
+ and t6, t7, t8 # E :
+ subq t8, 1, t6 # E : (stall)
+ or t6, t8, t7 # E : (stall)
+ zapnot t2, t7, t2 # U : prepare source word; mirror changes (stall)
+
+ zapnot t1, t7, t1 # U : to source validity mask
+ andnot t0, t2, t0 # E : zero place for source to reside
+ or t0, t1, t0 # E : and put it there (stall both t0, t1)
+ stq_u t0, 0(a0) # L : (stall)
+
+ ret (t9) # L0 : Latency=3
+
+ cfi_endproc