summaryrefslogtreecommitdiff
path: root/sysdeps/alpha/alphaev6/memset.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/alpha/alphaev6/memset.S')
-rw-r--r--sysdeps/alpha/alphaev6/memset.S224
1 files changed, 0 insertions, 224 deletions
diff --git a/sysdeps/alpha/alphaev6/memset.S b/sysdeps/alpha/alphaev6/memset.S
deleted file mode 100644
index 3b3c4ba061..0000000000
--- a/sysdeps/alpha/alphaev6/memset.S
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright (C) 2000, 2003 Free Software Foundation, Inc.
- Contributed by Richard Henderson (rth@tamu.edu)
- EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
- 02111-1307 USA. */
-
-#include <sysdep.h>
-
- .arch ev6
- .set noat
- .set noreorder
-
-ENTRY(memset)
-#ifdef PROF
- ldgp gp, 0(pv)
- lda AT, _mcount
- jsr AT, (AT), _mcount
- .prologue 1
-#else
- .prologue 0
-#endif
-
- /*
- * Serious stalling happens. The only way to mitigate this is to
- * undertake a major re-write to interleave the constant materialization
- * with other parts of the fall-through code. This is important, even
- * though it makes maintenance tougher.
- * Do this later.
- */
- and $17, 255, $1 # E : 00000000000000ch
- insbl $17, 1, $2 # U : 000000000000ch00
- mov $16, $0 # E : return value
- ble $18, $end # U : zero length requested?
-
- addq $18, $16, $6 # E : max address to write to
- or $1, $2, $17 # E : 000000000000chch
- insbl $1, 2, $3 # U : 0000000000ch0000
- insbl $1, 3, $4 # U : 00000000ch000000
-
- or $3, $4, $3 # E : 00000000chch0000
- inswl $17, 4, $5 # U : 0000chch00000000
- xor $16, $6, $1 # E : will complete write be within one quadword?
- inswl $17, 6, $2 # U : chch000000000000
-
- or $17, $3, $17 # E : 00000000chchchch
- or $2, $5, $2 # E : chchchch00000000
- bic $1, 7, $1 # E : fit within a single quadword?
- and $16, 7, $3 # E : Target addr misalignment
-
- or $17, $2, $17 # E : chchchchchchchch
- beq $1, $within_quad # U :
- nop # E :
- beq $3, $aligned # U : target is 0mod8
-
- /*
- * Target address is misaligned, and won't fit within a quadword.
- */
- ldq_u $4, 0($16) # L : Fetch first partial
- mov $16, $5 # E : Save the address
- insql $17, $16, $2 # U : Insert new bytes
- subq $3, 8, $3 # E : Invert (for addressing uses)
-
- addq $18, $3, $18 # E : $18 is new count ($3 is negative)
- mskql $4, $16, $4 # U : clear relevant parts of the quad
- subq $16, $3, $16 # E : $16 is new aligned destination
- or $2, $4, $1 # E : Final bytes
-
- nop
- stq_u $1,0($5) # L : Store result
- nop
- nop
-
- .align 4
-$aligned:
- /*
- * We are now guaranteed to be quad aligned, with at least
- * one partial quad to write.
- */
-
- sra $18, 3, $3 # U : Number of remaining quads to write
- and $18, 7, $18 # E : Number of trailing bytes to write
- mov $16, $5 # E : Save dest address
- beq $3, $no_quad # U : tail stuff only
-
- /*
- * It's worth the effort to unroll this and use wh64 if possible.
- * At this point, entry values are:
- * $16 Current destination address
- * $5 A copy of $16
- * $6 The max quadword address to write to
- * $18 Number trailer bytes
- * $3 Number quads to write
- */
-
- and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
- subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
- subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
- blt $4, $loop # U :
-
- /*
- * We know we've got at least 16 quads, minimum of one trip
- * through unrolled loop. Do a quad at a time to get us 0mod64
- * aligned.
- */
-
- nop # E :
- nop # E :
- nop # E :
- beq $1, $bigalign # U :
-
-$alignmod64:
- stq $17, 0($5) # L :
- subq $3, 1, $3 # E : For consistency later
- addq $1, 8, $1 # E : Increment towards zero for alignment
- addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
-
- nop
- nop
- addq $5, 8, $5 # E : Inc address
- blt $1, $alignmod64 # U :
-
-$bigalign:
- /*
- * $3 - number quads left to go
- * $5 - target address (aligned 0mod64)
- * $17 - mask of stuff to store
- * Scratch registers available: $7, $2, $4, $1
- * We know that we'll be taking a minimum of one trip through.
- * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
- * Assumes the wh64 needs to be for 2 trips through the loop in the future.
- * The wh64 is issued on for the starting destination address for trip +2
- * through the loop, and if there are less than two trips left, the target
- * address will be for the current trip.
- */
-
-$do_wh64:
- wh64 ($4) # L1 : memory subsystem write hint
- subq $3, 24, $2 # E : For determining future wh64 addresses
- stq $17, 0($5) # L :
- nop # E :
-
- addq $5, 128, $4 # E : speculative target of next wh64
- stq $17, 8($5) # L :
- stq $17, 16($5) # L :
- addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
-
- stq $17, 24($5) # L :
- stq $17, 32($5) # L :
- cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
- nop
-
- stq $17, 40($5) # L :
- stq $17, 48($5) # L :
- subq $3, 16, $2 # E : Repeat the loop at least once more?
- nop
-
- stq $17, 56($5) # L :
- addq $5, 64, $5 # E :
- subq $3, 8, $3 # E :
- bge $2, $do_wh64 # U :
-
- nop
- nop
- nop
- beq $3, $no_quad # U : Might have finished already
-
- .align 4
- /*
- * Simple loop for trailing quadwords, or for small amounts
- * of data (where we can't use an unrolled loop and wh64)
- */
-$loop:
- stq $17, 0($5) # L :
- subq $3, 1, $3 # E : Decrement number quads left
- addq $5, 8, $5 # E : Inc address
- bne $3, $loop # U : more?
-
-$no_quad:
- /*
- * Write 0..7 trailing bytes.
- */
- nop # E :
- beq $18, $end # U : All done?
- ldq $7, 0($5) # L :
- mskqh $7, $6, $2 # U : Mask final quad
-
- insqh $17, $6, $4 # U : New bits
- or $2, $4, $1 # E : Put it all together
- stq $1, 0($5) # L : And back to memory
- ret $31,($26),1 # L0 :
-
-$within_quad:
- ldq_u $1, 0($16) # L :
- insql $17, $16, $2 # U : New bits
- mskql $1, $16, $4 # U : Clear old
- or $2, $4, $2 # E : New result
-
- mskql $2, $6, $4 # U :
- mskqh $1, $6, $2 # U :
- or $2, $4, $1 # E :
- stq_u $1, 0($16) # L :
-
-$end:
- nop
- nop
- nop
- ret $31,($26),1 # L0 :
-
- END(memset)
-libc_hidden_builtin_def (memset)