summaryrefslogtreecommitdiff
path: root/sysdeps/x86_64/memset.S
diff options
context:
space:
mode:
authorAndreas Jaeger <aj@suse.de>2002-08-31 17:45:33 +0000
committerAndreas Jaeger <aj@suse.de>2002-08-31 17:45:33 +0000
commit78df0fcb80247ca7573a8ed07cc992b7031674c1 (patch)
tree339b581727c7ae5782f5118bf98567acda987553 /sysdeps/x86_64/memset.S
parent7c9466bc7688e084cfbf9311eb91bdbaed1ea888 (diff)
Update.
* sysdeps/x86_64/dl-machine.h (elf_machine_runtime_setup): Declare external functions with hidden attribute. (elf_machine_rela): Optimize. * sysdeps/x86_64/memset.S: New file. * sysdeps/x86_64/bzero.S: New file. * sysdeps/x86_64/stpcpy.S: New file. * sysdeps/x86_64/strcat.S: New file. * sysdeps/x86_64/strchr.S: New file. * sysdeps/x86_64/strcpy.S: New file. * sysdeps/x86_64/strcspn.S: New file. * sysdeps/x86_64/strlen.S: New file. * sysdeps/x86_64/strpbrk.S: New file. * sysdeps/x86_64/strspn.S: New file. * sysdeps/x86_64/strcmp.S: New file. * sysdeps/x86_64/strtok_r.S: New file. * sysdeps/x86_64/strtok.S: New file. * sysdeps/x86_64/memcpy.S: New file. * sysdeps/x86_64/mempcpy.S: New file.
Diffstat (limited to 'sysdeps/x86_64/memset.S')
-rw-r--r--sysdeps/x86_64/memset.S131
1 files changed, 131 insertions, 0 deletions
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
new file mode 100644
index 0000000000..b95ca40b2f
--- /dev/null
+++ b/sysdeps/x86_64/memset.S
@@ -0,0 +1,131 @@
+/* memset/bzero -- set memory area to CH/0
+ Optimized version for x86-64.
+ Copyright (C) 2002 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Andreas Jaeger <aj@suse.de>.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+#include "bp-sym.h"
+#include "bp-asm.h"
+
+/* BEWARE: `#ifdef memset' means that memset is redefined as `bzero' */
+#define BZERO_P (defined memset)
+
+/* This is somehow experimental and could made dependend on the cache
+ size. */
+#define LARGE $120000
+
+ .text
+ENTRY (memset)
+#if BZERO_P
+ mov %rsi,%rdx /* Adjust parameter. */
+ xorq %rsi,%rsi /* Fill with 0s. */
+#endif
+ cmp $0x7,%rdx /* Check for small length. */
+ mov %rdi,%rcx /* Save ptr as return value. */
+ jbe 7f
+
+#if BZERO_P
+ mov %rsi,%r8 /* Just copy 0. */
+#else
+ /* Populate 8 bit data to full 64-bit. */
+ movabs $0x0101010101010101,%r8
+ movzbl %sil,%eax
+ imul %rax,%r8
+#endif
+ test $0x7,%edi /* Check for alignment. */
+ je 2f
+
+ .p2align 4
+1: /* Align ptr to 8 byte. */
+ mov %sil,(%rcx)
+ dec %rdx
+ inc %rcx
+ test $0x7,%ecx
+ jne 1b
+
+2: /* Check for really large regions. */
+ mov %rdx,%rax
+ shr $0x6,%rax
+ je 4f
+ cmp LARGE, %rdx
+ jae 11f
+
+ .p2align 4
+3: /* Copy 64 bytes. */
+ mov %r8,(%rcx)
+ mov %r8,0x8(%rcx)
+ mov %r8,0x10(%rcx)
+ mov %r8,0x18(%rcx)
+ mov %r8,0x20(%rcx)
+ mov %r8,0x28(%rcx)
+ mov %r8,0x30(%rcx)
+ mov %r8,0x38(%rcx)
+ add $0x40,%rcx
+ dec %rax
+ jne 3b
+
+4: /* Copy final bytes. */
+ and $0x3f,%edx
+ mov %rdx,%rax
+ shr $0x3,%rax
+ je 6f
+
+5: /* First in chunks of 8 bytes. */
+ mov %r8,(%rcx)
+ add $0x8,%rcx
+ dec %rax
+ jne 5b
+6:
+ and $0x7,%edx
+7:
+ test %rdx,%rdx
+ je 9f
+8: /* And finally as bytes (up to 7). */
+ mov %sil,(%rcx)
+ inc %rcx
+ dec %rdx
+ jne 8b
+9:
+#if BZERO_P
+ nop
+#else
+ /* Load result (only if used as memset). */
+ mov %rdi,%rax /* start address of destination is result */
+#endif
+ retq
+
+ .p2align 4
+11: /* Copy 64 bytes without polluting the cache. */
+ /* We could use movntdq %xmm0,(%rcx) here to further
+ speed up for large cases but let's not use XMM registers. */
+ movnti %r8,(%rcx)
+ movnti %r8,0x8(%rcx)
+ movnti %r8,0x10(%rcx)
+ movnti %r8,0x18(%rcx)
+ movnti %r8,0x20(%rcx)
+ movnti %r8,0x28(%rcx)
+ movnti %r8,0x30(%rcx)
+ movnti %r8,0x38(%rcx)
+ add $0x40,%rcx
+ dec %rax
+ jne 11b
+ jmp 4b
+
+END (memset)