/* Optimized memcpy for x86-64. Copyright (C) 2007 Free Software Foundation, Inc. Contributed by Evandro Menezes , 2007. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ #include #include "asm-syntax.h" /* Stack slots in the red-zone. */ #ifdef USE_AS_MEMPCPY # define RETVAL (0) #else # define RETVAL (-8) #endif #define SAVE0 (RETVAL - 8) #define SAVE1 (SAVE0 - 8) #define SAVE2 (SAVE1 - 8) #define SAVE3 (SAVE2 - 8) .text #if defined PIC && !defined NOT_IN_libc ENTRY (__memcpy_chk) cmpq %rdx, %rcx jb HIDDEN_JUMPTARGET (__chk_fail) END (__memcpy_chk) #endif ENTRY(memcpy) /* (void *, const void*, size_t) */ /* Handle tiny blocks. */ L(1try): /* up to 32B */ cmpq $32, %rdx #ifndef USE_AS_MEMPCPY movq %rdi, %rax /* save return value */ #endif jae L(1after) L(1): /* 1-byte once */ testb $1, %dl jz L(1a) movzbl (%rsi), %ecx movb %cl, (%rdi) incq %rsi incq %rdi .p2align 4,, 4 L(1a): /* 2-byte once */ testb $2, %dl jz L(1b) movzwl (%rsi), %ecx movw %cx, (%rdi) addq $2, %rsi addq $2, %rdi .p2align 4,, 4 L(1b): /* 4-byte once */ testb $4, %dl jz L(1c) movl (%rsi), %ecx movl %ecx, (%rdi) addq $4, %rsi addq $4, %rdi .p2align 4,, 4 L(1c): /* 8-byte once */ testb $8, %dl jz L(1d) movq (%rsi), %rcx movq %rcx, (%rdi) addq $8, %rsi addq $8, %rdi .p2align 4,, 4 L(1d): /* 16-byte loop */ andl $0xf0, %edx jz L(exit) .p2align 4 L(1loop): movq (%rsi), %rcx movq 8 (%rsi), %r8 movq %rcx, (%rdi) movq %r8, 8 (%rdi) subl $16, %edx leaq 16 (%rsi), %rsi leaq 16 (%rdi), %rdi jnz L(1loop) .p2align 4,, 4 L(exit): /* exit */ #ifdef USE_AS_MEMPCPY movq %rdi, %rax /* return value */ #else rep #endif retq .p2align 4 L(1after): #ifndef USE_AS_MEMPCPY movq %rax, RETVAL (%rsp) /* save return value */ #endif /* Align to the natural word size. */ L(aligntry): movl %esi, %ecx /* align by destination */ andl $7, %ecx jz L(alignafter) /* already aligned */ L(align): /* align */ leaq -8 (%rcx, %rdx), %rdx /* calculate remaining bytes */ subl $8, %ecx .p2align 4 L(alignloop): /* 1-byte alignment loop */ movzbl (%rsi), %eax movb %al, (%rdi) incl %ecx leaq 1 (%rsi), %rsi leaq 1 (%rdi), %rdi jnz L(alignloop) .p2align 4 L(alignafter): /* Loop to handle mid-sized blocks. */ L(32try): /* up to 1KB */ cmpq $1024, %rdx ja L(32after) L(32): /* 32-byte loop */ movl %edx, %ecx shrl $5, %ecx jz L(32skip) .p2align 4 L(32loop): decl %ecx movq (%rsi), %rax movq 8 (%rsi), %r8 movq 16 (%rsi), %r9 movq 24 (%rsi), %r10 movq %rax, (%rdi) movq %r8, 8 (%rdi) movq %r9, 16 (%rdi) movq %r10, 24 (%rdi) leaq 32(%rsi), %rsi leaq 32(%rdi), %rdi jz L(32skip) /* help out smaller blocks */ decl %ecx movq (%rsi), %rax movq 8 (%rsi), %r8 movq 16 (%rsi), %r9 movq 24 (%rsi), %r10 movq %rax, (%rdi) movq %r8, 8 (%rdi) movq %r9, 16 (%rdi) movq %r10, 24 (%rdi) leaq 32 (%rsi), %rsi leaq 32 (%rdi), %rdi jnz L(32loop) .p2align 4 L(32skip): andl $31, %edx /* check for left overs */ #ifdef USE_AS_MEMPCPY jnz L(1) movq %rdi, %rax #else movq RETVAL (%rsp), %rax jnz L(1) rep #endif retq /* exit */ .p2align 4 L(32after): /* In order to minimize code-size in RTLD, algorithms specific for larger blocks are excluded when building for RTLD. */ /* Handle large blocks smaller than 1/2 L1. */ L(fasttry): /* first 1/2 L1 */ #ifndef NOT_IN_libc /* only up to this algorithm outside of libc.so */ movq __x86_64_core_cache_size_half (%rip), %r11 cmpq %rdx, %r11 /* calculate the smaller of */ cmovaq %rdx, %r11 /* remaining bytes and 1/2 L1 */ #endif L(fast): /* good ol' MOVS */ #ifndef NOT_IN_libc movq %r11, %rcx andq $-8, %r11 #else movq %rdx, %rcx #endif shrq $3, %rcx jz L(fastskip) rep movsq .p2align 4,, 4 L(fastskip): #ifndef NOT_IN_libc subq %r11, %rdx /* check for more */ testq $-8, %rdx jnz L(fastafter) #endif andl $7, %edx /* check for left overs */ #ifdef USE_AS_MEMPCPY jnz L(1) movq %rdi, %rax #else movq RETVAL (%rsp), %rax jnz L(1) rep #endif retq /* exit */ #ifndef NOT_IN_libc /* none of the algorithms below for RTLD */ .p2align 4 L(fastafter): /* Handle large blocks smaller than 1/2 L2. */ L(pretry): /* first 1/2 L2 */ movq __x86_64_shared_cache_size_half (%rip), %r8 cmpq %rdx, %r8 /* calculate the lesser of */ cmovaq %rdx, %r8 /* remaining bytes and 1/2 L2 */ L(pre): /* 64-byte with prefetching */ movq %r8, %rcx andq $-64, %r8 shrq $6, %rcx jz L(preskip) movq %r14, SAVE0 (%rsp) cfi_rel_offset (%r14, SAVE0) movq %r13, SAVE1 (%rsp) cfi_rel_offset (%r13, SAVE1) movq %r12, SAVE2 (%rsp) cfi_rel_offset (%r12, SAVE2) movq %rbx, SAVE3 (%rsp) cfi_rel_offset (%rbx, SAVE3) cmpl $0, __x86_64_prefetchw (%rip) jz L(preloop) /* check if PREFETCHW OK */ .p2align 4 /* ... when PREFETCHW is available (less cache-probe traffic in MP systems). */ L(prewloop): /* cache-line in state M */ decq %rcx movq (%rsi), %rax movq 8 (%rsi), %rbx movq 16 (%rsi), %r9 movq 24 (%rsi), %r10 movq 32 (%rsi), %r11 movq 40 (%rsi), %r12 movq 48 (%rsi), %r13 movq 56 (%rsi), %r14 prefetcht0 0 + 896 (%rsi) prefetcht0 64 + 896 (%rsi) movq %rax, (%rdi) movq %rbx, 8 (%rdi) movq %r9, 16 (%rdi) movq %r10, 24 (%rdi) movq %r11, 32 (%rdi) movq %r12, 40 (%rdi) movq %r13, 48 (%rdi) movq %r14, 56 (%rdi) leaq 64 (%rsi), %rsi leaq 64 (%rdi), %rdi jz L(prebail) decq %rcx movq (%rsi), %rax movq 8 (%rsi), %rbx movq 16 (%rsi), %r9 movq 24 (%rsi), %r10 movq 32 (%rsi), %r11 movq 40 (%rsi), %r12 movq 48 (%rsi), %r13 movq 56 (%rsi), %r14 movq %rax, (%rdi) movq %rbx, 8 (%rdi) movq %r9, 16 (%rdi) movq %r10, 24 (%rdi) movq %r11, 32 (%rdi) movq %r12, 40 (%rdi) movq %r13, 48 (%rdi) movq %r14, 56 (%rdi) prefetchw 896 - 64 (%rdi) prefetchw 896 - 0 (%rdi) leaq 64 (%rsi), %rsi leaq 64 (%rdi), %rdi jnz L(prewloop) jmp L(prebail) .p2align 4 /* ... when PREFETCHW is not available. */ L(preloop): /* cache-line in state E */ decq %rcx movq (%rsi), %rax movq 8 (%rsi), %rbx movq 16 (%rsi), %r9 movq 24 (%rsi), %r10 movq 32 (%rsi), %r11 movq 40 (%rsi), %r12 movq 48 (%rsi), %r13 movq 56 (%rsi), %r14 prefetcht0 896 + 0 (%rsi) prefetcht0 896 + 64 (%rsi) movq %rax, (%rdi) movq %rbx, 8 (%rdi) movq %r9, 16 (%rdi) movq %r10, 24 (%rdi) movq %r11, 32 (%rdi) movq %r12, 40 (%rdi) movq %r13, 48 (%rdi) movq %r14, 56 (%rdi) leaq 64 (%rsi), %rsi leaq 64 (%rdi), %rdi jz L(prebail) decq %rcx movq (%rsi), %rax movq 8 (%rsi), %rbx movq 16 (%rsi), %r9 movq 24 (%rsi), %r10 movq 32 (%rsi), %r11 movq 40 (%rsi), %r12 movq 48 (%rsi), %r13 movq 56 (%rsi), %r14 prefetcht0 896 - 64 (%rdi) prefetcht0 896 - 0 (%rdi) movq %rax, (%rdi) movq %rbx, 8 (%rdi) movq %r9, 16 (%rdi) movq %r10, 24 (%rdi) movq %r11, 32 (%rdi) movq %r12, 40 (%rdi) movq %r13, 48 (%rdi) movq %r14, 56 (%rdi) leaq 64 (%rsi), %rsi leaq 64 (%rdi), %rdi jnz L(preloop) L(prebail): movq SAVE3 (%rsp), %rbx cfi_restore (%rbx) movq SAVE2 (%rsp), %r12 cfi_restore (%r12) movq SAVE1 (%rsp), %r13 cfi_restore (%r13) movq SAVE0 (%rsp), %r14 cfi_restore (%r14) /* .p2align 4 */ L(preskip): subq %r8, %rdx /* check for more */ testq $-64, %rdx jnz L(preafter) andl $63, %edx /* check for left overs */ #ifdef USE_AS_MEMPCPY jnz L(1) movq %rdi, %rax #else movq RETVAL (%rsp), %rax jnz L(1) rep #endif retq /* exit */ .p2align 4 L(preafter): /* Loop to handle huge blocks. */ L(NTtry): L(NT): /* non-temporal 128-byte */ movq %rdx, %rcx shrq $7, %rcx jz L(NTskip) movq %r14, SAVE0 (%rsp) cfi_rel_offset (%r14, SAVE0) movq %r13, SAVE1 (%rsp) cfi_rel_offset (%r13, SAVE1) movq %r12, SAVE2 (%rsp) cfi_rel_offset (%r12, SAVE2) .p2align 4 L(NTloop): prefetchnta 768 (%rsi) prefetchnta 832 (%rsi) decq %rcx movq (%rsi), %rax movq 8 (%rsi), %r8 movq 16 (%rsi), %r9 movq 24 (%rsi), %r10 movq 32 (%rsi), %r11 movq 40 (%rsi), %r12 movq 48 (%rsi), %r13 movq 56 (%rsi), %r14 movntiq %rax, (%rdi) movntiq %r8, 8 (%rdi) movntiq %r9, 16 (%rdi) movntiq %r10, 24 (%rdi) movntiq %r11, 32 (%rdi) movntiq %r12, 40 (%rdi) movntiq %r13, 48 (%rdi) movntiq %r14, 56 (%rdi) movq 64 (%rsi), %rax movq 72 (%rsi), %r8 movq 80 (%rsi), %r9 movq 88 (%rsi), %r10 movq 96 (%rsi), %r11 movq 104 (%rsi), %r12 movq 112 (%rsi), %r13 movq 120 (%rsi), %r14 movntiq %rax, 64 (%rdi) movntiq %r8, 72 (%rdi) movntiq %r9, 80 (%rdi) movntiq %r10, 88 (%rdi) movntiq %r11, 96 (%rdi) movntiq %r12, 104 (%rdi) movntiq %r13, 112 (%rdi) movntiq %r14, 120 (%rdi) leaq 128 (%rsi), %rsi leaq 128 (%rdi), %rdi jnz L(NTloop) sfence /* serialize memory stores */ movq SAVE2 (%rsp), %r12 cfi_restore (%r12) movq SAVE1 (%rsp), %r13 cfi_restore (%r13) movq SAVE0 (%rsp), %r14 cfi_restore (%r14) L(NTskip): andl $127, %edx /* check for left overs */ #ifdef USE_AS_MEMPCPY jnz L(1) movq %rdi, %rax #else movq RETVAL (%rsp), %rax jnz L(1) rep #endif retq /* exit */ #endif /* !NOT_IN_libc */ END(memcpy) #ifndef USE_AS_MEMPCPY libc_hidden_builtin_def (memcpy) #endif