/* Copyright (C) 2011-2014 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Chris Metcalf , 2011. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library. If not, see . */ #include #include #include #include #include /* How many cache lines ahead should we prefetch? */ #define PREFETCH_LINES_AHEAD 3 void * __memcpy (void *__restrict dstv, const void *__restrict srcv, size_t n) { char *__restrict dst1 = (char *) dstv; const char *__restrict src1 = (const char *) srcv; const char *__restrict src1_end; const char *__restrict prefetch; op_t *__restrict dst8; /* 8-byte pointer to destination memory. */ op_t final; /* Final bytes to write to trailing word, if any */ long i; if (n < 16) { for (; n; n--) *dst1++ = *src1++; return dstv; } /* Locate the end of source memory we will copy. Don't prefetch past this. */ src1_end = src1 + n - 1; /* Prefetch ahead a few cache lines, but not past the end. */ prefetch = src1; for (i = 0; i < PREFETCH_LINES_AHEAD; i++) { __insn_prefetch (prefetch); prefetch += CHIP_L2_LINE_SIZE (); prefetch = (prefetch < src1_end) ? prefetch : src1; } /* Copy bytes until dst is word-aligned. */ for (; (uintptr_t) dst1 & (sizeof (op_t) - 1); n--) *dst1++ = *src1++; /* 8-byte pointer to destination memory. */ dst8 = (op_t *) dst1; if (__builtin_expect ((uintptr_t) src1 & (sizeof (op_t) - 1), 0)) { /* Misaligned copy. Use glibc's _wordcopy_fwd_dest_aligned, but inline it to avoid prologue/epilogue. TODO: Consider prefetching and using wh64 as well. */ void * srci; op_t a0, a1, a2, a3; long int dstp = (long int) dst1; long int srcp = (long int) src1; long int len = n / OPSIZ; /* Save the initial source pointer so we know the number of bytes to shift for merging two unaligned results. */ srci = (void *) srcp; /* Make SRCP aligned by rounding it down to the beginning of the `op_t' it points in the middle of. */ srcp &= -OPSIZ; switch (len % 4) { case 2: a1 = ((op_t *) srcp)[0]; a2 = ((op_t *) srcp)[1]; len += 2; srcp += 2 * OPSIZ; goto do1; case 3: a0 = ((op_t *) srcp)[0]; a1 = ((op_t *) srcp)[1]; len += 1; srcp += 2 * OPSIZ; goto do2; case 0: if (OP_T_THRES <= 3 * OPSIZ && len == 0) return dstv; a3 = ((op_t *) srcp)[0]; a0 = ((op_t *) srcp)[1]; len += 0; srcp += 2 * OPSIZ; goto do3; case 1: a2 = ((op_t *) srcp)[0]; a3 = ((op_t *) srcp)[1]; srcp += 2 * OPSIZ; len -= 1; if (OP_T_THRES <= 3 * OPSIZ && len == 0) goto do0; goto do4; /* No-op. */ } do { do4: a0 = ((op_t *) srcp)[0]; a2 = __insn_dblalign (a2, a3, srci); ((op_t *) dstp)[0] = a2; srcp += OPSIZ; dstp += OPSIZ; do3: a1 = ((op_t *) srcp)[0]; a3 = __insn_dblalign (a3, a0, srci); ((op_t *) dstp)[0] = a3; srcp += OPSIZ; dstp += OPSIZ; do2: a2 = ((op_t *) srcp)[0]; a0 = __insn_dblalign (a0, a1, srci); ((op_t *) dstp)[0] = a0; srcp += OPSIZ; dstp += OPSIZ; do1: a3 = ((op_t *) srcp)[0]; a1 = __insn_dblalign (a1, a2, srci); ((op_t *) dstp)[0] = a1; srcp += OPSIZ; dstp += OPSIZ; len -= 4; } while (len != 0); /* This is the right position for do0. Please don't move it into the loop. */ do0: ((op_t *) dstp)[0] = __insn_dblalign (a2, a3, srci); n = n % OPSIZ; if (n == 0) return dstv; a0 = ((const char *) srcp <= src1_end) ? ((op_t *) srcp)[0] : 0; final = __insn_dblalign (a3, a0, srci); dst8 = (op_t *)(dstp + OPSIZ); } else { /* Aligned copy. */ const op_t *__restrict src8 = (const op_t *) src1; /* src8 and dst8 are both word-aligned. */ if (n >= CHIP_L2_LINE_SIZE ()) { /* Copy until 'dst' is cache-line-aligned. */ for (; (uintptr_t) dst8 & (CHIP_L2_LINE_SIZE () - 1); n -= sizeof (op_t)) *dst8++ = *src8++; for (; n >= CHIP_L2_LINE_SIZE ();) { op_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; /* Prefetch and advance to next line to prefetch, but don't go past the end. */ __insn_prefetch (prefetch); prefetch += CHIP_L2_LINE_SIZE (); prefetch = (prefetch < src1_end) ? prefetch : (const char *) src8; /* Do all the loads before wh64. This is necessary if [src8, src8+7] and [dst8, dst8+7] share the same cache line and dst8 <= src8, as can be the case when called from memmove, or with code tested on x86 whose memcpy always works with forward copies. */ tmp0 = *src8++; tmp1 = *src8++; tmp2 = *src8++; tmp3 = *src8++; tmp4 = *src8++; tmp5 = *src8++; tmp6 = *src8++; tmp7 = *src8++; __insn_wh64 (dst8); *dst8++ = tmp0; *dst8++ = tmp1; *dst8++ = tmp2; *dst8++ = tmp3; *dst8++ = tmp4; *dst8++ = tmp5; *dst8++ = tmp6; *dst8++ = tmp7; n -= 64; } #if CHIP_L2_LINE_SIZE() != 64 # error "Fix code that assumes particular L2 cache line size." #endif } for (; n >= sizeof (op_t); n -= sizeof (op_t)) *dst8++ = *src8++; if (__builtin_expect (n == 0, 1)) return dstv; final = *src8; } /* n != 0 if we get here. Write out any trailing bytes. */ dst1 = (char *) dst8; #ifndef __BIG_ENDIAN__ if (n & 4) { *(uint32_t *) dst1 = final; dst1 += 4; final >>= 32; n &= 3; } if (n & 2) { *(uint16_t *) dst1 = final; dst1 += 2; final >>= 16; n &= 1; } if (n) *(uint8_t *) dst1 = final; #else if (n & 4) { *(uint32_t *) dst1 = final >> 32; dst1 += 4; } else { final >>= 32; } if (n & 2) { *(uint16_t *) dst1 = final >> 16; dst1 += 2; } else { final >>= 16; } if (n & 1) *(uint8_t *) dst1 = final >> 8; #endif return dstv; } weak_alias (__memcpy, memcpy) libc_hidden_builtin_def (memcpy)