diff options
author | Andreas Schwab <schwab@redhat.com> | 2009-07-20 11:02:11 +0200 |
---|---|---|
committer | Andreas Schwab <schwab@redhat.com> | 2009-07-20 11:02:11 +0200 |
commit | 53924a77a2b827e7f9af6424a6a30224d09692d1 (patch) | |
tree | ba5d034a512524339fcfed113518eb83201fdc23 /sysdeps | |
parent | 8ecde8e8c2a8e77804f954afffd9efe0ab951e52 (diff) | |
parent | 42e69bcf1137fccfd7a95645a9d316c6490b9ff9 (diff) |
Merge commit 'origin/master' into fedora/master
Diffstat (limited to 'sysdeps')
29 files changed, 3497 insertions, 67 deletions
diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h index b1af7fde0a..30f9d23091 100644 --- a/sysdeps/generic/ldsodefs.h +++ b/sysdeps/generic/ldsodefs.h @@ -335,6 +335,10 @@ struct audit_ifaces extern int _dl_name_match_p (const char *__name, const struct link_map *__map) internal_function; +/* Compute next higher prime number. */ +extern unsigned long int _dl_higher_prime_number (unsigned long int n) + internal_function; + /* Function used as argument for `_dl_receive_error' function. The arguments are the error code, error string, and the objname the error occurred in. */ @@ -383,6 +387,21 @@ struct rtld_global allocated by rtld. Later it keeps the size of the map. It might be reset if in _dl_close if the last global object is removed. */ size_t _ns_global_scope_alloc; + /* Search table for unique objects. */ + struct unique_sym_table + { + __rtld_lock_recursive_t lock; + struct unique_sym + { + uint32_t hashval; + const char *name; + const ElfW(Sym) *sym; + const struct link_map *map; + } *entries; + size_t size; + size_t n_elements; + void (*free) (void *); + } _ns_unique_sym_table; /* Keep track of changes to each namespace' list. */ struct r_debug _ns_debug; } _dl_ns[DL_NNS]; diff --git a/sysdeps/generic/sysdep.h b/sysdeps/generic/sysdep.h index 15d951c777..54884d9afe 100644 --- a/sysdeps/generic/sysdep.h +++ b/sysdeps/generic/sysdep.h @@ -1,5 +1,5 @@ /* Generic asm macros used on many machines. - Copyright (C) 1991,92,93,96,98,2002,2003 Free Software Foundation, Inc. + Copyright (C) 1991,92,93,96,98,2002,2003,2009 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -39,13 +39,13 @@ #ifdef __ASSEMBLER__ /* Mark the end of function named SYM. This is used on some platforms to generate correct debugging information. */ -#ifndef END -#define END(sym) -#endif +# ifndef END +# define END(sym) +# endif -#ifndef JUMPTARGET -#define JUMPTARGET(sym) sym -#endif +# ifndef JUMPTARGET +# define JUMPTARGET(sym) sym +# endif /* Makros to generate eh_frame unwind information. */ # ifdef HAVE_ASM_CFI_DIRECTIVES @@ -65,6 +65,8 @@ # define cfi_remember_state .cfi_remember_state # define cfi_restore_state .cfi_restore_state # define cfi_window_save .cfi_window_save +# define cfi_personality(enc, exp) .cfi_personality enc, exp +# define cfi_lsda(enc, exp) .cfi_lsda enc, exp # else # define cfi_startproc # define cfi_endproc @@ -82,6 +84,8 @@ # define cfi_remember_state # define cfi_restore_state # define cfi_window_save +# define cfi_personality(enc, exp) +# define cfi_lsda(enc, exp) # endif #else /* ! ASSEMBLER */ @@ -116,6 +120,10 @@ ".cfi_restore_state" # define CFI_WINDOW_SAVE \ ".cfi_window_save" +# define CFI_PERSONALITY(enc, exp) \ + ".cfi_personality " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp) +# define CFI_LSDA(enc, exp) \ + ".cfi_lsda " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp) # else # define CFI_STARTPROC # define CFI_ENDPROC @@ -132,6 +140,27 @@ # define CFI_REMEMBER_STATE # define CFI_RESTORE_STATE # define CFI_WINDOW_SAVE +# define CFI_PERSONALITY(enc, exp) +# define CFI_LSDA(enc, exp) # endif #endif /* __ASSEMBLER__ */ + +/* Values used for encoding parameter of cfi_personality and cfi_lsda. */ +#define DW_EH_PE_absptr 0x00 +#define DW_EH_PE_omit 0xff +#define DW_EH_PE_uleb128 0x01 +#define DW_EH_PE_udata2 0x02 +#define DW_EH_PE_udata4 0x03 +#define DW_EH_PE_udata8 0x04 +#define DW_EH_PE_sleb128 0x09 +#define DW_EH_PE_sdata2 0x0a +#define DW_EH_PE_sdata4 0x0b +#define DW_EH_PE_sdata8 0x0c +#define DW_EH_PE_signed 0x08 +#define DW_EH_PE_pcrel 0x10 +#define DW_EH_PE_textrel 0x20 +#define DW_EH_PE_datarel 0x30 +#define DW_EH_PE_funcrel 0x40 +#define DW_EH_PE_aligned 0x50 +#define DW_EH_PE_indirect 0x80 diff --git a/sysdeps/i386/configure b/sysdeps/i386/configure index d1d4dc15a7..cbc8cd9206 100755 --- a/sysdeps/i386/configure +++ b/sysdeps/i386/configure @@ -1,10 +1,42 @@ +as_nl=' +' +export as_nl +# Printing a long string crashes Solaris 7 /usr/bin/printf. +as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo +if (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then + as_echo='printf %s\n' + as_echo_n='printf %s' +else + if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then + as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"' + as_echo_n='/usr/ucb/echo -n' + else + as_echo_body='eval expr "X$1" : "X\\(.*\\)"' + as_echo_n_body='eval + arg=$1; + case $arg in + *"$as_nl"*) + expr "X$arg" : "X\\(.*\\)$as_nl"; + arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;; + esac; + expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl" + ' + export as_echo_n_body + as_echo_n='sh -c $as_echo_n_body as_echo' + fi + export as_echo_body + as_echo='sh -c $as_echo_body as_echo' +fi + # This file is generated from configure.in by Autoconf. DO NOT EDIT! # Local configure fragment for sysdeps/i386. -echo "$as_me:$LINENO: checking if -g produces usable source locations for assembler-with-cpp" >&5 -echo $ECHO_N "checking if -g produces usable source locations for assembler-with-cpp... $ECHO_C" >&6 +{ $as_echo "$as_me:$LINENO: checking if -g produces usable source locations for assembler-with-cpp" >&5 +$as_echo_n "checking if -g produces usable source locations for assembler-with-cpp... " >&6; } if test "${libc_cv_cpp_asm_debuginfo+set}" = set; then - echo $ECHO_N "(cached) $ECHO_C" >&6 + $as_echo_n "(cached) " >&6 else cat > conftest.S <<EOF #include "confdefs.h" @@ -27,7 +59,7 @@ if { ac_try='${CC-cc} $CPPFLAGS $ASFLAGS -g -c conftest.S 1>&5' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? - echo "$as_me:$LINENO: \$? = $ac_status" >&5 + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_pattern='conftest\.S' { ac_try='readelf --debug-dump=line conftest.o | @@ -35,7 +67,7 @@ if { ac_try='${CC-cc} $CPPFLAGS $ASFLAGS -g -c conftest.S 1>&5' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? - echo "$as_me:$LINENO: \$? = $ac_status" >&5 + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } }; then libc_cv_cpp_asm_debuginfo=yes @@ -44,11 +76,36 @@ else fi rm -f conftest* fi -echo "$as_me:$LINENO: result: $libc_cv_cpp_asm_debuginfo" >&5 -echo "${ECHO_T}$libc_cv_cpp_asm_debuginfo" >&6 +{ $as_echo "$as_me:$LINENO: result: $libc_cv_cpp_asm_debuginfo" >&5 +$as_echo "$libc_cv_cpp_asm_debuginfo" >&6; } if test $libc_cv_cpp_asm_debuginfo = yes; then cat >>confdefs.h <<\_ACEOF #define HAVE_CPP_ASM_DEBUGINFO 1 _ACEOF fi + +{ $as_echo "$as_me:$LINENO: checking for SSE4 support" >&5 +$as_echo_n "checking for SSE4 support... " >&6; } +if test "${libc_cv_cc_sse4+set}" = set; then + $as_echo_n "(cached) " >&6 +else + if { ac_try='${CC-cc} -msse4 -xc /dev/null -S -o /dev/null' + { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 + (eval $ac_try) 2>&5 + ac_status=$? + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; }; then + libc_cv_cc_sse4=yes +else + libc_cv_cc_sse4=no +fi +fi +{ $as_echo "$as_me:$LINENO: result: $libc_cv_cc_sse4" >&5 +$as_echo "$libc_cv_cc_sse4" >&6; } +if test $libc_cv_cc_sse4 = yes; then + cat >>confdefs.h <<\_ACEOF +#define HAVE_SSE4_SUPPORT 1 +_ACEOF + +fi diff --git a/sysdeps/i386/configure.in b/sysdeps/i386/configure.in index 028e1ae8e1..44f53a57a0 100644 --- a/sysdeps/i386/configure.in +++ b/sysdeps/i386/configure.in @@ -33,3 +33,14 @@ rm -f conftest*])AC_SUBST(libc_cv_cpp_asm_debuginfo) if test $libc_cv_cpp_asm_debuginfo = yes; then AC_DEFINE(HAVE_CPP_ASM_DEBUGINFO) fi + +dnl Check if -msse4 works. +AC_CACHE_CHECK(for SSE4 support, libc_cv_cc_sse4, [dnl +if AC_TRY_COMMAND([${CC-cc} -msse4 -xc /dev/null -S -o /dev/null]); then + libc_cv_cc_sse4=yes +else + libc_cv_cc_sse4=no +fi]) +if test $libc_cv_cc_sse4 = yes; then + AC_DEFINE(HAVE_SSE4_SUPPORT) +fi diff --git a/sysdeps/powerpc/sysdep.h b/sysdeps/powerpc/sysdep.h index 43edeb71eb..f5c79c54ef 100644 --- a/sysdeps/powerpc/sysdep.h +++ b/sysdeps/powerpc/sysdep.h @@ -44,8 +44,8 @@ #define PPC_FEATURE_PA6T 0x00000800 /* PA Semi 6T Core */ #define PPC_FEATURE_HAS_DFP 0x00000400 /* Decimal FP Unit */ #define PPC_FEATURE_POWER6_EXT 0x00000200 /* P6 + mffgpr/mftgpr */ -#define PPC_FEATURE_HAS_VSX 0x00000100 /* P7 Vector Extension. */ -#define PPC_FEATURE_ARCH_2_06 0x00000080 /* ISA 2.06 */ +#define PPC_FEATURE_ARCH_2_06 0x00000100 /* ISA 2.06 */ +#define PPC_FEATURE_HAS_VSX 0x00000080 /* P7 Vector Extension. */ #define PPC_FEATURE_970 (PPC_FEATURE_POWER4 + PPC_FEATURE_HAS_ALTIVEC) #ifdef __ASSEMBLER__ diff --git a/sysdeps/unix/sysv/linux/sys/epoll.h b/sysdeps/unix/sysv/linux/sys/epoll.h index 12de0bcfe2..ca1d3d0459 100644 --- a/sysdeps/unix/sysv/linux/sys/epoll.h +++ b/sysdeps/unix/sysv/linux/sys/epoll.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2002-2006, 2007, 2008 Free Software Foundation, Inc. +/* Copyright (C) 2002-2006, 2007, 2008, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -31,7 +31,7 @@ typedef __sigset_t sigset_t; #endif -/* Flags to be passed to epoll_create2. */ +/* Flags to be passed to epoll_create1. */ enum { EPOLL_CLOEXEC = 02000000, diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile index da82093381..78fdb04fcb 100644 --- a/sysdeps/x86_64/Makefile +++ b/sysdeps/x86_64/Makefile @@ -4,6 +4,7 @@ long-double-fcts = yes ifeq ($(subdir),csu) sysdep_routines += hp-timing elide-routines.os += hp-timing +gen-as-const-headers += link-defines.sym endif ifeq ($(subdir),gmon) diff --git a/sysdeps/x86_64/bits/link.h b/sysdeps/x86_64/bits/link.h index 5676b78753..643a293bb0 100644 --- a/sysdeps/x86_64/bits/link.h +++ b/sysdeps/x86_64/bits/link.h @@ -65,10 +65,19 @@ __END_DECLS /* Registers for entry into PLT on x86-64. */ # if __GNUC_PREREQ (4,0) typedef float La_x86_64_xmm __attribute__ ((__vector_size__ (16))); +typedef float La_x86_64_ymm __attribute__ ((__vector_size__ (32))); # else typedef float La_x86_64_xmm __attribute__ ((__mode__ (__V4SF__))); # endif +typedef union +{ +# if __GNUC_PREREQ (4,0) + La_x86_64_ymm ymm[2]; +# endif + La_x86_64_xmm xmm[4]; +} La_x86_64_vector __attribute__ ((aligned(16))); + typedef struct La_x86_64_regs { uint64_t lr_rdx; @@ -80,6 +89,7 @@ typedef struct La_x86_64_regs uint64_t lr_rbp; uint64_t lr_rsp; La_x86_64_xmm lr_xmm[8]; + La_x86_64_vector lr_vector[8]; } La_x86_64_regs; /* Return values for calls from PLT on x86-64. */ @@ -91,6 +101,8 @@ typedef struct La_x86_64_retval La_x86_64_xmm lrv_xmm1; long double lrv_st0; long double lrv_st1; + La_x86_64_vector lrv_vector0; + La_x86_64_vector lrv_vector1; } La_x86_64_retval; diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S index d8d9bc12a4..49d239f075 100644 --- a/sysdeps/x86_64/dl-trampoline.S +++ b/sysdeps/x86_64/dl-trampoline.S @@ -17,7 +17,9 @@ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ +#include <config.h> #include <sysdep.h> +#include <link-defines.h> .text .globl _dl_runtime_resolve @@ -89,25 +91,85 @@ _dl_runtime_profile: /* Actively align the La_x86_64_regs structure. */ andq $0xfffffffffffffff0, %rsp - subq $192, %rsp # sizeof(La_x86_64_regs) +# ifdef HAVE_AVX_SUPPORT + /* sizeof(La_x86_64_regs). Need extra space for 8 SSE registers + to detect if any xmm0-xmm7 registers are changed by audit + module. */ + subq $(LR_SIZE + XMM_SIZE*8), %rsp +# else + subq $LR_SIZE, %rsp # sizeof(La_x86_64_regs) +# endif movq %rsp, 24(%rbx) - movq %rdx, (%rsp) # Fill the La_x86_64_regs structure. - movq %r8, 8(%rsp) - movq %r9, 16(%rsp) - movq %rcx, 24(%rsp) - movq %rsi, 32(%rsp) - movq %rdi, 40(%rsp) - movq %rbp, 48(%rsp) + /* Fill the La_x86_64_regs structure. */ + movq %rdx, LR_RDX_OFFSET(%rsp) + movq %r8, LR_R8_OFFSET(%rsp) + movq %r9, LR_R9_OFFSET(%rsp) + movq %rcx, LR_RCX_OFFSET(%rsp) + movq %rsi, LR_RSI_OFFSET(%rsp) + movq %rdi, LR_RDI_OFFSET(%rsp) + movq %rbp, LR_RBP_OFFSET(%rsp) + leaq 48(%rbx), %rax - movq %rax, 56(%rsp) - movaps %xmm0, 64(%rsp) - movaps %xmm1, 80(%rsp) - movaps %xmm2, 96(%rsp) - movaps %xmm3, 112(%rsp) - movaps %xmm4, 128(%rsp) - movaps %xmm5, 144(%rsp) - movaps %xmm7, 160(%rsp) + movq %rax, LR_RSP_OFFSET(%rsp) + + /* We always store the XMM registers even if AVX is available. + This is to provide backward binary compatility for existing + audit modules. */ + movaps %xmm0, (LR_XMM_OFFSET)(%rsp) + movaps %xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp) + movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp) + movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp) + movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp) + movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp) + movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp) + movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp) + +# ifdef HAVE_AVX_SUPPORT + .data +L(have_avx): + .zero 4 + .size L(have_avx), 4 + .previous + + cmpl $0, L(have_avx)(%rip) + jne 1f + movq %rbx, %r11 # Save rbx + movl $1, %eax + cpuid + movq %r11,%rbx # Restore rbx + movl $1, %eax + testl $(1 << 28), %ecx + jne 2f + negl %eax +2: movl %eax, L(have_avx)(%rip) + cmpl $0, %eax + +1: js L(no_avx1) + + /* This is to support AVX audit modules. */ + vmovdqu %ymm0, (LR_VECTOR_OFFSET)(%rsp) + vmovdqu %ymm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp) + vmovdqu %ymm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp) + vmovdqu %ymm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp) + vmovdqu %ymm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp) + vmovdqu %ymm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp) + vmovdqu %ymm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp) + vmovdqu %ymm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp) + + /* Save xmm0-xmm7 registers to detect if any of them are + changed by audit module. */ + vmovdqa %xmm0, (LR_SIZE)(%rsp) + vmovdqa %xmm1, (LR_SIZE + XMM_SIZE)(%rsp) + vmovdqa %xmm2, (LR_SIZE + XMM_SIZE*2)(%rsp) + vmovdqa %xmm3, (LR_SIZE + XMM_SIZE*3)(%rsp) + vmovdqa %xmm4, (LR_SIZE + XMM_SIZE*4)(%rsp) + vmovdqa %xmm5, (LR_SIZE + XMM_SIZE*5)(%rsp) + vmovdqa %xmm6, (LR_SIZE + XMM_SIZE*6)(%rsp) + vmovdqa %xmm7, (LR_SIZE + XMM_SIZE*7)(%rsp) + +L(no_avx1): +# endif movq %rsp, %rcx # La_x86_64_regs pointer to %rcx. movq 48(%rbx), %rdx # Load return address if needed. @@ -119,27 +181,87 @@ _dl_runtime_profile: movq %rax, %r11 # Save return value. movq 8(%rbx), %rax # Get back register content. - movq (%rsp), %rdx - movq 8(%rsp), %r8 - movq 16(%rsp), %r9 - movaps 64(%rsp), %xmm0 - movaps 80(%rsp), %xmm1 - movaps 96(%rsp), %xmm2 - movaps 112(%rsp), %xmm3 - movaps 128(%rsp), %xmm4 - movaps 144(%rsp), %xmm5 - movaps 160(%rsp), %xmm7 - + movq LR_RDX_OFFSET(%rsp), %rdx + movq LR_R8_OFFSET(%rsp), %r8 + movq LR_R9_OFFSET(%rsp), %r9 + + movaps (LR_XMM_OFFSET)(%rsp), %xmm0 + movaps (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1 + movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2 + movaps (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3 + movaps (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4 + movaps (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5 + movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6 + movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7 + +# ifdef HAVE_AVX_SUPPORT + cmpl $0, L(have_avx)(%rip) + js L(no_avx2) + + /* Check if any xmm0-xmm7 registers are changed by audit + module. */ + vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET)(%rsp), %ymm0 + +1: vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1 + +1: vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2 + +1: vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3 + +1: vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4 + +1: vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5 + +1: vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6 + +1: vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7 + +L(no_avx2): +1: +# endif movq 16(%rbx), %r10 # Anything in framesize? testq %r10, %r10 - jns 1f + jns 3f /* There's nothing in the frame size, so there will be no call to the _dl_call_pltexit. */ - movq 24(%rsp), %rcx # Get back registers content. - movq 32(%rsp), %rsi - movq 40(%rsp), %rdi + /* Get back registers content. */ + movq LR_RCX_OFFSET(%rsp), %rcx + movq LR_RSI_OFFSET(%rsp), %rsi + movq LR_RDI_OFFSET(%rsp), %rdi movq %rbx, %rsp movq (%rsp), %rbx @@ -151,7 +273,7 @@ _dl_runtime_profile: cfi_adjust_cfa_offset(-48) jmp *%r11 # Jump to function address. -1: +3: cfi_adjust_cfa_offset(48) cfi_rel_offset(%rbx, 0) cfi_def_cfa_register(%rbx) @@ -161,7 +283,7 @@ _dl_runtime_profile: temporary buffer of the size specified by the 'framesize' returned from _dl_profile_fixup */ - leaq 56(%rbx), %rsi # stack + leaq LR_RSP_OFFSET(%rbx), %rsi # stack addq $8, %r10 andq $0xfffffffffffffff0, %r10 movq %r10, %rcx @@ -183,31 +305,80 @@ _dl_runtime_profile: _dl_call_pltexit. The La_x86_64_regs is being pointed by rsp now, so we just need to allocate the sizeof(La_x86_64_retval) space on the stack, since the alignment has already been taken care of. */ - - subq $80, %rsp # sizeof(La_x86_64_retval) +# ifdef HAVE_AVX_SUPPORT + /* sizeof(La_x86_64_retval). Need extra space for 2 SSE + registers to detect if xmm0/xmm1 registers are changed + by audit module. */ + subq $(LRV_SIZE + XMM_SIZE*2), %rsp +# else + subq $LRV_SIZE, %rsp # sizeof(La_x86_64_retval) +# endif movq %rsp, %rcx # La_x86_64_retval argument to %rcx. - movq %rax, (%rcx) # Fill in the La_x86_64_retval structure. - movq %rdx, 8(%rcx) - movaps %xmm0, 16(%rcx) - movaps %xmm1, 32(%rcx) - fstpt 48(%rcx) - fstpt 64(%rcx) + /* Fill in the La_x86_64_retval structure. */ + movq %rax, LRV_RAX_OFFSET(%rcx) + movq %rdx, LRV_RDX_OFFSET(%rcx) + + movaps %xmm0, LRV_XMM0_OFFSET(%rcx) + movaps %xmm1, LRV_XMM1_OFFSET(%rcx) + +# ifdef HAVE_AVX_SUPPORT + cmpl $0, L(have_avx)(%rip) + js L(no_avx3) + + /* This is to support AVX audit modules. */ + vmovdqu %ymm0, LRV_VECTOR0_OFFSET(%rcx) + vmovdqu %ymm1, LRV_VECTOR1_OFFSET(%rcx) + + /* Save xmm0/xmm1 registers to detect if they are changed + by audit module. */ + vmovdqa %xmm0, (LRV_SIZE)(%rcx) + vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx) + +L(no_avx3): +# endif + + fstpt LRV_ST0_OFFSET(%rcx) + fstpt LRV_ST1_OFFSET(%rcx) movq 24(%rbx), %rdx # La_x86_64_regs argument to %rdx. movq 40(%rbx), %rsi # Copy args pushed by PLT in register. movq 32(%rbx), %rdi # %rdi: link_map, %rsi: reloc_index call _dl_call_pltexit - movq (%rsp), %rax # Restore return registers. - movq 8(%rsp), %rdx - movaps 16(%rsp), %xmm0 - movaps 32(%rsp), %xmm1 - fldt 64(%rsp) - fldt 48(%rsp) + /* Restore return registers. */ + movq LRV_RAX_OFFSET(%rsp), %rax + movq LRV_RDX_OFFSET(%rsp), %rdx + + movaps LRV_XMM0_OFFSET(%rsp), %xmm0 + movaps LRV_XMM1_OFFSET(%rsp), %xmm1 + +# ifdef HAVE_AVX_SUPPORT + cmpl $0, L(have_avx)(%rip) + js L(no_avx4) + + /* Check if xmm0/xmm1 registers are changed by audit module. */ + vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2 + vpmovmskb %xmm2, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu LRV_VECTOR0_OFFSET(%rsp), %ymm0 + +1: vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2 + vpmovmskb %xmm2, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu LRV_VECTOR1_OFFSET(%rsp), %ymm1 + +L(no_avx4): +1: +# endif + + fldt LRV_ST1_OFFSET(%rsp) + fldt LRV_ST0_OFFSET(%rsp) movq %rbx, %rsp - movq (%rsp), %rbx + movq (%rsp), %rbx cfi_restore(rbx) cfi_def_cfa_register(%rsp) diff --git a/sysdeps/x86_64/elf/configure b/sysdeps/x86_64/elf/configure index 774654997d..221e74c2b8 100755 --- a/sysdeps/x86_64/elf/configure +++ b/sysdeps/x86_64/elf/configure @@ -79,3 +79,28 @@ cat >>confdefs.h <<\_ACEOF #define PI_STATIC_AND_HIDDEN 1 _ACEOF + +{ $as_echo "$as_me:$LINENO: checking for AVX support" >&5 +$as_echo_n "checking for AVX support... " >&6; } +if test "${libc_cv_cc_avx+set}" = set; then + $as_echo_n "(cached) " >&6 +else + if { ac_try='${CC-cc} -mavx -xc /dev/null -S -o /dev/null' + { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 + (eval $ac_try) 2>&5 + ac_status=$? + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; }; then + libc_cv_cc_avx=yes +else + libc_cv_cc_avx=no +fi +fi +{ $as_echo "$as_me:$LINENO: result: $libc_cv_cc_avx" >&5 +$as_echo "$libc_cv_cc_avx" >&6; } +if test $libc_cv_cc_avx = yes; then + cat >>confdefs.h <<\_ACEOF +#define HAVE_AVX_SUPPORT 1 +_ACEOF + +fi diff --git a/sysdeps/x86_64/elf/configure.in b/sysdeps/x86_64/elf/configure.in index 9cb59d009c..14d1875302 100644 --- a/sysdeps/x86_64/elf/configure.in +++ b/sysdeps/x86_64/elf/configure.in @@ -32,3 +32,14 @@ fi dnl It is always possible to access static and hidden symbols in an dnl position independent way. AC_DEFINE(PI_STATIC_AND_HIDDEN) + +dnl Check if -mavx works. +AC_CACHE_CHECK(for AVX support, libc_cv_cc_avx, [dnl +if AC_TRY_COMMAND([${CC-cc} -mavx -xc /dev/null -S -o /dev/null]); then + libc_cv_cc_avx=yes +else + libc_cv_cc_avx=no +fi]) +if test $libc_cv_cc_avx = yes; then + AC_DEFINE(HAVE_AVX_SUPPORT) +fi diff --git a/sysdeps/x86_64/link-defines.sym b/sysdeps/x86_64/link-defines.sym new file mode 100644 index 0000000000..1694d883ad --- /dev/null +++ b/sysdeps/x86_64/link-defines.sym @@ -0,0 +1,28 @@ +#include "link.h" +#include <stddef.h> + +-- +VECTOR_SIZE sizeof (La_x86_64_vector) +XMM_SIZE sizeof (La_x86_64_xmm) + +LR_SIZE sizeof (struct La_x86_64_regs) +LR_RDX_OFFSET offsetof (struct La_x86_64_regs, lr_rdx) +LR_R8_OFFSET offsetof (struct La_x86_64_regs, lr_r8) +LR_R9_OFFSET offsetof (struct La_x86_64_regs, lr_r9) +LR_RCX_OFFSET offsetof (struct La_x86_64_regs, lr_rcx) +LR_RSI_OFFSET offsetof (struct La_x86_64_regs, lr_rsi) +LR_RDI_OFFSET offsetof (struct La_x86_64_regs, lr_rdi) +LR_RBP_OFFSET offsetof (struct La_x86_64_regs, lr_rbp) +LR_RSP_OFFSET offsetof (struct La_x86_64_regs, lr_rsp) +LR_XMM_OFFSET offsetof (struct La_x86_64_regs, lr_xmm) +LR_VECTOR_OFFSET offsetof (struct La_x86_64_regs, lr_vector) + +LRV_SIZE sizeof (struct La_x86_64_retval) +LRV_RAX_OFFSET offsetof (struct La_x86_64_retval, lrv_rax) +LRV_RDX_OFFSET offsetof (struct La_x86_64_retval, lrv_rdx) +LRV_XMM0_OFFSET offsetof (struct La_x86_64_retval, lrv_xmm0) +LRV_XMM1_OFFSET offsetof (struct La_x86_64_retval, lrv_xmm1) +LRV_ST0_OFFSET offsetof (struct La_x86_64_retval, lrv_st0) +LRV_ST1_OFFSET offsetof (struct La_x86_64_retval, lrv_st1) +LRV_VECTOR0_OFFSET offsetof (struct La_x86_64_retval, lrv_vector0) +LRV_VECTOR1_OFFSET offsetof (struct La_x86_64_retval, lrv_vector1) diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S new file mode 100644 index 0000000000..a9fe13ae58 --- /dev/null +++ b/sysdeps/x86_64/memcmp.S @@ -0,0 +1,359 @@ +/* memcmp with SSE2 + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> + + .text +ENTRY (memcmp) + test %rdx, %rdx + jz L(finz) + cmpq $1, %rdx + jle L(finr1b) + subq %rdi, %rsi + movq %rdx, %r10 + cmpq $32, %r10 + jge L(gt32) + /* Handle small chunks and last block of less than 32 bytes. */ +L(small): + testq $1, %r10 + jz L(s2b) + movzbl (%rdi), %eax + movzbl (%rdi, %rsi), %edx + subq $1, %r10 + je L(finz1) + addq $1, %rdi + subl %edx, %eax + jnz L(exit) +L(s2b): + testq $2, %r10 + jz L(s4b) + movzwl (%rdi), %eax + movzwl (%rdi, %rsi), %edx + subq $2, %r10 + je L(fin2_7) + addq $2, %rdi + cmpl %edx, %eax + jnz L(fin2_7) +L(s4b): + testq $4, %r10 + jz L(s8b) + movl (%rdi), %eax + movl (%rdi, %rsi), %edx + subq $4, %r10 + je L(fin2_7) + addq $4, %rdi + cmpl %edx, %eax + jnz L(fin2_7) +L(s8b): + testq $8, %r10 + jz L(s16b) + movq (%rdi), %rax + movq (%rdi, %rsi), %rdx + subq $8, %r10 + je L(fin2_7) + addq $8, %rdi + cmpq %rdx, %rax + jnz L(fin2_7) +L(s16b): + movdqu (%rdi), %xmm1 + movdqu (%rdi, %rsi), %xmm0 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + xorl %eax, %eax + subl $0xffff, %edx + jz L(finz) + bsfl %edx, %ecx + leaq (%rdi, %rcx), %rcx + movzbl (%rcx), %eax + movzbl (%rsi, %rcx), %edx + jmp L(finz1) + + .p2align 4,, 4 +L(finr1b): + movzbl (%rdi), %eax + movzbl (%rsi), %edx +L(finz1): + subl %edx, %eax +L(exit): + ret + + .p2align 4,, 4 +L(fin2_7): + cmpq %rdx, %rax + jz L(finz) + movq %rax, %r11 + subq %rdx, %r11 + bsfq %r11, %rcx + sarq $3, %rcx + salq $3, %rcx + sarq %cl, %rax + movzbl %al, %eax + sarq %cl, %rdx + movzbl %dl, %edx + subl %edx, %eax + ret + + .p2align 4,, 4 +L(finz): + xorl %eax, %eax + ret + + /* For blocks bigger than 32 bytes + 1. Advance one of the addr pointer to be 16B aligned. + 2. Treat the case of both addr pointers aligned to 16B + separately to avoid movdqu. + 3. Handle any blocks of greater than 64 consecutive bytes with + unrolling to reduce branches. + 4. At least one addr pointer is 16B aligned, use memory version + of pcmbeqb. + */ + .p2align 4,, 4 +L(gt32): + movq %rdx, %r11 + addq %rdi, %r11 + movq %rdi, %r8 + + andq $15, %r8 + jz L(16am) + /* Both pointers may be misaligned. */ + movdqu (%rdi), %xmm1 + movdqu (%rdi, %rsi), %xmm0 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + subl $0xffff, %edx + jnz L(neq) + neg %r8 + leaq 16(%rdi, %r8), %rdi +L(16am): + /* Handle two 16B aligned pointers separately. */ + testq $15, %rsi + jz L(ATR) + testq $16, %rdi + jz L(A32) + movdqu (%rdi, %rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi +L(A32): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) + /* Pre-unroll to be ready for unrolled 64B loop. */ + testq $32, %rdi + jz L(A64) + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + +L(A64): + movq %r11, %r10 + andq $-64, %r10 + cmpq %r10, %rdi + jge L(mt32) + +L(A64main): + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + cmpq %rdi, %r10 + jne L(A64main) + +L(mt32): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) + +L(A32main): + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + cmpq %rdi, %r10 + jne L(A32main) +L(mt16): + subq %rdi, %r11 + je L(finz) + movq %r11, %r10 + jmp L(small) + + .p2align 4,, 4 +L(neq): + bsfl %edx, %ecx + movzbl (%rdi, %rcx), %eax + addq %rdi, %rsi + movzbl (%rsi,%rcx), %edx + jmp L(finz1) + + .p2align 4,, 4 +L(ATR): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) + testq $16, %rdi + jz L(ATR32) + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + cmpq %rdi, %r10 + je L(mt16) + +L(ATR32): + movq %r11, %r10 + andq $-64, %r10 + testq $32, %rdi + jz L(ATR64) + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + +L(ATR64): + cmpq %rdi, %r10 + je L(mt32) + +L(ATR64main): + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + cmpq %rdi, %r10 + jne L(ATR64main) + + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) + +L(ATR32res): + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + cmpq %r10, %rdi + jne L(ATR32res) + + subq %rdi, %r11 + je L(finz) + movq %r11, %r10 + jmp L(small) + /* Align to 16byte to improve instruction fetch. */ + .p2align 4,, 4 +END(memcmp) + +#undef bcmp +weak_alias (memcmp, bcmp) +libc_hidden_builtin_def (memcmp) diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 1c35e1ffb4..71e85f0652 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -4,5 +4,11 @@ gen-as-const-headers += ifunc-defines.sym endif ifeq ($(subdir),string) -sysdep_routines += strncmp-c +sysdep_routines += stpncpy-c strncpy-c strncmp-c +ifeq (yes,$(config-cflags-sse4)) +sysdep_routines += strcspn-c strpbrk-c strspn-c +CFLAGS-strcspn-c.c += -msse4 +CFLAGS-strpbrk-c.c += -msse4 +CFLAGS-strspn-c.c += -msse4 +endif endif diff --git a/sysdeps/x86_64/multiarch/rawmemchr.S b/sysdeps/x86_64/multiarch/rawmemchr.S index 93ca631633..d4f265f430 100644 --- a/sysdeps/x86_64/multiarch/rawmemchr.S +++ b/sysdeps/x86_64/multiarch/rawmemchr.S @@ -77,6 +77,7 @@ __rawmemchr_sse42: # undef ENTRY # define ENTRY(name) \ .type __rawmemchr_sse2, @function; \ + .align 16; \ __rawmemchr_sse2: cfi_startproc; \ CALL_MCOUNT # undef END diff --git a/sysdeps/x86_64/multiarch/stpcpy.S b/sysdeps/x86_64/multiarch/stpcpy.S new file mode 100644 index 0000000000..b63d308edc --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpcpy.S @@ -0,0 +1,7 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy +#include "strcpy.S" + +weak_alias (__stpcpy, stpcpy) +libc_hidden_def (__stpcpy) +libc_hidden_builtin_def (stpcpy) diff --git a/sysdeps/x86_64/multiarch/stpncpy-c.c b/sysdeps/x86_64/multiarch/stpncpy-c.c new file mode 100644 index 0000000000..2fde77dcab --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpncpy-c.c @@ -0,0 +1,8 @@ +#define STPNCPY __stpncpy_sse2 +#ifdef SHARED +#undef libc_hidden_def +#define libc_hidden_def(name) \ + __hidden_ver1 (__stpncpy_sse2, __GI___stpncpy, __stpncpy_sse2); +#endif + +#include "stpncpy.c" diff --git a/sysdeps/x86_64/multiarch/stpncpy.S b/sysdeps/x86_64/multiarch/stpncpy.S new file mode 100644 index 0000000000..ff89a89491 --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpncpy.S @@ -0,0 +1,6 @@ +#define STRCPY __stpncpy +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#include "strcpy.S" + +weak_alias (__stpncpy, stpncpy) diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S index 2f4bf17d95..37985036aa 100644 --- a/sysdeps/x86_64/multiarch/strcmp.S +++ b/sysdeps/x86_64/multiarch/strcmp.S @@ -1659,6 +1659,7 @@ LABEL(unaligned_table): # undef ENTRY # define ENTRY(name) \ .type STRCMP_SSE2, @function; \ + .align 16; \ STRCMP_SSE2: cfi_startproc; \ CALL_MCOUNT # undef END diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S new file mode 100644 index 0000000000..25cd01307d --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcpy.S @@ -0,0 +1,1918 @@ +/* strcpy with SSSE3 + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <ifunc-defines.h> + +#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY) +# ifndef STRCPY +# define STRCPY strcpy +# endif +#endif + +#ifdef USE_AS_STPCPY +# ifdef USE_AS_STRNCPY +# define STRCPY_SSSE3 __stpncpy_ssse3 +# define STRCPY_SSE2 __stpncpy_sse2 +# define __GI_STRCPY __GI_stpncpy +# else +# define STRCPY_SSSE3 __stpcpy_ssse3 +# define STRCPY_SSE2 __stpcpy_sse2 +# define __GI_STRCPY __GI_stpcpy +# define __GI___STRCPY __GI___stpcpy +# endif +#else +# ifdef USE_AS_STRNCPY +# define STRCPY_SSSE3 __strncpy_ssse3 +# define STRCPY_SSE2 __strncpy_sse2 +# define __GI_STRCPY __GI_strncpy +# else +# define STRCPY_SSSE3 __strcpy_ssse3 +# define STRCPY_SSE2 __strcpy_sse2 +# define __GI_STRCPY __GI_strcpy +# endif +#endif + +#ifndef LABEL +#define LABEL(l) L(l) +#endif + +/* Define multiple versions only for the definition in libc. */ +#ifndef NOT_IN_libc + .text +ENTRY(STRCPY) + .type STRCPY, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq STRCPY_SSE2(%rip), %rax + testl $(1<<9), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip) + jz 3f +/* Avoid SSSE3 strcpy on Atom since it is slow. */ + cmpl $1, __cpu_features+KIND_OFFSET(%rip) + jne 2f + cmpl $6, __cpu_features+FAMILY_OFFSET(%rip) + jne 2f + cmpl $28, __cpu_features+MODEL_OFFSET(%rip) + jz 3f +2: leaq STRCPY_SSSE3(%rip), %rax +3: ret +END(STRCPY) + + .section .text.ssse3,"ax",@progbits +STRCPY_SSSE3: + cfi_startproc + CALL_MCOUNT + +/* + * This implementation uses SSE to copy up to 16 bytes at a time. + */ +#ifdef USE_AS_STRNCPY + test %rdx, %rdx + jz LABEL(strncpy_exitz) + mov %rdx, %r8 +#else + xor %edx, %edx +#endif + mov %esi, %ecx + and $0xfffffffffffffff0, %rsi /*force rsi 16 byte align*/ + and $15, %ecx + mov %rdi, %rax /*store return parameter*/ + + + pxor %xmm0, %xmm0 /* clear %xmm0 */ + pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/ + pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/ + shr %cl, %edx /* get real bits left in edx*/ + test %edx, %edx /* edx must be 0 if there is no null char from rsi+%rcx */ + jnz LABEL(less16bytes) + +#ifdef USE_AS_STRNCPY + lea -16(%r8,%rcx), %r11 + cmp $0, %r11 + jle LABEL(less16bytes) /* if r8 + rcx <= 16, branch to less16bytes. */ +#endif + + mov %rcx, %r9 + or %edi, %ecx + and $15, %ecx + lea -16(%r9), %r10 + jz LABEL(ashr_0) /* ecx must be 0 if offset of rsi and rdi is 16 byte align*/ + + neg %r10 /* store the rest in rsi aligned 16 bytes for unaligned_exit*/ + + pxor %xmm0, %xmm0 /* clear %xmm0, may be polluted by unaligned operation*/ + pcmpeqb 16(%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/ + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(less32bytes) + /* + * at least 16 byte available to fill destination rdi + */ +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(less32bytes_strncpy_truncation) +#endif + mov (%rsi, %r9), %rdx + mov %rdx, (%rdi) + mov 8(%rsi, %r9), %rdx + mov %rdx, 8(%rdi) + + /* + * so far destatination rdi may be aligned by 16, re-calculate rsi to jump + * crossponding case + * rcx is offset of rsi + * rax is offset of rdi + */ + + and $0xfffffffffffffff0, %rdi /* force rdi 16 byte align */ + mov %rax, %rdx /* rax store orignal rdi */ + xor %rdi, %rdx /* equal to and $15, %rdx */ +#ifdef USE_AS_STRNCPY + add %rdx, %r8 +#endif + + add $16, %rdi /* next 16 bytes for rdi */ + sub %rdx, %r9 + + lea 16(%r9, %rsi), %rsi /*re-calculate rsi by (16 - rdx)+ rcx */ + mov %esi, %ecx /*store offset of rsi */ + and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */ + + and $15, %ecx /* ecx must be 0 if rdx is equal to rcx*/ + jz LABEL(ashr_0) + + lea -16(%rcx), %r10 + mov %rcx, %r9 + neg %r10 + lea LABEL(unaligned_table)(%rip), %r11 + movslq (%r11, %rcx,4), %rcx + lea (%r11, %rcx), %rcx + jmp *%rcx + + /* + * The following cases will be handled by ashr_0 & ashr_0_start + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * 0 0 0 ashr_0 + * n(1~15) n(1~15) 0 ashr_0_start + * + */ + .p2align 5 +LABEL(ashr_0): +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi), %xmm1 /* fetch first 16 bytes from rsi */ + movdqa %xmm1, (%rdi) /* store first 16 bytes into rdi */ + add $16, %rsi + add $16, %rdi + pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char */ + pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/ + + test %edx, %edx /* edx must be 0 if there is no null char in rsi*/ + jnz LABEL(aligned_16bytes) + +LABEL(ashr_0_loop): +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(aligned_exit) + +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(aligned_exit) + +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(aligned_exit) + +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jz LABEL(ashr_0_loop) + + jmp LABEL(aligned_exit) + .p2align 4 + +/* + * The following cases will be handled by ashr_15 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(15) n - 15 15((16 - (n -15) + n)%16 ashr_15 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_15): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_15_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $15, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $15, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_15_use_ssse3) + +/* + * The following cases will be handled by ashr_14 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(14~15) n - 14 14((16 - (n -14) + n)%16 ashr_14 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_14): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_14_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $14, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $14, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_14_use_ssse3) + +/* + * The following cases will be handled by ashr_13 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(13~15) n - 13 13((16 - (n -13) + n)%16 ashr_13 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_13): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_13_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $13, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $13, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_13_use_ssse3) + +/* + * The following cases will be handled by ashr_12 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(12~15) n - 12 12((16 - (n -12) + n)%16 ashr_12 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_12): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_12_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $12, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $12, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_12_use_ssse3) + +/* + * The following cases will be handled by ashr_11 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(11~15) n - 11 11((16 - (n -11) + n)%16 ashr_11 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_11): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_11_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $11, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $11, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_11_use_ssse3) + +/* + * The following cases will be handled by ashr_10 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(10~15) n - 10 10((16 - (n -10) + n)%16 ashr_10 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_10): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_10_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $10, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $10, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_10_use_ssse3) + +/* + * The following cases will be handled by ashr_9 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(9~15) n - 9 9((16 - (n -9) + n)%16 ashr_9 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_9): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_9_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $9, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $9, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_9_use_ssse3) + +/* + * The following cases will be handled by ashr_8 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(8~15) n - 8 8((16 - (n -8) + n)%16 ashr_8 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_8): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_8_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $8, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $8, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_8_use_ssse3) + +/* + * The following cases will be handled by ashr_7 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(7~15) n - 7 7((16 - (n -7) + n)%16 ashr_7 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_7): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + .p2align 4 + +LABEL(ashr_7_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $7, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $7, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_7_use_ssse3) + +/* + * The following cases will be handled by ashr_6 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(6~15) n - 6 6((16 - (n -6) + n)%16 ashr_6 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_6): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_6_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $6, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $6, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_6_use_ssse3) + + /* + * The following cases will be handled by ashr_5 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(5~15) n - 5 5((16 - (n -5) + n)%16 ashr_5 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_5): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_5_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $5, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $5, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_5_use_ssse3) + +/* + * + * The following cases will be handled by ashr_4 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(4~15) n - 4 4((16 - (n -4) + n)%16 ashr_4 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_4): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_4_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $4, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $4, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_4_use_ssse3) + +/* + * + * The following cases will be handled by ashr_3 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(3~15) n - 3 3((16 - (n -3) + n)%16 ashr_3 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_3): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_3_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $3, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $3, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_3_use_ssse3) + +/* + * + * The following cases will be handled by ashr_2 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(2~15) n - 2 2((16 - (n -2) + n)%16 ashr_2 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_2): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_2_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $2, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $2, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_2_use_ssse3) + +/* + * + * The following cases will be handled by ashr_1 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(1~15) n - 1 1 ((16 - (n -1) + n)%16 ashr_1 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_1): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_1_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $1, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + palignr $1, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_1_use_ssse3) + + .p2align 4 +LABEL(less32bytes): + xor %ecx, %ecx +LABEL(unaligned_exit): + add %r9, %rsi /* r9 stores original offset of rsi*/ + mov %rcx, %r9 + mov %r10, %rcx + shl %cl, %edx /* after shl, calculate the exact number to be filled*/ + mov %r9, %rcx + .p2align 4 +LABEL(aligned_exit): + add %rcx, %rdi /*locate exact address for rdi */ +LABEL(less16bytes): + add %rcx, %rsi /*locate exact address for rsi */ +LABEL(aligned_16bytes): +#ifdef USE_AS_STRNCPY + mov $1, %r9d + lea -1(%r8), %rcx + shl %cl, %r9d + cmp $32, %r8 + ja LABEL(strncpy_tail) + or %r9d, %edx +LABEL(strncpy_tail): +#endif + bsf %rdx, %rcx /*If a least significant 1 bit in %rdx is found, its bit index is stored in %rcx*/ + lea LABEL(tail_table)(%rip), %r11 + movslq (%r11, %rcx,4), %rcx + lea (%r11, %rcx), %rcx + jmp *%rcx + +#ifdef USE_AS_STRNCPY + .p2align 4 +LABEL(less32bytes_strncpy_truncation): + xor %ecx, %ecx +LABEL(strncpy_truncation_unaligned): + add %r9, %rsi +LABEL(strncpy_truncation_aligned): + add %rcx, %rdi + add %rcx, %rsi + add $16, %r8 + lea -1(%r8), %rcx + lea LABEL(tail_table)(%rip), %r11 + movslq (%r11, %rcx,4), %rcx + lea (%r11, %rcx), %rcx + jmp *%rcx + .p2align 4 +LABEL(strncpy_exitz): + mov %rdi, %rax + ret +#endif + +#ifdef USE_AS_STRNCPY + .p2align 4 +LABEL(strncpy_fill_tail): + mov %rax, %rdx + movzx %cl, %rax + mov %r8, %rcx + add %rax, %rdi + xor %eax, %eax + shr $3, %ecx + jz LABEL(strncpy_fill_less_8) + + rep stosq +LABEL(strncpy_fill_less_8): + mov %r8, %rcx + and $7, %ecx + jz LABEL(strncpy_fill_return) +LABEL(strncpy_fill_less_7): + sub $1, %ecx + mov %al, (%rdi, %rcx) + jnz LABEL(strncpy_fill_less_7) +LABEL(strncpy_fill_return): +#ifdef USE_AS_STPCPY + cmpb $1, (%rdx) + sbb $-1, %rdx +#endif + mov %rdx, %rax + ret +#endif + .p2align 4 +LABEL(tail_0): + mov (%rsi), %cl + mov %cl, (%rdi) +#ifdef USE_AS_STPCPY + mov %rdi, %rax +#endif +#ifdef USE_AS_STRNCPY + mov $1, %cl + sub $1, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_1): + mov (%rsi), %cx + mov %cx, (%rdi) +#ifdef USE_AS_STPCPY + lea 1(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $2, %cl + sub $2, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_2): + mov (%rsi), %cx + mov %cx, (%rdi) + mov 1(%rsi), %cx + mov %cx, 1(%rdi) +#ifdef USE_AS_STPCPY + lea 2(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $3, %cl + sub $3, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_3): + mov (%rsi), %ecx + mov %ecx, (%rdi) +#ifdef USE_AS_STPCPY + lea 3(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $4, %cl + sub $4, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_4): + mov (%rsi), %ecx + mov %ecx, (%rdi) + mov 1(%rsi), %edx + mov %edx, 1(%rdi) +#ifdef USE_AS_STPCPY + lea 4(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $5, %cl + sub $5, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_5): + mov (%rsi), %ecx + mov %ecx, (%rdi) + mov 2(%rsi), %edx + mov %edx, 2(%rdi) +#ifdef USE_AS_STPCPY + lea 5(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $6, %cl + sub $6, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_6): + mov (%rsi), %ecx + mov %ecx, (%rdi) + mov 3(%rsi), %edx + mov %edx,3(%rdi) +#ifdef USE_AS_STPCPY + lea 6(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $7, %cl + sub $7, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_7): + mov (%rsi), %rcx + mov %rcx, (%rdi) +#ifdef USE_AS_STPCPY + lea 7(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $8, %cl + sub $8, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_8): + + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 5(%rsi), %edx + mov %edx, 5(%rdi) +#ifdef USE_AS_STPCPY + lea 8(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $9, %cl + sub $9, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_9): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 6(%rsi), %edx + mov %edx, 6(%rdi) +#ifdef USE_AS_STPCPY + lea 9(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $10, %cl + sub $10, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_10): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 7(%rsi), %edx + mov %edx, 7(%rdi) +#ifdef USE_AS_STPCPY + lea 10(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $11, %cl + sub $11, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_11): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %edx + mov %edx, 8(%rdi) +#ifdef USE_AS_STPCPY + lea 11(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $12, %cl + sub $12, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_12): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 5(%rsi), %rcx + mov %rcx, 5(%rdi) +#ifdef USE_AS_STPCPY + lea 12(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $13, %cl + sub $13, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_13): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 6(%rsi), %rcx + mov %rcx, 6(%rdi) +#ifdef USE_AS_STPCPY + lea 13(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $14, %cl + sub $14, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_14): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 7(%rsi), %rcx + mov %rcx, 7(%rdi) +#ifdef USE_AS_STPCPY + lea 14(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $15, %cl + sub $15, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + +LABEL(tail_15): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) +#ifdef USE_AS_STPCPY + lea 15(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $16, %cl + sub $16, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + + ret + + .p2align 4 +LABEL(tail_16): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %cl + mov %cl, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 16(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $17, %cl + sub $17, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_17): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %cx + mov %cx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 17(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $18, %cl + sub $18, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_18): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 15(%rsi), %ecx + mov %ecx,15(%rdi) +#ifdef USE_AS_STPCPY + lea 18(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $19, %cl + sub $19, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_19): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %ecx + mov %ecx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 19(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $20, %cl + sub $20, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_20): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 13(%rsi), %rcx + mov %rcx, 13(%rdi) +#ifdef USE_AS_STPCPY + lea 20(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $21, %cl + sub $21, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_21): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 14(%rsi), %rcx + mov %rcx, 14(%rdi) +#ifdef USE_AS_STPCPY + lea 21(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $22, %cl + sub $22, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_22): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 15(%rsi), %rcx + mov %rcx, 15(%rdi) +#ifdef USE_AS_STPCPY + lea 22(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $23, %cl + sub $23, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_23): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 23(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $24, %cl + sub $24, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + + ret + + .p2align 4 +LABEL(tail_24): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 21(%rsi), %edx + mov %edx, 21(%rdi) +#ifdef USE_AS_STPCPY + lea 24(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $25, %cl + sub $25, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_25): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 22(%rsi), %edx + mov %edx, 22(%rdi) +#ifdef USE_AS_STPCPY + lea 25(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $26, %cl + sub $26, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_26): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 23(%rsi), %edx + mov %edx, 23(%rdi) +#ifdef USE_AS_STPCPY + lea 26(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $27, %cl + sub $27, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_27): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 24(%rsi), %edx + mov %edx, 24(%rdi) +#ifdef USE_AS_STPCPY + lea 27(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $28, %cl + sub $28, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_28): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 21(%rsi), %rdx + mov %rdx, 21(%rdi) +#ifdef USE_AS_STPCPY + lea 28(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $29, %cl + sub $29, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + + ret + + .p2align 4 +LABEL(tail_29): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 22(%rsi), %rdx + mov %rdx, 22(%rdi) +#ifdef USE_AS_STPCPY + lea 29(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $30, %cl + sub $30, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + + ret + + + .p2align 4 +LABEL(tail_30): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 23(%rsi), %rdx + mov %rdx, 23(%rdi) +#ifdef USE_AS_STPCPY + lea 30(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $31, %cl + sub $31, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_31): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 24(%rsi), %rdx + mov %rdx, 24(%rdi) +#ifdef USE_AS_STPCPY + lea 31(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $32, %cl + sub $32, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + cfi_endproc + .size STRCPY_SSSE3, .-STRCPY_SSSE3 + + .p2align 4 + .section .rodata.ssse3,"a",@progbits +LABEL(tail_table): + .int LABEL(tail_0) - LABEL(tail_table) + .int LABEL(tail_1) - LABEL(tail_table) + .int LABEL(tail_2) - LABEL(tail_table) + .int LABEL(tail_3) - LABEL(tail_table) + .int LABEL(tail_4) - LABEL(tail_table) + .int LABEL(tail_5) - LABEL(tail_table) + .int LABEL(tail_6) - LABEL(tail_table) + .int LABEL(tail_7) - LABEL(tail_table) + .int LABEL(tail_8) - LABEL(tail_table) + .int LABEL(tail_9) - LABEL(tail_table) + .int LABEL(tail_10) - LABEL(tail_table) + .int LABEL(tail_11) - LABEL(tail_table) + .int LABEL(tail_12) - LABEL(tail_table) + .int LABEL(tail_13) - LABEL(tail_table) + .int LABEL(tail_14) - LABEL(tail_table) + .int LABEL(tail_15) - LABEL(tail_table) + .int LABEL(tail_16) - LABEL(tail_table) + .int LABEL(tail_17) - LABEL(tail_table) + .int LABEL(tail_18) - LABEL(tail_table) + .int LABEL(tail_19) - LABEL(tail_table) + .int LABEL(tail_20) - LABEL(tail_table) + .int LABEL(tail_21) - LABEL(tail_table) + .int LABEL(tail_22) - LABEL(tail_table) + .int LABEL(tail_23) - LABEL(tail_table) + .int LABEL(tail_24) - LABEL(tail_table) + .int LABEL(tail_25) - LABEL(tail_table) + .int LABEL(tail_26) - LABEL(tail_table) + .int LABEL(tail_27) - LABEL(tail_table) + .int LABEL(tail_28) - LABEL(tail_table) + .int LABEL(tail_29) - LABEL(tail_table) + .int LABEL(tail_30) - LABEL(tail_table) + .int LABEL(tail_31) - LABEL(tail_table) + + .p2align 4 +LABEL(unaligned_table): + .int LABEL(ashr_0) - LABEL(unaligned_table) + .int LABEL(ashr_1) - LABEL(unaligned_table) + .int LABEL(ashr_2) - LABEL(unaligned_table) + .int LABEL(ashr_3) - LABEL(unaligned_table) + .int LABEL(ashr_4) - LABEL(unaligned_table) + .int LABEL(ashr_5) - LABEL(unaligned_table) + .int LABEL(ashr_6) - LABEL(unaligned_table) + .int LABEL(ashr_7) - LABEL(unaligned_table) + .int LABEL(ashr_8) - LABEL(unaligned_table) + .int LABEL(ashr_9) - LABEL(unaligned_table) + .int LABEL(ashr_10) - LABEL(unaligned_table) + .int LABEL(ashr_11) - LABEL(unaligned_table) + .int LABEL(ashr_12) - LABEL(unaligned_table) + .int LABEL(ashr_13) - LABEL(unaligned_table) + .int LABEL(ashr_14) - LABEL(unaligned_table) + .int LABEL(ashr_15) - LABEL(unaligned_table) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCPY_SSE2, @function; \ + .align 16; \ + STRCPY_SSE2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCPY_SSE2, .-STRCPY_SSE2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcpy calls through a PLT. + The speedup we get from using SSSE3 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCPY; __GI_STRCPY = STRCPY_SSE2 +# undef libc_hidden_def +# define libc_hidden_def(name) \ + .globl __GI___STRCPY; __GI___STRCPY = STRCPY_SSE2 +#endif + +#ifndef USE_AS_STRNCPY +#include "../strcpy.S" +#endif diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c new file mode 100644 index 0000000000..4512267d3f --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcspn-c.c @@ -0,0 +1,312 @@ +/* strcspn with SSE4.2 intrinsics + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <nmmintrin.h> +#include <string.h> + +/* We use 0x2: + _SIDD_SBYTE_OPS + | _SIDD_CMP_EQUAL_ANY + | _SIDD_POSITIVE_POLARITY + | _SIDD_LEAST_SIGNIFICANT + on pcmpistri to compare xmm/mem128 + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + X X X X X X X X X X X X X X X X + + against xmm + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + A A A A A A A A A A A A A A A A + + to find out if the first 16byte data element has any byte A and + the offset of the first byte. There are 3 cases: + + 1. The first 16byte data element has the byte A at the offset X. + 2. The first 16byte data element has EOS and doesn't have the byte A. + 3. The first 16byte data element is valid and doesn't have the byte A. + + Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: + + 1 X 1 0/1 0 + 2 16 0 1 0 + 3 16 0 0 0 + + We exit from the loop for cases 1 and 2 with jbe which branches + when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset + X for case 1. */ + +#ifndef STRCSPN_SSE2 +# define STRCSPN_SSE2 __strcspn_sse2 +# define STRCSPN_SSE42 __strcspn_sse42 +#endif + +#ifdef USE_AS_STRPBRK +# define RETURN(val1, val2) return val1 +#else +# define RETURN(val1, val2) return val2 +#endif + +extern +#ifdef USE_AS_STRPBRK +char * +#else +size_t +#endif +STRCSPN_SSE2 (const char *, const char *); + + +#ifdef USE_AS_STRPBRK +char * +#else +size_t +#endif +__attribute__ ((section (".text.sse4.2"))) +STRCSPN_SSE42 (const char *s, const char *a) +{ + if (*a == 0) + RETURN (NULL, strlen (s)); + + const char *aligned; + __m128i mask; + int offset = (int) ((size_t) a & 15); + if (offset != 0) + { + /* Load masks. */ + aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L); + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); + + switch (offset) + { + case 1: + mask = _mm_srli_si128 (mask0, 1); + break; + case 2: + mask = _mm_srli_si128 (mask0, 2); + break; + case 3: + mask = _mm_srli_si128 (mask0, 3); + break; + case 4: + mask = _mm_srli_si128 (mask0, 4); + break; + case 5: + mask = _mm_srli_si128 (mask0, 5); + break; + case 6: + mask = _mm_srli_si128 (mask0, 6); + break; + case 7: + mask = _mm_srli_si128 (mask0, 7); + break; + case 8: + mask = _mm_srli_si128 (mask0, 8); + break; + case 9: + mask = _mm_srli_si128 (mask0, 9); + break; + case 10: + mask = _mm_srli_si128 (mask0, 10); + break; + case 11: + mask = _mm_srli_si128 (mask0, 11); + break; + case 12: + mask = _mm_srli_si128 (mask0, 12); + break; + case 13: + mask = _mm_srli_si128 (mask0, 13); + break; + case 14: + mask = _mm_srli_si128 (mask0, 14); + break; + case 15: + mask = _mm_srli_si128 (mask0, 15); + break; + } + + /* Find where the NULL terminator is. */ + int length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16 - offset) + { + /* There is no NULL terminator. */ + __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); + int index = _mm_cmpistri (mask1, mask1, 0x3a); + length += index; + + /* Don't use SSE4.2 if the length of A > 16. */ + if (length > 16) + return STRCSPN_SSE2 (s, a); + + if (index != 0) + { + /* Combine mask0 and mask1. */ + switch (offset) + { + case 1: + mask = _mm_alignr_epi8 (mask1, mask0, 1); + break; + case 2: + mask = _mm_alignr_epi8 (mask1, mask0, 2); + break; + case 3: + mask = _mm_alignr_epi8 (mask1, mask0, 3); + break; + case 4: + mask = _mm_alignr_epi8 (mask1, mask0, 4); + break; + case 5: + mask = _mm_alignr_epi8 (mask1, mask0, 5); + break; + case 6: + mask = _mm_alignr_epi8 (mask1, mask0, 6); + break; + case 7: + mask = _mm_alignr_epi8 (mask1, mask0, 7); + break; + case 8: + mask = _mm_alignr_epi8 (mask1, mask0, 8); + break; + case 9: + mask = _mm_alignr_epi8 (mask1, mask0, 9); + break; + case 10: + mask = _mm_alignr_epi8 (mask1, mask0, 10); + break; + case 11: + mask = _mm_alignr_epi8 (mask1, mask0, 11); + break; + case 12: + mask = _mm_alignr_epi8 (mask1, mask0, 12); + break; + case 13: + mask = _mm_alignr_epi8 (mask1, mask0, 13); + break; + case 14: + mask = _mm_alignr_epi8 (mask1, mask0, 14); + break; + case 15: + mask = _mm_alignr_epi8 (mask1, mask0, 15); + break; + } + } + } + } + else + { + /* A is aligned. */ + mask = _mm_load_si128 ((__m128i *) a); + + /* Find where the NULL terminator is. */ + int length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16) + { + /* There is no NULL terminator. Don't use SSE4.2 if the length + of A > 16. */ + if (a[16] != 0) + return STRCSPN_SSE2 (s, a); + } + } + + offset = (int) ((size_t) s & 15); + if (offset != 0) + { + /* Check partial string. */ + aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L); + __m128i value = _mm_load_si128 ((__m128i *) aligned); + + switch (offset) + { + case 1: + value = _mm_srli_si128 (value, 1); + break; + case 2: + value = _mm_srli_si128 (value, 2); + break; + case 3: + value = _mm_srli_si128 (value, 3); + break; + case 4: + value = _mm_srli_si128 (value, 4); + break; + case 5: + value = _mm_srli_si128 (value, 5); + break; + case 6: + value = _mm_srli_si128 (value, 6); + break; + case 7: + value = _mm_srli_si128 (value, 7); + break; + case 8: + value = _mm_srli_si128 (value, 8); + break; + case 9: + value = _mm_srli_si128 (value, 9); + break; + case 10: + value = _mm_srli_si128 (value, 10); + break; + case 11: + value = _mm_srli_si128 (value, 11); + break; + case 12: + value = _mm_srli_si128 (value, 12); + break; + case 13: + value = _mm_srli_si128 (value, 13); + break; + case 14: + value = _mm_srli_si128 (value, 14); + break; + case 15: + value = _mm_srli_si128 (value, 15); + break; + } + + int length = _mm_cmpistri (mask, value, 0x2); + /* No need to check ZFlag since ZFlag is always 1. */ + int cflag = _mm_cmpistrc (mask, value, 0x2); + if (cflag) + RETURN ((char *) (s + length), length); + /* Find where the NULL terminator is. */ + int index = _mm_cmpistri (value, value, 0x3a); + if (index < 16 - offset) + RETURN (NULL, index); + aligned += 16; + } + else + aligned = s; + + while (1) + { + __m128i value = _mm_load_si128 ((__m128i *) aligned); + int index = _mm_cmpistri (mask, value, 0x2); + int cflag = _mm_cmpistrc (mask, value, 0x2); + int zflag = _mm_cmpistrz (mask, value, 0x2); + if (cflag) + RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); + if (zflag) + RETURN (NULL, + /* Find where the NULL terminator is. */ + (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s)); + aligned += 16; + } +} diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S new file mode 100644 index 0000000000..cc75ab70e6 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcspn.S @@ -0,0 +1,82 @@ +/* Multiple versions of strcspn + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <config.h> + +#ifdef HAVE_SSE4_SUPPORT + +#include <sysdep.h> +#include <ifunc-defines.h> + +#ifdef USE_AS_STRPBRK +#define STRCSPN_SSE42 __strpbrk_sse42 +#define STRCSPN_SSE2 __strpbrk_sse2 +#define __GI_STRCSPN __GI_strpbrk +#else +#ifndef STRCSPN +#define STRCSPN strcspn +#define STRCSPN_SSE42 __strcspn_sse42 +#define STRCSPN_SSE2 __strcspn_sse2 +#define __GI_STRCSPN __GI_strcspn +#endif +#endif + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strpbrk in static library since we + need strpbrk before the initialization happened. */ +#if (defined SHARED || !defined USE_AS_STRPBRK) && !defined NOT_IN_libc + .text +ENTRY(STRCSPN) + .type STRCSPN, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq STRCSPN_SSE2(%rip), %rax + testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip) + jz 2f + leaq STRCSPN_SSE42(%rip), %rax +2: ret +END(STRCSPN) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCSPN_SSE2, @function; \ + .globl STRCSPN_SSE2; \ + .align 16; \ + STRCSPN_SSE2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcspn calls through a PLT. + The speedup we get from using SSE4.2 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_SSE2 +#endif + +#endif /* HAVE_SSE4_SUPPORT */ + +#ifdef USE_AS_STRPBRK +#include "../strpbrk.S" +#else +#include "../strcspn.S" +#endif diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S index 79e6a977ec..82b03ccc28 100644 --- a/sysdeps/x86_64/multiarch/strlen.S +++ b/sysdeps/x86_64/multiarch/strlen.S @@ -77,6 +77,7 @@ __strlen_sse42: # undef ENTRY # define ENTRY(name) \ .type __strlen_sse2, @function; \ + .align 16; \ __strlen_sse2: cfi_startproc; \ CALL_MCOUNT # undef END diff --git a/sysdeps/x86_64/multiarch/strncpy-c.c b/sysdeps/x86_64/multiarch/strncpy-c.c new file mode 100644 index 0000000000..296c32cb5d --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncpy-c.c @@ -0,0 +1,8 @@ +#define STRNCPY __strncpy_sse2 +#ifdef SHARED +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strncpy_sse2, __GI_strncpy, __strncpy_sse2); +#endif + +#include "strncpy.c" diff --git a/sysdeps/x86_64/multiarch/strncpy.S b/sysdeps/x86_64/multiarch/strncpy.S new file mode 100644 index 0000000000..327a4ce447 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncpy.S @@ -0,0 +1,3 @@ +#define STRCPY strncpy +#define USE_AS_STRNCPY +#include "strcpy.S" diff --git a/sysdeps/x86_64/multiarch/strpbrk-c.c b/sysdeps/x86_64/multiarch/strpbrk-c.c new file mode 100644 index 0000000000..c58dcb5605 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strpbrk-c.c @@ -0,0 +1,4 @@ +#define USE_AS_STRPBRK +#define STRCSPN_SSE2 __strpbrk_sse2 +#define STRCSPN_SSE42 __strpbrk_sse42 +#include "strcspn-c.c" diff --git a/sysdeps/x86_64/multiarch/strpbrk.S b/sysdeps/x86_64/multiarch/strpbrk.S new file mode 100644 index 0000000000..ed5bca6a94 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strpbrk.S @@ -0,0 +1,3 @@ +#define STRCSPN strpbrk +#define USE_AS_STRPBRK +#include "strcspn.S" diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c new file mode 100644 index 0000000000..5b99f0d383 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strspn-c.c @@ -0,0 +1,284 @@ +/* strspn with SSE4.2 intrinsics + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <nmmintrin.h> +#include <string.h> + +/* We use 0x12: + _SIDD_SBYTE_OPS + | _SIDD_CMP_EQUAL_ANY + | _SIDD_NEGATIVE_POLARITY + | _SIDD_LEAST_SIGNIFICANT + on pcmpistri to compare xmm/mem128 + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + X X X X X X X X X X X X X X X X + + against xmm + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + A A A A A A A A A A A A A A A A + + to find out if the first 16byte data element has any non-A byte and + the offset of the first byte. There are 2 cases: + + 1. The first 16byte data element has the non-A byte, including + EOS, at the offset X. + 2. The first 16byte data element is valid and doesn't have the non-A + byte. + + Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: + + case ECX CFlag ZFlag SFlag + 1 X 1 0/1 0 + 2 16 0 0 0 + + We exit from the loop for case 1. */ + +extern size_t __strspn_sse2 (const char *, const char *); + + +size_t +__attribute__ ((section (".text.sse4.2"))) +__strspn_sse42 (const char *s, const char *a) +{ + if (*a == 0) + return 0; + + const char *aligned; + __m128i mask; + int offset = (int) ((size_t) a & 15); + if (offset != 0) + { + /* Load masks. */ + aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L); + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); + + switch (offset) + { + case 1: + mask = _mm_srli_si128 (mask0, 1); + break; + case 2: + mask = _mm_srli_si128 (mask0, 2); + break; + case 3: + mask = _mm_srli_si128 (mask0, 3); + break; + case 4: + mask = _mm_srli_si128 (mask0, 4); + break; + case 5: + mask = _mm_srli_si128 (mask0, 5); + break; + case 6: + mask = _mm_srli_si128 (mask0, 6); + break; + case 7: + mask = _mm_srli_si128 (mask0, 7); + break; + case 8: + mask = _mm_srli_si128 (mask0, 8); + break; + case 9: + mask = _mm_srli_si128 (mask0, 9); + break; + case 10: + mask = _mm_srli_si128 (mask0, 10); + break; + case 11: + mask = _mm_srli_si128 (mask0, 11); + break; + case 12: + mask = _mm_srli_si128 (mask0, 12); + break; + case 13: + mask = _mm_srli_si128 (mask0, 13); + break; + case 14: + mask = _mm_srli_si128 (mask0, 14); + break; + case 15: + mask = _mm_srli_si128 (mask0, 15); + break; + } + + /* Find where the NULL terminator is. */ + int length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16 - offset) + { + /* There is no NULL terminator. */ + __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); + int index = _mm_cmpistri (mask1, mask1, 0x3a); + length += index; + + /* Don't use SSE4.2 if the length of A > 16. */ + if (length > 16) + return __strspn_sse2 (s, a); + + if (index != 0) + { + /* Combine mask0 and mask1. */ + switch (offset) + { + case 1: + mask = _mm_alignr_epi8 (mask1, mask0, 1); + break; + case 2: + mask = _mm_alignr_epi8 (mask1, mask0, 2); + break; + case 3: + mask = _mm_alignr_epi8 (mask1, mask0, 3); + break; + case 4: + mask = _mm_alignr_epi8 (mask1, mask0, 4); + break; + case 5: + mask = _mm_alignr_epi8 (mask1, mask0, 5); + break; + case 6: + mask = _mm_alignr_epi8 (mask1, mask0, 6); + break; + case 7: + mask = _mm_alignr_epi8 (mask1, mask0, 7); + break; + case 8: + mask = _mm_alignr_epi8 (mask1, mask0, 8); + break; + case 9: + mask = _mm_alignr_epi8 (mask1, mask0, 9); + break; + case 10: + mask = _mm_alignr_epi8 (mask1, mask0, 10); + break; + case 11: + mask = _mm_alignr_epi8 (mask1, mask0, 11); + break; + case 12: + mask = _mm_alignr_epi8 (mask1, mask0, 12); + break; + case 13: + mask = _mm_alignr_epi8 (mask1, mask0, 13); + break; + case 14: + mask = _mm_alignr_epi8 (mask1, mask0, 14); + break; + case 15: + mask = _mm_alignr_epi8 (mask1, mask0, 15); + break; + } + } + } + } + else + { + /* A is aligned. */ + mask = _mm_load_si128 ((__m128i *) a); + + /* Find where the NULL terminator is. */ + int length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16) + { + /* There is no NULL terminator. Don't use SSE4.2 if the length + of A > 16. */ + if (a[16] != 0) + return __strspn_sse2 (s, a); + } + } + + offset = (int) ((size_t) s & 15); + if (offset != 0) + { + /* Check partial string. */ + aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L); + __m128i value = _mm_load_si128 ((__m128i *) aligned); + + switch (offset) + { + case 1: + value = _mm_srli_si128 (value, 1); + break; + case 2: + value = _mm_srli_si128 (value, 2); + break; + case 3: + value = _mm_srli_si128 (value, 3); + break; + case 4: + value = _mm_srli_si128 (value, 4); + break; + case 5: + value = _mm_srli_si128 (value, 5); + break; + case 6: + value = _mm_srli_si128 (value, 6); + break; + case 7: + value = _mm_srli_si128 (value, 7); + break; + case 8: + value = _mm_srli_si128 (value, 8); + break; + case 9: + value = _mm_srli_si128 (value, 9); + break; + case 10: + value = _mm_srli_si128 (value, 10); + break; + case 11: + value = _mm_srli_si128 (value, 11); + break; + case 12: + value = _mm_srli_si128 (value, 12); + break; + case 13: + value = _mm_srli_si128 (value, 13); + break; + case 14: + value = _mm_srli_si128 (value, 14); + break; + case 15: + value = _mm_srli_si128 (value, 15); + break; + } + + int length = _mm_cmpistri (mask, value, 0x12); + /* No need to check CFlag since it is always 1. */ + if (length < 16 - offset) + return length; + /* Find where the NULL terminator is. */ + int index = _mm_cmpistri (value, value, 0x3a); + if (index < 16 - offset) + return length; + aligned += 16; + } + else + aligned = s; + + while (1) + { + __m128i value = _mm_load_si128 ((__m128i *) aligned); + int index = _mm_cmpistri (mask, value, 0x12); + int cflag = _mm_cmpistrc (mask, value, 0x12); + if (cflag) + return (size_t) (aligned + index - s); + aligned += 16; + } +} diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S new file mode 100644 index 0000000000..4183a2cf60 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strspn.S @@ -0,0 +1,63 @@ +/* Multiple versions of strspn + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <config.h> + +#ifdef HAVE_SSE4_SUPPORT + +#include <sysdep.h> +#include <ifunc-defines.h> + +/* Define multiple versions only for the definition in libc. */ +#ifndef NOT_IN_libc + .text +ENTRY(strspn) + .type strspn, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq __strspn_sse2(%rip), %rax + testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip) + jz 2f + leaq __strspn_sse42(%rip), %rax +2: ret +END(strspn) + +# undef ENTRY +# define ENTRY(name) \ + .type __strspn_sse2, @function; \ + .globl __strspn_sse2; \ + .align 16; \ + __strspn_sse2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __strspn_sse2, .-__strspn_sse2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strspn calls through a PLT. + The speedup we get from using SSE4.2 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_strspn; __GI_strspn = __strspn_sse2 +#endif + +#endif /* HAVE_SSE4_SUPPORT */ + +#include "../strspn.S" |