diff options
author | Andreas Schwab <aschwab@redhat.com> | 2009-06-26 13:14:24 +0200 |
---|---|---|
committer | Andreas Schwab <aschwab@redhat.com> | 2009-06-26 13:14:24 +0200 |
commit | a879c2cb1753c3aa868aa1e9d2107e9f0deb0dc2 (patch) | |
tree | a919978ad8a178a0e6013f1ee43ab54f60a13493 /sysdeps | |
parent | 5f9df8e7b478cafd4528a133201f4611a963292e (diff) | |
parent | 44d20bca52ace85850012b0ead37b360e3ecd96e (diff) |
Merge commit 'origin/master' into fedora/master
Diffstat (limited to 'sysdeps')
-rw-r--r-- | sysdeps/generic/pty-private.h | 5 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc32/____longjmp_chk.S | 23 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S | 4 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S | 4 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S | 4 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.S | 4 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc32/power6/memcpy.S | 56 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc32/power6/memset.S | 6 | ||||
-rw-r--r-- | sysdeps/unix/grantpt.c | 28 | ||||
-rw-r--r-- | sysdeps/unix/sysv/linux/bits/socket.h | 6 | ||||
-rw-r--r-- | sysdeps/unix/sysv/linux/grantpt.c | 88 | ||||
-rw-r--r-- | sysdeps/unix/sysv/linux/lddlibc4.c | 4 | ||||
-rw-r--r-- | sysdeps/unix/sysv/linux/net/if_arp.h | 4 | ||||
-rw-r--r-- | sysdeps/unix/sysv/linux/sparc/bits/socket.h | 16 | ||||
-rw-r--r-- | sysdeps/x86_64/memchr.S | 6 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/Makefile | 4 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/init-arch.h | 3 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strcmp.S | 1677 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strncmp-c.c | 8 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strncmp.S | 3 |
20 files changed, 1808 insertions, 145 deletions
diff --git a/sysdeps/generic/pty-private.h b/sysdeps/generic/pty-private.h index d6ec2cee68..493f40551d 100644 --- a/sysdeps/generic/pty-private.h +++ b/sysdeps/generic/pty-private.h @@ -1,5 +1,5 @@ /* Internal defenitions and declarations for pseudo terminal functions. - Copyright (C) 1998, 1999 Free Software Foundation, Inc. + Copyright (C) 1998, 1999, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Zack Weinberg <zack@rabi.phys.columbia.edu>, 1998. @@ -39,7 +39,8 @@ enum /* failure modes */ FAIL_EBADF = 1, FAIL_EINVAL, FAIL_EACCES, - FAIL_EXEC + FAIL_EXEC, + FAIL_ENOMEM }; #endif /* pty-private.h */ diff --git a/sysdeps/powerpc/powerpc32/____longjmp_chk.S b/sysdeps/powerpc/powerpc32/____longjmp_chk.S index 5c1f648661..510ce5250d 100644 --- a/sysdeps/powerpc/powerpc32/____longjmp_chk.S +++ b/sysdeps/powerpc/powerpc32/____longjmp_chk.S @@ -26,11 +26,30 @@ #define __longjmp ____longjmp_chk +#ifdef PIC +# ifdef HAVE_ASM_PPC_REL16 +# define LOAD_ARG \ + bcl 20,31,1f; \ +1: mflr r3; \ + addis r3,r3,_GLOBAL_OFFSET_TABLE_-1b@ha; \ + addi r3,r3,_GLOBAL_OFFSET_TABLE_-1b@l; \ + lwz r3,.LC0@got(r3) +# else +# define LOAD_ARG \ + bl _GLOBAL_OFFSET_TABLE_-4@local; \ + mflr r3; \ + lwz r3,.LC0@got(r3) +# endif +#else +# define LOAD_ARG \ + lis r3,.LC0@ha; \ + la r3,.LC0@l(r3) +#endif + #define CHECK_SP(reg) \ cmplw reg, r1; \ bge+ .Lok; \ - lis r3,.LC0@ha; \ - la r3,.LC0@l(r3); \ + LOAD_ARG; \ bl HIDDEN_JUMPTARGET (__fortify_fail); \ .Lok: diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S b/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S index 6aef4e301b..95a0b3915d 100644 --- a/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S +++ b/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrt.S @@ -60,8 +60,8 @@ EALIGN (__sqrt, 5, 0) fmr fp12,fp2 stw r0,20(r1) stw r30,8(r1) - cfi_offset(lr,20) - cfi_offset(r30,8) + cfi_offset(lr,20-16) + cfi_offset(r30,8-16) #ifdef SHARED # ifdef HAVE_ASM_PPC_REL16 bcl 20,31,.LCF1 diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S b/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S index e5b8b9d565..c31555194b 100644 --- a/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S +++ b/sysdeps/powerpc/powerpc32/power4/fpu/w_sqrtf.S @@ -60,8 +60,8 @@ EALIGN (__sqrtf, 5, 0) fmr fp12,fp2 stw r0,20(r1) stw r30,8(r1) - cfi_offset(lr,20) - cfi_offset(r30,8) + cfi_offset(lr,20-16) + cfi_offset(r30,8-16) #ifdef SHARED # ifdef HAVE_ASM_PPC_REL16 bcl 20,31,.LCF1 diff --git a/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S b/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S index 925930bf77..105b5912a1 100644 --- a/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S +++ b/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrt.S @@ -60,8 +60,8 @@ EALIGN (__sqrt, 5, 0) fmr fp12,fp2 stw r0,20(r1) stw r30,8(r1) - cfi_offset(lr,20) - cfi_offset(r30,8) + cfi_offset(lr,20-16) + cfi_offset(r30,8-16) #ifdef SHARED # ifdef HAVE_ASM_PPC_REL16 bcl 20,31,.LCF1 diff --git a/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.S b/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.S index 891e69c9c0..14bc0a2ceb 100644 --- a/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.S +++ b/sysdeps/powerpc/powerpc32/power5/fpu/w_sqrtf.S @@ -60,8 +60,8 @@ EALIGN (__sqrtf, 5, 0) fmr fp12,fp2 stw r0,20(r1) stw r30,8(r1) - cfi_offset(lr,20) - cfi_offset(r30,8) + cfi_offset(lr,20-16) + cfi_offset(r30,8-16) #ifdef SHARED # ifdef HAVE_ASM_PPC_REL16 bcl 20,31,.LCF1 diff --git a/sysdeps/powerpc/powerpc32/power6/memcpy.S b/sysdeps/powerpc/powerpc32/power6/memcpy.S index ba45fd250c..156b0bd8cc 100644 --- a/sysdeps/powerpc/powerpc32/power6/memcpy.S +++ b/sysdeps/powerpc/powerpc32/power6/memcpy.S @@ -1,5 +1,5 @@ /* Optimized memcpy implementation for PowerPC32 on POWER6. - Copyright (C) 2003, 2006 Free Software Foundation, Inc. + Copyright (C) 2003, 2006, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -25,9 +25,9 @@ Returns 'dst'. Memcpy handles short copies (< 32-bytes) using a binary move blocks - (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled - with the appropriate combination of byte and halfword load/stores. - There is minimal effort to optimize the alignment of short moves. + (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled + with the appropriate combination of byte and halfword load/stores. + There is minimal effort to optimize the alignment of short moves. Longer moves (>= 32-bytes) justify the effort to get at least the destination word (4-byte) aligned. Further optimization is @@ -80,11 +80,11 @@ EALIGN (BP_SYM (memcpy), 5, 0) bne- cr6,L(wdu) /* If source is not word aligned. .L6 */ clrlwi 11,31,30 /* calculate the number of tail bytes */ b L(word_aligned) - /* Copy words from source to destination, assuming the destination is + /* Copy words from source to destination, assuming the destination is aligned on a word boundary. At this point we know there are at least 29 bytes left (32-3) to copy. - The next step is to determine if the source is also word aligned. + The next step is to determine if the source is also word aligned. If not branch to the unaligned move code at .L6. which uses a load, shift, store strategy. @@ -100,9 +100,9 @@ EALIGN (BP_SYM (memcpy), 5, 0) /* Move words where destination and source are word aligned. Use an unrolled loop to copy 4 words (16-bytes) per iteration. - If the the copy is not an exact multiple of 16 bytes, 1-3 + If the the copy is not an exact multiple of 16 bytes, 1-3 words are copied as needed to set up the main loop. After - the main loop exits there may be a tail of 1-3 bytes. These bytes are + the main loop exits there may be a tail of 1-3 bytes. These bytes are copied a halfword/byte at a time as needed to preserve alignment. */ L(word_aligned): mtcrf 0x01,9 @@ -121,7 +121,7 @@ L(word_aligned): addi 10,3,8 bf 31,4f lwz 0,8(12) - stw 0,8(3) + stw 0,8(3) blt cr1,3f addi 11,12,12 addi 10,3,12 @@ -135,7 +135,7 @@ L(word_aligned): addi 11,12,4 stw 6,0(3) addi 10,3,4 - + .align 4 4: lwz 6,0(11) @@ -149,14 +149,14 @@ L(word_aligned): addi 11,11,16 addi 10,10,16 bdnz 4b -3: +3: clrrwi 0,31,2 mtcrf 0x01,31 beq cr6,0f .L9: add 3,3,0 add 12,12,0 - + /* At this point we have a tail of 0-3 bytes and we know that the destination is word aligned. */ 2: bf 30,1f @@ -175,7 +175,7 @@ L(word_aligned): addi 1,1,32 blr -/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31 +/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31 bytes. Each case is handled without loops, using binary (1,2,4,8) tests. @@ -208,7 +208,7 @@ L(word_unaligned_short): andi. 0,8,3 beq cr6,L(wus_8) /* Handle moves of 8 bytes. */ /* At least 9 bytes left. Get the source word aligned. */ - cmpldi cr1,5,16 + cmplwi cr1,5,16 mr 12,4 ble cr6,L(wus_4) /* Handle moves of 0-8 bytes. */ mr 11,3 @@ -241,7 +241,7 @@ L(wus_tail): /* At least 6 bytes left and the source is word aligned. This allows some speculative loads up front. */ /* We need to special case the fall-through because the biggest delays - are due to address computation not being ready in time for the + are due to address computation not being ready in time for the AGEN. */ lwz 6,0(12) lwz 7,4(12) @@ -336,7 +336,7 @@ L(wus_tail4): /* Move 4 bytes. */ L(wus_tail2): /* Move 2-3 bytes. */ bf 30,L(wus_tail1) lhz 6,0(12) - sth 6,0(11) + sth 6,0(11) bf 31,L(wus_tailX) lbz 7,2(12) stb 7,2(11) @@ -368,7 +368,7 @@ L(wus_4): stw 6,0(3) bf 30,L(wus_5) lhz 7,4(4) - sth 7,4(3) + sth 7,4(3) bf 31,L(wus_0) lbz 8,6(4) stb 8,6(3) @@ -386,7 +386,7 @@ L(wus_5): L(wus_2): /* Move 2-3 bytes. */ bf 30,L(wus_1) lhz 6,0(4) - sth 6,0(3) + sth 6,0(3) bf 31,L(wus_0) lbz 7,2(4) stb 7,2(3) @@ -410,13 +410,13 @@ L(wdu): /* Copy words where the destination is aligned but the source is not. For power4, power5 and power6 machines there is penalty for - unaligned loads (src) that cross 32-byte, cacheline, or page + unaligned loads (src) that cross 32-byte, cacheline, or page boundaries. So we want to use simple (unaligned) loads where posible but avoid them where we know the load would span a 32-byte - boundary. + boundary. At this point we know we have at least 29 (32-3) bytes to copy - the src is unaligned. and we may cross at least one 32-byte + the src is unaligned. and we may cross at least one 32-byte boundary. Also we have the following regester values: r3 == adjusted dst, word aligned r4 == unadjusted src @@ -427,7 +427,7 @@ L(wdu): r31 == adjusted len First we need to copy word upto but not crossing the next 32-byte - boundary. Then perform aligned loads just before and just after + boundary. Then perform aligned loads just before and just after the boundary and use shifts and or to gernerate the next aligned word for dst. If more then 32 bytes remain we copy (unaligned src) the next 7 words and repeat the loop until less then 32-bytes @@ -442,7 +442,7 @@ L(wdu): mr 4,12 /* restore unaligned adjusted src ptr */ clrlwi 0,12,27 /* Find dist from previous 32-byte boundary. */ slwi 10,10,3 /* calculate number of bits to shift 1st word left */ - cmplwi cr5,0,16 + cmplwi cr5,0,16 subfic 8,0,32 /* Number of bytes to next 32-byte boundary. */ mtcrf 0x01,8 @@ -532,7 +532,7 @@ L(wdu_32): lwz 6,0(12) cmplwi cr6,31,4 srwi 8,31,5 /* calculate the 32 byte loop count */ - slw 0,6,10 + slw 0,6,10 clrlwi 31,31,27 /* The remaining bytes, < 32. */ blt cr5,L(wdu_32tail) mtctr 8 @@ -543,7 +543,7 @@ L(wdu_loop32): lwz 8,4(12) addi 12,12,32 lwz 7,4(4) - srw 8,8,9 + srw 8,8,9 or 0,0,8 stw 0,0(3) stw 7,4(3) @@ -562,7 +562,7 @@ L(wdu_loop32): stw 6,24(3) stw 7,28(3) addi 3,3,32 - slw 0,8,10 + slw 0,8,10 bdnz+ L(wdu_loop32) L(wdu_32tail): @@ -571,7 +571,7 @@ L(wdu_32tail): blt cr6,L(wdu_4tail) /* calculate and store the final word */ lwz 8,4(12) - srw 8,8,9 + srw 8,8,9 or 6,0,8 b L(wdu_32tailx) #endif @@ -816,7 +816,7 @@ L(wdu_4tail): beq cr6,L(wdus_0) /* If the tail is 0 bytes we are done! */ bf 30,L(wdus_3) lhz 7,0(4) - sth 7,0(3) + sth 7,0(3) bf 31,L(wdus_0) lbz 8,2(4) stb 8,2(3) diff --git a/sysdeps/powerpc/powerpc32/power6/memset.S b/sysdeps/powerpc/powerpc32/power6/memset.S index 10fb7b9786..cc65b7be9a 100644 --- a/sysdeps/powerpc/powerpc32/power6/memset.S +++ b/sysdeps/powerpc/powerpc32/power6/memset.S @@ -1,5 +1,5 @@ /* Optimized 32-bit memset implementation for POWER6. - Copyright (C) 1997,99, 2000,02,03,06,2007 Free Software Foundation, Inc. + Copyright (C) 1997,99,2000,02,03,06,2007,2009 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -240,7 +240,7 @@ L(nzCacheAligned256): cmplwi cr1,rLEN,256 addi rMEMP3,rMEMP,64 #ifdef NOT_IN_libc -/* When we are not in libc we should use only GPRs to avoid the FPU lock +/* When we are not in libc we should use only GPRs to avoid the FPU lock interrupt. */ stw rCHR,0(rMEMP) stw rCHR,4(rMEMP) @@ -381,7 +381,7 @@ L(cacheAligned): blt cr1,L(cacheAligned1) li rMEMP2,128 L(cacheAlignedx): - cmpldi cr5,rLEN,640 + cmplwi cr5,rLEN,640 blt cr6,L(cacheAligned128) bgt cr5,L(cacheAligned512) cmplwi cr6,rLEN,512 diff --git a/sysdeps/unix/grantpt.c b/sysdeps/unix/grantpt.c index bdedbacec8..8c299e9147 100644 --- a/sysdeps/unix/grantpt.c +++ b/sysdeps/unix/grantpt.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1998, 2000, 2001, 2002 Free Software Foundation, Inc. +/* Copyright (C) 1998, 2000, 2001, 2002, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Zack Weinberg <zack@rabi.phys.columbia.edu>, 1998. @@ -19,6 +19,7 @@ #include <assert.h> #include <errno.h> +#include <fcntl.h> #include <grp.h> #include <limits.h> #include <stdlib.h> @@ -115,8 +116,24 @@ grantpt (int fd) gid_t gid; pid_t pid; - if (pts_name (fd, &buf, sizeof (_buf))) - return -1; + if (__builtin_expect (pts_name (fd, &buf, sizeof (_buf)), 0)) + { + int save_errno = errno; + + /* Check, if the file descriptor is valid. pts_name returns the + wrong errno number, so we cannot use that. */ + if (__libc_fcntl (fd, F_GETFD) == -1 && errno == EBADF) + return -1; + + /* If the filedescriptor is no TTY, grantpt has to set errno + to EINVAL. */ + if (save_errno == ENOTTY) + __set_errno (EINVAL); + else + __set_errno (save_errno); + + return -1; + } if (__xstat64 (_STAT_VER, buf, &st) < 0) goto cleanup; @@ -185,7 +202,7 @@ grantpt (int fd) if (!WIFEXITED (w)) __set_errno (ENOEXEC); else - switch (WEXITSTATUS(w)) + switch (WEXITSTATUS (w)) { case 0: retval = 0; @@ -202,6 +219,9 @@ grantpt (int fd) case FAIL_EXEC: __set_errno (ENOEXEC); break; + case FAIL_ENOMEM: + __set_errno (ENOMEM); + break; default: assert(! "getpt: internal error: invalid exit code from pt_chown"); diff --git a/sysdeps/unix/sysv/linux/bits/socket.h b/sysdeps/unix/sysv/linux/bits/socket.h index 88062e59ad..f23b338a35 100644 --- a/sysdeps/unix/sysv/linux/bits/socket.h +++ b/sysdeps/unix/sysv/linux/bits/socket.h @@ -1,5 +1,5 @@ /* System-specific socket constants and types. Linux version. - Copyright (C) 1991, 1992, 1994-2001, 2004, 2006, 2007, 2008 + Copyright (C) 1991, 1992, 1994-2001, 2004, 2006, 2007, 2008, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -108,7 +108,8 @@ enum __socket_type #define PF_RXRPC 33 /* RxRPC sockets. */ #define PF_ISDN 34 /* mISDN sockets. */ #define PF_PHONET 35 /* Phonet sockets. */ -#define PF_MAX 36 /* For now.. */ +#define PF_IEEE802154 36 /* IEEE 802.15.4 sockets. */ +#define PF_MAX 37 /* For now.. */ /* Address families. */ #define AF_UNSPEC PF_UNSPEC @@ -148,6 +149,7 @@ enum __socket_type #define AF_RXRPC PF_RXRPC #define AF_ISDN PF_ISDN #define AF_PHONET PF_PHONET +#define AF_IEEE802154 PF_IEEE802154 #define AF_MAX PF_MAX /* Socket level values. Others are defined in the appropriate headers. diff --git a/sysdeps/unix/sysv/linux/grantpt.c b/sysdeps/unix/sysv/linux/grantpt.c deleted file mode 100644 index c858f89c8b..0000000000 --- a/sysdeps/unix/sysv/linux/grantpt.c +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (C) 1998, 1999, 2001, 2002, 2009 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - -#include <errno.h> -#include <fcntl.h> -#include <limits.h> -#include <stdlib.h> -#include <sys/statfs.h> - -#include "linux_fsinfo.h" - -/* Prototype for function that changes ownership and access permission - for slave pseudo terminals that do not live on a `devpts' - filesystem. */ -static int __unix_grantpt (int fd); - -/* Prototype for private function that gets the name of the slave - pseudo terminal in a safe way. */ -static int pts_name (int fd, char **pts, size_t buf_len); - -/* Change the ownership and access permission of the slave pseudo - terminal associated with the master pseudo terminal specified - by FD. */ -int -grantpt (int fd) -{ - struct statfs fsbuf; -#ifdef PATH_MAX - char _buf[PATH_MAX]; -#else - char _buf[512]; -#endif - char *buf = _buf; - - if (__builtin_expect (pts_name (fd, &buf, sizeof (_buf)), 0)) - { - int save_errno = errno; - - /* Check, if the file descriptor is valid. pts_name returns the - wrong errno number, so we cannot use that. */ - if (__libc_fcntl (fd, F_GETFD) == -1 && errno == EBADF) - return -1; - - /* If the filedescriptor is no TTY, grantpt has to set errno - to EINVAL. */ - if (save_errno == ENOTTY) - __set_errno (EINVAL); - else - __set_errno (save_errno); - - return -1; - } - - if (__statfs (buf, &fsbuf) < 0) - return -1; - - /* If the slave pseudo terminal lives on a `devpts' filesystem, the - ownership is already set and the access permission might already - be set. */ - if (fsbuf.f_type == DEVPTS_SUPER_MAGIC || fsbuf.f_type == DEVFS_SUPER_MAGIC) - { - struct stat64 st; - - if (fstat (fd, &st) == 0 - && (st.st_mode & ACCESSPERMS) == (S_IRUSR|S_IWUSR|S_IWGRP)) - return 0; - } - - return __unix_grantpt (fd); -} - -#define grantpt static __unix_grantpt -#include <sysdeps/unix/grantpt.c> diff --git a/sysdeps/unix/sysv/linux/lddlibc4.c b/sysdeps/unix/sysv/linux/lddlibc4.c index 694d1291cd..6d57190508 100644 --- a/sysdeps/unix/sysv/linux/lddlibc4.c +++ b/sysdeps/unix/sysv/linux/lddlibc4.c @@ -59,8 +59,8 @@ main (int argc, char *argv[]) if (strcmp (argv[1], "--help") == 0) { printf (gettext ("Usage: lddlibc4 FILE\n\n")); - printf (gettext ("For bug reporting instructions, please see:\n\ -<http://www.gnu.org/software/libc/bugs.html>.\n")); + fputs (gettext ("For bug reporting instructions, please see:\n\ +<http://www.gnu.org/software/libc/bugs.html>.\n"), stdout); return 0; } else if (strcmp (argv[1], "--version") == 0) diff --git a/sysdeps/unix/sysv/linux/net/if_arp.h b/sysdeps/unix/sysv/linux/net/if_arp.h index 9608652ee4..97cb61f62b 100644 --- a/sysdeps/unix/sysv/linux/net/if_arp.h +++ b/sysdeps/unix/sysv/linux/net/if_arp.h @@ -1,5 +1,5 @@ /* Definitions for Address Resolution Protocol. - Copyright (C) 1997,1999,2001,2006 Free Software Foundation, Inc. + Copyright (C) 1997,1999,2001,2006,2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. @@ -128,6 +128,8 @@ struct arphdr #define ARPHRD_IEEE80211 801 /* IEEE 802.11. */ #define ARPHRD_IEEE80211_PRISM 802 /* IEEE 802.11 + Prism2 header. */ #define ARPHRD_IEEE80211_RADIOTAP 803 /* IEEE 802.11 + radiotap header. */ +#define ARPHRD_IEEE802154 804 /* IEEE 802.15.4 header. */ +#define ARPHRD_IEEE802154_PHY 805 /* IEEE 802.15.4 PHY header. */ #define ARPHRD_VOID 0xFFFF /* Void type, nothing is known. */ #define ARPHRD_NONE 0xFFFE /* Zero header length. */ diff --git a/sysdeps/unix/sysv/linux/sparc/bits/socket.h b/sysdeps/unix/sysv/linux/sparc/bits/socket.h index d43a3cdf79..a148072095 100644 --- a/sysdeps/unix/sysv/linux/sparc/bits/socket.h +++ b/sysdeps/unix/sysv/linux/sparc/bits/socket.h @@ -1,5 +1,5 @@ /* System-specific socket constants and types. Linux/SPARC version. - Copyright (C) 1991, 1992, 1994-2001, 2004, 2006, 2007, 2008 + Copyright (C) 1991, 1992, 1994-2001, 2004, 2006, 2007, 2008, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -94,15 +94,21 @@ enum __socket_type #define PF_ASH 18 /* Ash. */ #define PF_ECONET 19 /* Acorn Econet. */ #define PF_ATMSVC 20 /* ATM SVCs. */ +#define PF_RDS 21 /* RDS sockets. */ #define PF_SNA 22 /* Linux SNA Project */ #define PF_IRDA 23 /* IRDA sockets. */ #define PF_PPPOX 24 /* PPPoX sockets. */ #define PF_WANPIPE 25 /* Wanpipe API sockets. */ +#define PF_LLC 26 /* Linux LLC. */ +#define PF_CAN 29 /* Controller Area Network. */ +#define PF_TIPC 30 /* TIPC sockets. */ #define PF_BLUETOOTH 31 /* Bluetooth sockets. */ #define PF_IUCV 32 /* IUCV sockets. */ #define PF_RXRPC 33 /* RxRPC sockets. */ #define PF_ISDN 34 /* mISDN sockets. */ -#define PF_MAX 35 /* For now.. */ +#define PF_PHONET 35 /* Phonet sockets. */ +#define PF_IEEE802154 36 /* IEEE 802.15.4 sockets. */ +#define PF_MAX 37 /* For now.. */ /* Address families. */ #define AF_UNSPEC PF_UNSPEC @@ -129,14 +135,20 @@ enum __socket_type #define AF_ASH PF_ASH #define AF_ECONET PF_ECONET #define AF_ATMSVC PF_ATMSVC +#define AF_RDS PF_RDS #define AF_SNA PF_SNA #define AF_IRDA PF_IRDA #define AF_PPPOX PF_PPPOX #define AF_WANPIPE PF_WANPIPE +#define AF_LLC PF_LLC +#define AF_CAN PF_CAN +#define AF_TIPC PF_TIPC #define AF_BLUETOOTH PF_BLUETOOTH #define AF_IUCV PF_IUCV #define AF_RXRPC PF_RXRPC #define AF_ISDN PF_ISDN +#define AF_PHONET PF_PHONET +#define AF_IEEE802154 PF_IEEE802154 #define AF_MAX PF_MAX /* Socket level values. Others are defined in the appropriate headers. diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S index 54b7af534c..6082aa7f76 100644 --- a/sysdeps/x86_64/memchr.S +++ b/sysdeps/x86_64/memchr.S @@ -41,7 +41,7 @@ ENTRY (memchr) movl $16, %esi jnz 1f cmpq %rsi, %rdx - jle 3f + jbe 3f 2: movdqa (%rdi,%rsi), %xmm0 leaq 16(%rsi), %rsi @@ -50,7 +50,7 @@ ENTRY (memchr) testl %ecx, %ecx jnz 1f cmpq %rsi, %rdx - jg 2b + ja 2b 3: xorl %eax, %eax ret @@ -60,7 +60,7 @@ ENTRY (memchr) addq %rcx, %rax leaq -16(%rsi,%rcx), %rsi cmpq %rsi, %rdx - jle 3b + jbe 3b ret END (memchr) diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 33d98c36e6..1c35e1ffb4 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -2,3 +2,7 @@ ifeq ($(subdir),csu) aux += init-arch gen-as-const-headers += ifunc-defines.sym endif + +ifeq ($(subdir),string) +sysdep_routines += strncmp-c +endif diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h index f160ba2a94..5c4892de38 100644 --- a/sysdeps/x86_64/multiarch/init-arch.h +++ b/sysdeps/x86_64/multiarch/init-arch.h @@ -56,3 +56,6 @@ extern void __init_cpu_features (void) attribute_hidden; #define HAS_POPCOUNT \ ((__cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 23)) != 0) + +#define HAS_SSE4_2 \ + ((__cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx & (1 << 20)) != 0) diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S new file mode 100644 index 0000000000..2f4bf17d95 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcmp.S @@ -0,0 +1,1677 @@ +/* strcmp with SSE4.2 + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <ifunc-defines.h> + +#ifdef USE_AS_STRNCMP +/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz + if the new counter > the old one or is 0. */ +#define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + lea -16(%rcx, %r11), %r9; \ + cmp %r9, %r11; \ + jb LABEL(strcmp_exitz); \ + test %r9, %r9; \ + je LABEL(strcmp_exitz); \ + mov %r9, %r11 + +#define STRCMP_SSE42 __strncmp_sse42 +#define STRCMP_SSE2 __strncmp_sse2 +#define __GI_STRCMP __GI_strncmp +#else +#define UPDATE_STRNCMP_COUNTER +#ifndef STRCMP +#define STRCMP strcmp +#define STRCMP_SSE42 __strcmp_sse42 +#define STRCMP_SSE2 __strcmp_sse2 +#define __GI_STRCMP __GI_strcmp +#endif +#endif + +#ifndef LABEL +#define LABEL(l) L(l) +#endif + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strncmp in static library since we + need strncmp before the initialization happened. */ +#if (defined SHARED || !defined USE_AS_STRNCMP) && !defined NOT_IN_libc + .text +ENTRY(STRCMP) + .type STRCMP, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq STRCMP_SSE2(%rip), %rax + testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip) + jz 2f + leaq STRCMP_SSE42(%rip), %rax +2: ret +END(STRCMP) + +/* We use 0x1a: + _SIDD_SBYTE_OPS + | _SIDD_CMP_EQUAL_EACH + | _SIDD_NEGATIVE_POLARITY + | _SIDD_LEAST_SIGNIFICANT + on pcmpistri to find out if two 16byte data elements are the same + and the offset of the first different byte. There are 4 cases: + + 1. Both 16byte data elements are valid and identical. + 2. Both 16byte data elements have EOS and identical. + 3. Both 16byte data elements are valid and they differ at offset X. + 4. At least one 16byte data element has EOS at offset X. Two 16byte + data elements must differ at or before offset X. + + Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases: + + case ECX CFlag ZFlag SFlag + 1 16 0 0 0 + 2 16 0 1 1 + 3 X 1 0 0 + 4 0 <= X 1 0/1 0/1 + + We exit from the loop for cases 2, 3 and 4 with jbe which branches + when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for + case 2. */ + + /* Put all SSE 4.2 functions together. */ + .section .text.sse4.2,"ax",@progbits + .align 16 + .type STRCMP_SSE42, @function +STRCMP_SSE42: + cfi_startproc + CALL_MCOUNT + +/* + * This implementation uses SSE to compare up to 16 bytes at a time. + */ +#ifdef USE_AS_STRNCMP + test %rdx, %rdx + je LABEL(strcmp_exitz) + cmp $1, %rdx + je LABEL(Byte0) + mov %rdx, %r11 +#endif + mov %esi, %ecx + mov %edi, %eax +/* Use 64bit AND here to avoid long NOP padding. */ + and $0x3f, %rcx /* rsi alignment in cache line */ + and $0x3f, %rax /* rdi alignment in cache line */ + cmp $0x30, %ecx + ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */ + cmp $0x30, %eax + ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */ + movlpd (%rdi), %xmm1 + movlpd (%rsi), %xmm2 + movhpd 8(%rdi), %xmm1 + movhpd 8(%rsi), %xmm2 + pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ + jnz LABEL(less16bytes) /* If not, find different value or null char */ +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) /* finish comparision */ +#endif + add $16, %rsi /* prepare to search next 16 bytes */ + add $16, %rdi /* prepare to search next 16 bytes */ + + /* + * Determine source and destination string offsets from 16-byte alignment. + * Use relative offset difference between the two to determine which case + * below to use. + */ + .p2align 4 +LABEL(crosscache): + and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ + and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ + mov $0xffff, %edx /* for equivalent offset */ + xor %r8d, %r8d + and $0xf, %ecx /* offset of rsi */ + and $0xf, %eax /* offset of rdi */ + cmp %eax, %ecx + je LABEL(ashr_0) /* rsi and rdi relative offset same */ + ja LABEL(bigger) + mov %edx, %r8d /* r8d is offset flag for exit tail */ + xchg %ecx, %eax + xchg %rsi, %rdi +LABEL(bigger): + lea 15(%rax), %r9 + sub %rcx, %r9 + lea LABEL(unaligned_table)(%rip), %r10 + movslq (%r10, %r9,4), %r9 + lea (%r10, %r9), %r10 + jmp *%r10 /* jump to corresponding case */ + +/* + * The following cases will be handled by ashr_0 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(0~15) n(0~15) 15(15+ n-n) ashr_0 + */ + .p2align 4 +LABEL(ashr_0): + + movdqa (%rsi), %xmm1 + pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ + sub %r9d, %edx + /* + * edx must be the same with r9d if in left byte (16-rcx) is equal to + * the start from (16-rax) and no null char was seen. + */ + jne LABEL(less32bytes) /* mismatch or null char */ + UPDATE_STRNCMP_COUNTER + mov $16, %rcx + mov $16, %r9 + pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */ + + /* + * Now both strings are aligned at 16-byte boundary. Loop over strings + * checking 32-bytes per iteration. + */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + .p2align 4 +LABEL(ashr_0_use_sse4_2): + movdqa (%rdi,%rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + lea 16(%rdx), %rdx + jbe LABEL(ashr_0_use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + movdqa (%rdi,%rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + lea 16(%rdx), %rdx + jbe LABEL(ashr_0_use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + jmp LABEL(ashr_0_use_sse4_2) + + + .p2align 4 +LABEL(ashr_0_use_sse4_2_exit): + jnc LABEL(strcmp_exitz) +#ifdef USE_AS_STRNCMP + sub %rcx, %r11 + jbe LABEL(strcmp_exitz) +#endif + lea -16(%rdx, %rcx), %rcx + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %edx + sub %edx, %eax + ret + + + + +/* + * The following cases will be handled by ashr_1 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(15) n -15 0(15 +(n-15) - n) ashr_1 + */ + .p2align 4 +LABEL(ashr_1): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pslldq $15, %xmm2 /* shift first string to align with second */ + pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ + sub %r9d, %edx + jnz LABEL(less32bytes) /* mismatch or null char seen */ + movdqa (%rdi), %xmm3 + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads*/ + mov $1, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 1(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_1_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_1_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $1, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_1_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $1, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_1_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_1_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $1, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $14, %ecx + ja LABEL(loop_ashr_1_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_2 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 + */ + .p2align 4 +LABEL(ashr_2): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $14, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $2, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 2(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_2_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_2_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $2, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_2_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $2, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_2_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_2_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $2, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $13, %ecx + ja LABEL(loop_ashr_2_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_3 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 + */ + .p2align 4 +LABEL(ashr_3): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $13, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $3, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 3(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + +LABEL(loop_ashr_3_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_3_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $3, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_3_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $3, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_3_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_3_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $3, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $12, %ecx + ja LABEL(loop_ashr_3_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_4 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 + */ + .p2align 4 +LABEL(ashr_4): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $12, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $4, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 4(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_4_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_4_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $4, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_4_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $4, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_4_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_4_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $4, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $11, %ecx + ja LABEL(loop_ashr_4_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_5 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 + */ + .p2align 4 +LABEL(ashr_5): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $11, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $5, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 5(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_5_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_5_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $5, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_5_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + + palignr $5, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_5_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_5_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $5, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $10, %ecx + ja LABEL(loop_ashr_5_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_6 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 + */ + .p2align 4 +LABEL(ashr_6): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $10, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $6, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 6(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_6_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_6_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $6, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_6_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $6, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_6_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_6_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $6, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $9, %ecx + ja LABEL(loop_ashr_6_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_7 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 + */ + .p2align 4 +LABEL(ashr_7): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $9, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $7, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 7(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_7_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_7_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $7, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_7_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $7, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_7_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_7_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $7, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $8, %ecx + ja LABEL(loop_ashr_7_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_8 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 + */ + .p2align 4 +LABEL(ashr_8): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $8, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $8, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 8(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_8_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_8_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $8, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_8_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $8, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_8_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_8_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $8, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $7, %ecx + ja LABEL(loop_ashr_8_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_9 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 + */ + .p2align 4 +LABEL(ashr_9): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $7, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $9, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 9(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_9_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_9_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + + palignr $9, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_9_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $9, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_9_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_9_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $9, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $6, %ecx + ja LABEL(loop_ashr_9_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_10 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 + */ + .p2align 4 +LABEL(ashr_10): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $6, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $10, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 10(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_10_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_10_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $10, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_10_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $10, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_10_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_10_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $10, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $5, %ecx + ja LABEL(loop_ashr_10_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_11 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 + */ + .p2align 4 +LABEL(ashr_11): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $5, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $11, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 11(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_11_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_11_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $11, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_11_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $11, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_11_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_11_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $11, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $4, %ecx + ja LABEL(loop_ashr_11_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_12 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 + */ + .p2align 4 +LABEL(ashr_12): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $4, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $12, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 12(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_12_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_12_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $12, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_12_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $12, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_12_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_12_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $12, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $3, %ecx + ja LABEL(loop_ashr_12_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_13 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 + */ + .p2align 4 +LABEL(ashr_13): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $3, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $13, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 13(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_13_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_13_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $13, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_13_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $13, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_13_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_13_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $13, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $2, %ecx + ja LABEL(loop_ashr_13_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_14 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 + */ + .p2align 4 +LABEL(ashr_14): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $2, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $14, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 14(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_14_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_14_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $14, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_14_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $14, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_14_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_14_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $14, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $1, %ecx + ja LABEL(loop_ashr_14_use_sse4_2) + + jmp LABEL(nibble_ashr_use_sse4_2_exit) + +/* + * The following cases will be handled by ashr_15 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 + */ + .p2align 4 +LABEL(ashr_15): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $1, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $15, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 15(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + + sub $0x1000, %r10 /* subtract 4K pagesize */ + + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_15_use_sse4_2): + add $16, %r10 + jg LABEL(nibble_ashr_15_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $15, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_15_use_sse4_2) + + movdqa (%rdi, %rdx), %xmm0 + palignr $15, -16(%rdi, %rdx), %xmm0 + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + jbe LABEL(use_sse4_2_exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_15_use_sse4_2) + + .p2align 4 +LABEL(nibble_ashr_15_use_sse4_2): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $15, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +#ifdef USE_AS_STRNCMP + cmp %r11, %rcx + jae LABEL(nibble_ashr_use_sse4_2_exit) +#endif + cmp $0, %ecx + ja LABEL(loop_ashr_15_use_sse4_2) + +LABEL(nibble_ashr_use_sse4_2_exit): + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + .p2align 4 +LABEL(use_sse4_2_exit): + jnc LABEL(strcmp_exitz) +#ifdef USE_AS_STRNCMP + sub %rcx, %r11 + jbe LABEL(strcmp_exitz) +#endif + add %rcx, %rdx + lea -16(%rdi, %r9), %rdi + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + test %r8d, %r8d + jz LABEL(use_sse4_2_ret) + xchg %eax, %edx +LABEL(use_sse4_2_ret): + sub %edx, %eax + ret + + .p2align 4 +LABEL(aftertail): + pcmpeqb %xmm3, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + not %edx + + .p2align 4 +LABEL(exit): + lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */ +LABEL(less32bytes): + lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ + lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ + test %r8d, %r8d + jz LABEL(ret) + xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ + + .p2align 4 +LABEL(ret): +LABEL(less16bytes): + /* + * Check to see if BSF is fast on this processor. If not, use a different + * exit tail. + */ + bsf %rdx, %rdx /* find and store bit index in %rdx */ + +#ifdef USE_AS_STRNCMP + sub %rdx, %r11 + jbe LABEL(strcmp_exitz) +#endif + xor %ecx, %ecx /* clear %ecx */ + xor %eax, %eax /* clear %eax */ + + movb (%rsi, %rdx), %cl + movb (%rdi, %rdx), %al + + sub %ecx, %eax + ret + +LABEL(strcmp_exitz): + xor %eax, %eax + ret + + .p2align 4 +LABEL(Byte0): + /* + * never need to handle byte 0 for strncmpy +#ifdef USE_AS_STRNCMP + sub $0, %r11 + jbe LABEL(strcmp_exitz) +#endif + */ + movzx (%rsi), %ecx + movzx (%rdi), %eax + + sub %ecx, %eax + ret + + .p2align 4 +LABEL(Byte1): + +#ifdef USE_AS_STRNCMP + sub $1, %r11 + jbe LABEL(strcmp_exitz) +#endif + movzx 1(%rsi), %ecx + movzx 1(%rdi), %eax + + sub %ecx, %eax + ret + + .p2align 4 +LABEL(Byte2): + +#ifdef USE_AS_STRNCMP + sub $2, %r11 + jbe LABEL(strcmp_exitz) +#endif + movzx 2(%rsi), %ecx + movzx 2(%rdi), %eax + + sub %ecx, %eax + ret + + .p2align 4 +LABEL(Byte3): + +#ifdef USE_AS_STRNCMP + sub $3, %r11 + jbe LABEL(strcmp_exitz) +#endif + movzx 3(%rsi), %ecx + movzx 3(%rdi), %eax + + sub %ecx, %eax + ret + + .p2align 4 +LABEL(Byte4): + +#ifdef USE_AS_STRNCMP + sub $4, %r11 + jbe LABEL(strcmp_exitz) +#endif + movzx 4(%rsi), %ecx + movzx 4(%rdi), %eax + + sub %ecx, %eax + ret + + .p2align 4 +LABEL(Byte5): + +#ifdef USE_AS_STRNCMP + sub $5, %r11 + jbe LABEL(strcmp_exitz) +#endif + movzx 5(%rsi), %ecx + movzx 5(%rdi), %eax + + sub %ecx, %eax + ret + + .p2align 4 +LABEL(Byte6): + +#ifdef USE_AS_STRNCMP + sub $6, %r11 + jbe LABEL(strcmp_exitz) +#endif + movzx 6(%rsi), %ecx + movzx 6(%rdi), %eax + + sub %ecx, %eax + ret + + .p2align 4 +LABEL(next_8_bytes): + add $8, %rdi + add $8, %rsi +#ifdef USE_AS_STRNCMP + sub $8, %r11 + jbe LABEL(strcmp_exitz) +#endif + test $0x01, %dh + jnz LABEL(Byte0) + + test $0x02, %dh + jnz LABEL(Byte1) + + test $0x04, %dh + jnz LABEL(Byte2) + + test $0x08, %dh + jnz LABEL(Byte3) + + test $0x10, %dh + jnz LABEL(Byte4) + + test $0x20, %dh + jnz LABEL(Byte5) + + test $0x40, %dh + jnz LABEL(Byte6) + +#ifdef USE_AS_STRNCMP + sub $7, %r11 + jbe LABEL(strcmp_exitz) +#endif + movzx 7(%rsi), %ecx + movzx 7(%rdi), %eax + + sub %ecx, %eax + ret + cfi_endproc + .size STRCMP_SSE42, .-STRCMP_SSE42 + + /* Put all SSE 4.2 functions together. */ + .section .rodata.sse4.2,"a",@progbits + .p2align 4 +LABEL(unaligned_table): + .int LABEL(ashr_1) - LABEL(unaligned_table) + .int LABEL(ashr_2) - LABEL(unaligned_table) + .int LABEL(ashr_3) - LABEL(unaligned_table) + .int LABEL(ashr_4) - LABEL(unaligned_table) + .int LABEL(ashr_5) - LABEL(unaligned_table) + .int LABEL(ashr_6) - LABEL(unaligned_table) + .int LABEL(ashr_7) - LABEL(unaligned_table) + .int LABEL(ashr_8) - LABEL(unaligned_table) + .int LABEL(ashr_9) - LABEL(unaligned_table) + .int LABEL(ashr_10) - LABEL(unaligned_table) + .int LABEL(ashr_11) - LABEL(unaligned_table) + .int LABEL(ashr_12) - LABEL(unaligned_table) + .int LABEL(ashr_13) - LABEL(unaligned_table) + .int LABEL(ashr_14) - LABEL(unaligned_table) + .int LABEL(ashr_15) - LABEL(unaligned_table) + .int LABEL(ashr_0) - LABEL(unaligned_table) + + +# undef ENTRY +# define ENTRY(name) \ + .type STRCMP_SSE2, @function; \ + STRCMP_SSE2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCMP_SSE2, .-STRCMP_SSE2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcmp calls through a PLT. + The speedup we get from using SSE4.2 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCMP; __GI_STRCMP = STRCMP_SSE2 +#endif + +#ifndef USE_AS_STRNCMP +#include "../strcmp.S" +#endif diff --git a/sysdeps/x86_64/multiarch/strncmp-c.c b/sysdeps/x86_64/multiarch/strncmp-c.c new file mode 100644 index 0000000000..d4f74a418d --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncmp-c.c @@ -0,0 +1,8 @@ +#ifdef SHARED +#define STRNCMP __strncmp_sse2 +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strncmp_sse2, __GI_strncmp, __strncmp_sse2); +#endif + +#include "strncmp.c" diff --git a/sysdeps/x86_64/multiarch/strncmp.S b/sysdeps/x86_64/multiarch/strncmp.S new file mode 100644 index 0000000000..0af34e7f15 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncmp.S @@ -0,0 +1,3 @@ +#define STRCMP strncmp +#define USE_AS_STRNCMP +#include "strcmp.S" |