Merge commit 'refs/top-bases/t/hurdsig-fixes' into t/hurdsig-fixes

author: Samuel Thibault <samuel.thibault@ens-lyon.org> 2016-10-09 19:34:06 +0200
committer: Samuel Thibault <samuel.thibault@ens-lyon.org> 2016-10-09 19:34:06 +0200
commit: 6772d640a4f4874166a61f1859e1660a2913a89d (patch)
tree: 839fea4d5dcefab75577cecb563ccad4234eb953 /sysdeps/x86_64
parent: f98906bbb57cb495b4501afc5f18604ef3a94e2a (diff)
parent: 7bb5f8a836b916d6ebf7b6921b136e99cea2442d (diff)
381 files changed, 2909 insertions, 3519 deletions
diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index ef70a50c84..67ed5ba213 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -19,8 +19,17 @@ gen-as-const-headers += locale-defines.sym
 endif
 
 ifeq ($(subdir),elf)
+# There is no good reason to use MMX in x86-64 ld.so with GCC.
+CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\
+		   -mno-mmx)
+
 sysdep-dl-routines += tlsdesc dl-tlsdesc
 
+tests += ifuncmain8
+modules-names += ifuncmod8
+
+$(objpfx)ifuncmain8: $(objpfx)ifuncmod8.so
+
 tests += tst-quad1 tst-quad2
 modules-names += tst-quadmod1 tst-quadmod2
 
@@ -34,10 +43,12 @@ tests-pie += $(quad-pie-test)
 $(objpfx)tst-quad1pie: $(objpfx)tst-quadmod1pie.o
 $(objpfx)tst-quad2pie: $(objpfx)tst-quadmod2pie.o
 
-tests += tst-audit3 tst-audit4 tst-audit5 tst-audit10
-ifeq (yes,$(config-cflags-avx))
-tests += tst-audit6 tst-audit7
-endif
+tests += tst-audit3 tst-audit4 tst-audit5 tst-audit6 tst-audit7 tst-audit10
+
+tests += tst-split-dynreloc
+LDFLAGS-tst-split-dynreloc = -Wl,-T,$(..)sysdeps/x86_64/tst-split-dynreloc.lds
+tst-split-dynreloc-ENV = LD_BIND_NOW=1
+
 modules-names += tst-auditmod3a tst-auditmod3b \
 		tst-auditmod4a tst-auditmod4b \
 		tst-auditmod5a tst-auditmod5b \
@@ -70,18 +81,13 @@ $(objpfx)tst-audit10: $(objpfx)tst-auditmod10a.so
 $(objpfx)tst-audit10.out: $(objpfx)tst-auditmod10b.so
 tst-audit10-ENV = LD_AUDIT=$(objpfx)tst-auditmod10b.so
 
-ifeq (yes,$(config-cflags-avx))
-AVX-CFLAGS=-mavx
-ifeq (yes,$(config-cflags-novzeroupper))
-AVX-CFLAGS+=-mno-vzeroupper
-endif
+AVX-CFLAGS=-mavx -mno-vzeroupper
 CFLAGS-tst-audit4.c += $(AVX-CFLAGS)
 CFLAGS-tst-auditmod4a.c += $(AVX-CFLAGS)
 CFLAGS-tst-auditmod4b.c += $(AVX-CFLAGS)
 CFLAGS-tst-auditmod6b.c += $(AVX-CFLAGS)
 CFLAGS-tst-auditmod6c.c += $(AVX-CFLAGS)
 CFLAGS-tst-auditmod7b.c += $(AVX-CFLAGS)
-endif
 ifeq (yes,$(config-cflags-avx512))
 AVX512-CFLAGS = -mavx512f
 CFLAGS-tst-audit10.c += $(AVX512-CFLAGS)
diff --git a/sysdeps/x86_64/__longjmp.S b/sysdeps/x86_64/__longjmp.S
index a410efb08c..c164626577 100644
--- a/sysdeps/x86_64/__longjmp.S
+++ b/sysdeps/x86_64/__longjmp.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2001-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2001-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/_mcount.S b/sysdeps/x86_64/_mcount.S
index 01787f95f7..5d7edd2a29 100644
--- a/sysdeps/x86_64/_mcount.S
+++ b/sysdeps/x86_64/_mcount.S
@@ -1,5 +1,5 @@
 /* Machine-specific calling sequence for `mcount' profiling function.  x86-64 version.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
+   Copyright (C) 2002-2016 Free Software Foundation, Inc.
    Contributed by Andreas Jaeger <aj@suse.de>.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/x86_64/add_n.S b/sysdeps/x86_64/add_n.S
index b8e7c3e067..fc99811476 100644
--- a/sysdeps/x86_64/add_n.S
+++ b/sysdeps/x86_64/add_n.S
@@ -1,6 +1,6 @@
 /* x86-64 __mpn_add_n -- Add two limb vectors of the same length > 0 and store
    sum in a third limb vector.
-   Copyright (C) 2006-2015 Free Software Foundation, Inc.
+   Copyright (C) 2006-2016 Free Software Foundation, Inc.
    This file is part of the GNU MP Library.
 
    The GNU MP Library is free software; you can redistribute it and/or modify
diff --git a/sysdeps/x86_64/addmul_1.S b/sysdeps/x86_64/addmul_1.S
index 829e01eff9..ab7c2fa701 100644
--- a/sysdeps/x86_64/addmul_1.S
+++ b/sysdeps/x86_64/addmul_1.S
@@ -1,6 +1,6 @@
 /* x86-64 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
    the result to a second limb vector.
-   Copyright (C) 2003-2015 Free Software Foundation, Inc.
+   Copyright (C) 2003-2016 Free Software Foundation, Inc.
    This file is part of the GNU MP Library.
 
    The GNU MP Library is free software; you can redistribute it and/or modify
diff --git a/sysdeps/x86_64/bits/atomic.h b/sysdeps/x86_64/atomic-machine.h
index 337b334db1..a5b86eb3ce 100644
--- a/sysdeps/x86_64/bits/atomic.h
+++ b/sysdeps/x86_64/atomic-machine.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2002-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2002-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
 
@@ -56,11 +56,7 @@ typedef uintmax_t uatomic_max_t;
 #endif
 
 #define __HAVE_64B_ATOMICS 1
-#if __GNUC_PREREQ (4, 7)
 #define USE_ATOMIC_COMPILER_BUILTINS 1
-#else
-#define USE_ATOMIC_COMPILER_BUILTINS 0
-#endif
 
 #define atomic_compare_and_exchange_val_acq(mem, newval, oldval) \
   __sync_val_compare_and_swap (mem, oldval, newval)
diff --git a/sysdeps/x86_64/backtrace.c b/sysdeps/x86_64/backtrace.c
index 2a3848d20f..e04407c516 100644
--- a/sysdeps/x86_64/backtrace.c
+++ b/sysdeps/x86_64/backtrace.c
@@ -1,5 +1,5 @@
 /* Return backtrace of current program state.
-   Copyright (C) 2003-2015 Free Software Foundation, Inc.
+   Copyright (C) 2003-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Jakub Jelinek <jakub@redhat.com>, 2003.
 
@@ -17,7 +17,7 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <bits/libc-lock.h>
+#include <libc-lock.h>
 #include <dlfcn.h>
 #include <execinfo.h>
 #include <stdlib.h>
@@ -92,11 +92,13 @@ backtrace_helper (struct _Unwind_Context *ctx, void *a)
 }
 
 int
-__backtrace (array, size)
-     void **array;
-     int size;
+__backtrace (void **array, int size)
 {
   struct trace_arg arg = { .array = array, .cfa = 0, .size = size, .cnt = -1 };
+
+  if (size <= 0)
+    return 0;
+
 #ifdef SHARED
   __libc_once_define (static, once);
 
@@ -105,8 +107,7 @@ __backtrace (array, size)
     return 0;
 #endif
 
-  if (size >= 1)
-    unwind_backtrace (backtrace_helper, &arg);
+  unwind_backtrace (backtrace_helper, &arg);
 
   /* _Unwind_Backtrace seems to put NULL address above
      _start.  Fix it up here.  */
diff --git a/sysdeps/x86_64/bsd-_setjmp.S b/sysdeps/x86_64/bsd-_setjmp.S
index fed6afd97b..1a2a94f1a6 100644
--- a/sysdeps/x86_64/bsd-_setjmp.S
+++ b/sysdeps/x86_64/bsd-_setjmp.S
@@ -1,5 +1,5 @@
 /* BSD `_setjmp' entry point to `sigsetjmp (..., 0)'.  x86-64 version.
-   Copyright (C) 1994-2015 Free Software Foundation, Inc.
+   Copyright (C) 1994-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/bsd-setjmp.S b/sysdeps/x86_64/bsd-setjmp.S
index 6a078dd786..11d9d8daa0 100644
--- a/sysdeps/x86_64/bsd-setjmp.S
+++ b/sysdeps/x86_64/bsd-setjmp.S
@@ -1,5 +1,5 @@
 /* BSD `setjmp' entry point to `sigsetjmp (..., 1)'.  x86-64 version.
-   Copyright (C) 1994-2015 Free Software Foundation, Inc.
+   Copyright (C) 1994-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/cacheinfo.c b/sysdeps/x86_64/cacheinfo.c
index b99fb9a762..96463df064 100644
--- a/sysdeps/x86_64/cacheinfo.c
+++ b/sysdeps/x86_64/cacheinfo.c
@@ -1,5 +1,5 @@
 /* x86_64 cache info.
-   Copyright (C) 2003-2015 Free Software Foundation, Inc.
+   Copyright (C) 2003-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -21,40 +21,11 @@
 #include <stdlib.h>
 #include <unistd.h>
 #include <cpuid.h>
+#include <init-arch.h>
 
-#ifndef __cpuid_count
-/* FIXME: Provide __cpuid_count if it isn't defined.  Copied from gcc
-   4.4.0.  Remove this if gcc 4.4 is the minimum requirement.  */
-# if defined(__i386__) && defined(__PIC__)
-/* %ebx may be the PIC register.  */
-#  define __cpuid_count(level, count, a, b, c, d)		\
-  __asm__ ("xchg{l}\t{%%}ebx, %1\n\t"			\
-	   "cpuid\n\t"					\
-	   "xchg{l}\t{%%}ebx, %1\n\t"			\
-	   : "=a" (a), "=r" (b), "=c" (c), "=d" (d)	\
-	   : "0" (level), "2" (count))
-# else
-#  define __cpuid_count(level, count, a, b, c, d)		\
-  __asm__ ("cpuid\n\t"					\
-	   : "=a" (a), "=b" (b), "=c" (c), "=d" (d)	\
-	   : "0" (level), "2" (count))
-# endif
-#endif
-
-#ifdef USE_MULTIARCH
-# include "multiarch/init-arch.h"
-
-# define is_intel __cpu_features.kind == arch_kind_intel
-# define is_amd __cpu_features.kind == arch_kind_amd
-# define max_cpuid __cpu_features.max_cpuid
-#else
-  /* This spells out "GenuineIntel".  */
-# define is_intel \
-  ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69
-  /* This spells out "AuthenticAMD".  */
-# define is_amd \
-  ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65
-#endif
+#define is_intel GLRO(dl_x86_cpu_features).kind == arch_kind_intel
+#define is_amd GLRO(dl_x86_cpu_features).kind == arch_kind_amd
+#define max_cpuid GLRO(dl_x86_cpu_features).max_cpuid
 
 static const struct intel_02_cache_info
 {
@@ -235,21 +206,8 @@ intel_check_word (int name, unsigned int value, bool *has_level_2,
 	      /* Intel reused this value.  For family 15, model 6 it
 		 specifies the 3rd level cache.  Otherwise the 2nd
 		 level cache.  */
-	      unsigned int family;
-	      unsigned int model;
-#ifdef USE_MULTIARCH
-	      family = __cpu_features.family;
-	      model = __cpu_features.model;
-#else
-	      unsigned int eax;
-	      unsigned int ebx;
-	      unsigned int ecx;
-	      unsigned int edx;
-	      __cpuid (1, eax, ebx, ecx, edx);
-
-	      family = ((eax >> 20) & 0xff) + ((eax >> 8) & 0xf);
-	      model = (((eax >>16) & 0xf) << 4) + ((eax >> 4) & 0xf);
-#endif
+	      unsigned int family = GLRO(dl_x86_cpu_features).family;
+	      unsigned int model = GLRO(dl_x86_cpu_features).model;
 
 	      if (family == 15 && model == 6)
 		{
@@ -476,18 +434,6 @@ long int
 attribute_hidden
 __cache_sysconf (int name)
 {
-#ifdef USE_MULTIARCH
-  if (__cpu_features.kind == arch_kind_unknown)
-    __init_cpu_features ();
-#else
-  /* Find out what brand of processor.  */
-  unsigned int max_cpuid;
-  unsigned int ebx;
-  unsigned int ecx;
-  unsigned int edx;
-  __cpuid (0, max_cpuid, ebx, ecx, edx);
-#endif
-
   if (is_intel)
     return handle_intel (name, max_cpuid);
 
@@ -523,18 +469,6 @@ long int __x86_raw_shared_cache_size attribute_hidden = 1024 * 1024;
 int __x86_prefetchw attribute_hidden;
 #endif
 
-#ifndef DISABLE_PREFERRED_MEMORY_INSTRUCTION
-/* Instructions preferred for memory and string routines.
-
-  0: Regular instructions
-  1: MMX instructions
-  2: SSE2 instructions
-  3: SSSE3 instructions
-
-  */
-int __x86_preferred_memory_instruction attribute_hidden;
-#endif
-
 
 static void
 __attribute__((constructor))
@@ -551,14 +485,6 @@ init_cacheinfo (void)
   unsigned int level;
   unsigned int threads = 0;
 
-#ifdef USE_MULTIARCH
-  if (__cpu_features.kind == arch_kind_unknown)
-    __init_cpu_features ();
-#else
-  int max_cpuid;
-  __cpuid (0, max_cpuid, ebx, ecx, edx);
-#endif
-
   if (is_intel)
     {
       data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid);
@@ -574,34 +500,13 @@ init_cacheinfo (void)
 	  shared = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid);
 	}
 
-      unsigned int ebx_1;
-
-#ifdef USE_MULTIARCH
-      eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax;
-      ebx_1 = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ebx;
-      ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx;
-      edx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].edx;
-#else
-      __cpuid (1, eax, ebx_1, ecx, edx);
-#endif
-
-      unsigned int family = (eax >> 8) & 0x0f;
-      unsigned int model = (eax >> 4) & 0x0f;
-      unsigned int extended_model = (eax >> 12) & 0xf0;
-
-#ifndef DISABLE_PREFERRED_MEMORY_INSTRUCTION
-      /* Intel prefers SSSE3 instructions for memory/string routines
-	 if they are available.  */
-      if ((ecx & 0x200))
-	__x86_preferred_memory_instruction = 3;
-      else
-	__x86_preferred_memory_instruction = 2;
-#endif
-
       /* Figure out the number of logical threads that share the
 	 highest cache level.  */
       if (max_cpuid >= 4)
 	{
+	  unsigned int family = GLRO(dl_x86_cpu_features).family;
+	  unsigned int model = GLRO(dl_x86_cpu_features).model;
+
 	  int i = 0;
 
 	  /* Query until desired cache level is enumerated.  */
@@ -653,7 +558,6 @@ init_cacheinfo (void)
 	  threads += 1;
 	  if (threads > 2 && level == 2 && family == 6)
 	    {
-	      model += extended_model;
 	      switch (model)
 		{
 		case 0x57:
@@ -676,7 +580,9 @@ init_cacheinfo (void)
 	intel_bug_no_cache_info:
 	  /* Assume that all logical threads share the highest cache level.  */
 
-	  threads = (ebx_1 >> 16) & 0xff;
+	  threads
+	    = ((GLRO(dl_x86_cpu_features).cpuid[COMMON_CPUID_INDEX_1].ebx
+		>> 16) & 0xff);
 	}
 
       /* Cap usage of highest cache level to the number of supported
@@ -691,25 +597,6 @@ init_cacheinfo (void)
       long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
       shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
 
-#ifndef DISABLE_PREFERRED_MEMORY_INSTRUCTION
-# ifdef USE_MULTIARCH
-      eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax;
-      ebx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ebx;
-      ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx;
-      edx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].edx;
-# else
-      __cpuid (1, eax, ebx, ecx, edx);
-# endif
-
-      /* AMD prefers SSSE3 instructions for memory/string routines
-	 if they are avaiable, otherwise it prefers integer
-	 instructions.  */
-      if ((ecx & 0x200))
-	__x86_preferred_memory_instruction = 3;
-      else
-	__x86_preferred_memory_instruction = 0;
-#endif
-
       /* Get maximum extended function. */
       __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx);
 
diff --git a/sysdeps/x86_64/configure b/sysdeps/x86_64/configure
index 552f535ac6..c72b9d3184 100644
--- a/sysdeps/x86_64/configure
+++ b/sysdeps/x86_64/configure
@@ -1,100 +1,6 @@
-
-# ac_fn_c_check_header_compile LINENO HEADER VAR INCLUDES
-# -------------------------------------------------------
-# Tests whether HEADER exists and can be compiled using the include files in
-# INCLUDES, setting the cache variable VAR accordingly.
-ac_fn_c_check_header_compile ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
-$as_echo_n "checking for $2... " >&6; }
-if eval \${$3+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-$4
-#include <$2>
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  eval "$3=yes"
-else
-  eval "$3=no"
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-fi
-eval ac_res=\$$3
-	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
-$as_echo "$ac_res" >&6; }
-  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
-
-} # ac_fn_c_check_header_compile
 # This file is generated from configure.ac by Autoconf.  DO NOT EDIT!
  # Local configure fragment for sysdeps/x86_64.
 
-
-ac_fn_c_check_header_compile "$LINENO" "cpuid.h" "ac_cv_header_cpuid_h" "/* No default includes.  */
-"
-if test "x$ac_cv_header_cpuid_h" = xyes; then :
-
-else
-  as_fn_error $? "gcc must provide the <cpuid.h> header" "$LINENO" 5
-fi
-
-
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for SSE4 support" >&5
-$as_echo_n "checking for SSE4 support... " >&6; }
-if ${libc_cv_cc_sse4+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if { ac_try='${CC-cc} -msse4 -xc /dev/null -S -o /dev/null'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-  libc_cv_cc_sse4=yes
-else
-  libc_cv_cc_sse4=no
-fi
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_cc_sse4" >&5
-$as_echo "$libc_cv_cc_sse4" >&6; }
-if test $libc_cv_cc_sse4 = yes; then
-  $as_echo "#define HAVE_SSE4_SUPPORT 1" >>confdefs.h
-
-fi
-config_vars="$config_vars
-config-cflags-sse4 = $libc_cv_cc_sse4"
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX support" >&5
-$as_echo_n "checking for AVX support... " >&6; }
-if ${libc_cv_cc_avx+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if { ac_try='${CC-cc} -mavx -xc /dev/null -S -o /dev/null'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-  libc_cv_cc_avx=yes
-else
-  libc_cv_cc_avx=no
-fi
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_cc_avx" >&5
-$as_echo "$libc_cv_cc_avx" >&6; }
-if test $libc_cv_cc_avx = yes; then
-  $as_echo "#define HAVE_AVX_SUPPORT 1" >>confdefs.h
-
-fi
-config_vars="$config_vars
-config-cflags-avx = $libc_cv_cc_avx"
-
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX512 support in assembler" >&5
 $as_echo_n "checking for AVX512 support in assembler... " >&6; }
 if ${libc_cv_asm_avx512+:} false; then :
@@ -149,80 +55,6 @@ fi
 config_vars="$config_vars
 config-cflags-avx512 = $libc_cv_cc_avx512"
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX encoding of SSE instructions" >&5
-$as_echo_n "checking for AVX encoding of SSE instructions... " >&6; }
-if ${libc_cv_cc_sse2avx+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if { ac_try='${CC-cc} -msse2avx -xc /dev/null -S -o /dev/null'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-  libc_cv_cc_sse2avx=yes
-else
-  libc_cv_cc_sse2avx=no
-fi
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_cc_sse2avx" >&5
-$as_echo "$libc_cv_cc_sse2avx" >&6; }
-if test $libc_cv_cc_sse2avx = yes; then
-  $as_echo "#define HAVE_SSE2AVX_SUPPORT 1" >>confdefs.h
-
-fi
-config_vars="$config_vars
-config-cflags-sse2avx = $libc_cv_cc_sse2avx"
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for FMA4 support" >&5
-$as_echo_n "checking for FMA4 support... " >&6; }
-if ${libc_cv_cc_fma4+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if { ac_try='${CC-cc} -mfma4 -xc /dev/null -S -o /dev/null'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-  libc_cv_cc_fma4=yes
-else
-  libc_cv_cc_fma4=no
-fi
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_cc_fma4" >&5
-$as_echo "$libc_cv_cc_fma4" >&6; }
-if test $libc_cv_cc_fma4 = yes; then
-  $as_echo "#define HAVE_FMA4_SUPPORT 1" >>confdefs.h
-
-fi
-config_vars="$config_vars
-have-mfma4 = $libc_cv_cc_fma4"
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for -mno-vzeroupper support" >&5
-$as_echo_n "checking for -mno-vzeroupper support... " >&6; }
-if ${libc_cv_cc_novzeroupper+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if { ac_try='${CC-cc} -mno-vzeroupper -xc /dev/null -S -o /dev/null'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-  libc_cv_cc_novzeroupper=yes
-else
-  libc_cv_cc_novzeroupper=no
-fi
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_cc_novzeroupper" >&5
-$as_echo "$libc_cv_cc_novzeroupper" >&6; }
-config_vars="$config_vars
-config-cflags-novzeroupper = $libc_cv_cc_novzeroupper"
-
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for Intel MPX support" >&5
 $as_echo_n "checking for Intel MPX support... " >&6; }
 if ${libc_cv_asm_mpx+:} false; then :
@@ -250,32 +82,6 @@ if test $libc_cv_asm_mpx == yes; then
 
 fi
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX2 support" >&5
-$as_echo_n "checking for AVX2 support... " >&6; }
-if ${libc_cv_cc_avx2+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if { ac_try='${CC-cc} -mavx2 -xc /dev/null -S -o /dev/null'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-  libc_cv_cc_avx2=yes
-else
-  libc_cv_cc_avx2=no
-fi
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_cc_avx2" >&5
-$as_echo "$libc_cv_cc_avx2" >&6; }
-if test $libc_cv_cc_avx2 = yes; then
-  $as_echo "#define HAVE_AVX2_SUPPORT 1" >>confdefs.h
-
-fi
-config_vars="$config_vars
-config-cflags-avx2 = $libc_cv_cc_avx2"
-
 if test x"$build_mathvec" = xnotset; then
   build_mathvec=yes
 fi
diff --git a/sysdeps/x86_64/configure.ac b/sysdeps/x86_64/configure.ac
index e7208c9b30..37b1059af3 100644
--- a/sysdeps/x86_64/configure.ac
+++ b/sysdeps/x86_64/configure.ac
@@ -1,28 +1,6 @@
 GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
 # Local configure fragment for sysdeps/x86_64.
 
-AC_CHECK_HEADER([cpuid.h], ,
-  [AC_MSG_ERROR([gcc must provide the <cpuid.h> header])],
-  [/* No default includes.  */])
-
-dnl Check if -msse4 works.
-AC_CACHE_CHECK(for SSE4 support, libc_cv_cc_sse4, [dnl
-LIBC_TRY_CC_OPTION([-msse4], [libc_cv_cc_sse4=yes], [libc_cv_cc_sse4=no])
-])
-if test $libc_cv_cc_sse4 = yes; then
-  AC_DEFINE(HAVE_SSE4_SUPPORT)
-fi
-LIBC_CONFIG_VAR([config-cflags-sse4], [$libc_cv_cc_sse4])
-
-dnl Check if -mavx works.
-AC_CACHE_CHECK(for AVX support, libc_cv_cc_avx, [dnl
-LIBC_TRY_CC_OPTION([-mavx], [libc_cv_cc_avx=yes], [libc_cv_cc_avx=no])
-])
-if test $libc_cv_cc_avx = yes; then
-  AC_DEFINE(HAVE_AVX_SUPPORT)
-fi
-LIBC_CONFIG_VAR([config-cflags-avx], [$libc_cv_cc_avx])
-
 dnl Check if asm supports AVX512.
 AC_CACHE_CHECK(for AVX512 support in assembler, libc_cv_asm_avx512, [dnl
 cat > conftest.s <<\EOF
@@ -48,34 +26,6 @@ if test $libc_cv_cc_avx512 = yes; then
 fi
 LIBC_CONFIG_VAR([config-cflags-avx512], [$libc_cv_cc_avx512])
 
-dnl Check if -msse2avx works.
-AC_CACHE_CHECK(for AVX encoding of SSE instructions, libc_cv_cc_sse2avx, [dnl
-LIBC_TRY_CC_OPTION([-msse2avx],
-		   [libc_cv_cc_sse2avx=yes],
-		   [libc_cv_cc_sse2avx=no])
-])
-if test $libc_cv_cc_sse2avx = yes; then
-  AC_DEFINE(HAVE_SSE2AVX_SUPPORT)
-fi
-LIBC_CONFIG_VAR([config-cflags-sse2avx], [$libc_cv_cc_sse2avx])
-
-dnl Check if -mfma4 works.
-AC_CACHE_CHECK(for FMA4 support, libc_cv_cc_fma4, [dnl
-LIBC_TRY_CC_OPTION([-mfma4], [libc_cv_cc_fma4=yes], [libc_cv_cc_fma4=no])
-])
-if test $libc_cv_cc_fma4 = yes; then
-  AC_DEFINE(HAVE_FMA4_SUPPORT)
-fi
-LIBC_CONFIG_VAR([have-mfma4], [$libc_cv_cc_fma4])
-
-dnl Check if -mno-vzeroupper works.
-AC_CACHE_CHECK(for -mno-vzeroupper support, libc_cv_cc_novzeroupper, [dnl
-LIBC_TRY_CC_OPTION([-mno-vzeroupper],
-		   [libc_cv_cc_novzeroupper=yes],
-		   [libc_cv_cc_novzeroupper=no])
-])
-LIBC_CONFIG_VAR([config-cflags-novzeroupper], [$libc_cv_cc_novzeroupper])
-
 dnl Check whether asm supports Intel MPX
 AC_CACHE_CHECK(for Intel MPX support, libc_cv_asm_mpx, [dnl
 cat > conftest.s <<\EOF
@@ -91,15 +41,6 @@ if test $libc_cv_asm_mpx == yes; then
   AC_DEFINE(HAVE_MPX_SUPPORT)
 fi
 
-dnl Check if -mavx2 works.
-AC_CACHE_CHECK(for AVX2 support, libc_cv_cc_avx2, [dnl
-LIBC_TRY_CC_OPTION([-mavx2], [libc_cv_cc_avx2=yes], [libc_cv_cc_avx2=no])
-])
-if test $libc_cv_cc_avx2 = yes; then
-  AC_DEFINE(HAVE_AVX2_SUPPORT)
-fi
-LIBC_CONFIG_VAR([config-cflags-avx2], [$libc_cv_cc_avx2])
-
 if test x"$build_mathvec" = xnotset; then
   build_mathvec=yes
 fi
diff --git a/sysdeps/x86_64/crti.S b/sysdeps/x86_64/crti.S
index 595b0fe83b..a34525974a 100644
--- a/sysdeps/x86_64/crti.S
+++ b/sysdeps/x86_64/crti.S
@@ -1,5 +1,5 @@
 /* Special .init and .fini section support for x86-64.
-   Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/crtn.S b/sysdeps/x86_64/crtn.S
index e2d6de73e4..b2fa0c6765 100644
--- a/sysdeps/x86_64/crtn.S
+++ b/sysdeps/x86_64/crtn.S
@@ -1,5 +1,5 @@
 /* Special .init and .fini section support for x86-64.
-   Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/dl-irel.h b/sysdeps/x86_64/dl-irel.h
index d0fa4330cc..80d7d1dd78 100644
--- a/sysdeps/x86_64/dl-irel.h
+++ b/sysdeps/x86_64/dl-irel.h
@@ -1,6 +1,6 @@
 /* Machine-dependent ELF indirect relocation inline functions.
    x86-64 version.
-   Copyright (C) 2009-2015 Free Software Foundation, Inc.
+   Copyright (C) 2009-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/dl-lookupcfg.h b/sysdeps/x86_64/dl-lookupcfg.h
index 310f261fec..033b475889 100644
--- a/sysdeps/x86_64/dl-lookupcfg.h
+++ b/sysdeps/x86_64/dl-lookupcfg.h
@@ -1,5 +1,5 @@
 /* Configuration of lookup functions.
-   Copyright (C) 2005-2015 Free Software Foundation, Inc.
+   Copyright (C) 2005-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -26,6 +26,7 @@
 
 struct link_map;
 
-extern void internal_function _dl_unmap (struct link_map *map);
+extern void _dl_unmap (struct link_map *map)
+  internal_function attribute_hidden;
 
 #define DL_UNMAP(map) _dl_unmap (map)
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index cae6db3560..980ca73cf2 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -1,5 +1,5 @@
 /* Machine-dependent ELF dynamic relocation inline functions.  x86-64 version.
-   Copyright (C) 2001-2015 Free Software Foundation, Inc.
+   Copyright (C) 2001-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Andreas Jaeger <aj@suse.de>.
 
@@ -26,6 +26,7 @@
 #include <sysdep.h>
 #include <tls.h>
 #include <dl-tlsdesc.h>
+#include <cpu-features.c>
 
 /* Return nonzero iff ELF header is compatible with the running host.  */
 static inline int __attribute__ ((unused))
@@ -65,8 +66,12 @@ static inline int __attribute__ ((unused, always_inline))
 elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 {
   Elf64_Addr *got;
-  extern void _dl_runtime_resolve (ElfW(Word)) attribute_hidden;
-  extern void _dl_runtime_profile (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_sse (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_avx (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_avx512 (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
 
   if (l->l_info[DT_JMPREL] && lazy)
     {
@@ -94,7 +99,12 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 	 end in this function.  */
       if (__glibc_unlikely (profile))
 	{
-	  *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile;
+	  if (HAS_ARCH_FEATURE (AVX512F_Usable))
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
+	  else if (HAS_ARCH_FEATURE (AVX_Usable))
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx;
+	  else
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse;
 
 	  if (GLRO(dl_profile) != NULL
 	      && _dl_name_match_p (GLRO(dl_profile), l))
@@ -103,9 +113,17 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 	    GL(dl_profile_map) = l;
 	}
       else
-	/* This function will get called to fix up the GOT entry indicated by
-	   the offset on the stack, and then jump to the resolved address.  */
-	*(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve;
+	{
+	  /* This function will get called to fix up the GOT entry
+	     indicated by the offset on the stack, and then jump to
+	     the resolved address.  */
+	  if (HAS_ARCH_FEATURE (AVX512F_Usable))
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx512;
+	  else if (HAS_ARCH_FEATURE (AVX_Usable))
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx;
+	  else
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_sse;
+	}
     }
 
   if (l->l_info[ADDRIDX (DT_TLSDESC_GOT)] && lazy)
@@ -205,6 +223,8 @@ dl_platform_init (void)
   if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0')
     /* Avoid an empty string which would disturb us.  */
     GLRO(dl_platform) = NULL;
+
+  init_cpu_features (&GLRO(dl_x86_cpu_features));
 }
 
 static inline ElfW(Addr)
diff --git a/sysdeps/x86_64/dl-procinfo.c b/sysdeps/x86_64/dl-procinfo.c
new file mode 100644
index 0000000000..4625695dfb
--- /dev/null
+++ b/sysdeps/x86_64/dl-procinfo.c
@@ -0,0 +1,57 @@
+/* Data for x86-64 version of processor capability information.
+   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* If anything should be added here check whether the size of each string
+   is still ok with the given array size.
+
+   All the #ifdefs in the definitions are quite irritating but
+   necessary if we want to avoid duplicating the information.  There
+   are three different modes:
+
+   - PROCINFO_DECL is defined.  This means we are only interested in
+     declarations.
+
+   - PROCINFO_DECL is not defined:
+
+     + if SHARED is defined the file is included in an array
+       initializer.  The .element = { ... } syntax is needed.
+
+     + if SHARED is not defined a normal array initialization is
+       needed.
+  */
+
+#ifndef PROCINFO_CLASS
+# define PROCINFO_CLASS
+#endif
+
+#if !defined PROCINFO_DECL && defined SHARED
+  ._dl_x86_cpu_features
+#else
+PROCINFO_CLASS struct cpu_features _dl_x86_cpu_features
+#endif
+#ifndef PROCINFO_DECL
+= { }
+#endif
+#if !defined SHARED || defined PROCINFO_DECL
+;
+#else
+,
+#endif
+
+#undef PROCINFO_DECL
+#undef PROCINFO_CLASS
diff --git a/sysdeps/x86_64/dl-tls.h b/sysdeps/x86_64/dl-tls.h
index 285799b674..0f101e6ac6 100644
--- a/sysdeps/x86_64/dl-tls.h
+++ b/sysdeps/x86_64/dl-tls.h
@@ -1,5 +1,5 @@
 /* Thread-local storage handling in the ELF dynamic linker.  x86-64 version.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
+   Copyright (C) 2002-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/dl-tlsdesc.S b/sysdeps/x86_64/dl-tlsdesc.S
index edb6328f8e..3cb7c3d031 100644
--- a/sysdeps/x86_64/dl-tlsdesc.S
+++ b/sysdeps/x86_64/dl-tlsdesc.S
@@ -1,5 +1,5 @@
 /* Thread-local storage handling in the ELF dynamic linker.  x86_64 version.
-   Copyright (C) 2004-2015 Free Software Foundation, Inc.
+   Copyright (C) 2004-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/dl-tlsdesc.h b/sysdeps/x86_64/dl-tlsdesc.h
index cf32328264..11e1a50b8f 100644
--- a/sysdeps/x86_64/dl-tlsdesc.h
+++ b/sysdeps/x86_64/dl-tlsdesc.h
@@ -1,6 +1,6 @@
 /* Thread-local storage descriptor handling in the ELF dynamic linker.
    x86_64 version.
-   Copyright (C) 2005-2015 Free Software Foundation, Inc.
+   Copyright (C) 2005-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -60,8 +60,9 @@ extern ptrdiff_t attribute_hidden
   _dl_tlsdesc_resolve_hold(struct tlsdesc *on_rax);
 
 # ifdef SHARED
-extern void *internal_function _dl_make_tlsdesc_dynamic (struct link_map *map,
-							 size_t ti_offset);
+extern void *_dl_make_tlsdesc_dynamic (struct link_map *map,
+				       size_t ti_offset)
+  internal_function attribute_hidden;
 
 extern ptrdiff_t attribute_hidden _dl_tlsdesc_dynamic(struct tlsdesc *);
 # endif
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 678c57fc24..9fb6b13983 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -1,5 +1,5 @@
 /* PLT trampolines.  x86-64 version.
-   Copyright (C) 2004-2015 Free Software Foundation, Inc.
+   Copyright (C) 2004-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -20,23 +20,40 @@
 #include <sysdep.h>
 #include <link-defines.h>
 
-#if (RTLD_SAVESPACE_SSE % 32) != 0
-# error RTLD_SAVESPACE_SSE must be aligned to 32 bytes
+#ifndef DL_STACK_ALIGNMENT
+/* Due to GCC bug:
+
+   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
+
+   __tls_get_addr may be called with 8-byte stack alignment.  Although
+   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
+   that stack will be always aligned at 16 bytes.  We use unaligned
+   16-byte move to load and store SSE registers, which has no penalty
+   on modern processors if stack is 16-byte aligned.  */
+# define DL_STACK_ALIGNMENT 8
+#endif
+
+#ifndef DL_RUNIME_UNALIGNED_VEC_SIZE
+/* The maximum size of unaligned vector load and store.  */
+# define DL_RUNIME_UNALIGNED_VEC_SIZE 16
 #endif
 
+/* True if _dl_runtime_resolve should align stack to VEC_SIZE bytes.  */
+#define DL_RUNIME_RESOLVE_REALIGN_STACK \
+  (VEC_SIZE > DL_STACK_ALIGNMENT \
+   && VEC_SIZE > DL_RUNIME_UNALIGNED_VEC_SIZE)
+
+/* Align vector register save area to 16 bytes.  */
+#define REGISTER_SAVE_VEC_OFF	0
+
 /* Area on stack to save and restore registers used for parameter
    passing when calling _dl_fixup.  */
 #ifdef __ILP32__
-/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX.  */
-# define REGISTER_SAVE_AREA	(8 * 7)
-# define REGISTER_SAVE_RAX	0
+# define REGISTER_SAVE_RAX	(REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
 # define PRESERVE_BND_REGS_PREFIX
 #else
-/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as BND0,
-   BND1, BND2, BND3.  */
-# define REGISTER_SAVE_AREA	(8 * 7 + 16 * 4)
 /* Align bound register save area to 16 bytes.  */
-# define REGISTER_SAVE_BND0	0
+# define REGISTER_SAVE_BND0	(REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
 # define REGISTER_SAVE_BND1	(REGISTER_SAVE_BND0 + 16)
 # define REGISTER_SAVE_BND2	(REGISTER_SAVE_BND1 + 16)
 # define REGISTER_SAVE_BND3	(REGISTER_SAVE_BND2 + 16)
@@ -54,386 +71,61 @@
 #define REGISTER_SAVE_R8	(REGISTER_SAVE_RDI + 8)
 #define REGISTER_SAVE_R9	(REGISTER_SAVE_R8 + 8)
 
-	.text
-	.globl _dl_runtime_resolve
-	.type _dl_runtime_resolve, @function
-	.align 16
-	cfi_startproc
-_dl_runtime_resolve:
-	cfi_adjust_cfa_offset(16) # Incorporate PLT
-	subq $REGISTER_SAVE_AREA,%rsp
-	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
-	# Preserve registers otherwise clobbered.
-	movq %rax, REGISTER_SAVE_RAX(%rsp)
-	movq %rcx, REGISTER_SAVE_RCX(%rsp)
-	movq %rdx, REGISTER_SAVE_RDX(%rsp)
-	movq %rsi, REGISTER_SAVE_RSI(%rsp)
-	movq %rdi, REGISTER_SAVE_RDI(%rsp)
-	movq %r8, REGISTER_SAVE_R8(%rsp)
-	movq %r9, REGISTER_SAVE_R9(%rsp)
-#ifndef __ILP32__
-	# We also have to preserve bound registers.  These are nops if
-	# Intel MPX isn't available or disabled.
-# ifdef HAVE_MPX_SUPPORT
-	bndmov %bnd0, REGISTER_SAVE_BND0(%rsp)
-	bndmov %bnd1, REGISTER_SAVE_BND1(%rsp)
-	bndmov %bnd2, REGISTER_SAVE_BND2(%rsp)
-	bndmov %bnd3, REGISTER_SAVE_BND3(%rsp)
-# else
-#  if REGISTER_SAVE_BND0 == 0
-	.byte 0x66,0x0f,0x1b,0x04,0x24
-#  else
-	.byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0
-#  endif
-	.byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1
-	.byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2
-	.byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3
-# endif
-#endif
-	# Copy args pushed by PLT in register.
-	# %rdi: link_map, %rsi: reloc_index
-	movq (REGISTER_SAVE_AREA + 8)(%rsp), %rsi
-	movq REGISTER_SAVE_AREA(%rsp), %rdi
-	call _dl_fixup		# Call resolver.
-	movq %rax, %r11		# Save return value
-#ifndef __ILP32__
-	# Restore bound registers.  These are nops if Intel MPX isn't
-	# avaiable or disabled.
-# ifdef HAVE_MPX_SUPPORT
-	bndmov REGISTER_SAVE_BND3(%rsp), %bnd3
-	bndmov REGISTER_SAVE_BND2(%rsp), %bnd2
-	bndmov REGISTER_SAVE_BND1(%rsp), %bnd1
-	bndmov REGISTER_SAVE_BND0(%rsp), %bnd0
-# else
-	.byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3
-	.byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2
-	.byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1
-#  if REGISTER_SAVE_BND0 == 0
-	.byte 0x66,0x0f,0x1a,0x04,0x24
-#  else
-	.byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0
-#  endif
-# endif
-#endif
-	# Get register content back.
-	movq REGISTER_SAVE_R9(%rsp), %r9
-	movq REGISTER_SAVE_R8(%rsp), %r8
-	movq REGISTER_SAVE_RDI(%rsp), %rdi
-	movq REGISTER_SAVE_RSI(%rsp), %rsi
-	movq REGISTER_SAVE_RDX(%rsp), %rdx
-	movq REGISTER_SAVE_RCX(%rsp), %rcx
-	movq REGISTER_SAVE_RAX(%rsp), %rax
-	# Adjust stack(PLT did 2 pushes)
-	addq $(REGISTER_SAVE_AREA + 16), %rsp
-	cfi_adjust_cfa_offset(-(REGISTER_SAVE_AREA + 16))
-	# Preserve bound registers.
-	PRESERVE_BND_REGS_PREFIX
-	jmp *%r11		# Jump to function address.
-	cfi_endproc
-	.size _dl_runtime_resolve, .-_dl_runtime_resolve
-
-
-#ifndef PROF
-	.globl _dl_runtime_profile
-	.type _dl_runtime_profile, @function
-	.align 16
-	cfi_startproc
-
-_dl_runtime_profile:
-	cfi_adjust_cfa_offset(16) # Incorporate PLT
-	/* The La_x86_64_regs data structure pointed to by the
-	   fourth paramater must be 16-byte aligned.  This must
-	   be explicitly enforced.  We have the set up a dynamically
-	   sized stack frame.  %rbx points to the top half which
-	   has a fixed size and preserves the original stack pointer.  */
-
-	subq $32, %rsp		# Allocate the local storage.
-	cfi_adjust_cfa_offset(32)
-	movq %rbx, (%rsp)
-	cfi_rel_offset(%rbx, 0)
+#define RESTORE_AVX
 
-	/* On the stack:
-		56(%rbx)	parameter #1
-		48(%rbx)	return address
-
-		40(%rbx)	reloc index
-		32(%rbx)	link_map
-
-		24(%rbx)	La_x86_64_regs pointer
-		16(%rbx)	framesize
-		 8(%rbx)	rax
-		  (%rbx)	rbx
-	*/
-
-	movq %rax, 8(%rsp)
-	movq %rsp, %rbx
-	cfi_def_cfa_register(%rbx)
-
-	/* Actively align the La_x86_64_regs structure.  */
-	andq $0xfffffffffffffff0, %rsp
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
-	/* sizeof(La_x86_64_regs).  Need extra space for 8 SSE registers
-	   to detect if any xmm0-xmm7 registers are changed by audit
-	   module.  */
-	subq $(LR_SIZE + XMM_SIZE*8), %rsp
+#ifdef HAVE_AVX512_ASM_SUPPORT
+# define VEC_SIZE		64
+# define VMOVA			vmovdqa64
+# if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
+#  define VMOV			vmovdqa64
 # else
-	subq $LR_SIZE, %rsp		# sizeof(La_x86_64_regs)
-# endif
-	movq %rsp, 24(%rbx)
-
-	/* Fill the La_x86_64_regs structure.  */
-	movq %rdx, LR_RDX_OFFSET(%rsp)
-	movq %r8,  LR_R8_OFFSET(%rsp)
-	movq %r9,  LR_R9_OFFSET(%rsp)
-	movq %rcx, LR_RCX_OFFSET(%rsp)
-	movq %rsi, LR_RSI_OFFSET(%rsp)
-	movq %rdi, LR_RDI_OFFSET(%rsp)
-	movq %rbp, LR_RBP_OFFSET(%rsp)
-
-	leaq 48(%rbx), %rax
-	movq %rax, LR_RSP_OFFSET(%rsp)
-
-	/* We always store the XMM registers even if AVX is available.
-	   This is to provide backward binary compatibility for existing
-	   audit modules.  */
-	movaps %xmm0,		   (LR_XMM_OFFSET)(%rsp)
-	movaps %xmm1, (LR_XMM_OFFSET +   XMM_SIZE)(%rsp)
-	movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
-	movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
-	movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
-	movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
-	movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
-	movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
-
-# ifndef __ILP32__
-#  ifdef HAVE_MPX_SUPPORT
-	bndmov %bnd0, 		   (LR_BND_OFFSET)(%rsp)  # Preserve bound
-	bndmov %bnd1, (LR_BND_OFFSET +   BND_SIZE)(%rsp)  # registers. Nops if
-	bndmov %bnd2, (LR_BND_OFFSET + BND_SIZE*2)(%rsp)  # MPX not available
-	bndmov %bnd3, (LR_BND_OFFSET + BND_SIZE*3)(%rsp)  # or disabled.
-#  else
-	.byte 0x66,0x0f,0x1b,0x84,0x24;.long (LR_BND_OFFSET)
-	.byte 0x66,0x0f,0x1b,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
-	.byte 0x66,0x0f,0x1b,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
-	.byte 0x66,0x0f,0x1b,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
-#  endif
-# endif
-
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
-	.data
-L(have_avx):
-	.zero 4
-	.size L(have_avx), 4
-	.previous
-
-	cmpl	$0, L(have_avx)(%rip)
-	jne	L(defined)
-	movq	%rbx, %r11		# Save rbx
-	movl	$1, %eax
-	cpuid
-	movq	%r11,%rbx		# Restore rbx
-	xorl	%eax, %eax
-	// AVX and XSAVE supported?
-	andl	$((1 << 28) | (1 << 27)), %ecx
-	cmpl	$((1 << 28) | (1 << 27)), %ecx
-	jne	10f
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	// AVX512 supported in processor?
-	movq	%rbx, %r11		# Save rbx
-	xorl	%ecx, %ecx
-	mov	$0x7, %eax
-	cpuid
-	andl	$(1 << 16), %ebx
-#  endif
-	xorl	%ecx, %ecx
-	// Get XFEATURE_ENABLED_MASK
-	xgetbv
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	test	%ebx, %ebx
-	movq	%r11, %rbx		# Restore rbx
-	je	20f
-	// Verify that XCR0[7:5] = '111b' and
-	// XCR0[2:1] = '11b' which means
-	// that zmm state is enabled
-	andl	$0xe6, %eax
-	cmpl	$0xe6, %eax
-	jne	20f
-	movl	%eax, L(have_avx)(%rip)
-L(avx512):
-#   define RESTORE_AVX
-#   define VMOV    vmovdqu64
-#   define VEC(i)  zmm##i
-#   define MORE_CODE
-#   include "dl-trampoline.h"
-#   undef VMOV
-#   undef VEC
-#   undef RESTORE_AVX
-#  endif
-20:	andl	$0x6, %eax
-10:	subl	$0x5, %eax
-	movl	%eax, L(have_avx)(%rip)
-	cmpl	$0, %eax
-
-L(defined):
-	js	L(no_avx)
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0xe6, L(have_avx)(%rip)
-	je	L(avx512)
-#  endif
-
-#  define RESTORE_AVX
-#  define VMOV    vmovdqu
-#  define VEC(i)  ymm##i
-#  define MORE_CODE
-#  include "dl-trampoline.h"
-
-	.align 16
-L(no_avx):
+#  define VMOV			vmovdqu64
 # endif
-
-# undef RESTORE_AVX
+# define VEC(i)			zmm##i
+# define _dl_runtime_resolve	_dl_runtime_resolve_avx512
+# define _dl_runtime_profile	_dl_runtime_profile_avx512
 # include "dl-trampoline.h"
-
-	cfi_endproc
-	.size _dl_runtime_profile, .-_dl_runtime_profile
+# undef _dl_runtime_resolve
+# undef _dl_runtime_profile
+# undef VEC
+# undef VMOV
+# undef VMOVA
+# undef VEC_SIZE
+#else
+strong_alias (_dl_runtime_resolve_avx, _dl_runtime_resolve_avx512)
+	.hidden _dl_runtime_resolve_avx512
+strong_alias (_dl_runtime_profile_avx, _dl_runtime_profile_avx512)
+	.hidden _dl_runtime_profile_avx512
 #endif
 
-
-#ifdef SHARED
-	.globl _dl_x86_64_save_sse
-	.type _dl_x86_64_save_sse, @function
-	.align 16
-	cfi_startproc
-_dl_x86_64_save_sse:
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0, L(have_avx)(%rip)
-	jne	L(defined_5)
-	movq	%rbx, %r11		# Save rbx
-	movl	$1, %eax
-	cpuid
-	movq	%r11,%rbx		# Restore rbx
-	xorl	%eax, %eax
-	// AVX and XSAVE supported?
-	andl	$((1 << 28) | (1 << 27)), %ecx
-	cmpl	$((1 << 28) | (1 << 27)), %ecx
-	jne	1f
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	// AVX512 supported in a processor?
-	movq	%rbx, %r11              # Save rbx
-	xorl	%ecx,%ecx
-	mov	$0x7,%eax
-	cpuid
-	andl	$(1 << 16), %ebx
-#  endif
-	xorl	%ecx, %ecx
-	// Get XFEATURE_ENABLED_MASK
-	xgetbv
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	test	%ebx, %ebx
-	movq	%r11, %rbx		# Restore rbx
-	je	2f
-	// Verify that XCR0[7:5] = '111b' and
-	// XCR0[2:1] = '11b' which means
-	// that zmm state is enabled
-	andl	$0xe6, %eax
-	movl	%eax, L(have_avx)(%rip)
-	cmpl	$0xe6, %eax
-	je	L(avx512_5)
-#  endif
-
-2:	andl	$0x6, %eax
-1:	subl	$0x5, %eax
-	movl	%eax, L(have_avx)(%rip)
-	cmpl	$0, %eax
-
-L(defined_5):
-	js	L(no_avx5)
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0xe6, L(have_avx)(%rip)
-	je	L(avx512_5)
-#  endif
-
-	vmovdqa %ymm0, %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE
-	vmovdqa %ymm1, %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE
-	vmovdqa %ymm2, %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE
-	vmovdqa %ymm3, %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE
-	vmovdqa %ymm4, %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE
-	vmovdqa %ymm5, %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE
-	vmovdqa %ymm6, %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE
-	vmovdqa %ymm7, %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE
-	ret
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-L(avx512_5):
-	vmovdqu64 %zmm0, %fs:RTLD_SAVESPACE_SSE+0*ZMM_SIZE
-	vmovdqu64 %zmm1, %fs:RTLD_SAVESPACE_SSE+1*ZMM_SIZE
-	vmovdqu64 %zmm2, %fs:RTLD_SAVESPACE_SSE+2*ZMM_SIZE
-	vmovdqu64 %zmm3, %fs:RTLD_SAVESPACE_SSE+3*ZMM_SIZE
-	vmovdqu64 %zmm4, %fs:RTLD_SAVESPACE_SSE+4*ZMM_SIZE
-	vmovdqu64 %zmm5, %fs:RTLD_SAVESPACE_SSE+5*ZMM_SIZE
-	vmovdqu64 %zmm6, %fs:RTLD_SAVESPACE_SSE+6*ZMM_SIZE
-	vmovdqu64 %zmm7, %fs:RTLD_SAVESPACE_SSE+7*ZMM_SIZE
-	ret
-#  endif
-L(no_avx5):
-# endif
-	movdqa	%xmm0, %fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE
-	movdqa	%xmm1, %fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE
-	movdqa	%xmm2, %fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE
-	movdqa	%xmm3, %fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE
-	movdqa	%xmm4, %fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE
-	movdqa	%xmm5, %fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE
-	movdqa	%xmm6, %fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE
-	movdqa	%xmm7, %fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE
-	ret
-	cfi_endproc
-	.size _dl_x86_64_save_sse, .-_dl_x86_64_save_sse
-
-
-	.globl _dl_x86_64_restore_sse
-	.type _dl_x86_64_restore_sse, @function
-	.align 16
-	cfi_startproc
-_dl_x86_64_restore_sse:
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0, L(have_avx)(%rip)
-	js	L(no_avx6)
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0xe6, L(have_avx)(%rip)
-	je	L(avx512_6)
-#  endif
-
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE, %ymm0
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE, %ymm1
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE, %ymm2
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE, %ymm3
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE, %ymm4
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE, %ymm5
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE, %ymm6
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE, %ymm7
-	ret
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-L(avx512_6):
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+0*ZMM_SIZE, %zmm0
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+1*ZMM_SIZE, %zmm1
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+2*ZMM_SIZE, %zmm2
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+3*ZMM_SIZE, %zmm3
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+4*ZMM_SIZE, %zmm4
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+5*ZMM_SIZE, %zmm5
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+6*ZMM_SIZE, %zmm6
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+7*ZMM_SIZE, %zmm7
-	ret
-#  endif
-L(no_avx6):
-# endif
-	movdqa	%fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE, %xmm0
-	movdqa	%fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE, %xmm1
-	movdqa	%fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE, %xmm2
-	movdqa	%fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE, %xmm3
-	movdqa	%fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE, %xmm4
-	movdqa	%fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE, %xmm5
-	movdqa	%fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE, %xmm6
-	movdqa	%fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE, %xmm7
-	ret
-	cfi_endproc
-	.size _dl_x86_64_restore_sse, .-_dl_x86_64_restore_sse
+#define VEC_SIZE		32
+#define VMOVA			vmovdqa
+#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
+# define VMOV			vmovdqa
+#else
+# define VMOV			vmovdqu
+#endif
+#define VEC(i)			ymm##i
+#define _dl_runtime_resolve	_dl_runtime_resolve_avx
+#define _dl_runtime_profile	_dl_runtime_profile_avx
+#include "dl-trampoline.h"
+#undef _dl_runtime_resolve
+#undef _dl_runtime_profile
+#undef VEC
+#undef VMOV
+#undef VMOVA
+#undef VEC_SIZE
+
+/* movaps/movups is 1-byte shorter.  */
+#define VEC_SIZE		16
+#define VMOVA			movaps
+#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
+# define VMOV			movaps
+#else
+# define VMOV			movups
 #endif
+#define VEC(i)			xmm##i
+#define _dl_runtime_resolve	_dl_runtime_resolve_sse
+#define _dl_runtime_profile	_dl_runtime_profile_sse
+#undef RESTORE_AVX
+#include "dl-trampoline.h"
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
index d542428ac2..f4191833ab 100644
--- a/sysdeps/x86_64/dl-trampoline.h
+++ b/sysdeps/x86_64/dl-trampoline.h
@@ -1,6 +1,5 @@
-/* Partial PLT profile trampoline to save and restore x86-64 vector
-   registers.
-   Copyright (C) 2009-2015 Free Software Foundation, Inc.
+/* PLT trampolines.  x86-64 version.
+   Copyright (C) 2009-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -17,16 +16,248 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifdef RESTORE_AVX
+#undef REGISTER_SAVE_AREA_RAW
+#ifdef __ILP32__
+/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as VEC0 to
+   VEC7.  */
+# define REGISTER_SAVE_AREA_RAW	(8 * 7 + VEC_SIZE * 8)
+#else
+/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as
+   BND0, BND1, BND2, BND3 and VEC0 to VEC7. */
+# define REGISTER_SAVE_AREA_RAW	(8 * 7 + 16 * 4 + VEC_SIZE * 8)
+#endif
+
+#undef REGISTER_SAVE_AREA
+#undef LOCAL_STORAGE_AREA
+#undef BASE
+#if DL_RUNIME_RESOLVE_REALIGN_STACK
+# define REGISTER_SAVE_AREA	(REGISTER_SAVE_AREA_RAW + 8)
+/* Local stack area before jumping to function address: RBX.  */
+# define LOCAL_STORAGE_AREA	8
+# define BASE			rbx
+# if (REGISTER_SAVE_AREA % VEC_SIZE) != 0
+#  error REGISTER_SAVE_AREA must be multples of VEC_SIZE
+# endif
+#else
+# define REGISTER_SAVE_AREA	REGISTER_SAVE_AREA_RAW
+/* Local stack area before jumping to function address:  All saved
+   registers.  */
+# define LOCAL_STORAGE_AREA	REGISTER_SAVE_AREA
+# define BASE			rsp
+# if (REGISTER_SAVE_AREA % 16) != 8
+#  error REGISTER_SAVE_AREA must be odd multples of 8
+# endif
+#endif
+
+	.text
+	.globl _dl_runtime_resolve
+	.hidden _dl_runtime_resolve
+	.type _dl_runtime_resolve, @function
+	.align 16
+	cfi_startproc
+_dl_runtime_resolve:
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
+#if DL_RUNIME_RESOLVE_REALIGN_STACK
+# if LOCAL_STORAGE_AREA != 8
+#  error LOCAL_STORAGE_AREA must be 8
+# endif
+	pushq %rbx			# push subtracts stack by 8.
+	cfi_adjust_cfa_offset(8)
+	cfi_rel_offset(%rbx, 0)
+	mov %RSP_LP, %RBX_LP
+	cfi_def_cfa_register(%rbx)
+	and $-VEC_SIZE, %RSP_LP
+#endif
+	sub $REGISTER_SAVE_AREA, %RSP_LP
+	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
+	# Preserve registers otherwise clobbered.
+	movq %rax, REGISTER_SAVE_RAX(%rsp)
+	movq %rcx, REGISTER_SAVE_RCX(%rsp)
+	movq %rdx, REGISTER_SAVE_RDX(%rsp)
+	movq %rsi, REGISTER_SAVE_RSI(%rsp)
+	movq %rdi, REGISTER_SAVE_RDI(%rsp)
+	movq %r8, REGISTER_SAVE_R8(%rsp)
+	movq %r9, REGISTER_SAVE_R9(%rsp)
+	VMOV %VEC(0), (REGISTER_SAVE_VEC_OFF)(%rsp)
+	VMOV %VEC(1), (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp)
+	VMOV %VEC(2), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp)
+	VMOV %VEC(3), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp)
+	VMOV %VEC(4), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp)
+	VMOV %VEC(5), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp)
+	VMOV %VEC(6), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp)
+	VMOV %VEC(7), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp)
+#ifndef __ILP32__
+	# We also have to preserve bound registers.  These are nops if
+	# Intel MPX isn't available or disabled.
+# ifdef HAVE_MPX_SUPPORT
+	bndmov %bnd0, REGISTER_SAVE_BND0(%rsp)
+	bndmov %bnd1, REGISTER_SAVE_BND1(%rsp)
+	bndmov %bnd2, REGISTER_SAVE_BND2(%rsp)
+	bndmov %bnd3, REGISTER_SAVE_BND3(%rsp)
+# else
+#  if REGISTER_SAVE_BND0 == 0
+	.byte 0x66,0x0f,0x1b,0x04,0x24
+#  else
+	.byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0
+#  endif
+	.byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1
+	.byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2
+	.byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3
+# endif
+#endif
+	# Copy args pushed by PLT in register.
+	# %rdi: link_map, %rsi: reloc_index
+	mov (LOCAL_STORAGE_AREA + 8)(%BASE), %RSI_LP
+	mov LOCAL_STORAGE_AREA(%BASE), %RDI_LP
+	call _dl_fixup		# Call resolver.
+	mov %RAX_LP, %R11_LP	# Save return value
+#ifndef __ILP32__
+	# Restore bound registers.  These are nops if Intel MPX isn't
+	# avaiable or disabled.
+# ifdef HAVE_MPX_SUPPORT
+	bndmov REGISTER_SAVE_BND3(%rsp), %bnd3
+	bndmov REGISTER_SAVE_BND2(%rsp), %bnd2
+	bndmov REGISTER_SAVE_BND1(%rsp), %bnd1
+	bndmov REGISTER_SAVE_BND0(%rsp), %bnd0
+# else
+	.byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3
+	.byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2
+	.byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1
+#  if REGISTER_SAVE_BND0 == 0
+	.byte 0x66,0x0f,0x1a,0x04,0x24
+#  else
+	.byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0
+#  endif
+# endif
+#endif
+	# Get register content back.
+	movq REGISTER_SAVE_R9(%rsp), %r9
+	movq REGISTER_SAVE_R8(%rsp), %r8
+	movq REGISTER_SAVE_RDI(%rsp), %rdi
+	movq REGISTER_SAVE_RSI(%rsp), %rsi
+	movq REGISTER_SAVE_RDX(%rsp), %rdx
+	movq REGISTER_SAVE_RCX(%rsp), %rcx
+	movq REGISTER_SAVE_RAX(%rsp), %rax
+	VMOV (REGISTER_SAVE_VEC_OFF)(%rsp), %VEC(0)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp), %VEC(1)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp), %VEC(2)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp), %VEC(3)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp), %VEC(4)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp), %VEC(5)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp), %VEC(6)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp), %VEC(7)
+#if DL_RUNIME_RESOLVE_REALIGN_STACK
+	mov %RBX_LP, %RSP_LP
+	cfi_def_cfa_register(%rsp)
+	movq (%rsp), %rbx
+	cfi_restore(%rbx)
+#endif
+	# Adjust stack(PLT did 2 pushes)
+	add $(LOCAL_STORAGE_AREA + 16), %RSP_LP
+	cfi_adjust_cfa_offset(-(LOCAL_STORAGE_AREA + 16))
+	# Preserve bound registers.
+	PRESERVE_BND_REGS_PREFIX
+	jmp *%r11		# Jump to function address.
+	cfi_endproc
+	.size _dl_runtime_resolve, .-_dl_runtime_resolve
+
+
+#ifndef PROF
+# if (LR_VECTOR_OFFSET % VEC_SIZE) != 0
+#  error LR_VECTOR_OFFSET must be multples of VEC_SIZE
+# endif
+
+	.globl _dl_runtime_profile
+	.hidden _dl_runtime_profile
+	.type _dl_runtime_profile, @function
+	.align 16
+_dl_runtime_profile:
+	cfi_startproc
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
+	/* The La_x86_64_regs data structure pointed to by the
+	   fourth paramater must be VEC_SIZE-byte aligned.  This must
+	   be explicitly enforced.  We have the set up a dynamically
+	   sized stack frame.  %rbx points to the top half which
+	   has a fixed size and preserves the original stack pointer.  */
+
+	sub $32, %RSP_LP	# Allocate the local storage.
+	cfi_adjust_cfa_offset(32)
+	movq %rbx, (%rsp)
+	cfi_rel_offset(%rbx, 0)
+
+	/* On the stack:
+		56(%rbx)	parameter #1
+		48(%rbx)	return address
+
+		40(%rbx)	reloc index
+		32(%rbx)	link_map
+
+		24(%rbx)	La_x86_64_regs pointer
+		16(%rbx)	framesize
+		 8(%rbx)	rax
+		  (%rbx)	rbx
+	*/
+
+	movq %rax, 8(%rsp)
+	mov %RSP_LP, %RBX_LP
+	cfi_def_cfa_register(%rbx)
+
+	/* Actively align the La_x86_64_regs structure.  */
+	and $-VEC_SIZE, %RSP_LP
+	/* sizeof(La_x86_64_regs).  Need extra space for 8 SSE registers
+	   to detect if any xmm0-xmm7 registers are changed by audit
+	   module.  */
+	sub $(LR_SIZE + XMM_SIZE*8), %RSP_LP
+	movq %rsp, 24(%rbx)
+
+	/* Fill the La_x86_64_regs structure.  */
+	movq %rdx, LR_RDX_OFFSET(%rsp)
+	movq %r8,  LR_R8_OFFSET(%rsp)
+	movq %r9,  LR_R9_OFFSET(%rsp)
+	movq %rcx, LR_RCX_OFFSET(%rsp)
+	movq %rsi, LR_RSI_OFFSET(%rsp)
+	movq %rdi, LR_RDI_OFFSET(%rsp)
+	movq %rbp, LR_RBP_OFFSET(%rsp)
+
+	lea 48(%rbx), %RAX_LP
+	movq %rax, LR_RSP_OFFSET(%rsp)
+
+	/* We always store the XMM registers even if AVX is available.
+	   This is to provide backward binary compatibility for existing
+	   audit modules.  */
+	movaps %xmm0,		   (LR_XMM_OFFSET)(%rsp)
+	movaps %xmm1, (LR_XMM_OFFSET +   XMM_SIZE)(%rsp)
+	movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
+	movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
+	movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
+	movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
+	movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
+	movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
+
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
+	bndmov %bnd0, 		   (LR_BND_OFFSET)(%rsp)  # Preserve bound
+	bndmov %bnd1, (LR_BND_OFFSET +   BND_SIZE)(%rsp)  # registers. Nops if
+	bndmov %bnd2, (LR_BND_OFFSET + BND_SIZE*2)(%rsp)  # MPX not available
+	bndmov %bnd3, (LR_BND_OFFSET + BND_SIZE*3)(%rsp)  # or disabled.
+#  else
+	.byte 0x66,0x0f,0x1b,0x84,0x24;.long (LR_BND_OFFSET)
+	.byte 0x66,0x0f,0x1b,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
+	.byte 0x66,0x0f,0x1b,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
+	.byte 0x66,0x0f,0x1b,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
+#  endif
+# endif
+
+# ifdef RESTORE_AVX
 	/* This is to support AVX audit modules.  */
-	VMOV %VEC(0),		      (LR_VECTOR_OFFSET)(%rsp)
-	VMOV %VEC(1), (LR_VECTOR_OFFSET +   VECTOR_SIZE)(%rsp)
-	VMOV %VEC(2), (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
-	VMOV %VEC(3), (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
-	VMOV %VEC(4), (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
-	VMOV %VEC(5), (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
-	VMOV %VEC(6), (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
-	VMOV %VEC(7), (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
+	VMOVA %VEC(0),		      (LR_VECTOR_OFFSET)(%rsp)
+	VMOVA %VEC(1), (LR_VECTOR_OFFSET +   VECTOR_SIZE)(%rsp)
+	VMOVA %VEC(2), (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
+	VMOVA %VEC(3), (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
+	VMOVA %VEC(4), (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
+	VMOVA %VEC(5), (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
+	VMOVA %VEC(6), (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
+	VMOVA %VEC(7), (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
 
 	/* Save xmm0-xmm7 registers to detect if any of them are
 	   changed by audit module.  */
@@ -38,7 +269,7 @@
 	vmovdqa %xmm5, (LR_SIZE + XMM_SIZE*5)(%rsp)
 	vmovdqa %xmm6, (LR_SIZE + XMM_SIZE*6)(%rsp)
 	vmovdqa %xmm7, (LR_SIZE + XMM_SIZE*7)(%rsp)
-#endif
+# endif
 
 	mov %RSP_LP, %RCX_LP	# La_x86_64_regs pointer to %rcx.
 	mov 48(%rbx), %RDX_LP	# Load return address if needed.
@@ -63,7 +294,7 @@
 	movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
 	movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
 
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* Check if any xmm0-xmm7 registers are changed by audit
 	   module.  */
 	vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8
@@ -72,7 +303,7 @@
 	je 2f
 	vmovdqa	%xmm0, (LR_VECTOR_OFFSET)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET)(%rsp), %VEC(0)
+2:	VMOVA (LR_VECTOR_OFFSET)(%rsp), %VEC(0)
 	vmovdqa	%xmm0, (LR_XMM_OFFSET)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8
@@ -81,7 +312,7 @@
 	je 2f
 	vmovdqa	%xmm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %VEC(1)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %VEC(1)
 	vmovdqa	%xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8
@@ -90,7 +321,7 @@
 	je 2f
 	vmovdqa	%xmm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %VEC(2)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %VEC(2)
 	vmovdqa	%xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8
@@ -99,7 +330,7 @@
 	je 2f
 	vmovdqa	%xmm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %VEC(3)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %VEC(3)
 	vmovdqa	%xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8
@@ -108,7 +339,7 @@
 	je 2f
 	vmovdqa	%xmm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %VEC(4)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %VEC(4)
 	vmovdqa	%xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8
@@ -117,7 +348,7 @@
 	je 2f
 	vmovdqa	%xmm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %VEC(5)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %VEC(5)
 	vmovdqa	%xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8
@@ -126,7 +357,7 @@
 	je 2f
 	vmovdqa	%xmm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %VEC(6)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %VEC(6)
 	vmovdqa	%xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
@@ -135,25 +366,25 @@
 	je 2f
 	vmovdqa	%xmm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %VEC(7)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %VEC(7)
 	vmovdqa	%xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
 
 1:
-#endif
+# endif
 
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
 	bndmov              (LR_BND_OFFSET)(%rsp), %bnd0  # Restore bound
 	bndmov (LR_BND_OFFSET +   BND_SIZE)(%rsp), %bnd1  # registers.
 	bndmov (LR_BND_OFFSET + BND_SIZE*2)(%rsp), %bnd2
 	bndmov (LR_BND_OFFSET + BND_SIZE*3)(%rsp), %bnd3
-# else
+#  else
 	.byte 0x66,0x0f,0x1a,0x84,0x24;.long (LR_BND_OFFSET)
 	.byte 0x66,0x0f,0x1a,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
 	.byte 0x66,0x0f,0x1a,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
 	.byte 0x66,0x0f,0x1a,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
+#  endif
 # endif
-#endif
 
 	mov  16(%rbx), %R10_LP	# Anything in framesize?
 	test %R10_LP, %R10_LP
@@ -168,12 +399,12 @@
 	movq LR_RSI_OFFSET(%rsp), %rsi
 	movq LR_RDI_OFFSET(%rsp), %rdi
 
-	movq %rbx, %rsp
+	mov %RBX_LP, %RSP_LP
 	movq (%rsp), %rbx
-	cfi_restore(rbx)
+	cfi_restore(%rbx)
 	cfi_def_cfa_register(%rsp)
 
-	addq $48, %rsp		# Adjust the stack to the return value
+	add $48, %RSP_LP	# Adjust the stack to the return value
 				# (eats the reloc index and link_map)
 	cfi_adjust_cfa_offset(-48)
 	PRESERVE_BND_REGS_PREFIX
@@ -189,13 +420,13 @@
 	   temporary buffer of the size specified by the 'framesize'
 	   returned from _dl_profile_fixup */
 
-	leaq LR_RSP_OFFSET(%rbx), %rsi	# stack
-	addq $8, %r10
-	andq $0xfffffffffffffff0, %r10
-	movq %r10, %rcx
-	subq %r10, %rsp
-	movq %rsp, %rdi
-	shrq $3, %rcx
+	lea LR_RSP_OFFSET(%rbx), %RSI_LP # stack
+	add $8, %R10_LP
+	and $-16, %R10_LP
+	mov %R10_LP, %RCX_LP
+	sub %R10_LP, %RSP_LP
+	mov %RSP_LP, %RDI_LP
+	shr $3, %RCX_LP
 	rep
 	movsq
 
@@ -206,21 +437,21 @@
 	PRESERVE_BND_REGS_PREFIX
 	call *%r11
 
-	mov 24(%rbx), %rsp	# Drop the copied stack content
+	mov 24(%rbx), %RSP_LP	# Drop the copied stack content
 
 	/* Now we have to prepare the La_x86_64_retval structure for the
 	   _dl_call_pltexit.  The La_x86_64_regs is being pointed by rsp now,
 	   so we just need to allocate the sizeof(La_x86_64_retval) space on
 	   the stack, since the alignment has already been taken care of. */
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* sizeof(La_x86_64_retval).  Need extra space for 2 SSE
 	   registers to detect if xmm0/xmm1 registers are changed
 	   by audit module.  */
-	subq $(LRV_SIZE + XMM_SIZE*2), %rsp
-#else
-	subq $LRV_SIZE, %rsp	# sizeof(La_x86_64_retval)
-#endif
-	movq %rsp, %rcx		# La_x86_64_retval argument to %rcx.
+	sub $(LRV_SIZE + XMM_SIZE*2), %RSP_LP
+# else
+	sub $LRV_SIZE, %RSP_LP	# sizeof(La_x86_64_retval)
+# endif
+	mov %RSP_LP, %RCX_LP	# La_x86_64_retval argument to %rcx.
 
 	/* Fill in the La_x86_64_retval structure.  */
 	movq %rax, LRV_RAX_OFFSET(%rcx)
@@ -229,26 +460,26 @@
 	movaps %xmm0, LRV_XMM0_OFFSET(%rcx)
 	movaps %xmm1, LRV_XMM1_OFFSET(%rcx)
 
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* This is to support AVX audit modules.  */
-	VMOV %VEC(0), LRV_VECTOR0_OFFSET(%rcx)
-	VMOV %VEC(1), LRV_VECTOR1_OFFSET(%rcx)
+	VMOVA %VEC(0), LRV_VECTOR0_OFFSET(%rcx)
+	VMOVA %VEC(1), LRV_VECTOR1_OFFSET(%rcx)
 
 	/* Save xmm0/xmm1 registers to detect if they are changed
 	   by audit module.  */
 	vmovdqa %xmm0,		  (LRV_SIZE)(%rcx)
 	vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx)
-#endif
+# endif
 
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
 	bndmov %bnd0, LRV_BND0_OFFSET(%rcx)  # Preserve returned bounds.
 	bndmov %bnd1, LRV_BND1_OFFSET(%rcx)
-# else
+#  else
 	.byte  0x66,0x0f,0x1b,0x81;.long (LRV_BND0_OFFSET)
 	.byte  0x66,0x0f,0x1b,0x89;.long (LRV_BND1_OFFSET)
+#  endif
 # endif
-#endif
 
 	fstpt LRV_ST0_OFFSET(%rcx)
 	fstpt LRV_ST1_OFFSET(%rcx)
@@ -265,50 +496,47 @@
 	movaps LRV_XMM0_OFFSET(%rsp), %xmm0
 	movaps LRV_XMM1_OFFSET(%rsp), %xmm1
 
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* Check if xmm0/xmm1 registers are changed by audit module.  */
 	vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2
 	vpmovmskb %xmm2, %esi
 	cmpl $0xffff, %esi
 	jne 1f
-	VMOV LRV_VECTOR0_OFFSET(%rsp), %VEC(0)
+	VMOVA LRV_VECTOR0_OFFSET(%rsp), %VEC(0)
 
 1:	vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
 	vpmovmskb %xmm2, %esi
 	cmpl $0xffff, %esi
 	jne 1f
-	VMOV LRV_VECTOR1_OFFSET(%rsp), %VEC(1)
+	VMOVA LRV_VECTOR1_OFFSET(%rsp), %VEC(1)
 
 1:
-#endif
+# endif
 
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
 	bndmov LRV_BND0_OFFSET(%rsp), %bnd0  # Restore bound registers.
 	bndmov LRV_BND1_OFFSET(%rsp), %bnd1
-# else
+#  else
 	.byte  0x66,0x0f,0x1a,0x84,0x24;.long (LRV_BND0_OFFSET)
 	.byte  0x66,0x0f,0x1a,0x8c,0x24;.long (LRV_BND1_OFFSET)
+#  endif
 # endif
-#endif
 
 	fldt LRV_ST1_OFFSET(%rsp)
 	fldt LRV_ST0_OFFSET(%rsp)
 
-	movq %rbx, %rsp
+	mov %RBX_LP, %RSP_LP
 	movq (%rsp), %rbx
-	cfi_restore(rbx)
+	cfi_restore(%rbx)
 	cfi_def_cfa_register(%rsp)
 
-	addq $48, %rsp		# Adjust the stack to the return value
+	add $48, %RSP_LP	# Adjust the stack to the return value
 				# (eats the reloc index and link_map)
 	cfi_adjust_cfa_offset(-48)
 	PRESERVE_BND_REGS_PREFIX
 	retq
 
-#ifdef MORE_CODE
-	cfi_adjust_cfa_offset(48)
-	cfi_rel_offset(%rbx, 0)
-	cfi_def_cfa_register(%rbx)
-# undef MORE_CODE
+	cfi_endproc
+	.size _dl_runtime_profile, .-_dl_runtime_profile
 #endif
diff --git a/sysdeps/x86_64/ffs.c b/sysdeps/x86_64/ffs.c
index 48feb4aba2..be5b6c8589 100644
--- a/sysdeps/x86_64/ffs.c
+++ b/sysdeps/x86_64/ffs.c
@@ -1,7 +1,7 @@
 /* ffs -- find first set bit in a word, counted from least significant end.
    For AMD x86-64.
    This file is part of the GNU C Library.
-   Copyright (C) 1991-2015 Free Software Foundation, Inc.
+   Copyright (C) 1991-2016 Free Software Foundation, Inc.
    Contributed by Ulrich Drepper <drepper@cygnus.com>.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/ffsll.c b/sysdeps/x86_64/ffsll.c
index 1c26679da7..c0f5abc446 100644
--- a/sysdeps/x86_64/ffsll.c
+++ b/sysdeps/x86_64/ffsll.c
@@ -1,7 +1,7 @@
 /* ffsll -- find first set bit in a word, counted from least significant end.
    For AMD x86-64.
    This file is part of the GNU C Library.
-   Copyright (C) 1991-2015 Free Software Foundation, Inc.
+   Copyright (C) 1991-2016 Free Software Foundation, Inc.
    Contributed by Ulrich Drepper <drepper@cygnus.com>.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile
index 1ebe5118bf..88742faff1 100644
--- a/sysdeps/x86_64/fpu/Makefile
+++ b/sysdeps/x86_64/fpu/Makefile
@@ -20,7 +20,9 @@ libmvec-support += svml_d_cos2_core svml_d_cos4_core_avx \
 		   svml_d_pow_data svml_s_powf4_core svml_s_powf8_core_avx \
 		   svml_s_powf8_core svml_s_powf16_core svml_s_powf_data \
 		   svml_s_sincosf4_core svml_s_sincosf8_core_avx \
-		   svml_s_sincosf8_core svml_s_sincosf16_core init-arch
+		   svml_s_sincosf8_core svml_s_sincosf16_core svml_finite_alias
+
+libmvec-static-only-routines = svml_finite_alias
 endif
 
 # Variables for libmvec tests.
diff --git a/sysdeps/x86_64/fpu/e_exp2l.S b/sysdeps/x86_64/fpu/e_exp2l.S
index 7d42a932db..0e059b7565 100644
--- a/sysdeps/x86_64/fpu/e_exp2l.S
+++ b/sysdeps/x86_64/fpu/e_exp2l.S
@@ -6,7 +6,17 @@
  */
 
 #include <machine/asm.h>
+#include <x86_64-math-asm.h>
 
+DEFINE_LDBL_MIN
+
+#ifdef PIC
+# define MO(op) op##(%rip)
+#else
+# define MO(op) op
+#endif
+
+	.text
 ENTRY(__ieee754_exp2l)
 	fldt	8(%rsp)
 /* I added the following ugly construct because exp(+-Inf) resulted
@@ -36,6 +46,7 @@ ENTRY(__ieee754_exp2l)
 	faddp				/* 2^(fract(x)) */
 	fscale				/* e^x */
 	fstp	%st(1)
+	LDBL_CHECK_FORCE_UFLOW_NONNEG_NAN
 	ret
 
 1:	testl	$0x200, %eax		/* Test sign.  */
diff --git a/sysdeps/x86_64/fpu/e_expf.S b/sysdeps/x86_64/fpu/e_expf.S
index 34453ca409..d4b63a8d8e 100644
--- a/sysdeps/x86_64/fpu/e_expf.S
+++ b/sysdeps/x86_64/fpu/e_expf.S
@@ -1,5 +1,5 @@
 /* Optimized __ieee754_expf function.
-   Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/x86_64/fpu/e_expl.S b/sysdeps/x86_64/fpu/e_expl.S
index 14dd29dcad..8b3ddaec59 100644
--- a/sysdeps/x86_64/fpu/e_expl.S
+++ b/sysdeps/x86_64/fpu/e_expl.S
@@ -23,6 +23,7 @@
  */
 
 #include <machine/asm.h>
+#include <x86_64-math-asm.h>
 
 #ifdef USE_AS_EXP10L
 # define IEEE754_EXPL __ieee754_exp10l
@@ -65,10 +66,7 @@ c1:	.byte 0x20, 0xfa, 0xee, 0xc2, 0x5f, 0x70, 0xa5, 0xec, 0xed, 0x3f
 csat:	.byte 0, 0, 0, 0, 0, 0, 0, 0x80, 0x0e, 0x40
 	.byte 0, 0, 0, 0, 0, 0
 	ASM_SIZE_DIRECTIVE(csat)
-	.type cmin,@object
-cmin:	.byte 0, 0, 0, 0, 0, 0, 0, 0x80, 0x1, 0
-	.byte 0, 0, 0, 0, 0, 0
-	ASM_SIZE_DIRECTIVE(cmin)
+DEFINE_LDBL_MIN
 #endif
 
 #ifdef PIC
@@ -192,17 +190,9 @@ ENTRY(IEEE754_EXPL)
 	fstp	%st(1)		/* 2  */
 	fscale			/* 2 scale factor is st(1); base^x */
 	fstp	%st(1)		/* 1  */
-	/* Ensure underflow for tiny result.  */
-	fldt	MO(cmin)	/* 2 cmin  */
-	fld	%st(1)		/* 3  */
-	fcomip	%st(1), %st	/* 2  */
-	fstp	%st		/* 1  */
-	jnc	6f
-	fld	%st
-	fmul	%st
-	fstp	%st
+	LDBL_CHECK_FORCE_UFLOW_NONNEG
 #endif
-6:	fstp	%st(1)		/* 0  */
+	fstp	%st(1)		/* 0  */
 	jmp	2f
 1:
 #ifdef USE_AS_EXPM1L
diff --git a/sysdeps/x86_64/fpu/e_log10l.S b/sysdeps/x86_64/fpu/e_log10l.S
index 2607ad199b..8fa61644c1 100644
--- a/sysdeps/x86_64/fpu/e_log10l.S
+++ b/sysdeps/x86_64/fpu/e_log10l.S
@@ -79,7 +79,13 @@ ENTRY(__log10l_finite)
 	fnstsw			// x-1 : x : log10(2)
 	andb	$0x45, %ah
 	jz	2b
-	fstp	%st(1)		// x-1 : log10(2)
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	6f
+	fabs			// log10(1) is +0 in all rounding modes.
+6:	fstp	%st(1)		// x-1 : log10(2)
 	fyl2xp1			// log10(x)
 	ret
 END(__log10l_finite)
diff --git a/sysdeps/x86_64/fpu/e_log2l.S b/sysdeps/x86_64/fpu/e_log2l.S
index c12906d456..a063255ddd 100644
--- a/sysdeps/x86_64/fpu/e_log2l.S
+++ b/sysdeps/x86_64/fpu/e_log2l.S
@@ -78,7 +78,13 @@ ENTRY(__log2l_finite)
 	fnstsw			// x-1 : x : 1
 	andb	$0x45, %ah
 	jz	2b
-	fstp	%st(1)		// x-1 : 1
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	6f
+	fabs			// log2(1) is +0 in all rounding modes.
+6:	fstp	%st(1)		// x-1 : 1
 	fyl2xp1			// log(x)
 	ret
 END (__log2l_finite)
diff --git a/sysdeps/x86_64/fpu/e_logl.S b/sysdeps/x86_64/fpu/e_logl.S
index 047b8db88a..dbe6fd59dc 100644
--- a/sysdeps/x86_64/fpu/e_logl.S
+++ b/sysdeps/x86_64/fpu/e_logl.S
@@ -81,7 +81,13 @@ ENTRY(__logl_finite)
 	fnstsw			// x-1 : x : log(2)
 	andb	$0x45, %ah
 	jz	2b
-	fstp	%st(1)		// x-1 : log(2)
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x40, %ah
+	jne	7f
+	fabs			// log(1) is +0 in all rounding modes.
+7:	fstp	%st(1)		// x-1 : log(2)
 	fyl2xp1			// log(x)
 	ret
 END (__logl_finite)
diff --git a/sysdeps/x86_64/fpu/e_powl.S b/sysdeps/x86_64/fpu/e_powl.S
index 358abb8dcb..1f68cf0102 100644
--- a/sysdeps/x86_64/fpu/e_powl.S
+++ b/sysdeps/x86_64/fpu/e_powl.S
@@ -1,5 +1,5 @@
 /* ix87 specific implementation of pow function.
-   Copyright (C) 1996-2015 Free Software Foundation, Inc.
+   Copyright (C) 1996-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
 
@@ -18,6 +18,7 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <machine/asm.h>
+#include <x86_64-math-asm.h>
 
 	.section .rodata.cst8,"aM",@progbits,8
 
@@ -59,6 +60,7 @@ minfinity:
 mzero:
 	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
 	ASM_SIZE_DIRECTIVE(minf_mzero)
+DEFINE_LDBL_MIN
 
 #ifdef PIC
 # define MO(op) op##(%rip)
@@ -175,6 +177,7 @@ ENTRY(__ieee754_powl)
 	orl	%edx, %ecx
 	jnz	6b
 	fstp	%st(0)		// ST*x
+	LDBL_CHECK_FORCE_UFLOW_NONNAN
 	ret
 
 	/* y is �NAN */
diff --git a/sysdeps/x86_64/fpu/e_sqrt.c b/sysdeps/x86_64/fpu/e_sqrt.c
index b587f1cfb7..4b86434913 100644
--- a/sysdeps/x86_64/fpu/e_sqrt.c
+++ b/sysdeps/x86_64/fpu/e_sqrt.c
@@ -1,5 +1,5 @@
 /* Square root of floating point number.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
+   Copyright (C) 2002-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/e_sqrtf.c b/sysdeps/x86_64/fpu/e_sqrtf.c
index 386ca1ce1d..639137b735 100644
--- a/sysdeps/x86_64/fpu/e_sqrtf.c
+++ b/sysdeps/x86_64/fpu/e_sqrtf.c
@@ -1,5 +1,5 @@
 /* Square root of floating point number.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
+   Copyright (C) 2002-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/fclrexcpt.c b/sysdeps/x86_64/fpu/fclrexcpt.c
index ec03e1e2c2..a8e00c0141 100644
--- a/sysdeps/x86_64/fpu/fclrexcpt.c
+++ b/sysdeps/x86_64/fpu/fclrexcpt.c
@@ -1,5 +1,5 @@
 /* Clear given exceptions in current floating-point environment.
-   Copyright (C) 2001-2015 Free Software Foundation, Inc.
+   Copyright (C) 2001-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/fedisblxcpt.c b/sysdeps/x86_64/fpu/fedisblxcpt.c
index 95f585aefc..f1ea6cfa97 100644
--- a/sysdeps/x86_64/fpu/fedisblxcpt.c
+++ b/sysdeps/x86_64/fpu/fedisblxcpt.c
@@ -1,5 +1,5 @@
 /* Disable floating-point exceptions.
-   Copyright (C) 2001-2015 Free Software Foundation, Inc.
+   Copyright (C) 2001-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Andreas Jaeger <aj@suse.de>, 2001.
 
diff --git a/sysdeps/x86_64/fpu/feenablxcpt.c b/sysdeps/x86_64/fpu/feenablxcpt.c
index e04875fe21..df4c628b8d 100644
--- a/sysdeps/x86_64/fpu/feenablxcpt.c
+++ b/sysdeps/x86_64/fpu/feenablxcpt.c
@@ -1,5 +1,5 @@
 /* Enable floating-point exceptions.
-   Copyright (C) 2001-2015 Free Software Foundation, Inc.
+   Copyright (C) 2001-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Andreas Jaeger <aj@suse.de>, 2001.
 
diff --git a/sysdeps/x86_64/fpu/fegetenv.c b/sysdeps/x86_64/fpu/fegetenv.c
index 7314cee7e6..a28efb36f3 100644
--- a/sysdeps/x86_64/fpu/fegetenv.c
+++ b/sysdeps/x86_64/fpu/fegetenv.c
@@ -1,5 +1,5 @@
 /* Store current floating-point environment.
-   Copyright (C) 2001-2015 Free Software Foundation, Inc.
+   Copyright (C) 2001-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/fegetexcept.c b/sysdeps/x86_64/fpu/fegetexcept.c
index 27a0803aa2..8acd0382a0 100644
--- a/sysdeps/x86_64/fpu/fegetexcept.c
+++ b/sysdeps/x86_64/fpu/fegetexcept.c
@@ -1,5 +1,5 @@
 /* Get enabled floating-point exceptions.
-   Copyright (C) 2001-2015 Free Software Foundation, Inc.
+   Copyright (C) 2001-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Andreas Jaeger <aj@suse.de>, 2001.
 
diff --git a/sysdeps/x86_64/fpu/fegetround.c b/sysdeps/x86_64/fpu/fegetround.c
index b515d8afe7..296d366560 100644
--- a/sysdeps/x86_64/fpu/fegetround.c
+++ b/sysdeps/x86_64/fpu/fegetround.c
@@ -1,5 +1,5 @@
 /* Return current rounding direction.
-   Copyright (C) 1997-2015 Free Software Foundation, Inc.
+   Copyright (C) 1997-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
 
diff --git a/sysdeps/x86_64/fpu/feholdexcpt.c b/sysdeps/x86_64/fpu/feholdexcpt.c
index 615b702135..a040c3dea5 100644
--- a/sysdeps/x86_64/fpu/feholdexcpt.c
+++ b/sysdeps/x86_64/fpu/feholdexcpt.c
@@ -1,5 +1,5 @@
 /* Store current floating-point environment and clear exceptions.
-   Copyright (C) 2001-2015 Free Software Foundation, Inc.
+   Copyright (C) 2001-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/fesetenv.c b/sysdeps/x86_64/fpu/fesetenv.c
index 3e3fd8492d..355d02aaa6 100644
--- a/sysdeps/x86_64/fpu/fesetenv.c
+++ b/sysdeps/x86_64/fpu/fesetenv.c
@@ -1,5 +1,5 @@
 /* Install given floating-point environment.
-   Copyright (C) 2001-2015 Free Software Foundation, Inc.
+   Copyright (C) 2001-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -17,9 +17,15 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <fenv.h>
+#include <fpu_control.h>
 #include <assert.h>
 
 
+/* All exceptions, including the x86-specific "denormal operand"
+   exception.  */
+#define FE_ALL_EXCEPT_X86 (FE_ALL_EXCEPT | __FE_DENORM)
+
+
 int
 __fesetenv (const fenv_t *envp)
 {
@@ -34,43 +40,61 @@ __fesetenv (const fenv_t *envp)
 
   if (envp == FE_DFL_ENV)
     {
-      temp.__control_word |= FE_ALL_EXCEPT;
+      temp.__control_word |= FE_ALL_EXCEPT_X86;
       temp.__control_word &= ~FE_TOWARDZERO;
-      temp.__status_word &= ~FE_ALL_EXCEPT;
+      temp.__control_word |= _FPU_EXTENDED;
+      temp.__status_word &= ~FE_ALL_EXCEPT_X86;
       temp.__eip = 0;
       temp.__cs_selector = 0;
       temp.__opcode = 0;
       temp.__data_offset = 0;
       temp.__data_selector = 0;
+      /* Clear SSE exceptions.  */
+      temp.__mxcsr &= ~FE_ALL_EXCEPT_X86;
       /* Set mask for SSE MXCSR.  */
-      temp.__mxcsr |= (FE_ALL_EXCEPT << 7);
+      temp.__mxcsr |= (FE_ALL_EXCEPT_X86 << 7);
       /* Set rounding to FE_TONEAREST.  */
       temp.__mxcsr &= ~ 0x6000;
       temp.__mxcsr |= (FE_TONEAREST << 3);
+      /* Clear the FZ and DAZ bits.  */
+      temp.__mxcsr &= ~0x8040;
     }
   else if (envp == FE_NOMASK_ENV)
     {
       temp.__control_word &= ~(FE_ALL_EXCEPT | FE_TOWARDZERO);
-      temp.__status_word &= ~FE_ALL_EXCEPT;
+      /* Keep the "denormal operand" exception masked.  */
+      temp.__control_word |= __FE_DENORM;
+      temp.__control_word |= _FPU_EXTENDED;
+      temp.__status_word &= ~FE_ALL_EXCEPT_X86;
       temp.__eip = 0;
       temp.__cs_selector = 0;
       temp.__opcode = 0;
       temp.__data_offset = 0;
       temp.__data_selector = 0;
+      /* Clear SSE exceptions.  */
+      temp.__mxcsr &= ~FE_ALL_EXCEPT_X86;
       /* Set mask for SSE MXCSR.  */
       /* Set rounding to FE_TONEAREST.  */
       temp.__mxcsr &= ~ 0x6000;
       temp.__mxcsr |= (FE_TONEAREST << 3);
       /* Do not mask exceptions.  */
       temp.__mxcsr &= ~(FE_ALL_EXCEPT << 7);
+      /* Keep the "denormal operand" exception masked.  */
+      temp.__mxcsr |= (__FE_DENORM << 7);
+      /* Clear the FZ and DAZ bits.  */
+      temp.__mxcsr &= ~0x8040;
     }
   else
     {
-      temp.__control_word &= ~(FE_ALL_EXCEPT | FE_TOWARDZERO);
+      temp.__control_word &= ~(FE_ALL_EXCEPT_X86
+			       | FE_TOWARDZERO
+			       | _FPU_EXTENDED);
       temp.__control_word |= (envp->__control_word
-			      & (FE_ALL_EXCEPT | FE_TOWARDZERO));
-      temp.__status_word &= ~FE_ALL_EXCEPT;
-      temp.__status_word |= envp->__status_word & FE_ALL_EXCEPT;
+			      & (FE_ALL_EXCEPT_X86
+				 | FE_TOWARDZERO
+				 | _FPU_EXTENDED));
+      temp.__status_word &= ~FE_ALL_EXCEPT_X86;
+      temp.__status_word |= envp->__status_word & FE_ALL_EXCEPT_X86;
       temp.__eip = envp->__eip;
       temp.__cs_selector = envp->__cs_selector;
       temp.__opcode = envp->__opcode;
diff --git a/sysdeps/x86_64/fpu/fesetround.c b/sysdeps/x86_64/fpu/fesetround.c
index 2a9c351142..475d63f4db 100644
--- a/sysdeps/x86_64/fpu/fesetround.c
+++ b/sysdeps/x86_64/fpu/fesetround.c
@@ -1,5 +1,5 @@
 /* Set current rounding direction.
-   Copyright (C) 2001-2015 Free Software Foundation, Inc.
+   Copyright (C) 2001-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/feupdateenv.c b/sysdeps/x86_64/fpu/feupdateenv.c
index 99dfdd8f5c..f035d57ca8 100644
--- a/sysdeps/x86_64/fpu/feupdateenv.c
+++ b/sysdeps/x86_64/fpu/feupdateenv.c
@@ -1,5 +1,5 @@
 /* Install given floating-point environment and raise exceptions.
-   Copyright (C) 1997-2015 Free Software Foundation, Inc.
+   Copyright (C) 1997-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
 
diff --git a/sysdeps/x86_64/fpu/fgetexcptflg.c b/sysdeps/x86_64/fpu/fgetexcptflg.c
index e4f321e239..938cf3e62b 100644
--- a/sysdeps/x86_64/fpu/fgetexcptflg.c
+++ b/sysdeps/x86_64/fpu/fgetexcptflg.c
@@ -1,5 +1,5 @@
 /* Store current representation for exceptions.
-   Copyright (C) 2001-2015 Free Software Foundation, Inc.
+   Copyright (C) 2001-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/fraiseexcpt.c b/sysdeps/x86_64/fpu/fraiseexcpt.c
index 3cd924647e..e2abbbec33 100644
--- a/sysdeps/x86_64/fpu/fraiseexcpt.c
+++ b/sysdeps/x86_64/fpu/fraiseexcpt.c
@@ -1,5 +1,5 @@
 /* Raise given exceptions.
-   Copyright (C) 2001-2015 Free Software Foundation, Inc.
+   Copyright (C) 2001-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/fsetexcptflg.c b/sysdeps/x86_64/fpu/fsetexcptflg.c
index f7915e3fdd..76f7bad9a8 100644
--- a/sysdeps/x86_64/fpu/fsetexcptflg.c
+++ b/sysdeps/x86_64/fpu/fsetexcptflg.c
@@ -1,5 +1,5 @@
 /* Set floating-point environment exception handling.
-   Copyright (C) 2001-2015 Free Software Foundation, Inc.
+   Copyright (C) 2001-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/ftestexcept.c b/sysdeps/x86_64/fpu/ftestexcept.c
index 1e67c2fe21..c8f2c01c67 100644
--- a/sysdeps/x86_64/fpu/ftestexcept.c
+++ b/sysdeps/x86_64/fpu/ftestexcept.c
@@ -1,5 +1,5 @@
 /* Test exception in current environment.
-   Copyright (C) 2001-2015 Free Software Foundation, Inc.
+   Copyright (C) 2001-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps
index de7d420aef..445b47527d 100644
--- a/sysdeps/x86_64/fpu/libm-test-ulps
+++ b/sysdeps/x86_64/fpu/libm-test-ulps
@@ -32,34 +32,34 @@ ildouble: 2
 ldouble: 2
 
 Function: "acosh":
-double: 1
+double: 2
 float: 2
-idouble: 1
+idouble: 2
 ifloat: 2
 ildouble: 2
 ldouble: 2
 
 Function: "acosh_downward":
-double: 1
-float: 1
-idouble: 1
-ifloat: 1
+double: 2
+float: 2
+idouble: 2
+ifloat: 2
 ildouble: 4
 ldouble: 4
 
 Function: "acosh_towardzero":
 double: 2
-float: 1
+float: 2
 idouble: 2
-ifloat: 1
+ifloat: 2
 ildouble: 4
 ldouble: 4
 
 Function: "acosh_upward":
 double: 2
-float: 1
+float: 2
 idouble: 2
-ifloat: 1
+ifloat: 2
 ildouble: 3
 ldouble: 3
 
@@ -98,8 +98,8 @@ double: 1
 float: 1
 idouble: 1
 ifloat: 1
-ildouble: 2
-ldouble: 2
+ildouble: 3
+ldouble: 3
 
 Function: "asinh_downward":
 double: 3
@@ -122,8 +122,8 @@ double: 3
 float: 3
 idouble: 3
 ifloat: 3
-ildouble: 4
-ldouble: 4
+ildouble: 5
+ldouble: 5
 
 Function: "atan":
 float: 1
@@ -186,18 +186,18 @@ ildouble: 1
 ldouble: 1
 
 Function: "atanh":
-double: 1
+double: 2
 float: 2
-idouble: 1
+idouble: 2
 ifloat: 2
 ildouble: 3
 ldouble: 3
 
 Function: "atanh_downward":
 double: 3
-float: 2
+float: 3
 idouble: 3
-ifloat: 2
+ifloat: 3
 ildouble: 5
 ldouble: 5
 
@@ -210,9 +210,9 @@ ildouble: 4
 ldouble: 4
 
 Function: "atanh_upward":
-double: 2
+double: 3
 float: 3
-idouble: 2
+idouble: 3
 ifloat: 3
 ildouble: 5
 ldouble: 5
@@ -668,9 +668,9 @@ ildouble: 1
 ldouble: 1
 
 Function: "cbrt_upward":
-double: 4
+double: 5
 float: 1
-idouble: 4
+idouble: 5
 ifloat: 1
 ildouble: 1
 ldouble: 1
@@ -869,11 +869,11 @@ ldouble: 3
 
 Function: Real part of "clog":
 double: 3
-float: 2
+float: 3
 idouble: 3
-ifloat: 2
-ildouble: 4
-ldouble: 4
+ifloat: 3
+ildouble: 3
+ldouble: 3
 
 Function: Imaginary part of "clog":
 float: 1
@@ -883,9 +883,9 @@ ldouble: 1
 
 Function: Real part of "clog10":
 double: 3
-float: 3
+float: 4
 idouble: 3
-ifloat: 3
+ifloat: 4
 ildouble: 4
 ldouble: 4
 
@@ -898,10 +898,10 @@ ildouble: 2
 ldouble: 2
 
 Function: Real part of "clog10_downward":
-double: 6
-float: 6
-idouble: 6
-ifloat: 6
+double: 5
+float: 4
+idouble: 5
+ifloat: 4
 ildouble: 8
 ldouble: 8
 
@@ -910,14 +910,14 @@ double: 2
 float: 4
 idouble: 2
 ifloat: 4
-ildouble: 2
-ldouble: 2
+ildouble: 3
+ldouble: 3
 
 Function: Real part of "clog10_towardzero":
 double: 5
-float: 4
+float: 5
 idouble: 5
-ifloat: 4
+ifloat: 5
 ildouble: 8
 ldouble: 8
 
@@ -930,28 +930,28 @@ ildouble: 3
 ldouble: 3
 
 Function: Real part of "clog10_upward":
-double: 8
+double: 6
 float: 5
-idouble: 8
+idouble: 6
 ifloat: 5
-ildouble: 6
-ldouble: 6
+ildouble: 8
+ldouble: 8
 
 Function: Imaginary part of "clog10_upward":
 double: 2
-float: 3
+float: 4
 idouble: 2
-ifloat: 3
+ifloat: 4
 ildouble: 3
 ldouble: 3
 
 Function: Real part of "clog_downward":
-double: 7
-float: 5
-idouble: 7
-ifloat: 5
-ildouble: 7
-ldouble: 7
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 5
+ldouble: 5
 
 Function: Imaginary part of "clog_downward":
 double: 1
@@ -962,28 +962,28 @@ ildouble: 1
 ldouble: 1
 
 Function: Real part of "clog_towardzero":
-double: 7
-float: 5
-idouble: 7
-ifloat: 5
-ildouble: 8
-ldouble: 8
+double: 4
+float: 4
+idouble: 4
+ifloat: 4
+ildouble: 5
+ldouble: 5
 
 Function: Imaginary part of "clog_towardzero":
 double: 1
-float: 2
+float: 3
 idouble: 1
-ifloat: 2
+ifloat: 3
 ildouble: 1
 ldouble: 1
 
 Function: Real part of "clog_upward":
-double: 8
-float: 5
-idouble: 8
-ifloat: 5
-ildouble: 6
-ldouble: 6
+double: 4
+float: 3
+idouble: 4
+ifloat: 3
+ildouble: 4
+ldouble: 4
 
 Function: Imaginary part of "clog_upward":
 double: 1
@@ -1019,14 +1019,14 @@ Function: "cos_vlen16":
 float: 1
 
 Function: "cos_vlen2":
-double: 1
+double: 2
 
 Function: "cos_vlen4":
-double: 1
+double: 2
 float: 1
 
 Function: "cos_vlen4_avx2":
-double: 1
+double: 2
 
 Function: "cos_vlen8":
 double: 1
@@ -1040,7 +1040,7 @@ double: 1
 float: 1
 idouble: 1
 ifloat: 1
-ildouble: 1
+ildouble: 2
 ldouble: 2
 
 Function: "cosh_downward":
@@ -1264,25 +1264,25 @@ ildouble: 2
 ldouble: 2
 
 Function: Real part of "csqrt_downward":
-double: 4
+double: 5
 float: 4
-idouble: 4
+idouble: 5
 ifloat: 4
-ildouble: 4
-ldouble: 4
+ildouble: 5
+ldouble: 5
 
 Function: Imaginary part of "csqrt_downward":
 double: 4
 float: 3
 idouble: 4
 ifloat: 3
-ildouble: 3
-ldouble: 3
+ildouble: 4
+ldouble: 4
 
 Function: Real part of "csqrt_towardzero":
-double: 3
+double: 4
 float: 3
-idouble: 3
+idouble: 4
 ifloat: 3
 ildouble: 4
 ldouble: 4
@@ -1292,8 +1292,8 @@ double: 4
 float: 3
 idouble: 4
 ifloat: 3
-ildouble: 3
-ldouble: 3
+ildouble: 4
+ldouble: 4
 
 Function: Real part of "csqrt_upward":
 double: 5
@@ -1308,8 +1308,8 @@ double: 3
 float: 3
 idouble: 3
 ifloat: 3
-ildouble: 3
-ldouble: 3
+ildouble: 4
+ldouble: 4
 
 Function: Real part of "ctan":
 double: 1
@@ -1472,17 +1472,17 @@ ildouble: 1
 ldouble: 1
 
 Function: "erfc":
-double: 2
+double: 3
 float: 2
-idouble: 2
+idouble: 3
 ifloat: 2
-ildouble: 2
-ldouble: 2
+ildouble: 3
+ldouble: 3
 
 Function: "erfc_downward":
-double: 4
+double: 5
 float: 6
-idouble: 4
+idouble: 5
 ifloat: 6
 ildouble: 4
 ldouble: 4
@@ -1496,12 +1496,12 @@ ildouble: 4
 ldouble: 4
 
 Function: "erfc_upward":
-double: 4
+double: 5
 float: 6
-idouble: 4
+idouble: 5
 ifloat: 6
-ildouble: 4
-ldouble: 4
+ildouble: 5
+ldouble: 5
 
 Function: "exp":
 ildouble: 1
@@ -1578,12 +1578,14 @@ ldouble: 1
 Function: "exp_towardzero":
 double: 1
 idouble: 1
-ildouble: 1
-ldouble: 1
+ildouble: 2
+ldouble: 2
 
 Function: "exp_upward":
 double: 1
+float: 1
 idouble: 1
+ifloat: 1
 ildouble: 1
 ldouble: 1
 
@@ -1625,9 +1627,9 @@ ldouble: 4
 
 Function: "expm1_towardzero":
 double: 1
-float: 1
+float: 2
 idouble: 1
-ifloat: 1
+ifloat: 2
 ildouble: 4
 ldouble: 4
 
@@ -1640,36 +1642,36 @@ ildouble: 4
 ldouble: 4
 
 Function: "gamma":
-double: 2
-float: 2
-idouble: 2
-ifloat: 2
-ildouble: 2
-ldouble: 2
-
-Function: "gamma_downward":
 double: 4
-float: 3
+float: 4
 idouble: 4
-ifloat: 3
-ildouble: 6
-ldouble: 6
+ifloat: 4
+ildouble: 4
+ldouble: 4
+
+Function: "gamma_downward":
+double: 5
+float: 4
+idouble: 5
+ifloat: 4
+ildouble: 7
+ldouble: 7
 
 Function: "gamma_towardzero":
-double: 4
-float: 3
-idouble: 4
-ifloat: 3
-ildouble: 6
-ldouble: 6
+double: 5
+float: 4
+idouble: 5
+ifloat: 4
+ildouble: 7
+ldouble: 7
 
 Function: "gamma_upward":
-double: 4
-float: 3
-idouble: 4
-ifloat: 3
-ildouble: 4
-ldouble: 4
+double: 5
+float: 5
+idouble: 5
+ifloat: 5
+ildouble: 6
+ldouble: 6
 
 Function: "hypot":
 double: 1
@@ -1792,36 +1794,36 @@ ildouble: 5
 ldouble: 5
 
 Function: "lgamma":
-double: 2
-float: 2
-idouble: 2
-ifloat: 2
-ildouble: 2
-ldouble: 2
-
-Function: "lgamma_downward":
 double: 4
-float: 3
+float: 4
 idouble: 4
-ifloat: 3
-ildouble: 6
-ldouble: 6
+ifloat: 4
+ildouble: 4
+ldouble: 4
+
+Function: "lgamma_downward":
+double: 5
+float: 4
+idouble: 5
+ifloat: 4
+ildouble: 7
+ldouble: 7
 
 Function: "lgamma_towardzero":
-double: 4
-float: 3
-idouble: 4
-ifloat: 3
-ildouble: 6
-ldouble: 6
+double: 5
+float: 4
+idouble: 5
+ifloat: 4
+ildouble: 7
+ldouble: 7
 
 Function: "lgamma_upward":
-double: 4
-float: 3
-idouble: 4
-ifloat: 3
-ildouble: 4
-ldouble: 4
+double: 5
+float: 5
+idouble: 5
+ifloat: 5
+ildouble: 6
+ldouble: 6
 
 Function: "log":
 float: 1
@@ -1874,16 +1876,16 @@ double: 2
 float: 2
 idouble: 2
 ifloat: 2
-ildouble: 3
-ldouble: 3
+ildouble: 4
+ldouble: 4
 
 Function: "log1p_towardzero":
 double: 2
 float: 2
 idouble: 2
 ifloat: 2
-ildouble: 3
-ldouble: 3
+ildouble: 4
+ldouble: 4
 
 Function: "log1p_upward":
 double: 2
@@ -1938,7 +1940,9 @@ ildouble: 2
 ldouble: 2
 
 Function: "log_upward":
+double: 1
 float: 2
+idouble: 1
 ifloat: 2
 ildouble: 1
 ldouble: 1
@@ -1964,8 +1968,8 @@ Function: "log_vlen8_avx2":
 float: 2
 
 Function: "pow":
-float: 3
-ifloat: 3
+float: 1
+ifloat: 1
 ildouble: 1
 ldouble: 1
 
@@ -2001,25 +2005,25 @@ ldouble: 2
 
 Function: "pow_downward":
 double: 1
-float: 3
+float: 1
 idouble: 1
-ifloat: 3
+ifloat: 1
 ildouble: 4
 ldouble: 4
 
 Function: "pow_towardzero":
 double: 1
-float: 4
+float: 1
 idouble: 1
-ifloat: 4
+ifloat: 1
 ildouble: 1
 ldouble: 1
 
 Function: "pow_upward":
 double: 1
-float: 4
+float: 1
 idouble: 1
-ifloat: 4
+ifloat: 1
 ildouble: 2
 ldouble: 2
 
@@ -2050,14 +2054,14 @@ ldouble: 1
 Function: "sin_downward":
 double: 1
 idouble: 1
-ildouble: 2
-ldouble: 2
+ildouble: 3
+ldouble: 3
 
 Function: "sin_towardzero":
 double: 1
 idouble: 1
-ildouble: 1
-ldouble: 1
+ildouble: 2
+ldouble: 2
 
 Function: "sin_upward":
 double: 1
@@ -2111,14 +2115,14 @@ Function: "sincos_vlen16":
 float: 1
 
 Function: "sincos_vlen2":
-double: 1
+double: 2
 
 Function: "sincos_vlen4":
-double: 1
+double: 2
 float: 1
 
 Function: "sincos_vlen4_avx2":
-double: 1
+double: 2
 
 Function: "sincos_vlen8":
 double: 1
@@ -2162,24 +2166,24 @@ ldouble: 5
 Function: "tan":
 float: 1
 ifloat: 1
-ildouble: 1
-ldouble: 1
+ildouble: 2
+ldouble: 2
 
 Function: "tan_downward":
 double: 1
 float: 2
 idouble: 1
 ifloat: 2
-ildouble: 2
-ldouble: 2
+ildouble: 3
+ldouble: 3
 
 Function: "tan_towardzero":
 double: 1
 float: 1
 idouble: 1
 ifloat: 1
-ildouble: 2
-ldouble: 2
+ildouble: 3
+ldouble: 3
 
 Function: "tan_upward":
 double: 1
@@ -2194,8 +2198,8 @@ double: 2
 float: 2
 idouble: 2
 ifloat: 2
-ildouble: 2
-ldouble: 2
+ildouble: 3
+ldouble: 3
 
 Function: "tanh_downward":
 double: 3
@@ -2222,36 +2226,36 @@ ildouble: 4
 ldouble: 4
 
 Function: "tgamma":
-double: 4
+double: 5
 float: 5
-idouble: 4
+idouble: 5
 ifloat: 5
-ildouble: 3
-ldouble: 3
+ildouble: 5
+ldouble: 5
 
 Function: "tgamma_downward":
-double: 4
-float: 4
-idouble: 4
-ifloat: 4
-ildouble: 3
-ldouble: 3
+double: 5
+float: 5
+idouble: 5
+ifloat: 5
+ildouble: 5
+ldouble: 5
 
 Function: "tgamma_towardzero":
 double: 5
 float: 5
 idouble: 5
 ifloat: 5
-ildouble: 3
-ldouble: 3
+ildouble: 5
+ldouble: 5
 
 Function: "tgamma_upward":
 double: 5
 float: 5
 idouble: 5
 ifloat: 5
-ildouble: 3
-ldouble: 3
+ildouble: 5
+ldouble: 5
 
 Function: "y0":
 double: 2
diff --git a/sysdeps/x86_64/fpu/math-tests-arch.h b/sysdeps/x86_64/fpu/math-tests-arch.h
index e8833bfe0a..867152046e 100644
--- a/sysdeps/x86_64/fpu/math-tests-arch.h
+++ b/sysdeps/x86_64/fpu/math-tests-arch.h
@@ -1,5 +1,5 @@
 /* Runtime architecture check for math tests. x86_64 version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -19,66 +19,36 @@
 #if defined REQUIRE_AVX
 # include <init-arch.h>
 
-/* Set to 1 if AVX supported.  */
-static int avx_usable;
-
-# define INIT_ARCH_EXT                                         \
-  do                                                           \
-    {                                                          \
-      __init_cpu_features ();                                  \
-      avx_usable = __cpu_features.feature[index_AVX_Usable]    \
-                   & bit_AVX_Usable;                           \
-    }                                                          \
-  while (0)
+# define INIT_ARCH_EXT
 
 # define CHECK_ARCH_EXT                                        \
   do                                                           \
     {                                                          \
-      if (!avx_usable) return;                                 \
+      if (!HAS_ARCH_FEATURE (AVX_Usable)) return;              \
     }                                                          \
   while (0)
 
 #elif defined REQUIRE_AVX2
 # include <init-arch.h>
 
-  /* Set to 1 if AVX2 supported.  */
-  static int avx2_usable;
-
-# define INIT_ARCH_EXT                                         \
-  do                                                           \
-    {                                                          \
-      __init_cpu_features ();                                  \
-      avx2_usable = __cpu_features.feature[index_AVX2_Usable]  \
-                  & bit_AVX2_Usable;                           \
-    }                                                          \
-  while (0)
+# define INIT_ARCH_EXT
 
 # define CHECK_ARCH_EXT                                        \
   do                                                           \
     {                                                          \
-      if (!avx2_usable) return;                                \
+      if (!HAS_ARCH_FEATURE (AVX2_Usable)) return;             \
     }                                                          \
   while (0)
 
 #elif defined REQUIRE_AVX512F
 # include <init-arch.h>
 
-  /* Set to 1 if supported.  */
-  static int avx512f_usable;
-
-# define INIT_ARCH_EXT                                                \
-  do                                                                  \
-    {                                                                 \
-      __init_cpu_features ();                                         \
-      avx512f_usable = __cpu_features.feature[index_AVX512F_Usable]   \
-		       & bit_AVX512F_Usable;                          \
-    }                                                                 \
-  while (0)
+# define INIT_ARCH_EXT
 
 # define CHECK_ARCH_EXT                                        \
   do                                                           \
     {                                                          \
-      if (!avx512f_usable) return;                             \
+      if (!HAS_ARCH_FEATURE (AVX512F_Usable)) return;          \
     }                                                          \
   while (0)
 
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index 86ea473b4f..34542155aa 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -2,7 +2,6 @@ ifeq ($(subdir),math)
 libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \
 			s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c
 
-ifeq ($(have-mfma4),yes)
 libm-sysdep_routines += e_exp-fma4 e_log-fma4 e_pow-fma4 s_atan-fma4 \
 			e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4 \
 			mplog-fma4 mpa-fma4 slowexp-fma4 slowpow-fma4 \
@@ -16,7 +15,7 @@ CFLAGS-e_asin-fma4.c = -mfma4
 CFLAGS-e_atan2-fma4.c = -mfma4
 CFLAGS-e_exp-fma4.c = -mfma4
 CFLAGS-e_log-fma4.c = -mfma4
-CFLAGS-e_pow-fma4.c = -mfma4
+CFLAGS-e_pow-fma4.c = -mfma4 $(config-cflags-nofma)
 CFLAGS-halfulp-fma4.c = -mfma4
 CFLAGS-mpa-fma4.c = -mfma4
 CFLAGS-mpatan-fma4.c = -mfma4
@@ -31,9 +30,7 @@ CFLAGS-slowexp-fma4.c = -mfma4
 CFLAGS-slowpow-fma4.c = -mfma4
 CFLAGS-s_sin-fma4.c = -mfma4
 CFLAGS-s_tan-fma4.c = -mfma4
-endif
 
-ifeq ($(config-cflags-sse2avx),yes)
 libm-sysdep_routines += e_exp-avx e_log-avx s_atan-avx \
 			e_atan2-avx s_sin-avx s_tan-avx \
 			mplog-avx mpa-avx slowexp-avx \
@@ -50,7 +47,6 @@ CFLAGS-s_sin-avx.c = -msse2avx -DSSE2AVX
 CFLAGS-slowexp-avx.c = -msse2avx -DSSE2AVX
 CFLAGS-s_tan-avx.c = -msse2avx -DSSE2AVX
 endif
-endif
 
 ifeq ($(subdir),mathvec)
 libmvec-sysdep_routines += svml_d_cos2_core_sse4 svml_d_cos4_core_avx2 \
diff --git a/sysdeps/x86_64/fpu/multiarch/e_asin.c b/sysdeps/x86_64/fpu/multiarch/e_asin.c
index 55865c02f3..111a5b99bd 100644
--- a/sysdeps/x86_64/fpu/multiarch/e_asin.c
+++ b/sysdeps/x86_64/fpu/multiarch/e_asin.c
@@ -1,7 +1,6 @@
-#ifdef HAVE_FMA4_SUPPORT
-# include <init-arch.h>
-# include <math.h>
-# include <math_private.h>
+#include <init-arch.h>
+#include <math.h>
+#include <math_private.h>
 
 extern double __ieee754_acos_sse2 (double);
 extern double __ieee754_asin_sse2 (double);
@@ -9,16 +8,19 @@ extern double __ieee754_acos_fma4 (double);
 extern double __ieee754_asin_fma4 (double);
 
 libm_ifunc (__ieee754_acos,
-	    HAS_FMA4 ? __ieee754_acos_fma4 : __ieee754_acos_sse2);
+	    HAS_ARCH_FEATURE (FMA4_Usable)
+	    ? __ieee754_acos_fma4
+	    : __ieee754_acos_sse2);
 strong_alias (__ieee754_acos, __acos_finite)
 
 libm_ifunc (__ieee754_asin,
-	    HAS_FMA4 ? __ieee754_asin_fma4 : __ieee754_asin_sse2);
+	    HAS_ARCH_FEATURE (FMA4_Usable)
+	    ? __ieee754_asin_fma4
+	    : __ieee754_asin_sse2);
 strong_alias (__ieee754_asin, __asin_finite)
 
-# define __ieee754_acos __ieee754_acos_sse2
-# define __ieee754_asin __ieee754_asin_sse2
-#endif
+#define __ieee754_acos __ieee754_acos_sse2
+#define __ieee754_asin __ieee754_asin_sse2
 
 
 #include <sysdeps/ieee754/dbl-64/e_asin.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/e_atan2.c b/sysdeps/x86_64/fpu/multiarch/e_atan2.c
index 547681cb59..9ca3c02a44 100644
--- a/sysdeps/x86_64/fpu/multiarch/e_atan2.c
+++ b/sysdeps/x86_64/fpu/multiarch/e_atan2.c
@@ -1,25 +1,18 @@
-#if defined HAVE_FMA4_SUPPORT || defined HAVE_AVX_SUPPORT
-# include <init-arch.h>
-# include <math.h>
-# include <math_private.h>
+#include <init-arch.h>
+#include <math.h>
+#include <math_private.h>
 
 extern double __ieee754_atan2_sse2 (double, double);
 extern double __ieee754_atan2_avx (double, double);
-# ifdef HAVE_FMA4_SUPPORT
 extern double __ieee754_atan2_fma4 (double, double);
-# else
-#  undef HAS_FMA4
-#  define HAS_FMA4 0
-#  define __ieee754_atan2_fma4 ((void *) 0)
-# endif
 
 libm_ifunc (__ieee754_atan2,
-	    HAS_FMA4 ? __ieee754_atan2_fma4
-	    : (HAS_AVX ? __ieee754_atan2_avx : __ieee754_atan2_sse2));
+	    HAS_ARCH_FEATURE (FMA4_Usable) ? __ieee754_atan2_fma4
+	    : (HAS_ARCH_FEATURE (AVX_Usable)
+	       ? __ieee754_atan2_avx : __ieee754_atan2_sse2));
 strong_alias (__ieee754_atan2, __atan2_finite)
 
-# define __ieee754_atan2 __ieee754_atan2_sse2
-#endif
+#define __ieee754_atan2 __ieee754_atan2_sse2
 
 
 #include <sysdeps/ieee754/dbl-64/e_atan2.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/e_exp.c b/sysdeps/x86_64/fpu/multiarch/e_exp.c
index d244954056..b7d7b5ff27 100644
--- a/sysdeps/x86_64/fpu/multiarch/e_exp.c
+++ b/sysdeps/x86_64/fpu/multiarch/e_exp.c
@@ -1,25 +1,18 @@
-#if defined HAVE_FMA4_SUPPORT || defined HAVE_AVX_SUPPORT
-# include <init-arch.h>
-# include <math.h>
-# include <math_private.h>
+#include <init-arch.h>
+#include <math.h>
+#include <math_private.h>
 
 extern double __ieee754_exp_sse2 (double);
 extern double __ieee754_exp_avx (double);
-# ifdef HAVE_FMA4_SUPPORT
 extern double __ieee754_exp_fma4 (double);
-# else
-#  undef HAS_FMA4
-#  define HAS_FMA4 0
-#  define __ieee754_exp_fma4 ((void *) 0)
-# endif
 
 libm_ifunc (__ieee754_exp,
-	    HAS_FMA4 ? __ieee754_exp_fma4
-	    : (HAS_AVX ? __ieee754_exp_avx : __ieee754_exp_sse2));
+	    HAS_ARCH_FEATURE (FMA4_Usable) ? __ieee754_exp_fma4
+	    : (HAS_ARCH_FEATURE (AVX_Usable)
+	       ? __ieee754_exp_avx : __ieee754_exp_sse2));
 strong_alias (__ieee754_exp, __exp_finite)
 
-# define __ieee754_exp __ieee754_exp_sse2
-#endif
+#define __ieee754_exp __ieee754_exp_sse2
 
 
 #include <sysdeps/ieee754/dbl-64/e_exp.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/e_log.c b/sysdeps/x86_64/fpu/multiarch/e_log.c
index 98054737bd..cf9533d6c0 100644
--- a/sysdeps/x86_64/fpu/multiarch/e_log.c
+++ b/sysdeps/x86_64/fpu/multiarch/e_log.c
@@ -1,25 +1,18 @@
-#if defined HAVE_FMA4_SUPPORT || defined HAVE_AVX_SUPPORT
-# include <init-arch.h>
-# include <math.h>
-# include <math_private.h>
+#include <init-arch.h>
+#include <math.h>
+#include <math_private.h>
 
 extern double __ieee754_log_sse2 (double);
 extern double __ieee754_log_avx (double);
-# ifdef HAVE_FMA4_SUPPORT
 extern double __ieee754_log_fma4 (double);
-# else
-#  undef HAS_FMA4
-#  define HAS_FMA4 0
-#  define __ieee754_log_fma4 ((void *) 0)
-# endif
 
 libm_ifunc (__ieee754_log,
-	    HAS_FMA4 ? __ieee754_log_fma4
-	    : (HAS_AVX ? __ieee754_log_avx : __ieee754_log_sse2));
+	    HAS_ARCH_FEATURE (FMA4_Usable) ? __ieee754_log_fma4
+	    : (HAS_ARCH_FEATURE (AVX_Usable)
+	       ? __ieee754_log_avx : __ieee754_log_sse2));
 strong_alias (__ieee754_log, __log_finite)
 
-# define __ieee754_log __ieee754_log_sse2
-#endif
+#define __ieee754_log __ieee754_log_sse2
 
 
 #include <sysdeps/ieee754/dbl-64/e_log.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/e_pow.c b/sysdeps/x86_64/fpu/multiarch/e_pow.c
index 433cce0de6..a5c5d89c3e 100644
--- a/sysdeps/x86_64/fpu/multiarch/e_pow.c
+++ b/sysdeps/x86_64/fpu/multiarch/e_pow.c
@@ -1,16 +1,17 @@
-#ifdef HAVE_FMA4_SUPPORT
-# include <init-arch.h>
-# include <math.h>
-# include <math_private.h>
+#include <init-arch.h>
+#include <math.h>
+#include <math_private.h>
 
 extern double __ieee754_pow_sse2 (double, double);
 extern double __ieee754_pow_fma4 (double, double);
 
-libm_ifunc (__ieee754_pow, HAS_FMA4 ? __ieee754_pow_fma4 : __ieee754_pow_sse2);
+libm_ifunc (__ieee754_pow,
+	    HAS_ARCH_FEATURE (FMA4_Usable)
+	    ? __ieee754_pow_fma4
+	    : __ieee754_pow_sse2);
 strong_alias (__ieee754_pow, __pow_finite)
 
-# define __ieee754_pow __ieee754_pow_sse2
-#endif
+#define __ieee754_pow __ieee754_pow_sse2
 
 
 #include <sysdeps/ieee754/dbl-64/e_pow.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/s_atan.c b/sysdeps/x86_64/fpu/multiarch/s_atan.c
index ae16d7c9bb..742e95cb96 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_atan.c
+++ b/sysdeps/x86_64/fpu/multiarch/s_atan.c
@@ -1,22 +1,15 @@
-#if defined HAVE_FMA4_SUPPORT || defined HAVE_AVX_SUPPORT
-# include <init-arch.h>
-# include <math.h>
+#include <init-arch.h>
+#include <math.h>
 
 extern double __atan_sse2 (double);
 extern double __atan_avx (double);
-# ifdef HAVE_FMA4_SUPPORT
 extern double __atan_fma4 (double);
-# else
-#  undef HAS_FMA4
-#  define HAS_FMA4 0
-#  define __atan_fma4 ((void *) 0)
-# endif
 
-libm_ifunc (atan, (HAS_FMA4 ? __atan_fma4 :
-		   HAS_AVX ? __atan_avx : __atan_sse2));
+libm_ifunc (atan, (HAS_ARCH_FEATURE (FMA4_Usable) ? __atan_fma4 :
+		   HAS_ARCH_FEATURE (AVX_Usable)
+		   ? __atan_avx : __atan_sse2));
 
-# define atan __atan_sse2
-#endif
+#define atan __atan_sse2
 
 
 #include <sysdeps/ieee754/dbl-64/s_atan.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/s_ceil.S b/sysdeps/x86_64/fpu/multiarch/s_ceil.S
index 00ecede74d..40fa729955 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_ceil.S
+++ b/sysdeps/x86_64/fpu/multiarch/s_ceil.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@gmail.come>, 2011.
 
@@ -22,10 +22,9 @@
 
 ENTRY(__ceil)
 	.type	__ceil, @gnu_indirect_function
-	call	__get_cpu_features@plt
-	movq	%rax, %rdx
+	LOAD_RTLD_GLOBAL_RO_RDX
 	leaq	__ceil_sse41(%rip), %rax
-	testl	$bit_SSE4_1, CPUID_OFFSET+index_SSE4_1(%rdx)
+	HAS_CPU_FEATURE (SSE4_1)
 	jnz	2f
 	leaq	__ceil_c(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/fpu/multiarch/s_ceilf.S b/sysdeps/x86_64/fpu/multiarch/s_ceilf.S
index c8ed70553e..9a06a5c174 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_ceilf.S
+++ b/sysdeps/x86_64/fpu/multiarch/s_ceilf.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@gmail.come>, 2011.
 
@@ -22,10 +22,9 @@
 
 ENTRY(__ceilf)
 	.type	__ceilf, @gnu_indirect_function
-	call	__get_cpu_features@plt
-	movq	%rax, %rdx
+	LOAD_RTLD_GLOBAL_RO_RDX
 	leaq	__ceilf_sse41(%rip), %rax
-	testl	$bit_SSE4_1, CPUID_OFFSET+index_SSE4_1(%rdx)
+	HAS_CPU_FEATURE (SSE4_1)
 	jnz	2f
 	leaq	__ceilf_c(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/fpu/multiarch/s_floor.S b/sysdeps/x86_64/fpu/multiarch/s_floor.S
index 952ffaa314..57a0eee5ba 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_floor.S
+++ b/sysdeps/x86_64/fpu/multiarch/s_floor.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@gmail.come>, 2011.
 
@@ -22,10 +22,9 @@
 
 ENTRY(__floor)
 	.type	__floor, @gnu_indirect_function
-	call	__get_cpu_features@plt
-	movq	%rax, %rdx
+	LOAD_RTLD_GLOBAL_RO_RDX
 	leaq	__floor_sse41(%rip), %rax
-	testl	$bit_SSE4_1, CPUID_OFFSET+index_SSE4_1(%rdx)
+	HAS_CPU_FEATURE (SSE4_1)
 	jnz	2f
 	leaq	__floor_c(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/fpu/multiarch/s_floorf.S b/sysdeps/x86_64/fpu/multiarch/s_floorf.S
index c8231e86b3..74a149a950 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_floorf.S
+++ b/sysdeps/x86_64/fpu/multiarch/s_floorf.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@gmail.come>, 2011.
 
@@ -22,10 +22,9 @@
 
 ENTRY(__floorf)
 	.type	__floorf, @gnu_indirect_function
-	call	__get_cpu_features@plt
-	movq	%rax, %rdx
+	LOAD_RTLD_GLOBAL_RO_RDX
 	leaq	__floorf_sse41(%rip), %rax
-	testl	$bit_SSE4_1, CPUID_OFFSET+index_SSE4_1(%rdx)
+	HAS_CPU_FEATURE (SSE4_1)
 	jnz	2f
 	leaq	__floorf_c(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/fpu/multiarch/s_fma.c b/sysdeps/x86_64/fpu/multiarch/s_fma.c
index 0963a0b36a..1de1a84cbe 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_fma.c
+++ b/sysdeps/x86_64/fpu/multiarch/s_fma.c
@@ -1,5 +1,5 @@
 /* FMA version of fma.
-   Copyright (C) 2009-2015 Free Software Foundation, Inc.
+   Copyright (C) 2009-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -21,8 +21,6 @@
 #include <math.h>
 #include <init-arch.h>
 
-#ifdef HAVE_AVX_SUPPORT
-
 extern double __fma_sse2 (double x, double y, double z) attribute_hidden;
 
 
@@ -34,25 +32,19 @@ __fma_fma3 (double x, double y, double z)
 }
 
 
-# ifdef HAVE_FMA4_SUPPORT
 static double
 __fma_fma4 (double x, double y, double z)
 {
   asm ("vfmaddsd %3, %2, %1, %0" : "=x" (x) : "x" (x), "x" (y), "x" (z));
   return x;
 }
-# else
-#  undef HAS_FMA4
-#  define HAS_FMA4 0
-#  define __fma_fma4 ((void *) 0)
-# endif
 
 
-libm_ifunc (__fma, HAS_FMA
-	    ? __fma_fma3 : (HAS_FMA4 ? __fma_fma4 : __fma_sse2));
+libm_ifunc (__fma, HAS_ARCH_FEATURE (FMA_Usable)
+	    ? __fma_fma3 : (HAS_ARCH_FEATURE (FMA4_Usable)
+			    ? __fma_fma4 : __fma_sse2));
 weak_alias (__fma, fma)
 
-# define __fma __fma_sse2
-#endif
+#define __fma __fma_sse2
 
 #include <sysdeps/ieee754/dbl-64/s_fma.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/s_fmaf.c b/sysdeps/x86_64/fpu/multiarch/s_fmaf.c
index 6046961f86..8905e4b54f 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_fmaf.c
+++ b/sysdeps/x86_64/fpu/multiarch/s_fmaf.c
@@ -1,5 +1,5 @@
 /* FMA version of fmaf.
-   Copyright (C) 2009-2015 Free Software Foundation, Inc.
+   Copyright (C) 2009-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -20,8 +20,6 @@
 #include <math.h>
 #include <init-arch.h>
 
-#ifdef HAVE_AVX_SUPPORT
-
 extern float __fmaf_sse2 (float x, float y, float z) attribute_hidden;
 
 
@@ -33,25 +31,19 @@ __fmaf_fma3 (float x, float y, float z)
 }
 
 
-# ifdef HAVE_FMA4_SUPPORT
 static float
 __fmaf_fma4 (float x, float y, float z)
 {
   asm ("vfmaddss %3, %2, %1, %0" : "=x" (x) : "x" (x), "x" (y), "x" (z));
   return x;
 }
-# else
-#  undef HAS_FMA4
-#  define HAS_FMA4 0
-#  define __fmaf_fma4 ((void *) 0)
-# endif
 
 
-libm_ifunc (__fmaf, HAS_FMA
-	    ? __fmaf_fma3 : (HAS_FMA4 ? __fmaf_fma4 : __fmaf_sse2));
+libm_ifunc (__fmaf, HAS_ARCH_FEATURE (FMA_Usable)
+	    ? __fmaf_fma3 : (HAS_ARCH_FEATURE (FMA4_Usable)
+			     ? __fmaf_fma4 : __fmaf_sse2));
 weak_alias (__fmaf, fmaf)
 
-# define __fmaf __fmaf_sse2
-#endif
+#define __fmaf __fmaf_sse2
 
 #include <sysdeps/ieee754/dbl-64/s_fmaf.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S b/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S
index b5d32b5873..5091cf5813 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S
+++ b/sysdeps/x86_64/fpu/multiarch/s_nearbyint.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@gmail.come>, 2011.
 
@@ -22,10 +22,9 @@
 
 ENTRY(__nearbyint)
 	.type	__nearbyint, @gnu_indirect_function
-	call	__get_cpu_features@plt
-	movq	%rax, %rdx
+	LOAD_RTLD_GLOBAL_RO_RDX
 	leaq	__nearbyint_sse41(%rip), %rax
-	testl	$bit_SSE4_1, CPUID_OFFSET+index_SSE4_1(%rdx)
+	HAS_CPU_FEATURE (SSE4_1)
 	jnz	2f
 	leaq	__nearbyint_c(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S b/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S
index cd7e177a55..4a13700001 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S
+++ b/sysdeps/x86_64/fpu/multiarch/s_nearbyintf.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@gmail.come>, 2011.
 
@@ -22,10 +22,9 @@
 
 ENTRY(__nearbyintf)
 	.type	__nearbyintf, @gnu_indirect_function
-	call	__get_cpu_features@plt
-	movq	%rax, %rdx
+	LOAD_RTLD_GLOBAL_RO_RDX
 	leaq	__nearbyintf_sse41(%rip), %rax
-	testl	$bit_SSE4_1, CPUID_OFFSET+index_SSE4_1(%rdx)
+	HAS_CPU_FEATURE (SSE4_1)
 	jnz	2f
 	leaq	__nearbyintf_c(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/fpu/multiarch/s_rint.S b/sysdeps/x86_64/fpu/multiarch/s_rint.S
index f52cef65db..1c0d1e14b7 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_rint.S
+++ b/sysdeps/x86_64/fpu/multiarch/s_rint.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@gmail.come>, 2011.
 
@@ -22,10 +22,9 @@
 
 ENTRY(__rint)
 	.type	__rint, @gnu_indirect_function
-	call	__get_cpu_features@plt
-	movq	%rax, %rdx
+	LOAD_RTLD_GLOBAL_RO_RDX
 	leaq	__rint_sse41(%rip), %rax
-	testl	$bit_SSE4_1, CPUID_OFFSET+index_SSE4_1(%rdx)
+	HAS_CPU_FEATURE (SSE4_1)
 	jnz	2f
 	leaq	__rint_c(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/fpu/multiarch/s_rintf.S b/sysdeps/x86_64/fpu/multiarch/s_rintf.S
index e2608d4c4e..8e42fa561f 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_rintf.S
+++ b/sysdeps/x86_64/fpu/multiarch/s_rintf.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@gmail.come>, 2011.
 
@@ -22,10 +22,9 @@
 
 ENTRY(__rintf)
 	.type	__rintf, @gnu_indirect_function
-	call	__get_cpu_features@plt
-	movq	%rax, %rdx
+	LOAD_RTLD_GLOBAL_RO_RDX
 	leaq	__rintf_sse41(%rip), %rax
-	testl	$bit_SSE4_1, CPUID_OFFSET+index_SSE4_1(%rdx)
+	HAS_CPU_FEATURE (SSE4_1)
 	jnz	2f
 	leaq	__rintf_c(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/fpu/multiarch/s_sin.c b/sysdeps/x86_64/fpu/multiarch/s_sin.c
index a0c2521c98..8ffd3e7125 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_sin.c
+++ b/sysdeps/x86_64/fpu/multiarch/s_sin.c
@@ -1,33 +1,26 @@
-#if defined HAVE_FMA4_SUPPORT || defined HAVE_AVX_SUPPORT
-# include <init-arch.h>
-# include <math.h>
-# undef NAN
+#include <init-arch.h>
+#include <math.h>
+#undef NAN
 
 extern double __cos_sse2 (double);
 extern double __sin_sse2 (double);
 extern double __cos_avx (double);
 extern double __sin_avx (double);
-# ifdef HAVE_FMA4_SUPPORT
 extern double __cos_fma4 (double);
 extern double __sin_fma4 (double);
-# else
-#  undef HAS_FMA4
-#  define HAS_FMA4 0
-#  define __cos_fma4 ((void *) 0)
-#  define __sin_fma4 ((void *) 0)
-# endif
 
-libm_ifunc (__cos, (HAS_FMA4 ? __cos_fma4 :
-		    HAS_AVX ? __cos_avx : __cos_sse2));
+libm_ifunc (__cos, (HAS_ARCH_FEATURE (FMA4_Usable) ? __cos_fma4 :
+		    HAS_ARCH_FEATURE (AVX_Usable)
+		    ? __cos_avx : __cos_sse2));
 weak_alias (__cos, cos)
 
-libm_ifunc (__sin, (HAS_FMA4 ? __sin_fma4 :
-		    HAS_AVX ? __sin_avx : __sin_sse2));
+libm_ifunc (__sin, (HAS_ARCH_FEATURE (FMA4_Usable) ? __sin_fma4 :
+		    HAS_ARCH_FEATURE (AVX_Usable)
+		    ? __sin_avx : __sin_sse2));
 weak_alias (__sin, sin)
 
-# define __cos __cos_sse2
-# define __sin __sin_sse2
-#endif
+#define __cos __cos_sse2
+#define __sin __sin_sse2
 
 
 #include <sysdeps/ieee754/dbl-64/s_sin.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/s_tan.c b/sysdeps/x86_64/fpu/multiarch/s_tan.c
index 904308fada..25f3bca07e 100644
--- a/sysdeps/x86_64/fpu/multiarch/s_tan.c
+++ b/sysdeps/x86_64/fpu/multiarch/s_tan.c
@@ -1,22 +1,15 @@
-#if defined HAVE_FMA4_SUPPORT || defined HAVE_AVX_SUPPORT
-# include <init-arch.h>
-# include <math.h>
+#include <init-arch.h>
+#include <math.h>
 
 extern double __tan_sse2 (double);
 extern double __tan_avx (double);
-# ifdef HAVE_FMA4_SUPPORT
 extern double __tan_fma4 (double);
-# else
-#  undef HAS_FMA4
-#  define HAS_FMA4 0
-#  define __tan_fma4 ((void *) 0)
-# endif
 
-libm_ifunc (tan, (HAS_FMA4 ? __tan_fma4 :
-		  HAS_AVX ? __tan_avx : __tan_sse2));
+libm_ifunc (tan, (HAS_ARCH_FEATURE (FMA4_Usable) ? __tan_fma4 :
+		  HAS_ARCH_FEATURE (AVX_Usable)
+		  ? __tan_avx : __tan_sse2));
 
-# define tan __tan_sse2
-#endif
+#define tan __tan_sse2
 
 
 #include <sysdeps/ieee754/dbl-64/s_tan.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S
index 5f67d83bd4..7d720e2fcb 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized cos, vector length is 2.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN2v_cos)
         .type   _ZGVbN2v_cos, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN2v_cos_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN2v_cos_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
         jz      2f
         ret
 2:      leaq    _ZGVbN2v_cos_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S
index 4420edcae0..088fcae067 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S
@@ -1,5 +1,5 @@
 /* Function cos vectorized with SSE4.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S
index 5babb834ad..65a3570d2e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized cos, vector length is 4.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN4v_cos)
         .type   _ZGVdN4v_cos, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN4v_cos_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN4v_cos_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
         jz      2f
         ret
 2:      leaq    _ZGVdN4v_cos_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S
index 9a776e7df7..4e653216d9 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S
@@ -1,5 +1,5 @@
 /* Function cos vectorized with AVX2.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
index d0f4f27f46..3e7f16d44e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized cos, vector length is 8.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN8v_cos)
         .type   _ZGVeN8v_cos, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
+	LOAD_RTLD_GLOBAL_RO_RDX
 1:      leaq    _ZGVeN8v_cos_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
         jnz     2f
         leaq    _ZGVeN8v_cos_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512F_Usable)
         jnz     2f
         leaq    _ZGVeN8v_cos_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
index b376155210..1cac1d827a 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
@@ -1,5 +1,5 @@
 /* Function cos vectorized with AVX-512, KNL and SKX versions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S
index ef3dc49a1c..136c67a550 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized exp.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN2v_exp)
         .type   _ZGVbN2v_exp, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN2v_exp_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN2v_exp_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
         jz      2f
         ret
 2:      leaq    _ZGVbN2v_exp_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S
index 1f5445924a..445b230152 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S
@@ -1,5 +1,5 @@
 /* Function exp vectorized with SSE4.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S
index 7f2ebdef67..9d6a47be0a 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized exp.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN4v_exp)
         .type   _ZGVdN4v_exp, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN4v_exp_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN4v_exp_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
         jz      2f
         ret
 2:      leaq    _ZGVdN4v_exp_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S
index a34e267433..25f9e28941 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core_avx2.S
@@ -1,5 +1,5 @@
 /* Function exp vectorized with AVX2.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
index 7b7c07d926..317ee36e61 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized exp.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN8v_exp)
         .type   _ZGVeN8v_exp, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN8v_exp_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN8v_exp_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
         jnz     2f
         leaq    _ZGVeN8v_exp_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512F_Usable)
         jnz     2f
         leaq    _ZGVeN8v_exp_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S
index 049a7e49cd..74f1d2ce7b 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core_avx512.S
@@ -1,5 +1,5 @@
 /* Function exp vectorized with AVX-512. KNL and SKX versions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S
index 38d369fc3c..03d86a3e63 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized log.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,11 +22,9 @@
         .text
 ENTRY (_ZGVbN2v_log)
         .type   _ZGVbN2v_log, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN2v_log_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN2v_log_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
         jz      2f
         ret
 2:      leaq    _ZGVbN2v_log_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S
index 82f3d8215d..5d254288f6 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S
@@ -1,5 +1,5 @@
 /* Function log vectorized with SSE4.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S
index ddb6105405..9f6ddbef15 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized log.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN4v_log)
         .type   _ZGVdN4v_log, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN4v_log_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN4v_log_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
         jz      2f
         ret
 2:      leaq    _ZGVdN4v_log_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core_avx2.S
index 816aede395..5da298747d 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core_avx2.S
@@ -1,5 +1,5 @@
 /* Function log vectorized with AVX2.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
index 76375fdae0..2e1a1da1a5 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized log.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN8v_log)
         .type   _ZGVeN8v_log, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN8v_log_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN8v_log_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
         jnz     2f
         leaq    _ZGVeN8v_log_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512F_Usable)
         jnz     2f
         leaq    _ZGVeN8v_log_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
index b0f3dd580c..dca8e61f34 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
@@ -1,5 +1,5 @@
 /* Function log vectorized with AVX-512. KNL and SKX versions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S
index f111388922..4a50246889 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized pow.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN2vv_pow)
         .type   _ZGVbN2vv_pow, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN2vv_pow_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN2vv_pow_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
         jz      2f
         ret
 2:      leaq    _ZGVbN2vv_pow_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S
index 9f6ec29ac5..064d170878 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S
@@ -1,5 +1,5 @@
 /* Function pow vectorized with SSE4.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S
index 21e3070a42..fb9f989adc 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized pow.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN4vv_pow)
         .type   _ZGVdN4vv_pow, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN4vv_pow_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN4vv_pow_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
         jz      2f
         ret
 2:      leaq    _ZGVdN4vv_pow_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S
index f1f1f35ca2..f2a73ffe1e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S
@@ -1,5 +1,5 @@
 /* Function pow vectorized with AVX2.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
index c1e5e76f92..30bc53f2f7 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized pow.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN8vv_pow)
         .type   _ZGVeN8vv_pow, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN8vv_pow_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN8vv_pow_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
         jnz     2f
         leaq    _ZGVeN8vv_pow_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512F_Usable)
         jnz     2f
         leaq    _ZGVeN8vv_pow_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S
index 8dd89c8ebb..4a515233fc 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S
@@ -1,5 +1,5 @@
 /* Function pow vectorized with AVX-512. KNL and SKX versions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S
index 29bd0a7b4d..112bec2224 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized sin.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN2v_sin)
         .type   _ZGVbN2v_sin, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN2v_sin_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN2v_sin_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
         jz      2f
         ret
 2:      leaq    _ZGVbN2v_sin_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S
index 3a1ccbf139..5755ce6f74 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S
@@ -1,5 +1,5 @@
 /* Function sin vectorized with SSE4.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S
index c3a453a477..700a1c629d 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized sin, vector length is 4.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN4v_sin)
         .type   _ZGVdN4v_sin, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN4v_sin_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN4v_sin_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
         jz      2f
         ret
 2:      leaq    _ZGVdN4v_sin_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core_avx2.S
index 6bf8b32b4f..46b557158a 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core_avx2.S
@@ -1,5 +1,5 @@
 /* Function sin vectorized with AVX2.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
index 131f2f47c5..5afce0ed88 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized sin.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN8v_sin)
         .type   _ZGVeN8v_sin, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN8v_sin_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN8v_sin_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
         jnz     2f
         leaq    _ZGVeN8v_sin_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512F_Usable)
         jnz     2f
         leaq    _ZGVeN8v_sin_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
index 422f6e8b0f..6c565f3861 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
@@ -1,5 +1,5 @@
 /* Function sin vectorized with AVX-512, KNL and SKX versions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S
index e8e5771808..883d7d33a4 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized sincos.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN2vvv_sincos)
         .type   _ZGVbN2vvv_sincos, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN2vvv_sincos_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN2vvv_sincos_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
         jz      2f
         ret
 2:      leaq    _ZGVbN2vvv_sincos_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S
index b504d1d732..65ad540122 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S
@@ -1,5 +1,5 @@
 /* Function sincos vectorized with SSE4.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S
index 64744ffa62..69a3f74650 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized sincos.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN4vvv_sincos)
         .type   _ZGVdN4vvv_sincos, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN4vvv_sincos_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN4vvv_sincos_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
         jz      2f
         ret
 2:      leaq    _ZGVdN4vvv_sincos_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S
index dca5604111..60d03e9f8b 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S
@@ -1,5 +1,5 @@
 /* Function sincos vectorized with AVX2.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
index e33109099e..64cb08c5d1 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized sincos.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN8vvv_sincos)
         .type   _ZGVeN8vvv_sincos, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN8vvv_sincos_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN8vvv_sincos_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
         jnz     2f
         leaq    _ZGVeN8vvv_sincos_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512F_Usable)
         jnz     2f
         leaq    _ZGVeN8vvv_sincos_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
index e8388325f7..44700f90b8 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
@@ -1,5 +1,5 @@
 /* Function sincos vectorized with AVX-512. KNL and SKX versions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
index 0654d3c19b..755254a280 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized cosf.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN16v_cosf)
         .type   _ZGVeN16v_cosf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN16v_cosf_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN16v_cosf_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
         jnz     2f
         leaq    _ZGVeN16v_cosf_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512F_Usable)
         jnz     2f
         leaq    _ZGVeN16v_cosf_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
index e777476d73..5004cd4758 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
@@ -1,5 +1,5 @@
 /* Function cosf vectorized with AVX-512. KNL and SKX versions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S
index fa2363bb1f..ad7de18851 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized cosf, vector length is 4.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN4v_cosf)
         .type   _ZGVbN4v_cosf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN4v_cosf_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN4v_cosf_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
         jz      2f
         ret
 2:      leaq    _ZGVbN4v_cosf_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S
index bdb6591905..d23ff72a30 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S
@@ -1,5 +1,5 @@
 /* Function cosf vectorized with SSE4.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S
index e14bba4a76..602c70e324 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized cosf, vector length is 8.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN8v_cosf)
         .type   _ZGVdN8v_cosf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN8v_cosf_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN8v_cosf_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
         jz      2f
         ret
 2:      leaq    _ZGVdN8v_cosf_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S
index 1efc943295..513f3c0a29 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S
@@ -1,5 +1,5 @@
 /* Function cosf vectorized with AVX2.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
index 62858eb39e..f990d36483 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized expf.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN16v_expf)
         .type   _ZGVeN16v_expf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN16v_expf_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN16v_expf_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
         jnz     2f
         leaq    _ZGVeN16v_expf_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512F_Usable)
         jnz     2f
         leaq    _ZGVeN16v_expf_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
index ec69055351..7eb7a1b775 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
@@ -1,5 +1,5 @@
 /* Function expf vectorized with AVX-512. KNL and SKX versions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S
index 37d38bc6f8..2fbe6d475e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized expf.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN4v_expf)
         .type   _ZGVbN4v_expf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN4v_expf_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN4v_expf_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
         jz      2f
         ret
 2:      leaq    _ZGVbN4v_expf_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S
index fcc1859c3a..c6f91e8dc1 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S
@@ -1,5 +1,5 @@
 /* Function expf vectorized with SSE4.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S
index e3dc1b1038..7d19bb423d 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized expf.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN8v_expf)
         .type   _ZGVdN8v_expf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN8v_expf_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN8v_expf_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
         jz      2f
         ret
 2:      leaq    _ZGVdN8v_expf_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S
index c876ecc03e..c6be6954f7 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S
@@ -1,5 +1,5 @@
 /* Function expf vectorized with AVX2.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
index 68c57e4386..9efb2fb7df 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized logf.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN16v_logf)
         .type   _ZGVeN16v_logf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN16v_logf_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN16v_logf_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
         jnz     2f
         leaq    _ZGVeN16v_logf_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512F_Usable)
         jnz     2f
         leaq    _ZGVeN16v_logf_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
index 86fcab6e63..6209058381 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
@@ -1,5 +1,5 @@
 /* Function logf vectorized with AVX-512. KNL and SKX versions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S
index 153ed8ebc2..c85615ac25 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized logf.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN4v_logf)
         .type   _ZGVbN4v_logf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN4v_logf_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN4v_logf_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
         jz      2f
         ret
 2:      leaq    _ZGVbN4v_logf_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S
index 68f11033d9..1ce9838513 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S
@@ -1,5 +1,5 @@
 /* Function logf vectorized with SSE4.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S
index 6f50bf6bdb..8f6d83dd56 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized logf.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN8v_logf)
         .type   _ZGVdN8v_logf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN8v_logf_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN8v_logf_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
         jz      2f
         ret
 2:      leaq    _ZGVdN8v_logf_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S
index 1f08b4218a..91fb549ce6 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S
@@ -1,5 +1,5 @@
 /* Function logf vectorized with AVX2.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
index 3aa9f952ce..80048ce977 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized powf.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN16vv_powf)
         .type   _ZGVeN16vv_powf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN16vv_powf_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN16vv_powf_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
         jnz     2f
         leaq    _ZGVeN16vv_powf_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512F_Usable)
         jnz     2f
         leaq    _ZGVeN16vv_powf_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
index 4b61974cb6..45d48723af 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
@@ -1,5 +1,5 @@
 /* Function powf vectorized with AVX-512. KNL and SKX versions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S
index f88b9ca6d4..b46821189b 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized powf.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN4vv_powf)
         .type   _ZGVbN4vv_powf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN4vv_powf_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN4vv_powf_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
         jz      2f
         ret
 2:      leaq    _ZGVbN4vv_powf_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S
index 6068f51c46..420f98c6a6 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S
@@ -1,5 +1,5 @@
 /* Function powf vectorized with SSE4.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S
index 4552e573a9..945908a2ff 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized powf.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN8vv_powf)
         .type   _ZGVdN8vv_powf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN8vv_powf_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN8vv_powf_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
         jz      2f
         ret
 2:      leaq    _ZGVdN8vv_powf_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core_avx2.S
index cfb86c7851..4446859130 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core_avx2.S
@@ -1,5 +1,5 @@
 /* Function powf vectorized with AVX2.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
index bdcabab6e2..16cee0c676 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized sincosf.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN16vvv_sincosf)
         .type   _ZGVeN16vvv_sincosf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN16vvv_sincosf_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN16vvv_sincosf_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
         jnz     2f
         leaq    _ZGVeN16vvv_sincosf_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512F_Usable)
         jnz     2f
         leaq    _ZGVeN16vvv_sincosf_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
index efff91bb0d..758aeeaeed 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
@@ -1,5 +1,5 @@
 /* Function sincosf vectorized with AVX-512. KNL and SKX versions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S
index 610046b587..d72b4049e2 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized sincosf.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN4vvv_sincosf)
         .type   _ZGVbN4vvv_sincosf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN4vvv_sincosf_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN4vvv_sincosf_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
         jz      2f
         ret
 2:      leaq    _ZGVbN4vvv_sincosf_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S
index 4d846b5d7e..643fc0ca3b 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S
@@ -1,5 +1,5 @@
 /* Function sincosf vectorized with SSE4.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S
index 9e5be67fc9..0123b8024e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized sincosf.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN8vvv_sincosf)
         .type   _ZGVdN8vvv_sincosf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN8vvv_sincosf_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN8vvv_sincosf_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
         jz      2f
         ret
 2:      leaq    _ZGVdN8vvv_sincosf_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S
index 0108fd5126..f2a0ba7116 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S
@@ -1,5 +1,5 @@
 /* Function sincosf vectorized with AVX2.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
index 3ec78a0b5e..2212cdd94d 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized sinf.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN16v_sinf)
         .type   _ZGVeN16v_sinf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN16v_sinf_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN16v_sinf_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
         jnz     2f
         leaq    _ZGVeN16v_sinf_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512F_Usable)
         jnz     2f
         leaq    _ZGVeN16v_sinf_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
index f13ed96af8..61d8d3793a 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
@@ -1,5 +1,5 @@
 /* Function sinf vectorized with AVX-512. KNL and SKX versions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S
index cf1e4df406..b31554730d 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized sinf.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN4v_sinf)
         .type   _ZGVbN4v_sinf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN4v_sinf_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN4v_sinf_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
         jz      2f
         ret
 2:      leaq    _ZGVbN4v_sinf_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S
index b8b852bcae..5268ab1f09 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core_sse4.S
@@ -1,5 +1,5 @@
 /* Function sinf vectorized with SSE4.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S
index b28bf3cabc..47fe0a4adc 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S
@@ -1,5 +1,5 @@
 /* Multiple versions of vectorized sinf, vector length is 8.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN8v_sinf)
         .type   _ZGVdN8v_sinf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
+	LOAD_RTLD_GLOBAL_RO_RDX
 1:      leaq    _ZGVdN8v_sinf_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX2_Usable)
         jz      2f
         ret
 2:      leaq    _ZGVdN8v_sinf_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S
index a130d25fce..9fdaadb2e8 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S
@@ -1,5 +1,5 @@
 /* Function sinf vectorized with AVX2.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/printf_fphex.c b/sysdeps/x86_64/fpu/printf_fphex.c
index 7b900caa88..0fbaa3748e 100644
--- a/sysdeps/x86_64/fpu/printf_fphex.c
+++ b/sysdeps/x86_64/fpu/printf_fphex.c
@@ -1,5 +1,5 @@
 /* Print floating point number in hexadecimal notation according to ISO C99.
-   Copyright (C) 1997-2015 Free Software Foundation, Inc.
+   Copyright (C) 1997-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/s_copysign.S b/sysdeps/x86_64/fpu/s_copysign.S
index 0576343595..18f568f46f 100644
--- a/sysdeps/x86_64/fpu/s_copysign.S
+++ b/sysdeps/x86_64/fpu/s_copysign.S
@@ -1,5 +1,5 @@
 /* copy sign, double version.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
+   Copyright (C) 2002-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Andreas Jaeger <aj@suse.de>, 2002.
 
diff --git a/sysdeps/x86_64/fpu/s_copysignf.S b/sysdeps/x86_64/fpu/s_copysignf.S
index 4961afca46..00a1fabaee 100644
--- a/sysdeps/x86_64/fpu/s_copysignf.S
+++ b/sysdeps/x86_64/fpu/s_copysignf.S
@@ -1,5 +1,5 @@
 /* copy sign, double version.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
+   Copyright (C) 2002-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Andreas Jaeger <aj@suse.de>, 2002.
 
diff --git a/sysdeps/x86_64/fpu/s_cosf.S b/sysdeps/x86_64/fpu/s_cosf.S
index b7868ceb20..31968e498f 100644
--- a/sysdeps/x86_64/fpu/s_cosf.S
+++ b/sysdeps/x86_64/fpu/s_cosf.S
@@ -1,5 +1,5 @@
 /* Optimized cosf function.
-   Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -310,8 +310,14 @@ L(arg_inf_or_nan):
 	/* Here if |x| is Inf or NAN */
 	jne	L(skip_errno_setting)	/* in case of x is NaN */
 
+	/* Align stack to 16 bytes.  */
+	subq	$8, %rsp
+	cfi_adjust_cfa_offset (8)
 	/* Here if x is Inf. Set errno to EDOM.  */
 	call	JUMPTARGET(__errno_location)
+	addq	$8, %rsp
+	cfi_adjust_cfa_offset (-8)
+
 	movl	$EDOM, (%rax)
 
 	.p2align	4
diff --git a/sysdeps/x86_64/fpu/s_fabs.c b/sysdeps/x86_64/fpu/s_fabs.c
index 5e4f1b390f..d3a313fdf5 100644
--- a/sysdeps/x86_64/fpu/s_fabs.c
+++ b/sysdeps/x86_64/fpu/s_fabs.c
@@ -1,5 +1,5 @@
 /* Absolute value of floating point number.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
+   Copyright (C) 2002-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/s_fabsf.c b/sysdeps/x86_64/fpu/s_fabsf.c
index a80c2589fa..e6dcda9433 100644
--- a/sysdeps/x86_64/fpu/s_fabsf.c
+++ b/sysdeps/x86_64/fpu/s_fabsf.c
@@ -1,5 +1,5 @@
 /* Absolute value of floating point number.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
+   Copyright (C) 2002-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/s_fabsl.S b/sysdeps/x86_64/fpu/s_fabsl.S
index 8d4694b978..6881ff11c7 100644
--- a/sysdeps/x86_64/fpu/s_fabsl.S
+++ b/sysdeps/x86_64/fpu/s_fabsl.S
@@ -1,5 +1,5 @@
 /* Absolute value of floating point number.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
+   Copyright (C) 2002-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/s_fdiml.S b/sysdeps/x86_64/fpu/s_fdiml.S
index ae7490a2a9..f9f1e20259 100644
--- a/sysdeps/x86_64/fpu/s_fdiml.S
+++ b/sysdeps/x86_64/fpu/s_fdiml.S
@@ -1,5 +1,5 @@
 /* Compute positive difference.
-   Copyright (C) 1997-2015 Free Software Foundation, Inc.
+   Copyright (C) 1997-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
 
diff --git a/sysdeps/x86_64/fpu/s_fmax.S b/sysdeps/x86_64/fpu/s_fmax.S
index 9857ab0b30..02096c0aea 100644
--- a/sysdeps/x86_64/fpu/s_fmax.S
+++ b/sysdeps/x86_64/fpu/s_fmax.S
@@ -1,5 +1,5 @@
 /* Compute maximum of two numbers, regarding NaN as missing argument.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
+   Copyright (C) 2002-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Andreas Jaeger <aj@suse.de>, 2002.
 
diff --git a/sysdeps/x86_64/fpu/s_fmaxf.S b/sysdeps/x86_64/fpu/s_fmaxf.S
index 0aa9d20cd2..28e129701e 100644
--- a/sysdeps/x86_64/fpu/s_fmaxf.S
+++ b/sysdeps/x86_64/fpu/s_fmaxf.S
@@ -1,5 +1,5 @@
 /* Compute maximum of two numbers, regarding NaN as missing argument.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
+   Copyright (C) 2002-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Andreas Jaeger <aj@suse.de>, 2002.
 
diff --git a/sysdeps/x86_64/fpu/s_fmaxl.S b/sysdeps/x86_64/fpu/s_fmaxl.S
index 11827134c0..f0c2bc0d56 100644
--- a/sysdeps/x86_64/fpu/s_fmaxl.S
+++ b/sysdeps/x86_64/fpu/s_fmaxl.S
@@ -1,5 +1,5 @@
 /* Compute maximum of two numbers, regarding NaN as missing argument.
-   Copyright (C) 1997-2015 Free Software Foundation, Inc.
+   Copyright (C) 1997-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
 
diff --git a/sysdeps/x86_64/fpu/s_fmin.S b/sysdeps/x86_64/fpu/s_fmin.S
index 9bd00a70b2..fb14e2f3ed 100644
--- a/sysdeps/x86_64/fpu/s_fmin.S
+++ b/sysdeps/x86_64/fpu/s_fmin.S
@@ -1,5 +1,5 @@
 /* Compute minimum of two numbers, regarding NaN as missing argument.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
+   Copyright (C) 2002-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Andreas Jaeger <aj@suse.de>, 2002.
 
diff --git a/sysdeps/x86_64/fpu/s_fminf.S b/sysdeps/x86_64/fpu/s_fminf.S
index 996c34b1a0..c8d6d0fd33 100644
--- a/sysdeps/x86_64/fpu/s_fminf.S
+++ b/sysdeps/x86_64/fpu/s_fminf.S
@@ -1,5 +1,5 @@
 /* Compute minimum of two numbers, regarding NaN as missing argument.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
+   Copyright (C) 2002-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Andreas Jaeger <aj@suse.de>, 2002.
 
diff --git a/sysdeps/x86_64/fpu/s_fminl.S b/sysdeps/x86_64/fpu/s_fminl.S
index be9571b4f1..f1a06d29d7 100644
--- a/sysdeps/x86_64/fpu/s_fminl.S
+++ b/sysdeps/x86_64/fpu/s_fminl.S
@@ -1,5 +1,5 @@
 /* Compute minimum of two numbers, regarding NaN as missing argument.
-   Copyright (C) 1997-2015 Free Software Foundation, Inc.
+   Copyright (C) 1997-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
 
diff --git a/sysdeps/x86_64/fpu/s_llrint.S b/sysdeps/x86_64/fpu/s_llrint.S
index e822c06070..6634c653ea 100644
--- a/sysdeps/x86_64/fpu/s_llrint.S
+++ b/sysdeps/x86_64/fpu/s_llrint.S
@@ -1,6 +1,6 @@
 /* Round argument to nearest integral value according to current rounding
    direction.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
+   Copyright (C) 2002-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Andreas Jaeger <aj@suse.d>, 2002.
 
@@ -26,5 +26,7 @@ ENTRY(__llrint)
 	ret
 END(__llrint)
 weak_alias (__llrint, llrint)
+#ifndef __ILP32__
 strong_alias (__llrint, __lrint)
 weak_alias (__llrint, lrint)
+#endif
diff --git a/sysdeps/x86_64/fpu/s_llrintf.S b/sysdeps/x86_64/fpu/s_llrintf.S
index 6825511a57..5ac03dffd9 100644
--- a/sysdeps/x86_64/fpu/s_llrintf.S
+++ b/sysdeps/x86_64/fpu/s_llrintf.S
@@ -1,6 +1,6 @@
 /* Round argument to nearest integral value according to current rounding
    direction.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
+   Copyright (C) 2002-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Andreas Jaeger <aj@suse.d>, 2002.
 
@@ -26,5 +26,7 @@ ENTRY(__llrintf)
 	ret
 END(__llrintf)
 weak_alias (__llrintf, llrintf)
+#ifndef __ILP32__
 strong_alias (__llrintf, __lrintf)
 weak_alias (__llrintf, lrintf)
+#endif
diff --git a/sysdeps/x86_64/fpu/s_llrintl.S b/sysdeps/x86_64/fpu/s_llrintl.S
index abe3a5bc0b..5f4d827dff 100644
--- a/sysdeps/x86_64/fpu/s_llrintl.S
+++ b/sysdeps/x86_64/fpu/s_llrintl.S
@@ -1,6 +1,6 @@
 /* Round argument to nearest integral value according to current rounding
    direction.
-   Copyright (C) 1997-2015 Free Software Foundation, Inc.
+   Copyright (C) 1997-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -28,6 +28,7 @@ ENTRY(__llrintl)
 	ret
 END(__llrintl)
 weak_alias (__llrintl, llrintl)
+#ifndef __ILP32__
 strong_alias (__llrintl, __lrintl)
 weak_alias (__llrintl, lrintl)
-
+#endif
diff --git a/sysdeps/x86_64/fpu/s_nearbyintl.S b/sysdeps/x86_64/fpu/s_nearbyintl.S
index dab2750a23..76d41bdd52 100644
--- a/sysdeps/x86_64/fpu/s_nearbyintl.S
+++ b/sysdeps/x86_64/fpu/s_nearbyintl.S
@@ -8,14 +8,16 @@
 
 ENTRY(__nearbyintl)
 	fldt	8(%rsp)
-	fnstcw	-4(%rsp)
-	movl	-4(%rsp), %eax
+	fnstenv	-28(%rsp)
+	movl	-28(%rsp), %eax
 	orl	$0x20, %eax
-	movl	%eax, -8(%rsp)
-	fldcw	-8(%rsp)
+	movl	%eax, -32(%rsp)
+	fldcw	-32(%rsp)
 	frndint
-	fclex
-	fldcw	-4(%rsp)
+	fnstsw
+	andl	$0x1, %eax
+	orl	%eax, -24(%rsp)
+	fldenv	-28(%rsp)
 	ret
 END (__nearbyintl)
 weak_alias (__nearbyintl, nearbyintl)
diff --git a/sysdeps/x86_64/fpu/s_scalbnl.S b/sysdeps/x86_64/fpu/s_scalbnl.S
index d0e9301eed..6c7683c32b 100644
--- a/sysdeps/x86_64/fpu/s_scalbnl.S
+++ b/sysdeps/x86_64/fpu/s_scalbnl.S
@@ -15,4 +15,3 @@ ENTRY(__scalbnl)
 	fstp	%st(1)
 	ret
 END (__scalbnl)
-weak_alias (__scalbnl, scalbnl)
diff --git a/sysdeps/x86_64/fpu/s_signbit.S b/sysdeps/x86_64/fpu/s_signbit.S
index a327c45330..92a79d3123 100644
--- a/sysdeps/x86_64/fpu/s_signbit.S
+++ b/sysdeps/x86_64/fpu/s_signbit.S
@@ -1,5 +1,5 @@
 /* Return nonzero value if number is negative.
-   Copyright (C) 2009-2015 Free Software Foundation, Inc.
+   Copyright (C) 2009-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@redha.com>, 2009.
 
diff --git a/sysdeps/x86_64/fpu/s_signbitf.S b/sysdeps/x86_64/fpu/s_signbitf.S
index 90994705c7..885645372e 100644
--- a/sysdeps/x86_64/fpu/s_signbitf.S
+++ b/sysdeps/x86_64/fpu/s_signbitf.S
@@ -1,5 +1,5 @@
 /* Return nonzero value if number is negative.
-   Copyright (C) 2009-2015 Free Software Foundation, Inc.
+   Copyright (C) 2009-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@redha.com>, 2009.
 
diff --git a/sysdeps/x86_64/fpu/s_sincosf.S b/sysdeps/x86_64/fpu/s_sincosf.S
index 21db70a88b..5e7cbe57e3 100644
--- a/sysdeps/x86_64/fpu/s_sincosf.S
+++ b/sysdeps/x86_64/fpu/s_sincosf.S
@@ -1,5 +1,5 @@
 /* Optimized sincosf function.
-   Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -354,8 +354,14 @@ L(arg_inf_or_nan):
 	/* Here if |x| is Inf or NAN */
 	jne	L(skip_errno_setting)	/* in case of x is NaN */
 
+	/* Align stack to 16 bytes.  */
+	subq	$8, %rsp
+	cfi_adjust_cfa_offset (8)
 	/* Here if x is Inf. Set errno to EDOM.  */
 	call	JUMPTARGET(__errno_location)
+	addq	$8, %rsp
+	cfi_adjust_cfa_offset (-8)
+
 	movl	$EDOM, (%rax)
 
 	.p2align	4
diff --git a/sysdeps/x86_64/fpu/s_sinf.S b/sysdeps/x86_64/fpu/s_sinf.S
index dc921641de..c980c6e207 100644
--- a/sysdeps/x86_64/fpu/s_sinf.S
+++ b/sysdeps/x86_64/fpu/s_sinf.S
@@ -1,5 +1,5 @@
 /* Optimized sinf function.
-   Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -336,8 +336,14 @@ L(arg_inf_or_nan):
 	/* Here if |x| is Inf or NAN */
 	jne	L(skip_errno_setting)	/* in case of x is NaN */
 
+	/* Align stack to 16 bytes.  */
+	subq	$8, %rsp
+	cfi_adjust_cfa_offset (8)
 	/* Here if x is Inf. Set errno to EDOM.  */
 	call	JUMPTARGET(__errno_location)
+	addq	$8, %rsp
+	cfi_adjust_cfa_offset (-8)
+
 	movl	$EDOM, (%rax)
 
 	.p2align	4
diff --git a/sysdeps/x86_64/fpu/s_truncl.S b/sysdeps/x86_64/fpu/s_truncl.S
index 6ba4a27cad..c37cf00241 100644
--- a/sysdeps/x86_64/fpu/s_truncl.S
+++ b/sysdeps/x86_64/fpu/s_truncl.S
@@ -1,5 +1,5 @@
 /* Truncate long double value.
-   Copyright (C) 1997-2015 Free Software Foundation, Inc.
+   Copyright (C) 1997-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
 
diff --git a/sysdeps/x86_64/fpu/svml_d_cos2_core.S b/sysdeps/x86_64/fpu/svml_d_cos2_core.S
index a26beca4a1..7f62d29917 100644
--- a/sysdeps/x86_64/fpu/svml_d_cos2_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_cos2_core.S
@@ -1,5 +1,5 @@
 /* Function cos vectorized with SSE2.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_cos4_core.S b/sysdeps/x86_64/fpu/svml_d_cos4_core.S
index 35996b7318..b92ff13b86 100644
--- a/sysdeps/x86_64/fpu/svml_d_cos4_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_cos4_core.S
@@ -1,5 +1,5 @@
 /* Function cos vectorized with AVX2, wrapper version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S b/sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S
index bf10b01cc5..a3da721e35 100644
--- a/sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S
+++ b/sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S
@@ -1,5 +1,5 @@
 /* Function cos vectorized in AVX ISA as wrapper to SSE4 ISA version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_cos8_core.S b/sysdeps/x86_64/fpu/svml_d_cos8_core.S
index 1ba10e8c9b..e5d986d11a 100644
--- a/sysdeps/x86_64/fpu/svml_d_cos8_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_cos8_core.S
@@ -1,5 +1,5 @@
 /* Function cos vectorized with AVX-512, wrapper to AVX2.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_exp2_core.S b/sysdeps/x86_64/fpu/svml_d_exp2_core.S
index ca3dd76364..9e511037a1 100644
--- a/sysdeps/x86_64/fpu/svml_d_exp2_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_exp2_core.S
@@ -1,5 +1,5 @@
 /* Function exp vectorized with SSE2.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_exp4_core.S b/sysdeps/x86_64/fpu/svml_d_exp4_core.S
index d497811980..8cac8adbc7 100644
--- a/sysdeps/x86_64/fpu/svml_d_exp4_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_exp4_core.S
@@ -1,5 +1,5 @@
 /* Function exp vectorized with AVX2, wrapper version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_exp4_core_avx.S b/sysdeps/x86_64/fpu/svml_d_exp4_core_avx.S
index 5dd2f6cd17..1a0fbf574a 100644
--- a/sysdeps/x86_64/fpu/svml_d_exp4_core_avx.S
+++ b/sysdeps/x86_64/fpu/svml_d_exp4_core_avx.S
@@ -1,5 +1,5 @@
 /* Function exp vectorized in AVX ISA as wrapper to SSE4 ISA version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_exp8_core.S b/sysdeps/x86_64/fpu/svml_d_exp8_core.S
index 3e273a3e71..2486e888a4 100644
--- a/sysdeps/x86_64/fpu/svml_d_exp8_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_exp8_core.S
@@ -1,5 +1,5 @@
 /* Function exp vectorized with AVX-512. Wrapper to AVX2 version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_exp_data.S b/sysdeps/x86_64/fpu/svml_d_exp_data.S
index 66fa3b88d7..6d1acbdd21 100644
--- a/sysdeps/x86_64/fpu/svml_d_exp_data.S
+++ b/sysdeps/x86_64/fpu/svml_d_exp_data.S
@@ -1,5 +1,5 @@
 /* Data for vector function exp.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_exp_data.h b/sysdeps/x86_64/fpu/svml_d_exp_data.h
index 71ebdb799e..f993403d47 100644
--- a/sysdeps/x86_64/fpu/svml_d_exp_data.h
+++ b/sysdeps/x86_64/fpu/svml_d_exp_data.h
@@ -1,5 +1,5 @@
 /* Offsets for data table for function exp.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_log2_core.S b/sysdeps/x86_64/fpu/svml_d_log2_core.S
index daa63b583f..8ea40fee56 100644
--- a/sysdeps/x86_64/fpu/svml_d_log2_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_log2_core.S
@@ -1,5 +1,5 @@
 /* Function log vectorized with SSE2.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_log4_core.S b/sysdeps/x86_64/fpu/svml_d_log4_core.S
index 009c93c837..72813d8921 100644
--- a/sysdeps/x86_64/fpu/svml_d_log4_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_log4_core.S
@@ -1,5 +1,5 @@
 /* Function log vectorized with AVX2, wrapper version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_log4_core_avx.S b/sysdeps/x86_64/fpu/svml_d_log4_core_avx.S
index 554fc45712..6ca1139931 100644
--- a/sysdeps/x86_64/fpu/svml_d_log4_core_avx.S
+++ b/sysdeps/x86_64/fpu/svml_d_log4_core_avx.S
@@ -1,5 +1,5 @@
 /* Function log vectorized in AVX ISA as wrapper to SSE4 ISA version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_log8_core.S b/sysdeps/x86_64/fpu/svml_d_log8_core.S
index 9728305f17..6850fd9a44 100644
--- a/sysdeps/x86_64/fpu/svml_d_log8_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_log8_core.S
@@ -1,5 +1,5 @@
 /* Function log vectorized with AVX-512. Wrapper to AVX2 version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_log_data.S b/sysdeps/x86_64/fpu/svml_d_log_data.S
index 1ce78e2c8b..9ab541b23f 100644
--- a/sysdeps/x86_64/fpu/svml_d_log_data.S
+++ b/sysdeps/x86_64/fpu/svml_d_log_data.S
@@ -1,5 +1,5 @@
 /* Data for function log.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_log_data.h b/sysdeps/x86_64/fpu/svml_d_log_data.h
index 8ca55a8010..30c2b54a4b 100644
--- a/sysdeps/x86_64/fpu/svml_d_log_data.h
+++ b/sysdeps/x86_64/fpu/svml_d_log_data.h
@@ -1,5 +1,5 @@
 /* Offsets for data table for function log.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_pow2_core.S b/sysdeps/x86_64/fpu/svml_d_pow2_core.S
index 0b726a1eaf..b25515c825 100644
--- a/sysdeps/x86_64/fpu/svml_d_pow2_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_pow2_core.S
@@ -1,5 +1,5 @@
 /* Function pow vectorized with SSE2.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_pow4_core.S b/sysdeps/x86_64/fpu/svml_d_pow4_core.S
index 9eb47ab8c9..547993799e 100644
--- a/sysdeps/x86_64/fpu/svml_d_pow4_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_pow4_core.S
@@ -1,5 +1,5 @@
 /* Function pow vectorized with AVX2, wrapper version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_pow4_core_avx.S b/sysdeps/x86_64/fpu/svml_d_pow4_core_avx.S
index 6c7b59995d..4e4e9867b4 100644
--- a/sysdeps/x86_64/fpu/svml_d_pow4_core_avx.S
+++ b/sysdeps/x86_64/fpu/svml_d_pow4_core_avx.S
@@ -1,5 +1,5 @@
 /* Function pow vectorized in AVX ISA as wrapper to SSE4 ISA version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_pow8_core.S b/sysdeps/x86_64/fpu/svml_d_pow8_core.S
index cd99457843..372e5a9c83 100644
--- a/sysdeps/x86_64/fpu/svml_d_pow8_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_pow8_core.S
@@ -1,5 +1,5 @@
 /* Function pow vectorized with AVX-512. Wrapper to AVX2 version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_pow_data.S b/sysdeps/x86_64/fpu/svml_d_pow_data.S
index 1e0733e0a6..8481f95455 100644
--- a/sysdeps/x86_64/fpu/svml_d_pow_data.S
+++ b/sysdeps/x86_64/fpu/svml_d_pow_data.S
@@ -1,5 +1,5 @@
 /* Data for function pow.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_pow_data.h b/sysdeps/x86_64/fpu/svml_d_pow_data.h
index a1b9f9bc46..239ba96984 100644
--- a/sysdeps/x86_64/fpu/svml_d_pow_data.h
+++ b/sysdeps/x86_64/fpu/svml_d_pow_data.h
@@ -1,5 +1,5 @@
 /* Offsets for data table for function pow.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_sin2_core.S b/sysdeps/x86_64/fpu/svml_d_sin2_core.S
index c619dab966..f6ec13104b 100644
--- a/sysdeps/x86_64/fpu/svml_d_sin2_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_sin2_core.S
@@ -1,5 +1,5 @@
 /* Function sin vectorized with SSE2.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_sin4_core.S b/sysdeps/x86_64/fpu/svml_d_sin4_core.S
index f650d461a5..95a1dec6f6 100644
--- a/sysdeps/x86_64/fpu/svml_d_sin4_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_sin4_core.S
@@ -1,5 +1,5 @@
 /* Function sin vectorized with AVX2, wrapper version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_sin4_core_avx.S b/sysdeps/x86_64/fpu/svml_d_sin4_core_avx.S
index a21ffafa32..29d1526a12 100644
--- a/sysdeps/x86_64/fpu/svml_d_sin4_core_avx.S
+++ b/sysdeps/x86_64/fpu/svml_d_sin4_core_avx.S
@@ -1,5 +1,5 @@
 /* Function sin vectorized in AVX ISA as wrapper to SSE4 ISA version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_sin8_core.S b/sysdeps/x86_64/fpu/svml_d_sin8_core.S
index 2e78b5e35a..abd86b3d98 100644
--- a/sysdeps/x86_64/fpu/svml_d_sin8_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_sin8_core.S
@@ -1,5 +1,5 @@
 /* Function sin vectorized with AVX-512, wrapper to AVX2 version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_sincos2_core.S b/sysdeps/x86_64/fpu/svml_d_sincos2_core.S
index bd089e1ed0..74afa0a677 100644
--- a/sysdeps/x86_64/fpu/svml_d_sincos2_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_sincos2_core.S
@@ -1,5 +1,5 @@
 /* Function sincos vectorized with SSE2.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_sincos4_core.S b/sysdeps/x86_64/fpu/svml_d_sincos4_core.S
index d67cd30132..2c0b011fb3 100644
--- a/sysdeps/x86_64/fpu/svml_d_sincos4_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_sincos4_core.S
@@ -1,5 +1,5 @@
 /* Function sincos vectorized with AVX2, wrapper version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_sincos4_core_avx.S b/sysdeps/x86_64/fpu/svml_d_sincos4_core_avx.S
index 4f3f15aea6..e4320a97c7 100644
--- a/sysdeps/x86_64/fpu/svml_d_sincos4_core_avx.S
+++ b/sysdeps/x86_64/fpu/svml_d_sincos4_core_avx.S
@@ -1,5 +1,5 @@
 /* Function sincos vectorized in AVX ISA as wrapper to SSE4 ISA version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_sincos8_core.S b/sysdeps/x86_64/fpu/svml_d_sincos8_core.S
index e7f7121fa0..68d490e5bc 100644
--- a/sysdeps/x86_64/fpu/svml_d_sincos8_core.S
+++ b/sysdeps/x86_64/fpu/svml_d_sincos8_core.S
@@ -1,5 +1,5 @@
 /* Function sincos vectorized with AVX-512. Wrapper to AVX2 version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_trig_data.S b/sysdeps/x86_64/fpu/svml_d_trig_data.S
index d3b30598cc..887dacee91 100644
--- a/sysdeps/x86_64/fpu/svml_d_trig_data.S
+++ b/sysdeps/x86_64/fpu/svml_d_trig_data.S
@@ -1,5 +1,5 @@
 /* Data for vectorized sin, cos, sincos.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_trig_data.h b/sysdeps/x86_64/fpu/svml_d_trig_data.h
index 1395337c7e..4617b5e0c3 100644
--- a/sysdeps/x86_64/fpu/svml_d_trig_data.h
+++ b/sysdeps/x86_64/fpu/svml_d_trig_data.h
@@ -1,5 +1,5 @@
 /* Offsets for data table for vectorized sin, cos, sincos.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
index 5c0ff897c0..54f4f58371 100644
--- a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
+++ b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
@@ -1,5 +1,5 @@
 /* Wrapper implementations of vector math functions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_finite_alias.S b/sysdeps/x86_64/fpu/svml_finite_alias.S
new file mode 100644
index 0000000000..2dcfc37590
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_finite_alias.S
@@ -0,0 +1,58 @@
+/* These aliases added as workaround to exclude unnecessary symbol
+   aliases in libmvec.so while compiler creates the vector names
+   based on scalar asm name.  Corresponding discussion is at
+   <https://gcc.gnu.org/ml/gcc/2015-06/msg00173.html>.
+   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define ALIAS_IMPL(alias, target) \
+ENTRY (alias); \
+	jmp *target@GOTPCREL(%rip); \
+END (alias)
+
+	.text
+ALIAS_IMPL (_ZGVbN2v___log_finite, _ZGVbN2v_log)
+ALIAS_IMPL (_ZGVcN4v___log_finite, _ZGVcN4v_log)
+ALIAS_IMPL (_ZGVdN4v___log_finite, _ZGVdN4v_log)
+ALIAS_IMPL (_ZGVeN8v___log_finite, _ZGVeN8v_log)
+
+ALIAS_IMPL (_ZGVbN4v___logf_finite, _ZGVbN4v_logf)
+ALIAS_IMPL (_ZGVcN8v___logf_finite, _ZGVcN8v_logf)
+ALIAS_IMPL (_ZGVdN8v___logf_finite, _ZGVdN8v_logf)
+ALIAS_IMPL (_ZGVeN16v___logf_finite, _ZGVeN16v_logf)
+
+ALIAS_IMPL (_ZGVbN2v___exp_finite, _ZGVbN2v_exp)
+ALIAS_IMPL (_ZGVcN4v___exp_finite, _ZGVcN4v_exp)
+ALIAS_IMPL (_ZGVdN4v___exp_finite, _ZGVdN4v_exp)
+ALIAS_IMPL (_ZGVeN8v___exp_finite, _ZGVeN8v_exp)
+
+ALIAS_IMPL (_ZGVbN4v___expf_finite, _ZGVbN4v_expf)
+ALIAS_IMPL (_ZGVcN8v___expf_finite, _ZGVcN8v_expf)
+ALIAS_IMPL (_ZGVdN8v___expf_finite, _ZGVdN8v_expf)
+ALIAS_IMPL (_ZGVeN16v___expf_finite, _ZGVeN16v_expf)
+
+ALIAS_IMPL (_ZGVbN2vv___pow_finite, _ZGVbN2vv_pow)
+ALIAS_IMPL (_ZGVcN4vv___pow_finite, _ZGVcN4vv_pow)
+ALIAS_IMPL (_ZGVdN4vv___pow_finite, _ZGVdN4vv_pow)
+ALIAS_IMPL (_ZGVeN8vv___pow_finite, _ZGVeN8vv_pow)
+
+ALIAS_IMPL (_ZGVbN4vv___powf_finite, _ZGVbN4vv_powf)
+ALIAS_IMPL (_ZGVcN8vv___powf_finite, _ZGVcN8vv_powf)
+ALIAS_IMPL (_ZGVdN8vv___powf_finite, _ZGVdN8vv_powf)
+ALIAS_IMPL (_ZGVeN16vv___powf_finite, _ZGVeN16vv_powf)
diff --git a/sysdeps/x86_64/fpu/svml_s_cosf16_core.S b/sysdeps/x86_64/fpu/svml_s_cosf16_core.S
index e623df5dc3..9ca4fbfaa8 100644
--- a/sysdeps/x86_64/fpu/svml_s_cosf16_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_cosf16_core.S
@@ -1,5 +1,5 @@
 /* Function cosf vectorized with AVX-512. Wrapper to AVX2 version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_cosf4_core.S b/sysdeps/x86_64/fpu/svml_s_cosf4_core.S
index 9875cd7f71..363090c54a 100644
--- a/sysdeps/x86_64/fpu/svml_s_cosf4_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_cosf4_core.S
@@ -1,5 +1,5 @@
 /* Function cosf vectorized with SSE2, wrapper version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_cosf8_core.S b/sysdeps/x86_64/fpu/svml_s_cosf8_core.S
index 376ee358ae..26a6a4e4d6 100644
--- a/sysdeps/x86_64/fpu/svml_s_cosf8_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_cosf8_core.S
@@ -1,5 +1,5 @@
 /* Function cosf vectorized with AVX2, wrapper version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_cosf8_core_avx.S b/sysdeps/x86_64/fpu/svml_s_cosf8_core_avx.S
index a443fd28ad..6c210d98ce 100644
--- a/sysdeps/x86_64/fpu/svml_s_cosf8_core_avx.S
+++ b/sysdeps/x86_64/fpu/svml_s_cosf8_core_avx.S
@@ -1,5 +1,5 @@
 /* Function cosf vectorized in AVX ISA as wrapper to SSE4 ISA version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_expf16_core.S b/sysdeps/x86_64/fpu/svml_s_expf16_core.S
index d9d355c372..d8eecac674 100644
--- a/sysdeps/x86_64/fpu/svml_s_expf16_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_expf16_core.S
@@ -1,5 +1,5 @@
 /* Function expf vectorized with AVX-512. Wrapper to AVX2 version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_expf4_core.S b/sysdeps/x86_64/fpu/svml_s_expf4_core.S
index 71c5da4657..65b5d1a3ce 100644
--- a/sysdeps/x86_64/fpu/svml_s_expf4_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_expf4_core.S
@@ -1,5 +1,5 @@
 /* Function expf vectorized with SSE2.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_expf8_core.S b/sysdeps/x86_64/fpu/svml_s_expf8_core.S
index d254a992a4..e3cf975bf6 100644
--- a/sysdeps/x86_64/fpu/svml_s_expf8_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_expf8_core.S
@@ -1,5 +1,5 @@
 /* Function expf vectorized with AVX2, wrapper version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_expf8_core_avx.S b/sysdeps/x86_64/fpu/svml_s_expf8_core_avx.S
index ece40ba972..90469d7dcf 100644
--- a/sysdeps/x86_64/fpu/svml_s_expf8_core_avx.S
+++ b/sysdeps/x86_64/fpu/svml_s_expf8_core_avx.S
@@ -1,5 +1,5 @@
 /* Function expf vectorized in AVX ISA as wrapper to SSE4 ISA version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_expf_data.S b/sysdeps/x86_64/fpu/svml_s_expf_data.S
index eee9d69e31..4b644082b6 100644
--- a/sysdeps/x86_64/fpu/svml_s_expf_data.S
+++ b/sysdeps/x86_64/fpu/svml_s_expf_data.S
@@ -1,5 +1,5 @@
 /* Data for function expf.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_expf_data.h b/sysdeps/x86_64/fpu/svml_s_expf_data.h
index beaa290540..3610633c96 100644
--- a/sysdeps/x86_64/fpu/svml_s_expf_data.h
+++ b/sysdeps/x86_64/fpu/svml_s_expf_data.h
@@ -1,5 +1,5 @@
 /* Offsets for data table for vector function expf.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_logf16_core.S b/sysdeps/x86_64/fpu/svml_s_logf16_core.S
index 47ae7855a3..cc2e97df78 100644
--- a/sysdeps/x86_64/fpu/svml_s_logf16_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_logf16_core.S
@@ -1,5 +1,5 @@
 /* Function logf vectorized with AVX-512. Wrapper to AVX2 version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_logf4_core.S b/sysdeps/x86_64/fpu/svml_s_logf4_core.S
index 09be406d3c..195f328d92 100644
--- a/sysdeps/x86_64/fpu/svml_s_logf4_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_logf4_core.S
@@ -1,5 +1,5 @@
 /* Function logf vectorized with SSE2.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_logf8_core.S b/sysdeps/x86_64/fpu/svml_s_logf8_core.S
index cf4e9be537..8bb6926667 100644
--- a/sysdeps/x86_64/fpu/svml_s_logf8_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_logf8_core.S
@@ -1,5 +1,5 @@
 /* Function logf vectorized with AVX2, wrapper version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_logf8_core_avx.S b/sysdeps/x86_64/fpu/svml_s_logf8_core_avx.S
index 7ab572bb30..c2efba23f2 100644
--- a/sysdeps/x86_64/fpu/svml_s_logf8_core_avx.S
+++ b/sysdeps/x86_64/fpu/svml_s_logf8_core_avx.S
@@ -1,5 +1,5 @@
 /* Function logf vectorized in AVX ISA as wrapper to SSE4 ISA version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_logf_data.S b/sysdeps/x86_64/fpu/svml_s_logf_data.S
index 1e7f7015d3..a5675f5c7a 100644
--- a/sysdeps/x86_64/fpu/svml_s_logf_data.S
+++ b/sysdeps/x86_64/fpu/svml_s_logf_data.S
@@ -1,5 +1,5 @@
 /* Data for vector function logf.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_logf_data.h b/sysdeps/x86_64/fpu/svml_s_logf_data.h
index d42411a849..619d5c4bd1 100644
--- a/sysdeps/x86_64/fpu/svml_s_logf_data.h
+++ b/sysdeps/x86_64/fpu/svml_s_logf_data.h
@@ -1,5 +1,5 @@
 /* Offsets for data table for vectorized function logf.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_powf16_core.S b/sysdeps/x86_64/fpu/svml_s_powf16_core.S
index efd84c2fff..cb52af0c6b 100644
--- a/sysdeps/x86_64/fpu/svml_s_powf16_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_powf16_core.S
@@ -1,5 +1,5 @@
 /* Function powf vectorized with AVX-512. Wrapper to AVX2 version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_powf4_core.S b/sysdeps/x86_64/fpu/svml_s_powf4_core.S
index 81f0f530de..88fae60892 100644
--- a/sysdeps/x86_64/fpu/svml_s_powf4_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_powf4_core.S
@@ -1,5 +1,5 @@
 /* Function powf vectorized with SSE2.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_powf8_core.S b/sysdeps/x86_64/fpu/svml_s_powf8_core.S
index 8fed6c7c86..8ea44897c1 100644
--- a/sysdeps/x86_64/fpu/svml_s_powf8_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_powf8_core.S
@@ -1,5 +1,5 @@
 /* Function powf vectorized with AVX2, wrapper version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_powf8_core_avx.S b/sysdeps/x86_64/fpu/svml_s_powf8_core_avx.S
index eeeb66d46e..b5e4e5e6ef 100644
--- a/sysdeps/x86_64/fpu/svml_s_powf8_core_avx.S
+++ b/sysdeps/x86_64/fpu/svml_s_powf8_core_avx.S
@@ -1,5 +1,5 @@
 /* Function powf vectorized in AVX ISA as wrapper to SSE4 ISA version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_powf_data.S b/sysdeps/x86_64/fpu/svml_s_powf_data.S
index 4a4799ae4f..fc1a3d9390 100644
--- a/sysdeps/x86_64/fpu/svml_s_powf_data.S
+++ b/sysdeps/x86_64/fpu/svml_s_powf_data.S
@@ -1,5 +1,5 @@
 /* Data for function powf.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_powf_data.h b/sysdeps/x86_64/fpu/svml_s_powf_data.h
index d847368e4b..514004238a 100644
--- a/sysdeps/x86_64/fpu/svml_s_powf_data.h
+++ b/sysdeps/x86_64/fpu/svml_s_powf_data.h
@@ -1,5 +1,5 @@
 /* Offsets for data table for function powf.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S b/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S
index 992f9a91cc..5cbf10b8da 100644
--- a/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S
@@ -1,5 +1,5 @@
 /* Function sincosf vectorized with AVX-512. Wrapper to AVX2 version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_sincosf4_core.S b/sysdeps/x86_64/fpu/svml_s_sincosf4_core.S
index d402ffba15..1a7d2733af 100644
--- a/sysdeps/x86_64/fpu/svml_s_sincosf4_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_sincosf4_core.S
@@ -1,5 +1,5 @@
 /* Function sincosf vectorized with SSE2.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_sincosf8_core.S b/sysdeps/x86_64/fpu/svml_s_sincosf8_core.S
index eec7de87d5..74d1dfd1a8 100644
--- a/sysdeps/x86_64/fpu/svml_s_sincosf8_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_sincosf8_core.S
@@ -1,5 +1,5 @@
 /* Function sincosf vectorized with AVX2, wrapper version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_sincosf8_core_avx.S b/sysdeps/x86_64/fpu/svml_s_sincosf8_core_avx.S
index c247444dfc..55b8b2d768 100644
--- a/sysdeps/x86_64/fpu/svml_s_sincosf8_core_avx.S
+++ b/sysdeps/x86_64/fpu/svml_s_sincosf8_core_avx.S
@@ -1,5 +1,5 @@
 /* Function sincosf vectorized in AVX ISA as wrapper to SSE4 ISA version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_sinf16_core.S b/sysdeps/x86_64/fpu/svml_s_sinf16_core.S
index add6e0fd43..d7a31e1ea6 100644
--- a/sysdeps/x86_64/fpu/svml_s_sinf16_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_sinf16_core.S
@@ -1,5 +1,5 @@
 /* Function sinf vectorized with AVX-512. Wrapper to AVX2 version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_sinf4_core.S b/sysdeps/x86_64/fpu/svml_s_sinf4_core.S
index 2349c7b788..6f10137134 100644
--- a/sysdeps/x86_64/fpu/svml_s_sinf4_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_sinf4_core.S
@@ -1,5 +1,5 @@
 /* Function sinf vectorized with SSE2.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_sinf8_core.S b/sysdeps/x86_64/fpu/svml_s_sinf8_core.S
index fe31e3793e..c459658688 100644
--- a/sysdeps/x86_64/fpu/svml_s_sinf8_core.S
+++ b/sysdeps/x86_64/fpu/svml_s_sinf8_core.S
@@ -1,5 +1,5 @@
 /* Function sinf vectorized with AVX2, wrapper version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_sinf8_core_avx.S b/sysdeps/x86_64/fpu/svml_s_sinf8_core_avx.S
index f54be48ee3..5e95aa2e02 100644
--- a/sysdeps/x86_64/fpu/svml_s_sinf8_core_avx.S
+++ b/sysdeps/x86_64/fpu/svml_s_sinf8_core_avx.S
@@ -1,5 +1,5 @@
 /* Function sinf vectorized in AVX ISA as wrapper to SSE4 ISA version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_trig_data.S b/sysdeps/x86_64/fpu/svml_s_trig_data.S
index 07fc7d272d..b61aa6abb9 100644
--- a/sysdeps/x86_64/fpu/svml_s_trig_data.S
+++ b/sysdeps/x86_64/fpu/svml_s_trig_data.S
@@ -1,5 +1,5 @@
 /* Data for function cosf.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_trig_data.h b/sysdeps/x86_64/fpu/svml_s_trig_data.h
index 5a91dad41c..2e469a918a 100644
--- a/sysdeps/x86_64/fpu/svml_s_trig_data.h
+++ b/sysdeps/x86_64/fpu/svml_s_trig_data.h
@@ -1,5 +1,5 @@
 /* Offsets for data table for vectorized sinf, cosf, sincosf.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
index d255d195ee..b1a03be3d9 100644
--- a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
+++ b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
@@ -1,5 +1,5 @@
 /* Wrapper implementations of vector math functions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
index 4e764f2475..a9d15979aa 100644
--- a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
@@ -1,5 +1,5 @@
 /* Wrapper part of tests for SSE ISA versions of vector math functions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/test-double-vlen2.c b/sysdeps/x86_64/fpu/test-double-vlen2.c
index 2b6896425e..c7a3dff747 100644
--- a/sysdeps/x86_64/fpu/test-double-vlen2.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen2.c
@@ -1,5 +1,5 @@
 /* Tests for SSE ISA versions of vector math functions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
index bc2fd16c5a..eb6a531502 100644
--- a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
@@ -1,5 +1,5 @@
 /* Wrapper part of tests for AVX2 ISA versions of vector math functions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-avx2.c b/sysdeps/x86_64/fpu/test-double-vlen4-avx2.c
index 56723ab4d7..0cadef03d6 100644
--- a/sysdeps/x86_64/fpu/test-double-vlen4-avx2.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen4-avx2.c
@@ -1,5 +1,5 @@
 /* Tests for AVX2 ISA versions of vector math functions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
index a711c9e1c3..52b81da3ee 100644
--- a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
@@ -1,5 +1,5 @@
 /* Wrapper part of tests for AVX ISA versions of vector math functions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/test-double-vlen4.c b/sysdeps/x86_64/fpu/test-double-vlen4.c
index f0813437b4..9ae97f1388 100644
--- a/sysdeps/x86_64/fpu/test-double-vlen4.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen4.c
@@ -1,5 +1,5 @@
 /* Tests for AVX ISA versions of vector math functions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
index 942c42b83b..c10bb9cb4a 100644
--- a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
@@ -1,5 +1,5 @@
 /* Wrapper part of tests for AVX-512 versions of vector math functions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/test-double-vlen8.c b/sysdeps/x86_64/fpu/test-double-vlen8.c
index 1e23b83418..4fb6c8d196 100644
--- a/sysdeps/x86_64/fpu/test-double-vlen8.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen8.c
@@ -1,5 +1,5 @@
 /* Tests for AVX-512 versions of vector math functions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
index bc98e78ff0..dc09e4a338 100644
--- a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
@@ -1,5 +1,5 @@
 /* Wrapper part of tests for AVX-512 ISA versions of vector math functions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/test-float-vlen16.c b/sysdeps/x86_64/fpu/test-float-vlen16.c
index d7f683f09c..882bfc840d 100644
--- a/sysdeps/x86_64/fpu/test-float-vlen16.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen16.c
@@ -1,5 +1,5 @@
 /* Tests for AVX-512 ISA versions of vector math functions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
index 39254efed4..0bb9818146 100644
--- a/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
@@ -1,5 +1,5 @@
 /* Wrapper part of tests for SSE ISA versions of vector math functions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/test-float-vlen4.c b/sysdeps/x86_64/fpu/test-float-vlen4.c
index e56d64260e..f6a4cf5c1e 100644
--- a/sysdeps/x86_64/fpu/test-float-vlen4.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen4.c
@@ -1,5 +1,5 @@
 /* Tests for SSE ISA versions of vector math functions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
index 6bd0d50779..4985ac2379 100644
--- a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
@@ -1,5 +1,5 @@
 /* Wrapper part of tests for AVX2 ISA versions of vector math functions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-avx2.c b/sysdeps/x86_64/fpu/test-float-vlen8-avx2.c
index 0012082b8e..7a416385b6 100644
--- a/sysdeps/x86_64/fpu/test-float-vlen8-avx2.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen8-avx2.c
@@ -1,5 +1,5 @@
 /* Tests for AVX2 ISA versions of vector math functions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
index 2fec906de0..9cc2883399 100644
--- a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
@@ -1,5 +1,5 @@
 /* Wrapper part of tests for AVX ISA versions of vector math functions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/test-float-vlen8.c b/sysdeps/x86_64/fpu/test-float-vlen8.c
index 891e58ff88..c92a50ae7e 100644
--- a/sysdeps/x86_64/fpu/test-float-vlen8.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen8.c
@@ -1,5 +1,5 @@
 /* Tests for AVX ISA versions of vector math functions.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/fpu/x86_64-math-asm.h b/sysdeps/x86_64/fpu/x86_64-math-asm.h
new file mode 100644
index 0000000000..db3f9f78b0
--- /dev/null
+++ b/sysdeps/x86_64/fpu/x86_64-math-asm.h
@@ -0,0 +1,74 @@
+/* Helper macros for x86_64 libm functions.
+   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86_64_MATH_ASM_H
+#define _X86_64_MATH_ASM_H 1
+
+/* Define constants for the minimum value of a floating-point
+   type.  */
+#define DEFINE_LDBL_MIN					\
+	.section .rodata.cst16,"aM",@progbits,16;	\
+	.p2align 4;					\
+	.type ldbl_min,@object;				\
+ldbl_min:						\
+	.byte 0, 0, 0, 0, 0, 0, 0, 0x80, 0x1, 0;	\
+	.byte 0, 0, 0, 0, 0, 0;				\
+	.size ldbl_min, .-ldbl_min;
+
+/* Force an underflow exception if the given value (nonnegative or
+   NaN) is subnormal.  The relevant constant for the minimum of the
+   type must have been defined, the MO macro must have been defined
+   for access to memory operands, and, if PIC, the PIC register must
+   have been loaded.  */
+#define LDBL_CHECK_FORCE_UFLOW_NONNEG_NAN	\
+	fldt	MO(ldbl_min);			\
+	fld	%st(1);				\
+	fucomip	%st(1), %st(0);			\
+	fstp	%st(0);				\
+	jnc 6464f;				\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstp	%st(0);				\
+6464:
+
+/* Likewise, but the argument is not a NaN.  */
+#define LDBL_CHECK_FORCE_UFLOW_NONNAN		\
+	fldt	MO(ldbl_min);			\
+	fld	%st(1);				\
+	fabs;					\
+	fcomip	%st(1), %st(0);			\
+	fstp	%st(0);				\
+	jnc 6464f;				\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstp	%st(0);				\
+6464:
+
+/* Likewise, but the argument is nonnegative and not a NaN.  */
+#define LDBL_CHECK_FORCE_UFLOW_NONNEG		\
+	fldt	MO(ldbl_min);			\
+	fld	%st(1);				\
+	fcomip	%st(1), %st(0);			\
+	fstp	%st(0);				\
+	jnc 6464f;				\
+	fld	%st(0);				\
+	fmul	%st(0);				\
+	fstp	%st(0);				\
+6464:
+
+#endif /* x86_64-math-asm.h.  */
diff --git a/sysdeps/x86_64/hp-timing.h b/sysdeps/x86_64/hp-timing.h
index 493f9735bd..65381b314d 100644
--- a/sysdeps/x86_64/hp-timing.h
+++ b/sysdeps/x86_64/hp-timing.h
@@ -1,5 +1,5 @@
 /* High precision, low overhead timing functions.  x86-64 version.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
+   Copyright (C) 2002-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/htonl.S b/sysdeps/x86_64/htonl.S
index 85a690f5b1..c92fae8791 100644
--- a/sysdeps/x86_64/htonl.S
+++ b/sysdeps/x86_64/htonl.S
@@ -1,5 +1,5 @@
 /* Change byte order in word.  For AMD x86-64.
-   Copyright (C) 1997-2015 Free Software Foundation, Inc.
+   Copyright (C) 1997-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/ifuncmain8.c b/sysdeps/x86_64/ifuncmain8.c
new file mode 100644
index 0000000000..448ab96bfa
--- /dev/null
+++ b/sysdeps/x86_64/ifuncmain8.c
@@ -0,0 +1,32 @@
+/* Test IFUNC selector with floating-point parameters.
+   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdlib.h>
+
+extern float foo (float);
+
+static int
+do_test (void)
+{
+  if (foo (2) != 3)
+    abort ();
+  return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"
diff --git a/sysdeps/x86_64/ifuncmod8.c b/sysdeps/x86_64/ifuncmod8.c
new file mode 100644
index 0000000000..c00436799c
--- /dev/null
+++ b/sysdeps/x86_64/ifuncmod8.c
@@ -0,0 +1,36 @@
+/* Test IFUNC selector with floating-point parameters.
+   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <emmintrin.h>
+
+void * foo_ifunc (void) __asm__ ("foo");
+__asm__(".type foo, %gnu_indirect_function");
+
+static float
+foo_impl (float x)
+{
+  return x + 1;
+}
+
+void *
+foo_ifunc (void)
+{
+  __m128i xmm = _mm_set1_epi32 (-1);
+  asm volatile ("movdqa %0, %%xmm0" : : "x" (xmm) : "xmm0" );
+  return foo_impl;
+}
diff --git a/sysdeps/x86_64/jmpbuf-offsets.h b/sysdeps/x86_64/jmpbuf-offsets.h
index 03176a91f0..da71e555f7 100644
--- a/sysdeps/x86_64/jmpbuf-offsets.h
+++ b/sysdeps/x86_64/jmpbuf-offsets.h
@@ -1,5 +1,5 @@
 /* Private macros for accessing __jmp_buf contents.  x86-64 version.
-   Copyright (C) 2006-2015 Free Software Foundation, Inc.
+   Copyright (C) 2006-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/jmpbuf-unwind.h b/sysdeps/x86_64/jmpbuf-unwind.h
index 3d9b2b589f..aa0642b54a 100644
--- a/sysdeps/x86_64/jmpbuf-unwind.h
+++ b/sysdeps/x86_64/jmpbuf-unwind.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2003-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Jakub Jelinek <jakub@redhat.com>, 2003.
 
diff --git a/sysdeps/x86_64/ldsodefs.h b/sysdeps/x86_64/ldsodefs.h
index 84d36e82be..6a96c53721 100644
--- a/sysdeps/x86_64/ldsodefs.h
+++ b/sysdeps/x86_64/ldsodefs.h
@@ -1,5 +1,5 @@
 /* Run-time dynamic linker data structures for loaded ELF shared objects.
-   Copyright (C) 1995-2015 Free Software Foundation, Inc.
+   Copyright (C) 1995-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -20,6 +20,7 @@
 #define	_X86_64_LDSODEFS_H	1
 
 #include <elf.h>
+#include <cpu-features.h>
 
 struct La_x86_64_regs;
 struct La_x86_64_retval;
diff --git a/sysdeps/x86_64/localplt.data b/sysdeps/x86_64/localplt.data
index d140476dfe..f168b143ff 100644
--- a/sysdeps/x86_64/localplt.data
+++ b/sysdeps/x86_64/localplt.data
@@ -3,17 +3,17 @@
 # users can define their own functions and have library internals call them.
 # Linker in binutils 2.26 and newer consolidates R_X86_64_JUMP_SLOT
 # relocation with R_X86_64_GLOB_DAT relocation against the same symbol.
-libc.so: calloc
+libc.so: calloc + RELA R_X86_64_GLOB_DAT
 libc.so: free + RELA R_X86_64_GLOB_DAT
 libc.so: malloc + RELA R_X86_64_GLOB_DAT
-libc.so: memalign
-libc.so: realloc
+libc.so: memalign + RELA R_X86_64_GLOB_DAT
+libc.so: realloc + RELA R_X86_64_GLOB_DAT
 libm.so: matherr
 # The dynamic loader uses __libc_memalign internally to allocate aligned
 # TLS storage. The other malloc family of functions are expected to allow
 # user symbol interposition.
-ld.so: __libc_memalign
-ld.so: malloc
-ld.so: calloc
-ld.so: realloc
+ld.so: __libc_memalign + RELA R_X86_64_GLOB_DAT
+ld.so: malloc + RELA R_X86_64_GLOB_DAT
+ld.so: calloc + RELA R_X86_64_GLOB_DAT
+ld.so: realloc + RELA R_X86_64_GLOB_DAT
 ld.so: free + RELA R_X86_64_GLOB_DAT
diff --git a/sysdeps/x86_64/lshift.S b/sysdeps/x86_64/lshift.S
index 03fb631207..49cbfbaf3d 100644
--- a/sysdeps/x86_64/lshift.S
+++ b/sysdeps/x86_64/lshift.S
@@ -1,5 +1,5 @@
 /* x86-64 __mpn_lshift --
-   Copyright (C) 2007-2015 Free Software Foundation, Inc.
+   Copyright (C) 2007-2016 Free Software Foundation, Inc.
    This file is part of the GNU MP Library.
 
    The GNU MP Library is free software; you can redistribute it and/or modify
diff --git a/sysdeps/x86_64/machine-gmon.h b/sysdeps/x86_64/machine-gmon.h
index 51cf7793b4..3d9ce5c44e 100644
--- a/sysdeps/x86_64/machine-gmon.h
+++ b/sysdeps/x86_64/machine-gmon.h
@@ -1,5 +1,5 @@
 /* x86-64-specific implementation of profiling support.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
+   Copyright (C) 2002-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Andreas Jaeger <aj@suse.de>, 2002.
 
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index fae85caae1..132eacba8f 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
index f636716b64..3fb018a772 100644
--- a/sysdeps/x86_64/memcmp.S
+++ b/sysdeps/x86_64/memcmp.S
@@ -1,5 +1,5 @@
 /* memcmp with SSE2
-   Copyright (C) 2009-2015 Free Software Foundation, Inc.
+   Copyright (C) 2009-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/x86_64/memcpy.S b/sysdeps/x86_64/memcpy.S
index eea8c2a5af..f6e3d9396c 100644
--- a/sysdeps/x86_64/memcpy.S
+++ b/sysdeps/x86_64/memcpy.S
@@ -1,7 +1,7 @@
 /*
    Optimized memcpy for x86-64.
 
-   Copyright (C) 2007-2015 Free Software Foundation, Inc.
+   Copyright (C) 2007-2016 Free Software Foundation, Inc.
    Contributed by Evandro Menezes <evandro.menezes@amd.com>, 2007.
 
    This file is part of the GNU C Library.
diff --git a/sysdeps/x86_64/memcpy_chk.S b/sysdeps/x86_64/memcpy_chk.S
index f1a5ac4b23..2296b55119 100644
--- a/sysdeps/x86_64/memcpy_chk.S
+++ b/sysdeps/x86_64/memcpy_chk.S
@@ -1,5 +1,5 @@
 /* Checking memcpy for x86-64.
-   Copyright (C) 2004-2015 Free Software Foundation, Inc.
+   Copyright (C) 2004-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/memmove.c b/sysdeps/x86_64/memmove.c
index e0694a859f..07f81852d6 100644
--- a/sysdeps/x86_64/memmove.c
+++ b/sysdeps/x86_64/memmove.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/mempcpy_chk.S b/sysdeps/x86_64/mempcpy_chk.S
index 968e7edf3f..390abc68dd 100644
--- a/sysdeps/x86_64/mempcpy_chk.S
+++ b/sysdeps/x86_64/mempcpy_chk.S
@@ -1,5 +1,5 @@
 /* Checking mempcpy for x86-64.
-   Copyright (C) 2004-2015 Free Software Foundation, Inc.
+   Copyright (C) 2004-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
index 707b8390db..840de30cd7 100644
--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
@@ -1,6 +1,6 @@
 /* fast SSE2 memrchr with 64 byte loop and pmaxub instruction using
 
-   Copyright (C) 2011-2015 Free Software Foundation, Inc.
+   Copyright (C) 2011-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index e4962546c4..4cf0da0fb8 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -1,6 +1,6 @@
 /* memset/bzero -- set memory area to CH/0
    Optimized version for x86-64.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
+   Copyright (C) 2002-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -24,7 +24,7 @@
 ENTRY(__bzero)
 	movq	%rdi, %rax /* Set return value.  */
 	movq	%rsi, %rdx /* Set n.  */
-	pxor	%xmm8, %xmm8
+	pxor	%xmm0, %xmm0
 	jmp	L(entry_from_bzero)
 END(__bzero)
 weak_alias (__bzero, bzero)
@@ -33,10 +33,10 @@ weak_alias (__bzero, bzero)
 ENTRY(__memset_tail)
 	movq	%rcx, %rax /* Set return value.  */
 
-	movd	%esi, %xmm8
-	punpcklbw	%xmm8, %xmm8
-	punpcklwd	%xmm8, %xmm8
-	pshufd	$0, %xmm8, %xmm8
+	movd	%esi, %xmm0
+	punpcklbw	%xmm0, %xmm0
+	punpcklwd	%xmm0, %xmm0
+	pshufd	$0, %xmm0, %xmm0
 
 	jmp	L(entry_from_bzero)
 END(__memset_tail)
@@ -50,57 +50,57 @@ END_CHK (__memset_chk)
 #endif
 
 ENTRY (memset)
-	movd	%esi, %xmm8
+	movd	%esi, %xmm0
 	movq	%rdi, %rax
-	punpcklbw	%xmm8, %xmm8
-	punpcklwd	%xmm8, %xmm8
-	pshufd	$0, %xmm8, %xmm8
+	punpcklbw	%xmm0, %xmm0
+	punpcklwd	%xmm0, %xmm0
+	pshufd	$0, %xmm0, %xmm0
 L(entry_from_bzero):
 	cmpq	$64, %rdx
 	ja	L(loop_start)
 	cmpq	$16, %rdx
 	jbe	L(less_16_bytes)
 	cmpq	$32, %rdx
-	movdqu	%xmm8, (%rdi)
-	movdqu	%xmm8, -16(%rdi,%rdx)
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm0, -16(%rdi,%rdx)
 	ja	L(between_32_64_bytes)
 L(return):
 	rep
 	ret
 	.p2align 4
 L(between_32_64_bytes):
-	movdqu	%xmm8, 16(%rdi)
-	movdqu	%xmm8, -32(%rdi,%rdx)
+	movdqu	%xmm0, 16(%rdi)
+	movdqu	%xmm0, -32(%rdi,%rdx)
 	ret
 	.p2align 4
 L(loop_start):
 	leaq	64(%rdi), %rcx
-	movdqu	%xmm8, (%rdi)
+	movdqu	%xmm0, (%rdi)
 	andq	$-64, %rcx
-	movdqu	%xmm8, -16(%rdi,%rdx)
-	movdqu	%xmm8, 16(%rdi)
-	movdqu	%xmm8, -32(%rdi,%rdx)
-	movdqu	%xmm8, 32(%rdi)
-	movdqu	%xmm8, -48(%rdi,%rdx)
-	movdqu	%xmm8, 48(%rdi)
-	movdqu	%xmm8, -64(%rdi,%rdx)
+	movdqu	%xmm0, -16(%rdi,%rdx)
+	movdqu	%xmm0, 16(%rdi)
+	movdqu	%xmm0, -32(%rdi,%rdx)
+	movdqu	%xmm0, 32(%rdi)
+	movdqu	%xmm0, -48(%rdi,%rdx)
+	movdqu	%xmm0, 48(%rdi)
+	movdqu	%xmm0, -64(%rdi,%rdx)
 	addq	%rdi, %rdx
 	andq	$-64, %rdx
 	cmpq	%rdx, %rcx
 	je	L(return)
 	.p2align 4
 L(loop):
-	movdqa	%xmm8, (%rcx)
-	movdqa	%xmm8, 16(%rcx)
-	movdqa	%xmm8, 32(%rcx)
-	movdqa	%xmm8, 48(%rcx)
+	movdqa	%xmm0, (%rcx)
+	movdqa	%xmm0, 16(%rcx)
+	movdqa	%xmm0, 32(%rcx)
+	movdqa	%xmm0, 48(%rcx)
 	addq	$64, %rcx
 	cmpq	%rcx, %rdx
 	jne	L(loop)
 	rep
 	ret
 L(less_16_bytes):
-	movq %xmm8, %rcx
+	movq %xmm0, %rcx
 	testb	$24, %dl
 	jne	L(between8_16bytes)
 	testb	$4, %dl
diff --git a/sysdeps/x86_64/memset_chk.S b/sysdeps/x86_64/memset_chk.S
index 70204267ca..95bb5d0e94 100644
--- a/sysdeps/x86_64/memset_chk.S
+++ b/sysdeps/x86_64/memset_chk.S
@@ -1,5 +1,5 @@
 /* Checking memset for x86-64.
-   Copyright (C) 2004-2015 Free Software Foundation, Inc.
+   Copyright (C) 2004-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/memusage.h b/sysdeps/x86_64/memusage.h
index e915c1a672..fc102c4252 100644
--- a/sysdeps/x86_64/memusage.h
+++ b/sysdeps/x86_64/memusage.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2001-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2001-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/mul_1.S b/sysdeps/x86_64/mul_1.S
index 2fb8fad2bd..88b8f920a1 100644
--- a/sysdeps/x86_64/mul_1.S
+++ b/sysdeps/x86_64/mul_1.S
@@ -1,6 +1,6 @@
 /* AMD64 __mpn_mul_1 -- Multiply a limb vector with a limb and store
    the result in a second limb vector.
-   Copyright (C) 2003-2015 Free Software Foundation, Inc.
+   Copyright (C) 2003-2016 Free Software Foundation, Inc.
    This file is part of the GNU MP Library.
 
    The GNU MP Library is free software; you can redistribute it and/or modify
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index d7002a9df3..d234f4ab66 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -1,5 +1,4 @@
 ifeq ($(subdir),csu)
-aux += init-arch
 tests += test-multiarch
 gen-as-const-headers += ifunc-defines.sym
 endif
@@ -8,31 +7,26 @@ ifeq ($(subdir),string)
 
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcmp-sse2-unaligned strncmp-ssse3 \
-		   memcmp-sse4 memcpy-ssse3 \
-		   memcpy-sse2-unaligned mempcpy-ssse3 \
-		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
-		   memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
-		   memmove-ssse3-back strcasecmp_l-ssse3 \
+		   memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned \
+		   memcpy-avx512-no-vzeroupper mempcpy-ssse3 memmove-ssse3 \
+		   memcpy-ssse3-back mempcpy-ssse3-back memmove-avx-unaligned \
+		   memcpy-avx-unaligned mempcpy-avx-unaligned \
+		   mempcpy-avx512-no-vzeroupper memmove-ssse3-back \
+		   memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
 		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
 		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
-		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned
-
-ifeq (yes,$(config-cflags-sse4))
-sysdep_routines += strcspn-c strpbrk-c strspn-c varshift
+		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
+		   strcspn-c strpbrk-c strspn-c varshift memset-avx2 \
+		   memset-avx512-no-vzeroupper
 CFLAGS-varshift.c += -msse4
 CFLAGS-strcspn-c.c += -msse4
 CFLAGS-strpbrk-c.c += -msse4
 CFLAGS-strspn-c.c += -msse4
 endif
 
-ifeq (yes,$(config-cflags-avx2))
-sysdep_routines += memset-avx2
-endif
-endif
-
 ifeq ($(subdir),wcsmbs)
 sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c
 endif
diff --git a/sysdeps/x86_64/multiarch/Versions b/sysdeps/x86_64/multiarch/Versions
deleted file mode 100644
index 59b185ac8d..0000000000
--- a/sysdeps/x86_64/multiarch/Versions
+++ /dev/null
@@ -1,5 +0,0 @@
-libc {
-  GLIBC_PRIVATE {
-    __get_cpu_features;
-  }
-}
diff --git a/sysdeps/x86_64/multiarch/cacheinfo.c b/sysdeps/x86_64/multiarch/cacheinfo.c
deleted file mode 100644
index f87b8dce6b..0000000000
--- a/sysdeps/x86_64/multiarch/cacheinfo.c
+++ /dev/null
@@ -1,2 +0,0 @@
-#define DISABLE_PREFERRED_MEMORY_INSTRUCTION
-#include "../cacheinfo.c"
diff --git a/sysdeps/x86_64/multiarch/ifunc-defines.sym b/sysdeps/x86_64/multiarch/ifunc-defines.sym
index a410d8808f..3df946f343 100644
--- a/sysdeps/x86_64/multiarch/ifunc-defines.sym
+++ b/sysdeps/x86_64/multiarch/ifunc-defines.sym
@@ -4,7 +4,6 @@
 --
 
 CPU_FEATURES_SIZE	sizeof (struct cpu_features)
-KIND_OFFSET		offsetof (struct cpu_features, kind)
 CPUID_OFFSET		offsetof (struct cpu_features, cpuid)
 CPUID_SIZE		sizeof (struct cpuid_registers)
 CPUID_EAX_OFFSET	offsetof (struct cpuid_registers, eax)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index b64e4f1532..188b6d36c6 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -1,5 +1,5 @@
 /* Enumerate available IFUNC implementations of a function.  x86-64 version.
-   Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -20,10 +20,11 @@
 #include <string.h>
 #include <wchar.h>
 #include <ifunc-impl-list.h>
+#include <sysdep.h>
 #include "init-arch.h"
 
 /* Maximum number of IFUNC implementations.  */
-#define MAX_IFUNC	4
+#define MAX_IFUNC	5
 
 /* Fill ARRAY of MAX elements with IFUNC implementations for function
    NAME supported on target machine and return the number of valid
@@ -39,48 +40,77 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/memcmp.S.  */
   IFUNC_IMPL (i, name, memcmp,
-	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSE4_1,
+	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_1),
 			      __memcmp_sse4_1)
-	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSSE3, __memcmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3),
+			      __memcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
 
-  /* Support sysdeps/x86_64/multiarch/memmove_chk.S.  */
+  /* Support sysdeps/x86_64/multiarch/memmove_chk.c.  */
   IFUNC_IMPL (i, name, __memmove_chk,
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_AVX,
+#ifdef HAVE_AVX512_ASM_SUPPORT
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memmove_chk_avx512_no_vzeroupper)
+#endif
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memmove_chk_avx_unaligned)
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __memmove_chk_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __memmove_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
 			      __memmove_chk_sse2))
 
   /* Support sysdeps/x86_64/multiarch/memmove.S.  */
   IFUNC_IMPL (i, name, memmove,
-	      IFUNC_IMPL_ADD (array, i, memmove, HAS_AVX,
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memmove_avx_unaligned)
-	      IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
+#ifdef HAVE_AVX512_ASM_SUPPORT
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memmove_avx512_no_vzeroupper)
+#endif
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
 			      __memmove_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
 			      __memmove_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2))
 
-#ifdef HAVE_AVX2_SUPPORT
   /* Support sysdeps/x86_64/multiarch/memset_chk.S.  */
   IFUNC_IMPL (i, name, __memset_chk,
-	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1, __memset_chk_sse2)
-	      IFUNC_IMPL_ADD (array, i, __memset_chk, HAS_AVX2,
-			      __memset_chk_avx2))
+	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+			      __memset_chk_sse2)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memset_chk_avx2)
+#ifdef HAVE_AVX512_ASM_SUPPORT
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memset_chk_avx512_no_vzeroupper)
+#endif
+	      )
 
   /* Support sysdeps/x86_64/multiarch/memset.S.  */
   IFUNC_IMPL (i, name, memset,
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
-	      IFUNC_IMPL_ADD (array, i, memset, HAS_AVX2, __memset_avx2))
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memset_avx2)
+#ifdef HAVE_AVX512_ASM_SUPPORT
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memset_avx512_no_vzeroupper)
 #endif
+	     )
 
   /* Support sysdeps/x86_64/multiarch/stpncpy.S.  */
   IFUNC_IMPL (i, name, stpncpy,
-	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
 			      __stpncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
 			      __stpncpy_sse2_unaligned)
@@ -88,38 +118,42 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/stpcpy.S.  */
   IFUNC_IMPL (i, name, stpcpy,
-	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_SSSE3, __stpcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3),
+			      __stpcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S.  */
   IFUNC_IMPL (i, name, strcasecmp,
-	      IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_AVX,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __strcasecmp_avx)
-	      IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSE4_2,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      HAS_CPU_FEATURE (SSE4_2),
 			      __strcasecmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __strcasecmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S.  */
   IFUNC_IMPL (i, name, strcasecmp_l,
-	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_AVX,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __strcasecmp_l_avx)
-	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_SSE4_2,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+			      HAS_CPU_FEATURE (SSE4_2),
 			      __strcasecmp_l_sse42)
-	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __strcasecmp_l_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
 			      __strcasecmp_l_sse2))
 
-  /* Support sysdeps/x86_64/multiarch/strcasestr.c.  */
-  IFUNC_IMPL (i, name, strcasestr,
-	      IFUNC_IMPL_ADD (array, i, strcasestr, 1, __strcasestr_sse2))
-
   /* Support sysdeps/x86_64/multiarch/strcat.S.  */
   IFUNC_IMPL (i, name, strcat,
-	      IFUNC_IMPL_ADD (array, i, strcat, HAS_SSSE3, __strcat_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3),
+			      __strcat_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2))
 
@@ -130,48 +164,57 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcmp.S.  */
   IFUNC_IMPL (i, name, strcmp,
-	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSE4_2, __strcmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSSE3, __strcmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
+			      __strcmp_sse42)
+	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSSE3),
+			      __strcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strcpy.S.  */
   IFUNC_IMPL (i, name, strcpy,
-	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_SSSE3, __strcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3),
+			      __strcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strcspn.S.  */
   IFUNC_IMPL (i, name, strcspn,
-	      IFUNC_IMPL_ADD (array, i, strcspn, HAS_SSE4_2,
+	      IFUNC_IMPL_ADD (array, i, strcspn, HAS_CPU_FEATURE (SSE4_2),
 			      __strcspn_sse42)
 	      IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.S.  */
   IFUNC_IMPL (i, name, strncasecmp,
-	      IFUNC_IMPL_ADD (array, i, strncasecmp, HAS_AVX,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __strncasecmp_avx)
-	      IFUNC_IMPL_ADD (array, i, strncasecmp, HAS_SSE4_2,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      HAS_CPU_FEATURE (SSE4_2),
 			      __strncasecmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strncasecmp, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __strncasecmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
 			      __strncasecmp_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.S.  */
   IFUNC_IMPL (i, name, strncasecmp_l,
-	      IFUNC_IMPL_ADD (array, i, strncasecmp_l, HAS_AVX,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __strncasecmp_l_avx)
-	      IFUNC_IMPL_ADD (array, i, strncasecmp_l, HAS_SSE4_2,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+			      HAS_CPU_FEATURE (SSE4_2),
 			      __strncasecmp_l_sse42)
-	      IFUNC_IMPL_ADD (array, i, strncasecmp_l, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __strncasecmp_l_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
 			      __strncasecmp_l_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strncat.S.  */
   IFUNC_IMPL (i, name, strncat,
-	      IFUNC_IMPL_ADD (array, i, strncat, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3),
 			      __strncat_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
 			      __strncat_sse2_unaligned)
@@ -179,7 +222,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncpy.S.  */
   IFUNC_IMPL (i, name, strncpy,
-	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3),
 			      __strncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
 			      __strncpy_sse2_unaligned)
@@ -187,14 +230,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strpbrk.S.  */
   IFUNC_IMPL (i, name, strpbrk,
-	      IFUNC_IMPL_ADD (array, i, strpbrk, HAS_SSE4_2,
+	      IFUNC_IMPL_ADD (array, i, strpbrk, HAS_CPU_FEATURE (SSE4_2),
 			      __strpbrk_sse42)
 	      IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_sse2))
 
 
   /* Support sysdeps/x86_64/multiarch/strspn.S.  */
   IFUNC_IMPL (i, name, strspn,
-	      IFUNC_IMPL_ADD (array, i, strspn, HAS_SSE4_2, __strspn_sse42)
+	      IFUNC_IMPL_ADD (array, i, strspn, HAS_CPU_FEATURE (SSE4_2),
+			      __strspn_sse42)
 	      IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strstr.c.  */
@@ -204,65 +248,95 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/wcscpy.S.  */
   IFUNC_IMPL (i, name, wcscpy,
-	      IFUNC_IMPL_ADD (array, i, wcscpy, HAS_SSSE3, __wcscpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3),
+			      __wcscpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2))
 
   /* Support sysdeps/x86_64/multiarch/wmemcmp.S.  */
   IFUNC_IMPL (i, name, wmemcmp,
-	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_SSE4_1,
+	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_1),
 			      __wmemcmp_sse4_1)
-	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3),
 			      __wmemcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
 
 #ifdef SHARED
   /* Support sysdeps/x86_64/multiarch/memcpy_chk.S.  */
   IFUNC_IMPL (i, name, __memcpy_chk,
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_AVX,
+#ifdef HAVE_AVX512_ASM_SUPPORT
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memcpy_chk_avx512_no_vzeroupper)
+#endif
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memcpy_chk_avx_unaligned)
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __memcpy_chk_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __memcpy_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
 			      __memcpy_chk_sse2))
 
   /* Support sysdeps/x86_64/multiarch/memcpy.S.  */
   IFUNC_IMPL (i, name, memcpy,
-	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_AVX,
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memcpy_avx_unaligned)
-	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
 			      __memcpy_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
+			      __memcpy_ssse3)
+#ifdef HAVE_AVX512_ASM_SUPPORT
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memcpy_avx512_no_vzeroupper)
+#endif
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
 
   /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S.  */
   IFUNC_IMPL (i, name, __mempcpy_chk,
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_AVX,
+#ifdef HAVE_AVX512_ASM_SUPPORT
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __mempcpy_chk_avx512_no_vzeroupper)
+#endif
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __mempcpy_chk_avx_unaligned)
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_chk_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
 			      __mempcpy_chk_sse2))
 
   /* Support sysdeps/x86_64/multiarch/mempcpy.S.  */
   IFUNC_IMPL (i, name, mempcpy,
-	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_AVX,
+#ifdef HAVE_AVX512_ASM_SUPPORT
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __mempcpy_avx512_no_vzeroupper)
+#endif
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __mempcpy_avx_unaligned)
-	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strncmp.S.  */
   IFUNC_IMPL (i, name, strncmp,
-	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSE4_2,
+	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2),
 			      __strncmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSSE3,
+	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
 			      __strncmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
 #endif
diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c
deleted file mode 100644
index aaad5fa841..0000000000
--- a/sysdeps/x86_64/multiarch/init-arch.c
+++ /dev/null
@@ -1,223 +0,0 @@
-/* Initialize CPU feature data.
-   This file is part of the GNU C Library.
-   Copyright (C) 2008-2015 Free Software Foundation, Inc.
-   Contributed by Ulrich Drepper <drepper@redhat.com>.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <atomic.h>
-#include <cpuid.h>
-#include "init-arch.h"
-
-
-struct cpu_features __cpu_features attribute_hidden;
-
-
-static void
-get_common_indeces (unsigned int *family, unsigned int *model)
-{
-  __cpuid (1, __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax,
-	   __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ebx,
-	   __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx,
-	   __cpu_features.cpuid[COMMON_CPUID_INDEX_1].edx);
-
-  unsigned int eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax;
-  *family = (eax >> 8) & 0x0f;
-  *model = (eax >> 4) & 0x0f;
-}
-
-
-void
-__init_cpu_features (void)
-{
-  unsigned int ebx;
-  unsigned int ecx;
-  unsigned int edx;
-  unsigned int family = 0;
-  unsigned int model = 0;
-  enum cpu_features_kind kind;
-
-  __cpuid (0, __cpu_features.max_cpuid, ebx, ecx, edx);
-
-  /* This spells out "GenuineIntel".  */
-  if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
-    {
-      kind = arch_kind_intel;
-
-      get_common_indeces (&family, &model);
-
-      unsigned int eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax;
-      unsigned int extended_family = (eax >> 20) & 0xff;
-      unsigned int extended_model = (eax >> 12) & 0xf0;
-      if (family == 0x0f)
-	{
-	  family += extended_family;
-	  model += extended_model;
-	}
-      else if (family == 0x06)
-	{
-	  ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx;
-	  model += extended_model;
-	  switch (model)
-	    {
-	    case 0x1c:
-	    case 0x26:
-	      /* BSF is slow on Atom.  */
-	      __cpu_features.feature[index_Slow_BSF] |= bit_Slow_BSF;
-	      break;
-
-	    case 0x37:
-	    case 0x4a:
-	    case 0x4d:
-	    case 0x5a:
-	    case 0x5d:
-	      /* Unaligned load versions are faster than SSSE3
-		 on Silvermont.  */
-#if index_Fast_Unaligned_Load != index_Prefer_PMINUB_for_stringop
-# error index_Fast_Unaligned_Load != index_Prefer_PMINUB_for_stringop
-#endif
-#if index_Fast_Unaligned_Load != index_Slow_SSE4_2
-# error index_Fast_Unaligned_Load != index_Slow_SSE4_2
-#endif
-	      __cpu_features.feature[index_Fast_Unaligned_Load]
-		|= (bit_Fast_Unaligned_Load
-		    | bit_Prefer_PMINUB_for_stringop
-		    | bit_Slow_SSE4_2);
-	      break;
-
-	    default:
-	      /* Unknown family 0x06 processors.  Assuming this is one
-		 of Core i3/i5/i7 processors if AVX is available.  */
-	      if ((ecx & bit_AVX) == 0)
-		break;
-
-	    case 0x1a:
-	    case 0x1e:
-	    case 0x1f:
-	    case 0x25:
-	    case 0x2c:
-	    case 0x2e:
-	    case 0x2f:
-	      /* Rep string instructions, copy backward, unaligned loads
-		 and pminub are fast on Intel Core i3, i5 and i7.  */
-#if index_Fast_Rep_String != index_Fast_Copy_Backward
-# error index_Fast_Rep_String != index_Fast_Copy_Backward
-#endif
-#if index_Fast_Rep_String != index_Fast_Unaligned_Load
-# error index_Fast_Rep_String != index_Fast_Unaligned_Load
-#endif
-#if index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop
-# error index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop
-#endif
-	      __cpu_features.feature[index_Fast_Rep_String]
-		|= (bit_Fast_Rep_String
-		    | bit_Fast_Copy_Backward
-		    | bit_Fast_Unaligned_Load
-		    | bit_Prefer_PMINUB_for_stringop);
-	      break;
-	    }
-	}
-    }
-  /* This spells out "AuthenticAMD".  */
-  else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
-    {
-      kind = arch_kind_amd;
-
-      get_common_indeces (&family, &model);
-
-      ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx;
-
-      unsigned int eax;
-      __cpuid (0x80000000, eax, ebx, ecx, edx);
-      if (eax >= 0x80000001)
-	__cpuid (0x80000001,
-		 __cpu_features.cpuid[COMMON_CPUID_INDEX_80000001].eax,
-		 __cpu_features.cpuid[COMMON_CPUID_INDEX_80000001].ebx,
-		 __cpu_features.cpuid[COMMON_CPUID_INDEX_80000001].ecx,
-		 __cpu_features.cpuid[COMMON_CPUID_INDEX_80000001].edx);
-    }
-  else
-    kind = arch_kind_other;
-
-  if (__cpu_features.max_cpuid >= 7)
-    __cpuid_count (7, 0,
-		   __cpu_features.cpuid[COMMON_CPUID_INDEX_7].eax,
-		   __cpu_features.cpuid[COMMON_CPUID_INDEX_7].ebx,
-		   __cpu_features.cpuid[COMMON_CPUID_INDEX_7].ecx,
-		   __cpu_features.cpuid[COMMON_CPUID_INDEX_7].edx);
-
-  /* Can we call xgetbv?  */
-  if (CPUID_OSXSAVE)
-    {
-      unsigned int xcrlow;
-      unsigned int xcrhigh;
-      asm ("xgetbv" : "=a" (xcrlow), "=d" (xcrhigh) : "c" (0));
-      /* Is YMM and XMM state usable?  */
-      if ((xcrlow & (bit_YMM_state | bit_XMM_state)) ==
-	  (bit_YMM_state | bit_XMM_state))
-	{
-	  /* Determine if AVX is usable.  */
-	  if (CPUID_AVX)
-	    __cpu_features.feature[index_AVX_Usable] |= bit_AVX_Usable;
-#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
-# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
-#endif
-	  /* Determine if AVX2 is usable.  Unaligned load with 256-bit
-	     AVX registers are faster on processors with AVX2.  */
-	  if (CPUID_AVX2)
-	    __cpu_features.feature[index_AVX2_Usable]
-	      |= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load;
-	  /* Check if OPMASK state, upper 256-bit of ZMM0-ZMM15 and
-	     ZMM16-ZMM31 state are enabled.  */
-	  if ((xcrlow & (bit_Opmask_state | bit_ZMM0_15_state
-			 | bit_ZMM16_31_state)) ==
-	      (bit_Opmask_state | bit_ZMM0_15_state | bit_ZMM16_31_state))
-	    {
-	      /* Determine if AVX512F is usable.  */
-	      if (CPUID_AVX512F)
-		{
-		  __cpu_features.feature[index_AVX512F_Usable]
-		    |= bit_AVX512F_Usable;
-		  /* Determine if AVX512DQ is usable.  */
-		  if (CPUID_AVX512DQ)
-		    __cpu_features.feature[index_AVX512DQ_Usable]
-		      |= bit_AVX512DQ_Usable;
-		}
-	    }
-	  /* Determine if FMA is usable.  */
-	  if (CPUID_FMA)
-	    __cpu_features.feature[index_FMA_Usable] |= bit_FMA_Usable;
-	  /* Determine if FMA4 is usable.  */
-	  if (CPUID_FMA4)
-	    __cpu_features.feature[index_FMA4_Usable] |= bit_FMA4_Usable;
-	}
-    }
-
-  __cpu_features.family = family;
-  __cpu_features.model = model;
-  atomic_write_barrier ();
-  __cpu_features.kind = kind;
-}
-
-#undef __get_cpu_features
-
-const struct cpu_features *
-__get_cpu_features (void)
-{
-  if (__cpu_features.kind == arch_kind_unknown)
-    __init_cpu_features ();
-
-  return &__cpu_features;
-}
diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
deleted file mode 100644
index cfc6e7049e..0000000000
--- a/sysdeps/x86_64/multiarch/init-arch.h
+++ /dev/null
@@ -1,206 +0,0 @@
-/* This file is part of the GNU C Library.
-   Copyright (C) 2008-2015 Free Software Foundation, Inc.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#define bit_Fast_Rep_String		(1 << 0)
-#define bit_Fast_Copy_Backward		(1 << 1)
-#define bit_Slow_BSF			(1 << 2)
-#define bit_Fast_Unaligned_Load		(1 << 4)
-#define bit_Prefer_PMINUB_for_stringop	(1 << 5)
-#define bit_AVX_Usable			(1 << 6)
-#define bit_FMA_Usable			(1 << 7)
-#define bit_FMA4_Usable			(1 << 8)
-#define bit_Slow_SSE4_2			(1 << 9)
-#define bit_AVX2_Usable			(1 << 10)
-#define bit_AVX_Fast_Unaligned_Load	(1 << 11)
-#define bit_AVX512F_Usable		(1 << 12)
-#define bit_AVX512DQ_Usable		(1 << 13)
-
-/* CPUID Feature flags.  */
-
-/* COMMON_CPUID_INDEX_1.  */
-#define bit_SSE2	(1 << 26)
-#define bit_SSSE3	(1 << 9)
-#define bit_SSE4_1	(1 << 19)
-#define bit_SSE4_2	(1 << 20)
-#define bit_OSXSAVE	(1 << 27)
-#define bit_AVX		(1 << 28)
-#define bit_POPCOUNT	(1 << 23)
-#define bit_FMA		(1 << 12)
-#define bit_FMA4	(1 << 16)
-
-/* COMMON_CPUID_INDEX_7.  */
-#define bit_RTM		(1 << 11)
-#define bit_AVX2	(1 << 5)
-#define bit_AVX512F	(1 << 16)
-#define bit_AVX512DQ	(1 << 17)
-
-/* XCR0 Feature flags.  */
-#define bit_XMM_state  (1 << 1)
-#define bit_YMM_state  (2 << 1)
-#define bit_Opmask_state	(1 << 5)
-#define bit_ZMM0_15_state	(1 << 6)
-#define bit_ZMM16_31_state	(1 << 7)
-
-/* The integer bit array index for the first set of internal feature bits.  */
-# define FEATURE_INDEX_1 0
-
-/* The current maximum size of the feature integer bit array.  */
-# define FEATURE_INDEX_MAX 1
-
-#ifdef	__ASSEMBLER__
-
-# include <ifunc-defines.h>
-
-# define index_SSE2	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET
-# define index_SSSE3	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
-# define index_SSE4_1	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
-# define index_SSE4_2	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
-# define index_AVX	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
-# define index_AVX2	COMMON_CPUID_INDEX_7*CPUID_SIZE+CPUID_EBX_OFFSET
-
-# define index_Fast_Rep_String		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_Fast_Copy_Backward	FEATURE_INDEX_1*FEATURE_SIZE
-# define index_Slow_BSF			FEATURE_INDEX_1*FEATURE_SIZE
-# define index_Fast_Unaligned_Load	FEATURE_INDEX_1*FEATURE_SIZE
-# define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1*FEATURE_SIZE
-# define index_AVX_Usable		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_FMA_Usable		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_FMA4_Usable		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_Slow_SSE4_2		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_AVX2_Usable		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_AVX_Fast_Unaligned_Load	FEATURE_INDEX_1*FEATURE_SIZE
-# define index_AVX512F_Usable		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_AVX512DQ_Usable		FEATURE_INDEX_1*FEATURE_SIZE
-
-#else	/* __ASSEMBLER__ */
-
-# include <sys/param.h>
-
-enum
-  {
-    COMMON_CPUID_INDEX_1 = 0,
-    COMMON_CPUID_INDEX_7,
-    COMMON_CPUID_INDEX_80000001,	/* for AMD */
-    /* Keep the following line at the end.  */
-    COMMON_CPUID_INDEX_MAX
-  };
-
-extern struct cpu_features
-{
-  enum cpu_features_kind
-    {
-      arch_kind_unknown = 0,
-      arch_kind_intel,
-      arch_kind_amd,
-      arch_kind_other
-    } kind;
-  int max_cpuid;
-  struct cpuid_registers
-  {
-    unsigned int eax;
-    unsigned int ebx;
-    unsigned int ecx;
-    unsigned int edx;
-  } cpuid[COMMON_CPUID_INDEX_MAX];
-  unsigned int family;
-  unsigned int model;
-  unsigned int feature[FEATURE_INDEX_MAX];
-} __cpu_features attribute_hidden;
-
-
-extern void __init_cpu_features (void) attribute_hidden;
-# define INIT_ARCH() \
-  do							\
-    if (__cpu_features.kind == arch_kind_unknown)	\
-      __init_cpu_features ();				\
-  while (0)
-
-/* Used from outside libc.so to get access to the CPU features structure.  */
-extern const struct cpu_features *__get_cpu_features (void)
-     __attribute__ ((const));
-
-# if IS_IN (libc)
-#  define __get_cpu_features()	(&__cpu_features)
-# endif
-
-# define HAS_CPU_FEATURE(idx, reg, bit) \
-  ((__get_cpu_features ()->cpuid[idx].reg & (bit)) != 0)
-
-/* Following are the feature tests used throughout libc.  */
-
-/* CPUID_* evaluates to true if the feature flag is enabled.
-   We always use &__cpu_features because the HAS_CPUID_* macros
-   are called only within __init_cpu_features, where we can't
-   call __get_cpu_features without infinite recursion.  */
-# define HAS_CPUID_FLAG(idx, reg, bit) \
-  (((&__cpu_features)->cpuid[idx].reg & (bit)) != 0)
-
-# define CPUID_OSXSAVE \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_1, ecx, bit_OSXSAVE)
-# define CPUID_AVX \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_1, ecx, bit_AVX)
-# define CPUID_FMA \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_1, ecx, bit_FMA)
-# define CPUID_FMA4 \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_80000001, ecx, bit_FMA4)
-# define CPUID_RTM \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_7, ebx, bit_RTM)
-# define CPUID_AVX2 \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_7, ebx, bit_AVX2)
-# define CPUID_AVX512F \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_7, ebx, bit_AVX512F)
-# define CPUID_AVX512DQ \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_7, ebx, bit_AVX512DQ)
-
-/* HAS_* evaluates to true if we may use the feature at runtime.  */
-# define HAS_SSE2	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, edx, bit_SSE2)
-# define HAS_POPCOUNT	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_POPCOUNT)
-# define HAS_SSSE3	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_SSSE3)
-# define HAS_SSE4_1	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_SSE4_1)
-# define HAS_SSE4_2	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_SSE4_2)
-# define HAS_RTM	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_7, ebx, bit_RTM)
-
-# define index_Fast_Rep_String		FEATURE_INDEX_1
-# define index_Fast_Copy_Backward	FEATURE_INDEX_1
-# define index_Slow_BSF			FEATURE_INDEX_1
-# define index_Fast_Unaligned_Load	FEATURE_INDEX_1
-# define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1
-# define index_AVX_Usable		FEATURE_INDEX_1
-# define index_FMA_Usable		FEATURE_INDEX_1
-# define index_FMA4_Usable		FEATURE_INDEX_1
-# define index_Slow_SSE4_2		FEATURE_INDEX_1
-# define index_AVX2_Usable		FEATURE_INDEX_1
-# define index_AVX_Fast_Unaligned_Load	FEATURE_INDEX_1
-# define index_AVX512F_Usable		FEATURE_INDEX_1
-# define index_AVX512DQ_Usable		FEATURE_INDEX_1
-
-# define HAS_ARCH_FEATURE(name) \
-  ((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0)
-
-# define HAS_FAST_REP_STRING		HAS_ARCH_FEATURE (Fast_Rep_String)
-# define HAS_FAST_COPY_BACKWARD		HAS_ARCH_FEATURE (Fast_Copy_Backward)
-# define HAS_SLOW_BSF			HAS_ARCH_FEATURE (Slow_BSF)
-# define HAS_FAST_UNALIGNED_LOAD	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
-# define HAS_AVX			HAS_ARCH_FEATURE (AVX_Usable)
-# define HAS_AVX2			HAS_ARCH_FEATURE (AVX2_Usable)
-# define HAS_AVX512F			HAS_ARCH_FEATURE (AVX512F_Usable)
-# define HAS_AVX512DQ			HAS_ARCH_FEATURE (AVX512DQ_Usable)
-# define HAS_FMA			HAS_ARCH_FEATURE (FMA_Usable)
-# define HAS_FMA4			HAS_ARCH_FEATURE (FMA4_Usable)
-# define HAS_AVX_FAST_UNALIGNED_LOAD	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
-
-#endif	/* __ASSEMBLER__ */
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
index 533fece51a..786f87282c 100644
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -1,5 +1,5 @@
 /* memcmp with SSE4.1, wmemcmp with SSE4.1
-   Copyright (C) 2010-2015 Free Software Foundation, Inc.
+   Copyright (C) 2010-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
index 948148b1cd..a22f399e02 100644
--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
@@ -1,5 +1,5 @@
 /* memcmp with SSSE3, wmemcmp with SSSE3
-   Copyright (C) 2011-2015 Free Software Foundation, Inc.
+   Copyright (C) 2011-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
index f8b46363d0..b5a1cc202e 100644
--- a/sysdeps/x86_64/multiarch/memcmp.S
+++ b/sysdeps/x86_64/multiarch/memcmp.S
@@ -1,6 +1,6 @@
 /* Multiple versions of memcmp
    All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2010-2015 Free Software Foundation, Inc.
+   Copyright (C) 2010-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -26,16 +26,13 @@
 	.text
 ENTRY(memcmp)
 	.type	memcmp, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
-	jne	1f
-	call	__init_cpu_features
-
-1:	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_CPU_FEATURE (SSSE3)
 	jnz	2f
 	leaq	__memcmp_sse2(%rip), %rax
 	ret
 
-2:	testl	$bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+2:	HAS_CPU_FEATURE (SSE4_1)
 	jz	3f
 	leaq	__memcmp_sse4_1(%rip), %rax
 	ret
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
index 9f033f5456..74fed186e9 100644
--- a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
@@ -1,5 +1,5 @@
 /* memcpy with AVX
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
new file mode 100644
index 0000000000..1bb12e81b0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
@@ -0,0 +1,408 @@
+/* memcpy optimized with AVX512 for KNL hardware.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if defined HAVE_AVX512_ASM_SUPPORT && IS_IN (libc) \
+    && (defined SHARED \
+	|| defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+#ifndef MEMCPY
+# define MEMCPY		__memcpy_avx512_no_vzeroupper
+# define MEMCPY_CHK	__memcpy_chk_avx512_no_vzeroupper
+#endif
+
+	.section .text,"ax",@progbits
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+
+ENTRY (MEMCPY)
+	mov	%rdi, %rax
+#ifdef USE_AS_MEMPCPY
+	add	%rdx, %rax
+#endif
+	lea	(%rsi, %rdx), %rcx
+	lea	(%rdi, %rdx), %r9
+	cmp	$512, %rdx
+	ja	L(512bytesormore)
+
+L(check):
+	cmp	$16, %rdx
+	jbe	L(less_16bytes)
+	cmp	$256, %rdx
+	jb	L(less_256bytes)
+	vmovups	(%rsi), %zmm0
+	vmovups 0x40(%rsi), %zmm1
+	vmovups 0x80(%rsi), %zmm2
+	vmovups 0xC0(%rsi), %zmm3
+	vmovups	-0x100(%rcx), %zmm4
+	vmovups -0xC0(%rcx), %zmm5
+	vmovups -0x80(%rcx), %zmm6
+	vmovups -0x40(%rcx), %zmm7
+	vmovups %zmm0, (%rdi)
+	vmovups %zmm1, 0x40(%rdi)
+	vmovups %zmm2, 0x80(%rdi)
+	vmovups %zmm3, 0xC0(%rdi)
+	vmovups	%zmm4, -0x100(%r9)
+	vmovups %zmm5, -0xC0(%r9)
+	vmovups %zmm6, -0x80(%r9)
+	vmovups %zmm7, -0x40(%r9)
+	ret
+
+L(less_256bytes):
+	cmp	$128, %dl
+	jb	L(less_128bytes)
+	vmovups	(%rsi), %zmm0
+	vmovups 0x40(%rsi), %zmm1
+	vmovups -0x80(%rcx), %zmm2
+	vmovups -0x40(%rcx), %zmm3
+	vmovups	%zmm0, (%rdi)
+	vmovups %zmm1, 0x40(%rdi)
+	vmovups %zmm2, -0x80(%r9)
+	vmovups %zmm3, -0x40(%r9)
+	ret
+
+L(less_128bytes):
+	cmp	$64, %dl
+	jb	L(less_64bytes)
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 0x20(%rsi), %ymm1
+	vmovdqu -0x40(%rcx), %ymm2
+	vmovdqu -0x20(%rcx), %ymm3
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymm1, 0x20(%rdi)
+	vmovdqu %ymm2, -0x40(%r9)
+	vmovdqu %ymm3, -0x20(%r9)
+	ret
+
+L(less_64bytes):
+	cmp	$32, %dl
+	jb	L(less_32bytes)
+	vmovdqu	(%rsi), %ymm0
+	vmovdqu -0x20(%rcx), %ymm1
+	vmovdqu	%ymm0, (%rdi)
+	vmovdqu	%ymm1, -0x20(%r9)
+	ret
+
+L(less_32bytes):
+	vmovdqu (%rsi), %xmm0
+	vmovdqu -0x10(%rcx), %xmm1
+	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmm1, -0x10(%r9)
+	ret
+
+L(less_16bytes):
+	cmp	$8, %dl
+	jb	L(less_8bytes)
+	movq	(%rsi), %rsi
+	movq	-0x8(%rcx), %rcx
+	movq	%rsi, (%rdi)
+	movq	%rcx, -0x8(%r9)
+	ret
+
+L(less_8bytes):
+	cmp	$4, %dl
+	jb	L(less_4bytes)
+	mov	(%rsi), %esi
+	mov	-0x4(%rcx), %ecx
+	mov	%esi, (%rdi)
+	mov	%ecx, -0x4(%r9)
+	ret
+
+L(less_4bytes):
+	cmp	$2, %dl
+	jb	L(less_2bytes)
+	mov	(%rsi), %si
+	mov	-0x2(%rcx), %cx
+	mov	%si, (%rdi)
+	mov	%cx, -0x2(%r9)
+	ret
+
+L(less_2bytes):
+	cmp	$1, %dl
+	jb	L(less_1bytes)
+	mov	(%rsi), %cl
+	mov	%cl, (%rdi)
+L(less_1bytes):
+	ret
+
+L(512bytesormore):
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %r8
+#else
+	mov	__x86_shared_cache_size_half(%rip), %r8
+#endif
+	cmp	%r8, %rdx
+	jae	L(preloop_large)
+	cmp	$1024, %rdx
+	ja	L(1024bytesormore)
+	prefetcht1 (%rsi)
+	prefetcht1 0x40(%rsi)
+	prefetcht1 0x80(%rsi)
+	prefetcht1 0xC0(%rsi)
+	prefetcht1 0x100(%rsi)
+	prefetcht1 0x140(%rsi)
+	prefetcht1 0x180(%rsi)
+	prefetcht1 0x1C0(%rsi)
+	prefetcht1 -0x200(%rcx)
+	prefetcht1 -0x1C0(%rcx)
+	prefetcht1 -0x180(%rcx)
+	prefetcht1 -0x140(%rcx)
+	prefetcht1 -0x100(%rcx)
+	prefetcht1 -0xC0(%rcx)
+	prefetcht1 -0x80(%rcx)
+	prefetcht1 -0x40(%rcx)
+	vmovups	(%rsi), %zmm0
+	vmovups 0x40(%rsi), %zmm1
+	vmovups 0x80(%rsi), %zmm2
+	vmovups 0xC0(%rsi), %zmm3
+	vmovups	0x100(%rsi), %zmm4
+	vmovups 0x140(%rsi), %zmm5
+	vmovups 0x180(%rsi), %zmm6
+	vmovups 0x1C0(%rsi), %zmm7
+	vmovups	-0x200(%rcx), %zmm8
+	vmovups -0x1C0(%rcx), %zmm9
+	vmovups -0x180(%rcx), %zmm10
+	vmovups -0x140(%rcx), %zmm11
+	vmovups	-0x100(%rcx), %zmm12
+	vmovups -0xC0(%rcx), %zmm13
+	vmovups -0x80(%rcx), %zmm14
+	vmovups -0x40(%rcx), %zmm15
+	vmovups %zmm0, (%rdi)
+	vmovups %zmm1, 0x40(%rdi)
+	vmovups %zmm2, 0x80(%rdi)
+	vmovups %zmm3, 0xC0(%rdi)
+	vmovups %zmm4, 0x100(%rdi)
+	vmovups %zmm5, 0x140(%rdi)
+	vmovups %zmm6, 0x180(%rdi)
+	vmovups %zmm7, 0x1C0(%rdi)
+	vmovups	%zmm8, -0x200(%r9)
+	vmovups %zmm9, -0x1C0(%r9)
+	vmovups %zmm10, -0x180(%r9)
+	vmovups %zmm11, -0x140(%r9)
+	vmovups	%zmm12, -0x100(%r9)
+	vmovups %zmm13, -0xC0(%r9)
+	vmovups %zmm14, -0x80(%r9)
+	vmovups %zmm15, -0x40(%r9)
+	ret
+
+L(1024bytesormore):
+	cmp	%rsi, %rdi
+	ja	L(1024bytesormore_bkw)
+	sub	$512, %r9
+	vmovups -0x200(%rcx), %zmm8
+	vmovups -0x1C0(%rcx), %zmm9
+	vmovups -0x180(%rcx), %zmm10
+	vmovups -0x140(%rcx), %zmm11
+	vmovups	-0x100(%rcx), %zmm12
+	vmovups -0xC0(%rcx), %zmm13
+	vmovups -0x80(%rcx), %zmm14
+	vmovups -0x40(%rcx), %zmm15
+	prefetcht1 (%rsi)
+	prefetcht1 0x40(%rsi)
+	prefetcht1 0x80(%rsi)
+	prefetcht1 0xC0(%rsi)
+	prefetcht1 0x100(%rsi)
+	prefetcht1 0x140(%rsi)
+	prefetcht1 0x180(%rsi)
+	prefetcht1 0x1C0(%rsi)
+
+/* Loop with unaligned memory access.  */
+L(gobble_512bytes_loop):
+	vmovups	(%rsi), %zmm0
+	vmovups 0x40(%rsi), %zmm1
+	vmovups 0x80(%rsi), %zmm2
+	vmovups 0xC0(%rsi), %zmm3
+	vmovups	0x100(%rsi), %zmm4
+	vmovups 0x140(%rsi), %zmm5
+	vmovups 0x180(%rsi), %zmm6
+	vmovups 0x1C0(%rsi), %zmm7
+	add	$512, %rsi
+	prefetcht1 (%rsi)
+	prefetcht1 0x40(%rsi)
+	prefetcht1 0x80(%rsi)
+	prefetcht1 0xC0(%rsi)
+	prefetcht1 0x100(%rsi)
+	prefetcht1 0x140(%rsi)
+	prefetcht1 0x180(%rsi)
+	prefetcht1 0x1C0(%rsi)
+	vmovups	%zmm0, (%rdi)
+	vmovups %zmm1, 0x40(%rdi)
+	vmovups %zmm2, 0x80(%rdi)
+	vmovups %zmm3, 0xC0(%rdi)
+	vmovups	%zmm4, 0x100(%rdi)
+	vmovups %zmm5, 0x140(%rdi)
+	vmovups %zmm6, 0x180(%rdi)
+	vmovups %zmm7, 0x1C0(%rdi)
+	add	$512, %rdi
+	cmp	%r9, %rdi
+	jb	L(gobble_512bytes_loop)
+	vmovups %zmm8, (%r9)
+	vmovups %zmm9, 0x40(%r9)
+	vmovups %zmm10, 0x80(%r9)
+	vmovups %zmm11, 0xC0(%r9)
+	vmovups %zmm12, 0x100(%r9)
+	vmovups %zmm13, 0x140(%r9)
+	vmovups %zmm14, 0x180(%r9)
+	vmovups %zmm15, 0x1C0(%r9)
+	ret
+
+L(1024bytesormore_bkw):
+	add	$512, %rdi
+	vmovups	0x1C0(%rsi), %zmm8
+	vmovups 0x180(%rsi), %zmm9
+	vmovups 0x140(%rsi), %zmm10
+	vmovups 0x100(%rsi), %zmm11
+	vmovups	0xC0(%rsi), %zmm12
+	vmovups 0x80(%rsi), %zmm13
+	vmovups 0x40(%rsi), %zmm14
+	vmovups (%rsi), %zmm15
+	prefetcht1 -0x40(%rcx)
+	prefetcht1 -0x80(%rcx)
+	prefetcht1 -0xC0(%rcx)
+	prefetcht1 -0x100(%rcx)
+	prefetcht1 -0x140(%rcx)
+	prefetcht1 -0x180(%rcx)
+	prefetcht1 -0x1C0(%rcx)
+	prefetcht1 -0x200(%rcx)
+
+/* Backward loop with unaligned memory access.  */
+L(gobble_512bytes_loop_bkw):
+	vmovups -0x40(%rcx), %zmm0
+	vmovups -0x80(%rcx), %zmm1
+	vmovups -0xC0(%rcx), %zmm2
+	vmovups	-0x100(%rcx), %zmm3
+	vmovups -0x140(%rcx), %zmm4
+	vmovups -0x180(%rcx), %zmm5
+	vmovups -0x1C0(%rcx), %zmm6
+	vmovups	-0x200(%rcx), %zmm7
+	sub	$512, %rcx
+	prefetcht1 -0x40(%rcx)
+	prefetcht1 -0x80(%rcx)
+	prefetcht1 -0xC0(%rcx)
+	prefetcht1 -0x100(%rcx)
+	prefetcht1 -0x140(%rcx)
+	prefetcht1 -0x180(%rcx)
+	prefetcht1 -0x1C0(%rcx)
+	prefetcht1 -0x200(%rcx)
+	vmovups %zmm0, -0x40(%r9)
+	vmovups %zmm1, -0x80(%r9)
+	vmovups %zmm2, -0xC0(%r9)
+	vmovups	%zmm3, -0x100(%r9)
+	vmovups %zmm4, -0x140(%r9)
+	vmovups %zmm5, -0x180(%r9)
+	vmovups %zmm6, -0x1C0(%r9)
+	vmovups	%zmm7, -0x200(%r9)
+	sub	$512, %r9
+	cmp	%rdi, %r9
+	ja	L(gobble_512bytes_loop_bkw)
+	vmovups %zmm8, -0x40(%rdi)
+	vmovups %zmm9, -0x80(%rdi)
+	vmovups %zmm10, -0xC0(%rdi)
+	vmovups %zmm11, -0x100(%rdi)
+	vmovups %zmm12, -0x140(%rdi)
+	vmovups %zmm13, -0x180(%rdi)
+	vmovups %zmm14, -0x1C0(%rdi)
+	vmovups %zmm15, -0x200(%rdi)
+	ret
+
+L(preloop_large):
+	cmp	%rsi, %rdi
+	ja	L(preloop_large_bkw)
+	vmovups	(%rsi), %zmm4
+	vmovups	0x40(%rsi), %zmm5
+
+/* Align destination for access with non-temporal stores in the loop.  */
+	mov	%rdi, %r8
+	and	$-0x80, %rdi
+	add	$0x80, %rdi
+	sub	%rdi, %r8
+	sub	%r8, %rsi
+	add	%r8, %rdx
+L(gobble_256bytes_nt_loop):
+	prefetcht1 0x200(%rsi)
+	prefetcht1 0x240(%rsi)
+	prefetcht1 0x280(%rsi)
+	prefetcht1 0x2C0(%rsi)
+	prefetcht1 0x300(%rsi)
+	prefetcht1 0x340(%rsi)
+	prefetcht1 0x380(%rsi)
+	prefetcht1 0x3C0(%rsi)
+	vmovdqu64 (%rsi), %zmm0
+	vmovdqu64 0x40(%rsi), %zmm1
+	vmovdqu64 0x80(%rsi), %zmm2
+	vmovdqu64 0xC0(%rsi), %zmm3
+	vmovntdq %zmm0, (%rdi)
+	vmovntdq %zmm1, 0x40(%rdi)
+	vmovntdq %zmm2, 0x80(%rdi)
+	vmovntdq %zmm3, 0xC0(%rdi)
+	sub	$256, %rdx
+	add	$256, %rsi
+	add	$256, %rdi
+	cmp	$256, %rdx
+	ja	L(gobble_256bytes_nt_loop)
+	sfence
+	vmovups	%zmm4, (%rax)
+	vmovups	%zmm5, 0x40(%rax)
+	jmp	L(check)
+
+L(preloop_large_bkw):
+	vmovups -0x80(%rcx), %zmm4
+	vmovups -0x40(%rcx), %zmm5
+
+/* Align end of destination for access with non-temporal stores.  */
+	mov	%r9, %r8
+	and	$-0x80, %r9
+	sub	%r9, %r8
+	sub	%r8, %rcx
+	sub	%r8, %rdx
+	add	%r9, %r8
+L(gobble_256bytes_nt_loop_bkw):
+	prefetcht1 -0x400(%rcx)
+	prefetcht1 -0x3C0(%rcx)
+	prefetcht1 -0x380(%rcx)
+	prefetcht1 -0x340(%rcx)
+	prefetcht1 -0x300(%rcx)
+	prefetcht1 -0x2C0(%rcx)
+	prefetcht1 -0x280(%rcx)
+	prefetcht1 -0x240(%rcx)
+	vmovdqu64 -0x100(%rcx), %zmm0
+	vmovdqu64 -0xC0(%rcx), %zmm1
+	vmovdqu64 -0x80(%rcx), %zmm2
+	vmovdqu64 -0x40(%rcx), %zmm3
+	vmovntdq %zmm0,	-0x100(%r9)
+	vmovntdq %zmm1,	-0xC0(%r9)
+	vmovntdq %zmm2,	-0x80(%r9)
+	vmovntdq %zmm3,	-0x40(%r9)
+	sub	$256, %rdx
+	sub	$256, %rcx
+	sub	$256, %r9
+	cmp	$256, %rdx
+	ja	L(gobble_256bytes_nt_loop_bkw)
+	sfence
+	vmovups	%zmm4, -0x80(%r8)
+	vmovups	%zmm5, -0x40(%r8)
+	jmp	L(check)
+END (MEMCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
index c5450af25a..c4509831fa 100644
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -1,5 +1,5 @@
 /* memcpy with unaliged loads
-   Copyright (C) 2013-2015 Free Software Foundation, Inc.
+   Copyright (C) 2013-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,6 +16,8 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
+#if IS_IN (libc)
+
 #include <sysdep.h>
 
 #include "asm-syntax.h"
@@ -169,3 +171,5 @@ L(between_5_8):
 	movl	%eax, -4(%rdi,%rdx)
 	jmp	L(return)
 END(__memcpy_sse2_unaligned)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
index 30e0d1c575..08b41e9e5a 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
@@ -1,5 +1,5 @@
 /* memcpy with SSSE3 and REP string
-   Copyright (C) 2010-2015 Free Software Foundation, Inc.
+   Copyright (C) 2010-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
index 33cc493dd4..95de9695f9 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
@@ -1,5 +1,5 @@
 /* memcpy with SSSE3
-   Copyright (C) 2010-2015 Free Software Foundation, Inc.
+   Copyright (C) 2010-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
index 4e18cd3070..64a1bcd137 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -1,6 +1,6 @@
 /* Multiple versions of memcpy
    All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2010-2015 Free Software Foundation, Inc.
+   Copyright (C) 2010-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -29,22 +29,28 @@
 	.text
 ENTRY(__new_memcpy)
 	.type	__new_memcpy, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
-	jne	1f
-	call	__init_cpu_features
+	LOAD_RTLD_GLOBAL_RO_RDX
+#ifdef HAVE_AVX512_ASM_SUPPORT
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jz	1f
+	leaq    __memcpy_avx512_no_vzeroupper(%rip), %rax
+	ret
+#endif
 1:	leaq	__memcpy_avx_unaligned(%rip), %rax
-	testl	$bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
-	jz 1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz 2f
 	ret
-1:	leaq	__memcpy_sse2(%rip), %rax
-	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
-	jnz	2f
+2:	leaq	__memcpy_sse2(%rip), %rax
+	HAS_ARCH_FEATURE (Slow_BSF)
+	jnz	3f
 	leaq	__memcpy_sse2_unaligned(%rip), %rax
 	ret
-2:	testl   $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
-	jz 3f
+3:	HAS_CPU_FEATURE (SSSE3)
+	jz 4f
 	leaq    __memcpy_ssse3(%rip), %rax
-3:	ret
+4:	ret
 END(__new_memcpy)
 
 # undef ENTRY
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
index 1e756ea0c2..648217e971 100644
--- a/sysdeps/x86_64/multiarch/memcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -1,6 +1,6 @@
 /* Multiple versions of __memcpy_chk
    All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2010-2015 Free Software Foundation, Inc.
+   Copyright (C) 2010-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -29,17 +29,23 @@
 	.text
 ENTRY(__memcpy_chk)
 	.type	__memcpy_chk, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
-	jne	1f
-	call	__init_cpu_features
+	LOAD_RTLD_GLOBAL_RO_RDX
+#ifdef HAVE_AVX512_ASM_SUPPORT
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz      1f
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jz      1f
+	leaq    __memcpy_chk_avx512_no_vzeroupper(%rip), %rax
+	ret
+#endif
 1:	leaq	__memcpy_chk_sse2(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
 	leaq	__memcpy_chk_ssse3(%rip), %rax
-	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
 	jz	2f
 	leaq	__memcpy_chk_ssse3_back(%rip), %rax
-	testl   $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
 	jz  2f
 	leaq    __memcpy_chk_avx_unaligned(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
index 01eac94889..75e35f2957 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
@@ -1,5 +1,5 @@
 /* memmove with AVX
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
new file mode 100644
index 0000000000..518d1fec35
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
@@ -0,0 +1,22 @@
+/* memmove optimized with AVX512 for KNL hardware.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_avx512_no_vzeroupper
+#define MEMCPY_CHK	__memmove_chk_avx512_no_vzeroupper
+#include "memcpy-avx512-no-vzeroupper.S"
diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c
index dd153a3eaa..8da5640bb0 100644
--- a/sysdeps/x86_64/multiarch/memmove.c
+++ b/sysdeps/x86_64/multiarch/memmove.c
@@ -1,6 +1,6 @@
 /* Multiple versions of memmove.
    All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2010-2015 Free Software Foundation, Inc.
+   Copyright (C) 2010-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -36,6 +36,9 @@ extern __typeof (__redirect_memmove) __memmove_sse2 attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_ssse3 attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
+# ifdef HAVE_AVX512_ASM_SUPPORT
+  extern __typeof (__redirect_memmove) __memmove_avx512_no_vzeroupper attribute_hidden;
+# endif
 
 #endif
 
@@ -49,12 +52,18 @@ extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
    ifunc symbol properly.  */
 extern __typeof (__redirect_memmove) __libc_memmove;
 libc_ifunc (__libc_memmove,
-	    HAS_AVX_FAST_UNALIGNED_LOAD
+#ifdef HAVE_AVX512_ASM_SUPPORT
+	    HAS_ARCH_FEATURE (AVX512F_Usable)
+	      && HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	    ? __memmove_avx512_no_vzeroupper
+	    :
+#endif
+	    (HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
 	    ? __memmove_avx_unaligned
-	    : (HAS_SSSE3
-	       ? (HAS_FAST_COPY_BACKWARD
+	    : (HAS_CPU_FEATURE (SSSE3)
+	       ? (HAS_ARCH_FEATURE (Fast_Copy_Backward)
 	          ? __memmove_ssse3_back : __memmove_ssse3)
-	       : __memmove_sse2));
+	       : __memmove_sse2)));
 
 strong_alias (__libc_memmove, memmove)
 
diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c
index 8b12d002dc..f64da63180 100644
--- a/sysdeps/x86_64/multiarch/memmove_chk.c
+++ b/sysdeps/x86_64/multiarch/memmove_chk.c
@@ -1,6 +1,6 @@
 /* Multiple versions of __memmove_chk.
    All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2010-2015 Free Software Foundation, Inc.
+   Copyright (C) 2010-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -26,12 +26,21 @@ extern __typeof (__memmove_chk) __memmove_chk_sse2 attribute_hidden;
 extern __typeof (__memmove_chk) __memmove_chk_ssse3 attribute_hidden;
 extern __typeof (__memmove_chk) __memmove_chk_ssse3_back attribute_hidden;
 extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden;
+# ifdef HAVE_AVX512_ASM_SUPPORT
+  extern __typeof (__memmove_chk) __memmove_chk_avx512_no_vzeroupper attribute_hidden;
+# endif
 
 #include "debug/memmove_chk.c"
 
 libc_ifunc (__memmove_chk,
-	    HAS_AVX_FAST_UNALIGNED_LOAD ? __memmove_chk_avx_unaligned :
-	    (HAS_SSSE3
-	    ? (HAS_FAST_COPY_BACKWARD
+#ifdef HAVE_AVX512_ASM_SUPPORT
+	    HAS_ARCH_FEATURE (AVX512F_Usable)
+	      && HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	    ? __memmove_chk_avx512_no_vzeroupper
+	    :
+#endif
+	    HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) ? __memmove_chk_avx_unaligned :
+	    (HAS_CPU_FEATURE (SSSE3)
+	    ? (HAS_ARCH_FEATURE (Fast_Copy_Backward)
 	       ? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
 	    : __memmove_chk_sse2));
diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
index 128ff832fb..241378e770 100644
--- a/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
+++ b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
@@ -1,5 +1,5 @@
 /* mempcpy with AVX
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S
new file mode 100644
index 0000000000..fcc0945ea7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S
@@ -0,0 +1,22 @@
+/* mempcpy optimized with AVX512 for KNL hardware.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy_avx512_no_vzeroupper
+#define MEMCPY_CHK	__mempcpy_chk_avx512_no_vzeroupper
+#include "memcpy-avx512-no-vzeroupper.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index 2eaacdf049..ed78623565 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -1,6 +1,6 @@
 /* Multiple versions of mempcpy
    All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2010-2015 Free Software Foundation, Inc.
+   Copyright (C) 2010-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -27,17 +27,23 @@
 #if defined SHARED && IS_IN (libc)
 ENTRY(__mempcpy)
 	.type	__mempcpy, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
-	jne	1f
-	call	__init_cpu_features
+	LOAD_RTLD_GLOBAL_RO_RDX
+#ifdef HAVE_AVX512_ASM_SUPPORT
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jz	1f
+	leaq    __mempcpy_avx512_no_vzeroupper(%rip), %rax
+	ret
+#endif
 1:	leaq	__mempcpy_sse2(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
 	leaq	__mempcpy_ssse3(%rip), %rax
-	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
 	jz	2f
 	leaq	__mempcpy_ssse3_back(%rip), %rax
-	testl	$bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
 	jz	2f
 	leaq	__mempcpy_avx_unaligned(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
index 17b84701b0..6e8a89d38c 100644
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -1,6 +1,6 @@
 /* Multiple versions of __mempcpy_chk
    All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2010-2015 Free Software Foundation, Inc.
+   Copyright (C) 2010-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -29,17 +29,23 @@
 	.text
 ENTRY(__mempcpy_chk)
 	.type	__mempcpy_chk, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
-	jne	1f
-	call	__init_cpu_features
+	LOAD_RTLD_GLOBAL_RO_RDX
+#ifdef HAVE_AVX512_ASM_SUPPORT
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jz	1f
+	leaq    __mempcpy_chk_avx512_no_vzeroupper(%rip), %rax
+	ret
+#endif
 1:	leaq	__mempcpy_chk_sse2(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
 	leaq	__mempcpy_chk_ssse3(%rip), %rax
-	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
 	jz	2f
 	leaq	__mempcpy_chk_ssse3_back(%rip), %rax
-	testl	$bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
 	jz	2f
 	leaq	__mempcpy_chk_avx_unaligned(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S
index 28eabade35..df634728d4 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2.S
@@ -1,5 +1,5 @@
 /* memset with AVX2
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
new file mode 100644
index 0000000000..1e638d7ac2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
@@ -0,0 +1,194 @@
+/* memset optimized with AVX512 for KNL hardware.
+   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if defined HAVE_AVX512_ASM_SUPPORT && IS_IN (libc)
+
+#include "asm-syntax.h"
+#ifndef MEMSET
+# define MEMSET __memset_avx512_no_vzeroupper
+# define MEMSET_CHK __memset_chk_avx512_no_vzeroupper
+#endif
+
+	.section .text,"ax",@progbits
+#if defined PIC
+ENTRY (MEMSET_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMSET_CHK)
+#endif
+
+ENTRY (MEMSET)
+	vpxor	%xmm0, %xmm0, %xmm0
+	vmovd	%esi, %xmm1
+	lea	(%rdi, %rdx), %rsi
+	mov	%rdi, %rax
+	vpshufb	%xmm0, %xmm1, %xmm0
+	cmp	$16, %rdx
+	jb	L(less_16bytes)
+	cmp	$512, %rdx
+	vbroadcastss	%xmm0, %zmm2
+	ja	L(512bytesormore)
+	cmp	$256, %rdx
+	jb	L(less_256bytes)
+	vmovups	%zmm2, (%rdi)
+	vmovups %zmm2, 0x40(%rdi)
+	vmovups %zmm2, 0x80(%rdi)
+	vmovups %zmm2, 0xC0(%rdi)
+	vmovups %zmm2, -0x100(%rsi)
+	vmovups %zmm2, -0xC0(%rsi)
+	vmovups %zmm2, -0x80(%rsi)
+	vmovups %zmm2, -0x40(%rsi)
+	ret
+
+L(less_256bytes):
+	cmp	$128, %dl
+	jb	L(less_128bytes)
+	vmovups	%zmm2, (%rdi)
+	vmovups %zmm2, 0x40(%rdi)
+	vmovups %zmm2, -0x80(%rsi)
+	vmovups %zmm2, -0x40(%rsi)
+	ret
+
+L(less_128bytes):
+	cmp	$64, %dl
+	jb	L(less_64bytes)
+	vmovups	%zmm2, (%rdi)
+	vmovups	%zmm2, -0x40(%rsi)
+	ret
+
+L(less_64bytes):
+	cmp	$32, %dl
+	jb	L(less_32bytes)
+	vmovdqu	%ymm2, (%rdi)
+	vmovdqu %ymm2, -0x20(%rsi)
+	ret
+
+L(less_32bytes):
+	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmm0, -0x10(%rsi)
+	ret
+
+L(less_16bytes):
+	cmp	$8, %dl
+	jb	L(less_8bytes)
+	vmovq	%xmm0, (%rdi)
+	vmovq	%xmm0, -0x08(%rsi)
+	ret
+
+L(less_8bytes):
+	vmovd	%xmm0, %ecx
+	cmp	$4, %dl
+	jb	L(less_4bytes)
+	mov	%ecx, (%rdi)
+	mov	%ecx, -0x04(%rsi)
+	ret
+
+L(less_4bytes):
+	cmp	$2, %dl
+	jb	L(less_2bytes)
+	mov	%cx, (%rdi)
+	mov	%cx, -0x02(%rsi)
+	ret
+
+L(less_2bytes):
+	cmp	$1, %dl
+	jb	L(less_1bytes)
+	mov	%cl, (%rdi)
+L(less_1bytes):
+	ret
+
+L(512bytesormore):
+	mov	__x86_shared_cache_size_half(%rip), %rcx
+	cmp	%rcx, %rdx
+	ja	L(preloop_large)
+	cmp	$1024, %rdx
+	ja	L(1024bytesormore)
+
+	vmovups	%zmm2, (%rdi)
+	vmovups	%zmm2, 0x40(%rdi)
+	vmovups	%zmm2, 0x80(%rdi)
+	vmovups	%zmm2, 0xC0(%rdi)
+	vmovups	%zmm2, 0x100(%rdi)
+	vmovups	%zmm2, 0x140(%rdi)
+	vmovups	%zmm2, 0x180(%rdi)
+	vmovups	%zmm2, 0x1C0(%rdi)
+	vmovups %zmm2, -0x200(%rsi)
+	vmovups %zmm2, -0x1C0(%rsi)
+	vmovups %zmm2, -0x180(%rsi)
+	vmovups %zmm2, -0x140(%rsi)
+	vmovups %zmm2, -0x100(%rsi)
+	vmovups %zmm2, -0xC0(%rsi)
+	vmovups %zmm2, -0x80(%rsi)
+	vmovups %zmm2, -0x40(%rsi)
+	ret
+
+/* Align on 64 and loop with aligned stores.  */
+L(1024bytesormore):
+	sub	$0x100, %rsi
+	vmovups	%zmm2, (%rax)
+	and	$-0x40, %rdi
+	add	$0x40, %rdi
+
+L(gobble_256bytes_loop):
+	vmovaps	%zmm2, (%rdi)
+	vmovaps	%zmm2, 0x40(%rdi)
+	vmovaps	%zmm2, 0x80(%rdi)
+	vmovaps	%zmm2, 0xC0(%rdi)
+	add	$0x100, %rdi
+	cmp	%rsi, %rdi
+	jb	L(gobble_256bytes_loop)
+	vmovups %zmm2, (%rsi)
+	vmovups %zmm2, 0x40(%rsi)
+	vmovups %zmm2, 0x80(%rsi)
+	vmovups %zmm2, 0xC0(%rsi)
+	ret
+
+/* Align on 128 and loop with non-temporal stores.  */
+L(preloop_large):
+	and	$-0x80, %rdi
+	add	$0x80, %rdi
+	vmovups	%zmm2, (%rax)
+	vmovups	%zmm2, 0x40(%rax)
+	sub	$0x200, %rsi
+
+L(gobble_512bytes_nt_loop):
+	vmovntdq %zmm2, (%rdi)
+	vmovntdq %zmm2, 0x40(%rdi)
+	vmovntdq %zmm2, 0x80(%rdi)
+	vmovntdq %zmm2, 0xC0(%rdi)
+	vmovntdq %zmm2, 0x100(%rdi)
+	vmovntdq %zmm2, 0x140(%rdi)
+	vmovntdq %zmm2, 0x180(%rdi)
+	vmovntdq %zmm2, 0x1C0(%rdi)
+	add	$0x200, %rdi
+	cmp	%rsi, %rdi
+	jb	L(gobble_512bytes_nt_loop)
+	sfence
+	vmovups %zmm2, (%rsi)
+	vmovups %zmm2, 0x40(%rsi)
+	vmovups %zmm2, 0x80(%rsi)
+	vmovups %zmm2, 0xC0(%rsi)
+	vmovups	%zmm2, 0x100(%rsi)
+	vmovups	%zmm2, 0x140(%rsi)
+	vmovups	%zmm2, 0x180(%rsi)
+	vmovups	%zmm2, 0x1C0(%rsi)
+	ret
+END (MEMSET)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
index c5f1fb340e..8e3b9b9764 100644
--- a/sysdeps/x86_64/multiarch/memset.S
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -1,6 +1,6 @@
 /* Multiple versions of memset
    All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -17,45 +17,48 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifdef HAVE_AVX2_SUPPORT
 #include <sysdep.h>
 #include <shlib-compat.h>
 #include <init-arch.h>
 
 /* Define multiple versions only for the definition in lib.  */
-# if IS_IN (libc)
+#if IS_IN (libc)
 ENTRY(memset)
 	.type	memset, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__memset_sse2(%rip), %rax
-	testl	$bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__memset_sse2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
 	jz	2f
 	leaq	__memset_avx2(%rip), %rax
+#ifdef HAVE_AVX512_ASM_SUPPORT
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	2f
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jz	2f
+	leaq	__memset_avx512_no_vzeroupper(%rip), %rax
+#endif
 2:	ret
 END(memset)
-# endif
+#endif
 
-# if IS_IN (libc)
-#  undef memset
-#  define memset __memset_sse2
+#if IS_IN (libc)
+# undef memset
+# define memset __memset_sse2
 
-#  undef __memset_chk
-#  define __memset_chk __memset_chk_sse2
+# undef __memset_chk
+# define __memset_chk __memset_chk_sse2
 
-#  ifdef SHARED
-#  undef libc_hidden_builtin_def
+# ifdef SHARED
+# undef libc_hidden_builtin_def
 /* It doesn't make sense to send libc-internal memset calls through a PLT.
    The speedup we get from using GPR instruction is likely eaten away
    by the indirect call in the PLT.  */
-#  define libc_hidden_builtin_def(name) \
+# define libc_hidden_builtin_def(name) \
 	.globl __GI_memset; __GI_memset = __memset_sse2
-#  endif
-
-#  undef strong_alias
-#  define strong_alias(original, alias)
 # endif
+
+# undef strong_alias
+# define strong_alias(original, alias)
 #endif
 
 #include "../memset.S"
diff --git a/sysdeps/x86_64/multiarch/memset_chk.S b/sysdeps/x86_64/multiarch/memset_chk.S
index 64fed3118a..9a7b270274 100644
--- a/sysdeps/x86_64/multiarch/memset_chk.S
+++ b/sysdeps/x86_64/multiarch/memset_chk.S
@@ -1,6 +1,6 @@
 /* Multiple versions of memset_chk
    All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+   Copyright (C) 2014-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,16 +22,21 @@
 
 /* Define multiple versions only for the definition in lib.  */
 #if IS_IN (libc)
-# if defined SHARED && defined HAVE_AVX2_SUPPORT
+# ifdef SHARED
 ENTRY(__memset_chk)
 	.type	__memset_chk, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__memset_chk_sse2(%rip), %rax
-	testl	$bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__memset_chk_sse2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
 	jz	2f
 	leaq	__memset_chk_avx2(%rip), %rax
+#ifdef HAVE_AVX512_ASM_SUPPORT
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	2f
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jz	2f
+	leaq	__memset_chk_avx512_no_vzeroupper(%rip), %rax
+#endif
 2:	ret
 END(__memset_chk)
 
diff --git a/sysdeps/x86_64/multiarch/rtld-memcmp.c b/sysdeps/x86_64/multiarch/rtld-memcmp.c
deleted file mode 100644
index 0f271356c2..0000000000
--- a/sysdeps/x86_64/multiarch/rtld-memcmp.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../rtld-memcmp.c"
diff --git a/sysdeps/x86_64/multiarch/rtld-memset.S b/sysdeps/x86_64/multiarch/rtld-memset.S
deleted file mode 100644
index 8092aa07da..0000000000
--- a/sysdeps/x86_64/multiarch/rtld-memset.S
+++ /dev/null
@@ -1 +0,0 @@
-#include "../rtld-memset.S"
diff --git a/sysdeps/x86_64/multiarch/sched_cpucount.c b/sysdeps/x86_64/multiarch/sched_cpucount.c
index 72ad7b01a8..b75aeb79b2 100644
--- a/sysdeps/x86_64/multiarch/sched_cpucount.c
+++ b/sysdeps/x86_64/multiarch/sched_cpucount.c
@@ -1,6 +1,6 @@
 /* Count bits in CPU set.  x86-64 multi-arch version.
    This file is part of the GNU C Library.
-   Copyright (C) 2008-2015 Free Software Foundation, Inc.
+   Copyright (C) 2008-2016 Free Software Foundation, Inc.
    Contributed by Ulrich Drepper <drepper@redhat.com>.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -33,4 +33,4 @@
 #undef __sched_cpucount
 
 libc_ifunc (__sched_cpucount,
-	    HAS_POPCOUNT ? popcount_cpucount : generic_cpucount);
+	    HAS_CPU_FEATURE (POPCOUNT) ? popcount_cpucount : generic_cpucount);
diff --git a/sysdeps/x86_64/multiarch/strcasestr.c b/sysdeps/x86_64/multiarch/strcasestr.c
deleted file mode 100644
index 834e656a2c..0000000000
--- a/sysdeps/x86_64/multiarch/strcasestr.c
+++ /dev/null
@@ -1,13 +0,0 @@
-/* Multiple versions of strcasestr
-   All versions must be listed in ifunc-impl-list.c.  */
-
-#include "init-arch.h"
-
-#define STRCASESTR __strcasestr_sse2
-
-#include "string/strcasestr.c"
-
-extern __typeof (__strcasestr_sse2) __strcasestr_sse2 attribute_hidden;
-
-libc_ifunc (__strcasestr,
-	    __strcasestr_sse2);
diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
index 81f1b40ef6..3a694d45c2 100644
--- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
@@ -1,5 +1,5 @@
 /* strcat with SSE2
-   Copyright (C) 2011-2015 Free Software Foundation, Inc.
+   Copyright (C) 2011-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
index d7b990725a..96184d0f0f 100644
--- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
+++ b/sysdeps/x86_64/multiarch/strcat-ssse3.S
@@ -1,5 +1,5 @@
 /* strcat with SSSE3
-   Copyright (C) 2011-2015 Free Software Foundation, Inc.
+   Copyright (C) 2011-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/x86_64/multiarch/strcat.S b/sysdeps/x86_64/multiarch/strcat.S
index 44993fade5..7bb38e68ad 100644
--- a/sysdeps/x86_64/multiarch/strcat.S
+++ b/sysdeps/x86_64/multiarch/strcat.S
@@ -1,6 +1,6 @@
 /* Multiple versions of strcat
    All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2009-2015 Free Software Foundation, Inc.
+   Copyright (C) 2009-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -47,14 +47,12 @@
 	.text
 ENTRY(STRCAT)
 	.type	STRCAT, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	STRCAT_SSE2_UNALIGNED(%rip), %rax
-	testl	$bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	STRCAT_SSE2_UNALIGNED(%rip), %rax
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
 	jnz	2f
 	leaq	STRCAT_SSE2(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
 	leaq	STRCAT_SSSE3(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S
index 0398650a01..979d112b28 100644
--- a/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S
+++ b/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S
@@ -1,5 +1,5 @@
 /* strchr with SSE2 without bsf
-   Copyright (C) 2011-2015 Free Software Foundation, Inc.
+   Copyright (C) 2011-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
index af55fac398..40683ad32b 100644
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ b/sysdeps/x86_64/multiarch/strchr.S
@@ -1,5 +1,5 @@
 /* Multiple versions of strchr
-   Copyright (C) 2009-2015 Free Software Foundation, Inc.
+   Copyright (C) 2009-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -25,11 +25,9 @@
 	.text
 ENTRY(strchr)
 	.type	strchr, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__strchr_sse2(%rip), %rax
-2:	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__strchr_sse2(%rip), %rax
+2:	HAS_ARCH_FEATURE (Slow_BSF)
 	jz	3f
 	leaq    __strchr_sse2_no_bsf(%rip), %rax
 3:	ret
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
index 20b65fa775..bf555b4066 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
@@ -1,5 +1,5 @@
 /* strcmp with unaligned loads
-   Copyright (C) 2013-2015 Free Software Foundation, Inc.
+   Copyright (C) 2013-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,6 +16,8 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
+#if IS_IN (libc)
+
 #include "sysdep.h"
 
 ENTRY ( __strcmp_sse2_unaligned)
@@ -207,3 +209,5 @@ L(different):
 	subl	%ecx, %eax
 	ret
 END (__strcmp_sse2_unaligned)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
index 4dff0a564b..70df84ae32 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
@@ -1,5 +1,5 @@
 /* strcmp with SSE4.2
-   Copyright (C) 2009-2015 Free Software Foundation, Inc.
+   Copyright (C) 2009-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
index f50f26c393..0e4a113f61 100644
--- a/sysdeps/x86_64/multiarch/strcmp.S
+++ b/sysdeps/x86_64/multiarch/strcmp.S
@@ -1,5 +1,5 @@
 /* Multiple versions of strcmp
-   Copyright (C) 2009-2015 Free Software Foundation, Inc.
+   Copyright (C) 2009-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -84,24 +84,20 @@
 	.text
 ENTRY(STRCMP)
 	.type	STRCMP, @gnu_indirect_function
-	/* Manually inlined call to __get_cpu_features.  */
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:
+	LOAD_RTLD_GLOBAL_RO_RDX
 #ifdef USE_AS_STRCMP
 	leaq	__strcmp_sse2_unaligned(%rip), %rax
-	testl   $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
 	jnz     3f
 #else
-	testl	$bit_Slow_SSE4_2, __cpu_features+FEATURE_OFFSET+index_Slow_SSE4_2(%rip)
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
 	jnz	2f
 	leaq	STRCMP_SSE42(%rip), %rax
-	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
+	HAS_CPU_FEATURE (SSE4_2)
 	jnz	3f
 #endif
 2:	leaq	STRCMP_SSSE3(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	HAS_CPU_FEATURE (SSSE3)
 	jnz	3f
 	leaq	STRCMP_SSE2(%rip), %rax
 3:	ret
@@ -110,23 +106,17 @@ END(STRCMP)
 # ifdef USE_AS_STRCASECMP_L
 ENTRY(__strcasecmp)
 	.type	__strcasecmp, @gnu_indirect_function
-	/* Manually inlined call to __get_cpu_features.  */
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:
-#  ifdef HAVE_AVX_SUPPORT
+	LOAD_RTLD_GLOBAL_RO_RDX
 	leaq	__strcasecmp_avx(%rip), %rax
-	testl	$bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX_Usable)
 	jnz	3f
-#  endif
-	testl	$bit_Slow_SSE4_2, __cpu_features+FEATURE_OFFSET+index_Slow_SSE4_2(%rip)
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
 	jnz	2f
 	leaq	__strcasecmp_sse42(%rip), %rax
-	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
+	HAS_CPU_FEATURE (SSE4_2)
 	jnz	3f
 2:	leaq	__strcasecmp_ssse3(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	HAS_CPU_FEATURE (SSSE3)
 	jnz	3f
 	leaq	__strcasecmp_sse2(%rip), %rax
 3:	ret
@@ -136,23 +126,17 @@ weak_alias (__strcasecmp, strcasecmp)
 # ifdef USE_AS_STRNCASECMP_L
 ENTRY(__strncasecmp)
 	.type	__strncasecmp, @gnu_indirect_function
-	/* Manually inlined call to __get_cpu_features.  */
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:
-#  ifdef HAVE_AVX_SUPPORT
+	LOAD_RTLD_GLOBAL_RO_RDX
 	leaq	__strncasecmp_avx(%rip), %rax
-	testl	$bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX_Usable)
 	jnz	3f
-#  endif
-	testl	$bit_Slow_SSE4_2, __cpu_features+FEATURE_OFFSET+index_Slow_SSE4_2(%rip)
+	HAS_ARCH_FEATURE (Slow_SSE4_2)
 	jnz	2f
 	leaq	__strncasecmp_sse42(%rip), %rax
-	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
+	HAS_CPU_FEATURE (SSE4_2)
 	jnz	3f
 2:	leaq	__strncasecmp_ssse3(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	HAS_CPU_FEATURE (SSSE3)
 	jnz	3f
 	leaq	__strncasecmp_sse2(%rip), %rax
 3:	ret
@@ -167,16 +151,14 @@ weak_alias (__strncasecmp, strncasecmp)
 # include "strcmp-sse42.S"
 
 
-# ifdef HAVE_AVX_SUPPORT
-#  if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
-#   define LABEL(l) .L##l##_avx
-#   define GLABEL(l) l##_avx
-#   define USE_AVX 1
-#   undef STRCMP_SSE42
-#   define STRCMP_SSE42 STRCMP_AVX
-#   define SECTION avx
-#   include "strcmp-sse42.S"
-#  endif
+# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+#  define LABEL(l) .L##l##_avx
+#  define GLABEL(l) l##_avx
+#  define USE_AVX 1
+#  undef STRCMP_SSE42
+#  define STRCMP_SSE42 STRCMP_AVX
+#  define SECTION avx
+#  include "strcmp-sse42.S"
 # endif
 
 
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
index 8f03d1db24..caa74be2c2 100644
--- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
@@ -1,5 +1,5 @@
 /* strcpy with SSE2 and unaligned load
-   Copyright (C) 2011-2015 Free Software Foundation, Inc.
+   Copyright (C) 2011-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
index 1f22c9a918..5bdb7671cf 100644
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
@@ -1,5 +1,5 @@
 /* strcpy with SSSE3
-   Copyright (C) 2011-2015 Free Software Foundation, Inc.
+   Copyright (C) 2011-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S
index 9464ee8b63..024f6ef899 100644
--- a/sysdeps/x86_64/multiarch/strcpy.S
+++ b/sysdeps/x86_64/multiarch/strcpy.S
@@ -1,6 +1,6 @@
 /* Multiple versions of strcpy
    All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2009-2015 Free Software Foundation, Inc.
+   Copyright (C) 2009-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -61,14 +61,12 @@
 	.text
 ENTRY(STRCPY)
 	.type	STRCPY, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	STRCPY_SSE2_UNALIGNED(%rip), %rax
-	testl	$bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	STRCPY_SSE2_UNALIGNED(%rip), %rax
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
 	jnz	2f
 	leaq	STRCPY_SSE2(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
 	leaq	STRCPY_SSSE3(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
index 60b2ed7a3f..91b804ddd6 100644
--- a/sysdeps/x86_64/multiarch/strcspn-c.c
+++ b/sysdeps/x86_64/multiarch/strcspn-c.c
@@ -1,5 +1,5 @@
 /* strcspn with SSE4.2 intrinsics
-   Copyright (C) 2009-2015 Free Software Foundation, Inc.
+   Copyright (C) 2009-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S
index 95e882c443..8e7ff1c663 100644
--- a/sysdeps/x86_64/multiarch/strcspn.S
+++ b/sysdeps/x86_64/multiarch/strcspn.S
@@ -1,6 +1,6 @@
 /* Multiple versions of strcspn
    All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2009-2015 Free Software Foundation, Inc.
+   Copyright (C) 2009-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -19,9 +19,6 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <config.h>
-
-#ifdef HAVE_SSE4_SUPPORT
-
 #include <sysdep.h>
 #include <init-arch.h>
 
@@ -45,11 +42,9 @@
 	.text
 ENTRY(STRCSPN)
 	.type	STRCSPN, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	STRCSPN_SSE2(%rip), %rax
-	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	STRCSPN_SSE2(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_2)
 	jz	2f
 	leaq	STRCSPN_SSE42(%rip), %rax
 2:	ret
@@ -66,7 +61,6 @@ END(STRCSPN)
 # define END(name) \
 	cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2
 #endif
-#endif /* HAVE_SSE4_SUPPORT */
 
 #ifdef USE_AS_STRPBRK
 #include "../strpbrk.S"
diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
index 6b0c80aa43..9675f9360e 100644
--- a/sysdeps/x86_64/multiarch/strspn-c.c
+++ b/sysdeps/x86_64/multiarch/strspn-c.c
@@ -1,5 +1,5 @@
 /* strspn with SSE4.2 intrinsics
-   Copyright (C) 2009-2015 Free Software Foundation, Inc.
+   Copyright (C) 2009-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S
index b734c1729a..4942826b24 100644
--- a/sysdeps/x86_64/multiarch/strspn.S
+++ b/sysdeps/x86_64/multiarch/strspn.S
@@ -1,6 +1,6 @@
 /* Multiple versions of strspn
    All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2009-2015 Free Software Foundation, Inc.
+   Copyright (C) 2009-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -19,9 +19,6 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <config.h>
-
-#ifdef HAVE_SSE4_SUPPORT
-
 #include <sysdep.h>
 #include <init-arch.h>
 
@@ -30,11 +27,9 @@
 	.text
 ENTRY(strspn)
 	.type	strspn, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__strspn_sse2(%rip), %rax
-	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__strspn_sse2(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_2)
 	jz	2f
 	leaq	__strspn_sse42(%rip), %rax
 2:	ret
@@ -52,6 +47,4 @@ END(strspn)
 	cfi_endproc; .size __strspn_sse2, .-__strspn_sse2
 #endif
 
-#endif /* HAVE_SSE4_SUPPORT */
-
 #include "../strspn.S"
diff --git a/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S
index 4f0e2ebdab..4ead1dfaf5 100644
--- a/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S
@@ -1,5 +1,5 @@
 /* strstr with unaligned loads
-   Copyright (C) 2009-2015 Free Software Foundation, Inc.
+   Copyright (C) 2009-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
index 507994bd38..eecba2243e 100644
--- a/sysdeps/x86_64/multiarch/strstr.c
+++ b/sysdeps/x86_64/multiarch/strstr.c
@@ -1,6 +1,6 @@
 /* Multiple versions of strstr.
    All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -41,7 +41,10 @@ extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 extern __typeof (__redirect_strstr) __libc_strstr;
-libc_ifunc (__libc_strstr, HAS_FAST_UNALIGNED_LOAD ? __strstr_sse2_unaligned : __strstr_sse2)
+libc_ifunc (__libc_strstr,
+	    HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	    ? __strstr_sse2_unaligned
+	    : __strstr_sse2)
 
 #undef strstr
 strong_alias (__libc_strstr, strstr)
diff --git a/sysdeps/x86_64/multiarch/test-multiarch.c b/sysdeps/x86_64/multiarch/test-multiarch.c
index 949d26e550..4eb0c16cd8 100644
--- a/sysdeps/x86_64/multiarch/test-multiarch.c
+++ b/sysdeps/x86_64/multiarch/test-multiarch.c
@@ -1,6 +1,6 @@
 /* Test CPU feature data.
    This file is part of the GNU C Library.
-   Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -75,12 +75,18 @@ do_test (int argc, char **argv)
   int fails;
 
   get_cpuinfo ();
-  fails = check_proc ("avx", HAS_AVX, "HAS_AVX");
-  fails += check_proc ("fma4", HAS_FMA4, "HAS_FMA4");
-  fails += check_proc ("sse4_2", HAS_SSE4_2, "HAS_SSE4_2");
-  fails += check_proc ("sse4_1", HAS_SSE4_1, "HAS_SSE4_1");
-  fails += check_proc ("ssse3", HAS_SSSE3, "HAS_SSSE3");
-  fails += check_proc ("popcnt", HAS_POPCOUNT, "HAS_POPCOUNT");
+  fails = check_proc ("avx", HAS_ARCH_FEATURE (AVX_Usable),
+		      "HAS_ARCH_FEATURE (AVX_Usable)");
+  fails += check_proc ("fma4", HAS_ARCH_FEATURE (FMA4_Usable),
+		       "HAS_ARCH_FEATURE (FMA4_Usable)");
+  fails += check_proc ("sse4_2", HAS_CPU_FEATURE (SSE4_2),
+		       "HAS_CPU_FEATURE (SSE4_2)");
+  fails += check_proc ("sse4_1", HAS_CPU_FEATURE (SSE4_1)
+		       , "HAS_CPU_FEATURE (SSE4_1)");
+  fails += check_proc ("ssse3", HAS_CPU_FEATURE (SSSE3),
+		       "HAS_CPU_FEATURE (SSSE3)");
+  fails += check_proc ("popcnt", HAS_CPU_FEATURE (POPCOUNT),
+		       "HAS_CPU_FEATURE (POPCOUNT)");
 
   printf ("%d differences between /proc/cpuinfo and glibc code.\n", fails);
 
diff --git a/sysdeps/x86_64/multiarch/varshift.c b/sysdeps/x86_64/multiarch/varshift.c
index 0007ef79e5..7921be5b57 100644
--- a/sysdeps/x86_64/multiarch/varshift.c
+++ b/sysdeps/x86_64/multiarch/varshift.c
@@ -1,5 +1,5 @@
 /* Helper for variable shifts of SSE registers.
-   Copyright (C) 2010-2015 Free Software Foundation, Inc.
+   Copyright (C) 2010-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/multiarch/varshift.h b/sysdeps/x86_64/multiarch/varshift.h
index 30ace3d914..7b27d0e9dd 100644
--- a/sysdeps/x86_64/multiarch/varshift.h
+++ b/sysdeps/x86_64/multiarch/varshift.h
@@ -1,5 +1,5 @@
 /* Helper for variable shifts of SSE registers.
-   Copyright (C) 2010-2015 Free Software Foundation, Inc.
+   Copyright (C) 2010-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
index 8097862574..341e57a5ca 100644
--- a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
@@ -1,5 +1,5 @@
 /* wcscpy with SSSE3
-   Copyright (C) 2011-2015 Free Software Foundation, Inc.
+   Copyright (C) 2011-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/x86_64/multiarch/wcscpy.S b/sysdeps/x86_64/multiarch/wcscpy.S
index ff2f5a73d1..8e7270b9c7 100644
--- a/sysdeps/x86_64/multiarch/wcscpy.S
+++ b/sysdeps/x86_64/multiarch/wcscpy.S
@@ -1,6 +1,6 @@
 /* Multiple versions of wcscpy
    All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2011-2015 Free Software Foundation, Inc.
+   Copyright (C) 2011-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -27,11 +27,8 @@
 	.text
 ENTRY(wcscpy)
 	.type	wcscpy, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
-	jne	1f
-	call	__init_cpu_features
-
-1:	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_CPU_FEATURE (SSSE3)
 	jnz	2f
 	leaq	__wcscpy_sse2(%rip), %rax
 	ret
diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S
index 109e2457fe..b510f756e2 100644
--- a/sysdeps/x86_64/multiarch/wmemcmp.S
+++ b/sysdeps/x86_64/multiarch/wmemcmp.S
@@ -1,6 +1,6 @@
 /* Multiple versions of wmemcmp
    All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2011-2015 Free Software Foundation, Inc.
+   Copyright (C) 2011-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -26,16 +26,13 @@
 	.text
 ENTRY(wmemcmp)
 	.type	wmemcmp, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
-	jne	1f
-	call	__init_cpu_features
-
-1:	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_CPU_FEATURE (SSSE3)
 	jnz	2f
 	leaq	__wmemcmp_sse2(%rip), %rax
 	ret
 
-2:	testl	$bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+2:	HAS_CPU_FEATURE (SSE4_1)
 	jz	3f
 	leaq	__wmemcmp_sse4_1(%rip), %rax
 	ret
diff --git a/sysdeps/x86_64/nptl/Makefile b/sysdeps/x86_64/nptl/Makefile
index 14fb69a94d..9b64b533ee 100644
--- a/sysdeps/x86_64/nptl/Makefile
+++ b/sysdeps/x86_64/nptl/Makefile
@@ -1,4 +1,4 @@
-# Copyright (C) 2002-2015 Free Software Foundation, Inc.
+# Copyright (C) 2002-2016 Free Software Foundation, Inc.
 # This file is part of the GNU C Library.
 
 # The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/nptl/pthread_spin_lock.S b/sysdeps/x86_64/nptl/pthread_spin_lock.S
index d1a9b68028..b871241617 100644
--- a/sysdeps/x86_64/nptl/pthread_spin_lock.S
+++ b/sysdeps/x86_64/nptl/pthread_spin_lock.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2012-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,11 +16,9 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <lowlevellock.h>
+#include <sysdep.h>
 
-	.globl	pthread_spin_lock
-	.type	pthread_spin_lock,@function
-	.align	16
-pthread_spin_lock:
+ENTRY(pthread_spin_lock)
 1:	LOCK
 	decl	0(%rdi)
 	jne	2f
@@ -33,4 +31,4 @@ pthread_spin_lock:
 	cmpl	$0, 0(%rdi)
 	jg	1b
 	jmp	2b
-	.size	pthread_spin_lock,.-pthread_spin_lock
+END(pthread_spin_lock)
diff --git a/sysdeps/x86_64/nptl/pthread_spin_trylock.S b/sysdeps/x86_64/nptl/pthread_spin_trylock.S
index 6b58929ef4..c9c53171fe 100644
--- a/sysdeps/x86_64/nptl/pthread_spin_trylock.S
+++ b/sysdeps/x86_64/nptl/pthread_spin_trylock.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2002-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2002-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
 
@@ -17,6 +17,7 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <pthread-errnos.h>
+#include <sysdep.h>
 
 
 #ifdef UP
@@ -25,10 +26,7 @@
 # define LOCK lock
 #endif
 
-	.globl	pthread_spin_trylock
-	.type	pthread_spin_trylock,@function
-	.align	16
-pthread_spin_trylock:
+ENTRY(pthread_spin_trylock)
 	movl	$1, %eax
 	xorl	%ecx, %ecx
 	LOCK
@@ -36,4 +34,4 @@ pthread_spin_trylock:
 	movl	$EBUSY, %eax
 	cmovel	%ecx, %eax
 	retq
-	.size	pthread_spin_trylock,.-pthread_spin_trylock
+END(pthread_spin_trylock)
diff --git a/sysdeps/x86_64/nptl/pthread_spin_unlock.S b/sysdeps/x86_64/nptl/pthread_spin_unlock.S
index 74d7dd6430..188de2e8cb 100644
--- a/sysdeps/x86_64/nptl/pthread_spin_unlock.S
+++ b/sysdeps/x86_64/nptl/pthread_spin_unlock.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2002-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2002-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
 
@@ -16,14 +16,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-	.globl	pthread_spin_unlock
-	.type	pthread_spin_unlock,@function
-	.align	16
-pthread_spin_unlock:
+#include <sysdep.h>
+
+ENTRY(pthread_spin_unlock)
 	movl	$1, (%rdi)
 	xorl	%eax, %eax
 	retq
-	.size	pthread_spin_unlock,.-pthread_spin_unlock
+END(pthread_spin_unlock)
 
 	/* The implementation of pthread_spin_init is identical.  */
 	.globl	pthread_spin_init
diff --git a/sysdeps/x86_64/nptl/pthreaddef.h b/sysdeps/x86_64/nptl/pthreaddef.h
index 9c7130dae2..9397efc631 100644
--- a/sysdeps/x86_64/nptl/pthreaddef.h
+++ b/sysdeps/x86_64/nptl/pthreaddef.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2002-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2002-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
 
diff --git a/sysdeps/x86_64/nptl/tcb-offsets.sym b/sysdeps/x86_64/nptl/tcb-offsets.sym
index 729d1da38f..aeb752673a 100644
--- a/sysdeps/x86_64/nptl/tcb-offsets.sym
+++ b/sysdeps/x86_64/nptl/tcb-offsets.sym
@@ -16,7 +16,6 @@ VGETCPU_CACHE_OFFSET	offsetof (tcbhead_t, vgetcpu_cache)
 #ifndef __ASSUME_PRIVATE_FUTEX
 PRIVATE_FUTEX		offsetof (tcbhead_t, private_futex)
 #endif
-RTLD_SAVESPACE_SSE	offsetof (tcbhead_t, rtld_savespace_sse)
 
 -- Not strictly offsets, but these values are also used in the TCB.
 TCB_CANCELSTATE_BITMASK	 CANCELSTATE_BITMASK
diff --git a/sysdeps/x86_64/nptl/tls.h b/sysdeps/x86_64/nptl/tls.h
index d7543c651f..2b061a07c6 100644
--- a/sysdeps/x86_64/nptl/tls.h
+++ b/sysdeps/x86_64/nptl/tls.h
@@ -1,5 +1,5 @@
 /* Definition for thread-local data handling.  nptl/x86_64 version.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
+   Copyright (C) 2002-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -67,14 +67,15 @@ typedef struct
 # else
   int __glibc_reserved1;
 # endif
-  int rtld_must_xmm_save;
+  int __glibc_unused1;
   /* Reservation of some values for the TM ABI.  */
   void *__private_tm[4];
   /* GCC split stack support.  */
   void *__private_ss;
   long int __glibc_reserved2;
-  /* Have space for the post-AVX register size.  */
-  __128bits rtld_savespace_sse[8][4] __attribute__ ((aligned (32)));
+  /* Must be kept even if it is no longer used by glibc since programs,
+     like AddressSanitizer, depend on the size of tcbhead_t.  */
+  __128bits __glibc_unused2[8][4] __attribute__ ((aligned (32)));
 
   void *__padding[8];
 } tcbhead_t;
@@ -384,41 +385,6 @@ typedef struct
 # define THREAD_GSCOPE_WAIT() \
   GL(dl_wait_lookup_done) ()
 
-
-# ifdef SHARED
-/* Defined in dl-trampoline.S.  */
-extern void _dl_x86_64_save_sse (void);
-extern void _dl_x86_64_restore_sse (void);
-
-# define RTLD_CHECK_FOREIGN_CALL \
-  (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) != 0)
-
-/* NB: Don't use the xchg operation because that would imply a lock
-   prefix which is expensive and unnecessary.  The cache line is also
-   not contested at all.  */
-#  define RTLD_ENABLE_FOREIGN_CALL \
-  int old_rtld_must_xmm_save = THREAD_GETMEM (THREAD_SELF,		      \
-					      header.rtld_must_xmm_save);     \
-  THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 1)
-
-#  define RTLD_PREPARE_FOREIGN_CALL \
-  do if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save))	      \
-    {									      \
-      _dl_x86_64_save_sse ();						      \
-      THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 0);	      \
-    }									      \
-  while (0)
-
-#  define RTLD_FINALIZE_FOREIGN_CALL \
-  do {									      \
-    if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) == 0)	      \
-      _dl_x86_64_restore_sse ();					      \
-    THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save,		      \
-		   old_rtld_must_xmm_save);				      \
-  } while (0)
-# endif
-
-
 #endif /* __ASSEMBLER__ */
 
 #endif	/* tls.h */
diff --git a/sysdeps/x86_64/rawmemchr.S b/sysdeps/x86_64/rawmemchr.S
index ec2cb9c76c..f90b7921a1 100644
--- a/sysdeps/x86_64/rawmemchr.S
+++ b/sysdeps/x86_64/rawmemchr.S
@@ -1,6 +1,6 @@
 /* fast SSE2 memchr with 64 byte loop and pmaxub instruction using
 
-   Copyright (C) 2011-2015 Free Software Foundation, Inc.
+   Copyright (C) 2011-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/x86_64/rshift.S b/sysdeps/x86_64/rshift.S
index 4166e612d5..c88c6d82bb 100644
--- a/sysdeps/x86_64/rshift.S
+++ b/sysdeps/x86_64/rshift.S
@@ -1,5 +1,5 @@
 /* x86-64 __mpn_rshift --
-   Copyright (C) 2007-2015 Free Software Foundation, Inc.
+   Copyright (C) 2007-2016 Free Software Foundation, Inc.
    This file is part of the GNU MP Library.
 
    The GNU MP Library is free software; you can redistribute it and/or modify
diff --git a/sysdeps/x86_64/rtld-memcmp.c b/sysdeps/x86_64/rtld-memcmp.c
deleted file mode 100644
index 2ee40328b8..0000000000
--- a/sysdeps/x86_64/rtld-memcmp.c
+++ /dev/null
@@ -1 +0,0 @@
-#include <string/memcmp.c>
diff --git a/sysdeps/x86_64/rtld-strchr.S b/sysdeps/x86_64/rtld-strchr.S
deleted file mode 100644
index cc694d71b6..0000000000
--- a/sysdeps/x86_64/rtld-strchr.S
+++ /dev/null
@@ -1,288 +0,0 @@
-/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR.
-   For AMD x86-64.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-
-	.text
-ENTRY (strchr)
-
-	/* Before we start with the main loop we process single bytes
-	   until the source pointer is aligned.  This has two reasons:
-	   1. aligned 64-bit memory access is faster
-	   and (more important)
-	   2. we process in the main loop 64 bit in one step although
-	      we don't know the end of the string.  But accessing at
-	      8-byte alignment guarantees that we never access illegal
-	      memory if this would not also be done by the trivial
-	      implementation (this is because all processor inherent
-	      boundaries are multiples of 8).  */
-
-	movq	%rdi, %rdx
-	andl	$7, %edx	/* Mask alignment bits  */
-	movq	%rdi, %rax	/* duplicate destination.  */
-	jz	1f		/* aligned => start loop */
-	neg	%edx
-	addl	$8, %edx	/* Align to 8 bytes.  */
-
-	/* Search the first bytes directly.  */
-0:	movb	(%rax), %cl	/* load byte  */
-	cmpb	%cl,%sil	/* compare byte.  */
-	je	6f		/* target found */
-	testb	%cl,%cl		/* is byte NUL? */
-	je	7f		/* yes => return NULL */
-	incq	%rax		/* increment pointer */
-	decl	%edx
-	jnz	0b
-
-
-1:
-	/* At the moment %rsi contains C.  What we need for the
-	   algorithm is C in all bytes of the register.  Avoid
-	   operations on 16 bit words because these require an
-	   prefix byte (and one more cycle).  */
-	/* Populate 8 bit data to full 64-bit.  */
-	movabs	$0x0101010101010101,%r9
-	movzbl	%sil,%edx
-	imul	%rdx,%r9
-
-	movq $0xfefefefefefefeff, %r8 /* Save magic.  */
-
-      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
-	 change any of the hole bits of LONGWORD.
-
-	 1) Is this safe?  Will it catch all the zero bytes?
-	 Suppose there is a byte with all zeros.  Any carry bits
-	 propagating from its left will fall into the hole at its
-	 least significant bit and stop.  Since there will be no
-	 carry from its most significant bit, the LSB of the
-	 byte to the left will be unchanged, and the zero will be
-	 detected.
-
-	 2) Is this worthwhile?  Will it ignore everything except
-	 zero bytes?  Suppose every byte of QUARDWORD has a bit set
-	 somewhere.  There will be a carry into bit 8.	If bit 8
-	 is set, this will carry into bit 16.  If bit 8 is clear,
-	 one of bits 9-15 must be set, so there will be a carry
-	 into bit 16.  Similarly, there will be a carry into bit
-	 24 tec..  If one of bits 54-63 is set, there will be a carry
-	 into bit 64 (=carry flag), so all of the hole bits will
-	 be changed.
-
-	 3) But wait!  Aren't we looking for C, not zero?
-	 Good point.  So what we do is XOR LONGWORD with a longword,
-	 each of whose bytes is C.  This turns each byte that is C
-	 into a zero.  */
-
-	.p2align 4
-4:
-	/* Main Loop is unrolled 4 times.  */
-	/* First unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	xorq %r9, %rcx		/* XOR with qword c|...|c => bytes of str == c
-				   are now 0 */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found c => return pointer */
-
-	/* The quadword we looked at does not contain the value we're looking
-	   for.  Let's search now whether we have reached the end of the
-	   string.  */
-	xorq %r9, %rcx		/* restore original dword without reload */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 7f			/* highest byte is NUL => return NULL */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 7f			/* found NUL => return NULL */
-
-	/* Second unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	xorq %r9, %rcx		/* XOR with qword c|...|c => bytes of str == c
-				   are now 0 */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found c => return pointer */
-
-	/* The quadword we looked at does not contain the value we're looking
-	   for.  Let's search now whether we have reached the end of the
-	   string.  */
-	xorq %r9, %rcx		/* restore original dword without reload */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 7f			/* highest byte is NUL => return NULL */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 7f			/* found NUL => return NULL */
-	/* Third unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	xorq %r9, %rcx		/* XOR with qword c|...|c => bytes of str == c
-				   are now 0 */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found c => return pointer */
-
-	/* The quadword we looked at does not contain the value we're looking
-	   for.  Let's search now whether we have reached the end of the
-	   string.  */
-	xorq %r9, %rcx		/* restore original dword without reload */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 7f			/* highest byte is NUL => return NULL */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 7f			/* found NUL => return NULL */
-	/* Fourth unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	xorq %r9, %rcx		/* XOR with qword c|...|c => bytes of str == c
-				   are now 0 */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found c => return pointer */
-
-	/* The quadword we looked at does not contain the value we're looking
-	   for.  Let's search now whether we have reached the end of the
-	   string.  */
-	xorq %r9, %rcx		/* restore original dword without reload */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 7f			/* highest byte is NUL => return NULL */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jz 4b			/* no NUL found => restart loop */
-
-
-7:	/* Return NULL.  */
-	xorl %eax, %eax
-	retq
-
-
-	/* We now scan for the byte in which the character was matched.
-	   But we have to take care of the case that a NUL char is
-	   found before this in the dword.  Note that we XORed %rcx
-	   with the byte we're looking for, therefore the tests below look
-	   reversed.  */
-
-
-	.p2align 4		/* Align, it's a jump target.  */
-3:	movq	%r9,%rdx	/* move to %rdx so that we can access bytes */
-	subq	$8,%rax		/* correct pointer increment.  */
-	testb %cl, %cl		/* is first byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %cl		/* is first byte NUL? */
-	je 7b			/* yes => return NULL */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is second byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %ch		/* is second byte NUL? */
-	je 7b			/* yes => return NULL? */
-	incq %rax		/* increment pointer */
-
-	shrq $16, %rcx		/* make upper bytes accessible */
-	testb %cl, %cl		/* is third byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %cl		/* is third byte NUL? */
-	je 7b			/* yes => return NULL */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is fourth byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %ch		/* is fourth byte NUL? */
-	je 7b			/* yes => return NULL? */
-	incq %rax		/* increment pointer */
-
-	shrq $16, %rcx		/* make upper bytes accessible */
-	testb %cl, %cl		/* is fifth byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %cl		/* is fifth byte NUL? */
-	je 7b			/* yes => return NULL */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is sixth byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %ch		/* is sixth byte NUL? */
-	je 7b			/* yes => return NULL? */
-	incq %rax		/* increment pointer */
-
-	shrq $16, %rcx		/* make upper bytes accessible */
-	testb %cl, %cl		/* is seventh byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %cl		/* is seventh byte NUL? */
-	je 7b			/* yes => return NULL */
-
-	/* It must be in the eigth byte and it cannot be NUL.  */
-	incq %rax
-
-6:
-	nop
-	retq
-END (strchr)
-
-weak_alias (strchr, index)
-libc_hidden_builtin_def (strchr)
diff --git a/sysdeps/x86_64/rtld-strlen.S b/sysdeps/x86_64/rtld-strlen.S
deleted file mode 100644
index 1328652154..0000000000
--- a/sysdeps/x86_64/rtld-strlen.S
+++ /dev/null
@@ -1,136 +0,0 @@
-/* strlen(str) -- determine the length of the string STR.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
-   Based on i486 version contributed by Ulrich Drepper <drepper@redhat.com>.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-
-	.text
-ENTRY (strlen)
-	movq %rdi, %rcx		/* Duplicate source pointer. */
-	andl $7, %ecx		/* mask alignment bits */
-	movq %rdi, %rax		/* duplicate destination.  */
-	jz 1f			/* aligned => start loop */
-
-	neg %ecx		/* We need to align to 8 bytes.  */
-	addl $8,%ecx
-	/* Search the first bytes directly.  */
-0:	cmpb $0x0,(%rax)	/* is byte NUL? */
-	je 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-	decl %ecx
-	jnz 0b
-
-1:	movq $0xfefefefefefefeff,%r8 /* Save magic.  */
-
-	.p2align 4		/* Align loop.  */
-4:	/* Main Loop is unrolled 4 times.  */
-	/* First unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found NUL => return pointer */
-
-	/* Second unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found NUL => return pointer */
-
-	/* Third unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found NUL => return pointer */
-
-	/* Fourth unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jz 4b			/* no NUL found => continue loop */
-
-	.p2align 4		/* Align, it's a jump target.  */
-3:	subq $8,%rax		/* correct pointer increment.  */
-
-	testb %cl, %cl		/* is first byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is second byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testl $0x00ff0000, %ecx /* is third byte NUL? */
-	jz 2f			/* yes => return pointer */
-	incq %rax		/* increment pointer */
-
-	testl $0xff000000, %ecx /* is fourth byte NUL? */
-	jz 2f			/* yes => return pointer */
-	incq %rax		/* increment pointer */
-
-	shrq $32, %rcx		/* look at other half.  */
-
-	testb %cl, %cl		/* is first byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is second byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testl $0xff0000, %ecx	/* is third byte NUL? */
-	jz 2f			/* yes => return pointer */
-	incq %rax		/* increment pointer */
-2:
-	subq %rdi, %rax		/* compute difference to string start */
-	ret
-END (strlen)
-libc_hidden_builtin_def (strlen)
diff --git a/sysdeps/x86_64/sched_cpucount.c b/sysdeps/x86_64/sched_cpucount.c
index 72e67aa999..0834e711b3 100644
--- a/sysdeps/x86_64/sched_cpucount.c
+++ b/sysdeps/x86_64/sched_cpucount.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2007-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/setjmp.S b/sysdeps/x86_64/setjmp.S
index 774aaf1e8d..3e93967c2f 100644
--- a/sysdeps/x86_64/setjmp.S
+++ b/sysdeps/x86_64/setjmp.S
@@ -1,5 +1,5 @@
 /* setjmp for x86-64.
-   Copyright (C) 2001-2015 Free Software Foundation, Inc.
+   Copyright (C) 2001-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/stackinfo.h b/sysdeps/x86_64/stackinfo.h
index 5b81d808d0..848aa7754c 100644
--- a/sysdeps/x86_64/stackinfo.h
+++ b/sysdeps/x86_64/stackinfo.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2001-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2001-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/start.S b/sysdeps/x86_64/start.S
index 0d27a38e9c..1374974307 100644
--- a/sysdeps/x86_64/start.S
+++ b/sysdeps/x86_64/start.S
@@ -1,5 +1,5 @@
 /* Startup code compliant to the ELF x86-64 ABI.
-   Copyright (C) 2001-2015 Free Software Foundation, Inc.
+   Copyright (C) 2001-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Andreas Jaeger <aj@suse.de>, 2001.
 
diff --git a/sysdeps/x86_64/stpcpy_chk.S b/sysdeps/x86_64/stpcpy_chk.S
deleted file mode 100644
index 905e8d7ee3..0000000000
--- a/sysdeps/x86_64/stpcpy_chk.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STPCPY_CHK
-#define STRCPY_CHK __stpcpy_chk
-#include <sysdeps/x86_64/strcpy_chk.S>
diff --git a/sysdeps/x86_64/strcat.S b/sysdeps/x86_64/strcat.S
index affb15a32e..dadf4c76b2 100644
--- a/sysdeps/x86_64/strcat.S
+++ b/sysdeps/x86_64/strcat.S
@@ -1,6 +1,6 @@
 /* strcat(dest, src) -- Append SRC on the end of DEST.
    Optimized for x86-64.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
+   Copyright (C) 2002-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Andreas Jaeger <aj@suse.de>, 2002.
 
diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S
index d7955af8d2..4431fee648 100644
--- a/sysdeps/x86_64/strchr.S
+++ b/sysdeps/x86_64/strchr.S
@@ -1,6 +1,6 @@
 /* strchr (str, ch) -- Return pointer to first occurrence of CH in STR.
    For AMD x86-64.
-   Copyright (C) 2009-2015 Free Software Foundation, Inc.
+   Copyright (C) 2009-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/strchrnul.S b/sysdeps/x86_64/strchrnul.S
index e8cab0dd16..7b52d699ee 100644
--- a/sysdeps/x86_64/strchrnul.S
+++ b/sysdeps/x86_64/strchrnul.S
@@ -1,7 +1,7 @@
 /* strchrnul (str, ch) -- Return pointer to first occurrence of CH in STR
 	or terminating NUL byte.
    For AMD x86-64.
-   Copyright (C) 2009-2015 Free Software Foundation, Inc.
+   Copyright (C) 2009-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index 1329649d3a..c5c44d4e27 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -1,5 +1,5 @@
 /* Highly optimized version for x86-64.
-   Copyright (C) 1999-2015 Free Software Foundation, Inc.
+   Copyright (C) 1999-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Based on i686 version contributed by Ulrich Drepper
    <drepper@cygnus.com>, 1999.
@@ -29,13 +29,6 @@
 #endif
 
 #ifdef USE_AS_STRNCMP
-/* The simplified code below is not set up to handle strncmp() so far.
-   Should this become necessary it has to be implemented.  For now
-   just report the problem.  */
-# if !IS_IN (libc)
-#  error "strncmp not implemented so far"
-# endif
-
 /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
    if the new counter > the old one or is 0.  */
 # define UPDATE_STRNCMP_COUNTER				\
@@ -50,20 +43,10 @@
 #elif defined USE_AS_STRCASECMP_L
 # include "locale-defines.h"
 
-/* No support for strcasecmp outside libc so far since it is not needed.  */
-# if !IS_IN (libc)
-#  error "strcasecmp_l not implemented so far"
-# endif
-
 # define UPDATE_STRNCMP_COUNTER
 #elif defined USE_AS_STRNCASECMP_L
 # include "locale-defines.h"
 
-/* No support for strncasecmp outside libc so far since it is not needed.  */
-# if !IS_IN (libc)
-#  error "strncasecmp_l not implemented so far"
-# endif
-
 # define UPDATE_STRNCMP_COUNTER				\
 	/* calculate left number to compare */		\
 	lea	-16(%rcx, %r11), %r9;			\
@@ -126,63 +109,44 @@ libc_hidden_def (__strncasecmp)
 #endif
 
 ENTRY (STRCMP)
-#if !IS_IN (libc)
-/* Simple version since we can't use SSE registers in ld.so.  */
-L(oop):	movb	(%rdi), %al
-	cmpb	(%rsi), %al
-	jne	L(neq)
-	incq	%rdi
-	incq	%rsi
-	testb	%al, %al
-	jnz	L(oop)
-
-	xorl	%eax, %eax
-	ret
-
-L(neq):	movl	$1, %eax
-	movl	$-1, %ecx
-	cmovbl	%ecx, %eax
-	ret
-END (STRCMP)
-#else	/* !IS_IN (libc) */
-# ifdef USE_AS_STRCASECMP_L
+#ifdef USE_AS_STRCASECMP_L
 	/* We have to fall back on the C implementation for locales
 	   with encodings not matching ASCII for single bytes.  */
-#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
 	mov	LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
-#  else
+# else
 	mov	(%rdx), %RAX_LP
-#  endif
+# endif
 	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
 	jne	__strcasecmp_l_nonascii
-# elif defined USE_AS_STRNCASECMP_L
+#elif defined USE_AS_STRNCASECMP_L
 	/* We have to fall back on the C implementation for locales
 	   with encodings not matching ASCII for single bytes.  */
-#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
 	mov	LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
-#  else
+# else
 	mov	(%rcx), %RAX_LP
-#  endif
+# endif
 	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
 	jne	__strncasecmp_l_nonascii
-# endif
+#endif
 
 /*
  * This implementation uses SSE to compare up to 16 bytes at a time.
  */
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	test	%rdx, %rdx
 	je	LABEL(strcmp_exitz)
 	cmp	$1, %rdx
 	je	LABEL(Byte0)
 	mov	%rdx, %r11
-# endif
+#endif
 	mov	%esi, %ecx
 	mov	%edi, %eax
 /* Use 64bit AND here to avoid long NOP padding.  */
 	and	$0x3f, %rcx		/* rsi alignment in cache line */
 	and	$0x3f, %rax		/* rdi alignment in cache line */
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	.section .rodata.cst16,"aM",@progbits,16
 	.align 16
 .Lbelowupper:
@@ -196,12 +160,12 @@ END (STRCMP)
 	.quad	0x2020202020202020
 	.previous
 	movdqa	.Lbelowupper(%rip), %xmm5
-#  define UCLOW_reg %xmm5
+# define UCLOW_reg %xmm5
 	movdqa	.Ltopupper(%rip), %xmm6
-#  define UCHIGH_reg %xmm6
+# define UCHIGH_reg %xmm6
 	movdqa	.Ltouppermask(%rip), %xmm7
-#  define LCQWORD_reg %xmm7
-# endif
+# define LCQWORD_reg %xmm7
+#endif
 	cmp	$0x30, %ecx
 	ja	LABEL(crosscache)	/* rsi: 16-byte load will cross cache line */
 	cmp	$0x30, %eax
@@ -210,8 +174,8 @@ END (STRCMP)
 	movlpd	(%rsi), %xmm2
 	movhpd	8(%rdi), %xmm1
 	movhpd	8(%rsi), %xmm2
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
-#  define TOLOWER(reg1, reg2) \
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# define TOLOWER(reg1, reg2) \
 	movdqa	reg1, %xmm8;					\
 	movdqa	UCHIGH_reg, %xmm9;				\
 	movdqa	reg2, %xmm10;					\
@@ -227,9 +191,9 @@ END (STRCMP)
 	por	%xmm8, reg1;					\
 	por	%xmm10, reg2
 	TOLOWER (%xmm1, %xmm2)
-# else
-#  define TOLOWER(reg1, reg2)
-# endif
+#else
+# define TOLOWER(reg1, reg2)
+#endif
 	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
 	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
 	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
@@ -237,10 +201,10 @@ END (STRCMP)
 	pmovmskb %xmm1, %edx
 	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
 	jnz	LABEL(less16bytes)	/* If not, find different value or null char */
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)	/* finish comparision */
-# endif
+#endif
 	add	$16, %rsi		/* prepare to search next 16 bytes */
 	add	$16, %rdi		/* prepare to search next 16 bytes */
 
@@ -282,13 +246,13 @@ LABEL(ashr_0):
 	movdqa	(%rsi), %xmm1
 	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char check */
 	pcmpeqb	%xmm1, %xmm0			/* Any null chars? */
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpeqb	(%rdi), %xmm1			/* compare 16 bytes for equality */
-# else
+#else
 	movdqa	(%rdi), %xmm2
 	TOLOWER (%xmm1, %xmm2)
 	pcmpeqb	%xmm2, %xmm1			/* compare 16 bytes for equality */
-# endif
+#endif
 	psubb	%xmm0, %xmm1			/* packed sub of comparison results*/
 	pmovmskb %xmm1, %r9d
 	shr	%cl, %edx			/* adjust 0xffff for offset */
@@ -321,10 +285,10 @@ LABEL(loop_ashr_0):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)		/* mismatch or null char seen */
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 	add	$16, %rcx
 	movdqa	(%rsi, %rcx), %xmm1
 	movdqa	(%rdi, %rcx), %xmm2
@@ -336,10 +300,10 @@ LABEL(loop_ashr_0):
 	pmovmskb %xmm1, %edx
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 	add	$16, %rcx
 	jmp	LABEL(loop_ashr_0)
 
@@ -388,13 +352,13 @@ LABEL(gobble_ashr_1):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4		 /* store for next cycle */
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$1, %xmm3
 	pslldq	$15, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -404,10 +368,10 @@ LABEL(gobble_ashr_1):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
 
@@ -418,13 +382,13 @@ LABEL(gobble_ashr_1):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4		/* store for next cycle */
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$1, %xmm3
 	pslldq	$15, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -434,10 +398,10 @@ LABEL(gobble_ashr_1):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
 	jmp	LABEL(loop_ashr_1)
@@ -453,10 +417,10 @@ LABEL(nibble_ashr_1):
 	test	$0xfffe, %edx
 	jnz	LABEL(ashr_1_exittail)	/* find null char*/
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$15, %r11
 	jbe	LABEL(ashr_1_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10		/* substract 4K from %r10 */
@@ -518,13 +482,13 @@ LABEL(gobble_ashr_2):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$2, %xmm3
 	pslldq	$14, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -534,10 +498,10 @@ LABEL(gobble_ashr_2):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -549,13 +513,13 @@ LABEL(gobble_ashr_2):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$2, %xmm3
 	pslldq	$14, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -565,10 +529,10 @@ LABEL(gobble_ashr_2):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -581,10 +545,10 @@ LABEL(nibble_ashr_2):
 	test	$0xfffc, %edx
 	jnz	LABEL(ashr_2_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$14, %r11
 	jbe	LABEL(ashr_2_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -643,13 +607,13 @@ LABEL(gobble_ashr_3):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$3, %xmm3
 	pslldq	$13, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -659,10 +623,10 @@ LABEL(gobble_ashr_3):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -674,13 +638,13 @@ LABEL(gobble_ashr_3):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$3, %xmm3
 	pslldq	$13, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -690,10 +654,10 @@ LABEL(gobble_ashr_3):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -706,10 +670,10 @@ LABEL(nibble_ashr_3):
 	test	$0xfff8, %edx
 	jnz	LABEL(ashr_3_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$13, %r11
 	jbe	LABEL(ashr_3_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -768,13 +732,13 @@ LABEL(gobble_ashr_4):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$4, %xmm3
 	pslldq	$12, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -784,10 +748,10 @@ LABEL(gobble_ashr_4):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -799,13 +763,13 @@ LABEL(gobble_ashr_4):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$4, %xmm3
 	pslldq	$12, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -815,10 +779,10 @@ LABEL(gobble_ashr_4):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -831,10 +795,10 @@ LABEL(nibble_ashr_4):
 	test	$0xfff0, %edx
 	jnz	LABEL(ashr_4_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$12, %r11
 	jbe	LABEL(ashr_4_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -893,13 +857,13 @@ LABEL(gobble_ashr_5):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$5, %xmm3
 	pslldq	$11, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -909,10 +873,10 @@ LABEL(gobble_ashr_5):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -924,13 +888,13 @@ LABEL(gobble_ashr_5):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$5, %xmm3
 	pslldq	$11, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -940,10 +904,10 @@ LABEL(gobble_ashr_5):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -956,10 +920,10 @@ LABEL(nibble_ashr_5):
 	test	$0xffe0, %edx
 	jnz	LABEL(ashr_5_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$11, %r11
 	jbe	LABEL(ashr_5_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1018,13 +982,13 @@ LABEL(gobble_ashr_6):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$6, %xmm3
 	pslldq	$10, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1034,10 +998,10 @@ LABEL(gobble_ashr_6):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1049,13 +1013,13 @@ LABEL(gobble_ashr_6):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$6, %xmm3
 	pslldq	$10, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1065,10 +1029,10 @@ LABEL(gobble_ashr_6):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1081,10 +1045,10 @@ LABEL(nibble_ashr_6):
 	test	$0xffc0, %edx
 	jnz	LABEL(ashr_6_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$10, %r11
 	jbe	LABEL(ashr_6_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1143,13 +1107,13 @@ LABEL(gobble_ashr_7):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$7, %xmm3
 	pslldq	$9, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1159,10 +1123,10 @@ LABEL(gobble_ashr_7):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1174,13 +1138,13 @@ LABEL(gobble_ashr_7):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$7, %xmm3
 	pslldq	$9, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1190,10 +1154,10 @@ LABEL(gobble_ashr_7):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1206,10 +1170,10 @@ LABEL(nibble_ashr_7):
 	test	$0xff80, %edx
 	jnz	LABEL(ashr_7_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$9, %r11
 	jbe	LABEL(ashr_7_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1268,13 +1232,13 @@ LABEL(gobble_ashr_8):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$8, %xmm3
 	pslldq	$8, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1284,10 +1248,10 @@ LABEL(gobble_ashr_8):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1299,13 +1263,13 @@ LABEL(gobble_ashr_8):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$8, %xmm3
 	pslldq	$8, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1315,10 +1279,10 @@ LABEL(gobble_ashr_8):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1331,10 +1295,10 @@ LABEL(nibble_ashr_8):
 	test	$0xff00, %edx
 	jnz	LABEL(ashr_8_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$8, %r11
 	jbe	LABEL(ashr_8_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1393,13 +1357,13 @@ LABEL(gobble_ashr_9):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$9, %xmm3
 	pslldq	$7, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1409,10 +1373,10 @@ LABEL(gobble_ashr_9):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1424,13 +1388,13 @@ LABEL(gobble_ashr_9):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$9, %xmm3
 	pslldq	$7, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1440,10 +1404,10 @@ LABEL(gobble_ashr_9):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3		/* store for next cycle */
@@ -1456,10 +1420,10 @@ LABEL(nibble_ashr_9):
 	test	$0xfe00, %edx
 	jnz	LABEL(ashr_9_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$7, %r11
 	jbe	LABEL(ashr_9_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1518,13 +1482,13 @@ LABEL(gobble_ashr_10):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$10, %xmm3
 	pslldq	$6, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1534,10 +1498,10 @@ LABEL(gobble_ashr_10):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1549,13 +1513,13 @@ LABEL(gobble_ashr_10):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$10, %xmm3
 	pslldq	$6, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1565,10 +1529,10 @@ LABEL(gobble_ashr_10):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1581,10 +1545,10 @@ LABEL(nibble_ashr_10):
 	test	$0xfc00, %edx
 	jnz	LABEL(ashr_10_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$6, %r11
 	jbe	LABEL(ashr_10_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1643,13 +1607,13 @@ LABEL(gobble_ashr_11):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$11, %xmm3
 	pslldq	$5, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1659,10 +1623,10 @@ LABEL(gobble_ashr_11):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1674,13 +1638,13 @@ LABEL(gobble_ashr_11):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$11, %xmm3
 	pslldq	$5, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1690,10 +1654,10 @@ LABEL(gobble_ashr_11):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1706,10 +1670,10 @@ LABEL(nibble_ashr_11):
 	test	$0xf800, %edx
 	jnz	LABEL(ashr_11_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$5, %r11
 	jbe	LABEL(ashr_11_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1768,13 +1732,13 @@ LABEL(gobble_ashr_12):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$12, %xmm3
 	pslldq	$4, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1784,10 +1748,10 @@ LABEL(gobble_ashr_12):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1799,13 +1763,13 @@ LABEL(gobble_ashr_12):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$12, %xmm3
 	pslldq	$4, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1815,10 +1779,10 @@ LABEL(gobble_ashr_12):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1831,10 +1795,10 @@ LABEL(nibble_ashr_12):
 	test	$0xf000, %edx
 	jnz	LABEL(ashr_12_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$4, %r11
 	jbe	LABEL(ashr_12_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1893,13 +1857,13 @@ LABEL(gobble_ashr_13):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$13, %xmm3
 	pslldq	$3, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1909,10 +1873,10 @@ LABEL(gobble_ashr_13):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1924,13 +1888,13 @@ LABEL(gobble_ashr_13):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$13, %xmm3
 	pslldq	$3, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1940,10 +1904,10 @@ LABEL(gobble_ashr_13):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1956,10 +1920,10 @@ LABEL(nibble_ashr_13):
 	test	$0xe000, %edx
 	jnz	LABEL(ashr_13_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$3, %r11
 	jbe	LABEL(ashr_13_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -2018,13 +1982,13 @@ LABEL(gobble_ashr_14):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$14, %xmm3
 	pslldq	$2, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2034,10 +1998,10 @@ LABEL(gobble_ashr_14):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -2049,13 +2013,13 @@ LABEL(gobble_ashr_14):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$14, %xmm3
 	pslldq	$2, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2065,10 +2029,10 @@ LABEL(gobble_ashr_14):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -2081,10 +2045,10 @@ LABEL(nibble_ashr_14):
 	test	$0xc000, %edx
 	jnz	LABEL(ashr_14_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$2, %r11
 	jbe	LABEL(ashr_14_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -2145,13 +2109,13 @@ LABEL(gobble_ashr_15):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$15, %xmm3
 	pslldq	$1, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2161,10 +2125,10 @@ LABEL(gobble_ashr_15):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -2176,13 +2140,13 @@ LABEL(gobble_ashr_15):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$15, %xmm3
 	pslldq	$1, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2192,10 +2156,10 @@ LABEL(gobble_ashr_15):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -2208,10 +2172,10 @@ LABEL(nibble_ashr_15):
 	test	$0x8000, %edx
 	jnz	LABEL(ashr_15_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmpq	$1, %r11
 	jbe	LABEL(ashr_15_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -2246,18 +2210,18 @@ LABEL(ret):
 LABEL(less16bytes):
 	bsf	%rdx, %rdx		/* find and store bit index in %rdx */
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	%rdx, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 	movzbl	(%rsi, %rdx), %ecx
 	movzbl	(%rdi, %rdx), %eax
 
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
 	movl	(%rdx,%rcx,4), %ecx
 	movl	(%rdx,%rax,4), %eax
-# endif
+#endif
 
 	sub	%ecx, %eax
 	ret
@@ -2271,11 +2235,11 @@ LABEL(Byte0):
 	movzx	(%rsi), %ecx
 	movzx	(%rdi), %eax
 
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
 	movl	(%rdx,%rcx,4), %ecx
 	movl	(%rdx,%rax,4), %eax
-# endif
+#endif
 
 	sub	%ecx, %eax
 	ret
@@ -2300,5 +2264,4 @@ LABEL(unaligned_table):
 	.int	LABEL(ashr_14) - LABEL(unaligned_table)
 	.int	LABEL(ashr_15) - LABEL(unaligned_table)
 	.int	LABEL(ashr_0) - LABEL(unaligned_table)
-#endif /* !IS_IN (libc) */
 libc_hidden_builtin_def (STRCMP)
diff --git a/sysdeps/x86_64/strcpy.S b/sysdeps/x86_64/strcpy.S
index 23231088fd..3f90c0020a 100644
--- a/sysdeps/x86_64/strcpy.S
+++ b/sysdeps/x86_64/strcpy.S
@@ -1,5 +1,5 @@
 /* strcpy/stpcpy implementation for x86-64.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
+   Copyright (C) 2002-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Andreas Jaeger <aj@suse.de>, 2002.
 
diff --git a/sysdeps/x86_64/strcpy_chk.S b/sysdeps/x86_64/strcpy_chk.S
deleted file mode 100644
index 24e51c66f1..0000000000
--- a/sysdeps/x86_64/strcpy_chk.S
+++ /dev/null
@@ -1,208 +0,0 @@
-/* strcpy/stpcpy checking implementation for x86-64.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Andreas Jaeger <aj@suse.de>, 2002.
-   Adopted into checking version by Jakub Jelinek <jakub@redhat.com>.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-#ifndef USE_AS_STPCPY_CHK
-# define STRCPY_CHK __strcpy_chk
-#endif
-
-	.text
-ENTRY (STRCPY_CHK)
-	movq	%rsi, %rcx	/* Source register. */
-	andl	$7, %ecx	/* mask alignment bits */
-#ifndef USE_AS_STPCPY_CHK
-	movq	%rdi, %r10	/* Duplicate destination pointer.  */
-#endif
-	jz 5f			/* aligned => start loop */
-
-	cmpq	$8, %rdx	/* Check if only few bytes left in
-				   destination.  */
-	jb	50f
-
-	subq	$8, %rcx	/* We need to align to 8 bytes.  */
-	addq	%rcx, %rdx	/* Subtract count of stored bytes
-				   in the cycle below from destlen.  */
-
-	/* Search the first bytes directly.  */
-0:
-	movb	(%rsi), %al	/* Fetch a byte */
-	testb	%al, %al	/* Is it NUL? */
-	movb	%al, (%rdi)	/* Store it */
-	jz	4f		/* If it was NUL, done! */
-	incq	%rsi
-	incq	%rdi
-	incl	%ecx
-	jnz	0b
-
-5:
-	movq $0xfefefefefefefeff,%r8
-	cmpq	$32, %rdx	/* Are there enough bytes in destination
-				   for the next unrolled round?  */
-	jb	60f		/* If not, avoid the unrolled loop.  */
-
-	/* Now the sources is aligned.  Unfortunatly we cannot force
-	   to have both source and destination aligned, so ignore the
-	   alignment of the destination.  */
-	.p2align 4
-1:
-	/* 1st unroll.  */
-	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
-	addq	$8, %rsi	/* Adjust pointer for next word.  */
-	movq	%rax, %r9	/* Save a copy for NUL finding.  */
-	addq	%r8, %r9	/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc	3f		/* highest byte is NUL => return pointer */
-	xorq	%rax, %r9	/* (word+magic)^word */
-	orq	%r8, %r9	/* set all non-carry bits */
-	incq	%r9		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-
-	jnz	3f		/* found NUL => return pointer */
-
-	movq	%rax, (%rdi)	/* Write value to destination.  */
-	addq	$8, %rdi	/* Adjust pointer.  */
-
-	/* 2nd unroll.  */
-	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
-	addq	$8, %rsi	/* Adjust pointer for next word.  */
-	movq	%rax, %r9	/* Save a copy for NUL finding.  */
-	addq	%r8, %r9	/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc	3f		/* highest byte is NUL => return pointer */
-	xorq	%rax, %r9	/* (word+magic)^word */
-	orq	%r8, %r9	/* set all non-carry bits */
-	incq	%r9		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-
-	jnz	3f		/* found NUL => return pointer */
-
-	movq	%rax, (%rdi)	/* Write value to destination.  */
-	addq	$8, %rdi	/* Adjust pointer.  */
-
-	/* 3rd unroll.  */
-	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
-	addq	$8, %rsi	/* Adjust pointer for next word.  */
-	movq	%rax, %r9	/* Save a copy for NUL finding.  */
-	addq	%r8, %r9	/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc	3f		/* highest byte is NUL => return pointer */
-	xorq	%rax, %r9	/* (word+magic)^word */
-	orq	%r8, %r9	/* set all non-carry bits */
-	incq	%r9		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-
-	jnz	3f		/* found NUL => return pointer */
-
-	movq	%rax, (%rdi)	/* Write value to destination.  */
-	addq	$8, %rdi	/* Adjust pointer.  */
-
-	/* 4th unroll.  */
-	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
-	addq	$8, %rsi	/* Adjust pointer for next word.  */
-	movq	%rax, %r9	/* Save a copy for NUL finding.  */
-	addq	%r8, %r9	/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc	3f		/* highest byte is NUL => return pointer */
-	xorq	%rax, %r9	/* (word+magic)^word */
-	orq	%r8, %r9	/* set all non-carry bits */
-	incq	%r9		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-
-	jnz	3f		/* found NUL => return pointer */
-
-	subq	$32, %rdx	/* Adjust destlen.  */
-	movq	%rax, (%rdi)	/* Write value to destination.  */
-	addq	$8, %rdi	/* Adjust pointer.  */
-	cmpq	$32, %rdx	/* Are there enough bytes in destination
-				   for the next unrolled round?  */
-	jae	1b		/* Next iteration.  */
-
-60:
-	cmpq	$8, %rdx	/* Are there enough bytes in destination
-				   for the next unrolled round?  */
-	jb	50f		/* Now, copy and check byte by byte.  */
-
-	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
-	addq	$8, %rsi	/* Adjust pointer for next word.  */
-	movq	%rax, %r9	/* Save a copy for NUL finding.  */
-	addq	%r8, %r9	/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc	3f		/* highest byte is NUL => return pointer */
-	xorq	%rax, %r9	/* (word+magic)^word */
-	orq	%r8, %r9	/* set all non-carry bits */
-	incq	%r9		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-
-	jnz	3f		/* found NUL => return pointer */
-
-	subq	$8, %rdx	/* Adjust destlen.  */
-	movq	%rax, (%rdi)	/* Write value to destination.  */
-	addq	$8, %rdi	/* Adjust pointer.  */
-	jmp	60b		/* Next iteration.  */
-
-	/* Do the last few bytes. %rax contains the value to write.
-	   The loop is unrolled twice.  */
-	.p2align 4
-3:
-	/* Note that stpcpy needs to return with the value of the NUL
-	   byte.  */
-	movb	%al, (%rdi)	/* 1st byte.  */
-	testb	%al, %al	/* Is it NUL.  */
-	jz	4f		/* yes, finish.  */
-	incq	%rdi		/* Increment destination.  */
-	movb	%ah, (%rdi)	/* 2nd byte.  */
-	testb	%ah, %ah	/* Is it NUL?.  */
-	jz	4f		/* yes, finish.  */
-	incq	%rdi		/* Increment destination.  */
-	shrq	$16, %rax	/* Shift...  */
-	jmp	3b		/* and look at next two bytes in %rax.  */
-
-51:
-	/* Search the bytes directly, checking for overflows.  */
-	incq	%rsi
-	incq	%rdi
-	decq	%rdx
-	jz	HIDDEN_JUMPTARGET (__chk_fail)
-52:
-	movb	(%rsi), %al	/* Fetch a byte */
-	testb	%al, %al	/* Is it NUL? */
-	movb	%al, (%rdi)	/* Store it */
-	jnz	51b		/* If it was NUL, done! */
-4:
-#ifdef USE_AS_STPCPY_CHK
-	movq	%rdi, %rax	/* Destination is return value.  */
-#else
-	movq	%r10, %rax	/* Source is return value.  */
-#endif
-	retq
-
-50:
-	testq	%rdx, %rdx
-	jnz	52b
-	jmp	HIDDEN_JUMPTARGET (__chk_fail)
-
-END (STRCPY_CHK)
diff --git a/sysdeps/x86_64/strcspn.S b/sysdeps/x86_64/strcspn.S
index c6c20f001c..de526c8fdd 100644
--- a/sysdeps/x86_64/strcspn.S
+++ b/sysdeps/x86_64/strcspn.S
@@ -1,7 +1,7 @@
 /* strcspn (str, ss) -- Return the length of the initial segment of STR
 			which contains no characters from SS.
    For AMD x86-64.
-   Copyright (C) 1994-2015 Free Software Foundation, Inc.
+   Copyright (C) 1994-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>.
    Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>.
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index c382c8d23e..12f63ad1bb 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -1,5 +1,5 @@
 /* SSE2 version of strlen.
-   Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -20,7 +20,7 @@
 
 /* Long lived register in strlen(s), strnlen(s, n) are:
 
-	%xmm11 - zero
+	%xmm3 - zero
 	%rdi   - s
 	%r10  (s+n) & (~(64-1))
 	%r11   s+n
@@ -32,14 +32,14 @@ ENTRY(strlen)
 
 /* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
 #define FIND_ZERO	\
-	pcmpeqb	(%rax), %xmm8;	\
-	pcmpeqb	16(%rax), %xmm9;	\
-	pcmpeqb	32(%rax), %xmm10;	\
-	pcmpeqb	48(%rax), %xmm11;	\
-	pmovmskb	%xmm8, %esi;	\
-	pmovmskb	%xmm9, %edx;	\
-	pmovmskb	%xmm10, %r8d;	\
-	pmovmskb	%xmm11, %ecx;	\
+	pcmpeqb	(%rax), %xmm0;	\
+	pcmpeqb	16(%rax), %xmm1;	\
+	pcmpeqb	32(%rax), %xmm2;	\
+	pcmpeqb	48(%rax), %xmm3;	\
+	pmovmskb	%xmm0, %esi;	\
+	pmovmskb	%xmm1, %edx;	\
+	pmovmskb	%xmm2, %r8d;	\
+	pmovmskb	%xmm3, %ecx;	\
 	salq	$16, %rdx;	\
 	salq	$16, %rcx;	\
 	orq	%rsi, %rdx;	\
@@ -63,10 +63,10 @@ L(n_nonzero):
 	mov	%rsi, %r11
 #endif
 
-	pxor	%xmm8, %xmm8
-	pxor	%xmm9, %xmm9
-	pxor	%xmm10, %xmm10
-	pxor	%xmm11, %xmm11
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
 	movq	%rdi, %rax
 	movq	%rdi, %rcx
 	andq	$4095, %rcx
@@ -103,9 +103,9 @@ L(n_nonzero):
 	FIND_ZERO
 #else
 	/* Test first 16 bytes unaligned.  */
-	movdqu	(%rax), %xmm12
-	pcmpeqb	%xmm8, %xmm12
-	pmovmskb	%xmm12, %edx
+	movdqu	(%rax), %xmm4
+	pcmpeqb	%xmm0, %xmm4
+	pmovmskb	%xmm4, %edx
 	test	%edx, %edx
 	je 	L(next48_bytes)
 	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
@@ -114,12 +114,12 @@ L(n_nonzero):
 L(next48_bytes):
 /* Same as FIND_ZERO except we do not check first 16 bytes.  */
 	andq	$-16, %rax
-	pcmpeqb 16(%rax), %xmm9
-	pcmpeqb 32(%rax), %xmm10
-	pcmpeqb 48(%rax), %xmm11
-	pmovmskb	%xmm9, %edx
-	pmovmskb	%xmm10, %r8d
-	pmovmskb	%xmm11, %ecx
+	pcmpeqb 16(%rax), %xmm1
+	pcmpeqb 32(%rax), %xmm2
+	pcmpeqb 48(%rax), %xmm3
+	pmovmskb	%xmm1, %edx
+	pmovmskb	%xmm2, %r8d
+	pmovmskb	%xmm3, %ecx
 	salq	$16, %rdx
 	salq	$16, %rcx
 	orq	%r8, %rcx
@@ -127,7 +127,7 @@ L(next48_bytes):
 	orq	%rcx, %rdx
 #endif
 
-	/* When no zero byte is found xmm9-11 are zero so we do not have to
+	/* When no zero byte is found xmm1-3 are zero so we do not have to
 	   zero them.  */
 	PROLOG(loop)
 
@@ -149,9 +149,9 @@ L(strnlen_ret):
 #endif
 	.p2align 4
 L(loop_init):
-	pxor	%xmm9, %xmm9
-	pxor	%xmm10, %xmm10
-	pxor	%xmm11, %xmm11
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
 #ifdef AS_STRNLEN
 	.p2align 4
 L(loop):
@@ -160,12 +160,12 @@ L(loop):
 	cmpq	%rax, %r10
 	je	L(exit_end)
 
-	movdqa	(%rax), %xmm8
-	pminub	16(%rax), %xmm8
-	pminub	32(%rax), %xmm8
-	pminub	48(%rax), %xmm8
-	pcmpeqb	%xmm11, %xmm8
-	pmovmskb	%xmm8, %edx
+	movdqa	(%rax), %xmm0
+	pminub	16(%rax), %xmm0
+	pminub	32(%rax), %xmm0
+	pminub	48(%rax), %xmm0
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
 	testl	%edx, %edx
 	jne	L(exit)
 	jmp	L(loop)
@@ -174,7 +174,7 @@ L(loop):
 L(exit_end):
 	cmp	%rax, %r11
 	je	L(first) /* Do not read when end is at page boundary.  */
-	pxor	%xmm8, %xmm8
+	pxor	%xmm0, %xmm0
 	FIND_ZERO
 
 L(first):
@@ -186,7 +186,7 @@ L(first):
 
 	.p2align 4
 L(exit):
-	pxor	%xmm8, %xmm8
+	pxor	%xmm0, %xmm0
 	FIND_ZERO
 
 	bsfq	%rdx, %rdx
@@ -200,23 +200,23 @@ L(exit):
 	.p2align 4
 L(loop):
 
-	movdqa	64(%rax), %xmm8
-	pminub	80(%rax), %xmm8
-	pminub	96(%rax), %xmm8
-	pminub	112(%rax), %xmm8
-	pcmpeqb	%xmm11, %xmm8
-	pmovmskb	%xmm8, %edx
+	movdqa	64(%rax), %xmm0
+	pminub	80(%rax), %xmm0
+	pminub	96(%rax), %xmm0
+	pminub	112(%rax), %xmm0
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
 	testl	%edx, %edx
 	jne	L(exit64)
 
 	subq	$-128, %rax
 
-	movdqa	(%rax), %xmm8
-	pminub	16(%rax), %xmm8
-	pminub	32(%rax), %xmm8
-	pminub	48(%rax), %xmm8
-	pcmpeqb	%xmm11, %xmm8
-	pmovmskb	%xmm8, %edx
+	movdqa	(%rax), %xmm0
+	pminub	16(%rax), %xmm0
+	pminub	32(%rax), %xmm0
+	pminub	48(%rax), %xmm0
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
 	testl	%edx, %edx
 	jne	L(exit0)
 	jmp	L(loop)
@@ -225,7 +225,7 @@ L(loop):
 L(exit64):
 	addq	$64, %rax
 L(exit0):
-	pxor	%xmm8, %xmm8
+	pxor	%xmm0, %xmm0
 	FIND_ZERO
 
 	bsfq	%rdx, %rdx
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
index 14a3abafb3..de0be762ed 100644
--- a/sysdeps/x86_64/strrchr.S
+++ b/sysdeps/x86_64/strrchr.S
@@ -1,5 +1,5 @@
 /* strrchr (str, ch) -- Return pointer to last occurrence of CH in STR.
-   Copyright (C) 2013-2015 Free Software Foundation, Inc.
+   Copyright (C) 2013-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/strspn.S b/sysdeps/x86_64/strspn.S
index 62f061bc8f..49dd4ba9f5 100644
--- a/sysdeps/x86_64/strspn.S
+++ b/sysdeps/x86_64/strspn.S
@@ -1,7 +1,7 @@
 /* strspn (str, ss) -- Return the length of the initial segment of STR
 			which contains only characters from SS.
    For AMD x86-64.
-   Copyright (C) 1994-2015 Free Software Foundation, Inc.
+   Copyright (C) 1994-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>.
    Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>.
diff --git a/sysdeps/x86_64/strtok.S b/sysdeps/x86_64/strtok.S
index f24be7aae7..bd5b103d50 100644
--- a/sysdeps/x86_64/strtok.S
+++ b/sysdeps/x86_64/strtok.S
@@ -1,6 +1,6 @@
 /* strtok (str, delim) -- Return next DELIM separated token from STR.
    For AMD x86-64.
-   Copyright (C) 1998-2015 Free Software Foundation, Inc.
+   Copyright (C) 1998-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Based on i686 version contributed by Ulrich Drepper
    <drepper@cygnus.com>, 1998.
diff --git a/sysdeps/x86_64/sub_n.S b/sysdeps/x86_64/sub_n.S
index 4879ace6a3..cc9bc48b01 100644
--- a/sysdeps/x86_64/sub_n.S
+++ b/sysdeps/x86_64/sub_n.S
@@ -1,6 +1,6 @@
 /* x86-64 __mpn_sub_n -- Add two limb vectors of the same length > 0 and store
    sum in a third limb vector.
-   Copyright (C) 2006-2015 Free Software Foundation, Inc.
+   Copyright (C) 2006-2016 Free Software Foundation, Inc.
    This file is part of the GNU MP Library.
 
    The GNU MP Library is free software; you can redistribute it and/or modify
diff --git a/sysdeps/x86_64/submul_1.S b/sysdeps/x86_64/submul_1.S
index f5468b97e3..3037cb9c45 100644
--- a/sysdeps/x86_64/submul_1.S
+++ b/sysdeps/x86_64/submul_1.S
@@ -1,6 +1,6 @@
 /* x86-64 __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
    the result from a second limb vector.
-   Copyright (C) 2003-2015 Free Software Foundation, Inc.
+   Copyright (C) 2003-2016 Free Software Foundation, Inc.
    This file is part of the GNU MP Library.
 
    The GNU MP Library is free software; you can redistribute it and/or modify
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
index e79a3974fd..fbe3560588 100644
--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
@@ -1,5 +1,5 @@
 /* Assembler macros for x86-64.
-   Copyright (C) 2001-2015 Free Software Foundation, Inc.
+   Copyright (C) 2001-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/tlsdesc.c b/sysdeps/x86_64/tlsdesc.c
index 6807fa26ec..aff8b67941 100644
--- a/sysdeps/x86_64/tlsdesc.c
+++ b/sysdeps/x86_64/tlsdesc.c
@@ -1,5 +1,5 @@
 /* Manage TLS descriptors.  x86_64 version.
-   Copyright (C) 2005-2015 Free Software Foundation, Inc.
+   Copyright (C) 2005-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/tst-audit.h b/sysdeps/x86_64/tst-audit.h
index f2bf3aa008..94e9dd5282 100644
--- a/sysdeps/x86_64/tst-audit.h
+++ b/sysdeps/x86_64/tst-audit.h
@@ -1,6 +1,6 @@
 /* Definitions for testing PLT entry/exit auditing.  x86_64 version.
 
-   Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/x86_64/tst-audit10.c b/sysdeps/x86_64/tst-audit10.c
index 6919871564..d104341be8 100644
--- a/sysdeps/x86_64/tst-audit10.c
+++ b/sysdeps/x86_64/tst-audit10.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2012-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/tst-auditmod10a.c b/sysdeps/x86_64/tst-auditmod10a.c
index dc0d276c54..e94dbaf7fe 100644
--- a/sysdeps/x86_64/tst-auditmod10a.c
+++ b/sysdeps/x86_64/tst-auditmod10a.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2012-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/tst-auditmod10b.c b/sysdeps/x86_64/tst-auditmod10b.c
index 0eb36747d2..ad6fcafdda 100644
--- a/sysdeps/x86_64/tst-auditmod10b.c
+++ b/sysdeps/x86_64/tst-auditmod10b.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2012-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/tst-mallocalign1.c b/sysdeps/x86_64/tst-mallocalign1.c
index 89f4bed0be..3897af86c1 100644
--- a/sysdeps/x86_64/tst-mallocalign1.c
+++ b/sysdeps/x86_64/tst-mallocalign1.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2012-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/tst-quad1.c b/sysdeps/x86_64/tst-quad1.c
index c24182c6b6..1cb63a748f 100644
--- a/sysdeps/x86_64/tst-quad1.c
+++ b/sysdeps/x86_64/tst-quad1.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2012-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/tst-quadmod1.S b/sysdeps/x86_64/tst-quadmod1.S
index 3902850654..588c5016b6 100644
--- a/sysdeps/x86_64/tst-quadmod1.S
+++ b/sysdeps/x86_64/tst-quadmod1.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2012-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/tst-quadmod2.S b/sysdeps/x86_64/tst-quadmod2.S
index 1d515a8530..7409a9eaa3 100644
--- a/sysdeps/x86_64/tst-quadmod2.S
+++ b/sysdeps/x86_64/tst-quadmod2.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2012-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/tst-split-dynreloc.c b/sysdeps/x86_64/tst-split-dynreloc.c
new file mode 100644
index 0000000000..2f9e9b9477
--- /dev/null
+++ b/sysdeps/x86_64/tst-split-dynreloc.c
@@ -0,0 +1,28 @@
+/* This test will be used to create an executable with a specific
+   section layout in which .rela.dyn and .rela.plt are not contiguous.
+   For x86 case, readelf will report something like:
+
+   ...
+   [10] .rela.dyn         RELA
+   [11] .bar              PROGBITS
+   [12] .rela.plt         RELA
+   ...
+
+   This is important as this case was not correctly handled by dynamic
+   linker in the bind-now case, and the second section was never
+   processed.  */
+
+#include <stdio.h>
+
+const int __attribute__ ((section(".bar"))) bar = 0x12345678;
+static const char foo[] = "foo";
+
+static int
+do_test (void)
+{
+  printf ("%s %d\n", foo, bar);
+  return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"
diff --git a/sysdeps/x86_64/tst-split-dynreloc.lds b/sysdeps/x86_64/tst-split-dynreloc.lds
new file mode 100644
index 0000000000..2229e698c9
--- /dev/null
+++ b/sysdeps/x86_64/tst-split-dynreloc.lds
@@ -0,0 +1,5 @@
+SECTIONS
+{
+   .bar : { *(.bar) }
+}
+INSERT AFTER .rela.dyn;
diff --git a/sysdeps/x86_64/tst-stack-align.h b/sysdeps/x86_64/tst-stack-align.h
index 8d91a4c81e..24e8e61c35 100644
--- a/sysdeps/x86_64/tst-stack-align.h
+++ b/sysdeps/x86_64/tst-stack-align.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2003-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/wcschr.S b/sysdeps/x86_64/wcschr.S
index dad111ff38..8604289e46 100644
--- a/sysdeps/x86_64/wcschr.S
+++ b/sysdeps/x86_64/wcschr.S
@@ -1,5 +1,5 @@
 /* wcschr with SSSE3
-   Copyright (C) 2011-2015 Free Software Foundation, Inc.
+   Copyright (C) 2011-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/x86_64/wcscmp.S b/sysdeps/x86_64/wcscmp.S
index bd36d9f5d9..705a73b10e 100644
--- a/sysdeps/x86_64/wcscmp.S
+++ b/sysdeps/x86_64/wcscmp.S
@@ -1,5 +1,5 @@
 /* Optimized wcscmp for x86-64 with SSE2.
-   Copyright (C) 2011-2015 Free Software Foundation, Inc.
+   Copyright (C) 2011-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
index 8c1210ed8d..7a9175eefe 100644
--- a/sysdeps/x86_64/wcslen.S
+++ b/sysdeps/x86_64/wcslen.S
@@ -1,5 +1,5 @@
 /* Optimized wcslen for x86-64 with SSE2.
-   Copyright (C) 2011-2015 Free Software Foundation, Inc.
+   Copyright (C) 2011-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
index 9e28aac7f0..fb192f3ecf 100644
--- a/sysdeps/x86_64/wcsrchr.S
+++ b/sysdeps/x86_64/wcsrchr.S
@@ -1,5 +1,5 @@
 /* wcsrchr with SSSE3
-   Copyright (C) 2011-2015 Free Software Foundation, Inc.
+   Copyright (C) 2011-2016 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
diff --git a/sysdeps/x86_64/x32/dl-machine.h b/sysdeps/x86_64/x32/dl-machine.h
index f5efaa5060..47132fcd96 100644
--- a/sysdeps/x86_64/x32/dl-machine.h
+++ b/sysdeps/x86_64/x32/dl-machine.h
@@ -1,5 +1,5 @@
 /* Machine-dependent ELF dynamic relocation inline functions.  x32 version.
-   Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/rtld-memset.S b/sysdeps/x86_64/x32/fpu/s_lrint.S
index f8df3334bc..aa68863553 100644
--- a/sysdeps/x86_64/rtld-memset.S
+++ b/sysdeps/x86_64/x32/fpu/s_lrint.S
@@ -1,6 +1,6 @@
-/* memset implementation for the dynamic linker.  This is separate from the
-   libc implementation to avoid writing to SSE registers.
-   Copyright (C) 2013-2015 Free Software Foundation, Inc.
+/* Round argument to nearest integral value according to current rounding
+   direction.
+   Copyright (C) 2015-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -18,20 +18,10 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
-#include "asm-syntax.h"
-
 
 	.text
-/* void *memset (void *dest, char c, size_t count)
-   dest	 => %rdi
-   c	 => %rsi
-   count => %rdx  */
-ENTRY (memset)
-	mov	%rdx, %rcx
-	movzbl	%sil, %eax
-	mov	%rdi, %rdx
-	rep	stosb
-	mov	%rdx, %rax
+ENTRY(__lrint)
+	cvtsd2si %xmm0,%eax
 	ret
-END (memset)
-libc_hidden_builtin_def (memset)
+END(__lrint)
+weak_alias (__lrint, lrint)
diff --git a/sysdeps/x86_64/x32/fpu/s_lrintf.S b/sysdeps/x86_64/x32/fpu/s_lrintf.S
new file mode 100644
index 0000000000..bb5b1665bd
--- /dev/null
+++ b/sysdeps/x86_64/x32/fpu/s_lrintf.S
@@ -0,0 +1,27 @@
+/* Round argument to nearest integral value according to current rounding
+   direction.
+   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__lrintf)
+	cvtss2si %xmm0,%eax
+	ret
+END(__lrintf)
+weak_alias (__lrintf, lrintf)
diff --git a/sysdeps/x86_64/x32/fpu/s_lrintl.S b/sysdeps/x86_64/x32/fpu/s_lrintl.S
new file mode 100644
index 0000000000..6bc8f6fdb9
--- /dev/null
+++ b/sysdeps/x86_64/x32/fpu/s_lrintl.S
@@ -0,0 +1,30 @@
+/* Round argument to nearest integral value according to current rounding
+   direction.
+   Copyright (C) 1997-2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__lrintl)
+	fldt	8(%rsp)
+	fistpl	-4(%rsp)
+	fwait
+	movl	-4(%rsp),%eax
+	ret
+END(__lrintl)
+weak_alias (__lrintl, lrintl)
diff --git a/sysdeps/x86_64/x32/gmp-mparam.h b/sysdeps/x86_64/x32/gmp-mparam.h
index 2125a70a85..df37442bfb 100644
--- a/sysdeps/x86_64/x32/gmp-mparam.h
+++ b/sysdeps/x86_64/x32/gmp-mparam.h
@@ -1,6 +1,6 @@
 /* gmp-mparam.h -- Compiler/machine parameter header file.
 
-Copyright (C) 2012-2015 Free Software Foundation, Inc.
+Copyright (C) 2012-2016 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
diff --git a/sysdeps/x86_64/x32/nptl/tls.h b/sysdeps/x86_64/x32/nptl/tls.h
index bdc74a10a4..245623494b 100644
--- a/sysdeps/x86_64/x32/nptl/tls.h
+++ b/sysdeps/x86_64/x32/nptl/tls.h
@@ -1,5 +1,5 @@
 /* Definition for thread-local data handling.  nptl/x32 version.
-   Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
diff --git a/sysdeps/x86_64/x32/sysdep.h b/sysdeps/x86_64/x32/sysdep.h
index 0cbc1a083f..17a1446796 100644
--- a/sysdeps/x86_64/x32/sysdep.h
+++ b/sysdeps/x86_64/x32/sysdep.h
@@ -1,5 +1,5 @@
 /* Assembler macros for x32.
-   Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -90,7 +90,3 @@
 # define R15_LP	"r15d"
 
 #endif	/* __ASSEMBLER__ */
-
-/* On x32, it is not required to normalize a 64-bit value before using
-   it as a 32-bit value.  */
-#define REGISTER_CAST_INT32_TO_INT64 0
author	Samuel Thibault <samuel.thibault@ens-lyon.org>	2016-10-09 19:34:06 +0200
committer	Samuel Thibault <samuel.thibault@ens-lyon.org>	2016-10-09 19:34:06 +0200
commit	6772d640a4f4874166a61f1859e1660a2913a89d (patch)
tree	839fea4d5dcefab75577cecb563ccad4234eb953 /sysdeps/x86_64
parent	f98906bbb57cb495b4501afc5f18604ef3a94e2a (diff)
parent	7bb5f8a836b916d6ebf7b6921b136e99cea2442d (diff)