l***@linux.intel.com
2018-12-05 22:58:39 UTC
From: Leonardo Sandoval <***@linux.intel.com>
Optimize strcat/strcpy/stpcpy routines and its max-offset versions with
AVX2. It uses vector comparison as much as possible. Observed speedups
compared to sse2_unaligned:
strcpy 1.7x
strncpy 1.4x
strcat 1.5x
strncat 1.4x
stpcpy 1.5x
stpncpy 1.4x
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
strcat-avx2, strncat-avx2, strcpy-avx2, strncpy-avx2,
stpcpy-avx2 and stpncpy-avx2.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c:
(__libc_ifunc_impl_list): Add tests for __strcat_avx2,
__strncat_avx2, __strcpy_avx2, __strncpy_avx2, __stpcpy_avx2
and __stpncpy_avx2.
* sysdeps/x86_64/multiarch/{ifunc-unaligned-ssse3.h =>
ifunc-strcpy.h}: Rename header for a more generic name.
* sysdeps/x86_64/multiarch/ifunc-strcpy.h:
(IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX 2 machines if
AVX unaligned load is fast and vzeroupper is preferred.
* sysdeps/x86_64/multiarch/ifunc-strcpy-erms.h:
(IFUNC_SELECTOR): Likewise but also checks for ERMS
availability.
* sysdeps/x86_64/multiarch/strcpy-avx2.S: AVX2 implementation of strcpy.
* sysdeps/x86_64/multiarch/stpcpy-avx2.c: New file.
* sysdeps/x86_64/multiarch/stpncpy-avx2.c: Likewise.
* sysdeps/x86_64/multiarch/strcat-avx2.c: Likewise.
* sysdeps/x86_64/multiarch/stpcpy.c: Rename header to ifunc-strcpy-erms.h.
* sysdeps/x86_64/multiarch/stpncpy.c: Likewise.
* sysdeps/x86_64/multiarch/strcat.c: Likewise.
* sysdeps/x86_64/multiarch/strcpy.c: Likewise.
* sysdeps/x86_64/multiarch/strncat.c: Likewise.
* sysdeps/x86_64/multiarch/strncpy.c: Likewise.
* sysdeps/x86_64/multiarch/strncat-avx2.c: Defines optimized routines for
use by generic string/strncat.c.
* sysdeps/x86_64/multiarch/strncpy-avx2.c: Same as above but for string string/strncpy.c.
---
Changes since v1:
- decouple non-strcpy routines from strcpy-avx2.S so these use generic
routines (defined on the string folder) using optimized IA helper routines
(defined in sysdeps/x86_64/multiarch/)
sysdeps/x86_64/multiarch/Makefile | 3 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +
sysdeps/x86_64/multiarch/ifunc-strcpy-erms.h | 47 +++
...ifunc-unaligned-ssse3.h => ifunc-strcpy.h} | 8 +-
sysdeps/x86_64/multiarch/stpcpy-avx2.c | 43 +++
sysdeps/x86_64/multiarch/stpcpy.c | 2 +-
sysdeps/x86_64/multiarch/stpncpy-avx2.c | 33 ++
sysdeps/x86_64/multiarch/stpncpy.c | 2 +-
sysdeps/x86_64/multiarch/strcat-avx2.c | 36 ++
sysdeps/x86_64/multiarch/strcat.c | 2 +-
sysdeps/x86_64/multiarch/strcpy-avx2.S | 333 ++++++++++++++++++
sysdeps/x86_64/multiarch/strcpy.c | 2 +-
sysdeps/x86_64/multiarch/strncat-avx2.c | 42 +++
sysdeps/x86_64/multiarch/strncat.c | 2 +-
sysdeps/x86_64/multiarch/strncpy-avx2.c | 33 ++
sysdeps/x86_64/multiarch/strncpy.c | 2 +-
16 files changed, 595 insertions(+), 7 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/ifunc-strcpy-erms.h
rename sysdeps/x86_64/multiarch/{ifunc-unaligned-ssse3.h => ifunc-strcpy.h} (82%)
create mode 100644 sysdeps/x86_64/multiarch/stpcpy-avx2.c
create mode 100644 sysdeps/x86_64/multiarch/stpncpy-avx2.c
create mode 100644 sysdeps/x86_64/multiarch/strcat-avx2.c
create mode 100644 sysdeps/x86_64/multiarch/strcpy-avx2.S
create mode 100644 sysdeps/x86_64/multiarch/strncat-avx2.c
create mode 100644 sysdeps/x86_64/multiarch/strncpy-avx2.c
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index bb5e9707352..395e432c092 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -24,11 +24,14 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \
strrchr-sse2 strrchr-avx2 \
strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \
+ strcat-avx2 strncat-avx2 \
strcat-ssse3 strncat-ssse3\
+ strcpy-avx2 strncpy-avx2 \
strcpy-sse2 stpcpy-sse2 \
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
strcpy-sse2-unaligned strncpy-sse2-unaligned \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
+ stpcpy-avx2 stpncpy-avx2 \
strcat-sse2 \
strcat-sse2-unaligned strncat-sse2-unaligned \
strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 9aaaef7251b..8b55bb69540 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -199,6 +199,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, stpncpy,
IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
__stpncpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, stpncpy, HAS_ARCH_FEATURE (AVX2_Usable),
+ __stpncpy_avx2)
IFUNC_IMPL_ADD (array, i, stpncpy, 1,
__stpncpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
@@ -207,6 +209,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, stpcpy,
IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3),
__stpcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, stpcpy, HAS_ARCH_FEATURE (AVX2_Usable),
+ __stpcpy_avx2)
IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2))
@@ -239,6 +243,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcat.c. */
IFUNC_IMPL (i, name, strcat,
+ IFUNC_IMPL_ADD (array, i, strcat, HAS_ARCH_FEATURE (AVX2_Usable),
+ __strcat_avx2)
IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3),
__strcat_ssse3)
IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
@@ -280,6 +286,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcpy.c. */
IFUNC_IMPL (i, name, strcpy,
+ IFUNC_IMPL_ADD (array, i, strcpy, HAS_ARCH_FEATURE (AVX2_Usable),
+ __strcpy_avx2)
IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3),
__strcpy_ssse3)
IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
@@ -321,6 +329,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncat.c. */
IFUNC_IMPL (i, name, strncat,
+ IFUNC_IMPL_ADD (array, i, strncat, HAS_ARCH_FEATURE (AVX2_Usable),
+ __strncat_avx2)
IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3),
__strncat_ssse3)
IFUNC_IMPL_ADD (array, i, strncat, 1,
@@ -329,6 +339,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncpy.c. */
IFUNC_IMPL (i, name, strncpy,
+ IFUNC_IMPL_ADD (array, i, strncpy, HAS_ARCH_FEATURE (AVX2_Usable),
+ __strncpy_avx2)
IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3),
__strncpy_ssse3)
IFUNC_IMPL_ADD (array, i, strncpy, 1,
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy-erms.h b/sysdeps/x86_64/multiarch/ifunc-strcpy-erms.h
new file mode 100644
index 00000000000..db73e8d2720
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy-erms.h
@@ -0,0 +1,47 @@
+/* Common definition for ifunc selections optimized with SSE2, unaligned
+ SSE2, SSSE3, AVX2 and ERMS.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+ const struct cpu_features* cpu_features = __get_cpu_features ();
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)
+ && CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ return OPTIMIZE (avx2);
+
+ if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
+ return OPTIMIZE (sse2_unaligned);
+
+ if (CPU_FEATURES_CPU_P (cpu_features, SSSE3))
+ return OPTIMIZE (ssse3);
+
+ return OPTIMIZE (sse2);
+}
diff --git a/sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
similarity index 82%
rename from sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h
rename to sysdeps/x86_64/multiarch/ifunc-strcpy.h
index 81805f98323..092b368477c 100644
--- a/sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
@@ -1,5 +1,5 @@
/* Common definition for ifunc selections optimized with SSE2, unaligned
- SSE2 and SSSE3.
+ SSE2, SSSE3 and AVX2.
All versions must be listed in ifunc-impl-list.c.
Copyright (C) 2017-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -24,12 +24,18 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ return OPTIMIZE (avx2);
+
if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
return OPTIMIZE (sse2_unaligned);
diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2.c b/sysdeps/x86_64/multiarch/stpcpy-avx2.c
new file mode 100644
index 00000000000..fc49f92914c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpcpy-avx2.c
@@ -0,0 +1,43 @@
+ /* stpcpy with AVX2
+ Copyright (C) 2011-2018 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+#include <string.h>
+
+#define STPCPY __stpcpy_avx2
+
+extern typeof (strlen) __strlen_avx2;
+extern typeof (memcpy) __memmove_avx_unaligned_erms attribute_hidden;
+
+#define strlen __strlen_avx2
+#define memcpy __memmove_avx_unaligned_erms
+
+#undef weak_alias
+#define weak_alias(name, aliasname)
+
+#undef libc_hidden_def
+#define libc_hidden_def(name)
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <string/stpcpy.c>
+
+#endif
+
diff --git a/sysdeps/x86_64/multiarch/stpcpy.c b/sysdeps/x86_64/multiarch/stpcpy.c
index f74a54b153a..59c9e976ff1 100644
--- a/sysdeps/x86_64/multiarch/stpcpy.c
+++ b/sysdeps/x86_64/multiarch/stpcpy.c
@@ -28,7 +28,7 @@
# undef __stpcpy
# define SYMBOL_NAME stpcpy
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-strcpy-erms.h"
libc_ifunc_redirected (__redirect_stpcpy, __stpcpy, IFUNC_SELECTOR ());
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2.c b/sysdeps/x86_64/multiarch/stpncpy-avx2.c
new file mode 100644
index 00000000000..7940c67358b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2.c
@@ -0,0 +1,33 @@
+/* Copyright (C) 2014-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <string.h>
+
+#define STPNCPY __stpncpy_avx2
+
+#undef libc_hidden_def
+#define libc_hidden_def(name)
+
+extern typeof (strnlen) __strnlen_avx2;
+extern typeof (memcpy) __memmove_avx_unaligned_erms attribute_hidden;
+extern typeof (memset) __memset_avx2_unaligned_erms attribute_hidden;
+
+#define __strnlen __strnlen_avx2
+#define memcpy __memmove_avx_unaligned_erms
+#define memset __memset_avx2_unaligned_erms
+
+#include <string/stpncpy.c>
diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c
index 28842ece2b0..bf96f2520bb 100644
--- a/sysdeps/x86_64/multiarch/stpncpy.c
+++ b/sysdeps/x86_64/multiarch/stpncpy.c
@@ -26,7 +26,7 @@
# undef __stpncpy
# define SYMBOL_NAME stpncpy
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-strcpy-erms.h"
libc_ifunc_redirected (__redirect_stpncpy, __stpncpy, IFUNC_SELECTOR ());
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.c b/sysdeps/x86_64/multiarch/strcat-avx2.c
new file mode 100644
index 00000000000..e06ffbd0050
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-avx2.c
@@ -0,0 +1,36 @@
+/* strcat with AVX2
+ Copyright (C) 2011-2018 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+#include <string.h>
+
+#define STRCAT __strcat_avx2
+
+#undef libc_hidden_def
+#define libc_hidden_def(name)
+
+extern typeof (strcpy) __strcpy_avx2;
+extern typeof (strlen) __strlen_avx2;
+
+#define strcpy __strcpy_avx2
+#define strlen __strlen_avx2
+#include <string/strcat.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcat.c b/sysdeps/x86_64/multiarch/strcat.c
index 1922c0a0da4..186b8882ea5 100644
--- a/sysdeps/x86_64/multiarch/strcat.c
+++ b/sysdeps/x86_64/multiarch/strcat.c
@@ -24,7 +24,7 @@
# undef strcat
# define SYMBOL_NAME strcat
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-strcpy.h"
libc_ifunc_redirected (__redirect_strcat, strcat, IFUNC_SELECTOR ());
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
new file mode 100644
index 00000000000..c340381a933
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -0,0 +1,333 @@
+/* strcpy with AVX2
+ Copyright (C) 2011-2018 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+
+#define STRCPY __strcpy_avx2
+
+/* Number of bytes in a vector register */
+#define VEC_SIZE 32
+
+#define VZEROUPPER vzeroupper
+
+/* define special purpose registers */
+#define xmmZ xmm0 /* zero xmm register */
+#define ymmZ ymm0 /* zero ymm register */
+#define ymmM ymm1 /* mask register */
+
+ .section .text.avx,"ax",@progbits
+ENTRY (STRCPY)
+ mov %rsi, %rcx
+ mov %rdi, %rax /* save result */
+
+ vpxor %xmmZ, %xmmZ, %xmmZ
+
+ and $((VEC_SIZE * 4) - 1), %ecx
+ cmp $(VEC_SIZE * 2), %ecx
+ jbe L(SourceStringAlignmentLessTwoVecSize)
+
+ and $-VEC_SIZE, %rsi
+ and $(VEC_SIZE - 1), %ecx
+
+ vpcmpeqb (%rsi), %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ shr %cl, %rdx
+
+ test %edx, %edx
+ jnz L(CopyVecSizeTail)
+
+ vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
+ vpmovmskb %ymm2, %edx
+
+ test %edx, %edx
+ jnz L(CopyTwoVecSize)
+
+ vmovdqu (%rsi, %rcx), %ymm2 /* copy VEC_SIZE bytes */
+ vmovdqu %ymm2, (%rdi)
+
+/* If source address alignment != destination address alignment */
+ .p2align 4
+L(UnalignVecSizeBoth):
+ sub %rcx, %rdi
+ mov $VEC_SIZE, %rcx
+ vmovdqa (%rsi, %rcx), %ymm2
+ vmovdqu %ymm2, (%rdi, %rcx)
+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
+ vpcmpeqb %ymm2, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ add $VEC_SIZE, %rcx
+ test %edx, %edx
+ jnz L(CopyVecSize)
+
+ vmovdqu %ymm2, (%rdi, %rcx)
+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
+ vpcmpeqb %ymm3, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ add $VEC_SIZE, %rcx
+ test %edx, %edx
+ jnz L(CopyVecSize)
+
+ vmovdqu %ymm3, (%rdi, %rcx)
+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
+ vpcmpeqb %ymm4, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ add $VEC_SIZE, %rcx
+ test %edx, %edx
+ jnz L(CopyVecSize)
+
+ vmovdqu %ymm4, (%rdi, %rcx)
+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
+ vpcmpeqb %ymm2, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ add $VEC_SIZE, %rcx
+ test %edx, %edx
+ jnz L(CopyVecSize)
+
+ vmovdqu %ymm2, (%rdi, %rcx)
+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
+ vpcmpeqb %ymm2, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ add $VEC_SIZE, %rcx
+ test %edx, %edx
+ jnz L(CopyVecSize)
+
+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
+ vmovdqu %ymm2, (%rdi, %rcx)
+ vpcmpeqb %ymm3, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ add $VEC_SIZE, %rcx
+ test %edx, %edx
+ jnz L(CopyVecSize)
+
+ vmovdqu %ymm3, (%rdi, %rcx)
+ mov %rsi, %rdx
+ lea VEC_SIZE(%rsi, %rcx), %rsi
+ and $-(VEC_SIZE * 4), %rsi
+ sub %rsi, %rdx
+ sub %rdx, %rdi
+L(UnalignedFourVecSizeLoop):
+ vmovdqa (%rsi), %ymm4
+ vmovdqa VEC_SIZE(%rsi), %ymm5
+ vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
+ vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
+ vpminub %ymm5, %ymm4, %ymm2
+ vpminub %ymm7, %ymm6, %ymm3
+ vpminub %ymm2, %ymm3, %ymm3
+ vpcmpeqb %ymmM, %ymm3, %ymm3
+ vpmovmskb %ymm3, %edx
+ test %edx, %edx
+ jnz L(UnalignedFourVecSizeLeave)
+
+L(UnalignedFourVecSizeLoop_start):
+ add $(VEC_SIZE * 4), %rdi
+ add $(VEC_SIZE * 4), %rsi
+ vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
+ vmovdqa (%rsi), %ymm4
+ vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
+ vmovdqa VEC_SIZE(%rsi), %ymm5
+ vpminub %ymm5, %ymm4, %ymm2
+ vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
+ vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
+ vmovdqu %ymm7, -VEC_SIZE(%rdi)
+ vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
+ vpminub %ymm7, %ymm6, %ymm3
+ vpminub %ymm2, %ymm3, %ymm3
+ vpcmpeqb %ymmM, %ymm3, %ymm3
+ vpmovmskb %ymm3, %edx
+ test %edx, %edx
+ jz L(UnalignedFourVecSizeLoop_start)
+
+L(UnalignedFourVecSizeLeave):
+ vpcmpeqb %ymm4, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ test %edx, %edx
+ jnz L(CopyVecSizeUnaligned_0)
+
+ vpcmpeqb %ymm5, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %ecx
+ test %ecx, %ecx
+ jnz L(CopyVecSizeUnaligned_16)
+
+ vpcmpeqb %ymm6, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ test %edx, %edx
+ jnz L(CopyVecSizeUnaligned_32)
+
+ vpcmpeqb %ymm7, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %ecx
+ bsf %ecx, %edx
+ vmovdqu %ymm4, (%rdi)
+ vmovdqu %ymm5, VEC_SIZE(%rdi)
+ vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
+ add $(VEC_SIZE * 3), %rsi
+ add $(VEC_SIZE * 3), %rdi
+ jmp L(CopyVecSizeExit)
+
+/* If source address alignment == destination address alignment */
+
+L(SourceStringAlignmentLessTwoVecSize):
+ vmovdqu (%rsi), %ymm3
+ vmovdqu VEC_SIZE(%rsi), %ymm2
+ vpcmpeqb %ymm3, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+
+ test %edx, %edx
+ jnz L(CopyVecSizeTail1)
+
+ vmovdqu %ymm3, (%rdi)
+ vpcmpeqb %ymm2, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+
+ test %edx, %edx
+ jnz L(CopyTwoVecSize1)
+
+ and $-VEC_SIZE, %rsi
+ and $(VEC_SIZE - 1), %ecx
+ jmp L(UnalignVecSizeBoth)
+
+/*------End of main part with loops---------------------*/
+
+/* Case1 */
+
+ .p2align 4
+L(CopyVecSize):
+ add %rcx, %rdi
+L(CopyVecSizeTail):
+ add %rcx, %rsi
+L(CopyVecSizeTail1):
+ bsf %edx, %edx
+L(CopyVecSizeExit):
+ cmp $32, %edx
+ jae L(Exit32_63)
+ cmp $16, %edx
+ jae L(Exit16_31)
+ cmp $8, %edx
+ jae L(Exit8_15)
+ cmp $4, %edx
+ jae L(Exit4_7)
+ cmp $3, %edx
+ je L(Exit3)
+ cmp $1, %edx
+ ja L(Exit2)
+ je L(Exit1)
+ movb $0, (%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(CopyTwoVecSize1):
+ add $VEC_SIZE, %rsi
+ add $VEC_SIZE, %rdi
+ jmp L(CopyVecSizeTail1)
+
+ .p2align 4
+L(CopyTwoVecSize):
+ bsf %edx, %edx
+ add %rcx, %rsi
+ add $VEC_SIZE, %edx
+ sub %ecx, %edx
+ jmp L(CopyVecSizeExit)
+
+ .p2align 4
+L(CopyVecSizeUnaligned_0):
+ bsf %edx, %edx
+ jmp L(CopyVecSizeExit)
+
+ .p2align 4
+L(CopyVecSizeUnaligned_16):
+ bsf %ecx, %edx
+ vmovdqu %ymm4, (%rdi)
+ add $VEC_SIZE, %rsi
+ add $VEC_SIZE, %rdi
+ jmp L(CopyVecSizeExit)
+
+ .p2align 4
+L(CopyVecSizeUnaligned_32):
+ bsf %edx, %edx
+ vmovdqu %ymm4, (%rdi)
+ vmovdqu %ymm5, VEC_SIZE(%rdi)
+ add $(VEC_SIZE * 2), %rsi
+ add $(VEC_SIZE * 2), %rdi
+ jmp L(CopyVecSizeExit)
+
+/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
+
+ .p2align 4
+L(Exit1):
+ movzwl (%rsi), %edx
+ mov %dx, (%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit2):
+ movzwl (%rsi), %ecx
+ mov %cx, (%rdi)
+ movb $0, 2(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit3):
+ mov (%rsi), %edx
+ mov %edx, (%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit4_7):
+ mov (%rsi), %ecx
+ mov %ecx, (%rdi)
+ mov -3(%rsi, %rdx), %ecx
+ mov %ecx, -3(%rdi, %rdx)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit8_15):
+ mov (%rsi), %rcx
+ mov -7(%rsi, %rdx), %r9
+ mov %rcx, (%rdi)
+ mov %r9, -7(%rdi, %rdx)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit16_31):
+ vmovdqu (%rsi), %xmm2
+ vmovdqu -15(%rsi, %rdx), %xmm3
+ vmovdqu %xmm2, (%rdi)
+ vmovdqu %xmm3, -15(%rdi, %rdx)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit32_63):
+ vmovdqu (%rsi), %ymm2
+ vmovdqu -31(%rsi, %rdx), %ymm3
+ vmovdqu %ymm2, (%rdi)
+ vmovdqu %ymm3, -31(%rdi, %rdx)
+ VZEROUPPER
+ ret
+
+
+END (STRCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcpy.c b/sysdeps/x86_64/multiarch/strcpy.c
index ce819dd2321..f9b0dddf4dd 100644
--- a/sysdeps/x86_64/multiarch/strcpy.c
+++ b/sysdeps/x86_64/multiarch/strcpy.c
@@ -24,7 +24,7 @@
# undef strcpy
# define SYMBOL_NAME strcpy
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-strcpy.h"
libc_ifunc_redirected (__redirect_strcpy, strcpy, IFUNC_SELECTOR ());
diff --git a/sysdeps/x86_64/multiarch/strncat-avx2.c b/sysdeps/x86_64/multiarch/strncat-avx2.c
new file mode 100644
index 00000000000..017d079ceba
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncat-avx2.c
@@ -0,0 +1,42 @@
+/* strncat with AVX2
+ Copyright (C) 2011-2018 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+#include <string.h>
+
+#define STRNCAT __strncat_avx2
+
+#undef libc_hidden_def
+#define libc_hidden_def(name)
+
+extern typeof (strncpy) __strncpy_avx2;
+extern typeof (strlen) __strlen_avx2;
+extern typeof (strnlen) __strnlen_avx2;
+extern typeof (memcpy) __memmove_avx_unaligned_erms attribute_hidden;
+
+#define strncpy __strncpy_avx2
+#define strlen __strlen_avx2
+#define memcpy __memmove_avx_unaligned_erms
+#define __strnlen __strnlen_avx2
+
+#include <string/strncat.c>
+
+#endif
+
diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c
index 2546277450b..d44b4338bb9 100644
--- a/sysdeps/x86_64/multiarch/strncat.c
+++ b/sysdeps/x86_64/multiarch/strncat.c
@@ -24,7 +24,7 @@
# undef strncat
# define SYMBOL_NAME strncat
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-strcpy-erms.h"
libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ());
strong_alias (strncat, __strncat);
diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2.c b/sysdeps/x86_64/multiarch/strncpy-avx2.c
new file mode 100644
index 00000000000..a3e6aff26db
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-avx2.c
@@ -0,0 +1,33 @@
+/* Copyright (C) 2014-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <string.h>
+
+#define STRNCPY __strncpy_avx2
+
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)
+
+extern typeof (strnlen) __strnlen_avx2;
+extern typeof (memcpy) __memmove_avx_unaligned_erms attribute_hidden;
+extern typeof (memset) __memset_avx2_unaligned_erms attribute_hidden;
+
+#define __strnlen __strnlen_avx2
+#define memcpy __memmove_avx_unaligned_erms
+#define memset __memset_avx2_unaligned_erms
+
+#include <string/strncpy.c>
diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c
index 37aae2c3ba1..90032d1dd86 100644
--- a/sysdeps/x86_64/multiarch/strncpy.c
+++ b/sysdeps/x86_64/multiarch/strncpy.c
@@ -24,7 +24,7 @@
# undef strncpy
# define SYMBOL_NAME strncpy
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-strcpy-erms.h"
libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ());
Optimize strcat/strcpy/stpcpy routines and its max-offset versions with
AVX2. It uses vector comparison as much as possible. Observed speedups
compared to sse2_unaligned:
strcpy 1.7x
strncpy 1.4x
strcat 1.5x
strncat 1.4x
stpcpy 1.5x
stpncpy 1.4x
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
strcat-avx2, strncat-avx2, strcpy-avx2, strncpy-avx2,
stpcpy-avx2 and stpncpy-avx2.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c:
(__libc_ifunc_impl_list): Add tests for __strcat_avx2,
__strncat_avx2, __strcpy_avx2, __strncpy_avx2, __stpcpy_avx2
and __stpncpy_avx2.
* sysdeps/x86_64/multiarch/{ifunc-unaligned-ssse3.h =>
ifunc-strcpy.h}: Rename header for a more generic name.
* sysdeps/x86_64/multiarch/ifunc-strcpy.h:
(IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX 2 machines if
AVX unaligned load is fast and vzeroupper is preferred.
* sysdeps/x86_64/multiarch/ifunc-strcpy-erms.h:
(IFUNC_SELECTOR): Likewise but also checks for ERMS
availability.
* sysdeps/x86_64/multiarch/strcpy-avx2.S: AVX2 implementation of strcpy.
* sysdeps/x86_64/multiarch/stpcpy-avx2.c: New file.
* sysdeps/x86_64/multiarch/stpncpy-avx2.c: Likewise.
* sysdeps/x86_64/multiarch/strcat-avx2.c: Likewise.
* sysdeps/x86_64/multiarch/stpcpy.c: Rename header to ifunc-strcpy-erms.h.
* sysdeps/x86_64/multiarch/stpncpy.c: Likewise.
* sysdeps/x86_64/multiarch/strcat.c: Likewise.
* sysdeps/x86_64/multiarch/strcpy.c: Likewise.
* sysdeps/x86_64/multiarch/strncat.c: Likewise.
* sysdeps/x86_64/multiarch/strncpy.c: Likewise.
* sysdeps/x86_64/multiarch/strncat-avx2.c: Defines optimized routines for
use by generic string/strncat.c.
* sysdeps/x86_64/multiarch/strncpy-avx2.c: Same as above but for string string/strncpy.c.
---
Changes since v1:
- decouple non-strcpy routines from strcpy-avx2.S so these use generic
routines (defined on the string folder) using optimized IA helper routines
(defined in sysdeps/x86_64/multiarch/)
sysdeps/x86_64/multiarch/Makefile | 3 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +
sysdeps/x86_64/multiarch/ifunc-strcpy-erms.h | 47 +++
...ifunc-unaligned-ssse3.h => ifunc-strcpy.h} | 8 +-
sysdeps/x86_64/multiarch/stpcpy-avx2.c | 43 +++
sysdeps/x86_64/multiarch/stpcpy.c | 2 +-
sysdeps/x86_64/multiarch/stpncpy-avx2.c | 33 ++
sysdeps/x86_64/multiarch/stpncpy.c | 2 +-
sysdeps/x86_64/multiarch/strcat-avx2.c | 36 ++
sysdeps/x86_64/multiarch/strcat.c | 2 +-
sysdeps/x86_64/multiarch/strcpy-avx2.S | 333 ++++++++++++++++++
sysdeps/x86_64/multiarch/strcpy.c | 2 +-
sysdeps/x86_64/multiarch/strncat-avx2.c | 42 +++
sysdeps/x86_64/multiarch/strncat.c | 2 +-
sysdeps/x86_64/multiarch/strncpy-avx2.c | 33 ++
sysdeps/x86_64/multiarch/strncpy.c | 2 +-
16 files changed, 595 insertions(+), 7 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/ifunc-strcpy-erms.h
rename sysdeps/x86_64/multiarch/{ifunc-unaligned-ssse3.h => ifunc-strcpy.h} (82%)
create mode 100644 sysdeps/x86_64/multiarch/stpcpy-avx2.c
create mode 100644 sysdeps/x86_64/multiarch/stpncpy-avx2.c
create mode 100644 sysdeps/x86_64/multiarch/strcat-avx2.c
create mode 100644 sysdeps/x86_64/multiarch/strcpy-avx2.S
create mode 100644 sysdeps/x86_64/multiarch/strncat-avx2.c
create mode 100644 sysdeps/x86_64/multiarch/strncpy-avx2.c
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index bb5e9707352..395e432c092 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -24,11 +24,14 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \
strrchr-sse2 strrchr-avx2 \
strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \
+ strcat-avx2 strncat-avx2 \
strcat-ssse3 strncat-ssse3\
+ strcpy-avx2 strncpy-avx2 \
strcpy-sse2 stpcpy-sse2 \
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
strcpy-sse2-unaligned strncpy-sse2-unaligned \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
+ stpcpy-avx2 stpncpy-avx2 \
strcat-sse2 \
strcat-sse2-unaligned strncat-sse2-unaligned \
strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 9aaaef7251b..8b55bb69540 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -199,6 +199,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, stpncpy,
IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
__stpncpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, stpncpy, HAS_ARCH_FEATURE (AVX2_Usable),
+ __stpncpy_avx2)
IFUNC_IMPL_ADD (array, i, stpncpy, 1,
__stpncpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
@@ -207,6 +209,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, stpcpy,
IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3),
__stpcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, stpcpy, HAS_ARCH_FEATURE (AVX2_Usable),
+ __stpcpy_avx2)
IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2))
@@ -239,6 +243,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcat.c. */
IFUNC_IMPL (i, name, strcat,
+ IFUNC_IMPL_ADD (array, i, strcat, HAS_ARCH_FEATURE (AVX2_Usable),
+ __strcat_avx2)
IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3),
__strcat_ssse3)
IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
@@ -280,6 +286,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcpy.c. */
IFUNC_IMPL (i, name, strcpy,
+ IFUNC_IMPL_ADD (array, i, strcpy, HAS_ARCH_FEATURE (AVX2_Usable),
+ __strcpy_avx2)
IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3),
__strcpy_ssse3)
IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
@@ -321,6 +329,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncat.c. */
IFUNC_IMPL (i, name, strncat,
+ IFUNC_IMPL_ADD (array, i, strncat, HAS_ARCH_FEATURE (AVX2_Usable),
+ __strncat_avx2)
IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3),
__strncat_ssse3)
IFUNC_IMPL_ADD (array, i, strncat, 1,
@@ -329,6 +339,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncpy.c. */
IFUNC_IMPL (i, name, strncpy,
+ IFUNC_IMPL_ADD (array, i, strncpy, HAS_ARCH_FEATURE (AVX2_Usable),
+ __strncpy_avx2)
IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3),
__strncpy_ssse3)
IFUNC_IMPL_ADD (array, i, strncpy, 1,
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy-erms.h b/sysdeps/x86_64/multiarch/ifunc-strcpy-erms.h
new file mode 100644
index 00000000000..db73e8d2720
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy-erms.h
@@ -0,0 +1,47 @@
+/* Common definition for ifunc selections optimized with SSE2, unaligned
+ SSE2, SSSE3, AVX2 and ERMS.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+ const struct cpu_features* cpu_features = __get_cpu_features ();
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)
+ && CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ return OPTIMIZE (avx2);
+
+ if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
+ return OPTIMIZE (sse2_unaligned);
+
+ if (CPU_FEATURES_CPU_P (cpu_features, SSSE3))
+ return OPTIMIZE (ssse3);
+
+ return OPTIMIZE (sse2);
+}
diff --git a/sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
similarity index 82%
rename from sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h
rename to sysdeps/x86_64/multiarch/ifunc-strcpy.h
index 81805f98323..092b368477c 100644
--- a/sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
@@ -1,5 +1,5 @@
/* Common definition for ifunc selections optimized with SSE2, unaligned
- SSE2 and SSSE3.
+ SSE2, SSSE3 and AVX2.
All versions must be listed in ifunc-impl-list.c.
Copyright (C) 2017-2018 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -24,12 +24,18 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ return OPTIMIZE (avx2);
+
if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
return OPTIMIZE (sse2_unaligned);
diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2.c b/sysdeps/x86_64/multiarch/stpcpy-avx2.c
new file mode 100644
index 00000000000..fc49f92914c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpcpy-avx2.c
@@ -0,0 +1,43 @@
+ /* stpcpy with AVX2
+ Copyright (C) 2011-2018 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+#include <string.h>
+
+#define STPCPY __stpcpy_avx2
+
+extern typeof (strlen) __strlen_avx2;
+extern typeof (memcpy) __memmove_avx_unaligned_erms attribute_hidden;
+
+#define strlen __strlen_avx2
+#define memcpy __memmove_avx_unaligned_erms
+
+#undef weak_alias
+#define weak_alias(name, aliasname)
+
+#undef libc_hidden_def
+#define libc_hidden_def(name)
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <string/stpcpy.c>
+
+#endif
+
diff --git a/sysdeps/x86_64/multiarch/stpcpy.c b/sysdeps/x86_64/multiarch/stpcpy.c
index f74a54b153a..59c9e976ff1 100644
--- a/sysdeps/x86_64/multiarch/stpcpy.c
+++ b/sysdeps/x86_64/multiarch/stpcpy.c
@@ -28,7 +28,7 @@
# undef __stpcpy
# define SYMBOL_NAME stpcpy
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-strcpy-erms.h"
libc_ifunc_redirected (__redirect_stpcpy, __stpcpy, IFUNC_SELECTOR ());
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2.c b/sysdeps/x86_64/multiarch/stpncpy-avx2.c
new file mode 100644
index 00000000000..7940c67358b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2.c
@@ -0,0 +1,33 @@
+/* Copyright (C) 2014-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <string.h>
+
+#define STPNCPY __stpncpy_avx2
+
+#undef libc_hidden_def
+#define libc_hidden_def(name)
+
+extern typeof (strnlen) __strnlen_avx2;
+extern typeof (memcpy) __memmove_avx_unaligned_erms attribute_hidden;
+extern typeof (memset) __memset_avx2_unaligned_erms attribute_hidden;
+
+#define __strnlen __strnlen_avx2
+#define memcpy __memmove_avx_unaligned_erms
+#define memset __memset_avx2_unaligned_erms
+
+#include <string/stpncpy.c>
diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c
index 28842ece2b0..bf96f2520bb 100644
--- a/sysdeps/x86_64/multiarch/stpncpy.c
+++ b/sysdeps/x86_64/multiarch/stpncpy.c
@@ -26,7 +26,7 @@
# undef __stpncpy
# define SYMBOL_NAME stpncpy
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-strcpy-erms.h"
libc_ifunc_redirected (__redirect_stpncpy, __stpncpy, IFUNC_SELECTOR ());
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.c b/sysdeps/x86_64/multiarch/strcat-avx2.c
new file mode 100644
index 00000000000..e06ffbd0050
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-avx2.c
@@ -0,0 +1,36 @@
+/* strcat with AVX2
+ Copyright (C) 2011-2018 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+#include <string.h>
+
+#define STRCAT __strcat_avx2
+
+#undef libc_hidden_def
+#define libc_hidden_def(name)
+
+extern typeof (strcpy) __strcpy_avx2;
+extern typeof (strlen) __strlen_avx2;
+
+#define strcpy __strcpy_avx2
+#define strlen __strlen_avx2
+#include <string/strcat.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcat.c b/sysdeps/x86_64/multiarch/strcat.c
index 1922c0a0da4..186b8882ea5 100644
--- a/sysdeps/x86_64/multiarch/strcat.c
+++ b/sysdeps/x86_64/multiarch/strcat.c
@@ -24,7 +24,7 @@
# undef strcat
# define SYMBOL_NAME strcat
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-strcpy.h"
libc_ifunc_redirected (__redirect_strcat, strcat, IFUNC_SELECTOR ());
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
new file mode 100644
index 00000000000..c340381a933
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -0,0 +1,333 @@
+/* strcpy with AVX2
+ Copyright (C) 2011-2018 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+
+#define STRCPY __strcpy_avx2
+
+/* Number of bytes in a vector register */
+#define VEC_SIZE 32
+
+#define VZEROUPPER vzeroupper
+
+/* define special purpose registers */
+#define xmmZ xmm0 /* zero xmm register */
+#define ymmZ ymm0 /* zero ymm register */
+#define ymmM ymm1 /* mask register */
+
+ .section .text.avx,"ax",@progbits
+ENTRY (STRCPY)
+ mov %rsi, %rcx
+ mov %rdi, %rax /* save result */
+
+ vpxor %xmmZ, %xmmZ, %xmmZ
+
+ and $((VEC_SIZE * 4) - 1), %ecx
+ cmp $(VEC_SIZE * 2), %ecx
+ jbe L(SourceStringAlignmentLessTwoVecSize)
+
+ and $-VEC_SIZE, %rsi
+ and $(VEC_SIZE - 1), %ecx
+
+ vpcmpeqb (%rsi), %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ shr %cl, %rdx
+
+ test %edx, %edx
+ jnz L(CopyVecSizeTail)
+
+ vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
+ vpmovmskb %ymm2, %edx
+
+ test %edx, %edx
+ jnz L(CopyTwoVecSize)
+
+ vmovdqu (%rsi, %rcx), %ymm2 /* copy VEC_SIZE bytes */
+ vmovdqu %ymm2, (%rdi)
+
+/* If source address alignment != destination address alignment */
+ .p2align 4
+L(UnalignVecSizeBoth):
+ sub %rcx, %rdi
+ mov $VEC_SIZE, %rcx
+ vmovdqa (%rsi, %rcx), %ymm2
+ vmovdqu %ymm2, (%rdi, %rcx)
+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
+ vpcmpeqb %ymm2, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ add $VEC_SIZE, %rcx
+ test %edx, %edx
+ jnz L(CopyVecSize)
+
+ vmovdqu %ymm2, (%rdi, %rcx)
+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
+ vpcmpeqb %ymm3, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ add $VEC_SIZE, %rcx
+ test %edx, %edx
+ jnz L(CopyVecSize)
+
+ vmovdqu %ymm3, (%rdi, %rcx)
+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
+ vpcmpeqb %ymm4, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ add $VEC_SIZE, %rcx
+ test %edx, %edx
+ jnz L(CopyVecSize)
+
+ vmovdqu %ymm4, (%rdi, %rcx)
+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
+ vpcmpeqb %ymm2, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ add $VEC_SIZE, %rcx
+ test %edx, %edx
+ jnz L(CopyVecSize)
+
+ vmovdqu %ymm2, (%rdi, %rcx)
+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
+ vpcmpeqb %ymm2, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ add $VEC_SIZE, %rcx
+ test %edx, %edx
+ jnz L(CopyVecSize)
+
+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
+ vmovdqu %ymm2, (%rdi, %rcx)
+ vpcmpeqb %ymm3, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ add $VEC_SIZE, %rcx
+ test %edx, %edx
+ jnz L(CopyVecSize)
+
+ vmovdqu %ymm3, (%rdi, %rcx)
+ mov %rsi, %rdx
+ lea VEC_SIZE(%rsi, %rcx), %rsi
+ and $-(VEC_SIZE * 4), %rsi
+ sub %rsi, %rdx
+ sub %rdx, %rdi
+L(UnalignedFourVecSizeLoop):
+ vmovdqa (%rsi), %ymm4
+ vmovdqa VEC_SIZE(%rsi), %ymm5
+ vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
+ vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
+ vpminub %ymm5, %ymm4, %ymm2
+ vpminub %ymm7, %ymm6, %ymm3
+ vpminub %ymm2, %ymm3, %ymm3
+ vpcmpeqb %ymmM, %ymm3, %ymm3
+ vpmovmskb %ymm3, %edx
+ test %edx, %edx
+ jnz L(UnalignedFourVecSizeLeave)
+
+L(UnalignedFourVecSizeLoop_start):
+ add $(VEC_SIZE * 4), %rdi
+ add $(VEC_SIZE * 4), %rsi
+ vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
+ vmovdqa (%rsi), %ymm4
+ vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
+ vmovdqa VEC_SIZE(%rsi), %ymm5
+ vpminub %ymm5, %ymm4, %ymm2
+ vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
+ vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
+ vmovdqu %ymm7, -VEC_SIZE(%rdi)
+ vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
+ vpminub %ymm7, %ymm6, %ymm3
+ vpminub %ymm2, %ymm3, %ymm3
+ vpcmpeqb %ymmM, %ymm3, %ymm3
+ vpmovmskb %ymm3, %edx
+ test %edx, %edx
+ jz L(UnalignedFourVecSizeLoop_start)
+
+L(UnalignedFourVecSizeLeave):
+ vpcmpeqb %ymm4, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ test %edx, %edx
+ jnz L(CopyVecSizeUnaligned_0)
+
+ vpcmpeqb %ymm5, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %ecx
+ test %ecx, %ecx
+ jnz L(CopyVecSizeUnaligned_16)
+
+ vpcmpeqb %ymm6, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+ test %edx, %edx
+ jnz L(CopyVecSizeUnaligned_32)
+
+ vpcmpeqb %ymm7, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %ecx
+ bsf %ecx, %edx
+ vmovdqu %ymm4, (%rdi)
+ vmovdqu %ymm5, VEC_SIZE(%rdi)
+ vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
+ add $(VEC_SIZE * 3), %rsi
+ add $(VEC_SIZE * 3), %rdi
+ jmp L(CopyVecSizeExit)
+
+/* If source address alignment == destination address alignment */
+
+L(SourceStringAlignmentLessTwoVecSize):
+ vmovdqu (%rsi), %ymm3
+ vmovdqu VEC_SIZE(%rsi), %ymm2
+ vpcmpeqb %ymm3, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+
+ test %edx, %edx
+ jnz L(CopyVecSizeTail1)
+
+ vmovdqu %ymm3, (%rdi)
+ vpcmpeqb %ymm2, %ymmZ, %ymmM
+ vpmovmskb %ymmM, %edx
+
+ test %edx, %edx
+ jnz L(CopyTwoVecSize1)
+
+ and $-VEC_SIZE, %rsi
+ and $(VEC_SIZE - 1), %ecx
+ jmp L(UnalignVecSizeBoth)
+
+/*------End of main part with loops---------------------*/
+
+/* Case1 */
+
+ .p2align 4
+L(CopyVecSize):
+ add %rcx, %rdi
+L(CopyVecSizeTail):
+ add %rcx, %rsi
+L(CopyVecSizeTail1):
+ bsf %edx, %edx
+L(CopyVecSizeExit):
+ cmp $32, %edx
+ jae L(Exit32_63)
+ cmp $16, %edx
+ jae L(Exit16_31)
+ cmp $8, %edx
+ jae L(Exit8_15)
+ cmp $4, %edx
+ jae L(Exit4_7)
+ cmp $3, %edx
+ je L(Exit3)
+ cmp $1, %edx
+ ja L(Exit2)
+ je L(Exit1)
+ movb $0, (%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(CopyTwoVecSize1):
+ add $VEC_SIZE, %rsi
+ add $VEC_SIZE, %rdi
+ jmp L(CopyVecSizeTail1)
+
+ .p2align 4
+L(CopyTwoVecSize):
+ bsf %edx, %edx
+ add %rcx, %rsi
+ add $VEC_SIZE, %edx
+ sub %ecx, %edx
+ jmp L(CopyVecSizeExit)
+
+ .p2align 4
+L(CopyVecSizeUnaligned_0):
+ bsf %edx, %edx
+ jmp L(CopyVecSizeExit)
+
+ .p2align 4
+L(CopyVecSizeUnaligned_16):
+ bsf %ecx, %edx
+ vmovdqu %ymm4, (%rdi)
+ add $VEC_SIZE, %rsi
+ add $VEC_SIZE, %rdi
+ jmp L(CopyVecSizeExit)
+
+ .p2align 4
+L(CopyVecSizeUnaligned_32):
+ bsf %edx, %edx
+ vmovdqu %ymm4, (%rdi)
+ vmovdqu %ymm5, VEC_SIZE(%rdi)
+ add $(VEC_SIZE * 2), %rsi
+ add $(VEC_SIZE * 2), %rdi
+ jmp L(CopyVecSizeExit)
+
+/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
+
+ .p2align 4
+L(Exit1):
+ movzwl (%rsi), %edx
+ mov %dx, (%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit2):
+ movzwl (%rsi), %ecx
+ mov %cx, (%rdi)
+ movb $0, 2(%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit3):
+ mov (%rsi), %edx
+ mov %edx, (%rdi)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit4_7):
+ mov (%rsi), %ecx
+ mov %ecx, (%rdi)
+ mov -3(%rsi, %rdx), %ecx
+ mov %ecx, -3(%rdi, %rdx)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit8_15):
+ mov (%rsi), %rcx
+ mov -7(%rsi, %rdx), %r9
+ mov %rcx, (%rdi)
+ mov %r9, -7(%rdi, %rdx)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit16_31):
+ vmovdqu (%rsi), %xmm2
+ vmovdqu -15(%rsi, %rdx), %xmm3
+ vmovdqu %xmm2, (%rdi)
+ vmovdqu %xmm3, -15(%rdi, %rdx)
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(Exit32_63):
+ vmovdqu (%rsi), %ymm2
+ vmovdqu -31(%rsi, %rdx), %ymm3
+ vmovdqu %ymm2, (%rdi)
+ vmovdqu %ymm3, -31(%rdi, %rdx)
+ VZEROUPPER
+ ret
+
+
+END (STRCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcpy.c b/sysdeps/x86_64/multiarch/strcpy.c
index ce819dd2321..f9b0dddf4dd 100644
--- a/sysdeps/x86_64/multiarch/strcpy.c
+++ b/sysdeps/x86_64/multiarch/strcpy.c
@@ -24,7 +24,7 @@
# undef strcpy
# define SYMBOL_NAME strcpy
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-strcpy.h"
libc_ifunc_redirected (__redirect_strcpy, strcpy, IFUNC_SELECTOR ());
diff --git a/sysdeps/x86_64/multiarch/strncat-avx2.c b/sysdeps/x86_64/multiarch/strncat-avx2.c
new file mode 100644
index 00000000000..017d079ceba
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncat-avx2.c
@@ -0,0 +1,42 @@
+/* strncat with AVX2
+ Copyright (C) 2011-2018 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+#include <string.h>
+
+#define STRNCAT __strncat_avx2
+
+#undef libc_hidden_def
+#define libc_hidden_def(name)
+
+extern typeof (strncpy) __strncpy_avx2;
+extern typeof (strlen) __strlen_avx2;
+extern typeof (strnlen) __strnlen_avx2;
+extern typeof (memcpy) __memmove_avx_unaligned_erms attribute_hidden;
+
+#define strncpy __strncpy_avx2
+#define strlen __strlen_avx2
+#define memcpy __memmove_avx_unaligned_erms
+#define __strnlen __strnlen_avx2
+
+#include <string/strncat.c>
+
+#endif
+
diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c
index 2546277450b..d44b4338bb9 100644
--- a/sysdeps/x86_64/multiarch/strncat.c
+++ b/sysdeps/x86_64/multiarch/strncat.c
@@ -24,7 +24,7 @@
# undef strncat
# define SYMBOL_NAME strncat
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-strcpy-erms.h"
libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ());
strong_alias (strncat, __strncat);
diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2.c b/sysdeps/x86_64/multiarch/strncpy-avx2.c
new file mode 100644
index 00000000000..a3e6aff26db
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-avx2.c
@@ -0,0 +1,33 @@
+/* Copyright (C) 2014-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <string.h>
+
+#define STRNCPY __strncpy_avx2
+
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)
+
+extern typeof (strnlen) __strnlen_avx2;
+extern typeof (memcpy) __memmove_avx_unaligned_erms attribute_hidden;
+extern typeof (memset) __memset_avx2_unaligned_erms attribute_hidden;
+
+#define __strnlen __strnlen_avx2
+#define memcpy __memmove_avx_unaligned_erms
+#define memset __memset_avx2_unaligned_erms
+
+#include <string/strncpy.c>
diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c
index 37aae2c3ba1..90032d1dd86 100644
--- a/sysdeps/x86_64/multiarch/strncpy.c
+++ b/sysdeps/x86_64/multiarch/strncpy.c
@@ -24,7 +24,7 @@
# undef strncpy
# define SYMBOL_NAME strncpy
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-strcpy-erms.h"
libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ());
--
2.19.2
2.19.2