Based on:
https://sourceware.org/bugzilla/show_bug.cgi?id=23196
https://sourceware.org/git/gitweb.cgi?p=glibc.git;a=patch;h=6b4362f2cbb6ef6e265d9f216f3c13d84405a1c0
https://sourceware.org/git/gitweb.cgi?p=glibc.git;a=patch;h=58ad5f8a646338b2ee3f2136336dcf731e97ab4d

From 6b4362f2cbb6ef6e265d9f216f3c13d84405a1c0 Mon Sep 17 00:00:00 2001
From: Andreas Schwab <schwab@suse.de>
Date: Thu, 24 May 2018 14:39:18 +0200
Subject: [PATCH] Don't write beyond destination in
 __mempcpy_avx512_no_vzeroupper (bug 23196)

When compiled as mempcpy, the return value is the end of the destination
buffer, thus it cannot be used to refer to the start of it.

From 58ad5f8a646338b2ee3f2136336dcf731e97ab4d Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Wed, 23 May 2018 03:59:56 -0700
Subject: [PATCH] Add a test case for [BZ #23196]

	[BZ #23196]
	* string/test-memcpy.c (do_test1): New function.
	(test_main): Call it.
---
 string/test-memcpy.c                          | 47 +++++++++++++++++++
 string/test-mempcpy.c                         |  1 +
 .../multiarch/memcpy-avx512-no-vzeroupper.S   |  5 +-
 3 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/string/test-memcpy.c b/string/test-memcpy.c
index 2a0994c..4c2b9e0 100644
--- a/string/test-memcpy.c
+++ b/string/test-memcpy.c
@@ -206,6 +206,50 @@ do_random_tests (void)
     }
 }
 
+static void
+do_test1 (void)
+{
+  size_t size = 0x100000;
+  void *large_buf;
+
+  large_buf = mmap (NULL, size * 2 + page_size, PROT_READ | PROT_WRITE,
+		    MAP_PRIVATE | MAP_ANON, -1, 0);
+  if (large_buf == MAP_FAILED)
+    {
+      puts ("Failed to allocat large_buf, skipping do_test1");
+      return;
+    }
+
+  if (mprotect (large_buf + size, page_size, PROT_NONE))
+    error (EXIT_FAILURE, errno, "mprotect failed");
+
+  size_t arrary_size = size / sizeof (uint32_t);
+  uint32_t *dest = large_buf;
+  uint32_t *src = large_buf + size + page_size;
+  size_t i;
+
+  for (i = 0; i < arrary_size; i++)
+    src[i] = (uint32_t) i;
+
+  FOR_EACH_IMPL (impl, 0)
+    {
+      memset (dest, -1, size);
+      CALL (impl, (char *) dest, (char *) src, size);
+      for (i = 0; i < arrary_size; i++)
+	if (dest[i] != src[i])
+	  {
+	    error (0, 0,
+		   "Wrong result in function %s dst \"%p\" src \"%p\" offset \"%zd\"",
+		   impl->name, dest, src, i);
+	    ret = 1;
+	    break;
+	  }
+    }
+
+  munmap ((void *) dest, size);
+  munmap ((void *) src, size);
+}
+
 int
 test_main (void)
 {
@@ -247,6 +291,9 @@ test_main (void)
   do_test (0, 0, getpagesize ());
 
   do_random_tests ();
+
+  do_test1 ();
+
   return ret;
 }
 
diff --git a/string/test-mempcpy.c b/string/test-mempcpy.c
index f4969c2..d180230 100644
--- a/string/test-mempcpy.c
+++ b/string/test-mempcpy.c
@@ -18,6 +18,7 @@
    <http://www.gnu.org/licenses/>.  */
 
 #define MEMCPY_RESULT(dst, len) (dst) + (len)
+#define MIN_PAGE_SIZE 131072
 #define TEST_MAIN
 #define TEST_NAME "mempcpy"
 #include "test-string.h"
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
index 1bb12e8..96c369e 100644
--- a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
+++ b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
@@ -333,6 +333,7 @@ L(preloop_large):
 	vmovups	(%rsi), %zmm4
 	vmovups	0x40(%rsi), %zmm5
 
+	mov	%rdi, %r11
 /* Align destination for access with non-temporal stores in the loop.  */
 	mov	%rdi, %r8
 	and	$-0x80, %rdi
@@ -363,8 +364,8 @@ L(gobble_256bytes_nt_loop):
 	cmp	$256, %rdx
 	ja	L(gobble_256bytes_nt_loop)
 	sfence
-	vmovups	%zmm4, (%rax)
-	vmovups	%zmm5, 0x40(%rax)
+	vmovups	%zmm4, (%r11)
+	vmovups	%zmm5, 0x40(%r11)
 	jmp	L(check)
 
 L(preloop_large_bkw):
-- 
2.17.1

