x86_64: movdqu rarely stores bad data (movdqu works fine). Kernel bug, fried CPU or glibc bug?

From: Sergei Trofimovich
Date: Sat Jun 16 2018 - 17:23:06 EST


TL;DR: on master string/test-memmove glibc test fails on my machine
and I don't know why. Other tests work fine.

$ elf/ld.so --inhibit-cache --library-path . string/test-memmove
simple_memmove __memmove_ssse3_rep __memmove_ssse3 __memmove_sse2_unaligned __memmove_ia32
string/test-memmove: Wrong result in function __memmove_sse2_unaligned dst "0x70000084" src "0x70000000" offset "43297733"

https://sourceware.org/git/?p=glibc.git;a=blob;f=string/test-memmove.c;h=64e3651ba40604e47ddf6d633f4d0aea4644f60a;hb=HEAD

Long story:

I've trimmed __memmove_sse2_unaligned implementation down to
test-memmove-xmm-unaligned.c (attached). It's supposed to show
failed memmove attempts when those happen:

$ gcc -ggdb3 -O2 -m32 test-memmove-xmm-unaligned.c -o test-memmove-xmm-unaligned -Wall && ./test-memmove-xmm-unaligned
Bad result in memmove(dst=0xe7d44110, src=0xe7d44010, len=134217728): offset= 3786689; expected=0039C7C1( 3786689) actual=0039C7C3( 3786691) bit_mismatch=00000002; iteration=1
Bad result in memmove(dst=0xe7d44110, src=0xe7d44010, len=134217728): offset= 3786689; expected=0039C7C1( 3786689) actual=0039C7C3( 3786691) bit_mismatch=00000002; iteration=3
Bad result in memmove(dst=0xe7d44110, src=0xe7d44010, len=134217728): offset= 5448641; expected=005323C1( 5448641) actual=005323C3( 5448643) bit_mismatch=00000002; iteration=5
Bad result in memmove(dst=0xe7d44110, src=0xe7d44010, len=134217728): offset=29022145; expected=01BAD7C1(29022145) actual=01BAD7C3(29022147) bit_mismatch=00000002; iteration=9

$ gcc -ggdb3 -O2 -m64 test-memmove-xmm-unaligned.c -o test-memmove-xmm-unaligned -Wall && ./test-memmove-xmm-unaligned
Bad result in memmove(dst=0x7fa4658bf110, src=0x7fa4658bf010, len=134217728): offset=25257857; expected=01816781(25257857) actual=01816783(25257859) bit_mismatch=00000002; iteration=43
Bad result in memmove(dst=0x7fa4658bf110, src=0x7fa4658bf010, len=134217728): offset=28109697; expected=01ACEB81(28109697) actual=01ACEB83(28109699) bit_mismatch=00000002; iteration=112
Bad result in memmove(dst=0x7fa4658bf110, src=0x7fa4658bf010, len=134217728): offset=18257633; expected=011696E1(18257633) actual=011696E3(18257635) bit_mismatch=00000002; iteration=363
Bad result in memmove(dst=0x7fa4658bf110, src=0x7fa4658bf010, len=134217728): offset=26981249; expected=019BB381(26981249) actual=019BB383(26981251) bit_mismatch=00000002; iteration=437

Note it is a single-bit corruption happening occasionally (not on every iteration).
-m32 is way more error prone that -m64.

Test example roughly implements these 2 loops:
This fails:
sfence
loop {
movdqu [src++],%xmm0
movntdq %xmm0,[dst++]
}
sfence
This works:
sfence
loop {
movdqu [src++],%xmm0
movdqu %xmm0,[dst++]
}
sfence

Failures happen only on sandybridge CPU:
Intel(R) Core(TM) i7-2700K CPU @ 3.50GHz
kernel is 4.17.0-11928-g2837461dbe6f.

Problem is not reproducible instantly after reboot. Machine has to be
heavily loaded to start corrupting memory. A few hours of memtest86+
does not reveal any memory failures.

I wonder if anyone else can reproduce this failure or should I start
looking for a new CPU.

From the above it looks like as if movntdq does not play well with XMM
context save/restore and there is an 'mfence' missing somewhere in
interrupt handling.

If there is no obvious problems with glibc's memove() or my small test
what can I do to rule-out/pin-down hardware or kernel problem?

Thanks!

--

Sergei
/*
Test as:
$ gcc -ggdb3 -O2 -m32 test-memmove-xmm-unaligned.c -o test-memmove-xmm-unaligned -Wall && ./test-memmove-xmm-unaligned
Error example:
Bad result in memmove(dst=0xd7cf5094, src=0xd7cf5010, len=268435456): offset= 8031729; expected=007A8DF1( 8031729) actual=007A8DF3( 8031731) bit_mismatch=00000002; iteration=2
Bad result in memmove(dst=0xd7cf5094, src=0xd7cf5010, len=268435456): offset=43626993; expected=0299B1F1(43626993) actual=0299B1F3(43626995) bit_mismatch=00000002; iteration=3
Bad result in memmove(dst=0xd7cf5094, src=0xd7cf5010, len=268435456): offset=25404913; expected=0183A5F1(25404913) actual=0183A5F3(25404915) bit_mismatch=00000002; iteration=4
...
*/

#include <string.h> /* memmove */
#include <stdlib.h> /* exit */
#include <stdio.h> /* fprintf */

#include <sys/mman.h> /* mlock() */
#include <emmintrin.h> /* movdqu, sfence, movntdq */

typedef unsigned int u32;

static void memmove_si128u (__m128i_u * dest, __m128i_u const *src, size_t items) __attribute__((noinline));
static void memmove_si128u (__m128i_u * dest, __m128i_u const *src, size_t items)
{
// emulate behaviour of optimised block for __memmove_sse2_unaligned:
// sfence
// loop(backwards) {
// 8x movdqu mem->%xmm{N}
// 8x movntdq %xmm{N}->mem
// }
// source: https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S;h=9aa17de99c9c3415a9b5ac28fd9f1eb4457f916d;hb=HEAD#l244

// ASSUME: if ((unintptr_t)dest > (unintptr_t)src) {
dest += items - 1;
src += items - 1;
_mm_sfence();
for (; items != 0; items-=8, dest-=8, src-=8)
{
__m128i xmm0 = _mm_loadu_si128(src-0); // movdqu
__m128i xmm1 = _mm_loadu_si128(src-1); // movdqu
__m128i xmm2 = _mm_loadu_si128(src-2); // movdqu
__m128i xmm3 = _mm_loadu_si128(src-3); // movdqu
__m128i xmm4 = _mm_loadu_si128(src-4); // movdqu
__m128i xmm5 = _mm_loadu_si128(src-5); // movdqu
__m128i xmm6 = _mm_loadu_si128(src-6); // movdqu
__m128i xmm7 = _mm_loadu_si128(src-7); // movdqu
if (0)
{
// this would work:
_mm_storeu_si128(dest-0, xmm0);// movdqu
_mm_storeu_si128(dest-1, xmm1);// movdqu
_mm_storeu_si128(dest-2, xmm2);// movdqu
_mm_storeu_si128(dest-3, xmm3);// movdqu
_mm_storeu_si128(dest-4, xmm4);// movdqu
_mm_storeu_si128(dest-5, xmm5);// movdqu
_mm_storeu_si128(dest-6, xmm6);// movdqu
_mm_storeu_si128(dest-7, xmm7);// movdqu
}
else
{
_mm_stream_si128(dest-0, xmm0); // movntdq
_mm_stream_si128(dest-1, xmm1); // movntdq
_mm_stream_si128(dest-2, xmm2); // movntdq
_mm_stream_si128(dest-3, xmm3); // movntdq
_mm_stream_si128(dest-4, xmm4); // movntdq
_mm_stream_si128(dest-5, xmm5); // movntdq
_mm_stream_si128(dest-6, xmm6); // movntdq
_mm_stream_si128(dest-7, xmm7); // movntdq
}
}
_mm_sfence();
}

static void do_memmove (u32 * buf, size_t buf_elements, size_t iter) __attribute__((noinline));
static void do_memmove (u32 * buf, size_t buf_elements, size_t iter)
{
size_t elements_to_move = buf_elements / 2;

// "memset" buffer with 0, 1, 2, 3, ...
for (u32 i = 0; i < elements_to_move; i++) buf[i] = i;

u32 * dst = buf + 64;

// __memmove_sse2_unaligned
// memmove(dst, buf, elements_to_move * sizeof (u32));
memmove_si128u((__m128i_u *)dst, (__m128i_u const *)buf, elements_to_move * sizeof (u32) / sizeof (__m128i));

// validate target buffer buffer with 0, 1, 2, 3, ...
for (u32 i = 0; i < elements_to_move; i++)
{
u32 v = dst[i];
if (v != i)
fprintf (stderr,
"Bad result in memmove(dst=%p, src=%p, len=%zd)"
": offset=%8u; expected=%08X(%8u) actual=%08X(%8u) bit_mismatch=%08X; iteration=%zu\n",
dst, buf, elements_to_move * sizeof (u32),
i, i, i, v, v, v^i, iter);
}
}

int main (void)
{
size_t size = 256 * 1024 * 1024;
void * buf = malloc(size);
mlock (buf, size);
// wait for a failure
for (size_t n = 0; ;++n) {
do_memmove(buf, size / sizeof (u32), n);
}
free(buf);
}

Attachment: pgpwJsnD5nF5Q.pgp
Description: Цифровая подпись OpenPGP