Hi,
I’m using memtester to test memory of my Unmatched board. It shows failure on ‘Bit Spread: testing 48’. This failure is reproduceable but the failure offset is different each time.
root@ubuntu:~# memtester 15000M
memtester version 4.6.0 (64-bit)
Copyright (C) 2001-2020 Charles Cazabon.
Licensed under the GNU General Public License version 2 (only).
pagesize is 4096
pagesizemask is 0xfffffffffffff000
want 15000MB (15728640000 bytes)
got 15000MB (15728640000 bytes), trying mlock ...locked.
Loop 1:
Stuck Address : ok
Random Value : ok
Compare XOR : ok
Compare SUB : ok
Compare MUL : ok
Compare DIV : ok
Compare OR : ok
Compare AND : ok
Sequential Increment: ok
Solid Bits : ok
Block Sequential : ok
Checkerboard : ok
Bit Spread : testing 48FAILURE: 0x0002000000000000 != 0x0005000000000000 at offset 0x00000000a7da5ec0.
FAILURE: 0xfffdffffffffffff != 0xfffaffffffffffff at offset 0x00000000a7da5ec8.
FAILURE: 0x0002000000000000 != 0x0005000000000000 at offset 0x00000000a7da5ed0.
FAILURE: 0xfffdffffffffffff != 0xfffaffffffffffff at offset 0x00000000a7da5ed8.
FAILURE: 0x0002000000000000 != 0x0005000000000000 at offset 0x00000000a7da5ee0.
FAILURE: 0xfffdffffffffffff != 0xfffaffffffffffff at offset 0x00000000a7da5ee8.
FAILURE: 0x0002000000000000 != 0x0005000000000000 at offset 0x00000000a7da5ef0.
FAILURE: 0xfffdffffffffffff != 0xfffaffffffffffff at offset 0x00000000a7da5ef8.
FAILURE: 0x0002000000000000 != 0x0005000000000000 at offset 0x00000000fbbed440.
FAILURE: 0xfffdffffffffffff != 0xfffaffffffffffff at offset 0x00000000fbbed448.
FAILURE: 0x0002000000000000 != 0x0005000000000000 at offset 0x00000000fbbed450.
FAILURE: 0xfffdffffffffffff != 0xfffaffffffffffff at offset 0x00000000fbbed458.
FAILURE: 0x0002000000000000 != 0x0005000000000000 at offset 0x00000000fbbed460.
FAILURE: 0xfffdffffffffffff != 0xfffaffffffffffff at offset 0x00000000fbbed468.
FAILURE: 0x0002000000000000 != 0x0005000000000000 at offset 0x00000000fbbed470.
FAILURE: 0xfffdffffffffffff != 0xfffaffffffffffff at offset 0x00000000fbbed478.
FAILURE: 0x0002000000000000 != 0x0005000000000000 at offset 0x000000013fbc1280.
FAILURE: 0xfffdffffffffffff != 0xfffaffffffffffff at offset 0x000000013fbc1288.
FAILURE: 0x0002000000000000 != 0x0005000000000000 at offset 0x000000013fbc1290.
FAILURE: 0xfffdffffffffffff != 0xfffaffffffffffff at offset 0x000000013fbc1298.
FAILURE: 0x0002000000000000 != 0x0005000000000000 at offset 0x000000013fbc12a0.
FAILURE: 0xfffdffffffffffff != 0xfffaffffffffffff at offset 0x000000013fbc12a8.
FAILURE: 0x0002000000000000 != 0x0005000000000000 at offset 0x000000013fbc12b0.
FAILURE: 0xfffdffffffffffff != 0xfffaffffffffffff at offset 0x000000013fbc12b8.
Bit Flip : setting 307^C
root@ubuntu:~#
My test environment is: Ubuntu 24.04.2 (fresh install, memtester installed by using apt install memtester
, gcc 13.3.0), no GPU installed.
There is a old post describing a similar problem. So I doubt if this is really caused by faulty DRAM. It would be appreciated if you could run memtester on your board and see if same problem exists.
However, you need wait for 10+ hours to reach ‘Bit Spread: testing 48’ on stock memtester. To make life easier, I cut the memtester’s code in question to a simple code snippet:
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#define FMT_TARGET "0x%016lx"
#define UL_ONEBITS 0xffffffffffffffffUL
#define UL_LEN 64
#define ONE 0x00000001UL
typedef unsigned long ul;
typedef unsigned long volatile ulv;
int __attribute__ ((noinline)) compare_regions(ulv *bufa, ulv *bufb, size_t count) {
int r = 0;
size_t i;
ulv *p1 = bufa;
ulv *p2 = bufb;
off_t physaddr;
for (i = 0; i < count; i++, p1++, p2++) {
if (*p1 != *p2) {
fprintf(stderr,
"FAILURE: " FMT_TARGET " != " FMT_TARGET " at offset " FMT_TARGET ".\n",
(ul) *p1, (ul) *p2, (ul) (i * sizeof(ul)));
/* printf("Skipping to next test..."); */
r = -1;
}
}
return r;
}
int __attribute__ ((noinline)) test_bitspread_comparison(ulv *bufa, ulv *bufb, size_t count) {
ulv *p1 = bufa;
ulv *p2 = bufb;
unsigned int j;
size_t i;
printf(" ");
fflush(stdout);
for (j = 47; j < UL_LEN * 2; j++) {
printf("\b\b\b\b\b\b\b\b\b\b\b");
p1 = (ulv *) bufa;
p2 = (ulv *) bufb;
printf("setting %3u", j);
fflush(stdout);
for (i = 0; i < count; i++) {
if (j < UL_LEN) { /* Walk it up. */
*p1++ = *p2++ = (i % 2 == 0)
? (ONE << j) | (ONE << (j + 2))
: UL_ONEBITS ^ ((ONE << j)
| (ONE << (j + 2)));
} else { /* Walk it back down. */
*p1++ = *p2++ = (i % 2 == 0)
? (ONE << (UL_LEN * 2 - 1 - j)) | (ONE << (UL_LEN * 2 + 1 - j))
: UL_ONEBITS ^ (ONE << (UL_LEN * 2 - 1 - j)
| (ONE << (UL_LEN * 2 + 1 - j)));
}
}
printf("\b\b\b\b\b\b\b\b\b\b\b");
printf("testing %3u", j);
fflush(stdout);
if (compare_regions(bufa, bufb, count)) {
return -1;
}
}
printf("\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b");
fflush(stdout);
return 0;
}
#define MEMSZ ((size_t)15 * 1024 * 1024 * 1024)
int main()
{
printf("sz=%zu\n", MEMSZ);
void *mem = mmap(0, MEMSZ, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (!mem) { perror("mmap failed"); exit(1); }
printf("mem=%p\n", mem);
printf("mlock...\n");
if (mlock(mem, MEMSZ) != 0) { perror("mlock failed"); fputs("can't mlock!!! not root?\n", stderr); }
size_t count = MEMSZ / 2 / sizeof(ul);
ulv *bufa;
ulv *bufb;
for (size_t round = 0; ; round++) {
printf("round %zu\n", round);
if (round % 2 == 0) {
bufa = mem;
bufb = bufa + count;
} else {
bufb = mem;
bufa = bufb + count;
}
printf("bufa=%p\n", bufa);
printf("bufb=%p\n", bufb);
test_bitspread_comparison(bufa, bufb, count);
}
}
You can use gcc -O2 -o test test.c
to compile (you can also try clang
) and ./test
to run it and check if it output FAILURE: ...
like this
root@ubuntu:~# gcc -O2 -o test test.c
root@ubuntu:~# ./test
sz=16106127360
mem=0x360f200000
mlock...
round 0
bufa=0x360f200000
bufb=0x37ef200000
testing 48FAILURE: 0x0005000000000000 != 0x0002000000000000 at offset 0x0000000027ff5e40.
FAILURE: 0xfffaffffffffffff != 0xfffdffffffffffff at offset 0x0000000027ff5e48.
FAILURE: 0x0005000000000000 != 0x0002000000000000 at offset 0x0000000027ff5e50.
FAILURE: 0xfffaffffffffffff != 0xfffdffffffffffff at offset 0x0000000027ff5e58.
FAILURE: 0x0005000000000000 != 0x0002000000000000 at offset 0x0000000027ff5e60.
FAILURE: 0xfffaffffffffffff != 0xfffdffffffffffff at offset 0x0000000027ff5e68.
FAILURE: 0x0005000000000000 != 0x0002000000000000 at offset 0x0000000027ff5e70.
FAILURE: 0xfffaffffffffffff != 0xfffdffffffffffff at offset 0x0000000027ff5e78.