GCC 12 miscompilation of volatile asm (was: Re: [PATCH] arm64/io: Remind compiler that there is a memory side effect)

From: Mark Rutland
Date: Tue Apr 05 2022 - 19:19:05 EST


Hi all,

[adding kernel folk who work on asm stuff]

As a heads-up, GCC 12 (not yet released) appears to erroneously optimize away
calls to functions with volatile asm. Szabolcs has raised an issue on the GCC
bugzilla:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105160

... which is a P1 release blocker, and is currently being investigated.

Jemery originally reported this as an issue with {readl,writel}_relaxed(), but
the underlying problem doesn't have anything to do with those specifically.

I'm dumping a bunch of info here largely for posterity / archival, and to find
out who (from the kernel side) is willing and able to test proposed compiler
fixes, once those are available.

I'm happy to do so for aarch64; Peter, I assume you'd be happy to look at the
x86 side?

This is a generic issue, and

I wrote test cases for aarch64 and x86_64. Those are inline later in this mail,
and currently you can see them on compiler explorer:

aarch64: https://godbolt.org/z/vMczqjYvs

x86_64: https://godbolt.org/z/cveff9hq5



My aarch64 test case is:

| #define sysreg_read(regname) \
| ({ \
| unsigned long __sr_val; \
| asm volatile( \
| "mrs %0, " #regname "\n" \
| : "=r" (__sr_val)); \
| \
| __sr_val; \
| })
|
| #define sysreg_write(regname, __sw_val) \
| do { \
| asm volatile( \
| "msr " #regname ", %0\n" \
| : \
| : "r" (__sw_val)); \
| } while (0)
|
| #define isb() \
| do { \
| asm volatile( \
| "isb" \
| : \
| : \
| : "memory"); \
| } while (0)
|
| static unsigned long sctlr_read(void)
| {
| return sysreg_read(sctlr_el1);
| }
|
| static void sctlr_write(unsigned long val)
| {
| sysreg_write(sctlr_el1, val);
| }
|
| static void sctlr_rmw(void)
| {
| unsigned long val;
|
| val = sctlr_read();
| val |= 1UL << 7;
| sctlr_write(val);
| }
|
| void sctlr_read_multiple(void)
| {
| sctlr_read();
| sctlr_read();
| sctlr_read();
| sctlr_read();
| }
|
| void sctlr_write_multiple(void)
| {
| sctlr_write(0);
| sctlr_write(0);
| sctlr_write(0);
| sctlr_write(0);
| sctlr_write(0);
| }
|
| void sctlr_rmw_multiple(void)
| {
| sctlr_rmw();
| sctlr_rmw();
| sctlr_rmw();
| sctlr_rmw();
| }
|
| void function(void)
| {
| sctlr_read_multiple();
| sctlr_write_multiple();
| sctlr_rmw_multiple();
|
| isb();
| }

Per compiler explorer (https://godbolt.org/z/vMczqjYvs) GCC trunk currently
compiles this as:

| sctlr_rmw:
| mrs x0, sctlr_el1
| orr x0, x0, 128
| msr sctlr_el1, x0
| ret
| sctlr_read_multiple:
| mrs x0, sctlr_el1
| mrs x0, sctlr_el1
| mrs x0, sctlr_el1
| mrs x0, sctlr_el1
| ret
| sctlr_write_multiple:
| mov x0, 0
| msr sctlr_el1, x0
| msr sctlr_el1, x0
| msr sctlr_el1, x0
| msr sctlr_el1, x0
| msr sctlr_el1, x0
| ret
| sctlr_rmw_multiple:
| ret
| function:
| isb
| ret

Whereas GCC 11.2 compiles this as:

| sctlr_rmw:
| mrs x0, sctlr_el1
| orr x0, x0, 128
| msr sctlr_el1, x0
| ret
| sctlr_read_multiple:
| mrs x0, sctlr_el1
| mrs x0, sctlr_el1
| mrs x0, sctlr_el1
| mrs x0, sctlr_el1
| ret
| sctlr_write_multiple:
| mov x0, 0
| msr sctlr_el1, x0
| msr sctlr_el1, x0
| msr sctlr_el1, x0
| msr sctlr_el1, x0
| msr sctlr_el1, x0
| ret
| sctlr_rmw_multiple:
| stp x29, x30, [sp, -16]!
| mov x29, sp
| bl sctlr_rmw
| bl sctlr_rmw
| bl sctlr_rmw
| bl sctlr_rmw
| ldp x29, x30, [sp], 16
| ret
| function:
| stp x29, x30, [sp, -16]!
| mov x29, sp
| bl sctlr_read_multiple
| bl sctlr_write_multiple
| bl sctlr_rmw_multiple
| isb
| ldp x29, x30, [sp], 16
| ret



My x86_64 test case is:

| unsigned long rdmsr(unsigned long reg)
| {
| unsigned int lo, hi;
|
| asm volatile(
| "rdmsr"
| : "=d" (hi), "=a" (lo)
| : "c" (reg)
| );
|
| return ((unsigned long)hi << 32) | lo;
| }
|
| void wrmsr(unsigned long reg, unsigned long val)
| {
| unsigned int lo = val;
| unsigned int hi = val >> 32;
|
| asm volatile(
| "wrmsr"
| :
| : "d" (hi), "a" (lo), "c" (reg)
| );
| }
|
| void msr_rmw_set_bits(unsigned long reg, unsigned long bits)
| {
| unsigned long val;
|
| val = rdmsr(reg);
| val |= bits;
| wrmsr(reg, val);
| }
|
| void func_with_msr_side_effects(unsigned long reg)
| {
| msr_rmw_set_bits(reg, 1UL << 0);
| msr_rmw_set_bits(reg, 1UL << 1);
| msr_rmw_set_bits(reg, 1UL << 2);
| msr_rmw_set_bits(reg, 1UL << 3);
| }

Per compiler explorer (https://godbolt.org/z/cveff9hq5) GCC trunk currently
compiles this as:

| msr_rmw_set_bits:
| mov rcx, rdi
| rdmsr
| sal rdx, 32
| mov eax, eax
| or rax, rsi
| or rax, rdx
| mov rdx, rax
| shr rdx, 32
| wrmsr
| ret
| func_with_msr_side_effects:
| ret

While GCC 11.2 compiles that as:

| msr_rmw_set_bits:
| mov rcx, rdi
| rdmsr
| sal rdx, 32
| mov eax, eax
| or rax, rsi
| or rax, rdx
| mov rdx, rax
| shr rdx, 32
| wrmsr
| ret
| func_with_msr_side_effects:
| push rbp
| push rbx
| mov rbx, rdi
| mov rbp, rsi
| call msr_rmw_set_bits
| mov rsi, rbp
| mov rdi, rbx
| call msr_rmw_set_bits
| mov rsi, rbp
| mov rdi, rbx
| call msr_rmw_set_bits
| mov rsi, rbp
| mov rdi, rbx
| call msr_rmw_set_bits
| pop rbx
| pop rbp
| ret

Thanks,
Mark.