Re: [patch 00/38] x86/retbleed: Call depth tracking mitigation

From: Thomas Gleixner
Date: Mon Jul 18 2022 - 15:33:12 EST


On Mon, Jul 18 2022 at 21:29, Thomas Gleixner wrote:
>> The implementation falls back to the allocated thunks when padding is not
>> available. I'll send out the GCC patch and the required kernel patch as a
>> reply to this series after polishing it a bit.
>
> Here it goes. GCC hackery first.

And the kernel counterpart.

---
Subject: x06/callthunks: Put thunks into compiler provided padding area
From: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Date: Fri, 15 Jul 2022 16:12:47 +0200

- NOT FOR INCLUSION -

Let the compiler add a 16 byte padding in front of each function entry
point and put the call depth accounting there. That avoids calling out
into the module area and reduces ITLB pressure.

Not-Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
---
arch/x86/Kconfig | 14 ++++++
arch/x86/Makefile | 4 +
arch/x86/kernel/callthunks.c | 99 ++++++++++++++++++++++++++++++++++++++++++-
3 files changed, 115 insertions(+), 2 deletions(-)

--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2440,6 +2440,9 @@ config CC_HAS_SLS
config CC_HAS_RETURN_THUNK
def_bool $(cc-option,-mfunction-return=thunk-extern)

+config CC_HAS_PADDING
+ def_bool $(cc-option,-mforce-function-padding)
+
config HAVE_CALL_THUNKS
def_bool y
depends on RETHUNK && OBJTOOL
@@ -2512,6 +2515,17 @@ config CALL_DEPTH_TRACKING
of this option is marginal as the call depth tracking is using
run-time generated call thunks and call patching.

+config CALL_THUNKS_IN_PADDING
+ bool "Put call depth into padding area before function"
+ depends on CALL_DEPTH_TRACKING && CC_HAS_PADDING
+ default n
+ help
+ Put the call depth accounting into a padding area before the
+ function entry. This padding area is generated by the
+ compiler. This increases text size by ~5%. For non affected
+ systems this space is unused. On affected SKL systems this
+ results in a significant performance gain.
+
config CALL_THUNKS_DEBUG
bool "Enable call thunks and call depth tracking debugging"
depends on CALL_DEPTH_TRACKING
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -197,6 +197,10 @@ ifdef CONFIG_SLS
KBUILD_CFLAGS += -mharden-sls=all
endif

+ifdef CONFIG_CALL_THUNKS_IN_PADDING
+ KBUILD_CFLAGS += -mforce-function-padding
+endif
+
KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE)

ifdef CONFIG_LTO_CLANG
--- a/arch/x86/kernel/callthunks.c
+++ b/arch/x86/kernel/callthunks.c
@@ -92,6 +92,7 @@ struct thunk_mem {

struct thunk_mem_area {
struct thunk_mem *tmem;
+ unsigned long *dests;
unsigned long start;
unsigned long nthunks;
};
@@ -181,6 +182,16 @@ static __init_or_module void callthunk_f
tmem->base + area->start * callthunk_desc.thunk_size,
area->start, area->nthunks);

+ /* Remove thunks in the padding area */
+ for (i = 0; area->dests && i < area->nthunks; i++) {
+ void *dest = (void *)area->dests[i];
+
+ if (!dest)
+ continue;
+ pr_info("Remove %px at index %u\n", dest, i);
+ btree_remove64(&call_thunks, (unsigned long)dest);
+ }
+
/* Jump starts right after the template */
thunk = tmem->base + area->start * callthunk_desc.thunk_size;
tp = thunk + callthunk_desc.template_size;
@@ -204,6 +215,7 @@ static __init_or_module void callthunk_f
size = area->nthunks * callthunk_desc.thunk_size;
text_poke_set_locked(thunk, 0xcc, size);
}
+ vfree(area->dests);
kfree(area);
}

@@ -289,7 +301,8 @@ patch_paravirt_call_sites(struct paravir
patch_call(p->instr, layout);
}

-static struct thunk_mem_area *callthunks_alloc(unsigned int nthunks)
+static struct thunk_mem_area *callthunks_alloc(unsigned int nthunks,
+ bool module)
{
struct thunk_mem_area *area;
unsigned int size, mapsize;
@@ -299,6 +312,13 @@ static struct thunk_mem_area *callthunks
if (!area)
return NULL;

+ if (module) {
+ area->dests = vzalloc(nthunks * sizeof(unsigned long));
+ if (!area->dests)
+ goto free_area;
+ pr_info("Allocated dests array: %px\n", area->dests);
+ }
+
list_for_each_entry(tmem, &thunk_mem_list, list) {
unsigned long start;

@@ -340,6 +360,7 @@ static struct thunk_mem_area *callthunks
free_tmem:
kfree(tmem);
free_area:
+ vfree(area->dests);
kfree(area);
return NULL;
}
@@ -372,6 +393,73 @@ static __init_or_module int callthunk_se
return 0;
}

+int setup_padding_thunks(s32 *start, s32 *end, struct thunk_mem_area *area,
+ struct module_layout *layout)
+{
+ int nthunks = 0, idx = 0;
+ s32 *s;
+
+ if (callthunk_desc.template_size > 16)
+ return 0;
+
+ for (s = start; s < end; s++) {
+ void *thunk, *tp, *dest = (void *)s + *s;
+ unsigned long key = (unsigned long)dest;
+ int fail, i;
+ u8 opcode;
+
+ if (is_inittext(layout, dest)) {
+ prdbg("Ignoring init dest: %pS %px\n", dest, dest);
+ return 0;
+ }
+
+ /* Multiple symbols can have the same location. */
+ if (btree_lookup64(&call_thunks, key)) {
+ prdbg("Ignoring duplicate dest: %pS %px\n", dest, dest);
+ continue;
+ }
+
+ thunk = tp = dest - 16;
+ prdbg("Probing dest: %pS %px at %px\n", dest, dest, tp);
+ pagefault_disable();
+ fail = 0;
+ for (i = 0; !fail && i < 16; i++) {
+ if (get_kernel_nofault(opcode, tp + i)) {
+ fail = 1;
+ } else if (opcode != 0xcc) {
+ fail = 2;
+ }
+ }
+ pagefault_enable();
+ switch (fail) {
+ case 1:
+ prdbg("Faulted for dest: %pS %px\n", dest, dest);
+ nthunks++;
+ continue;
+ case 2:
+ prdbg("No padding for dest: %pS %px\n", dest, dest);
+ nthunks++;
+ continue;
+ }
+
+ prdbg("Thunk for dest: %pS %px at %px\n", dest, dest, tp);
+ memcpy(tp, callthunk_desc.template, callthunk_desc.template_size);
+ tp += callthunk_desc.template_size;
+ memcpy(tp, x86_nops[6], 6);
+
+ if (area->dests) {
+ pr_info("Insert %px at index %d\n", dest, idx);
+ area->dests[idx++] = key;
+ }
+
+ fail = btree_insert64(&call_thunks, key, (void *)thunk, GFP_KERNEL);
+ if (fail)
+ return fail;
+ }
+ prdbg("%d external thunks required\n", nthunks);
+ return 0;
+}
+
static __init_or_module int callthunks_setup(struct callthunk_sites *cs,
struct module_layout *layout)
{
@@ -394,7 +482,7 @@ static __init_or_module int callthunks_s
if (!nthunks)
goto patch;

- area = callthunks_alloc(nthunks);
+ area = callthunks_alloc(nthunks, !!layout->mtn.mod);
if (!area)
return -ENOMEM;

@@ -420,6 +508,13 @@ static __init_or_module int callthunks_s
prdbg("Using thunk vbuf %px\n", vbuf);
}

+ if (IS_ENABLED(CONFIG_CALL_THUNKS_IN_PADDING)) {
+ ret = setup_padding_thunks(cs->syms_start, cs->syms_end,
+ area, layout);
+ if (ret < 0)
+ goto fail;
+ }
+
for (s = cs->syms_start; s < cs->syms_end; s++) {
void *dest = (void *)s + *s;