Re: [PATCH 1/3] x86/mm: Fix potential overflow in user_pcid_flush_mask
From: Dave Hansen
Date: Mon Jun 02 2025 - 12:55:35 EST
On 6/2/25 06:30, Rik van Riel wrote:
> Currently no system with AMD INVLPGB support requires the page table
> isolation mitigation. However, people could still enable PTI manually,
> or a vulnerability could be found in the future that makes PTI useful
> on certain AMD CPUs.
>
> Additionally, there are systems that support Intel RAR TLB invalidation,
> where PTI is a useful mitigation.
Let's just leave this mention of RAR out for now.
> The combination of PTI and broadcast TLB flush has a problem:
> - invalidate_user_asid() sets a bit corresponding to the process PCID in user_pcid_flush_mask
> - SWITCH_TO_USER_CR3 tests and clears a bit corresponding to the process PCID in user_pcid_flush_mask
The other bit of background here is that there are currently only 6
PCIDs that might need to be flushed in this way (TLB_NR_DYN_ASIDS).
There are obviously more than 6 bits in an unsigned long, so this is all
fine.
But the INVLPGB support vastly expanded the number of ASIDs that might
be used.
> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
> index e9b81876ebe4..cc9935bbbd45 100644
> --- a/arch/x86/include/asm/tlbflush.h
> +++ b/arch/x86/include/asm/tlbflush.h
> @@ -23,6 +23,31 @@ void __flush_tlb_all(void);
> #define TLB_FLUSH_ALL -1UL
> #define TLB_GENERATION_INVALID 0
>
> +/*
> + * When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for
> + * user/kernel switches
> + */
> +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
> +# define PTI_CONSUMED_PCID_BITS 1
> +#else
> +# define PTI_CONSUMED_PCID_BITS 0
> +#endif
> +
> +#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
> +
> +/*
> + * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
> + * for them being zero-based. Another -1 is because PCID 0 is reserved for
> + * use by non-PCID-aware users.
> + */
> +#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
> +
> +#ifdef CONFIG_BROADCAST_TLB_FLUSH
> +# define CR3_AVAIL_PCID_LONGS ((1 << CR3_AVAIL_PCID_BITS) / BITS_PER_LONG)
> +#else
> +# define CR3_AVAIL_PCID_LONGS 1
> +#endif
I wonder if we can make this easier to understand. Is there something
preventing us from using good old DECLARE_BITMAP()?
DECLARE_BITMAP(user_pcid_flush_mask, MAX_KERN_ASID);
#ifdef CONFIG_BROADCAST_TLB_FLUSH
# define MAX_KERN_PCID (1 << CR3_AVAIL_PCID_BITS - 1)
#else
/* PCID 0 is reserved. Dynamic asids 0->5 map to PCIDs 1->6 */
# define MAX_KERN_PCID (TLB_NR_DYN_ASIDS + 1)
#endif
The TLB_NR_DYN_ASIDS portion of this could even be a preparatory patch
before the CONFIG_BROADCAST_TLB_FLUSH gets added. That might also make
the whole thing more clear.
> void cr4_update_irqsoff(unsigned long set, unsigned long clear);
> unsigned long cr4_read_shadow(void);
>
> @@ -115,14 +140,6 @@ struct tlb_state {
> */
> u8 lam;
> #endif
> -
> - /*
> - * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
> - * the corresponding user PCID needs a flush next time we
> - * switch to it; see SWITCH_TO_USER_CR3.
> - */
> - unsigned short user_pcid_flush_mask;
> -
> /*
> * Access to this CR4 shadow and to H/W CR4 is protected by
> * disabling interrupts when modifying either one.
> @@ -149,6 +166,15 @@ struct tlb_state {
> * context 0.
> */
> struct tlb_context ctxs[TLB_NR_DYN_ASIDS];
> +
> +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
> + /*
> + * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
> + * the corresponding user PCID needs a flush next time we
> + * switch to it; see SWITCH_TO_USER_CR3.
> + */
> + unsigned long user_pcid_flush_mask[CR3_AVAIL_PCID_LONGS];
> +#endif
> };
> DECLARE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate);
This adds an #ifdef. I guess it makes sense to do it for the now larger
user_pcid_flush_mask[] while it didn't for a single long. But that's
another logically separate bit that adds complexity to reading this
whole mess.
Honestly, I'd just leave this out for the bug fix. If someone really
cares, we can come back and fix it up in mainline.
> diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
> index 6259b474073b..8c41a2e5a53e 100644
> --- a/arch/x86/kernel/asm-offsets.c
> +++ b/arch/x86/kernel/asm-offsets.c
> @@ -103,8 +103,10 @@ static void __used common(void)
> BLANK();
> DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
>
> +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
> /* TLB state for the entry code */
> OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
> +#endif
Because it necessitates this hunk too...
> /* Layout info for cpu_entry_area */
> OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 39f80111e6f1..f5761e8be77f 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -90,25 +90,6 @@
> *
> */
>
> -/*
> - * When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for
> - * user/kernel switches
> - */
> -#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
> -# define PTI_CONSUMED_PCID_BITS 1
> -#else
> -# define PTI_CONSUMED_PCID_BITS 0
> -#endif
> -
> -#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
> -
> -/*
> - * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
> - * for them being zero-based. Another -1 is because PCID 0 is reserved for
> - * use by non-PCID-aware users.
> - */
> -#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
> -
> /*
> * Given @asid, compute kPCID
> */
> @@ -543,10 +524,7 @@ static void broadcast_tlb_flush(struct flush_tlb_info *info)
> */
> static inline void invalidate_user_asid(u16 asid)
> {
> - /* There is no user ASID if address space separation is off */
> - if (!IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION))
> - return;
> -
> +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
> /*
> * We only have a single ASID if PCID is off and the CR3
> * write will have flushed it.
> @@ -557,8 +535,8 @@ static inline void invalidate_user_asid(u16 asid)
> if (!static_cpu_has(X86_FEATURE_PTI))
> return;
>
> - __set_bit(kern_pcid(asid),
> - (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
> + __set_bit(kern_pcid(asid), this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask[0]));
> +#endif
> }
... and this one.