Re: x86: CR4 update when IRQs are enabled

From: Andy Lutomirski
Date: Wed Nov 15 2017 - 22:44:57 EST


I think Nadav is right here. IMO the right fix is to rename the
functions cr4_set_bits_irqs_off() etc, add a warning (if lockdep is
on) and fix the callers.

On Wed, Nov 15, 2017 at 4:34 PM, Nadav Amit <nadav.amit@xxxxxxxxx> wrote:
> hpa@xxxxxxxxx wrote:
>
>> On November 15, 2017 3:31:50 PM PST, Nadav Amit <nadav.amit@xxxxxxxxx> wrote:
>>> Ping?
>>>
>>> Nadav Amit <nadav.amit@xxxxxxxxx> wrote:
>>>
>>>> CCâing more people, and adding a patch to clarify...
>>>>
>>>> Nadav Amit <nadav.amit@xxxxxxxxx> wrote:
>>>>
>>>>> I am puzzled by the comment in tlb_state.cr4 , which says:
>>>>>
>>>>> /*
>>>>> * Access to this CR4 shadow and to H/W CR4 is protected by
>>>>> * disabling interrupts when modifying either one.
>>>>> */
>>>>>
>>>>> This does not seem to be true and adding a warning to CR4 writes
>>> when
>>>>> !irqs_disabled() immediately fires in identify_cpu() and
>>>>> __mcheck_cpu_init_generic(). While these two are called during boot,
>>> I think
>>>>> there other CR4 changes with enabled IRQs, for example, PR_SET_TSC.
>>>>>
>>>>> So my question(s): Is the comment correct? Is the current behavior
>>> correct?
>>>> So here is what I have in mind. I am not sure whether
>>> CONFIG_DEBUG_PREEMPT is
>>>> the right #ifdef. Let me know what you think.
>>>>
>>>> -- >8 --
>>>>
>>>> Subject: [PATCH] x86: disable IRQs before changing CR4
>>>>
>>>> CR4 changes need to be performed while IRQs are disabled in order to
>>>> update the CR4 shadow and the actual register atomically.
>>>>
>>>> Signed-off-by: Nadav Amit <namit@xxxxxxxxxx>
>>>> ---
>>>> arch/x86/include/asm/tlbflush.h | 18 ++++++++++++------
>>>> arch/x86/kernel/cpu/common.c | 13 ++++++++++++-
>>>> arch/x86/kernel/cpu/mcheck/mce.c | 3 +++
>>>> arch/x86/kernel/cpu/mcheck/p5.c | 4 ++++
>>>> arch/x86/kernel/cpu/mcheck/winchip.c | 3 +++
>>>> arch/x86/kernel/process.c | 14 ++++++++++++--
>>>> 6 files changed, 46 insertions(+), 9 deletions(-)
>>>>
>>>> diff --git a/arch/x86/include/asm/tlbflush.h
>>> b/arch/x86/include/asm/tlbflush.h
>>>> index 50ea3482e1d1..bc70dd1cc7c6 100644
>>>> --- a/arch/x86/include/asm/tlbflush.h
>>>> +++ b/arch/x86/include/asm/tlbflush.h
>>>> @@ -89,6 +89,15 @@ static inline void cr4_init_shadow(void)
>>>> this_cpu_write(cpu_tlbstate.cr4, __read_cr4());
>>>> }
>>>>
>>>> +static inline void update_cr4(unsigned long cr4)
>>>> +{
>>>> +#ifdef CONFIG_DEBUG_PREEMPT
>>>> + WARN_ON_ONCE(!irqs_disabled());
>>>> +#endif
>>>> + this_cpu_write(cpu_tlbstate.cr4, cr4);
>>>> + __write_cr4(cr4);
>>>> +}
>>>> +
>>>> /* Set in this cpu's CR4. */
>>>> static inline void cr4_set_bits(unsigned long mask)
>>>> {
>>>> @@ -97,8 +106,7 @@ static inline void cr4_set_bits(unsigned long
>>> mask)
>>>> cr4 = this_cpu_read(cpu_tlbstate.cr4);
>>>> if ((cr4 | mask) != cr4) {
>>>> cr4 |= mask;
>>>> - this_cpu_write(cpu_tlbstate.cr4, cr4);
>>>> - __write_cr4(cr4);
>>>> + update_cr4(cr4);
>>>> }
>>>> }
>>>>
>>>> @@ -110,8 +118,7 @@ static inline void cr4_clear_bits(unsigned long
>>> mask)
>>>> cr4 = this_cpu_read(cpu_tlbstate.cr4);
>>>> if ((cr4 & ~mask) != cr4) {
>>>> cr4 &= ~mask;
>>>> - this_cpu_write(cpu_tlbstate.cr4, cr4);
>>>> - __write_cr4(cr4);
>>>> + update_cr4(cr4);
>>>> }
>>>> }
>>>>
>>>> @@ -121,8 +128,7 @@ static inline void cr4_toggle_bits(unsigned long
>>> mask)
>>>> cr4 = this_cpu_read(cpu_tlbstate.cr4);
>>>> cr4 ^= mask;
>>>> - this_cpu_write(cpu_tlbstate.cr4, cr4);
>>>> - __write_cr4(cr4);
>>>> + update_cr4(cr4);
>>>> }
>>>>
>>>> /* Read the CR4 shadow. */
>>>> diff --git a/arch/x86/kernel/cpu/common.c
>>> b/arch/x86/kernel/cpu/common.c
>>>> index c8b39870f33e..82e6b41fd5e9 100644
>>>> --- a/arch/x86/kernel/cpu/common.c
>>>> +++ b/arch/x86/kernel/cpu/common.c
>>>> @@ -318,6 +318,8 @@ static bool pku_disabled;
>>>>
>>>> static __always_inline void setup_pku(struct cpuinfo_x86 *c)
>>>> {
>>>> + unsigned long flags;
>>>> +
>>>> /* check the boot processor, plus compile options for PKU: */
>>>> if (!cpu_feature_enabled(X86_FEATURE_PKU))
>>>> return;
>>>> @@ -327,7 +329,10 @@ static __always_inline void setup_pku(struct
>>> cpuinfo_x86 *c)
>>>> if (pku_disabled)
>>>> return;
>>>>
>>>> + local_irq_save(flags);
>>>> cr4_set_bits(X86_CR4_PKE);
>>>> + local_irq_restore(flags);
>>>> +
>>>> /*
>>>> * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE
>>>> * cpuid bit to be set. We need to ensure that we
>>>> @@ -1069,6 +1074,7 @@ static void validate_apic_and_package_id(struct
>>> cpuinfo_x86 *c)
>>>> */
>>>> static void identify_cpu(struct cpuinfo_x86 *c)
>>>> {
>>>> + unsigned long flags;
>>>> int i;
>>>>
>>>> c->loops_per_jiffy = loops_per_jiffy;
>>>> @@ -1121,9 +1127,14 @@ static void identify_cpu(struct cpuinfo_x86
>>> *c)
>>>> /* Disable the PN if appropriate */
>>>> squash_the_stupid_serial_number(c);
>>>>
>>>> - /* Set up SMEP/SMAP */
>>>> + /*
>>>> + * Set up SMEP/SMAP. Disable interrupts to prevent triggering a
>>> warning
>>>> + * as CR4 changes must be done with disabled interrupts.
>>>> + */
>>>> + local_irq_save(flags);
>>>> setup_smep(c);
>>>> setup_smap(c);
>>>> + local_irq_restore(flags);
>>>>
>>>> /*
>>>> * The vendor-specific functions might have changed features.
>>>> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c
>>> b/arch/x86/kernel/cpu/mcheck/mce.c
>>>> index 3b413065c613..497c07e33c25 100644
>>>> --- a/arch/x86/kernel/cpu/mcheck/mce.c
>>>> +++ b/arch/x86/kernel/cpu/mcheck/mce.c
>>>> @@ -1508,6 +1508,7 @@ static void __mcheck_cpu_init_generic(void)
>>>> {
>>>> enum mcp_flags m_fl = 0;
>>>> mce_banks_t all_banks;
>>>> + unsigned long flags;
>>>> u64 cap;
>>>>
>>>> if (!mca_cfg.bootlog)
>>>> @@ -1519,7 +1520,9 @@ static void __mcheck_cpu_init_generic(void)
>>>> bitmap_fill(all_banks, MAX_NR_BANKS);
>>>> machine_check_poll(MCP_UC | m_fl, &all_banks);
>>>>
>>>> + local_irq_save(flags);
>>>> cr4_set_bits(X86_CR4_MCE);
>>>> + local_irq_restore(flags);
>>>>
>>>> rdmsrl(MSR_IA32_MCG_CAP, cap);
>>>> if (cap & MCG_CTL_P)
>>>> diff --git a/arch/x86/kernel/cpu/mcheck/p5.c
>>> b/arch/x86/kernel/cpu/mcheck/p5.c
>>>> index 2a0717bf8033..d5d4963415e9 100644
>>>> --- a/arch/x86/kernel/cpu/mcheck/p5.c
>>>> +++ b/arch/x86/kernel/cpu/mcheck/p5.c
>>>> @@ -42,6 +42,7 @@ static void pentium_machine_check(struct pt_regs
>>> *regs, long error_code)
>>>> /* Set up machine check reporting for processors with Intel style
>>> MCE: */
>>>> void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
>>>> {
>>>> + unsigned long flags;
>>>> u32 l, h;
>>>>
>>>> /* Default P5 to off as its often misconnected: */
>>>> @@ -62,7 +63,10 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
>>>> pr_info("Intel old style machine check architecture supported.\n");
>>>>
>>>> /* Enable MCE: */
>>>> + local_irq_save(flags);
>>>> cr4_set_bits(X86_CR4_MCE);
>>>> + local_irq_restore(flags);
>>>> +
>>>> pr_info("Intel old style machine check reporting enabled on
>>> CPU#%d.\n",
>>>> smp_processor_id());
>>>> }
>>>> diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c
>>> b/arch/x86/kernel/cpu/mcheck/winchip.c
>>>> index c6a722e1d011..6dd985e3849d 100644
>>>> --- a/arch/x86/kernel/cpu/mcheck/winchip.c
>>>> +++ b/arch/x86/kernel/cpu/mcheck/winchip.c
>>>> @@ -26,6 +26,7 @@ static void winchip_machine_check(struct pt_regs
>>> *regs, long error_code)
>>>> /* Set up machine check reporting on the Winchip C6 series */
>>>> void winchip_mcheck_init(struct cpuinfo_x86 *c)
>>>> {
>>>> + unsigned long flags;
>>>> u32 lo, hi;
>>>>
>>>> machine_check_vector = winchip_machine_check;
>>>> @@ -37,7 +38,9 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c)
>>>> lo &= ~(1<<4); /* Enable MCE */
>>>> wrmsr(MSR_IDT_FCR1, lo, hi);
>>>>
>>>> + local_irq_save(flags);
>>>> cr4_set_bits(X86_CR4_MCE);
>>>> + local_irq_restore(flags);
>>>>
>>>> pr_info("Winchip machine check reporting enabled on CPU#0.\n");
>>>> }
>>>> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
>>>> index 3ca198080ea9..09e44e784745 100644
>>>> --- a/arch/x86/kernel/process.c
>>>> +++ b/arch/x86/kernel/process.c
>>>> @@ -127,25 +127,35 @@ void flush_thread(void)
>>>>
>>>> void disable_TSC(void)
>>>> {
>>>> + unsigned long flags;
>>>> +
>>>> preempt_disable();
>>>> - if (!test_and_set_thread_flag(TIF_NOTSC))
>>>> + if (!test_and_set_thread_flag(TIF_NOTSC)) {
>>>> /*
>>>> * Must flip the CPU state synchronously with
>>>> * TIF_NOTSC in the current running context.
>>>> */
>>>> + local_irq_save(flags);
>>>> cr4_set_bits(X86_CR4_TSD);
>>>> + local_irq_restore(flags);
>>>> + }
>>>> preempt_enable();
>>>> }
>>>>
>>>> static void enable_TSC(void)
>>>> {
>>>> + unsigned long flags;
>>>> +
>>>> preempt_disable();
>>>> - if (test_and_clear_thread_flag(TIF_NOTSC))
>>>> + if (test_and_clear_thread_flag(TIF_NOTSC)) {
>>>> /*
>>>> * Must flip the CPU state synchronously with
>>>> * TIF_NOTSC in the current running context.
>>>> */
>>>> + local_irq_save(flags);
>>>> cr4_clear_bits(X86_CR4_TSD);
>>>> + local_irq_restore(flags);
>>>> + }
>>>> preempt_enable();
>>>> }
>>
>> This is wrong on at least two levels colon first of all, this should not be wrapped around the abstracted operations but put inside them if relevant. Second, I suspect that this is not at all a requirement but rather that as long as the hardware register is written second, I think we should always be safe.
>
> Thanks for your reply.
>
> Can you please explain your suspicion? Here is a simple execution that may
> fail (assume cr4 is initially zero):
>
> CPU0
> ====
> cr4_set_bits(1)
> => cpu_tlbstate.cr4 = 1
> => interrupt
> cr4_set_bits(2)
> cpu_tlbstate.cr4 = 3
> __write_cr4(3)
> => __write_cr4(1) [ and should have been 3 ]
>
> Am I missing anything?
>
> Now, it may be a theoretical issue right now, since I did not find any
> change of CR4 bits that happens in an interrupt handler.
>
> Anyhow, if you want me to submit a fix, please let me know what other levels
> of wrongness you had in mind.
>
> Regards,
> Nadav
>