Re: [PATCH v2] PCI/MSI: Avoid torn updates to MSI pairs

From: Thomas Gleixner
Date: Thu Jan 23 2020 - 13:17:02 EST


Evan,

Thomas Gleixner <tglx@xxxxxxxxxxxxx> writes:
> This is not yet debugged fully and as this is happening on MSI-X I'm not
> really convinced yet that your 'torn write' theory holds.

can you please apply the debug patch below and run your test. When the
failure happens, stop the tracer and collect the trace.

Another question. Did you ever try to change the affinity of that
interrupt without hotplug rapidly while the device makes traffic? If
not, it would be interesting whether this leads to a failure as well.

Thanks

tglx

8<---------------

--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -964,6 +964,8 @@ void irq_force_complete_move(struct irq_
if (!vector)
goto unlock;

+ trace_printk("IRQ %u vector %u irq inprogress %u\n", vector,
+ irqd->irq, apicd->move_in_progress);
/*
* This is tricky. If the cleanup of the old vector has not been
* done yet, then the following setaffinity call will fail with
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -244,6 +244,8 @@ u64 arch_irq_stat(void)

desc = __this_cpu_read(vector_irq[vector]);
if (likely(!IS_ERR_OR_NULL(desc))) {
+ trace_printk("Handle vector %u IRQ %u\n", vector,
+ desc->irq_data.irq);
if (IS_ENABLED(CONFIG_X86_32))
handle_irq(desc, regs);
else
@@ -252,10 +254,18 @@ u64 arch_irq_stat(void)
ack_APIC_irq();

if (desc == VECTOR_UNUSED) {
+ trace_printk("Handle unused vector %u\n", vector);
pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n",
__func__, smp_processor_id(),
vector);
} else {
+ if (desc == VECTOR_SHUTDOWN) {
+ trace_printk("Handle shutdown vector %u\n",
+ vector);
+ } else if (desc == VECTOR_RETRIGGERED) {
+ trace_printk("Handle retriggered vector %u\n",
+ vector);
+ }
__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
}
}
@@ -373,9 +383,14 @@ void fixup_irqs(void)
if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector])))
continue;

+ desc = __this_cpu_read(vector_irq[vector]);
+ trace_printk("FIXUP: %u\n", desc->irq_data.irq);
+
irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
if (irr & (1 << (vector % 32))) {
desc = __this_cpu_read(vector_irq[vector]);
+ trace_printk("FIXUP: %u IRR pending\n",
+ desc->irq_data.irq);

raw_spin_lock(&desc->lock);
data = irq_desc_get_irq_data(desc);
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -122,6 +122,10 @@ static bool migrate_one_irq(struct irq_d
affinity = cpu_online_mask;
brokeaff = true;
}
+
+ trace_printk("IRQ: %d maskchip %d wasmasked %d break %d\n",
+ d->irq, maskchip, irqd_irq_masked(d), brokeaff);
+
/*
* Do not set the force argument of irq_do_set_affinity() as this
* disables the masking of offline CPUs from the supplied affinity