Re: [PROBLEM] Frequently get "irq 31: nobody cared" when passing through 2x GPUs that share same pci switch via vfio

From: Matthew Ruffell
Date: Wed Nov 24 2021 - 00:52:36 EST


Hi Alex,

I have forward ported your patch to 5.16-rc2 to account for the vfio module
refactor that happened recently. Attached below.

Have you had an opportunity to research if it is possible to conditionalise
clearing DisINTx by looking at the interrupt status and seeing if there is a
pending interrupt but no handler set?

We are testing a 5.16-rc2 kernel with the patch applied on Nathan's server
currently, and we are also trying out the pci=clearmsi command line parameter
that was discussed on linux-pci a few years ago in [1][2][3][4] along with
setting snd-hda-intel.enable_msi=1 to see if it helps the crashkernel not get
stuck copying IR tables.

[1] https://marc.info/?l=linux-pci&m=153988799707413
[2] https://lore.kernel.org/linux-pci/20181018183721.27467-1-gpiccoli@xxxxxxxxxxxxx/
[3] https://lore.kernel.org/linux-pci/20181018183721.27467-2-gpiccoli@xxxxxxxxxxxxx/
[4] https://lore.kernel.org/linux-pci/20181018183721.27467-3-gpiccoli@xxxxxxxxxxxxx/

I will let you know how we get on.

Thanks,
Matthew

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index f948e6cd2993..cbca207ddc45 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -276,6 +276,7 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
vdev->pci_2_3 = pci_intx_mask_supported(pdev);
}

+ vfio_intx_stub_init(vdev);
pci_read_config_word(pdev, PCI_COMMAND, &cmd);
if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
cmd &= ~PCI_COMMAND_INTX_DISABLE;
@@ -365,6 +366,14 @@ void vfio_pci_core_disable(struct
vfio_pci_core_device *vdev)
kfree(dummy_res);
}

+ /*
+ * Set known command register state, disabling MSI/X (via busmaster)
+ * and INTx directly. At this point we can teardown the INTx stub
+ * handler initialized from the SET_IRQS teardown above.
+ */
+ pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
+ vfio_intx_stub_exit(vdev);
+
vdev->needs_reset = true;

/*
@@ -382,12 +391,6 @@ void vfio_pci_core_disable(struct
vfio_pci_core_device *vdev)
pci_save_state(pdev);
}

- /*
- * Disable INTx and MSI, presumably to avoid spurious interrupts
- * during reset. Stolen from pci_reset_function()
- */
- pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
-
/*
* Try to get the locks ourselves to prevent a deadlock. The
* success of this is dependent on being able to lock the device,
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c
b/drivers/vfio/pci/vfio_pci_intrs.c
index 6069a11fb51a..98cf528aa175 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -139,6 +139,44 @@ static irqreturn_t vfio_intx_handler(int irq, void *dev_id)
return ret;
}

+static irqreturn_t vfio_intx_stub(int irq, void *dev_id)
+{
+ struct vfio_pci_core_device *vdev = dev_id;
+
+ if (pci_check_and_mask_intx(vdev->pdev))
+ return IRQ_HANDLED;
+
+ return IRQ_NONE;
+}
+
+void vfio_intx_stub_init(struct vfio_pci_core_device *vdev)
+{
+ char *name;
+
+ if (vdev->nointx || !vdev->pci_2_3 || !vdev->pdev->irq)
+ return;
+
+ name = kasprintf(GFP_KERNEL, "vfio-intx-stub(%s)",
+ pci_name(vdev->pdev));
+ if (!name)
+ return;
+
+ if (request_irq(vdev->pdev->irq, vfio_intx_stub,
+ IRQF_SHARED, name, vdev))
+ kfree(name);
+
+ vdev->intx_stub = true;
+}
+
+void vfio_intx_stub_exit(struct vfio_pci_core_device *vdev)
+{
+ if (!vdev->intx_stub)
+ return;
+
+ kfree(free_irq(vdev->pdev->irq, vdev));
+ vdev->intx_stub = false;
+}
+
static int vfio_intx_enable(struct vfio_pci_core_device *vdev)
{
if (!is_irq_none(vdev))
@@ -153,6 +191,8 @@ static int vfio_intx_enable(struct
vfio_pci_core_device *vdev)

vdev->num_ctx = 1;

+ vfio_intx_stub_exit(vdev);
+
/*
* If the virtual interrupt is masked, restore it. Devices
* supporting DisINTx can be masked at the hardware level
@@ -231,6 +271,7 @@ static void vfio_intx_disable(struct
vfio_pci_core_device *vdev)
vdev->irq_type = VFIO_PCI_NUM_IRQS;
vdev->num_ctx = 0;
kfree(vdev->ctx);
+ vfio_intx_stub_init(vdev);
}

/*
@@ -258,6 +299,8 @@ static int vfio_msi_enable(struct
vfio_pci_core_device *vdev, int nvec, bool msi
if (!vdev->ctx)
return -ENOMEM;

+ vfio_intx_stub_exit(vdev);
+
/* return the number of supported vectors if we can't get all: */
cmd = vfio_pci_memory_lock_and_enable(vdev);
ret = pci_alloc_irq_vectors(pdev, 1, nvec, flag);
@@ -266,6 +309,7 @@ static int vfio_msi_enable(struct
vfio_pci_core_device *vdev, int nvec, bool msi
pci_free_irq_vectors(pdev);
vfio_pci_memory_unlock_and_restore(vdev, cmd);
kfree(vdev->ctx);
+ vfio_intx_stub_init(vdev);
return ret;
}
vfio_pci_memory_unlock_and_restore(vdev, cmd);
@@ -388,6 +432,7 @@ static int vfio_msi_set_block(struct
vfio_pci_core_device *vdev, unsigned start,
static void vfio_msi_disable(struct vfio_pci_core_device *vdev, bool msix)
{
struct pci_dev *pdev = vdev->pdev;
+ pci_dev_flags_t dev_flags = pdev->dev_flags;
int i;
u16 cmd;

@@ -399,19 +444,22 @@ static void vfio_msi_disable(struct
vfio_pci_core_device *vdev, bool msix)
vfio_msi_set_block(vdev, 0, vdev->num_ctx, NULL, msix);

cmd = vfio_pci_memory_lock_and_enable(vdev);
- pci_free_irq_vectors(pdev);
- vfio_pci_memory_unlock_and_restore(vdev, cmd);

/*
- * Both disable paths above use pci_intx_for_msi() to clear DisINTx
- * via their shutdown paths. Restore for NoINTx devices.
+ * XXX pci_intx_for_msi() will clear DisINTx, which can trigger an
+ * INTx storm even before we return from pci_free_irq_vectors(), even
+ * as we'll restore the previous command register immediately after.
+ * Hack around it by masking in a dev_flag to prevent such behavior.
*/
- if (vdev->nointx)
- pci_intx(pdev, 0);
+ pdev->dev_flags |= PCI_DEV_FLAGS_MSI_INTX_DISABLE_BUG;
+ pci_free_irq_vectors(pdev);
+ pdev->dev_flags = dev_flags;

+ vfio_pci_memory_unlock_and_restore(vdev, cmd);
vdev->irq_type = VFIO_PCI_NUM_IRQS;
vdev->num_ctx = 0;
kfree(vdev->ctx);
+ vfio_intx_stub_init(vdev);
}

/*
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index ef9a44b6cf5d..58e1029eb083 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -124,6 +124,7 @@ struct vfio_pci_core_device {
bool needs_reset;
bool nointx;
bool needs_pm_restore;
+ bool intx_stub;
struct pci_saved_state *pci_saved_state;
struct pci_saved_state *pm_save;
int ioeventfds_nr;
@@ -145,6 +146,9 @@ struct vfio_pci_core_device {
#define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev)))
#define irq_is(vdev, type) (vdev->irq_type == type)

+extern void vfio_intx_stub_init(struct vfio_pci_core_device *vdev);
+extern void vfio_intx_stub_exit(struct vfio_pci_core_device *vdev);
+
extern void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev);
extern void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev);