Re: [PATCH] genirq/msi: Make sure early activation of all PCI MSIs

From: Marc Zyngier
Date: Thu Jan 21 2021 - 16:26:47 EST


Hi Shameer,

On Thu, 21 Jan 2021 11:02:47 +0000,
Shameer Kolothum <shameerali.kolothum.thodi@xxxxxxxxxx> wrote:
>
> We currently do early activation of MSI irqs for PCI/MSI based on
> the MSI_FLAG_ACTIVATE_EARLY flag. Though this activates all the
> allocated MSIs in the case of MSI-X, it only does so for the
> base irq in the case of MSI. This is because, for MSI, there
> is only one msi_desc entry for all the 32 irqs it can support
> and the current implementation iterates over the msi entries and
> ends up activating the base irq only.
>
> The above creates an issue on platforms where the msi controller
> supports direct injection of vLPIs(eg: ARM GICv4 ITS). On these
> platforms, upon irq activation, ITS driver maps the event to an
> ITT entry. And for Guest pass-through to work, early mapping of
> all the dev MSI vectors is required. Otherwise, the vfio irq
> bypass manager registration will fail. eg, On a HiSilicon D06
> platform with GICv4 enabled, Guest boot with zip dev pass-through
> reports,
>
> "vfio-pci 0000:75:00.1: irq bypass producer (token 0000000006e5176a)
> registration fails: 66311"
>
> and Guest boot fails.
>
> This is traced to,
>    kvm_arch_irq_bypass_add_producer
>      kvm_vgic_v4_set_forwarding
>        vgic_its_resolve_lpi --> returns E_ITS_INT_UNMAPPED_INTERRUPT
>
> Hence make sure we activate all the irqs for both MSI and MSI-x cases.
>
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@xxxxxxxxxx>
> ---
> It is unclear to me whether not performing the early activation of all
> MSI irqs was deliberate and has consequences on any other platforms.
> Please let me know.

Probably just an oversight.

>
> Thanks,
> Shameer
> ---
> kernel/irq/msi.c | 114 +++++++++++++++++++++++++++++++++++++----------
> 1 file changed, 90 insertions(+), 24 deletions(-)
>
> diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
> index 2c0c4d6d0f83..eec187fc32a9 100644
> --- a/kernel/irq/msi.c
> +++ b/kernel/irq/msi.c
> @@ -395,6 +395,78 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
> return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit;
> }
>
> +static void msi_domain_deactivate_irq(struct irq_domain *domain, int irq)
> +{
> + struct irq_data *irqd;
> +
> + irqd = irq_domain_get_irq_data(domain, irq);
> + if (irqd_is_activated(irqd))
> + irq_domain_deactivate_irq(irqd);
> +}
> +
> +static int msi_domain_activate_irq(struct irq_domain *domain,
> + int irq, bool can_reserve)
> +{
> + struct irq_data *irqd;
> +
> + irqd = irq_domain_get_irq_data(domain, irq);
> + if (!can_reserve) {
> + irqd_clr_can_reserve(irqd);
> + if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
> + irqd_set_msi_nomask_quirk(irqd);
> + }
> + return irq_domain_activate_irq(irqd, can_reserve);
> +}
> +
> +static int msi_domain_activate_msix_irqs(struct irq_domain *domain,
> + struct device *dev, bool can_reserve)
> +{
> + struct msi_desc *desc;
> + int ret, irq;
> +
> + for_each_msi_entry(desc, dev) {
> + irq = desc->irq;
> + ret = msi_domain_activate_irq(domain, irq, can_reserve);
> + if (ret)
> + goto out;
> + }
> + return 0;
> +
> +out:
> + for_each_msi_entry(desc, dev) {
> + if (irq == desc->irq)
> + break;
> + msi_domain_deactivate_irq(domain, desc->irq);
> + }
> + return ret;
> +}
> +
> +static int msi_domain_activate_msi_irqs(struct irq_domain *domain,
> + struct device *dev, bool can_reserve)
> +{
> + struct msi_desc *desc;
> + int i, ret, base, irq;
> +
> + desc = first_msi_entry(dev);
> + base = desc->irq;
> +
> + for (i = 0; i < desc->nvec_used; i++) {
> + irq = base + i;
> + ret = msi_domain_activate_irq(domain, irq, can_reserve);
> + if (ret)
> + goto out;
> + }
> + return 0;
> +
> +out:
> + for (i = 0; i < desc->nvec_used; i++) {
> + if (irq == base + i)
> + break;
> + msi_domain_deactivate_irq(domain, base + i);
> + }
> + return ret;
> +}
> +
> int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
> int nvec)
> {
> @@ -443,21 +515,25 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
> else
> dev_dbg(dev, "irq [%d-%d] for MSI\n",
> virq, virq + desc->nvec_used - 1);
> - /*
> - * This flag is set by the PCI layer as we need to activate
> - * the MSI entries before the PCI layer enables MSI in the
> - * card. Otherwise the card latches a random msi message.
> - */
> - if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY))
> - continue;
> + }
>
> - irq_data = irq_domain_get_irq_data(domain, desc->irq);
> - if (!can_reserve) {
> - irqd_clr_can_reserve(irq_data);
> - if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
> - irqd_set_msi_nomask_quirk(irq_data);
> - }
> - ret = irq_domain_activate_irq(irq_data, can_reserve);
> + /*
> + * This flag is set by the PCI layer as we need to activate
> + * the MSI entries before the PCI layer enables MSI in the
> + * card. Otherwise the card latches a random msi message.
> + * Early activation is also required when the msi controller
> + * supports direct injection of virtual LPIs(eg. ARM GICv4 ITS).
> + * Otherwise, the DevID/EventID -> LPI translation for pass-through
> + * devices will fail. Make sure we do activate all the allocated
> + * irqs for MSI and MSI-X cases.
> + */
> + if ((info->flags & MSI_FLAG_ACTIVATE_EARLY)) {
> + desc = first_msi_entry(dev);
> +
> + if (desc->msi_attrib.is_msix)
> + ret = msi_domain_activate_msix_irqs(domain, dev, can_reserve);
> + else
> + ret = msi_domain_activate_msi_irqs(domain, dev, can_reserve);
> if (ret)
> goto cleanup;
> }
> @@ -475,16 +551,6 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
> return 0;
>
> cleanup:
> - for_each_msi_entry(desc, dev) {
> - struct irq_data *irqd;
> -
> - if (desc->irq == virq)
> - break;
> -
> - irqd = irq_domain_get_irq_data(domain, desc->irq);
> - if (irqd_is_activated(irqd))
> - irq_domain_deactivate_irq(irqd);
> - }
> msi_domain_free_irqs(domain, dev);
> return ret;
> }
> --
> 2.17.1
>
>

I find this pretty complicated, and the I'd like to avoid injecting
the PCI MSI-vs-MSI-X concept in something that is supposed to be
bus-agnostic.

What's wrong with the following (untested) patch, which looks much
simpler?

Thanks,

M.

diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 2c0c4d6d0f83..a930d03c2c67 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -451,15 +451,17 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY))
continue;

- irq_data = irq_domain_get_irq_data(domain, desc->irq);
- if (!can_reserve) {
- irqd_clr_can_reserve(irq_data);
- if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
- irqd_set_msi_nomask_quirk(irq_data);
+ for (i = 0; i < desc->nvec_used; i++) {
+ irq_data = irq_domain_get_irq_data(domain, desc->irq + i);
+ if (!can_reserve) {
+ irqd_clr_can_reserve(irq_data);
+ if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
+ irqd_set_msi_nomask_quirk(irq_data);
+ }
+ ret = irq_domain_activate_irq(irq_data, can_reserve);
+ if (ret)
+ goto cleanup;
}
- ret = irq_domain_activate_irq(irq_data, can_reserve);
- if (ret)
- goto cleanup;
}

/*
@@ -478,12 +480,14 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
for_each_msi_entry(desc, dev) {
struct irq_data *irqd;

+ for (i = 0; i < desc->nvec_used; i++) {
+ irqd = irq_domain_get_irq_data(domain, desc->irq + i);
+ if (irqd_is_activated(irqd))
+ irq_domain_deactivate_irq(irqd);
+ }
+
if (desc->irq == virq)
break;
-
- irqd = irq_domain_get_irq_data(domain, desc->irq);
- if (irqd_is_activated(irqd))
- irq_domain_deactivate_irq(irqd);
}
msi_domain_free_irqs(domain, dev);
return ret;

--
Without deviation from the norm, progress is not possible.