RE: [PATCH v2] PCI: hv: Only reuse existing IRTE allocation for Multi-MSI
From: Michael Kelley (LINUX)
Date: Tue Oct 25 2022 - 14:18:37 EST
From: Dexuan Cui <decui@xxxxxxxxxxxxx> Sent: Monday, October 24, 2022 4:34 PM
>
> Jeffrey added Multi-MSI support to the pci-hyperv driver by the 4 patches:
> 08e61e861a0e ("PCI: hv: Fix multi-MSI to allow more than one MSI vector")
> 455880dfe292 ("PCI: hv: Fix hv_arch_irq_unmask() for multi-MSI")
> b4b77778ecc5 ("PCI: hv: Reuse existing IRTE allocation in compose_msi_msg()")
> a2bad844a67b ("PCI: hv: Fix interrupt mapping for multi-MSI")
>
> It turns out that the third patch (b4b77778ecc5) causes a performance
> regression because all the interrupts now happen on 1 physical CPU (or two
> pCPUs, if one pCPU doesn't have enough vectors). When a guest has many PCI
> devices, it may suffer from soft lockups if the workload is heavy, e.g.,
> see https://lwn.net/ml/linux-kernel/20220804025104.15673-1-decui@xxxxxxxxxxxxx/
>
> Commit b4b77778ecc5 itself is good. The real issue is that the hypercall in
> hv_irq_unmask() -> hv_arch_irq_unmask() ->
> hv_do_hypercall(HVCALL_RETARGET_INTERRUPT...) only changes the target
> virtual CPU rather than physical CPU; with b4b77778ecc5, the pCPU is
> determined only once in hv_compose_msi_msg() where only vCPU0 is specified;
> consequently the hypervisor only uses 1 target pCPU for all the interrupts.
>
> Note: before b4b77778ecc5, the pCPU is determined twice, and when the pCPU
> is determinted the second time, the vCPU in the effective affinity mask is
> used (i.e., it isn't always vCPU0), so the hypervisor chooses a different
> pCPU for each interrupt.
>
> The hypercall will be fixed in future to update the pCPU as well, but
> that will take quite a while, so let's restore the old behavior in
> hv_compose_msi_msg(), i.e., don't reuse the existing IRTE allocation for
> single-MSI and MSI-X; for multi-MSI, we choose the vCPU in a round-robin
> manner for each PCI device, so the interrupts of different devices can
> happen on different pCPUs, though the interrupts of each device happen on
> some single pCPU.
>
> The hypercall fix may not be backported to all old versions of Hyper-V, so
> we want to have this guest side change for ever (or at least till we're sure
> the old affected versions of Hyper-V are no longer supported).
>
> Fixes: b4b77778ecc5 ("PCI: hv: Reuse existing IRTE allocation in compose_msi_msg()")
> Co-developed-by: Jeffrey Hugo <quic_jhugo@xxxxxxxxxxx>
> Signed-off-by: Jeffrey Hugo <quic_jhugo@xxxxxxxxxxx>
> Co-developed-by: Carl Vanderlip <quic_carlv@xxxxxxxxxxx>
> Signed-off-by: Carl Vanderlip <quic_carlv@xxxxxxxxxxx>
> Signed-off-by: Dexuan Cui <decui@xxxxxxxxxxxxx>
>
> ---
> v1 is here:
>
> https://lwn.net/ml/linux-kernel/20220804025104.15673-1-decui@xxxxxxxxxxxxx/
>
> Changes in v2:
> round-robin the vCPU for multi-MSI.
> The commit message is re-worked.
> Added Jeff and Carl's Co-developed-by and Signed-off-by.
>
>
> drivers/pci/controller/pci-hyperv.c | 61 ++++++++++++++++++++++-------
> 1 file changed, 46 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
> index e7c6f6629e7c..7ac080f95259 100644
> --- a/drivers/pci/controller/pci-hyperv.c
> +++ b/drivers/pci/controller/pci-hyperv.c
> @@ -1613,7 +1613,7 @@ static void hv_pci_compose_compl(void *context, struct
> pci_response *resp,
> }
>
> static u32 hv_compose_msi_req_v1(
> - struct pci_create_interrupt *int_pkt, const struct cpumask *affinity,
> + struct pci_create_interrupt *int_pkt,
> u32 slot, u8 vector, u8 vector_count)
> {
> int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
> @@ -1640,18 +1640,39 @@ static int hv_compose_msi_req_get_cpu(const struct cpumask *affinity)
> return cpumask_first_and(affinity, cpu_online_mask);
> }
>
> -static u32 hv_compose_msi_req_v2(
> - struct pci_create_interrupt2 *int_pkt, const struct cpumask *affinity,
> - u32 slot, u8 vector, u8 vector_count)
> +/*
> + * Make sure the dummy vCPU values for multi-MSI don't all point to vCPU0.
> + */
> +static int hv_compose_multi_msi_req_get_cpu(void)
> {
> + static DEFINE_SPINLOCK(multi_msi_cpu_lock);
> +
> + /* -1 means starting with CPU 0 */
> + static int cpu_next = -1;
> +
> + unsigned long flags;
> int cpu;
>
> + spin_lock_irqsave(&multi_msi_cpu_lock, flags);
> +
> + cpu_next = cpumask_next_wrap(cpu_next, cpu_online_mask, nr_cpu_ids,
> + false);
I have a modest concern about using cpu_online_mask. The CPUs in this mask
can change if a CPU is taken online or offline in Linux. I don't think there's
a requirement to pick on an online CPU, especially since if we pick a CPU that's
online now, it might not be online later. Using cpu_present_mask would be
more correct. That's the CPUs that are present in the VM, which won't change
over time since Hyper-V doesn't hot-add or hot-remove CPUs in a VM.
A similar concern applies to hv_compose_msi_req_get_cpu().
Michael
> + cpu = cpu_next;
> +
> + spin_unlock_irqrestore(&multi_msi_cpu_lock, flags);
> +
> + return cpu;
> +}
> +
> +static u32 hv_compose_msi_req_v2(
> + struct pci_create_interrupt2 *int_pkt, int cpu,
> + u32 slot, u8 vector, u8 vector_count)
> +{
> int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2;
> int_pkt->wslot.slot = slot;
> int_pkt->int_desc.vector = vector;
> int_pkt->int_desc.vector_count = vector_count;
> int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
> - cpu = hv_compose_msi_req_get_cpu(affinity);
> int_pkt->int_desc.processor_array[0] =
> hv_cpu_number_to_vp_number(cpu);
> int_pkt->int_desc.processor_count = 1;
> @@ -1660,18 +1681,15 @@ static u32 hv_compose_msi_req_v2(
> }
>
> static u32 hv_compose_msi_req_v3(
> - struct pci_create_interrupt3 *int_pkt, const struct cpumask *affinity,
> + struct pci_create_interrupt3 *int_pkt, int cpu,
> u32 slot, u32 vector, u8 vector_count)
> {
> - int cpu;
> -
> int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE3;
> int_pkt->wslot.slot = slot;
> int_pkt->int_desc.vector = vector;
> int_pkt->int_desc.reserved = 0;
> int_pkt->int_desc.vector_count = vector_count;
> int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
> - cpu = hv_compose_msi_req_get_cpu(affinity);
> int_pkt->int_desc.processor_array[0] =
> hv_cpu_number_to_vp_number(cpu);
> int_pkt->int_desc.processor_count = 1;
> @@ -1710,12 +1728,18 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
> struct pci_create_interrupt3 v3;
> } int_pkts;
> } __packed ctxt;
> + bool multi_msi;
> u64 trans_id;
> u32 size;
> int ret;
> + int cpu;
> +
> + msi_desc = irq_data_get_msi_desc(data);
> + multi_msi = !msi_desc->pci.msi_attrib.is_msix &&
> + msi_desc->nvec_used > 1;
>
> /* Reuse the previous allocation */
> - if (data->chip_data) {
> + if (data->chip_data && multi_msi) {
> int_desc = data->chip_data;
> msg->address_hi = int_desc->address >> 32;
> msg->address_lo = int_desc->address & 0xffffffff;
> @@ -1723,7 +1747,6 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
> return;
> }
>
> - msi_desc = irq_data_get_msi_desc(data);
> pdev = msi_desc_to_pci_dev(msi_desc);
> dest = irq_data_get_effective_affinity_mask(data);
> pbus = pdev->bus;
> @@ -1733,11 +1756,18 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
> if (!hpdev)
> goto return_null_message;
>
> + /* Free any previous message that might have already been composed. */
> + if (data->chip_data && !multi_msi) {
> + int_desc = data->chip_data;
> + data->chip_data = NULL;
> + hv_int_desc_free(hpdev, int_desc);
> + }
> +
> int_desc = kzalloc(sizeof(*int_desc), GFP_ATOMIC);
> if (!int_desc)
> goto drop_reference;
>
> - if (!msi_desc->pci.msi_attrib.is_msix && msi_desc->nvec_used > 1) {
> + if (multi_msi) {
> /*
> * If this is not the first MSI of Multi MSI, we already have
> * a mapping. Can exit early.
> @@ -1762,9 +1792,11 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
> */
> vector = 32;
> vector_count = msi_desc->nvec_used;
> + cpu = hv_compose_multi_msi_req_get_cpu();
> } else {
> vector = hv_msi_get_int_vector(data);
> vector_count = 1;
> + cpu = hv_compose_msi_req_get_cpu(dest);
> }
>
> memset(&ctxt, 0, sizeof(ctxt));
> @@ -1775,7 +1807,6 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
> switch (hbus->protocol_version) {
> case PCI_PROTOCOL_VERSION_1_1:
> size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1,
> - dest,
> hpdev->desc.win_slot.slot,
> vector,
> vector_count);
> @@ -1784,7 +1815,7 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
> case PCI_PROTOCOL_VERSION_1_2:
> case PCI_PROTOCOL_VERSION_1_3:
> size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2,
> - dest,
> + cpu,
> hpdev->desc.win_slot.slot,
> vector,
> vector_count);
> @@ -1792,7 +1823,7 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
>
> case PCI_PROTOCOL_VERSION_1_4:
> size = hv_compose_msi_req_v3(&ctxt.int_pkts.v3,
> - dest,
> + cpu,
> hpdev->desc.win_slot.slot,
> vector,
> vector_count);
> --
> 2.25.1