Re: [PATCH 4/4] drivers:pci:hv: New paravirtual PCI front-end for Hyper-V VMs

From: Thomas Gleixner
Date: Sun Aug 02 2015 - 04:48:14 EST


On Sun, 2 Aug 2015, jakeo@xxxxxxxxxxxxx wrote:
> +#include <linux/kernel.h>
> +#include <linux/jiffies.h>
> +#include <linux/mman.h>
> +#include <linux/delay.h>
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/kthread.h>
> +#include <linux/completion.h>
> +#include <linux/notifier.h>
> +#include <linux/of_pci.h>
> +#include <linux/of_platform.h>
> +#include <linux/pci.h>
> +#include <linux/semaphore.h>
> +#include <linux/irqdomain.h>
> +#include <asm/irqdomain.h>
> +#include <linux/msi.h>

I seriously doubt, that you need all of these includes.

> +/* Interrupt management hooks */
> +
> +/**
> + * hv_msi_free() - Free the MSI.
> + * @domain: The interrupt domain pointer
> + * @info: Extra MSI-related context
> + * @virq: Identifies the IRQ.
> + *
> + * The Hyper-V parent partition and hypervisor are tracking the
> + * messages that are in use, keeping the interrupt redirection
> + * table up to date. This callback sends a message that frees
> + * the the IRT entry and related tracking nonsense.
> + */
> +void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info,
> + unsigned int virq)

static?

> +{
> + struct pci_delete_interrupt *int_pkt;
> + struct {
> + struct pci_packet pkt;
> + u8 buffer[sizeof(struct pci_delete_interrupt) -
> + sizeof(struct pci_message)];
> + } ctxt;
> + struct hv_pcibus_device *hbus;
> + struct hv_pci_dev *hpdev;
> + struct irq_desc *desc;
> + struct msi_desc *msi;
> + struct tran_int_desc *int_desc;
> + struct irq_desc *irq_desc;
> +
> + desc = irq_to_desc(virq);
> + msi = irq_desc_get_msi_desc(desc);

Why do you need to lookup the irq descriptor if you want the msi descriptor?

> + hbus = info->data;
> + hpdev = lookup_hv_dev(hbus, devfn_to_wslot(msi->dev->devfn));
> + if (!hpdev)
> + return;
> +
> + int_desc = irq_get_handler_data(virq);

I don't think this is a proper storage point. The data is domain/chip
specific, right? So, what's wrong with storing it in irq_data::chip_data?

> + if (int_desc) {
> + memset(&ctxt, 0, sizeof(ctxt));
> + int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
> + int_pkt->message_type.message_type =
> + PCI_DELETE_INTERRUPT_MESSAGE;
> + int_pkt->wslot.slot = hpdev->desc.win_slot.slot;
> + int_pkt->int_desc = *int_desc;
> + vmbus_sendpacket(hbus->hdev->channel, int_pkt, sizeof(*int_pkt),
> + (unsigned long)&ctxt.pkt, VM_PKT_DATA_INBAND,
> + 0);
> + kfree(int_desc);

Free before clearing the reference?

> + irq_desc = irq_to_desc(virq);

Do you expect the descriptor to change between the lookup above and
this one?

> + irq_desc->irq_data.handler_data = NULL;

You are not supposed to fiddle in irq_desc or irq_data internals. We
have helper functions for that.

> + }
> +
> + hv_pcichild_dec(hpdev, hv_pcidev_ref_by_slot);
> +}
> +
> +int hv_set_affinity(struct irq_data *data, const struct cpumask *dest,
> + bool force)

static, if at all.

> +{
> + struct irq_data *parent = data->parent_data;
> +
> + return parent->chip->irq_set_affinity(parent, dest, force);
> +}

irq_chip_set_affinity_parent ???

> +/**
> + * hv_compose_msi_msg() - Supplies a valid MSI address/data
> + * @data: Everything about this MSI
> + * @msg: Buffer that is filled in by this function
> + *
> + * This function unpacks the IRQ looking for target CPU set, IDT
> + * vector and mode and sends a message to the parent partition
> + * asking for a mapping for that tuple in this partition. The
> + * response supplies a data value and address to which that data
> + * should be written to trigger that interrupt.
> + */
> +void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)

static ?

> +{
> + struct irq_cfg *cfg = irqd_cfg(data);
> + struct hv_pcibus_device *hbus;
> + struct hv_pci_dev *hpdev;
> + struct pci_bus *pbus;
> + struct pci_create_interrupt *int_pkt;
> + struct compose_comp_ctxt comp;
> + struct tran_int_desc *int_desc;
> + struct irq_desc *irq_desc;
> + struct {
> + struct pci_packet pkt;
> + u8 buffer[sizeof(struct pci_create_interrupt) -
> + sizeof(struct pci_message)];
> + } ctxt;
> + int cpu;
> + int ret;
> +
> + pbus = data->msi_desc->dev->bus;
> + hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
> + hpdev = lookup_hv_dev(hbus, devfn_to_wslot(data->msi_desc->dev->devfn));
> +
> + if (!hpdev)
> + goto return_null_message;
> +
> + int_desc = kzalloc(sizeof(*int_desc), GFP_KERNEL);
> + if (!int_desc)
> + goto return_null_message;
> +
> + memset(&ctxt, 0, sizeof(ctxt));
> + init_completion(&comp.comp_pkt.host_event);
> + ctxt.pkt.completion_func = hv_pci_compose_compl;
> + ctxt.pkt.compl_ctxt = &comp;
> + int_pkt = (struct pci_create_interrupt *)&ctxt.pkt.message;
> + int_pkt->message_type.message_type = PCI_CREATE_INTERRUPT_MESSAGE;
> + int_pkt->wslot.slot = hpdev->desc.win_slot.slot;
> + int_pkt->int_desc.vector = cfg->vector;
> + int_pkt->int_desc.vector_count = 1;
> + int_pkt->int_desc.delivery_mode =
> + (apic->irq_delivery_mode == dest_LowestPrio) ? 1 : 0;
> +
> + /*
> + * Cut down interrupt targets to a single processor. Hyper-V
> + * currently only supports picking the target of an MSI by
> + * sending a message to the parent partition and receiving a
> + * response. This can't really work in the context of
> + * irq_set_affinity, which sometimes needs to run in an environment
> + * that can't wait for a completion. Until a future version of
> + * Hyper-V exists that can retarget an interrupt without waiting,
> + * this code picks a single virtual processor as the target. This
> + * comment and the following lines should be deleted when that
> + * happens.
> + */
> + cpu = cpumask_any_and(data->affinity, cpu_online_mask);

We have accessors for this for a reason. data->affinity will be gone
after 4.3-rc1.

> + cpumask_clear(data->affinity);
> + cpumask_set_cpu(cpu, data->affinity);

> + /*
> + * This bit doesn't have to work on machines with more than 64
> + * processors because Hyper-V only supports 64 in a guest.
> + */
> + for_each_cpu(cpu, data->affinity) {
> + if (cpu_is_offline(cpu))
> + continue;
> + int_pkt->int_desc.cpu_mask |=
> + (1 << vmbus_cpu_number_to_vp_number(cpu));
> + }

ROTFL. What's the point of this loop? To find the single already known
cpu which is in the mask?

> + ret = vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt,
> + sizeof(*int_pkt), (unsigned long)&ctxt.pkt,
> + VM_PKT_DATA_INBAND,
> + VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
> + if (!ret)
> + wait_for_completion(&comp.comp_pkt.host_event);
> +
> + if (comp.comp_pkt.completion_status < 0) {
> + pr_err("Request for interrupt failed: 0x%x",
> + comp.comp_pkt.completion_status);
> + goto return_null_message;
> + }
> +
> + /*
> + * Record the assignment so that this can be unwound later. Using
> + * irq_set_handler_data() here would be appropriate, but the lock
> + * it takes is already held.
> + */
> + *int_desc = comp.int_desc;
> + irq_desc = irq_to_desc(data->irq);
> + irq_desc->irq_data.handler_data = int_desc;

You're really a fan of redundant lookups. How is irq_desc->irq_data
going to be different from data?

> + /* Pass up the result. */
> + msg->address_hi = comp.int_desc.address >> 32;
> + msg->address_lo = comp.int_desc.address & 0xffffffff;
> + msg->data = comp.int_desc.data;
> +
> + hv_pcichild_dec(hpdev, hv_pcidev_ref_by_slot);
> + return;
> +
> +return_null_message:

Leaks int_desc and a refcount on hpdev.

> + msg->address_hi = 0;
> + msg->address_lo = 0;
> + msg->data = 0;
> +}
> +
> +/* HW Interrupt Chip Descriptor */
> +static struct irq_chip hv_msi_irq_chip = {
> + .name = "Hyper-V PCIe MSI",
> + .irq_compose_msi_msg = hv_compose_msi_msg,
> + .irq_set_affinity = hv_set_affinity,
> + .irq_ack = irq_chip_ack_parent,

Please format this readable:

.name = "Hyper-V PCIe MSI",
.irq_compose_msi_msg = hv_compose_msi_msg,
.irq_set_affinity = hv_set_affinity,
.irq_ack = irq_chip_ack_parent,

Hmm?

> +};
> +
> +static irq_hw_number_t hv_msi_domain_ops_get_hwirq(struct msi_domain_info *info,
> + msi_alloc_info_t *arg)
> +{
> + return arg->msi_hwirq;
> +}
> +
> +static int hv_msi_domain_ops_prepare(struct irq_domain *domain,
> + struct device *dev, int nvec,
> + msi_alloc_info_t *arg)
> +{
> + struct pci_dev *pdev = to_pci_dev(dev);
> + struct msi_desc *desc = first_pci_msi_entry(pdev);
> +
> + memset(arg, 0, sizeof(*arg));
> + arg->msi_dev = pdev;
> + if (desc->msi_attrib.is_msix) {
> + arg->type = X86_IRQ_ALLOC_TYPE_MSIX;
> + } else {
> + arg->type = X86_IRQ_ALLOC_TYPE_MSI;
> + arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
> + }
> +
> + return 0;
> +}

Pretty much a copy of the x86 code, right?

I wonder whether this MSI infrastructure code would be better
seperated out and moved to arch/x86/hyperv/msi.c or
arch/x86/kernel/apic/hvmsi.c. It's small enough to be built in. So all
you'd need to export is hv_pcie_init_irq_domain and
hv_pcie_free_irq_domain.

Thanks,

tglx
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/