Re: [RFC v2 1/5] vfio/type1: Introduce iova list and add iommu aperture validity check

From: Alex Williamson
Date: Wed Jan 17 2018 - 19:04:47 EST


On Fri, 12 Jan 2018 16:45:27 +0000
Shameer Kolothum <shameerali.kolothum.thodi@xxxxxxxxxx> wrote:

> This introduces an iova list that is valid for dma mappings. Make
> sure the new iommu aperture window is valid and doesn't conflict
> with any existing dma mappings during attach. Also update the iova
> list with new aperture window during attach/detach.
>
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@xxxxxxxxxx>
> ---
> drivers/vfio/vfio_iommu_type1.c | 177 ++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 177 insertions(+)
>
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index e30e29a..11cbd49 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -60,6 +60,7 @@
>
> struct vfio_iommu {
> struct list_head domain_list;
> + struct list_head iova_list;
> struct vfio_domain *external_domain; /* domain for external user */
> struct mutex lock;
> struct rb_root dma_list;
> @@ -92,6 +93,12 @@ struct vfio_group {
> struct list_head next;
> };
>
> +struct vfio_iova {
> + struct list_head list;
> + phys_addr_t start;
> + phys_addr_t end;
> +};

dma_list uses dma_addr_t for the iova. IOVAs are naturally DMA
addresses, why are we using phys_addr_t?

> +
> /*
> * Guest RAM pinning working set or DMA target
> */
> @@ -1192,6 +1199,123 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)
> return ret;
> }
>
> +static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
> + struct list_head *head)
> +{
> + struct vfio_iova *region;
> +
> + region = kmalloc(sizeof(*region), GFP_KERNEL);
> + if (!region)
> + return -ENOMEM;
> +
> + INIT_LIST_HEAD(&region->list);
> + region->start = start;
> + region->end = end;
> +
> + list_add_tail(&region->list, head);
> + return 0;
> +}

As I'm reading through this series, I'm learning that there are a lot
of assumptions and subtle details that should be documented. For
instance, the IOMMU API only provides a single geometry and we build
upon that here as this patch creates a list, but there's only a single
entry for now. The following patches carve that single iova range into
pieces and somewhat subtly use the list_head passed to keep the list
sorted, allowing the first/last_entry tricks used throughout. Subtle
interfaces are prone to bugs.

> +
> +/*
> + * Find whether a mem region overlaps with existing dma mappings
> + */
> +static bool vfio_find_dma_overlap(struct vfio_iommu *iommu,
> + phys_addr_t start, phys_addr_t end)
> +{
> + struct rb_node *n = rb_first(&iommu->dma_list);
> +
> + for (; n; n = rb_next(n)) {
> + struct vfio_dma *dma;
> +
> + dma = rb_entry(n, struct vfio_dma, node);
> +
> + if (end < dma->iova)
> + break;
> + if (start >= dma->iova + dma->size)
> + continue;
> + return true;
> + }
> +
> + return false;
> +}

Why do we need this in addition to the existing vfio_find_dma()? Why
doesn't this use the tree structure of the dma_list?

> +
> +/*
> + * Check the new iommu aperture is a valid one
> + */
> +static int vfio_iommu_valid_aperture(struct vfio_iommu *iommu,
> + phys_addr_t start,
> + phys_addr_t end)
> +{
> + struct vfio_iova *first, *last;
> + struct list_head *iova = &iommu->iova_list;
> +
> + if (list_empty(iova))
> + return 0;
> +
> + /* Check if new one is outside the current aperture */

"Disjoint sets"

> + first = list_first_entry(iova, struct vfio_iova, list);
> + last = list_last_entry(iova, struct vfio_iova, list);
> + if ((start > last->end) || (end < first->start))
> + return -EINVAL;
> +
> + /* Check for any existing dma mappings outside the new start */
> + if (start > first->start) {
> + if (vfio_find_dma_overlap(iommu, first->start, start - 1))
> + return -EINVAL;
> + }
> +
> + /* Check for any existing dma mappings outside the new end */
> + if (end < last->end) {
> + if (vfio_find_dma_overlap(iommu, end + 1, last->end))
> + return -EINVAL;
> + }
> +
> + return 0;
> +}

I think this returns an int because you want to use it for the return
value below, but it really seems like a bool question, ie. does this
aperture conflict with existing mappings. Additionally, the aperture
is valid, it was provided to us by the IOMMU API, the question is
whether it conflicts. Please also name consistently to the other
functions in this patch, vfio_iommu_aper_xxxx().

> +
> +/*
> + * Adjust the iommu aperture window if new aperture is a valid one
> + */
> +static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
> + phys_addr_t start,
> + phys_addr_t end)

Perhaps "resize", "prune", or "shrink" to make it more clear what is
being adjusted?

> +{
> + struct vfio_iova *node, *next;
> + struct list_head *iova = &iommu->iova_list;
> +
> + if (list_empty(iova))
> + return vfio_insert_iova(start, end, iova);
> +
> + /* Adjust iova list start */
> + list_for_each_entry_safe(node, next, iova, list) {
> + if (start < node->start)
> + break;
> + if ((start >= node->start) && (start <= node->end)) {

start == node->end results in a zero sized node. s/<=/</

> + node->start = start;
> + break;
> + }
> + /* Delete nodes before new start */
> + list_del(&node->list);
> + kfree(node);
> + }
> +
> + /* Adjust iova list end */
> + list_for_each_entry_safe(node, next, iova, list) {
> + if (end > node->end)
> + continue;
> +
> + if ((end >= node->start) && (end <= node->end)) {

end == node->start results in a zero sized node. s/>=/>/

> + node->end = end;
> + continue;
> + }
> + /* Delete nodes after new end */
> + list_del(&node->list);
> + kfree(node);
> + }
> +
> + return 0;
> +}
> +
> static int vfio_iommu_type1_attach_group(void *iommu_data,
> struct iommu_group *iommu_group)
> {
> @@ -1202,6 +1326,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
> int ret;
> bool resv_msi, msi_remap;
> phys_addr_t resv_msi_base;
> + struct iommu_domain_geometry geo;
>
> mutex_lock(&iommu->lock);
>
> @@ -1271,6 +1396,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
> if (ret)
> goto out_domain;
>
> + /* Get aperture info */
> + iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, &geo);
> +
> + ret = vfio_iommu_valid_aperture(iommu, geo.aperture_start,
> + geo.aperture_end);
> + if (ret)
> + goto out_detach;
> +
> resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
>
> INIT_LIST_HEAD(&domain->group_list);
> @@ -1327,6 +1460,11 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
> goto out_detach;
> }
>
> + ret = vfio_iommu_iova_aper_adjust(iommu, geo.aperture_start,
> + geo.aperture_end);
> + if (ret)
> + goto out_detach;
> +
> list_add(&domain->next, &iommu->domain_list);
>
> mutex_unlock(&iommu->lock);
> @@ -1392,6 +1530,35 @@ static void vfio_sanity_check_pfn_list(struct vfio_iommu *iommu)
> WARN_ON(iommu->notifier.head);
> }
>
> +/*
> + * Called when a domain is removed in detach. It is possible that
> + * the removed domain decided the iova aperture window. Modify the
> + * iova aperture with the smallest window among existing domains.
> + */
> +static void vfio_iommu_iova_aper_refresh(struct vfio_iommu *iommu)
> +{
> + struct vfio_domain *domain;
> + struct iommu_domain_geometry geo;
> + struct vfio_iova *node;
> + phys_addr_t start = 0;
> + phys_addr_t end = (phys_addr_t)~0;
> +
> + list_for_each_entry(domain, &iommu->domain_list, next) {
> + iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,
> + &geo);
> + if (geo.aperture_start > start)
> + start = geo.aperture_start;
> + if (geo.aperture_end < end)
> + end = geo.aperture_end;
> + }
> +
> + /* modify iova aperture limits */
> + node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);
> + node->start = start;
> + node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);
> + node->end = end;

We can do this because the new aperture is the same or bigger than the
current aperture, never smaller. That's not fully obvious and should
be noted in the comment. Perhaps this function should be "expand"
rather than "refresh".

> +}
> +
> static void vfio_iommu_type1_detach_group(void *iommu_data,
> struct iommu_group *iommu_group)
> {
> @@ -1445,6 +1612,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
> iommu_domain_free(domain->domain);
> list_del(&domain->next);
> kfree(domain);
> + vfio_iommu_iova_aper_refresh(iommu);
> }
> break;
> }
> @@ -1475,6 +1643,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
> }
>
> INIT_LIST_HEAD(&iommu->domain_list);
> + INIT_LIST_HEAD(&iommu->iova_list);
> iommu->dma_list = RB_ROOT;
> mutex_init(&iommu->lock);
> BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
> @@ -1502,6 +1671,7 @@ static void vfio_iommu_type1_release(void *iommu_data)
> {
> struct vfio_iommu *iommu = iommu_data;
> struct vfio_domain *domain, *domain_tmp;
> + struct vfio_iova *iova, *iova_tmp;
>
> if (iommu->external_domain) {
> vfio_release_domain(iommu->external_domain, true);
> @@ -1517,6 +1687,13 @@ static void vfio_iommu_type1_release(void *iommu_data)
> list_del(&domain->next);
> kfree(domain);
> }
> +
> + list_for_each_entry_safe(iova, iova_tmp,
> + &iommu->iova_list, list) {
> + list_del(&iova->list);
> + kfree(iova);
> + }
> +
> kfree(iommu);
> }
>