Re: [RFC v2 1/5] vfio/type1: Introduce iova list and add iommu aperture validity check

From: Auger Eric
Date: Tue Jan 23 2018 - 03:25:19 EST


Hi Shameer,

On 18/01/18 01:04, Alex Williamson wrote:
> On Fri, 12 Jan 2018 16:45:27 +0000
> Shameer Kolothum <shameerali.kolothum.thodi@xxxxxxxxxx> wrote:
>
>> This introduces an iova list that is valid for dma mappings. Make
>> sure the new iommu aperture window is valid and doesn't conflict
>> with any existing dma mappings during attach. Also update the iova
>> list with new aperture window during attach/detach.
>>
>> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@xxxxxxxxxx>
>> ---
>> drivers/vfio/vfio_iommu_type1.c | 177 ++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 177 insertions(+)
>>
>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>> index e30e29a..11cbd49 100644
>> --- a/drivers/vfio/vfio_iommu_type1.c
>> +++ b/drivers/vfio/vfio_iommu_type1.c
>> @@ -60,6 +60,7 @@
>>
>> struct vfio_iommu {
>> struct list_head domain_list;
>> + struct list_head iova_list;
>> struct vfio_domain *external_domain; /* domain for external user */
>> struct mutex lock;
>> struct rb_root dma_list;
>> @@ -92,6 +93,12 @@ struct vfio_group {
>> struct list_head next;
>> };
>>
>> +struct vfio_iova {
>> + struct list_head list;
>> + phys_addr_t start;
>> + phys_addr_t end;
>> +};
>
> dma_list uses dma_addr_t for the iova. IOVAs are naturally DMA
> addresses, why are we using phys_addr_t?
>
>> +
>> /*
>> * Guest RAM pinning working set or DMA target
>> */
>> @@ -1192,6 +1199,123 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)
>> return ret;
>> }
>>
>> +static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
>> + struct list_head *head)
>> +{
>> + struct vfio_iova *region;
>> +
>> + region = kmalloc(sizeof(*region), GFP_KERNEL);
>> + if (!region)
>> + return -ENOMEM;
>> +
>> + INIT_LIST_HEAD(&region->list);
>> + region->start = start;
>> + region->end = end;
>> +
>> + list_add_tail(&region->list, head);
>> + return 0;
>> +}
>
> As I'm reading through this series, I'm learning that there are a lot
> of assumptions and subtle details that should be documented. For
> instance, the IOMMU API only provides a single geometry and we build
> upon that here as this patch creates a list, but there's only a single
> entry for now. The following patches carve that single iova range into
> pieces and somewhat subtly use the list_head passed to keep the list
> sorted, allowing the first/last_entry tricks used throughout. Subtle
> interfaces are prone to bugs.
>
>> +
>> +/*
>> + * Find whether a mem region overlaps with existing dma mappings
>> + */
>> +static bool vfio_find_dma_overlap(struct vfio_iommu *iommu,
>> + phys_addr_t start, phys_addr_t end)
>> +{
>> + struct rb_node *n = rb_first(&iommu->dma_list);
>> +
>> + for (; n; n = rb_next(n)) {
>> + struct vfio_dma *dma;
>> +
>> + dma = rb_entry(n, struct vfio_dma, node);
>> +
>> + if (end < dma->iova)
>> + break;
>> + if (start >= dma->iova + dma->size)
>> + continue;
>> + return true;
>> + }
>> +
>> + return false;
>> +}
>
> Why do we need this in addition to the existing vfio_find_dma()? Why
> doesn't this use the tree structure of the dma_list?
>
>> +
>> +/*
>> + * Check the new iommu aperture is a valid one
>> + */
>> +static int vfio_iommu_valid_aperture(struct vfio_iommu *iommu,
>> + phys_addr_t start,
>> + phys_addr_t end)
>> +{
>> + struct vfio_iova *first, *last;
>> + struct list_head *iova = &iommu->iova_list;
>> +
>> + if (list_empty(iova))
>> + return 0;
>> +
>> + /* Check if new one is outside the current aperture */
>
> "Disjoint sets"
>
>> + first = list_first_entry(iova, struct vfio_iova, list);
>> + last = list_last_entry(iova, struct vfio_iova, list);
>> + if ((start > last->end) || (end < first->start))
>> + return -EINVAL;
>> +
>> + /* Check for any existing dma mappings outside the new start */
>> + if (start > first->start) {
>> + if (vfio_find_dma_overlap(iommu, first->start, start - 1))
>> + return -EINVAL;
>> + }
>> +
>> + /* Check for any existing dma mappings outside the new end */
>> + if (end < last->end) {
>> + if (vfio_find_dma_overlap(iommu, end + 1, last->end))
>> + return -EINVAL;
>> + }
>> +
>> + return 0;
>> +}
>
> I think this returns an int because you want to use it for the return
> value below, but it really seems like a bool question, ie. does this
> aperture conflict with existing mappings. Additionally, the aperture
> is valid, it was provided to us by the IOMMU API, the question is
> whether it conflicts. Please also name consistently to the other
> functions in this patch, vfio_iommu_aper_xxxx().
>
>> +
>> +/*
>> + * Adjust the iommu aperture window if new aperture is a valid one
>> + */
>> +static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
>> + phys_addr_t start,
>> + phys_addr_t end)
>
> Perhaps "resize", "prune", or "shrink" to make it more clear what is
> being adjusted?
>
>> +{
>> + struct vfio_iova *node, *next;
>> + struct list_head *iova = &iommu->iova_list;
>> +
>> + if (list_empty(iova))
>> + return vfio_insert_iova(start, end, iova);
>> +
>> + /* Adjust iova list start */
>> + list_for_each_entry_safe(node, next, iova, list) {
>> + if (start < node->start)
>> + break;
>> + if ((start >= node->start) && (start <= node->end)) {
>
> start == node->end results in a zero sized node. s/<=/</
>
>> + node->start = start;
>> + break;
>> + }
>> + /* Delete nodes before new start */
>> + list_del(&node->list);
>> + kfree(node);
>> + }
>> +
>> + /* Adjust iova list end */
>> + list_for_each_entry_safe(node, next, iova, list) {
>> + if (end > node->end)
>> + continue;
>> +
>> + if ((end >= node->start) && (end <= node->end)) {
>
> end == node->start results in a zero sized node. s/>=/>/
>
>> + node->end = end;
>> + continue;
>> + }
>> + /* Delete nodes after new end */
>> + list_del(&node->list);
>> + kfree(node);
>> + }
>> +
>> + return 0;
>> +}
>> +
>> static int vfio_iommu_type1_attach_group(void *iommu_data,
>> struct iommu_group *iommu_group)
>> {
>> @@ -1202,6 +1326,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>> int ret;
>> bool resv_msi, msi_remap;
>> phys_addr_t resv_msi_base;
>> + struct iommu_domain_geometry geo;
>>
>> mutex_lock(&iommu->lock);
>>
>> @@ -1271,6 +1396,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>> if (ret)
>> goto out_domain;
>>
>> + /* Get aperture info */
>> + iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, &geo);
>> +
>> + ret = vfio_iommu_valid_aperture(iommu, geo.aperture_start,
>> + geo.aperture_end);
>> + if (ret)
>> + goto out_detach;
>> +
>> resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
>>
>> INIT_LIST_HEAD(&domain->group_list);
>> @@ -1327,6 +1460,11 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>> goto out_detach;
>> }
>>
>> + ret = vfio_iommu_iova_aper_adjust(iommu, geo.aperture_start,
>> + geo.aperture_end);
>> + if (ret)
>> + goto out_detach;
>> +
>> list_add(&domain->next, &iommu->domain_list);
>>
>> mutex_unlock(&iommu->lock);
>> @@ -1392,6 +1530,35 @@ static void vfio_sanity_check_pfn_list(struct vfio_iommu *iommu)
>> WARN_ON(iommu->notifier.head);
>> }
>>
>> +/*
>> + * Called when a domain is removed in detach. It is possible that
>> + * the removed domain decided the iova aperture window. Modify the
>> + * iova aperture with the smallest window among existing domains.
>> + */
>> +static void vfio_iommu_iova_aper_refresh(struct vfio_iommu *iommu)
>> +{
>> + struct vfio_domain *domain;
>> + struct iommu_domain_geometry geo;
>> + struct vfio_iova *node;
>> + phys_addr_t start = 0;
>> + phys_addr_t end = (phys_addr_t)~0;
>> +
>> + list_for_each_entry(domain, &iommu->domain_list, next) {
>> + iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,
>> + &geo);
>> + if (geo.aperture_start > start)
>> + start = geo.aperture_start;
>> + if (geo.aperture_end < end)
>> + end = geo.aperture_end;
>> + }
>> +
>> + /* modify iova aperture limits */
>> + node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);
>> + node->start = start;
>> + node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);
>> + node->end = end;
>
> We can do this because the new aperture is the same or bigger than the
> current aperture, never smaller. That's not fully obvious and should
> be noted in the comment. Perhaps this function should be "expand"
> rather than "refresh".
This one is not obvious to me either:
assuming you have 2 domains, resp with aperture 1 and 2, resulting into
aperture 3. Holes are created by resv regions for instance. If you
remove domain 1, don't you get 4) instead of 2)?

1) |------------|
+
2) |---| |--| |-----|
=
3) |-| |--|


4) |---| |----------------|

Thanks

Eric
>
>> +}
>> +
>> static void vfio_iommu_type1_detach_group(void *iommu_data,
>> struct iommu_group *iommu_group)
>> {
>> @@ -1445,6 +1612,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>> iommu_domain_free(domain->domain);
>> list_del(&domain->next);
>> kfree(domain);
>> + vfio_iommu_iova_aper_refresh(iommu);
>> }
>> break;
>> }
>> @@ -1475,6 +1643,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>> }
>>
>> INIT_LIST_HEAD(&iommu->domain_list);
>> + INIT_LIST_HEAD(&iommu->iova_list);
>> iommu->dma_list = RB_ROOT;
>> mutex_init(&iommu->lock);
>> BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
>> @@ -1502,6 +1671,7 @@ static void vfio_iommu_type1_release(void *iommu_data)
>> {
>> struct vfio_iommu *iommu = iommu_data;
>> struct vfio_domain *domain, *domain_tmp;
>> + struct vfio_iova *iova, *iova_tmp;
>>
>> if (iommu->external_domain) {
>> vfio_release_domain(iommu->external_domain, true);
>> @@ -1517,6 +1687,13 @@ static void vfio_iommu_type1_release(void *iommu_data)
>> list_del(&domain->next);
>> kfree(domain);
>> }
>> +
>> + list_for_each_entry_safe(iova, iova_tmp,
>> + &iommu->iova_list, list) {
>> + list_del(&iova->list);
>> + kfree(iova);
>> + }
>> +
>> kfree(iommu);
>> }
>>
>