Re: [PATCH RFC RESEND 5/6] iommu/riscv: support nested iommu for creating domains owned by userspace

From: Jason Gunthorpe
Date: Tue May 07 2024 - 11:07:21 EST


On Tue, May 07, 2024 at 10:25:59PM +0800, Zong Li wrote:
> This patch implements .domain_alloc_user operation for creating domains
> owend by userspace, e.g. through IOMMUFD. Add s2 domain for parent
> domain for second stage, s1 domain will be the first stage.
>
> Don't remove IOMMU private data of dev in blocked domain, because it
> holds the user data of device, which is used when attaching device into
> s1 domain.
>
> Signed-off-by: Zong Li <zong.li@xxxxxxxxxx>
> ---
> drivers/iommu/riscv/iommu.c | 227 ++++++++++++++++++++++++++++++++++-
> include/uapi/linux/iommufd.h | 17 +++
> 2 files changed, 243 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
> index 072251f6ad85..7eda850df475 100644
> --- a/drivers/iommu/riscv/iommu.c
> +++ b/drivers/iommu/riscv/iommu.c
> @@ -827,6 +827,7 @@ static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
>
> /* This struct contains protection domain specific IOMMU driver data. */
> struct riscv_iommu_domain {
> + struct riscv_iommu_domain *s2;
> struct iommu_domain domain;
> struct list_head bonds;
> spinlock_t lock; /* protect bonds list updates. */
> @@ -844,6 +845,7 @@ struct riscv_iommu_domain {
> /* Private IOMMU data for managed devices, dev_iommu_priv_* */
> struct riscv_iommu_info {
> struct riscv_iommu_domain *domain;
> + struct riscv_iommu_dc dc_user;
> };
>
> /* Linkage between an iommu_domain and attached devices. */
> @@ -1454,7 +1456,6 @@ static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain,
>
> riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0, 0);
> riscv_iommu_bond_unlink(info->domain, dev);
> - info->domain = NULL;

Nope, whatever is going on here, this can't be correct.

Once a domain is detached the driver must drop all references to it
immediately.

> return 0;
> }
> @@ -1486,6 +1487,229 @@ static struct iommu_domain riscv_iommu_identity_domain = {
> }
> };
>
> +/**
> + * Nested IOMMU operations
> + */
> +
> +static int riscv_iommu_attach_dev_nested(struct iommu_domain *domain, struct device *dev)
> +{
> + struct riscv_iommu_domain *riscv_domain = iommu_domain_to_riscv(domain);
> + struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> + struct riscv_iommu_info *info = dev_iommu_priv_get(dev);
> +
> + if (riscv_domain->numa_node == NUMA_NO_NODE)
> + riscv_domain->numa_node = dev_to_node(iommu->dev);

Why? The kernel doesn't do any memory allocation for nested domains,
does it?

> + riscv_iommu_bond_unlink(info->domain, dev);
> +
> + if (riscv_iommu_bond_link(riscv_domain, dev))
> + return -ENOMEM;
> +
> + riscv_iommu_iotlb_inval(riscv_domain, 0, ULONG_MAX);

Why? The cache tags should be in good shape before they are assigned
to the device. Wiping it on every attach is just needless flushing
that isn't useful.

> + riscv_iommu_iodir_update(iommu, dev, info->dc_user.fsc, info->dc_user.ta,
> + info->dc_user.iohgatp);

This isn't ideal. Ideally the driver should make changing from S2 to
S2+S1 and back to be hitless. Wiping the valid bit as part of the
programming is not good.

As I said before, this driver probably needs the programming sequencer
from ARM.

> + info->domain = riscv_domain;
> +
> + return 0;
> +}
> +
> +static void riscv_iommu_domain_free_nested(struct iommu_domain *domain)
> +{
> + struct riscv_iommu_domain *riscv_domain = iommu_domain_to_riscv(domain);
> +
> + kfree(riscv_domain);
> +}

Doesn't the driver already have this function?
> +static struct iommu_domain *
> +riscv_iommu_domain_alloc_nested(struct device *dev,
> + struct iommu_domain *parent,
> + const struct iommu_user_data *user_data)
> +{
> + struct riscv_iommu_domain *s2_domain = iommu_domain_to_riscv(parent);
> + struct riscv_iommu_domain *s1_domain;
> + struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> + struct iommu_hwpt_riscv_iommu arg;
> + int ret, va_bits;
> +
> + if (user_data->type != IOMMU_HWPT_DATA_RISCV_IOMMU)
> + return ERR_PTR(-EOPNOTSUPP);
> +
> + if (parent->type != IOMMU_DOMAIN_UNMANAGED)
> + return ERR_PTR(-EINVAL);
> +
> + ret = iommu_copy_struct_from_user(&arg,
> + user_data,
> + IOMMU_HWPT_DATA_RISCV_IOMMU,
> + out_event_uptr);
> + if (ret)
> + return ERR_PTR(ret);
> +
> + s1_domain = kzalloc(sizeof(*s1_domain), GFP_KERNEL);
> + if (!s1_domain)
> + return ERR_PTR(-ENOMEM);
> +
> + spin_lock_init(&s1_domain->lock);
> + INIT_LIST_HEAD_RCU(&s1_domain->bonds);
> +
> + s1_domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1,
> + RISCV_IOMMU_MAX_PSCID, GFP_KERNEL);
> + if (s1_domain->pscid < 0) {
> + iommu_free_page(s1_domain->pgd_root);
> + kfree(s1_domain);
> + return ERR_PTR(-ENOMEM);
> + }
> +
> + /* Get device context of stage-1 from user*/
> + ret = riscv_iommu_get_dc_user(dev, &arg);
> + if (ret) {
> + kfree(s1_domain);
> + return ERR_PTR(-EINVAL);
> + }
> +
> + if (!iommu) {
> + va_bits = VA_BITS;
> + } else if (iommu->caps & RISCV_IOMMU_CAP_S_SV57) {
> + va_bits = 57;
> + } else if (iommu->caps & RISCV_IOMMU_CAP_S_SV48) {
> + va_bits = 48;
> + } else if (iommu->caps & RISCV_IOMMU_CAP_S_SV39) {
> + va_bits = 39;
> + } else {
> + dev_err(dev, "cannot find supported page table mode\n");
> + return ERR_PTR(-ENODEV);
> + }
> +
> + /*
> + * The ops->domain_alloc_user could be directly called by the iommufd core,
> + * instead of iommu core. So, this function need to set the default value of
> + * following data member:
> + * - domain->pgsize_bitmap
> + * - domain->geometry
> + * - domain->type
> + * - domain->ops
> + */
> + s1_domain->s2 = s2_domain;
> + s1_domain->domain.type = IOMMU_DOMAIN_NESTED;
> + s1_domain->domain.ops = &riscv_iommu_nested_domain_ops;

> + s1_domain->domain.pgsize_bitmap = SZ_4K;
> + s1_domain->domain.geometry.aperture_start = 0;
> + s1_domain->domain.geometry.aperture_end = DMA_BIT_MASK(va_bits - 1);
> + s1_domain->domain.geometry.force_aperture = true;

nested domains don't support paging/map so they don't use any of
this. Don't set it. Will remove the confusing va_bit stuff.


> +static struct iommu_domain *
> +riscv_iommu_domain_alloc_user(struct device *dev, u32 flags,
> + struct iommu_domain *parent,
> + const struct iommu_user_data *user_data)
> +{
> + struct iommu_domain *domain;
> + struct riscv_iommu_domain *riscv_domain;
> +
> + /* Allocate stage-1 domain if it has stage-2 parent domain */
> + if (parent)
> + return riscv_iommu_domain_alloc_nested(dev, parent, user_data);
> +
> + if (flags & ~((IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
> + return ERR_PTR(-EOPNOTSUPP);

This if should be moved up to the top and this driver does not support
diry_tracking, so don't test it.

> + if (user_data)
> + return ERR_PTR(-EINVAL);

Really? Don't you need to ask for the s2?

> + /* domain_alloc_user op needs to be fully initialized */
> + domain = iommu_domain_alloc(dev->bus);

Please no, structure your driver properly so you can call only
internal functions. This was a hack for intel only.

> + if (!domain)
> + return ERR_PTR(-ENOMEM);
> +
> + /*
> + * We assume that nest-parent or g-stage only will come here

No... That isn't right. Normal non-virtualization paging domains come
here too.

The default of this function, with an empty user_data, should be to
create the same domain as alloc_domain_paging.


> + * TODO: Shadow page table doesn't be supported now.
> + * We currently can't distinguish g-stage and shadow
> + * page table here. Shadow page table shouldn't be
> + * put at stage-2.
> + */
> + riscv_domain = iommu_domain_to_riscv(domain);

What is a shadow page table?

> + /* pgd_root may be allocated in .domain_alloc_paging */
> + if (riscv_domain->pgd_root)
> + iommu_free_page(riscv_domain->pgd_root);

And this is gross, structure your driver right so you don't have to
undo what prior functions did.

> + riscv_domain->pgd_root = iommu_alloc_pages_node(riscv_domain->numa_node,
> + GFP_KERNEL_ACCOUNT,
> + 2);
> + if (!riscv_domain->pgd_root)
> + return ERR_PTR(-ENOMEM);
> +
> + riscv_domain->gscid = ida_alloc_range(&riscv_iommu_gscids, 1,
> + RISCV_IOMMU_MAX_GSCID, GFP_KERNEL);
> + if (riscv_domain->gscid < 0) {
> + iommu_free_pages(riscv_domain->pgd_root, 2);
> + kfree(riscv_domain);
> + return ERR_PTR(-ENOMEM);
> + }
> +
> + return domain;
> +}
> +
> static void *riscv_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
> {
> struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> @@ -1587,6 +1811,7 @@ static const struct iommu_ops riscv_iommu_ops = {
> .blocked_domain = &riscv_iommu_blocking_domain,
> .release_domain = &riscv_iommu_blocking_domain,
> .domain_alloc_paging = riscv_iommu_alloc_paging_domain,
> + .domain_alloc_user = riscv_iommu_domain_alloc_user,
> .def_domain_type = riscv_iommu_device_domain_type,
> .device_group = riscv_iommu_device_group,
> .probe_device = riscv_iommu_probe_device,
> diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
> index ec9aafd7d373..e10b6e236647 100644
> --- a/include/uapi/linux/iommufd.h
> +++ b/include/uapi/linux/iommufd.h
> @@ -390,14 +390,31 @@ struct iommu_hwpt_vtd_s1 {
> __u32 __reserved;
> };
>
> +/**
> + * struct iommu_hwpt_riscv_iommu - RISCV IOMMU stage-1 device context table
> + * info (IOMMU_HWPT_TYPE_RISCV_IOMMU)
> + * @dc_len: Length of device context
> + * @dc_uptr: User pointer to the address of device context
> + * @event_len: Length of an event record
> + * @out_event_uptr: User pointer to the address of event record
> + */
> +struct iommu_hwpt_riscv_iommu {
> + __aligned_u64 dc_len;
> + __aligned_u64 dc_uptr;

So far other drivers have been inlining their "device context" in this
struct, why do you need a pointer and length?

> + __aligned_u64 event_len;
> + __aligned_u64 out_event_uptr;

Weird? Why?

Jason