Re: [PATCH] iommu/arm-smmu-v3: add nr_ats_masters to avoid unnecessary operations

From: John Garry
Date: Mon Aug 12 2019 - 06:42:33 EST


On 01/08/2019 13:20, Zhen Lei wrote:
When (smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS) is true, even if a
smmu domain does not contain any ats master, the operations of
arm_smmu_atc_inv_to_cmd() and lock protection in arm_smmu_atc_inv_domain()
are always executed. This will impact performance, especially in
multi-core and stress scenarios. For my FIO test scenario, about 8%
performance reduced.

In fact, we can use a atomic member to record how many ats masters the
smmu contains. And check that without traverse the list and check all
masters one by one in the lock protection.


Hi Will, Robin, Jean-Philippe,

Can you kindly check this issue? We have seen a signifigant performance regression here.

Thanks!

Fixes: 9ce27afc0830 ("iommu/arm-smmu-v3: Add support for PCI ATS")
Signed-off-by: Zhen Lei <thunder.leizhen@xxxxxxxxxx>
---
drivers/iommu/arm-smmu-v3.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index a9a9fabd396804a..1b370d9aca95f94 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -631,6 +631,7 @@ struct arm_smmu_domain {

struct io_pgtable_ops *pgtbl_ops;
bool non_strict;
+ atomic_t nr_ats_masters;


It's not ideal to keep a separate count of ats masters...hmmm


enum arm_smmu_domain_stage stage;
union {
@@ -1531,7 +1532,7 @@ static int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
struct arm_smmu_cmdq_ent cmd;
struct arm_smmu_master *master;

- if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS))
+ if (!atomic_read(&smmu_domain->nr_ats_masters))
return 0;

The rest of the code is here:

arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd);

spin_lock_irqsave(&smmu_domain->devices_lock, flags);
list_for_each_entry(master, &smmu_domain->devices, domain_head)
ret |= arm_smmu_atc_inv_master(master, &cmd);
spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);

return ret ? -ETIMEDOUT : 0;
}

Not directly related to leizhen's issue: Could RCU protection be used for this list iteration? I can't imagine that the devices list changes often. And also we already protect the cmdq in arm_smmu_atc_inv_master().


arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd);
@@ -1869,6 +1870,7 @@ static int arm_smmu_enable_ats(struct arm_smmu_master *master)
size_t stu;
struct pci_dev *pdev;
struct arm_smmu_device *smmu = master->smmu;
+ struct arm_smmu_domain *smmu_domain = master->domain;
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(master->dev);

if (!(smmu->features & ARM_SMMU_FEAT_ATS) || !dev_is_pci(master->dev) ||
@@ -1887,12 +1889,15 @@ static int arm_smmu_enable_ats(struct arm_smmu_master *master)
return ret;

master->ats_enabled = true;
+ atomic_inc(&smmu_domain->nr_ats_masters);
+
return 0;
}

static void arm_smmu_disable_ats(struct arm_smmu_master *master)
{
struct arm_smmu_cmdq_ent cmd;
+ struct arm_smmu_domain *smmu_domain = master->domain;

if (!master->ats_enabled || !dev_is_pci(master->dev))
return;
@@ -1901,6 +1906,7 @@ static void arm_smmu_disable_ats(struct arm_smmu_master *master)
arm_smmu_atc_inv_master(master, &cmd);
pci_disable_ats(to_pci_dev(master->dev));
master->ats_enabled = false;
+ atomic_dec(&smmu_domain->nr_ats_masters);
}

static void arm_smmu_detach_dev(struct arm_smmu_master *master)
@@ -1915,10 +1921,10 @@ static void arm_smmu_detach_dev(struct arm_smmu_master *master)
list_del(&master->domain_head);
spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);

- master->domain = NULL;
arm_smmu_install_ste_for_dev(master);

arm_smmu_disable_ats(master);
+ master->domain = NULL;
}

static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)