Re: [PATCH]iommu-iotlb-flushing

From: mark gross
Date: Fri Feb 29 2008 - 18:13:47 EST


On Sat, Feb 23, 2008 at 12:05:17AM -0800, Andrew Morton wrote:
> On Wed, 20 Feb 2008 16:06:23 -0800 mark gross <mgross@xxxxxxxxxxxxxxx> wrote:
>
> > The following patch is for batching up the flushing of the IOTLB for
> > the DMAR implementation found in the Intel VT-d hardware. It works by
> > building a list of to be flushed IOTLB entries and a bitmap list of
> > which DMAR engine they are from.
> >
> > After either a high water mark (250 accessible via debugfs) or 10ms the
> > list of iova's will be reclaimed and the DMAR engines associated are
> > IOTLB-flushed.
> >
> > This approach recovers 15 to 20% of the performance lost when using the
> > IOMMU for my netperf udp stream benchmark with small packets. It can be
> > disabled with a kernel boot parameter
> > "intel_iommu=strict".
> >
> > Its use does weaken the IOMMU protections a bit.
> >
> > I would like to see this go into MM for a while and then onto mainline.
> >
> > ...
> >
> > +static struct timer_list unmap_timer =
> > + TIMER_INITIALIZER(flush_unmaps_timeout, 0, 0);
>
> Could use DEFINE_TIMER here.

fixed.

>
> > +struct unmap_list {
> > + struct list_head list;
> > + struct dmar_domain *domain;
> > + struct iova *iova;
> > +};
>
> unmap_list doens't seem to be used anywhere?

fixed.

>
> > +static struct intel_iommu *g_iommus;
> > +/* bitmap for indexing intel_iommus */
> > +static unsigned long *g_iommus_to_flush;
> > +static int g_num_of_iommus;
> > +
> > +static DEFINE_SPINLOCK(async_umap_flush_lock);
> > +static LIST_HEAD(unmaps_to_do);
> > +
> > +static int timer_on;
> > +static long list_size;
> > +static int high_watermark;
> > +
> > +static struct dentry *intel_iommu_debug, *debug;
> > +
> > +
> > static void domain_remove_dev_info(struct dmar_domain *domain);
> >
> > static int dmar_disabled;
> > static int __initdata dmar_map_gfx = 1;
> > static int dmar_forcedac;
> > +static int intel_iommu_strict;
> >
> > #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
> > static DEFINE_SPINLOCK(device_domain_lock);
> > @@ -73,9 +103,13 @@
> > printk(KERN_INFO
> > "Intel-IOMMU: disable GFX device mapping\n");
> > } else if (!strncmp(str, "forcedac", 8)) {
> > - printk (KERN_INFO
> > + printk(KERN_INFO
> > "Intel-IOMMU: Forcing DAC for PCI devices\n");
> > dmar_forcedac = 1;
> > + } else if (!strncmp(str, "strict", 8)) {
>
> s/8/6/

fixed.

>
> > + printk(KERN_INFO
> > + "Intel-IOMMU: disable batched IOTLB flush\n");
> > + intel_iommu_strict = 1;
> > }
> >
> > str += strcspn(str, ",");
> > @@ -965,17 +999,13 @@
> > set_bit(0, iommu->domain_ids);
> > return 0;
> > }
> > -
> > -static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
> > +static struct intel_iommu *alloc_iommu(struct intel_iommu *iommu,
> > + struct dmar_drhd_unit *drhd)
> > {
> > - struct intel_iommu *iommu;
> > int ret;
> > int map_size;
> > u32 ver;
> >
> > - iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
> > - if (!iommu)
> > - return NULL;
> > iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
> > if (!iommu->reg) {
> > printk(KERN_ERR "IOMMU: can't map the region\n");
> > @@ -1396,7 +1426,7 @@
> > int index;
> >
> > while (dev) {
> > - for (index = 0; index < cnt; index ++)
> > + for (index = 0; index < cnt; index++)
> > if (dev == devices[index])
> > return 1;
> >
> > @@ -1661,7 +1691,7 @@
> > struct dmar_rmrr_unit *rmrr;
> > struct pci_dev *pdev;
> > struct intel_iommu *iommu;
> > - int ret, unit = 0;
> > + int nlongs, i, ret, unit = 0;
> >
> > /*
> > * for each drhd
> > @@ -1672,7 +1702,30 @@
> > for_each_drhd_unit(drhd) {
> > if (drhd->ignored)
> > continue;
> > - iommu = alloc_iommu(drhd);
> > + g_num_of_iommus++;
>
> No locking needed for g_num_of_iommus?

Yup, not needed. Only the single threaded __init path increments this
and after that only read accesses happen.

>
> > + }
> > +
> > + nlongs = BITS_TO_LONGS(g_num_of_iommus);
>
> Would this code be neater if it used the <linux/bitmap.h> stuff?
>

I just looked, I don't think bitmap.h helps me with my bitmap use in
this module.


> > + g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
> > + if (!g_iommus_to_flush) {
> > + printk(KERN_ERR "Intel-IOMMU: "
> > + "Allocating bitmap array failed\n");
> > + return -ENOMEM;
>
> Are you sure we aren't leaking anything here? Like the alloc_iommu() above?
>

Fixed. Was leaking the g_iommus_to_flush on the error path.


> > + }
> > +
> > + g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
> > + if (!g_iommus) {
> > + kfree(g_iommus_to_flush);
> > + ret = -ENOMEM;
> > + goto error;
> > + }
> > +
> > + i = 0;
> > + for_each_drhd_unit(drhd) {
> > + if (drhd->ignored)
> > + continue;
> > + iommu = alloc_iommu(&g_iommus[i], drhd);
> > + i++;
> > if (!iommu) {
> > ret = -ENOMEM;
> > goto error;
> > @@ -1705,7 +1758,6 @@
> > * endfor
> > */
> > for_each_rmrr_units(rmrr) {
> > - int i;
> > for (i = 0; i < rmrr->devices_cnt; i++) {
> > pdev = rmrr->devices[i];
> > /* some BIOS lists non-exist devices in DMAR table */
> > @@ -1909,6 +1961,54 @@
> > return 0;
> > }
> >
> > +static void flush_unmaps(void)
> > +{
> > + struct iova *node, *n;
> > + unsigned long flags;
> > + int i;
> > +
> > + spin_lock_irqsave(&async_umap_flush_lock, flags);
> > + timer_on = 0;
> > +
> > + /*just flush them all*/
>
> I'm surprised that checkpatch didn't grump about the odd commenting style.
>

fixed.

> > + for (i = 0; i < g_num_of_iommus; i++) {
> > + if (test_and_clear_bit(i, g_iommus_to_flush))
> > + iommu_flush_iotlb_global(&g_iommus[i], 0);
> > + }
> > +
> > + list_for_each_entry_safe(node, n, &unmaps_to_do, list) {
> > + /* free iova */
> > + list_del(&node->list);
> > + __free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);
> > +
> > + }
> > + list_size = 0;
> > + spin_unlock_irqrestore(&async_umap_flush_lock, flags);
> > +}
> > +
> > +static void flush_unmaps_timeout(unsigned long data)
> > +{
> > + flush_unmaps();
> > +}
> > +
> > +static void add_unmap(struct dmar_domain *dom, struct iova *iova)
> > +{
> > + unsigned long flags;
> > +
> > + spin_lock_irqsave(&async_umap_flush_lock, flags);
>
> How scalable is this?

Its not very, but I'm doing an insert walk of a list and need the locks.
On the bright side, its way better than spin locking and poling the
IOTLB bit for cleared on every unmap.

>
> > + iova->dmar = dom;
> > + list_add(&iova->list, &unmaps_to_do);
> > + set_bit((dom->iommu - g_iommus), g_iommus_to_flush);
> > +
> > + if (!timer_on) {
> > + unmap_timer.expires = jiffies + msecs_to_jiffies(10);
> > + mod_timer(&unmap_timer, unmap_timer.expires);
>
> No, this modifies unmap_timer.expires twice. Might be racy too. You want
>
> mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
>

Fixed.

> > + timer_on = 1;
> > + }
> > + list_size++;
> > + spin_unlock_irqrestore(&async_umap_flush_lock, flags);
> > +}
> > +
> > static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
> > size_t size, int dir)
> > {
> > @@ -1936,13 +2036,21 @@
> > dma_pte_clear_range(domain, start_addr, start_addr + size);
> > /* free page tables */
> > dma_pte_free_pagetable(domain, start_addr, start_addr + size);
> > -
> > - if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
> > - size >> PAGE_SHIFT_4K, 0))
> > - iommu_flush_write_buffer(domain->iommu);
> > -
> > - /* free iova */
> > - __free_iova(&domain->iovad, iova);
> > + if (intel_iommu_strict) {
> > + if (iommu_flush_iotlb_psi(domain->iommu,
> > + domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
> > + iommu_flush_write_buffer(domain->iommu);
> > + /* free iova */
> > + __free_iova(&domain->iovad, iova);
> > + } else {
> > + add_unmap(domain, iova);
> > + /*
> > + * queue up the release of the unmap to save the 1/6th of the
> > + * cpu used up by the iotlb flush operation...
> > + */
> > + if (list_size > high_watermark)
> > + flush_unmaps();
> > + }
> > }
> >
> > static void * intel_alloc_coherent(struct device *hwdev, size_t size,
> > @@ -2266,6 +2374,10 @@
> > if (dmar_table_init())
> > return -ENODEV;
> >
> > + high_watermark = 250;
> > + intel_iommu_debug = debugfs_create_dir("intel_iommu", NULL);
> > + debug = debugfs_create_u32("high_watermark", S_IWUGO | S_IRUGO,
> > + intel_iommu_debug, &high_watermark);
> > iommu_init_mempool();
> > dmar_init_reserved_ranges();
> >
> > @@ -2281,6 +2393,7 @@
> > printk(KERN_INFO
> > "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
> >
> > + init_timer(&unmap_timer);
>
> I see timers being added, but I see no del_timer_sync()s added on cleanup
> paths. Are you sure that we don't have races on various teardown paths?
>

The only error path to delete the single timer created is at the bottom
of intel_iommu_init. I don't think there is a place for the timer to be
delete and any error clean up paths to put it.

> > force_iommu = 1;
> > dma_ops = &intel_dma_ops;
> > return 0;
> > Index: linux-2.6.24-mm1/drivers/pci/iova.h
> > ===================================================================
> > --- linux-2.6.24-mm1.orig/drivers/pci/iova.h 2008-02-12 07:12:06.000000000 -0800
> > +++ linux-2.6.24-mm1/drivers/pci/iova.h 2008-02-12 07:39:53.000000000 -0800
> > @@ -23,6 +23,8 @@
> > struct rb_node node;
> > unsigned long pfn_hi; /* IOMMU dish out addr hi */
> > unsigned long pfn_lo; /* IOMMU dish out addr lo */
> > + struct list_head list;
> > + void *dmar;
> > };
> >
> > /* holds all the iova translations for a domain */
> > Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
> > ===================================================================
> > --- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt 2008-02-12 07:12:06.000000000 -0800
> > +++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt 2008-02-13 11:17:22.000000000 -0800
> > @@ -822,6 +822,10 @@
> > than 32 bit addressing. The default is to look
> > for translation below 32 bit and if not available
> > then look in the higher range.
> > + strict [Default Off]
> > + With this option on every unmap_single operation will
> > + result in a hardware IOTLB flush operation as opposed
> > + to batching them for performance.
>
> boot-time options suck. Is it not possible to tweak this at runtime?

Yes, I could easily create a sysfs or debugfs mechanism for turning it
on / off at run time. I would like input on the preferred way to do this.
sysfs or debugfs?


Thanks again for your review on this. The following is the updated
patch.

--mgross

Signed-off-by: <mgross@xxxxxxxxxxxxxxx>



Index: linux-2.6.25-rc2-mm1/drivers/pci/intel-iommu.c
===================================================================
--- linux-2.6.25-rc2-mm1.orig/drivers/pci/intel-iommu.c 2008-02-26 11:15:46.000000000 -0800
+++ linux-2.6.25-rc2-mm1/drivers/pci/intel-iommu.c 2008-02-29 14:36:55.000000000 -0800
@@ -21,6 +21,7 @@

#include <linux/init.h>
#include <linux/bitmap.h>
+#include <linux/debugfs.h>
#include <linux/slab.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
@@ -30,6 +31,7 @@
#include <linux/dmar.h>
#include <linux/dma-mapping.h>
#include <linux/mempool.h>
+#include <linux/timer.h>
#include "iova.h"
#include "intel-iommu.h"
#include <asm/proto.h> /* force_iommu in this header in x86-64*/
@@ -50,11 +52,32 @@

#define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)

+
+static void flush_unmaps_timeout(unsigned long data);
+
+DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
+
+static struct intel_iommu *g_iommus;
+/* bitmap for indexing intel_iommus */
+static unsigned long *g_iommus_to_flush;
+static int g_num_of_iommus;
+
+static DEFINE_SPINLOCK(async_umap_flush_lock);
+static LIST_HEAD(unmaps_to_do);
+
+static int timer_on;
+static long list_size;
+static int high_watermark;
+
+static struct dentry *intel_iommu_debug, *debug;
+
+
static void domain_remove_dev_info(struct dmar_domain *domain);

static int dmar_disabled;
static int __initdata dmar_map_gfx = 1;
static int dmar_forcedac;
+static int intel_iommu_strict;

#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
static DEFINE_SPINLOCK(device_domain_lock);
@@ -73,9 +96,13 @@
printk(KERN_INFO
"Intel-IOMMU: disable GFX device mapping\n");
} else if (!strncmp(str, "forcedac", 8)) {
- printk (KERN_INFO
+ printk(KERN_INFO
"Intel-IOMMU: Forcing DAC for PCI devices\n");
dmar_forcedac = 1;
+ } else if (!strncmp(str, "strict", 6)) {
+ printk(KERN_INFO
+ "Intel-IOMMU: disable batched IOTLB flush\n");
+ intel_iommu_strict = 1;
}

str += strcspn(str, ",");
@@ -965,17 +992,13 @@
set_bit(0, iommu->domain_ids);
return 0;
}
-
-static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
+static struct intel_iommu *alloc_iommu(struct intel_iommu *iommu,
+ struct dmar_drhd_unit *drhd)
{
- struct intel_iommu *iommu;
int ret;
int map_size;
u32 ver;

- iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
- if (!iommu)
- return NULL;
iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
if (!iommu->reg) {
printk(KERN_ERR "IOMMU: can't map the region\n");
@@ -1396,7 +1419,7 @@
int index;

while (dev) {
- for (index = 0; index < cnt; index ++)
+ for (index = 0; index < cnt; index++)
if (dev == devices[index])
return 1;

@@ -1661,7 +1684,7 @@
struct dmar_rmrr_unit *rmrr;
struct pci_dev *pdev;
struct intel_iommu *iommu;
- int ret, unit = 0;
+ int nlongs, i, ret, unit = 0;

/*
* for each drhd
@@ -1672,7 +1695,35 @@
for_each_drhd_unit(drhd) {
if (drhd->ignored)
continue;
- iommu = alloc_iommu(drhd);
+ g_num_of_iommus++;
+ /*
+ * lock not needed as this is only incremented in the single
+ * threaded kernel __init code path all other access are read
+ * only
+ */
+ }
+
+ nlongs = BITS_TO_LONGS(g_num_of_iommus);
+ g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
+ if (!g_iommus_to_flush) {
+ printk(KERN_ERR "Intel-IOMMU: "
+ "Allocating bitmap array failed\n");
+ return -ENOMEM;
+ }
+
+ g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
+ if (!g_iommus) {
+ kfree(g_iommus_to_flush);
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ i = 0;
+ for_each_drhd_unit(drhd) {
+ if (drhd->ignored)
+ continue;
+ iommu = alloc_iommu(&g_iommus[i], drhd);
+ i++;
if (!iommu) {
ret = -ENOMEM;
goto error;
@@ -1705,7 +1756,6 @@
* endfor
*/
for_each_rmrr_units(rmrr) {
- int i;
for (i = 0; i < rmrr->devices_cnt; i++) {
pdev = rmrr->devices[i];
/* some BIOS lists non-exist devices in DMAR table */
@@ -1761,6 +1811,7 @@
iommu = drhd->iommu;
free_iommu(iommu);
}
+ kfree(g_iommus);
return ret;
}

@@ -1909,6 +1960,53 @@
return 0;
}

+static void flush_unmaps(void)
+{
+ struct iova *node, *n;
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&async_umap_flush_lock, flags);
+ timer_on = 0;
+
+ /* just flush them all */
+ for (i = 0; i < g_num_of_iommus; i++) {
+ if (test_and_clear_bit(i, g_iommus_to_flush))
+ iommu_flush_iotlb_global(&g_iommus[i], 0);
+ }
+
+ list_for_each_entry_safe(node, n, &unmaps_to_do, list) {
+ /* free iova */
+ list_del(&node->list);
+ __free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);
+
+ }
+ list_size = 0;
+ spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
+static void flush_unmaps_timeout(unsigned long data)
+{
+ flush_unmaps();
+}
+
+static void add_unmap(struct dmar_domain *dom, struct iova *iova)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&async_umap_flush_lock, flags);
+ iova->dmar = dom;
+ list_add(&iova->list, &unmaps_to_do);
+ set_bit((dom->iommu - g_iommus), g_iommus_to_flush);
+
+ if (!timer_on) {
+ mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
+ timer_on = 1;
+ }
+ list_size++;
+ spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
size_t size, int dir)
{
@@ -1936,13 +2034,21 @@
dma_pte_clear_range(domain, start_addr, start_addr + size);
/* free page tables */
dma_pte_free_pagetable(domain, start_addr, start_addr + size);
-
- if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
- size >> PAGE_SHIFT_4K, 0))
- iommu_flush_write_buffer(domain->iommu);
-
- /* free iova */
- __free_iova(&domain->iovad, iova);
+ if (intel_iommu_strict) {
+ if (iommu_flush_iotlb_psi(domain->iommu,
+ domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
+ iommu_flush_write_buffer(domain->iommu);
+ /* free iova */
+ __free_iova(&domain->iovad, iova);
+ } else {
+ add_unmap(domain, iova);
+ /*
+ * queue up the release of the unmap to save the 1/6th of the
+ * cpu used up by the iotlb flush operation...
+ */
+ if (list_size > high_watermark)
+ flush_unmaps();
+ }
}

static void * intel_alloc_coherent(struct device *hwdev, size_t size,
@@ -2266,6 +2372,10 @@
if (dmar_table_init())
return -ENODEV;

+ high_watermark = 250;
+ intel_iommu_debug = debugfs_create_dir("intel_iommu", NULL);
+ debug = debugfs_create_u32("high_watermark", S_IWUGO | S_IRUGO,
+ intel_iommu_debug, &high_watermark);
iommu_init_mempool();
dmar_init_reserved_ranges();

@@ -2281,6 +2391,7 @@
printk(KERN_INFO
"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");

+ init_timer(&unmap_timer);
force_iommu = 1;
dma_ops = &intel_dma_ops;
return 0;
Index: linux-2.6.25-rc2-mm1/drivers/pci/iova.h
===================================================================
--- linux-2.6.25-rc2-mm1.orig/drivers/pci/iova.h 2008-02-26 11:15:46.000000000 -0800
+++ linux-2.6.25-rc2-mm1/drivers/pci/iova.h 2008-02-29 14:11:52.000000000 -0800
@@ -23,6 +23,8 @@
struct rb_node node;
unsigned long pfn_hi; /* IOMMU dish out addr hi */
unsigned long pfn_lo; /* IOMMU dish out addr lo */
+ struct list_head list;
+ void *dmar;
};

/* holds all the iova translations for a domain */
Index: linux-2.6.25-rc2-mm1/Documentation/kernel-parameters.txt
===================================================================
--- linux-2.6.25-rc2-mm1.orig/Documentation/kernel-parameters.txt 2008-02-26 11:16:01.000000000 -0800
+++ linux-2.6.25-rc2-mm1/Documentation/kernel-parameters.txt 2008-02-26 13:44:06.000000000 -0800
@@ -830,6 +830,10 @@
than 32 bit addressing. The default is to look
for translation below 32 bit and if not available
then look in the higher range.
+ strict [Default Off]
+ With this option on every unmap_single operation will
+ result in a hardware IOTLB flush operation as opposed
+ to batching them for performance.

io_delay= [X86-32,X86-64] I/O delay method
0x80
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/