[RFC v1 3/4] swiotlb: Allow dynamic allocation of bounce buffers

From: Petr Tesarik
Date: Mon Mar 20 2023 - 08:29:58 EST


From: Petr Tesarik <petr.tesarik.ext@xxxxxxxxxx>

The software IO TLB was designed with the assumption that it is not
used much, especially on 64-bit systems, so a small fixed memory
area (currently 64 MiB) is sufficient to handle the few cases which
still require a bounce buffer. However, these cases are not so rare
in some circumstances.

First, if SEV is active, all DMA must be done through shared
unencrypted pages, and SWIOTLB is used to make this happen without
changing device drivers. The software IO TLB size is increased to
6% of total memory in sev_setup_arch(), but that is more of an
approximation. The actual requirements may vary depending on which
drivers are used and the amount of I/O.

Second, on the Raspberry Pi 4, swiotlb is used by dma-buf for pages
moved from the rendering GPU (v3d driver), which can access all
memory, to the display output (vc4 driver), which is connected to a
bus with an address limit of 1 GiB and no IOMMU. These buffers can
be large (several megabytes) and cannot be handled by SWIOTLB,
because they exceed maximum segment size of 256 KiB. Such mapping
failures can be easily reproduced on a Raspberry Pi4: Starting
GNOME remote desktop results in a flood of kernel messages like
these:

[ 387.937625] vc4-drm gpu: swiotlb buffer is full (sz: 524288 bytes), total 32768 (slots), used 3136 (slots)
[ 387.960381] vc4-drm gpu: swiotlb buffer is full (sz: 815104 bytes), total 32768 (slots), used 2 (slots)

This second example cannot be even solved without increasing the
segment size (and the complexity of {map,unmap}_single size). At
that point, it's better to allocate bounce buffers dynamically with
dma_direct_alloc_pages().

One caveat is that the DMA API often takes only the address of a
buffer, and the implementation (direct or IOMMU) checks whether it
belongs to the software IO TLB. This is easy if the IO TLB is a
single chunk of physically contiguous memory, but not if some
buffers are allocated dynamically. Testing on a Raspberry Pi 4
shows that there can be 1k+ such buffers. This requires something
better than a linked list. I'm using a maple tree to track
dynamically allocated buffers. This data structure was invented for
a similar use case, but there are some challenges:

1. The value is limited to ULONG_MAX, which is too little both for
physical addresses (e.g. x86 PAE or 32-bit ARM LPAE) and DMA
addresses (e.g. Xen guests on 32-bit ARM).

2. Since buffers are currently allocated with page granularity, a
PFN can be used instead. However, some values are reserved by
the maple tree implementation. Liam suggests to use
xa_mk_value() in that case, but that reduces the usable range by
half. Luckily, 31 bits are still enough to hold a PFN on all
32-bit platforms.

3. Software IO TLB is used from interrupt context. The maple tree
implementation is not IRQ-safe (MT_FLAGS_LOCK_IRQ does nothing
AFAICS). Instead, I use an external lock, spin_lock_irqsave() and
spin_unlock_irqrestore().

Note that bounce buffers are never allocated dynamically if the
software IO TLB is in fact a DMA restricted pool, which is intended
to be stay in its designated location in physical memory.

Signed-off-by: Petr Tesarik <petr.tesarik.ext@xxxxxxxxxx>
---
include/linux/swiotlb.h | 11 ++-
kernel/dma/swiotlb.c | 156 +++++++++++++++++++++++++++++++++++++---
2 files changed, 157 insertions(+), 10 deletions(-)

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index b71adba03dc7..0ef27d6491b9 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -7,6 +7,7 @@
#include <linux/init.h>
#include <linux/types.h>
#include <linux/limits.h>
+#include <linux/maple_tree.h>
#include <linux/spinlock.h>

struct device;
@@ -87,6 +88,8 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys,
* @for_alloc: %true if the pool is used for memory allocation
* @nareas: The area number in the pool.
* @area_nslabs: The slot number in the area.
+ * @dyn_lock: Protect dynamically allocated slots.
+ * @dyn_slots: Dynamically allocated slots.
*/
struct io_tlb_mem {
phys_addr_t start;
@@ -102,9 +105,13 @@ struct io_tlb_mem {
unsigned int area_nslabs;
struct io_tlb_area *areas;
struct io_tlb_slot *slots;
+ spinlock_t dyn_lock;
+ struct maple_tree dyn_slots;
};
extern struct io_tlb_mem io_tlb_default_mem;

+bool is_swiotlb_dyn(struct io_tlb_mem *mem, phys_addr_t paddr);
+
static inline bool is_swiotlb_fixed(struct io_tlb_mem *mem, phys_addr_t paddr)
{
return paddr >= mem->start && paddr < mem->end;
@@ -114,7 +121,9 @@ static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;

- return mem && is_swiotlb_fixed(mem, paddr);
+ return mem &&
+ (is_swiotlb_fixed(mem, paddr) ||
+ is_swiotlb_dyn(mem, paddr));
}

static inline bool is_swiotlb_force_bounce(struct device *dev)
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index e8608bcb205e..c6a0b8f2aa6f 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -41,6 +41,7 @@
#include <linux/string.h>
#include <linux/swiotlb.h>
#include <linux/types.h>
+#include <linux/xarray.h>
#ifdef CONFIG_DMA_RESTRICTED_POOL
#include <linux/of.h>
#include <linux/of_fdt.h>
@@ -68,6 +69,13 @@ struct io_tlb_slot {
unsigned int list;
};

+struct io_tlb_dyn_slot {
+ phys_addr_t orig_addr;
+ size_t alloc_size;
+ struct page *page;
+ dma_addr_t dma_addr;
+};
+
static bool swiotlb_force_bounce;
static bool swiotlb_force_disable;

@@ -292,6 +300,10 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
mem->slots[i].alloc_size = 0;
}

+ spin_lock_init(&mem->dyn_lock);
+ mt_init_flags(&mem->dyn_slots, MT_FLAGS_LOCK_EXTERN);
+ mt_set_external_lock(&mem->dyn_slots, &mem->dyn_lock);
+
/*
* If swiotlb_unencrypted_base is set, the bounce buffer memory will
* be remapped and cleared in swiotlb_update_mem_attributes.
@@ -516,6 +528,115 @@ void __init swiotlb_exit(void)
memset(mem, 0, sizeof(*mem));
}

+static struct io_tlb_dyn_slot *swiotlb_dyn_slot(struct io_tlb_mem *mem,
+ phys_addr_t paddr)
+{
+ unsigned long index = (uintptr_t)xa_mk_value(PHYS_PFN(paddr));
+ struct io_tlb_dyn_slot *slot;
+ unsigned long flags;
+
+ spin_lock_irqsave(&mem->dyn_lock, flags);
+ slot = mt_find(&mem->dyn_slots, &index, index);
+ spin_unlock_irqrestore(&mem->dyn_lock, flags);
+ return slot;
+}
+
+bool is_swiotlb_dyn(struct io_tlb_mem *mem, phys_addr_t paddr)
+{
+ return !!swiotlb_dyn_slot(mem, paddr);
+}
+
+static void swiotlb_dyn_bounce(struct device *dev, phys_addr_t tlb_addr,
+ size_t size, enum dma_data_direction dir)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_dyn_slot *slot = swiotlb_dyn_slot(mem, tlb_addr);
+ unsigned int tlb_offset;
+ unsigned char *vaddr;
+
+ if (!slot)
+ return;
+
+ tlb_offset = tlb_addr - page_to_phys(slot->page);
+ vaddr = page_address(slot->page) + tlb_offset;
+
+ swiotlb_copy(dev, slot->orig_addr, vaddr, size, slot->alloc_size,
+ tlb_offset, dir);
+}
+
+static phys_addr_t swiotlb_dyn_map(struct device *dev, phys_addr_t orig_addr,
+ size_t alloc_size, unsigned int alloc_align_mask,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_dyn_slot *slot;
+ unsigned long index;
+ unsigned long flags;
+ phys_addr_t paddr;
+ gfp_t gfp;
+ int err;
+
+ /* Allocation has page granularity. Avoid small buffers. */
+ if (alloc_size < PAGE_SIZE)
+ goto err;
+
+ /* DMA direct does not deal with physical address constraints. */
+ if (alloc_align_mask || dma_get_min_align_mask(dev))
+ goto err;
+
+ gfp = (attrs & DMA_ATTR_MAY_SLEEP) ? GFP_KERNEL : GFP_NOWAIT;
+ slot = kmalloc(sizeof(*slot), gfp | __GFP_NOWARN);
+ if (!slot)
+ goto err;
+
+ slot->orig_addr = orig_addr;
+ slot->alloc_size = alloc_size;
+ slot->page = dma_direct_alloc_pages(dev, PAGE_ALIGN(alloc_size),
+ &slot->dma_addr, dir,
+ gfp | __GFP_NOWARN);
+ if (!slot->page)
+ goto err_free_slot;
+
+ paddr = page_to_phys(slot->page);
+ index = (uintptr_t)xa_mk_value(PHYS_PFN(paddr));
+ spin_lock_irqsave(&mem->dyn_lock, flags);
+ err = mtree_store_range(&mem->dyn_slots, index,
+ index + PFN_UP(alloc_size) - 1,
+ slot, GFP_NOWAIT | __GFP_NOWARN);
+ spin_unlock_irqrestore(&mem->dyn_lock, flags);
+ if (err)
+ goto err_free_dma;
+
+ return paddr;
+
+err_free_dma:
+ dma_direct_free_pages(dev, slot->alloc_size, slot->page,
+ slot->dma_addr, dir);
+
+err_free_slot:
+ kfree(slot);
+err:
+ return (phys_addr_t)DMA_MAPPING_ERROR;
+}
+
+static void swiotlb_dyn_unmap(struct device *dev, phys_addr_t tlb_addr,
+ enum dma_data_direction dir)
+{
+ unsigned long index = (uintptr_t)xa_mk_value(PHYS_PFN(tlb_addr));
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_dyn_slot *slot;
+ unsigned long flags;
+
+ spin_lock_irqsave(&mem->dyn_lock, flags);
+ slot = mt_find(&mem->dyn_slots, &index, index);
+ mtree_erase(&mem->dyn_slots, index);
+ spin_unlock_irqrestore(&mem->dyn_lock, flags);
+
+ dma_direct_free_pages(dev, slot->alloc_size, slot->page,
+ slot->dma_addr, dir);
+ kfree(slot);
+}
+
/*
* Return the offset into a iotlb slot required to keep the device happy.
*/
@@ -524,11 +645,8 @@ static unsigned int swiotlb_align_offset(struct device *dev, u64 addr)
return addr & dma_get_min_align_mask(dev) & (IO_TLB_SIZE - 1);
}

-/*
- * Bounce: copy the swiotlb buffer from or back to the original dma location
- */
-static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size,
- enum dma_data_direction dir)
+static void swiotlb_fixed_bounce(struct device *dev, phys_addr_t tlb_addr,
+ size_t size, enum dma_data_direction dir)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT;
@@ -608,6 +726,18 @@ static void swiotlb_copy(struct device *dev, phys_addr_t orig_addr,
}
}

+/*
+ * Bounce: copy the swiotlb buffer from or back to the original dma location
+ */
+static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size,
+ enum dma_data_direction dir)
+{
+ if (is_swiotlb_fixed(dev->dma_io_tlb_mem, tlb_addr))
+ swiotlb_fixed_bounce(dev, tlb_addr, size, dir);
+ else
+ swiotlb_dyn_bounce(dev, tlb_addr, size, dir);
+}
+
static inline phys_addr_t slot_addr(phys_addr_t start, phys_addr_t idx)
{
return start + (idx << IO_TLB_SHIFT);
@@ -799,8 +929,13 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
return (phys_addr_t)DMA_MAPPING_ERROR;
}

- tlb_addr = swiotlb_fixed_map(dev, orig_addr, alloc_size,
- alloc_align_mask, attrs);
+ tlb_addr = (phys_addr_t)DMA_MAPPING_ERROR;
+ if (!is_swiotlb_for_alloc(dev))
+ tlb_addr = swiotlb_dyn_map(dev, orig_addr, alloc_size,
+ alloc_align_mask, dir, attrs);
+ if (tlb_addr == (phys_addr_t)DMA_MAPPING_ERROR)
+ tlb_addr = swiotlb_fixed_map(dev, orig_addr, alloc_size,
+ alloc_align_mask, attrs);

if (tlb_addr == (phys_addr_t)DMA_MAPPING_ERROR) {
if (!(attrs & DMA_ATTR_NO_WARN))
@@ -882,7 +1017,10 @@ void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr,
(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_FROM_DEVICE);

- swiotlb_release_slots(dev, tlb_addr);
+ if (is_swiotlb_fixed(dev->dma_io_tlb_mem, tlb_addr))
+ swiotlb_release_slots(dev, tlb_addr);
+ else
+ swiotlb_dyn_unmap(dev, tlb_addr, dir);
}

void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
@@ -1013,7 +1151,7 @@ bool swiotlb_free(struct device *dev, struct page *page, size_t size)
{
phys_addr_t tlb_addr = page_to_phys(page);

- if (!is_swiotlb_buffer(dev, tlb_addr))
+ if (!is_swiotlb_fixed(dev->dma_io_tlb_mem, tlb_addr))
return false;

swiotlb_release_slots(dev, tlb_addr);
--
2.25.1