[PATCH 4/5] vfio/type1: Flush CPU caches on DMA pages in non-coherent domains

From: Yan Zhao
Date: Tue May 07 2024 - 02:24:02 EST


Flush CPU cache on DMA pages before mapping them into the first
non-coherent domain (domain that does not enforce cache coherency, i.e. CPU
caches are not force-snooped) and after unmapping them from the last
domain.

Devices attached to non-coherent domains can execute non-coherent DMAs
(DMAs that lack CPU cache snooping) to access physical memory with CPU
caches bypassed.

Such a scenario could be exploited by a malicious guest, allowing them to
access stale host data in memory rather than the data initialized by the
host (e.g., zeros) in the cache, thus posing a risk of information leakage
attack.

Furthermore, the host kernel (e.g. a ksm thread) might encounter
inconsistent data between the CPU cache and memory (left by a malicious
guest) after a page is unpinned for DMA but before it's recycled.

Therefore, it is required to flush the CPU cache before a page is
accessible to non-coherent DMAs and after the page is inaccessible to
non-coherent DMAs.

However, the CPU cache is not flushed immediately when the page is unmapped
from the last non-coherent domain. Instead, the flushing is performed
lazily, right before the page is unpinned.
Take the following example to illustrate the process. The CPU cache is
flushed right before step 2 and step 5.
1. A page is mapped into a coherent domain.
2. The page is mapped into a non-coherent domain.
3. The page is unmapped from the non-coherent domain e.g.due to hot-unplug.
4. The page is unmapped from the coherent domain.
5. The page is unpinned.

Reasons for adopting this lazily flushing design include:
- There're several unmap paths and only one unpin path. Lazily flush before
unpin wipes out the inconsistency between cache and physical memory
before a page is globally visible and produces code that is simpler, more
maintainable and easier to backport.
- Avoid dividing a large unmap range into several smaller ones or
allocating additional memory to hold IOVA to HPA relationship.

Reported-by: Jason Gunthorpe <jgg@xxxxxxxxxx>
Closes: https://lore.kernel.org/lkml/20240109002220.GA439767@xxxxxxxxxx
Fixes: 73fa0d10d077 ("vfio: Type1 IOMMU implementation")
Cc: Alex Williamson <alex.williamson@xxxxxxxxxx>
Cc: Jason Gunthorpe <jgg@xxxxxxxxxx>
Cc: Kevin Tian <kevin.tian@xxxxxxxxx>
Signed-off-by: Yan Zhao <yan.y.zhao@xxxxxxxxx>
---
drivers/vfio/vfio_iommu_type1.c | 51 +++++++++++++++++++++++++++++++++
1 file changed, 51 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index b5c15fe8f9fc..ce873f4220bf 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -74,6 +74,7 @@ struct vfio_iommu {
bool v2;
bool nesting;
bool dirty_page_tracking;
+ bool has_noncoherent_domain;
struct list_head emulated_iommu_groups;
};

@@ -99,6 +100,7 @@ struct vfio_dma {
unsigned long *bitmap;
struct mm_struct *mm;
size_t locked_vm;
+ bool cache_flush_required; /* For noncoherent domain */
};

struct vfio_batch {
@@ -716,6 +718,9 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
long unlocked = 0, locked = 0;
long i;

+ if (dma->cache_flush_required)
+ arch_clean_nonsnoop_dma(pfn << PAGE_SHIFT, npage << PAGE_SHIFT);
+
for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
if (put_pfn(pfn++, dma->prot)) {
unlocked++;
@@ -1099,6 +1104,8 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
&iotlb_gather);
}

+ dma->cache_flush_required = false;
+
if (do_accounting) {
vfio_lock_acct(dma, -unlocked, true);
return 0;
@@ -1120,6 +1127,21 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
iommu->dma_avail++;
}

+static void vfio_update_noncoherent_domain_state(struct vfio_iommu *iommu)
+{
+ struct vfio_domain *domain;
+ bool has_noncoherent = false;
+
+ list_for_each_entry(domain, &iommu->domain_list, next) {
+ if (domain->enforce_cache_coherency)
+ continue;
+
+ has_noncoherent = true;
+ break;
+ }
+ iommu->has_noncoherent_domain = has_noncoherent;
+}
+
static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu)
{
struct vfio_domain *domain;
@@ -1455,6 +1477,12 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,

vfio_batch_init(&batch);

+ /*
+ * Record necessity to flush CPU cache to make sure CPU cache is flushed
+ * for both pin & map and unmap & unpin (for unwind) paths.
+ */
+ dma->cache_flush_required = iommu->has_noncoherent_domain;
+
while (size) {
/* Pin a contiguous chunk of memory */
npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
@@ -1466,6 +1494,10 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
break;
}

+ if (dma->cache_flush_required)
+ arch_clean_nonsnoop_dma(pfn << PAGE_SHIFT,
+ npage << PAGE_SHIFT);
+
/* Map it! */
ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
dma->prot);
@@ -1683,9 +1715,14 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
for (; n; n = rb_next(n)) {
struct vfio_dma *dma;
dma_addr_t iova;
+ bool cache_flush_required;

dma = rb_entry(n, struct vfio_dma, node);
iova = dma->iova;
+ cache_flush_required = !domain->enforce_cache_coherency &&
+ !dma->cache_flush_required;
+ if (cache_flush_required)
+ dma->cache_flush_required = true;

while (iova < dma->iova + dma->size) {
phys_addr_t phys;
@@ -1737,6 +1774,9 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
size = npage << PAGE_SHIFT;
}

+ if (cache_flush_required)
+ arch_clean_nonsnoop_dma(phys, size);
+
ret = iommu_map(domain->domain, iova, phys, size,
dma->prot | IOMMU_CACHE,
GFP_KERNEL_ACCOUNT);
@@ -1801,6 +1841,7 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
vfio_unpin_pages_remote(dma, iova, phys >> PAGE_SHIFT,
size >> PAGE_SHIFT, true);
}
+ dma->cache_flush_required = false;
}

vfio_batch_fini(&batch);
@@ -1828,6 +1869,9 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain, struct list_head *
if (!pages)
return;

+ if (!domain->enforce_cache_coherency)
+ arch_clean_nonsnoop_dma(page_to_phys(pages), PAGE_SIZE * 2);
+
list_for_each_entry(region, regions, list) {
start = ALIGN(region->start, PAGE_SIZE * 2);
if (start >= region->end || (region->end - start < PAGE_SIZE * 2))
@@ -1847,6 +1891,9 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain, struct list_head *
break;
}

+ if (!domain->enforce_cache_coherency)
+ arch_clean_nonsnoop_dma(page_to_phys(pages), PAGE_SIZE * 2);
+
__free_pages(pages, order);
}

@@ -2308,6 +2355,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,

list_add(&domain->next, &iommu->domain_list);
vfio_update_pgsize_bitmap(iommu);
+ if (!domain->enforce_cache_coherency)
+ vfio_update_noncoherent_domain_state(iommu);
done:
/* Delete the old one and insert new iova list */
vfio_iommu_iova_insert_copy(iommu, &iova_copy);
@@ -2508,6 +2557,8 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
}
iommu_domain_free(domain->domain);
list_del(&domain->next);
+ if (!domain->enforce_cache_coherency)
+ vfio_update_noncoherent_domain_state(iommu);
kfree(domain);
vfio_iommu_aper_expand(iommu, &iova_copy);
vfio_update_pgsize_bitmap(iommu);
--
2.17.1