[PATCH 5/5] iommufd: Flush CPU caches on DMA pages in non-coherent domains

From: Yan Zhao
Date: Tue May 07 2024 - 02:25:07 EST


Flush CPU cache on DMA pages before mapping them into the first
non-coherent domain (domain that does not enforce cache coherency, i.e. CPU
caches are not force-snooped) and after unmapping them from the last
domain.

Devices attached to non-coherent domains can execute non-coherent DMAs
(DMAs that lack CPU cache snooping) to access physical memory with CPU
caches bypassed.

Such a scenario could be exploited by a malicious guest, allowing them to
access stale host data in memory rather than the data initialized by the
host (e.g., zeros) in the cache, thus posing a risk of information leakage
attack.

Furthermore, the host kernel (e.g. a ksm thread) might encounter
inconsistent data between the CPU cache and memory (left by a malicious
guest) after a page is unpinned for DMA but before it's recycled.

Therefore, it is required to flush the CPU cache before a page is
accessible to non-coherent DMAs and after the page is inaccessible to
non-coherent DMAs.

However, the CPU cache is not flushed immediately when the page is unmapped
from the last non-coherent domain. Instead, the flushing is performed
lazily, right before the page is unpinned.
Take the following example to illustrate the process. The CPU cache is
flushed right before step 2 and step 5.
1. A page is mapped into a coherent domain.
2. The page is mapped into a non-coherent domain.
3. The page is unmapped from the non-coherent domain e.g.due to hot-unplug.
4. The page is unmapped from the coherent domain.
5. The page is unpinned.

Reasons for adopting this lazily flushing design include:
- There're several unmap paths and only one unpin path. Lazily flush before
unpin wipes out the inconsistency between cache and physical memory
before a page is globally visible and produces code that is simpler, more
maintainable and easier to backport.
- Avoid dividing a large unmap range into several smaller ones or
allocating additional memory to hold IOVA to HPA relationship.

Unlike "has_noncoherent_domain" flag used in vfio_iommu, the
"noncoherent_domain_cnt" counter is implemented in io_pagetable to track
whether an iopt has non-coherent domains attached.
Such a difference is because in iommufd only hwpt of type paging contains
flag "enforce_cache_coherency" and iommu domains in io_pagetable has no
flag "enforce_cache_coherency" as that in vfio_domain.
A counter in io_pagetable can avoid traversing ioas->hwpt_list and holding
ioas->mutex.

Reported-by: Jason Gunthorpe <jgg@xxxxxxxxxx>
Closes: https://lore.kernel.org/lkml/20240109002220.GA439767@xxxxxxxxxx
Fixes: e8d57210035b ("iommufd: Add kAPI toward external drivers for physical devices")
Cc: Alex Williamson <alex.williamson@xxxxxxxxxx>
Cc: Jason Gunthorpe <jgg@xxxxxxxxxx>
Cc: Kevin Tian <kevin.tian@xxxxxxxxx>
Signed-off-by: Yan Zhao <yan.y.zhao@xxxxxxxxx>
---
drivers/iommu/iommufd/hw_pagetable.c | 19 +++++++++--
drivers/iommu/iommufd/io_pagetable.h | 5 +++
drivers/iommu/iommufd/iommufd_private.h | 1 +
drivers/iommu/iommufd/pages.c | 44 +++++++++++++++++++++++--
4 files changed, 65 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index 33d142f8057d..e3099d732c5c 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -14,12 +14,18 @@ void iommufd_hwpt_paging_destroy(struct iommufd_object *obj)
container_of(obj, struct iommufd_hwpt_paging, common.obj);

if (!list_empty(&hwpt_paging->hwpt_item)) {
+ struct io_pagetable *iopt = &hwpt_paging->ioas->iopt;
mutex_lock(&hwpt_paging->ioas->mutex);
list_del(&hwpt_paging->hwpt_item);
mutex_unlock(&hwpt_paging->ioas->mutex);

- iopt_table_remove_domain(&hwpt_paging->ioas->iopt,
- hwpt_paging->common.domain);
+ iopt_table_remove_domain(iopt, hwpt_paging->common.domain);
+
+ if (!hwpt_paging->enforce_cache_coherency) {
+ down_write(&iopt->domains_rwsem);
+ iopt->noncoherent_domain_cnt--;
+ up_write(&iopt->domains_rwsem);
+ }
}

if (hwpt_paging->common.domain)
@@ -176,6 +182,12 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
goto out_abort;
}

+ if (!hwpt_paging->enforce_cache_coherency) {
+ down_write(&ioas->iopt.domains_rwsem);
+ ioas->iopt.noncoherent_domain_cnt++;
+ up_write(&ioas->iopt.domains_rwsem);
+ }
+
rc = iopt_table_add_domain(&ioas->iopt, hwpt->domain);
if (rc)
goto out_detach;
@@ -183,6 +195,9 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
return hwpt_paging;

out_detach:
+ down_write(&ioas->iopt.domains_rwsem);
+ ioas->iopt.noncoherent_domain_cnt--;
+ up_write(&ioas->iopt.domains_rwsem);
if (immediate_attach)
iommufd_hw_pagetable_detach(idev);
out_abort:
diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h
index 0ec3509b7e33..557da8fb83d9 100644
--- a/drivers/iommu/iommufd/io_pagetable.h
+++ b/drivers/iommu/iommufd/io_pagetable.h
@@ -198,6 +198,11 @@ struct iopt_pages {
void __user *uptr;
bool writable:1;
u8 account_mode;
+ /*
+ * CPU cache flush is required before mapping the pages to or after
+ * unmapping it from a noncoherent domain
+ */
+ bool cache_flush_required:1;

struct xarray pinned_pfns;
/* Of iopt_pages_access::node */
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 991f864d1f9b..fc77fd43b232 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -53,6 +53,7 @@ struct io_pagetable {
struct rb_root_cached reserved_itree;
u8 disable_large_pages;
unsigned long iova_alignment;
+ unsigned int noncoherent_domain_cnt;
};

void iopt_init_table(struct io_pagetable *iopt);
diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
index 528f356238b3..8f4b939cba5b 100644
--- a/drivers/iommu/iommufd/pages.c
+++ b/drivers/iommu/iommufd/pages.c
@@ -272,6 +272,17 @@ struct pfn_batch {
unsigned int total_pfns;
};

+static void iopt_cache_flush_pfn_batch(struct pfn_batch *batch)
+{
+ unsigned long cur, i;
+
+ for (cur = 0; cur < batch->end; cur++) {
+ for (i = 0; i < batch->npfns[cur]; i++)
+ arch_clean_nonsnoop_dma(PFN_PHYS(batch->pfns[cur] + i),
+ PAGE_SIZE);
+ }
+}
+
static void batch_clear(struct pfn_batch *batch)
{
batch->total_pfns = 0;
@@ -637,10 +648,18 @@ static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages,
while (npages) {
size_t to_unpin = min_t(size_t, npages,
batch->npfns[cur] - first_page_off);
+ unsigned long pfn = batch->pfns[cur] + first_page_off;
+
+ /*
+ * Lazily flushing CPU caches when a page is about to be
+ * unpinned if the page was mapped into a noncoherent domain
+ */
+ if (pages->cache_flush_required)
+ arch_clean_nonsnoop_dma(pfn << PAGE_SHIFT,
+ to_unpin << PAGE_SHIFT);

unpin_user_page_range_dirty_lock(
- pfn_to_page(batch->pfns[cur] + first_page_off),
- to_unpin, pages->writable);
+ pfn_to_page(pfn), to_unpin, pages->writable);
iopt_pages_sub_npinned(pages, to_unpin);
cur++;
first_page_off = 0;
@@ -1358,10 +1377,17 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)
{
unsigned long done_end_index;
struct pfn_reader pfns;
+ bool cache_flush_required;
int rc;

lockdep_assert_held(&area->pages->mutex);

+ cache_flush_required = area->iopt->noncoherent_domain_cnt &&
+ !area->pages->cache_flush_required;
+
+ if (cache_flush_required)
+ area->pages->cache_flush_required = true;
+
rc = pfn_reader_first(&pfns, area->pages, iopt_area_index(area),
iopt_area_last_index(area));
if (rc)
@@ -1369,6 +1395,9 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)

while (!pfn_reader_done(&pfns)) {
done_end_index = pfns.batch_start_index;
+ if (cache_flush_required)
+ iopt_cache_flush_pfn_batch(&pfns.batch);
+
rc = batch_to_domain(&pfns.batch, domain, area,
pfns.batch_start_index);
if (rc)
@@ -1413,6 +1442,7 @@ int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages)
unsigned long unmap_index;
struct pfn_reader pfns;
unsigned long index;
+ bool cache_flush_required;
int rc;

lockdep_assert_held(&area->iopt->domains_rwsem);
@@ -1426,9 +1456,19 @@ int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages)
if (rc)
goto out_unlock;

+ cache_flush_required = area->iopt->noncoherent_domain_cnt &&
+ !pages->cache_flush_required;
+
+ if (cache_flush_required)
+ pages->cache_flush_required = true;
+
while (!pfn_reader_done(&pfns)) {
done_first_end_index = pfns.batch_end_index;
done_all_end_index = pfns.batch_start_index;
+
+ if (cache_flush_required)
+ iopt_cache_flush_pfn_batch(&pfns.batch);
+
xa_for_each(&area->iopt->domains, index, domain) {
rc = batch_to_domain(&pfns.batch, domain, area,
pfns.batch_start_index);
--
2.17.1