[PATCH 3/5] x86/mm: Introduce and export interface arch_clean_nonsnoop_dma()

From: Yan Zhao
Date: Tue May 07 2024 - 02:21:39 EST


Introduce and export interface arch_clean_nonsnoop_dma() to flush CPU
caches for memory involved in non-coherent DMAs (DMAs that lack CPU cache
snooping).

When IOMMU does not enforce cache coherency, devices are allowed to perform
non-coherent DMAs. This scenario poses a risk of information leakage when
the device is assigned into a VM. Specifically, a malicious guest could
potentially retrieve stale host data through non-coherent DMA reads to
physical memory, with data initialized by host (e.g., zeros) still residing
in the cache.

Additionally, host kernel (e.g. by a ksm kthread) is possible to read
inconsistent data from CPU cache/memory (left by a malicious guest) after
a page is unpinned for non-coherent DMA but before it's freed.

Therefore, VFIO/IOMMUFD must initiate a CPU cache flush for pages involved
in non-coherent DMAs prior to or following their mapping or unmapping to or
from the IOMMU.

Introduce and export an interface accepting a contiguous physical address
range as input to help flush CPU caches in architecture specific way for
VFIO/IOMMUFD. (Currently, x86 only).

Given CLFLUSH on MMIOs in x86 is generally undesired and sometimes will
cause MCE on certain platforms (e.g. executing CLFLUSH on VGA ranges
0xA0000-0xBFFFF causes MCE on some platforms). Meanwhile, some MMIOs are
cacheable and demands CLFLUSH (e.g. certain MMIOs for PMEM). Hence, a
method of checking host PAT/MTRR for uncacheable memory is adopted.

This implementation always performs CLFLUSH on "pfn_valid() && !reserved"
pages (since they are not possible to be MMIOs).
For the reserved or !pfn_valid() cases, check host PAT/MTRR to bypass
uncacheable physical ranges in host and do CFLUSH on the rest cacheable
ranges.

Cc: Alex Williamson <alex.williamson@xxxxxxxxxx>
Cc: Jason Gunthorpe <jgg@xxxxxxxxxx>
Cc: Kevin Tian <kevin.tian@xxxxxxxxx>
Suggested-by: Jason Gunthorpe <jgg@xxxxxxxxxx>
Signed-off-by: Yan Zhao <yan.y.zhao@xxxxxxxxx>
---
arch/x86/include/asm/cacheflush.h | 3 ++
arch/x86/mm/pat/set_memory.c | 88 +++++++++++++++++++++++++++++++
include/linux/cacheflush.h | 6 +++
3 files changed, 97 insertions(+)

diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index b192d917a6d0..b63607994285 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -10,4 +10,7 @@

void clflush_cache_range(void *addr, unsigned int size);

+void arch_clean_nonsnoop_dma(phys_addr_t phys, size_t length);
+#define arch_clean_nonsnoop_dma arch_clean_nonsnoop_dma
+
#endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 80c9037ffadf..7ff08ad20369 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -34,6 +34,7 @@
#include <asm/memtype.h>
#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>
+#include <asm/mtrr.h>

#include "../mm_internal.h"

@@ -349,6 +350,93 @@ void arch_invalidate_pmem(void *addr, size_t size)
EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
#endif

+/*
+ * Flush pfn_valid() and !PageReserved() page
+ */
+static void clflush_page(struct page *page)
+{
+ const int size = boot_cpu_data.x86_clflush_size;
+ unsigned int i;
+ void *va;
+
+ va = kmap_local_page(page);
+
+ /* CLFLUSHOPT is unordered and requires full memory barrier */
+ mb();
+ for (i = 0; i < PAGE_SIZE; i += size)
+ clflushopt(va + i);
+ /* CLFLUSHOPT is unordered and requires full memory barrier */
+ mb();
+
+ kunmap_local(va);
+}
+
+/*
+ * Flush a reserved page or !pfn_valid() PFN.
+ * Flush is not performed if the PFN is accessed in uncacheable type. i.e.
+ * - PAT type is UC/UC-/WC when PAT is enabled
+ * - MTRR type is UC/WC/WT/WP when PAT is not enabled.
+ * (no need to do CLFLUSH though WT/WP is cacheable).
+ */
+static void clflush_reserved_or_invalid_pfn(unsigned long pfn)
+{
+ const int size = boot_cpu_data.x86_clflush_size;
+ unsigned int i;
+ void *va;
+
+ if (!pat_enabled()) {
+ u64 start = PFN_PHYS(pfn), end = start + PAGE_SIZE;
+ u8 mtrr_type, uniform;
+
+ mtrr_type = mtrr_type_lookup(start, end, &uniform);
+ if (mtrr_type != MTRR_TYPE_WRBACK)
+ return;
+ } else if (pat_pfn_immune_to_uc_mtrr(pfn)) {
+ return;
+ }
+
+ va = memremap(pfn << PAGE_SHIFT, PAGE_SIZE, MEMREMAP_WB);
+ if (!va)
+ return;
+
+ /* CLFLUSHOPT is unordered and requires full memory barrier */
+ mb();
+ for (i = 0; i < PAGE_SIZE; i += size)
+ clflushopt(va + i);
+ /* CLFLUSHOPT is unordered and requires full memory barrier */
+ mb();
+
+ memunmap(va);
+}
+
+static inline void clflush_pfn(unsigned long pfn)
+{
+ if (pfn_valid(pfn) &&
+ (!PageReserved(pfn_to_page(pfn)) || is_zero_pfn(pfn)))
+ return clflush_page(pfn_to_page(pfn));
+
+ clflush_reserved_or_invalid_pfn(pfn);
+}
+
+/**
+ * arch_clean_nonsnoop_dma - flush a cache range for non-coherent DMAs
+ * (DMAs that lack CPU cache snooping).
+ * @phys_addr: physical address start
+ * @length: number of bytes to flush
+ */
+void arch_clean_nonsnoop_dma(phys_addr_t phys_addr, size_t length)
+{
+ unsigned long nrpages, pfn;
+ unsigned long i;
+
+ pfn = PHYS_PFN(phys_addr);
+ nrpages = PAGE_ALIGN((phys_addr & ~PAGE_MASK) + length) >> PAGE_SHIFT;
+
+ for (i = 0; i < nrpages; i++, pfn++)
+ clflush_pfn(pfn);
+}
+EXPORT_SYMBOL_GPL(arch_clean_nonsnoop_dma);
+
#ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
bool cpu_cache_has_invalidate_memregion(void)
{
diff --git a/include/linux/cacheflush.h b/include/linux/cacheflush.h
index 55f297b2c23f..0bfc6551c6d3 100644
--- a/include/linux/cacheflush.h
+++ b/include/linux/cacheflush.h
@@ -26,4 +26,10 @@ static inline void flush_icache_pages(struct vm_area_struct *vma,

#define flush_icache_page(vma, page) flush_icache_pages(vma, page, 1)

+#ifndef arch_clean_nonsnoop_dma
+static inline void arch_clean_nonsnoop_dma(phys_addr_t phys, size_t length)
+{
+}
+#endif
+
#endif /* _LINUX_CACHEFLUSH_H */
--
2.17.1