Re: [PATCH v3 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush

From: Ethan Zhao
Date: Sun Aug 10 2025 - 03:20:22 EST

Next message: Sabyrzhan Tasbolatov: "Re: [PATCH v5 1/2] kasan: introduce ARCH_DEFER_KASAN and unify static key across modes"
Previous message: Andreas Hindborg: "Re: [PATCH v2 2/2] rust: time: Implement basic arithmetic operations for Delta"
In reply to: Baolu Lu: "Re: [PATCH v3 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush"
Next in thread: Uladzislau Rezki: "Re: [PATCH v3 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

On 8/8/2025 1:15 PM, Baolu Lu wrote:

On 8/7/25 23:31, Dave Hansen wrote:

+void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+    struct page *page = virt_to_page(pte);
+
+    guard(spinlock)(&kernel_pte_work.lock);
+    list_add(&page->lru, &kernel_pte_work.list);
+    schedule_work(&kernel_pte_work.work);
+}
diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/ pgalloc.h
index 3c8ec3bfea44..716ebab67636 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -46,6 +46,7 @@ static inline pte_t
*pte_alloc_one_kernel_noprof(struct mm_struct *mm)
  #define pte_alloc_one_kernel(...)
alloc_hooks(pte_alloc_one_kernel_noprof(__VA_ARGS__))
  #endif

+#ifndef __HAVE_ARCH_PTE_FREE_KERNEL
  /**
   * pte_free_kernel - free PTE-level kernel page table memory
   * @mm: the mm_struct of the current context
@@ -55,6 +56,7 @@ static inline void pte_free_kernel(struct mm_struct
*mm, pte_t *pte)
  {
      pagetable_dtor_free(virt_to_ptdesc(pte));
  }
+#endif

  /**
   * __pte_alloc_one - allocate memory for a PTE-level user page table

I'd much rather the arch-generic code looked like this:

#ifdef CONFIG_ASYNC_PGTABLE_FREE
// code and struct here, or dump them over in some
// other file and do this in a header
#else
static void pte_free_kernel_async(struct page *page) {}
#endif

void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
{
     struct page *page = virt_to_page(pte);

     if (IS_DEFINED(CONFIG_ASYNC_PGTABLE_FREE)) {
    pte_free_kernel_async(page);
     else
    pagetable_dtor_free(page_ptdesc(page));
}

Then in Kconfig, you end up with something like:

config ASYNC_PGTABLE_FREE
    def_bool y
    depends on INTEL_IOMMU_WHATEVER

That very much tells much more of the whole story in code. It also gives
the x86 folks that compile out the IOMMU the exact same code as the
arch-generic folks. It_also_ makes it dirt simple and obvious for the
x86 folks to optimize out the async behavior if they don't like it in
the future by replacing the compile-time IOMMU check with a runtime one.

Also, if another crazy IOMMU implementation comes along that happens to
do what the x86 IOMMUs do, then they have a single Kconfig switch to
flip. If they follow what this patch tries to do, they'll start by
copying and pasting the x86 implementation.

I'll do it like this. Does that look good to you?

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 70d29b14d851..6f1113e024fa 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -160,6 +160,7 @@ config IOMMU_DMA
# Shared Virtual Addressing
config IOMMU_SVA
     select IOMMU_MM_DATA
+    select ASYNC_PGTABLE_FREE if X86
     bool

config IOMMU_IOPF
diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index 3c8ec3bfea44..dbddacdca2ce 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -46,6 +46,19 @@ static inline pte_t *pte_alloc_one_kernel_noprof(struct mm_struct *mm)
#define pte_alloc_one_kernel(...) alloc_hooks(pte_alloc_one_kernel_noprof(__VA_ARGS__))
#endif

+#ifdef CONFIG_ASYNC_PGTABLE_FREE
+struct pgtable_free_work {
+    struct list_head list;
+    spinlock_t lock;
+    struct work_struct work;
+};
+extern struct pgtable_free_work kernel_pte_work;
+
+void pte_free_kernel_async(struct ptdesc *ptdesc);
+#else
+static inline void pte_free_kernel_async(struct ptdesc *ptdesc) {}
+#endif
+
/**
* pte_free_kernel - free PTE-level kernel page table memory
* @mm: the mm_struct of the current context
@@ -53,7 +66,12 @@ static inline pte_t *pte_alloc_one_kernel_noprof(struct mm_struct *mm)
*/
static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
{
-    pagetable_dtor_free(virt_to_ptdesc(pte));
+    struct ptdesc *ptdesc = virt_to_ptdesc(pte);
+
+    if (IS_ENABLED(CONFIG_ASYNC_PGTABLE_FREE))
+        pte_free_kernel_async(ptdesc);
+    else
+        pagetable_dtor_free(ptdesc);
}

/**
diff --git a/mm/Kconfig b/mm/Kconfig
index e443fe8cd6cf..528550cfa7fe 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1346,6 +1346,13 @@ config LOCK_MM_AND_FIND_VMA
config IOMMU_MM_DATA
     bool

+config ASYNC_PGTABLE_FREE
+    bool "Asynchronous kernel page table freeing"
+    help
+      Perform kernel page table freeing asynchronously. This is required
+      for systems with IOMMU Shared Virtual Address (SVA) to flush IOTLB
+      paging structure caches.
+
config EXECMEM
     bool

diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 567e2d084071..6639ee6641d4 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -13,6 +13,7 @@
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mm_inline.h>
+#include <linux/iommu.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>

@@ -406,3 +407,32 @@ pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
     pte_unmap_unlock(pte, ptl);
     goto again;
}
+
+#ifdef CONFIG_ASYNC_PGTABLE_FREE
+static void kernel_pte_work_func(struct work_struct *work);
+struct pgtable_free_work kernel_pte_work = {
+    .list = LIST_HEAD_INIT(kernel_pte_work.list),
+    .lock = __SPIN_LOCK_UNLOCKED(kernel_pte_work.lock),
+    .work = __WORK_INITIALIZER(kernel_pte_work.work, kernel_pte_work_func),
+};
+
+static void kernel_pte_work_func(struct work_struct *work)
+{
+    struct ptdesc *ptdesc, *next;
+
+    iommu_sva_invalidate_kva_range(0, TLB_FLUSH_ALL);
+
+    guard(spinlock)(&kernel_pte_work.lock);
+    list_for_each_entry_safe(ptdesc, next, &kernel_pte_work.list, pt_list) {
+        list_del_init(&ptdesc->pt_list);
+        pagetable_dtor_free(ptdesc);
+    }
+}
+
+void pte_free_kernel_async(struct ptdesc *ptdesc)
+{
+    guard(spinlock)(&kernel_pte_work.lock);
+    list_add(&ptdesc->pt_list, &kernel_pte_work.list);
+    schedule_work(&kernel_pte_work.work);
+}

kernel_pte_work.list is global shared var, it would make the producer
pte_free_kernel() and the consumer kernel_pte_work_func() to operate in serialized timing. In a large system, I don't think you design this deliberately :)

Thanks,
Ethan
> +#endif

Next message: Sabyrzhan Tasbolatov: "Re: [PATCH v5 1/2] kasan: introduce ARCH_DEFER_KASAN and unify static key across modes"
Previous message: Andreas Hindborg: "Re: [PATCH v2 2/2] rust: time: Implement basic arithmetic operations for Delta"
In reply to: Baolu Lu: "Re: [PATCH v3 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush"
Next in thread: Uladzislau Rezki: "Re: [PATCH v3 1/1] iommu/sva: Invalidate KVA range on kernel TLB flush"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]