Re: [PATCH v3 2/2] vfio/type1: optimize vfio_unpin_pages_remote() for large folio

From: David Hildenbrand
Date: Mon Jun 16 2025 - 07:19:29 EST


On 16.06.25 13:13, lizhe.67@xxxxxxxxxxxxx wrote:
On Mon, 16 Jun 2025 10:14:23 +0200, david@xxxxxxxxxx wrote:

drivers/vfio/vfio_iommu_type1.c | 55 +++++++++++++++++++++++++++------
1 file changed, 46 insertions(+), 9 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index e952bf8bdfab..09ecc546ece8 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -469,17 +469,28 @@ static bool is_invalid_reserved_pfn(unsigned long pfn)
return true;
}
-static int put_pfn(unsigned long pfn, int prot)
+static inline void _put_pfns(struct page *page, int npages, int prot)
{
- if (!is_invalid_reserved_pfn(pfn)) {
- struct page *page = pfn_to_page(pfn);
+ unpin_user_page_range_dirty_lock(page, npages, prot & IOMMU_WRITE);
+}
- unpin_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE);
- return 1;
+/*
+ * The caller must ensure that these npages PFNs belong to the same folio.
+ */
+static inline int put_pfns(unsigned long pfn, int npages, int prot)
+{
+ if (!is_invalid_reserved_pfn(pfn)) {
+ _put_pfns(pfn_to_page(pfn), npages, prot);
+ return npages;
}
return 0;
}
+static inline int put_pfn(unsigned long pfn, int prot)
+{
+ return put_pfns(pfn, 1, prot);
+}
+
#define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))
static void __vfio_batch_init(struct vfio_batch *batch, bool single)
@@ -806,11 +817,37 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
bool do_accounting)
{
long unlocked = 0, locked = vpfn_pages(dma, iova, npage);
- long i;
- for (i = 0; i < npage; i++)
- if (put_pfn(pfn++, dma->prot))
- unlocked++;
+ while (npage) {
+ long nr_pages = 1;
+
+ if (!is_invalid_reserved_pfn(pfn)) {
+ struct page *page = pfn_to_page(pfn);
+ struct folio *folio = page_folio(page);
+ long folio_pages_num = folio_nr_pages(folio);
+
+ /*
+ * For a folio, it represents a physically
+ * contiguous set of bytes, and all of its pages
+ * share the same invalid/reserved state.
+ *
+ * Here, our PFNs are contiguous. Therefore, if we
+ * detect that the current PFN belongs to a large
+ * folio, we can batch the operations for the next
+ * nr_pages PFNs.
+ */
+ if (folio_pages_num > 1)
+ nr_pages = min_t(long, npage,
+ folio_pages_num -
+ folio_page_idx(folio, page));
+
+ _put_pfns(page, nr_pages, dma->prot);


This is sneaky. You interpret the page pointer a an actual page array,
assuming that it would give you the right values when advancing nr_pages
in that array.

This is mostly true, but with !CONFIG_SPARSEMEM_VMEMMAP it is not
universally true for very large folios (e.g., in a 1 GiB hugetlb folio
when we cross the 128 MiB mark on x86).

Not sure if that could already trigger here, but it is subtle.

As previously mentioned in the email, the code here functions
correctly.

+ unlocked += nr_pages;

We could do slightly better here, as we already have the folio. We would
add a unpin_user_folio_dirty_locked() similar to unpin_user_folio().

Instead of _put_pfns, we would be calling

unpin_user_folio_dirty_locked(folio, nr_pages, dma->prot & IOMMU_WRITE);

Thank you so much for your suggestion. Does this implementation of
unpin_user_folio_dirty_locked() look viable to you?

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fdda6b16263b..567c9dae9088 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1689,6 +1689,8 @@ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
bool make_dirty);
void unpin_user_pages(struct page **pages, unsigned long npages);
void unpin_user_folio(struct folio *folio, unsigned long npages);
+void unpin_user_folio_dirty_locked(struct folio *folio,
+ unsigned long npages, bool make_dirty);
void unpin_folios(struct folio **folios, unsigned long nfolios);
static inline bool is_cow_mapping(vm_flags_t flags)
diff --git a/mm/gup.c b/mm/gup.c
index 84461d384ae2..2f1e14a79463 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -360,11 +360,8 @@ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
for (i = 0; i < npages; i += nr) {
folio = gup_folio_range_next(page, npages, i, &nr);
- if (make_dirty && !folio_test_dirty(folio)) {
- folio_lock(folio);
- folio_mark_dirty(folio);
- folio_unlock(folio);
- }
+ if (make_dirty && !folio_test_dirty(folio))
+ folio_mark_dirty_lock(folio);
gup_put_folio(folio, nr, FOLL_PIN);

We can call unpin_user_folio_dirty_locked(). :)

}
}
@@ -435,6 +432,26 @@ void unpin_user_folio(struct folio *folio, unsigned long npages)
}
EXPORT_SYMBOL(unpin_user_folio);
+/**
+ * unpin_user_folio_dirty_locked() - release pages of a folio and
+ * optionally dirty

"conditionally mark a folio dirty and unpin it"

Because that's the sequence in which it is done.

+ *
+ * @folio: pointer to folio to be released
+ * @npages: number of pages of same folio

Can we change that to "nrefs" or rather "npins"?

+ * @make_dirty: whether to mark the folio dirty
+ *
+ * Mark the folio as being modified if @make_dirty is true. Then
+ * release npages of the folio.

Similarly, adjust the doc here.

+ */
+void unpin_user_folio_dirty_locked(struct folio *folio,
+ unsigned long npages, bool make_dirty)
+{
+ if (make_dirty && !folio_test_dirty(folio))
+ folio_mark_dirty_lock(folio);
+ gup_put_folio(folio, npages, FOLL_PIN);
+}
+EXPORT_SYMBOL(unpin_user_folio_dirty_locked);

Yes, should probably go into a separate cleanup patch.

--
Cheers,

David / dhildenb