[PATCH] io_uring/rsrc: Add support for multi-folio buffer coalescing

From: Chenliang Li
Date: Mon May 06 2024 - 05:01:23 EST


Currently fixed buffers consisting of pages in one same folio(huge page)
can be coalesced into a single bvec entry at registration.
This patch expands it to support coalescing fixed buffers
with multiple folios, by:
1. Add a helper function and a helper struct to do the coalescing work
at buffer registration;
2. Add the bvec setup procedure of the coalsced path;
3. store page_mask and page_shift into io_mapped_ubuf for
later use in io_import_fixed.

Signed-off-by: Chenliang Li <cliang01.li@xxxxxxxxxxx>
---
io_uring/rsrc.c | 156 +++++++++++++++++++++++++++++++++++-------------
io_uring/rsrc.h | 9 +++
2 files changed, 124 insertions(+), 41 deletions(-)

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 65417c9553b1..f9e11131c9a5 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -871,6 +871,80 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
return ret;
}

+/*
+ * For coalesce to work, a buffer must be one or multiple
+ * folios, all the folios except the first and last one
+ * should be of the same size.
+ */
+static bool io_sqe_buffer_try_coalesce(struct page **pages,
+ unsigned int nr_pages,
+ struct io_imu_folio_stats *stats)
+{
+ struct folio *folio = NULL, *first_folio = NULL;
+ unsigned int page_cnt;
+ int i, j;
+
+ if (nr_pages <= 1)
+ return false;
+
+ first_folio = page_folio(pages[0]);
+ stats->full_folio_pcnt = folio_nr_pages(first_folio);
+ if (stats->full_folio_pcnt == 1)
+ return false;
+
+ stats->folio_shift = folio_shift(first_folio);
+
+ folio = first_folio;
+ page_cnt = 1;
+ stats->nr_folios = 1;
+ /*
+ * Check:
+ * 1. Pages must be contiguous;
+ * 2. All folios should have the same page count
+ * except the first and last one
+ */
+ for (i = 1; i < nr_pages; i++) {
+ if (page_folio(pages[i]) != folio ||
+ pages[i] != pages[i-1] + 1) {
+ if (folio == first_folio)
+ stats->first_folio_pcnt = page_cnt;
+ else if (page_cnt != stats->full_folio_pcnt)
+ return false;
+ folio = page_folio(pages[i]);
+ page_cnt = 1;
+ stats->nr_folios++;
+ continue;
+ }
+ page_cnt++;
+ }
+ if (folio == first_folio)
+ stats->first_folio_pcnt = page_cnt;
+
+ if (stats->first_folio_pcnt > 1)
+ /*
+ * The pages are bound to the folio, it doesn't
+ * actually unpin them but drops all but one reference,
+ * which is usually put down by io_buffer_unmap().
+ * Note, needs a better helper.
+ */
+ unpin_user_pages(&pages[1], stats->first_folio_pcnt - 1);
+ j = stats->first_folio_pcnt;
+ nr_pages -= stats->first_folio_pcnt;
+ for (i = 1; i < stats->nr_folios; i++) {
+ unsigned int nr_unpin;
+
+ nr_unpin = min_t(unsigned int, nr_pages - 1,
+ stats->full_folio_pcnt - 1);
+ if (nr_unpin <= 1)
+ continue;
+ unpin_user_pages(&pages[j+1], nr_unpin);
+ j += stats->full_folio_pcnt;
+ nr_pages -= stats->full_folio_pcnt;
+ }
+
+ return true;
+}
+
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
struct io_mapped_ubuf **pimu,
struct page **last_hpage)
@@ -879,8 +953,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
struct page **pages = NULL;
unsigned long off;
size_t size;
- int ret, nr_pages, i;
- struct folio *folio = NULL;
+ int ret, nr_pages, nr_bvecs, i, j;
+ bool coalesced;
+ struct io_imu_folio_stats stats;

*pimu = (struct io_mapped_ubuf *)&dummy_ubuf;
if (!iov->iov_base)
@@ -895,39 +970,26 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
goto done;
}

- /* If it's a huge page, try to coalesce them into a single bvec entry */
- if (nr_pages > 1) {
- folio = page_folio(pages[0]);
- for (i = 1; i < nr_pages; i++) {
- /*
- * Pages must be consecutive and on the same folio for
- * this to work
- */
- if (page_folio(pages[i]) != folio ||
- pages[i] != pages[i - 1] + 1) {
- folio = NULL;
- break;
- }
- }
- if (folio) {
- /*
- * The pages are bound to the folio, it doesn't
- * actually unpin them but drops all but one reference,
- * which is usually put down by io_buffer_unmap().
- * Note, needs a better helper.
- */
- unpin_user_pages(&pages[1], nr_pages - 1);
- nr_pages = 1;
- }
- }
-
- imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
+ /* If it's multiple huge pages, try to coalesce them into fewer bvec entries */
+ coalesced = io_sqe_buffer_try_coalesce(pages, nr_pages, &stats);
+ nr_bvecs = nr_pages;
+ if (coalesced)
+ nr_bvecs = stats.nr_folios;
+ imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
if (!imu)
goto done;

ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
if (ret) {
- unpin_user_pages(pages, nr_pages);
+ if (coalesced) {
+ unpin_user_page(pages[0]);
+ j = stats.first_folio_pcnt;
+ for (i = 1; i < stats.nr_folios; i++) {
+ unpin_user_page(pages[j]);
+ j += stats.full_folio_pcnt;
+ }
+ } else
+ unpin_user_pages(pages, nr_pages);
goto done;
}

@@ -936,12 +998,29 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
/* store original address for later verification */
imu->ubuf = (unsigned long) iov->iov_base;
imu->ubuf_end = imu->ubuf + iov->iov_len;
- imu->nr_bvecs = nr_pages;
+ imu->nr_bvecs = nr_bvecs;
+ imu->page_shift = PAGE_SHIFT;
+ imu->page_mask = PAGE_MASK;
+ if (coalesced) {
+ imu->page_shift = stats.folio_shift;
+ imu->page_mask = ~((1UL << stats.folio_shift) - 1);
+ }
*pimu = imu;
ret = 0;

- if (folio) {
- bvec_set_page(&imu->bvec[0], pages[0], size, off);
+ if (coalesced) {
+ size_t vec_len;
+
+ vec_len = min_t(size_t, size, PAGE_SIZE * stats.first_folio_pcnt - off);
+ bvec_set_page(&imu->bvec[0], pages[0], vec_len, off);
+ size -= vec_len;
+ j = stats.first_folio_pcnt;
+ for (i = 1; i < nr_bvecs; i++) {
+ vec_len = min_t(size_t, size, PAGE_SIZE * stats.full_folio_pcnt);
+ bvec_set_page(&imu->bvec[i], pages[j], vec_len, 0);
+ size -= vec_len;
+ j += stats.full_folio_pcnt;
+ }
goto done;
}
for (i = 0; i < nr_pages; i++) {
@@ -1049,7 +1128,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
* we know that:
*
* 1) it's a BVEC iter, we set it up
- * 2) all bvecs are PAGE_SIZE in size, except potentially the
+ * 2) all bvecs are the same in size, except potentially the
* first and last bvec
*
* So just find our index, and adjust the iterator afterwards.
@@ -1061,11 +1140,6 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
const struct bio_vec *bvec = imu->bvec;

if (offset < bvec->bv_len) {
- /*
- * Note, huge pages buffers consists of one large
- * bvec entry and should always go this way. The other
- * branch doesn't expect non PAGE_SIZE'd chunks.
- */
iter->bvec = bvec;
iter->nr_segs = bvec->bv_len;
iter->count -= offset;
@@ -1075,12 +1149,12 @@ int io_import_fixed(int ddir, struct iov_iter *iter,

/* skip first vec */
offset -= bvec->bv_len;
- seg_skip = 1 + (offset >> PAGE_SHIFT);
+ seg_skip = 1 + (offset >> imu->page_shift);

iter->bvec = bvec + seg_skip;
iter->nr_segs -= seg_skip;
iter->count -= bvec->bv_len + offset;
- iter->iov_offset = offset & ~PAGE_MASK;
+ iter->iov_offset = offset & ~(imu->page_mask);
}
}

diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index c032ca3436ca..4c655e446150 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -47,9 +47,18 @@ struct io_mapped_ubuf {
u64 ubuf_end;
unsigned int nr_bvecs;
unsigned long acct_pages;
+ unsigned int page_shift;
+ unsigned long page_mask;
struct bio_vec bvec[] __counted_by(nr_bvecs);
};

+struct io_imu_folio_stats {
+ unsigned int first_folio_pcnt;
+ unsigned int full_folio_pcnt;
+ unsigned int nr_folios;
+ unsigned int folio_shift;
+};
+
void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
--
2.34.1