[RFC PATCH] iov_iter: Add an iterator-of-iterators

From: David Howells
Date: Wed Mar 22 2023 - 14:17:08 EST


Trond Myklebust <trondmy@xxxxxxxxxxxxxxx> wrote:

> Add an enum iter_type for ITER_ITER ? :-)

Well, you asked for it... It's actually fairly straightforward once
ITER_PIPE is removed.

---
iov_iter: Add an iterator-of-iterators

Provide an I/O iterator that takes an array of iterators and iterates over
them in turn. Then make the sunrpc service code (and thus nfsd) use it.

In this particular instance, the svc_tcp_sendmsg() sets up an array of
three iterators: once for the marker+header, one for the body and one
optional one for the tail, then sets msg_iter to be an
iterator-of-iterators across them.

Signed-off-by: David Howells <dhowells@xxxxxxxxxx>
---
include/linux/uio.h | 19 +++-
lib/iov_iter.c | 233 +++++++++++++++++++++++++++++++++++++++++++++++++--
net/sunrpc/svcsock.c | 29 +++---
3 files changed, 258 insertions(+), 23 deletions(-)

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 74598426edb4..321381d3d616 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -27,6 +27,7 @@ enum iter_type {
ITER_XARRAY,
ITER_DISCARD,
ITER_UBUF,
+ ITER_ITERLIST,
};

#define ITER_SOURCE 1 // == WRITE
@@ -43,17 +44,17 @@ struct iov_iter {
bool nofault;
bool data_source;
bool user_backed;
- union {
- size_t iov_offset;
- int last_offset;
- };
+ bool spliceable;
+ size_t iov_offset;
size_t count;
+ size_t orig_count;
union {
const struct iovec *iov;
const struct kvec *kvec;
const struct bio_vec *bvec;
struct xarray *xarray;
void __user *ubuf;
+ struct iov_iter *iterlist;
};
union {
unsigned long nr_segs;
@@ -104,6 +105,11 @@ static inline bool iov_iter_is_xarray(const struct iov_iter *i)
return iov_iter_type(i) == ITER_XARRAY;
}

+static inline bool iov_iter_is_iterlist(const struct iov_iter *i)
+{
+ return iov_iter_type(i) == ITER_ITERLIST;
+}
+
static inline unsigned char iov_iter_rw(const struct iov_iter *i)
{
return i->data_source ? WRITE : READ;
@@ -238,6 +244,8 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_
void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count);
void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray,
loff_t start, size_t count);
+void iov_iter_iterlist(struct iov_iter *i, unsigned int direction, struct iov_iter *iterlist,
+ unsigned long nr_segs, size_t count);
ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
size_t maxsize, unsigned maxpages, size_t *start,
iov_iter_extraction_t extraction_flags);
@@ -345,7 +353,8 @@ static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
.user_backed = true,
.data_source = direction,
.ubuf = buf,
- .count = count
+ .count = count,
+ .orig_count = count,
};
}
/* Flags for iov_iter_get/extract_pages*() */
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index fad95e4cf372..34ce3b958b6c 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -282,7 +282,8 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
.iov = iov,
.nr_segs = nr_segs,
.iov_offset = 0,
- .count = count
+ .count = count,
+ .orig_count = count,
};
}
EXPORT_SYMBOL(iov_iter_init);
@@ -364,6 +365,26 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
if (WARN_ON_ONCE(!i->data_source))
return 0;

+ if (unlikely(iov_iter_is_iterlist(i))) {
+ size_t copied = 0;
+
+ while (bytes && i->count) {
+ size_t part = min(bytes, i->iterlist->count), n;
+
+ if (part > 0)
+ n = _copy_from_iter(addr, part, i->iterlist);
+ addr += n;
+ copied += n;
+ bytes -= n;
+ i->count -= n;
+ if (n < part || !bytes)
+ break;
+ i->iterlist++;
+ i->nr_segs--;
+ }
+ return copied;
+ }
+
if (user_backed_iter(i))
might_fault();
iterate_and_advance(i, bytes, base, len, off,
@@ -380,6 +401,27 @@ size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
if (WARN_ON_ONCE(!i->data_source))
return 0;

+ if (unlikely(iov_iter_is_iterlist(i))) {
+ size_t copied = 0;
+
+ while (bytes && i->count) {
+ size_t part = min(bytes, i->iterlist->count), n;
+
+ if (part > 0)
+ n = _copy_from_iter_nocache(addr, part,
+ i->iterlist);
+ addr += n;
+ copied += n;
+ bytes -= n;
+ i->count -= n;
+ if (n < part || !bytes)
+ break;
+ i->iterlist++;
+ i->nr_segs--;
+ }
+ return copied;
+ }
+
iterate_and_advance(i, bytes, base, len, off,
__copy_from_user_inatomic_nocache(addr + off, base, len),
memcpy(addr + off, base, len)
@@ -411,6 +453,27 @@ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
if (WARN_ON_ONCE(!i->data_source))
return 0;

+ if (unlikely(iov_iter_is_iterlist(i))) {
+ size_t copied = 0;
+
+ while (bytes && i->count) {
+ size_t part = min(bytes, i->iterlist->count), n;
+
+ if (part > 0)
+ n = _copy_from_iter_flushcache(addr, part,
+ i->iterlist);
+ addr += n;
+ copied += n;
+ bytes -= n;
+ i->count -= n;
+ if (n < part || !bytes)
+ break;
+ i->iterlist++;
+ i->nr_segs--;
+ }
+ return copied;
+ }
+
iterate_and_advance(i, bytes, base, len, off,
__copy_from_user_flushcache(addr + off, base, len),
memcpy_flushcache(addr + off, base, len)
@@ -514,7 +577,31 @@ EXPORT_SYMBOL(iov_iter_zero);
size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes,
struct iov_iter *i)
{
- char *kaddr = kmap_atomic(page), *p = kaddr + offset;
+ char *kaddr, *p;
+
+ if (unlikely(iov_iter_is_iterlist(i))) {
+ size_t copied = 0;
+
+ while (bytes && i->count) {
+ size_t part = min(bytes, i->iterlist->count), n;
+
+ if (part > 0)
+ n = copy_page_from_iter_atomic(page, offset, part,
+ i->iterlist);
+ offset += n;
+ copied += n;
+ bytes -= n;
+ i->count -= n;
+ if (n < part || !bytes)
+ break;
+ i->iterlist++;
+ i->nr_segs--;
+ }
+ return copied;
+ }
+
+ kaddr = kmap_atomic(page);
+ p = kaddr + offset;
if (!page_copy_sane(page, offset, bytes)) {
kunmap_atomic(kaddr);
return 0;
@@ -585,19 +672,49 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
iov_iter_bvec_advance(i, size);
} else if (iov_iter_is_discard(i)) {
i->count -= size;
+ }else if (iov_iter_is_iterlist(i)) {
+ i->count -= size;
+ for (;;) {
+ size_t part = min(size, i->iterlist->count);
+
+ if (part > 0)
+ iov_iter_advance(i->iterlist, part);
+ size -= part;
+ if (!size)
+ break;
+ i->iterlist++;
+ i->nr_segs--;
+ }
}
}
EXPORT_SYMBOL(iov_iter_advance);

+static void iov_iter_revert_iterlist(struct iov_iter *i, size_t unroll)
+{
+ for (;;) {
+ size_t part = min(unroll, i->iterlist->orig_count - i->iterlist->count);
+
+ if (part > 0)
+ iov_iter_revert(i->iterlist, part);
+ unroll -= part;
+ if (!unroll)
+ break;
+ i->iterlist--;
+ i->nr_segs++;
+ }
+}
+
void iov_iter_revert(struct iov_iter *i, size_t unroll)
{
if (!unroll)
return;
- if (WARN_ON(unroll > MAX_RW_COUNT))
+ if (WARN_ON(unroll > i->orig_count - i->count))
return;
i->count += unroll;
if (unlikely(iov_iter_is_discard(i)))
return;
+ if (unlikely(iov_iter_is_iterlist(i)))
+ return iov_iter_revert_iterlist(i, unroll);
if (unroll <= i->iov_offset) {
i->iov_offset -= unroll;
return;
@@ -641,6 +758,8 @@ EXPORT_SYMBOL(iov_iter_revert);
*/
size_t iov_iter_single_seg_count(const struct iov_iter *i)
{
+ if (iov_iter_is_iterlist(i))
+ i = i->iterlist;
if (i->nr_segs > 1) {
if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
return min(i->count, i->iov->iov_len - i->iov_offset);
@@ -662,7 +781,8 @@ void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
.kvec = kvec,
.nr_segs = nr_segs,
.iov_offset = 0,
- .count = count
+ .count = count,
+ .orig_count = count,
};
}
EXPORT_SYMBOL(iov_iter_kvec);
@@ -678,7 +798,8 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
.bvec = bvec,
.nr_segs = nr_segs,
.iov_offset = 0,
- .count = count
+ .count = count,
+ .orig_count = count,
};
}
EXPORT_SYMBOL(iov_iter_bvec);
@@ -706,6 +827,7 @@ void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
.xarray = xarray,
.xarray_start = start,
.count = count,
+ .orig_count = count,
.iov_offset = 0
};
}
@@ -727,11 +849,47 @@ void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
.iter_type = ITER_DISCARD,
.data_source = false,
.count = count,
+ .orig_count = count,
.iov_offset = 0
};
}
EXPORT_SYMBOL(iov_iter_discard);

+/**
+ * iov_iter_iterlist - Initialise an I/O iterator that is a list of iterators
+ * @iter: The iterator to initialise.
+ * @direction: The direction of the transfer.
+ * @iterlist: The list of iterators
+ * @nr_segs: The number of elements in the list
+ * @count: The size of the I/O buffer in bytes.
+ *
+ * Set up an I/O iterator that just discards everything that's written to it.
+ * It's only available as a source iterator (for WRITE), all the iterators in
+ * the list must be the same and none of them can be ITER_ITERLIST type.
+ */
+void iov_iter_iterlist(struct iov_iter *iter, unsigned int direction,
+ struct iov_iter *iterlist, unsigned long nr_segs,
+ size_t count)
+{
+ unsigned long i;
+
+ BUG_ON(direction != WRITE);
+ for (i = 0; i < nr_segs; i++) {
+ BUG_ON(iterlist[i].iter_type == ITER_ITERLIST);
+ BUG_ON(!iterlist[i].data_source);
+ }
+
+ *iter = (struct iov_iter){
+ .iter_type = ITER_ITERLIST,
+ .data_source = true,
+ .count = count,
+ .orig_count = count,
+ .iterlist = iterlist,
+ .nr_segs = nr_segs,
+ };
+}
+EXPORT_SYMBOL(iov_iter_iterlist);
+
static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask,
unsigned len_mask)
{
@@ -879,6 +1037,15 @@ unsigned long iov_iter_alignment(const struct iov_iter *i)
if (iov_iter_is_xarray(i))
return (i->xarray_start + i->iov_offset) | i->count;

+ if (iov_iter_is_iterlist(i)) {
+ unsigned long align = 0;
+ unsigned int j;
+
+ for (j = 0; j < i->nr_segs; j++)
+ align |= iov_iter_alignment(&i->iterlist[j]);
+ return align;
+ }
+
return 0;
}
EXPORT_SYMBOL(iov_iter_alignment);
@@ -1078,6 +1245,18 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
}
if (iov_iter_is_xarray(i))
return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
+ if (iov_iter_is_iterlist(i)) {
+ ssize_t size;
+
+ while (!i->iterlist->count) {
+ i->iterlist++;
+ i->nr_segs--;
+ }
+ size = __iov_iter_get_pages_alloc(i->iterlist, pages, maxsize, maxpages,
+ start, extraction_flags);
+ i->count -= size;
+ return size;
+ }
return -EFAULT;
}

@@ -1126,6 +1305,31 @@ ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i,
}
EXPORT_SYMBOL(iov_iter_get_pages_alloc2);

+static size_t csum_and_copy_from_iterlist(void *addr, size_t bytes, __wsum *csum,
+ struct iov_iter *i)
+{
+ size_t copied = 0, n;
+
+ while (i->count && i->nr_segs) {
+ struct iov_iter *j = i->iterlist;
+
+ if (j->count == 0) {
+ i->iterlist++;
+ i->nr_segs--;
+ continue;
+ }
+
+ n = csum_and_copy_from_iter(addr, bytes - copied, csum, j);
+ addr += n;
+ copied += n;
+ i->count -= n;
+ if (n == 0)
+ break;
+ }
+
+ return copied;
+}
+
size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
struct iov_iter *i)
{
@@ -1133,6 +1337,8 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
sum = *csum;
if (WARN_ON_ONCE(!i->data_source))
return 0;
+ if (iov_iter_is_iterlist(i))
+ return csum_and_copy_from_iterlist(addr, bytes, csum, i);

iterate_and_advance(i, bytes, base, len, off, ({
next = csum_and_copy_from_user(base, addr + off, len);
@@ -1236,6 +1442,21 @@ static int bvec_npages(const struct iov_iter *i, int maxpages)
return npages;
}

+static int iterlist_npages(const struct iov_iter *i, int maxpages)
+{
+ ssize_t size = i->count;
+ const struct iov_iter *p;
+ int npages = 0;
+
+ for (p = i->iterlist; size; p++) {
+ size -= p->count;
+ npages += iov_iter_npages(p, maxpages - npages);
+ if (unlikely(npages >= maxpages))
+ return maxpages;
+ }
+ return npages;
+}
+
int iov_iter_npages(const struct iov_iter *i, int maxpages)
{
if (unlikely(!i->count))
@@ -1255,6 +1476,8 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages)
int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
return min(npages, maxpages);
}
+ if (iov_iter_is_iterlist(i))
+ return iterlist_npages(i, maxpages);
return 0;
}
EXPORT_SYMBOL(iov_iter_npages);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 1d0f0f764e16..030a1fa5171b 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1073,11 +1073,13 @@ static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr,
{
const struct kvec *head = xdr->head;
const struct kvec *tail = xdr->tail;
+ struct iov_iter iters[3];
+ struct bio_vec head_bv, tail_bv;
struct msghdr msg = {
- .msg_flags = MSG_SPLICE_PAGES,
+ .msg_flags = 0, //MSG_SPLICE_PAGES,
};
- void *m, *h, *t;
- int ret, n = xdr_buf_pagecount(xdr), size;
+ void *m, *t;
+ int ret, n = 2, size;

*sentp = 0;
ret = xdr_alloc_bvec(xdr, GFP_KERNEL);
@@ -1089,27 +1091,28 @@ static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr,
if (!m)
return -ENOMEM;

- h = m + sizeof(marker);
- t = h + head->iov_len;
+ memcpy(m, &marker, sizeof(marker));
+ if (head->iov_len)
+ memcpy(m + sizeof(marker), head->iov_base, head->iov_len);
+ bvec_set_virt(&head_bv, m, sizeof(marker) + head->iov_len);
+ iov_iter_bvec(&iters[0], ITER_SOURCE, &head_bv, 1,
+ sizeof(marker) + head->iov_len);

- bvec_set_virt(&xdr->bvec[-1], m, sizeof(marker) + head->iov_len);
- n++;
+ iov_iter_bvec(&iters[1], ITER_SOURCE, xdr->bvec,
+ xdr_buf_pagecount(xdr), xdr->page_len);

if (tail->iov_len) {
t = page_frag_alloc(NULL, tail->iov_len, GFP_KERNEL);
if (!t)
return -ENOMEM;
- bvec_set_virt(&xdr->bvec[n], t, tail->iov_len);
memcpy(t, tail->iov_base, tail->iov_len);
+ bvec_set_virt(&tail_bv, t, tail->iov_len);
+ iov_iter_bvec(&iters[2], ITER_SOURCE, &tail_bv, 1, tail->iov_len);
n++;
}

- memcpy(m, &marker, sizeof(marker));
- if (head->iov_len)
- memcpy(h, head->iov_base, head->iov_len);
-
size = sizeof(marker) + head->iov_len + xdr->page_len + tail->iov_len;
- iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, xdr->bvec - 1, n, size);
+ iov_iter_iterlist(&msg.msg_iter, ITER_SOURCE, iters, n, size);

ret = sock_sendmsg(sock, &msg);
if (ret < 0)