[PATCH v3 3/4] netfs: Add a function to extract a UBUF or IOVEC into a BVEC iterator

From: David Howells
Date: Fri Dec 02 2022 - 04:45:27 EST


Add a function to extract the pages from a user-space supplied iterator
(UBUF- or IOVEC-type) into a BVEC-type iterator, retaining the pages by
getting a ref on them (WRITE) or pinning them (READ) as we go.

This is useful in three situations:

(1) A userspace thread may have a sibling that unmaps or remaps the
process's VM during the operation, changing the assignment of the
pages and potentially causing an error. Retaining the pages keeps
some pages around, even if this occurs; futher, we find out at the
point of extraction if EFAULT is going to be incurred.

(2) Pages might get swapped out/discarded if not retained, so we want to
retain them to avoid the reload causing a deadlock due to a DIO
from/to an mmapped region on the same file.

(3) The iterator may get passed to sendmsg() by the filesystem. If a
fault occurs, we may get a short write to a TCP stream that's then
tricky to recover from.

We don't deal with other types of iterator here, leaving it to other
mechanisms to retain the pages (eg. PG_locked, PG_writeback and the pipe
lock).

Changes:
========
ver #3)
- Switch to using EXPORT_SYMBOL_GPL to prevent indirect 3rd-party access
to get/pin_user_pages_fast()[1].

Signed-off-by: David Howells <dhowells@xxxxxxxxxx>
cc: Jeff Layton <jlayton@xxxxxxxxxx>
cc: Steve French <sfrench@xxxxxxxxx>
cc: Shyam Prasad N <nspmangalore@xxxxxxxxx>
cc: Rohith Surabattula <rohiths.msft@xxxxxxxxx>
cc: linux-cachefs@xxxxxxxxxx
cc: linux-cifs@xxxxxxxxxxxxxxx
cc: linux-fsdevel@xxxxxxxxxxxxxxx
Link: https://lore.kernel.org/r/Y3zFzdWnWlEJ8X8/@infradead.org/ [1]
Link: https://lore.kernel.org/r/166697255265.61150.6289490555867717077.stgit@xxxxxxxxxxxxxxxxxxxxxx/ # rfc
Link: https://lore.kernel.org/r/166732026503.3186319.12020462741051772825.stgit@xxxxxxxxxxxxxxxxxxxxxx/ # rfc
Link: https://lore.kernel.org/r/166869690376.3723671.8813331570219190705.stgit@xxxxxxxxxxxxxxxxxxxxxx/ # rfc
---

fs/netfs/Makefile | 1
fs/netfs/iterator.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/netfs.h | 3 +
3 files changed, 103 insertions(+)
create mode 100644 fs/netfs/iterator.c

diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index f684c0cd1ec5..386d6fb92793 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -3,6 +3,7 @@
netfs-y := \
buffered_read.o \
io.o \
+ iterator.o \
main.o \
objects.o

diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c
new file mode 100644
index 000000000000..82a691b233ef
--- /dev/null
+++ b/fs/netfs/iterator.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Iterator helpers.
+ *
+ * Copyright (C) 2022 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ */
+
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+#include <linux/netfs.h>
+#include "internal.h"
+
+/**
+ * netfs_extract_user_iter - Extract the pages from a user iterator into a bvec
+ * @orig: The original iterator
+ * @orig_len: The amount of iterator to copy
+ * @new: The iterator to be set up
+ * @cleanup_mode: Where to indicate the cleanup mode
+ *
+ * Extract the page fragments from the given amount of the source iterator and
+ * build up a second iterator that refers to all of those bits. This allows
+ * the original iterator to disposed of.
+ *
+ * On success, the number of elements in the bvec is returned, the original
+ * iterator will have been advanced by the amount extracted and @*cleanup_mode
+ * will have been set to FOLL_GET, FOLL_PIN or 0.
+ */
+ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len,
+ struct iov_iter *new, unsigned int *cleanup_mode)
+{
+ struct bio_vec *bv = NULL;
+ struct page **pages;
+ unsigned int cur_npages;
+ unsigned int max_pages;
+ unsigned int npages = 0;
+ unsigned int i;
+ ssize_t ret;
+ size_t count = orig_len, offset, len;
+ size_t bv_size, pg_size;
+
+ if (WARN_ON_ONCE(!iter_is_ubuf(orig) && !iter_is_iovec(orig)))
+ return -EIO;
+
+ max_pages = iov_iter_npages(orig, INT_MAX);
+ bv_size = array_size(max_pages, sizeof(*bv));
+ bv = kvmalloc(bv_size, GFP_KERNEL);
+ if (!bv)
+ return -ENOMEM;
+
+ *cleanup_mode = 0;
+
+ /* Put the page list at the end of the bvec list storage. bvec
+ * elements are larger than page pointers, so as long as we work
+ * 0->last, we should be fine.
+ */
+ pg_size = array_size(max_pages, sizeof(*pages));
+ pages = (void *)bv + bv_size - pg_size;
+
+ while (count && npages < max_pages) {
+ ret = iov_iter_extract_pages(orig, &pages, count,
+ max_pages - npages, &offset,
+ cleanup_mode);
+ if (ret < 0) {
+ pr_err("Couldn't get user pages (rc=%zd)\n", ret);
+ break;
+ }
+
+ if (ret > count) {
+ pr_err("get_pages rc=%zd more than %zu\n", ret, count);
+ break;
+ }
+
+ count -= ret;
+ ret += offset;
+ cur_npages = DIV_ROUND_UP(ret, PAGE_SIZE);
+
+ if (npages + cur_npages > max_pages) {
+ pr_err("Out of bvec array capacity (%u vs %u)\n",
+ npages + cur_npages, max_pages);
+ break;
+ }
+
+ for (i = 0; i < cur_npages; i++) {
+ len = ret > PAGE_SIZE ? PAGE_SIZE : ret;
+ bv[npages + i].bv_page = *pages++;
+ bv[npages + i].bv_offset = offset;
+ bv[npages + i].bv_len = len - offset;
+ ret -= len;
+ offset = 0;
+ }
+
+ npages += cur_npages;
+ }
+
+ iov_iter_bvec(new, iov_iter_rw(orig), bv, npages, orig_len - count);
+ return npages;
+}
+EXPORT_SYMBOL_GPL(netfs_extract_user_iter);
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index f2402ddeafbf..eed84474e4cf 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -288,6 +288,9 @@ void netfs_get_subrequest(struct netfs_io_subrequest *subreq,
void netfs_put_subrequest(struct netfs_io_subrequest *subreq,
bool was_async, enum netfs_sreq_ref_trace what);
void netfs_stats_show(struct seq_file *);
+ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len,
+ struct iov_iter *new,
+ unsigned int *cleanup_mode);

/**
* netfs_inode - Get the netfs inode context from the inode