[RFC PATCH 12/28] af_unix: Support MSG_SPLICE_PAGES

From: David Howells
Date: Thu Mar 16 2023 - 11:28:34 EST


Make AF_UNIX sendmsg() support MSG_SPLICE_PAGES, splicing in pages from the
source iterator if given and if ITER_BVEC and copying the data in
otherwise.

This allows ->sendpage() to be replaced by something that can handle
multiple multipage folios in a single transaction.

Signed-off-by: David Howells <dhowells@xxxxxxxxxx>
cc: "David S. Miller" <davem@xxxxxxxxxxxxx>
cc: Eric Dumazet <edumazet@xxxxxxxxxx>
cc: Jakub Kicinski <kuba@xxxxxxxxxx>
cc: Paolo Abeni <pabeni@xxxxxxxxxx>
cc: Jens Axboe <axboe@xxxxxxxxx>
cc: Matthew Wilcox <willy@xxxxxxxxxxxxx>
cc: netdev@xxxxxxxxxxxxxxx
---
net/unix/af_unix.c | 84 +++++++++++++++++++++++++++++++++++++---------
1 file changed, 68 insertions(+), 16 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 347122c3575e..6f3454db9c53 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2151,6 +2151,44 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other
}
#endif

+/*
+ * Extract pages from a BVEC-type iterator and add them to the socket buffer.
+ */
+static ssize_t unix_extract_bvec_to_skb(struct sk_buff *skb,
+ struct iov_iter *iter, ssize_t maxsize)
+{
+ const struct bio_vec *bv = iter->bvec;
+ unsigned long start = iter->iov_offset;
+ unsigned int i;
+ ssize_t ret = 0;
+
+ for (i = 0; i < iter->nr_segs; i++) {
+ size_t off, len;
+
+ len = bv[i].bv_len;
+ if (start >= len) {
+ start -= len;
+ continue;
+ }
+
+ len = min_t(size_t, maxsize, len - start);
+ off = bv[i].bv_offset + start;
+
+ if (skb_append_pagefrags(skb, bv->bv_page, off, len) < 0)
+ break;
+
+ ret += len;
+ maxsize -= len;
+ if (maxsize <= 0)
+ break;
+ start = 0;
+ }
+
+ if (ret > 0)
+ iov_iter_advance(iter, ret);
+ return ret;
+}
+
static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
size_t len)
{
@@ -2194,19 +2232,25 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
while (sent < len) {
size = len - sent;

- /* Keep two messages in the pipe so it schedules better */
- size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
+ if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
+ skb = sock_alloc_send_pskb(sk, 0, 0,
+ msg->msg_flags & MSG_DONTWAIT,
+ &err, 0);
+ } else {
+ /* Keep two messages in the pipe so it schedules better */
+ size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);

- /* allow fallback to order-0 allocations */
- size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
+ /* allow fallback to order-0 allocations */
+ size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);

- data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
+ data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));

- data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
+ data_len = min_t(size_t, size, PAGE_ALIGN(data_len));

- skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
- msg->msg_flags & MSG_DONTWAIT, &err,
- get_order(UNIX_SKB_FRAGS_SZ));
+ skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
+ msg->msg_flags & MSG_DONTWAIT, &err,
+ get_order(UNIX_SKB_FRAGS_SZ));
+ }
if (!skb)
goto out_err;

@@ -2218,13 +2262,21 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
}
fds_sent = true;

- skb_put(skb, size - data_len);
- skb->data_len = data_len;
- skb->len = size;
- err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
- if (err) {
- kfree_skb(skb);
- goto out_err;
+ if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
+ size = unix_extract_bvec_to_skb(skb, &msg->msg_iter, size);
+ skb->data_len += size;
+ skb->len += size;
+ skb->truesize += size;
+ refcount_add(size, &sk->sk_wmem_alloc);
+ } else {
+ skb_put(skb, size - data_len);
+ skb->data_len = data_len;
+ skb->len = size;
+ err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
+ if (err) {
+ kfree_skb(skb);
+ goto out_err;
+ }
}

unix_state_lock(other);