[PATCH net-next 1/2] virtio-net: re enable XDP_REDIRECT for mergeable buffer

From: Jason Wang
Date: Wed Feb 28 2018 - 22:19:29 EST


XDP_REDIRECT support for mergeable buffer was removed since commit
7324f5399b06 ("virtio_net: disable XDP_REDIRECT in receive_mergeable()
case"). This is because we don't reserve enough tailroom for struct
skb_shared_info which breaks XDP assumption. Other complaints are, the
complex linearize logic and EWMA estimation may increase the
possibility of linearizing.

Signed-off-by: Jason Wang <jasowang@xxxxxxxxxx>
---
drivers/net/virtio_net.c | 107 +++++++++++++++++++++++++++++------------------
1 file changed, 67 insertions(+), 40 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 9bb9e56..81190ba 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -537,6 +537,26 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
return NULL;
}

+static struct sk_buff *virtnet_skb_xdp(struct receive_queue *rq,
+ struct sk_buff *skb)
+{
+ struct bpf_prog *xdp_prog;
+ int ret;
+
+ rcu_read_lock();
+ xdp_prog = rcu_dereference(rq->xdp_prog);
+ if (xdp_prog) {
+ ret = do_xdp_generic(xdp_prog, skb);
+ if (ret != XDP_PASS) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ }
+ rcu_read_unlock();
+
+ return skb;
+}
+
static struct sk_buff *receive_small(struct net_device *dev,
struct virtnet_info *vi,
struct receive_queue *rq,
@@ -689,31 +709,30 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
struct bpf_prog *xdp_prog;
unsigned int truesize;
unsigned int headroom = mergeable_ctx_to_headroom(ctx);
- bool sent;
+ bool sent, skb_xdp = false;
+ int err;

head_skb = NULL;

rcu_read_lock();
xdp_prog = rcu_dereference(rq->xdp_prog);
if (xdp_prog) {
- struct page *xdp_page;
struct xdp_buff xdp;
void *data;
u32 act;

- /* This happens when rx buffer size is underestimated */
+ /* This happens when rx buffer size is underestimated
+ * or headroom is not enough because of the buffer
+ * was refilled before XDP is set. In both cases,
+ * for simplicity, we will offload them to generic
+ * XDP routine. This should only happen for the first
+ * several packets, so we don't care much about its
+ * performance.
+ */
if (unlikely(num_buf > 1 ||
headroom < virtnet_get_headroom(vi))) {
- /* linearize data for XDP */
- xdp_page = xdp_linearize_page(rq, &num_buf,
- page, offset,
- VIRTIO_XDP_HEADROOM,
- &len);
- if (!xdp_page)
- goto err_xdp;
- offset = VIRTIO_XDP_HEADROOM;
- } else {
- xdp_page = page;
+ skb_xdp = true;
+ goto skb_xdp;
}

/* Transient failure which in theory could occur if
@@ -727,7 +746,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
/* Allow consuming headroom but reserve enough space to push
* the descriptor on if we get an XDP_TX return code.
*/
- data = page_address(xdp_page) + offset;
+ data = page_address(page) + offset;
xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
xdp.data = data + vi->hdr_len;
xdp_set_data_meta_invalid(&xdp);
@@ -736,9 +755,6 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,

act = bpf_prog_run_xdp(xdp_prog, &xdp);

- if (act != XDP_PASS)
- ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
-
switch (act) {
case XDP_PASS:
/* recalculate offset to account for any header
@@ -746,28 +762,22 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
* skb and avoid using offset
*/
offset = xdp.data -
- page_address(xdp_page) - vi->hdr_len;
-
- /* We can only create skb based on xdp_page. */
- if (unlikely(xdp_page != page)) {
- rcu_read_unlock();
- put_page(page);
- head_skb = page_to_skb(vi, rq, xdp_page,
- offset, len, PAGE_SIZE);
- return head_skb;
- }
+ page_address(page) - vi->hdr_len;
break;
case XDP_TX:
sent = __virtnet_xdp_xmit(vi, &xdp);
if (unlikely(!sent)) {
trace_xdp_exception(vi->dev, xdp_prog, act);
- if (unlikely(xdp_page != page))
- put_page(xdp_page);
goto err_xdp;
}
*xdp_xmit = true;
- if (unlikely(xdp_page != page))
+ rcu_read_unlock();
+ goto xdp_xmit;
+ case XDP_REDIRECT:
+ err = xdp_do_redirect(dev, &xdp, xdp_prog);
+ if (err)
goto err_xdp;
+ *xdp_xmit = true;
rcu_read_unlock();
goto xdp_xmit;
default:
@@ -775,13 +785,12 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
case XDP_ABORTED:
trace_xdp_exception(vi->dev, xdp_prog, act);
case XDP_DROP:
- if (unlikely(xdp_page != page))
- __free_pages(xdp_page, 0);
goto err_xdp;
}
}
rcu_read_unlock();

+skb_xdp:
truesize = mergeable_ctx_to_truesize(ctx);
if (unlikely(len > truesize)) {
pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
@@ -848,7 +857,11 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
}
}

- ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
+ if (skb_xdp)
+ head_skb = virtnet_skb_xdp(rq, head_skb);
+ else
+ ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
+
return head_skb;

err_xdp:
@@ -1013,13 +1026,18 @@ static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
}

static unsigned int get_mergeable_buf_len(struct receive_queue *rq,
- struct ewma_pkt_len *avg_pkt_len)
+ struct ewma_pkt_len *avg_pkt_len,
+ unsigned int room)
{
const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
unsigned int len;

- len = hdr_len + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
+ if (room)
+ return PAGE_SIZE - room;
+
+ len = hdr_len + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
rq->min_buf_len, PAGE_SIZE - hdr_len);
+
return ALIGN(len, L1_CACHE_BYTES);
}

@@ -1028,21 +1046,27 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
{
struct page_frag *alloc_frag = &rq->alloc_frag;
unsigned int headroom = virtnet_get_headroom(vi);
+ unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
+ unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
char *buf;
void *ctx;
int err;
unsigned int len, hole;

- len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len);
- if (unlikely(!skb_page_frag_refill(len + headroom, alloc_frag, gfp)))
+ /* Extra tailroom is needed to satisfy XDP's assumption. This
+ * means rx frags coalescing won't work, but consider we've
+ * disabled GSO for XDP, it won't be a big issue.
+ */
+ len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
+ if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
return -ENOMEM;

buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
buf += headroom; /* advance address leaving hole at front of pkt */
get_page(alloc_frag->page);
- alloc_frag->offset += len + headroom;
+ alloc_frag->offset += len + room;
hole = alloc_frag->size - alloc_frag->offset;
- if (hole < len + headroom) {
+ if (hole < len + room) {
/* To avoid internal fragmentation, if there is very likely not
* enough space for another buffer, add the remaining space to
* the current buffer.
@@ -2576,12 +2600,15 @@ static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
{
struct virtnet_info *vi = netdev_priv(queue->dev);
unsigned int queue_index = get_netdev_rx_queue_index(queue);
+ unsigned int headroom = virtnet_get_headroom(vi);
+ unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
struct ewma_pkt_len *avg;

BUG_ON(queue_index >= vi->max_queue_pairs);
avg = &vi->rq[queue_index].mrg_avg_pkt_len;
return sprintf(buf, "%u\n",
- get_mergeable_buf_len(&vi->rq[queue_index], avg));
+ get_mergeable_buf_len(&vi->rq[queue_index], avg,
+ SKB_DATA_ALIGN(headroom + tailroom)));
}

static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
--
2.7.4