Re: [PATCH bpf-next 1/3] xsk: Support UMEM chunk_size > PAGE_SIZE

From: Maciej Fijalkowski
Date: Tue Mar 21 2023 - 06:48:56 EST


On Sun, Mar 19, 2023 at 08:56:54PM +0100, Kal Conley wrote:
> Add core AF_XDP support for chunk sizes larger than PAGE_SIZE. This
> enables sending/receiving jumbo ethernet frames up to the theoretical
> maxiumum of 64 KiB. For chunk sizes > PAGE_SIZE, the UMEM is required
> to consist of HugeTLB VMAs (and be hugepage aligned). Initially, only
> XDP_COPY mode is usuable pending future driver work.
>
> For consistency, check for HugeTLB pages during UMEM registration. This
> implies that hugepages are required for XDP_COPY mode despite DMA not
> being used. This restriction is desirable since it ensures user software
> can take advantage of future driver support.
>
> Even in HugeTLB mode, continue to do page accounting using order-0
> (4 KiB) pages. This minimizes the size of this change and reduces the
> risk of impacting driver code. Taking full advantage of hugepages for
> accounting should improve XDP performance in the general case.
>
> No significant change in RX/TX performance was observed with this patch.
> A few data points are reproduced below:
>
> Machine : Dell PowerEdge R940
> CPU : Intel(R) Xeon(R) Platinum 8168 CPU @ 2.70GHz
> NIC : MT27700 Family [ConnectX-4]

can you tell us a bit more about your environment setup - from what i can
tell mlx4 does not support xdp multi-buffer, so you were testing this
feature with XDP program attached in generic mode?

if you were using xdpsock -S or xdpsock -c? or maybe even something
different?

what was your MTU size on NIC?

>
> +-----+------------+-------------+---------------+
> | | frame size | packet size | rxdrop (Mpps) |
> +-----+------------+-------------+---------------+
> | old | 4000 | 320 | 15.7 |
> | new | 4000 | 320 | 15.8 |
> +-----+------------+-------------+---------------+
> | old | 4096 | 320 | 16.4 |
> | new | 4096 | 320 | 16.3 |
> +-----+------------+-------------+---------------+
> | new | 9000 | 320 | 6.3 |
> | new | 10240 | 9000 | 0.4 |
> +-----+------------+-------------+---------------+
>
> Signed-off-by: Kal Conley <kal.conley@xxxxxxxxxxx>
> ---
> include/net/xdp_sock.h | 1 +
> include/net/xdp_sock_drv.h | 6 +++++
> include/net/xsk_buff_pool.h | 4 +++-
> net/xdp/xdp_umem.c | 46 +++++++++++++++++++++++++++++--------
> net/xdp/xsk.c | 3 +++
> net/xdp/xsk_buff_pool.c | 16 +++++++++----
> 6 files changed, 61 insertions(+), 15 deletions(-)
>
> diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> index 3057e1a4a11c..e562ac3f5295 100644
> --- a/include/net/xdp_sock.h
> +++ b/include/net/xdp_sock.h
> @@ -28,6 +28,7 @@ struct xdp_umem {
> struct user_struct *user;
> refcount_t users;
> u8 flags;
> + bool hugetlb;
> bool zc;
> struct page **pgs;
> int id;
> diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
> index 9c0d860609ba..eb630d17f994 100644
> --- a/include/net/xdp_sock_drv.h
> +++ b/include/net/xdp_sock_drv.h
> @@ -12,6 +12,12 @@
> #define XDP_UMEM_MIN_CHUNK_SHIFT 11
> #define XDP_UMEM_MIN_CHUNK_SIZE (1 << XDP_UMEM_MIN_CHUNK_SHIFT)
>
> +/* Allow chunk sizes up to the maximum size of an ethernet frame (64 KiB).
> + * Larger chunks are not guaranteed to fit in a single SKB.
> + */
> +#define XDP_UMEM_MAX_CHUNK_SHIFT 16
> +#define XDP_UMEM_MAX_CHUNK_SIZE (1 << XDP_UMEM_MAX_CHUNK_SHIFT)
> +
> #ifdef CONFIG_XDP_SOCKETS
>
> void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries);
> diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
> index 3e952e569418..69e3970da092 100644
> --- a/include/net/xsk_buff_pool.h
> +++ b/include/net/xsk_buff_pool.h
> @@ -78,6 +78,7 @@ struct xsk_buff_pool {
> u8 cached_need_wakeup;
> bool uses_need_wakeup;
> bool dma_need_sync;
> + bool hugetlb;
> bool unaligned;
> void *addrs;
> /* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect:
> @@ -175,7 +176,8 @@ static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool,
> static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool,
> u64 addr, u32 len)
> {
> - bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE;
> + bool cross_pg = pool->hugetlb ? (addr & (HPAGE_SIZE - 1)) + len > HPAGE_SIZE :
> + (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE;
>
> if (likely(!cross_pg))
> return false;
> diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
> index 02207e852d79..c96eefb9f5ae 100644
> --- a/net/xdp/xdp_umem.c
> +++ b/net/xdp/xdp_umem.c
> @@ -10,6 +10,7 @@
> #include <linux/uaccess.h>
> #include <linux/slab.h>
> #include <linux/bpf.h>
> +#include <linux/hugetlb_inline.h>
> #include <linux/mm.h>
> #include <linux/netdevice.h>
> #include <linux/rtnetlink.h>
> @@ -19,6 +20,9 @@
> #include "xdp_umem.h"
> #include "xsk_queue.h"
>
> +_Static_assert(XDP_UMEM_MIN_CHUNK_SIZE <= PAGE_SIZE);
> +_Static_assert(XDP_UMEM_MAX_CHUNK_SIZE <= HPAGE_SIZE);
> +
> static DEFINE_IDA(umem_ida);
>
> static void xdp_umem_unpin_pages(struct xdp_umem *umem)
> @@ -91,7 +95,26 @@ void xdp_put_umem(struct xdp_umem *umem, bool defer_cleanup)
> }
> }
>
> -static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address)
> +/* Returns true if the UMEM contains HugeTLB pages exclusively, false otherwise.
> + *
> + * The mmap_lock must be held by the caller.
> + */
> +static bool xdp_umem_is_hugetlb(struct xdp_umem *umem, unsigned long address)
> +{
> + unsigned long end = address + umem->npgs * PAGE_SIZE;
> + struct vm_area_struct *vma;
> + struct vma_iterator vmi;
> +
> + vma_iter_init(&vmi, current->mm, address);
> + for_each_vma_range(vmi, vma, end) {
> + if (!is_vm_hugetlb_page(vma))
> + return false;
> + }
> +
> + return true;
> +}
> +
> +static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address, bool hugetlb)
> {
> unsigned int gup_flags = FOLL_WRITE;
> long npgs;
> @@ -102,8 +125,17 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address)
> return -ENOMEM;
>
> mmap_read_lock(current->mm);
> +
> + umem->hugetlb = IS_ALIGNED(address, HPAGE_SIZE) && xdp_umem_is_hugetlb(umem, address);
> + if (hugetlb && !umem->hugetlb) {
> + mmap_read_unlock(current->mm);
> + err = -EINVAL;
> + goto out_pgs;
> + }
> +
> npgs = pin_user_pages(address, umem->npgs,
> gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL);
> +
> mmap_read_unlock(current->mm);
>
> if (npgs != umem->npgs) {
> @@ -152,20 +184,14 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
> {
> bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
> u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
> + bool hugetlb = chunk_size > PAGE_SIZE;
> u64 addr = mr->addr, size = mr->len;
> u32 chunks_rem, npgs_rem;
> u64 chunks, npgs;
> int err;
>
> - if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
> - /* Strictly speaking we could support this, if:
> - * - huge pages, or*
> - * - using an IOMMU, or
> - * - making sure the memory area is consecutive
> - * but for now, we simply say "computer says no".
> - */
> + if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > XDP_UMEM_MAX_CHUNK_SIZE)
> return -EINVAL;
> - }
>
> if (mr->flags & ~XDP_UMEM_UNALIGNED_CHUNK_FLAG)
> return -EINVAL;
> @@ -215,7 +241,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
> if (err)
> return err;
>
> - err = xdp_umem_pin_pages(umem, (unsigned long)addr);
> + err = xdp_umem_pin_pages(umem, (unsigned long)addr, hugetlb);
> if (err)
> goto out_account;
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 2ac58b282b5e..3899a2d235bb 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -421,6 +421,9 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> sock_wfree(skb);
> }
>
> +/* Chunks must fit in the SKB `frags` array. */
> +_Static_assert(XDP_UMEM_MAX_CHUNK_SIZE / PAGE_SIZE <= MAX_SKB_FRAGS);
> +
> static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> struct xdp_desc *desc)
> {
> diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> index b2df1e0f8153..777e8a38a232 100644
> --- a/net/xdp/xsk_buff_pool.c
> +++ b/net/xdp/xsk_buff_pool.c
> @@ -80,6 +80,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
> pool->headroom = umem->headroom;
> pool->chunk_size = umem->chunk_size;
> pool->chunk_shift = ffs(umem->chunk_size) - 1;
> + pool->hugetlb = umem->hugetlb;
> pool->unaligned = unaligned;
> pool->frame_len = umem->chunk_size - umem->headroom -
> XDP_PACKET_HEADROOM;
> @@ -369,16 +370,23 @@ void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
> }
> EXPORT_SYMBOL(xp_dma_unmap);
>
> -static void xp_check_dma_contiguity(struct xsk_dma_map *dma_map)
> +/* HugeTLB pools consider contiguity at hugepage granularity only. Hence, all
> + * order-0 pages within a hugepage have the same contiguity value.
> + */
> +static void xp_check_dma_contiguity(struct xsk_dma_map *dma_map, bool hugetlb)
> {
> + u32 page_size = hugetlb ? HPAGE_SIZE : PAGE_SIZE;
> + u32 n = page_size >> PAGE_SHIFT;
> u32 i;
>
> - for (i = 0; i < dma_map->dma_pages_cnt - 1; i++) {
> - if (dma_map->dma_pages[i] + PAGE_SIZE == dma_map->dma_pages[i + 1])
> + for (i = 0; i + n < dma_map->dma_pages_cnt; i++) {
> + if (dma_map->dma_pages[i] + page_size == dma_map->dma_pages[i + n])
> dma_map->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK;
> else
> dma_map->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK;
> }
> + for (; i < dma_map->dma_pages_cnt; i++)
> + dma_map->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK;
> }
>
> static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_map)
> @@ -441,7 +449,7 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
> }
>
> if (pool->unaligned)
> - xp_check_dma_contiguity(dma_map);
> + xp_check_dma_contiguity(dma_map, pool->hugetlb);
>
> err = xp_init_dma_info(pool, dma_map);
> if (err) {
> --
> 2.39.2
>