[RFC PATCH 10/10] memory-provider: add dmabuf devmem provider

From: Mina Almasry
Date: Mon Jul 10 2023 - 18:35:55 EST


Use Jakub's memory provider PoC API:
https://github.com/kuba-moo/linux/tree/pp-providers

To implement a dmabuf devmem memory provider. The provider allocates
NET_RX dmabuf pages to the page pool. This abstracts any custom memory
allocation or freeing changes for devmem TCP from drivers using the
page pool.

The memory provider allocates NET_RX pages from the
dmabuf pages provided by the driver. These pages are ZONE_DEVICE pages
with the sg dma_addrs stored in the zone_device_data entry in the page.
The page pool entries in struct page are in a union with the ZONE_DEVICE
entries, and - without special handling - the page pool would
accidentally overwrite the data in the ZONE_DEVICE fields.

To solve this, the memory provider converts the page from a ZONE_DEVICE
page to a ZONE_NORMAL page upon giving it to the page pool, and converts
it back to ZONE_DEVICE page upon getting it back from the page pool.
This is safe to do because the NET_RX pages are dmabuf pages created to
hold the dma_addr in the dma_buf_map_attachement sg_table entries, and
are only used with code that handles them specifically.

However, since dmabuf pages can now also be page pool page, we need
to update 2 places to detect this correctly:

1. is_dma_buf_page() needs to be updated to correctly detect dmabuf
pages after they've been inserted into the pool.

2. dma_buf_page_to_dma_addr() needs to be updated. For page pool pages,
the dma_addr exists in page->dma_addr. For non page pool pages, the
dma_addr exists in page->zone_device_data.

Signed-off-by: Mina Almasry <almasrymina@xxxxxxxxxx>
---
include/linux/dma-buf.h | 29 ++++++++++-
include/net/page_pool.h | 20 ++++++++
net/core/page_pool.c | 104 ++++++++++++++++++++++++++++++++++++----
3 files changed, 143 insertions(+), 10 deletions(-)

diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 93228a2fec47..896359fa998d 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -692,15 +692,26 @@ static inline bool is_dma_buf_pages_file(struct file *file)

struct page *dma_buf_pages_net_rx_alloc(struct dma_buf_pages *priv);

+static inline bool is_dma_buf_page_net_rx(struct page *page)
+{
+ struct dma_buf_pages *priv;
+
+ return (is_page_pool_page(page) && (priv = page->pp->mp_priv) &&
+ priv->pgmap.ops == &dma_buf_pgmap_ops);
+}
+
static inline bool is_dma_buf_page(struct page *page)
{
return (is_zone_device_page(page) && page->pgmap &&
- page->pgmap->ops == &dma_buf_pgmap_ops);
+ page->pgmap->ops == &dma_buf_pgmap_ops) ||
+ is_dma_buf_page_net_rx(page);
}

static inline dma_addr_t dma_buf_page_to_dma_addr(struct page *page)
{
- return (dma_addr_t)page->zone_device_data;
+ return is_dma_buf_page_net_rx(page) ?
+ (dma_addr_t)page->dma_addr :
+ (dma_addr_t)page->zone_device_data;
}

static inline int dma_buf_map_sg(struct device *dev, struct scatterlist *sg,
@@ -718,6 +729,16 @@ static inline int dma_buf_map_sg(struct device *dev, struct scatterlist *sg,

return nents;
}
+
+static inline bool is_dma_buf_pages_priv(void *ptr)
+{
+ struct dma_buf_pages *priv = (struct dma_buf_pages *)ptr;
+
+ if (!priv || priv->pgmap.ops != &dma_buf_pgmap_ops)
+ return false;
+
+ return true;
+}
#else
static inline bool is_dma_buf_page(struct page *page)
{
@@ -745,6 +766,10 @@ static inline struct page *dma_buf_pages_net_rx_alloc(struct dma_buf_pages *priv
return NULL;
}

+static inline bool is_dma_buf_pages_priv(void *ptr)
+{
+ return false;
+}
#endif


diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 7b6668479baf..a57757a13cc8 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -157,6 +157,7 @@ enum pp_memory_provider_type {
PP_MP_HUGE_SPLIT, /* 2MB, online page alloc */
PP_MP_HUGE, /* 2MB, all memory pre-allocated */
PP_MP_HUGE_1G, /* 1G pages, MEP, pre-allocated */
+ PP_MP_DMABUF_DEVMEM, /* dmabuf devmem provider */
};

struct pp_memory_provider_ops {
@@ -170,6 +171,7 @@ extern const struct pp_memory_provider_ops basic_ops;
extern const struct pp_memory_provider_ops hugesp_ops;
extern const struct pp_memory_provider_ops huge_ops;
extern const struct pp_memory_provider_ops huge_1g_ops;
+extern const struct pp_memory_provider_ops dmabuf_devmem_ops;

struct page_pool {
struct page_pool_params p;
@@ -420,4 +422,22 @@ static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid)
page_pool_update_nid(pool, new_nid);
}

+static inline bool is_page_pool_page(struct page *page)
+{
+ /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
+ * in order to preserve any existing bits, such as bit 0 for the
+ * head page of compound page and bit 1 for pfmemalloc page, so
+ * mask those bits for freeing side when doing below checking,
+ * and page_is_pfmemalloc() is checked in __page_pool_put_page()
+ * to avoid recycling the pfmemalloc page.
+ */
+ if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE))
+ return false;
+
+ if (!page->pp)
+ return false;
+
+ return true;
+}
+
#endif /* _NET_PAGE_POOL_H */
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index df3f431fcff3..e626d4e309c1 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -236,6 +236,9 @@ static int page_pool_init(struct page_pool *pool,
case PP_MP_HUGE_1G:
pool->mp_ops = &huge_1g_ops;
break;
+ case PP_MP_DMABUF_DEVMEM:
+ pool->mp_ops = &dmabuf_devmem_ops;
+ break;
default:
err = -EINVAL;
goto free_ptr_ring;
@@ -975,14 +978,7 @@ bool page_pool_return_skb_page(struct page *page, bool napi_safe)

page = compound_head(page);

- /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
- * in order to preserve any existing bits, such as bit 0 for the
- * head page of compound page and bit 1 for pfmemalloc page, so
- * mask those bits for freeing side when doing below checking,
- * and page_is_pfmemalloc() is checked in __page_pool_put_page()
- * to avoid recycling the pfmemalloc page.
- */
- if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE))
+ if (!is_page_pool_page(page))
return false;

pp = page->pp;
@@ -1538,3 +1534,95 @@ const struct pp_memory_provider_ops huge_1g_ops = {
.alloc_pages = mp_huge_1g_alloc_pages,
.release_page = mp_huge_1g_release,
};
+
+/*** "Dmabuf devmem page" ***/
+
+/* Dmabuf devmem memory provider allocates DMA_BUF_PAGES_NET_RX pages which are
+ * backing the dma_buf_map_attachment() from the NIC to the device memory.
+ *
+ * These pages are wrappers around the dma_addr of the sg entries in the
+ * sg_table returned from dma_buf_map_attachment(). They can be passed to the
+ * networking stack, which will generate devmem skbs from them and process them
+ * correctly.
+ */
+static int mp_dmabuf_devmem_init(struct page_pool *pool)
+{
+ struct dma_buf_pages *priv;
+
+ priv = pool->mp_priv;
+ if (!is_dma_buf_pages_priv(priv))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void mp_dmabuf_devmem_destroy(struct page_pool *pool)
+{
+}
+
+static struct page *mp_dmabuf_devmem_alloc_pages(struct page_pool *pool,
+ gfp_t gfp)
+{
+ struct dma_buf_pages *priv = pool->mp_priv;
+ dma_addr_t dma_addr;
+ struct page *page;
+
+ page = dma_buf_pages_net_rx_alloc(priv);
+ if (!page)
+ return page;
+
+ /* It shouldn't be possible for the allocation to give us a page not
+ * belonging to this page_pool's pgmap.
+ */
+ BUG_ON(page->pgmap != &priv->pgmap);
+
+ /* netdev_rxq_alloc_dma_buf_page() allocates a ZONE_DEVICE page.
+ * Prepare to convert it into a page_pool page. We need to hold pgmap
+ * and zone_device_data (which holds the dma_addr).
+ *
+ * DMA_BUF_PAGES_NET_RX are dmabuf pages created specifically to wrap
+ * the dma_addr of the sg_table into a struct page. These pages are
+ * used by code specifically equipped to handle them, so this
+ * conversation from ZONE_DEVICE page to page pool page should be safe.
+ */
+ dma_addr = (dma_addr_t)page->zone_device_data;
+
+ set_page_zone(page, ZONE_NORMAL);
+ page->pp_magic = 0;
+ page_pool_set_pp_info(pool, page);
+
+ page->dma_addr = dma_addr;
+
+ return page;
+}
+
+static bool mp_dmabuf_devmem_release_page(struct page_pool *pool,
+ struct page *page)
+{
+ struct dma_buf_pages *priv = pool->mp_priv;
+ unsigned long dma_addr = page->dma_addr;
+
+ page_pool_clear_pp_info(page);
+
+ /* As the page pool releases the page, restore it back to a ZONE_DEVICE
+ * page so it gets freed according to the
+ * page->pgmap->ops->page_free().
+ */
+ set_page_zone(page, ZONE_DEVICE);
+ page->zone_device_data = (void*)dma_addr;
+ page->pgmap = &priv->pgmap;
+ put_page(page);
+
+ /* Return false here as we don't want the page pool touching the page
+ * after it's released to us.
+ */
+ return false;
+}
+
+const struct pp_memory_provider_ops dmabuf_devmem_ops = {
+ .init = mp_dmabuf_devmem_init,
+ .destroy = mp_dmabuf_devmem_destroy,
+ .alloc_pages = mp_dmabuf_devmem_alloc_pages,
+ .release_page = mp_dmabuf_devmem_release_page,
+};
+EXPORT_SYMBOL(dmabuf_devmem_ops);
--
2.41.0.390.g38632f3daf-goog