[PATCH RFC] tun: dma engine support

From: Michael S. Tsirkin
Date: Mon Oct 11 2010 - 16:59:16 EST


Simple hack to use dma engine for tun RX.
Only one skb in flight at the moment.

Signed-off-by: Michael S. Tsirkin <mst@xxxxxxxxxx>
---

I am still looking at handling multiple skbs, but
sending this out for early flames and improvement suggestions.

Loopback testing seems to show only minor performance gains:
this is not really suprising as data is hot in cache already.
Where I would expect this to help more is with incoming
traffic from an external NIC. This still needs to be tested.

drivers/dma/Kconfig | 2 +-
drivers/dma/iovlock.c | 2 +-
drivers/net/tun.c | 389 ++++++++++++++++++++++++++++++++++++++++++++++++-
3 files changed, 390 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 9520cf0..7e82c00 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -202,7 +202,7 @@ comment "DMA Clients"
depends on DMA_ENGINE

config NET_DMA
- bool "Network: TCP receive copy offload"
+ bool "Network: TCP/TUN receive copy offload"
depends on DMA_ENGINE && NET
default (INTEL_IOATDMA || FSL_DMA)
help
diff --git a/drivers/dma/iovlock.c b/drivers/dma/iovlock.c
index c6917e8..121d7fd 100644
--- a/drivers/dma/iovlock.c
+++ b/drivers/dma/iovlock.c
@@ -138,7 +138,7 @@ void dma_unpin_iovec_pages(struct dma_pinned_list *pinned_list)

kfree(pinned_list);
}
-
+EXPORT_SYMBOL_GPL(dma_unpin_iovec_pages);

/*
* We have already pinned down the pages we will be using in the iovecs.
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 55f3a3e..ddbfbc8 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -62,6 +62,8 @@
#include <linux/nsproxy.h>
#include <linux/virtio_net.h>
#include <linux/rcupdate.h>
+#include <linux/dmaengine.h>
+#include <linux/pagemap.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
@@ -70,6 +72,9 @@
#include <asm/system.h>
#include <asm/uaccess.h>

+int tun_dma_copybreak = 0x10000;
+module_param_named(dma_copybreak, tun_dma_copybreak, int, 0644);
+MODULE_PARM_DESC(debug_level, "Use DMA engine for messages of this length and up");
/* Uncomment to enable debugging */
/* #define TUN_DEBUG 1 */

@@ -547,6 +552,364 @@ static inline struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
return skb;
}

+#ifdef CONFIG_NET_DMA
+/* The below duplicates code from net/core and drivers/dma
+ * with the minor twist that these functions work on a const
+ * iovec with an offset. TODO: move it there? */
+static int num_pages_spanned(void __user * iov_base, size_t iov_len)
+{
+ return
+ ((PAGE_ALIGN((unsigned long)iov_base + iov_len) -
+ ((unsigned long)iov_base & PAGE_MASK)) >> PAGE_SHIFT);
+}
+
+/*
+ * Pin down all the iovec pages needed for len bytes.
+ * Return a struct dma_pinned_list to keep track of pages pinned down.
+ *
+ * We are allocating a single chunk of memory, and then carving it up into
+ * 3 sections, the latter 2 whose size depends on the number of iovecs and the
+ * total number of pages, respectively.
+ */
+static struct dma_pinned_list *dma_pin_const_iovec_pages(const struct iovec *iov,
+ size_t iov_offset, size_t len)
+{
+ struct dma_pinned_list *local_list;
+ struct page **pages;
+ int i;
+ int ret;
+ int nr_iovecs = 0;
+ int iovec_len_used = 0;
+ int iovec_pages_used = 0;
+ void __user *iov_base;
+ size_t iov_len;
+
+ /* determine how many iovecs/pages there are, up front */
+ do {
+ /* Skip offset as required. */
+ iov_len = iov[nr_iovecs].iov_len;
+ if (iov_offset >= iovec_len_used + iov_len) {
+ iov_offset -= iov_len;
+ ++iov;
+ continue;
+ }
+ iov_base = iov[nr_iovecs].iov_base;
+ if (!iovec_len_used) {
+ iov_base += iov_offset;
+ iov_len -= iov_offset;
+ }
+ iovec_len_used += iov_len;
+ iovec_pages_used += num_pages_spanned(iov_base, iov_len);
+ nr_iovecs++;
+ } while (iovec_len_used < len);
+
+ /* single kmalloc for pinned list, page_list[], and the page arrays */
+ local_list = kmalloc(sizeof(*local_list)
+ + (nr_iovecs * sizeof (struct dma_page_list))
+ + (iovec_pages_used * sizeof (struct page*)), GFP_KERNEL);
+ if (!local_list)
+ goto out;
+
+ /* list of pages starts right after the page list array */
+ pages = (struct page **) &local_list->page_list[nr_iovecs];
+
+ local_list->nr_iovecs = 0;
+
+ for (i = 0; i < nr_iovecs; i++) {
+ struct dma_page_list *page_list = &local_list->page_list[i];
+
+ iov_len = iov[i].iov_len + iov_offset;
+ iov_base = iov[i].iov_base + iov_offset;
+ iov_offset = 0;
+ len -= iov_len;
+
+ page_list->nr_pages = num_pages_spanned(iov_base, iov_len);
+ page_list->base_address = iov_base;
+
+ page_list->pages = pages;
+ pages += page_list->nr_pages;
+
+ /* pin pages down */
+ ret = get_user_pages_fast(
+ (unsigned long)iov_base,
+ page_list->nr_pages,
+ 1, /* write */
+ page_list->pages);
+
+ if (unlikely(ret < 0))
+ goto unpin;
+
+ local_list->nr_iovecs = i + 1;
+
+ if (unlikely(ret != page_list->nr_pages)) {
+ page_list->nr_pages = ret;
+ goto unpin;
+ }
+
+ }
+
+ return local_list;
+
+unpin:
+ dma_unpin_iovec_pages(local_list);
+out:
+ return NULL;
+}
+
+/*
+ * We have already pinned down the pages we will be using in the iovecs.
+ * Each entry in iov array has corresponding entry in pinned_list->page_list.
+ * Using array indexing to keep iov[] and page_list[] in sync.
+ * Initial elements in iov array's iov->iov_len will be 0 if already copied into
+ * by another call.
+ * iov array length remaining guaranteed to be bigger than len.
+ */
+dma_cookie_t dma_memcpy_to_iovecend(struct dma_chan *chan, const struct iovec *iov,
+ struct dma_pinned_list *pinned_list, unsigned char *kdata,
+ size_t iov_offset, size_t len)
+{
+ int iov_byte_offset;
+ int copy;
+ dma_cookie_t dma_cookie = 0;
+ int iovec_idx;
+ int page_idx;
+ size_t iov_len;
+ unsigned long iov_base;
+
+ if (!chan)
+ return memcpy_toiovecend(iov, kdata, iov_offset, len);
+
+ iovec_idx = 0;
+ for (iovec_idx = 0; iovec_idx < pinned_list->nr_iovecs; ++iovec_idx) {
+ struct dma_page_list *page_list;
+
+ iov_len = iov[iovec_idx].iov_len;
+ /* skip already used-up iovecs */
+ if (iov_len <= iov_offset) {
+ iov_offset -= iov_len;
+ continue;
+ }
+
+ page_list = &pinned_list->page_list[iovec_idx];
+
+ iov_base = (unsigned long)iov[iovec_idx].iov_base + iov_offset;
+ iov_len -= iov_offset;
+ iov_offset = 0;
+ iov_byte_offset = iov_base & ~PAGE_MASK;
+ page_idx = ((iov_base & PAGE_MASK)
+ - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT;
+
+ /* break up copies to not cross page boundary */
+ while (iov_len) {
+ copy = min_t(int, PAGE_SIZE - iov_byte_offset, len);
+ copy = min_t(int, copy, iov_len);
+
+ dma_cookie = dma_async_memcpy_buf_to_pg(chan,
+ page_list->pages[page_idx],
+ iov_byte_offset,
+ kdata,
+ copy);
+ /* poll for a descriptor slot */
+ if (unlikely(dma_cookie < 0)) {
+ dma_async_issue_pending(chan);
+ continue;
+ }
+
+ len -= copy;
+ iov_len -= copy;
+ iov_base += copy;
+
+ if (!len)
+ return dma_cookie;
+
+ kdata += copy;
+ iov_byte_offset = 0;
+ page_idx++;
+ }
+ }
+
+ /* really bad if we ever run out of iovecs */
+ BUG();
+ return -EFAULT;
+}
+
+dma_cookie_t dma_memcpy_pg_to_const_iovec(struct dma_chan *chan, const struct iovec *iov,
+ struct dma_pinned_list *pinned_list, struct page *page,
+ unsigned int offset, size_t iov_offset, size_t len)
+{
+ int iov_byte_offset;
+ int copy;
+ dma_cookie_t dma_cookie = 0;
+ int iovec_idx;
+ int page_idx;
+ int err;
+ size_t iov_len;
+ unsigned long iov_base;
+
+ /* this needs as-yet-unimplemented buf-to-buff, so punt. */
+ /* TODO: use dma for this */
+ if (!chan || !pinned_list) {
+ u8 *vaddr = kmap(page);
+ err = memcpy_toiovecend(iov, vaddr + offset, iov_offset, len);
+ kunmap(page);
+ return err;
+ }
+
+ for (iovec_idx = 0; iovec_idx < pinned_list->nr_iovecs; ++iovec_idx) {
+ struct dma_page_list *page_list;
+
+ iov_len = iov[iovec_idx].iov_len;
+ /* skip already used-up iovecs */
+ if (iov_len <= iov_offset) {
+ iov_offset -= iov_len;
+ continue;
+ }
+
+ page_list = &pinned_list->page_list[iovec_idx];
+ iov_base = (unsigned long)iov[iovec_idx].iov_base + iov_offset;
+ iov_len -= iov_offset;
+ iov_offset = 0;
+
+ iov_byte_offset = iov_base & ~PAGE_MASK;
+ page_idx = ((iov_base & PAGE_MASK)
+ - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT;
+
+ /* break up copies to not cross page boundary */
+ while (iov_len) {
+ copy = min_t(int, PAGE_SIZE - iov_byte_offset, len);
+ copy = min_t(int, copy, iov_len);
+
+ dma_cookie = dma_async_memcpy_pg_to_pg(chan,
+ page_list->pages[page_idx],
+ iov_byte_offset,
+ page,
+ offset,
+ copy);
+ /* poll for a descriptor slot */
+ if (unlikely(dma_cookie < 0)) {
+ dma_async_issue_pending(chan);
+ continue;
+ }
+
+ len -= copy;
+ iov_len -= copy;
+ iov_base += copy;
+
+ if (!len)
+ return dma_cookie;
+
+ offset += copy;
+ iov_byte_offset = 0;
+ page_idx++;
+ }
+ }
+
+ /* really bad if we ever run out of iovecs */
+ BUG();
+ return -EFAULT;
+}
+
+/**
+ * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
+ * @skb - buffer to copy
+ * @offset - offset in the buffer to start copying from
+ * @iovec - io vector to copy to
+ * @len - amount of data to copy from buffer to iovec
+ * @pinned_list - locked iovec buffer data
+ *
+ * Note: the iovec is not modified during the copy.
+ * Note: pinned_list is assumed pinned with the same offset.
+ */
+dma_cookie_t dma_skb_copy_datagram_const_iovec(struct dma_chan *chan,
+ struct sk_buff *skb, int offset, const struct iovec *to,
+ size_t iov_offset,
+ size_t len, struct dma_pinned_list *pinned_list)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ struct sk_buff *frag_iter;
+ dma_cookie_t cookie = 0;
+
+ /* Copy header. */
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ cookie = dma_memcpy_to_iovecend(chan, to, pinned_list,
+ skb->data + offset, iov_offset,
+ copy);
+ if (cookie < 0)
+ goto fault;
+ len -= copy;
+ if (len == 0)
+ goto end;
+ offset += copy;
+ iov_offset += copy;
+ }
+
+ /* Copy paged appendix. Hmm... why does this look so complicated? */
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_shinfo(skb)->frags[i].size;
+ copy = end - offset;
+ if (copy > 0) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ struct page *page = frag->page;
+
+ if (copy > len)
+ copy = len;
+
+ cookie = dma_memcpy_pg_to_const_iovec(chan, to, pinned_list, page,
+ frag->page_offset + offset - start, iov_offset, copy);
+ if (cookie < 0)
+ goto fault;
+ len -= copy;
+ if (len == 0)
+ goto end;
+ offset += copy;
+ iov_offset += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ copy = end - offset;
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ cookie = dma_skb_copy_datagram_const_iovec(chan, frag_iter,
+ offset - start,
+ to, iov_offset, copy,
+ pinned_list);
+ if (cookie < 0)
+ goto fault;
+ len -= copy;
+ if (len == 0)
+ goto end;
+ offset += copy;
+ iov_offset += copy;
+ }
+ start = end;
+ }
+
+end:
+ if (!len) {
+ skb->dma_cookie = cookie;
+ return cookie;
+ }
+
+fault:
+ return -EFAULT;
+}
+#endif
+
/* Get packet from user space buffer */
static __inline__ ssize_t tun_get_user(struct tun_struct *tun,
const struct iovec *iv, size_t count,
@@ -706,6 +1069,9 @@ static __inline__ ssize_t tun_put_user(struct tun_struct *tun,
{
struct tun_pi pi = { 0, skb->protocol };
ssize_t total = 0;
+ struct dma_chan *dma_chan;
+ struct dma_pinned_list *pinned_list;
+ int dma_cookie;

if (!(tun->flags & TUN_NO_PI)) {
if ((len -= sizeof(pi)) < 0)
@@ -768,8 +1134,29 @@ static __inline__ ssize_t tun_put_user(struct tun_struct *tun,
}

len = min_t(int, skb->len, len);
-
+#ifdef CONFIG_NET_DMA
+
+ if (len < tun_dma_copybreak)
+ goto copy;
+
+ dma_chan = dma_find_channel(DMA_MEMCPY);
+ if (!dma_chan)
+ goto copy;
+ pinned_list = dma_pin_const_iovec_pages(iv, total, len);
+ if (!pinned_list)
+ goto copy;
+ dma_cookie = dma_skb_copy_datagram_const_iovec(dma_chan, skb, 0, iv,
+ total, len, pinned_list);
+ if (dma_cookie >= 0) {
+ dma_async_memcpy_issue_pending(dma_chan);
+ dma_sync_wait(dma_chan, dma_cookie);
+ }
+ dma_unpin_iovec_pages(pinned_list);
+ goto done;
+#endif
+copy:
skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
+done:
total += skb->len;

tun->dev->stats.tx_packets++;
--
1.7.3-rc1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/