Re: [RFC][PATCH] VM deadlock prevention core -v3

From: Evgeniy Polyakov
Date: Thu Aug 10 2006 - 10:01:07 EST


On Thu, Aug 10, 2006 at 03:32:49PM +0200, Peter Zijlstra (a.p.zijlstra@xxxxxxxxx) wrote:
> Hi,

Hello, Peter.

> So I try again, please tell me if I'm still on crack and should go detox.
> However if you do so, I kindly request some words on the how and why of it.

I think you should talk with doctor in that case, but not with kernel
hackers :)

I have some comments about implementation, not overall design, since we
have slightly diametral points of view there.

...

> --- linux-2.6.orig/net/core/skbuff.c
> +++ linux-2.6/net/core/skbuff.c
> @@ -43,6 +43,7 @@
> #include <linux/kernel.h>
> #include <linux/sched.h>
> #include <linux/mm.h>
> +#include <linux/pagemap.h>
> #include <linux/interrupt.h>
> #include <linux/in.h>
> #include <linux/inet.h>
> @@ -125,6 +126,8 @@ EXPORT_SYMBOL(skb_truesize_bug);
> *
> */
>
> +#define ceiling_log2(x) fls((x) - 1)
> +
> /**
> * __alloc_skb - allocate a network buffer
> * @size: size to allocate
> @@ -147,6 +150,59 @@ struct sk_buff *__alloc_skb(unsigned int
> struct sk_buff *skb;
> u8 *data;
>
> + size = SKB_DATA_ALIGN(size);
> +
> + if (gfp_mask & __GFP_MEMALLOC) {
> + /*
> + * Fallback allocation for memalloc reserves.
> + *
> + * the page is populated like so:
> + *
> + * struct sk_buff
> + * [ struct sk_buff ]
> + * [ atomic_t ]
> + * unsigned int
> + * struct skb_shared_info
> + * char []
> + *
> + * We have to do higher order allocations for icky jumbo
> + * frame drivers :-(. They really should be migrated to
> + * scather/gather DMA and use skb fragments.
> + */
> + unsigned int data_offset =
> + sizeof(struct sk_buff) + sizeof(unsigned int);
> + unsigned long length = size + data_offset +
> + sizeof(struct skb_shared_info);
> + unsigned int pages;
> + unsigned int order;
> + struct page *page;
> + void *kaddr;
> +
> + /*
> + * Force fclone alloc in order to fudge a lacking in skb_clone().
> + */
> + fclone = 1;
> + if (fclone) {
> + data_offset += sizeof(struct sk_buff) + sizeof(atomic_t);
> + length += sizeof(struct sk_buff) + sizeof(atomic_t);
> + }
> + pages = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
> + order = ceiling_log2(pages);
> + skb = NULL;
> + if (!(page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order)))
> + goto out;
> +
> + kaddr = pfn_to_kaddr(page_to_pfn(page));
> + skb = (struct sk_buff *)kaddr;
> +
> + *((unsigned int *)(kaddr + data_offset -
> + sizeof(unsigned int))) = order;
> + data = (u8 *)(kaddr + data_offset);
> +

Tricky, but since you are using own allocator here, you could change it to
be not so aggressive - i.e. do not round size to number of pages.

> + goto allocated;
> + }
> +
> cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
>
> /* Get the HEAD */
> @@ -155,12 +211,13 @@ struct sk_buff *__alloc_skb(unsigned int
> goto out;
>
> /* Get the DATA. Size must match skb_add_mtu(). */
> - size = SKB_DATA_ALIGN(size);

Bad sign.

> data = ____kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
> if (!data)
> goto nodata;
>
> +struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
> + unsigned length, gfp_t gfp_mask)
> +{
> + struct sk_buff *skb;
> +
> + WARN_ON(gfp_mask & (__GFP_NOMEMALLOC | __GFP_MEMALLOC));
> + gfp_mask &= ~(__GFP_NOMEMALLOC | __GFP_MEMALLOC);
> +
> + skb = ___netdev_alloc_skb(dev, length, gfp_mask | __GFP_NOMEMALLOC);
> + if (skb)
> + goto done;
> +
> + if (atomic_read(&dev->rx_reserve_used) >=
> + dev->rx_reserve * dev->memalloc_socks)
> + goto out;
> +
> + /*
> + * pre-inc guards against a race with netdev_wait_memalloc()
> + */
> + atomic_inc(&dev->rx_reserve_used);
> + skb = ___netdev_alloc_skb(dev, length, gfp_mask | __GFP_MEMALLOC);
> + if (unlikely(!skb)) {
> + atomic_dec(&dev->rx_reserve_used);
> + goto out;
> + }

Since you have added atomic operation in that path, you can use device's
reference counter instead and do not care that it can dissapear.

> +done:
> + skb->dev = dev;
> +out:
> + return skb;
> +}
> +
> static void skb_drop_list(struct sk_buff **listp)
> {
> struct sk_buff *list = *listp;
> @@ -313,10 +417,35 @@ static void skb_release_data(struct sk_b
> if (skb_shinfo(skb)->frag_list)
> skb_drop_fraglist(skb);
>
> - kfree(skb->head);
> + if (!skb->memalloc)
> + kfree(skb->head);
> + skb->head = NULL;
> }
> }
>
> +/**
> + * free_skb_pages - frees a memalloced skbuff
> + * @cache: fake %kmem_cache argument
> + * @objp: %sk_buff pointer
> + *
> + * Function is made to look like %kmem_cache_free so we can easily
> + * substitue the free function in %kfree_skbmem.
> + */
> +static void free_skb_pages(struct kmem_cache *cache, void *objp)
> +{
> + struct sk_buff *skb = (struct sk_buff *)objp;
> + /*
> + * The input_dev is the initial input device;
> + * we have it pinned by virtue of rx_reserve_used not being zero.
> + */
> + struct net_device *dev = skb->input_dev ?: skb->dev;
> + unsigned int order =
> + *(unsigned int *)(skb->head - sizeof(unsigned int));
> + if (!skb->head)
> + atomic_dec(&dev->rx_reserve_used);
> + free_pages((unsigned long)skb, order);
> +}
> +
> /*
> * Free an skbuff by memory without cleaning the state.
> */
> @@ -324,17 +453,21 @@ void kfree_skbmem(struct sk_buff *skb)
> {
> struct sk_buff *other;
> atomic_t *fclone_ref;
> + void (*free_skb)(struct kmem_cache *, void *);
>
> skb_release_data(skb);
> +
> + free_skb = skb->memalloc ? free_skb_pages : kmem_cache_free;
> +
> switch (skb->fclone) {
> case SKB_FCLONE_UNAVAILABLE:
> - kmem_cache_free(skbuff_head_cache, skb);
> + free_skb(skbuff_head_cache, skb);
> break;
>
> case SKB_FCLONE_ORIG:
> fclone_ref = (atomic_t *) (skb + 2);
> if (atomic_dec_and_test(fclone_ref))
> - kmem_cache_free(skbuff_fclone_cache, skb);
> + free_skb(skbuff_fclone_cache, skb);
> break;
>
> case SKB_FCLONE_CLONE:
> @@ -347,7 +480,7 @@ void kfree_skbmem(struct sk_buff *skb)
> skb->fclone = SKB_FCLONE_UNAVAILABLE;
>
> if (atomic_dec_and_test(fclone_ref))
> - kmem_cache_free(skbuff_fclone_cache, other);
> + free_skb(skbuff_fclone_cache, other);
> break;
> };
> }
> @@ -434,6 +567,12 @@ struct sk_buff *skb_clone(struct sk_buff
> n->fclone = SKB_FCLONE_CLONE;
> atomic_inc(fclone_ref);
> } else {
> + /*
> + * should we special-case skb->memalloc cloning?
> + * for now fudge it by forcing fast-clone alloc.
> + */
> + BUG_ON(skb->memalloc);
> +
> n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
> if (!n)
> return NULL;

Ugh... cloning is a one of the shoulders of giant where Linux network
stack is staying...

> @@ -686,6 +825,8 @@ int pskb_expand_head(struct sk_buff *skb
> if (skb_shared(skb))
> BUG();
>
> + BUG_ON(skb->memalloc);
> +
> size = SKB_DATA_ALIGN(size);
>
> data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);

And that is a bug.
That operation can happen even with usual receiving processing.

> +/**
> + * dev_adjust_memalloc - adjust the global memalloc reserve for this device
> + * @dev: device that has memalloc demands
> + * @nr_socks: number of new %SOCK_MEMALLOC sockets
> + *
> + * This function adjusts the memalloc reserve based on device
> + * demand. For each %SOCK_MEMALLOC socket this device will reserve
> + * 2 * %MAX_PHYS_SEGMENTS pages for outbound traffic (assumption:
> + * each %SOCK_MEMALLOC socket will have a %request_queue associated)
> + * and @dev->rx_reserve mtu pages.
> + */
> +int dev_adjust_memalloc(struct net_device *dev, int nr_socks)
> +{
> + unsigned long flags;
> + unsigned long reserve;
> + int err;
> +
> + spin_lock_irqsave(&dev->memalloc_lock, flags);
> +
> + dev->memalloc_socks += nr_socks;
> + BUG_ON(dev->memalloc_socks < 0);
> +
> + reserve = dev->memalloc_socks *
> + (2 * MAX_PHYS_SEGMENTS + /* outbound */
> + dev->rx_reserve * mtu_pages(dev->mtu)); /* inbound */
> +
> + err = adjust_memalloc_reserve(reserve - dev->memalloc_reserve);
> + if (err) {
> + printk(KERN_WARNING
> + "%s: Unable to change RX reserve to: %lu, error: %d\n",
> + dev->name, reserve, err);
> + goto unlock;
> + }
> + dev->memalloc_reserve = reserve;
> +
> +unlock:
> + spin_unlock_irqrestore(&dev->memalloc_lock, flags);
> + return err;
> +}
> +EXPORT_SYMBOL_GPL(dev_adjust_memalloc);
>
> /*
> * Device change register/unregister. These are not inline or static
> @@ -2464,6 +2527,9 @@ int dev_set_mtu(struct net_device *dev,
> err = dev->change_mtu(dev, new_mtu);
> else
> dev->mtu = new_mtu;
> +
> + dev_adjust_memalloc(dev, 0);
> +
> if (!err && dev->flags & IFF_UP)
> raw_notifier_call_chain(&netdev_chain,
> NETDEV_CHANGEMTU, dev);
> @@ -2900,6 +2966,7 @@ int register_netdevice(struct net_device
> #ifdef CONFIG_NET_CLS_ACT
> spin_lock_init(&dev->ingress_lock);
> #endif
> + spin_lock_init(&dev->memalloc_lock);
>
> ret = alloc_divert_blk(dev);
> if (ret)
> @@ -3106,6 +3173,28 @@ static void netdev_wait_allrefs(struct n
> }
> }
>
> +/* netdev_wait_memalloc - wait for all outstanding memalloc skbs
> + *
> + * This is called when unregistering network devices.
> + *
> + * Better make sure the skb -> dev mapping is correct, if we leak
> + * some this will stall forever.
> + */
> +static void netdev_wait_memalloc(struct net_device *dev)
> +{
> + unsigned long warning_time = jiffies;
> + while (atomic_read(&dev->rx_reserve_used) != 0) {
> + msleep(250);
> + if (time_after(jiffies, warning_time + 10 * HZ)) {
> + printk(KERN_EMERG "netdev_wait_memalloc: "
> + "waiting for %s to become free. SKB "
> + "count = %d\n",
> + dev->name, atomic_read(&dev->rx_reserve_used));
> + warning_time = jiffies;
> + }
> + }
> +}
> +
> /* The sequence is:
> *
> * rtnl_lock();
> @@ -3165,6 +3254,14 @@ void netdev_run_todo(void)
>
> netdev_wait_allrefs(dev);
>
> + netdev_wait_memalloc(dev);
> +
> + /* Get rid of any SOCK_MEMALLOC reserves. */
> + if (dev->memalloc_reserve) {
> + BUG_ON(!dev->memalloc_socks);
> + dev_adjust_memalloc(dev, -dev->memalloc_socks);
> + }
> +
> /* paranoia */
> BUG_ON(atomic_read(&dev->refcnt));
> BUG_TRAP(!dev->ip_ptr);
> Index: linux-2.6/net/ipv4/af_inet.c
> ===================================================================
> --- linux-2.6.orig/net/ipv4/af_inet.c
> +++ linux-2.6/net/ipv4/af_inet.c
> @@ -132,6 +132,14 @@ void inet_sock_destruct(struct sock *sk)
> {
> struct inet_sock *inet = inet_sk(sk);
>
> + if (sk_is_memalloc(sk)) {
> + struct net_device *dev = ip_dev_find(inet->rcv_saddr);
> + if (dev) {
> + dev_adjust_memalloc(dev, -1);
> + dev_put(dev);
> + }
> + }
> +

This looks very strange - you decrement reference counter both in socket
destruction code and in netdevice destruction code.

--
Evgeniy Polyakov
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/