[RFC PATCH] net, tun: remove the flow cache
From: Zhi Yong Wu
Date: Tue Dec 17 2013 - 02:26:43 EST
From: Zhi Yong Wu <wuzhy@xxxxxxxxxxxxxxxxxx>
The flow cache is an extremely broken concept, and it usually brings up
growth issues and DoS attacks, so this patch is trying to remove it from
the tuntap driver, and insteadly use a simpler way for its flow control.
Signed-off-by: Zhi Yong Wu <wuzhy@xxxxxxxxxxxxxxxxxx>
---
drivers/net/tun.c | 208 +++--------------------------------------------------
1 files changed, 10 insertions(+), 198 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 7c8343a..7c27fdc 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -32,12 +32,15 @@
*
* Daniel Podlejski <underley@xxxxxxxxxxxxxxx>
* Modifications for 2.3.99-pre5 kernel.
+ *
+ * Zhi Yong Wu <wuzhy@xxxxxxxxxxxxxxxxxx>
+ * Remove the flow cache.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#define DRV_NAME "tun"
-#define DRV_VERSION "1.6"
+#define DRV_VERSION "1.7"
#define DRV_DESCRIPTION "Universal TUN/TAP device driver"
#define DRV_COPYRIGHT "(C) 1999-2004 Max Krasnyansky <maxk@xxxxxxxxxxxx>"
@@ -146,18 +149,6 @@ struct tun_file {
struct tun_struct *detached;
};
-struct tun_flow_entry {
- struct hlist_node hash_link;
- struct rcu_head rcu;
- struct tun_struct *tun;
-
- u32 rxhash;
- int queue_index;
- unsigned long updated;
-};
-
-#define TUN_NUM_FLOW_ENTRIES 1024
-
/* Since the socket were moved to tun_file, to preserve the behavior of persist
* device, socket filter, sndbuf and vnet header size were restore when the
* file were attached to a persist device.
@@ -184,163 +175,11 @@ struct tun_struct {
int debug;
#endif
spinlock_t lock;
- struct hlist_head flows[TUN_NUM_FLOW_ENTRIES];
- struct timer_list flow_gc_timer;
- unsigned long ageing_time;
unsigned int numdisabled;
struct list_head disabled;
void *security;
- u32 flow_count;
};
-static inline u32 tun_hashfn(u32 rxhash)
-{
- return rxhash & 0x3ff;
-}
-
-static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash)
-{
- struct tun_flow_entry *e;
-
- hlist_for_each_entry_rcu(e, head, hash_link) {
- if (e->rxhash == rxhash)
- return e;
- }
- return NULL;
-}
-
-static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun,
- struct hlist_head *head,
- u32 rxhash, u16 queue_index)
-{
- struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC);
-
- if (e) {
- tun_debug(KERN_INFO, tun, "create flow: hash %u index %u\n",
- rxhash, queue_index);
- e->updated = jiffies;
- e->rxhash = rxhash;
- e->queue_index = queue_index;
- e->tun = tun;
- hlist_add_head_rcu(&e->hash_link, head);
- ++tun->flow_count;
- }
- return e;
-}
-
-static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)
-{
- tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n",
- e->rxhash, e->queue_index);
- hlist_del_rcu(&e->hash_link);
- kfree_rcu(e, rcu);
- --tun->flow_count;
-}
-
-static void tun_flow_flush(struct tun_struct *tun)
-{
- int i;
-
- spin_lock_bh(&tun->lock);
- for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
- struct tun_flow_entry *e;
- struct hlist_node *n;
-
- hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link)
- tun_flow_delete(tun, e);
- }
- spin_unlock_bh(&tun->lock);
-}
-
-static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index)
-{
- int i;
-
- spin_lock_bh(&tun->lock);
- for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
- struct tun_flow_entry *e;
- struct hlist_node *n;
-
- hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
- if (e->queue_index == queue_index)
- tun_flow_delete(tun, e);
- }
- }
- spin_unlock_bh(&tun->lock);
-}
-
-static void tun_flow_cleanup(unsigned long data)
-{
- struct tun_struct *tun = (struct tun_struct *)data;
- unsigned long delay = tun->ageing_time;
- unsigned long next_timer = jiffies + delay;
- unsigned long count = 0;
- int i;
-
- tun_debug(KERN_INFO, tun, "tun_flow_cleanup\n");
-
- spin_lock_bh(&tun->lock);
- for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
- struct tun_flow_entry *e;
- struct hlist_node *n;
-
- hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
- unsigned long this_timer;
- count++;
- this_timer = e->updated + delay;
- if (time_before_eq(this_timer, jiffies))
- tun_flow_delete(tun, e);
- else if (time_before(this_timer, next_timer))
- next_timer = this_timer;
- }
- }
-
- if (count)
- mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer));
- spin_unlock_bh(&tun->lock);
-}
-
-static void tun_flow_update(struct tun_struct *tun, u32 rxhash,
- struct tun_file *tfile)
-{
- struct hlist_head *head;
- struct tun_flow_entry *e;
- unsigned long delay = tun->ageing_time;
- u16 queue_index = tfile->queue_index;
-
- if (!rxhash)
- return;
- else
- head = &tun->flows[tun_hashfn(rxhash)];
-
- rcu_read_lock();
-
- /* We may get a very small possibility of OOO during switching, not
- * worth to optimize.*/
- if (tun->numqueues == 1 || tfile->detached)
- goto unlock;
-
- e = tun_flow_find(head, rxhash);
- if (likely(e)) {
- /* TODO: keep queueing to old queue until it's empty? */
- e->queue_index = queue_index;
- e->updated = jiffies;
- } else {
- spin_lock_bh(&tun->lock);
- if (!tun_flow_find(head, rxhash) &&
- tun->flow_count < MAX_TAP_FLOWS)
- tun_flow_create(tun, head, rxhash, queue_index);
-
- if (!timer_pending(&tun->flow_gc_timer))
- mod_timer(&tun->flow_gc_timer,
- round_jiffies_up(jiffies + delay));
- spin_unlock_bh(&tun->lock);
- }
-
-unlock:
- rcu_read_unlock();
-}
-
/* We try to identify a flow through its rxhash first. The reason that
* we do not check rxq no. is becuase some cards(e.g 82599), chooses
* the rxq based on the txq where the last packet of the flow comes. As
@@ -351,7 +190,6 @@ unlock:
static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb)
{
struct tun_struct *tun = netdev_priv(dev);
- struct tun_flow_entry *e;
u32 txq = 0;
u32 numqueues = 0;
@@ -360,12 +198,11 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb)
txq = skb_get_rxhash(skb);
if (txq) {
- e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
- if (e)
- txq = e->queue_index;
- else
+ txq = skb_get_queue_mapping(skb);
+ if (!txq) {
/* use multiply and shift instead of expensive divide */
txq = ((u64)txq * numqueues) >> 32;
+ }
} else if (likely(skb_rx_queue_recorded(skb))) {
txq = skb_get_rx_queue(skb);
while (unlikely(txq >= numqueues))
@@ -439,7 +276,6 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
tun_disable_queue(tun, tfile);
synchronize_net();
- tun_flow_delete_by_queue(tun, tun->numqueues + 1);
/* Drop read queue */
tun_queue_purge(tfile);
tun_set_real_num_queues(tun);
@@ -858,25 +694,6 @@ static const struct net_device_ops tap_netdev_ops = {
#endif
};
-static void tun_flow_init(struct tun_struct *tun)
-{
- int i;
-
- for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++)
- INIT_HLIST_HEAD(&tun->flows[i]);
-
- tun->ageing_time = TUN_FLOW_EXPIRE;
- setup_timer(&tun->flow_gc_timer, tun_flow_cleanup, (unsigned long)tun);
- mod_timer(&tun->flow_gc_timer,
- round_jiffies_up(jiffies + tun->ageing_time));
-}
-
-static void tun_flow_uninit(struct tun_struct *tun)
-{
- del_timer_sync(&tun->flow_gc_timer);
- tun_flow_flush(tun);
-}
-
/* Initialize net device. */
static void tun_net_init(struct net_device *dev)
{
@@ -986,7 +803,6 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
int copylen;
bool zerocopy = false;
int err;
- u32 rxhash;
if (!(tun->flags & TUN_NO_PI)) {
if (len < sizeof(pi))
@@ -1146,13 +962,12 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
skb_reset_network_header(skb);
skb_probe_transport_header(skb, 0);
- rxhash = skb_get_rxhash(skb);
+ skb_set_queue_mapping(skb, tfile->queue_index);
netif_rx_ni(skb);
tun->dev->stats.rx_packets++;
tun->dev->stats.rx_bytes += len;
- tun_flow_update(tun, rxhash, tfile);
return total_len;
}
@@ -1370,7 +1185,6 @@ static void tun_free_netdev(struct net_device *dev)
struct tun_struct *tun = netdev_priv(dev);
BUG_ON(!(list_empty(&tun->disabled)));
- tun_flow_uninit(tun);
security_tun_dev_free_security(tun->security);
free_netdev(dev);
}
@@ -1644,7 +1458,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
goto err_free_dev;
tun_net_init(dev);
- tun_flow_init(tun);
dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
@@ -1655,7 +1468,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
INIT_LIST_HEAD(&tun->disabled);
err = tun_attach(tun, file, false);
if (err < 0)
- goto err_free_flow;
+ goto err_free_security;
err = register_netdevice(tun->dev);
if (err < 0)
@@ -1705,8 +1518,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
err_detach:
tun_detach_all(dev);
-err_free_flow:
- tun_flow_uninit(tun);
+err_free_security:
security_tun_dev_free_security(tun->security);
err_free_dev:
free_netdev(dev);
--
1.7.6.5
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/