Re: [RFC] Use RCU for tcp_ehash lookup

From: Srivatsa Vaddagiri
Date: Wed Sep 01 2004 - 06:36:48 EST


On Tue, Aug 31, 2004 at 03:54:20PM +0200, Andi Kleen wrote:
> I bet also when you just do rdtsc timing for the TCP receive
> path the cycle numbers will be way down (excluding the copy).

I got cycle numbers for the lookup routine (with CONFIG_PREEMPT turned off).
They were taken on a 900MHz 8way Intel P3 SMP box. The results are as below:


-------------------------------------------------------------------------------
| 2.6.8.1 | 2.6.8.1 + my patch
-------------------------------------------------------------------------------
Average cycles | |
spent in | |
__tcp_v4_lookup_established | 2970.65 | 668.227
| (~3.3 micro-seconds) | (~0.74 microseconds)
-------------------------------------------------------------------------------

This repesents improvement by a factor of 77.5%!


>
> And it should also fix the performance problems with
> cat /proc/net/tcp on ppc64/ia64 for large hash tables because the rw locks
> are gone.

But spinlocks are in! Would that still improve the performance compared to rw
locks? (See me earlier note where I have explained that lookup done for
/proc/net/tcp is _not_ lock-free yet).

> I haven't studied it in detail (yet), just two minor style
> comments:

[snip]

> Can you rewrite that without goto?

[snip]

> And that too.

I have avoided the goto's in the updated patch below.

Thanks!!


---

linux-2.6.8.1-vatsa/include/net/sock.h | 22 +++++++++--
linux-2.6.8.1-vatsa/include/net/tcp.h | 24 +++++++++---
linux-2.6.8.1-vatsa/net/core/sock.c | 11 +++++
linux-2.6.8.1-vatsa/net/ipv4/tcp.c | 2 -
linux-2.6.8.1-vatsa/net/ipv4/tcp_diag.c | 11 +++--
linux-2.6.8.1-vatsa/net/ipv4/tcp_ipv4.c | 50 ++++++++++++++++-----------
linux-2.6.8.1-vatsa/net/ipv4/tcp_minisocks.c | 47 ++++++++++++++++++++-----
linux-2.6.8.1-vatsa/net/ipv6/tcp_ipv6.c | 22 +++++++----
8 files changed, 135 insertions(+), 54 deletions(-)

diff -puN include/net/sock.h~tcp_ehash include/net/sock.h
--- linux-2.6.8.1/include/net/sock.h~tcp_ehash 2004-08-25 18:06:42.000000000 +0530
+++ linux-2.6.8.1-vatsa/include/net/sock.h 2004-09-01 10:09:43.000000000 +0530
@@ -50,6 +50,7 @@
#include <linux/security.h>

#include <linux/filter.h>
+#include <linux/rcupdate.h>

#include <asm/atomic.h>
#include <net/dst.h>
@@ -178,6 +179,7 @@ struct sock_common {
* @sk_error_report - callback to indicate errors (e.g. %MSG_ERRQUEUE)
* @sk_backlog_rcv - callback to process the backlog
* @sk_destruct - called at sock freeing time, i.e. when all refcnt == 0
+ * @sk_rcu - RCU callback structure
*/
struct sock {
/*
@@ -266,6 +268,7 @@ struct sock {
int (*sk_backlog_rcv)(struct sock *sk,
struct sk_buff *skb);
void (*sk_destruct)(struct sock *sk);
+ struct rcu_head sk_rcu;
};

/*
@@ -350,7 +353,7 @@ static __inline__ int sk_del_node_init(s

static __inline__ void __sk_add_node(struct sock *sk, struct hlist_head *list)
{
- hlist_add_head(&sk->sk_node, list);
+ hlist_add_head_rcu(&sk->sk_node, list);
}

static __inline__ void sk_add_node(struct sock *sk, struct hlist_head *list)
@@ -371,7 +374,7 @@ static __inline__ void sk_add_bind_node(
}

#define sk_for_each(__sk, node, list) \
- hlist_for_each_entry(__sk, node, list, sk_node)
+ hlist_for_each_entry_rcu(__sk, node, list, sk_node)
#define sk_for_each_from(__sk, node) \
if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
hlist_for_each_entry_from(__sk, node, sk_node)
@@ -703,6 +706,7 @@ extern void FASTCALL(release_sock(struct
extern struct sock * sk_alloc(int family, int priority, int zero_it,
kmem_cache_t *slab);
extern void sk_free(struct sock *sk);
+extern void sk_free_rcu(struct rcu_head *head);

extern struct sk_buff *sock_wmalloc(struct sock *sk,
unsigned long size, int force,
@@ -888,8 +892,18 @@ static inline void sk_filter_charge(stru
/* Ungrab socket and destroy it, if it was the last reference. */
static inline void sock_put(struct sock *sk)
{
- if (atomic_dec_and_test(&sk->sk_refcnt))
- sk_free(sk);
+ while (atomic_dec_and_test(&sk->sk_refcnt)) {
+ /* Restore ref count and schedule callback.
+ * If we don't restore ref count, then the callback can be
+ * scheduled by more than one CPU.
+ */
+ atomic_inc(&sk->sk_refcnt);
+
+ if (atomic_read(&sk->sk_refcnt) == 1) {
+ call_rcu(&sk->sk_rcu, sk_free_rcu);
+ break;
+ }
+ }
}

/* Detach socket from process context.
diff -puN include/net/tcp.h~tcp_ehash include/net/tcp.h
--- linux-2.6.8.1/include/net/tcp.h~tcp_ehash 2004-08-25 18:06:42.000000000 +0530
+++ linux-2.6.8.1-vatsa/include/net/tcp.h 2004-09-01 10:13:40.000000000 +0530
@@ -30,6 +30,7 @@
#include <linux/slab.h>
#include <linux/cache.h>
#include <linux/percpu.h>
+#include <linux/rcupdate.h>
#include <net/checksum.h>
#include <net/sock.h>
#include <net/snmp.h>
@@ -44,7 +45,7 @@
* for the rest. I'll experiment with dynamic table growth later.
*/
struct tcp_ehash_bucket {
- rwlock_t lock;
+ spinlock_t lock;
struct hlist_head chain;
} __attribute__((__aligned__(8)));

@@ -222,12 +223,13 @@ struct tcp_tw_bucket {
struct in6_addr tw_v6_rcv_saddr;
int tw_v6_ipv6only;
#endif
+ struct rcu_head tw_rcu;
};

static __inline__ void tw_add_node(struct tcp_tw_bucket *tw,
struct hlist_head *list)
{
- hlist_add_head(&tw->tw_node, list);
+ hlist_add_head_rcu(&tw->tw_node, list);
}

static __inline__ void tw_add_bind_node(struct tcp_tw_bucket *tw,
@@ -305,14 +307,22 @@ static inline int tcp_v6_ipv6only(const
#endif

extern kmem_cache_t *tcp_timewait_cachep;
+extern void tcp_tw_free(struct rcu_head *head);

static inline void tcp_tw_put(struct tcp_tw_bucket *tw)
{
- if (atomic_dec_and_test(&tw->tw_refcnt)) {
-#ifdef INET_REFCNT_DEBUG
- printk(KERN_DEBUG "tw_bucket %p released\n", tw);
-#endif
- kmem_cache_free(tcp_timewait_cachep, tw);
+ while (atomic_dec_and_test(&tw->tw_refcnt)) {
+ /* Restore ref count and schedule callback.
+ * If we don't restore ref count, then the callback can be
+ * scheduled by more than one CPU.
+ */
+
+ atomic_inc(&tw->tw_refcnt);
+
+ if (atomic_read(&tw->tw_refcnt) == 1) {
+ call_rcu(&tw->tw_rcu, tcp_tw_free);
+ break;
+ }
}
}

diff -puN net/core/sock.c~tcp_ehash net/core/sock.c
--- linux-2.6.8.1/net/core/sock.c~tcp_ehash 2004-08-25 18:06:42.000000000 +0530
+++ linux-2.6.8.1-vatsa/net/core/sock.c 2004-08-26 16:53:14.000000000 +0530
@@ -657,6 +657,16 @@ void sk_free(struct sock *sk)
module_put(owner);
}

+/* RCU callback to free a socket */
+
+void sk_free_rcu(struct rcu_head *head)
+{
+ struct sock *sk = container_of(head, struct sock, sk_rcu);
+
+ if (atomic_dec_and_test(&sk->sk_refcnt))
+ sk_free(sk);
+}
+
void __init sk_init(void)
{
sk_cachep = kmem_cache_create("sock", sizeof(struct sock), 0,
@@ -1347,6 +1357,7 @@ EXPORT_SYMBOL(__lock_sock);
EXPORT_SYMBOL(__release_sock);
EXPORT_SYMBOL(sk_alloc);
EXPORT_SYMBOL(sk_free);
+EXPORT_SYMBOL(sk_free_rcu);
EXPORT_SYMBOL(sk_send_sigurg);
EXPORT_SYMBOL(sock_alloc_send_pskb);
EXPORT_SYMBOL(sock_alloc_send_skb);
diff -puN net/ipv4/tcp_ipv4.c~tcp_ehash net/ipv4/tcp_ipv4.c
--- linux-2.6.8.1/net/ipv4/tcp_ipv4.c~tcp_ehash 2004-08-25 18:06:42.000000000 +0530
+++ linux-2.6.8.1-vatsa/net/ipv4/tcp_ipv4.c 2004-08-25 18:07:27.000000000 +0530
@@ -351,7 +351,8 @@ void tcp_listen_wlock(void)
static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
{
struct hlist_head *list;
- rwlock_t *lock;
+ rwlock_t *lock = NULL;
+ spinlock_t *slock = NULL;

BUG_TRAP(sk_unhashed(sk));
if (listen_possible && sk->sk_state == TCP_LISTEN) {
@@ -360,14 +361,16 @@ static __inline__ void __tcp_v4_hash(str
tcp_listen_wlock();
} else {
list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
- lock = &tcp_ehash[sk->sk_hashent].lock;
- write_lock(lock);
+ slock = &tcp_ehash[sk->sk_hashent].lock;
+ spin_lock(slock);
}
__sk_add_node(sk, list);
sock_prot_inc_use(sk->sk_prot);
- write_unlock(lock);
- if (listen_possible && sk->sk_state == TCP_LISTEN)
+ if (listen_possible && sk->sk_state == TCP_LISTEN) {
+ write_unlock(lock);
wake_up(&tcp_lhash_wait);
+ } else
+ spin_unlock(slock);
}

static void tcp_v4_hash(struct sock *sk)
@@ -381,7 +384,8 @@ static void tcp_v4_hash(struct sock *sk)

void tcp_unhash(struct sock *sk)
{
- rwlock_t *lock;
+ rwlock_t *lock = NULL;
+ spinlock_t *slock = NULL;

if (sk_unhashed(sk))
goto ende;
@@ -392,17 +396,20 @@ void tcp_unhash(struct sock *sk)
lock = &tcp_lhash_lock;
} else {
struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
- lock = &head->lock;
- write_lock_bh(&head->lock);
+ slock = &head->lock;
+ spin_lock_bh(&head->lock);
}

if (__sk_del_node_init(sk))
sock_prot_dec_use(sk->sk_prot);
- write_unlock_bh(lock);

+ if (sk->sk_state != TCP_LISTEN)
+ spin_unlock_bh(slock);
+ else {
+ write_unlock_bh(lock);
ende:
- if (sk->sk_state == TCP_LISTEN)
wake_up(&tcp_lhash_wait);
+ }
}

/* Don't inline this cruft. Here are some nice properties to
@@ -494,7 +501,7 @@ static inline struct sock *__tcp_v4_look
*/
int hash = tcp_hashfn(daddr, hnum, saddr, sport);
head = &tcp_ehash[hash];
- read_lock(&head->lock);
+ rcu_read_lock();
sk_for_each(sk, node, &head->chain) {
if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
goto hit; /* You sunk my battleship! */
@@ -507,7 +514,7 @@ static inline struct sock *__tcp_v4_look
}
sk = NULL;
out:
- read_unlock(&head->lock);
+ rcu_read_unlock();
return sk;
hit:
sock_hold(sk);
@@ -559,7 +566,7 @@ static int __tcp_v4_check_established(st
struct hlist_node *node;
struct tcp_tw_bucket *tw;

- write_lock(&head->lock);
+ spin_lock(&head->lock);

/* Check TIME-WAIT sockets first. */
sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
@@ -614,7 +621,7 @@ unique:
BUG_TRAP(sk_unhashed(sk));
__sk_add_node(sk, &head->chain);
sock_prot_inc_use(sk->sk_prot);
- write_unlock(&head->lock);
+ spin_unlock(&head->lock);

if (twp) {
*twp = tw;
@@ -630,7 +637,7 @@ unique:
return 0;

not_unique:
- write_unlock(&head->lock);
+ spin_unlock(&head->lock);
return -EADDRNOTAVAIL;
}

@@ -2228,7 +2235,10 @@ static void *established_get_first(struc
struct hlist_node *node;
struct tcp_tw_bucket *tw;

- read_lock(&tcp_ehash[st->bucket].lock);
+ /* Take the spinlock. Otherwise a dancing socket
+ * (__tcp_tw_hashdance) may be reported twice!
+ */
+ spin_lock(&tcp_ehash[st->bucket].lock);
sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
if (sk->sk_family != st->family) {
continue;
@@ -2245,7 +2255,7 @@ static void *established_get_first(struc
rc = tw;
goto out;
}
- read_unlock(&tcp_ehash[st->bucket].lock);
+ spin_unlock(&tcp_ehash[st->bucket].lock);
st->state = TCP_SEQ_STATE_ESTABLISHED;
}
out:
@@ -2272,10 +2282,10 @@ get_tw:
cur = tw;
goto out;
}
- read_unlock(&tcp_ehash[st->bucket].lock);
+ spin_unlock(&tcp_ehash[st->bucket].lock);
st->state = TCP_SEQ_STATE_ESTABLISHED;
if (++st->bucket < tcp_ehash_size) {
- read_lock(&tcp_ehash[st->bucket].lock);
+ spin_lock(&tcp_ehash[st->bucket].lock);
sk = sk_head(&tcp_ehash[st->bucket].chain);
} else {
cur = NULL;
@@ -2385,7 +2395,7 @@ static void tcp_seq_stop(struct seq_file
case TCP_SEQ_STATE_TIME_WAIT:
case TCP_SEQ_STATE_ESTABLISHED:
if (v)
- read_unlock(&tcp_ehash[st->bucket].lock);
+ spin_unlock(&tcp_ehash[st->bucket].lock);
local_bh_enable();
break;
}
diff -puN net/ipv4/tcp.c~tcp_ehash net/ipv4/tcp.c
--- linux-2.6.8.1/net/ipv4/tcp.c~tcp_ehash 2004-08-25 18:06:42.000000000 +0530
+++ linux-2.6.8.1-vatsa/net/ipv4/tcp.c 2004-08-25 18:07:27.000000000 +0530
@@ -2258,7 +2258,7 @@ void __init tcp_init(void)
if (!tcp_ehash)
panic("Failed to allocate TCP established hash table\n");
for (i = 0; i < (tcp_ehash_size << 1); i++) {
- tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
+ tcp_ehash[i].lock = SPIN_LOCK_UNLOCKED;
INIT_HLIST_HEAD(&tcp_ehash[i].chain);
}

diff -puN net/ipv4/tcp_diag.c~tcp_ehash net/ipv4/tcp_diag.c
--- linux-2.6.8.1/net/ipv4/tcp_diag.c~tcp_ehash 2004-08-25 18:06:42.000000000 +0530
+++ linux-2.6.8.1-vatsa/net/ipv4/tcp_diag.c 2004-08-25 18:07:27.000000000 +0530
@@ -522,7 +522,10 @@ skip_listen_ht:
if (i > s_i)
s_num = 0;

- read_lock_bh(&head->lock);
+ /* Take the spinlock. Otherwise a dancing socket
+ * (__tcp_tw_hashdance) may be reported twice!
+ */
+ spin_lock_bh(&head->lock);

num = 0;
sk_for_each(sk, node, &head->chain) {
@@ -542,7 +545,7 @@ skip_listen_ht:
if (tcpdiag_fill(skb, sk, r->tcpdiag_ext,
NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq) <= 0) {
- read_unlock_bh(&head->lock);
+ spin_unlock_bh(&head->lock);
goto done;
}
++num;
@@ -568,13 +571,13 @@ skip_listen_ht:
if (tcpdiag_fill(skb, sk, r->tcpdiag_ext,
NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq) <= 0) {
- read_unlock_bh(&head->lock);
+ spin_unlock_bh(&head->lock);
goto done;
}
++num;
}
}
- read_unlock_bh(&head->lock);
+ spin_unlock_bh(&head->lock);
}

done:
diff -puN net/ipv4/tcp_minisocks.c~tcp_ehash net/ipv4/tcp_minisocks.c
--- linux-2.6.8.1/net/ipv4/tcp_minisocks.c~tcp_ehash 2004-08-25 18:06:42.000000000 +0530
+++ linux-2.6.8.1-vatsa/net/ipv4/tcp_minisocks.c 2004-08-26 16:58:08.000000000 +0530
@@ -64,14 +64,14 @@ static void tcp_timewait_kill(struct tcp

/* Unlink from established hashes. */
ehead = &tcp_ehash[tw->tw_hashent];
- write_lock(&ehead->lock);
+ spin_lock(&ehead->lock);
if (hlist_unhashed(&tw->tw_node)) {
- write_unlock(&ehead->lock);
+ spin_unlock(&ehead->lock);
return;
}
__hlist_del(&tw->tw_node);
sk_node_init(&tw->tw_node);
- write_unlock(&ehead->lock);
+ spin_unlock(&ehead->lock);

/* Disassociate with bind bucket. */
bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)];
@@ -308,17 +308,28 @@ static void __tcp_tw_hashdance(struct so
tw_add_bind_node(tw, &tw->tw_tb->owners);
spin_unlock(&bhead->lock);

- write_lock(&ehead->lock);
+ spin_lock(&ehead->lock);

- /* Step 2: Remove SK from established hash. */
- if (__sk_del_node_init(sk))
- sock_prot_dec_use(sk->sk_prot);
+ /*
+ * We have to be carefull here since there could be racing
+ * (lock-free) lookups happening on other CPUs. If we remove SK first
+ * and then add TW, then there is a tiny window where this socket is
+ * in neither the established half nor in the TIMEWAIT half of the ehash
+ * table. Lookups occuring in that window can drop packets!
+ * Hence we first add TW and then remove SK, with a barrier in between.
+ */

- /* Step 3: Hash TW into TIMEWAIT half of established hash table. */
+ /* Step 2: Hash TW into TIMEWAIT half of established hash table. */
tw_add_node(tw, &(ehead + tcp_ehash_size)->chain);
atomic_inc(&tw->tw_refcnt);

- write_unlock(&ehead->lock);
+ smp_wmb();
+
+ /* Step 3: Remove SK from established hash. */
+ if (__sk_del_node_init(sk))
+ sock_prot_dec_use(sk->sk_prot);
+
+ spin_unlock(&ehead->lock);
}

/*
@@ -1069,11 +1080,29 @@ int tcp_child_process(struct sock *paren
return ret;
}

+/* RCU callback to free a timewait bucket */
+
+void tcp_tw_free(struct rcu_head *head)
+{
+ struct tcp_tw_bucket *tw =
+ container_of(head, struct tcp_tw_bucket, tw_rcu);
+
+ if (atomic_dec_and_test(&tw->tw_refcnt)) {
+#ifdef INET_REFCNT_DEBUG
+ printk(KERN_DEBUG "tw_bucket %p released\n", tw);
+#endif
+ kmem_cache_free(tcp_timewait_cachep, tw);
+ }
+}
+
+
+
EXPORT_SYMBOL(tcp_check_req);
EXPORT_SYMBOL(tcp_child_process);
EXPORT_SYMBOL(tcp_create_openreq_child);
EXPORT_SYMBOL(tcp_timewait_state_process);
EXPORT_SYMBOL(tcp_tw_deschedule);
+EXPORT_SYMBOL(tcp_tw_free);

#ifdef CONFIG_SYSCTL
EXPORT_SYMBOL(sysctl_tcp_tw_recycle);
diff -puN net/ipv6/tcp_ipv6.c~tcp_ehash net/ipv6/tcp_ipv6.c
--- linux-2.6.8.1/net/ipv6/tcp_ipv6.c~tcp_ehash 2004-08-25 18:06:42.000000000 +0530
+++ linux-2.6.8.1-vatsa/net/ipv6/tcp_ipv6.c 2004-08-25 18:07:27.000000000 +0530
@@ -210,7 +210,8 @@ fail:
static __inline__ void __tcp_v6_hash(struct sock *sk)
{
struct hlist_head *list;
- rwlock_t *lock;
+ rwlock_t *lock = NULL;
+ spinlock_t *slock = NULL;

BUG_TRAP(sk_unhashed(sk));

@@ -221,13 +222,16 @@ static __inline__ void __tcp_v6_hash(str
} else {
sk->sk_hashent = tcp_v6_sk_hashfn(sk);
list = &tcp_ehash[sk->sk_hashent].chain;
- lock = &tcp_ehash[sk->sk_hashent].lock;
- write_lock(lock);
+ slock = &tcp_ehash[sk->sk_hashent].lock;
+ spin_lock(slock);
}

__sk_add_node(sk, list);
sock_prot_inc_use(sk->sk_prot);
- write_unlock(lock);
+ if (sk->sk_state == TCP_LISTEN)
+ write_unlock(lock);
+ else
+ spin_unlock(slock);
}


@@ -307,7 +311,7 @@ static inline struct sock *__tcp_v6_look
*/
hash = tcp_v6_hashfn(daddr, hnum, saddr, sport);
head = &tcp_ehash[hash];
- read_lock(&head->lock);
+ rcu_read_lock();
sk_for_each(sk, node, &head->chain) {
/* For IPV6 do the cheaper port and family tests first. */
if(TCP_IPV6_MATCH(sk, saddr, daddr, ports, dif))
@@ -326,12 +330,12 @@ static inline struct sock *__tcp_v6_look
goto hit;
}
}
- read_unlock(&head->lock);
+ rcu_read_unlock();
return NULL;

hit:
sock_hold(sk);
- read_unlock(&head->lock);
+ rcu_read_unlock();
return sk;
}

@@ -452,7 +456,7 @@ static int tcp_v6_check_established(stru
struct hlist_node *node;
struct tcp_tw_bucket *tw;

- write_lock_bh(&head->lock);
+ spin_lock_bh(&head->lock);

/* Check TIME-WAIT sockets first. */
sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
@@ -491,7 +495,7 @@ unique:
__sk_add_node(sk, &head->chain);
sk->sk_hashent = hash;
sock_prot_inc_use(sk->sk_prot);
- write_unlock_bh(&head->lock);
+ spin_unlock_bh(&head->lock);

if (tw) {
/* Silly. Should hash-dance instead... */

_
--


Thanks and Regards,
Srivatsa Vaddagiri,
Linux Technology Center,
IBM Software Labs,
Bangalore, INDIA - 560017
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/