[PATCH] net/tcp: introduce TRACE_EVENT for TCP/IPv4 state transition

From: Yafang Shao
Date: Thu Nov 09 2017 - 01:01:49 EST


With this newly introduced TRACE_EVENT, it will be very easy to minotor
TCP/IPv4 state transition.

A new TRACE_SYSTEM named tcp is added, in which we can trace other TCP
event as well.

Two helpers are added,
static inline void __tcp_set_state(struct sock *sk, int state)
static inline void __sk_state_store(struct sock *sk, int newstate)

When do TCP/IPv4 state transition, we should use these two helpers or
use tcp_set_state() instead of assign a value to sk_state directly.

Signed-off-by: Yafang Shao <laoar.shao@xxxxxxxxx>
---
include/net/tcp.h | 16 ++++++++++++
include/trace/events/tcp.h | 58 +++++++++++++++++++++++++++++++++++++++++
net/ipv4/inet_connection_sock.c | 9 ++++---
net/ipv4/inet_hashtables.c | 2 +-
net/ipv4/tcp.c | 2 +-
5 files changed, 82 insertions(+), 5 deletions(-)
create mode 100644 include/trace/events/tcp.h

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 89974c5..a8336d3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -49,6 +49,7 @@
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/bpf-cgroup.h>
+#include <trace/events/tcp.h>

extern struct inet_hashinfo tcp_hashinfo;

@@ -1284,6 +1285,21 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb)
#endif
void tcp_set_state(struct sock *sk, int state);

+/*
+ * To trace TCP state transition.
+ */
+static inline void __tcp_set_state(struct sock *sk, int state)
+{
+ trace_tcp_set_state(sk, sk->sk_state, state);
+ sk->sk_state = state;
+}
+
+static inline void __sk_state_store(struct sock *sk, int newstate)
+{
+ trace_tcp_set_state(sk, sk->sk_state, newstate);
+ sk_state_store(sk, newstate);
+}
+
void tcp_done(struct sock *sk);

int tcp_abort(struct sock *sk, int err);
diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
new file mode 100644
index 0000000..abf65af
--- /dev/null
+++ b/include/trace/events/tcp.h
@@ -0,0 +1,58 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM tcp
+
+#if !defined(_TRACE_TCP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_TCP_H
+
+#include <linux/tracepoint.h>
+#include <net/sock.h>
+#include <net/inet_timewait_sock.h>
+#include <net/request_sock.h>
+#include <net/inet_sock.h>
+#include <net/tcp_states.h>
+
+TRACE_EVENT(tcp_set_state,
+ TP_PROTO(struct sock *sk, int oldstate, int newstate),
+ TP_ARGS(sk, oldstate, newstate),
+
+ TP_STRUCT__entry(
+ __field(__be32, dst)
+ __field(__be32, src)
+ __field(__u16, dport)
+ __field(__u16, sport)
+ __field(int, oldstate)
+ __field(int, newstate)
+ ),
+
+ TP_fast_assign(
+ if (oldstate == TCP_TIME_WAIT) {
+ __entry->dst = inet_twsk(sk)->tw_daddr;
+ __entry->src = inet_twsk(sk)->tw_rcv_saddr;
+ __entry->dport = ntohs(inet_twsk(sk)->tw_dport);
+ __entry->sport = ntohs(inet_twsk(sk)->tw_sport);
+ } else if (oldstate == TCP_NEW_SYN_RECV) {
+ __entry->dst = inet_rsk(inet_reqsk(sk))->ir_rmt_addr;
+ __entry->src = inet_rsk(inet_reqsk(sk))->ir_loc_addr;
+ __entry->dport =
+ ntohs(inet_rsk(inet_reqsk(sk))->ir_rmt_port);
+ __entry->sport = inet_rsk(inet_reqsk(sk))->ir_num;
+ } else {
+ __entry->dst = inet_sk(sk)->inet_daddr;
+ __entry->src = inet_sk(sk)->inet_rcv_saddr;
+ __entry->dport = ntohs(inet_sk(sk)->inet_dport);
+ __entry->sport = ntohs(inet_sk(sk)->inet_sport);
+ }
+
+ __entry->oldstate = oldstate;
+ __entry->newstate = newstate;
+ ),
+
+ TP_printk("%08X:%04X %08X:%04X, %02x %02x",
+ __entry->src, __entry->sport, __entry->dst, __entry->dport,
+ __entry->oldstate, __entry->newstate)
+);
+
+#endif
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index c039c93..307a046 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -27,6 +27,9 @@
#include <net/sock_reuseport.h>
#include <net/addrconf.h>

+#define CREATE_TRACE_POINTS
+#include <trace/events/tcp.h>
+
#ifdef INET_CSK_DEBUG
const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
EXPORT_SYMBOL(inet_csk_timer_bug_msg);
@@ -786,7 +789,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
if (newsk) {
struct inet_connection_sock *newicsk = inet_csk(newsk);

- newsk->sk_state = TCP_SYN_RECV;
+ __tcp_set_state(newsk, TCP_SYN_RECV);
newicsk->icsk_bind_hash = NULL;

inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
@@ -880,7 +883,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog)
* It is OK, because this socket enters to hash table only
* after validation is complete.
*/
- sk_state_store(sk, TCP_LISTEN);
+ __sk_state_store(sk, TCP_LISTEN);
if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
inet->inet_sport = htons(inet->inet_num);

@@ -891,7 +894,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog)
return 0;
}

- sk->sk_state = TCP_CLOSE;
+ __tcp_set_state(sk, TCP_CLOSE);
return err;
}
EXPORT_SYMBOL_GPL(inet_csk_listen_start);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 597bb4c..0f45d456 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -430,7 +430,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
} else {
percpu_counter_inc(sk->sk_prot->orphan_count);
- sk->sk_state = TCP_CLOSE;
+ __tcp_set_state(sk, TCP_CLOSE);
sock_set_flag(sk, SOCK_DEAD);
inet_csk_destroy_sock(sk);
}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5091402..984dce6 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2040,7 +2040,7 @@ void tcp_set_state(struct sock *sk, int state)
/* Change state AFTER socket is unhashed to avoid closed
* socket sitting in hash tables.
*/
- sk_state_store(sk, state);
+ __sk_state_store(sk, state);

#ifdef STATE_TRACE
SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
--
1.8.3.1