Re: 2.6.23-rc8-mm2 - tcp_fastretrans_alert() WARNING

From: Ilpo Järvinen
Date: Tue Oct 02 2007 - 06:26:28 EST


On Mon, 1 Oct 2007, Cedric Le Goater wrote:

> got it !
>
> r3-06.test.meiosys.com login: WARNING: at /home/legoater/linux/2.6.23-rc8-mm2/net/ipv4/tcp_input.c:2314 tcp_fastretrans_alert()
>
> Call Trace:
> <IRQ> [<ffffffff8040fdc3>] tcp_ack+0xcd6/0x18af
[...snip...]
>
> TCP 0

Hmm, so it's SACK then...

> I wasn't doing any particular test on n/w so it took me a while to figure
> out how I was triggering the WARNING. Apparently, this is happening when I
> run ketchup, but not always. This test machine is behind many firewall &
> routers so it might be a reason.
>
> I'm trying to get the WARNING and the tcpdump output for it but for the
> moment, it seems it's beyond my reach :/

I'm currently out of ideas where it could come from... so lets try
brute-force checking as your test case is not very high-speed... This
could hide it though... :-(

Please put the patch below on top of clean rc8-mm2 (it includes the patch
I gave you last time) and try to reproduce.... These counter bugs can
survive for sometime until !sacked_out condition occurs, so the patch
below tries to find that out when inconsisteny occurs for the first time
regardless of sacked_out (I also removed some statics which hopefully
reduces compiler inlining for easier reading of the output). I tried this
myself (except for verify()s in frto funcs and minor printout
modifications), didn't trigger for me.

--
i.

---
include/net/tcp.h | 3 +
net/ipv4/tcp_input.c | 23 +++++++++--
net/ipv4/tcp_ipv4.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++++
net/ipv4/tcp_output.c | 6 ++-
4 files changed, 128 insertions(+), 6 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 991ccdc..54a0d91 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -43,6 +43,9 @@

#include <linux/seq_file.h>

+extern void tcp_verify_fackets(struct sock *sk);
+extern void tcp_print_queue(struct sock *sk);
+
extern struct inet_hashinfo tcp_hashinfo;

extern atomic_t tcp_orphan_count;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e22ffe7..1d7367d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1140,7 +1140,7 @@ static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb,
return dup_sack;
}

-static int
+int
tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1160,6 +1160,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
int first_sack_index;

if (!tp->sacked_out) {
+ if (WARN_ON(tp->fackets_out))
+ tcp_print_queue(sk);
tp->fackets_out = 0;
tp->highest_sack = tp->snd_una;
}
@@ -1420,6 +1422,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
}
}
}
+ tcp_verify_fackets(sk);

/* Check for lost retransmit. This superb idea is
* borrowed from "ratehalving". Event "C".
@@ -1632,13 +1635,14 @@ void tcp_enter_frto(struct sock *sk)
tcp_set_ca_state(sk, TCP_CA_Disorder);
tp->high_seq = tp->snd_nxt;
tp->frto_counter = 1;
+ tcp_verify_fackets(sk);
}

/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
* which indicates that we should follow the traditional RTO recovery,
* i.e. mark everything lost and do go-back-N retransmission.
*/
-static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
+void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -1675,6 +1679,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
}
}
tcp_verify_left_out(tp);
+ tcp_verify_fackets(sk);

tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments;
tp->snd_cwnd_cnt = 0;
@@ -1753,6 +1758,7 @@ void tcp_enter_loss(struct sock *sk, int how)
}
}
tcp_verify_left_out(tp);
+ tcp_verify_fackets(sk);

tp->reordering = min_t(unsigned int, tp->reordering,
sysctl_tcp_reordering);
@@ -2308,7 +2314,7 @@ static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
* It does _not_ decide what to send, it is made in function
* tcp_xmit_retransmit_queue().
*/
-static void
+void
tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
{
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2322,8 +2328,11 @@ tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
if (!tp->packets_out)
tp->sacked_out = 0;

- if (WARN_ON(!tp->sacked_out && tp->fackets_out))
+ if (WARN_ON(!tp->sacked_out && tp->fackets_out)) {
+ printk(KERN_ERR "TCP %d\n", tcp_is_reno(tp));
+ tcp_print_queue(sk);
tp->fackets_out = 0;
+ }

/* Now state machine starts.
* A. ECE, hence prohibit cwnd undoing, the reduction is required. */
@@ -2333,6 +2342,8 @@ tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
/* B. In all the states check for reneging SACKs. */
if (tp->sacked_out && tcp_check_sack_reneging(sk))
return;
+
+ tcp_verify_fackets(sk);

/* C. Process data loss notification, provided it is valid. */
if ((flag&FLAG_DATA_LOST) &&
@@ -2572,7 +2583,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
* is before the ack sequence we can discard it as it's confirmed to have
* arrived at the other end.
*/
-static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p)
+int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p)
{
struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2694,6 +2705,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p)
ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
}
}
+ tcp_verify_fackets(sk);
+

#if FASTRETRANS_DEBUG > 0
BUG_TRAP((int)tp->sacked_out >= 0);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7fed0a6..8b18757 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -108,6 +108,108 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
.lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
};

+void tcp_print_queue(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ char s[50+1];
+ char i[50+1];
+ int idx = 0;
+ u32 hs = tp->highest_sack;
+
+ if (!tp->sacked_out)
+ hs = tp->snd_una;
+
+ tcp_for_write_queue(skb, sk) {
+ if (skb == tcp_send_head(sk))
+ break;
+
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) {
+ if (skb->len < tp->mss_cache)
+ s[idx] = 's';
+ else
+ s[idx] = 'S';
+ } else {
+ s[idx] = '-';
+ }
+ if ((TCP_SKB_CB(skb)->seq == hs) && (tp->fastpath_skb_hint == skb))
+ i[idx] = 'x';
+ else if (tp->fastpath_skb_hint == skb)
+ i[idx] = 'f';
+ else if (TCP_SKB_CB(skb)->seq == hs)
+ i[idx] = 'h';
+ else
+ i[idx] = ' ';
+
+ if (++idx >= 50) {
+ s[idx] = 0;
+ i[idx] = 0;
+ printk(KERN_ERR "TCP wq(s) %s\n", s);
+ printk(KERN_ERR "TCP wq(i) %s\n", i);
+ idx = 0;
+ }
+ }
+ if (idx) {
+ s[idx] = '<';
+ s[idx+1] = 0;
+ i[idx] = '<';
+ i[idx+1] = 0;
+ printk(KERN_ERR "TCP wq(s) %s\n", s);
+ printk(KERN_ERR "TCP wq(i) %s\n", i);
+ }
+ printk(KERN_ERR "s%u f%u p%u seq: su%u hs%u sn%u (%u)\n",
+ tp->sacked_out, tp->fackets_out, tp->packets_out,
+ tp->snd_una, tp->highest_sack, tp->snd_nxt,
+ ((tp->fastpath_skb_hint == NULL) ? 0 :
+ TCP_SKB_CB(tp->fastpath_skb_hint)->seq));
+}
+
+void tcp_verify_fackets(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ u32 fackets = 0;
+ int hisack_valid = 0;
+ int err = 0;
+
+ if (tcp_is_reno(tp))
+ return;
+
+ if (!tp->sacked_out) {
+ if (WARN_ON(tp->fackets_out))
+ err = 1;
+ else if (tp->fastpath_skb_hint == NULL)
+ return;
+ }
+
+ /* ...expensive processing here... */
+ tcp_for_write_queue(skb, sk) {
+ if (skb == tcp_send_head(sk))
+ break;
+
+ if (tp->sacked_out && (TCP_SKB_CB(skb)->seq == tp->highest_sack)) {
+ hisack_valid = 1;
+ if (WARN_ON(tp->fackets_out != fackets + tcp_skb_pcount(skb)))
+ err = 1;
+ }
+
+ if (skb == tp->fastpath_skb_hint)
+ if (WARN_ON(fackets != tp->fastpath_cnt_hint))
+ err = 1;
+
+ if (WARN_ON((fackets > tp->fackets_out) && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)))
+ err = 1;
+
+ fackets += tcp_skb_pcount(skb);
+ }
+
+ if (WARN_ON(tp->sacked_out && !hisack_valid))
+ err = 1;
+
+ if (err)
+ tcp_print_queue(sk);
+}
+
static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
{
return inet_csk_get_port(&tcp_hashinfo, sk, snum,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6199abe..4c70caf 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -773,6 +773,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss
tcp_verify_left_out(tp);
}
tcp_adjust_fackets_out(tp, skb, diff);
+
+ tcp_verify_fackets(sk);
}

/* Link BUFF into the send queue. */
@@ -1688,7 +1690,7 @@ u32 __tcp_select_window(struct sock *sk)
}

/* Attempt to collapse two adjacent SKB's during retransmission. */
-static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
+void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
@@ -1764,6 +1766,8 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
if (tp->fastpath_skb_hint == next_skb)
tp->fastpath_skb_hint = skb;

+ tcp_verify_fackets(sk);
+
sk_stream_free_skb(sk, next_skb);
}
}
--
1.5.0.6

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/