Re: select()/socket has problems under 2.2.x.

Andrea Arcangeli (andrea@e-mind.com)
Sun, 7 Mar 1999 19:19:36 +0100 (CET)


Today I again thought a bit about the issue.

Since I still think that the user can set any sndbuf/rcvbuf and expect to
still have a working socket, and I think that sndbuf/rcvbuf is just an
hint on how much data to queue, I taken the approch to smart change rcvbuf
when it is going to cause deadlocking. I think this approch is better than
going to play with the mss of the sender because the other end could not
care about our advertised mss and because this way will automagically work
also if somebody will set the rcvbuf while the tcp connection is just
established.

I also fixed the TCP to always send back an ack to the other end (since I
understood by Alan's email of yesterday, that this thing has to happen) if
we are forced to drop the incoming packet because we have just a lot of
data queued.

Here a new whole patch against 2.2.2:

Index: tcp_input.c
===================================================================
RCS file: /var/cvs/linux/net/ipv4/tcp_input.c,v
retrieving revision 1.1.1.6
retrieving revision 1.1.2.22
diff -u -r1.1.1.6 -r1.1.2.22
--- tcp_input.c 1999/02/23 16:48:26 1.1.1.6
+++ linux/net/ipv4/tcp_input.c 1999/03/07 18:09:32 1.1.2.22
@@ -55,6 +55,19 @@
* work without delayed acks.
* Andi Kleen: Process packets with PSH set in the
* fast path.
+ * Andrea Arcangeli: TCP_NODELAY disable delacks.
+ * Andrea Arcangeli: Delay also the first ack to be able to
+ * join the ack in the first data packet.
+ * Andrea Arcangeli: Don't wakeup reader if we received an
+ * out of order packet.
+ * Andrea Arcangeli: Fixed the rcvbuf deadlock. Since rcvbuf
+ * is really only an hint, now we set it
+ * to 3 * pmtu before going to
+ * deadlocking. This will also improve
+ * transfer throughput.
+ * Andrea Arcangeli: Always send back an ack even if we
+ * have to reject the incoming packet
+ * because our rcvbuf is full.
*/

#include <linux/config.h>
@@ -99,12 +112,11 @@
if(tp->ato == 0) {
tp->lrcvtime = jiffies;

- /* Help sender leave slow start quickly,
- * and also makes sure we do not take this
- * branch ever again for this connection.
+ /*
+ * Don't enter quickack mode to be able to join the first
+ * ack with a data packet. -arca
*/
- tp->ato = 1;
- tcp_enter_quickack_mode(tp);
+ tp->ato = (HZ+49)/50;
} else {
int m = jiffies - tp->lrcvtime;

@@ -135,9 +147,9 @@
*/
if(th->psh && (skb->len < (tp->mss_cache >> 1))) {
/* Preserve the quickack state. */
- if((tp->ato & 0x7fffffff) > HZ/50)
+ if((tp->ato & 0x7fffffff) > (HZ+49)/50)
tp->ato = ((tp->ato & 0x80000000) |
- (HZ/50));
+ ((HZ+49)/50));
}
}

@@ -241,7 +253,7 @@
extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct tcphdr *th, unsigned len)
{
/* ts_recent must be younger than 24 days */
- return (((jiffies - tp->ts_recent_stamp) >= PAWS_24DAYS) ||
+ return (((s32)(jiffies - tp->ts_recent_stamp) >= PAWS_24DAYS) ||
(((s32)(tp->rcv_tsval-tp->ts_recent) < 0) &&
/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM */
(len != (th->doff * 4))));
@@ -1354,7 +1366,7 @@
}
}

-static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
+static int tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
struct sk_buff *skb1;
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
@@ -1384,7 +1396,7 @@
tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) |
(0x10 << 16) |
tp->snd_wnd);
- return;
+ return 0;
}

/* An old packet, either a retransmit or some packet got lost. */
@@ -1393,7 +1405,7 @@
SOCK_DEBUG(sk, "retransmit received: seq %X\n", TCP_SKB_CB(skb)->seq);
tcp_enter_quickack_mode(tp);
kfree_skb(skb);
- return;
+ return 1;
}

if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
@@ -1415,7 +1427,7 @@
SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);

- if (skb_peek(&tp->out_of_order_queue) == NULL) {
+ if (!skb_queue_len(&tp->out_of_order_queue)) {
/* Initial out of order segment, build 1 SACK. */
if(tp->sack_ok) {
tp->num_sacks = 1;
@@ -1458,6 +1470,7 @@
}
}
}
+ return 1;
}


@@ -1471,6 +1484,7 @@
{
struct tcphdr *th;
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ int ret;

th = skb->h.th;
skb_pull(skb, th->doff*4);
@@ -1484,16 +1498,46 @@
* Make sure to do this before moving snd_nxt, otherwise
* data might be acked for that we don't have enough room.
*/
- if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) {
- if (prune_queue(sk) < 0) {
- /* Still not enough room. That can happen when
- * skb->true_size differs significantly from skb->len.
+ bigger_rcvbuf:
+ if (atomic_read(&sk->rmem_alloc) >= sk->rcvbuf)
+ {
+ extern __u32 sysctl_rmem_max;
+ int good_rcvbuf = min(tp->pmtu_cookie * 3, sysctl_rmem_max);
+#if 1
+ printk(KERN_DEBUG "tcp_data: rmem %d, rcvbuf %d, len %d, "
+ "skb->len %d, skb->truesize %d, good_rcvbuf %d\n",
+ atomic_read(&sk->rmem_alloc), sk->rcvbuf, len,
+ skb->len, skb->truesize, good_rcvbuf);
+#endif
+ if (sk->rcvbuf < good_rcvbuf)
+ {
+ sk->rcvbuf = good_rcvbuf;
+ goto bigger_rcvbuf;
+ }
+ /*
+ * It seems we can't queue/receive the packet, so try to
+ * shrink something if possible. -arca
+ */
+ if (prune_queue(sk) < 0)
+ {
+ /*
+ * Bad we aren't been able to shrink the out of
+ * order queue so we _must_ drop the packet. But
+ * remeber to send an ack back to the other end
+ * to tell him that we had to drop the packet, this
+ * avoid deadlocking of the tcp connection on the
+ * wire, but it doesn't mean we'll be able to continue
+ * to receive data in the future if something in the
+ * rmem_alloc and rcvbuf won't change. -arca
*/
+ tp->delayed_acks++;
+ tcp_enter_quickack_mode(tp);
+ tp->pred_flags = 0;
return 0;
}
}

- tcp_data_queue(sk, skb);
+ ret = tcp_data_queue(sk, skb);

if (before(tp->rcv_nxt, tp->copied_seq)) {
printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
@@ -1503,7 +1547,7 @@
/* Above, tcp_data_queue() increments delayed_acks appropriately.
* Now tell the user we may have some data.
*/
- if (!sk->dead) {
+ if (!sk->dead && !ret) {
SOCK_DEBUG(sk, "Data wakeup.\n");
sk->data_ready(sk,0);
}
@@ -1580,7 +1624,9 @@
/* We entered "quick ACK" mode or... */
tcp_in_quickack_mode(tp) ||
/* We have out of order data */
- (skb_peek(&tp->out_of_order_queue) != NULL)) {
+ skb_queue_len(&tp->out_of_order_queue) ||
+ /* TCP_NODELAY is set */
+ sk->nonagle == 1) {
/* Then ack it now */
tcp_send_ack(sk);
} else {
Index: tcp_ipv4.c
===================================================================
RCS file: /var/cvs/linux/net/ipv4/tcp_ipv4.c,v
retrieving revision 1.1.1.4
retrieving revision 1.1.2.6
diff -u -r1.1.1.4 -r1.1.2.6
--- tcp_ipv4.c 1999/02/20 15:43:23 1.1.1.4
+++ linux/net/ipv4/tcp_ipv4.c 1999/03/07 18:09:32 1.1.2.6
@@ -566,7 +566,7 @@
struct sk_buff *buff;
struct rtable *rt;
u32 daddr, nexthop;
- int tmp;
+ int tmp, pmtu;

if (sk->state != TCP_CLOSE)
return(-EISCONN);
@@ -642,8 +642,9 @@
/* Reset mss clamp */
tp->mss_clamp = ~0;

+ pmtu = rt->u.dst.pmtu;
if (!ip_dont_fragment(sk, &rt->u.dst) &&
- rt->u.dst.pmtu > 576 && rt->rt_dst != rt->rt_gateway) {
+ pmtu > 576 && rt->rt_dst != rt->rt_gateway) {
/* Clamp mss at maximum of 536 and user_mss.
Probably, user ordered to override tiny segment size
in gatewayed case.
@@ -651,7 +652,17 @@
tp->mss_clamp = max(tp->user_mss, 536);
}

- tcp_connect(sk, buff, rt->u.dst.pmtu);
+ tcp_connect(sk, buff, pmtu);
+
+ /*
+ * set a rasonable sndbuf/rcvbuf to improve performances and
+ * make sure to avoid future deadlocks. -arca
+ */
+ pmtu *= 3;
+ if (sk->rcvbuf < pmtu)
+ sk->rcvbuf = min(pmtu, sysctl_rmem_max);
+ if (sk->sndbuf < pmtu)
+ sk->sndbuf = min(pmtu, sysctl_wmem_max);
return 0;
}

Index: tcp_output.c
===================================================================
RCS file: /var/cvs/linux/net/ipv4/tcp_output.c,v
retrieving revision 1.1.1.5
diff -u -r1.1.1.5 tcp_output.c
--- tcp_output.c 1999/02/23 16:48:26 1.1.1.5
+++ linux/net/ipv4/tcp_output.c 1999/03/07 17:37:13
@@ -580,6 +580,19 @@
if(tp->af_specific->rebuild_header(sk))
return 1; /* Routing failure or similar. */

+ /* Solaris sucks. */
+ if(skb->len > 0 &&
+ (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
+ tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
+#if 1
+ printk("TCP: Doing Solaris hack for [%p:%08x:%04x:%08x]\n",
+ skb, sk->daddr, sk->dport, tp->snd_una);
+#endif
+ TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
+ skb_trim(skb, 0);
+ skb->csum = 0;
+ }
+
/* Ok, we're gonna send it out, update state. */
TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_RETRANS;
tp->retrans_out++;
@@ -973,6 +986,8 @@
timeout = tp->ato;
if (timeout > max_timeout)
timeout = max_timeout;
+ if (timeout < (HZ+49)/50)
+ timeout = (HZ+49)/50;
timeout += jiffies;

/* Use new timeout only if there wasn't a older one earlier. */
@@ -980,7 +995,7 @@
tp->delack_timer.expires = timeout;
add_timer(&tp->delack_timer);
} else {
- if (timeout < tp->delack_timer.expires)
+ if (time_before(timeout, tp->delack_timer.expires))
mod_timer(&tp->delack_timer, timeout);
}
}

Comments?

I would like if people who experienced deadlocks in their TCP connection
would try my patch and feedback. Thanks.

Andrea Arcangeli

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/