Re: [RFCv2 3/3] tcp: Wait for sufficient data in tcp_mtu_probe

From: Eric Dumazet
Date: Wed May 26 2021 - 10:35:44 EST


On Wed, May 26, 2021 at 12:38 PM Leonard Crestez <cdleonard@xxxxxxxxx> wrote:
>
> According to RFC4821 Section 7.4 "Protocols MAY delay sending non-probes
> in order to accumulate enough data" but linux almost never does that.
>
> Implement this by returning 0 from tcp_mtu_probe if not enough data is
> queued locally but some packets are still in flight. This makes mtu
> probing more likely to happen for applications that do small writes.
>
> Only doing this if packets are in flight should ensure that writing will
> be attempted again later. This is similar to how tcp_mtu_probe already
> returns zero if the probe doesn't fit inside the receiver window or the
> congestion window.
>
> Control this with a sysctl because this implies a latency tradeoff but
> only up to one RTT.
>
> Signed-off-by: Leonard Crestez <cdleonard@xxxxxxxxx>
> ---
> Documentation/networking/ip-sysctl.rst | 5 +++++
> include/net/netns/ipv4.h | 1 +
> net/ipv4/sysctl_net_ipv4.c | 7 +++++++
> net/ipv4/tcp_ipv4.c | 1 +
> net/ipv4/tcp_output.c | 18 ++++++++++++++----
> 5 files changed, 28 insertions(+), 4 deletions(-)
>
> diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
> index 7ab52a105a5d..967b7fac35b1 100644
> --- a/Documentation/networking/ip-sysctl.rst
> +++ b/Documentation/networking/ip-sysctl.rst
> @@ -349,10 +349,15 @@ tcp_mtu_probe_floor - INTEGER
> If MTU probing is enabled this caps the minimum MSS used for search_low
> for the connection.
>
> Default : 48
>
> +tcp_mtu_probe_waitdata - BOOLEAN
> + Wait for enough data for an mtu probe to accumulate on the sender.
> +
> + Default: 1
> +
> tcp_mtu_probe_rack - BOOLEAN
> Try to use shorter probes if RACK is also enabled
>
> Default: 1
>
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index b4ff12f25a7f..366e7b325778 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -112,10 +112,11 @@ struct netns_ipv4 {
> #ifdef CONFIG_NET_L3_MASTER_DEV
> u8 sysctl_tcp_l3mdev_accept;
> #endif
> u8 sysctl_tcp_mtu_probing;
> int sysctl_tcp_mtu_probe_floor;
> + int sysctl_tcp_mtu_probe_waitdata;

If this is a boolean, you should use u8, and place this field to avoid
adding a hole.

> int sysctl_tcp_mtu_probe_rack;
> int sysctl_tcp_base_mss;
> int sysctl_tcp_min_snd_mss;
> int sysctl_tcp_probe_threshold;
> u32 sysctl_tcp_probe_interval;
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index 275c91fb9cf8..53868b812958 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -847,10 +847,17 @@ static struct ctl_table ipv4_net_table[] = {
> .mode = 0644,
> .proc_handler = proc_dointvec_minmax,
> .extra1 = &tcp_min_snd_mss_min,
> .extra2 = &tcp_min_snd_mss_max,
> },
> + {
> + .procname = "tcp_mtu_probe_waitdata",
> + .data = &init_net.ipv4.sysctl_tcp_mtu_probe_waitdata,
> + .maxlen = sizeof(int),
> + .mode = 0644,
> + .proc_handler = proc_dointvec,

If this is a boolean, please use proc_dou8vec_minmax, and SYSCTL_ZERO/SYSCTL_ONE

> + },
> {
> .procname = "tcp_mtu_probe_rack",
> .data = &init_net.ipv4.sysctl_tcp_mtu_probe_rack,
> .maxlen = sizeof(int),
> .mode = 0644,
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index ed8af4a7325b..940df2ae4636 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -2892,10 +2892,11 @@ static int __net_init tcp_sk_init(struct net *net)
> net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
> net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
> net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
> net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
> net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
> + net->ipv4.sysctl_tcp_mtu_probe_waitdata = 1;
> net->ipv4.sysctl_tcp_mtu_probe_rack = 1;
>
> net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
> net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
> net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 362f97cfb09e..268e1bac001f 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -2394,14 +2394,10 @@ static int tcp_mtu_probe(struct sock *sk)
> */
> tcp_mtu_check_reprobe(sk);
> return -1;
> }
>
> - /* Have enough data in the send queue to probe? */
> - if (tp->write_seq - tp->snd_nxt < size_needed)
> - return -1;
> -
> /* Can probe fit inside congestion window? */
> if (packets_needed > tp->snd_cwnd)
> return -1;
>
> /* Can probe fit inside receiver window? If not then skip probing.
> @@ -2411,10 +2407,24 @@ static int tcp_mtu_probe(struct sock *sk)
> * clear below.
> */
> if (tp->snd_wnd < size_needed)
> return -1;
>
> + /* Have enough data in the send queue to probe? */
> + if (tp->write_seq - tp->snd_nxt < size_needed) {
> + /* If packets are already in flight it's safe to wait for more data to
> + * accumulate on the sender because writing will be triggered as ACKs
> + * arrive.
> + * If no packets are in flight returning zero can stall.
> + */
> + if (net->ipv4.sysctl_tcp_mtu_probe_waitdata &&

I have serious doubts about RPC traffic.
Adding one RTT latency is going to make this unlikely to be used.

> + tcp_packets_in_flight(tp))
> + return 0;
> + else
> + return -1;
> + }
> +
> /* Do we need for more acks to clear the receive window? */
> if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
> return 0;
>
> /* Do we need the congestion window to clear? */
> --
> 2.25.1
>