summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2025-11-06 11:52:36 +0000
committerJakub Kicinski <kuba@kernel.org>2025-11-07 18:41:44 -0800
commit416dd649f3aa3774907c668167a29c668dbc634b (patch)
treed229dab8ca41a6570c62cf3b3dc510b786cb8575 /net/ipv4
parent45cb3c6fbe875e1fda037a798e4c95c43b901d17 (diff)
tcp: add net.ipv4.tcp_comp_sack_rtt_percent
TCP SACK compression has been added in 2018 in commit 5d9f4262b7ea ("tcp: add SACK compression"). It is working great for WAN flows (with large RTT). Wifi in particular gets a significant boost _when_ ACK are suppressed. Add a new sysctl so that we can tune the very conservative 5 % value that has been used so far in this formula, so that small RTT flows can benefit from this feature. delay = min ( 5 % of RTT, 1 ms) This patch adds new tcp_comp_sack_rtt_percent sysctl to ease experiments and tuning. Given that we cap the delay to 1ms (tcp_comp_sack_delay_ns sysctl), set the default value to 33 %. Quoting Neal Cardwell ( https://lore.kernel.org/netdev/CADVnQymZ1tFnEA1Q=vtECs0=Db7zHQ8=+WCQtnhHFVbEOzjVnQ@mail.gmail.com/ ) The rationale for 33% is basically to try to facilitate pipelining, where there are always at least 3 ACKs and 3 GSO/TSO skbs per SRTT, so that the path can maintain a budget for 3 full-sized GSO/TSO skbs "in flight" at all times: + 1 skb in the qdisc waiting to be sent by the NIC next + 1 skb being sent by the NIC (being serialized by the NIC out onto the wire) + 1 skb being received and aggregated by the receiver machine's aggregation mechanism (some combination of LRO, GRO, and sack compression) Note that this is basically the same magic number (3) and the same rationales as: (a) tcp_tso_should_defer() ensuring that we defer sending data for no longer than cwnd/tcp_tso_win_divisor (where tcp_tso_win_divisor = 3), and (b) bbr_quantization_budget() ensuring that cwnd is at least 3 GSO/TSO skbs to maintain pipelining and full throughput at low RTTs Signed-off-by: Eric Dumazet <edumazet@google.com> Reviewed-by: Neal Cardwell <ncardwell@google.com> Link: https://patch.msgid.link/20251106115236.3450026-1-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/sysctl_net_ipv4.c9
-rw-r--r--net/ipv4/tcp_input.c26
-rw-r--r--net/ipv4/tcp_ipv4.c1
3 files changed, 28 insertions, 8 deletions
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 0c7c8f9041cb..35367f8e2da3 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1452,6 +1452,15 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_doulongvec_minmax,
},
{
+ .procname = "tcp_comp_sack_rtt_percent",
+ .data = &init_net.ipv4.sysctl_tcp_comp_sack_rtt_percent,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_ONE_THOUSAND,
+ },
+ {
.procname = "tcp_comp_sack_slack_ns",
.data = &init_net.ipv4.sysctl_tcp_comp_sack_slack_ns,
.maxlen = sizeof(unsigned long),
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 804ec56bdd24..9df5d7515605 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5893,7 +5893,9 @@ static inline void tcp_data_snd_check(struct sock *sk)
static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
{
struct tcp_sock *tp = tcp_sk(sk);
- unsigned long rtt, delay;
+ struct net *net = sock_net(sk);
+ unsigned long rtt;
+ u64 delay;
/* More than one full frame received... */
if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
@@ -5912,7 +5914,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
* Defer the ack until tcp_release_cb().
*/
if (sock_owned_by_user_nocheck(sk) &&
- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_backlog_ack_defer)) {
+ READ_ONCE(net->ipv4.sysctl_tcp_backlog_ack_defer)) {
set_bit(TCP_ACK_DEFERRED, &sk->sk_tsq_flags);
return;
}
@@ -5927,7 +5929,7 @@ send_now:
}
if (!tcp_is_sack(tp) ||
- tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr))
+ tp->compressed_ack >= READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_nr))
goto send_now;
if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
@@ -5942,18 +5944,26 @@ send_now:
if (hrtimer_is_queued(&tp->compressed_ack_timer))
return;
- /* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */
+ /* compress ack timer : comp_sack_rtt_percent of rtt,
+ * but no more than tcp_comp_sack_delay_ns.
+ */
rtt = tp->rcv_rtt_est.rtt_us;
if (tp->srtt_us && tp->srtt_us < rtt)
rtt = tp->srtt_us;
- delay = min_t(unsigned long,
- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns),
- rtt * (NSEC_PER_USEC >> 3)/20);
+ /* delay = (rtt >> 3) * NSEC_PER_USEC * comp_sack_rtt_percent / 100
+ * ->
+ * delay = rtt * 1.25 * comp_sack_rtt_percent
+ */
+ delay = (u64)(rtt + (rtt >> 2)) *
+ READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_rtt_percent);
+
+ delay = min(delay, READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_delay_ns));
+
sock_hold(sk);
hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay),
- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns),
+ READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_slack_ns),
HRTIMER_MODE_REL_PINNED_SOFT);
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 0cfebac33a91..a7d9fec2950b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3595,6 +3595,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
net->ipv4.sysctl_tcp_comp_sack_nr = 44;
+ net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 33;
net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;