summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/ip-sysctl.rst6
-rw-r--r--Documentation/networking/net_cachelines/tcp_sock.rst3
-rw-r--r--include/linux/tcp.h4
-rw-r--r--include/net/netns/ipv4.h1
-rw-r--r--include/net/tcp.h3
-rw-r--r--include/net/tcp_ecn.h52
-rw-r--r--net/ipv4/sysctl_net_ipv4.c9
-rw-r--r--net/ipv4/tcp.c5
-rw-r--r--net/ipv4/tcp_input.c4
-rw-r--r--net/ipv4/tcp_ipv4.c1
-rw-r--r--net/ipv4/tcp_minisocks.c2
-rw-r--r--net/ipv4/tcp_output.c26
12 files changed, 107 insertions, 9 deletions
diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 1c206501b973..a06cb99d66dc 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -487,6 +487,12 @@ tcp_ecn_option - INTEGER
Default: 2
+tcp_ecn_option_beacon - INTEGER
+ Control Accurate ECN (AccECN) option sending frequency per RTT and it
+ takes effect only when tcp_ecn_option is set to 2.
+
+ Default: 3 (AccECN will be send at least 3 times per RTT)
+
tcp_ecn_fallback - BOOLEAN
If the kernel detects that ECN connection misbehaves, enable fall
back to non-ECN. Currently, this knob implements the fallback
diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst
index b941151f8c0a..d4dc01800945 100644
--- a/Documentation/networking/net_cachelines/tcp_sock.rst
+++ b/Documentation/networking/net_cachelines/tcp_sock.rst
@@ -109,6 +109,9 @@ u8:2 syn_ect_snt write_mostly read_w
u8:2 syn_ect_rcv read_mostly read_write
u8:2 accecn_minlen write_mostly read_write
u8:2 est_ecnfield read_write
+u8:2 accecn_opt_demand read_mostly read_write
+u8:2 prev_ecnfield read_write
+u64 accecn_opt_tstamp read_write
u8:4 accecn_fail_mode
u32 lost read_mostly tcp_ack
u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx)
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 73557656cb2d..f637b659b35a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -275,6 +275,7 @@ struct tcp_sock {
u32 mdev_us; /* medium deviation */
u32 rtt_seq; /* sequence number to update rttvar */
u64 tcp_wstamp_ns; /* departure time for next sent data packet */
+ u64 accecn_opt_tstamp; /* Last AccECN option sent timestamp */
struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
struct sk_buff *highest_sack; /* skb just after the highest
* skb with SACKed bit set
@@ -296,7 +297,8 @@ struct tcp_sock {
unused2:4;
u8 accecn_minlen:2,/* Minimum length of AccECN option sent */
est_ecnfield:2,/* ECN field for AccECN delivered estimates */
- unused3:4;
+ accecn_opt_demand:2,/* Demand AccECN option for n next ACKs */
+ prev_ecnfield:2; /* ECN bits from the previous segment */
__be32 pred_flags;
u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
u64 tcp_mstamp; /* most recent packet received/sent */
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index acbb7dd497e1..34eb3aecb3f2 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -149,6 +149,7 @@ struct netns_ipv4 {
u8 sysctl_tcp_ecn;
u8 sysctl_tcp_ecn_option;
+ u8 sysctl_tcp_ecn_option_beacon;
u8 sysctl_tcp_ecn_fallback;
u8 sysctl_ip_default_ttl;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6be29129465e..78dd7b8a4145 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -100,6 +100,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
/* Maximal number of window scale according to RFC1323 */
#define TCP_MAX_WSCALE 14U
+/* Default sending frequency of accurate ECN option per RTT */
+#define TCP_ACCECN_OPTION_BEACON 3
+
/* urg_data states */
#define TCP_URG_VALID 0x0100
#define TCP_URG_NOTYET 0x0200
diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h
index 08c7f4757e4e..133fb6b79500 100644
--- a/include/net/tcp_ecn.h
+++ b/include/net/tcp_ecn.h
@@ -176,6 +176,17 @@ static inline void tcp_accecn_third_ack(struct sock *sk,
}
}
+/* Demand the minimum # to send AccECN optnio */
+static inline void tcp_accecn_opt_demand_min(struct sock *sk,
+ u8 opt_demand_min)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u8 opt_demand;
+
+ opt_demand = max_t(u8, opt_demand_min, tp->accecn_opt_demand);
+ tp->accecn_opt_demand = opt_demand;
+}
+
/* Maps IP ECN field ECT/CE code point to AccECN option field number, given
* we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0).
*/
@@ -256,6 +267,7 @@ static inline void tcp_ecn_received_counters(struct sock *sk,
u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK;
u8 is_ce = INET_ECN_is_ce(ecnfield);
struct tcp_sock *tp = tcp_sk(sk);
+ bool ecn_edge;
if (!INET_ECN_is_not_ect(ecnfield)) {
u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs);
@@ -274,9 +286,34 @@ static inline void tcp_ecn_received_counters(struct sock *sk,
if (len > 0) {
u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield);
+ u32 oldbytes = tp->received_ecn_bytes[ecnfield - 1];
+ u32 bytes_mask = GENMASK_U32(31, 22);
+
tp->received_ecn_bytes[ecnfield - 1] += len;
tp->accecn_minlen = max_t(u8, tp->accecn_minlen,
minlen);
+
+ /* Send AccECN option at least once per 2^22-byte
+ * increase in any ECN byte counter.
+ */
+ if ((tp->received_ecn_bytes[ecnfield - 1] ^ oldbytes) &
+ bytes_mask) {
+ tcp_accecn_opt_demand_min(sk, 1);
+ }
+ }
+ }
+
+ ecn_edge = tp->prev_ecnfield != ecnfield;
+ if (ecn_edge || is_ce) {
+ tp->prev_ecnfield = ecnfield;
+ /* Demand Accurate ECN change-triggered ACKs. Two ACK are
+ * demanded to indicate unambiguously the ecnfield value
+ * in the latter ACK.
+ */
+ if (tcp_ecn_mode_accecn(tp)) {
+ if (ecn_edge)
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
+ tp->accecn_opt_demand = 2;
}
}
}
@@ -349,6 +386,7 @@ static inline void tcp_accecn_init_counters(struct tcp_sock *tp)
__tcp_accecn_init_bytes_counters(tp->received_ecn_bytes);
__tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes);
tp->accecn_minlen = 0;
+ tp->accecn_opt_demand = 0;
tp->est_ecnfield = 0;
}
@@ -431,6 +469,7 @@ static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th,
default:
tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK;
+ tp->accecn_opt_demand = 2;
if (INET_ECN_is_ce(ip_dsfield) &&
tcp_accecn_validate_syn_feedback(sk, ace,
tp->syn_ect_snt)) {
@@ -451,6 +490,7 @@ static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th,
} else {
tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
INET_ECN_MASK;
+ tp->prev_ecnfield = tp->syn_ect_rcv;
tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
}
}
@@ -542,4 +582,16 @@ tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
th->ece = 1;
}
+static inline bool tcp_accecn_option_beacon_check(const struct sock *sk)
+{
+ u32 ecn_beacon = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon);
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!ecn_beacon)
+ return false;
+
+ return tcp_stamp_us_delta(tp->tcp_mstamp, tp->accecn_opt_tstamp) * ecn_beacon >=
+ (tp->srtt_us >> 3);
+}
+
#endif /* _LINUX_TCP_ECN_H */
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4a697acb4e85..24dbc603cc44 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -741,6 +741,15 @@ static struct ctl_table ipv4_net_table[] = {
.extra2 = SYSCTL_TWO,
},
{
+ .procname = "tcp_ecn_option_beacon",
+ .data = &init_net.ipv4.sysctl_tcp_ecn_option_beacon,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_THREE,
+ },
+ {
.procname = "tcp_ecn_fallback",
.data = &init_net.ipv4.sysctl_tcp_ecn_fallback,
.maxlen = sizeof(u8),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8c4a4b8666fc..090f9ac43d4c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3410,6 +3410,8 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->delivered_ce = 0;
tp->accecn_fail_mode = 0;
tcp_accecn_init_counters(tp);
+ tp->prev_ecnfield = 0;
+ tp->accecn_opt_tstamp = 0;
if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release)
icsk->icsk_ca_ops->release(sk);
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
@@ -5134,11 +5136,12 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, accecn_opt_tstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags);
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 89);
+ CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 97);
/* TXRX read-write hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e898a76c485e..87154fd86167 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6121,8 +6121,10 @@ step1:
* RFC 5961 4.2 : Send a challenge ack
*/
if (th->syn) {
- if (tcp_ecn_mode_accecn(tp))
+ if (tcp_ecn_mode_accecn(tp)) {
accecn_reflector = true;
+ tcp_accecn_opt_demand_min(sk, 1);
+ }
if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack &&
TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq &&
TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt &&
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index aa8dbfe20924..6a63be1f6461 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3562,6 +3562,7 @@ static int __net_init tcp_sk_init(struct net *net)
{
net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL;
+ net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON;
net->ipv4.sysctl_tcp_ecn_fallback = 1;
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 1dbcc09ff7a9..193343494558 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -463,6 +463,8 @@ static void tcp_ecn_openreq_child(struct sock *sk,
tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
tp->syn_ect_snt = treq->syn_ect_snt;
tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt);
+ tp->prev_ecnfield = treq->syn_ect_rcv;
+ tp->accecn_opt_demand = 1;
tcp_ecn_received_counters_payload(sk, skb);
} else {
tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ?
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 34e5c83bbace..f897c2594954 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -705,8 +705,12 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
*ptr++ = htonl(((e0b & 0xffffff) << 8) |
TCPOPT_NOP);
}
- if (tp)
+ if (tp) {
tp->accecn_minlen = 0;
+ tp->accecn_opt_tstamp = tp->tcp_mstamp;
+ if (tp->accecn_opt_demand)
+ tp->accecn_opt_demand--;
+ }
}
if (unlikely(OPTION_SACK_ADVERTISE & options)) {
@@ -1149,11 +1153,16 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
opts->num_sack_blocks = 0;
}
- if (tcp_ecn_mode_accecn(tp) &&
- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option)) {
- opts->use_synack_ecn_bytes = 0;
- size += tcp_options_fit_accecn(opts, tp->accecn_minlen,
- MAX_TCP_OPTION_SPACE - size);
+ if (tcp_ecn_mode_accecn(tp)) {
+ int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option);
+
+ if (ecn_opt &&
+ (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand ||
+ tcp_accecn_option_beacon_check(sk))) {
+ opts->use_synack_ecn_bytes = 0;
+ size += tcp_options_fit_accecn(opts, tp->accecn_minlen,
+ MAX_TCP_OPTION_SPACE - size);
+ }
}
if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
@@ -2863,6 +2872,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
sent_pkts = 0;
tcp_mstamp_refresh(tp);
+
+ /* AccECN option beacon depends on mstamp, it may change mss */
+ if (tcp_ecn_mode_accecn(tp) && tcp_accecn_option_beacon_check(sk))
+ mss_now = tcp_current_mss(sk);
+
if (!push_one) {
/* Do MTU probing. */
result = tcp_mtu_probe(sk);