diff options
| author | Kuniyuki Iwashima <kuniyu@google.com> | 2025-10-14 23:54:55 +0000 |
|---|---|---|
| committer | Martin KaFai Lau <martin.lau@kernel.org> | 2025-10-16 12:04:47 -0700 |
| commit | 7c268eaeec6388b7bee36aef3fb5e62c9222ad3b (patch) | |
| tree | 1529d1d9e676c103e0ea8c0354984f74a592db14 /net | |
| parent | 4a997d49d92ad9dda603f60881faa6c800d435e9 (diff) | |
net: Allow opt-out from global protocol memory accounting.
Some protocols (e.g., TCP, UDP) implement memory accounting for socket
buffers and charge memory to per-protocol global counters pointed to by
sk->sk_proto->memory_allocated.
Sometimes, system processes do not want that limitation. For a similar
purpose, there is SO_RESERVE_MEM for sockets under memcg.
Also, by opting out of the per-protocol accounting, sockets under memcg
can avoid paying costs for two orthogonal memory accounting mechanisms.
A microbenchmark result is in the subsequent bpf patch.
Let's allow opt-out from the per-protocol memory accounting if
sk->sk_bypass_prot_mem is true.
sk->sk_bypass_prot_mem and sk->sk_prot are placed in the same cache
line, and sk_has_account() always fetches sk->sk_prot before accessing
sk->sk_bypass_prot_mem, so there is no extra cache miss for this patch.
The following patches will set sk->sk_bypass_prot_mem to true, and
then, the per-protocol memory accounting will be skipped.
Note that this does NOT disable memcg, but rather the per-protocol one.
Another option not to use the hole in struct sock_common is create
sk_prot variants like tcp_prot_bypass, but this would complicate
SOCKMAP logic, tcp_bpf_prots etc.
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Link: https://patch.msgid.link/20251014235604.3057003-3-kuniyu@google.com
Diffstat (limited to 'net')
| -rw-r--r-- | net/core/sock.c | 32 | ||||
| -rw-r--r-- | net/ipv4/tcp.c | 3 | ||||
| -rw-r--r-- | net/ipv4/tcp_output.c | 7 | ||||
| -rw-r--r-- | net/mptcp/protocol.c | 7 | ||||
| -rw-r--r-- | net/tls/tls_device.c | 3 |
5 files changed, 39 insertions, 13 deletions
diff --git a/net/core/sock.c b/net/core/sock.c index 08ae20069b6d..5bf208579c02 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1046,9 +1046,13 @@ static int sock_reserve_memory(struct sock *sk, int bytes) if (!charged) return -ENOMEM; + if (sk->sk_bypass_prot_mem) + goto success; + /* pre-charge to forward_alloc */ sk_memory_allocated_add(sk, pages); allocated = sk_memory_allocated(sk); + /* If the system goes into memory pressure with this * precharge, give up and return error. */ @@ -1057,6 +1061,8 @@ static int sock_reserve_memory(struct sock *sk, int bytes) mem_cgroup_sk_uncharge(sk, pages); return -ENOMEM; } + +success: sk_forward_alloc_add(sk, pages << PAGE_SHIFT); WRITE_ONCE(sk->sk_reserved_mem, @@ -3145,8 +3151,11 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) return true; - sk_enter_memory_pressure(sk); + if (!sk->sk_bypass_prot_mem) + sk_enter_memory_pressure(sk); + sk_stream_moderate_sndbuf(sk); + return false; } EXPORT_SYMBOL(sk_page_frag_refill); @@ -3263,10 +3272,12 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { bool memcg_enabled = false, charged = false; struct proto *prot = sk->sk_prot; - long allocated; + long allocated = 0; - sk_memory_allocated_add(sk, amt); - allocated = sk_memory_allocated(sk); + if (!sk->sk_bypass_prot_mem) { + sk_memory_allocated_add(sk, amt); + allocated = sk_memory_allocated(sk); + } if (mem_cgroup_sk_enabled(sk)) { memcg_enabled = true; @@ -3275,6 +3286,9 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) goto suppress_allocation; } + if (!allocated) + return 1; + /* Under limit. */ if (allocated <= sk_prot_mem_limits(sk, 0)) { sk_leave_memory_pressure(sk); @@ -3353,7 +3367,8 @@ suppress_allocation: trace_sock_exceed_buf_limit(sk, prot, allocated, kind); - sk_memory_allocated_sub(sk, amt); + if (allocated) + sk_memory_allocated_sub(sk, amt); if (charged) mem_cgroup_sk_uncharge(sk, amt); @@ -3392,11 +3407,14 @@ EXPORT_SYMBOL(__sk_mem_schedule); */ void __sk_mem_reduce_allocated(struct sock *sk, int amount) { - sk_memory_allocated_sub(sk, amount); - if (mem_cgroup_sk_enabled(sk)) mem_cgroup_sk_uncharge(sk, amount); + if (sk->sk_bypass_prot_mem) + return; + + sk_memory_allocated_sub(sk, amount); + if (sk_under_global_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4d720aa09a4c..54def27326f1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -928,7 +928,8 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, } __kfree_skb(skb); } else { - sk->sk_prot->enter_memory_pressure(sk); + if (!sk->sk_bypass_prot_mem) + tcp_enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); } return NULL; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b94efb3050d2..7f5df7a71f62 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3743,12 +3743,17 @@ void sk_forced_mem_schedule(struct sock *sk, int size) delta = size - sk->sk_forward_alloc; if (delta <= 0) return; + amt = sk_mem_pages(delta); sk_forward_alloc_add(sk, amt << PAGE_SHIFT); - sk_memory_allocated_add(sk, amt); if (mem_cgroup_sk_enabled(sk)) mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL); + + if (sk->sk_bypass_prot_mem) + return; + + sk_memory_allocated_add(sk, amt); } /* Send a FIN. The caller locks the socket for us. diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 0292162a14ee..94a5f6dcc577 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1065,11 +1065,12 @@ static void mptcp_enter_memory_pressure(struct sock *sk) mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - if (first) + if (first && !ssk->sk_bypass_prot_mem) { tcp_enter_memory_pressure(ssk); - sk_stream_moderate_sndbuf(ssk); + first = false; + } - first = false; + sk_stream_moderate_sndbuf(ssk); } __mptcp_sync_sndbuf(sk); } diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index a64ae15b1a60..caa2b5d24622 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -373,7 +373,8 @@ static int tls_do_allocation(struct sock *sk, if (!offload_ctx->open_record) { if (unlikely(!skb_page_frag_refill(prepend_size, pfrag, sk->sk_allocation))) { - READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk); + if (!sk->sk_bypass_prot_mem) + READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); return -ENOMEM; } |