summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig4
-rw-r--r--net/ipv4/af_inet.c21
-rw-r--r--net/ipv4/arp.c4
-rw-r--r--net/ipv4/icmp.c191
-rw-r--r--net/ipv4/inet_connection_sock.c25
-rw-r--r--net/ipv4/inet_hashtables.c8
-rw-r--r--net/ipv4/inet_timewait_sock.c35
-rw-r--r--net/ipv4/ip_input.c4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c11
-rw-r--r--net/ipv4/tcp.c121
-rw-r--r--net/ipv4/tcp_input.c14
-rw-r--r--net/ipv4/tcp_ipv4.c137
-rw-r--r--net/ipv4/tcp_lp.c7
-rw-r--r--net/ipv4/tcp_minisocks.c2
-rw-r--r--net/ipv4/tcp_output.c7
15 files changed, 377 insertions, 214 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 12850a277251..b71c22475c51 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -760,9 +760,7 @@ config TCP_AO
config TCP_MD5SIG
bool "TCP: MD5 Signature Option support (RFC2385)"
- select CRYPTO
- select CRYPTO_MD5
- select TCP_SIGPOOL
+ select CRYPTO_LIB_MD5
help
RFC2385 specifies a method of giving MD5 protection to TCP sessions.
Its main (only?) use is to protect BGP sessions between core routers
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 3109c5ec38f3..0784e2a873a1 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -755,6 +755,26 @@ EXPORT_SYMBOL(inet_stream_connect);
void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk)
{
+ if (mem_cgroup_sockets_enabled) {
+ gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL;
+
+ mem_cgroup_sk_alloc(newsk);
+
+ if (mem_cgroup_from_sk(newsk)) {
+ int amt;
+
+ /* The socket has not been accepted yet, no need
+ * to look at newsk->sk_wmem_queued.
+ */
+ amt = sk_mem_pages(newsk->sk_forward_alloc +
+ atomic_read(&newsk->sk_rmem_alloc));
+ if (amt)
+ mem_cgroup_sk_charge(newsk, amt, gfp);
+ }
+
+ kmem_cache_charge(newsk, gfp);
+ }
+
sock_rps_record_flow(newsk);
WARN_ON(!((1 << newsk->sk_state) &
(TCPF_ESTABLISHED | TCPF_SYN_RECV |
@@ -768,6 +788,7 @@ void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *new
newsock->state = SS_CONNECTED;
}
+EXPORT_SYMBOL_GPL(__inet_accept);
/*
* Accept a pending connection. The TCP layer now gives BSD semantics.
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 833f2cf97178..f3bfecf8a234 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1217,10 +1217,10 @@ int arp_invalidate(struct net_device *dev, __be32 ip, bool force)
err = neigh_update(neigh, NULL, NUD_FAILED,
NEIGH_UPDATE_F_OVERRIDE|
NEIGH_UPDATE_F_ADMIN, 0);
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
neigh_release(neigh);
neigh_remove_one(neigh);
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
}
return err;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 1b7fb5d935ed..4abbec2f47ef 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -582,6 +582,185 @@ relookup_failed:
return ERR_PTR(err);
}
+struct icmp_ext_iio_addr4_subobj {
+ __be16 afi;
+ __be16 reserved;
+ __be32 addr4;
+};
+
+static unsigned int icmp_ext_iio_len(void)
+{
+ return sizeof(struct icmp_extobj_hdr) +
+ /* ifIndex */
+ sizeof(__be32) +
+ /* Interface Address Sub-Object */
+ sizeof(struct icmp_ext_iio_addr4_subobj) +
+ /* Interface Name Sub-Object. Length must be a multiple of 4
+ * bytes.
+ */
+ ALIGN(sizeof(struct icmp_ext_iio_name_subobj), 4) +
+ /* MTU */
+ sizeof(__be32);
+}
+
+static unsigned int icmp_ext_max_len(u8 ext_objs)
+{
+ unsigned int ext_max_len;
+
+ ext_max_len = sizeof(struct icmp_ext_hdr);
+
+ if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
+ ext_max_len += icmp_ext_iio_len();
+
+ return ext_max_len;
+}
+
+static __be32 icmp_ext_iio_addr4_find(const struct net_device *dev)
+{
+ struct in_device *in_dev;
+ struct in_ifaddr *ifa;
+
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ return 0;
+
+ /* It is unclear from RFC 5837 which IP address should be chosen, but
+ * it makes sense to choose a global unicast address.
+ */
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY)
+ continue;
+ if (ifa->ifa_scope != RT_SCOPE_UNIVERSE ||
+ ipv4_is_multicast(ifa->ifa_address))
+ continue;
+ return ifa->ifa_address;
+ }
+
+ return 0;
+}
+
+static void icmp_ext_iio_iif_append(struct net *net, struct sk_buff *skb,
+ int iif)
+{
+ struct icmp_ext_iio_name_subobj *name_subobj;
+ struct icmp_extobj_hdr *objh;
+ struct net_device *dev;
+ __be32 data;
+
+ if (!iif)
+ return;
+
+ /* Add the fields in the order specified by RFC 5837. */
+ objh = skb_put(skb, sizeof(*objh));
+ objh->class_num = ICMP_EXT_OBJ_CLASS_IIO;
+ objh->class_type = ICMP_EXT_CTYPE_IIO_ROLE(ICMP_EXT_CTYPE_IIO_ROLE_IIF);
+
+ data = htonl(iif);
+ skb_put_data(skb, &data, sizeof(__be32));
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_IFINDEX;
+
+ rcu_read_lock();
+
+ dev = dev_get_by_index_rcu(net, iif);
+ if (!dev)
+ goto out;
+
+ data = icmp_ext_iio_addr4_find(dev);
+ if (data) {
+ struct icmp_ext_iio_addr4_subobj *addr4_subobj;
+
+ addr4_subobj = skb_put_zero(skb, sizeof(*addr4_subobj));
+ addr4_subobj->afi = htons(ICMP_AFI_IP);
+ addr4_subobj->addr4 = data;
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_IPADDR;
+ }
+
+ name_subobj = skb_put_zero(skb, ALIGN(sizeof(*name_subobj), 4));
+ name_subobj->len = ALIGN(sizeof(*name_subobj), 4);
+ netdev_copy_name(dev, name_subobj->name);
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_NAME;
+
+ data = htonl(READ_ONCE(dev->mtu));
+ skb_put_data(skb, &data, sizeof(__be32));
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_MTU;
+
+out:
+ rcu_read_unlock();
+ objh->length = htons(skb_tail_pointer(skb) - (unsigned char *)objh);
+}
+
+static void icmp_ext_objs_append(struct net *net, struct sk_buff *skb,
+ u8 ext_objs, int iif)
+{
+ if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
+ icmp_ext_iio_iif_append(net, skb, iif);
+}
+
+static struct sk_buff *
+icmp_ext_append(struct net *net, struct sk_buff *skb_in, struct icmphdr *icmph,
+ unsigned int room, int iif)
+{
+ unsigned int payload_len, ext_max_len, ext_len;
+ struct icmp_ext_hdr *ext_hdr;
+ struct sk_buff *skb;
+ u8 ext_objs;
+ int nhoff;
+
+ switch (icmph->type) {
+ case ICMP_DEST_UNREACH:
+ case ICMP_TIME_EXCEEDED:
+ case ICMP_PARAMETERPROB:
+ break;
+ default:
+ return NULL;
+ }
+
+ ext_objs = READ_ONCE(net->ipv4.sysctl_icmp_errors_extension_mask);
+ if (!ext_objs)
+ return NULL;
+
+ ext_max_len = icmp_ext_max_len(ext_objs);
+ if (ICMP_EXT_ORIG_DGRAM_MIN_LEN + ext_max_len > room)
+ return NULL;
+
+ skb = skb_clone(skb_in, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ nhoff = skb_network_offset(skb);
+ payload_len = min(skb->len - nhoff, ICMP_EXT_ORIG_DGRAM_MIN_LEN);
+
+ if (!pskb_network_may_pull(skb, payload_len))
+ goto free_skb;
+
+ if (pskb_trim(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN) ||
+ __skb_put_padto(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN, false))
+ goto free_skb;
+
+ if (pskb_expand_head(skb, 0, ext_max_len, GFP_ATOMIC))
+ goto free_skb;
+
+ ext_hdr = skb_put_zero(skb, sizeof(*ext_hdr));
+ ext_hdr->version = ICMP_EXT_VERSION_2;
+
+ icmp_ext_objs_append(net, skb, ext_objs, iif);
+
+ /* Do not send an empty extension structure. */
+ ext_len = skb_tail_pointer(skb) - (unsigned char *)ext_hdr;
+ if (ext_len == sizeof(*ext_hdr))
+ goto free_skb;
+
+ ext_hdr->checksum = ip_compute_csum(ext_hdr, ext_len);
+ /* The length of the original datagram in 32-bit words (RFC 4884). */
+ icmph->un.reserved[1] = ICMP_EXT_ORIG_DGRAM_MIN_LEN / sizeof(u32);
+
+ return skb;
+
+free_skb:
+ consume_skb(skb);
+ return NULL;
+}
+
/*
* Send an ICMP message in response to a situation
*
@@ -601,6 +780,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
struct icmp_bxm icmp_param;
struct rtable *rt = skb_rtable(skb_in);
bool apply_ratelimit = false;
+ struct sk_buff *ext_skb;
struct ipcm_cookie ipc;
struct flowi4 fl4;
__be32 saddr;
@@ -770,7 +950,12 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
if (room <= (int)sizeof(struct iphdr))
goto ende;
- icmp_param.data_len = skb_in->len - icmp_param.offset;
+ ext_skb = icmp_ext_append(net, skb_in, &icmp_param.data.icmph, room,
+ parm->iif);
+ if (ext_skb)
+ icmp_param.skb = ext_skb;
+
+ icmp_param.data_len = icmp_param.skb->len - icmp_param.offset;
if (icmp_param.data_len > room)
icmp_param.data_len = room;
icmp_param.head_len = sizeof(struct icmphdr);
@@ -785,6 +970,9 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
trace_icmp_send(skb_in, type, code);
icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt);
+
+ if (ext_skb)
+ consume_skb(ext_skb);
ende:
ip_rt_put(rt);
out_unlock:
@@ -1502,6 +1690,7 @@ static int __net_init icmp_sk_init(struct net *net)
net->ipv4.sysctl_icmp_ratelimit = 1 * HZ;
net->ipv4.sysctl_icmp_ratemask = 0x1818;
net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0;
+ net->ipv4.sysctl_icmp_errors_extension_mask = 0;
net->ipv4.sysctl_icmp_msgs_per_sec = 1000;
net->ipv4.sysctl_icmp_msgs_burst = 50;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index cdd1e12aac8c..3b83b66b2284 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -712,31 +712,6 @@ struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg)
release_sock(sk);
- if (mem_cgroup_sockets_enabled) {
- gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL;
- int amt = 0;
-
- /* atomically get the memory usage, set and charge the
- * newsk->sk_memcg.
- */
- lock_sock(newsk);
-
- mem_cgroup_sk_alloc(newsk);
- if (mem_cgroup_from_sk(newsk)) {
- /* The socket has not been accepted yet, no need
- * to look at newsk->sk_wmem_queued.
- */
- amt = sk_mem_pages(newsk->sk_forward_alloc +
- atomic_read(&newsk->sk_rmem_alloc));
- }
-
- if (amt)
- mem_cgroup_sk_charge(newsk, amt, gfp);
- kmem_cache_charge(newsk, gfp);
-
- release_sock(newsk);
- }
-
if (req)
reqsk_put(req);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index b7024e3d9ac3..f5826ec4bcaa 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -720,8 +720,11 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
spin_lock(lock);
if (osk) {
WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
- ret = sk_nulls_del_node_init_rcu(osk);
- } else if (found_dup_sk) {
+ ret = sk_nulls_replace_node_init_rcu(osk, sk);
+ goto unlock;
+ }
+
+ if (found_dup_sk) {
*found_dup_sk = inet_ehash_lookup_by_sk(sk, list);
if (*found_dup_sk)
ret = false;
@@ -730,6 +733,7 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
if (ret)
__sk_nulls_add_node_rcu(sk, list);
+unlock:
spin_unlock(lock);
return ret;
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index c96d61d08854..d4c781a0667f 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -88,12 +88,6 @@ void inet_twsk_put(struct inet_timewait_sock *tw)
}
EXPORT_SYMBOL_GPL(inet_twsk_put);
-static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
- struct hlist_nulls_head *list)
-{
- hlist_nulls_add_head_rcu(&tw->tw_node, list);
-}
-
static void inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo)
{
__inet_twsk_schedule(tw, timeo, false);
@@ -113,13 +107,12 @@ void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw,
{
const struct inet_sock *inet = inet_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
- struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
struct inet_bind_hashbucket *bhead, *bhead2;
- /* Step 1: Put TW into bind hash. Original socket stays there too.
- Note, that any socket with inet->num != 0 MUST be bound in
- binding cache, even if it is closed.
+ /* Put TW into bind hash. Original socket stays there too.
+ * Note, that any socket with inet->num != 0 MUST be bound in
+ * binding cache, even if it is closed.
*/
bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
hashinfo->bhash_size)];
@@ -141,19 +134,6 @@ void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw,
spin_lock(lock);
- /* Step 2: Hash TW into tcp ehash chain */
- inet_twsk_add_node_rcu(tw, &ehead->chain);
-
- /* Step 3: Remove SK from hash chain */
- if (__sk_nulls_del_node_init_rcu(sk))
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
-
-
- /* Ensure above writes are committed into memory before updating the
- * refcount.
- * Provides ordering vs later refcount_inc().
- */
- smp_wmb();
/* tw_refcnt is set to 3 because we have :
* - one reference for bhash chain.
* - one reference for ehash chain.
@@ -163,6 +143,15 @@ void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw,
*/
refcount_set(&tw->tw_refcnt, 3);
+ /* Ensure tw_refcnt has been set before tw is published.
+ * smp_wmb() provides the necessary memory barrier to enforce this
+ * ordering.
+ */
+ smp_wmb();
+
+ hlist_nulls_replace_init_rcu(&sk->sk_nulls_node, &tw->tw_node);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+
inet_twsk_schedule(tw, timeo);
spin_unlock(lock);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 273578579a6b..19d3141dad1f 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -141,6 +141,8 @@
#include <linux/mroute.h>
#include <linux/netlink.h>
#include <net/dst_metadata.h>
+#include <net/udp.h>
+#include <net/tcp.h>
/*
* Process Router Attention IP option (RFC 2113)
@@ -317,8 +319,6 @@ static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph,
ip_hdr(hint)->tos == iph->tos;
}
-int tcp_v4_early_demux(struct sk_buff *skb);
-enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb);
static int ip_rcv_finish_core(struct net *net,
struct sk_buff *skb, struct net_device *dev,
const struct sk_buff *hint)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 24dbc603cc44..0c7c8f9041cb 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -48,6 +48,8 @@ static int tcp_plb_max_rounds = 31;
static int tcp_plb_max_cong_thresh = 256;
static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC;
static int tcp_ecn_mode_max = 2;
+static u32 icmp_errors_extension_mask_all =
+ GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0);
/* obsolete */
static int sysctl_tcp_low_latency __read_mostly;
@@ -675,6 +677,15 @@ static struct ctl_table ipv4_net_table[] = {
.extra2 = SYSCTL_ONE
},
{
+ .procname = "icmp_errors_extension_mask",
+ .data = &init_net.ipv4.sysctl_icmp_errors_extension_mask,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &icmp_errors_extension_mask_all,
+ },
+ {
.procname = "icmp_ratelimit",
.data = &init_net.ipv4.sysctl_icmp_ratelimit,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8a18aeca7ab0..a9345aa5a2e5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -243,7 +243,7 @@
#define pr_fmt(fmt) "TCP: " fmt
-#include <crypto/hash.h>
+#include <crypto/md5.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
@@ -253,7 +253,6 @@
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/skbuff.h>
-#include <linux/scatterlist.h>
#include <linux/splice.h>
#include <linux/net.h>
#include <linux/socket.h>
@@ -425,7 +424,6 @@ void tcp_md5_destruct_sock(struct sock *sk)
tcp_clear_md5_list(sk);
kfree(rcu_replace_pointer(tp->md5sig_info, NULL, 1));
static_branch_slow_dec_deferred(&tcp_md5_needed);
- tcp_md5_release_sigpool();
}
}
EXPORT_IPV6_MOD_GPL(tcp_md5_destruct_sock);
@@ -928,7 +926,8 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
}
__kfree_skb(skb);
} else {
- sk->sk_prot->enter_memory_pressure(sk);
+ if (!sk->sk_bypass_prot_mem)
+ tcp_enter_memory_pressure(sk);
sk_stream_moderate_sndbuf(sk);
}
return NULL;
@@ -1557,8 +1556,10 @@ void __tcp_cleanup_rbuf(struct sock *sk, int copied)
time_to_ack = true;
}
}
- if (time_to_ack)
+ if (time_to_ack) {
+ tcp_mstamp_refresh(tp);
tcp_send_ack(sk);
+ }
}
void tcp_cleanup_rbuf(struct sock *sk, int copied)
@@ -3583,9 +3584,12 @@ static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf,
DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
EXPORT_IPV6_MOD(tcp_tx_delay_enabled);
-static void tcp_enable_tx_delay(void)
+static void tcp_enable_tx_delay(struct sock *sk, int val)
{
- if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
+ struct tcp_sock *tp = tcp_sk(sk);
+ s32 delta = (val - tp->tcp_tx_delay) << 3;
+
+ if (val && !static_branch_unlikely(&tcp_tx_delay_enabled)) {
static int __tcp_tx_delay_enabled = 0;
if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
@@ -3593,6 +3597,22 @@ static void tcp_enable_tx_delay(void)
pr_info("TCP_TX_DELAY enabled\n");
}
}
+ /* If we change tcp_tx_delay on a live flow, adjust tp->srtt_us,
+ * tp->rtt_min, icsk_rto and sk->sk_pacing_rate.
+ * This is best effort.
+ */
+ if (delta && sk->sk_state == TCP_ESTABLISHED) {
+ s64 srtt = (s64)tp->srtt_us + delta;
+
+ tp->srtt_us = clamp_t(s64, srtt, 1, ~0U);
+
+ /* Note: does not deal with non zero icsk_backoff */
+ tcp_set_rto(sk);
+
+ minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
+
+ tcp_update_pacing_rate(sk);
+ }
}
/* When set indicates to always queue non-full frames. Later the user clears
@@ -4119,8 +4139,12 @@ ao_parse:
tp->recvmsg_inq = val;
break;
case TCP_TX_DELAY:
- if (val)
- tcp_enable_tx_delay();
+ /* tp->srtt_us is u32, and is shifted by 3 */
+ if (val < 0 || val >= (1U << (31 - 3))) {
+ err = -EINVAL;
+ break;
+ }
+ tcp_enable_tx_delay(sk, val);
WRITE_ONCE(tp->tcp_tx_delay, val);
break;
default:
@@ -4815,52 +4839,45 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
EXPORT_IPV6_MOD(tcp_getsockopt);
#ifdef CONFIG_TCP_MD5SIG
-int tcp_md5_sigpool_id = -1;
-EXPORT_IPV6_MOD_GPL(tcp_md5_sigpool_id);
-
-int tcp_md5_alloc_sigpool(void)
+void tcp_md5_hash_skb_data(struct md5_ctx *ctx, const struct sk_buff *skb,
+ unsigned int header_len)
{
- size_t scratch_size;
- int ret;
+ const unsigned int head_data_len = skb_headlen(skb) > header_len ?
+ skb_headlen(skb) - header_len : 0;
+ const struct skb_shared_info *shi = skb_shinfo(skb);
+ struct sk_buff *frag_iter;
+ unsigned int i;
- scratch_size = sizeof(union tcp_md5sum_block) + sizeof(struct tcphdr);
- ret = tcp_sigpool_alloc_ahash("md5", scratch_size);
- if (ret >= 0) {
- /* As long as any md5 sigpool was allocated, the return
- * id would stay the same. Re-write the id only for the case
- * when previously all MD5 keys were deleted and this call
- * allocates the first MD5 key, which may return a different
- * sigpool id than was used previously.
- */
- WRITE_ONCE(tcp_md5_sigpool_id, ret); /* Avoids the compiler potentially being smart here */
- return 0;
- }
- return ret;
-}
+ md5_update(ctx, (const u8 *)tcp_hdr(skb) + header_len, head_data_len);
-void tcp_md5_release_sigpool(void)
-{
- tcp_sigpool_release(READ_ONCE(tcp_md5_sigpool_id));
-}
+ for (i = 0; i < shi->nr_frags; ++i) {
+ const skb_frag_t *f = &shi->frags[i];
+ u32 p_off, p_len, copied;
+ const void *vaddr;
+ struct page *p;
-void tcp_md5_add_sigpool(void)
-{
- tcp_sigpool_get(READ_ONCE(tcp_md5_sigpool_id));
+ skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f),
+ p, p_off, p_len, copied) {
+ vaddr = kmap_local_page(p);
+ md5_update(ctx, vaddr + p_off, p_len);
+ kunmap_local(vaddr);
+ }
+ }
+
+ skb_walk_frags(skb, frag_iter)
+ tcp_md5_hash_skb_data(ctx, frag_iter, 0);
}
+EXPORT_IPV6_MOD(tcp_md5_hash_skb_data);
-int tcp_md5_hash_key(struct tcp_sigpool *hp,
- const struct tcp_md5sig_key *key)
+void tcp_md5_hash_key(struct md5_ctx *ctx,
+ const struct tcp_md5sig_key *key)
{
u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */
- struct scatterlist sg;
-
- sg_init_one(&sg, key->key, keylen);
- ahash_request_set_crypt(hp->req, &sg, NULL, keylen);
/* We use data_race() because tcp_md5_do_add() might change
* key->key under us
*/
- return data_race(crypto_ahash_update(hp->req));
+ data_race(({ md5_update(ctx, key->key, keylen), 0; }));
}
EXPORT_IPV6_MOD(tcp_md5_hash_key);
@@ -4871,19 +4888,16 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
int family, int l3index, const __u8 *hash_location)
{
/* This gets called for each TCP segment that has TCP-MD5 option.
- * We have 3 drop cases:
- * o No MD5 hash and one expected.
- * o MD5 hash and we're not expecting one.
- * o MD5 hash and its wrong.
+ * We have 2 drop cases:
+ * o An MD5 signature is present, but we're not expecting one.
+ * o The MD5 signature is wrong.
*/
const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *key;
u8 newhash[16];
- int genhash;
key = tcp_md5_do_lookup(sk, l3index, saddr, family);
-
- if (!key && hash_location) {
+ if (!key) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
trace_tcp_hash_md5_unexpected(sk, skb);
return SKB_DROP_REASON_TCP_MD5UNEXPECTED;
@@ -4894,11 +4908,10 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
* IPv4-mapped case.
*/
if (family == AF_INET)
- genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
+ tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
else
- genhash = tp->af_specific->calc_md5_hash(newhash, key,
- NULL, skb);
- if (genhash || memcmp(hash_location, newhash, 16) != 0) {
+ tp->af_specific->calc_md5_hash(newhash, key, NULL, skb);
+ if (memcmp(hash_location, newhash, 16) != 0) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
trace_tcp_hash_md5_mismatch(sk, skb);
return SKB_DROP_REASON_TCP_MD5FAILURE;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e4a979b75cc6..6db1d4c36a88 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -937,9 +937,15 @@ void tcp_rcv_space_adjust(struct sock *sk)
trace_tcp_rcv_space_adjust(sk);
- tcp_mstamp_refresh(tp);
+ if (unlikely(!tp->rcv_rtt_est.rtt_us))
+ return;
+
+ /* We do not refresh tp->tcp_mstamp here.
+ * Some platforms have expensive ktime_get() implementations.
+ * Using the last cached value is enough for DRS.
+ */
time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
- if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
+ if (time < (tp->rcv_rtt_est.rtt_us >> 3))
return;
/* Number of bytes copied to user in last RTT */
@@ -1102,7 +1108,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
tp->srtt_us = max(1U, srtt);
}
-static void tcp_update_pacing_rate(struct sock *sk)
+void tcp_update_pacing_rate(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
u64 rate;
@@ -1139,7 +1145,7 @@ static void tcp_update_pacing_rate(struct sock *sk)
/* Calculate rto without backoff. This is the second half of Van Jacobson's
* routine referred to above.
*/
-static void tcp_set_rto(struct sock *sk)
+void tcp_set_rto(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
/* Old crap is replaced with new one. 8)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b1fcf3e4e1ce..40a76da5364a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -53,6 +53,7 @@
#include <linux/module.h>
#include <linux/random.h>
#include <linux/cache.h>
+#include <linux/fips.h>
#include <linux/jhash.h>
#include <linux/init.h>
#include <linux/times.h>
@@ -86,14 +87,13 @@
#include <linux/btf_ids.h>
#include <linux/skbuff_ref.h>
-#include <crypto/hash.h>
-#include <linux/scatterlist.h>
+#include <crypto/md5.h>
#include <trace/events/tcp.h>
#ifdef CONFIG_TCP_MD5SIG
-static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
- __be32 daddr, __be32 saddr, const struct tcphdr *th);
+static void tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
+ __be32 daddr, __be32 saddr, const struct tcphdr *th);
#endif
struct inet_hashinfo tcp_hashinfo;
@@ -754,7 +754,6 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
struct tcp_md5sig_key *key = NULL;
unsigned char newhash[16];
struct sock *sk1 = NULL;
- int genhash;
#endif
u64 transmit_time = 0;
struct sock *ctl_sk;
@@ -840,11 +839,9 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
if (!key)
goto out;
-
- genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
- if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
+ tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
+ if (memcmp(md5_hash_location, newhash, 16) != 0)
goto out;
-
}
if (key) {
@@ -1425,13 +1422,13 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
struct tcp_sock *tp = tcp_sk(sk);
if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
- if (tcp_md5_alloc_sigpool())
- return -ENOMEM;
+ if (fips_enabled) {
+ pr_warn_once("TCP-MD5 support is disabled due to FIPS\n");
+ return -EOPNOTSUPP;
+ }
- if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
- tcp_md5_release_sigpool();
+ if (tcp_md5sig_info_add(sk, GFP_KERNEL))
return -ENOMEM;
- }
if (!static_branch_inc(&tcp_md5_needed.key)) {
struct tcp_md5sig_info *md5sig;
@@ -1439,7 +1436,6 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
rcu_assign_pointer(tp->md5sig_info, NULL);
kfree_rcu(md5sig, rcu);
- tcp_md5_release_sigpool();
return -EUSERS;
}
}
@@ -1456,12 +1452,9 @@ int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
struct tcp_sock *tp = tcp_sk(sk);
if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
- tcp_md5_add_sigpool();
- if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
- tcp_md5_release_sigpool();
+ if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
return -ENOMEM;
- }
if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
struct tcp_md5sig_info *md5sig;
@@ -1470,7 +1463,6 @@ int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
rcu_assign_pointer(tp->md5sig_info, NULL);
kfree_rcu(md5sig, rcu);
- tcp_md5_release_sigpool();
return -EUSERS;
}
}
@@ -1578,66 +1570,44 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
cmd.tcpm_key, cmd.tcpm_keylen);
}
-static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
- __be32 daddr, __be32 saddr,
- const struct tcphdr *th, int nbytes)
+static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx,
+ __be32 daddr, __be32 saddr,
+ const struct tcphdr *th, int nbytes)
{
- struct tcp4_pseudohdr *bp;
- struct scatterlist sg;
- struct tcphdr *_th;
-
- bp = hp->scratch;
- bp->saddr = saddr;
- bp->daddr = daddr;
- bp->pad = 0;
- bp->protocol = IPPROTO_TCP;
- bp->len = cpu_to_be16(nbytes);
-
- _th = (struct tcphdr *)(bp + 1);
- memcpy(_th, th, sizeof(*th));
- _th->check = 0;
+ struct {
+ struct tcp4_pseudohdr ip;
+ struct tcphdr tcp;
+ } h;
- sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
- ahash_request_set_crypt(hp->req, &sg, NULL,
- sizeof(*bp) + sizeof(*th));
- return crypto_ahash_update(hp->req);
+ h.ip.saddr = saddr;
+ h.ip.daddr = daddr;
+ h.ip.pad = 0;
+ h.ip.protocol = IPPROTO_TCP;
+ h.ip.len = cpu_to_be16(nbytes);
+ h.tcp = *th;
+ h.tcp.check = 0;
+ md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp));
}
-static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
- __be32 daddr, __be32 saddr, const struct tcphdr *th)
+static noinline_for_stack void
+tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
+ __be32 daddr, __be32 saddr, const struct tcphdr *th)
{
- struct tcp_sigpool hp;
+ struct md5_ctx ctx;
- if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
- goto clear_hash_nostart;
-
- if (crypto_ahash_init(hp.req))
- goto clear_hash;
- if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
- goto clear_hash;
- if (tcp_md5_hash_key(&hp, key))
- goto clear_hash;
- ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
- if (crypto_ahash_final(hp.req))
- goto clear_hash;
-
- tcp_sigpool_end(&hp);
- return 0;
-
-clear_hash:
- tcp_sigpool_end(&hp);
-clear_hash_nostart:
- memset(md5_hash, 0, 16);
- return 1;
+ md5_init(&ctx);
+ tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2);
+ tcp_md5_hash_key(&ctx, key);
+ md5_final(&ctx, md5_hash);
}
-int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
- const struct sock *sk,
- const struct sk_buff *skb)
+noinline_for_stack void
+tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
+ const struct sock *sk, const struct sk_buff *skb)
{
const struct tcphdr *th = tcp_hdr(skb);
- struct tcp_sigpool hp;
__be32 saddr, daddr;
+ struct md5_ctx ctx;
if (sk) { /* valid for establish/request sockets */
saddr = sk->sk_rcv_saddr;
@@ -1648,30 +1618,11 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
daddr = iph->daddr;
}
- if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
- goto clear_hash_nostart;
-
- if (crypto_ahash_init(hp.req))
- goto clear_hash;
-
- if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
- goto clear_hash;
- if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
- goto clear_hash;
- if (tcp_md5_hash_key(&hp, key))
- goto clear_hash;
- ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
- if (crypto_ahash_final(hp.req))
- goto clear_hash;
-
- tcp_sigpool_end(&hp);
- return 0;
-
-clear_hash:
- tcp_sigpool_end(&hp);
-clear_hash_nostart:
- memset(md5_hash, 0, 16);
- return 1;
+ md5_init(&ctx);
+ tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, skb->len);
+ tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2);
+ tcp_md5_hash_key(&ctx, key);
+ md5_final(&ctx, md5_hash);
}
EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb);
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index 52fe17167460..976b56644a8a 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -23,9 +23,9 @@
* Original Author:
* Aleksandar Kuzmanovic <akuzma@northwestern.edu>
* Available from:
- * http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf
+ * https://users.cs.northwestern.edu/~akuzma/doc/TCP-LP-ToN.pdf
* Original implementation for 2.4.19:
- * http://www-ece.rice.edu/networks/TCP-LP/
+ * https://users.cs.northwestern.edu/~akuzma/rice/TCP-LP/linux/tcp-lp-linux.htm
*
* 2.6.x module Authors:
* Wong Hoi Sing, Edison <hswong3i@gmail.com>
@@ -113,6 +113,8 @@ static void tcp_lp_init(struct sock *sk)
/**
* tcp_lp_cong_avoid
* @sk: socket to avoid congesting
+ * @ack: current ack sequence number
+ * @acked: number of ACKed packets
*
* Implementation of cong_avoid.
* Will only call newReno CA when away from inference.
@@ -261,6 +263,7 @@ static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt)
/**
* tcp_lp_pkts_acked
* @sk: socket requiring congestion avoidance calculations
+ * @sample: ACK sample containing timing and rate information
*
* Implementation of pkts_acked.
* Deal with active drop under Early Congestion Indication.
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 2ec8c6f1cdcc..ded2cf1f6006 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -312,7 +312,6 @@ static void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw)
return;
if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key))
goto out_free;
- tcp_md5_add_sigpool();
}
return;
out_free:
@@ -406,7 +405,6 @@ void tcp_twsk_destructor(struct sock *sk)
if (twsk->tw_md5_key) {
kfree(twsk->tw_md5_key);
static_branch_slow_dec_deferred(&tcp_md5_needed);
- tcp_md5_release_sigpool();
}
}
#endif
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b94efb3050d2..7f5df7a71f62 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3743,12 +3743,17 @@ void sk_forced_mem_schedule(struct sock *sk, int size)
delta = size - sk->sk_forward_alloc;
if (delta <= 0)
return;
+
amt = sk_mem_pages(delta);
sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
- sk_memory_allocated_add(sk, amt);
if (mem_cgroup_sk_enabled(sk))
mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL);
+
+ if (sk->sk_bypass_prot_mem)
+ return;
+
+ sk_memory_allocated_add(sk, amt);
}
/* Send a FIN. The caller locks the socket for us.