[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20230921133021.1995349-3-edumazet@google.com>
Date: Thu, 21 Sep 2023 13:30:15 +0000
From: Eric Dumazet <edumazet@...gle.com>
To: "David S . Miller" <davem@...emloft.net>, Jakub Kicinski <kuba@...nel.org>,
Paolo Abeni <pabeni@...hat.com>
Cc: David Ahern <dsahern@...nel.org>, netdev@...r.kernel.org, eric.dumazet@...il.com,
Eric Dumazet <edumazet@...gle.com>
Subject: [PATCH net-next 2/8] inet: implement lockless IP_MTU_DISCOVER
inet->pmtudisc can be read locklessly.
Implement proper lockless reads and writes to inet->pmtudisc
ip_sock_set_mtu_discover() can now be called from arbitrary
contexts.
Signed-off-by: Eric Dumazet <edumazet@...gle.com>
---
include/net/ip.h | 13 ++++++++-----
net/ipv4/ip_output.c | 7 ++++---
net/ipv4/ip_sockglue.c | 17 ++++++-----------
net/ipv4/ping.c | 2 +-
net/ipv4/raw.c | 2 +-
net/ipv4/udp.c | 2 +-
net/netfilter/ipvs/ip_vs_sync.c | 2 +-
7 files changed, 22 insertions(+), 23 deletions(-)
diff --git a/include/net/ip.h b/include/net/ip.h
index 3489a1cca5e7bc315ba646f6bc125b2b6ded9416..46933a0d98eac2db40c2e88006125588b8f8143e 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -434,19 +434,22 @@ int ip_dont_fragment(const struct sock *sk, const struct dst_entry *dst)
static inline bool ip_sk_accept_pmtu(const struct sock *sk)
{
- return inet_sk(sk)->pmtudisc != IP_PMTUDISC_INTERFACE &&
- inet_sk(sk)->pmtudisc != IP_PMTUDISC_OMIT;
+ u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc);
+
+ return pmtudisc != IP_PMTUDISC_INTERFACE &&
+ pmtudisc != IP_PMTUDISC_OMIT;
}
static inline bool ip_sk_use_pmtu(const struct sock *sk)
{
- return inet_sk(sk)->pmtudisc < IP_PMTUDISC_PROBE;
+ return READ_ONCE(inet_sk(sk)->pmtudisc) < IP_PMTUDISC_PROBE;
}
static inline bool ip_sk_ignore_df(const struct sock *sk)
{
- return inet_sk(sk)->pmtudisc < IP_PMTUDISC_DO ||
- inet_sk(sk)->pmtudisc == IP_PMTUDISC_OMIT;
+ u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc);
+
+ return pmtudisc < IP_PMTUDISC_DO || pmtudisc == IP_PMTUDISC_OMIT;
}
static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index adad16f1e872ce20941a087b3965fdb040868d4e..2be281f184a5fe5a695ccd51fabe69fa45bea0b8 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1387,8 +1387,8 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
struct ip_options *opt = NULL;
struct rtable *rt = (struct rtable *)cork->dst;
struct iphdr *iph;
+ u8 pmtudisc, ttl;
__be16 df = 0;
- __u8 ttl;
skb = __skb_dequeue(queue);
if (!skb)
@@ -1418,8 +1418,9 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
/* DF bit is set when we want to see DF on outgoing frames.
* If ignore_df is set too, we still allow to fragment this frame
* locally. */
- if (inet->pmtudisc == IP_PMTUDISC_DO ||
- inet->pmtudisc == IP_PMTUDISC_PROBE ||
+ pmtudisc = READ_ONCE(inet->pmtudisc);
+ if (pmtudisc == IP_PMTUDISC_DO ||
+ pmtudisc == IP_PMTUDISC_PROBE ||
(skb->len <= dst_mtu(&rt->dst) &&
ip_dont_fragment(sk, &rt->dst)))
df = htons(IP_DF);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 4ad3003378ae6b186513000264f77b54a7babe6d..6d874cc03c8b4e88d79ebc50a6db105606b6ae60 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -622,9 +622,7 @@ int ip_sock_set_mtu_discover(struct sock *sk, int val)
{
if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
return -EINVAL;
- lock_sock(sk);
- inet_sk(sk)->pmtudisc = val;
- release_sock(sk);
+ WRITE_ONCE(inet_sk(sk)->pmtudisc, val);
return 0;
}
EXPORT_SYMBOL(ip_sock_set_mtu_discover);
@@ -1050,6 +1048,8 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
return -EINVAL;
WRITE_ONCE(inet->mc_ttl, val);
return 0;
+ case IP_MTU_DISCOVER:
+ return ip_sock_set_mtu_discover(sk, val);
}
err = 0;
@@ -1107,11 +1107,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
case IP_TOS: /* This sets both TOS and Precedence */
__ip_sock_set_tos(sk, val);
break;
- case IP_MTU_DISCOVER:
- if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
- goto e_inval;
- inet->pmtudisc = val;
- break;
case IP_UNICAST_IF:
{
struct net_device *dev = NULL;
@@ -1595,6 +1590,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_MULTICAST_TTL:
val = READ_ONCE(inet->mc_ttl);
goto copyval;
+ case IP_MTU_DISCOVER:
+ val = READ_ONCE(inet->pmtudisc);
+ goto copyval;
}
if (needs_rtnl)
@@ -1634,9 +1632,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_TOS:
val = inet->tos;
break;
- case IP_MTU_DISCOVER:
- val = inet->pmtudisc;
- break;
case IP_MTU:
{
struct dst_entry *dst;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 4dd809b7b18867154df42bc28809b886913e253c..50d12b0c8d46fdcd9b448c3ebc90395ebf426075 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -551,7 +551,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info)
case ICMP_DEST_UNREACH:
if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
ipv4_sk_update_pmtu(skb, sk, info);
- if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) {
+ if (READ_ONCE(inet_sock->pmtudisc) != IP_PMTUDISC_DONT) {
err = EMSGSIZE;
harderr = 1;
break;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 4b5db5d1edc279df1fd7412af2845a7a79c95ec8..ade1aecd7c71184d753a28a67bc9b30087247db4 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -239,7 +239,7 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
if (code > NR_ICMP_UNREACH)
break;
if (code == ICMP_FRAG_NEEDED) {
- harderr = inet->pmtudisc != IP_PMTUDISC_DONT;
+ harderr = READ_ONCE(inet->pmtudisc) != IP_PMTUDISC_DONT;
err = EMSGSIZE;
} else {
err = icmp_err_convert[code].errno;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c3ff984b63547daf0ecfb4ab96956aee2f8d589d..731a723dc80816f0b5b0803d7397f7e9e8cd8b09 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -750,7 +750,7 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
case ICMP_DEST_UNREACH:
if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
ipv4_sk_update_pmtu(skb, sk, info);
- if (inet->pmtudisc != IP_PMTUDISC_DONT) {
+ if (READ_ONCE(inet->pmtudisc) != IP_PMTUDISC_DONT) {
err = EMSGSIZE;
harderr = 1;
break;
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 3eed1670224888acf639cff06537ddf2505461bb..4f6c795588fbdbf084154025b8172e0fd2ea7384 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1335,7 +1335,7 @@ static void set_mcast_pmtudisc(struct sock *sk, int val)
/* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */
lock_sock(sk);
- inet->pmtudisc = val;
+ WRITE_ONCE(inet->pmtudisc, val);
#ifdef CONFIG_IP_VS_IPV6
if (sk->sk_family == AF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
--
2.42.0.459.ge4e396fd5e-goog
Powered by blists - more mailing lists