[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250902183603.740428-1-edumazet@google.com>
Date: Tue, 2 Sep 2025 18:36:03 +0000
From: Eric Dumazet <edumazet@...gle.com>
To: "David S . Miller" <davem@...emloft.net>, Jakub Kicinski <kuba@...nel.org>,
Paolo Abeni <pabeni@...hat.com>
Cc: Simon Horman <horms@...nel.org>, netdev@...r.kernel.org, eric.dumazet@...il.com,
Eric Dumazet <edumazet@...gle.com>, syzbot+50603c05bbdf4dfdaffa@...kaller.appspotmail.com,
Sebastian Andrzej Siewior <bigeasy@...utronix.de>, Kuniyuki Iwashima <kuniyu@...gle.com>
Subject: [PATCH net] net: lockless sock_i_ino()
Followup of commit c51da3f7a161 ("net: remove sock_i_uid()")
A recent syzbot report was the trigger for this change.
Over the years, we had many problems caused by the
read_lock[_bh](&sk->sk_callback_lock) in sock_i_uid().
We could fix smc_diag_dump_proto() or make a more radical move:
Instead of waiting for new syzbot reports, cache the socket
inode number in sk->sk_ino, so that we no longer
need to acquire sk->sk_callback_lock in sock_i_ino().
This makes socket dumps faster (one less cache line miss,
and two atomic ops avoided).
Prior art:
commit 25a9c8a4431c ("netlink: Add __sock_i_ino() for __netlink_diag_dump().")
commit 4f9bf2a2f5aa ("tcp: Don't acquire inet_listen_hashbucket::lock with disabled BH.")
commit efc3dbc37412 ("rds: Make rds_sock_lock BH rather than IRQ safe.")
Fixes: d2d6422f8bd1 ("x86: Allow to enable PREEMPT_RT.")
Reported-by: syzbot+50603c05bbdf4dfdaffa@...kaller.appspotmail.com
Closes: https://lore.kernel.org/netdev/68b73804.050a0220.3db4df.01d8.GAE@google.com/T/#u
Signed-off-by: Eric Dumazet <edumazet@...gle.com>
Cc: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
Cc: Kuniyuki Iwashima <kuniyu@...gle.com>
---
include/net/sock.h | 17 +++++++++++++----
net/core/sock.c | 22 ----------------------
net/mptcp/protocol.c | 1 -
net/netlink/diag.c | 2 +-
4 files changed, 14 insertions(+), 28 deletions(-)
diff --git a/include/net/sock.h b/include/net/sock.h
index c8a4b283df6f..fb13322a11fc 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -285,6 +285,7 @@ struct sk_filter;
* @sk_ack_backlog: current listen backlog
* @sk_max_ack_backlog: listen backlog set in listen()
* @sk_uid: user id of owner
+ * @sk_ino: inode number (zero if orphaned)
* @sk_prefer_busy_poll: prefer busypolling over softirq processing
* @sk_busy_poll_budget: napi processing budget when busypolling
* @sk_priority: %SO_PRIORITY setting
@@ -518,6 +519,7 @@ struct sock {
u32 sk_ack_backlog;
u32 sk_max_ack_backlog;
kuid_t sk_uid;
+ unsigned long sk_ino;
spinlock_t sk_peer_lock;
int sk_bind_phc;
struct pid *sk_peer_pid;
@@ -2056,6 +2058,10 @@ static inline int sk_rx_queue_get(const struct sock *sk)
static inline void sk_set_socket(struct sock *sk, struct socket *sock)
{
sk->sk_socket = sock;
+ if (sock) {
+ WRITE_ONCE(sk->sk_uid, SOCK_INODE(sock)->i_uid);
+ WRITE_ONCE(sk->sk_ino, SOCK_INODE(sock)->i_ino);
+ }
}
static inline wait_queue_head_t *sk_sleep(struct sock *sk)
@@ -2077,6 +2083,7 @@ static inline void sock_orphan(struct sock *sk)
sk_set_socket(sk, NULL);
sk->sk_wq = NULL;
/* Note: sk_uid is unchanged. */
+ WRITE_ONCE(sk->sk_ino, 0);
write_unlock_bh(&sk->sk_callback_lock);
}
@@ -2087,20 +2094,22 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
rcu_assign_pointer(sk->sk_wq, &parent->wq);
parent->sk = sk;
sk_set_socket(sk, parent);
- WRITE_ONCE(sk->sk_uid, SOCK_INODE(parent)->i_uid);
security_sock_graft(sk, parent);
write_unlock_bh(&sk->sk_callback_lock);
}
+static inline unsigned long sock_i_ino(const struct sock *sk)
+{
+ /* Paired with WRITE_ONCE() in sock_graft() and sock_orphan() */
+ return READ_ONCE(sk->sk_ino);
+}
+
static inline kuid_t sk_uid(const struct sock *sk)
{
/* Paired with WRITE_ONCE() in sockfs_setattr() */
return READ_ONCE(sk->sk_uid);
}
-unsigned long __sock_i_ino(struct sock *sk);
-unsigned long sock_i_ino(struct sock *sk);
-
static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
{
return sk ? sk_uid(sk) : make_kuid(net->user_ns, 0);
diff --git a/net/core/sock.c b/net/core/sock.c
index 7c26ec8dce63..158bddd23134 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2780,28 +2780,6 @@ void sock_pfree(struct sk_buff *skb)
EXPORT_SYMBOL(sock_pfree);
#endif /* CONFIG_INET */
-unsigned long __sock_i_ino(struct sock *sk)
-{
- unsigned long ino;
-
- read_lock(&sk->sk_callback_lock);
- ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
- read_unlock(&sk->sk_callback_lock);
- return ino;
-}
-EXPORT_SYMBOL(__sock_i_ino);
-
-unsigned long sock_i_ino(struct sock *sk)
-{
- unsigned long ino;
-
- local_bh_disable();
- ino = __sock_i_ino(sk);
- local_bh_enable();
- return ino;
-}
-EXPORT_SYMBOL(sock_i_ino);
-
/*
* Allocate a skb from the socket's send buffer.
*/
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 9a287b75c1b3..e6fd97b21e9e 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -3554,7 +3554,6 @@ void mptcp_sock_graft(struct sock *sk, struct socket *parent)
write_lock_bh(&sk->sk_callback_lock);
rcu_assign_pointer(sk->sk_wq, &parent->wq);
sk_set_socket(sk, parent);
- WRITE_ONCE(sk->sk_uid, SOCK_INODE(parent)->i_uid);
write_unlock_bh(&sk->sk_callback_lock);
}
diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index 61981e01fd6f..b8e58132e8af 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -168,7 +168,7 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NLM_F_MULTI,
- __sock_i_ino(sk)) < 0) {
+ sock_i_ino(sk)) < 0) {
ret = 1;
break;
}
--
2.51.0.338.gd7d06c2dae-goog
Powered by blists - more mailing lists