[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <CC9C9B9C-6F03-4ABD-A180-95100737B2EE@icloud.com>
Date: Tue, 17 Oct 2023 20:37:43 -0700
From: Christoph Paasch <christophpaasch@...oud.com>
To: Eric Dumazet <edumazet@...gle.com>
Cc: David Miller <davem@...emloft.net>,
Jakub Kicinski <kuba@...nel.org>,
Paolo Abeni <pabeni@...hat.com>,
David Ahern <dsahern@...nel.org>,
netdev <netdev@...r.kernel.org>,
Eric Dumazet <eric.dumazet@...il.com>
Subject: Re: [PATCH v2 net-next 3/8] inet: implement lockless IP_TOS
Hello Eric,
> On Sep 21, 2023, at 8:42 PM, Eric Dumazet <edumazet@...gle.com> wrote:
>
> Some reads of inet->tos are racy.
>
> Add needed READ_ONCE() annotations and convert IP_TOS option lockless.
>
> v2: missing changes in include/net/route.h (David Ahern)
>
> Signed-off-by: Eric Dumazet <edumazet@...gle.com>
> ---
> include/net/ip.h | 3 +-
> include/net/route.h | 4 +--
> net/dccp/ipv4.c | 2 +-
> net/ipv4/inet_diag.c | 2 +-
> net/ipv4/ip_output.c | 4 +--
> net/ipv4/ip_sockglue.c | 29 ++++++++-----------
> net/ipv4/tcp_ipv4.c | 9 +++---
> net/mptcp/sockopt.c | 8 ++---
> net/sctp/protocol.c | 4 +--
> .../selftests/net/mptcp/mptcp_connect.sh | 2 +-
> 10 files changed, 31 insertions(+), 36 deletions(-)
This patch causes a NULL-pointer deref in my syzkaller instances:
BUG: kernel NULL pointer dereference, address: 0000000000000000
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 12bad6067 P4D 12bad6067 PUD 12bad5067 PMD 0
Oops: 0000 [#1] PREEMPT SMP
CPU: 1 PID: 2750 Comm: syz-executor.5 Not tainted 6.6.0-rc4-g7a5720a344e7 #49
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014
RIP: 0010:tcp_get_metrics+0x118/0x8f0 net/ipv4/tcp_metrics.c:321
Code: c7 44 24 70 02 00 8b 03 89 44 24 48 c7 44 24 4c 00 00 00 00 66 c7 44 24 58 02 00 66 ba 02 00 b1 01 89 4c 24 04 4c 89 7c 24 10 <49> 8b 0f 48 8b 89 50 05 00 00 48 89 4c 24 30 33 81 00 02 00 00 69
RSP: 0018:ffffc90000af79b8 EFLAGS: 00010293
RAX: 000000000100007f RBX: ffff88812ae8f500 RCX: ffff88812b5f8f01
RDX: 0000000000000002 RSI: ffffffff8300f080 RDI: 0000000000000002
RBP: 0000000000000002 R08: 0000000000000003 R09: ffffffff8205eca0
R10: 0000000000000002 R11: ffff88812b5f8f00 R12: ffff88812a9e0580
R13: 0000000000000000 R14: ffff88812ae8fbd2 R15: 0000000000000000
FS: 00007f70a006b640(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 000000012bad7003 CR4: 0000000000170ee0
Call Trace:
<TASK>
tcp_fastopen_cache_get+0x32/0x140 net/ipv4/tcp_metrics.c:567
tcp_fastopen_cookie_check+0x28/0x180 net/ipv4/tcp_fastopen.c:419
tcp_connect+0x9c8/0x12a0 net/ipv4/tcp_output.c:3839
tcp_v4_connect+0x645/0x6e0 net/ipv4/tcp_ipv4.c:323
__inet_stream_connect+0x120/0x590 net/ipv4/af_inet.c:676
tcp_sendmsg_fastopen+0x2d6/0x3a0 net/ipv4/tcp.c:1021
tcp_sendmsg_locked+0x1957/0x1b00 net/ipv4/tcp.c:1073
tcp_sendmsg+0x30/0x50 net/ipv4/tcp.c:1336
__sock_sendmsg+0x83/0xd0 net/socket.c:730
__sys_sendto+0x20a/0x2a0 net/socket.c:2194
__do_sys_sendto net/socket.c:2206 [inline]
__se_sys_sendto net/socket.c:2202 [inline]
__x64_sys_sendto+0x28/0x30 net/socket.c:2202
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x47/0xa0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x6e/0xd8
The reason is that setting IP_TOS calls sk_reset_dst, which then causes these issues in the places where we assume that the dst in the socket is set (specifically, the tcp_connect-path).
Here is the syzkaller reproducer:
# {Threaded:true Repeat:true RepeatTimes:0 Procs:1 Slowdown:1 Sandbox:none SandboxArg:0 Leak:false NetInjection:false NetDevices:true NetReset:false Cgroups:false BinfmtMisc:false CloseFDs:true KCSAN:false DevlinkPCI:false NicVF:false USB:false VhciInjection:false Wifi:false IEEE802154:false Sysctl:false Swap:false UseTmpDir:false HandleSegv:false Repro:false Trace:false LegacyOptions:{Collide:false Fault:false FaultCall:0 FaultNth:0}}
r0 = socket$inet(0x2, 0x1, 0x0)
sendto$inet(r0, 0x0, 0x0, 0x20000841, &(0x7f0000000080)={0x2, 0x4e20, @dev={0xac, 0x14, 0x14, 0x15}}, 0x10) (async)
setsockopt$inet_int(r0, 0x0, 0x1, &(0x7f00000002c0)=0x81, 0x4)
Cheers,
Christoph
>
> diff --git a/include/net/ip.h b/include/net/ip.h
> index 46933a0d98eac2db40c2e88006125588b8f8143e..6fbc0dcf4b9780d60b5e5d6f84d6017fbf57d0ae 100644
> --- a/include/net/ip.h
> +++ b/include/net/ip.h
> @@ -258,7 +258,7 @@ static inline u8 ip_sendmsg_scope(const struct inet_sock *inet,
>
> static inline __u8 get_rttos(struct ipcm_cookie* ipc, struct inet_sock *inet)
> {
> - return (ipc->tos != -1) ? RT_TOS(ipc->tos) : RT_TOS(inet->tos);
> + return (ipc->tos != -1) ? RT_TOS(ipc->tos) : RT_TOS(READ_ONCE(inet->tos));
> }
>
> /* datagram.c */
> @@ -810,6 +810,5 @@ int ip_sock_set_mtu_discover(struct sock *sk, int val);
> void ip_sock_set_pktinfo(struct sock *sk);
> void ip_sock_set_recverr(struct sock *sk);
> void ip_sock_set_tos(struct sock *sk, int val);
> -void __ip_sock_set_tos(struct sock *sk, int val);
>
> #endif /* _IP_H */
> diff --git a/include/net/route.h b/include/net/route.h
> index 51a45b1887b562bfb473f9f8c50897d5d3073476..5c248a8e3d0e3ed757ad95f546032c2c49729eec 100644
> --- a/include/net/route.h
> +++ b/include/net/route.h
> @@ -37,7 +37,7 @@
>
> #define RTO_ONLINK 0x01
>
> -#define RT_CONN_FLAGS(sk) (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE))
> +#define RT_CONN_FLAGS(sk) (RT_TOS(READ_ONCE(inet_sk(sk)->tos)) | sock_flag(sk, SOCK_LOCALROUTE))
> #define RT_CONN_FLAGS_TOS(sk,tos) (RT_TOS(tos) | sock_flag(sk, SOCK_LOCALROUTE))
>
> static inline __u8 ip_sock_rt_scope(const struct sock *sk)
> @@ -50,7 +50,7 @@ static inline __u8 ip_sock_rt_scope(const struct sock *sk)
>
> static inline __u8 ip_sock_rt_tos(const struct sock *sk)
> {
> - return RT_TOS(inet_sk(sk)->tos);
> + return RT_TOS(READ_ONCE(inet_sk(sk)->tos));
> }
>
> struct ip_tunnel_info;
> diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
> index 69453b936bd557c77a790a27ff64cc91e5a58296..1b8cbfda6e5dbd098a58d92639a64bc8db83ff23 100644
> --- a/net/dccp/ipv4.c
> +++ b/net/dccp/ipv4.c
> @@ -511,7 +511,7 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req
> err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
> ireq->ir_rmt_addr,
> rcu_dereference(ireq->ireq_opt),
> - inet_sk(sk)->tos);
> + READ_ONCE(inet_sk(sk)->tos));
> rcu_read_unlock();
> err = net_xmit_eval(err);
> }
> diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
> index e13a84433413ed88088435ff8e11efeb30fc3cca..1f2d7a8bd060e59baeb00fcb1c6aabfcb3bb213d 100644
> --- a/net/ipv4/inet_diag.c
> +++ b/net/ipv4/inet_diag.c
> @@ -134,7 +134,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
> * hence this needs to be included regardless of socket family.
> */
> if (ext & (1 << (INET_DIAG_TOS - 1)))
> - if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
> + if (nla_put_u8(skb, INET_DIAG_TOS, READ_ONCE(inet->tos)) < 0)
> goto errout;
>
> #if IS_ENABLED(CONFIG_IPV6)
> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> index 2be281f184a5fe5a695ccd51fabe69fa45bea0b8..85320f92e8363d59e92c54139044cbab7e0561fa 100644
> --- a/net/ipv4/ip_output.c
> +++ b/net/ipv4/ip_output.c
> @@ -544,7 +544,7 @@ EXPORT_SYMBOL(__ip_queue_xmit);
>
> int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
> {
> - return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos);
> + return __ip_queue_xmit(sk, skb, fl, READ_ONCE(inet_sk(sk)->tos));
> }
> EXPORT_SYMBOL(ip_queue_xmit);
>
> @@ -1438,7 +1438,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
> iph = ip_hdr(skb);
> iph->version = 4;
> iph->ihl = 5;
> - iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
> + iph->tos = (cork->tos != -1) ? cork->tos : READ_ONCE(inet->tos);
> iph->frag_off = df;
> iph->ttl = ttl;
> iph->protocol = sk->sk_protocol;
> diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
> index 6d874cc03c8b4e88d79ebc50a6db105606b6ae60..50c008efbb6de7303621dd30b178c90cb3f5a2fc 100644
> --- a/net/ipv4/ip_sockglue.c
> +++ b/net/ipv4/ip_sockglue.c
> @@ -585,25 +585,20 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
> return err;
> }
>
> -void __ip_sock_set_tos(struct sock *sk, int val)
> +void ip_sock_set_tos(struct sock *sk, int val)
> {
> + u8 old_tos = READ_ONCE(inet_sk(sk)->tos);
> +
> if (sk->sk_type == SOCK_STREAM) {
> val &= ~INET_ECN_MASK;
> - val |= inet_sk(sk)->tos & INET_ECN_MASK;
> + val |= old_tos & INET_ECN_MASK;
> }
> - if (inet_sk(sk)->tos != val) {
> - inet_sk(sk)->tos = val;
> + if (old_tos != val) {
> + WRITE_ONCE(inet_sk(sk)->tos, val);
> WRITE_ONCE(sk->sk_priority, rt_tos2priority(val));
> sk_dst_reset(sk);
> }
> }
> -
> -void ip_sock_set_tos(struct sock *sk, int val)
> -{
> - lock_sock(sk);
> - __ip_sock_set_tos(sk, val);
> - release_sock(sk);
> -}
> EXPORT_SYMBOL(ip_sock_set_tos);
>
> void ip_sock_set_freebind(struct sock *sk)
> @@ -1050,6 +1045,9 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
> return 0;
> case IP_MTU_DISCOVER:
> return ip_sock_set_mtu_discover(sk, val);
> + case IP_TOS: /* This sets both TOS and Precedence */
> + ip_sock_set_tos(sk, val);
> + return 0;
> }
>
> err = 0;
> @@ -1104,9 +1102,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
> }
> }
> break;
> - case IP_TOS: /* This sets both TOS and Precedence */
> - __ip_sock_set_tos(sk, val);
> - break;
> case IP_UNICAST_IF:
> {
> struct net_device *dev = NULL;
> @@ -1593,6 +1588,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
> case IP_MTU_DISCOVER:
> val = READ_ONCE(inet->pmtudisc);
> goto copyval;
> + case IP_TOS:
> + val = READ_ONCE(inet->tos);
> + goto copyval;
> }
>
> if (needs_rtnl)
> @@ -1629,9 +1627,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
> return -EFAULT;
> return 0;
> }
> - case IP_TOS:
> - val = inet->tos;
> - break;
> case IP_MTU:
> {
> struct dst_entry *dst;
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index f13eb7e23d03f3681055257e6ebea0612ae3f9b3..1f89ba58e71eff74d8ed75019de9e70d2f4d5926 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1024,10 +1024,11 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
> if (skb) {
> __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
>
> - tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
> - (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
> - (inet_sk(sk)->tos & INET_ECN_MASK) :
> - inet_sk(sk)->tos;
> + tos = READ_ONCE(inet_sk(sk)->tos);
> +
> + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
> + tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
> + (tos & INET_ECN_MASK);
>
> if (!INET_ECN_is_capable(tos) &&
> tcp_bpf_ca_needs_ecn((struct sock *)req))
> diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
> index 8260202c00669fd7d2eed2f94a3c2cf225a0d89c..155e8472ba9b83c35c6f827b2bb35c0be4127917 100644
> --- a/net/mptcp/sockopt.c
> +++ b/net/mptcp/sockopt.c
> @@ -734,11 +734,11 @@ static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname,
>
> lock_sock(sk);
> sockopt_seq_inc(msk);
> - val = inet_sk(sk)->tos;
> + val = READ_ONCE(inet_sk(sk)->tos);
> mptcp_for_each_subflow(msk, subflow) {
> struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
>
> - __ip_sock_set_tos(ssk, val);
> + ip_sock_set_tos(ssk, val);
> }
> release_sock(sk);
>
> @@ -1343,7 +1343,7 @@ static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname,
>
> switch (optname) {
> case IP_TOS:
> - return mptcp_put_int_option(msk, optval, optlen, inet_sk(sk)->tos);
> + return mptcp_put_int_option(msk, optval, optlen, READ_ONCE(inet_sk(sk)->tos));
> }
>
> return -EOPNOTSUPP;
> @@ -1411,7 +1411,7 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
> ssk->sk_bound_dev_if = sk->sk_bound_dev_if;
> ssk->sk_incoming_cpu = sk->sk_incoming_cpu;
> ssk->sk_ipv6only = sk->sk_ipv6only;
> - __ip_sock_set_tos(ssk, inet_sk(sk)->tos);
> + ip_sock_set_tos(ssk, inet_sk(sk)->tos);
>
> if (sk->sk_userlocks & tx_rx_locks) {
> ssk->sk_userlocks |= sk->sk_userlocks & tx_rx_locks;
> diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
> index 2185f44198deb002bc8ed7f1b0f3fe02d6bb9f09..94c6dd53cd62d1fa6236d07946e8d5ff68eb587d 100644
> --- a/net/sctp/protocol.c
> +++ b/net/sctp/protocol.c
> @@ -426,7 +426,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
> struct dst_entry *dst = NULL;
> union sctp_addr *daddr = &t->ipaddr;
> union sctp_addr dst_saddr;
> - __u8 tos = inet_sk(sk)->tos;
> + u8 tos = READ_ONCE(inet_sk(sk)->tos);
>
> if (t->dscp & SCTP_DSCP_SET_MASK)
> tos = t->dscp & SCTP_DSCP_VAL_MASK;
> @@ -1057,7 +1057,7 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t)
> struct flowi4 *fl4 = &t->fl.u.ip4;
> struct sock *sk = skb->sk;
> struct inet_sock *inet = inet_sk(sk);
> - __u8 dscp = inet->tos;
> + __u8 dscp = READ_ONCE(inet->tos);
> __be16 df = 0;
>
> pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb,
> diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
> index b1fc8afd072dc6ddde8d561a675a5549a9a37dba..61a2a1988ce69ffa17e0dd8e629eac550f4f7d99 100755
> --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh
> +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
> @@ -716,7 +716,7 @@ run_test_transparent()
> # the required infrastructure in MPTCP sockopt code. To support TOS, the
> # following function has been exported (T). Not great but better than
> # checking for a specific kernel version.
> - if ! mptcp_lib_kallsyms_has "T __ip_sock_set_tos$"; then
> + if ! mptcp_lib_kallsyms_has "T ip_sock_set_tos$"; then
> echo "INFO: ${msg} not supported by the kernel: SKIP"
> mptcp_lib_result_skip "${TEST_GROUP}"
> return
> --
> 2.42.0.515.g380fc7ccd1-goog
>
>
Powered by blists - more mailing lists