[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Date: Sat, 6 Jun 2020 18:26:51 +0200
From: Jakub Sitnicki <jakub@...udflare.com>
To: John Fastabend <john.fastabend@...il.com>
Cc: netdev@...r.kernel.org, bpf@...r.kernel.org
Subject: Re: [bpf-next PATCH 1/3] bpf: refactor sockmap redirect code so its
easy to reuse
On Fri, 29 May 2020 23:06:41 +0000
John Fastabend <john.fastabend@...il.com> wrote:
> We will need this block of code called from tls context shortly
> lets refactor the redirect logic so its easy to use. This also
> cleans up the switch stmt so we have fewer fallthrough cases.
>
> No logic changes are intended.
>
> Fixes: d829e9c4112b5 ("tls: convert to generic sk_msg interface")
> Signed-off-by: John Fastabend <john.fastabend@...il.com>
> ---
> net/core/skmsg.c | 55 +++++++++++++++++++++++++++++++++---------------------
> 1 file changed, 34 insertions(+), 21 deletions(-)
>
> diff --git a/net/core/skmsg.c b/net/core/skmsg.c
> index c479372..9d72f71 100644
> --- a/net/core/skmsg.c
> +++ b/net/core/skmsg.c
> @@ -682,13 +682,43 @@ static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
> return container_of(parser, struct sk_psock, parser);
> }
>
> -static void sk_psock_verdict_apply(struct sk_psock *psock,
> - struct sk_buff *skb, int verdict)
> +static void sk_psock_skb_redirect(struct sk_psock *psock, struct sk_buff *skb)
> {
> struct sk_psock *psock_other;
> struct sock *sk_other;
> bool ingress;
>
> + sk_other = tcp_skb_bpf_redirect_fetch(skb);
> + if (unlikely(!sk_other)) {
> + kfree_skb(skb);
> + return;
> + }
> + psock_other = sk_psock(sk_other);
I think we're not in RCU read-side critical section and so lockdep-RCU
[0] is complaining about accessing sk_user_data with rcu_dereference:
bash-5.0# ./test_sockmap
# 1/ 6 sockmap::txmsg test passthrough:OK
# 2/ 6 sockmap::txmsg test redirect:OK
# 3/ 6 sockmap::txmsg test drop:OK
# 4/ 6 sockmap::txmsg test ingress redirect:OK
[ 96.791996]
[ 96.792211] =============================
[ 96.792763] WARNING: suspicious RCU usage
[ 96.793297] 5.7.0-rc7-02964-g615b5749876a-dirty #692 Not tainted
[ 96.794032] -----------------------------
[ 96.794480] include/linux/skmsg.h:284 suspicious rcu_dereference_check() usage!
[ 96.795154]
[ 96.795154] other info that might help us debug this:
[ 96.795154]
[ 96.795926]
[ 96.795926] rcu_scheduler_active = 2, debug_locks = 1
[ 96.796556] 1 lock held by test_sockmap/1060:
[ 96.796970] #0: ffff8880a0d35f20 (sk_lock-AF_INET){+.+.}-{0:0}, at: tls_sw_recvmsg+0x238/0xc10
[ 96.797813]
[ 96.797813] stack backtrace:
[ 96.798224] CPU: 1 PID: 1060 Comm: test_sockmap Not tainted 5.7.0-rc7-02964-g615b5749876a-dirty #692
[ 96.799071] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014
[ 96.800294] Call Trace:
[ 96.800543] dump_stack+0x97/0xe0
[ 96.800864] sk_psock_skb_redirect.isra.0+0xa6/0x1b0
[ 96.801338] sk_psock_tls_strp_read+0x298/0x310
[ 96.801769] tls_sw_recvmsg+0xa47/0xc10
[ 96.802144] ? decrypt_skb+0x80/0x80
[ 96.802491] ? lock_downgrade+0x330/0x330
[ 96.802887] inet_recvmsg+0xae/0x2a0
[ 96.803224] ? rw_copy_check_uvector+0x15e/0x1b0
[ 96.803669] ? inet_sendpage+0xc0/0xc0
[ 96.804029] ____sys_recvmsg+0x110/0x210
[ 96.804417] ? kernel_recvmsg+0x60/0x60
[ 96.804785] ? copy_msghdr_from_user+0x91/0xd0
[ 96.805200] ? __copy_msghdr_from_user+0x230/0x230
[ 96.805665] ? lock_acquire+0x120/0x4b0
[ 96.806025] ? match_held_lock+0x1b/0x230
[ 96.806417] ___sys_recvmsg+0xb8/0x100
[ 96.806778] ? ___sys_sendmsg+0x110/0x110
[ 96.807155] ? lock_downgrade+0x330/0x330
[ 96.807555] ? __fget_light+0xad/0x110
[ 96.807917] ? sockfd_lookup_light+0x91/0xb0
[ 96.808334] __sys_recvmsg+0x87/0xe0
[ 96.808674] ? __sys_recvmsg_sock+0x70/0x70
[ 96.809064] ? rcu_read_lock_sched_held+0x81/0xb0
[ 96.809540] ? switch_fpu_return+0x1/0x250
[ 96.809948] ? do_syscall_64+0x5f/0x9a0
[ 96.810325] do_syscall_64+0xad/0x9a0
[ 96.810676] ? handle_mm_fault+0x21e/0x3d0
[ 96.811060] ? syscall_return_slowpath+0x530/0x530
[ 96.811524] ? trace_hardirqs_on_thunk+0x1a/0x1c
[ 96.811955] ? lockdep_hardirqs_off+0xb5/0x100
[ 96.812383] ? trace_hardirqs_off_thunk+0x1a/0x1c
[ 96.812871] entry_SYSCALL_64_after_hwframe+0x49/0xb3
[ 96.813425] RIP: 0033:0x7f92ff15c737
[ 96.813762] Code: 02 b8 ff ff ff ff eb bd 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 2f 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 89 54 24 1c 48 89 74 24 10
[ 96.815477] RSP: 002b:00007ffd80734b68 EFLAGS: 00000246 ORIG_RAX: 000000000000002f
[ 96.816177] RAX: ffffffffffffffda RBX: 000000000000001c RCX: 00007f92ff15c737
[ 96.816847] RDX: 0000000000004000 RSI: 00007ffd80734bd0 RDI: 000000000000001c
[ 96.817602] RBP: 00007ffd80734c50 R08: 00007ffd80734bc0 R09: 0000000000000060
[ 96.818351] R10: fffffffffffffb15 R11: 0000000000000246 R12: 0000000000000000
[ 96.819107] R13: 0000000000004000 R14: 00007f92fef7a6b8 R15: 00007ffd80734d50
[0] https://www.kernel.org/doc/Documentation/RCU/lockdep-splat.txt
> + if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
> + !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
> + kfree_skb(skb);
> + return;
> + }
> +
> + ingress = tcp_skb_bpf_ingress(skb);
> + if ((!ingress && sock_writeable(sk_other)) ||
> + (ingress &&
> + atomic_read(&sk_other->sk_rmem_alloc) <=
> + sk_other->sk_rcvbuf)) {
> + if (!ingress)
> + skb_set_owner_w(skb, sk_other);
> + skb_queue_tail(&psock_other->ingress_skb, skb);
> + schedule_work(&psock_other->work);
> + } else {
> + kfree_skb(skb);
> + }
> +}
> +
> +static void sk_psock_verdict_apply(struct sk_psock *psock,
> + struct sk_buff *skb, int verdict)
> +{
> + struct sock *sk_other;
> +
> switch (verdict) {
> case __SK_PASS:
> sk_other = psock->sk;
> @@ -707,25 +737,8 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
> }
> goto out_free;
> case __SK_REDIRECT:
> - sk_other = tcp_skb_bpf_redirect_fetch(skb);
> - if (unlikely(!sk_other))
> - goto out_free;
> - psock_other = sk_psock(sk_other);
> - if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
> - !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED))
> - goto out_free;
> - ingress = tcp_skb_bpf_ingress(skb);
> - if ((!ingress && sock_writeable(sk_other)) ||
> - (ingress &&
> - atomic_read(&sk_other->sk_rmem_alloc) <=
> - sk_other->sk_rcvbuf)) {
> - if (!ingress)
> - skb_set_owner_w(skb, sk_other);
> - skb_queue_tail(&psock_other->ingress_skb, skb);
> - schedule_work(&psock_other->work);
> - break;
> - }
> - /* fall-through */
> + sk_psock_skb_redirect(psock, skb);
> + break;
> case __SK_DROP:
> /* fall-through */
> default:
>
Powered by blists - more mailing lists