[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87tty55aou.fsf@cloudflare.com>
Date: Tue, 28 Mar 2023 14:09:27 +0200
From: Jakub Sitnicki <jakub@...udflare.com>
To: John Fastabend <john.fastabend@...il.com>
Cc: cong.wang@...edance.com, daniel@...earbox.net, lmb@...valent.com,
edumazet@...gle.com, bpf@...r.kernel.org, netdev@...r.kernel.org,
ast@...nel.org, andrii@...nel.org, will@...valent.com
Subject: Re: [PATCH bpf v2 02/12] bpf: sockmap, convert schedule_work into
delayed_work
On Mon, Mar 27, 2023 at 10:54 AM -07, John Fastabend wrote:
> Sk_buffs are fed into sockmap verdict programs either from a strparser
> (when the user might want to decide how framing of skb is done by attaching
> another parser program) or directly through tcp_read_sock. The
> tcp_read_sock is the preferred method for performance when the BPF logic is
> a stream parser.
>
> The flow for Cilium's common use case with a stream parser is,
>
> tcp_read_sock()
> sk_psock_verdict_recv
> ret = bpf_prog_run_pin_on_cpu()
> sk_psock_verdict_apply(sock, skb, ret)
> // if system is under memory pressure or app is slow we may
> // need to queue skb. Do this queuing through ingress_skb and
> // then kick timer to wake up handler
> skb_queue_tail(ingress_skb, skb)
> schedule_work(work);
>
>
> The work queue is wired up to sk_psock_backlog(). This will then walk the
> ingress_skb skb list that holds our sk_buffs that could not be handled,
> but should be OK to run at some later point. However, its possible that
> the workqueue doing this work still hits an error when sending the skb.
> When this happens the skbuff is requeued on a temporary 'state' struct
> kept with the workqueue. This is necessary because its possible to
> partially send an skbuff before hitting an error and we need to know how
> and where to restart when the workqueue runs next.
>
> Now for the trouble, we don't rekick the workqueue. This can cause a
> stall where the skbuff we just cached on the state variable might never
> be sent. This happens when its the last packet in a flow and no further
> packets come along that would cause the system to kick the workqueue from
> that side.
>
> To fix we could do simple schedule_work(), but while under memory pressure
> it makes sense to back off some instead of continue to retry repeatedly. So
> instead to fix convert schedule_work to schedule_delayed_work and add
> backoff logic to reschedule from backlog queue on errors. Its not obvious
> though what a good backoff is so use '1'.
>
> To test we observed some flakes whil running NGINX compliance test with
> sockmap we attributed these failed test to this bug and subsequent issue.
>
> Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()")
> Tested-by: William Findlay <will@...valent.com>
> Signed-off-by: John Fastabend <john.fastabend@...il.com>
> ---
> include/linux/skmsg.h | 2 +-
> net/core/skmsg.c | 19 ++++++++++++-------
> net/core/sock_map.c | 3 ++-
> 3 files changed, 15 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
> index 84f787416a54..904ff9a32ad6 100644
> --- a/include/linux/skmsg.h
> +++ b/include/linux/skmsg.h
> @@ -105,7 +105,7 @@ struct sk_psock {
> struct proto *sk_proto;
> struct mutex work_mutex;
> struct sk_psock_work_state work_state;
> - struct work_struct work;
> + struct delayed_work work;
> struct rcu_work rwork;
> };
>
> diff --git a/net/core/skmsg.c b/net/core/skmsg.c
> index 2b6d9519ff29..96a6a3a74a67 100644
> --- a/net/core/skmsg.c
> +++ b/net/core/skmsg.c
> @@ -481,7 +481,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
> }
> out:
> if (psock->work_state.skb && copied > 0)
> - schedule_work(&psock->work);
> + schedule_delayed_work(&psock->work, 0);
> return copied;
> }
> EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
> @@ -639,7 +639,8 @@ static void sk_psock_skb_state(struct sk_psock *psock,
>
> static void sk_psock_backlog(struct work_struct *work)
> {
> - struct sk_psock *psock = container_of(work, struct sk_psock, work);
> + struct delayed_work *dwork = to_delayed_work(work);
> + struct sk_psock *psock = container_of(dwork, struct sk_psock, work);
> struct sk_psock_work_state *state = &psock->work_state;
> struct sk_buff *skb = NULL;
> bool ingress;
> @@ -679,6 +680,10 @@ static void sk_psock_backlog(struct work_struct *work)
> if (ret == -EAGAIN) {
> sk_psock_skb_state(psock, state, skb,
> len, off);
> +
> + // Delay slightly to prioritize any
> + // other work that might be here.
> + schedule_delayed_work(&psock->work, 1);
Do IIUC that this means we can back out changes from commit bec217197b41
("skmsg: Schedule psock work if the cached skb exists on the psock")?
Nit: Comment syntax.
> goto end;
> }
> /* Hard errors break pipe and stop xmit. */
> @@ -733,7 +738,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
> INIT_LIST_HEAD(&psock->link);
> spin_lock_init(&psock->link_lock);
>
> - INIT_WORK(&psock->work, sk_psock_backlog);
> + INIT_DELAYED_WORK(&psock->work, sk_psock_backlog);
> mutex_init(&psock->work_mutex);
> INIT_LIST_HEAD(&psock->ingress_msg);
> spin_lock_init(&psock->ingress_lock);
> @@ -822,7 +827,7 @@ static void sk_psock_destroy(struct work_struct *work)
>
> sk_psock_done_strp(psock);
>
> - cancel_work_sync(&psock->work);
> + cancel_delayed_work_sync(&psock->work);
> mutex_destroy(&psock->work_mutex);
>
> psock_progs_drop(&psock->progs);
> @@ -937,7 +942,7 @@ static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb)
> }
>
> skb_queue_tail(&psock_other->ingress_skb, skb);
> - schedule_work(&psock_other->work);
> + schedule_delayed_work(&psock_other->work, 0);
> spin_unlock_bh(&psock_other->ingress_lock);
> return 0;
> }
> @@ -1017,7 +1022,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
> spin_lock_bh(&psock->ingress_lock);
> if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
> skb_queue_tail(&psock->ingress_skb, skb);
> - schedule_work(&psock->work);
> + schedule_delayed_work(&psock->work, 0);
> err = 0;
> }
> spin_unlock_bh(&psock->ingress_lock);
> @@ -1048,7 +1053,7 @@ static void sk_psock_write_space(struct sock *sk)
> psock = sk_psock(sk);
> if (likely(psock)) {
> if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
> - schedule_work(&psock->work);
> + schedule_delayed_work(&psock->work, 0);
> write_space = psock->saved_write_space;
> }
> rcu_read_unlock();
> diff --git a/net/core/sock_map.c b/net/core/sock_map.c
> index a68a7290a3b2..d38267201892 100644
> --- a/net/core/sock_map.c
> +++ b/net/core/sock_map.c
> @@ -1624,9 +1624,10 @@ void sock_map_close(struct sock *sk, long timeout)
> rcu_read_unlock();
> sk_psock_stop(psock);
> release_sock(sk);
> - cancel_work_sync(&psock->work);
> + cancel_delayed_work_sync(&psock->work);
> sk_psock_put(sk, psock);
> }
> +
> /* Make sure we do not recurse. This is a bug.
> * Leak the socket instead of crashing on a stack overflow.
> */
Powered by blists - more mailing lists