[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <0e1656dc-b67c-ec65-83a4-6709fb186061@linux.dev>
Date: Mon, 15 May 2023 15:52:42 -0700
From: Martin KaFai Lau <martin.lau@...ux.dev>
To: "D. Wythe" <alibuda@...ux.alibaba.com>
Cc: kuba@...nel.org, davem@...emloft.net, netdev@...r.kernel.org,
linux-s390@...r.kernel.org, linux-rdma@...r.kernel.org, bpf@...r.kernel.org,
kgraul@...ux.ibm.com, wenjia@...ux.ibm.com, jaka@...ux.ibm.com,
ast@...nel.org, daniel@...earbox.net, andrii@...nel.org, pabeni@...hat.com,
song@...nel.org, sdf@...gle.com, haoluo@...gle.com, yhs@...com,
edumazet@...gle.com, john.fastabend@...il.com, kpsingh@...nel.org,
jolsa@...nel.org, guwen@...ux.alibaba.com
Subject: Re: [PATCH bpf-next v1 2/5] net/smc: allow smc to negotiate protocols
on policies
On 5/11/23 11:24 PM, D. Wythe wrote:
> From: "D. Wythe" <alibuda@...ux.alibaba.com>
>
> As we all know, the SMC protocol is not suitable for all scenarios,
> especially for short-lived. However, for most applications, they cannot
> guarantee that there are no such scenarios at all. Therefore, apps
> may need some specific strategies to decide shall we need to use SMC
> or not.
>
> Just like the congestion control implementation in TCP, this patch
> provides a generic negotiator implementation. If necessary,
> we can provide different protocol negotiation strategies for
> apps based on this implementation.
>
> But most importantly, this patch provides the possibility of
> eBPF injection, allowing users to implement their own protocol
> negotiation policy in userspace.
>
> Signed-off-by: D. Wythe <alibuda@...ux.alibaba.com>
> ---
> include/net/smc.h | 32 +++++++++++
> net/Makefile | 1 +
> net/smc/Kconfig | 11 ++++
> net/smc/af_smc.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++-
> net/smc/smc_negotiator.c | 119 +++++++++++++++++++++++++++++++++++++++++
> net/smc/smc_negotiator.h | 116 ++++++++++++++++++++++++++++++++++++++++
> 6 files changed, 412 insertions(+), 1 deletion(-)
> create mode 100644 net/smc/smc_negotiator.c
> create mode 100644 net/smc/smc_negotiator.h
>
> diff --git a/include/net/smc.h b/include/net/smc.h
> index 6d076f5..191061c 100644
> --- a/include/net/smc.h
> +++ b/include/net/smc.h
> @@ -296,6 +296,8 @@ struct smc_sock { /* smc sock container */
> atomic_t queued_smc_hs; /* queued smc handshakes */
> struct inet_connection_sock_af_ops af_ops;
> const struct inet_connection_sock_af_ops *ori_af_ops;
> + /* protocol negotiator ops */
> + const struct smc_sock_negotiator_ops *negotiator_ops;
> /* original af ops */
> int sockopt_defer_accept;
> /* sockopt TCP_DEFER_ACCEPT
> @@ -316,4 +318,34 @@ struct smc_sock { /* smc sock container */
> */
> };
>
> +#ifdef CONFIG_SMC_BPF
> +/* BPF struct ops for smc protocol negotiator */
> +struct smc_sock_negotiator_ops {
> +
> + struct list_head list;
> +
> + /* ops name */
> + char name[16];
> + /* key for name */
> + u32 key;
> +
> + /* init with sk */
> + void (*init)(struct sock *sk);
> +
> + /* release with sk */
> + void (*release)(struct sock *sk);
> +
> + /* advice for negotiate */
> + int (*negotiate)(struct sock *sk);
> +
> + /* info gathering timing */
> + void (*collect_info)(struct sock *sk, int timing);
> +
> + /* module owner */
> + struct module *owner;
> +};
> +#else
> +struct smc_sock_negotiator_ops {};
> +#endif
> +
> #endif /* _SMC_H */
> diff --git a/net/Makefile b/net/Makefile
> index 4c4dc53..222916a 100644
> --- a/net/Makefile
> +++ b/net/Makefile
> @@ -52,6 +52,7 @@ obj-$(CONFIG_TIPC) += tipc/
> obj-$(CONFIG_NETLABEL) += netlabel/
> obj-$(CONFIG_IUCV) += iucv/
> obj-$(CONFIG_SMC) += smc/
> +obj-$(CONFIG_SMC_BPF) += smc/smc_negotiator.o > obj-$(CONFIG_RFKILL) += rfkill/
> obj-$(CONFIG_NET_9P) += 9p/
> obj-$(CONFIG_CAIF) += caif/
> diff --git a/net/smc/Kconfig b/net/smc/Kconfig
> index 1ab3c5a..bdcc9f1 100644
> --- a/net/smc/Kconfig
> +++ b/net/smc/Kconfig
> @@ -19,3 +19,14 @@ config SMC_DIAG
> smcss.
>
> if unsure, say Y.
> +
> +config SMC_BPF
> + bool "SMC: support eBPF" if SMC
so smc_negotiator will always be in the kernel image even af_smc is compiled as
a module? If the SMC_BPF needs to support af_smc as a module, proper
implementation needs to be added to bpf_struct_ops to support module first. It
is work-in-progress.
> + depends on BPF_SYSCALL
> + default n
> + help
> + Supports eBPF to allows user mode participation in SMC's protocol process
> + via ebpf programs. Alternatively, obtain information about the SMC socks
> + through the ebpf program.
> +
> + If unsure, say N.
> diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
> index 50c38b6..7406fd4 100644
> --- a/net/smc/af_smc.c
> +++ b/net/smc/af_smc.c
> @@ -52,6 +52,7 @@
> #include "smc_close.h"
> #include "smc_stats.h"
> #include "smc_tracepoint.h"
> +#include "smc_negotiator.h"
> #include "smc_sysctl.h"
>
> static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group
> @@ -68,6 +69,119 @@
> static void smc_tcp_listen_work(struct work_struct *);
> static void smc_connect_work(struct work_struct *);
>
> +#ifdef CONFIG_SMC_BPF
> +
> +/* Check if sock should use smc */
> +int smc_sock_should_select_smc(const struct smc_sock *smc)
> +{
> + const struct smc_sock_negotiator_ops *ops;
> + int ret;
> +
> + rcu_read_lock();
> + ops = READ_ONCE(smc->negotiator_ops);
> +
> + /* No negotiator_ops supply or no negotiate func set,
> + * always pass it.
> + */
> + if (!ops || !ops->negotiate) {
A smc_sock_negotiator_ops without ->negotiate? Is it useful at all to allow the
register in the first place?
> + rcu_read_unlock();
> + return SK_PASS;
> + }
> +
> + ret = ops->negotiate((struct sock *)&smc->sk);
> + rcu_read_unlock();
> + return ret;
> +}
> +
> +void smc_sock_perform_collecting_info(const struct smc_sock *smc, int timing)
> +{
> + const struct smc_sock_negotiator_ops *ops;
> +
> + rcu_read_lock();
> + ops = READ_ONCE(smc->negotiator_ops);
> +
> + if (!ops || !ops->collect_info) {
> + rcu_read_unlock();
> + return;
> + }
> +
> + ops->collect_info((struct sock *)&smc->sk, timing);
> + rcu_read_unlock();
> +}
> +
> +int smc_sock_assign_negotiator_ops(struct smc_sock *smc, const char *name)
> +{
> + struct smc_sock_negotiator_ops *ops;
> + int ret = -EINVAL;
> +
> + /* already set */
> + if (READ_ONCE(smc->negotiator_ops))
> + smc_sock_cleanup_negotiator_ops(smc, /* might be still referenced */ false);
> +
> + /* Just for clear negotiator_ops */
> + if (!name || !strlen(name))
> + return 0;
> +
> + rcu_read_lock();
> + ops = smc_negotiator_ops_get_by_name(name);
> + if (likely(ops)) {
> + if (unlikely(!bpf_try_module_get(ops, ops->owner))) {
> + ret = -EACCES;
> + } else {
> + WRITE_ONCE(smc->negotiator_ops, ops);
> + /* make sure ops can be seen */
> + smp_wmb();
This rcu_read_lock(), WRITE_ONCE, and smp_wmb() combo looks very suspicious.
smc->negotiator_ops is protected by rcu (+refcnt) or lock_sock()?
I am going to stop reviewing here.
> + if (ops->init)
> + ops->init(&smc->sk);
> + ret = 0;
> + }
> + }
> + rcu_read_unlock();
> + return ret;
> +}
> +
> +void smc_sock_cleanup_negotiator_ops(struct smc_sock *smc, bool no_more)
> +{
> + const struct smc_sock_negotiator_ops *ops;
> +
> + ops = READ_ONCE(smc->negotiator_ops);
> +
> + /* not all smc sock has negotiator_ops */
> + if (!ops)
> + return;
> +
> + might_sleep();
> +
> + /* Just ensure data integrity */
> + WRITE_ONCE(smc->negotiator_ops, NULL);
> + /* make sure NULL can be seen */
> + smp_wmb();
> + /* if the socks may have references to the negotiator ops to be removed.
> + * it means that we might need to wait for the readers of ops
> + * to complete. It's slow though.
> + */
> + if (unlikely(!no_more))
> + synchronize_rcu();
> + if (ops->release)
> + ops->release(&smc->sk);
> + bpf_module_put(ops, ops->owner);
> +}
> +
> +void smc_sock_clone_negotiator_ops(struct sock *parent, struct sock *child)
> +{
> + const struct smc_sock_negotiator_ops *ops;
> +
> + rcu_read_lock();
> + ops = READ_ONCE(smc_sk(parent)->negotiator_ops);
> + if (ops && bpf_try_module_get(ops, ops->owner)) {
> + smc_sk(child)->negotiator_ops = ops;
> + if (ops->init)
> + ops->init(child);
> + }
> + rcu_read_unlock();
> +}
> +#endif
> +
> int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb)
> {
> struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
> @@ -166,6 +280,9 @@ static bool smc_hs_congested(const struct sock *sk)
> if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq))
> return true;
>
> + if (!smc_sock_should_select_smc(smc))
> + return true;
> +
> return false;
> }
>
> @@ -320,6 +437,9 @@ static int smc_release(struct socket *sock)
> sock_hold(sk); /* sock_put below */
> smc = smc_sk(sk);
>
> + /* trigger info gathering if needed.*/
> + smc_sock_perform_collecting_info(smc, SMC_SOCK_CLOSED_TIMING);
> +
> old_state = sk->sk_state;
>
> /* cleanup for a dangling non-blocking connect */
> @@ -356,6 +476,9 @@ static int smc_release(struct socket *sock)
>
> static void smc_destruct(struct sock *sk)
> {
> + /* cleanup negotiator_ops if set */
> + smc_sock_cleanup_negotiator_ops(smc_sk(sk), /* no longer used */ true);
> +
> if (sk->sk_state != SMC_CLOSED)
> return;
> if (!sock_flag(sk, SOCK_DEAD))
> @@ -1627,7 +1750,14 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
> }
>
> smc_copy_sock_settings_to_clc(smc);
> - tcp_sk(smc->clcsock->sk)->syn_smc = 1;
> + /* accept out connection as SMC connection */
> + if (smc_sock_should_select_smc(smc) == SK_PASS) {
> + tcp_sk(smc->clcsock->sk)->syn_smc = 1;
> + } else {
> + tcp_sk(smc->clcsock->sk)->syn_smc = 0;
> + smc_switch_to_fallback(smc, /* active fallback */ 0);
> + }
> +
> if (smc->connect_nonblock) {
> rc = -EALREADY;
> goto out;
> @@ -1679,6 +1809,8 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
> }
> *new_smc = smc_sk(new_sk);
>
> + smc_sock_clone_negotiator_ops(lsk, new_sk);
> +
> mutex_lock(&lsmc->clcsock_release_lock);
> if (lsmc->clcsock)
> rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK);
Powered by blists - more mailing lists