[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <f72b77c3-15ac-3de3-5bce-c263564c1487@iogearbox.net>
Date: Fri, 17 Mar 2023 16:23:48 +0100
From: Daniel Borkmann <daniel@...earbox.net>
To: Kui-Feng Lee <kuifeng@...a.com>, bpf@...r.kernel.org,
ast@...nel.org, martin.lau@...ux.dev, song@...nel.org,
kernel-team@...a.com, andrii@...nel.org, sdf@...gle.com
Cc: netdev@...r.kernel.org, Eric Dumazet <edumazet@...gle.com>
Subject: Re: [PATCH bpf-next v7 2/8] net: Update an existing TCP congestion
control algorithm.
On 3/16/23 3:36 AM, Kui-Feng Lee wrote:
> This feature lets you immediately transition to another congestion
> control algorithm or implementation with the same name. Once a name
> is updated, new connections will apply this new algorithm.
>
> The purpose is to update a customized algorithm implemented in BPF
> struct_ops with a new version on the flight. The following is an
> example of using the userspace API implemented in later BPF patches.
>
> link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
> .......
> err = bpf_link__update_map(link, skel->maps.ca_update_2);
>
> We first load and register an algorithm implemented in BPF struct_ops,
> then swap it out with a new one using the same name. After that, newly
> created connections will apply the updated algorithm, while older ones
> retain the previous version already applied.
>
> This patch also takes this chance to refactor the ca validation into
> the new tcp_validate_congestion_control() function.
>
> Cc: netdev@...r.kernel.org, Eric Dumazet <edumazet@...gle.com>
> Signed-off-by: Kui-Feng Lee <kuifeng@...a.com>
> ---
> include/net/tcp.h | 3 +++
> net/ipv4/tcp_cong.c | 60 +++++++++++++++++++++++++++++++++++++++------
> 2 files changed, 56 insertions(+), 7 deletions(-)
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index db9f828e9d1e..2abb755e6a3a 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -1117,6 +1117,9 @@ struct tcp_congestion_ops {
>
> int tcp_register_congestion_control(struct tcp_congestion_ops *type);
> void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
> +int tcp_update_congestion_control(struct tcp_congestion_ops *type,
> + struct tcp_congestion_ops *old_type);
> +int tcp_validate_congestion_control(struct tcp_congestion_ops *ca);
>
> void tcp_assign_congestion_control(struct sock *sk);
> void tcp_init_congestion_control(struct sock *sk);
> diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
> index db8b4b488c31..c90791ae8389 100644
> --- a/net/ipv4/tcp_cong.c
> +++ b/net/ipv4/tcp_cong.c
> @@ -75,14 +75,8 @@ struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
> return NULL;
> }
>
> -/*
> - * Attach new congestion control algorithm to the list
> - * of available options.
> - */
> -int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
> +int tcp_validate_congestion_control(struct tcp_congestion_ops *ca)
> {
> - int ret = 0;
> -
> /* all algorithms must implement these */
> if (!ca->ssthresh || !ca->undo_cwnd ||
> !(ca->cong_avoid || ca->cong_control)) {
> @@ -90,6 +84,20 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
> return -EINVAL;
> }
>
> + return 0;
> +}
> +
> +/* Attach new congestion control algorithm to the list
> + * of available options.
> + */
> +int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
> +{
> + int ret;
> +
> + ret = tcp_validate_congestion_control(ca);
> + if (ret)
> + return ret;
> +
> ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
>
> spin_lock(&tcp_cong_list_lock);
> @@ -130,6 +138,44 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
> }
> EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
>
> +/* Replace a registered old ca with a new one.
> + *
> + * The new ca must have the same name as the old one, that has been
> + * registered.
> + */
> +int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_congestion_ops *old_ca)
> +{
> + struct tcp_congestion_ops *existing;
> + int ret;
> +
> + ret = tcp_validate_congestion_control(ca);
> + if (ret)
> + return ret;
> +
> + ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
> +
> + spin_lock(&tcp_cong_list_lock);
> + existing = tcp_ca_find_key(old_ca->key);
> + if (ca->key == TCP_CA_UNSPEC || !existing || strcmp(existing->name, ca->name)) {
> + pr_notice("%s not registered or non-unique key\n",
> + ca->name);
> + ret = -EINVAL;
> + } else if (existing != old_ca) {
> + pr_notice("invalid old congestion control algorithm to replace\n");
> + ret = -EINVAL;
> + } else {
> + /* Add the new one before removing the old one to keep
> + * one implementation available all the time.
> + */
> + list_add_tail_rcu(&ca->list, &tcp_cong_list);
> + list_del_rcu(&existing->list);
> + pr_debug("%s updated\n", ca->name);
> + }
> + spin_unlock(&tcp_cong_list_lock);
> +
> + return ret;
> +}
Was wondering if we could have tcp_register_congestion_control and tcp_update_congestion_control
could be refactored for reuse. Maybe like below. From the function itself what is not clear whether
callers that replace an existing one should do the synchronize_rcu() themselves or if this should
be part of tcp_update_congestion_control?
int tcp_check_congestion_control(struct tcp_congestion_ops *ca)
{
/* All algorithms must implement these. */
if (!ca->ssthresh || !ca->undo_cwnd ||
!(ca->cong_avoid || ca->cong_control)) {
pr_err("%s does not implement required ops\n", ca->name);
return -EINVAL;
}
if (ca->key == TCP_CA_UNSPEC)
ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
if (ca->key == TCP_CA_UNSPEC) {
pr_notice("%s results in zero key\n", ca->name);
return -EEXIST;
}
return 0;
}
/* Attach new congestion control algorithm to the list of available
* options or replace an existing one if old is non-NULL.
*/
int tcp_update_congestion_control(struct tcp_congestion_ops *new,
struct tcp_congestion_ops *old)
{
struct tcp_congestion_ops *found;
int ret;
ret = tcp_check_congestion_control(new);
if (ret)
return ret;
if (old &&
(old->key != new->key ||
strcmp(old->name, new->name))) {
pr_notice("%s & %s have non-matching congestion control names\n",
old->name, new->name);
return -EINVAL;
}
spin_lock(&tcp_cong_list_lock);
found = tcp_ca_find_key(new->key);
if (old) {
if (found == old) {
list_add_tail_rcu(&new->list, &tcp_cong_list);
list_del_rcu(&old->list);
} else {
pr_notice("%s not registered\n", old->name);
ret = -EINVAL;
}
} else {
if (found) {
pr_notice("%s already registered\n", new->name);
ret = -EEXIST;
} else {
list_add_tail_rcu(&new->list, &tcp_cong_list);
}
}
spin_unlock(&tcp_cong_list_lock);
return ret;
}
int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
{
return tcp_update_congestion_control(ca, NULL);
}
EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
Powered by blists - more mailing lists