[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <a72b816.ab94.18d37564300.Coremail.00107082@163.com>
Date: Wed, 24 Jan 2024 01:20:16 +0800 (CST)
From: "David Wang" <00107082@....com>
To: "Jozsef Kadlecsik" <kadlec@...ckhole.kfki.hu>
Cc: "Eric Dumazet" <edumazet@...gle.com>,
"Pablo Neira Ayuso" <pablo@...filter.org>,
netfilter-devel@...r.kernel.org,
"David Miller" <davem@...emloft.net>, netdev@...r.kernel.org,
kuba@...nel.org, pabeni@...hat.com,
"Florian Westphal" <fw@...len.de>,
"Ale Crismani" <ale.crismani@...omattic.com>
Subject: Re: [PATCH net 14/14] netfilter: ipset: fix performance regression
in swap operation
At 2024-01-23 17:03:42, "Jozsef Kadlecsik" <kadlec@...ckhole.kfki.hu> wrote:
>Hi,
>
>On Thu, 18 Jan 2024, Jozsef Kadlecsik wrote:
>
>> On Thu, 18 Jan 2024, Eric Dumazet wrote:
>>
>> > On Wed, Jan 17, 2024 at 5:00 PM Pablo Neira Ayuso <pablo@...filter.org> wrote:
>> > >
>> > > The patch "netfilter: ipset: fix race condition between swap/destroy
>> > > and kernel side add/del/test", commit 28628fa9 fixes a race condition.
>> > > But the synchronize_rcu() added to the swap function unnecessarily slows
>> > > it down: it can safely be moved to destroy and use call_rcu() instead.
>> > > Thus we can get back the same performance and preventing the race condition
>> > > at the same time.
>[...]
>> > > +static void
>> > > +ip_set_destroy_set_rcu(struct rcu_head *head)
>> > > +{
>> > > + struct ip_set *set = container_of(head, struct ip_set, rcu);
>> > > +
>> > > + ip_set_destroy_set(set);
>> >
>> > Calling ip_set_destroy_set() from BH (rcu callbacks) is not working.
>>
>> Yeah, it calls cancel_delayed_work_sync() to handle the garbage collector
>> and that can wait. The call can be moved into the main destroy function
>> and let the rcu callback do just the minimal job, however it needs a
>> restructuring. So please skip this patch now.
>
>I reworked the patch to work safely with using call_rcu() and still
>preventing the race condition. According to my tests the patch works as
>intended, but it'd be good to receive feedback that it indeed fixes the
>issue properly.
I apply the patch on 6.8.0-rc1, stress create-add-swap-destroy sequence for 1000000 rounds, and no performance regression observed.
$ time sudo ./a.out
real 0m19.717s
user 0m0.849s
sys 0m17.060s
I also test with two processes create-add-swap-destroy disjoint set of ipset parallelly on a 2-Core VM, no abnormality noticed other than some slower, I guess this is expected, right?
$ time sudo ./a.out
real 0m23.625s
user 0m0.827s
sys 0m20.222s
$ time sudo ./a2.out
real 0m23.838s
user 0m0.835s
sys 0m20.401s
No obverse slab memory increment can be confirmed after several round of stress.
cat /proc/meminfo | grep Slab:
39248 kB - 39368 kB -- 39340 kB - 39388 kB ----- 39308 kB
FYI
David
>
>From 23e85f453da573dd4265e7d1fffd2aeab3369e0d Mon Sep 17 00:00:00 2001
>From: Jozsef Kadlecsik <kadlec@...filter.org>
>Date: Tue, 16 Jan 2024 17:10:45 +0100
>Subject: [PATCH 1/1] netfilter: ipset: fix performance regression in swap
> operation
>
>The patch "netfilter: ipset: fix race condition between swap/destroy
>and kernel side add/del/test", commit 28628fa9 fixes a race condition.
>But the synchronize_rcu() added to the swap function unnecessarily slows
>it down: it can safely be moved to destroy and use call_rcu() instead.
>
>Eric Dumazet pointed out that simply calling the destroy functions as
>rcu callback does not work: sets with timeout use garbage collectors
>which need cancelling at destroy which can wait. Therefore the destroy
>functions are split into two: cancelling garbage collectors safely at
>executing the command received by netlink and moving the remaining
>part only into the rcu callback.
>
>Link: https://lore.kernel.org/lkml/C0829B10-EAA6-4809-874E-E1E9C05A8D84@automattic.com/
>Fixes: 28628fa952fe ("netfilter: ipset: fix race condition between swap/destroy and kernel side add/del/test")
>Reported-by: Ale Crismani <ale.crismani@...omattic.com>
>Reported-by: David Wang <00107082@....com
>Signed-off-by: Jozsef Kadlecsik <kadlec@...filter.org>
>---
> include/linux/netfilter/ipset/ip_set.h | 4 +++
> net/netfilter/ipset/ip_set_bitmap_gen.h | 14 ++++++++--
> net/netfilter/ipset/ip_set_core.c | 37 +++++++++++++++++++------
> net/netfilter/ipset/ip_set_hash_gen.h | 15 ++++++++--
> net/netfilter/ipset/ip_set_list_set.c | 13 +++++++--
> 5 files changed, 65 insertions(+), 18 deletions(-)
>
>diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
>index e8c350a3ade1..e9f4f845d760 100644
>--- a/include/linux/netfilter/ipset/ip_set.h
>+++ b/include/linux/netfilter/ipset/ip_set.h
>@@ -186,6 +186,8 @@ struct ip_set_type_variant {
> /* Return true if "b" set is the same as "a"
> * according to the create set parameters */
> bool (*same_set)(const struct ip_set *a, const struct ip_set *b);
>+ /* Cancel ongoing garbage collectors before destroying the set*/
>+ void (*cancel_gc)(struct ip_set *set);
> /* Region-locking is used */
> bool region_lock;
> };
>@@ -242,6 +244,8 @@ extern void ip_set_type_unregister(struct ip_set_type *set_type);
>
> /* A generic IP set */
> struct ip_set {
>+ /* For call_cru in destroy */
>+ struct rcu_head rcu;
> /* The name of the set */
> char name[IPSET_MAXNAMELEN];
> /* Lock protecting the set data */
>diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
>index 26ab0e9612d8..9523104a90da 100644
>--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
>+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
>@@ -28,6 +28,7 @@
> #define mtype_del IPSET_TOKEN(MTYPE, _del)
> #define mtype_list IPSET_TOKEN(MTYPE, _list)
> #define mtype_gc IPSET_TOKEN(MTYPE, _gc)
>+#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc)
> #define mtype MTYPE
>
> #define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id)))
>@@ -57,9 +58,6 @@ mtype_destroy(struct ip_set *set)
> {
> struct mtype *map = set->data;
>
>- if (SET_WITH_TIMEOUT(set))
>- del_timer_sync(&map->gc);
>-
> if (set->dsize && set->extensions & IPSET_EXT_DESTROY)
> mtype_ext_cleanup(set);
> ip_set_free(map->members);
>@@ -288,6 +286,15 @@ mtype_gc(struct timer_list *t)
> add_timer(&map->gc);
> }
>
>+static void
>+mtype_cancel_gc(struct ip_set *set)
>+{
>+ struct mtype *map = set->data;
>+
>+ if (SET_WITH_TIMEOUT(set))
>+ del_timer_sync(&map->gc);
>+}
>+
> static const struct ip_set_type_variant mtype = {
> .kadt = mtype_kadt,
> .uadt = mtype_uadt,
>@@ -301,6 +308,7 @@ static const struct ip_set_type_variant mtype = {
> .head = mtype_head,
> .list = mtype_list,
> .same_set = mtype_same_set,
>+ .cancel_gc = mtype_cancel_gc,
> };
>
> #endif /* __IP_SET_BITMAP_IP_GEN_H */
>diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
>index 4c133e06be1d..bcaad9c009fe 100644
>--- a/net/netfilter/ipset/ip_set_core.c
>+++ b/net/netfilter/ipset/ip_set_core.c
>@@ -1182,6 +1182,14 @@ ip_set_destroy_set(struct ip_set *set)
> kfree(set);
> }
>
>+static void
>+ip_set_destroy_set_rcu(struct rcu_head *head)
>+{
>+ struct ip_set *set = container_of(head, struct ip_set, rcu);
>+
>+ ip_set_destroy_set(set);
>+}
>+
> static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
> const struct nlattr * const attr[])
> {
>@@ -1193,8 +1201,6 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
> if (unlikely(protocol_min_failed(attr)))
> return -IPSET_ERR_PROTOCOL;
>
>- /* Must wait for flush to be really finished in list:set */
>- rcu_barrier();
>
> /* Commands are serialized and references are
> * protected by the ip_set_ref_lock.
>@@ -1206,8 +1212,10 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
> * counter, so if it's already zero, we can proceed
> * without holding the lock.
> */
>- read_lock_bh(&ip_set_ref_lock);
> if (!attr[IPSET_ATTR_SETNAME]) {
>+ /* Must wait for flush to be really finished in list:set */
>+ rcu_barrier();
>+ read_lock_bh(&ip_set_ref_lock);
> for (i = 0; i < inst->ip_set_max; i++) {
> s = ip_set(inst, i);
> if (s && (s->ref || s->ref_netlink)) {
>@@ -1221,6 +1229,8 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
> s = ip_set(inst, i);
> if (s) {
> ip_set(inst, i) = NULL;
>+ /* Must cancel garbage collectors */
>+ s->variant->cancel_gc(s);
> ip_set_destroy_set(s);
> }
> }
>@@ -1228,6 +1238,9 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
> inst->is_destroyed = false;
> } else {
> u32 flags = flag_exist(info->nlh);
>+ u16 features = 0;
>+
>+ read_lock_bh(&ip_set_ref_lock);
> s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
> &i);
> if (!s) {
>@@ -1238,10 +1251,16 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
> ret = -IPSET_ERR_BUSY;
> goto out;
> }
>+ features = s->type->features;
> ip_set(inst, i) = NULL;
> read_unlock_bh(&ip_set_ref_lock);
>-
>- ip_set_destroy_set(s);
>+ if (features & IPSET_TYPE_NAME) {
>+ /* Must wait for flush to be really finished */
>+ rcu_barrier();
>+ }
>+ /* Must cancel garbage collectors */
>+ s->variant->cancel_gc(s);
>+ call_rcu(&s->rcu, ip_set_destroy_set_rcu);
> }
> return 0;
> out:
>@@ -1394,9 +1413,6 @@ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info,
> ip_set(inst, to_id) = from;
> write_unlock_bh(&ip_set_ref_lock);
>
>- /* Make sure all readers of the old set pointers are completed. */
>- synchronize_rcu();
>-
> return 0;
> }
>
>@@ -2409,8 +2425,11 @@ ip_set_fini(void)
> {
> nf_unregister_sockopt(&so_set);
> nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
>-
> unregister_pernet_subsys(&ip_set_net_ops);
>+
>+ /* Wait for call_rcu() in destroy */
>+ rcu_barrier();
>+
> pr_debug("these are the famous last words\n");
> }
>
>diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
>index 7c2399541771..c62998b46f00 100644
>--- a/net/netfilter/ipset/ip_set_hash_gen.h
>+++ b/net/netfilter/ipset/ip_set_hash_gen.h
>@@ -221,6 +221,7 @@ static const union nf_inet_addr zeromask = {};
> #undef mtype_gc_do
> #undef mtype_gc
> #undef mtype_gc_init
>+#undef mtype_cancel_gc
> #undef mtype_variant
> #undef mtype_data_match
>
>@@ -265,6 +266,7 @@ static const union nf_inet_addr zeromask = {};
> #define mtype_gc_do IPSET_TOKEN(MTYPE, _gc_do)
> #define mtype_gc IPSET_TOKEN(MTYPE, _gc)
> #define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init)
>+#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc)
> #define mtype_variant IPSET_TOKEN(MTYPE, _variant)
> #define mtype_data_match IPSET_TOKEN(MTYPE, _data_match)
>
>@@ -449,9 +451,6 @@ mtype_destroy(struct ip_set *set)
> struct htype *h = set->data;
> struct list_head *l, *lt;
>
>- if (SET_WITH_TIMEOUT(set))
>- cancel_delayed_work_sync(&h->gc.dwork);
>-
> mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true);
> list_for_each_safe(l, lt, &h->ad) {
> list_del(l);
>@@ -598,6 +597,15 @@ mtype_gc_init(struct htable_gc *gc)
> queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ);
> }
>
>+static void
>+mtype_cancel_gc(struct ip_set *set)
>+{
>+ struct htype *h = set->data;
>+
>+ if (SET_WITH_TIMEOUT(set))
>+ cancel_delayed_work_sync(&h->gc.dwork);
>+}
>+
> static int
> mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
> struct ip_set_ext *mext, u32 flags);
>@@ -1440,6 +1448,7 @@ static const struct ip_set_type_variant mtype_variant = {
> .uref = mtype_uref,
> .resize = mtype_resize,
> .same_set = mtype_same_set,
>+ .cancel_gc = mtype_cancel_gc,
> .region_lock = true,
> };
>
>diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
>index e162636525cf..6c3f28bc59b3 100644
>--- a/net/netfilter/ipset/ip_set_list_set.c
>+++ b/net/netfilter/ipset/ip_set_list_set.c
>@@ -426,9 +426,6 @@ list_set_destroy(struct ip_set *set)
> struct list_set *map = set->data;
> struct set_elem *e, *n;
>
>- if (SET_WITH_TIMEOUT(set))
>- timer_shutdown_sync(&map->gc);
>-
> list_for_each_entry_safe(e, n, &map->members, list) {
> list_del(&e->list);
> ip_set_put_byindex(map->net, e->id);
>@@ -545,6 +542,15 @@ list_set_same_set(const struct ip_set *a, const struct ip_set *b)
> a->extensions == b->extensions;
> }
>
>+static void
>+list_set_cancel_gc(struct ip_set *set)
>+{
>+ struct list_set *map = set->data;
>+
>+ if (SET_WITH_TIMEOUT(set))
>+ timer_shutdown_sync(&map->gc);
>+}
>+
> static const struct ip_set_type_variant set_variant = {
> .kadt = list_set_kadt,
> .uadt = list_set_uadt,
>@@ -558,6 +564,7 @@ static const struct ip_set_type_variant set_variant = {
> .head = list_set_head,
> .list = list_set_list,
> .same_set = list_set_same_set,
>+ .cancel_gc = list_set_cancel_gc,
> };
>
> static void
>--
>2.39.2
>
>Best regards,
>Jozsef
>--
>E-mail : kadlec@...ckhole.kfki.hu, kadlecsik.jozsef@...ner.hu
>PGP key : https://wigner.hu/~kadlec/pgp_public_key.txt
>Address : Wigner Research Centre for Physics
> H-1525 Budapest 114, POB. 49, Hungary
Powered by blists - more mailing lists