netdev - Re: [RFC PATCH] net_sched: bulk free tcf

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Wed, 29 Nov 2017 08:14:38 -0800
From:   Alexander Duyck <alexander.duyck@...il.com>
To:     Paolo Abeni <pabeni@...hat.com>
Cc:     Netdev <netdev@...r.kernel.org>,
        Jamal Hadi Salim <jhs@...atatu.com>,
        Cong Wang <xiyou.wangcong@...il.com>,
        Jiri Pirko <jiri@...nulli.us>,
        "David S. Miller" <davem@...emloft.net>
Subject: Re: [RFC PATCH] net_sched: bulk free tcf_block

On Wed, Nov 29, 2017 at 6:25 AM, Paolo Abeni <pabeni@...hat.com> wrote:
> Currently deleting qdisc with a large number of children and filters
> can take a lot of time:
>
> tc qdisc add dev lo root htb
> for I in `seq 1 1000`; do
>         tc class add dev lo parent 1: classid 1:$I htb rate 100kbit
>         tc qdisc add dev lo parent 1:$I handle $((I + 1)): htb
>         for J in `seq 1 10`; do
>                 tc filter add dev lo parent $((I + 1)): u32 match ip src 1.1.1.$J
>         done
> done
> time tc qdisc del dev root
>
> real    0m54.764s
> user    0m0.023s
> sys     0m0.000s
>
> This is due to the multiple rcu_barrier() calls, one for each tcf_block
> freed, invoked with the rtnl lock held. Most other network related
> tasks will block in this timespan.
>
> This change implements bulk free of tcf_block() at destroy() time:
> when deleting a qdisc hierarchy, the to-be-deleted blocks are queued
> in a rtnl_lock protected list, and a single rcu_barrier is invoked
> for each burst.
>
> The burst is flushed after the deletion of the topmost qdisc of the
> destroyed hierarchy and all the queued blocks are deleted with a
> single delayed work call.
>
> After this change, the previous example gives:
>
> real    0m0.193s
> user    0m0.000s
> sys     0m0.016s
>
> Signed-off-by: Paolo Abeni <pabeni@...hat.com>

While I agree there is an issue I don't think this is being approached
quite the right way. The question I have is why aren't we using the
standard RCU approach for this and simply using either call_rcu or
kfree_rcu to free the object?

> ---
> This patch adds 2 new list_head fields to tcf_block, that could be
> replaced with a single pointer, open coding single linked list
> manipulation in cls_api.c, if a lower memory impact is required.
> ---
>  include/net/pkt_cls.h     |  1 +
>  include/net/sch_generic.h |  5 +++
>  net/sched/cls_api.c       | 78 +++++++++++++++++++++++++++++++++++------------
>  net/sched/sch_api.c       |  1 +
>  net/sched/sch_generic.c   | 17 +++++++++++
>  net/sched/sch_ingress.c   |  1 +
>  6 files changed, 83 insertions(+), 20 deletions(-)
>
> diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
> index 0105445cab83..12777cfae77c 100644
> --- a/include/net/pkt_cls.h
> +++ b/include/net/pkt_cls.h
> @@ -45,6 +45,7 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
>  void tcf_block_put(struct tcf_block *block);
>  void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
>                        struct tcf_block_ext_info *ei);
> +void tcf_flush_blocks(void);
>
>  static inline struct Qdisc *tcf_block_q(struct tcf_block *block)
>  {
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index 65d0d25f2648..99846ee644a8 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -71,6 +71,7 @@ struct Qdisc {
>                                       * qdisc_tree_decrease_qlen() should stop.
>                                       */
>  #define TCQ_F_INVISIBLE                0x80 /* invisible by default in dump */
> +#define TCQ_F_DELETING         0x100
>         u32                     limit;
>         const struct Qdisc_ops  *ops;
>         struct qdisc_size_table __rcu *stab;
> @@ -279,6 +280,10 @@ struct tcf_block {
>         struct Qdisc *q;
>         struct list_head cb_list;
>         struct work_struct work;
> +
> +       /* TODO: use a single list, do avoid wasting too much memory */
> +       struct list_head del_list;
> +       struct list_head del_head;
>  };
>

This is just adding yet another layer of deferred freeing. We already
have RCU why don't we just use that?

>  static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
> diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
> index 7d97f612c9b9..446b16c1f532 100644
> --- a/net/sched/cls_api.c
> +++ b/net/sched/cls_api.c
> @@ -37,6 +37,61 @@ static LIST_HEAD(tcf_proto_base);
>  /* Protects list of registered TC modules. It is pure SMP lock. */
>  static DEFINE_RWLOCK(cls_mod_lock);
>
> +/* List of tcf_blocks queued for deletion. Bulk freeing them we avoid the
> + * rcu_barrier() storm at root_qdisc->destroy() time
> + */
> +static LIST_HEAD(queued_blocks);
> +
> +static void queue_for_deletion(struct tcf_block *block)
> +{
> +       if (WARN_ON(!list_empty(&block->del_list)))
> +               return;
> +
> +       ASSERT_RTNL();
> +       list_add(&block->del_list, &queued_blocks);
> +}
> +
> +static void flush_blocks(struct work_struct *work)
> +{
> +       struct tcf_block *block, *tmp_block;
> +       struct tcf_chain *chain, *tmp;
> +       struct list_head *head;
> +
> +       head = &container_of(work, struct tcf_block, work)->del_head;
> +       rtnl_lock();
> +       list_for_each_entry(block, head, del_list)
> +               /* Only chain 0 should be still here. */
> +               list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
> +                       tcf_chain_put(chain);
> +       rtnl_unlock();
> +
> +       list_for_each_entry_safe(block, tmp_block, head, del_list)
> +               kfree(block);
> +}
> +
> +void tcf_flush_blocks(void)
> +{
> +       struct tcf_block *head;
> +       LIST_HEAD(flush_burst);
> +
> +       ASSERT_RTNL();
> +       if (list_empty(&queued_blocks))
> +               return;
> +
> +       head = list_first_entry(&queued_blocks, struct tcf_block, del_list);
> +       INIT_LIST_HEAD(&head->del_head);
> +       list_splice_init(&queued_blocks, &head->del_head);
> +       INIT_WORK(&head->work, flush_blocks);
> +
> +       /* Wait for existing RCU callbacks to cool down, make sure their works
> +        * have been queued before this. We can not flush pending works here
> +        * because we are holding the RTNL lock.
> +        */
> +       rcu_barrier();
> +       schedule_work(&head->work);
> +}
> +EXPORT_SYMBOL_GPL(tcf_flush_blocks);
> +
>  /* Find classifier type by string name */
>
>  static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind)
> @@ -288,6 +343,7 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
>                 return -ENOMEM;
>         INIT_LIST_HEAD(&block->chain_list);
>         INIT_LIST_HEAD(&block->cb_list);
> +       INIT_LIST_HEAD(&block->del_list);
>
>         /* Create chain 0 by default, it has to be always present. */
>         chain = tcf_chain_create(block, 0);
> @@ -330,19 +386,6 @@ int tcf_block_get(struct tcf_block **p_block,
>  }
>  EXPORT_SYMBOL(tcf_block_get);
>
> -static void tcf_block_put_final(struct work_struct *work)
> -{
> -       struct tcf_block *block = container_of(work, struct tcf_block, work);
> -       struct tcf_chain *chain, *tmp;
> -
> -       rtnl_lock();
> -       /* Only chain 0 should be still here. */
> -       list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
> -               tcf_chain_put(chain);

So it seems like the heart of the problem is right here. The
tcf_chain_put call is updating the reference count and I would assume
that is the only bit that really needs you to still be holding onto
the rtnl_lock.

The question I would have is if there is anything accessing the
reference count or manipulating the list itself without holding the
rtnl lock? If not you could look at converting this whole thing from
using a list to an rculist and it seems like it would save you a lot
of trouble. As far as I can tell the only thing you would then really
have to worry about would be the freeing of the chain itself which
would happen in an rcu callback instead of with the rtnl lock held.

> -       rtnl_unlock();
> -       kfree(block);
> -}
> -
>  /* XXX: Standalone actions are not allowed to jump to any chain, and bound
>   * actions should be all removed after flushing. However, filters are now
>   * destroyed in tc filter workqueue with RTNL lock, they can not race here.
> @@ -357,13 +400,7 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
>
>         tcf_block_offload_unbind(block, q, ei);
>
> -       INIT_WORK(&block->work, tcf_block_put_final);
> -       /* Wait for existing RCU callbacks to cool down, make sure their works
> -        * have been queued before this. We can not flush pending works here
> -        * because we are holding the RTNL lock.
> -        */
> -       rcu_barrier();
> -       tcf_queue_work(&block->work);
> +       queue_for_deletion(block);
>  }
>  EXPORT_SYMBOL(tcf_block_put_ext);
>
> @@ -920,6 +957,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
>                 if (tp_created)
>                         tcf_proto_destroy(tp);
>         }
> +       tcf_flush_blocks();
>
>  errout:
>         if (chain)
> diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
> index b6c4f536876b..5fe2dcb73bfd 100644
> --- a/net/sched/sch_api.c
> +++ b/net/sched/sch_api.c
> @@ -1645,6 +1645,7 @@ static int tclass_del_notify(struct net *net,
>                 kfree_skb(skb);
>                 return err;
>         }
> +       tcf_flush_blocks();
>
>         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
>                               n->nlmsg_flags & NLM_F_ECHO);
> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> index 3839cbbdc32b..299c5d33916a 100644
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -28,6 +28,7 @@
>  #include <linux/if_vlan.h>
>  #include <net/sch_generic.h>
>  #include <net/pkt_sched.h>
> +#include <net/pkt_cls.h>
>  #include <net/dst.h>
>  #include <trace/events/qdisc.h>
>
> @@ -708,11 +709,25 @@ static void qdisc_free(struct Qdisc *qdisc)
>  void qdisc_destroy(struct Qdisc *qdisc)
>  {
>         const struct Qdisc_ops  *ops = qdisc->ops;
> +       struct Qdisc *p;
> +       bool flush;
>
>         if (qdisc->flags & TCQ_F_BUILTIN ||
>             !refcount_dec_and_test(&qdisc->refcnt))
>                 return;
>
> +       /* we can avoid flush the pending blocks if this qdisc is a child
> +        * deleted a recursive destroy() call and the parent qdisc is already
> +        * removed.
> +        */
> +       qdisc->flags |= TCQ_F_DELETING;
> +       if (qdisc->parent != TC_H_ROOT) {
> +               p = qdisc_lookup(qdisc_dev(qdisc), TC_H_MAJ(qdisc->parent));
> +               flush = p && !(p->flags & TCQ_F_DELETING);
> +       } else {
> +               flush = true;
> +       }
> +
>  #ifdef CONFIG_NET_SCHED
>         qdisc_hash_del(qdisc);
>
> @@ -723,6 +738,8 @@ void qdisc_destroy(struct Qdisc *qdisc)
>                 ops->reset(qdisc);
>         if (ops->destroy)
>                 ops->destroy(qdisc);
> +       if (flush)
> +               tcf_flush_blocks();
>
>         module_put(ops->owner);
>         dev_put(qdisc_dev(qdisc));
> diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
> index 5ecc38f35d47..7329b8f55160 100644
> --- a/net/sched/sch_ingress.c
> +++ b/net/sched/sch_ingress.c
> @@ -201,6 +201,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt)
>
>  err_egress_block_get:
>         tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info);
> +       tcf_flush_blocks();
>         return err;
>  }
>
> --
> 2.13.6
>