lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date:   Wed, 29 Nov 2017 15:25:54 +0100
From:   Paolo Abeni <pabeni@...hat.com>
To:     netdev@...r.kernel.org
Cc:     Jamal Hadi Salim <jhs@...atatu.com>,
        Cong Wang <xiyou.wangcong@...il.com>,
        Jiri Pirko <jiri@...nulli.us>,
        "David S. Miller" <davem@...emloft.net>
Subject: [RFC PATCH] net_sched: bulk free tcf_block

Currently deleting qdisc with a large number of children and filters
can take a lot of time:

tc qdisc add dev lo root htb
for I in `seq 1 1000`; do
	tc class add dev lo parent 1: classid 1:$I htb rate 100kbit
	tc qdisc add dev lo parent 1:$I handle $((I + 1)): htb
	for J in `seq 1 10`; do
		tc filter add dev lo parent $((I + 1)): u32 match ip src 1.1.1.$J
	done
done
time tc qdisc del dev root

real	0m54.764s
user	0m0.023s
sys	0m0.000s

This is due to the multiple rcu_barrier() calls, one for each tcf_block
freed, invoked with the rtnl lock held. Most other network related
tasks will block in this timespan.

This change implements bulk free of tcf_block() at destroy() time:
when deleting a qdisc hierarchy, the to-be-deleted blocks are queued
in a rtnl_lock protected list, and a single rcu_barrier is invoked
for each burst.

The burst is flushed after the deletion of the topmost qdisc of the
destroyed hierarchy and all the queued blocks are deleted with a
single delayed work call.

After this change, the previous example gives:

real	0m0.193s
user	0m0.000s
sys	0m0.016s

Signed-off-by: Paolo Abeni <pabeni@...hat.com>
---
This patch adds 2 new list_head fields to tcf_block, that could be
replaced with a single pointer, open coding single linked list
manipulation in cls_api.c, if a lower memory impact is required.
---
 include/net/pkt_cls.h     |  1 +
 include/net/sch_generic.h |  5 +++
 net/sched/cls_api.c       | 78 +++++++++++++++++++++++++++++++++++------------
 net/sched/sch_api.c       |  1 +
 net/sched/sch_generic.c   | 17 +++++++++++
 net/sched/sch_ingress.c   |  1 +
 6 files changed, 83 insertions(+), 20 deletions(-)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 0105445cab83..12777cfae77c 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -45,6 +45,7 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
 void tcf_block_put(struct tcf_block *block);
 void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 		       struct tcf_block_ext_info *ei);
+void tcf_flush_blocks(void);
 
 static inline struct Qdisc *tcf_block_q(struct tcf_block *block)
 {
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 65d0d25f2648..99846ee644a8 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -71,6 +71,7 @@ struct Qdisc {
 				      * qdisc_tree_decrease_qlen() should stop.
 				      */
 #define TCQ_F_INVISIBLE		0x80 /* invisible by default in dump */
+#define TCQ_F_DELETING		0x100
 	u32			limit;
 	const struct Qdisc_ops	*ops;
 	struct qdisc_size_table	__rcu *stab;
@@ -279,6 +280,10 @@ struct tcf_block {
 	struct Qdisc *q;
 	struct list_head cb_list;
 	struct work_struct work;
+
+	/* TODO: use a single list, do avoid wasting too much memory */
+	struct list_head del_list;
+	struct list_head del_head;
 };
 
 static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 7d97f612c9b9..446b16c1f532 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -37,6 +37,61 @@ static LIST_HEAD(tcf_proto_base);
 /* Protects list of registered TC modules. It is pure SMP lock. */
 static DEFINE_RWLOCK(cls_mod_lock);
 
+/* List of tcf_blocks queued for deletion. Bulk freeing them we avoid the
+ * rcu_barrier() storm at root_qdisc->destroy() time
+ */
+static LIST_HEAD(queued_blocks);
+
+static void queue_for_deletion(struct tcf_block *block)
+{
+	if (WARN_ON(!list_empty(&block->del_list)))
+		return;
+
+	ASSERT_RTNL();
+	list_add(&block->del_list, &queued_blocks);
+}
+
+static void flush_blocks(struct work_struct *work)
+{
+	struct tcf_block *block, *tmp_block;
+	struct tcf_chain *chain, *tmp;
+	struct list_head *head;
+
+	head = &container_of(work, struct tcf_block, work)->del_head;
+	rtnl_lock();
+	list_for_each_entry(block, head, del_list)
+		/* Only chain 0 should be still here. */
+		list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
+			tcf_chain_put(chain);
+	rtnl_unlock();
+
+	list_for_each_entry_safe(block, tmp_block, head, del_list)
+		kfree(block);
+}
+
+void tcf_flush_blocks(void)
+{
+	struct tcf_block *head;
+	LIST_HEAD(flush_burst);
+
+	ASSERT_RTNL();
+	if (list_empty(&queued_blocks))
+		return;
+
+	head = list_first_entry(&queued_blocks, struct tcf_block, del_list);
+	INIT_LIST_HEAD(&head->del_head);
+	list_splice_init(&queued_blocks, &head->del_head);
+	INIT_WORK(&head->work, flush_blocks);
+
+	/* Wait for existing RCU callbacks to cool down, make sure their works
+	 * have been queued before this. We can not flush pending works here
+	 * because we are holding the RTNL lock.
+	 */
+	rcu_barrier();
+	schedule_work(&head->work);
+}
+EXPORT_SYMBOL_GPL(tcf_flush_blocks);
+
 /* Find classifier type by string name */
 
 static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind)
@@ -288,6 +343,7 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
 		return -ENOMEM;
 	INIT_LIST_HEAD(&block->chain_list);
 	INIT_LIST_HEAD(&block->cb_list);
+	INIT_LIST_HEAD(&block->del_list);
 
 	/* Create chain 0 by default, it has to be always present. */
 	chain = tcf_chain_create(block, 0);
@@ -330,19 +386,6 @@ int tcf_block_get(struct tcf_block **p_block,
 }
 EXPORT_SYMBOL(tcf_block_get);
 
-static void tcf_block_put_final(struct work_struct *work)
-{
-	struct tcf_block *block = container_of(work, struct tcf_block, work);
-	struct tcf_chain *chain, *tmp;
-
-	rtnl_lock();
-	/* Only chain 0 should be still here. */
-	list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
-		tcf_chain_put(chain);
-	rtnl_unlock();
-	kfree(block);
-}
-
 /* XXX: Standalone actions are not allowed to jump to any chain, and bound
  * actions should be all removed after flushing. However, filters are now
  * destroyed in tc filter workqueue with RTNL lock, they can not race here.
@@ -357,13 +400,7 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 
 	tcf_block_offload_unbind(block, q, ei);
 
-	INIT_WORK(&block->work, tcf_block_put_final);
-	/* Wait for existing RCU callbacks to cool down, make sure their works
-	 * have been queued before this. We can not flush pending works here
-	 * because we are holding the RTNL lock.
-	 */
-	rcu_barrier();
-	tcf_queue_work(&block->work);
+	queue_for_deletion(block);
 }
 EXPORT_SYMBOL(tcf_block_put_ext);
 
@@ -920,6 +957,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
 		if (tp_created)
 			tcf_proto_destroy(tp);
 	}
+	tcf_flush_blocks();
 
 errout:
 	if (chain)
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b6c4f536876b..5fe2dcb73bfd 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1645,6 +1645,7 @@ static int tclass_del_notify(struct net *net,
 		kfree_skb(skb);
 		return err;
 	}
+	tcf_flush_blocks();
 
 	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 			      n->nlmsg_flags & NLM_F_ECHO);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 3839cbbdc32b..299c5d33916a 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -28,6 +28,7 @@
 #include <linux/if_vlan.h>
 #include <net/sch_generic.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 #include <net/dst.h>
 #include <trace/events/qdisc.h>
 
@@ -708,11 +709,25 @@ static void qdisc_free(struct Qdisc *qdisc)
 void qdisc_destroy(struct Qdisc *qdisc)
 {
 	const struct Qdisc_ops  *ops = qdisc->ops;
+	struct Qdisc *p;
+	bool flush;
 
 	if (qdisc->flags & TCQ_F_BUILTIN ||
 	    !refcount_dec_and_test(&qdisc->refcnt))
 		return;
 
+	/* we can avoid flush the pending blocks if this qdisc is a child
+	 * deleted a recursive destroy() call and the parent qdisc is already
+	 * removed.
+	 */
+	qdisc->flags |= TCQ_F_DELETING;
+	if (qdisc->parent != TC_H_ROOT) {
+		p = qdisc_lookup(qdisc_dev(qdisc), TC_H_MAJ(qdisc->parent));
+		flush = p && !(p->flags & TCQ_F_DELETING);
+	} else {
+		flush = true;
+	}
+
 #ifdef CONFIG_NET_SCHED
 	qdisc_hash_del(qdisc);
 
@@ -723,6 +738,8 @@ void qdisc_destroy(struct Qdisc *qdisc)
 		ops->reset(qdisc);
 	if (ops->destroy)
 		ops->destroy(qdisc);
+	if (flush)
+		tcf_flush_blocks();
 
 	module_put(ops->owner);
 	dev_put(qdisc_dev(qdisc));
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 5ecc38f35d47..7329b8f55160 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -201,6 +201,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt)
 
 err_egress_block_get:
 	tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info);
+	tcf_flush_blocks();
 	return err;
 }
 
-- 
2.13.6

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ