lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1286035915.2582.2472.camel@edumazet-laptop>
Date:	Sat, 02 Oct 2010 18:11:55 +0200
From:	Eric Dumazet <eric.dumazet@...il.com>
To:	Jarek Poplawski <jarkao2@...il.com>
Cc:	hadi@...erus.ca, David Miller <davem@...emloft.net>,
	netdev <netdev@...r.kernel.org>
Subject: [PATCH net-next V3] net: dynamic ingress_queue allocation

Le samedi 02 octobre 2010 à 11:32 +0200, Jarek Poplawski a écrit :
> On Fri, Oct 01, 2010 at 03:56:28PM +0200, Eric Dumazet wrote:
>  
> >  static void netdev_init_queue_locks(struct net_device *dev)
> >  {
> >  	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
> > -	__netdev_init_queue_locks_one(dev, &dev->ingress_queue, NULL);
> > +	__netdev_init_queue_locks_one(dev, dev_ingress_queue(dev), NULL);
> 
> Is dev_ingress_queue(dev) not NULL anytime here?

Yes, but I felt this could be removed later. If you feel its OK right
now, I am OK too ;)

> 
> >  }
> >  
> >  unsigned long netdev_fix_features(unsigned long features, const char *name)
> > @@ -5447,16 +5448,37 @@ static void netdev_init_one_queue(struct net_device *dev,
> >  				  struct netdev_queue *queue,
> >  				  void *_unused)
> >  {
> > -	queue->dev = dev;
> > +	if (queue)
> > +		queue->dev = dev;
> >  }
> >  
> >  static void netdev_init_queues(struct net_device *dev)
> >  {
> > -	netdev_init_one_queue(dev, &dev->ingress_queue, NULL);
> > +	netdev_init_one_queue(dev, dev_ingress_queue(dev), NULL);
> 
> Is dev_ingress_queue(dev) not NULL anytime here?
> 

Yes

> >  	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
> >  	spin_lock_init(&dev->tx_global_lock);
> >  }
> >  
> > +struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
> > +{
> > +	struct netdev_queue *queue = dev_ingress_queue(dev);
> > +
> > +#ifdef CONFIG_NET_CLS_ACT
> > +	if (queue)
> > +		return queue;
> > +	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
> > +	if (!queue)
> > +		return NULL;
> > +	netdev_init_one_queue(dev, queue, NULL);
> > +	__netdev_init_queue_locks_one(dev, queue, NULL);
> > +	queue->qdisc = &noop_qdisc;
> > +	queue->qdisc_sleeping = &noop_qdisc;
> > +	smp_wmb();
> 
> Why don't we need smp_rmb() in handle_ing()?
> 

I only wanted to see if Al Viro was using ingress on its Alpha
machine ;)

I am going to use regular rcu api to ease code understanding :)


> > +	dev->ingress_queue = queue;
> > +#endif
> > +	return queue;
> > +}
> > +
> >  /**
> >   *	alloc_netdev_mq - allocate network device
> >   *	@sizeof_priv:	size of private data to allocate space for
> > @@ -5559,6 +5581,8 @@ void free_netdev(struct net_device *dev)
> >  
> >  	kfree(dev->_tx);
> >  
> > +	kfree(dev_ingress_queue(dev));
> > +
> >  	/* Flush device addresses */
> >  	dev_addr_flush(dev);
> >  
> > diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
> > index b802078..8635110 100644
> > --- a/net/sched/sch_api.c
> > +++ b/net/sched/sch_api.c
> > @@ -240,7 +240,10 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
> >  	if (q)
> >  		goto out;
> >  
> > -	q = qdisc_match_from_root(dev->ingress_queue.qdisc_sleeping, handle);
> > +	if (!dev_ingress_queue(dev))
> > +		goto out;
> > +	q = qdisc_match_from_root(dev_ingress_queue(dev)->qdisc_sleeping,
> > +				  handle);
> 


> I'd prefer:
>  +	if (dev_ingress_queue(dev))
>  +		q = qdisc_match_from_root(dev_ingress_queue(dev)->qdisc_sleeping,
> 

Yes

> >  out:
> >  	return q;
> >  }
> > @@ -690,6 +693,8 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
> >  		    (new && new->flags & TCQ_F_INGRESS)) {
> >  			num_q = 1;
> >  			ingress = 1;
> > +			if (!dev_ingress_queue(dev))
> > +				return -ENOENT;
> 
> Is this test really needed here?

To avoid a NULL dereference some lines later.
Do I have a guarantee its not NULL here ?

> 
> >  		}
> >  
> >  		if (dev->flags & IFF_UP)
> > @@ -701,7 +706,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
> >  		}
> >  
> >  		for (i = 0; i < num_q; i++) {
> > -			struct netdev_queue *dev_queue = &dev->ingress_queue;
> > +			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
> >  
> >  			if (!ingress)
> >  				dev_queue = netdev_get_tx_queue(dev, i);
> > @@ -979,7 +984,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
> >  					return -ENOENT;
> >  				q = qdisc_leaf(p, clid);
> >  			} else { /* ingress */
> > -				q = dev->ingress_queue.qdisc_sleeping;
> > +				if (dev_ingress_queue(dev))
> > +					q = dev_ingress_queue(dev)->qdisc_sleeping;
> >  			}
> >  		} else {
> >  			q = dev->qdisc;
> > @@ -1044,7 +1050,8 @@ replay:
> >  					return -ENOENT;
> >  				q = qdisc_leaf(p, clid);
> >  			} else { /*ingress */
> > -				q = dev->ingress_queue.qdisc_sleeping;
> > +				if (dev_ingress_queue_create(dev))
> > +					q = dev_ingress_queue(dev)->qdisc_sleeping;
> 
> I wonder if doing dev_ingress_queue_create() just before qdisc_create()
> (and the test here) isn't more readable.

Sorry, I dont understand. I want to create ingress_queue only if user
wants it. If we setup (egress) trafic shaping, no need to setup
ingress_queue.

> >  
> > @@ -753,7 +755,7 @@ void dev_activate(struct net_device *dev)
> >  
> >  	need_watchdog = 0;
> >  	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
> > -	transition_one_qdisc(dev, &dev->ingress_queue, NULL);
> > +	transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
> 
> I'd prefer here and similarly later:
> 
>  +	if (dev_ingress_queue(dev))
>  +		transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
> 
> to show NULL dev_queue is only legal in this one case.

OK, thanks a lot for the extended review Jarek (and Jamal of course)

Here is the V3 then.

[PATCH net-next V3] net: dynamic ingress_queue allocation

ingress being not used very much, and net_device->ingress_queue being
quite a big object (128 or 256 bytes), use a dynamic allocation if
needed (tc qdisc add dev eth0 ingress ...)

dev_ingress_queue(dev) helper should be used only with RTNL taken.

Signed-off-by: Eric Dumazet <eric.dumazet@...il.com>
---
V3: add rcu notations & address Jarek comments
 include/linux/netdevice.h |    2 -
 include/linux/rtnetlink.h |    8 ++++++
 net/core/dev.c            |   34 ++++++++++++++++++++++-------
 net/sched/sch_api.c       |   42 ++++++++++++++++++++++++------------
 net/sched/sch_generic.c   |   12 ++++++----
 5 files changed, 71 insertions(+), 27 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ceed347..92d81ed 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -986,7 +986,7 @@ struct net_device {
 	rx_handler_func_t	*rx_handler;
 	void			*rx_handler_data;
 
-	struct netdev_queue	ingress_queue; /* use two cache lines */
+	struct netdev_queue __rcu *ingress_queue;
 
 /*
  * Cache lines mostly used on transmit path
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 68c436b..0bb7b48 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -6,6 +6,7 @@
 #include <linux/if_link.h>
 #include <linux/if_addr.h>
 #include <linux/neighbour.h>
+#include <linux/netdevice.h>
 
 /* rtnetlink families. Values up to 127 are reserved for real address
  * families, values above 128 may be used arbitrarily.
@@ -769,6 +770,13 @@ extern int lockdep_rtnl_is_held(void);
 #define rtnl_dereference(p)					\
 	rcu_dereference_check(p, lockdep_rtnl_is_held())
 
+static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
+{
+	return rtnl_dereference(dev->ingress_queue);
+}
+
+extern struct netdev_queue *dev_ingress_queue_create(struct net_device *dev);
+
 extern void rtnetlink_init(void);
 extern void __rtnl_unlock(void);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index a313bab..b078ec8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2702,11 +2702,10 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
  * the ingress scheduler, you just cant add policies on ingress.
  *
  */
-static int ing_filter(struct sk_buff *skb)
+static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
 {
 	struct net_device *dev = skb->dev;
 	u32 ttl = G_TC_RTTL(skb->tc_verd);
-	struct netdev_queue *rxq;
 	int result = TC_ACT_OK;
 	struct Qdisc *q;
 
@@ -2720,8 +2719,6 @@ static int ing_filter(struct sk_buff *skb)
 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
 
-	rxq = &dev->ingress_queue;
-
 	q = rxq->qdisc;
 	if (q != &noop_qdisc) {
 		spin_lock(qdisc_lock(q));
@@ -2737,7 +2734,9 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
 					 struct packet_type **pt_prev,
 					 int *ret, struct net_device *orig_dev)
 {
-	if (skb->dev->ingress_queue.qdisc == &noop_qdisc)
+	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
+
+	if (!rxq || rxq->qdisc == &noop_qdisc)
 		goto out;
 
 	if (*pt_prev) {
@@ -2745,7 +2744,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
 		*pt_prev = NULL;
 	}
 
-	switch (ing_filter(skb)) {
+	switch (ing_filter(skb, rxq)) {
 	case TC_ACT_SHOT:
 	case TC_ACT_STOLEN:
 		kfree_skb(skb);
@@ -4940,7 +4939,6 @@ static void __netdev_init_queue_locks_one(struct net_device *dev,
 static void netdev_init_queue_locks(struct net_device *dev)
 {
 	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
-	__netdev_init_queue_locks_one(dev, &dev->ingress_queue, NULL);
 }
 
 unsigned long netdev_fix_features(unsigned long features, const char *name)
@@ -5452,11 +5450,29 @@ static void netdev_init_one_queue(struct net_device *dev,
 
 static void netdev_init_queues(struct net_device *dev)
 {
-	netdev_init_one_queue(dev, &dev->ingress_queue, NULL);
 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
 	spin_lock_init(&dev->tx_global_lock);
 }
 
+struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
+{
+	struct netdev_queue *queue = dev_ingress_queue(dev);
+
+#ifdef CONFIG_NET_CLS_ACT
+	if (queue)
+		return queue;
+	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
+	if (!queue)
+		return NULL;
+	netdev_init_one_queue(dev, queue, NULL);
+	__netdev_init_queue_locks_one(dev, queue, NULL);
+	queue->qdisc = &noop_qdisc;
+	queue->qdisc_sleeping = &noop_qdisc;
+	rcu_assign_pointer(dev->ingress_queue, queue);
+#endif
+	return queue;
+}
+
 /**
  *	alloc_netdev_mq - allocate network device
  *	@sizeof_priv:	size of private data to allocate space for
@@ -5559,6 +5575,8 @@ void free_netdev(struct net_device *dev)
 
 	kfree(dev->_tx);
 
+	kfree(rcu_dereference_raw(dev->ingress_queue));
+
 	/* Flush device addresses */
 	dev_addr_flush(dev);
 
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b802078..b22ca2d 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -240,7 +240,10 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 	if (q)
 		goto out;
 
-	q = qdisc_match_from_root(dev->ingress_queue.qdisc_sleeping, handle);
+	if (dev_ingress_queue(dev))
+		q = qdisc_match_from_root(
+			dev_ingress_queue(dev)->qdisc_sleeping,
+			handle);
 out:
 	return q;
 }
@@ -690,6 +693,8 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 		    (new && new->flags & TCQ_F_INGRESS)) {
 			num_q = 1;
 			ingress = 1;
+			if (!dev_ingress_queue(dev))
+				return -ENOENT;
 		}
 
 		if (dev->flags & IFF_UP)
@@ -701,7 +706,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 		}
 
 		for (i = 0; i < num_q; i++) {
-			struct netdev_queue *dev_queue = &dev->ingress_queue;
+			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
 
 			if (!ingress)
 				dev_queue = netdev_get_tx_queue(dev, i);
@@ -979,7 +984,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 					return -ENOENT;
 				q = qdisc_leaf(p, clid);
 			} else { /* ingress */
-				q = dev->ingress_queue.qdisc_sleeping;
+				if (dev_ingress_queue(dev))
+					q = dev_ingress_queue(dev)->qdisc_sleeping;
 			}
 		} else {
 			q = dev->qdisc;
@@ -1043,8 +1049,9 @@ replay:
 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
 					return -ENOENT;
 				q = qdisc_leaf(p, clid);
-			} else { /*ingress */
-				q = dev->ingress_queue.qdisc_sleeping;
+			} else { /* ingress */
+				if (dev_ingress_queue_create(dev))
+					q = dev_ingress_queue(dev)->qdisc_sleeping;
 			}
 		} else {
 			q = dev->qdisc;
@@ -1123,11 +1130,14 @@ replay:
 create_n_graft:
 	if (!(n->nlmsg_flags&NLM_F_CREATE))
 		return -ENOENT;
-	if (clid == TC_H_INGRESS)
-		q = qdisc_create(dev, &dev->ingress_queue, p,
-				 tcm->tcm_parent, tcm->tcm_parent,
-				 tca, &err);
-	else {
+	if (clid == TC_H_INGRESS) {
+		if (dev_ingress_queue(dev))
+			q = qdisc_create(dev, dev_ingress_queue(dev), p,
+					 tcm->tcm_parent, tcm->tcm_parent,
+					 tca, &err);
+		else
+			err = -ENOENT;
+	} else {
 		struct netdev_queue *dev_queue;
 
 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
@@ -1304,8 +1314,10 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
 			goto done;
 
-		dev_queue = &dev->ingress_queue;
-		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
+		dev_queue = dev_ingress_queue(dev);
+		if (dev_queue &&
+		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
+				       &q_idx, s_q_idx) < 0)
 			goto done;
 
 cont:
@@ -1595,8 +1607,10 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
 		goto done;
 
-	dev_queue = &dev->ingress_queue;
-	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
+	dev_queue = dev_ingress_queue(dev);
+	if (dev_queue &&
+	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
+				&t, s_t) < 0)
 		goto done;
 
 done:
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 545278a..3d57681 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -753,7 +753,8 @@ void dev_activate(struct net_device *dev)
 
 	need_watchdog = 0;
 	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
-	transition_one_qdisc(dev, &dev->ingress_queue, NULL);
+	if (dev_ingress_queue(dev))
+		transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
 
 	if (need_watchdog) {
 		dev->trans_start = jiffies;
@@ -812,7 +813,8 @@ static bool some_qdisc_is_busy(struct net_device *dev)
 void dev_deactivate(struct net_device *dev)
 {
 	netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc);
-	dev_deactivate_queue(dev, &dev->ingress_queue, &noop_qdisc);
+	if (dev_ingress_queue(dev))
+		dev_deactivate_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
 
 	dev_watchdog_down(dev);
 
@@ -838,7 +840,8 @@ void dev_init_scheduler(struct net_device *dev)
 {
 	dev->qdisc = &noop_qdisc;
 	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
-	dev_init_scheduler_queue(dev, &dev->ingress_queue, &noop_qdisc);
+	if (dev_ingress_queue(dev))
+		dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
 
 	setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
 }
@@ -861,7 +864,8 @@ static void shutdown_scheduler_queue(struct net_device *dev,
 void dev_shutdown(struct net_device *dev)
 {
 	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
-	shutdown_scheduler_queue(dev, &dev->ingress_queue, &noop_qdisc);
+	if (dev_ingress_queue(dev))
+		shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
 	qdisc_destroy(dev->qdisc);
 	dev->qdisc = &noop_qdisc;
 


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ