netdev - Re: [PATCH v2] net: batch skb dequeueing from softnet input_pkt

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1271146112.16881.213.camel@edumazet-laptop>
Date:	Tue, 13 Apr 2010 10:08:32 +0200
From:	Eric Dumazet <eric.dumazet@...il.com>
To:	Changli Gao <xiaosuo@...il.com>
Cc:	"David S. Miller" <davem@...emloft.net>, netdev@...r.kernel.org
Subject: Re: [PATCH v2] net: batch skb dequeueing from softnet
 input_pkt_queue

Le mardi 13 avril 2010 à 23:38 +0800, Changli Gao a écrit :
> batch skb dequeueing from softnet input_pkt_queue
> 
> batch skb dequeueing from softnet input_pkt_queue to reduce potential lock
> contention and irq disabling/enabling.
> 

Very interesting idea, but implementation is too complex, and probably
buggy, in a area that too few people understand today.

Could you keep it as simple as possible ?

> Signed-off-by: Changli Gao <xiaosuo@...il.com>
> ----
>  include/linux/netdevice.h |    2 ++
>  net/core/dev.c            |   45 +++++++++++++++++++++++++++++++++++++++------
>  2 files changed, 41 insertions(+), 6 deletions(-)
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index d1a21b5..bc7a0d7 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1335,6 +1335,8 @@ struct softnet_data {
>  	struct call_single_data	csd ____cacheline_aligned_in_smp;
>  #endif
>  	struct sk_buff_head	input_pkt_queue;
> +	struct sk_buff_head	processing_queue;

	Probably not necessary.

> +	volatile bool		flush_processing_queue;

Use of 'volatile' is strongly discouraged, I would say, forbidden. 

Its usually a sign of 'I dont exactly what memory ordering I need, so I
throw a volatile just in case'. We live in a world full of RCU, read ,
write, full barriers. And these apis are well documented.

>  	struct napi_struct	backlog;
>  };
>  
> diff --git a/net/core/dev.c b/net/core/dev.c
> index a10a216..ac24293 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2324,6 +2324,11 @@ static void trigger_softirq(void *data)
>  }
>  #endif /* CONFIG_SMP */
>  
> +static inline u32 softnet_input_qlen(struct softnet_data *queue)
> +{
> +	return queue->input_pkt_queue.qlen + queue->processing_queue.qlen;
> +}
> +
>  /*
>   * enqueue_to_backlog is called to queue an skb to a per CPU backlog
>   * queue (may be a remote CPU queue).
> @@ -2339,8 +2344,8 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
>  	__get_cpu_var(netdev_rx_stat).total++;
>  
>  	rps_lock(queue);
> -	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
> -		if (queue->input_pkt_queue.qlen) {
> +	if (softnet_input_qlen(queue) <= netdev_max_backlog) {
> +		if (softnet_input_qlen(queue)) {
>  enqueue:
>  			__skb_queue_tail(&queue->input_pkt_queue, skb);
>  			rps_unlock(queue);
> @@ -2803,6 +2808,7 @@ static void flush_backlog(void *arg)
>  			__skb_unlink(skb, &queue->input_pkt_queue);
>  			kfree_skb(skb);
>  		}
> +	queue->flush_processing_queue = true;

	Probably not necessary

>  	rps_unlock(queue);
>  }
>  
> @@ -3112,14 +3118,23 @@ static int process_backlog(struct napi_struct *napi, int quota)
>  	struct softnet_data *queue = &__get_cpu_var(softnet_data);
>  	unsigned long start_time = jiffies;
>  
> +	if (queue->flush_processing_queue) {

Really... this is bloat IMHO

> +		struct sk_buff *skb;
> +
> +		queue->flush_processing_queue = false;
> +		while ((skb = __skb_dequeue(&queue->processing_queue)))
> +			kfree_skb(skb);
> +	}
> +
>  	napi->weight = weight_p;
>  	do {
>  		struct sk_buff *skb;
>  
>  		local_irq_disable();
>  		rps_lock(queue);
> -		skb = __skb_dequeue(&queue->input_pkt_queue);
> -		if (!skb) {
> +		skb_queue_splice_tail_init(&queue->input_pkt_queue,
> +					   &queue->processing_queue);
> +		if (skb_queue_empty(&queue->processing_queue)) {
>  			__napi_complete(napi);
>  			rps_unlock(queue);
>  			local_irq_enable();
> @@ -3128,9 +3143,22 @@ static int process_backlog(struct napi_struct *napi, int quota)
>  		rps_unlock(queue);
>  		local_irq_enable();
>  
> -		__netif_receive_skb(skb);
> -	} while (++work < quota && jiffies == start_time);
> +		while ((skb = __skb_dequeue(&queue->processing_queue))) {
> +			__netif_receive_skb(skb);
> +			if (++work < quota && jiffies == start_time)
> +				continue;
> +			if (!queue->flush_processing_queue)
> +				goto out;
> +			queue->flush_processing_queue = false;

once again, ... so much code for a unlikely event...

> +			while ((skb = __skb_dequeue(&queue->processing_queue))) {
> +				__netif_receive_skb(skb);
> +				++work;
> +			}
> +			goto out;
> +		}
> +	} while (1);
>  
> +out:
>  	return work;
>  }
>  
> @@ -5487,6 +5515,9 @@ static int dev_cpu_callback(struct notifier_block *nfb,
>  	raise_softirq_irqoff(NET_TX_SOFTIRQ);
>  	local_irq_enable();
>  
> +	while ((skb = __skb_dequeue(&oldsd->processing_queue)))
> +		netif_rx(skb);
> +
>  	/* Process offline CPU's input_pkt_queue */
>  	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
>  		netif_rx(skb);
> @@ -5709,6 +5740,8 @@ static int __init net_dev_init(void)
>  
>  		queue = &per_cpu(softnet_data, i);
>  		skb_queue_head_init(&queue->input_pkt_queue);
> +		skb_queue_head_init(&queue->processing_queue);
> +		queue->flush_processing_queue = false;
>  		queue->completion_queue = NULL;
>  		INIT_LIST_HEAD(&queue->poll_list);
>  

I advise to keep it simple.

My suggestion would be to limit this patch only to process_backlog().

Really if you touch other areas, there is too much risk.

Perform sort of skb_queue_splice_tail_init() into a local (stack) queue,
but the trick is to not touch input_pkt_queue.qlen, so that we dont slow
down enqueue_to_backlog().

Process at most 'quota' skbs (or jiffies limit).

relock queue.
input_pkt_queue.qlen -= number_of_handled_skbs;

In the unlikely event we have unprocessed skbs in local queue,
re-insert the remaining skbs at head of input_pt_queue.

Consider if input_pkt_queue.qlen is 0 or not, to call
__napi_complete(napi); or not :)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html