netdev - Re: [PATCH v4 1/1] rps: core implementation

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Sat, 21 Nov 2009 01:03:22 -0800
From:	Tom Herbert <therbert@...gle.com>
To:	Eric Dumazet <eric.dumazet@...il.com>
Cc:	David Miller <davem@...emloft.net>,
	Linux Netdev List <netdev@...r.kernel.org>,
	Andi Kleen <andi@...stfloor.org>
Subject: Re: [PATCH v4 1/1] rps: core implementation

>>   *   The DEVICE structure.
>>   *   Actually, this whole structure is a big mistake.  It mixes I/O
>>   *   data with strictly "high-level" data, and it has to know about
>> @@ -861,6 +884,9 @@ struct net_device {
>>
>>       struct netdev_queue     rx_queue;
>>
>> +     struct dev_rps_maps     *dev_rps_maps;  /* Per-NAPI maps for
>> +                                                receive packet steeing */
>> +
>>       struct netdev_queue     *_tx ____cacheline_aligned_in_smp;
>>
>>       /* Number of TX queues allocated at alloc_netdev_mq() time  */
>> @@ -1276,6 +1302,9 @@ struct softnet_data {
>>       struct Qdisc            *output_queue;
>>       struct sk_buff_head     input_pkt_queue;
>>       struct list_head        poll_list;
>> +
>> +     struct call_single_data csd;
>
> This is problematic. softnet_data used to be only used by local cpu.
> With RPS, other cpus are going to access csd, input_pkt_queue, backlog
> and dirty cache lines.
>
> Maybe we should split sofnet_data in two cache lines, one private,
> one chared, and
> DEFINE_PER_CPU(struct softnet_data, softnet_data);
> ->
> DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
>

That would make sense.

> Also you need to change comment at start of struct softnet_data,
> since it is wrong after RPS.
>
> /*
>  * Incoming packets are placed on per-cpu queues so that
>  * no locking is needed.
>  */
>
Okay.


>> -static u32 skb_tx_hashrnd;
>> +static u32 hashrnd;
> adding a __read_mostly here could help :)
>

Yep.

>>
>>  u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
>>  {
>> @@ -1852,7 +1852,7 @@ u16 skb_tx_hash(const struct net_device *dev,
>> const struct sk_buff *skb)
>>       else
>>               hash = skb->protocol;
>>
>> -     hash = jhash_1word(hash, skb_tx_hashrnd);
>> +     hash = jhash_1word(hash, hashrnd);
>>
>>       return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
>>  }
>> @@ -2073,6 +2073,146 @@ int weight_p __read_mostly = 64;            /*
>> old backlog weight */
>>
>>  DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
>>
>> +/*
>> + * get_rps_cpu is called from netif_receive_skb and returns the target
>> + * CPU from the RPS map of the receiving NAPI instance for a given skb.
>> + */
>> +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
>> +{
>> +     u32 addr1, addr2, ports;
>> +     struct ipv6hdr *ip6;
>> +     struct iphdr *ip;
>> +     u32 hash, ihl;
>> +     u8 ip_proto;
>> +     int cpu = -1;
>> +     struct dev_rps_maps *drmap;
>> +     struct rps_map *map = NULL;
>> +     u16 index;
>> +
>
>> +     rcu_read_lock();
> If called from netif_receive_skb(), we already are in a rcu protected section,
> but this could be a cleanup patch, because many other parts in stack could
> be changed as well.
>
So convention would be to not call these then?

>
>> +/*
>> + * enqueue_to_backlog is called to queue an skb to a per CPU backlog
>> + * queue (may be a remote CPU queue).
>> + */
>> +static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
>> +{
>> +     struct softnet_data *queue;
>> +     unsigned long flags;
>> +
>> +     queue = &per_cpu(softnet_data, cpu);
>> +
>
>
>> +     local_irq_save(flags);
>> +     __get_cpu_var(netdev_rx_stat).total++;
>> +
>> +     spin_lock(&queue->input_pkt_queue.lock);
>
> could reduce to :
>        percpu_add(netdev_rx_stat.total, 1);
>        spin_lock_irqsave(&queue->input_pkt_queue.lock, flags);
>
Would it make sense to percpu_add into dev.c just for this when other
parts in dev.c would still use __get_cpu_var(stat)++?  Also, I think
this results in more instructions...

>> +     if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
>> +             if (queue->input_pkt_queue.qlen) {
>> +enqueue:
>> +                     __skb_queue_tail(&queue->input_pkt_queue, skb);
>> +                     spin_unlock_irqrestore(&queue->input_pkt_queue.lock,
>> +                         flags);
>> +                     return NET_RX_SUCCESS;
>> +             }
>> +
>> +             /* Schedule NAPI for backlog device */
>
>> +             if (napi_schedule_prep(&queue->backlog)) {
>
>  Is it legal (or wanter at all) to call napi_schedule_prep() on remote cpu backlog ?
>

I think it is useful.  If the remote backlog is scheduled then there
is no need to do an IPI, so this means there would only ever be on IPI
pending for a backlog queue and hence we can get away with only having
one call_function_data per backlog.  Without the napi schedule, we
would probably need a call structure for each backlog per CPU-- and
also we'd still need some shared bits between source and target CPUs
anyway for when the IPI is completed (want to avoid blocking on
call_function_data).

>> +                     if (cpu != smp_processor_id()) {
>> +                             cpu_set(cpu,
>> +                                 get_cpu_var(rps_remote_softirq_cpus));
>> +                             __raise_softirq_irqoff(NET_RX_SOFTIRQ);
>> +                     } else
>> +                             __napi_schedule(&queue->backlog);
>
> Why not the more easy :
>        if (cpu != smp_processor_id())
>                cpu_set(cpu, get_cpu_var(rps_remote_softirq_cpus));
>        else
>                napi_schedule(&queue->backlog);
>
> This way we dont touch to remote backlog (and this backlog could stay in the private
> cache line of remote cpu)
>
>> +             }
>> +             goto enqueue;
>> +     }
>> +
>
>> +     spin_unlock(&queue->input_pkt_queue.lock);
>> +
>> +     __get_cpu_var(netdev_rx_stat).dropped++;
>> +     local_irq_restore(flags);
>
> ->
>        spin_unlock_irqrestore(&queue->input_pkt_queue.lock, flags);
>        percpu_add(netdev_rx_stat.dropped, 1);
>
>
> +/*
> + * net_rps_action sends any pending IPI's for rps.  This is only called from
> + * softirq and interrupts must be enabled.
> + */
> +static void net_rps_action(void)
> +{
> +       int cpu;
> +
> +       /* Send pending IPI's to kick RPS processing on remote cpus. */
> +       for_each_cpu_mask_nr(cpu, __get_cpu_var(rps_remote_softirq_cpus)) {
> +               struct softnet_data *queue = &per_cpu(softnet_data, cpu);
> +               __smp_call_function_single(cpu, &queue->csd, 0);
> +       }
> +       cpus_clear(__get_cpu_var(rps_remote_softirq_cpus));
> +}
>
>
> net_rps_action() might be not very descriptive name and bit expensive...
>
> Did you tried smp_call_function_many() ?
> (I suspect you did, but found it was not that optimized ?)
>
> CC Andi to get feedback from him :)
>
> static void net_rps_remcpus_fire(void)
> {
>        smp_call_function_many(__get_cpu_var(rps_remote_softirq_cpus),
>                               trigger_softirq, NULL, 0);
> }
>

I'm thinking it is better to avoid possibly blocking on locked cfd_data.

Tom
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html