netdev - Re: [PATCH] ifb: add multi-queue support

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <412e6f7f0911111912q27f2b0aate56c637349292c3f@mail.gmail.com>
Date:	Thu, 12 Nov 2009 11:12:17 +0800
From:	Changli Gao <xiaosuo@...il.com>
To:	Patrick McHardy <kaber@...sh.net>
Cc:	"David S. Miller" <davem@...emloft.net>,
	Stephen Hemminger <shemminger@...tta.com>,
	Eric Dumazet <eric.dumazet@...il.com>,
	Tom Herbert <therbert@...gle.com>, netdev@...r.kernel.org
Subject: Re: [PATCH] ifb: add multi-queue support

On Wed, Nov 11, 2009 at 11:59 PM, Patrick McHardy <kaber@...sh.net> wrote:
> Changli Gao wrote:
>> diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
>> index 69c2566..ac04e85 100644
>> --- a/drivers/net/ifb.c
>> +++ b/drivers/net/ifb.c
>> ...
>> +/* Number of ifb devices to be set up by this module. */
>>  static int numifbs = 2;
>> +module_param(numifbs, int, 0444);
>> +MODULE_PARM_DESC(numifbs, "Number of ifb devices");
>>
>> -static void ri_tasklet(unsigned long dev);
>> -static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev);
>> -static int ifb_open(struct net_device *dev);
>> -static int ifb_close(struct net_device *dev);
>> +/* Number of TX queues per ifb */
>> +static int numtxqs = 1;
>> +module_param(numtxqs, int, 0444);
>> +MODULE_PARM_DESC(numtxqs, "Number of TX queues per ifb");
>
> unsigned?

Yea, unsigned is better, and I need to check whether its value is
smaller than 1 somewhere. The same will be done with IFLA_NTXQ.

>
>> +             while ((skb = skb_dequeue(&pq->tq)) != NULL) {
>> +                     u32 from = G_TC_FROM(skb->tc_verd);
>> +
>> +                     skb->tc_verd = 0;
>> +                     skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
>> +                     txq->tx_packets++;
>> +                     txq->tx_bytes +=skb->len;
>> +
>> +                     rcu_read_lock();
>> +                     skb->dev = dev_get_by_index_rcu(&init_net, skb->iif);
>> +                     if (!skb->dev) {
>> +                             rcu_read_unlock();
>> +                             dev_kfree_skb(skb);
>> +                             txq->tx_dropped++;
>> +                             break;
>> +                     }
>> +                     rcu_read_unlock();
>> +                     skb->iif = dev->ifindex;
>
> What protects the device from disappearing here and below during
> dev_queue_xmit() and netif_rx_ni()?

For dev_queue_xmit(), dev is holded by skb->_dst, so there is no
problem. But for netif_rx_ni(), I don't know how to prevent the device
disappearing, and it seems that all the NIC drivers have this problem.
Maybe there was the assumption about the execution context of
netif_rx() before. Now softirq can't be executed by softirqd, so the
packet receiving path maybe interleaved. I don't know how to prevent
it happening.


>> +
>> +static u16 ifb_select_queue(struct net_device *dev, struct sk_buff *skb)
>> +{
>> +     u32 addr1, addr2;
>> +     u32 hash, ihl;
>> +     union {
>> +             u16 in16[2];
>> +             u32 in32;
>> +     } ports;
>> +     u8 ip_proto;
>> +
>> +     if ((hash = skb_rx_queue_recorded(skb))) {
>> +             while (hash >= dev->real_num_tx_queues)
>> +                     hash -= dev->real_num_tx_queues;
>> +             return hash;
>> +     }
>> +
>> +     switch (skb->protocol) {
>> +     case __constant_htons(ETH_P_IP):
>> +             if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
>> +                     ip_proto = ip_hdr(skb)->protocol;
>> +             else
>> +                     ip_proto = 0;
>
> So fragments will get reordered?

Yea. It is OK, as the number of fragments isn't large. If we decide to
avoid this, we have to reasm them as netfilter does. and it isn't
efficiency.

>
>> +             addr1 = ip_hdr(skb)->saddr;
>> +             addr2 = ip_hdr(skb)->daddr;
>> +             ihl = ip_hdr(skb)->ihl << 2;
>
> ip_hdrlen()?

Yea, I'll use ip_hdrlen() instead.

>
>> +             break;
>> +     case __constant_htons(ETH_P_IPV6):
>> +             ip_proto = ipv6_hdr(skb)->nexthdr;
>> +             addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
>> +             addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
>> +             ihl = 10;
>
> Where does 10 come from?

It should be 40, after reviewing IPV6, I found that I need to loop
until finding the right protocol value.

>
>> +             break;
>> +     default:
>> +             return 0;
>
> Perhaps hash on skb->protocol here.

use return skb->protocol % dev->real_num_tx_queues; instead.

>
>> +     }
>> +     if (addr1 > addr2)
>> +             swap(addr1, addr2);
>> +
>> +     switch (ip_proto) {
>> +     case IPPROTO_TCP:
>> +     case IPPROTO_UDP:
>> +     case IPPROTO_DCCP:
>> +     case IPPROTO_ESP:
>> +     case IPPROTO_AH:
>> +     case IPPROTO_SCTP:
>> +     case IPPROTO_UDPLITE:
>> +             ports.in32 = *((u32 *) (skb_network_header(skb) + ihl));
>> +             if (ports.in16[0] > ports.in16[1])
>> +                     swap(ports.in16[0], ports.in16[1]);
>> +             break;
>> +
>> +     default:
>> +             ports.in32 = 0;
>> +             break;
>> +     }
>> +
>> +     hash = jhash_3words(addr1, addr2, ports.in32,
>> +                         simple_tx_hashrnd ^ ip_proto);
>> +
>> +     return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
>> +}
>> +
>> +static int ifb_init(struct net_device *dev)
>> +{
>> +     struct ifb_private *dp = netdev_priv(dev);
>> +     struct ifb_private_q *pq = dp->pq;
>> +     int i;
>> +
>> +     pq = kmalloc(sizeof(*pq) * dev->real_num_tx_queues, GFP_KERNEL);
>
> kcalloc()?

OK.

>> +     if (pq == NULL)
>> +             return -ENOMEM;
>> +     dp->pq = pq;
>> +
>> +     for (i = 0; i < dev->real_num_tx_queues; i++) {
>> +             pq[i].dev = dev;
>> +             skb_queue_head_init(&pq[i].rq);
>> +             skb_queue_head_init(&pq[i].tq);
>> +             init_waitqueue_head(&pq[i].wq);
>> +             pq[i].rx_packets = 0;
>> +             pq[i].rx_bytes = 0;
>> +             pq[i].rx_dropped = 0;
>> +     }
>> +
>> +     return 0;
>> +}
>
>> +static int ifb_get_tx_queues(struct net *net, struct nlattr *tb[],
>> +                          unsigned int *num_tx_queues,
>> +                          unsigned int *real_num_tx_queues)
>> +{
>> +     if (tb[IFLA_NTXQ]) {
>> +             *num_tx_queues = nla_get_u16(tb[IFLA_NTXQ]);
>
> We currently use unsigned ints for the queue number, so please
> use an u32 for the attribute as well.
>
>> +             *real_num_tx_queues = *num_tx_queues;
>> +     } else {
>> +             *num_tx_queues = numtxqs;
>> +             *real_num_tx_queues = numtxqs;
>> +     }
>> +
>> +     return 0;
>> +}
>> +
>
        u16                     (*ndo_select_queue)(struct net_device *dev,
                                                    struct sk_buff *skb);
use u16 as the return value so ..., and I think u16 is big enough. If
you insist on this, I'll use u32 instead.


-- 
Regards，
Changli Gao(xiaosuo@...il.com)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html