[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <d53e8ed1-33a7-a29a-c53a-0be9e1a97901@redhat.com>
Date: Tue, 5 Dec 2017 15:29:52 +0800
From: Jason Wang <jasowang@...hat.com>
To: Willem de Bruijn <willemdebruijn.kernel@...il.com>
Cc: Network Development <netdev@...r.kernel.org>,
LKML <linux-kernel@...r.kernel.org>,
"Michael S. Tsirkin" <mst@...hat.com>,
Tom Herbert <tom@...bertland.com>, aconole@...hat.com,
wexu@...hat.com
Subject: Re: [PATCH net-next V3] tun: add eBPF based queue selection method
On 2017年12月05日 08:16, Willem de Bruijn wrote:
> On Mon, Dec 4, 2017 at 4:31 AM, Jason Wang <jasowang@...hat.com> wrote:
>> This patch introduces an eBPF based queue selection method. With this,
>> the policy could be offloaded to userspace completely through a new
>> ioctl TUNSETSTEERINGEBPF.
>>
>> Signed-off-by: Jason Wang <jasowang@...hat.com>
>> ---
>> +static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
>> +{
>> + struct tun_steering_prog *prog;
>> + u16 ret = 0;
>> +
>> + prog = rcu_dereference(tun->steering_prog);
>> + if (prog)
>> + ret = bpf_prog_run_clear_cb(prog->prog, skb);
> This dereferences tun->steering_prog for a second time. It is safe
> in this load balancing case to assign a few extra packets to queue 0.
> But the issue can also be avoided by replacing the function with a
> direct call in tun_net_xmit:
>
> struct tun_steering_prog *s = rcu_dereference(tun->steering_prog);
> if (s)
> ret = bpf_prog_run_clear_cb(s->prog, skb) % tun->numqueues;
Right.
>
>> /* Net device start xmit */
>> -static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>> +static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
>> {
>> - struct tun_struct *tun = netdev_priv(dev);
>> - int txq = skb->queue_mapping;
>> - struct tun_file *tfile;
>> - u32 numqueues = 0;
>> -
>> - rcu_read_lock();
>> - tfile = rcu_dereference(tun->tfiles[txq]);
>> - numqueues = READ_ONCE(tun->numqueues);
>> -
>> - /* Drop packet if interface is not attached */
>> - if (txq >= numqueues)
>> - goto drop;
>> -
>> #ifdef CONFIG_RPS
>> - if (numqueues == 1 && static_key_false(&rps_needed)) {
>> + if (tun->numqueues == 1 && static_key_false(&rps_needed)) {
>> /* Select queue was not called for the skbuff, so we extract the
>> * RPS hash and save it into the flow_table here.
>> */
>> @@ -969,6 +986,26 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>> }
>> }
>> #endif
>> +}
>> +
>> +/* Net device start xmit */
>> +static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>> +{
>> + struct tun_struct *tun = netdev_priv(dev);
>> + int txq = skb->queue_mapping;
>> + struct tun_file *tfile;
>> + u32 numqueues = 0;
>> +
>> + rcu_read_lock();
>> + tfile = rcu_dereference(tun->tfiles[txq]);
>> + numqueues = READ_ONCE(tun->numqueues);
> Now tun->numqueues is read twice, reversing commit fa35864e0bb7
> ("tuntap: Fix for a race in accessing numqueues"). I don't see anything
> left that would cause a divide by zero after the relevant code was
> converted from divide to multiple and subsequently even removed.
>
> But if it's safe to read multiple times, might as well remove the READ_ONCE.
Good point, but READ_ONCE() is not something new, we'd better change
this in another patch.
>
>> @@ -1551,7 +1588,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>> int copylen;
>> bool zerocopy = false;
>> int err;
>> - u32 rxhash;
>> + u32 rxhash = 0;
>> int skb_xdp = 1;
>> bool frags = tun_napi_frags_enabled(tun);
>>
>> @@ -1739,7 +1776,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>> rcu_read_unlock();
>> }
>>
>> - rxhash = __skb_get_hash_symmetric(skb);
>> + rcu_read_lock();
>> + if (!rcu_dereference(tun->steering_prog))
>> + rxhash = __skb_get_hash_symmetric(skb);
>> + rcu_read_unlock();
>>
>> if (frags) {
>> /* Exercise flow dissector code path. */
>> @@ -1783,7 +1823,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>> u64_stats_update_end(&stats->syncp);
>> put_cpu_ptr(stats);
>>
>> - tun_flow_update(tun, rxhash, tfile);
>> + if (rxhash)
>> + tun_flow_update(tun, rxhash, tfile);
>> +
> Nit: zero is a valid hash? In which case, an int64_t initialized to -1 is the
> safer check.
Looks not? E.g looking at __flow_hash_from_keys() it did:
static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
{
u32 hash;
__flow_hash_consistentify(keys);
hash = __flow_hash_words(flow_keys_hash_start(keys),
flow_keys_hash_length(keys), keyval);
if (!hash)
hash = 1;
return hash;
}
Thanks
Powered by blists - more mailing lists