lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Sat, 11 Sep 2010 00:47:18 +0800
From:	Changli Gao <xiaosuo@...il.com>
To:	Eric Dumazet <eric.dumazet@...il.com>
Cc:	"David S. Miller" <davem@...emloft.net>,
	Oliver Hartkopp <socketcan@...tkopp.net>,
	"Michael S. Tsirkin" <mst@...hat.com>, netdev@...r.kernel.org
Subject: Re: [PATCH] net: af_packet: don't call tpacket_destruct_skb() until
 the skb is sent out

On Fri, Sep 10, 2010 at 10:26 PM, Eric Dumazet <eric.dumazet@...il.com> wrote:
> Le vendredi 10 septembre 2010 à 21:22 +0800, Changli Gao a écrit :
>> Since skb->destructor() is used to account socket memory, and maybe called
>> before the skb is sent out, a corrupt skb maybe sent out finally.
>>
>> A new destructor is added into structure skb_shared_info(), and it won't
>> be called until the last reference to the data of a skb is put. af_packet
>> uses this destructor instead.
>>
>
> Hi Changli
>
>>  static void tpacket_destruct_skb(struct sk_buff *skb)
>>  {
>> -     struct packet_sock *po = pkt_sk(skb->sk);
>> -     void *ph;
>> -
>> -     BUG_ON(skb == NULL);
>> +     struct tpacket_destructor_arg *arg = skb_shinfo(skb)->destructor_arg;
>> +     struct packet_sock *po = pkt_sk(arg->sk);
>> +     void *ph = arg->ph;
>>
>>       if (likely(po->tx_ring.pg_vec)) {
>> -             ph = skb_shinfo(skb)->destructor_arg;
>>               BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
>>               BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
>>               atomic_dec(&po->tx_ring.pending);
>>               __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
>>       }
>>
>> +     skb->sk = arg->sk;
>>       sock_wfree(skb);
>
> Are you sure sock_wfree(skb) is still needed ?

sock_wfree(skb) is also used to wake up the users who sleep on
poll(2). If sock_wfree(skb) is moved into skb->destructor(), and
called before skb is sent out, pollers will be waked up without
POLLOUT, and since the later skb_shinfo(skb)->destructor() doesn't
wake up the pollers, POLLOUT events will be lost, and the poller will
be blocked forever.

>
>
>> +     kfree(arg);
>
> this new kmalloc()/kfree() for each sent packet wont please the guys
> using af_packet/mmap interface...

Embed these two pointers into skb_shared_info? It may slow the others.

>
>>  }
>>
>>  static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
>> @@ -862,7 +867,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
>>       skb->dev = dev;
>>       skb->priority = po->sk.sk_priority;
>>       skb->mark = po->sk.sk_mark;
>> -     skb_shinfo(skb)->destructor_arg = ph.raw;
>>
>>       switch (po->tp_version) {
>>       case TPACKET_V2:
>> @@ -884,9 +888,8 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
>>       to_write = tp_len;
>>
>>       if (sock->type == SOCK_DGRAM) {
>> -             err = dev_hard_header(skb, dev, ntohs(proto), addr,
>> -                             NULL, tp_len);
>> -             if (unlikely(err < 0))
>> +             if (unlikely(dev_hard_header(skb, dev, ntohs(proto), addr,
>> +                                          NULL, tp_len) < 0))
>>                       return -EINVAL;
>>       } else if (dev->hard_header_len) {
>>               /* net device doesn't like empty head */
>> @@ -897,8 +900,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
>>               }
>>
>>               skb_push(skb, dev->hard_header_len);
>> -             err = skb_store_bits(skb, 0, data,
>> -                             dev->hard_header_len);
>> +             err = skb_store_bits(skb, 0, data, dev->hard_header_len);
>>               if (unlikely(err))
>>                       return err;
>>
>> @@ -906,7 +908,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
>>               to_write -= dev->hard_header_len;
>>       }
>>
>> -     err = -EFAULT;
>>       page = virt_to_page(data);
>>       offset = offset_in_page(data);
>>       len_max = PAGE_SIZE - offset;
>> @@ -994,6 +995,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>>               size_max = dev->mtu + reserve;
>>
>>       do {
>> +             struct tpacket_destructor_arg *arg;
>> +
>>               ph = packet_current_frame(po, &po->tx_ring,
>>                               TP_STATUS_SEND_REQUEST);
>>
>> @@ -1028,7 +1031,16 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>>                       }
>>               }
>>
>> -             skb->destructor = tpacket_destruct_skb;
>> +             arg = kmalloc(sizeof(*arg), GFP_KERNEL);
>> +             if (unlikely(arg == NULL)) {
>> +                     err = -ENOBUFS;
>> +                     goto out_status;
>> +             }
>> +             arg->sk = &po->sk;
>> +             arg->ph = ph;
>> +             skb_shinfo(skb)->destructor_arg = arg;
>> +             skb->destructor = NULL;
>
> why setting skb->destructor to NULL here ?

Let skb_shinfo(skb)->destructor() do all the things which used to be
done by skb->destructor().

>
>> +             skb_shinfo(skb)->destructor = tpacket_destruct_skb;
>>               __packet_set_status(po, ph, TP_STATUS_SENDING);
>>               atomic_inc(&po->tx_ring.pending);
>>
>
> I dont yet understand how this can prevent af_unix module being unloaded
> while packets are in flight

This issue isn't addressed, and I think it should be fixed in a separate patch.

>
> I believe sock_wfree() should be avoided (since early orphaning occurs),
> to reduce number of atomic ops to the minimum.
>
> af_packet/mmap users want fast operations, we should not use
> sock_wfree() for them, because max number of in flight packets is known
> (tx ring buffer)
>
>

But the users rely on the kernel to inform them there is available
frame for use.

-- 
Regards,
Changli Gao(xiaosuo@...il.com)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ