[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1284128770.24675.41.camel@edumazet-laptop>
Date: Fri, 10 Sep 2010 16:26:10 +0200
From: Eric Dumazet <eric.dumazet@...il.com>
To: Changli Gao <xiaosuo@...il.com>
Cc: "David S. Miller" <davem@...emloft.net>,
Oliver Hartkopp <socketcan@...tkopp.net>,
"Michael S. Tsirkin" <mst@...hat.com>, netdev@...r.kernel.org
Subject: Re: [PATCH] net: af_packet: don't call tpacket_destruct_skb()
until the skb is sent out
Le vendredi 10 septembre 2010 à 21:22 +0800, Changli Gao a écrit :
> Since skb->destructor() is used to account socket memory, and maybe called
> before the skb is sent out, a corrupt skb maybe sent out finally.
>
> A new destructor is added into structure skb_shared_info(), and it won't
> be called until the last reference to the data of a skb is put. af_packet
> uses this destructor instead.
>
Hi Changli
> Signed-off-by: Changli Gao <xiaosuo@...il.com>
> ---
> include/linux/skbuff.h | 1 +
> net/core/skbuff.c | 19 ++++++++++++++-----
> net/packet/af_packet.c | 38 +++++++++++++++++++++++++-------------
> 3 files changed, 40 insertions(+), 18 deletions(-)
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 9e8085a..f874c13 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -191,6 +191,7 @@ struct skb_shared_info {
> __u8 tx_flags;
> struct sk_buff *frag_list;
> struct skb_shared_hwtstamps hwtstamps;
> + void (*destructor)(struct sk_buff *skb);
>
> /*
> * Warning : all fields before dataref are cleared in __alloc_skb()
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 2d1bc76..ff37e54 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -332,10 +332,14 @@ static void skb_release_data(struct sk_buff *skb)
> if (!skb->cloned ||
> !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
> &skb_shinfo(skb)->dataref)) {
> - if (skb_shinfo(skb)->nr_frags) {
> + struct skb_shared_info *shinfo = skb_shinfo(skb);
> +
> + if (shinfo->destructor)
> + shinfo->destructor(skb);
> + if (shinfo->nr_frags) {
> int i;
> - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
> - put_page(skb_shinfo(skb)->frags[i].page);
> + for (i = 0; i < shinfo->nr_frags; i++)
> + put_page(shinfo->frags[i].page);
> }
>
> if (skb_has_frag_list(skb))
> @@ -497,9 +501,12 @@ bool skb_recycle_check(struct sk_buff *skb, int skb_size)
> if (skb_shared(skb) || skb_cloned(skb))
> return false;
>
> + shinfo = skb_shinfo(skb);
> + if (shinfo->destructor)
> + return false;
> +
> skb_release_head_state(skb);
>
> - shinfo = skb_shinfo(skb);
> memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
> atomic_set(&shinfo->dataref, 1);
>
> @@ -799,7 +806,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
>
> memcpy((struct skb_shared_info *)(data + size),
> skb_shinfo(skb),
> - offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
> + offsetof(struct skb_shared_info,
> + frags[skb_shinfo(skb)->nr_frags]));
> + skb_shinfo(skb)->destructor = NULL;
>
> /* Check if we can avoid taking references on fragments if we own
> * the last reference on skb->head. (see skb_release_data())
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index 3616f27..7e16b55 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -823,22 +823,27 @@ ring_is_full:
> goto drop_n_restore;
> }
>
> +struct tpacket_destructor_arg {
> + struct sock *sk;
> + void *ph;
> +};
> +
> static void tpacket_destruct_skb(struct sk_buff *skb)
> {
> - struct packet_sock *po = pkt_sk(skb->sk);
> - void *ph;
> -
> - BUG_ON(skb == NULL);
> + struct tpacket_destructor_arg *arg = skb_shinfo(skb)->destructor_arg;
> + struct packet_sock *po = pkt_sk(arg->sk);
> + void *ph = arg->ph;
>
> if (likely(po->tx_ring.pg_vec)) {
> - ph = skb_shinfo(skb)->destructor_arg;
> BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
> BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
> atomic_dec(&po->tx_ring.pending);
> __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
> }
>
> + skb->sk = arg->sk;
> sock_wfree(skb);
Are you sure sock_wfree(skb) is still needed ?
> + kfree(arg);
this new kmalloc()/kfree() for each sent packet wont please the guys
using af_packet/mmap interface...
> }
>
> static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
> @@ -862,7 +867,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
> skb->dev = dev;
> skb->priority = po->sk.sk_priority;
> skb->mark = po->sk.sk_mark;
> - skb_shinfo(skb)->destructor_arg = ph.raw;
>
> switch (po->tp_version) {
> case TPACKET_V2:
> @@ -884,9 +888,8 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
> to_write = tp_len;
>
> if (sock->type == SOCK_DGRAM) {
> - err = dev_hard_header(skb, dev, ntohs(proto), addr,
> - NULL, tp_len);
> - if (unlikely(err < 0))
> + if (unlikely(dev_hard_header(skb, dev, ntohs(proto), addr,
> + NULL, tp_len) < 0))
> return -EINVAL;
> } else if (dev->hard_header_len) {
> /* net device doesn't like empty head */
> @@ -897,8 +900,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
> }
>
> skb_push(skb, dev->hard_header_len);
> - err = skb_store_bits(skb, 0, data,
> - dev->hard_header_len);
> + err = skb_store_bits(skb, 0, data, dev->hard_header_len);
> if (unlikely(err))
> return err;
>
> @@ -906,7 +908,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
> to_write -= dev->hard_header_len;
> }
>
> - err = -EFAULT;
> page = virt_to_page(data);
> offset = offset_in_page(data);
> len_max = PAGE_SIZE - offset;
> @@ -994,6 +995,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> size_max = dev->mtu + reserve;
>
> do {
> + struct tpacket_destructor_arg *arg;
> +
> ph = packet_current_frame(po, &po->tx_ring,
> TP_STATUS_SEND_REQUEST);
>
> @@ -1028,7 +1031,16 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> }
> }
>
> - skb->destructor = tpacket_destruct_skb;
> + arg = kmalloc(sizeof(*arg), GFP_KERNEL);
> + if (unlikely(arg == NULL)) {
> + err = -ENOBUFS;
> + goto out_status;
> + }
> + arg->sk = &po->sk;
> + arg->ph = ph;
> + skb_shinfo(skb)->destructor_arg = arg;
> + skb->destructor = NULL;
why setting skb->destructor to NULL here ?
> + skb_shinfo(skb)->destructor = tpacket_destruct_skb;
> __packet_set_status(po, ph, TP_STATUS_SENDING);
> atomic_inc(&po->tx_ring.pending);
>
I dont yet understand how this can prevent af_unix module being unloaded
while packets are in flight
I believe sock_wfree() should be avoided (since early orphaning occurs),
to reduce number of atomic ops to the minimum.
af_packet/mmap users want fast operations, we should not use
sock_wfree() for them, because max number of in flight packets is known
(tx ring buffer)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists