[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1266523748.15681.25.camel@w-sridhar.beaverton.ibm.com>
Date: Thu, 18 Feb 2010 12:09:08 -0800
From: Sridhar Samudrala <sri@...ibm.com>
To: Arnd Bergmann <arnd@...db.de>
Cc: David Miller <davem@...emloft.net>, kaber@...sh.net,
eswierk@...stanetworks.com, netdev@...r.kernel.org
Subject: Re: [PATCH 1/3] macvtap: rework object lifetime rules
On Thu, 2010-02-18 at 16:45 +0100, Arnd Bergmann wrote:
> This reworks the change done by the previous patch
> in a more complete way.
>
> The original macvtap code has a number of problems
> resulting from the use of RCU for protecting the
> access to struct macvtap_queue from open files.
>
> This includes
> - need for GFP_ATOMIC allocations for skbs
> - potential deadlocks when copy_*_user sleeps
> - inability to work with vhost-net
>
> Changing the lifetime of macvtap_queue to always
> depend on the open file solves all these. The
> RCU reference simply moves one step down to
> the reference on the macvlan_dev, which we
> only need for nonblocking operations.
>
> Signed-off-by: Arnd Bergmann <arnd@...db.de>
Reviewed and tested.
Acked-by: Sridhar Samudrala <sri@...ibm.com>
> ---
> drivers/net/macvtap.c | 183 ++++++++++++++++++++++++-------------------------
> 1 files changed, 91 insertions(+), 92 deletions(-)
>
> diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
> index fe7656b..7050997 100644
> --- a/drivers/net/macvtap.c
> +++ b/drivers/net/macvtap.c
> @@ -60,30 +60,19 @@ static struct cdev macvtap_cdev;
>
> /*
> * RCU usage:
> - * The macvtap_queue is referenced both from the chardev struct file
> - * and from the struct macvlan_dev using rcu_read_lock.
> + * The macvtap_queue and the macvlan_dev are loosely coupled, the
> + * pointers from one to the other can only be read while rcu_read_lock
> + * or macvtap_lock is held.
> *
> - * We never actually update the contents of a macvtap_queue atomically
> - * with RCU but it is used for race-free destruction of a queue when
> - * either the file or the macvlan_dev goes away. Pointers back to
> - * the dev and the file are implicitly valid as long as the queue
> - * exists.
> + * Both the file and the macvlan_dev hold a reference on the macvtap_queue
> + * through sock_hold(&q->sk). When the macvlan_dev goes away first,
> + * q->vlan becomes inaccessible. When the files gets closed,
> + * macvtap_get_queue() fails.
> *
> - * The callbacks from macvlan are always done with rcu_read_lock held
> - * already. For calls from file_operations, we use the rcu_read_lock_bh
> - * to get a reference count on the socket and the device.
> - *
> - * When destroying a queue, we remove the pointers from the file and
> - * from the dev and then synchronize_rcu to make sure no thread is
> - * still using the queue. There may still be references to the struct
> - * sock inside of the queue from outbound SKBs, but these never
> - * reference back to the file or the dev. The data structure is freed
> - * through __sk_free when both our references and any pending SKBs
> - * are gone.
> - *
> - * macvtap_lock is only used to prevent multiple concurrent open()
> - * calls to assign a new vlan->tap pointer. It could be moved into
> - * the macvlan_dev itself but is extremely rarely used.
> + * There may still be references to the struct sock inside of the
> + * queue from outbound SKBs, but these never reference back to the
> + * file or the dev. The data structure is freed through __sk_free
> + * when both our references and any pending SKBs are gone.
> */
> static DEFINE_SPINLOCK(macvtap_lock);
>
> @@ -101,11 +90,12 @@ static int macvtap_set_queue(struct net_device *dev, struct file *file,
> goto out;
>
> err = 0;
> - q->vlan = vlan;
> + rcu_assign_pointer(q->vlan, vlan);
> rcu_assign_pointer(vlan->tap, q);
> + sock_hold(&q->sk);
>
> q->file = file;
> - rcu_assign_pointer(file->private_data, q);
> + file->private_data = q;
>
> out:
> spin_unlock(&macvtap_lock);
> @@ -113,28 +103,25 @@ out:
> }
>
> /*
> - * We must destroy each queue exactly once, when either
> - * the netdev or the file go away.
> + * The file owning the queue got closed, give up both
> + * the reference that the files holds as well as the
> + * one from the macvlan_dev if that still exists.
> *
> * Using the spinlock makes sure that we don't get
> * to the queue again after destroying it.
> - *
> - * synchronize_rcu serializes with the packet flow
> - * that uses rcu_read_lock.
> */
> -static void macvtap_del_queue(struct macvtap_queue **qp)
> +static void macvtap_put_queue(struct macvtap_queue *q)
> {
> - struct macvtap_queue *q;
> + struct macvlan_dev *vlan;
>
> spin_lock(&macvtap_lock);
> - q = rcu_dereference(*qp);
> - if (!q) {
> - spin_unlock(&macvtap_lock);
> - return;
> + vlan = rcu_dereference(q->vlan);
> + if (vlan) {
> + rcu_assign_pointer(vlan->tap, NULL);
> + rcu_assign_pointer(q->vlan, NULL);
> + sock_put(&q->sk);
> }
>
> - rcu_assign_pointer(q->vlan->tap, NULL);
> - rcu_assign_pointer(q->file->private_data, NULL);
> spin_unlock(&macvtap_lock);
>
> synchronize_rcu();
> @@ -152,29 +139,29 @@ static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
> return rcu_dereference(vlan->tap);
> }
>
> +/*
> + * The net_device is going away, give up the reference
> + * that it holds on the queue (all the queues one day)
> + * and safely set the pointer from the queues to NULL.
> + */
> static void macvtap_del_queues(struct net_device *dev)
> {
> struct macvlan_dev *vlan = netdev_priv(dev);
> - macvtap_del_queue(&vlan->tap);
> -}
> -
> -static inline struct macvtap_queue *macvtap_file_get_queue(struct file *file)
> -{
> struct macvtap_queue *q;
> - rcu_read_lock_bh();
> - q = rcu_dereference(file->private_data);
> - if (q) {
> - sock_hold(&q->sk);
> - dev_hold(q->vlan->dev);
> +
> + spin_lock(&macvtap_lock);
> + q = rcu_dereference(vlan->tap);
> + if (!q) {
> + spin_unlock(&macvtap_lock);
> + return;
> }
> - rcu_read_unlock_bh();
> - return q;
> -}
>
> -static inline void macvtap_file_put_queue(struct macvtap_queue *q)
> -{
> + rcu_assign_pointer(vlan->tap, NULL);
> + rcu_assign_pointer(q->vlan, NULL);
> + spin_unlock(&macvtap_lock);
> +
> + synchronize_rcu();
> sock_put(&q->sk);
> - dev_put(q->vlan->dev);
> }
>
> /*
> @@ -284,7 +271,6 @@ static int macvtap_open(struct inode *inode, struct file *file)
> q->sock.type = SOCK_RAW;
> q->sock.state = SS_CONNECTED;
> sock_init_data(&q->sock, &q->sk);
> - q->sk.sk_allocation = GFP_ATOMIC; /* for now */
> q->sk.sk_write_space = macvtap_sock_write_space;
>
> err = macvtap_set_queue(dev, file, q);
> @@ -300,13 +286,14 @@ out:
>
> static int macvtap_release(struct inode *inode, struct file *file)
> {
> - macvtap_del_queue((struct macvtap_queue **)&file->private_data);
> + struct macvtap_queue *q = file->private_data;
> + macvtap_put_queue(q);
> return 0;
> }
>
> static unsigned int macvtap_poll(struct file *file, poll_table * wait)
> {
> - struct macvtap_queue *q = macvtap_file_get_queue(file);
> + struct macvtap_queue *q = file->private_data;
> unsigned int mask = POLLERR;
>
> if (!q)
> @@ -323,7 +310,6 @@ static unsigned int macvtap_poll(struct file *file, poll_table * wait)
> sock_writeable(&q->sk)))
> mask |= POLLOUT | POLLWRNORM;
>
> - macvtap_file_put_queue(q);
> out:
> return mask;
> }
> @@ -334,6 +320,7 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q,
> int noblock)
> {
> struct sk_buff *skb;
> + struct macvlan_dev *vlan;
> size_t len = count;
> int err;
>
> @@ -341,26 +328,37 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q,
> return -EINVAL;
>
> skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err);
> -
> - if (!skb) {
> - macvlan_count_rx(q->vlan, 0, false, false);
> - return err;
> - }
> + if (!skb)
> + goto err;
>
> skb_reserve(skb, NET_IP_ALIGN);
> skb_put(skb, count);
>
> - if (skb_copy_datagram_from_iovec(skb, 0, iv, 0, len)) {
> - macvlan_count_rx(q->vlan, 0, false, false);
> - kfree_skb(skb);
> - return -EFAULT;
> - }
> + err = skb_copy_datagram_from_iovec(skb, 0, iv, 0, len);
> + if (err)
> + goto err;
>
> skb_set_network_header(skb, ETH_HLEN);
> -
> - macvlan_start_xmit(skb, q->vlan->dev);
> + rcu_read_lock_bh();
> + vlan = rcu_dereference(q->vlan);
> + if (vlan)
> + macvlan_start_xmit(skb, vlan->dev);
> + else
> + kfree_skb(skb);
> + rcu_read_unlock_bh();
>
> return count;
> +
> +err:
> + rcu_read_lock_bh();
> + vlan = rcu_dereference(q->vlan);
> + if (vlan)
> + macvlan_count_rx(q->vlan, 0, false, false);
> + rcu_read_unlock_bh();
> +
> + kfree_skb(skb);
> +
> + return err;
> }
>
> static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
> @@ -368,15 +366,10 @@ static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
> {
> struct file *file = iocb->ki_filp;
> ssize_t result = -ENOLINK;
> - struct macvtap_queue *q = macvtap_file_get_queue(file);
> -
> - if (!q)
> - goto out;
> + struct macvtap_queue *q = file->private_data;
>
> result = macvtap_get_user(q, iv, iov_length(iv, count),
> file->f_flags & O_NONBLOCK);
> - macvtap_file_put_queue(q);
> -out:
> return result;
> }
>
> @@ -385,14 +378,17 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
> const struct sk_buff *skb,
> const struct iovec *iv, int len)
> {
> - struct macvlan_dev *vlan = q->vlan;
> + struct macvlan_dev *vlan;
> int ret;
>
> len = min_t(int, skb->len, len);
>
> ret = skb_copy_datagram_const_iovec(skb, 0, iv, 0, len);
>
> + rcu_read_lock_bh();
> + vlan = rcu_dereference(q->vlan);
> macvlan_count_rx(vlan, len, ret == 0, 0);
> + rcu_read_unlock_bh();
>
> return ret ? ret : len;
> }
> @@ -401,14 +397,16 @@ static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
> unsigned long count, loff_t pos)
> {
> struct file *file = iocb->ki_filp;
> - struct macvtap_queue *q = macvtap_file_get_queue(file);
> + struct macvtap_queue *q = file->private_data;
>
> DECLARE_WAITQUEUE(wait, current);
> struct sk_buff *skb;
> ssize_t len, ret = 0;
>
> - if (!q)
> - return -ENOLINK;
> + if (!q) {
> + ret = -ENOLINK;
> + goto out;
> + }
>
> len = iov_length(iv, count);
> if (len < 0) {
> @@ -444,7 +442,6 @@ static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
> remove_wait_queue(q->sk.sk_sleep, &wait);
>
> out:
> - macvtap_file_put_queue(q);
> return ret;
> }
>
> @@ -454,12 +451,13 @@ out:
> static long macvtap_ioctl(struct file *file, unsigned int cmd,
> unsigned long arg)
> {
> - struct macvtap_queue *q;
> + struct macvtap_queue *q = file->private_data;
> + struct macvlan_dev *vlan;
> void __user *argp = (void __user *)arg;
> struct ifreq __user *ifr = argp;
> unsigned int __user *up = argp;
> unsigned int u;
> - char devname[IFNAMSIZ];
> + int ret;
>
> switch (cmd) {
> case TUNSETIFF:
> @@ -471,16 +469,21 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
> return 0;
>
> case TUNGETIFF:
> - q = macvtap_file_get_queue(file);
> - if (!q)
> + rcu_read_lock_bh();
> + vlan = rcu_dereference(q->vlan);
> + if (vlan)
> + dev_hold(vlan->dev);
> + rcu_read_unlock_bh();
> +
> + if (!vlan)
> return -ENOLINK;
> - memcpy(devname, q->vlan->dev->name, sizeof(devname));
> - macvtap_file_put_queue(q);
>
> + ret = 0;
> if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) ||
> put_user((TUN_TAP_DEV | TUN_NO_PI), &ifr->ifr_flags))
> - return -EFAULT;
> - return 0;
> + ret = -EFAULT;
> + dev_put(vlan->dev);
> + return ret;
>
> case TUNGETFEATURES:
> if (put_user((IFF_TAP | IFF_NO_PI), up))
> @@ -491,11 +494,7 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
> if (get_user(u, up))
> return -EFAULT;
>
> - q = macvtap_file_get_queue(file);
> - if (!q)
> - return -ENOLINK;
> q->sk.sk_sndbuf = u;
> - macvtap_file_put_queue(q);
> return 0;
>
> case TUNSETOFFLOAD:
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists