lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20120821144013.GA7784@redhat.com>
Date:	Tue, 21 Aug 2012 17:40:14 +0300
From:	"Michael S. Tsirkin" <mst@...hat.com>
To:	Rafael Aquini <aquini@...hat.com>
Cc:	linux-mm@...ck.org, linux-kernel@...r.kernel.org,
	virtualization@...ts.linux-foundation.org,
	Rusty Russell <rusty@...tcorp.com.au>,
	Rik van Riel <riel@...hat.com>, Mel Gorman <mel@....ul.ie>,
	Andi Kleen <andi@...stfloor.org>,
	Andrew Morton <akpm@...ux-foundation.org>,
	Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>,
	Minchan Kim <minchan@...nel.org>
Subject: Re: [PATCH v8 3/5] virtio_balloon: introduce migration primitives to
 balloon pages

On Tue, Aug 21, 2012 at 09:47:46AM -0300, Rafael Aquini wrote:
> Memory fragmentation introduced by ballooning might reduce significantly
> the number of 2MB contiguous memory blocks that can be used within a guest,
> thus imposing performance penalties associated with the reduced number of
> transparent huge pages that could be used by the guest workload.
> 
> Besides making balloon pages movable at allocation time and introducing
> the necessary primitives to perform balloon page migration/compaction,
> this patch also introduces the following locking scheme to provide the
> proper synchronization and protection for struct virtio_balloon elements
> against concurrent accesses due to parallel operations introduced by
> memory compaction / page migration.
>  - balloon_lock (mutex) : synchronizes the access demand to elements of
> 			  struct virtio_balloon and its queue operations;
>  - pages_lock (spinlock): special protection to balloon pages list against
> 			  concurrent list handling operations;
>  - virtio_baloon->pages list handling sync by RCU operations;
> 
> Signed-off-by: Rafael Aquini <aquini@...hat.com>
> ---
>  drivers/virtio/virtio_balloon.c | 210 +++++++++++++++++++++++++++++++++++++---
>  1 file changed, 199 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index 0908e60..bda7bb0 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -27,6 +27,7 @@
>  #include <linux/delay.h>
>  #include <linux/slab.h>
>  #include <linux/module.h>
> +#include <linux/balloon_compaction.h>
>  
>  /*
>   * Balloon device works in 4K page units.  So each page is pointed to by
> @@ -35,6 +36,12 @@
>   */
>  #define VIRTIO_BALLOON_PAGES_PER_PAGE (PAGE_SIZE >> VIRTIO_BALLOON_PFN_SHIFT)
>  
> +/* flags used to hint compaction procedures about the balloon device status */
> +enum balloon_status_flags {
> +	BALLOON_REMOVAL = 0,	/* balloon device is under removal steps */
> +	BALLOON_OK,		/* balloon device is up and running */
> +};
> +
>  struct virtio_balloon
>  {
>  	struct virtio_device *vdev;
> @@ -46,11 +53,24 @@ struct virtio_balloon
>  	/* The thread servicing the balloon. */
>  	struct task_struct *thread;
>  
> +	/* balloon special page->mapping */
> +	struct address_space *mapping;
> +
> +	/* Synchronize access/update to this struct virtio_balloon elements */
> +	struct mutex balloon_lock;
> +
>  	/* Waiting for host to ack the pages we released. */
>  	wait_queue_head_t acked;
>  
>  	/* Number of balloon pages we've told the Host we're not using. */
>  	unsigned int num_pages;
> +
> +	/* balloon device status flag */
> +	unsigned short balloon_status;
> +
> +	/* Protect 'pages' list against concurrent handling */
> +	spinlock_t pages_lock;
> +
>  	/*
>  	 * The pages we've told the Host we're not using.
>  	 * Each page on this list adds VIRTIO_BALLOON_PAGES_PER_PAGE
> @@ -122,13 +142,17 @@ static void set_page_pfns(u32 pfns[], struct page *page)
>  
>  static void fill_balloon(struct virtio_balloon *vb, size_t num)
>  {
> +	/* Get the proper GFP alloc mask from vb->mapping flags */
> +	gfp_t vb_gfp_mask = mapping_gfp_mask(vb->mapping);
> +
>  	/* We can only do one array worth at a time. */
>  	num = min(num, ARRAY_SIZE(vb->pfns));
>  
> +	mutex_lock(&vb->balloon_lock);
>  	for (vb->num_pfns = 0; vb->num_pfns < num;
>  	     vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
> -		struct page *page = alloc_page(GFP_HIGHUSER | __GFP_NORETRY |
> -					__GFP_NOMEMALLOC | __GFP_NOWARN);
> +		struct page *page = alloc_page(vb_gfp_mask | __GFP_NORETRY |
> +					       __GFP_NOWARN | __GFP_NOMEMALLOC);
>  		if (!page) {
>  			if (printk_ratelimit())
>  				dev_printk(KERN_INFO, &vb->vdev->dev,
> @@ -141,7 +165,10 @@ static void fill_balloon(struct virtio_balloon *vb, size_t num)
>  		set_page_pfns(vb->pfns + vb->num_pfns, page);
>  		vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
>  		totalram_pages--;
> -		list_add(&page->lru, &vb->pages);
> +		spin_lock(&vb->pages_lock);
> +		list_add_rcu(&page->lru, &vb->pages);
> +		assign_balloon_mapping(page, vb->mapping);
> +		spin_unlock(&vb->pages_lock);
>  	}
>  
>  	/* Didn't get any?  Oh well. */
> @@ -149,6 +176,7 @@ static void fill_balloon(struct virtio_balloon *vb, size_t num)
>  		return;
>  
>  	tell_host(vb, vb->inflate_vq);
> +	mutex_unlock(&vb->balloon_lock);
>  }
>  
>  static void release_pages_by_pfn(const u32 pfns[], unsigned int num)
> @@ -169,21 +197,48 @@ static void leak_balloon(struct virtio_balloon *vb, size_t num)
>  	/* We can only do one array worth at a time. */
>  	num = min(num, ARRAY_SIZE(vb->pfns));
>  
> +	mutex_lock(&vb->balloon_lock);
>  	for (vb->num_pfns = 0; vb->num_pfns < num;
>  	     vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
> -		page = list_first_entry(&vb->pages, struct page, lru);
> -		list_del(&page->lru);
> +		/*
> +		 * We can race against 'virtballoon_isolatepage()' and end up
> +		 * stumbling across a _temporarily_ empty 'pages' list.
> +		 */
> +		spin_lock(&vb->pages_lock);
> +		page = list_first_or_null_rcu(&vb->pages, struct page, lru);

Why is list_first_or_null_rcu called outside
RCU critical section here?

> +		if (!page) {
> +			spin_unlock(&vb->pages_lock);
> +			break;
> +		}
> +		/*
> +		 * It is safe now to drop page->mapping and delete this page
> +		 * from balloon page list, since we are grabbing 'pages_lock'
> +		 * which prevents 'virtballoon_isolatepage()' from acting.
> +		 */
> +		clear_balloon_mapping(page);
> +		list_del_rcu(&page->lru);
> +		spin_unlock(&vb->pages_lock);
>  		set_page_pfns(vb->pfns + vb->num_pfns, page);
>  		vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE;
>  	}
> +	/*
> +	 * Syncrhonize RCU grace period and wait for all RCU read critical side
> +	 * sections to finish before proceeding with page release steps.
> +	 * This avoids compaction/migration callback races against balloon
> +	 * device removal steps.
> +	 */
> +	synchronize_rcu();
>  
>  	/*
>  	 * Note that if
>  	 * virtio_has_feature(vdev, VIRTIO_BALLOON_F_MUST_TELL_HOST);
>  	 * is true, we *have* to do it in this order
>  	 */
> -	tell_host(vb, vb->deflate_vq);
> -	release_pages_by_pfn(vb->pfns, vb->num_pfns);
> +	if (vb->num_pfns > 0) {
> +		tell_host(vb, vb->deflate_vq);
> +		release_pages_by_pfn(vb->pfns, vb->num_pfns);
> +	}
> +	mutex_unlock(&vb->balloon_lock);
>  }
>  
>  static inline void update_stat(struct virtio_balloon *vb, int idx,
> @@ -239,6 +294,7 @@ static void stats_handle_request(struct virtio_balloon *vb)
>  	struct scatterlist sg;
>  	unsigned int len;
>  
> +	mutex_lock(&vb->balloon_lock);
>  	vb->need_stats_update = 0;
>  	update_balloon_stats(vb);
>  
> @@ -249,6 +305,7 @@ static void stats_handle_request(struct virtio_balloon *vb)
>  	if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0)
>  		BUG();
>  	virtqueue_kick(vq);
> +	mutex_unlock(&vb->balloon_lock);
>  }
>  
>  static void virtballoon_changed(struct virtio_device *vdev)
> @@ -261,22 +318,27 @@ static void virtballoon_changed(struct virtio_device *vdev)
>  static inline s64 towards_target(struct virtio_balloon *vb)
>  {
>  	__le32 v;
> -	s64 target;
> +	s64 target, actual;
>  
> +	mutex_lock(&vb->balloon_lock);
> +	actual = vb->num_pages;
>  	vb->vdev->config->get(vb->vdev,
>  			      offsetof(struct virtio_balloon_config, num_pages),
>  			      &v, sizeof(v));
>  	target = le32_to_cpu(v);
> -	return target - vb->num_pages;
> +	mutex_unlock(&vb->balloon_lock);
> +	return target - actual;
>  }
>  
>  static void update_balloon_size(struct virtio_balloon *vb)
>  {
> -	__le32 actual = cpu_to_le32(vb->num_pages);
> -
> +	__le32 actual;
> +	mutex_lock(&vb->balloon_lock);
> +	actual = cpu_to_le32(vb->num_pages);
>  	vb->vdev->config->set(vb->vdev,
>  			      offsetof(struct virtio_balloon_config, actual),
>  			      &actual, sizeof(actual));
> +	mutex_unlock(&vb->balloon_lock);
>  }
>  
>  static int balloon(void *_vballoon)
> @@ -339,9 +401,117 @@ static int init_vqs(struct virtio_balloon *vb)
>  	return 0;
>  }
>  
> +#ifdef CONFIG_BALLOON_COMPACTION
> +/*
> + * Populate balloon_mapping->a_ops callback method to perform the balloon
> + * page migration task.
> + *
> + * After a ballooned page gets isolated by compaction procedures, this is the
> + * function that performs the page migration on behalf of move_to_new_page(),
> + * when the last calls (page)->mapping->a_ops->migratepage.
> + *
> + * Page migration for virtio balloon is done in a simple swap fashion which
> + * follows these two steps:
> + *  1) insert newpage into vb->pages list and update the host about it;
> + *  2) update the host about the removed old page from vb->pages list;
> + */
> +int virtballoon_migratepage(struct address_space *mapping,
> +		struct page *newpage, struct page *page, enum migrate_mode mode)
> +{
> +	struct virtio_balloon *vb = (void *)__page_balloon_device(page);

Don't cast to void *.

> +
> +	/* at this point, besides very unlikely, a NULL *vb is a serious bug */

A bit of a strange comment. Either it's unlikely or a bug :)
Besides, vb is checked for NULL value, not *vb.

> +	BUG_ON(!vb);
> +
> +	/*
> +	 * Skip page migration step if the memory balloon device is under its
> +	 * removal procedure, to avoid racing against module unload.
> +	 *

What kind of race does this fix? Why does not
leaking all pages fix it?

> +	 * If there still are isolated (balloon) pages under migration lists,
> +	 * 'virtballoon_putbackpage()' will take care of them properly, before
> +	 * the module unload finishes.
> +	 */
> +	if (vb->balloon_status == BALLOON_REMOVAL)
> +		return -EAGAIN;
> +
> +	mutex_lock(&vb->balloon_lock);
> +
> +	/* balloon's page migration 1st step */
> +	vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
> +	spin_lock(&vb->pages_lock);
> +	list_add_rcu(&newpage->lru, &vb->pages);
> +	spin_unlock(&vb->pages_lock);
> +	set_page_pfns(vb->pfns, newpage);
> +	tell_host(vb, vb->inflate_vq);
> +
> +	/* balloon's page migration 2nd step */
> +	vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
> +	set_page_pfns(vb->pfns, page);
> +	tell_host(vb, vb->deflate_vq);
> +
> +	mutex_unlock(&vb->balloon_lock);
> +	return 0;
> +}
> +
> +/*
> + * Populate balloon_mapping->a_ops callback method to help compaction on
> + * isolating a page from the balloon page list for posterior migration.
> + */
> +int virtballoon_isolatepage(struct page *page)
> +{
> +	struct virtio_balloon *vb = (void *)__page_balloon_device(page);

Cast here looks wrong.

> +	int ret = 0;
> +	/*
> +	 * If we stumble across a NULL *vb here, it means this page has been
> +	 * already released by 'leak_balloon()'.
> +	 *
> +	 * We also skip the page isolation step if the memory balloon device is
> +	 * under its removal procedure, to avoid racing against module unload.
> +	 */

What kind of race do you have in mind here?
Doesn't leaking all pages in module removal address it?

> +	if (vb


How can vb be NULL? Pls document.

> && (vb->balloon_status != BALLOON_REMOVAL)) {

Read of balloon_status needs some kind of memory barrier I think.

> +		spin_lock(&vb->pages_lock);
> +		/*
> +		 * virtballoon_isolatepage() can race against leak_balloon(),
> +		 * and (wrongly) isolate a page that is about to be freed.
> +		 * Test page->mapping under pages_lock to close that window.
> +		 */
> +		if (rcu_access_pointer(page->mapping) == vb->mapping) {
> +			/* It is safe to isolate this page, now */
> +			list_del_rcu(&page->lru);
> +			ret = 1;
> +		}
> +		spin_unlock(&vb->pages_lock);
> +	}
> +	return ret;
> +}
> +
> +/*
> + * Populate balloon_mapping->a_ops callback method to help compaction on
> + * re-inserting a not-migrated isolated page into the balloon page list.
> + */
> +void virtballoon_putbackpage(struct page *page)
> +{
> +	struct virtio_balloon *vb = (void *)__page_balloon_device(page);
> +
> +	/* at this point, besides very unlikely, a NULL *vb is a serious bug */

A bit of a strange comment. Either it's unlikely or a bug :)
Besides, vb is checked for NULL value, not *vb.

> +	BUG_ON(!vb);
> +
> +	spin_lock(&vb->pages_lock);
> +	list_add_rcu(&page->lru, &vb->pages);
> +	spin_unlock(&vb->pages_lock);
> +}
> +#endif /* CONFIG_BALLOON_COMPACTION */
> +
> +/* define the balloon_mapping->a_ops callbacks to allow compaction/migration */
> +static DEFINE_BALLOON_MAPPING_AOPS(virtio_balloon_aops,
> +				   virtballoon_migratepage,
> +				   virtballoon_isolatepage,
> +				   virtballoon_putbackpage);
> +
>  static int virtballoon_probe(struct virtio_device *vdev)
>  {
>  	struct virtio_balloon *vb;
> +	struct address_space *vb_mapping;
>  	int err;
>  
>  	vdev->priv = vb = kmalloc(sizeof(*vb), GFP_KERNEL);
> @@ -351,12 +521,24 @@ static int virtballoon_probe(struct virtio_device *vdev)
>  	}
>  
>  	INIT_LIST_HEAD(&vb->pages);
> +	spin_lock_init(&vb->pages_lock);
> +	mutex_init(&vb->balloon_lock);
> +
>  	vb->num_pages = 0;
>  	init_waitqueue_head(&vb->config_change);
>  	init_waitqueue_head(&vb->acked);
>  	vb->vdev = vdev;
>  	vb->need_stats_update = 0;
>  
> +	/* Allocate a special page->mapping for this balloon device */

This comment is not helpful, pls remove.

> +	vb_mapping = alloc_balloon_mapping((void *)vb, &virtio_balloon_aops);

Cast to void * is not needed.

> +	if (!vb_mapping) {
> +		err = -ENOMEM;
> +		goto out_free_vb;
> +	}
> +	/* Store the page->mapping reference for this balloon device */
> +	vb->mapping = vb_mapping;

This comment is not helpful, pls remove.

> +
>  	err = init_vqs(vb);
>  	if (err)
>  		goto out_free_vb;
> @@ -367,12 +549,14 @@ static int virtballoon_probe(struct virtio_device *vdev)
>  		goto out_del_vqs;
>  	}
>  
> +	vb->balloon_status = BALLOON_OK;
>  	return 0;
>  
>  out_del_vqs:
>  	vdev->config->del_vqs(vdev);
>  out_free_vb:
>  	kfree(vb);
> +	kfree(vb_mapping);

I think it's better to wrap free for vb mapping too.

>  out:
>  	return err;
>  }
> @@ -394,8 +578,10 @@ static void __devexit virtballoon_remove(struct virtio_device *vdev)
>  {
>  	struct virtio_balloon *vb = vdev->priv;
>  
> +	vb->balloon_status = BALLOON_REMOVAL;

This needs some kind of barrier.

>  	kthread_stop(vb->thread);
>  	remove_common(vb);
> +	kfree(vb->mapping);
>  	kfree(vb);
>  }
>  
> @@ -409,6 +595,7 @@ static int virtballoon_freeze(struct virtio_device *vdev)
>  	 * function is called.
>  	 */
>  
> +	vb->balloon_status = BALLOON_REMOVAL;

Here module is not going away so why change status?

>  	remove_common(vb);
>  	return 0;
>  }
> @@ -424,6 +611,7 @@ static int virtballoon_restore(struct virtio_device *vdev)
>  
>  	fill_balloon(vb, towards_target(vb));
>  	update_balloon_size(vb);
> +	vb->balloon_status = BALLOON_OK;

Isn't this too late to set status here? We filled the balloon ...


>  	return 0;
>  }
>  #endif
> -- 
> 1.7.11.4
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ