lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20190205153607-mutt-send-email-mst@kernel.org>
Date:   Tue, 5 Feb 2019 15:45:14 -0500
From:   "Michael S. Tsirkin" <mst@...hat.com>
To:     Nitesh Narayan Lal <nitesh@...hat.com>
Cc:     kvm@...r.kernel.org, linux-kernel@...r.kernel.org,
        pbonzini@...hat.com, lcapitulino@...hat.com, pagupta@...hat.com,
        wei.w.wang@...el.com, yang.zhang.wz@...il.com, riel@...riel.com,
        david@...hat.com, dodgen@...gle.com, konrad.wilk@...cle.com,
        dhildenb@...hat.com, aarcange@...hat.com
Subject: Re: [RFC][Patch v8 6/7] KVM: Enables the kernel to isolate and
 report free pages

On Mon, Feb 04, 2019 at 03:18:53PM -0500, Nitesh Narayan Lal wrote:
> This patch enables the kernel to scan the per cpu array and
> compress it by removing the repetitive/re-allocated pages.
> Once the per cpu array is completely filled with pages in the
> buddy it wakes up the kernel per cpu thread which re-scans the
> entire per cpu array by acquiring a zone lock corresponding to
> the page which is being scanned. If the page is still free and
> present in the buddy it tries to isolate the page and adds it
> to another per cpu array.
> 
> Once this scanning process is complete and if there are any
> isolated pages added to the new per cpu array kernel thread
> invokes hyperlist_ready().
> 
> In hyperlist_ready() a hypercall is made to report these pages to
> the host using the virtio-balloon framework. In order to do so
> another virtqueue 'hinting_vq' is added to the balloon framework.
> As the host frees all the reported pages, the kernel thread returns
> them back to the buddy.
> 
> Signed-off-by: Nitesh Narayan Lal <nitesh@...hat.com>


This looks kind of like what early iterations of Wei's patches did.

But this has lots of issues, for example you might end up with
a hypercall per a 4K page.

So in the end, he switched over to just reporting only
MAX_ORDER - 1 pages.

Would that be a good idea for you too?

An alternative would be a different much lighter weight
way to report these pages and to free them on the host.

> ---
>  drivers/virtio/virtio_balloon.c     |  56 +++++++-
>  include/linux/page_hinting.h        |  18 ++-
>  include/uapi/linux/virtio_balloon.h |   1 +
>  mm/page_alloc.c                     |   2 +-
>  virt/kvm/page_hinting.c             | 202 +++++++++++++++++++++++++++-
>  5 files changed, 269 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index 728ecd1eea30..8af34e0b9a32 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -57,13 +57,15 @@ enum virtio_balloon_vq {
>  	VIRTIO_BALLOON_VQ_INFLATE,
>  	VIRTIO_BALLOON_VQ_DEFLATE,
>  	VIRTIO_BALLOON_VQ_STATS,
> +	VIRTIO_BALLOON_VQ_HINTING,
>  	VIRTIO_BALLOON_VQ_FREE_PAGE,
>  	VIRTIO_BALLOON_VQ_MAX
>  };
>  
>  struct virtio_balloon {
>  	struct virtio_device *vdev;
> -	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
> +	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq,
> +								*hinting_vq;
>  
>  	/* Balloon's own wq for cpu-intensive work items */
>  	struct workqueue_struct *balloon_wq;
> @@ -122,6 +124,40 @@ static struct virtio_device_id id_table[] = {
>  	{ 0 },
>  };
>  
> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
> +void virtballoon_page_hinting(struct virtio_balloon *vb, u64 gvaddr,
> +			      int hyper_entries)
> +{
> +	u64 gpaddr = virt_to_phys((void *)gvaddr);
> +
> +	virtqueue_add_desc(vb->hinting_vq, gpaddr, hyper_entries, 0);
> +	virtqueue_kick_sync(vb->hinting_vq);
> +}
> +
> +static void hinting_ack(struct virtqueue *vq)
> +{
> +	struct virtio_balloon *vb = vq->vdev->priv;
> +
> +	wake_up(&vb->acked);
> +}
> +
> +static void enable_hinting(struct virtio_balloon *vb)
> +{
> +	guest_page_hinting_flag = 1;
> +	static_branch_enable(&guest_page_hinting_key);
> +	request_hypercall = (void *)&virtballoon_page_hinting;
> +	balloon_ptr = vb;
> +	WARN_ON(smpboot_register_percpu_thread(&hinting_threads));
> +}
> +
> +static void disable_hinting(void)
> +{
> +	guest_page_hinting_flag = 0;
> +	static_branch_enable(&guest_page_hinting_key);
> +	balloon_ptr = NULL;
> +}
> +#endif
> +
>  static u32 page_to_balloon_pfn(struct page *page)
>  {
>  	unsigned long pfn = page_to_pfn(page);
> @@ -481,6 +517,7 @@ static int init_vqs(struct virtio_balloon *vb)
>  	names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate";
>  	names[VIRTIO_BALLOON_VQ_STATS] = NULL;
>  	names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> +	names[VIRTIO_BALLOON_VQ_HINTING] = NULL;
>  
>  	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
>  		names[VIRTIO_BALLOON_VQ_STATS] = "stats";
> @@ -492,11 +529,18 @@ static int init_vqs(struct virtio_balloon *vb)
>  		callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
>  	}
>  
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING)) {
> +		names[VIRTIO_BALLOON_VQ_HINTING] = "hinting_vq";
> +		callbacks[VIRTIO_BALLOON_VQ_HINTING] = hinting_ack;
> +	}
>  	err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
>  					 vqs, callbacks, names, NULL, NULL);
>  	if (err)
>  		return err;
>  
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
> +		vb->hinting_vq = vqs[VIRTIO_BALLOON_VQ_HINTING];
> +
>  	vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE];
>  	vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE];
>  	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> @@ -908,6 +952,11 @@ static int virtballoon_probe(struct virtio_device *vdev)
>  		if (err)
>  			goto out_del_balloon_wq;
>  	}
> +
> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
> +		enable_hinting(vb);
> +#endif
>  	virtio_device_ready(vdev);
>  
>  	if (towards_target(vb))
> @@ -950,6 +999,10 @@ static void virtballoon_remove(struct virtio_device *vdev)
>  	cancel_work_sync(&vb->update_balloon_size_work);
>  	cancel_work_sync(&vb->update_balloon_stats_work);
>  
> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
> +		disable_hinting();
> +#endif
>  	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
>  		cancel_work_sync(&vb->report_free_page_work);
>  		destroy_workqueue(vb->balloon_wq);
> @@ -1009,6 +1062,7 @@ static unsigned int features[] = {
>  	VIRTIO_BALLOON_F_MUST_TELL_HOST,
>  	VIRTIO_BALLOON_F_STATS_VQ,
>  	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
> +	VIRTIO_BALLOON_F_HINTING,
>  	VIRTIO_BALLOON_F_FREE_PAGE_HINT,
>  	VIRTIO_BALLOON_F_PAGE_POISON,
>  };
> diff --git a/include/linux/page_hinting.h b/include/linux/page_hinting.h
> index e800c6b07561..3ba8c1f3b4a4 100644
> --- a/include/linux/page_hinting.h
> +++ b/include/linux/page_hinting.h
> @@ -1,15 +1,12 @@
>  #include <linux/smpboot.h>
>  
> -/*
> - * Size of the array which is used to store the freed pages is defined by
> - * MAX_FGPT_ENTRIES. If possible, we have to find a better way using which
> - * we can get rid of the hardcoded array size.
> - */
>  #define MAX_FGPT_ENTRIES	1000
>  /*
>   * hypervisor_pages - It is a dummy structure passed with the hypercall.
> - * @pfn: page frame number for the page which needs to be sent to the host.
> - * @order: order of the page needs to be reported to the host.
> + * @pfn - page frame number for the page which is to be freed.
> + * @pages - number of pages which are supposed to be freed.
> + * A global array object is used to to hold the list of pfn and pages and is
> + * passed as part of the hypercall.
>   */
>  struct hypervisor_pages {
>  	unsigned long pfn;
> @@ -19,11 +16,18 @@ struct hypervisor_pages {
>  extern int guest_page_hinting_flag;
>  extern struct static_key_false guest_page_hinting_key;
>  extern struct smp_hotplug_thread hinting_threads;
> +extern void (*request_hypercall)(void *, u64, int);
> +extern void *balloon_ptr;
>  extern bool want_page_poisoning;
>  
>  int guest_page_hinting_sysctl(struct ctl_table *table, int write,
>  			      void __user *buffer, size_t *lenp, loff_t *ppos);
>  void guest_free_page(struct page *page, int order);
> +extern int __isolate_free_page(struct page *page, unsigned int order);
> +extern void free_one_page(struct zone *zone,
> +			  struct page *page, unsigned long pfn,
> +			  unsigned int order,
> +			  int migratetype);
>  
>  static inline void disable_page_poisoning(void)
>  {

I guess you will want to put this in some other header.  Function
declarations belong close to where they are implemented, not used.

> diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> index a1966cd7b677..2b0f62814e22 100644
> --- a/include/uapi/linux/virtio_balloon.h
> +++ b/include/uapi/linux/virtio_balloon.h
> @@ -36,6 +36,7 @@
>  #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
>  #define VIRTIO_BALLOON_F_FREE_PAGE_HINT	3 /* VQ to report free pages */
>  #define VIRTIO_BALLOON_F_PAGE_POISON	4 /* Guest is using page poisoning */
> +#define VIRTIO_BALLOON_F_HINTING	5 /* Page hinting virtqueue */
>  
>  /* Size of a PFN in the balloon interface. */
>  #define VIRTIO_BALLOON_PFN_SHIFT 12
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index d295c9bc01a8..93224cba9243 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1199,7 +1199,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
>  	spin_unlock(&zone->lock);
>  }
>  
> -static void free_one_page(struct zone *zone,
> +void free_one_page(struct zone *zone,
>  				struct page *page, unsigned long pfn,
>  				unsigned int order,
>  				int migratetype)
> diff --git a/virt/kvm/page_hinting.c b/virt/kvm/page_hinting.c
> index be529f6f2bc0..315099fcda43 100644
> --- a/virt/kvm/page_hinting.c
> +++ b/virt/kvm/page_hinting.c
> @@ -1,6 +1,8 @@
>  #include <linux/gfp.h>
>  #include <linux/mm.h>
> +#include <linux/page_ref.h>
>  #include <linux/kvm_host.h>
> +#include <linux/sort.h>
>  #include <linux/kernel.h>
>  
>  /*
> @@ -39,6 +41,11 @@ int guest_page_hinting_flag;
>  EXPORT_SYMBOL(guest_page_hinting_flag);
>  static DEFINE_PER_CPU(struct task_struct *, hinting_task);
>  
> +void (*request_hypercall)(void *, u64, int);
> +EXPORT_SYMBOL(request_hypercall);
> +void *balloon_ptr;
> +EXPORT_SYMBOL(balloon_ptr);
> +
>  int guest_page_hinting_sysctl(struct ctl_table *table, int write,
>  			      void __user *buffer, size_t *lenp,
>  			      loff_t *ppos)
> @@ -55,18 +62,201 @@ int guest_page_hinting_sysctl(struct ctl_table *table, int write,
>  	return ret;
>  }
>  
> +void hyperlist_ready(struct hypervisor_pages *guest_isolated_pages, int entries)
> +{
> +	int i = 0;
> +	int mt = 0;
> +
> +	if (balloon_ptr)
> +		request_hypercall(balloon_ptr, (u64)&guest_isolated_pages[0],
> +				  entries);
> +
> +	while (i < entries) {
> +		struct page *page = pfn_to_page(guest_isolated_pages[i].pfn);
> +
> +		mt = get_pageblock_migratetype(page);
> +		free_one_page(page_zone(page), page, page_to_pfn(page),
> +			      guest_isolated_pages[i].order, mt);
> +		i++;
> +	}
> +}
> +
> +struct page *get_buddy_page(struct page *page)
> +{
> +	unsigned long pfn = page_to_pfn(page);
> +	unsigned int order;
> +
> +	for (order = 0; order < MAX_ORDER; order++) {
> +		struct page *page_head = page - (pfn & ((1 << order) - 1));
> +
> +		if (PageBuddy(page_head) && page_private(page_head) >= order)
> +			return page_head;
> +	}
> +	return NULL;
> +}
> +
>  static void hinting_fn(unsigned int cpu)
>  {
>  	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> +	int idx = 0, ret = 0;
> +	struct zone *zone_cur;
> +	unsigned long flags = 0;
> +
> +	while (idx < MAX_FGPT_ENTRIES) {
> +		unsigned long pfn = page_hinting_obj->kvm_pt[idx].pfn;
> +		unsigned long pfn_end = page_hinting_obj->kvm_pt[idx].pfn +
> +			(1 << page_hinting_obj->kvm_pt[idx].order) - 1;
> +
> +		while (pfn <= pfn_end) {
> +			struct page *page = pfn_to_page(pfn);
> +			struct page *buddy_page = NULL;
> +
> +			zone_cur = page_zone(page);
> +			spin_lock_irqsave(&zone_cur->lock, flags);
> +
> +			if (PageCompound(page)) {
> +				struct page *head_page = compound_head(page);
> +				unsigned long head_pfn = page_to_pfn(head_page);
> +				unsigned int alloc_pages =
> +					1 << compound_order(head_page);
> +
> +				pfn = head_pfn + alloc_pages;
> +				spin_unlock_irqrestore(&zone_cur->lock, flags);
> +				continue;
> +			}
> +
> +			if (page_ref_count(page)) {
> +				pfn++;
> +				spin_unlock_irqrestore(&zone_cur->lock, flags);
> +				continue;
> +			}
> +
> +			if (PageBuddy(page)) {
> +				int buddy_order = page_private(page);
>  
> +				ret = __isolate_free_page(page, buddy_order);
> +				if (!ret) {
> +				} else {
> +					int l_idx = page_hinting_obj->hyp_idx;
> +					struct hypervisor_pages *l_obj =
> +					page_hinting_obj->hypervisor_pagelist;
> +
> +					l_obj[l_idx].pfn = pfn;
> +					l_obj[l_idx].order = buddy_order;
> +					page_hinting_obj->hyp_idx += 1;
> +				}
> +				pfn = pfn + (1 << buddy_order);
> +				spin_unlock_irqrestore(&zone_cur->lock, flags);
> +				continue;
> +			}
> +
> +			buddy_page = get_buddy_page(page);
> +			if (buddy_page) {
> +				int buddy_order = page_private(buddy_page);
> +
> +				ret = __isolate_free_page(buddy_page,
> +							  buddy_order);
> +				if (!ret) {
> +				} else {
> +					int l_idx = page_hinting_obj->hyp_idx;
> +					struct hypervisor_pages *l_obj =
> +					page_hinting_obj->hypervisor_pagelist;
> +					unsigned long buddy_pfn =
> +						page_to_pfn(buddy_page);
> +
> +					l_obj[l_idx].pfn = buddy_pfn;
> +					l_obj[l_idx].order = buddy_order;
> +					page_hinting_obj->hyp_idx += 1;
> +				}
> +				pfn = page_to_pfn(buddy_page) +
> +					(1 << buddy_order);
> +				spin_unlock_irqrestore(&zone_cur->lock, flags);
> +				continue;
> +			}
> +			spin_unlock_irqrestore(&zone_cur->lock, flags);
> +			pfn++;
> +		}
> +		page_hinting_obj->kvm_pt[idx].pfn = 0;
> +		page_hinting_obj->kvm_pt[idx].order = -1;
> +		page_hinting_obj->kvm_pt[idx].zonenum = -1;
> +		idx++;
> +	}
> +	if (page_hinting_obj->hyp_idx > 0) {
> +		hyperlist_ready(page_hinting_obj->hypervisor_pagelist,
> +				page_hinting_obj->hyp_idx);
> +		page_hinting_obj->hyp_idx = 0;
> +	}
>  	page_hinting_obj->kvm_pt_idx = 0;
>  	put_cpu_var(hinting_obj);
>  }
>  
> +int if_exist(struct page *page)
> +{
> +	int i = 0;
> +	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> +
> +	while (i < MAX_FGPT_ENTRIES) {
> +		if (page_to_pfn(page) == page_hinting_obj->kvm_pt[i].pfn)
> +			return 1;
> +		i++;
> +	}
> +	return 0;
> +}
> +
> +void pack_array(void)
> +{
> +	int i = 0, j = 0;
> +	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> +
> +	while (i < MAX_FGPT_ENTRIES) {
> +		if (page_hinting_obj->kvm_pt[i].pfn != 0) {
> +			if (i != j) {
> +				page_hinting_obj->kvm_pt[j].pfn =
> +					page_hinting_obj->kvm_pt[i].pfn;
> +				page_hinting_obj->kvm_pt[j].order =
> +					page_hinting_obj->kvm_pt[i].order;
> +				page_hinting_obj->kvm_pt[j].zonenum =
> +					page_hinting_obj->kvm_pt[i].zonenum;
> +			}
> +			j++;
> +		}
> +		i++;
> +	}
> +	i = j;
> +	page_hinting_obj->kvm_pt_idx = j;
> +	while (j < MAX_FGPT_ENTRIES) {
> +		page_hinting_obj->kvm_pt[j].pfn = 0;
> +		page_hinting_obj->kvm_pt[j].order = -1;
> +		page_hinting_obj->kvm_pt[j].zonenum = -1;
> +		j++;
> +	}
> +}
> +
>  void scan_array(void)
>  {
>  	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> +	int i = 0;
>  
> +	while (i < MAX_FGPT_ENTRIES) {
> +		struct page *page =
> +			pfn_to_page(page_hinting_obj->kvm_pt[i].pfn);
> +		struct page *buddy_page = get_buddy_page(page);
> +
> +		if (!PageBuddy(page) && buddy_page) {
> +			if (if_exist(buddy_page)) {
> +				page_hinting_obj->kvm_pt[i].pfn = 0;
> +				page_hinting_obj->kvm_pt[i].order = -1;
> +				page_hinting_obj->kvm_pt[i].zonenum = -1;
> +			} else {
> +				page_hinting_obj->kvm_pt[i].pfn =
> +					page_to_pfn(buddy_page);
> +				page_hinting_obj->kvm_pt[i].order =
> +					page_private(buddy_page);
> +			}
> +		}
> +		i++;
> +	}
> +	pack_array();
>  	if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES)
>  		wake_up_process(__this_cpu_read(hinting_task));
>  }
> @@ -111,8 +301,18 @@ void guest_free_page(struct page *page, int order)
>  		page_hinting_obj->kvm_pt[page_hinting_obj->kvm_pt_idx].order =
>  							order;
>  		page_hinting_obj->kvm_pt_idx += 1;
> -		if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES)
> +		if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES) {
> +			/*
> +			 * We are depending on the buddy free-list to identify
> +			 * if a page is free or not. Hence, we are dumping all
> +			 * the per-cpu pages back into the buddy allocator. This
> +			 * will ensure less failures when we try to isolate free
> +			 * captured pages and hence more memory reporting to the
> +			 * host.
> +			 */
> +			drain_local_pages(NULL);
>  			scan_array();
> +		}
>  	}
>  	local_irq_restore(flags);
>  }
> -- 
> 2.17.2

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ