lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Tue, 5 Feb 2019 16:54:03 -0500
From:   Nitesh Narayan Lal <nitesh@...hat.com>
To:     "Michael S. Tsirkin" <mst@...hat.com>
Cc:     kvm@...r.kernel.org, linux-kernel@...r.kernel.org,
        pbonzini@...hat.com, lcapitulino@...hat.com, pagupta@...hat.com,
        wei.w.wang@...el.com, yang.zhang.wz@...il.com, riel@...riel.com,
        david@...hat.com, dodgen@...gle.com, konrad.wilk@...cle.com,
        dhildenb@...hat.com, aarcange@...hat.com
Subject: Re: [RFC][Patch v8 6/7] KVM: Enables the kernel to isolate and report
 free pages


On 2/5/19 3:45 PM, Michael S. Tsirkin wrote:
> On Mon, Feb 04, 2019 at 03:18:53PM -0500, Nitesh Narayan Lal wrote:
>> This patch enables the kernel to scan the per cpu array and
>> compress it by removing the repetitive/re-allocated pages.
>> Once the per cpu array is completely filled with pages in the
>> buddy it wakes up the kernel per cpu thread which re-scans the
>> entire per cpu array by acquiring a zone lock corresponding to
>> the page which is being scanned. If the page is still free and
>> present in the buddy it tries to isolate the page and adds it
>> to another per cpu array.
>>
>> Once this scanning process is complete and if there are any
>> isolated pages added to the new per cpu array kernel thread
>> invokes hyperlist_ready().
>>
>> In hyperlist_ready() a hypercall is made to report these pages to
>> the host using the virtio-balloon framework. In order to do so
>> another virtqueue 'hinting_vq' is added to the balloon framework.
>> As the host frees all the reported pages, the kernel thread returns
>> them back to the buddy.
>>
>> Signed-off-by: Nitesh Narayan Lal <nitesh@...hat.com>
>
> This looks kind of like what early iterations of Wei's patches did.
>
> But this has lots of issues, for example you might end up with
> a hypercall per a 4K page.
> So in the end, he switched over to just reporting only
> MAX_ORDER - 1 pages.
You mean that I should only capture/attempt to isolate pages with order
MAX_ORDER - 1?
>
> Would that be a good idea for you too?
Will it help if we have a threshold value based on the amount of memory
captured instead of the number of entries/pages in the array?
>
> An alternative would be a different much lighter weight
> way to report these pages and to free them on the host.
>
>> ---
>>  drivers/virtio/virtio_balloon.c     |  56 +++++++-
>>  include/linux/page_hinting.h        |  18 ++-
>>  include/uapi/linux/virtio_balloon.h |   1 +
>>  mm/page_alloc.c                     |   2 +-
>>  virt/kvm/page_hinting.c             | 202 +++++++++++++++++++++++++++-
>>  5 files changed, 269 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
>> index 728ecd1eea30..8af34e0b9a32 100644
>> --- a/drivers/virtio/virtio_balloon.c
>> +++ b/drivers/virtio/virtio_balloon.c
>> @@ -57,13 +57,15 @@ enum virtio_balloon_vq {
>>  	VIRTIO_BALLOON_VQ_INFLATE,
>>  	VIRTIO_BALLOON_VQ_DEFLATE,
>>  	VIRTIO_BALLOON_VQ_STATS,
>> +	VIRTIO_BALLOON_VQ_HINTING,
>>  	VIRTIO_BALLOON_VQ_FREE_PAGE,
>>  	VIRTIO_BALLOON_VQ_MAX
>>  };
>>  
>>  struct virtio_balloon {
>>  	struct virtio_device *vdev;
>> -	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
>> +	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq,
>> +								*hinting_vq;
>>  
>>  	/* Balloon's own wq for cpu-intensive work items */
>>  	struct workqueue_struct *balloon_wq;
>> @@ -122,6 +124,40 @@ static struct virtio_device_id id_table[] = {
>>  	{ 0 },
>>  };
>>  
>> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
>> +void virtballoon_page_hinting(struct virtio_balloon *vb, u64 gvaddr,
>> +			      int hyper_entries)
>> +{
>> +	u64 gpaddr = virt_to_phys((void *)gvaddr);
>> +
>> +	virtqueue_add_desc(vb->hinting_vq, gpaddr, hyper_entries, 0);
>> +	virtqueue_kick_sync(vb->hinting_vq);
>> +}
>> +
>> +static void hinting_ack(struct virtqueue *vq)
>> +{
>> +	struct virtio_balloon *vb = vq->vdev->priv;
>> +
>> +	wake_up(&vb->acked);
>> +}
>> +
>> +static void enable_hinting(struct virtio_balloon *vb)
>> +{
>> +	guest_page_hinting_flag = 1;
>> +	static_branch_enable(&guest_page_hinting_key);
>> +	request_hypercall = (void *)&virtballoon_page_hinting;
>> +	balloon_ptr = vb;
>> +	WARN_ON(smpboot_register_percpu_thread(&hinting_threads));
>> +}
>> +
>> +static void disable_hinting(void)
>> +{
>> +	guest_page_hinting_flag = 0;
>> +	static_branch_enable(&guest_page_hinting_key);
>> +	balloon_ptr = NULL;
>> +}
>> +#endif
>> +
>>  static u32 page_to_balloon_pfn(struct page *page)
>>  {
>>  	unsigned long pfn = page_to_pfn(page);
>> @@ -481,6 +517,7 @@ static int init_vqs(struct virtio_balloon *vb)
>>  	names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate";
>>  	names[VIRTIO_BALLOON_VQ_STATS] = NULL;
>>  	names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
>> +	names[VIRTIO_BALLOON_VQ_HINTING] = NULL;
>>  
>>  	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
>>  		names[VIRTIO_BALLOON_VQ_STATS] = "stats";
>> @@ -492,11 +529,18 @@ static int init_vqs(struct virtio_balloon *vb)
>>  		callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
>>  	}
>>  
>> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING)) {
>> +		names[VIRTIO_BALLOON_VQ_HINTING] = "hinting_vq";
>> +		callbacks[VIRTIO_BALLOON_VQ_HINTING] = hinting_ack;
>> +	}
>>  	err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
>>  					 vqs, callbacks, names, NULL, NULL);
>>  	if (err)
>>  		return err;
>>  
>> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
>> +		vb->hinting_vq = vqs[VIRTIO_BALLOON_VQ_HINTING];
>> +
>>  	vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE];
>>  	vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE];
>>  	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
>> @@ -908,6 +952,11 @@ static int virtballoon_probe(struct virtio_device *vdev)
>>  		if (err)
>>  			goto out_del_balloon_wq;
>>  	}
>> +
>> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
>> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
>> +		enable_hinting(vb);
>> +#endif
>>  	virtio_device_ready(vdev);
>>  
>>  	if (towards_target(vb))
>> @@ -950,6 +999,10 @@ static void virtballoon_remove(struct virtio_device *vdev)
>>  	cancel_work_sync(&vb->update_balloon_size_work);
>>  	cancel_work_sync(&vb->update_balloon_stats_work);
>>  
>> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
>> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
>> +		disable_hinting();
>> +#endif
>>  	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
>>  		cancel_work_sync(&vb->report_free_page_work);
>>  		destroy_workqueue(vb->balloon_wq);
>> @@ -1009,6 +1062,7 @@ static unsigned int features[] = {
>>  	VIRTIO_BALLOON_F_MUST_TELL_HOST,
>>  	VIRTIO_BALLOON_F_STATS_VQ,
>>  	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
>> +	VIRTIO_BALLOON_F_HINTING,
>>  	VIRTIO_BALLOON_F_FREE_PAGE_HINT,
>>  	VIRTIO_BALLOON_F_PAGE_POISON,
>>  };
>> diff --git a/include/linux/page_hinting.h b/include/linux/page_hinting.h
>> index e800c6b07561..3ba8c1f3b4a4 100644
>> --- a/include/linux/page_hinting.h
>> +++ b/include/linux/page_hinting.h
>> @@ -1,15 +1,12 @@
>>  #include <linux/smpboot.h>
>>  
>> -/*
>> - * Size of the array which is used to store the freed pages is defined by
>> - * MAX_FGPT_ENTRIES. If possible, we have to find a better way using which
>> - * we can get rid of the hardcoded array size.
>> - */
>>  #define MAX_FGPT_ENTRIES	1000
>>  /*
>>   * hypervisor_pages - It is a dummy structure passed with the hypercall.
>> - * @pfn: page frame number for the page which needs to be sent to the host.
>> - * @order: order of the page needs to be reported to the host.
>> + * @pfn - page frame number for the page which is to be freed.
>> + * @pages - number of pages which are supposed to be freed.
>> + * A global array object is used to to hold the list of pfn and pages and is
>> + * passed as part of the hypercall.
>>   */
>>  struct hypervisor_pages {
>>  	unsigned long pfn;
>> @@ -19,11 +16,18 @@ struct hypervisor_pages {
>>  extern int guest_page_hinting_flag;
>>  extern struct static_key_false guest_page_hinting_key;
>>  extern struct smp_hotplug_thread hinting_threads;
>> +extern void (*request_hypercall)(void *, u64, int);
>> +extern void *balloon_ptr;
>>  extern bool want_page_poisoning;
>>  
>>  int guest_page_hinting_sysctl(struct ctl_table *table, int write,
>>  			      void __user *buffer, size_t *lenp, loff_t *ppos);
>>  void guest_free_page(struct page *page, int order);
>> +extern int __isolate_free_page(struct page *page, unsigned int order);
>> +extern void free_one_page(struct zone *zone,
>> +			  struct page *page, unsigned long pfn,
>> +			  unsigned int order,
>> +			  int migratetype);
>>  
>>  static inline void disable_page_poisoning(void)
>>  {
> I guess you will want to put this in some other header.  Function
> declarations belong close to where they are implemented, not used.
I will find a better place.
>
>> diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
>> index a1966cd7b677..2b0f62814e22 100644
>> --- a/include/uapi/linux/virtio_balloon.h
>> +++ b/include/uapi/linux/virtio_balloon.h
>> @@ -36,6 +36,7 @@
>>  #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
>>  #define VIRTIO_BALLOON_F_FREE_PAGE_HINT	3 /* VQ to report free pages */
>>  #define VIRTIO_BALLOON_F_PAGE_POISON	4 /* Guest is using page poisoning */
>> +#define VIRTIO_BALLOON_F_HINTING	5 /* Page hinting virtqueue */
>>  
>>  /* Size of a PFN in the balloon interface. */
>>  #define VIRTIO_BALLOON_PFN_SHIFT 12
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index d295c9bc01a8..93224cba9243 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -1199,7 +1199,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
>>  	spin_unlock(&zone->lock);
>>  }
>>  
>> -static void free_one_page(struct zone *zone,
>> +void free_one_page(struct zone *zone,
>>  				struct page *page, unsigned long pfn,
>>  				unsigned int order,
>>  				int migratetype)
>> diff --git a/virt/kvm/page_hinting.c b/virt/kvm/page_hinting.c
>> index be529f6f2bc0..315099fcda43 100644
>> --- a/virt/kvm/page_hinting.c
>> +++ b/virt/kvm/page_hinting.c
>> @@ -1,6 +1,8 @@
>>  #include <linux/gfp.h>
>>  #include <linux/mm.h>
>> +#include <linux/page_ref.h>
>>  #include <linux/kvm_host.h>
>> +#include <linux/sort.h>
>>  #include <linux/kernel.h>
>>  
>>  /*
>> @@ -39,6 +41,11 @@ int guest_page_hinting_flag;
>>  EXPORT_SYMBOL(guest_page_hinting_flag);
>>  static DEFINE_PER_CPU(struct task_struct *, hinting_task);
>>  
>> +void (*request_hypercall)(void *, u64, int);
>> +EXPORT_SYMBOL(request_hypercall);
>> +void *balloon_ptr;
>> +EXPORT_SYMBOL(balloon_ptr);
>> +
>>  int guest_page_hinting_sysctl(struct ctl_table *table, int write,
>>  			      void __user *buffer, size_t *lenp,
>>  			      loff_t *ppos)
>> @@ -55,18 +62,201 @@ int guest_page_hinting_sysctl(struct ctl_table *table, int write,
>>  	return ret;
>>  }
>>  
>> +void hyperlist_ready(struct hypervisor_pages *guest_isolated_pages, int entries)
>> +{
>> +	int i = 0;
>> +	int mt = 0;
>> +
>> +	if (balloon_ptr)
>> +		request_hypercall(balloon_ptr, (u64)&guest_isolated_pages[0],
>> +				  entries);
>> +
>> +	while (i < entries) {
>> +		struct page *page = pfn_to_page(guest_isolated_pages[i].pfn);
>> +
>> +		mt = get_pageblock_migratetype(page);
>> +		free_one_page(page_zone(page), page, page_to_pfn(page),
>> +			      guest_isolated_pages[i].order, mt);
>> +		i++;
>> +	}
>> +}
>> +
>> +struct page *get_buddy_page(struct page *page)
>> +{
>> +	unsigned long pfn = page_to_pfn(page);
>> +	unsigned int order;
>> +
>> +	for (order = 0; order < MAX_ORDER; order++) {
>> +		struct page *page_head = page - (pfn & ((1 << order) - 1));
>> +
>> +		if (PageBuddy(page_head) && page_private(page_head) >= order)
>> +			return page_head;
>> +	}
>> +	return NULL;
>> +}
>> +
>>  static void hinting_fn(unsigned int cpu)
>>  {
>>  	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
>> +	int idx = 0, ret = 0;
>> +	struct zone *zone_cur;
>> +	unsigned long flags = 0;
>> +
>> +	while (idx < MAX_FGPT_ENTRIES) {
>> +		unsigned long pfn = page_hinting_obj->kvm_pt[idx].pfn;
>> +		unsigned long pfn_end = page_hinting_obj->kvm_pt[idx].pfn +
>> +			(1 << page_hinting_obj->kvm_pt[idx].order) - 1;
>> +
>> +		while (pfn <= pfn_end) {
>> +			struct page *page = pfn_to_page(pfn);
>> +			struct page *buddy_page = NULL;
>> +
>> +			zone_cur = page_zone(page);
>> +			spin_lock_irqsave(&zone_cur->lock, flags);
>> +
>> +			if (PageCompound(page)) {
>> +				struct page *head_page = compound_head(page);
>> +				unsigned long head_pfn = page_to_pfn(head_page);
>> +				unsigned int alloc_pages =
>> +					1 << compound_order(head_page);
>> +
>> +				pfn = head_pfn + alloc_pages;
>> +				spin_unlock_irqrestore(&zone_cur->lock, flags);
>> +				continue;
>> +			}
>> +
>> +			if (page_ref_count(page)) {
>> +				pfn++;
>> +				spin_unlock_irqrestore(&zone_cur->lock, flags);
>> +				continue;
>> +			}
>> +
>> +			if (PageBuddy(page)) {
>> +				int buddy_order = page_private(page);
>>  
>> +				ret = __isolate_free_page(page, buddy_order);
>> +				if (!ret) {
>> +				} else {
>> +					int l_idx = page_hinting_obj->hyp_idx;
>> +					struct hypervisor_pages *l_obj =
>> +					page_hinting_obj->hypervisor_pagelist;
>> +
>> +					l_obj[l_idx].pfn = pfn;
>> +					l_obj[l_idx].order = buddy_order;
>> +					page_hinting_obj->hyp_idx += 1;
>> +				}
>> +				pfn = pfn + (1 << buddy_order);
>> +				spin_unlock_irqrestore(&zone_cur->lock, flags);
>> +				continue;
>> +			}
>> +
>> +			buddy_page = get_buddy_page(page);
>> +			if (buddy_page) {
>> +				int buddy_order = page_private(buddy_page);
>> +
>> +				ret = __isolate_free_page(buddy_page,
>> +							  buddy_order);
>> +				if (!ret) {
>> +				} else {
>> +					int l_idx = page_hinting_obj->hyp_idx;
>> +					struct hypervisor_pages *l_obj =
>> +					page_hinting_obj->hypervisor_pagelist;
>> +					unsigned long buddy_pfn =
>> +						page_to_pfn(buddy_page);
>> +
>> +					l_obj[l_idx].pfn = buddy_pfn;
>> +					l_obj[l_idx].order = buddy_order;
>> +					page_hinting_obj->hyp_idx += 1;
>> +				}
>> +				pfn = page_to_pfn(buddy_page) +
>> +					(1 << buddy_order);
>> +				spin_unlock_irqrestore(&zone_cur->lock, flags);
>> +				continue;
>> +			}
>> +			spin_unlock_irqrestore(&zone_cur->lock, flags);
>> +			pfn++;
>> +		}
>> +		page_hinting_obj->kvm_pt[idx].pfn = 0;
>> +		page_hinting_obj->kvm_pt[idx].order = -1;
>> +		page_hinting_obj->kvm_pt[idx].zonenum = -1;
>> +		idx++;
>> +	}
>> +	if (page_hinting_obj->hyp_idx > 0) {
>> +		hyperlist_ready(page_hinting_obj->hypervisor_pagelist,
>> +				page_hinting_obj->hyp_idx);
>> +		page_hinting_obj->hyp_idx = 0;
>> +	}
>>  	page_hinting_obj->kvm_pt_idx = 0;
>>  	put_cpu_var(hinting_obj);
>>  }
>>  
>> +int if_exist(struct page *page)
>> +{
>> +	int i = 0;
>> +	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
>> +
>> +	while (i < MAX_FGPT_ENTRIES) {
>> +		if (page_to_pfn(page) == page_hinting_obj->kvm_pt[i].pfn)
>> +			return 1;
>> +		i++;
>> +	}
>> +	return 0;
>> +}
>> +
>> +void pack_array(void)
>> +{
>> +	int i = 0, j = 0;
>> +	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
>> +
>> +	while (i < MAX_FGPT_ENTRIES) {
>> +		if (page_hinting_obj->kvm_pt[i].pfn != 0) {
>> +			if (i != j) {
>> +				page_hinting_obj->kvm_pt[j].pfn =
>> +					page_hinting_obj->kvm_pt[i].pfn;
>> +				page_hinting_obj->kvm_pt[j].order =
>> +					page_hinting_obj->kvm_pt[i].order;
>> +				page_hinting_obj->kvm_pt[j].zonenum =
>> +					page_hinting_obj->kvm_pt[i].zonenum;
>> +			}
>> +			j++;
>> +		}
>> +		i++;
>> +	}
>> +	i = j;
>> +	page_hinting_obj->kvm_pt_idx = j;
>> +	while (j < MAX_FGPT_ENTRIES) {
>> +		page_hinting_obj->kvm_pt[j].pfn = 0;
>> +		page_hinting_obj->kvm_pt[j].order = -1;
>> +		page_hinting_obj->kvm_pt[j].zonenum = -1;
>> +		j++;
>> +	}
>> +}
>> +
>>  void scan_array(void)
>>  {
>>  	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
>> +	int i = 0;
>>  
>> +	while (i < MAX_FGPT_ENTRIES) {
>> +		struct page *page =
>> +			pfn_to_page(page_hinting_obj->kvm_pt[i].pfn);
>> +		struct page *buddy_page = get_buddy_page(page);
>> +
>> +		if (!PageBuddy(page) && buddy_page) {
>> +			if (if_exist(buddy_page)) {
>> +				page_hinting_obj->kvm_pt[i].pfn = 0;
>> +				page_hinting_obj->kvm_pt[i].order = -1;
>> +				page_hinting_obj->kvm_pt[i].zonenum = -1;
>> +			} else {
>> +				page_hinting_obj->kvm_pt[i].pfn =
>> +					page_to_pfn(buddy_page);
>> +				page_hinting_obj->kvm_pt[i].order =
>> +					page_private(buddy_page);
>> +			}
>> +		}
>> +		i++;
>> +	}
>> +	pack_array();
>>  	if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES)
>>  		wake_up_process(__this_cpu_read(hinting_task));
>>  }
>> @@ -111,8 +301,18 @@ void guest_free_page(struct page *page, int order)
>>  		page_hinting_obj->kvm_pt[page_hinting_obj->kvm_pt_idx].order =
>>  							order;
>>  		page_hinting_obj->kvm_pt_idx += 1;
>> -		if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES)
>> +		if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES) {
>> +			/*
>> +			 * We are depending on the buddy free-list to identify
>> +			 * if a page is free or not. Hence, we are dumping all
>> +			 * the per-cpu pages back into the buddy allocator. This
>> +			 * will ensure less failures when we try to isolate free
>> +			 * captured pages and hence more memory reporting to the
>> +			 * host.
>> +			 */
>> +			drain_local_pages(NULL);
>>  			scan_array();
>> +		}
>>  	}
>>  	local_irq_restore(flags);
>>  }
>> -- 
>> 2.17.2
-- 
Regards
Nitesh



Download attachment "signature.asc" of type "application/pgp-signature" (834 bytes)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ