[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20190205165514-mutt-send-email-mst@kernel.org>
Date: Tue, 5 Feb 2019 16:55:44 -0500
From: "Michael S. Tsirkin" <mst@...hat.com>
To: Nitesh Narayan Lal <nitesh@...hat.com>
Cc: kvm@...r.kernel.org, linux-kernel@...r.kernel.org,
pbonzini@...hat.com, lcapitulino@...hat.com, pagupta@...hat.com,
wei.w.wang@...el.com, yang.zhang.wz@...il.com, riel@...riel.com,
david@...hat.com, dodgen@...gle.com, konrad.wilk@...cle.com,
dhildenb@...hat.com, aarcange@...hat.com
Subject: Re: [RFC][Patch v8 6/7] KVM: Enables the kernel to isolate and
report free pages
On Tue, Feb 05, 2019 at 04:54:03PM -0500, Nitesh Narayan Lal wrote:
>
> On 2/5/19 3:45 PM, Michael S. Tsirkin wrote:
> > On Mon, Feb 04, 2019 at 03:18:53PM -0500, Nitesh Narayan Lal wrote:
> >> This patch enables the kernel to scan the per cpu array and
> >> compress it by removing the repetitive/re-allocated pages.
> >> Once the per cpu array is completely filled with pages in the
> >> buddy it wakes up the kernel per cpu thread which re-scans the
> >> entire per cpu array by acquiring a zone lock corresponding to
> >> the page which is being scanned. If the page is still free and
> >> present in the buddy it tries to isolate the page and adds it
> >> to another per cpu array.
> >>
> >> Once this scanning process is complete and if there are any
> >> isolated pages added to the new per cpu array kernel thread
> >> invokes hyperlist_ready().
> >>
> >> In hyperlist_ready() a hypercall is made to report these pages to
> >> the host using the virtio-balloon framework. In order to do so
> >> another virtqueue 'hinting_vq' is added to the balloon framework.
> >> As the host frees all the reported pages, the kernel thread returns
> >> them back to the buddy.
> >>
> >> Signed-off-by: Nitesh Narayan Lal <nitesh@...hat.com>
> >
> > This looks kind of like what early iterations of Wei's patches did.
> >
> > But this has lots of issues, for example you might end up with
> > a hypercall per a 4K page.
> > So in the end, he switched over to just reporting only
> > MAX_ORDER - 1 pages.
> You mean that I should only capture/attempt to isolate pages with order
> MAX_ORDER - 1?
> >
> > Would that be a good idea for you too?
> Will it help if we have a threshold value based on the amount of memory
> captured instead of the number of entries/pages in the array?
This is what Wei's patches do at least.
> >
> > An alternative would be a different much lighter weight
> > way to report these pages and to free them on the host.
> >
> >> ---
> >> drivers/virtio/virtio_balloon.c | 56 +++++++-
> >> include/linux/page_hinting.h | 18 ++-
> >> include/uapi/linux/virtio_balloon.h | 1 +
> >> mm/page_alloc.c | 2 +-
> >> virt/kvm/page_hinting.c | 202 +++++++++++++++++++++++++++-
> >> 5 files changed, 269 insertions(+), 10 deletions(-)
> >>
> >> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> >> index 728ecd1eea30..8af34e0b9a32 100644
> >> --- a/drivers/virtio/virtio_balloon.c
> >> +++ b/drivers/virtio/virtio_balloon.c
> >> @@ -57,13 +57,15 @@ enum virtio_balloon_vq {
> >> VIRTIO_BALLOON_VQ_INFLATE,
> >> VIRTIO_BALLOON_VQ_DEFLATE,
> >> VIRTIO_BALLOON_VQ_STATS,
> >> + VIRTIO_BALLOON_VQ_HINTING,
> >> VIRTIO_BALLOON_VQ_FREE_PAGE,
> >> VIRTIO_BALLOON_VQ_MAX
> >> };
> >>
> >> struct virtio_balloon {
> >> struct virtio_device *vdev;
> >> - struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
> >> + struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq,
> >> + *hinting_vq;
> >>
> >> /* Balloon's own wq for cpu-intensive work items */
> >> struct workqueue_struct *balloon_wq;
> >> @@ -122,6 +124,40 @@ static struct virtio_device_id id_table[] = {
> >> { 0 },
> >> };
> >>
> >> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
> >> +void virtballoon_page_hinting(struct virtio_balloon *vb, u64 gvaddr,
> >> + int hyper_entries)
> >> +{
> >> + u64 gpaddr = virt_to_phys((void *)gvaddr);
> >> +
> >> + virtqueue_add_desc(vb->hinting_vq, gpaddr, hyper_entries, 0);
> >> + virtqueue_kick_sync(vb->hinting_vq);
> >> +}
> >> +
> >> +static void hinting_ack(struct virtqueue *vq)
> >> +{
> >> + struct virtio_balloon *vb = vq->vdev->priv;
> >> +
> >> + wake_up(&vb->acked);
> >> +}
> >> +
> >> +static void enable_hinting(struct virtio_balloon *vb)
> >> +{
> >> + guest_page_hinting_flag = 1;
> >> + static_branch_enable(&guest_page_hinting_key);
> >> + request_hypercall = (void *)&virtballoon_page_hinting;
> >> + balloon_ptr = vb;
> >> + WARN_ON(smpboot_register_percpu_thread(&hinting_threads));
> >> +}
> >> +
> >> +static void disable_hinting(void)
> >> +{
> >> + guest_page_hinting_flag = 0;
> >> + static_branch_enable(&guest_page_hinting_key);
> >> + balloon_ptr = NULL;
> >> +}
> >> +#endif
> >> +
> >> static u32 page_to_balloon_pfn(struct page *page)
> >> {
> >> unsigned long pfn = page_to_pfn(page);
> >> @@ -481,6 +517,7 @@ static int init_vqs(struct virtio_balloon *vb)
> >> names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate";
> >> names[VIRTIO_BALLOON_VQ_STATS] = NULL;
> >> names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> >> + names[VIRTIO_BALLOON_VQ_HINTING] = NULL;
> >>
> >> if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> >> names[VIRTIO_BALLOON_VQ_STATS] = "stats";
> >> @@ -492,11 +529,18 @@ static int init_vqs(struct virtio_balloon *vb)
> >> callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> >> }
> >>
> >> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING)) {
> >> + names[VIRTIO_BALLOON_VQ_HINTING] = "hinting_vq";
> >> + callbacks[VIRTIO_BALLOON_VQ_HINTING] = hinting_ack;
> >> + }
> >> err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
> >> vqs, callbacks, names, NULL, NULL);
> >> if (err)
> >> return err;
> >>
> >> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
> >> + vb->hinting_vq = vqs[VIRTIO_BALLOON_VQ_HINTING];
> >> +
> >> vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE];
> >> vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE];
> >> if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> >> @@ -908,6 +952,11 @@ static int virtballoon_probe(struct virtio_device *vdev)
> >> if (err)
> >> goto out_del_balloon_wq;
> >> }
> >> +
> >> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
> >> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
> >> + enable_hinting(vb);
> >> +#endif
> >> virtio_device_ready(vdev);
> >>
> >> if (towards_target(vb))
> >> @@ -950,6 +999,10 @@ static void virtballoon_remove(struct virtio_device *vdev)
> >> cancel_work_sync(&vb->update_balloon_size_work);
> >> cancel_work_sync(&vb->update_balloon_stats_work);
> >>
> >> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
> >> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
> >> + disable_hinting();
> >> +#endif
> >> if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
> >> cancel_work_sync(&vb->report_free_page_work);
> >> destroy_workqueue(vb->balloon_wq);
> >> @@ -1009,6 +1062,7 @@ static unsigned int features[] = {
> >> VIRTIO_BALLOON_F_MUST_TELL_HOST,
> >> VIRTIO_BALLOON_F_STATS_VQ,
> >> VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
> >> + VIRTIO_BALLOON_F_HINTING,
> >> VIRTIO_BALLOON_F_FREE_PAGE_HINT,
> >> VIRTIO_BALLOON_F_PAGE_POISON,
> >> };
> >> diff --git a/include/linux/page_hinting.h b/include/linux/page_hinting.h
> >> index e800c6b07561..3ba8c1f3b4a4 100644
> >> --- a/include/linux/page_hinting.h
> >> +++ b/include/linux/page_hinting.h
> >> @@ -1,15 +1,12 @@
> >> #include <linux/smpboot.h>
> >>
> >> -/*
> >> - * Size of the array which is used to store the freed pages is defined by
> >> - * MAX_FGPT_ENTRIES. If possible, we have to find a better way using which
> >> - * we can get rid of the hardcoded array size.
> >> - */
> >> #define MAX_FGPT_ENTRIES 1000
> >> /*
> >> * hypervisor_pages - It is a dummy structure passed with the hypercall.
> >> - * @pfn: page frame number for the page which needs to be sent to the host.
> >> - * @order: order of the page needs to be reported to the host.
> >> + * @pfn - page frame number for the page which is to be freed.
> >> + * @pages - number of pages which are supposed to be freed.
> >> + * A global array object is used to to hold the list of pfn and pages and is
> >> + * passed as part of the hypercall.
> >> */
> >> struct hypervisor_pages {
> >> unsigned long pfn;
> >> @@ -19,11 +16,18 @@ struct hypervisor_pages {
> >> extern int guest_page_hinting_flag;
> >> extern struct static_key_false guest_page_hinting_key;
> >> extern struct smp_hotplug_thread hinting_threads;
> >> +extern void (*request_hypercall)(void *, u64, int);
> >> +extern void *balloon_ptr;
> >> extern bool want_page_poisoning;
> >>
> >> int guest_page_hinting_sysctl(struct ctl_table *table, int write,
> >> void __user *buffer, size_t *lenp, loff_t *ppos);
> >> void guest_free_page(struct page *page, int order);
> >> +extern int __isolate_free_page(struct page *page, unsigned int order);
> >> +extern void free_one_page(struct zone *zone,
> >> + struct page *page, unsigned long pfn,
> >> + unsigned int order,
> >> + int migratetype);
> >>
> >> static inline void disable_page_poisoning(void)
> >> {
> > I guess you will want to put this in some other header. Function
> > declarations belong close to where they are implemented, not used.
> I will find a better place.
> >
> >> diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> >> index a1966cd7b677..2b0f62814e22 100644
> >> --- a/include/uapi/linux/virtio_balloon.h
> >> +++ b/include/uapi/linux/virtio_balloon.h
> >> @@ -36,6 +36,7 @@
> >> #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */
> >> #define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */
> >> #define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */
> >> +#define VIRTIO_BALLOON_F_HINTING 5 /* Page hinting virtqueue */
> >>
> >> /* Size of a PFN in the balloon interface. */
> >> #define VIRTIO_BALLOON_PFN_SHIFT 12
> >> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> >> index d295c9bc01a8..93224cba9243 100644
> >> --- a/mm/page_alloc.c
> >> +++ b/mm/page_alloc.c
> >> @@ -1199,7 +1199,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
> >> spin_unlock(&zone->lock);
> >> }
> >>
> >> -static void free_one_page(struct zone *zone,
> >> +void free_one_page(struct zone *zone,
> >> struct page *page, unsigned long pfn,
> >> unsigned int order,
> >> int migratetype)
> >> diff --git a/virt/kvm/page_hinting.c b/virt/kvm/page_hinting.c
> >> index be529f6f2bc0..315099fcda43 100644
> >> --- a/virt/kvm/page_hinting.c
> >> +++ b/virt/kvm/page_hinting.c
> >> @@ -1,6 +1,8 @@
> >> #include <linux/gfp.h>
> >> #include <linux/mm.h>
> >> +#include <linux/page_ref.h>
> >> #include <linux/kvm_host.h>
> >> +#include <linux/sort.h>
> >> #include <linux/kernel.h>
> >>
> >> /*
> >> @@ -39,6 +41,11 @@ int guest_page_hinting_flag;
> >> EXPORT_SYMBOL(guest_page_hinting_flag);
> >> static DEFINE_PER_CPU(struct task_struct *, hinting_task);
> >>
> >> +void (*request_hypercall)(void *, u64, int);
> >> +EXPORT_SYMBOL(request_hypercall);
> >> +void *balloon_ptr;
> >> +EXPORT_SYMBOL(balloon_ptr);
> >> +
> >> int guest_page_hinting_sysctl(struct ctl_table *table, int write,
> >> void __user *buffer, size_t *lenp,
> >> loff_t *ppos)
> >> @@ -55,18 +62,201 @@ int guest_page_hinting_sysctl(struct ctl_table *table, int write,
> >> return ret;
> >> }
> >>
> >> +void hyperlist_ready(struct hypervisor_pages *guest_isolated_pages, int entries)
> >> +{
> >> + int i = 0;
> >> + int mt = 0;
> >> +
> >> + if (balloon_ptr)
> >> + request_hypercall(balloon_ptr, (u64)&guest_isolated_pages[0],
> >> + entries);
> >> +
> >> + while (i < entries) {
> >> + struct page *page = pfn_to_page(guest_isolated_pages[i].pfn);
> >> +
> >> + mt = get_pageblock_migratetype(page);
> >> + free_one_page(page_zone(page), page, page_to_pfn(page),
> >> + guest_isolated_pages[i].order, mt);
> >> + i++;
> >> + }
> >> +}
> >> +
> >> +struct page *get_buddy_page(struct page *page)
> >> +{
> >> + unsigned long pfn = page_to_pfn(page);
> >> + unsigned int order;
> >> +
> >> + for (order = 0; order < MAX_ORDER; order++) {
> >> + struct page *page_head = page - (pfn & ((1 << order) - 1));
> >> +
> >> + if (PageBuddy(page_head) && page_private(page_head) >= order)
> >> + return page_head;
> >> + }
> >> + return NULL;
> >> +}
> >> +
> >> static void hinting_fn(unsigned int cpu)
> >> {
> >> struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> >> + int idx = 0, ret = 0;
> >> + struct zone *zone_cur;
> >> + unsigned long flags = 0;
> >> +
> >> + while (idx < MAX_FGPT_ENTRIES) {
> >> + unsigned long pfn = page_hinting_obj->kvm_pt[idx].pfn;
> >> + unsigned long pfn_end = page_hinting_obj->kvm_pt[idx].pfn +
> >> + (1 << page_hinting_obj->kvm_pt[idx].order) - 1;
> >> +
> >> + while (pfn <= pfn_end) {
> >> + struct page *page = pfn_to_page(pfn);
> >> + struct page *buddy_page = NULL;
> >> +
> >> + zone_cur = page_zone(page);
> >> + spin_lock_irqsave(&zone_cur->lock, flags);
> >> +
> >> + if (PageCompound(page)) {
> >> + struct page *head_page = compound_head(page);
> >> + unsigned long head_pfn = page_to_pfn(head_page);
> >> + unsigned int alloc_pages =
> >> + 1 << compound_order(head_page);
> >> +
> >> + pfn = head_pfn + alloc_pages;
> >> + spin_unlock_irqrestore(&zone_cur->lock, flags);
> >> + continue;
> >> + }
> >> +
> >> + if (page_ref_count(page)) {
> >> + pfn++;
> >> + spin_unlock_irqrestore(&zone_cur->lock, flags);
> >> + continue;
> >> + }
> >> +
> >> + if (PageBuddy(page)) {
> >> + int buddy_order = page_private(page);
> >>
> >> + ret = __isolate_free_page(page, buddy_order);
> >> + if (!ret) {
> >> + } else {
> >> + int l_idx = page_hinting_obj->hyp_idx;
> >> + struct hypervisor_pages *l_obj =
> >> + page_hinting_obj->hypervisor_pagelist;
> >> +
> >> + l_obj[l_idx].pfn = pfn;
> >> + l_obj[l_idx].order = buddy_order;
> >> + page_hinting_obj->hyp_idx += 1;
> >> + }
> >> + pfn = pfn + (1 << buddy_order);
> >> + spin_unlock_irqrestore(&zone_cur->lock, flags);
> >> + continue;
> >> + }
> >> +
> >> + buddy_page = get_buddy_page(page);
> >> + if (buddy_page) {
> >> + int buddy_order = page_private(buddy_page);
> >> +
> >> + ret = __isolate_free_page(buddy_page,
> >> + buddy_order);
> >> + if (!ret) {
> >> + } else {
> >> + int l_idx = page_hinting_obj->hyp_idx;
> >> + struct hypervisor_pages *l_obj =
> >> + page_hinting_obj->hypervisor_pagelist;
> >> + unsigned long buddy_pfn =
> >> + page_to_pfn(buddy_page);
> >> +
> >> + l_obj[l_idx].pfn = buddy_pfn;
> >> + l_obj[l_idx].order = buddy_order;
> >> + page_hinting_obj->hyp_idx += 1;
> >> + }
> >> + pfn = page_to_pfn(buddy_page) +
> >> + (1 << buddy_order);
> >> + spin_unlock_irqrestore(&zone_cur->lock, flags);
> >> + continue;
> >> + }
> >> + spin_unlock_irqrestore(&zone_cur->lock, flags);
> >> + pfn++;
> >> + }
> >> + page_hinting_obj->kvm_pt[idx].pfn = 0;
> >> + page_hinting_obj->kvm_pt[idx].order = -1;
> >> + page_hinting_obj->kvm_pt[idx].zonenum = -1;
> >> + idx++;
> >> + }
> >> + if (page_hinting_obj->hyp_idx > 0) {
> >> + hyperlist_ready(page_hinting_obj->hypervisor_pagelist,
> >> + page_hinting_obj->hyp_idx);
> >> + page_hinting_obj->hyp_idx = 0;
> >> + }
> >> page_hinting_obj->kvm_pt_idx = 0;
> >> put_cpu_var(hinting_obj);
> >> }
> >>
> >> +int if_exist(struct page *page)
> >> +{
> >> + int i = 0;
> >> + struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> >> +
> >> + while (i < MAX_FGPT_ENTRIES) {
> >> + if (page_to_pfn(page) == page_hinting_obj->kvm_pt[i].pfn)
> >> + return 1;
> >> + i++;
> >> + }
> >> + return 0;
> >> +}
> >> +
> >> +void pack_array(void)
> >> +{
> >> + int i = 0, j = 0;
> >> + struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> >> +
> >> + while (i < MAX_FGPT_ENTRIES) {
> >> + if (page_hinting_obj->kvm_pt[i].pfn != 0) {
> >> + if (i != j) {
> >> + page_hinting_obj->kvm_pt[j].pfn =
> >> + page_hinting_obj->kvm_pt[i].pfn;
> >> + page_hinting_obj->kvm_pt[j].order =
> >> + page_hinting_obj->kvm_pt[i].order;
> >> + page_hinting_obj->kvm_pt[j].zonenum =
> >> + page_hinting_obj->kvm_pt[i].zonenum;
> >> + }
> >> + j++;
> >> + }
> >> + i++;
> >> + }
> >> + i = j;
> >> + page_hinting_obj->kvm_pt_idx = j;
> >> + while (j < MAX_FGPT_ENTRIES) {
> >> + page_hinting_obj->kvm_pt[j].pfn = 0;
> >> + page_hinting_obj->kvm_pt[j].order = -1;
> >> + page_hinting_obj->kvm_pt[j].zonenum = -1;
> >> + j++;
> >> + }
> >> +}
> >> +
> >> void scan_array(void)
> >> {
> >> struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> >> + int i = 0;
> >>
> >> + while (i < MAX_FGPT_ENTRIES) {
> >> + struct page *page =
> >> + pfn_to_page(page_hinting_obj->kvm_pt[i].pfn);
> >> + struct page *buddy_page = get_buddy_page(page);
> >> +
> >> + if (!PageBuddy(page) && buddy_page) {
> >> + if (if_exist(buddy_page)) {
> >> + page_hinting_obj->kvm_pt[i].pfn = 0;
> >> + page_hinting_obj->kvm_pt[i].order = -1;
> >> + page_hinting_obj->kvm_pt[i].zonenum = -1;
> >> + } else {
> >> + page_hinting_obj->kvm_pt[i].pfn =
> >> + page_to_pfn(buddy_page);
> >> + page_hinting_obj->kvm_pt[i].order =
> >> + page_private(buddy_page);
> >> + }
> >> + }
> >> + i++;
> >> + }
> >> + pack_array();
> >> if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES)
> >> wake_up_process(__this_cpu_read(hinting_task));
> >> }
> >> @@ -111,8 +301,18 @@ void guest_free_page(struct page *page, int order)
> >> page_hinting_obj->kvm_pt[page_hinting_obj->kvm_pt_idx].order =
> >> order;
> >> page_hinting_obj->kvm_pt_idx += 1;
> >> - if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES)
> >> + if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES) {
> >> + /*
> >> + * We are depending on the buddy free-list to identify
> >> + * if a page is free or not. Hence, we are dumping all
> >> + * the per-cpu pages back into the buddy allocator. This
> >> + * will ensure less failures when we try to isolate free
> >> + * captured pages and hence more memory reporting to the
> >> + * host.
> >> + */
> >> + drain_local_pages(NULL);
> >> scan_array();
> >> + }
> >> }
> >> local_irq_restore(flags);
> >> }
> >> --
> >> 2.17.2
> --
> Regards
> Nitesh
>
Powered by blists - more mailing lists