lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20090402053114.GF1117@x200.localdomain>
Date:	Wed, 1 Apr 2009 22:31:14 -0700
From:	Chris Wright <chrisw@...hat.com>
To:	Anthony Liguori <anthony@...emonkey.ws>
Cc:	Chris Wright <chrisw@...hat.com>,
	Andrea Arcangeli <aarcange@...hat.com>,
	Izik Eidus <ieidus@...hat.com>, linux-kernel@...r.kernel.org,
	kvm@...r.kernel.org, linux-mm@...ck.org, avi@...hat.com,
	riel@...hat.com, jeremy@...p.org, mtosatti@...hat.com,
	hugh@...itas.com, corbet@....net, yaniv@...hat.com,
	dmonakhov@...nvz.org
Subject: [PATCH 5/4] update ksm userspace interfaces

* Anthony Liguori (anthony@...emonkey.ws) wrote:
> Using an interface like madvise() would force the issue to be dealt with  
> properly from the start :-)

Yeah, I'm not at all opposed to it.

This updates to madvise for register and sysfs for control.

madvise issues:
- MADV_SHAREABLE
  - register only ATM, can add MADV_UNSHAREABLE to allow an app to proactively
    unregister, but need a cleanup when ->mm goes away via exit/exec
  - will register a region per vma, should probably push the whole thing
    into vma rather than keep [mm,addr,len] tuple in ksm
sysfs issues:
- none really, i added a reporting mechanism for number of pages shared,
  doesn't decrement on COW
- could use some extra sanity checks

It compiles!  Diff output is hard to read, I can send a 4/4 w/ this
patch rolled in for easier review.

Signed-off-by: Chris Wright <chrisw@...hat.com>
---
 include/asm-generic/mman.h |    1 +
 include/linux/ksm.h        |   63 +--------
 mm/ksm.c                   |  352 ++++++++++++++++----------------------------
 mm/madvise.c               |   18 +++
 4 files changed, 149 insertions(+), 285 deletions(-)

diff --git a/include/asm-generic/mman.h b/include/asm-generic/mman.h
index 5e3dde2..a1c1d5c 100644
--- a/include/asm-generic/mman.h
+++ b/include/asm-generic/mman.h
@@ -34,6 +34,7 @@
 #define MADV_REMOVE	9		/* remove these pages & resources */
 #define MADV_DONTFORK	10		/* don't inherit across fork */
 #define MADV_DOFORK	11		/* do inherit across fork */
+#define MADV_SHAREABLE	12		/* can share identical pages */
 
 /* compatibility flags */
 #define MAP_FILE	0
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 5776dce..e032f6f 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -1,69 +1,8 @@
 #ifndef __LINUX_KSM_H
 #define __LINUX_KSM_H
 
-/*
- * Userspace interface for /dev/ksm - kvm shared memory
- */
-
-#include <linux/types.h>
-#include <linux/ioctl.h>
-
-#include <asm/types.h>
-
-#define KSM_API_VERSION 1
-
 #define ksm_control_flags_run 1
 
-/* for KSM_REGISTER_MEMORY_REGION */
-struct ksm_memory_region {
-	__u32 npages; /* number of pages to share */
-	__u32 pad;
-	__u64 addr; /* the begining of the virtual address */
-        __u64 reserved_bits;
-};
-
-struct ksm_kthread_info {
-	__u32 sleep; /* number of microsecoends to sleep */
-	__u32 pages_to_scan; /* number of pages to scan */
-	__u32 flags; /* control flags */
-        __u32 pad;
-        __u64 reserved_bits;
-};
-
-#define KSMIO 0xAB
-
-/* ioctls for /dev/ksm */
-
-#define KSM_GET_API_VERSION              _IO(KSMIO,   0x00)
-/*
- * KSM_CREATE_SHARED_MEMORY_AREA - create the shared memory reagion fd
- */
-#define KSM_CREATE_SHARED_MEMORY_AREA    _IO(KSMIO,   0x01) /* return SMA fd */
-/*
- * KSM_START_STOP_KTHREAD - control the kernel thread scanning speed
- * (can stop the kernel thread from working by setting running = 0)
- */
-#define KSM_START_STOP_KTHREAD		 _IOW(KSMIO,  0x02,\
-					      struct ksm_kthread_info)
-/*
- * KSM_GET_INFO_KTHREAD - return information about the kernel thread
- * scanning speed.
- */
-#define KSM_GET_INFO_KTHREAD		 _IOW(KSMIO,  0x03,\
-					      struct ksm_kthread_info)
-
-
-/* ioctls for SMA fds */
-
-/*
- * KSM_REGISTER_MEMORY_REGION - register virtual address memory area to be
- * scanned by kvm.
- */
-#define KSM_REGISTER_MEMORY_REGION       _IOW(KSMIO,  0x20,\
-					      struct ksm_memory_region)
-/*
- * KSM_REMOVE_MEMORY_REGION - remove virtual address memory area from ksm.
- */
-#define KSM_REMOVE_MEMORY_REGION         _IO(KSMIO,   0x21)
+long ksm_register_memory(struct vm_area_struct *, unsigned long, unsigned long);
 
 #endif
diff --git a/mm/ksm.c b/mm/ksm.c
index eba4c09..fcbf76e 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -17,7 +17,6 @@
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
-#include <linux/miscdevice.h>
 #include <linux/vmalloc.h>
 #include <linux/file.h>
 #include <linux/mman.h>
@@ -38,6 +37,7 @@
 #include <linux/rbtree.h>
 #include <linux/anon_inodes.h>
 #include <linux/ksm.h>
+#include <linux/kobject.h>
 
 #include <asm/tlbflush.h>
 
@@ -55,20 +55,11 @@ MODULE_PARM_DESC(rmap_hash_size, "Hash table size for the reverse mapping");
  */
 struct ksm_mem_slot {
 	struct list_head link;
-	struct list_head sma_link;
 	struct mm_struct *mm;
 	unsigned long addr;	/* the begining of the virtual address */
 	unsigned npages;	/* number of pages to share */
 };
 
-/*
- * ksm_sma - shared memory area, each process have its own sma that contain the
- * information about the slots that it own
- */
-struct ksm_sma {
-	struct list_head sma_slots;
-};
-
 /**
  * struct ksm_scan - cursor for scanning
  * @slot_index: the current slot we are scanning
@@ -190,6 +181,7 @@ static struct kmem_cache *rmap_item_cache;
 
 static int kthread_sleep; /* sleep time of the kernel thread */
 static int kthread_pages_to_scan; /* npages to scan for the kernel thread */
+static unsigned long ksm_pages_shared;
 static struct ksm_scan kthread_ksm_scan;
 static int ksmd_flags;
 static struct task_struct *kthread;
@@ -363,22 +355,12 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
 	free_rmap_item(rmap_item);
 }
 
-static void remove_page_from_tree(struct mm_struct *mm,
-				  unsigned long addr)
-{
-	struct rmap_item *rmap_item;
-
-	rmap_item = get_rmap_item(mm, addr);
-	if (!rmap_item)
-		return;
-	remove_rmap_item_from_tree(rmap_item);
-	return;
-}
-
-static int ksm_sma_ioctl_register_memory_region(struct ksm_sma *ksm_sma,
-						struct ksm_memory_region *mem)
+long ksm_register_memory(struct vm_area_struct *vma, unsigned long start,
+			unsigned long end)
 {
 	struct ksm_mem_slot *slot;
+	int npages = (end - start) >> PAGE_SHIFT;
+
 	int ret = -EPERM;
 
 	slot = kzalloc(sizeof(struct ksm_mem_slot), GFP_KERNEL);
@@ -390,13 +372,12 @@ static int ksm_sma_ioctl_register_memory_region(struct ksm_sma *ksm_sma,
 	slot->mm = get_task_mm(current);
 	if (!slot->mm)
 		goto out_free;
-	slot->addr = mem->addr;
-	slot->npages = mem->npages;
+	slot->addr = start;
+	slot->npages = npages;
 
 	down_write(&slots_lock);
 
 	list_add_tail(&slot->link, &slots);
-	list_add_tail(&slot->sma_link, &ksm_sma->sma_slots);
 
 	up_write(&slots_lock);
 	return 0;
@@ -407,76 +388,6 @@ out:
 	return ret;
 }
 
-static void remove_mm_from_hash_and_tree(struct mm_struct *mm)
-{
-	struct ksm_mem_slot *slot;
-	int pages_count;
-
-	list_for_each_entry(slot, &slots, link)
-		if (slot->mm == mm)
-			break;
-	BUG_ON(!slot);
-
-	root_unstable_tree = RB_ROOT;
-	for (pages_count = 0; pages_count < slot->npages; ++pages_count)
-		remove_page_from_tree(mm, slot->addr +
-				      pages_count * PAGE_SIZE);
-	list_del(&slot->link);
-}
-
-static int ksm_sma_ioctl_remove_memory_region(struct ksm_sma *ksm_sma)
-{
-	struct ksm_mem_slot *slot, *node;
-
-	down_write(&slots_lock);
-	list_for_each_entry_safe(slot, node, &ksm_sma->sma_slots, sma_link) {
-		remove_mm_from_hash_and_tree(slot->mm);
-		mmput(slot->mm);
-		list_del(&slot->sma_link);
-		kfree(slot);
-	}
-	up_write(&slots_lock);
-	return 0;
-}
-
-static int ksm_sma_release(struct inode *inode, struct file *filp)
-{
-	struct ksm_sma *ksm_sma = filp->private_data;
-	int r;
-
-	r = ksm_sma_ioctl_remove_memory_region(ksm_sma);
-	kfree(ksm_sma);
-	return r;
-}
-
-static long ksm_sma_ioctl(struct file *filp,
-			  unsigned int ioctl, unsigned long arg)
-{
-	struct ksm_sma *sma = filp->private_data;
-	void __user *argp = (void __user *)arg;
-	int r = EINVAL;
-
-	switch (ioctl) {
-	case KSM_REGISTER_MEMORY_REGION: {
-		struct ksm_memory_region ksm_memory_region;
-
-		r = -EFAULT;
-		if (copy_from_user(&ksm_memory_region, argp,
-				   sizeof(ksm_memory_region)))
-			goto out;
-		r = ksm_sma_ioctl_register_memory_region(sma,
-							 &ksm_memory_region);
-		break;
-	}
-	case KSM_REMOVE_MEMORY_REGION:
-		r = ksm_sma_ioctl_remove_memory_region(sma);
-		break;
-	}
-
-out:
-	return r;
-}
-
 static unsigned long addr_in_vma(struct vm_area_struct *vma, struct page *page)
 {
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -652,6 +563,8 @@ static int try_to_merge_two_pages(struct mm_struct *mm1, struct page *page1,
 		prot = vma->vm_page_prot;
 		pgprot_val(prot) &= ~_PAGE_RW;
 		ret = try_to_merge_one_page(mm1, vma, page1, page2, prot);
+		if (!ret)
+			ksm_pages_shared++;
 	} else {
 		struct page *kpage;
 
@@ -701,6 +614,8 @@ static int try_to_merge_two_pages(struct mm_struct *mm1, struct page *page1,
 					put_page(tmppage[0]);
 				}
 				up_read(&mm1->mmap_sem);
+			} else {
+				ksm_pages_shared++;
 			}
 		}
 		put_page(kpage);
@@ -1181,9 +1096,6 @@ static void scan_update_old_index(struct ksm_scan *ksm_scan)
  * @scan_npages - number of pages we are want to scan before we return from this
  * @function.
  *
- * (this function can be called from the kernel thread scanner, or from 
- *  userspace ioctl context scanner)
- *
  *  The function return -EAGAIN in case there are not slots to scan.
  */
 static int ksm_scan_start(struct ksm_scan *ksm_scan, unsigned int scan_npages)
@@ -1231,154 +1143,148 @@ out:
 	return ret;
 }
 
-static struct file_operations ksm_sma_fops = {
-	.release        = ksm_sma_release,
-	.unlocked_ioctl = ksm_sma_ioctl,
-	.compat_ioctl   = ksm_sma_ioctl,
-};
-
-static int ksm_dev_ioctl_create_shared_memory_area(void)
+int kthread_ksm_scan_thread(void *nothing)
 {
-	int fd = -1;
-	struct ksm_sma *ksm_sma;
+	while (!kthread_should_stop()) {
+		if (ksmd_flags & ksm_control_flags_run) {
+			down_read(&kthread_lock);
+			ksm_scan_start(&kthread_ksm_scan,
+				       kthread_pages_to_scan);
+			up_read(&kthread_lock);
+			schedule_timeout_interruptible(
+					usecs_to_jiffies(kthread_sleep));
+		} else {
+			wait_event_interruptible(kthread_wait,
+					ksmd_flags & ksm_control_flags_run ||
+					kthread_should_stop());
+		}
+	}
+	return 0;
+}
 
-	ksm_sma = kmalloc(sizeof(struct ksm_sma), GFP_KERNEL);
-	if (!ksm_sma)
-		goto out;
+#define KSM_ATTR_RO(_name) \
+	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
 
-	INIT_LIST_HEAD(&ksm_sma->sma_slots);
+#define KSM_ATTR(_name) \
+	static struct kobj_attribute _name##_attr = \
+		__ATTR(_name, 0644, _name##_show, _name##_store)
 
-	fd = anon_inode_getfd("ksm-sma", &ksm_sma_fops, ksm_sma, 0);
-	if (fd < 0)
-		goto out_free;
+static ssize_t sleep_show(struct kobject *kobj, struct kobj_attribute *attr,
+			  char *buf)
+{
+	unsigned int usecs;
 
-	return fd;
-out_free:
-	kfree(ksm_sma);
-out:
-	return fd;
+	down_read(&kthread_lock);
+	usecs = kthread_sleep;
+	up_read(&kthread_lock);
+
+	return sprintf(buf, "%u\n", usecs);
 }
 
-/*
- * ksm_dev_ioctl_start_stop_kthread - control the kernel thread scanning running
- * speed.
- * This function allow us to control on the time the kernel thread will sleep
- * how many pages it will scan between sleep and sleep, and how many pages it
- * will maximum merge between sleep and sleep.
- */
-static int ksm_dev_ioctl_start_stop_kthread(struct ksm_kthread_info *info)
+static ssize_t sleep_store(struct kobject *kobj,
+				   struct kobj_attribute *attr,
+				   const char *buf, size_t count)
 {
-	int ret = 0;
-
-	down_write(&kthread_lock);
+	unsigned long usecs;
+	int err;
 
-	if (info->flags & ksm_control_flags_run) {
-		if (!info->pages_to_scan) {
-			ret = EPERM;
-			up_write(&kthread_lock);
-			goto out;
-		}
-	}
+	err = strict_strtoul(buf, 10, &usecs);
+	if (err)
+		return 0;
 
-	kthread_sleep = info->sleep;
-	kthread_pages_to_scan = info->pages_to_scan;
-	ksmd_flags = info->flags;
+	/* TODO sanitize usecs */
 
+	down_write(&kthread_lock);
+	kthread_sleep = usecs;
 	up_write(&kthread_lock);
 
-	if (ksmd_flags & ksm_control_flags_run)
-		wake_up_interruptible(&kthread_wait);
-
-out:
-	return ret;
+	return count;
 }
+KSM_ATTR(sleep);
 
-/*
- * ksm_dev_ioctl_get_info_kthread - write into info the scanning information
- * of the ksm kernel thread
- */
-static void ksm_dev_ioctl_get_info_kthread(struct ksm_kthread_info *info)
+static ssize_t pages_to_scan_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
 {
+	unsigned long nr_pages;
+
 	down_read(&kthread_lock);
+	nr_pages = kthread_pages_to_scan;
+	up_read(&kthread_lock);
 
-	info->sleep = kthread_sleep;
-	info->pages_to_scan = kthread_pages_to_scan;
-	info->flags = ksmd_flags;
+	return sprintf(buf, "%lu\n", nr_pages);
+}
 
-	up_read(&kthread_lock);
+static ssize_t pages_to_scan_store(struct kobject *kobj,
+				   struct kobj_attribute *attr,
+				   const char *buf, size_t count)
+{
+	int err;
+	unsigned long nr_pages;
+
+	err = strict_strtoul(buf, 10, &nr_pages);
+	if (err)
+		return 0;
+
+	down_write(&kthread_lock);
+	kthread_pages_to_scan = nr_pages;
+	up_write(&kthread_lock);
+
+	return count;
 }
+KSM_ATTR(pages_to_scan);
 
-static long ksm_dev_ioctl(struct file *filp,
-			  unsigned int ioctl, unsigned long arg)
+static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
+			char *buf)
 {
-	void __user *argp = (void __user *)arg;
-	long r = -EINVAL;
-
-	switch (ioctl) {
-	case KSM_GET_API_VERSION:
-		r = KSM_API_VERSION;
-		break;
-	case KSM_CREATE_SHARED_MEMORY_AREA:
-		r = ksm_dev_ioctl_create_shared_memory_area();
-		break;
-	case KSM_START_STOP_KTHREAD: {
-		struct ksm_kthread_info info;
-
-		r = -EFAULT;
-		if (copy_from_user(&info, argp,
-				   sizeof(struct ksm_kthread_info)))
-			break;
-
-		r = ksm_dev_ioctl_start_stop_kthread(&info);
-		break;
-		}
-	case KSM_GET_INFO_KTHREAD: {
-		struct ksm_kthread_info info;
-
-		ksm_dev_ioctl_get_info_kthread(&info);
-		r = -EFAULT;
-		if (copy_to_user(argp, &info,
-				 sizeof(struct ksm_kthread_info)))
-			break;
-		r = 0;
-		break;
-	}
-	default:
-		break;
-	}
-	return r;
+	unsigned long run;
+
+	down_read(&kthread_lock);
+	run = ksmd_flags;
+	up_read(&kthread_lock);
+
+	return sprintf(buf, "%lu\n", run);
 }
 
-static struct file_operations ksm_chardev_ops = {
-	.unlocked_ioctl = ksm_dev_ioctl,
-	.compat_ioctl   = ksm_dev_ioctl,
-	.owner          = THIS_MODULE,
-};
+static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
+			 const char *buf, size_t count)
+{
+	int err;
+	unsigned long run;
 
-static struct miscdevice ksm_dev = {
-	KSM_MINOR,
-	"ksm",
-	&ksm_chardev_ops,
-};
+	err = strict_strtoul(buf, 10, &run);
+	if (err)
+		return 0;
 
-int kthread_ksm_scan_thread(void *nothing)
+	down_write(&kthread_lock);
+	ksmd_flags = run;
+	up_write(&kthread_lock);
+
+	if (ksmd_flags)
+		wake_up_interruptible(&kthread_wait);
+
+	return count;
+}
+KSM_ATTR(run);
+
+static ssize_t pages_shared_show(struct kobject *kobj,
+				 struct kobj_attribute *attr, char *buf)
 {
-	while (!kthread_should_stop()) {
-		if (ksmd_flags & ksm_control_flags_run) {
-			down_read(&kthread_lock);
-			ksm_scan_start(&kthread_ksm_scan,
-				       kthread_pages_to_scan);
-			up_read(&kthread_lock);
-			schedule_timeout_interruptible(
-					usecs_to_jiffies(kthread_sleep));
-		} else {
-			wait_event_interruptible(kthread_wait,
-					ksmd_flags & ksm_control_flags_run ||
-					kthread_should_stop());
-		}
-	}
-	return 0;
+	return sprintf(buf, "%lu\n", ksm_pages_shared);
 }
+KSM_ATTR_RO(pages_shared);
+
+static struct attribute *ksm_attrs[] = {
+	&sleep_attr.attr,
+	&pages_to_scan_attr.attr,
+	&run_attr.attr,
+	&pages_shared_attr.attr,
+	NULL,
+};
+
+static struct attribute_group ksm_attr_group = {
+	.attrs = ksm_attrs,
+	.name = "ksm",
+};
 
 static int __init ksm_init(void)
 {
@@ -1399,9 +1305,9 @@ static int __init ksm_init(void)
 		goto out_free2;
 	}
 
-	r = misc_register(&ksm_dev);
+	r = sysfs_create_group(mm_kobj, &ksm_attr_group);
 	if (r) {
-		printk(KERN_ERR "ksm: misc device register failed\n");
+		printk(KERN_ERR "ksm: sysfs file creation failed\n");
 		goto out_free3;
 	}
 
@@ -1420,7 +1326,7 @@ out:
 
 static void __exit ksm_exit(void)
 {
-	misc_deregister(&ksm_dev);
+	sysfs_remove_group(mm_kobj, &ksm_attr_group);
 	ksmd_flags = ksm_control_flags_run;
 	kthread_stop(kthread);
 	rmap_hash_free();
diff --git a/mm/madvise.c b/mm/madvise.c
index b9ce574..16bc7fa 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -11,6 +11,7 @@
 #include <linux/mempolicy.h>
 #include <linux/hugetlb.h>
 #include <linux/sched.h>
+#include <linux/ksm.h>
 
 /*
  * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -208,6 +209,18 @@ static long madvise_remove(struct vm_area_struct *vma,
 	return error;
 }
 
+/*
+ * Application allows pages to be shared with other pages of identical
+ * content.
+ *
+ */
+static long madvise_shareable(struct vm_area_struct *vma,
+				struct vm_area_struct **prev,
+				unsigned long start, unsigned long end)
+{
+	return ksm_register_memory(vma, start, end);
+}
+
 static long
 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		unsigned long start, unsigned long end, int behavior)
@@ -238,6 +251,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		error = madvise_dontneed(vma, prev, start, end);
 		break;
 
+	case MADV_SHAREABLE:
+		error = madvise_shareable(vma, prev, start, end);
+		break;
 	default:
 		error = -EINVAL;
 		break;
@@ -269,6 +285,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
  *		so the kernel can free resources associated with it.
  *  MADV_REMOVE - the application wants to free up the given range of
  *		pages and associated backing store.
+ *  MADV_SHAREABLE - the application agrees that pages in the given
+ *		range can be shared w/ other pages of identical content.
  *
  * return values:
  *  zero    - success
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ