lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Wed, 21 Mar 2007 14:43:48 -0500
From:	Adam Litke <agl@...ibm.com>
To:	Andrew Morton <akpm@...ux-foundation.org>
Cc:	Arjan van de Ven <arjan@...radead.org>,
	William Lee Irwin III <wli@...omorphy.com>,
	Christoph Hellwig <hch@...radead.org>,
	Ken Chen <kenchen@...gle.com>, linux-mm@...ck.org,
	linux-kernel@...r.kernel.org
Subject: pagetable_ops: Hugetlb character device example

The main reason I am advocating a set of pagetable_operations is to
enable the development of a new hugetlb interface.  During the hugetlb
BOFS at OLS last year, we talked about a character device that would
behave like /dev/zero.  Many of the people were talking about how they
just wanted to create MAP_PRIVATE hugetlb mappings without all the fuss
about the hugetlbfs filesystem.  /dev/zero is a familiar interface for
getting anonymous memory so bringing that model to huge pages would make
programming for anonymous huge pages easier.

The pagetable_operations API opens up possibilities to do some
additional (and completely sane) things.  For example, I have a patch
that alters the character device code below to make use of a hugetlb
ZERO_PAGE.  This eliminates almost all the up-front fault time, allowing
pages to be COW'ed only when first written to.  We cannot do things like
this with hugetlbfs anymore because we have a set of complex semantics
to preserve.

The following patch is an example of what a simple pagetable_operations
consumer could look like.  It does depend on some other cleanups I am
working on (removal of is_file_hugepages(), ...hugetlbfs/inode.c vs.
mm/hugetlb.c separation, etc).  So it is unlikely to apply to any trees
you may have.  I do think it makes a useful illustration of what
legitimate things can be done with a pagetable_operations interface.

commit be72df1c616fb662693a8d4410ce3058f20c71f3
Author: Adam Litke <agl@...ibm.com>
Date:   Tue Feb 13 14:18:21 2007 -0800

diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index fc11063..c5e755b 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_IPMI_HANDLER)	+= ipmi/
 
 obj-$(CONFIG_HANGCHECK_TIMER)	+= hangcheck-timer.o
 obj-$(CONFIG_TCG_TPM)		+= tpm/
+obj-$(CONFIG_HUGETLB_PAGE)	+= page.o
 
 # Files generated that shall be removed upon make clean
 clean-files := consolemap_deftbl.c defkeymap.c
diff --git a/drivers/char/page.c b/drivers/char/page.c
new file mode 100644
index 0000000..e903028
--- /dev/null
+++ b/drivers/char/page.c
@@ -0,0 +1,133 @@
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/hugetlb.h>
+
+static const struct {
+	unsigned int    minor;
+	char            *name;
+	umode_t         mode;
+} devlist[] = {
+	{1, "page-huge", S_IRUGO | S_IWUGO},
+};
+
+static struct page *page_nopage(struct vm_area_struct *vma,
+			unsigned long address, int *unused)
+{
+	BUG();
+	return NULL;
+}
+
+static struct vm_operations_struct page_vm_ops = {
+	.nopage	= page_nopage,
+};
+
+static int page_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long address, int write_access)
+{
+	pte_t *ptep;
+	pte_t entry, new_entry;
+	int ret;
+	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
+
+	ptep = huge_pte_alloc(mm, address);
+	if (!ptep)
+		return VM_FAULT_OOM;
+
+	mutex_lock(&hugetlb_instantiation_mutex);
+	entry = *ptep;
+	if (pte_none(entry)) {
+		struct page *page;
+
+		page = alloc_huge_page(vma, address);
+		if (!page)
+			return VM_FAULT_OOM;
+		clear_huge_page(page, address);
+
+		ret = VM_FAULT_MINOR;
+		spin_lock(&mm->page_table_lock);
+		if (!pte_none(*ptep))
+			goto out;
+		add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
+		new_entry = make_huge_pte(vma, page, 0);
+		set_huge_pte_at(mm, address, ptep, new_entry);
+		goto out;
+	}
+
+	spin_lock(&mm->page_table_lock);
+	/* Check for a racing update before calling hugetlb_cow */
+	if (likely(pte_same(entry, *ptep)))
+		if (write_access && !pte_write(entry))
+			ret = hugetlb_cow(mm, vma, address, ptep, entry);
+
+out:
+	spin_unlock(&mm->page_table_lock);
+	mutex_unlock(&hugetlb_instantiation_mutex);
+	return ret;
+}
+
+
+static struct pagetable_operations_struct page_pagetable_ops = {
+	.copy_vma		= copy_hugetlb_page_range,
+	.pin_pages		= follow_hugetlb_page,
+	.unmap_page_range	= unmap_hugepage_range,
+	.change_protection	= hugetlb_change_protection,
+	.free_pgtable_range	= hugetlb_free_pgd_range,
+	.fault			= page_fault,
+};
+
+static int page_mmap(struct file * file, struct vm_area_struct *vma)
+{
+	if (vma->vm_flags & VM_SHARED)
+		return -EINVAL;
+
+	if (vma->vm_pgoff)
+		return -EINVAL;
+
+	if (vma->vm_start & ~HPAGE_MASK)
+		return -EINVAL;
+
+	if (vma->vm_end & ~HPAGE_MASK)
+		return -EINVAL;
+
+	if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
+		return -EINVAL;
+
+	vma->vm_flags |= (VM_HUGETLB | VM_RESERVED);
+	vma->vm_ops = &page_vm_ops;
+	vma->pagetable_ops = &page_pagetable_ops;
+
+	return 0;
+}
+
+const struct file_operations page_file_operations = {
+	.mmap			= page_mmap,
+	.get_unmapped_area	= hugetlb_get_unmapped_area,
+	.prepare_unmapped_area	= prepare_hugepage_range,
+};
+
+static struct class *page_class;
+
+static int __init chr_dev_init(void)
+{
+	int major, i;
+
+	printk("Initializing page devices...");
+	major = register_chrdev(0, "page", &page_file_operations);
+	if (major <= 0)
+		printk("failed\n");
+	else
+		printk("(%i:0)\n", major);
+
+	page_class = class_create(THIS_MODULE, "page");
+	for (i = 0; i < ARRAY_SIZE(devlist); i++)
+		class_device_create(page_class, NULL,
+			MKDEV(major, devlist[i].minor),
+			NULL, devlist[i].name);
+
+	return 0;
+}
+fs_initcall(chr_dev_init);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4fc0bca..edd4944 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -590,6 +590,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	BUG_ON(!has_pt_op(vma, fault));
 
+	BUG_ON(!has_pt_op(vma,fault));
 	spin_lock(&mm->page_table_lock);
 	while (vaddr < vma->vm_end && remainder) {
 		pte_t *pte;

-- 
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ