linux-kernel - [RFC][PATCH] big continuous memory allocator v2

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20100907114505.fc40ea3d.kamezawa.hiroyu@jp.fujitsu.com>
Date:	Tue, 7 Sep 2010 11:45:05 +0900
From:	KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
To:	"linux-mm@...ck.org" <linux-mm@...ck.org>
Cc:	"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
	"minchan.kim@...il.com" <minchan.kim@...il.com>,
	Mel Gorman <mel@....ul.ie>,
	"kosaki.motohiro@...fujitsu.com" <kosaki.motohiro@...fujitsu.com>
Subject: [RFC][PATCH] big continuous memory allocator v2


This is a page allcoator based on memory migration/hotplug code.
passed some small tests, and maybe easier to read than previous one.

==
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>

This patch as a memory allocator for contiguous memory larger than MAX_ORDER.

  alloc_contig_pages(hint, size, node);

  This function allocates 'size' of contigoues pages, whose physical address
  is higher than 'hint' and on "node". size and hint are specified in pfn.
  Allocated pages's page_count() are set to 1.
  Return value is the top page. 

 free_contig_pages(start, size)
 free all pages in the range.

This patch does
  - find an area which can be ISOLATED with skipping memory holes.
  - migrate LRU pages in the area.
  - steal chunk of pages from allocator.

Most of codes are for "deteciting candidate of range for allocating memory".
migration/isolation reuses memory hotplug codes.

This is fully experimental and written as example.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
---
 include/linux/page-isolation.h |    9 +
 mm/memory_hotplug.c            |   86 -----------
 mm/page_alloc.c                |   28 +++
 mm/page_isolation.c            |  301 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 340 insertions(+), 84 deletions(-)

Index: kametest/mm/page_isolation.c
===================================================================
--- kametest.orig/mm/page_isolation.c
+++ kametest/mm/page_isolation.c
@@ -3,8 +3,11 @@
  */
 
 #include <linux/mm.h>
+#include <linux/swap.h>
 #include <linux/page-isolation.h>
 #include <linux/pageblock-flags.h>
+#include <linux/mm_inline.h>
+#include <linux/migrate.h>
 #include "internal.h"
 
 static inline struct page *
@@ -140,3 +143,301 @@ int test_pages_isolated(unsigned long st
 	spin_unlock_irqrestore(&zone->lock, flags);
 	return ret ? 0 : -EBUSY;
 }
+
+#define MIGRATION_RETRY	(5)
+
+/*
+ * Scanning pfn is much easier than scanning lru list.
+ * Scan pfn from start to end and Find LRU page.
+ */
+unsigned long scan_lru_pages(unsigned long start, unsigned long end)
+{
+	unsigned long pfn;
+	struct page *page;
+
+	for (pfn = start; pfn < end; pfn++) {
+		if (pfn_valid(pfn)) {
+			page = pfn_to_page(pfn);
+			if (PageLRU(page))
+				return pfn;
+		}
+	}
+	return pfn;
+}
+
+/* Migrate all LRU pages in the range to somewhere else */
+static struct page *
+hotremove_migrate_alloc(struct page *page, unsigned long private, int **x)
+{
+	/* This should be improooooved!! */
+	return alloc_page(GFP_HIGHUSER_MOVABLE);
+}
+
+#define NR_MOVE_AT_ONCE_PAGES	(256)
+int do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
+{
+	unsigned long pfn;
+	struct page *page;
+	int move_pages = NR_MOVE_AT_ONCE_PAGES;
+	int not_managed = 0;
+	int ret = 0;
+	LIST_HEAD(source);
+
+	for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
+		if (!pfn_valid(pfn))
+			continue;
+		page = pfn_to_page(pfn);
+		if (!page_count(page))
+			continue;
+		/*
+		 * We can skip free pages. And we can only deal with pages on
+		 * LRU.
+		 */
+		ret = isolate_lru_page(page);
+		if (!ret) { /* Success */
+			list_add_tail(&page->lru, &source);
+			move_pages--;
+			inc_zone_page_state(page, NR_ISOLATED_ANON +
+					    page_is_file_cache(page));
+
+		} else {
+			/* Becasue we don't have big zone->lock. we should
+			   check this again here. */
+			if (page_count(page))
+				not_managed++;
+#ifdef CONFIG_DEBUG_VM
+			printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
+			       pfn);
+			dump_page(page);
+#endif
+		}
+	}
+	ret = -EBUSY;
+	if (not_managed) {
+		if (!list_empty(&source))
+			putback_lru_pages(&source);
+		goto out;
+	}
+	ret = 0;
+	if (list_empty(&source))
+		goto out;
+	/* this function returns # of failed pages */
+	ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
+
+out:
+	return ret;
+}
+
+
+/*
+ * An interface to isolate pages in specified size and range.
+ * Purpose is to return contigous free pages larger than MAX_ORDER.
+ * Below codes are very slow and sleeps, please never call this under
+ * performance critical codes.
+ */
+
+struct page_range {
+	unsigned long base, end, pages;
+};
+
+static inline unsigned long  MAX_O_ALIGN(unsigned long x) {
+	return ALIGN(x, MAX_ORDER_NR_PAGES);
+}
+
+static inline unsigned long MAX_O_BASE(unsigned long x) {
+	return (x & ~(MAX_ORDER_NR_PAGES - 1));
+}
+
+int __get_contig_block(unsigned long pfn, unsigned long nr_pages, void *arg)
+{
+	struct page_range *blockinfo = arg;
+	unsigned long end;
+
+	end = pfn + nr_pages;
+	pfn = MAX_O_ALIGN(pfn);
+	end = MAX_O_BASE(end);
+	if (end < pfn)
+		return 0;
+	if (end - pfn >= blockinfo->pages) {
+		blockinfo->base = pfn;
+		blockinfo->end = end;
+		return 1;
+	}
+	return 0;
+}
+
+static void __trim_zone(struct page_range *range)
+{
+	struct zone *zone;
+	unsigned long pfn;
+	/*
+	 * In most case, each zone's [start_pfn, end_pfn) has no
+	 * overlap between each other. But some arch allows it and
+	 * we need to check it here.
+	 */
+	for (pfn = range->base, zone = page_zone(pfn_to_page(pfn));
+	     pfn < range->end;
+	     pfn += MAX_ORDER_NR_PAGES) {
+
+		if (zone != page_zone(pfn_to_page(pfn)))
+			break;
+	}
+	range->end = min(pfn, range->end);
+	return;
+}
+static unsigned long __find_contig_block(unsigned long base,
+		unsigned long end, unsigned long pages)
+{
+	unsigned long pfn;
+	struct page_range blockinfo;
+	int ret;
+
+	/* Skip memory holes */
+retry:
+	blockinfo.base = base;
+	blockinfo.end = end;
+	blockinfo.pages = pages;
+	/*
+	 * retruns a contiguous page range within [base, end) which is
+	 * larger than pages.
+	 */
+	ret = walk_system_ram_range(base, end - base, &blockinfo,
+		__get_contig_block);
+	if (!ret)
+		return 0;
+
+	__trim_zone(&blockinfo);
+	/* Ok, we found contiguous memory chunk of size. Isolate it.*/
+	for (pfn = blockinfo.base; pfn + pages < blockinfo.end;
+	     pfn += MAX_ORDER_NR_PAGES) {
+		/*
+		 * Now, we know [base,end) of a contiguous chunk.
+		 * Don't need to take care of memory holes.
+		 */
+		if (!start_isolate_page_range(pfn, pfn + pages))
+			return pfn;
+	}
+	/* failed */
+	if (blockinfo.end + pages < end) {
+		/* Move base address and find the next block of RAM. */
+		base = blockinfo.end;
+		goto retry;
+	}
+	return 0;
+}
+
+/**
+ *	alloc_contig_pages - allocate a contigous physical pages
+ *	@hint:		the base address of searching free space(in pfn)
+ *	@size:		size of requested area (in # of pages)
+ *	@node:		the node where memory allocated from. If -1, ignored.
+ *
+ *	Search an area of @size in the physical memory map and checks wheter
+ *	we can create a contigous free space. If it seems possible, try to
+ *	create contigous space with page migration.
+ *
+ *	Returns a page of the beginning of contiguous block. At failure, NULL
+ *	is returned. Each page in the area is set to page_count() = 1. Because
+ *	this function does page	migration, this function is very heavy and
+ *	sleeps some time. Caller must be aware that "NULL returned" is not a
+ *	special case.
+ *
+ *	Now, returned range is aligned to MAX_ORDER.
+ */
+
+struct page *alloc_contig_pages(unsigned long hint,
+				unsigned long size, int node)
+{
+	unsigned long base, found, end, pages, start;
+	struct page *ret = NULL;
+	int migration_failed;
+	struct zone *zone;
+
+	hint = MAX_O_ALIGN(hint);
+	/* request size should be aligned to pageblock */
+	pages = MAX_O_ALIGN(size);
+	found = 0;
+retry:
+	for_each_populated_zone(zone) {
+		unsigned long zone_end_pfn;
+
+		if (node >= 0 && node != zone_to_nid(zone))
+			continue;
+		if (zone->present_pages < pages)
+			continue;
+		base = MAX_O_ALIGN(zone->zone_start_pfn);
+		base = max(base, hint);
+		zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+		if (base + pages > zone_end_pfn)
+			continue;
+		found = __find_contig_block(base, zone_end_pfn, pages);
+		/* Next try will see the next block. */
+		hint = base + MAX_ORDER_NR_PAGES;
+		if (found)
+			break;
+	}
+
+	if (!found)
+		goto out;
+	/*
+	 * Ok, here, we have contiguous pageblock marked as "isolated"
+	 * try migration.
+	 *
+	 * FIXME: permanent migration_failure detection logic seems not very
+	 * precise.
+ 	 */
+	end = found + pages;
+	/* scan_lru_pages() finds the next PG_lru page in the range */
+	for (start = scan_lru_pages(found, end), migration_failed = 0;
+	     start < end;
+	     start = scan_lru_pages(start, end)) {
+		if (do_migrate_range(start, end)) {
+			/* it's better to try another block ? */
+			if (++migration_failed >= MIGRATION_RETRY)
+				break;
+			/* take a rest and synchronize LRU etc. */
+			lru_add_drain_all();
+			flush_scheduled_work();
+			cond_resched();
+			drain_all_pages();
+		} else /* reset migration_failure counter */
+			migration_failed = 0;
+	}
+
+	lru_add_drain_all();
+	flush_scheduled_work();
+	drain_all_pages();
+	/* Check all pages are isolated */
+	if (test_pages_isolated(found, end)) {
+		undo_isolate_page_range(found, pages);
+		/* We failed at [start...???) migration. */
+		hint = MAX_O_ALIGN(start + 1);
+		goto retry; /* goto next chunk */
+	}
+	/*
+	 * Ok, here, [found...found+pages) memory are isolated.
+	 * All pages in the range will be moved into the list with
+	 * page_count(page)=1.
+	 */
+	ret = pfn_to_page(found);
+	alloc_contig_freed_pages(found, found + pages);
+	/* unset ISOLATE */
+	undo_isolate_page_range(found, pages);
+	/* Free unnecessary pages in tail */
+	for (start = found + size; start < found + pages; start++)
+		__free_page(pfn_to_page(start));
+out:
+	return ret;
+
+}
+
+
+void free_contig_pages(struct page *page, int nr_pages)
+{
+	int i;
+	for (i = 0; i < nr_pages; i++)
+		__free_page(page + i);
+}
+
+EXPORT_SYMBOL_GPL(alloc_contig_pages);
+EXPORT_SYMBOL_GPL(free_contig_pages);
Index: kametest/include/linux/page-isolation.h
===================================================================
--- kametest.orig/include/linux/page-isolation.h
+++ kametest/include/linux/page-isolation.h
@@ -33,5 +33,14 @@ test_pages_isolated(unsigned long start_
 extern int set_migratetype_isolate(struct page *page);
 extern void unset_migratetype_isolate(struct page *page);
 
+/* For contiguous memory alloc */
+extern int do_migrate_range(unsigned long start_pfn, unsigned long end_pfn);
+extern void alloc_contig_freed_pages(unsigned long pfn,  unsigned long end);
+extern unsigned long scan_lru_pages(unsigned long start, unsigned long end);
+
+
+extern struct page *alloc_contig_pages(unsigned long hint,
+			unsigned long size, int node);
+extern void free_contig_pages(struct page *page, int nr_pages);
 
 #endif
Index: kametest/mm/memory_hotplug.c
===================================================================
--- kametest.orig/mm/memory_hotplug.c
+++ kametest/mm/memory_hotplug.c
@@ -568,7 +568,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(add_memory);
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
+#if defined(CONFIG_MEMORY_HOTREMOVE) || defined(CONFIG_CONTIG_ALLOC)
 /*
  * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
  * set and the size of the free page is given by page_order(). Using this,
@@ -643,87 +643,6 @@ static int test_pages_in_a_zone(unsigned
 }
 
 /*
- * Scanning pfn is much easier than scanning lru list.
- * Scan pfn from start to end and Find LRU page.
- */
-int scan_lru_pages(unsigned long start, unsigned long end)
-{
-	unsigned long pfn;
-	struct page *page;
-	for (pfn = start; pfn < end; pfn++) {
-		if (pfn_valid(pfn)) {
-			page = pfn_to_page(pfn);
-			if (PageLRU(page))
-				return pfn;
-		}
-	}
-	return 0;
-}
-
-static struct page *
-hotremove_migrate_alloc(struct page *page, unsigned long private, int **x)
-{
-	/* This should be improooooved!! */
-	return alloc_page(GFP_HIGHUSER_MOVABLE);
-}
-
-#define NR_OFFLINE_AT_ONCE_PAGES	(256)
-static int
-do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
-{
-	unsigned long pfn;
-	struct page *page;
-	int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
-	int not_managed = 0;
-	int ret = 0;
-	LIST_HEAD(source);
-
-	for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
-		if (!pfn_valid(pfn))
-			continue;
-		page = pfn_to_page(pfn);
-		if (!page_count(page))
-			continue;
-		/*
-		 * We can skip free pages. And we can only deal with pages on
-		 * LRU.
-		 */
-		ret = isolate_lru_page(page);
-		if (!ret) { /* Success */
-			list_add_tail(&page->lru, &source);
-			move_pages--;
-			inc_zone_page_state(page, NR_ISOLATED_ANON +
-					    page_is_file_cache(page));
-
-		} else {
-			/* Becasue we don't have big zone->lock. we should
-			   check this again here. */
-			if (page_count(page))
-				not_managed++;
-#ifdef CONFIG_DEBUG_VM
-			printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
-			       pfn);
-			dump_page(page);
-#endif
-		}
-	}
-	ret = -EBUSY;
-	if (not_managed) {
-		if (!list_empty(&source))
-			putback_lru_pages(&source);
-		goto out;
-	}
-	ret = 0;
-	if (list_empty(&source))
-		goto out;
-	/* this function returns # of failed pages */
-	ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
-
-out:
-	return ret;
-}
-
-/*
  * remove from free_area[] and mark all as Reserved.
  */
 static int
@@ -740,7 +659,6 @@ offline_isolated_pages(unsigned long sta
 	walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
 				offline_isolated_pages_cb);
 }
-
 /*
  * Check all pages in range, recoreded as memory resource, are isolated.
  */
@@ -833,7 +751,7 @@ repeat:
 	}
 
 	pfn = scan_lru_pages(start_pfn, end_pfn);
-	if (pfn) { /* We have page on LRU */
+	if (pfn != end_pfn) { /* We have page on LRU */
 		ret = do_migrate_range(pfn, end_pfn);
 		if (!ret) {
 			drain = 1;
Index: kametest/mm/page_alloc.c
===================================================================
--- kametest.orig/mm/page_alloc.c
+++ kametest/mm/page_alloc.c
@@ -5401,6 +5401,34 @@ out:
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 
+void alloc_contig_freed_pages(unsigned long pfn,  unsigned long end)
+{
+	struct page *page;
+	struct zone *zone;
+	int order;
+	unsigned long start = pfn;
+
+	zone = page_zone(pfn_to_page(pfn));
+	spin_lock_irq(&zone->lock);
+	while (pfn < end) {
+		VM_BUG_ON(!pfn_valid(pfn));
+		page = pfn_to_page(pfn);
+		VM_BUG_ON(page_count(page));
+		VM_BUG_ON(!PageBuddy(page));
+		list_del(&page->lru);
+		order = page_order(page);
+		zone->free_area[order].nr_free--;
+		rmv_page_order(page);
+		__mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
+		pfn += 1 << order;
+	}
+	spin_unlock_irq(&zone->lock);
+
+	/*After this, pages in the range can be freed one be one */
+	for (pfn = start; pfn < end; pfn++)
+		prep_new_page(pfn_to_page(pfn), 0, 0);
+}
+
 #ifdef CONFIG_MEMORY_HOTREMOVE
 /*
  * All pages in the range must be isolated before calling this.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/