lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20130501175012.7229.24490.sendpatchset@srdronam.in.ibm.com>
Date:	Wed, 01 May 2013 23:20:12 +0530
From:	Srikar Dronamraju <srikar@...ux.vnet.ibm.com>
To:	Ingo Molnar <mingo@...nel.org>,
	Andrea Arcangeli <aarcange@...hat.com>
Cc:	Mel Gorman <mgorman@...e.de>,
	Peter Zijlstra <a.p.zijlstra@...llo.nl>,
	Rik van Riel <riel@...hat.com>,
	LKML <linux-kernel@...r.kernel.org>
Subject: [PATCH 1/2] numa: Track last pid accessing a page.

From: Srikar Dronamraju <srikar@...ux.vnet.ibm.com>
Date: Tue, 30 Apr 2013 01:18:08 -0500
Subject: [PATCH 1/2] numa: Track last pid accessing a page.

This change is mostly extracted from ff2a9f9: numa, mm, sched: Implement
last-CPU+PID hash tracking from tip/numa/core.

We rely on the page::last_nid field (embedded in remaining bits of the
page flags field), to drive NUMA placement: the last_nid gives us
information about which tasks access memory on what node.

Lets consider a page is mostly a private page i.e accessed mostly by
one task. If such a task is being moved to a different node, then move
the page on the first access from the new node.

The cost is 8 more bits used from the page flags - this space
is still available on 64-bit systems.

There is the potential of false sharing if the PIDs of two tasks
are equal modulo 256 - this degrades the statistics somewhat but
does not completely eliminate it. Related tasks are typically
launched close to each other.

Cc: Peter Zijlstra <a.p.zijlstra@...llo.nl>
Cc: Andrea Arcangeli <aarcange@...hat.com>
Cc: Rik van Riel <riel@...hat.com>
Cc: Mel Gorman <mgorman@...e.de>
Originally-from: Ingo Molnar <mingo@...nel.org>
Signed-off-by: Srikar Dronamraju <srikar@...ux.vnet.ibm.com>
---
 include/linux/mm.h                |   72 ++++++++++++++++++++++++-------------
 include/linux/mm_types.h          |    4 +-
 include/linux/page-flags-layout.h |   25 ++++++++-----
 mm/huge_memory.c                  |    2 +-
 mm/memory.c                       |    4 +-
 mm/mempolicy.c                    |   20 ++++++++---
 mm/migrate.c                      |    4 +-
 mm/mm_init.c                      |   10 +++---
 mm/mmzone.c                       |   14 ++++----
 mm/page_alloc.c                   |    4 +-
 10 files changed, 99 insertions(+), 60 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e2091b8..2e3a3db 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -582,11 +582,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
  * sets it, so none of the operations on it need to be atomic.
  */
 
-/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NID] | ... | FLAGS | */
+/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NIDPID] | ... | FLAGS | */
 #define SECTIONS_PGOFF		((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
 #define NODES_PGOFF		(SECTIONS_PGOFF - NODES_WIDTH)
 #define ZONES_PGOFF		(NODES_PGOFF - ZONES_WIDTH)
-#define LAST_NID_PGOFF		(ZONES_PGOFF - LAST_NID_WIDTH)
+#define LAST_NIDPID_PGOFF	(ZONES_PGOFF - LAST_NIDPID_WIDTH)
 
 /*
  * Define the bit shifts to access each section.  For non-existent
@@ -596,7 +596,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 #define SECTIONS_PGSHIFT	(SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
 #define NODES_PGSHIFT		(NODES_PGOFF * (NODES_WIDTH != 0))
 #define ZONES_PGSHIFT		(ZONES_PGOFF * (ZONES_WIDTH != 0))
-#define LAST_NID_PGSHIFT	(LAST_NID_PGOFF * (LAST_NID_WIDTH != 0))
+#define LAST_NIDPID_PGSHIFT	(LAST_NIDPID_PGOFF * (LAST_NIDPID_WIDTH != 0))
 
 /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
 #ifdef NODE_NOT_IN_PAGE_FLAGS
@@ -618,7 +618,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 #define ZONES_MASK		((1UL << ZONES_WIDTH) - 1)
 #define NODES_MASK		((1UL << NODES_WIDTH) - 1)
 #define SECTIONS_MASK		((1UL << SECTIONS_WIDTH) - 1)
-#define LAST_NID_MASK		((1UL << LAST_NID_WIDTH) - 1)
+#define LAST_NIDPID_MASK	((1UL << LAST_NIDPID_WIDTH) - 1)
 #define ZONEID_MASK		((1UL << ZONEID_SHIFT) - 1)
 
 static inline enum zone_type page_zonenum(const struct page *page)
@@ -662,51 +662,73 @@ static inline int page_to_nid(const struct page *page)
 #endif
 
 #ifdef CONFIG_NUMA_BALANCING
-#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
-static inline int page_nid_xchg_last(struct page *page, int nid)
+
+static inline int nidpid_to_nid(int nidpid)
 {
-	return xchg(&page->_last_nid, nid);
+	return (nidpid >> NIDPID_PID_BITS) & NIDPID_NID_MASK;
 }
 
-static inline int page_nid_last(struct page *page)
+static inline int nidpid_to_pid(int nidpid)
 {
-	return page->_last_nid;
+	return nidpid & NIDPID_PID_MASK;
 }
-static inline void page_nid_reset_last(struct page *page)
+
+static inline int nid_pid_to_nidpid(int nid, int pid)
 {
-	page->_last_nid = -1;
+	return ((nid & NIDPID_NID_MASK) << NIDPID_PID_BITS) | (pid & NIDPID_PID_MASK);
 }
-#else
-static inline int page_nid_last(struct page *page)
+
+#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS
+static inline int page_xchg_last_nidpid(struct page *page, int nidpid)
 {
-	return (page->flags >> LAST_NID_PGSHIFT) & LAST_NID_MASK;
+	return xchg(&page->_last_nidpid, nidpid);
 }
 
-extern int page_nid_xchg_last(struct page *page, int nid);
-
-static inline void page_nid_reset_last(struct page *page)
+static inline int page_last_nidpid(struct page *page)
 {
-	int nid = (1 << LAST_NID_SHIFT) - 1;
+	return page->_last_nidpid;
+}
 
-	page->flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT);
-	page->flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT;
+static inline void reset_page_last_nidpid(struct page *page)
+{
+	page->_last_nidpid = -1;
 }
-#endif /* LAST_NID_NOT_IN_PAGE_FLAGS */
+
 #else
-static inline int page_nid_xchg_last(struct page *page, int nid)
+
+extern int page_xchg_last_nidpid(struct page *page, int nidpid);
+static inline int page_last_nidpid(struct page *page)
+{
+	return (page->flags >> LAST_NIDPID_PGSHIFT) & LAST_NIDPID_MASK;
+}
+
+static inline void reset_page_last_nidpid(struct page *page)
+{
+	page_xchg_last_nidpid(page, -1);
+}
+#endif /* LAST_NIDPID_NOT_IN_PAGE_FLAGS */
+
+static inline int page_last_pid(struct page *page)
+{
+	return nidpid_to_pid(page_last_nidpid(page));
+}
+
+#else /* !CONFIG_NUMA_BALANCING: */
+static inline int page_xchg_last_nidpid(struct page *page, int cpu)
 {
 	return page_to_nid(page);
 }
 
-static inline int page_nid_last(struct page *page)
+static inline int page_last_nidpid(struct page *page)
 {
 	return page_to_nid(page);
 }
 
-static inline void page_nid_reset_last(struct page *page)
+static inline void reset_page_last_nidpid(struct page *page)
 {
 }
-#endif
+
+#endif /* !CONFIG_NUMA_BALANCING */
 
 static inline struct zone *page_zone(const struct page *page)
 {
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ace9a5f..ccb20b9 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -174,8 +174,8 @@ struct page {
 	void *shadow;
 #endif
 
-#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
-	int _last_nid;
+#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS
+	int _last_nidpid;
 #endif
 }
 /*
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
index 93506a1..c17279a 100644
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -39,9 +39,9 @@
  * lookup is necessary.
  *
  * No sparsemem or sparsemem vmemmap: |       NODE     | ZONE |          ... | FLAGS |
- *         " plus space for last_nid: |       NODE     | ZONE | LAST_NID ... | FLAGS |
+ *         " plus space for last_nid: |       NODE     | ZONE | LAST_NIDPID ... | FLAGS |
  * classic sparse with space for node:| SECTION | NODE | ZONE |          ... | FLAGS |
- *         " plus space for last_nid: | SECTION | NODE | ZONE | LAST_NID ... | FLAGS |
+ *         " plus space for last_nid: | SECTION | NODE | ZONE | LAST_NIDPID ... | FLAGS |
  * classic sparse no space for node:  | SECTION |     ZONE    | ... | FLAGS |
  */
 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
@@ -61,16 +61,23 @@
 #define NODES_WIDTH		0
 #endif
 
+/* Reduce false sharing: */
+#define NIDPID_PID_BITS		8
+#define NIDPID_PID_MASK		((1 << NIDPID_PID_BITS)-1)
+
+#define NIDPID_NID_BITS		NODES_SHIFT
+#define NIDPID_NID_MASK		((1 << NIDPID_NID_BITS)-1)
+
 #ifdef CONFIG_NUMA_BALANCING
-#define LAST_NID_SHIFT NODES_SHIFT
+# define LAST_NIDPID_SHIFT	(NIDPID_NID_BITS+NIDPID_PID_BITS)
 #else
-#define LAST_NID_SHIFT 0
+# define LAST_NIDPID_SHIFT	0
 #endif
 
-#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
-#define LAST_NID_WIDTH LAST_NID_SHIFT
+#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NIDPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
+# define LAST_NIDPID_WIDTH	LAST_NIDPID_SHIFT
 #else
-#define LAST_NID_WIDTH 0
+# define LAST_NIDPID_WIDTH	0
 #endif
 
 /*
@@ -81,8 +88,8 @@
 #define NODE_NOT_IN_PAGE_FLAGS
 #endif
 
-#if defined(CONFIG_NUMA_BALANCING) && LAST_NID_WIDTH == 0
-#define LAST_NID_NOT_IN_PAGE_FLAGS
+#if defined(CONFIG_NUMA_BALANCING) && LAST_NIDPID_WIDTH == 0
+# define LAST_NIDPID_NOT_IN_PAGE_FLAGS
 #endif
 
 #endif /* _LINUX_PAGE_FLAGS_LAYOUT */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e2f7f5a..798297a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1639,7 +1639,7 @@ static void __split_huge_page_refcount(struct page *page)
 		page_tail->mapping = page->mapping;
 
 		page_tail->index = page->index + i;
-		page_nid_xchg_last(page_tail, page_nid_last(page));
+		page_xchg_last_nidpid(page_tail, page_last_nidpid(page));
 
 		BUG_ON(!PageAnon(page_tail));
 		BUG_ON(!PageUptodate(page_tail));
diff --git a/mm/memory.c b/mm/memory.c
index ba94dec..e819b3e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -69,8 +69,8 @@
 
 #include "internal.h"
 
-#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
-#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid.
+#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS
+#warning Unfortunate NUMA config, growing page-frame for last_nidpid.
 #endif
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 7431001..4aa64dd 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2286,11 +2286,13 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long 
 		BUG();
 	}
 
+#ifdef CONFIG_NUMA_BALANCING
 	/* Migrate the page towards the node whose CPU is referencing it */
 	if (pol->flags & MPOL_F_MORON) {
-		int last_nid;
+		int last_nidpid, this_nidpid;
 
 		polnid = numa_node_id();
+		this_nidpid = nid_pid_to_nidpid(polnid, current->pid);
 
 		/*
 		 * Multi-stage node selection is used in conjunction
@@ -2313,11 +2315,19 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long 
 		 * it less likely we act on an unlikely task<->page
 		 * relation.
 		 */
-		last_nid = page_nid_xchg_last(page, polnid);
-		if (last_nid != polnid)
-			goto out;
+		last_nidpid = page_xchg_last_nidpid(page, this_nidpid);
+		if (curnid != polnid) {
+			int last_pid = nidpid_to_pid(last_nidpid);
+			int this_pid = current->pid & NIDPID_PID_MASK;
+
+			/* Freshly allocated pages not accessed by anyone else yet: */
+			if (last_pid == this_pid || last_pid == -1 ||
+					(nidpid_to_nid(last_nidpid) == polnid))
+				ret = polnid;
+		}
+		goto out;
 	}
-
+#endif
 	if (curnid != polnid)
 		ret = polnid;
 out:
diff --git a/mm/migrate.c b/mm/migrate.c
index 3bbaf5d..74fcd76 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1478,7 +1478,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
 					  __GFP_NOWARN) &
 					 ~GFP_IOFS, 0);
 	if (newpage)
-		page_nid_xchg_last(newpage, page_nid_last(page));
+		page_xchg_last_nidpid(newpage, page_last_nidpid(page));
 
 	return newpage;
 }
@@ -1660,7 +1660,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 	if (!new_page)
 		goto out_fail;
 
-	page_nid_xchg_last(new_page, page_nid_last(page));
+	page_xchg_last_nidpid(new_page, page_last_nidpid(page));
 
 	isolated = numamigrate_isolate_page(pgdat, page);
 	if (!isolated) {
diff --git a/mm/mm_init.c b/mm/mm_init.c
index c280a02..0a0c0d3 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -69,26 +69,26 @@ void __init mminit_verify_pageflags_layout(void)
 	unsigned long or_mask, add_mask;
 
 	shift = 8 * sizeof(unsigned long);
-	width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT;
+	width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NIDPID_SHIFT;
 	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
 		"Section %d Node %d Zone %d Lastnid %d Flags %d\n",
 		SECTIONS_WIDTH,
 		NODES_WIDTH,
 		ZONES_WIDTH,
-		LAST_NID_WIDTH,
+		LAST_NIDPID_WIDTH,
 		NR_PAGEFLAGS);
 	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
 		"Section %d Node %d Zone %d Lastnid %d\n",
 		SECTIONS_SHIFT,
 		NODES_SHIFT,
 		ZONES_SHIFT,
-		LAST_NID_SHIFT);
+		LAST_NIDPID_SHIFT);
 	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
 		"Section %lu Node %lu Zone %lu Lastnid %lu\n",
 		(unsigned long)SECTIONS_PGSHIFT,
 		(unsigned long)NODES_PGSHIFT,
 		(unsigned long)ZONES_PGSHIFT,
-		(unsigned long)LAST_NID_PGSHIFT);
+		(unsigned long)LAST_NIDPID_PGSHIFT);
 	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
 		"Node/Zone ID: %lu -> %lu\n",
 		(unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
@@ -100,7 +100,7 @@ void __init mminit_verify_pageflags_layout(void)
 	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
 		"Node not in page flags");
 #endif
-#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
+#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS
 	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
 		"Last nid not in page flags");
 #endif
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 2ac0afb..a9958a1 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -97,20 +97,20 @@ void lruvec_init(struct lruvec *lruvec)
 		INIT_LIST_HEAD(&lruvec->lists[lru]);
 }
 
-#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS)
-int page_nid_xchg_last(struct page *page, int nid)
+#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NIDPID_NOT_IN_PAGE_FLAGS)
+extern int page_xchg_last_nidpid(struct page *page, int nidpid)
 {
 	unsigned long old_flags, flags;
-	int last_nid;
+	int last_nidpid;
 
 	do {
 		old_flags = flags = page->flags;
-		last_nid = page_nid_last(page);
+		last_nidpid = (flags >> LAST_NIDPID_PGSHIFT) & LAST_NIDPID_MASK;
 
-		flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT);
-		flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT;
+		flags &= ~(LAST_NIDPID_MASK << LAST_NIDPID_PGSHIFT);
+		flags |= (nidpid & LAST_NIDPID_MASK) << LAST_NIDPID_PGSHIFT;
 	} while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
 
-	return last_nid;
+	return last_nidpid;
 }
 #endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8fcced7..d4d0540 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -613,7 +613,7 @@ static inline int free_pages_check(struct page *page)
 		bad_page(page);
 		return 1;
 	}
-	page_nid_reset_last(page);
+	reset_page_last_nidpid(page);
 	if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
 		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 	return 0;
@@ -3910,7 +3910,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		mminit_verify_page_links(page, zone, nid, pfn);
 		init_page_count(page);
 		page_mapcount_reset(page);
-		page_nid_reset_last(page);
+		reset_page_last_nidpid(page);
 		SetPageReserved(page);
 		/*
 		 * Mark the block movable so that blocks are reserved for
-- 
1.7.1


-- 
Thanks and Regards
Srikar Dronamraju

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ