linux-kernel - [tip:sched/numa] sched/numa: Introduce sys_numa

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <tip-nf7brf1b88w0ojwuf689xeet@git.kernel.org>
Date:	Sat, 19 May 2012 04:36:58 -0700
From:	tip-bot for Peter Zijlstra <a.p.zijlstra@...llo.nl>
To:	linux-tip-commits@...r.kernel.org
Cc:	linux-kernel@...r.kernel.org, hpa@...or.com, mingo@...nel.org,
	torvalds@...ux-foundation.org, a.p.zijlstra@...llo.nl,
	pjt@...gle.com, cl@...ux.com, riel@...hat.com,
	akpm@...ux-foundation.org, bharata.rao@...il.com,
	aarcange@...hat.com, Lee.Schermerhorn@...com,
	suresh.b.siddha@...el.com, danms@...ibm.com, tglx@...utronix.de
Subject: [tip:sched/numa] sched/numa: Introduce sys_numa_{t,m}bind()

Commit-ID:  a2dae61eb839a60e27321ed4f0a88dd3536be998
Gitweb:     http://git.kernel.org/tip/a2dae61eb839a60e27321ed4f0a88dd3536be998
Author:     Peter Zijlstra <a.p.zijlstra@...llo.nl>
AuthorDate: Thu, 8 Mar 2012 17:56:08 +0100
Committer:  Ingo Molnar <mingo@...nel.org>
CommitDate: Sat, 19 May 2012 12:55:28 +0200

sched/numa: Introduce sys_numa_{t,m}bind()

Now that we have a NUMA process scheduler, provide a syscall
interface for finer granularity NUMA balancing. In particular
this allows setting up NUMA groups of threads and vmas within
a process.

For this we introduce two new syscalls:

 sys_numa_tbind(int tig, int ng_id, unsigned long flags);

Bind a thread to a numa group, query its binding or create a new group:

 sys_numa_tbind(tid, -1, 0);	 // create new group, return new ng_id
 sys_numa_tbind(tid, -2, 0);	 // returns existing ng_id
 sys_numa_tbind(tid, ng_id, 0);  // set ng_id

 Returns:
  -ESRCH	tid->task resolution failed
  -EINVAL	task didn't have a ng_id, flags was wrong
  -EPERM	we don't have privileges over tid

sys_numa_mbind(unsigned long addr, unsigned long len, int ng_id, unsigned long flags);

Bind a memory region to a numa group.

 sys_numa_mbind(addr, len, ng_id, 0);

create a non-mergable vma over [addr,addr+len) and assign an mpol
binding it to the numa group identified by ng_id.

These system calls together can create thread and vma groups which
will be scheduled as a single entity.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@...llo.nl>
Cc: Suresh Siddha <suresh.b.siddha@...el.com>
Cc: Paul Turner <pjt@...gle.com>
Cc: Dan Smith <danms@...ibm.com>
Cc: Bharata B Rao <bharata.rao@...il.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@...com>
Cc: Christoph Lameter <cl@...ux.com>
Cc: Rik van Riel <riel@...hat.com>
Cc: Andrea Arcangeli <aarcange@...hat.com>
Cc: Andrew Morton <akpm@...ux-foundation.org>
Cc: Linus Torvalds <torvalds@...ux-foundation.org>
Link: http://lkml.kernel.org/n/tip-nf7brf1b88w0ojwuf689xeet@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@...nel.org>
---
 arch/x86/syscalls/syscall_32.tbl |    2 +
 arch/x86/syscalls/syscall_64.tbl |    2 +
 include/asm-generic/unistd.h     |    6 +-
 include/linux/mempolicy.h        |   27 ++
 include/linux/mm.h               |   16 +
 include/linux/sched.h            |    2 +
 include/linux/syscalls.h         |    3 +
 kernel/exit.c                    |    1 +
 kernel/sched/numa.c              |  646 ++++++++++++++++++++++++++++++++++++++
 kernel/sys_ni.c                  |    4 +
 mm/memory.c                      |    5 +-
 mm/mempolicy.c                   |    8 +-
 12 files changed, 718 insertions(+), 4 deletions(-)

diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 29f9f05..38954c5 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -355,3 +355,5 @@
 346	i386	setns			sys_setns
 347	i386	process_vm_readv	sys_process_vm_readv		compat_sys_process_vm_readv
 348	i386	process_vm_writev	sys_process_vm_writev		compat_sys_process_vm_writev
+349	i386	numa_mbind		sys_numa_mbind			compat_sys_numa_mbind
+350	i386	numa_tbind		sys_numa_tbind			compat_sys_numa_tbind
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index dd29a9e..63c5285 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -318,6 +318,8 @@
 309	common	getcpu			sys_getcpu
 310	64	process_vm_readv	sys_process_vm_readv
 311	64	process_vm_writev	sys_process_vm_writev
+312	64	numa_mbind		sys_numa_mbind
+313	64	numa_tbind		sys_numa_tbind
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
 # for native 64-bit operation.
diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h
index 991ef01..4dfef49 100644
--- a/include/asm-generic/unistd.h
+++ b/include/asm-generic/unistd.h
@@ -691,9 +691,13 @@ __SC_COMP(__NR_process_vm_readv, sys_process_vm_readv, \
 #define __NR_process_vm_writev 271
 __SC_COMP(__NR_process_vm_writev, sys_process_vm_writev, \
           compat_sys_process_vm_writev)
+#define __NR_numa_mbind 272
+__SC_COMP(__NR_numa_mbind, sys_numa_mbind, compat_sys_ms_mbind)
+#define __NR_numa_tbind 273
+__SC_COMP(__NR_numa_tbind, sys_numa_tbind, compat_sys_ms_tbind)
 
 #undef __NR_syscalls
-#define __NR_syscalls 272
+#define __NR_syscalls 274
 
 /*
  * All syscalls below here should go away really,
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index af2983b..6e1029f 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -79,6 +79,8 @@ enum mpol_rebind_step {
 #include <linux/nodemask.h>
 #include <linux/pagemap.h>
 #include <linux/migrate.h>
+#include <linux/list.h>
+#include <linux/sched.h>
 
 struct mm_struct;
 
@@ -110,6 +112,10 @@ struct mempolicy {
 	atomic_t refcnt;
 	unsigned short mode; 	/* See MPOL_* above */
 	unsigned short flags;	/* See set_mempolicy() MPOL_F_* above */
+	struct numa_group *numa_group;
+	struct list_head ng_entry;
+	struct vm_area_struct *vma;
+	struct rcu_head rcu;
 	union {
 		short 		 preferred_node; /* preferred */
 		nodemask_t	 nodes;		/* interleave/bind */
@@ -397,6 +403,27 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol,
 }
 
 #endif /* CONFIG_NUMA */
+
+#ifdef CONFIG_NUMA
+
+extern void __numa_task_exit(struct task_struct *);
+extern void numa_vma_link(struct vm_area_struct *, struct vm_area_struct *);
+extern void numa_vma_unlink(struct vm_area_struct *);
+
+static inline void numa_task_exit(struct task_struct *p)
+{
+	if (p->numa_group)
+		__numa_task_exit(p);
+}
+
+#else /* CONFIG_NUMA */
+
+static inline void numa_task_exit(struct task_struct *p) { }
+static inline void numa_vma_link(struct vm_area_struct *new, struct vm_area_struct *old) { }
+static inline void numa_vma_unlink(struct vm_area_struct *vma) { }
+
+#endif /* CONFIG_NUMA */
+
 #endif /* __KERNEL__ */
 
 #endif
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a89817e..eda8271 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1076,19 +1076,35 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
 	return (unsigned long)val;
 }
 
+#ifdef CONFIG_NUMA
+extern void __numa_add_rss_counter(struct vm_area_struct *, int, long);
+
+static inline
+void numa_add_rss_counter(struct vm_area_struct *vma, int member, long value)
+{
+	if (vma->vm_policy) /* XXX sodding include dependecies */
+		__numa_add_rss_counter(vma, member, value);
+}
+#else /* !CONFIG_NUMA */
+static inline void numa_add_rss_counter(struct vm_area_struct *vma, int member, long value) { }
+#endif /* CONFIG_NUMA */
+
 static inline void add_rss_counter(struct vm_area_struct *vma, int member, long value)
 {
 	atomic_long_add(value, &vma->vm_mm->rss_stat.count[member]);
+	numa_add_rss_counter(vma, member, value);
 }
 
 static inline void inc_rss_counter(struct vm_area_struct *vma, int member)
 {
 	atomic_long_inc(&vma->vm_mm->rss_stat.count[member]);
+	numa_add_rss_counter(vma, member, 1);
 }
 
 static inline void dec_rss_counter(struct vm_area_struct *vma, int member)
 {
 	atomic_long_dec(&vma->vm_mm->rss_stat.count[member]);
+	numa_add_rss_counter(vma, member, -1);
 }
 
 static inline unsigned long get_mm_rss(struct mm_struct *mm)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fe6e535..024a5f9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1512,6 +1512,8 @@ struct task_struct {
 	short il_next;
 	short pref_node_fork;
 	int node;
+	struct numa_group *numa_group;
+	struct list_head ng_entry;
 #endif
 	struct rcu_head rcu;
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3de3acb..9c6f47a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -857,5 +857,8 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
 				      const struct iovec __user *rvec,
 				      unsigned long riovcnt,
 				      unsigned long flags);
+asmlinkage long sys_numa_mbind(unsigned long addr, unsigned long len,
+			       int ng_id, unsigned long flags);
+asmlinkage long sys_numa_tbind(int tid, int ng_id, unsigned long flags);
 
 #endif
diff --git a/kernel/exit.c b/kernel/exit.c
index d8bd3b42..a0c1b54 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1018,6 +1018,7 @@ void do_exit(long code)
 	mpol_put(tsk->mempolicy);
 	tsk->mempolicy = NULL;
 	task_unlock(tsk);
+	numa_task_exit(tsk);
 #endif
 #ifdef CONFIG_FUTEX
 	if (unlikely(current->pi_state_cache))
diff --git a/kernel/sched/numa.c b/kernel/sched/numa.c
index 77fa7d4..b2aba37 100644
--- a/kernel/sched/numa.c
+++ b/kernel/sched/numa.c
@@ -14,6 +14,7 @@
 
 #include <linux/mempolicy.h>
 #include <linux/kthread.h>
+#include <linux/compat.h>
 
 #include "sched.h"
 
@@ -852,3 +853,648 @@ static __init int numa_init(void)
 	return 0;
 }
 early_initcall(numa_init);
+
+
+/*
+ *  numa_group bits
+ */
+
+#include <linux/idr.h>
+#include <linux/srcu.h>
+#include <linux/syscalls.h>
+
+struct numa_group {
+	spinlock_t		lock;
+	int			id;
+
+	struct mm_rss_stat	rss;
+
+	struct list_head	tasks;
+	struct list_head	vmas;
+
+	const struct cred	*cred;
+	atomic_t		ref;
+
+	struct numa_entity	numa_entity;
+
+	struct rcu_head		rcu;
+};
+
+static struct srcu_struct ng_srcu;
+
+static DEFINE_MUTEX(numa_group_idr_lock);
+static DEFINE_IDR(numa_group_idr);
+
+static inline struct numa_group *ne_ng(struct numa_entity *ne)
+{
+	return container_of(ne, struct numa_group, numa_entity);
+}
+
+static inline bool ng_tryget(struct numa_group *ng)
+{
+	return atomic_inc_not_zero(&ng->ref);
+}
+
+static inline void ng_get(struct numa_group *ng)
+{
+	atomic_inc(&ng->ref);
+}
+
+static void __ng_put_rcu(struct rcu_head *rcu)
+{
+	struct numa_group *ng = container_of(rcu, struct numa_group, rcu);
+
+	put_cred(ng->cred);
+	kfree(ng);
+}
+
+static void __ng_put(struct numa_group *ng)
+{
+	mutex_lock(&numa_group_idr_lock);
+	idr_remove(&numa_group_idr, ng->id);
+	mutex_unlock(&numa_group_idr_lock);
+
+	WARN_ON(!list_empty(&ng->tasks));
+	WARN_ON(!list_empty(&ng->vmas));
+
+	dequeue_ne(&ng->numa_entity);
+
+	call_rcu(&ng->rcu, __ng_put_rcu);
+}
+
+static inline void ng_put(struct numa_group *ng)
+{
+	if (atomic_dec_and_test(&ng->ref))
+		__ng_put(ng);
+}
+
+/*
+ * numa_ops
+ */
+
+static unsigned long numa_group_mem_load(struct numa_entity *ne)
+{
+	struct numa_group *ng = ne_ng(ne);
+
+	return atomic_long_read(&ng->rss.count[MM_ANONPAGES]);
+}
+
+static unsigned long numa_group_cpu_load(struct numa_entity *ne)
+{
+	struct numa_group *ng = ne_ng(ne);
+	unsigned long load = 0;
+	struct task_struct *p;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(p, &ng->tasks, ng_entry)
+		load += p->numa_contrib;
+	rcu_read_unlock();
+
+	return load;
+}
+
+static void numa_group_mem_migrate(struct numa_entity *ne, int node)
+{
+	struct numa_group *ng = ne_ng(ne);
+	struct vm_area_struct *vma;
+	struct mempolicy *mpol;
+	struct mm_struct *mm;
+	int idx;
+
+	/*
+	 * Horrid code this..
+	 *
+	 * The main problem is that ng->lock nests inside mmap_sem [
+	 * numa_vma_{,un}link() gets called under mmap_sem ]. But here we need
+	 * to iterate that list and acquire mmap_sem for each entry.
+	 *
+	 * We start here with no locks held. numa_vma_unlink() is used to add
+	 * an SRCU delayed reference count to the mpols. This allows us to do
+	 * lockless iteration of the list.
+	 *
+	 * Once we have an mpol we need to acquire mmap_sem, this too isn't
+	 * straight fwd, take ng->lock to pin mpol->vma due to its
+	 * serialization against numa_vma_unlink(). While that vma pointer is
+	 * stable the vma->vm_mm pointer must be good too, so acquire an extra
+	 * reference to the mm.
+	 *
+	 * This reference keeps mm stable so we can drop ng->lock and acquire
+	 * mmap_sem. After which mpol->vma is stable again since the memory map
+	 * is stable. So verify ->vma is still good (numa_vma_unlink clears it)
+	 * and the mm is still the same (paranoia, can't see how that could
+	 * happen).
+	 */
+
+	idx = srcu_read_lock(&ng_srcu);
+	list_for_each_entry_rcu(mpol, &ng->vmas, ng_entry) {
+		nodemask_t mask = nodemask_of_node(node);
+
+		spin_lock(&ng->lock); /* pin mpol->vma */
+		vma = mpol->vma;
+		if (!vma) {
+			spin_unlock(&ng->lock);
+			continue;
+		}
+		mm = vma->vm_mm;
+		atomic_inc(&mm->mm_users); /* pin mm */
+		spin_unlock(&ng->lock);
+
+		down_read(&mm->mmap_sem);
+		vma = mpol->vma;
+		if (!vma)
+			goto unlock_next;
+
+		mpol_rebind_policy(mpol, &mask, MPOL_REBIND_ONCE);
+		lazy_migrate_vma(vma, node);
+unlock_next:
+		up_read(&mm->mmap_sem);
+		mmput(mm);
+	}
+	srcu_read_unlock(&ng_srcu, idx);
+}
+
+static void numa_group_cpu_migrate(struct numa_entity *ne, int node)
+{
+	struct numa_group *ng = ne_ng(ne);
+	struct task_struct *p;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(p, &ng->tasks, ng_entry)
+		sched_setnode(p, node);
+	rcu_read_unlock();
+}
+
+static bool numa_group_can_migrate(struct numa_entity *ne, int node)
+{
+	struct numa_group *ng = ne_ng(ne);
+	struct task_struct *t;
+	bool allowed = false;
+	u64 runtime = 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(t, &ng->tasks, ng_entry) {
+		allowed = __task_can_migrate(t, &runtime, node);
+		if (!allowed)
+			break;
+	}
+	rcu_read_unlock();
+
+	/*
+	 * Don't bother migrating memory if there's less than 1 second
+	 * of runtime on the tasks.
+	 */
+	return allowed && runtime > NSEC_PER_SEC;
+}
+
+static bool numa_group_tryget(struct numa_entity *ne)
+{
+	/*
+	 * See process_tryget(), similar but against ng_put().
+	 */
+	return ng_tryget(ne_ng(ne));
+}
+
+static void numa_group_put(struct numa_entity *ne)
+{
+	ng_put(ne_ng(ne));
+}
+
+static const struct numa_ops numa_group_ops = {
+	.mem_load	= numa_group_mem_load,
+	.cpu_load	= numa_group_cpu_load,
+
+	.mem_migrate	= numa_group_mem_migrate,
+	.cpu_migrate	= numa_group_cpu_migrate,
+
+	.can_migrate	= numa_group_can_migrate,
+
+	.tryget		= numa_group_tryget,
+	.put		= numa_group_put,
+};
+
+static struct numa_group *lock_p_ng(struct task_struct *p)
+{
+	struct numa_group *ng;
+
+	for (;;) {
+		ng = ACCESS_ONCE(p->numa_group);
+		if (!ng)
+			return NULL;
+
+		spin_lock(&ng->lock);
+		if (p->numa_group == ng)
+			break;
+		spin_unlock(&ng->lock);
+	}
+
+	return ng;
+}
+
+void __numa_task_exit(struct task_struct *p)
+{
+	struct numa_group *ng;
+
+	ng = lock_p_ng(p);
+	if (ng) {
+		list_del_rcu(&p->ng_entry);
+		p->numa_group = NULL;
+		spin_unlock(&ng->lock);
+
+		ng_put(ng);
+	}
+}
+
+/*
+ * memory (vma) accounting/tracking
+ *
+ * We assume a 1:1 relation between vmas and mpols and keep a list of mpols in
+ * the numa_group, and a vma backlink in the mpol.
+ */
+
+void numa_vma_link(struct vm_area_struct *new, struct vm_area_struct *old)
+{
+	struct numa_group *ng = NULL;
+
+	if (old && old->vm_policy)
+		ng = old->vm_policy->numa_group;
+
+	if (!ng && new->vm_policy)
+		ng = new->vm_policy->numa_group;
+
+	if (!ng)
+		return;
+
+	ng_get(ng);
+	new->vm_policy->numa_group = ng;
+	new->vm_policy->vma = new;
+
+	spin_lock(&ng->lock);
+	list_add_rcu(&new->vm_policy->ng_entry, &ng->vmas);
+	spin_unlock(&ng->lock);
+}
+
+void __numa_add_rss_counter(struct vm_area_struct *vma, int member, long value)
+{
+	/*
+	 * Since the caller passes the vma argument, the caller is responsible
+	 * for making sure the vma is stable, hence the ->vm_policy->numa_group
+	 * dereference is safe. (caller usually has vma->vm_mm->mmap_sem for
+	 * reading).
+	 */
+	if (vma->vm_policy->numa_group)
+		atomic_long_add(value, &vma->vm_policy->numa_group->rss.count[member]);
+}
+
+static void __mpol_put_rcu(struct rcu_head *rcu)
+{
+	struct mempolicy *mpol = container_of(rcu, struct mempolicy, rcu);
+	mpol_put(mpol);
+}
+
+void numa_vma_unlink(struct vm_area_struct *vma)
+{
+	struct mempolicy *mpol;
+	struct numa_group *ng;
+
+	if (!vma)
+		return;
+
+	mpol = vma->vm_policy;
+	if (!mpol)
+		return;
+
+	ng = mpol->numa_group;
+	if (!ng)
+		return;
+
+	spin_lock(&ng->lock);
+	list_del_rcu(&mpol->ng_entry);
+	/*
+	 * Rediculous, see numa_group_mem_migrate.
+	 */
+	mpol->vma = NULL;
+	mpol_get(mpol);
+	call_srcu(&ng_srcu, &mpol->rcu, __mpol_put_rcu);
+	spin_unlock(&ng->lock);
+
+	ng_put(ng);
+}
+
+/*
+ * syscall bits
+ */
+
+#define MS_ID_GET	-2
+#define MS_ID_NEW	-1
+
+static struct numa_group *ng_create(struct task_struct *p)
+{
+	struct numa_group *ng;
+	int node, err;
+
+	ng = kzalloc(sizeof(*ng), GFP_KERNEL);
+	if (!ng)
+		goto fail;
+
+	err = idr_pre_get(&numa_group_idr, GFP_KERNEL);
+	if (!err)
+		goto fail_alloc;
+
+	mutex_lock(&numa_group_idr_lock);
+	err = idr_get_new(&numa_group_idr, ng, &ng->id);
+	mutex_unlock(&numa_group_idr_lock);
+
+	if (err)
+		goto fail_alloc;
+
+	spin_lock_init(&ng->lock);
+	atomic_set(&ng->ref, 1);
+	ng->cred = get_task_cred(p);
+	INIT_LIST_HEAD(&ng->tasks);
+	INIT_LIST_HEAD(&ng->vmas);
+	init_ne(&ng->numa_entity, &numa_group_ops);
+
+	dequeue_ne(&p->mm->numa);
+	node = find_idlest_node(tsk_home_node(p));
+	enqueue_ne(&ng->numa_entity, node);
+
+	return ng;
+
+fail_alloc:
+	kfree(ng);
+fail:
+	return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * More or less equal to ptrace_may_access(); XXX
+ */
+static int ng_allowed(struct numa_group *ng, struct task_struct *p)
+{
+	const struct cred *cred = ng->cred, *tcred;
+
+	rcu_read_lock();
+	tcred = __task_cred(p);
+	if (cred->user->user_ns == tcred->user->user_ns &&
+	    (cred->uid == tcred->euid &&
+	     cred->uid == tcred->suid &&
+	     cred->uid == tcred->uid  &&
+	     cred->gid == tcred->egid &&
+	     cred->gid == tcred->sgid &&
+	     cred->gid == tcred->gid))
+		goto ok;
+	if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
+		goto ok;
+	rcu_read_unlock();
+	return -EPERM;
+
+ok:
+	rcu_read_unlock();
+	return 0;
+}
+
+static struct numa_group *ng_lookup(int ng_id, struct task_struct *p)
+{
+	struct numa_group *ng;
+
+	rcu_read_lock();
+again:
+	ng = idr_find(&numa_group_idr, ng_id);
+	if (!ng) {
+		rcu_read_unlock();
+		return ERR_PTR(-EINVAL);
+	}
+	if (ng_allowed(ng, p)) {
+		rcu_read_unlock();
+		return ERR_PTR(-EPERM);
+	}
+	if (!ng_tryget(ng))
+		goto again;
+	rcu_read_unlock();
+
+	return ng;
+}
+
+static int ng_task_assign(struct task_struct *p, int ng_id)
+{
+	struct numa_group *old_ng, *ng;
+
+	ng = ng_lookup(ng_id, p);
+	if (IS_ERR(ng))
+		return PTR_ERR(ng);
+
+	old_ng = lock_p_ng(p);
+	if (old_ng) {
+		/*
+		 * Special numa_group that assists in serializing the
+		 * p->numa_group hand-over. Assume concurrent ng_task_assign()
+		 * invocation, only one can remove the old_ng, but both need
+		 * to serialize against RCU.
+		 *
+		 * Therefore we cannot clear p->numa_group, this would lead to
+		 * the second not observing old_ng and thus missing the RCU
+		 * sync.
+		 *
+		 * We also cannot set p->numa_group to ng, since then we'd
+		 * try to remove ourselves from a list we're not on yet --
+		 * double list_del_rcu() invocation.
+		 *
+		 * Solve this by using this special intermediate numa_group,
+		 * we set p->numa_group to this object so that the second
+		 * observes a !NULL numa_group, however we skip the
+		 * list_del_rcu() when we find this special group avoiding the
+		 * double delete.
+		 */
+		static struct numa_group __ponies = {
+			.lock = __SPIN_LOCK_UNLOCKED(__ponies.lock),
+		};
+
+		if (likely(old_ng != &__ponies)) {
+			list_del_rcu(&p->ng_entry);
+			p->numa_group = &__ponies;
+		}
+		spin_unlock(&old_ng->lock);
+
+		if (unlikely(old_ng == &__ponies))
+			old_ng = NULL; /* avoid ng_put() */
+
+		/*
+		 * We have to wait for the old ng_entry users to go away before
+		 * we can re-use the link entry for the new list.
+		 */
+		synchronize_rcu();
+	}
+
+	spin_lock(&ng->lock);
+	p->numa_group = ng;
+	list_add_rcu(&p->ng_entry, &ng->tasks);
+	spin_unlock(&ng->lock);
+
+	sched_setnode(p, ng->numa_entity.node);
+
+	if (old_ng)
+		ng_put(old_ng);
+
+	return ng_id;
+}
+
+static struct task_struct *find_get_task(pid_t tid)
+{
+	struct task_struct *p;
+
+	rcu_read_lock();
+	if (!tid)
+		p = current;
+	else
+		p = find_task_by_vpid(tid);
+
+	if (p->flags & PF_EXITING)
+		p = NULL;
+
+	if (p)
+		get_task_struct(p);
+	rcu_read_unlock();
+
+	if (!p)
+		return ERR_PTR(-ESRCH);
+
+	return p;
+}
+
+/*
+ * Bind a thread to a numa group or query its binding or create a new group.
+ *
+ * sys_numa_tbind(tid, -1, 0);	  // create new group, return new ng_id
+ * sys_numa_tbind(tid, -2, 0);	  // returns existing ng_id
+ * sys_numa_tbind(tid, ng_id, 0); // set ng_id
+ *
+ * Returns:
+ *  -ESRCH	tid->task resolution failed
+ *  -EINVAL	task didn't have a ng_id, flags was wrong
+ *  -EPERM	we don't have privileges over tid
+ *
+ */
+SYSCALL_DEFINE3(numa_tbind, int, tid, int, ng_id, unsigned long, flags)
+{
+	struct task_struct *p = find_get_task(tid);
+	struct numa_group *ng = NULL;
+	int orig_ng_id = ng_id;
+
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	if (flags) {
+		ng_id = -EINVAL;
+		goto out;
+	}
+
+	switch (ng_id) {
+	case MS_ID_GET:
+		ng_id = -EINVAL;
+		rcu_read_lock();
+		ng = rcu_dereference(p->numa_group);
+		if (ng)
+			ng_id = ng->id;
+		rcu_read_unlock();
+		break;
+
+	case MS_ID_NEW:
+		ng = ng_create(p);
+		if (IS_ERR(ng)) {
+			ng_id = PTR_ERR(ng);
+			break;
+		}
+		ng_id = ng->id;
+		/* fall through */
+
+	default:
+		ng_id = ng_task_assign(p, ng_id);
+		if (ng && orig_ng_id < 0)
+			ng_put(ng);
+		break;
+	}
+
+out:
+	put_task_struct(p);
+	return ng_id;
+}
+
+/*
+ * Bind a memory region to a numa group.
+ *
+ * sys_numa_mbind(addr, len, ng_id, 0);
+ *
+ * create a non-mergable vma over [addr,addr+len) and assign a mpol binding it
+ * to the numa group identified by ng_id.
+ *
+ */
+SYSCALL_DEFINE4(numa_mbind, unsigned long, addr, unsigned long, len,
+			    int, ng_id, unsigned long, flags)
+{
+	struct mm_struct *mm = current->mm;
+	struct mempolicy *mpol;
+	struct numa_group *ng;
+	nodemask_t mask;
+	int err = 0;
+
+	if (flags)
+		return -EINVAL;
+
+	if (addr & ~PAGE_MASK)
+		return -EINVAL;
+
+	ng = ng_lookup(ng_id, current);
+	if (IS_ERR(ng))
+		return PTR_ERR(ng);
+
+	mask = nodemask_of_node(ng->numa_entity.node);
+	mpol = mpol_new(MPOL_BIND, 0, &mask);
+	if (!mpol) {
+		ng_put(ng);
+		return -ENOMEM;
+	}
+	mpol->flags |= MPOL_MF_LAZY;
+	mpol->numa_group = ng;
+
+	down_write(&mm->mmap_sem);
+	err = mpol_do_mbind(addr, len, mpol, MPOL_BIND,
+			&mask, MPOL_MF_MOVE|MPOL_MF_LAZY);
+	up_write(&mm->mmap_sem);
+	mpol_put(mpol);
+	ng_put(ng);
+
+	if (!err) {
+		/*
+		 * There's a small overlap between ng and mm here, we only
+		 * remove the mm after we associate the VMAs with the ng. Since
+		 * lazy_migrate_vma() checks the mempolicy hierarchy this works
+		 * out fine.
+		 */
+		dequeue_ne(&mm->numa);
+	}
+
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+
+asmlinkage long compat_sys_numa_mbind(compat_ulong_t addr, compat_ulong_t len,
+				      compat_int_t ng_id, compat_ulong_t flags)
+{
+	return sys_numa_mbind(addr, len, ng_id, flags);
+}
+
+asmlinkage long compat_sys_numa_tbind(compat_int_t tid, compat_int_t ng_id,
+				      compat_ulong_t flags)
+{
+	return sys_numa_tbind(tid, ng_id, flags);
+}
+
+#endif /* CONFIG_COMPAT */
+
+static __init int numa_group_init(void)
+{
+	init_srcu_struct(&ng_srcu);
+	return 0;
+}
+early_initcall(numa_group_init);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 47bfa16..3f5a59f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -103,6 +103,10 @@ cond_syscall(sys_set_mempolicy);
 cond_syscall(compat_sys_mbind);
 cond_syscall(compat_sys_get_mempolicy);
 cond_syscall(compat_sys_set_mempolicy);
+cond_syscall(sys_numa_mbind);
+cond_syscall(compat_sys_numa_mbind);
+cond_syscall(sys_numa_tbind);
+cond_syscall(compat_sys_numa_tbind);
 cond_syscall(sys_add_key);
 cond_syscall(sys_request_key);
 cond_syscall(sys_keyctl);
diff --git a/mm/memory.c b/mm/memory.c
index 6b39551c..4614be0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -144,9 +144,10 @@ static void add_rss_counter_fast(struct vm_area_struct *vma, int member, int val
 {
 	struct task_struct *task = current;
 
-	if (likely(task->mm == vma->vm_mm))
+	if (likely(task->mm == vma->vm_mm)) {
 		task->rss_stat.count[member] += val;
-	else
+		numa_add_rss_counter(vma, member, val);
+	} else
 		add_rss_counter(vma, member, val);
 }
 #define inc_rss_counter_fast(vma, member) add_rss_counter_fast(vma, member, 1)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index df29149..6cdfdba 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -291,12 +291,13 @@ struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 		mode = MPOL_PREFERRED;
 	} else if (nodes_empty(*nodes))
 		return ERR_PTR(-EINVAL);
-	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
+	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL | __GFP_ZERO);
 	if (!policy)
 		return ERR_PTR(-ENOMEM);
 	atomic_set(&policy->refcnt, 1);
 	policy->mode = mode;
 	policy->flags = flags;
+	INIT_LIST_HEAD(&policy->ng_entry);
 
 	return policy;
 }
@@ -611,6 +612,9 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 	if (!err) {
 		mpol_get(new);
 		vma->vm_policy = new;
+		numa_vma_link(vma, NULL);
+		if (old)
+			numa_vma_unlink(old->vma);
 		mpol_put(old);
 	}
 	return err;
@@ -2063,11 +2067,13 @@ int vma_dup_policy(struct vm_area_struct *new, struct vm_area_struct *old)
 	if (IS_ERR(mpol))
 		return PTR_ERR(mpol);
 	vma_set_policy(new, mpol);
+	numa_vma_link(new, old);
 	return 0;
 }
 
 void vma_put_policy(struct vm_area_struct *vma)
 {
+	numa_vma_unlink(vma);
 	mpol_put(vma_policy(vma));
 }
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/