linux-kernel - [RFC PATCH 01/10] sched: Introduce per node numa weights

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1375170505-5967-2-git-send-email-srikar@linux.vnet.ibm.com>
Date:	Tue, 30 Jul 2013 13:18:16 +0530
From:	Srikar Dronamraju <srikar@...ux.vnet.ibm.com>
To:	Mel Gorman <mgorman@...e.de>,
	Peter Zijlstra <a.p.zijlstra@...llo.nl>,
	Ingo Molnar <mingo@...nel.org>
Cc:	Andrea Arcangeli <aarcange@...hat.com>,
	Johannes Weiner <hannes@...xchg.org>,
	Linux-MM <linux-mm@...ck.org>,
	LKML <linux-kernel@...r.kernel.org>,
	Preeti U Murthy <preeti@...ux.vnet.ibm.com>,
	Linus Torvalds <torvalds@...ux-foundation.org>,
	Srikar Dronamraju <srikar@...ux.vnet.ibm.com>
Subject: [RFC PATCH 01/10] sched: Introduce per node numa weights

Load balancer spreads the load evenly for fairness and for maintaining
balance across different domains. However where possible related tasks
could be scheduled in the same domain (esp at node domains) to allow tasks
to have more local accesses. This consolidation can be done without
affecting fairness and leaving the domains balanced

To better consolidate the loads, account weights per-mm per-node. These
stats are used in later patches to select more appropriate tasks during
load balance.

TODO: Modify to capture and use the actual task weights rather than task
counts

Signed-off-by: Srikar Dronamraju <srikar@...ux.vnet.ibm.com>
---
 fs/exec.c                |    5 +++++
 include/linux/mm_types.h |    1 +
 kernel/fork.c            |   10 +++++++---
 kernel/sched/fair.c      |   34 ++++++++++++++++++++++++++++++++++
 4 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index a96a488..b086e9e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -833,6 +833,11 @@ static int exec_mmap(struct mm_struct *mm)
 	activate_mm(active_mm, mm);
 	task_unlock(tsk);
 	arch_pick_mmap_layout(mm);
+#ifdef CONFIG_NUMA_BALANCING
+	mm->numa_weights = kzalloc(sizeof(atomic_t) * (nr_node_ids + 1), GFP_KERNEL);
+	atomic_inc(&mm->numa_weights[cpu_to_node(task_cpu(tsk))]);
+	atomic_inc(&mm->numa_weights[nr_node_ids]);
+#endif
 	if (old_mm) {
 		up_read(&old_mm->mmap_sem);
 		BUG_ON(active_mm != old_mm);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ace9a5f..45d02df 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -435,6 +435,7 @@ struct mm_struct {
 	 * a different node than Make PTE Scan Go Now.
 	 */
 	int first_nid;
+	atomic_t *numa_weights;
 #endif
 	struct uprobes_state uprobes_state;
 };
diff --git a/kernel/fork.c b/kernel/fork.c
index 1766d32..21421bd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -617,6 +617,9 @@ void mmput(struct mm_struct *mm)
 		khugepaged_exit(mm); /* must run before exit_mmap */
 		exit_mmap(mm);
 		set_mm_exe_file(mm, NULL);
+#ifdef CONFIG_NUMA_BALANCING
+		kfree(mm->numa_weights);
+#endif
 		if (!list_empty(&mm->mmlist)) {
 			spin_lock(&mmlist_lock);
 			list_del(&mm->mmlist);
@@ -823,9 +826,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	mm->pmd_huge_pte = NULL;
 #endif
-#ifdef CONFIG_NUMA_BALANCING
-	mm->first_nid = NUMA_PTE_SCAN_INIT;
-#endif
 	if (!mm_init(mm, tsk))
 		goto fail_nomem;
 
@@ -844,6 +844,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 	if (mm->binfmt && !try_module_get(mm->binfmt->module))
 		goto free_pt;
 
+#ifdef CONFIG_NUMA_BALANCING
+	mm->first_nid = NUMA_PTE_SCAN_INIT;
+	mm->numa_weights = kzalloc(sizeof(atomic_t) * (nr_node_ids + 1), GFP_KERNEL);
+#endif
 	return mm;
 
 free_pt:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a33e59..8a2b5aa 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -995,10 +995,40 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
 		}
 	}
 }
+
+static void account_numa_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p)
+{
+	struct mm_struct *mm = p->mm;
+	struct rq *rq = rq_of(cfs_rq);
+	int curnode = cpu_to_node(cpu_of(rq));
+
+	if (mm && mm->numa_weights) {
+		atomic_read(&mm->numa_weights[curnode]);
+		atomic_read(&mm->numa_weights[nr_node_ids]);
+	}
+}
+
+static void account_numa_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p)
+{
+	struct mm_struct *mm = p->mm;
+	struct rq *rq = rq_of(cfs_rq);
+	int curnode = cpu_to_node(cpu_of(rq));
+
+	if (mm && mm->numa_weights) {
+		atomic_dec(&mm->numa_weights[curnode]);
+		atomic_dec(&mm->numa_weights[nr_node_ids]);
+	}
+}
 #else
 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 {
 }
+static void account_numa_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p)
+{
+}
+static void account_numa_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p)
+{
+}
 #endif /* CONFIG_NUMA_BALANCING */
 
 static void
@@ -1713,6 +1743,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (se != cfs_rq->curr)
 		__enqueue_entity(cfs_rq, se);
 	se->on_rq = 1;
+	if (entity_is_task(se))
+		account_numa_enqueue(cfs_rq, task_of(se));
 
 	if (cfs_rq->nr_running == 1) {
 		list_add_leaf_cfs_rq(cfs_rq);
@@ -1810,6 +1842,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	update_min_vruntime(cfs_rq);
 	update_cfs_shares(cfs_rq);
+	if (entity_is_task(se))
+		account_numa_dequeue(cfs_rq, task_of(se));
 }
 
 /*
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/