lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Thu, 10 Aug 2017 17:45:18 +0200
From:   Peter Zijlstra <peterz@...radead.org>
To:     "Ofer Levi(SW)" <oferle@...lanox.com>
Cc:     "rusty@...tcorp.com.au" <rusty@...tcorp.com.au>,
        "mingo@...hat.com" <mingo@...hat.com>,
        "Vineet.Gupta1@...opsys.com" <Vineet.Gupta1@...opsys.com>,
        "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        Tejun Heo <tj@...nel.org>
Subject: Re: hotplug support for arch/arc/plat-eznps platform

On Thu, Aug 10, 2017 at 11:19:05AM +0200, Peter Zijlstra wrote:
> On Thu, Aug 10, 2017 at 07:40:16AM +0000, Ofer Levi(SW) wrote:
> > Well, this definitely have pleased the little toy :)
> > Thank you. I really appreciate your time and effort.
> > 
> > If I may, one more newbie question. What do I need to do for the two patches to find 
> > their way into formal kernel code?
> 
> I'll split the first patch into two separate patches and line them up.
> 
> I'm not sure about this last patch, I'll speak with Ingo once he's back
> to see what would be the thing to do here.
> 
> I suspect we can make it work, that sysctl stuff is only debug crud
> after all and that should never get in the way of getting work done.

Can you test this instead of the second patch? It should have the same
effect.

BTW, what physical size does your toy have? I'm thinking its less than
multiple racks worth like the SGI systems were.

---
Subject: sched/debug: Optimize sched_domain sysctl generation
From: Peter Zijlstra <peterz@...radead.org>
Date: Thu Aug 10 17:10:26 CEST 2017

Currently we unconditionally destroy all sysctl bits and regenerate
them after we've rebuild the domains (even if that rebuild is a
no-op).

And since we unconditionally (re)build the sysctl for all possible
CPUs, onlining all CPUs gets us O(n^2) time. Instead change this to
only rebuild the bits for CPUs we've actually installed new domains
on.

Reported-by: "Ofer Levi(SW)" <oferle@...lanox.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
---
 kernel/sched/debug.c    |   68 ++++++++++++++++++++++++++++++++++++++----------
 kernel/sched/sched.h    |    4 ++
 kernel/sched/topology.c |    1 
 3 files changed, 59 insertions(+), 14 deletions(-)

--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -327,38 +327,78 @@ static struct ctl_table *sd_alloc_ctl_cp
 	return table;
 }
 
+static cpumask_var_t sd_sysctl_cpus;
 static struct ctl_table_header *sd_sysctl_header;
+
 void register_sched_domain_sysctl(void)
 {
-	int i, cpu_num = num_possible_cpus();
-	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+	static struct ctl_table *cpu_entries;
+	static struct ctl_table **cpu_idx;
 	char buf[32];
+	int i;
+
+	if (!cpu_entries) {
+		cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1);
+		if (!cpu_entries)
+			return;
+
+		WARN_ON(sd_ctl_dir[0].child);
+		sd_ctl_dir[0].child = cpu_entries;
+	}
+
+	if (!cpu_idx) {
+		struct ctl_table *e = cpu_entries;
+
+		cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL);
+		if (!cpu_idx)
+			return;
+
+		/* deal with sparse possible map */
+		for_each_possible_cpu(i) {
+			cpu_idx[i] = e;
+			e++;
+		}
+	}
 
-	WARN_ON(sd_ctl_dir[0].child);
-	sd_ctl_dir[0].child = entry;
+	if (!cpumask_available(sd_sysctl_cpus)) {
+		if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
+			return;
 
-	if (entry == NULL)
-		return;
+		/* init to possible to not have holes in @cpu_entries */
+		cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
+	}
+
+	for_each_cpu(i, sd_sysctl_cpus) {
+		struct ctl_table *e = cpu_idx[i];
+
+		if (e->child)
+			sd_free_ctl_entry(&e->child);
+
+		if (!e->procname) {
+			snprintf(buf, 32, "cpu%d", i);
+			e->procname = kstrdup(buf, GFP_KERNEL);
+		}
+		e->mode = 0555;
+		e->child = sd_alloc_ctl_cpu_table(i);
 
-	for_each_possible_cpu(i) {
-		snprintf(buf, 32, "cpu%d", i);
-		entry->procname = kstrdup(buf, GFP_KERNEL);
-		entry->mode = 0555;
-		entry->child = sd_alloc_ctl_cpu_table(i);
-		entry++;
+		__cpumask_clear_cpu(i, sd_sysctl_cpus);
 	}
 
 	WARN_ON(sd_sysctl_header);
 	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
 }
 
+void dirty_sched_domain_sysctl(int cpu)
+{
+	if (cpumask_available(sd_sysctl_cpus))
+		__cpumask_set_cpu(cpu, sd_sysctl_cpus);
+}
+
 /* may be called multiple times per register */
 void unregister_sched_domain_sysctl(void)
 {
 	unregister_sysctl_table(sd_sysctl_header);
 	sd_sysctl_header = NULL;
-	if (sd_ctl_dir[0].child)
-		sd_free_ctl_entry(&sd_ctl_dir[0].child);
 }
 #endif /* CONFIG_SYSCTL */
 #endif /* CONFIG_SMP */
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1120,11 +1120,15 @@ extern int group_balance_cpu(struct sche
 
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
 void register_sched_domain_sysctl(void);
+void dirty_sched_domain_sysctl(int cpu);
 void unregister_sched_domain_sysctl(void);
 #else
 static inline void register_sched_domain_sysctl(void)
 {
 }
+static inline void dirty_sched_domain_sysctl(int cpu)
+{
+}
 static inline void unregister_sched_domain_sysctl(void)
 {
 }
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -461,6 +461,7 @@ cpu_attach_domain(struct sched_domain *s
 	rq_attach_root(rq, rd);
 	tmp = rq->sd;
 	rcu_assign_pointer(rq->sd, sd);
+	dirty_sched_domain_sysctl(cpu);
 	destroy_sched_domains(tmp);
 
 	update_top_cache_domain(cpu);

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ