lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date:	Sun, 29 Mar 2015 22:13:33 -0400
From:	David Ahern <david.ahern@...cle.com>
To:	efault@....de, peterz@...radead.org, mingo@...nel.org
Cc:	linux-kernel@...r.kernel.org, David Ahern <david.ahern@...cle.com>
Subject: [RFC PATCH] sched: Add cpu based entries to debugfs

Currently sched_debug can be added to the kernel commandline parameters
to dump domain information during boot. This method is not practical with
a large number of CPUs.

This patch adds per-cpu entries to debugfs under a sched directory.
Reading the per-cpu file shows the domain information in a human-readable
format:

$ cat /sys/kernel/debug/sched/cpu0
domain 0 / SMT:
    flags: 0x2af:  load-balance new-idle exec fork affine cpu-capacity share-pkg-resources
    span: 0-7
    groups:
        0 (cpu_capacity = 147)
        1 (cpu_capacity = 147)
        2 (cpu_capacity = 147)
        3 (cpu_capacity = 147)
        4 (cpu_capacity = 147)
        5 (cpu_capacity = 147)
        6 (cpu_capacity = 147)
        7 (cpu_capacity = 147)

domain 2 / DIE:
    flags: 0x102f:  load-balance new-idle exec fork affine prefer-sibling
    span: 0-127
    groups:
        0-7 (cpu_capacity = 1176)
        8-15 (cpu_capacity = 1176)
        16-23 (cpu_capacity = 1176)
        24-31 (cpu_capacity = 1176)
        32-39 (cpu_capacity = 1176)
        40-47 (cpu_capacity = 1176)
        48-55 (cpu_capacity = 1176)
        56-63 (cpu_capacity = 1176)
        64-71 (cpu_capacity = 1176)
        72-79 (cpu_capacity = 1176)
        80-87 (cpu_capacity = 1176)
        88-95 (cpu_capacity = 1176)
        96-103 (cpu_capacity = 1176)
        104-111 (cpu_capacity = 1176)
        112-119 (cpu_capacity = 1176)
        120-127 (cpu_capacity = 1176)

domain 3 / NUMA:
    flags: 0x642f:  load-balance new-idle exec fork affine serialize overlap numa
    span: 0-1023
    groups:
        0-127 (cpu_capacity = 18816)
        128-255 (cpu_capacity = 18816)
        256-383 (cpu_capacity = 18816)
        384-511 (cpu_capacity = 18816)
        512-639 (cpu_capacity = 18816)
        640-767 (cpu_capacity = 18816)
        768-895 (cpu_capacity = 18816)
        896-1023 (cpu_capacity = 18816)

Before spending too much time formalizing this I wanted to see if you guys
would entertain the idea of making this info available via debugfs. It does
move the existing sched_features file to sched/features -- not sure how 
acceptable it is to move files in debugfs.

TO-DO: handle hotplug

Signed-off-by: David Ahern <david.ahern@...cle.com>
---
 kernel/sched/core.c | 167 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 164 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 62671f53202a..b4d8d0c8260e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -268,12 +268,173 @@ static const struct file_operations sched_feat_fops = {
 	.release	= single_release,
 };
 
+static const char * const sd_flag_names[] = {
+	"load-balance",
+	"new-idle",
+	"exec",
+	"fork",
+	"wake",
+	"affine",
+	"",
+	"cpu-capacity",
+	"power-domain",
+	"share-pkg-resources",
+	"serialize",
+	"asym-packing",
+	"prefer-sibling",
+	"overlap",
+	"numa",
+	"",
+};
+static void sched_cpu_domain_show(struct seq_file *m, struct sched_domain *sd,
+				  int cpu)
+{
+	struct cpumask groupmask;
+	struct sched_group *group = sd->groups;
+	int i;
+
+	cpumask_clear(&groupmask);
+
+	seq_printf(m, "domain %d / %s:\n", sd->level, sd->name);
+	seq_printf(m, "    flags: 0x%x: ", sd->flags);
+
+	for (i = 0; i < ARRAY_SIZE(sd_flag_names); ++i) {
+		if (sd->flags & (1 << i))
+			seq_printf(m, " %s", sd_flag_names[i]);
+	}
+	seq_puts(m, "\n");
+
+	if (!(sd->flags & SD_LOAD_BALANCE) && sd->parent)
+		seq_puts(m, "           ERROR: !SD_LOAD_BALANCE domain has parent\n");
+
+	seq_printf(m, "    span: %*pbl\n",
+		   cpumask_pr_args(sched_domain_span(sd)));
+
+	if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
+		seq_printf(m, "    ERROR: domain->span does not contain CPU%d\n", cpu);
+
+	if (!cpumask_test_cpu(cpu, sched_group_cpus(group)))
+		seq_printf(m, "    ERROR: domain->groups does not contain CPU%d\n", cpu);
+
+	seq_puts(m, "    groups:\n");
+	do {
+		if (!group) {
+			seq_puts(m, "            ERROR: group is NULL\n");
+			break;
+		}
+
+		/*
+		 * Even though we initialize ->capacity to something semi-sane,
+		 * we leave capacity_orig unset. This allows us to detect if
+		 * domain iteration is still funny without causing /0 traps.
+		 */
+		if (!group->sgc->capacity_orig) {
+			seq_puts(m, "        ERROR: domain->cpu_capacity not set\n");
+			break;
+		}
+
+		if (!cpumask_weight(sched_group_cpus(group))) {
+			seq_puts(m, "        ERROR: empty group\n");
+			break;
+		}
+
+		if (!(sd->flags & SD_OVERLAP) &&
+		    cpumask_intersects(&groupmask, sched_group_cpus(group))) {
+			seq_puts(m, "        ERROR: repeated CPUs\n");
+			break;
+		}
+
+		cpumask_or(&groupmask, &groupmask, sched_group_cpus(group));
+
+		seq_printf(m, "        %*pbl",
+			   cpumask_pr_args(sched_group_cpus(group)));
+
+		if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
+			seq_printf(m, " (cpu_capacity = %d)",
+				   group->sgc->capacity);
+		}
+		seq_puts(m, "\n");
+
+		group = group->next;
+	} while (group != sd->groups);
+
+	if (!cpumask_equal(sched_domain_span(sd), &groupmask))
+		seq_puts(m, "    ERROR: groups don't span domain->span\n");
+
+	if (sd->parent &&
+	    !cpumask_subset(&groupmask, sched_domain_span(sd->parent))) {
+		seq_puts(m, "    ERROR: parent span is not a superset of domain->span\n");
+	}
+}
+
+static int sched_cpu_show(struct seq_file *m, void *unused)
+{
+	struct sched_domain *sd;
+	int cpu = (int) ((long) m->private);
+
+	if (cpu < 0 || cpu > CONFIG_NR_CPUS) {
+		seq_printf(m, "invalid CPU, %d\n", cpu);
+		return 0;
+	}
+
+	for_each_domain(cpu, sd) {
+		sched_cpu_domain_show(m, sd, cpu);
+		seq_puts(m, "\n");
+	}
+
+	return 0;
+}
+
+static int sched_cpu_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, sched_cpu_show, inode->i_private);
+}
+static const struct file_operations sched_cpu_fops = {
+	.open		= sched_cpu_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+static struct dentry *d_sched_debug;
+static struct dentry *d_sched_cpu[CONFIG_NR_CPUS];
+
+static int sched_debugfs_add_cpu(int cpu)
+{
+	char buf[32];
+	long lcpu = cpu;
+
+	snprintf(buf, sizeof(buf), "cpu%d", cpu);
+	d_sched_cpu[cpu] = debugfs_create_file(buf, 0444, d_sched_debug,
+						(void *) lcpu, &sched_cpu_fops);
+
+	if (d_sched_cpu[cpu] == NULL)
+		pr_warn("Failed to create debugfs entry for cpu %d\n", cpu);
+
+	return 0;
+}
+
 static __init int sched_init_debug(void)
 {
-	debugfs_create_file("sched_features", 0644, NULL, NULL,
+	int cpu;
+	int rc = 0;
+
+	d_sched_debug = debugfs_create_dir("sched", NULL);
+	if (!d_sched_debug) {
+		pr_warn("Could not create debugfs 'sched' entry\n");
+		return 0;
+	}
+
+	debugfs_create_file("features", 0644, d_sched_debug, NULL,
 			&sched_feat_fops);
 
-	return 0;
+	for_each_online_cpu(cpu) {
+		rc = sched_debugfs_add_cpu(cpu);
+		if (rc)
+			goto out;
+	}
+
+out:
+	return rc;
 }
 late_initcall(sched_init_debug);
 #endif /* CONFIG_SCHED_DEBUG */
@@ -6689,7 +6850,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
 
 		if (!cpumask_subset(sched_domain_span(child),
 				    sched_domain_span(sd))) {
-			pr_err("BUG: arch topology borken\n");
+			pr_err("BUG: arch topology broken\n");
 #ifdef CONFIG_SCHED_DEBUG
 			pr_err("     the %s domain not a subset of the %s domain\n",
 					child->name, sd->name);
-- 
2.3.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ