linux-kernel - Re: [PATCH v4] sched/debug: Use sched_debug_lock to serialize use of cgroup

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <YGwmsePipAYoAI2H@hirez.programming.kicks-ass.net>
Date:   Tue, 6 Apr 2021 11:15:29 +0200
From:   Peter Zijlstra <peterz@...radead.org>
To:     Waiman Long <longman@...hat.com>
Cc:     Ingo Molnar <mingo@...hat.com>, Juri Lelli <juri.lelli@...hat.com>,
        Vincent Guittot <vincent.guittot@...aro.org>,
        Dietmar Eggemann <dietmar.eggemann@....com>,
        Steven Rostedt <rostedt@...dmis.org>,
        Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>,
        Daniel Bristot de Oliveira <bristot@...hat.com>,
        Bharata B Rao <bharata@...ux.vnet.ibm.com>,
        Phil Auld <pauld@...hat.com>,
        Daniel Thompson <daniel.thompson@...aro.org>,
        linux-kernel@...r.kernel.org
Subject: Re: [PATCH v4] sched/debug: Use sched_debug_lock to serialize use of
 cgroup_path[] only

On Mon, Apr 05, 2021 at 07:42:03PM -0400, Waiman Long wrote:
> The handling of sysrq key can be activated by echoing the key to
> /proc/sysrq-trigger or via the magic key sequence typed into a terminal
> that is connected to the system in some way (serial, USB or other mean).
> In the former case, the handling is done in a user context. In the
> latter case, it is likely to be in an interrupt context.

> [ 7809.796281]  </NMI>
> [ 7809.796282]  _raw_spin_lock_irqsave+0x32/0x40
> [ 7809.796283]  print_cpu+0x261/0x7c0
> [ 7809.796283]  sysrq_sched_debug_show+0x34/0x50
> [ 7809.796284]  sysrq_handle_showstate+0xc/0x20
> [ 7809.796284]  __handle_sysrq.cold.11+0x48/0xfb
> [ 7809.796285]  write_sysrq_trigger+0x2b/0x30
> [ 7809.796285]  proc_reg_write+0x39/0x60
> [ 7809.796286]  vfs_write+0xa5/0x1a0
> [ 7809.796286]  ksys_write+0x4f/0xb0
> [ 7809.796287]  do_syscall_64+0x5b/0x1a0
> [ 7809.796287]  entry_SYSCALL_64_after_hwframe+0x65/0xca
> [ 7809.796288] RIP: 0033:0x7fabe4ceb648
> 
> The purpose of sched_debug_lock is to serialize the use of the global
> cgroup_path[] buffer in print_cpu(). The rests of the printk calls don't
> need serialization from sched_debug_lock.

> The print_cpu() function has two callers - sched_debug_show() and
> sysrq_sched_debug_show(). 

So what idiot is doing sysrq and that proc file at the same time? Why is
it a problem now?

> @@ -470,16 +468,49 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
>  #endif
>  
>  #ifdef CONFIG_CGROUP_SCHED
> +static DEFINE_SPINLOCK(sched_debug_lock);
>  static char group_path[PATH_MAX];
> +static enum {
> +	TOKEN_NONE,
> +	TOKEN_ACQUIRED,
> +	TOKEN_NA	/* Not applicable */
> +} console_token = TOKEN_ACQUIRED;

> +/*
> + * All the print_cpu() callers from sched_debug_show() will be allowed
> + * to contend for sched_debug_lock and use group_path[] as their SEQ_printf()
> + * calls will be much faster. However only one print_cpu() caller from
> + * sysrq_sched_debug_show() which outputs to the console will be allowed
> + * to use group_path[]. Another parallel console writer will have to use
> + * a shorter stack buffer instead. Since the console output will be garbled
> + * anyway, truncation of some cgroup paths shouldn't be a big issue.
> + */
> +#define SEQ_printf_task_group_path(m, tg, fmt...)			\
> +{									\
> +	unsigned long flags;						\
> +	int token = m ? TOKEN_NA					\
> +		      : xchg_acquire(&console_token, TOKEN_NONE);	\
> +									\
> +	if (token == TOKEN_NONE) {					\
> +		char buf[128];						\
> +		task_group_path(tg, buf, sizeof(buf));			\
> +		SEQ_printf(m, fmt, buf);				\
> +	} else {							\
> +		spin_lock_irqsave(&sched_debug_lock, flags);		\
> +		task_group_path(tg, group_path, sizeof(group_path));	\
> +		SEQ_printf(m, fmt, group_path);				\
> +		spin_unlock_irqrestore(&sched_debug_lock, flags);	\
> +		if (token == TOKEN_ACQUIRED)				\
> +			smp_store_release(&console_token, token);	\
> +	}								\
>  }

This is disgusting... you have an open-coded test-and-set lock like
thing *AND* a spinlock, what gives?


What's wrong with something simple like this?

---
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4b49cc2af5c4..2ac2977f3b96 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -8,8 +8,6 @@
  */
 #include "sched.h"
 
-static DEFINE_SPINLOCK(sched_debug_lock);
-
 /*
  * This allows printing both to /proc/sched_debug and
  * to the console
@@ -470,6 +468,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 #endif
 
 #ifdef CONFIG_CGROUP_SCHED
+static DEFINE_SPINLOCK(group_path_lock);
 static char group_path[PATH_MAX];
 
 static char *task_group_path(struct task_group *tg)
@@ -481,6 +480,22 @@ static char *task_group_path(struct task_group *tg)
 
 	return group_path;
 }
+
+#define SEQ_printf_task_group_path(m, tg)				\
+do {									\
+	if (spin_trylock(&group_path_lock)) {				\
+		task_group_path(tg, group_path, sizeof(group_path));	\
+		SEQ_printf(m, "%s", group_path);			\
+		spin_unlock(&group_path_lock);				\
+	} else {							\
+		SEQ_printf(m, "looser!");				\
+	}
+} while (0)
+
+#else
+
+#define SEQ_printf_task_group_path(m, tg) do { } while (0)
+
 #endif
 
 static void
@@ -505,9 +520,8 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 #ifdef CONFIG_NUMA_BALANCING
 	SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
 #endif
-#ifdef CONFIG_CGROUP_SCHED
-	SEQ_printf(m, " %s", task_group_path(task_group(p)));
-#endif
+	SEQ_printf(m, " ");
+	SEQ_printf_task_group_path(m, task_group(p));
 
 	SEQ_printf(m, "\n");
 }
@@ -541,13 +555,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	struct sched_entity *last;
 	unsigned long flags;
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
 	SEQ_printf(m, "\n");
-	SEQ_printf(m, "cfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
-#else
+	SEQ_printf(m, "cfs_rq[%d]:", cpu);
+	SEQ_printf_task_group_path(m, cfs_rq->tg);
 	SEQ_printf(m, "\n");
-	SEQ_printf(m, "cfs_rq[%d]:\n", cpu);
-#endif
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
 			SPLIT_NS(cfs_rq->exec_clock));
 
@@ -612,13 +623,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 
 void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 {
-#ifdef CONFIG_RT_GROUP_SCHED
 	SEQ_printf(m, "\n");
-	SEQ_printf(m, "rt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
-#else
+	SEQ_printf(m, "rt_rq[%d]:", cpu);
+	SEQ_printf_task_group_path(m, rt_rq->tg);
 	SEQ_printf(m, "\n");
-	SEQ_printf(m, "rt_rq[%d]:\n", cpu);
-#endif
 
 #define P(x) \
 	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
@@ -666,7 +674,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
 static void print_cpu(struct seq_file *m, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long flags;
 
 #ifdef CONFIG_X86
 	{
@@ -717,13 +724,11 @@ do {									\
 	}
 #undef P
 
-	spin_lock_irqsave(&sched_debug_lock, flags);
 	print_cfs_stats(m, cpu);
 	print_rt_stats(m, cpu);
 	print_dl_stats(m, cpu);
 
 	print_rq(m, rq, cpu);
-	spin_unlock_irqrestore(&sched_debug_lock, flags);
 	SEQ_printf(m, "\n");
 }