linux-kernel - [PATCH 20/39] sched_ext: Print debug dump after an error exit

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240501151312.635565-21-tj@kernel.org>
Date: Wed,  1 May 2024 05:09:55 -1000
From: Tejun Heo <tj@...nel.org>
To: torvalds@...ux-foundation.org,
	mingo@...hat.com,
	peterz@...radead.org,
	juri.lelli@...hat.com,
	vincent.guittot@...aro.org,
	dietmar.eggemann@....com,
	rostedt@...dmis.org,
	bsegall@...gle.com,
	mgorman@...e.de,
	bristot@...hat.com,
	vschneid@...hat.com,
	ast@...nel.org,
	daniel@...earbox.net,
	andrii@...nel.org,
	martin.lau@...nel.org,
	joshdon@...gle.com,
	brho@...gle.com,
	pjt@...gle.com,
	derkling@...gle.com,
	haoluo@...gle.com,
	dvernet@...a.com,
	dschatzberg@...a.com,
	dskarlat@...cmu.edu,
	riel@...riel.com,
	changwoo@...lia.com,
	himadrics@...ia.fr,
	memxor@...il.com,
	andrea.righi@...onical.com,
	joel@...lfernandes.org
Cc: linux-kernel@...r.kernel.org,
	bpf@...r.kernel.org,
	kernel-team@...a.com,
	Tejun Heo <tj@...nel.org>
Subject: [PATCH 20/39] sched_ext: Print debug dump after an error exit

If a BPF scheduler triggers an error, the scheduler is aborted and the
system is reverted to the built-in scheduler. In the process, a lot of
information which may be useful for figuring out what happened can be lost.

This patch adds debug dump which captures information which may be useful
for debugging including runqueue and runnable thread states at the time of
failure. The following shows a debug dump after triggering the watchdog:

  # os/work/tools/sched_ext/build/bin/scx_qmap -t 100
  enq=0, dsp=0, delta=0, deq=0
  enq=21, dsp=21, delta=0, deq=0
  enq=96, dsp=96, delta=0, deq=1

  DEBUG DUMP
  ================================================================================

  kworker/u16:0[11] triggered exit kind 1026:
    runnable task stall (scx_qmap[1524] failed to run for 5.659s)

  Backtrace:
    scx_watchdog_workfn+0x138/0x1c0
    process_scheduled_works+0x245/0x4e0
    worker_thread+0x270/0x360
    kthread+0xeb/0x110
    ret_from_fork+0x36/0x40
    ret_from_fork_asm+0x11/0x20

  Runqueue states
  ---------------

  CPU 2   : nr_run=1 ops_qseq=34
	    curr=kworker/u16:0[11] class=ext_sched_class

   *R kworker/u16:0[11] +0ms
	scx_state/flags=3/0xd ops_state/qseq=0/0
	sticky/holding_cpu=-1/-1 dsq_id=(n/a)
	cpus=ff

      scx_ops_error_irq_workfn+0x25d/0x340
      irq_work_run_list+0x7d/0xc0
      irq_work_run+0x18/0x30
      __sysvec_irq_work+0x38/0x100
      sysvec_irq_work+0x69/0x80
      asm_sysvec_irq_work+0x1b/0x20
      scx_watchdog_workfn+0x15d/0x1c0
      process_scheduled_works+0x245/0x4e0
      worker_thread+0x270/0x360
      kthread+0xeb/0x110
      ret_from_fork+0x36/0x40
      ret_from_fork_asm+0x11/0x20

  CPU 7   : nr_run=1 ops_qseq=11
	    curr=swapper/7[0] class=idle_sched_class

    R scx_qmap[1524] -5659ms
	scx_state/flags=3/0x9 ops_state/qseq=2/3
	sticky/holding_cpu=-1/-1 dsq_id=(n/a)
	cpus=ff

      common_nsleep+0x34/0x50
      __x64_sys_clock_nanosleep+0xd9/0x120
      do_syscall_64+0x7e/0x150
      entry_SYSCALL_64_after_hwframe+0x46/0x4e

  ================================================================================

  EXIT: runnable task stall (scx_qmap[1524] failed to run for 5.659s)

It shows that CPU 2 was running the watchdog when it triggered the error
condition and the scx_qmap thread has been queued on CPU 7 for over 5
seconds but failed to run. This dump has proved pretty useful for developing
and debugging BPF schedulers.

Currently, it uses fixed 32k buffer and doesn't provide any way for the BPF
scheduler to add additional information. These will be improved in the
future.

Signed-off-by: Tejun Heo <tj@...nel.org>
Reviewed-by: David Vernet <dvernet@...a.com>
---
 kernel/sched/ext.c                           | 122 ++++++++++++++++++-
 tools/sched_ext/include/scx/compat.h         |  11 +-
 tools/sched_ext/include/scx/user_exit_info.h |  19 +++
 tools/sched_ext/scx_qmap.c                   |  10 +-
 tools/sched_ext/scx_simple.c                 |   2 +-
 5 files changed, 155 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index ff080b5f0330..4ffa42e5d7dd 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -12,6 +12,7 @@ enum scx_consts {
 
 	SCX_EXIT_BT_LEN			= 64,
 	SCX_EXIT_MSG_LEN		= 1024,
+	SCX_EXIT_DUMP_DFL_LEN		= 32768,
 };
 
 enum scx_exit_kind {
@@ -48,6 +49,9 @@ struct scx_exit_info {
 
 	/* informational message */
 	char			*msg;
+
+	/* debug dump */
+	char			*dump;
 };
 
 /* sched_ext_ops.flags */
@@ -330,6 +334,12 @@ struct sched_ext_ops {
 	 */
 	u32 timeout_ms;
 
+	/**
+	 * exit_dump_len - scx_exit_info.dump buffer length. If 0, the default
+	 * value of 32768 is used.
+	 */
+	u32 exit_dump_len;
+
 	/**
 	 * name - BPF scheduler's name
 	 *
@@ -2888,12 +2898,13 @@ static void scx_ops_bypass(bool bypass)
 
 static void free_exit_info(struct scx_exit_info *ei)
 {
+	kfree(ei->dump);
 	kfree(ei->msg);
 	kfree(ei->bt);
 	kfree(ei);
 }
 
-static struct scx_exit_info *alloc_exit_info(void)
+static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len)
 {
 	struct scx_exit_info *ei;
 
@@ -2903,8 +2914,9 @@ static struct scx_exit_info *alloc_exit_info(void)
 
 	ei->bt = kcalloc(sizeof(ei->bt[0]), SCX_EXIT_BT_LEN, GFP_KERNEL);
 	ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
+	ei->dump = kzalloc(exit_dump_len, GFP_KERNEL);
 
-	if (!ei->bt || !ei->msg) {
+	if (!ei->bt || !ei->msg || !ei->dump) {
 		free_exit_info(ei);
 		return NULL;
 	}
@@ -3104,8 +3116,101 @@ static void scx_ops_disable(enum scx_exit_kind kind)
 	schedule_scx_ops_disable_work();
 }
 
+static void scx_dump_task(struct seq_buf *s, struct task_struct *p, char marker,
+			  unsigned long now)
+{
+	static unsigned long bt[SCX_EXIT_BT_LEN];
+	char dsq_id_buf[19] = "(n/a)";
+	unsigned long ops_state = atomic_long_read(&p->scx.ops_state);
+	unsigned int bt_len;
+	size_t avail, used;
+	char *buf;
+
+	if (p->scx.dsq)
+		scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx",
+			  (unsigned long long)p->scx.dsq->id);
+
+	seq_buf_printf(s, "\n %c%c %s[%d] %+ldms\n",
+		       marker, task_state_to_char(p), p->comm, p->pid,
+		       jiffies_delta_msecs(p->scx.runnable_at, now));
+	seq_buf_printf(s, "      scx_state/flags=%u/0x%x ops_state/qseq=%lu/%lu\n",
+		       scx_get_task_state(p),
+		       p->scx.flags & ~SCX_TASK_STATE_MASK,
+		       ops_state & SCX_OPSS_STATE_MASK,
+		       ops_state >> SCX_OPSS_QSEQ_SHIFT);
+	seq_buf_printf(s, "      sticky/holding_cpu=%d/%d dsq_id=%s\n",
+		       p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf);
+	seq_buf_printf(s, "      cpus=%*pb\n\n", cpumask_pr_args(p->cpus_ptr));
+
+	bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1);
+
+	avail = seq_buf_get_buf(s, &buf);
+	used = stack_trace_snprint(buf, avail, bt, bt_len, 3);
+	seq_buf_commit(s, used < avail ? used : -1);
+}
+
+static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
+{
+	const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n";
+	unsigned long now = jiffies;
+	struct seq_buf s;
+	size_t avail, used;
+	char *buf;
+	int cpu;
+
+	if (dump_len <= sizeof(trunc_marker))
+		return;
+
+	seq_buf_init(&s, ei->dump, dump_len - sizeof(trunc_marker));
+
+	seq_buf_printf(&s, "%s[%d] triggered exit kind %d:\n  %s (%s)\n\n",
+		       current->comm, current->pid, ei->kind, ei->reason, ei->msg);
+	seq_buf_printf(&s, "Backtrace:\n");
+	avail = seq_buf_get_buf(&s, &buf);
+	used = stack_trace_snprint(buf, avail, ei->bt, ei->bt_len, 1);
+	seq_buf_commit(&s, used < avail ? used : -1);
+
+	seq_buf_printf(&s, "\nRunqueue states\n");
+	seq_buf_printf(&s, "---------------\n");
+
+	for_each_possible_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+		struct rq_flags rf;
+		struct task_struct *p;
+
+		rq_lock(rq, &rf);
+
+		if (list_empty(&rq->scx.runnable_list) &&
+		    rq->curr->sched_class == &idle_sched_class)
+			goto next;
+
+		seq_buf_printf(&s, "\nCPU %-4d: nr_run=%u ops_qseq=%lu\n",
+			       cpu, rq->scx.nr_running, rq->scx.ops_qseq);
+		seq_buf_printf(&s, "          curr=%s[%d] class=%ps\n",
+			       rq->curr->comm, rq->curr->pid,
+			       rq->curr->sched_class);
+
+		if (rq->curr->sched_class == &ext_sched_class)
+			scx_dump_task(&s, rq->curr, '*', now);
+
+		list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
+			scx_dump_task(&s, p, ' ', now);
+	next:
+		rq_unlock(rq, &rf);
+	}
+
+	if (seq_buf_has_overflowed(&s))
+		memcpy(ei->dump + seq_buf_used(&s) - 1, trunc_marker,
+		       sizeof(trunc_marker));
+}
+
 static void scx_ops_error_irq_workfn(struct irq_work *irq_work)
 {
+	struct scx_exit_info *ei = scx_exit_info;
+
+	if (ei->kind >= SCX_EXIT_ERROR)
+		scx_dump_state(ei, scx_ops.exit_dump_len);
+
 	schedule_scx_ops_disable_work();
 }
 
@@ -3131,6 +3236,13 @@ static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
 	vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);
 	va_end(args);
 
+	/*
+	 * Set ei->kind and ->reason for scx_dump_state(). They'll be set again
+	 * in scx_ops_disable_workfn().
+	 */
+	ei->kind = kind;
+	ei->reason = scx_exit_reason(ei->kind);
+
 	irq_work_queue(&scx_ops_error_irq_work);
 }
 
@@ -3192,7 +3304,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	if (ret < 0)
 		goto err;
 
-	scx_exit_info = alloc_exit_info();
+	scx_exit_info = alloc_exit_info(ops->exit_dump_len);
 	if (!scx_exit_info) {
 		ret = -ENOMEM;
 		goto err_del;
@@ -3572,6 +3684,10 @@ static int bpf_scx_init_member(const struct btf_type *t,
 			return -E2BIG;
 		ops->timeout_ms = *(u32 *)(udata + moff);
 		return 1;
+	case offsetof(struct sched_ext_ops, exit_dump_len):
+		ops->exit_dump_len =
+			*(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN;
+		return 1;
 	}
 
 	return 0;
diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h
index 2a66f3eb87a9..2be79bd88a25 100644
--- a/tools/sched_ext/include/scx/compat.h
+++ b/tools/sched_ext/include/scx/compat.h
@@ -126,7 +126,8 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field
  * and attach it, backward compatibility is automatically maintained where
  * reasonable.
  *
- * - sched_ext_ops.tick(): Ignored on older kernels with a warning.
+ * - ops.tick(): Ignored on older kernels with a warning.
+ * - ops.exit_dump_len: Cleared to zero on older kernels with a warning.
  */
 #define SCX_OPS_OPEN(__ops_name, __scx_name) ({					\
 	struct __scx_name *__skel;						\
@@ -136,7 +137,13 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field
 	__skel; 								\
 })
 
-#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name) ({				\
+#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({		\
+	UEI_SET_SIZE(__skel, __ops_name, __uei_name);				\
+	if (!__COMPAT_struct_has_field("sched_ext_ops", "exit_dump_len") &&	\
+	    (__skel)->struct_ops.__ops_name->exit_dump_len) {			\
+		fprintf(stderr, "WARNING: kernel doesn't support setting exit dump len\n"); \
+		(__skel)->struct_ops.__ops_name->exit_dump_len = 0;		\
+	}									\
 	if (!__COMPAT_struct_has_field("sched_ext_ops", "tick") &&		\
 	    (__skel)->struct_ops.__ops_name->tick) {				\
 		fprintf(stderr, "WARNING: kernel doesn't support ops.tick()\n"); \
diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h
index 8c3b7fac4d05..cf4293cb250e 100644
--- a/tools/sched_ext/include/scx/user_exit_info.h
+++ b/tools/sched_ext/include/scx/user_exit_info.h
@@ -13,6 +13,7 @@
 enum uei_sizes {
 	UEI_REASON_LEN		= 128,
 	UEI_MSG_LEN		= 1024,
+	UEI_DUMP_DFL_LEN	= 32768,
 };
 
 struct user_exit_info {
@@ -28,6 +29,8 @@ struct user_exit_info {
 #include <bpf/bpf_core_read.h>
 
 #define UEI_DEFINE(__name)							\
+	char RESIZABLE_ARRAY(data, __name##_dump);				\
+	const volatile u32 __name##_dump_len;					\
 	struct user_exit_info __name SEC(".data")
 
 #define UEI_RECORD(__uei_name, __ei) ({						\
@@ -35,6 +38,8 @@ struct user_exit_info {
 				  sizeof(__uei_name.reason), (__ei)->reason);	\
 	bpf_probe_read_kernel_str(__uei_name.msg,				\
 				  sizeof(__uei_name.msg), (__ei)->msg);		\
+	bpf_probe_read_kernel_str(__uei_name##_dump,				\
+				  __uei_name##_dump_len, (__ei)->dump);		\
 	if (bpf_core_field_exists((__ei)->exit_code))				\
 		__uei_name.exit_code = (__ei)->exit_code;			\
 	/* use __sync to force memory barrier */				\
@@ -47,6 +52,13 @@ struct user_exit_info {
 #include <stdio.h>
 #include <stdbool.h>
 
+/* no need to call the following explicitly if SCX_OPS_LOAD() is used */
+#define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({				\
+	u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN; \
+	(__skel)->rodata->__uei_name##_dump_len = __len;			\
+	RESIZE_ARRAY(data, __uei_name##_dump, __len);				\
+})
+
 #define UEI_EXITED(__skel, __uei_name) ({					\
 	/* use __sync to force memory barrier */				\
 	__sync_val_compare_and_swap(&(__skel)->data->__uei_name.kind, -1, -1);	\
@@ -54,6 +66,13 @@ struct user_exit_info {
 
 #define UEI_REPORT(__skel, __uei_name) ({					\
 	struct user_exit_info *__uei = &(__skel)->data->__uei_name;		\
+	char *__uei_dump = (__skel)->data_##__uei_name##_dump->__uei_name##_dump; \
+	if (__uei_dump[0] != '\0') {						\
+		fputs("\nDEBUG DUMP\n", stderr);				\
+		fputs("================================================================================\n\n", stderr); \
+		fputs(__uei_dump, stderr);					\
+		fputs("\n================================================================================\n\n", stderr); \
+	}									\
 	fprintf(stderr, "EXIT: %s", __uei->reason);				\
 	if (__uei->msg[0] != '\0')						\
 		fprintf(stderr, " (%s)", __uei->msg);				\
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index d2b98ef3ead2..28fd5aa4e62c 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -20,7 +20,7 @@ const char help_fmt[] =
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
 "Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-b COUNT]\n"
-"       [-d PID] [-p] [-v]\n"
+"       [-d PID] [-D LEN] [-p] [-v]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
@@ -28,6 +28,7 @@ const char help_fmt[] =
 "  -T COUNT      Stall every COUNT'th kernel thread\n"
 "  -b COUNT      Dispatch upto COUNT tasks together\n"
 "  -d PID        Disallow a process from switching into SCHED_EXT (-1 for self)\n"
+"  -D LEN        Set scx_exit_info.dump buffer length\n"
 "  -p            Switch only tasks on SCHED_EXT policy intead of all\n"
 "  -v            Print libbpf debug messages\n"
 "  -h            Display this help and exit\n";
@@ -59,7 +60,7 @@ int main(int argc, char **argv)
 
 	skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
 
-	while ((opt = getopt(argc, argv, "s:e:t:T:b:d:pvh")) != -1) {
+	while ((opt = getopt(argc, argv, "s:e:t:T:b:d:D:pvh")) != -1) {
 		switch (opt) {
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -81,6 +82,9 @@ int main(int argc, char **argv)
 			if (skel->rodata->disallow_tgid < 0)
 				skel->rodata->disallow_tgid = getpid();
 			break;
+		case 'D':
+			skel->struct_ops.qmap_ops->exit_dump_len = strtoul(optarg, NULL, 0);
+			break;
 		case 'p':
 			skel->rodata->switch_partial = true;
 			skel->struct_ops.qmap_ops->flags |= __COMPAT_SCX_OPS_SWITCH_PARTIAL;
@@ -94,7 +98,7 @@ int main(int argc, char **argv)
 		}
 	}
 
-	SCX_OPS_LOAD(skel, qmap_ops, scx_qmap);
+	SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei);
 	link = SCX_OPS_ATTACH(skel, qmap_ops);
 
 	while (!exit_req && !UEI_EXITED(skel, uei)) {
diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c
index 08c741f56685..9ffa8d084228 100644
--- a/tools/sched_ext/scx_simple.c
+++ b/tools/sched_ext/scx_simple.c
@@ -80,7 +80,7 @@ int main(int argc, char **argv)
 		}
 	}
 
-	SCX_OPS_LOAD(skel, simple_ops, scx_simple);
+	SCX_OPS_LOAD(skel, simple_ops, scx_simple, uei);
 	link = SCX_OPS_ATTACH(skel, simple_ops);
 
 	while (!exit_req && !UEI_EXITED(skel, uei)) {
-- 
2.44.0