lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20230801090124.8050-4-zegao@tencent.com>
Date:   Tue,  1 Aug 2023 17:01:21 +0800
From:   Ze Gao <zegao2021@...il.com>
To:     Adrian Hunter <adrian.hunter@...el.com>,
        Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
        Arnaldo Carvalho de Melo <acme@...nel.org>,
        Ian Rogers <irogers@...gle.com>,
        Ingo Molnar <mingo@...hat.com>, Jiri Olsa <jolsa@...nel.org>,
        Mark Rutland <mark.rutland@....com>,
        Masami Hiramatsu <mhiramat@...nel.org>,
        Namhyung Kim <namhyung@...nel.org>,
        Peter Zijlstra <peterz@...radead.org>,
        Steven Rostedt <rostedt@...dmis.org>
Cc:     linux-kernel@...r.kernel.org, linux-perf-users@...r.kernel.org,
        linux-trace-kernel@...r.kernel.org,
        linux-trace-devel@...r.kernel.org, Ze Gao <zegao@...cent.com>
Subject: [RFC PATCH v3 3/6] sched, tracing: add to report task state in symbolic chars

Internal representations of task state are likely to be changed
or ordered, and reporting them to userspace without exporting
them as part of API is basically wrong, which can easily break
a userspace observability tool as kernel evolves. For example,
perf suffers from this and still reports wrong states as of this
writing.

OTOH, some masqueraded states like TASK_REPORT_IDLE and
TASK_REPORT_MAX are also reported inadvertently, which confuses
things even more and most userspace tools do not even take them
into consideration.

So add a new variable in company with the old raw value to
report task state in symbolic chars, which are self-explaining
and no further translation is needed. Of course this does not
break any userspace tool.

Note for PREEMPT_ACTIVE, we introduce 'p' to report it and use
the old conventions for the rest.

Signed-off-by: Ze Gao <zegao@...cent.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@...nel.org>
Acked-by: Ian Rogers <irogers@...gle.com>
---
 include/trace/events/sched.h | 54 +++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 22 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index fbb99a61f714..e507901bcab8 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -6,6 +6,7 @@
 #define _TRACE_SCHED_H
 
 #include <linux/kthread.h>
+#include <linux/sched.h>
 #include <linux/sched/numa_balancing.h>
 #include <linux/tracepoint.h>
 #include <linux/binfmts.h>
@@ -214,6 +215,27 @@ static inline long __trace_sched_switch_state(bool preempt,
 
 	return state ? (1 << (state - 1)) : state;
 }
+
+static inline char __trace_sched_switch_state_char(bool preempt,
+						   unsigned int prev_state,
+						   struct task_struct *p)
+{
+	long state;
+
+#ifdef CONFIG_SCHED_DEBUG
+	BUG_ON(p != current);
+#endif /* CONFIG_SCHED_DEBUG */
+
+	/*
+	 * For PREEMPT_ACTIVE, we introduce 'p' to report it and use the old
+	 * conventions for the rest.
+	 */
+	if (preempt)
+		return 'p';
+
+	state = __task_state_index(prev_state, p->exit_state);
+	return task_index_to_char(state);
+}
 #endif /* CREATE_TRACE_POINTS */
 
 /*
@@ -233,6 +255,7 @@ TRACE_EVENT(sched_switch,
 		__field(	pid_t,	prev_pid			)
 		__field(	int,	prev_prio			)
 		__field(	long,	prev_state			)
+		__field(	char,	prev_state_char			)
 		__array(	char,	next_comm,	TASK_COMM_LEN	)
 		__field(	pid_t,	next_pid			)
 		__field(	int,	next_prio			)
@@ -240,32 +263,19 @@ TRACE_EVENT(sched_switch,
 
 	TP_fast_assign(
 		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
-		__entry->prev_pid	= prev->pid;
-		__entry->prev_prio	= prev->prio;
-		__entry->prev_state	= __trace_sched_switch_state(preempt, prev_state, prev);
+		__entry->prev_pid		= prev->pid;
+		__entry->prev_prio		= prev->prio;
+		__entry->prev_state		= __trace_sched_switch_state(preempt, prev_state, prev);
+		__entry->prev_state_char	= __trace_sched_switch_state_char(preempt, prev_state, prev);
 		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
-		__entry->next_pid	= next->pid;
-		__entry->next_prio	= next->prio;
+		__entry->next_pid		= next->pid;
+		__entry->next_prio		= next->prio;
 		/* XXX SCHED_DEADLINE */
 	),
 
-	TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
-		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
-
-		(__entry->prev_state & (TASK_REPORT_MAX - 1)) ?
-		  __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|",
-				{ TASK_INTERRUPTIBLE, "S" },
-				{ TASK_UNINTERRUPTIBLE, "D" },
-				{ __TASK_STOPPED, "T" },
-				{ __TASK_TRACED, "t" },
-				{ EXIT_DEAD, "X" },
-				{ EXIT_ZOMBIE, "Z" },
-				{ TASK_PARKED, "P" },
-				{ TASK_DEAD, "I" }) :
-		  "R",
-
-		__entry->prev_state & TASK_REPORT_MAX ? "+" : "",
-		__entry->next_comm, __entry->next_pid, __entry->next_prio)
+	TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%c ==> next_comm=%s next_pid=%d next_prio=%d",
+		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio, __entry->prev_state_char, __entry->next_comm,
+		__entry->next_pid, __entry->next_prio)
 );
 
 /*
-- 
2.40.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ