[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20230725072254.32045-2-zegao@tencent.com>
Date: Tue, 25 Jul 2023 15:22:52 +0800
From: Ze Gao <zegao2021@...il.com>
To: Ingo Molnar <mingo@...hat.com>,
Peter Zijlstra <peterz@...radead.org>,
Steven Rostedt <rostedt@...dmis.org>,
Namhyung Kim <namhyung@...nel.org>,
Ian Rogers <irogers@...gle.com>,
Arnaldo Carvalho de Melo <acme@...nel.org>,
Jiri Olsa <jolsa@...nel.org>,
Mark Rutland <mark.rutland@....com>,
Masami Hiramatsu <mhiramat@...nel.org>,
Adrian Hunter <adrian.hunter@...el.com>,
Alexander Shishkin <alexander.shishkin@...ux.intel.com>
Cc: linux-kernel@...r.kernel.org, linux-perf-users@...r.kernel.org,
linux-trace-kernel@...r.kernel.org, Ze Gao <zegao@...cent.com>
Subject: [RFC PATCH 1/3] sched, tracing: report task state in symbolic chars instead
Internal representations of task state are likely to be changed or ordered,
and reporting them to userspace without exporting them as part of API is not
a good choice, which can easily break a userspace observability tool as kernel
evolves. For example, perf suffers from this and still reports wrong states
by this patch.
OTOH, some masqueraded state like TASK_REPORT_IDLE and TASK_REPORT_MAX are
also reported inadvertently, which confuses things even more.
So report task state in symbolic char instead, which is self-explaining and
no further translation is needed.
Note for PREEMPT_ACTIVE, we introduce 'p' to report it and use the old
conventions for the rest.
Signed-off-by: Ze Gao <zegao@...cent.com>
---
include/trace/events/sched.h | 41 +++++++++++++-----------------------
1 file changed, 15 insertions(+), 26 deletions(-)
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index fbb99a61f714..51102a7c0c2d 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -6,6 +6,7 @@
#define _TRACE_SCHED_H
#include <linux/kthread.h>
+#include <linux/sched.h>
#include <linux/sched/numa_balancing.h>
#include <linux/tracepoint.h>
#include <linux/binfmts.h>
@@ -187,7 +188,7 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
TP_ARGS(p));
#ifdef CREATE_TRACE_POINTS
-static inline long __trace_sched_switch_state(bool preempt,
+static inline const char __trace_sched_switch_state(bool preempt,
unsigned int prev_state,
struct task_struct *p)
{
@@ -197,13 +198,6 @@ static inline long __trace_sched_switch_state(bool preempt,
BUG_ON(p != current);
#endif /* CONFIG_SCHED_DEBUG */
- /*
- * Preemption ignores task state, therefore preempted tasks are always
- * RUNNING (we will not have dequeued if state != RUNNING).
- */
- if (preempt)
- return TASK_REPORT_MAX;
-
/*
* task_state_index() uses fls() and returns a value from 0-8 range.
* Decrement it by 1 (except TASK_RUNNING state i.e 0) before using
@@ -212,7 +206,16 @@ static inline long __trace_sched_switch_state(bool preempt,
*/
state = __task_state_index(prev_state, p->exit_state);
- return state ? (1 << (state - 1)) : state;
+ /*
+ * Preemption ignores task state, therefore preempted tasks are always
+ * RUNNING (we will not have dequeued if state != RUNNING).
+ * Here, we use 'p' to denote this case and only for this case.
+ */
+ if (preempt)
+ return 'p';
+
+
+ return task_index_to_char(state);
}
#endif /* CREATE_TRACE_POINTS */
@@ -232,7 +235,7 @@ TRACE_EVENT(sched_switch,
__array( char, prev_comm, TASK_COMM_LEN )
__field( pid_t, prev_pid )
__field( int, prev_prio )
- __field( long, prev_state )
+ __field( char, prev_state )
__array( char, next_comm, TASK_COMM_LEN )
__field( pid_t, next_pid )
__field( int, next_prio )
@@ -249,22 +252,8 @@ TRACE_EVENT(sched_switch,
/* XXX SCHED_DEADLINE */
),
- TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
- __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
-
- (__entry->prev_state & (TASK_REPORT_MAX - 1)) ?
- __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|",
- { TASK_INTERRUPTIBLE, "S" },
- { TASK_UNINTERRUPTIBLE, "D" },
- { __TASK_STOPPED, "T" },
- { __TASK_TRACED, "t" },
- { EXIT_DEAD, "X" },
- { EXIT_ZOMBIE, "Z" },
- { TASK_PARKED, "P" },
- { TASK_DEAD, "I" }) :
- "R",
-
- __entry->prev_state & TASK_REPORT_MAX ? "+" : "",
+ TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%c ==> next_comm=%s next_pid=%d next_prio=%d",
+ __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
__entry->next_comm, __entry->next_pid, __entry->next_prio)
);
--
2.40.1
Powered by blists - more mailing lists