lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Fri, 22 Sep 2017 13:54:30 +0200
From:   Peter Zijlstra <peterz@...radead.org>
To:     Markus Trippelsdorf <markus@...ppelsdorf.de>
Cc:     Tejun Heo <tj@...nel.org>, linux-kernel@...r.kernel.org,
        "Luis R. Rodriguez" <mcgrof@...nel.org>,
        "Eric W. Biederman" <ebiederm@...ssion.com>,
        "Paul E. McKenney" <paulmck@...ux.vnet.ibm.com>,
        Linus Torvalds <torvalds@...ux-foundation.org>,
        Steven Rostedt <rostedt@...dmis.org>,
        Thomas Gleixner <tglx@...utronix.de>,
        Ingo Molnar <mingo@...nel.org>
Subject: [RFC][PATCH] sched: Cleanup task->state printing

On Fri, Sep 22, 2017 at 11:35:33AM +0200, Markus Trippelsdorf wrote:
> > It seems to work. Simply returning "I (idle)" from get_task_state() in
> > fs/proc/array.c when the state is TASK_IDLE does the trick.
> > I've tested top, htop and ps.

I ended up with the below; there was quite a lot of inconsistent state
printing around it seems.

I should probably split this thing into a bunch of patches :/

Alongside an explicit idle state, this also exposes TASK_PARKED,
although arguably we could map that to idle too. Opinions?

---
 fs/proc/array.c                   | 35 ++++++++++++-----------
 include/linux/sched.h             | 58 +++++++++++++++++++++++----------------
 include/trace/events/sched.h      | 24 +++++++++++-----
 kernel/sched/core.c               | 22 ++++++++++++++-
 kernel/sched/debug.c              |  2 --
 kernel/trace/trace_output.c       | 21 ++++----------
 kernel/trace/trace_sched_wakeup.c | 12 ++++----
 7 files changed, 103 insertions(+), 71 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 88c355574aa0..5a076854857f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -118,28 +118,31 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
  * simple bit tests.
  */
 static const char * const task_state_array[] = {
-	"R (running)",		/*   0 */
-	"S (sleeping)",		/*   1 */
-	"D (disk sleep)",	/*   2 */
-	"T (stopped)",		/*   4 */
-	"t (tracing stop)",	/*   8 */
-	"X (dead)",		/*  16 */
-	"Z (zombie)",		/*  32 */
+	/* states inside TASK_REPORT */
+
+	"R (running)",		/* 0x00 */
+	"S (sleeping)",		/* 0x01 */
+	"D (disk sleep)",	/* 0x02 */
+	"T (stopped)",		/* 0x04 */
+	"t (tracing stop)",	/* 0x08 */
+	"X (dead)",		/* 0x10 */
+	"Z (zombie)",		/* 0x20 */
+	"P (parked)",		/* 0x40 */
+
+	/* extra states, beyond TASK_REPORT */
+
+	"I (idle)",		/* 0x80 */
 };
 
 static inline const char *get_task_state(struct task_struct *tsk)
 {
-	unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT;
+	unsigned int tsk_state = READ_ONCE(tsk->state);
+	unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;
 
-	/*
-	 * Parked tasks do not run; they sit in __kthread_parkme().
-	 * Without this check, we would report them as running, which is
-	 * clearly wrong, so we report them as sleeping instead.
-	 */
-	if (tsk->state == TASK_PARKED)
-		state = TASK_INTERRUPTIBLE;
+	if (tsk_state == TASK_IDLE)
+		state = TASK_REPORT_MAX;
 
-	BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1);
+	BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-2);
 
 	return task_state_array[fls(state)];
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 68b38335d33c..7ae81efb17bd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -65,25 +65,27 @@ struct task_group;
  */
 
 /* Used in tsk->state: */
-#define TASK_RUNNING			0
-#define TASK_INTERRUPTIBLE		1
-#define TASK_UNINTERRUPTIBLE		2
-#define __TASK_STOPPED			4
-#define __TASK_TRACED			8
+#define TASK_RUNNING			0x0000
+#define TASK_INTERRUPTIBLE		0x0001
+#define TASK_UNINTERRUPTIBLE		0x0002
+#define __TASK_STOPPED			0x0004
+#define __TASK_TRACED			0x0008
 /* Used in tsk->exit_state: */
-#define EXIT_DEAD			16
-#define EXIT_ZOMBIE			32
+#define EXIT_DEAD			0x0010
+#define EXIT_ZOMBIE			0x0020
 #define EXIT_TRACE			(EXIT_ZOMBIE | EXIT_DEAD)
 /* Used in tsk->state again: */
-#define TASK_DEAD			64
-#define TASK_WAKEKILL			128
-#define TASK_WAKING			256
-#define TASK_PARKED			512
-#define TASK_NOLOAD			1024
-#define TASK_NEW			2048
-#define TASK_STATE_MAX			4096
+#define TASK_PARKED			0x0040
+#define TASK_REPORT_MAX			0x0080
 
-#define TASK_STATE_TO_CHAR_STR		"RSDTtXZxKWPNn"
+/* Not in TASK_REPORT: */
+#define TASK_DEAD			0x0080
+#define TASK_WAKEKILL			0x0100
+#define TASK_WAKING			0x0200
+#define TASK_NOLOAD			0x0400
+#define TASK_NEW			0x0800
+
+#define TASK_STATE_MAX			0x1000
 
 /* Convenience macros for the sake of set_current_state: */
 #define TASK_KILLABLE			(TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
@@ -96,10 +98,11 @@ struct task_group;
 #define TASK_NORMAL			(TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
 #define TASK_ALL			(TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
 
-/* get_task_state(): */
+/* task_state_to_char(), get_task_state(), trace_sched_switch() */
 #define TASK_REPORT			(TASK_RUNNING | TASK_INTERRUPTIBLE | \
 					 TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
-					 __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
+					 __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
+					 TASK_PARKED)
 
 #define task_is_traced(task)		((task->state & __TASK_TRACED) != 0)
 
@@ -1244,17 +1247,24 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
 	return task_pgrp_nr_ns(tsk, &init_pid_ns);
 }
 
-static inline char task_state_to_char(struct task_struct *task)
+static inline char __task_state_to_char(unsigned int state)
 {
-	const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
-	unsigned long state = task->state;
+	static const char state_char[] = "RSDTtXZPI";
 
-	state = state ? __ffs(state) + 1 : 0;
+	BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != sizeof(state_char) - 3);
+
+	return state_char[fls(state)];
+}
+
+static inline char task_state_to_char(struct task_struct *task)
+{
+	unsigned int tsk_state = READ_ONCE(task->state);
+	unsigned int state = (tsk_state | task->exit_state) & TASK_REPORT;
 
-	/* Make sure the string lines up properly with the number of task states: */
-	BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);
+	if (tsk_state == TASK_IDLE)
+		state = TASK_REPORT_MAX;
 
-	return state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?';
+	return __task_state_to_char(state);
 }
 
 /**
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index ae1409ffe99a..af1858a01335 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -106,6 +106,8 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
 #ifdef CREATE_TRACE_POINTS
 static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p)
 {
+	unsigned int state = READ_ONCE(p->state);
+
 #ifdef CONFIG_SCHED_DEBUG
 	BUG_ON(p != current);
 #endif /* CONFIG_SCHED_DEBUG */
@@ -114,10 +116,18 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
 	 * Preemption ignores task state, therefore preempted tasks are always
 	 * RUNNING (we will not have dequeued if state != RUNNING).
 	 */
-	return preempt ? TASK_RUNNING | TASK_STATE_MAX : p->state;
+	if (preempt)
+		return TASK_REPORT_MAX << 1;
+
+	if (state == TASK_IDLE)
+		return TASK_REPORT_MAX;
+
+	return (state | p->exit_state) & TASK_REPORT;
 }
 #endif /* CREATE_TRACE_POINTS */
 
+#define TRACE_REPORT_MASK ((TASK_REPORT_MAX << 1) - 1)
+
 /*
  * Tracepoint for task switches, performed by the scheduler:
  */
@@ -152,13 +162,13 @@ TRACE_EVENT(sched_switch,
 
 	TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
 		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
-		__entry->prev_state & (TASK_STATE_MAX-1) ?
-		  __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|",
+
+		__entry->prev_state & TRACE_REPORT_MASK ?
+		  __print_flags(__entry->prev_state & TRACE_REPORT_MASK, "|",
 				{ 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
-				{ 16, "Z" }, { 32, "X" }, { 64, "x" },
-				{ 128, "K" }, { 256, "W" }, { 512, "P" },
-				{ 1024, "N" }) : "R",
-		__entry->prev_state & TASK_STATE_MAX ? "+" : "",
+				{ 16, "X" }, { 32, "Z" }, { 64, "P" },
+				{ 128, "I" }) : "R",
+		__entry->prev_state & (TASK_REPORT_MAX << 1) ? "+" : "",
 		__entry->next_comm, __entry->next_pid, __entry->next_prio)
 );
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 703f5831738e..431e2d6c709e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5164,6 +5164,26 @@ void sched_show_task(struct task_struct *p)
 	put_task_stack(p);
 }
 
+static inline bool state_filter_match(unsigned long state_filter, struct task_struct *p)
+{
+	/* no filter, everything matches */
+	if (!state_filter)
+		return true;
+
+	/* filter, but doesn't match */
+	if (!(p->state & state_filter))
+		return false;
+
+	/*
+	 * When looking for TASK_UNINTERRUPTIBLE, skip TASK_IDLE, but allow
+	 * TASK_KILLABLE.
+	 */
+	if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
+		return false;
+
+	return true;
+}
+
 void show_state_filter(unsigned long state_filter)
 {
 	struct task_struct *g, *p;
@@ -5186,7 +5206,7 @@ void show_state_filter(unsigned long state_filter)
 		 */
 		touch_nmi_watchdog();
 		touch_all_softlockup_watchdogs();
-		if (!state_filter || (p->state & state_filter))
+		if (state_filter_match(state_filter, p))
 			sched_show_task(p);
 	}
 
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4a23bbc3111b..244619e402cc 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -461,8 +461,6 @@ static char *task_group_path(struct task_group *tg)
 }
 #endif
 
-static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
-
 static void
 print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 {
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index bac629af2285..c738e764e2a5 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -656,15 +656,6 @@ int trace_print_lat_context(struct trace_iterator *iter)
 	return !trace_seq_has_overflowed(s);
 }
 
-static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
-
-static int task_state_char(unsigned long state)
-{
-	int bit = state ? __ffs(state) + 1 : 0;
-
-	return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
-}
-
 /**
  * ftrace_find_event - find a registered event
  * @type: the type of event to look for
@@ -930,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
 
 	trace_assign_type(field, iter->ent);
 
-	T = task_state_char(field->next_state);
-	S = task_state_char(field->prev_state);
+	T = __task_state_to_char(field->next_state);
+	S = __task_state_to_char(field->prev_state);
 	trace_find_cmdline(field->next_pid, comm);
 	trace_seq_printf(&iter->seq,
 			 " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
@@ -966,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
 	trace_assign_type(field, iter->ent);
 
 	if (!S)
-		S = task_state_char(field->prev_state);
-	T = task_state_char(field->next_state);
+		S = __task_state_to_char(field->prev_state);
+	T = __task_state_to_char(field->next_state);
 	trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
 			 field->prev_pid,
 			 field->prev_prio,
@@ -1002,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
 	trace_assign_type(field, iter->ent);
 
 	if (!S)
-		S = task_state_char(field->prev_state);
-	T = task_state_char(field->next_state);
+		S = __task_state_to_char(field->prev_state);
+	T = __task_state_to_char(field->next_state);
 
 	SEQ_PUT_HEX_FIELD(s, field->prev_pid);
 	SEQ_PUT_HEX_FIELD(s, field->prev_prio);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index ddec53b67646..b14caa0afd35 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -380,7 +380,7 @@ probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
 }
 
 static void
-tracing_sched_switch_trace(struct trace_array *tr,
+tracing_sched_switch_trace(bool preempt, struct trace_array *tr,
 			   struct task_struct *prev,
 			   struct task_struct *next,
 			   unsigned long flags, int pc)
@@ -397,10 +397,10 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry	= ring_buffer_event_data(event);
 	entry->prev_pid			= prev->pid;
 	entry->prev_prio		= prev->prio;
-	entry->prev_state		= prev->state;
+	entry->prev_state		= __trace_sched_switch_state(preempt, prev);
 	entry->next_pid			= next->pid;
 	entry->next_prio		= next->prio;
-	entry->next_state		= next->state;
+	entry->next_state		= __trace_sched_switch_state(false, next);
 	entry->next_cpu	= task_cpu(next);
 
 	if (!call_filter_check_discard(call, entry, buffer, event))
@@ -425,10 +425,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry	= ring_buffer_event_data(event);
 	entry->prev_pid			= curr->pid;
 	entry->prev_prio		= curr->prio;
-	entry->prev_state		= curr->state;
+	entry->prev_state		= __trace_sched_switch_state(false, curr);
 	entry->next_pid			= wakee->pid;
 	entry->next_prio		= wakee->prio;
-	entry->next_state		= wakee->state;
+	entry->next_state		= __trace_sched_switch_state(false, wakee);
 	entry->next_cpu			= task_cpu(wakee);
 
 	if (!call_filter_check_discard(call, entry, buffer, event))
@@ -482,7 +482,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
 	data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
 
 	__trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
-	tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
+	tracing_sched_switch_trace(preempt, wakeup_trace, prev, next, flags, pc);
 
 	T0 = data->preempt_timestamp;
 	T1 = ftrace_now(cpu);

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ