linux-kernel - Re: [PATCH] sched: Add trace for task wake up latency and leave running time

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20200821184955.GT3982@worktop.programming.kicks-ass.net>
Date:   Fri, 21 Aug 2020 20:49:55 +0200
From:   Peter Zijlstra <peterz@...radead.org>
To:     Dongjiu Geng <gengdongjiu@...wei.com>
Cc:     mingo@...hat.com, juri.lelli@...hat.com,
        vincent.guittot@...aro.org, dietmar.eggemann@....com,
        rostedt@...dmis.org, bsegall@...gle.com, mgorman@...e.de,
        thara.gopinath@...aro.org, pauld@...hat.com,
        vincent.donnefort@....com, rdunlap@...radead.org,
        linux-kernel@...r.kernel.org, linuxarm@...wei.com
Subject: Re: [PATCH] sched: Add trace for task wake up latency and leave
 running time


> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 93ecd930efd3..edb622c40a90 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1324,6 +1324,13 @@ struct task_struct {
>  	/* CPU-specific state of this task: */
>  	struct thread_struct		thread;
>  
> +	/* Task wake up time stamp */
> +	u64				ts_wakeup;
> +	/* Previous task switch out time stamp */
> +	u64				pre_ts_end;
> +	/* Next task switch in time stamp */
> +	u64				next_ts_start;
> +	bool				wakeup_state;
>  	/*
>  	 * WARNING: on x86, 'thread_struct' contains a variable-sized
>  	 * structure.  It *MUST* be at the end of 'task_struct'.

^^^ did you read that comment?

> diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> index fec25b9cfbaf..e99c6d573a42 100644
> --- a/include/trace/events/sched.h
> +++ b/include/trace/events/sched.h
> @@ -183,6 +183,72 @@ TRACE_EVENT(sched_switch,
>  		__entry->next_comm, __entry->next_pid, __entry->next_prio)
>  );
>  
> +DECLARE_EVENT_CLASS(sched_latency_template,
> +
> +	TP_PROTO(bool preempt,
> +		 struct task_struct *prev,
> +		 struct task_struct *next,
> +		 u64 time),
> +
> +	TP_ARGS(preempt, prev, next, time),
> +
> +	TP_STRUCT__entry(
> +		__array(	char,	prev_comm,	TASK_COMM_LEN	)
> +		__field(	pid_t,	prev_pid			)
> +		__field(	int,	prev_prio			)
> +		__field(	long,	prev_state			)
> +		__array(	char,	next_comm,	TASK_COMM_LEN	)
> +		__field(	pid_t,	next_pid			)
> +		__field(	int,	next_prio			)
> +		__field(	u64,	time				)
> +	),
> +
> +	TP_fast_assign(
> +		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
> +		__entry->prev_pid	= prev->pid;
> +		__entry->prev_prio	= prev->prio;
> +		__entry->prev_state	= __trace_sched_switch_state(preempt, prev);
> +		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
> +		__entry->next_pid	= next->pid;
> +		__entry->next_prio	= next->prio;
> +		__entry->time		= time;
> +		/* XXX SCHED_DEADLINE */
> +	),
> +
> +	TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d passed time=%llu (ns)",
> +		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
> +
> +		(__entry->prev_state & (TASK_REPORT_MAX - 1)) ?
> +		  __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|",
> +				{ TASK_INTERRUPTIBLE, "S" },
> +				{ TASK_UNINTERRUPTIBLE, "D" },
> +				{ __TASK_STOPPED, "T" },
> +				{ __TASK_TRACED, "t" },
> +				{ EXIT_DEAD, "X" },
> +				{ EXIT_ZOMBIE, "Z" },
> +				{ TASK_PARKED, "P" },
> +				{ TASK_DEAD, "I" }) :
> +		  "R",
> +
> +		__entry->prev_state & TASK_REPORT_MAX ? "+" : "",
> +		__entry->next_comm, __entry->next_pid, __entry->next_prio,
> +		__entry->time)
> +);

NAK, that tracepoint is already broken, we don't want to proliferate the
broken.

> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 8471a0f7eb32..b5a1928dc948 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2464,6 +2464,8 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
>  {
>  	check_preempt_curr(rq, p, wake_flags);
>  	p->state = TASK_RUNNING;
> +	p->ts_wakeup = local_clock();
> +	p->wakeup_state = true;
>  	trace_sched_wakeup(p);
>  
>  #ifdef CONFIG_SMP

NAK, userless overhead.

> @@ -2846,6 +2848,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
>  		success = 1;
>  		trace_sched_waking(p);
>  		p->state = TASK_RUNNING;
> +		p->ts_wakeup = local_clock();
> +		p->wakeup_state = true;
>  		trace_sched_wakeup(p);
>  		goto out;
>  	}

idem


> @@ -3355,6 +3362,8 @@ void wake_up_new_task(struct task_struct *p)
>  	post_init_entity_util_avg(p);
>  
>  	activate_task(rq, p, ENQUEUE_NOCLOCK);
> +	p->ts_wakeup = local_clock();
> +	p->wakeup_state = true;
>  	trace_sched_wakeup_new(p);
>  	check_preempt_curr(rq, p, WF_FORK);
>  #ifdef CONFIG_SMP

idem

> @@ -4521,8 +4530,19 @@ static void __sched notrace __schedule(bool preempt)
>  
>  		psi_sched_switch(prev, next, !task_on_rq_queued(prev));
>  
> +		next->next_ts_start = prev->pre_ts_end = local_clock();
> +
>  		trace_sched_switch(preempt, prev, next);
>  
> +		if (next->ts_wakeup && next->wakeup_state) {
> +			trace_sched_wakeup_latency(preempt, prev, next,
> +				next->next_ts_start - next->ts_wakeup);
> +			next->wakeup_state = false;
> +		}
> +
> +		trace_sched_leave_running_time(preempt, prev,
> +			next, next->next_ts_start - next->pre_ts_end);
> +
>  		/* Also unlocks the rq: */
>  		rq = context_switch(rq, prev, next, &rf);
>  	} else {

This really isn't ever going to happen.