linux-kernel - Re: [PATCH] sched: fix task and run queue run

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20150930154413.GO3604@twins.programming.kicks-ass.net>
Date:	Wed, 30 Sep 2015 17:44:13 +0200
From:	Peter Zijlstra <peterz@...radead.org>
To:	"Meyer, Mike" <Mike.Meyer@...adata.com>
Cc:	"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
	"mingo@...hat.com" <mingo@...hat.com>
Subject: Re: [PATCH] sched: fix task and run queue run_delay inconsistencies

On Wed, Sep 23, 2015 at 12:37:18AM +0000, Meyer, Mike wrote:
> During evaluation of some performance data, it was discovered thread
> and run queue run_delay accounting data was inconsistent with the other
> accounting data that was collected.  Further investigation found under
> certain circumstances execution time was leaking into the task and
> run queue accounting of run_delay.
> 
> Consider the following sequence:
> 
>     a. thread is running.
>     b. thread moves beween cgroups, changes scheduling class or priority.
>     c. thread sleeps OR
>     d. thread involuntarily gives up cpu.
> 
> a. implies:
> 
>     thread->sched_info.last_queued = 0
> 
> a. and b. results in the following:
> 
>     1. dequeue_task(rq, thread)
> 
>            sched_info_dequeued(rq, thread)
>                delta = 0
> 
>                sched_info_reset_dequeued(thread)
>                    thread->sched_info.last_queued = 0
> 
>                thread->sched_info.run_delay += delta
> 
>     2. enqueue_task(rq, thread)
> 
>            sched_info_queued(rq, thread)
> 
>                /* thread is still on cpu at this point. */
>                thread->sched_info.last_queued = task_rq(thread)->clock;
> 
> c. results in:
> 
>     dequeue_task(rq, thread)
> 
>         sched_info_dequeued(rq, thread)
> 
>             /* delta is execution time not run_delay. */
>             delta = task_rq(thread)->clock - thread->sched_info.last_queued
> 
>         sched_info_reset_dequeued(thread)
>             thread->sched_info.last_queued = 0
> 
>         thread->sched_info.run_delay += delta
> 
>     Since thread was running between enqueue_task(rq, thread) and
>     dequeue_task(rq, thread), the delta above is really execution
>     time and not run_delay.
> 
> d. results in:
> 
>     __sched_info_switch(thread, next_thread)
> 
>         sched_info_depart(rq, thread)
> 
>             sched_info_queued(rq, thread)
> 
>                 /* last_queued not updated due to being non-zero */
>                 return
> 
>     Since thread was running between enqueue_task(rq, thread) and
>     __sched_info_switch(thread, next_thread), the execution time
>     between enqueue_task(rq, thread) and
>     __sched_info_switch(thread, next_thread) now will become
>     associated with run_delay due to when last_queued was last updated.
> 
> The proposed patch addresses the issue by calling
> sched_info_reset_dequeued(thread) following the call to
> enqueue_task(rq, thread) for running threads in situations in which
> thread->sched_info.last_queued should remain 0.

Would something like the below; which avoids calling
sched_info_{de,}queued() for these sites also work?

It even shrinks the code (due to inlining {en,de}queue_task()):

$ size defconfig-build/kernel/sched/core.o defconfig-build/kernel/sched/core.o.orig
   text    data     bss     dec     hex filename
  64019   23378    2344   89741   15e8d defconfig-build/kernel/sched/core.o
  64149   23378    2344   89871   15f0f defconfig-build/kernel/sched/core.o.orig

---
 kernel/sched/core.c  | 42 ++++++++++++++++++++++++------------------
 kernel/sched/sched.h | 14 ++++++++------
 2 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fe819298c220..c5d579ad70cd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -827,17 +827,19 @@ static void set_load_weight(struct task_struct *p)
 	load->inv_weight = prio_to_wmult[prio];
 }
 
-static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_rq_clock(rq);
-	sched_info_queued(rq, p);
+	if (!(flags & ENQUEUE_TEMP))
+		sched_info_queued(rq, p);
 	p->sched_class->enqueue_task(rq, p, flags);
 }
 
-static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_rq_clock(rq);
-	sched_info_dequeued(rq, p);
+	if (!(flags & DEQUEUE_TEMP))
+		sched_info_dequeued(rq, p);
 	p->sched_class->dequeue_task(rq, p, flags);
 }
 
@@ -1178,7 +1180,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 		 * holding rq->lock.
 		 */
 		lockdep_assert_held(&rq->lock);
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_TEMP);
 	}
 	if (running)
 		put_prev_task(rq, p);
@@ -1188,7 +1190,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, p, 0);
+		enqueue_task(rq, p, ENQUEUE_TEMP);
 }
 
 /*
@@ -3300,7 +3302,7 @@ EXPORT_SYMBOL(default_wake_function);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-	int oldprio, queued, running, enqueue_flag = 0;
+	int oldprio, queued, running, enqueue_flag = ENQUEUE_TEMP;
 	struct rq *rq;
 	const struct sched_class *prev_class;
 
@@ -3332,7 +3334,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_TEMP);
 	if (running)
 		put_prev_task(rq, p);
 
@@ -3350,7 +3352,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		if (!dl_prio(p->normal_prio) ||
 		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
 			p->dl.dl_boosted = 1;
-			enqueue_flag = ENQUEUE_REPLENISH;
+			enqueue_flag |= ENQUEUE_REPLENISH;
 		} else
 			p->dl.dl_boosted = 0;
 		p->sched_class = &dl_sched_class;
@@ -3358,7 +3360,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		if (dl_prio(oldprio))
 			p->dl.dl_boosted = 0;
 		if (oldprio < prio)
-			enqueue_flag = ENQUEUE_HEAD;
+			enqueue_flag |= ENQUEUE_HEAD;
 		p->sched_class = &rt_sched_class;
 	} else {
 		if (dl_prio(oldprio))
@@ -3410,7 +3412,7 @@ void set_user_nice(struct task_struct *p, long nice)
 	}
 	queued = task_on_rq_queued(p);
 	if (queued)
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_TEMP);
 
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
@@ -3419,7 +3421,7 @@ void set_user_nice(struct task_struct *p, long nice)
 	delta = p->prio - old_prio;
 
 	if (queued) {
-		enqueue_task(rq, p, 0);
+		enqueue_task(rq, p, ENQUEUE_TEMP);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
@@ -3921,7 +3923,7 @@ static int __sched_setscheduler(struct task_struct *p,
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_TEMP);
 	if (running)
 		put_prev_task(rq, p);
 
@@ -3931,11 +3933,15 @@ static int __sched_setscheduler(struct task_struct *p,
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued) {
+		int enqueue_flags = ENQUEUE_TEMP;
 		/*
 		 * We enqueue to tail when the priority of a task is
 		 * increased (user space view).
 		 */
-		enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+		if (oldprio <= p->prio)
+			enqueue_flags |= ENQUEUE_HEAD;
+
+		enqueue_task(rq, p, enqueue_flags);
 	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
@@ -5084,7 +5090,7 @@ void sched_setnuma(struct task_struct *p, int nid)
 	running = task_current(rq, p);
 
 	if (queued)
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_TEMP);
 	if (running)
 		put_prev_task(rq, p);
 
@@ -5093,7 +5099,7 @@ void sched_setnuma(struct task_struct *p, int nid)
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, p, 0);
+		enqueue_task(rq, p, ENQUEUE_TEMP);
 	task_rq_unlock(rq, p, &flags);
 }
 #endif /* CONFIG_NUMA_BALANCING */
@@ -7712,7 +7718,7 @@ void sched_move_task(struct task_struct *tsk)
 	queued = task_on_rq_queued(tsk);
 
 	if (queued)
-		dequeue_task(rq, tsk, 0);
+		dequeue_task(rq, tsk, DEQUEUE_TEMP);
 	if (unlikely(running))
 		put_prev_task(rq, tsk);
 
@@ -7736,7 +7742,7 @@ void sched_move_task(struct task_struct *tsk)
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, tsk, 0);
+		enqueue_task(rq, tsk, ENQUEUE_TEMP);
 
 	task_rq_unlock(rq, tsk, &flags);
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index af6f252e7e34..d97a8d1abc66 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1150,16 +1150,18 @@ static const u32 prio_to_wmult[40] = {
  /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
 
-#define ENQUEUE_WAKEUP		1
-#define ENQUEUE_HEAD		2
+#define ENQUEUE_WAKEUP		0x01
+#define ENQUEUE_HEAD		0x02
 #ifdef CONFIG_SMP
-#define ENQUEUE_WAKING		4	/* sched_class::task_waking was called */
+#define ENQUEUE_WAKING		0x04	/* sched_class::task_waking was called */
 #else
-#define ENQUEUE_WAKING		0
+#define ENQUEUE_WAKING		0x00
 #endif
-#define ENQUEUE_REPLENISH	8
+#define ENQUEUE_REPLENISH	0x08
+#define ENQUEUE_TEMP		0x10
 
-#define DEQUEUE_SLEEP		1
+#define DEQUEUE_SLEEP		0x01
+#define DEQUEUE_TEMP		0x02
 
 #define RETRY_TASK		((void *)-1UL)
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/