linux-kernel - [BUG] Corrupted SCHED_DEADLINE bandwidth with cpusets

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date:	Wed, 3 Feb 2016 13:55:50 -0500
From:	Steven Rostedt <rostedt@...dmis.org>
To:	Peter Zijlstra <peterz@...radead.org>,
	Ingo Molnar <mingo@...nel.org>
Cc:	LKML <linux-kernel@...r.kernel.org>,
	Clark Williams <williams@...hat.com>,
	John Kacur <jkacur@...hat.com>,
	Daniel Bristot de Oliveira <bristot@...hat.com>,
	Juri Lelli <juri.lelli@...il.com>
Subject: [BUG] Corrupted SCHED_DEADLINE bandwidth with cpusets

There's an accounting issue with the SCHED_DEADLINE and the creation of
cpusets. If a SCHED_DEADLINE task already exists and a new root domain
is created, the calculation of the bandwidth among the root domains
gets corrupted.

For the reproducer, I downloaded Juri's tests:

  https://github.com/jlelli/tests.git 

    For his burn.c file.

  https://github.com/jlelli/schedtool-dl.git

    For his modified schedtool utility.


I have a kernel with my patches that show the bandwidth:

 # grep dl /proc/sched_debug        
dl_rq[0]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[1]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[2]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[3]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[4]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[5]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[6]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[7]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0


Note: the sched_rt runtime and period are 950000 and 1000000
respectively, and the bw ratio is (95/100) << 20 == 996147.

This isn't the way I first discovered the issue, but it appears to be
the quickest way to reproduce it.

Make sure there's no other cpusets. As libvirt created some, I had to
remove them first:

 # rmdir /sys/fs/cgroup/cpuset/libvirt/{qemu,}


 # burn&
 # schedtool -E -t 2000000:20000000 $!

 # grep dl /proc/sched_debug
dl_rq[0]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 104857
dl_rq[1]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 104857
dl_rq[2]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 104857
dl_rq[3]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 104857
dl_rq[4]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 104857
dl_rq[5]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 104857
dl_rq[6]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 104857
dl_rq[7]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 104857

Note: (2/20) << 20 == 104857

 # echo 0 > /sys/fs/cgroup/cpuset/cpuset.sched_load_balance

 # grep dl /proc/sched_debug                                       
dl_rq[0]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[1]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[2]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[3]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[4]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[5]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[6]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[7]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0

Notice that after removing load_balancing from the main cpuset, all the
totals went to zero.

Let's see what happens when we kill the task.

 # killall burn

 # grep dl /proc/sched_debug
dl_rq[0]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : -104857
dl_rq[1]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : -104857
dl_rq[2]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : -104857
dl_rq[3]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : -104857
dl_rq[4]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : -104857
dl_rq[5]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : -104857
dl_rq[6]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : -104857
dl_rq[7]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : -104857

They all went negative!

Not good, but we can recover...

 # echo 1 > /sys/fs/cgroup/cpuset/cpuset.sched_load_balance 

# grep dl /proc/sched_debugdl_rq[0]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[1]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[2]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[3]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[4]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[5]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[6]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0
dl_rq[7]:
  .dl_nr_running                 : 0
  .dl_bw->bw                     : 996147
  .dl_bw->total_bw               : 0

Playing with this a bit more, I found that it appears that setting
load_balance to 1 in the toplevel cpuset always resets the deadline
bandwidth weather or not it should be. At least that's a way to recover
from things not working anymore, but I still believe this is a bug.

Things can get even worse when adding a bunch of cpusets. But it
appears to always be cleared if you kill all sched_deadline tasks and
set the toplevel cpuset sched_load_balance to 1.


I traced this with the below patch and have this:

       schedtool-2279  [004] d...   124.195359: __sched_setscheduler: dl_b=ffff88011a01d040 new=104857 old=0
       schedtool-2279  [004] d...   124.195361: __dl_add: (ffff88011a01d040) total=0 add tsk=104857
       schedtool-2279  [004] d...   124.195362: __dl_add: new total=104857
       schedtool-2279  [004] d...   124.195364: <stack trace>
 => __dl_add
 => __sched_setscheduler
 => SyS_sched_setattr
 => entry_SYSCALL_64_fastpath
            bash-2213  [003] d...   142.917342: rq_attach_root: old_rd refcount=8
            bash-2213  [003] d...   142.917344: rq_attach_root: add new rd ffffffff81fcd9c0 (ffffffff81fcda00) total=0
            bash-2213  [003] d...   142.917351: <stack trace>
 => rq_attach_root
 => cpu_attach_domain
 => partition_sched_domains
 => rebuild_sched_domains_locked
 => update_flag
 => cpuset_write_u64
 => cgroup_file_write
 => kernfs_fop_write
 => __vfs_write
 => vfs_write
 => SyS_write
 => entry_SYSCALL_64_fastpath
            bash-2213  [003] d...   142.917387: rq_attach_root: old_rd refcount=7
            bash-2213  [003] d...   142.917388: rq_attach_root: add new rd ffffffff81fcd9c0 (ffffffff81fcda00) total=0
            bash-2213  [003] d...   142.917393: <stack trace>
 => rq_attach_root
 => cpu_attach_domain
 => partition_sched_domains
 => rebuild_sched_domains_locked
 => update_flag
 => cpuset_write_u64
 => cgroup_file_write
 => kernfs_fop_write
 => __vfs_write
 => vfs_write
 => SyS_write
 => entry_SYSCALL_64_fastpath
            bash-2213  [003] d...   142.917420: rq_attach_root: old_rd refcount=6
            bash-2213  [003] d...   142.917420: rq_attach_root: add new rd ffffffff81fcd9c0 (ffffffff81fcda00) total=0
            bash-2213  [003] d...   142.917425: <stack trace>
 => rq_attach_root
 => cpu_attach_domain
 => partition_sched_domains
 => rebuild_sched_domains_locked
 => update_flag
 => cpuset_write_u64
 => cgroup_file_write
 => kernfs_fop_write
 => __vfs_write
 => vfs_write
 => SyS_write
 => entry_SYSCALL_64_fastpath
            bash-2213  [003] d...   142.917452: rq_attach_root: old_rd refcount=5
            bash-2213  [003] d...   142.917452: rq_attach_root: add new rd ffffffff81fcd9c0 (ffffffff81fcda00) total=0
            bash-2213  [003] d...   142.917457: <stack trace>
 => rq_attach_root
 => cpu_attach_domain
 => partition_sched_domains
 => rebuild_sched_domains_locked
 => update_flag
 => cpuset_write_u64
 => cgroup_file_write
 => kernfs_fop_write
 => __vfs_write
 => vfs_write
 => SyS_write
 => entry_SYSCALL_64_fastpath
            bash-2213  [003] d...   142.917485: rq_attach_root: old_rd refcount=4
            bash-2213  [003] d...   142.917485: rq_attach_root: add new rd ffffffff81fcd9c0 (ffffffff81fcda00) total=0
            bash-2213  [003] d...   142.917490: <stack trace>
 => rq_attach_root
 => cpu_attach_domain
 => partition_sched_domains
 => rebuild_sched_domains_locked
 => update_flag
 => cpuset_write_u64
 => cgroup_file_write
 => kernfs_fop_write
 => __vfs_write
 => vfs_write
 => SyS_write
 => entry_SYSCALL_64_fastpath
            bash-2213  [003] d...   142.917518: rq_attach_root: old_rd refcount=3
            bash-2213  [003] d...   142.917519: rq_attach_root: add new rd ffffffff81fcd9c0 (ffffffff81fcda00) total=0
            bash-2213  [003] d...   142.917524: <stack trace>
 => rq_attach_root
 => cpu_attach_domain
 => partition_sched_domains
 => rebuild_sched_domains_locked
 => update_flag
 => cpuset_write_u64
 => cgroup_file_write
 => kernfs_fop_write
 => __vfs_write
 => vfs_write
 => SyS_write
 => entry_SYSCALL_64_fastpath
            bash-2213  [003] d...   142.917550: rq_attach_root: old_rd refcount=2
            bash-2213  [003] d...   142.917550: rq_attach_root: add new rd ffffffff81fcd9c0 (ffffffff81fcda00) total=0
            bash-2213  [003] d...   142.917556: <stack trace>
 => rq_attach_root
 => cpu_attach_domain
 => partition_sched_domains
 => rebuild_sched_domains_locked
 => update_flag
 => cpuset_write_u64
 => cgroup_file_write
 => kernfs_fop_write
 => __vfs_write
 => vfs_write
 => SyS_write
 => entry_SYSCALL_64_fastpath
            bash-2213  [003] d...   142.917582: rq_attach_root: old_rd refcount=1
            bash-2213  [003] d...   142.917582: rq_attach_root: old rd ffff88011a01d000 (ffff88011a01d040) total=104857
            bash-2213  [003] d...   142.917582: rq_attach_root: add new rd ffffffff81fcd9c0 (ffffffff81fcda00) total=0
            bash-2213  [003] d...   142.917588: <stack trace>
 => rq_attach_root
 => cpu_attach_domain
 => partition_sched_domains
 => rebuild_sched_domains_locked
 => update_flag
 => cpuset_write_u64
 => cgroup_file_write
 => kernfs_fop_write
 => __vfs_write
 => vfs_write
 => SyS_write
 => entry_SYSCALL_64_fastpath
     kworker/0:2-268   [000] d...   153.335522: task_dead_dl: [0:ffffffff81fcda00] total=0 tsk=104857
     kworker/0:2-268   [000] d...   153.335523: task_dead_dl: new total=-104857
     kworker/0:2-268   [000] d...   153.335528: <stack trace>
 => task_dead_dl
 => finish_task_switch
 => __schedule
 => schedule
 => worker_thread
 => kthread
 => ret_from_fork


Seems to be some disconnect between cgroups and sched deadline root
domains.

-- Steve

---
 kernel/sched/core.c     |    9 +++++++++
 kernel/sched/deadline.c |    5 +++++
 kernel/sched/sched.h    |    6 ++++++
 3 files changed, 20 insertions(+)

Index: linux-trace.git/kernel/sched/core.c
===================================================================
--- linux-trace.git.orig/kernel/sched/core.c	2016-02-03 10:54:55.158659968 -0500
+++ linux-trace.git/kernel/sched/core.c	2016-02-03 13:52:56.213696814 -0500
@@ -2301,6 +2301,7 @@ static int dl_overflow(struct task_struc
 	if (new_bw == p->dl.dl_bw)
 		return 0;
 
+	trace_printk("dl_b=%p new=%lld old=%lld\n", dl_b, new_bw, p->dl.dl_bw);
 	/*
 	 * Either if a task, enters, leave, or stays -deadline but changes
 	 * its parameters, we may need to update accordingly the total
@@ -5068,6 +5069,7 @@ int task_can_attach(struct task_struct *
 		if (overflow)
 			ret = -EBUSY;
 		else {
+			struct dl_bw *src_dl_b;
 			/*
 			 * We reserve space for this task in the destination
 			 * root_domain, as we can't fail after this point.
@@ -5075,6 +5077,8 @@ int task_can_attach(struct task_struct *
 			 * later on (see set_cpus_allowed_dl()).
 			 */
 			__dl_add(dl_b, p->dl.dl_bw);
+			src_dl_b = dl_bw_of(task_cpu(p));
+			trace_printk("source %p total=%lld\n", src_dl_b, src_dl_b->total_bw);
 		}
 		raw_spin_unlock_irqrestore(&dl_b->lock, flags);
 		rcu_read_unlock_sched();
@@ -5636,6 +5640,7 @@ static void rq_attach_root(struct rq *rq
 
 		cpumask_clear_cpu(rq->cpu, old_rd->span);
 
+		trace_printk("old_rd refcount=%d\n", atomic_read(&old_rd->refcount));
 		/*
 		 * If we dont want to free the old_rd yet then
 		 * set old_rd to NULL to skip the freeing later
@@ -5646,7 +5651,11 @@ static void rq_attach_root(struct rq *rq
 	}
 
 	atomic_inc(&rd->refcount);
+	if (old_rd)
+		trace_printk("old rd %p (%p) total=%lld\n", rq->rd, &rq->rd->dl_bw, rq->rd->dl_bw.total_bw);
+	trace_printk("add new rd %p (%p) total=%lld\n", rd, &rd->dl_bw, rd->dl_bw.total_bw);
 	rq->rd = rd;
+	trace_dump_stack(0);
 
 	cpumask_set_cpu(rq->cpu, rd->span);
 	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
Index: linux-trace.git/kernel/sched/deadline.c
===================================================================
--- linux-trace.git.orig/kernel/sched/deadline.c	2016-02-03 10:21:27.140280992 -0500
+++ linux-trace.git/kernel/sched/deadline.c	2016-02-03 13:44:41.321432980 -0500
@@ -66,6 +66,8 @@ void init_dl_bw(struct dl_bw *dl_b)
 	else
 		dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime());
 	raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock);
+	trace_printk("clear total_bw %p (was:%lld)\n", dl_b, dl_b->total_bw);
+	trace_dump_stack(0);
 	dl_b->total_bw = 0;
 }
 
@@ -1224,7 +1226,10 @@ static void task_dead_dl(struct task_str
 	 */
 	raw_spin_lock_irq(&dl_b->lock);
 	/* XXX we should retain the bw until 0-lag */
+	trace_printk("[%d:%p] total=%lld tsk=%lld\n", task_cpu(p), dl_b, dl_b->total_bw, p->dl.dl_bw);
 	dl_b->total_bw -= p->dl.dl_bw;
+	trace_printk("new total=%lld\n", dl_b->total_bw);
+	trace_dump_stack(0);
 	raw_spin_unlock_irq(&dl_b->lock);
 }
 
Index: linux-trace.git/kernel/sched/sched.h
===================================================================
--- linux-trace.git.orig/kernel/sched/sched.h	2016-02-03 10:54:55.182659590 -0500
+++ linux-trace.git/kernel/sched/sched.h	2016-02-03 13:44:41.321432980 -0500
@@ -192,13 +192,19 @@ struct dl_bw {
 static inline
 void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
 {
+	trace_printk("(%p) total=%lld sub tsk=%lld\n", dl_b, dl_b->total_bw, tsk_bw);
 	dl_b->total_bw -= tsk_bw;
+	trace_printk("new total=%lld\n", dl_b->total_bw);
+	trace_dump_stack(0);
 }
 
 static inline
 void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
 {
+	trace_printk("(%p) total=%lld add tsk=%lld\n", dl_b, dl_b->total_bw, tsk_bw);
 	dl_b->total_bw += tsk_bw;
+	trace_printk("new total=%lld\n", dl_b->total_bw);
+	trace_dump_stack(0);
 }
 
 static inline