linux-kernel - [PATCH 20/25] sched/kcpustat: Introduce vtime-aware kcpustat accessor

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1542163569-20047-21-git-send-email-frederic@kernel.org>
Date:   Wed, 14 Nov 2018 03:46:04 +0100
From:   Frederic Weisbecker <frederic@...nel.org>
To:     LKML <linux-kernel@...r.kernel.org>
Cc:     Frederic Weisbecker <frederic@...nel.org>,
        Peter Zijlstra <peterz@...radead.org>,
        Wanpeng Li <wanpengli@...cent.com>,
        Thomas Gleixner <tglx@...utronix.de>,
        Yauheni Kaliuta <yauheni.kaliuta@...hat.com>,
        Ingo Molnar <mingo@...nel.org>, Rik van Riel <riel@...hat.com>
Subject: [PATCH 20/25] sched/kcpustat: Introduce vtime-aware kcpustat accessor

Kcpustat is not correctly supported on nohz_full CPUs. The tick doesn't
fire and the cputime therefore doesn't move forward. The issue has shown
up after the vanishing of the remaining 1Hz which has made the issue
visible.

We are solving that with tracking the task running on a CPU through RCU
and reading its vtime delta that we add to the raw kcpustat values.

We make sure that we fetch a coherent raw-kcpustat/vtime-delta couple
sequence while checking that the CPU referred by the target vtime is the
correct one, under the locked vtime seqcount.

Reported-by: Yauheni Kaliuta <yauheni.kaliuta@...hat.com>
Signed-off-by: Frederic Weisbecker <frederic@...nel.org>
Cc: Yauheni Kaliuta <yauheni.kaliuta@...hat.com>
Cc: Thomas Gleixner <tglx@...utronix.de>
Cc: Rik van Riel <riel@...hat.com>
Cc: Peter Zijlstra <peterz@...radead.org>
Cc: Wanpeng Li <wanpengli@...cent.com>
Cc: Ingo Molnar <mingo@...nel.org>
---
 include/linux/kernel_stat.h | 25 +++++++++++++
 kernel/sched/cputime.c      | 90 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 115 insertions(+)

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 049d973..2d4d301 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -79,6 +79,31 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu)
 	return kstat_cpu(cpu).irqs_sum;
 }
 
+
+static inline void kcpustat_cputime_raw(struct kernel_cpustat *kcpustat,
+					u64 *user, u64 *nice, u64 *system,
+					u64 *guest, u64 *guest_nice)
+{
+	*user = kcpustat->cpustat[CPUTIME_USER];
+	*nice = kcpustat->cpustat[CPUTIME_NICE];
+	*system = kcpustat->cpustat[CPUTIME_SYSTEM];
+	*guest = kcpustat->cpustat[CPUTIME_GUEST];
+	*guest_nice = kcpustat->cpustat[CPUTIME_GUEST_NICE];
+}
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+extern void kcpustat_cputime(struct kernel_cpustat *kcpustat, int cpu,
+			     u64 *user, u64 *nice, u64 *system,
+			     u64 *guest, u64 *guest_nice);
+#else
+static inline void kcpustat_cputime(struct kernel_cpustat *kcpustat, int cpu,
+				    u64 *user, u64 *nice, u64 *system,
+				    u64 *guest, u64 *guest_nice)
+{
+	kcpustat_cputime_raw(kcpustat, user, nice, system, guest, guest_nice);
+}
+#endif
+
 extern void account_user_time(struct task_struct *, u64);
 extern void account_guest_time(struct task_struct *, u64);
 extern void account_system_time(struct task_struct *, int, u64);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 2b35132..3afde9f 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -1024,4 +1024,94 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
 			*utime += vtime->utime + delta;
 	} while (read_seqcount_retry(&vtime->seqcount, seq));
 }
+
+static int kcpustat_vtime(struct kernel_cpustat *kcpustat, struct vtime *vtime,
+			  int cpu, u64 *user, u64 *nice,
+			  u64 *system, u64 *guest, u64 *guest_nice)
+{
+	unsigned int seq;
+	u64 delta;
+	int err;
+
+	do {
+		seq = read_seqcount_begin(&vtime->seqcount);
+
+		/*
+		 * We raced against context switch, fetch the
+		 * kcpustat task again.
+		 */
+		if (vtime->cpu != cpu && vtime->cpu != -1) {
+			err = -EAGAIN;
+			continue;
+		}
+
+		err = 0;
+
+		kcpustat_cputime_raw(kcpustat, user, nice,
+				     system, guest, guest_nice);
+
+		/* Task is sleeping, dead or idle, nothing to add */
+		if (vtime->state < VTIME_SYS)
+			continue;
+
+		delta = vtime_delta(vtime);
+
+		/*
+		 * Task runs either in user (including guest) or kernel space,
+		 * add pending nohz time to the right place.
+		 */
+		if (vtime->state == VTIME_SYS) {
+			*system += vtime->stime + delta;
+		} else if (vtime->state == VTIME_USER) {
+			if (vtime->nice)
+				*nice += vtime->utime + delta;
+			else
+				*user += vtime->utime + delta;
+		} else {
+			WARN_ON_ONCE(vtime->state != VTIME_GUEST);
+			if (vtime->nice) {
+				*guest_nice += vtime->gtime + delta;
+				*nice += vtime->gtime + delta;
+			} else {
+				*guest += vtime->gtime + delta;
+				*user += vtime->gtime + delta;
+			}
+		}
+	} while (read_seqcount_retry(&vtime->seqcount, seq));
+
+	return err;
+}
+
+void kcpustat_cputime(struct kernel_cpustat *kcpustat, int cpu,
+		      u64 *user, u64 *nice, u64 *system,
+		      u64 *guest, u64 *guest_nice)
+{
+	struct task_struct *curr;
+	struct vtime *vtime;
+	int err;
+
+	if (!vtime_accounting_enabled()) {
+		kcpustat_cputime_raw(kcpustat, user, nice,
+				     system, guest, guest_nice);
+		return;
+	}
+
+	rcu_read_lock();
+
+	do {
+		curr = rcu_dereference(kcpustat->curr);
+		if (!curr) {
+			kcpustat_cputime_raw(kcpustat, user, nice,
+					     system, guest, guest_nice);
+			break;
+		}
+
+		vtime = &curr->vtime;
+		err = kcpustat_vtime(kcpustat, vtime, cpu, user,
+				     nice, system, guest, guest_nice);
+	} while (err == -EAGAIN);
+
+	rcu_read_unlock();
+}
+
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
-- 
2.7.4