[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240219175735.33171-1-nsaenz@amazon.com>
Date: Mon, 19 Feb 2024 17:57:35 +0000
From: Nicolas Saenz Julienne <nsaenz@...zon.com>
To: <frederic@...nel.org>, <paulmck@...nel.org>
CC: <jalliste@...zon.co.uk>, <nsaenz@...zon.com>, <mhiramat@...nel.org>,
<akpm@...ux-foundation.org>, <pmladek@...e.com>, <rdunlap@...radead.org>,
<tsi@...oix.net>, <nphamcs@...il.com>, <gregkh@...uxfoundation.org>,
<linux-kernel@...r.kernel.org>, <kvm@...r.kernel.org>, <pbonzini@...hat.com>,
<seanjc@...gle.com>
Subject: [RFC] cputime: Introduce option to force full dynticks accounting on NOHZ & NOHZ_IDLE CPUs
Under certain extreme conditions, the tick-based cputime accounting may
produce inaccurate data. For instance, guest CPU usage is sensitive to
interrupts firing right before the tick's expiration. This forces the
guest into kernel context, and has that time slice wrongly accounted as
system time. This issue is exacerbated if the interrupt source is in
sync with the tick, significantly skewing usage metrics towards system
time.
On CPUs with full dynticks enabled, cputime accounting leverages the
context tracking subsystem to measure usage, and isn't susceptible to
this sort of race conditions. However, this imposes a bigger overhead,
including additional accounting and the extra dyntick tracking during
user<->kernel<->guest transitions (RmW + mb).
So, in order to get the best of both worlds, introduce a cputime
configuration option that allows using the full dynticks accounting
scheme on NOHZ & NOHZ_IDLE CPUs, while avoiding the expensive
user<->kernel<->guest dyntick transitions.
Signed-off-by: Nicolas Saenz Julienne <nsaenz@...zon.com>
Signed-off-by: Jack Allister <jalliste@...zon.co.uk>
---
NOTE: This wasn't tested in depth, and it's mostly intended to highlight
the issue we're trying to solve. Also ccing KVM folks, since it's
relevant to guest CPU usage accounting.
include/linux/context_tracking.h | 4 ++--
include/linux/vtime.h | 6 ++++--
init/Kconfig | 24 +++++++++++++++++++++++-
kernel/context_tracking.c | 25 ++++++++++++++++++++++---
4 files changed, 51 insertions(+), 8 deletions(-)
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 6e76b9dba00e7..dd9b500359aa6 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -102,11 +102,11 @@ static __always_inline void context_tracking_guest_exit(void) { }
#define CT_WARN_ON(cond) do { } while (0)
#endif /* !CONFIG_CONTEXT_TRACKING_USER */
-#ifdef CONFIG_CONTEXT_TRACKING_USER_FORCE
+#if defined(CONFIG_CONTEXT_TRACKING_USER_FORCE) || defined(CONFIG_VIRT_CPU_ACCOUNTING_GEN_FORCE)
extern void context_tracking_init(void);
#else
static inline void context_tracking_init(void) { }
-#endif /* CONFIG_CONTEXT_TRACKING_USER_FORCE */
+#endif /* CONFIG_CONTEXT_TRACKING_USER_FORCE || CONFIG_VIRT_CPU_ACCOUNTING_GEN_FORCE */
#ifdef CONFIG_CONTEXT_TRACKING_IDLE
extern void ct_idle_enter(void);
diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index 3684487d01e1c..d78d01eead6e9 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -79,12 +79,14 @@ static inline bool vtime_accounting_enabled(void)
static inline bool vtime_accounting_enabled_cpu(int cpu)
{
- return context_tracking_enabled_cpu(cpu);
+ return IS_ENABLED(CONFIG_VIRT_CPU_ACCOUNTING_GEN_FORCE) ||
+ context_tracking_enabled_cpu(cpu);
}
static inline bool vtime_accounting_enabled_this_cpu(void)
{
- return context_tracking_enabled_this_cpu();
+ return IS_ENABLED(CONFIG_VIRT_CPU_ACCOUNTING_GEN_FORCE) ||
+ context_tracking_enabled_this_cpu();
}
extern void vtime_task_switch_generic(struct task_struct *prev);
diff --git a/init/Kconfig b/init/Kconfig
index 9ffb103fc927b..86877e1f416fc 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -473,6 +473,9 @@ menu "CPU/Task time and stats accounting"
config VIRT_CPU_ACCOUNTING
bool
+config VIRT_CPU_ACCOUNTING_GEN
+ bool
+
choice
prompt "Cputime accounting"
default TICK_CPU_ACCOUNTING
@@ -501,12 +504,13 @@ config VIRT_CPU_ACCOUNTING_NATIVE
this also enables accounting of stolen time on logically-partitioned
systems.
-config VIRT_CPU_ACCOUNTING_GEN
+config VIRT_CPU_ACCOUNTING_DYNTICKS
bool "Full dynticks CPU time accounting"
depends on HAVE_CONTEXT_TRACKING_USER
depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
depends on GENERIC_CLOCKEVENTS
select VIRT_CPU_ACCOUNTING
+ select VIRT_CPU_ACCOUNTING_GEN
select CONTEXT_TRACKING_USER
help
Select this option to enable task and CPU time accounting on full
@@ -520,8 +524,26 @@ config VIRT_CPU_ACCOUNTING_GEN
If unsure, say N.
+config VIRT_CPU_ACCOUNTING_GEN_FORCE
+ bool "Force full dynticks CPU time accounting"
+ depends on HAVE_CONTEXT_TRACKING_USER
+ depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
+ depends on GENERIC_CLOCKEVENTS
+ select VIRT_CPU_ACCOUNTING
+ select VIRT_CPU_ACCOUNTING_GEN
+ select CONTEXT_TRACKING_USER
+ help
+ Select this option to forcibly enable the full dynticks CPU time
+ accounting. This accounting is implemented by watching every
+ kernel-user boundaries using the context tracking subsystem. The
+ accounting is thus performed at the expense of some overhead, but is
+ more precise than tick based CPU accounting.
+
+ If unsure, say N.
+
endchoice
+
config IRQ_TIME_ACCOUNTING
bool "Fine granularity task level IRQ time accounting"
depends on HAVE_IRQ_TIME_ACCOUNTING && !VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 6ef0b35fc28c5..f70949430cf11 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -537,6 +537,13 @@ void noinstr __ct_user_enter(enum ctx_state state)
*/
raw_atomic_add(state, &ct->state);
}
+
+ if (IS_ENABLED(CONFIG_VIRT_CPU_ACCOUNTING_GEN_FORCE) &&
+ state == CONTEXT_USER) {
+ instrumentation_begin();
+ vtime_user_enter(current);
+ instrumentation_end();
+ }
}
}
context_tracking_recursion_exit();
@@ -645,6 +652,13 @@ void noinstr __ct_user_exit(enum ctx_state state)
*/
raw_atomic_sub(state, &ct->state);
}
+
+ if (IS_ENABLED(CONFIG_VIRT_CPU_ACCOUNTING_GEN_FORCE) &&
+ state == CONTEXT_USER) {
+ instrumentation_begin();
+ vtime_user_exit(current);
+ instrumentation_end();
+ }
}
}
context_tracking_recursion_exit();
@@ -715,13 +729,18 @@ void __init ct_cpu_track_user(int cpu)
initialized = true;
}
-#ifdef CONFIG_CONTEXT_TRACKING_USER_FORCE
+#if defined(CONFIG_CONTEXT_TRACKING_USER_FORCE) || defined(CONFIG_VIRT_CPU_ACCOUNTING_GEN_FORCE)
void __init context_tracking_init(void)
{
int cpu;
- for_each_possible_cpu(cpu)
- ct_cpu_track_user(cpu);
+ if (IS_ENABLED(CONFIG_CONTEXT_TRACKING_USER_FORCE)) {
+ for_each_possible_cpu(cpu)
+ ct_cpu_track_user(cpu);
+ }
+
+ if (IS_ENABLED(CONFIG_VIRT_CPU_ACCOUNTING_GEN_FORCE))
+ static_branch_inc(&context_tracking_key);
}
#endif
--
2.40.1
Powered by blists - more mailing lists