[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20241209094941.GF21636@noisy.programming.kicks-ass.net>
Date: Mon, 9 Dec 2024 10:49:41 +0100
From: Peter Zijlstra <peterz@...radead.org>
To: Marcel Ziswiler <marcel.ziswiler@...ethink.co.uk>
Cc: mingo@...hat.com, juri.lelli@...hat.com, vincent.guittot@...aro.org,
dietmar.eggemann@....com, rostedt@...dmis.org, bsegall@...gle.com,
mgorman@...e.de, vschneid@...hat.com, linux-kernel@...r.kernel.org,
kprateek.nayak@....com, wuyun.abel@...edance.com,
youssefesmat@...omium.org, tglx@...utronix.de, efault@....de
Subject: Re: [REGRESSION] Re: [PATCH 00/24] Complete EEVDF
Sorry for the delay, I got laid low by snot monsters :/
On Mon, Dec 02, 2024 at 07:46:21PM +0100, Marcel Ziswiler wrote:
> Unfortunately, once I trigger the failure the system is completely dead and won't allow me to dump the trace
> buffer any longer. So I did the following instead on the serial console terminal:
>
> tail -f /sys/kernel/debug/tracing/trace
>
> Not sure whether there is any better way to go about this. Plus even so we run the serial console at 1.5
> megabaud I am not fully sure whether it was able to keep up logging what you are looking for.
Ah, that is unfortunate. There is a ftrace_dump_on_oops option that
might be of help. And yes, dumping trace buffers over 1m5 serial lines
is tedious -- been there done that, got a t-shirt and all that.
Still, let me see if perhaps making that WARN in enqueue_dl_entity()
return makes the whole thing less fatal.
I've included the traceoff_on_warning and ftrace_dump in the code, so
all you really need to still do is enable the stacktrace option.
echo 1 > /sys/kernel/debug/tracing/options/stacktrace
> Yes, and do not hesitate to ask for any additional information et. al. we are happy to help. Thanks!
Could I bother you to try again with the below patch?
There are two new hunks vs the previous one, the hunk in
enqueue_dl_entity() (the very last bit) will stop tracing and dump the
buffers when that condition is hit in addition to then aborting the
double enqueue, hopefully leaving the system is a slightly better state.
The other new hunk is the one for dl_server_stop() (second hunk), while
going over the code last week, I found that this might be a possible
hole leading to the observed double enqueue, so fingers crossed.
---
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 33b4646f8b24..bd1df7612482 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1223,6 +1223,11 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_
scoped_guard (rq_lock, rq) {
struct rq_flags *rf = &scope.rf;
+ if (dl_se == &rq->fair_server) {
+ trace_printk("timer fair server %d throttled %d\n",
+ cpu_of(rq), dl_se->dl_throttled);
+ }
+
if (!dl_se->dl_throttled || !dl_se->dl_runtime)
return HRTIMER_NORESTART;
@@ -1674,6 +1679,12 @@ void dl_server_start(struct sched_dl_entity *dl_se)
void dl_server_stop(struct sched_dl_entity *dl_se)
{
+ if (current->dl_server == dl_se) {
+ struct rq *rq = rq_of_dl_se(dl_se);
+ trace_printk("stop fair server %d\n", cpu_of(rq));
+ current->dl_server = NULL;
+ }
+
if (!dl_se->dl_runtime)
return;
@@ -1792,6 +1803,9 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
rq_lock(rq, &rf);
}
+ if (dl_se == &rq->fair_server)
+ trace_printk("inactive fair server %d\n", cpu_of(rq));
+
sched_clock_tick();
update_rq_clock(rq);
@@ -1987,6 +2001,12 @@ update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_se(dl_se);
+
+ if (dl_se == &rq->fair_server) {
+ trace_printk("enqueue fair server %d h_nr_running %d\n",
+ cpu_of(rq), rq->cfs.h_nr_running);
+ }
WARN_ON_ONCE(!RB_EMPTY_NODE(&dl_se->rb_node));
@@ -1998,6 +2018,12 @@ static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_se(dl_se);
+
+ if (dl_se == &rq->fair_server) {
+ trace_printk("dequeue fair server %d h_nr_running %d\n",
+ cpu_of(rq), rq->cfs.h_nr_running);
+ }
if (RB_EMPTY_NODE(&dl_se->rb_node))
return;
@@ -2012,7 +2038,11 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
static void
enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
{
- WARN_ON_ONCE(on_dl_rq(dl_se));
+ if (WARN_ON_ONCE(on_dl_rq(dl_se))) {
+ tracing_off();
+ ftrace_dump(DUMP_ALL);
+ return;
+ }
update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags);
Powered by blists - more mailing lists