[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20191218104941.GR2844@hirez.programming.kicks-ass.net>
Date: Wed, 18 Dec 2019 11:49:41 +0100
From: Peter Zijlstra <peterz@...radead.org>
To: Ming Lei <ming.lei@...hat.com>
Cc: Thomas Gleixner <tglx@...utronix.de>, Jens Axboe <axboe@...nel.dk>,
linux-kernel@...r.kernel.org, linux-block@...r.kernel.org,
Long Li <longli@...rosoft.com>, Ingo Molnar <mingo@...hat.com>,
Christoph Hellwig <hch@....de>,
Keith Busch <keith.busch@...el.com>,
Sagi Grimberg <sagi@...mberg.me>,
John Garry <john.garry@...wei.com>,
Hannes Reinecke <hare@...e.com>
Subject: Re: [RFC PATCH 2/3] softirq: implement interrupt flood detection
On Wed, Dec 18, 2019 at 03:19:41PM +0800, Ming Lei wrote:
> diff --git a/kernel/softirq.c b/kernel/softirq.c
> index 0427a86743a4..f6e434ac4183 100644
> --- a/kernel/softirq.c
> +++ b/kernel/softirq.c
> @@ -25,6 +25,8 @@
> #include <linux/smpboot.h>
> #include <linux/tick.h>
> #include <linux/irq.h>
> +#include <linux/sched.h>
> +#include <linux/sched/clock.h>
>
> #define CREATE_TRACE_POINTS
> #include <trace/events/irq.h>
> @@ -52,6 +54,26 @@ DEFINE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat);
> EXPORT_PER_CPU_SYMBOL(irq_stat);
> #endif
>
> +#define IRQ_INTERVAL_STAGE1_WEIGHT_BITS ilog2(512)
> +#define IRQ_INTERVAL_STAGE2_WEIGHT_BITS ilog2(128)
That must be the most difficult way of writing 9 and 7 resp.
> +#define IRQ_INTERVAL_THRESHOLD_UNIT_NS 1000
> +
> +#define IRQ_INTERVAL_MIN_THRESHOLD_NS IRQ_INTERVAL_THRESHOLD_UNIT_NS
> +#define IRQ_INTERVAL_MAX_MIN_THRESHOLD_TIME_NS 4000000000
(seriously a name with MAX_MIN in it ?!?)
That's unreadable, we have (4*NSEC_PER_SEC) for that (if I counted the
0s correctly)
These are all a bunch of magic value, any justification for them? Will
they always work?
> +
> +struct irq_interval {
> + u64 clock;
> + int avg;
> + int std_threshold:31;
I know of a standard deviation, but what is a standard threshold?
> + int stage:1;
signed single bit.. there's people that object to that. They figure just
a sign bit isn't much useful.
> +
> + u64 stage_start_clock;
> + unsigned stage1_time;
> + unsigned stage2_time;
> +};
> +DEFINE_PER_CPU(struct irq_interval, avg_irq_interval);
> +
> static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
>
> DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
> @@ -339,6 +361,140 @@ asmlinkage __visible void do_softirq(void)
> local_irq_restore(flags);
> }
>
> +static inline void irq_interval_update_avg(struct irq_interval *inter,
> + u64 now, int weight_bits)
> +{
> + inter->avg = inter->avg - ((inter->avg) >> weight_bits) +
> + ((now - inter->clock) >> weight_bits);
Did you perhaps want to write something like:
s64 sample = now - inter->clock;
inter->avg += (sample - inter->avg) >> weight_bits;
Which is a recognisable form.
It also shows the obvious overflow when sample is large (interrupts
didn't happen for a while). You'll want to clamp @sample to some max.
> + if (unlikely(inter->avg < 0))
> + inter->avg = 0;
And since inter->avg must never be <0, wth are you using a signed
bitfield? This generates shit code. Use an on-stack temporary if
anything:
int avg = inter->avg;
avg += (sample - avg) >> bits;
if (avg < 0)
avg = 0;
inter->avg = avg;
and presto! no signed bitfields required.
> +}
> +
> +/*
> + * Keep the ratio of stage2 time to stage1 time between 1/2 and 1/8. If
> + * it is out of the range, adjust .std_threshold for maintaining the ratio.
it is either @std_threshold or @irq_interval::std_threshold
> + */
> +static inline void irq_interval_update_threshold(struct irq_interval *inter)
> +{
> + if (inter->stage2_time * 2 > inter->stage1_time)
> + inter->std_threshold -= IRQ_INTERVAL_THRESHOLD_UNIT_NS;
> + if (inter->stage2_time * 8 < inter->stage1_time)
> + inter->std_threshold += IRQ_INTERVAL_THRESHOLD_UNIT_NS;
I suppose that will eventually converge.
> + if (inter->std_threshold <= 0)
> + inter->std_threshold = IRQ_INTERVAL_THRESHOLD_UNIT_NS;
I think you'll find you actually meant to write:
if (inter->std_threshold < IRQ_INTERVAL_THRESHOLD_UNIT_NS)
> + if (inter->std_threshold >= 32 * IRQ_INTERVAL_THRESHOLD_UNIT_NS)
> + inter->std_threshold = 32 * IRQ_INTERVAL_THRESHOLD_UNIT_NS;
We actually have a macro for this:
inter->std_threshold = clamp(inter->std_threshold,
IRQ_INTERVAL_THRESHOLD_UNIT_NS,
32 * IRQ_INTERVAL_THRESHOLD_UNIT_NS);
> +}
> +
> +/*
> + * If we stay at stage1 for too long with minimized threshold and low enough
> + * interrupt average interval, there may have risk to lock up CPU.
It's not locked up...
> + */
> +static bool irq_interval_cpu_lockup_risk(struct irq_interval *inter, u64 now)
> +{
> + if (inter->avg > inter->std_threshold)
> + return false;
> +
> + if (inter->std_threshold != IRQ_INTERVAL_MIN_THRESHOLD_NS)
> + return false;
> +
> + if (now - inter->stage_start_clock <=
> + IRQ_INTERVAL_MAX_MIN_THRESHOLD_TIME_NS)
> + return false;
> + return true;
> +}
> +
> +/*
> + * Update average interrupt interval with the Exponential Weighted Moving
> + * Average(EWMA), and implement two-stage interrupt flood detection.
> + *
> + * Use scheduler's runqueue software clock at default for figuring
> + * interrupt interval for saving cost. When the interval becomes zero,
> + * it is reasonable to conclude scheduler's activity on this CPU has been
> + * stopped because of interrupt flood. Then switch to the 2nd stage
> + * detection in which clock is read from hardware, and the detection
> + * result can be more reliable.
> + */
> +static void irq_interval_update(void)
> +{
> + struct irq_interval *inter = raw_cpu_ptr(&avg_irq_interval);
raw_cpu_ptr is wrong, this wants to be this_cpu_ptr()
> + u64 now;
> +
> + if (likely(!inter->stage)) {
> + now = sched_local_rq_clock();
> + irq_interval_update_avg(inter, now,
> + IRQ_INTERVAL_STAGE1_WEIGHT_BITS);
> +
> + if (unlikely(inter->avg < inter->std_threshold / 2 ||
> + irq_interval_cpu_lockup_risk(inter, now))) {
> + inter->stage = 1;
> + now = sched_clock_cpu(smp_processor_id());
> + inter->stage1_time = now - inter->stage_start_clock;
> + inter->stage_start_clock = now;
> +
> + /*
> + * reset as the mean of the min and the max value of
> + * stage2's threshold
> + */
> + inter->avg = inter->std_threshold +
> + (inter->std_threshold >> 2);
> + }
> + } else {
> + now = sched_clock_cpu(smp_processor_id());
> +
> + irq_interval_update_avg(inter, now,
> + IRQ_INTERVAL_STAGE2_WEIGHT_BITS);
> +
> + if (inter->avg > inter->std_threshold * 2) {
> + inter->stage = 0;
> + inter->stage2_time = now - inter->stage_start_clock;
> + inter->stage_start_clock = now;
> +
> + irq_interval_update_threshold(inter);
> + }
> + }
> +}
AFAICT the only reason for much of this complexity is so that you can
use this sched_local_rq_clock() thing, right? Once that reaches a
threshold, you go use the more accurate sched_clock_cpu() and once that
tickles the threshold you call it golden and raise hell.
So pray tell, why did you not integrate this with IRQ_TIME_ACCOUNTING ?
That already takes a timestamp and does most of what you need.
> @@ -356,6 +512,7 @@ void irq_enter(void)
> }
>
> __irq_enter();
> + irq_interval_update();
> }
Arggh.. you're going to make every single interrupt take at least 2
extra cache misses for this gunk?!?
And it lumps all interrupts on a single heap, and doesn't do any of the
otherwise useful things we've been wanting to have IRQ timings for :/
_If_ you want to do something like this, do it like the below. That only
adds a few instruction to irq_exit() and only touches a cacheline that's
already touched.
It computes both the avg duration and the avg inter-arrival-time of
hardirqs. Things get critical when:
inter-arrival-avg < 2*duration-avg
or something like that.
---
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index d43318a489f2..6f5ef70b5a1d 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -50,7 +50,7 @@ static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
void irqtime_account_irq(struct task_struct *curr)
{
struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
- s64 delta;
+ s64 delta, iat;
int cpu;
if (!sched_clock_irqtime)
@@ -58,7 +58,6 @@ void irqtime_account_irq(struct task_struct *curr)
cpu = smp_processor_id();
delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
- irqtime->irq_start_time += delta;
/*
* We do not account for softirq time from ksoftirqd here.
@@ -66,10 +65,21 @@ void irqtime_account_irq(struct task_struct *curr)
* in that case, so as not to confuse scheduler with a special task
* that do not consume any time, but still wants to run.
*/
- if (hardirq_count())
+ if (hardirq_count()) {
irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
- else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+
+ /* this is irq_exit(), delta is the duration of the hardirq */
+ irqtime->duration_avg += (delta - irqtime->duration_avg) >> 7;
+
+ /* compute the inter arrival time using the previous arrival_time */
+ iat = irqtime->irq_start_time - irqtime->irq_arrival_time;
+ irqtime->irq_arrival_time += iat;
+ irqtime->irq_inter_arrival_avg += (iat - irqtime->inter_arrival_avg) >> 7;
+
+ } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
+
+ irqtime->irq_start_time += delta;
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 280a3c735935..cab07e5a6c11 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2236,6 +2236,9 @@ struct irqtime {
u64 total;
u64 tick_delta;
u64 irq_start_time;
+ u64 irq_duration_avg;
+ u64 irq_arrival_time;
+ u64 irq_inter_arrival_avg;
struct u64_stats_sync sync;
};
Powered by blists - more mailing lists