[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <431a5acb-4906-4095-8dec-b2d824adaac6@arm.com>
Date: Tue, 11 Mar 2025 17:22:16 +0000
From: Mark Barnett <mark.barnett@....com>
To: Peter Zijlstra <peterz@...radead.org>
Cc: mingo@...hat.com, acme@...nel.org, namhyung@...nel.org,
irogers@...gle.com, ben.gainey@....com, deepak.surti@....com,
ak@...ux.intel.com, will@...nel.org, james.clark@....com,
mark.rutland@....com, alexander.shishkin@...ux.intel.com, jolsa@...nel.org,
adrian.hunter@...el.com, linux-perf-users@...r.kernel.org,
linux-kernel@...r.kernel.org, linux-arm-kernel@...ts.infradead.org
Subject: Re: [PATCH v3 3/5] perf: Allow adding fixed random jitter to the
alternate sampling period
On 3/11/25 11:31, Peter Zijlstra wrote:
> On Fri, Mar 07, 2025 at 08:22:45PM +0000, mark.barnett@....com wrote:
>> From: Ben Gainey <ben.gainey@....com>
>>
>> This change modifies the core perf overflow handler, adding some small
>> random jitter to each sample period whenever an event switches between the
>> two alternate sample periods. A new flag is added to perf_event_attr to
>> opt into this behaviour.
>>
>> This change follows the discussion in [1], where it is recognized that it
>> may be possible for certain patterns of execution to end up with biased
>> results.
>>
>> [1] https://lore.kernel.org/linux-perf-users/Zc24eLqZycmIg3d2@tassilo/
>>
>> Signed-off-by: Ben Gainey <ben.gainey@....com>
>> Signed-off-by: Mark Barnett <mark.barnett@....com>
>> ---
>> include/uapi/linux/perf_event.h | 7 ++++++-
>> kernel/events/core.c | 9 ++++++++-
>> 2 files changed, 14 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
>> index 499a8673df8e..c0076ce8f80a 100644
>> --- a/include/uapi/linux/perf_event.h
>> +++ b/include/uapi/linux/perf_event.h
>> @@ -461,7 +461,12 @@ struct perf_event_attr {
>> inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */
>> remove_on_exec : 1, /* event is removed from task on exec */
>> sigtrap : 1, /* send synchronous SIGTRAP on event */
>> - __reserved_1 : 26;
>> + /*
>> + * Add a limited amount of jitter on each alternate period, where
>> + * the jitter is between [0, (2<<jitter_alt_period) - 1]
>> + */
>> + jitter_alt_period : 3,
>> + __reserved_1 : 23;
>
> So; I've been thinking about this interface.
>
> I think I prefer you keep the existing sample_period/sample_freq working
> as is and simply modulate with random and high-freq components.
>
> A very little like so..
>
> I've made the hf_sample_period 32bit since I figured that ought to be
> enough -- you're aiming at very short periods after all. But there's
> enough unused bits left.
>
> So this has sample_period or sample_freq compute hwc->sample_period_base
> which is first modified with random such that the average is exactly
> sample_period_base (assuming a flat distribution).
>
> This means that sample_period_base is still the right number to use for
> computing freq based things. Additionally, have the 'extra' interrupt
> ignored for adaptive period crud.
>
> Also, someone needs to consider the eBPF hook and what to do with it.
> I've kept the ordering as per this series, but I suspect it's wrong and
> we want this before the BPF hook. Please think about this and explicitly
> mention this in the next series.
>
> Anyway, very much a sketch of things, incomplete and not been near a
> compiler.
>
>
Thanks, Peter!
OK, I see what you mean. Packing the fields into hf_sample makes sense.
I'll have a look at the eBPF hook and see if we need to do anything. The
sample period is always stored in perf_sample_data so it's technically
possible for eBPF programs to identify the high-frequency ones, but it's
not a great API. Maybe we should have an explicit flag.
I have one question about interrupt accounting, below...
>
> ---
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 76f4265efee9..c5dd6442e96f 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -229,6 +229,10 @@ struct hw_perf_event {
> #define PERF_HES_UPTODATE 0x02 /* event->count up-to-date */
> #define PERF_HES_ARCH 0x04
>
> +#define PERF_HES_HF_ON 0x10 /* using high-fred sampling */
> +#define PERF_HES_HF_SAMPLE 0x20
> +#define PERF_HES_HF_RAND 0x40
> +
> int state;
>
> /*
> @@ -241,6 +245,7 @@ struct hw_perf_event {
> * The period to start the next sample with.
> */
> u64 sample_period;
> + u64 sample_period_base;
>
> union {
> struct { /* Sampling */
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index 0524d541d4e3..8dbe027f93f1 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -379,6 +379,7 @@ enum perf_event_read_format {
> #define PERF_ATTR_SIZE_VER6 120 /* add: aux_sample_size */
> #define PERF_ATTR_SIZE_VER7 128 /* add: sig_data */
> #define PERF_ATTR_SIZE_VER8 136 /* add: config3 */
> +#define PERF_ATTR_SIZE_VER9 144 /* add: hf_sample */
>
> /*
> * Hardware event_id to monitor via a performance monitoring event:
> @@ -531,6 +532,14 @@ struct perf_event_attr {
> __u64 sig_data;
>
> __u64 config3; /* extension of config2 */
> + union {
> + __u64 hf_sample;
> + struct {
> + __u64 hf_sample_period : 32,
> + hf_sample_rand : 4,
> + __reserved_4 : 28;
> + };
> + };
> };
>
> /*
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index b87a5ac42ce2..e5a93edf3b5f 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -8,6 +8,7 @@
> * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@....ibm.com>
> */
>
> +#include "linux/random.h"
> #include <linux/fs.h>
> #include <linux/mm.h>
> #include <linux/cpu.h>
> @@ -55,6 +56,7 @@
> #include <linux/pgtable.h>
> #include <linux/buildid.h>
> #include <linux/task_work.h>
> +#include <linux/prandom.h>
>
> #include "internal.h"
>
> @@ -443,6 +445,8 @@ static cpumask_var_t perf_online_pkg_mask;
> static cpumask_var_t perf_online_sys_mask;
> static struct kmem_cache *perf_event_cache;
>
> +static struct rnd_state perf_rand;
> +
> /*
> * perf event paranoia level:
> * -1 - not paranoid at all
> @@ -4233,19 +4237,19 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
>
> period = perf_calculate_period(event, nsec, count);
>
> - delta = (s64)(period - hwc->sample_period);
> + delta = (s64)(period - hwc->sample_period_base);
> if (delta >= 0)
> delta += 7;
> else
> delta -= 7;
> delta /= 8; /* low pass filter */
>
> - sample_period = hwc->sample_period + delta;
> + sample_period = hwc->sample_period_base + delta;
>
> if (!sample_period)
> sample_period = 1;
>
> - hwc->sample_period = sample_period;
> + hwc->sample_period_base = sample_period;
>
> if (local64_read(&hwc->period_left) > 8*sample_period) {
> if (disable)
> @@ -4490,6 +4494,8 @@ void perf_event_task_tick(void)
> if (ctx)
> perf_adjust_freq_unthr_context(ctx, !!throttled);
> rcu_read_unlock();
> +
> + prandom_seed_state(&perf_rand, get_random_u64());
> }
>
> static int event_enable_on_exec(struct perf_event *event,
> @@ -9979,6 +9985,8 @@ static int __perf_event_overflow(struct perf_event *event,
> int throttle, struct perf_sample_data *data,
> struct pt_regs *regs)
> {
> + struct hw_perf_event *hwc = &event->hw;
> + u64 sample_period;
> int events = atomic_read(&event->event_limit);
> int ret = 0;
>
> @@ -9989,15 +9997,50 @@ static int __perf_event_overflow(struct perf_event *event,
> if (unlikely(!is_sampling_event(event)))
> return 0;
>
> - ret = __perf_event_account_interrupt(event, throttle);
> + /*
> + * High Freq samples are injected inside the larger period:
> + *
> + * |------------|-|------------|-|
> + * P0 HF P1 HF
> + *
> + * By ignoring the HF samples, we measure the actual period.
> + */
> + if (!(hwc->state & PERF_HES_HF_SAMPLE))
> + ret = __perf_event_account_interrupt(event, throttle);
>
The high-frequency samples should still contribute to interrupt
accounting/throttling, right? We'd just need to put guards around the
adaptive period stuff so that HF samples don't contribute to the
frequency training.
> if (event->attr.aux_pause)
> perf_event_aux_pause(event->aux_event, true);
>
> + /* XXX interaction between HF samples and BPF */
> if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT &&
> !bpf_overflow_handler(event, data, regs))
> goto out;
>
> + sample_period = hwc->sample_period_base;
> + if (hwc->state & PERF_HES_HF_RAND) {
> + u64 rand = 1 << event->attr.hf_sample_rand;
> + sample_period -= rand / 2;
> + sample_period += prandom_u32_state(&perf_rand) & (rand - 1);
> + }
> + if (hwc->state & PERF_HES_HF_ON) {
> + u64 hf_sample_period = event->attr.hf_sample_period;
> +
> + if (sample_period < hf_sample_period) {
> + hwc->state &= ~PERF_HES_HF_ON;
> + goto set_period;
> + }
> +
> + if (!(hwc->state & PERF_HES_HF_SAMPLE)) {
> + hwc->sample_period -= hf_sample_period;
> + hwc->state |= PERF_HES_HF_SAMPLE;
> + } else {
> + hwc->sample_period = hf_sample_period;
> + hwc->state &= ~PERF_HES_HF_SAMPLE;
> + }
> + }
> +set_period:
> + hwc->sample_period = sample_period;
> +
> /*
> * XXX event_limit might not quite work as expected on inherited
> * events
> @@ -12458,8 +12501,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
>
> hwc = &event->hw;
> hwc->sample_period = attr->sample_period;
> - if (attr->freq && attr->sample_freq)
> + hwc->sample_period_base = attr->sample_period;
> + if (attr->freq && attr->sample_freq) {
> hwc->sample_period = 1;
> + hwc->sample_period_base = 1;
> + }
> hwc->last_period = hwc->sample_period;
>
> local64_set(&hwc->period_left, hwc->sample_period);
> @@ -13824,6 +13870,7 @@ inherit_event(struct perf_event *parent_event,
> struct hw_perf_event *hwc = &child_event->hw;
>
> hwc->sample_period = sample_period;
> + hwc->sample_period_base = sample_period;
> hwc->last_period = sample_period;
>
> local64_set(&hwc->period_left, sample_period);
> @@ -14279,6 +14326,8 @@ void __init perf_event_init(void)
> {
> int ret;
>
> + prandom_seed_state(&perf_rand, get_random_u64());
> +
> idr_init(&pmu_idr);
>
> perf_event_init_all_cpus();
>
Powered by blists - more mailing lists