linux-kernel - Re: [PATCH V5 4/4] perf/x86/intel: Support PEBS counters snapshotting

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <96cb859f-8265-4f5a-99bb-44cfdcd25b9e@linux.intel.com>
Date: Tue, 17 Dec 2024 12:45:56 -0500
From: "Liang, Kan" <kan.liang@...ux.intel.com>
To: Peter Zijlstra <peterz@...radead.org>
Cc: mingo@...hat.com, acme@...nel.org, namhyung@...nel.org,
 irogers@...gle.com, linux-kernel@...r.kernel.org,
 linux-perf-users@...r.kernel.org, ak@...ux.intel.com, eranian@...gle.com
Subject: Re: [PATCH V5 4/4] perf/x86/intel: Support PEBS counters snapshotting



On 2024-12-17 8:37 a.m., Peter Zijlstra wrote:
> On Mon, Dec 16, 2024 at 12:45:05PM -0800, kan.liang@...ux.intel.com wrote:
> 
>> @@ -2049,6 +2102,40 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
>>  		}
>>  	}
>>  
>> +	if (format_group & (PEBS_DATACFG_CNTR | PEBS_DATACFG_METRICS)) {
>> +		struct pebs_cntr_header *cntr = next_record;
>> +		int bit;
>> +
>> +		next_record += sizeof(struct pebs_cntr_header);
>> +
>> +		for_each_set_bit(bit, (unsigned long *)&cntr->cntr, INTEL_PMC_MAX_GENERIC) {
>> +			x86_perf_event_update(cpuc->events[bit], (u64 *)next_record);
>> +			next_record += sizeof(u64);
>> +		}
> 
> I still don't much like any of this -- the next_record value might be in
> the past relative to what is already in the event.
>

I forgot to handle the read case. With the below patch, the next_record
value should not contain a previous value, unless there is a HW bug.
Because no matter it's read/pmi/context switch, perf always stop the
PMU, drains the buffer, and the value is always from the PEBS record.

There may be two places where perf directly reads the counter and update
the event->count. One is the x86_pmu_stop(). The other is to handle the
non-PEBS event's PMI. The PMU is stopped for both cases. The PEBS buffer
is also drained in advance. The next "next_record" must be a newer value.


diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 32b28d674641..13ab7fca72dd 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2797,7 +2797,8 @@ static void intel_pmu_read_topdown_event(struct
perf_event *event)

 static void intel_pmu_read_event(struct perf_event *event)
 {
-	if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
+	if ((event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) ||
+	    is_pebs_counter_event(event))
 		intel_pmu_auto_reload_read(event);
 	else if (is_topdown_count(event))
 		intel_pmu_read_topdown_event(event);
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index e06ac9a3cdf8..5e11b38427c7 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -2181,10 +2181,8 @@ get_next_pebs_record_by_bit(void *base, void
*top, int bit)
 	return NULL;
 }

-void intel_pmu_auto_reload_read(struct perf_event *event)
+void intel_pmu_pebs_read(struct perf_event *event)
 {
-	WARN_ON(!(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD));
-
 	perf_pmu_disable(event->pmu);
 	intel_pmu_drain_pebs_buffer();
 	perf_pmu_enable(event->pmu);
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index e88dd0fcc4d4..22b0c3e565a6 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1650,7 +1650,7 @@ void intel_pmu_pebs_sched_task(struct
perf_event_pmu_context *pmu_ctx, bool sche

 void intel_pmu_pebs_update_cfg(struct cpu_hw_events *cpuc, int n, int
*assign);

-void intel_pmu_auto_reload_read(struct perf_event *event);
+void intel_pmu_pebs_read(struct perf_event *event);

 void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr);



> Why can't you use something like the below -- that gives you a count
> value matching the pmc value you put in, as long as it is 'near' the
> current value.
> 
> ---
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index 8f218ac0d445..3cf8b4f2b2c1 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -154,6 +154,26 @@ u64 x86_perf_event_update(struct perf_event *event)
>  	return new_raw_count;
>  }
>  
> +u64 x86_perf_event_pmc_to_count(struct perf_event *event, u64 pmc)
> +{
> +	struct hw_perf_event *hwc = &event->hw;
> +	int shift = 64 - x86_pmu.cntval_bits;
> +	u64 prev_pmc, prev_count;
> +	u64 delta;
> +
> +	do {
> +		prev_pmc = local64_read(&hwc->prev_count);
> +		barrier();
> +		prev_count = local64_read(&event->count);
> +		barrier();
> +	} while (prev_pmc != local64_read(&hwc->prev_count));

Is the "while()" to handle PMI? But there should be no PMI, since the
PMU has been disabled when draining the PEBS buffer.

I'm thinking to introduce a dedicated function to update the count from
pmc. It will only be used in the drain_pebs(). So we can avoid extending
the x86_perf_event_update() in patch 2. (Will do more tests.)

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index e06ac9a3cdf8..7f0b850f7277 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1969,6 +1969,23 @@ static void adaptive_pebs_save_regs(struct
pt_regs *regs,

 #define PEBS_LATENCY_MASK			0xffff

+void intel_perf_event_pmc_to_count(struct perf_event *event, u64 pmc)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int shift = 64 - x86_pmu.cntval_bits;
+	u64 prev_pmc;
+	u64 delta;
+
+	prev_pmc = local64_read(&hwc->prev_count);
+
+	delta = (pmc << shift) - (prev_pmc << shift);
+	delta >>= shift;
+
+	local64_add(delta, &event->count);
+	local64_sub(delta, &hwc->period_left);
+	local64_set(&hwc->prev_count, pmc);
+}
+
 /*
  * With adaptive PEBS the layout depends on what fields are configured.
  */
@@ -2109,7 +2126,7 @@ static void setup_pebs_adaptive_sample_data(struct
perf_event *event,
 		next_record += sizeof(struct pebs_cntr_header);

 		for_each_set_bit(bit, (unsigned long *)&cntr->cntr,
INTEL_PMC_MAX_GENERIC) {
-			x86_perf_event_update(cpuc->events[bit], (u64 *)next_record);
+			intel_perf_event_pmc_to_count(cpuc->events[bit], (u64 *)next_record);
 			next_record += sizeof(u64);
 		}


Thanks,
Kan

> +
> +	delta = (pmc << shift) - (prev_pmc << shift);
> +	delta >>= shift;
> +
> +	return prev_count + delta;
> +}
> +
>  /*
>   * Find and validate any extra registers to set up.
>   */
>