[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <05159761-274e-4a6b-97d9-a1251d6cac7b@linux.intel.com>
Date: Wed, 11 Feb 2026 14:56:11 +0800
From: "Mi, Dapeng" <dapeng1.mi@...ux.intel.com>
To: Peter Zijlstra <peterz@...radead.org>
Cc: Ingo Molnar <mingo@...hat.com>, Arnaldo Carvalho de Melo
<acme@...nel.org>, Namhyung Kim <namhyung@...nel.org>,
Thomas Gleixner <tglx@...utronix.de>,
Dave Hansen <dave.hansen@...ux.intel.com>, Ian Rogers <irogers@...gle.com>,
Adrian Hunter <adrian.hunter@...el.com>, Jiri Olsa <jolsa@...nel.org>,
Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
Andi Kleen <ak@...ux.intel.com>, Eranian Stephane <eranian@...gle.com>,
Mark Rutland <mark.rutland@....com>, broonie@...nel.org,
Ravi Bangoria <ravi.bangoria@....com>, linux-kernel@...r.kernel.org,
linux-perf-users@...r.kernel.org, Zide Chen <zide.chen@...el.com>,
Falcon Thomas <thomas.falcon@...el.com>, Dapeng Mi <dapeng1.mi@...el.com>,
Xudong Hao <xudong.hao@...el.com>, Kan Liang <kan.liang@...ux.intel.com>
Subject: Re: [Patch v6 12/22] perf: Add sampling support for SIMD registers
On 2/11/2026 4:04 AM, Peter Zijlstra wrote:
> On Mon, Feb 09, 2026 at 03:20:37PM +0800, Dapeng Mi wrote:
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index d487c55a4f3e..5742126f50cc 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -7761,6 +7761,50 @@ perf_output_sample_regs(struct perf_output_handle *handle,
>> }
>> }
>>
>> +static void
>> +perf_output_sample_simd_regs(struct perf_output_handle *handle,
>> + struct perf_event *event,
>> + struct pt_regs *regs,
>> + u64 mask, u32 pred_mask)
>> +{
>> + u16 pred_qwords = event->attr.sample_simd_pred_reg_qwords;
>> + u16 vec_qwords = event->attr.sample_simd_vec_reg_qwords;
>> + u16 nr_vectors;
>> + u16 nr_pred;
>> + int bit;
>> + u64 val;
>> + u16 i;
>> +
>> + nr_vectors = hweight64(mask);
>> + nr_pred = hweight32(pred_mask);
>> +
>> + perf_output_put(handle, nr_vectors);
>> + perf_output_put(handle, vec_qwords);
>> + perf_output_put(handle, nr_pred);
>> + perf_output_put(handle, pred_qwords);
>> +
>> + if (nr_vectors) {
>> + for (bit = 0; bit < sizeof(mask) * BITS_PER_BYTE; bit++) {
>> + if (!(BIT_ULL(bit) & mask))
>> + continue;
>> + for (i = 0; i < vec_qwords; i++) {
>> + val = perf_simd_reg_value(regs, bit, i, false);
>> + perf_output_put(handle, val);
>> + }
>> + }
>> + }
>> + if (nr_pred) {
>> + for (bit = 0; bit < sizeof(pred_mask) * BITS_PER_BYTE; bit++) {
>> + if (!(BIT(bit) & pred_mask))
>> + continue;
>> + for (i = 0; i < pred_qwords; i++) {
>> + val = perf_simd_reg_value(regs, bit, i, true);
>> + perf_output_put(handle, val);
>> + }
>> + }
>> + }
>> +}
> Yeah, that works, but it does make me sad. The existing
> perf_output_sample_regs() has yet another solution.
>
> Wondering how hard it could possibly be to write a for_each_set_bit()
> variant that works on a given word (instead of an array), I did the
> below.
>
> It works (at least, the assembly looks about right); but I'm not sure
> its all I had hoped for either :-(
Pretty code! It looks I still haven't gotten used to writing such kind of
macros.
The code looks good to me, I would test it later. Thanks.
>
> ---
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -7754,18 +7754,27 @@ void __weak perf_get_regs_user(struct pe
> regs_user->abi = perf_reg_abi(current);
> }
>
> +/* Until GCC-14+/clang-19+, which have __builtin_ctzg() */
> +#define __ctzg(val, def) \
> + (val) ? _Generic((val), \
> + unsigned int: __builtin_ctz(val), \
> + unsigned long: __builtin_ctzl(val), \
> + unsigned long long: __builtin_ctzll(val)) : (def)
> +
> +#define __next_bit(val, bit) \
> + ({ auto __v = (val); \
> + __v &= GENMASK(sizeof(__v) * BITS_PER_BYTE - 1, bit); \
> + __ctzg(__v, -1); })
> +
> +#define word_for_each_set_bit(bit, val) \
> + for (int bit = 0; bit = __next_bit(val, bit), bit >= 0; bit++)
> +
> static void
> perf_output_sample_regs(struct perf_output_handle *handle,
> struct pt_regs *regs, u64 mask)
> {
> - int bit;
> - DECLARE_BITMAP(_mask, 64);
> -
> - bitmap_from_u64(_mask, mask);
> - for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
> - u64 val;
> -
> - val = perf_reg_value(regs, bit);
> + word_for_each_set_bit(bit, mask) {
> + u64 val = perf_reg_value(regs, bit);
> perf_output_put(handle, val);
> }
> }
> @@ -7778,14 +7787,8 @@ perf_output_sample_simd_regs(struct perf
> {
> u16 pred_qwords = event->attr.sample_simd_pred_reg_qwords;
> u16 vec_qwords = event->attr.sample_simd_vec_reg_qwords;
> - u16 nr_vectors;
> - u16 nr_pred;
> - int bit;
> - u64 val;
> - u16 i;
> -
> - nr_vectors = hweight64(mask);
> - nr_pred = hweight32(pred_mask);
> + u16 nr_vectors = hweight64(mask);
> + u16 nr_pred = hweight32(pred_mask);
>
> perf_output_put(handle, nr_vectors);
> perf_output_put(handle, vec_qwords);
> @@ -7793,21 +7796,17 @@ perf_output_sample_simd_regs(struct perf
> perf_output_put(handle, pred_qwords);
>
> if (nr_vectors) {
> - for (bit = 0; bit < sizeof(mask) * BITS_PER_BYTE; bit++) {
> - if (!(BIT_ULL(bit) & mask))
> - continue;
> - for (i = 0; i < vec_qwords; i++) {
> - val = perf_simd_reg_value(regs, bit, i, false);
> + word_for_each_set_bit(bit, mask) {
> + for (int i = 0; i < vec_qwords; i++) {
> + u64 val = perf_simd_reg_value(regs, bit, i, false);
> perf_output_put(handle, val);
> }
> }
> }
> if (nr_pred) {
> - for (bit = 0; bit < sizeof(pred_mask) * BITS_PER_BYTE; bit++) {
> - if (!(BIT(bit) & pred_mask))
> - continue;
> - for (i = 0; i < pred_qwords; i++) {
> - val = perf_simd_reg_value(regs, bit, i, true);
> + word_for_each_set_bit(bit, pred_mask) {
> + for (int i = 0; i < pred_qwords; i++) {
> + u64 val = perf_simd_reg_value(regs, bit, i, true);
> perf_output_put(handle, val);
> }
> }
Powered by blists - more mailing lists