linux-kernel - Re: [PATCH 12/21] perf/x86/intel/lbr: Support Architectural LBR

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <e026c2c7-0187-e2c8-8a1a-58b4b1a66134@linux.intel.com>
Date:   Fri, 19 Jun 2020 15:40:54 -0400
From:   "Liang, Kan" <kan.liang@...ux.intel.com>
To:     Peter Zijlstra <peterz@...radead.org>
Cc:     mingo@...hat.com, acme@...nel.org, tglx@...utronix.de,
        bp@...en8.de, x86@...nel.org, linux-kernel@...r.kernel.org,
        mark.rutland@....com, alexander.shishkin@...ux.intel.com,
        jolsa@...hat.com, namhyung@...nel.org, dave.hansen@...el.com,
        yu-cheng.yu@...el.com, bigeasy@...utronix.de, gorcunov@...il.com,
        hpa@...or.com, alexey.budankov@...ux.intel.com, eranian@...gle.com,
        ak@...ux.intel.com, like.xu@...ux.intel.com,
        yao.jin@...ux.intel.com
Subject: Re: [PATCH 12/21] perf/x86/intel/lbr: Support Architectural LBR



On 6/19/2020 3:08 PM, Peter Zijlstra wrote:
> On Fri, Jun 19, 2020 at 07:04:00AM -0700, kan.liang@...ux.intel.com wrote:
> 
>> +static void intel_pmu_arch_lbr_enable(bool pmi)
>> +{
>> +	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
>> +	u64 debugctl, lbr_ctl = 0, orig_debugctl;
>> +
>> +	if (pmi)
>> +		return;
>> +
>> +	if (cpuc->lbr_ctl)
>> +		lbr_ctl = cpuc->lbr_ctl->config & x86_pmu.lbr_ctl_mask;
>> +	/*
>> +	 * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
>> +	 * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
>> +	 * may be missed, that can lead to confusing results.
>> +	 */
>> +	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
>> +	orig_debugctl = debugctl;
>> +	if (lbr_ctl & ARCH_LBR_CALL_STACK)
>> +		debugctl &= ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
>> +	else
>> +		debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
>> +	if (orig_debugctl != debugctl)
>> +		wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
>> +
>> +	wrmsrl(MSR_ARCH_LBR_CTL, lbr_ctl | ARCH_LBR_CTL_LBREN);
>> +}
> 
> This is nearly a duplicate of the old one, surely we can do better?

It's similar, but we have to deal with different MSRs and bits.

> 
>> +static void intel_pmu_arch_lbr_restore(void *ctx)
>> +{
>> +	struct x86_perf_task_context_arch_lbr *task_ctx = ctx;
>> +	struct x86_perf_arch_lbr_entry *entries = task_ctx->entries;
>> +	int i;
>> +
>> +	/* Fast reset the LBRs before restore if the call stack is not full. */
>> +	if (!entries[x86_pmu.lbr_nr - 1].lbr_from)
>> +		intel_pmu_arch_lbr_reset();
>> +
>> +	for (i = 0; i < x86_pmu.lbr_nr; i++) {
>> +		if (!entries[i].lbr_from)
>> +			break;
>> +		wrlbr_from(i, entries[i].lbr_from);
>> +		wrlbr_to(i, entries[i].lbr_to);
>> +		wrmsrl(MSR_ARCH_LBR_INFO_0 + i, entries[i].lbr_info);
>> +	}
>> +}
> 
> This too looks very much like the old one.

The difference is the reset part.
For the previous platforms, we restore the saved LBRs first, then reset 
the unsaved LBR MSRs one by one.
Now, for Arch LBR, we have a fast reset method. We reset all LBRs (if we 
know there are unsaved items) first, then restore the saved LBRs.
That would improve the performance for the application with short call 
stack.

> 
>> +static void intel_pmu_arch_lbr_save(void *ctx)
>> +{
>> +	struct x86_perf_task_context_arch_lbr *task_ctx = ctx;
>> +	struct x86_perf_arch_lbr_entry *entries = task_ctx->entries;
>> +	int i;
>> +
>> +	for (i = 0; i < x86_pmu.lbr_nr; i++) {
>> +		entries[i].lbr_from = rdlbr_from(i);
>> +		/* Only save valid branches. */
>> +		if (!entries[i].lbr_from)
>> +			break;
>> +		entries[i].lbr_to = rdlbr_to(i);
>> +		rdmsrl(MSR_ARCH_LBR_INFO_0 + i, entries[i].lbr_info);
>> +	}
>> +
>> +	/* LBR call stack is not full. Reset is required in restore. */
>> +	if (i < x86_pmu.lbr_nr)
>> +		entries[x86_pmu.lbr_nr - 1].lbr_from = 0;
>> +}
> 
> And again..
> 
>> +static void __intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc, int index,
>> +				      u64 from, u64 to, u64 info)
>> +{
>> +	u64 mis = 0, pred = 0, in_tx = 0, abort = 0, type = 0;
>> +	u32 br_type, to_plm;
>> +	u16 cycles = 0;
>> +
>> +	if (x86_pmu.arch_lbr_mispred) {
>> +		mis = !!(info & ARCH_LBR_INFO_MISPRED);
>> +		pred = !mis;
>> +	}
>> +	in_tx = !!(info & ARCH_LBR_INFO_IN_TSX);
>> +	abort = !!(info & ARCH_LBR_INFO_TSX_ABORT);
>> +	if (x86_pmu.arch_lbr_timed_lbr &&
>> +	    (info & ARCH_LBR_INFO_CYC_CNT_VALID))
>> +		cycles = (info & ARCH_LBR_INFO_CYC_CNT);
>> +
>> +	/*
>> +	 * Parse the branch type recorded in LBR_x_INFO MSR.
>> +	 * Doesn't support OTHER_BRANCH decoding for now.
>> +	 * OTHER_BRANCH branch type still rely on software decoding.
>> +	 */
>> +	if (x86_pmu.arch_lbr_br_type) {
>> +		br_type = (info & ARCH_LBR_INFO_BR_TYPE) >> ARCH_LBR_INFO_BR_TYPE_OFFSET;
>> +
>> +		if (br_type <= ARCH_LBR_BR_TYPE_KNOWN_MAX) {
>> +			to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
>> +			type = arch_lbr_br_type_map[br_type] | to_plm;
>> +		}
>> +	}
>> +
>> +	cpuc->lbr_entries[index].from		 = from;
>> +	cpuc->lbr_entries[index].to		 = to;
>> +	cpuc->lbr_entries[index].mispred	 = mis;
>> +	cpuc->lbr_entries[index].predicted	 = pred;
>> +	cpuc->lbr_entries[index].in_tx		 = in_tx;
>> +	cpuc->lbr_entries[index].abort		 = abort;
>> +	cpuc->lbr_entries[index].cycles		 = cycles;
>> +	cpuc->lbr_entries[index].type		 = type;
>> +	cpuc->lbr_entries[index].reserved	 = 0;
>> +}
>> +
>> +static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc)
>> +{
>> +	u64 from, to, info;
>> +	int i;
>> +
>> +	for (i = 0; i < x86_pmu.lbr_nr; i++) {
>> +		from = rdlbr_from(i);
>> +		to   = rdlbr_to(i);
>> +
>> +		/*
>> +		 * Read LBR entries until invalid entry (0s) is detected.
>> +		 */
>> +		if (!from)
>> +			break;
>> +
>> +		rdmsrl(MSR_ARCH_LBR_INFO_0 + i, info);
>> +
>> +		__intel_pmu_arch_lbr_read(cpuc, i, from, to, info);
>> +	}
>> +
>> +	cpuc->lbr_stack.nr = i;
>> +}
> 
>> +static void intel_pmu_store_pebs_arch_lbrs(struct pebs_lbr *pebs_lbr,
>> +					   struct cpu_hw_events *cpuc)
>> +{
>> +	struct pebs_lbr_entry *lbr;
>> +	int i;
>> +
>> +	for (i = 0; i < x86_pmu.lbr_nr; i++) {
>> +		lbr = &pebs_lbr->lbr[i];
>> +
>> +		/*
>> +		 * Read LBR entries until invalid entry (0s) is detected.
>> +		 */
>> +		if (!lbr->from)
>> +			break;
>> +
>> +		__intel_pmu_arch_lbr_read(cpuc, i, lbr->from,
>> +					  lbr->to, lbr->info);
>> +	}
>> +
>> +	cpuc->lbr_stack.nr = i;
>> +	intel_pmu_lbr_filter(cpuc);
>> +}
> 
> Unless I'm reading cross-eyed again, that too is very similar to what we
> already had.
> 

I will try to factor out the common codes for thses functions as many as 
possible.


Thanks,
Kan