linux-kernel - [PATCH 02/14] perf/x86: output NMI overhead

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1479894292-16277-3-git-send-email-kan.liang@intel.com>
Date:   Wed, 23 Nov 2016 04:44:40 -0500
From:   kan.liang@...el.com
To:     peterz@...radead.org, mingo@...hat.com, acme@...nel.org,
        linux-kernel@...r.kernel.org
Cc:     alexander.shishkin@...ux.intel.com, tglx@...utronix.de,
        namhyung@...nel.org, jolsa@...nel.org, adrian.hunter@...el.com,
        wangnan0@...wei.com, mark.rutland@....com, andi@...stfloor.org,
        Kan Liang <kan.liang@...el.com>
Subject: [PATCH 02/14] perf/x86: output NMI overhead

From: Kan Liang <kan.liang@...el.com>

NMI handler is one of the most important part which brings overhead.

There are lots of NMI during sampling. It's very expensive to log each
NMI. So the accumulated time and NMI# will be output when event is going
to be disabled or task is scheduling out.
The newly introduced flag PERF_EF_LOG indicate to output the overhead
log.

Signed-off-by: Kan Liang <kan.liang@...el.com>
---
 arch/x86/events/core.c          | 19 ++++++++++++++-
 arch/x86/events/perf_event.h    |  2 ++
 include/linux/perf_event.h      |  1 +
 include/uapi/linux/perf_event.h |  2 ++
 kernel/events/core.c            | 54 ++++++++++++++++++++++-------------------
 5 files changed, 52 insertions(+), 26 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index d31735f..6c3b0ef 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1397,6 +1397,11 @@ static void x86_pmu_del(struct perf_event *event, int flags)
 
 	perf_event_update_userpage(event);
 
+	if ((flags & PERF_EF_LOG) && cpuc->nmi_overhead.nr) {
+		cpuc->nmi_overhead.cpu = smp_processor_id();
+		perf_log_overhead(event, PERF_NMI_OVERHEAD, &cpuc->nmi_overhead);
+	}
+
 do_del:
 	if (x86_pmu.del) {
 		/*
@@ -1475,11 +1480,21 @@ void perf_events_lapic_init(void)
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
 }
 
+static void
+perf_caculate_nmi_overhead(u64 time)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	cpuc->nmi_overhead.nr++;
+	cpuc->nmi_overhead.time += time;
+}
+
 static int
 perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
 {
 	u64 start_clock;
 	u64 finish_clock;
+	u64 clock;
 	int ret;
 
 	/*
@@ -1492,8 +1507,10 @@ perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
 	start_clock = sched_clock();
 	ret = x86_pmu.handle_irq(regs);
 	finish_clock = sched_clock();
+	clock = finish_clock - start_clock;
 
-	perf_sample_event_took(finish_clock - start_clock);
+	perf_caculate_nmi_overhead(clock);
+	perf_sample_event_took(clock);
 
 	return ret;
 }
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 5874d8d..c6b25ac 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -230,6 +230,8 @@ struct cpu_hw_events {
 	struct intel_excl_cntrs		*excl_cntrs;
 	int excl_thread_id; /* 0 or 1 */
 
+	struct perf_overhead_entry     nmi_overhead;
+
 	/*
 	 * AMD specific bits
 	 */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3a530a7..632647f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -313,6 +313,7 @@ struct pmu {
 #define PERF_EF_START	0x01		/* start the counter when adding    */
 #define PERF_EF_RELOAD	0x02		/* reload the counter when starting */
 #define PERF_EF_UPDATE	0x04		/* update the counter when stopping */
+#define PERF_EF_LOG	0x08
 
 	/*
 	 * Adds/Removes a counter to/from the PMU, can be done inside a
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index ae571be..071323d 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -992,6 +992,8 @@ struct perf_branch_entry {
 };
 
 enum perf_record_overhead_type {
+	PERF_NMI_OVERHEAD	= 0,
+
 	PERF_OVERHEAD_MAX,
 };
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 590c399..d82e6ca 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1785,7 +1785,8 @@ event_filter_match(struct perf_event *event)
 static void
 event_sched_out(struct perf_event *event,
 		  struct perf_cpu_context *cpuctx,
-		  struct perf_event_context *ctx)
+		  struct perf_event_context *ctx,
+		  bool log_overhead)
 {
 	u64 tstamp = perf_event_time(event);
 	u64 delta;
@@ -1810,9 +1811,8 @@ event_sched_out(struct perf_event *event,
 		return;
 
 	perf_pmu_disable(event->pmu);
-
 	event->tstamp_stopped = tstamp;
-	event->pmu->del(event, 0);
+	event->pmu->del(event, log_overhead ? PERF_EF_LOG : 0);
 	event->oncpu = -1;
 	event->state = PERF_EVENT_STATE_INACTIVE;
 	if (event->pending_disable) {
@@ -1835,20 +1835,21 @@ event_sched_out(struct perf_event *event,
 static void
 group_sched_out(struct perf_event *group_event,
 		struct perf_cpu_context *cpuctx,
-		struct perf_event_context *ctx)
+		struct perf_event_context *ctx,
+		bool log_overhead)
 {
 	struct perf_event *event;
 	int state = group_event->state;
 
 	perf_pmu_disable(ctx->pmu);
 
-	event_sched_out(group_event, cpuctx, ctx);
+	event_sched_out(group_event, cpuctx, ctx, log_overhead);
 
 	/*
 	 * Schedule out siblings (if any):
 	 */
 	list_for_each_entry(event, &group_event->sibling_list, group_entry)
-		event_sched_out(event, cpuctx, ctx);
+		event_sched_out(event, cpuctx, ctx, log_overhead);
 
 	perf_pmu_enable(ctx->pmu);
 
@@ -1872,7 +1873,7 @@ __perf_remove_from_context(struct perf_event *event,
 {
 	unsigned long flags = (unsigned long)info;
 
-	event_sched_out(event, cpuctx, ctx);
+	event_sched_out(event, cpuctx, ctx, false);
 	if (flags & DETACH_GROUP)
 		perf_group_detach(event);
 	list_del_event(event, ctx);
@@ -1918,9 +1919,9 @@ static void __perf_event_disable(struct perf_event *event,
 	update_cgrp_time_from_event(event);
 	update_group_times(event);
 	if (event == event->group_leader)
-		group_sched_out(event, cpuctx, ctx);
+		group_sched_out(event, cpuctx, ctx, true);
 	else
-		event_sched_out(event, cpuctx, ctx);
+		event_sched_out(event, cpuctx, ctx, true);
 	event->state = PERF_EVENT_STATE_OFF;
 }
 
@@ -2141,10 +2142,10 @@ group_sched_in(struct perf_event *group_event,
 			event->tstamp_running += now - event->tstamp_stopped;
 			event->tstamp_stopped = now;
 		} else {
-			event_sched_out(event, cpuctx, ctx);
+			event_sched_out(event, cpuctx, ctx, false);
 		}
 	}
-	event_sched_out(group_event, cpuctx, ctx);
+	event_sched_out(group_event, cpuctx, ctx, false);
 
 	pmu->cancel_txn(pmu);
 
@@ -2198,7 +2199,8 @@ static void add_event_to_ctx(struct perf_event *event,
 
 static void ctx_sched_out(struct perf_event_context *ctx,
 			  struct perf_cpu_context *cpuctx,
-			  enum event_type_t event_type);
+			  enum event_type_t event_type,
+			  bool log_overhead);
 static void
 ctx_sched_in(struct perf_event_context *ctx,
 	     struct perf_cpu_context *cpuctx,
@@ -2206,7 +2208,8 @@ ctx_sched_in(struct perf_event_context *ctx,
 	     struct task_struct *task);
 
 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
-			       struct perf_event_context *ctx)
+			       struct perf_event_context *ctx,
+			       bool log_overhead)
 {
 	if (!cpuctx->task_ctx)
 		return;
@@ -2214,7 +2217,7 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
 	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
 		return;
 
-	ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+	ctx_sched_out(ctx, cpuctx, EVENT_ALL, log_overhead);
 }
 
 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
@@ -2234,7 +2237,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
 {
 	perf_pmu_disable(cpuctx->ctx.pmu);
 	if (task_ctx)
-		task_ctx_sched_out(cpuctx, task_ctx);
+		task_ctx_sched_out(cpuctx, task_ctx, false);
 	cpu_ctx_sched_out(cpuctx, EVENT_ALL);
 	perf_event_sched_in(cpuctx, task_ctx, current);
 	perf_pmu_enable(cpuctx->ctx.pmu);
@@ -2281,7 +2284,7 @@ static int  __perf_install_in_context(void *info)
 	}
 
 	if (activate) {
-		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+		ctx_sched_out(ctx, cpuctx, EVENT_TIME, false);
 		add_event_to_ctx(event, ctx);
 		ctx_resched(cpuctx, task_ctx);
 	} else {
@@ -2396,7 +2399,7 @@ static void __perf_event_enable(struct perf_event *event,
 		return;
 
 	if (ctx->is_active)
-		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+		ctx_sched_out(ctx, cpuctx, EVENT_TIME, false);
 
 	__perf_event_mark_enabled(event);
 
@@ -2611,7 +2614,8 @@ EXPORT_SYMBOL_GPL(perf_event_refresh);
 
 static void ctx_sched_out(struct perf_event_context *ctx,
 			  struct perf_cpu_context *cpuctx,
-			  enum event_type_t event_type)
+			  enum event_type_t event_type,
+			  bool log_overhead)
 {
 	int is_active = ctx->is_active;
 	struct perf_event *event;
@@ -2662,12 +2666,12 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 	perf_pmu_disable(ctx->pmu);
 	if (is_active & EVENT_PINNED) {
 		list_for_each_entry(event, &ctx->pinned_groups, group_entry)
-			group_sched_out(event, cpuctx, ctx);
+			group_sched_out(event, cpuctx, ctx, log_overhead);
 	}
 
 	if (is_active & EVENT_FLEXIBLE) {
 		list_for_each_entry(event, &ctx->flexible_groups, group_entry)
-			group_sched_out(event, cpuctx, ctx);
+			group_sched_out(event, cpuctx, ctx, log_overhead);
 	}
 	perf_pmu_enable(ctx->pmu);
 }
@@ -2850,7 +2854,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 
 	if (do_switch) {
 		raw_spin_lock(&ctx->lock);
-		task_ctx_sched_out(cpuctx, ctx);
+		task_ctx_sched_out(cpuctx, ctx, false);
 		raw_spin_unlock(&ctx->lock);
 	}
 }
@@ -2958,7 +2962,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 			      enum event_type_t event_type)
 {
-	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
+	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type, false);
 }
 
 static void
@@ -3350,7 +3354,7 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx)
 
 	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 	if (ctx)
-		ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+		ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE, false);
 
 	rotate_ctx(&cpuctx->ctx);
 	if (ctx)
@@ -3415,7 +3419,7 @@ static void perf_event_enable_on_exec(int ctxn)
 
 	cpuctx = __get_cpu_context(ctx);
 	perf_ctx_lock(cpuctx, ctx);
-	ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+	ctx_sched_out(ctx, cpuctx, EVENT_TIME, false);
 	list_for_each_entry(event, &ctx->event_list, event_entry)
 		enabled |= event_enable_on_exec(event, ctx);
 
@@ -10190,7 +10194,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
 	 * in.
 	 */
 	raw_spin_lock_irq(&child_ctx->lock);
-	task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
+	task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, true);
 
 	/*
 	 * Now that the context is inactive, destroy the task <-> ctx relation
-- 
2.5.5