linux-kernel - Re: [PATCH v7 2/2] perf/core: add mux switch to skip to the current CPU's events list on mux interrupt

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <e7fa57c2-fc86-a21c-9380-5ab1ecc53fe7@linux.intel.com>
Date:   Wed, 23 Aug 2017 21:12:34 +0300
From:   Alexey Budankov <alexey.budankov@...ux.intel.com>
To:     Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
        Peter Zijlstra <peterz@...radead.org>,
        Ingo Molnar <mingo@...hat.com>,
        Arnaldo Carvalho de Melo <acme@...nel.org>
Cc:     Andi Kleen <ak@...ux.intel.com>, Kan Liang <kan.liang@...el.com>,
        Dmitri Prokhorov <Dmitry.Prohorov@...el.com>,
        Valery Cherepennikov <valery.cherepennikov@...el.com>,
        Mark Rutland <mark.rutland@....com>,
        Stephane Eranian <eranian@...gle.com>,
        David Carrillo-Cisneros <davidcc@...gle.com>,
        linux-kernel <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH v7 2/2] perf/core: add mux switch to skip to the current
 CPU's events list on mux interrupt

On 23.08.2017 14:54, Alexander Shishkin wrote:
> Alexey Budankov <alexey.budankov@...ux.intel.com> writes:
> 
>> This patch implements mux switch that triggers skipping to the 
>> current CPU's events list at mulitplexing hrtimer interrupt 
>> handler as well as adoption of the switch in the existing 
>> implementation.
>>
>> perf_event_groups_iterate_cpu() API is introduced to implement 
>> iteration thru the certain CPU groups list skipping groups 
> 
> "through"
> 
>> allocated for the other CPUs.
>>
>> Signed-off-by: Alexey Budankov <alexey.budankov@...ux.intel.com>
>> ---
>>  kernel/events/core.c | 193 ++++++++++++++++++++++++++++++++++++---------------
>>  1 file changed, 137 insertions(+), 56 deletions(-)
>>
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 08ccfb2..aeb0f81 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -556,11 +556,11 @@ void perf_sample_event_took(u64 sample_len_ns)
>>  static atomic64_t perf_event_id;
>>  
>>  static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
>> -			      enum event_type_t event_type);
>> +			      enum event_type_t event_type, int mux);
>>  
>>  static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
>>  			     enum event_type_t event_type,
>> -			     struct task_struct *task);
>> +			     struct task_struct *task, int mux);
>>  
>>  static void update_context_time(struct perf_event_context *ctx);
>>  static u64 perf_event_time(struct perf_event *event);
>> @@ -702,6 +702,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
>>  	struct perf_cpu_context *cpuctx;
>>  	struct list_head *list;
>>  	unsigned long flags;
>> +	int mux = 0;
>>  
>>  	/*
>>  	 * Disable interrupts and preemption to avoid this CPU's
>> @@ -717,7 +718,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
>>  		perf_pmu_disable(cpuctx->ctx.pmu);
>>  
>>  		if (mode & PERF_CGROUP_SWOUT) {
>> -			cpu_ctx_sched_out(cpuctx, EVENT_ALL);
>> +			cpu_ctx_sched_out(cpuctx, EVENT_ALL, mux);
>>  			/*
>>  			 * must not be done before ctxswout due
>>  			 * to event_filter_match() in event_sched_out()
>> @@ -736,7 +737,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
>>  			 */
>>  			cpuctx->cgrp = perf_cgroup_from_task(task,
>>  							     &cpuctx->ctx);
>> -			cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
>> +			cpu_ctx_sched_in(cpuctx, EVENT_ALL, task, mux);
> 
> 'mux' is always zero in this function, isn't it?
> 
>>  		}
>>  		perf_pmu_enable(cpuctx->ctx.pmu);
>>  		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>> @@ -1613,8 +1614,16 @@ perf_event_groups_rotate(struct rb_root *groups, int cpu)
>>   */
>>  #define perf_event_groups_for_each(event, iter, tree, node, list, link) \
>>  	   for (iter = rb_first(tree); iter; iter = rb_next(iter))	\
>> -		list_for_each_entry(event, &(rb_entry(iter,		\
>> -			typeof(*event), node)->list), link)
>> +		list_for_each_entry(event, &(rb_entry(iter, 		\
>> +				typeof(*event), node)->list), link)
> 
> Is this an indentation change? What is it doing here?
> 
>> +
>> +/*
>> + * Iterate event groups related to specific cpu.
>> + */
>> +#define perf_event_groups_for_each_cpu(event, cpu, tree, list, link)	\
>> +		list = perf_event_groups_get_list(tree, cpu);		\
>> +		if (list)						\
>> +			list_for_each_entry(event, list, link)
> 
> ..or not, if there's no list.
> 
>>  
>>  /*
>>   * Add a event from the lists for its context.
>> @@ -2397,36 +2406,38 @@ static void add_event_to_ctx(struct perf_event *event,
>>  
>>  static void ctx_sched_out(struct perf_event_context *ctx,
>>  			  struct perf_cpu_context *cpuctx,
>> -			  enum event_type_t event_type);
>> +			  enum event_type_t event_type, int mux);
>>  static void
>>  ctx_sched_in(struct perf_event_context *ctx,
>>  	     struct perf_cpu_context *cpuctx,
>>  	     enum event_type_t event_type,
>> -	     struct task_struct *task);
>> +	     struct task_struct *task, int mux);
>>  
>>  static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
>>  			       struct perf_event_context *ctx,
>>  			       enum event_type_t event_type)
>>  {
>> +	int mux = 0;
>> +
>>  	if (!cpuctx->task_ctx)
>>  		return;
>>  
>>  	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
>>  		return;
>>  
>> -	ctx_sched_out(ctx, cpuctx, event_type);
>> +	ctx_sched_out(ctx, cpuctx, event_type, mux);
> 
> Just use 0.

Well, I intentionally made this ugly switch named over local variable 
to add clarity into the code so I simply wonder - why do you prefer 
the unnamed variant?

> 
>>  }
>>  
>>  static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
>>  				struct perf_event_context *ctx,
>> -				struct task_struct *task)
>> +				struct task_struct *task, int mux)
>>  {
>> -	cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
>> +	cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task, mux);
>>  	if (ctx)
>> -		ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
>> -	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
>> +		ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task, mux);
>> +	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task, mux);
>>  	if (ctx)
>> -		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
>> +		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task, mux);
>>  }
>>  
>>  /*
>> @@ -2450,6 +2461,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
>>  {
>>  	enum event_type_t ctx_event_type = event_type & EVENT_ALL;
>>  	bool cpu_event = !!(event_type & EVENT_CPU);
>> +	int mux = 0;
>>  
>>  	/*
>>  	 * If pinned groups are involved, flexible groups also need to be
>> @@ -2470,11 +2482,11 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
>>  	 *  - otherwise, do nothing more.
>>  	 */
>>  	if (cpu_event)
>> -		cpu_ctx_sched_out(cpuctx, ctx_event_type);
>> +		cpu_ctx_sched_out(cpuctx, ctx_event_type, mux);
>>  	else if (ctx_event_type & EVENT_PINNED)
>> -		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
>> +		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
>>  
>> -	perf_event_sched_in(cpuctx, task_ctx, current);
>> +	perf_event_sched_in(cpuctx, task_ctx, current, mux);
> 
> Also mux==0 in all cases in this function.
> 
>>  	perf_pmu_enable(cpuctx->ctx.pmu);
>>  }
>>  
>> @@ -2491,7 +2503,7 @@ static int  __perf_install_in_context(void *info)
>>  	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
>>  	struct perf_event_context *task_ctx = cpuctx->task_ctx;
>>  	bool reprogram = true;
>> -	int ret = 0;
>> +	int ret = 0, mux =0;
>>  
>>  	raw_spin_lock(&cpuctx->ctx.lock);
>>  	if (ctx->task) {
>> @@ -2518,7 +2530,7 @@ static int  __perf_install_in_context(void *info)
>>  	}
>>  
>>  	if (reprogram) {
>> -		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
>> +		ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux);
>>  		add_event_to_ctx(event, ctx);
>>  		ctx_resched(cpuctx, task_ctx, get_event_type(event));
>>  	} else {
>> @@ -2655,13 +2667,14 @@ static void __perf_event_enable(struct perf_event *event,
>>  {
>>  	struct perf_event *leader = event->group_leader;
>>  	struct perf_event_context *task_ctx;
>> +	int mux = 0;
>>  
>>  	if (event->state >= PERF_EVENT_STATE_INACTIVE ||
>>  	    event->state <= PERF_EVENT_STATE_ERROR)
>>  		return;
>>  
>>  	if (ctx->is_active)
>> -		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
>> +		ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux);
>>  
>>  	__perf_event_mark_enabled(event);
>>  
>> @@ -2671,7 +2684,7 @@ static void __perf_event_enable(struct perf_event *event,
>>  	if (!event_filter_match(event)) {
>>  		if (is_cgroup_event(event))
>>  			perf_cgroup_defer_enabled(event);
>> -		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
>> +		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux);
>>  		return;
>>  	}
>>  
>> @@ -2680,7 +2693,7 @@ static void __perf_event_enable(struct perf_event *event,
>>  	 * then don't put it on unless the group is on.
>>  	 */
>>  	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
>> -		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
>> +		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux);
> 
> And here.
> 
>>  		return;
>>  	}
>>  
>> @@ -2876,11 +2889,13 @@ EXPORT_SYMBOL_GPL(perf_event_refresh);
>>  
>>  static void ctx_sched_out(struct perf_event_context *ctx,
>>  			  struct perf_cpu_context *cpuctx,
>> -			  enum event_type_t event_type)
>> +			  enum event_type_t event_type, int mux)
>>  {
>>  	int is_active = ctx->is_active;
>> +	struct list_head *group_list;
>>  	struct perf_event *event;
>>  	struct rb_node *node;
>> +	int sw = -1, cpu = smp_processor_id();
> 
> Same thing seems to be happening with 'sw'.
> 
>>  	lockdep_assert_held(&ctx->lock);
>>  
>>  	if (likely(!ctx->nr_events)) {
>> @@ -2926,17 +2941,47 @@ static void ctx_sched_out(struct perf_event_context *ctx,
>>  
>>  	perf_pmu_disable(ctx->pmu);
>>  
>> -	if (is_active & EVENT_PINNED)
>> -		perf_event_groups_for_each(event, node,
>> -				&ctx->pinned_groups, group_node,
>> -				group_list, group_entry)
>> -			group_sched_out(event, cpuctx, ctx);
>> +	if (is_active & EVENT_PINNED) {
>> +		if (mux) {
> 
> So it's 'rotate', really.

Which 'rotate' do you mean? If the local variable from 
perf_rotate_context() then yes - it may be passed to this function 
call from there as mux value, but logically they are still different.

> 
>> +			perf_event_groups_for_each_cpu(event, cpu,
>> +					&ctx->pinned_groups,
>> +					group_list, group_entry) {
>> +					group_sched_out(event, cpuctx, ctx);
>> +			}
>> +			perf_event_groups_for_each_cpu(event, sw,
>> +					&ctx->pinned_groups,
>> +					group_list, group_entry) {
>> +					group_sched_out(event, cpuctx, ctx);
>> +			}
>> +		} else {
>> +			perf_event_groups_for_each(event, node,
>> +					&ctx->pinned_groups, group_node,
>> +					group_list, group_entry) {
>> +					group_sched_out(event, cpuctx, ctx);
>> +			}
>> +		}
>> +	}
>>  
>> -	if (is_active & EVENT_FLEXIBLE)
>> -		perf_event_groups_for_each(event, node,
>> -				&ctx->flexible_groups, group_node,
>> -				group_list, group_entry)
>> -			group_sched_out(event, cpuctx, ctx);
>> +	if (is_active & EVENT_FLEXIBLE) {
>> +		if (mux) {
>> +			perf_event_groups_for_each_cpu(event, cpu,
>> +					&ctx->flexible_groups,
>> +					group_list, group_entry) {
>> +					group_sched_out(event, cpuctx, ctx);
>> +			}
>> +			perf_event_groups_for_each_cpu(event, sw,
>> +					&ctx->flexible_groups,
>> +					group_list, group_entry) {
>> +					group_sched_out(event, cpuctx, ctx);
>> +			}
>> +		} else {
>> +			perf_event_groups_for_each(event, node,
>> +					&ctx->flexible_groups, group_node,
>> +					group_list, group_entry) {
>> +					group_sched_out(event, cpuctx, ctx);
>> +			}
>> +		}
>> +	}
>>  
>>  	perf_pmu_enable(ctx->pmu);
>>  }
>> @@ -3225,9 +3270,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
>>   * Called with IRQs disabled
>>   */
>>  static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
>> -			      enum event_type_t event_type)
>> +			      enum event_type_t event_type, int mux)
>>  {
>> -	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
>> +	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type, mux);
>>  }
>>  
>>  static void
>> @@ -3287,11 +3332,13 @@ static void
>>  ctx_sched_in(struct perf_event_context *ctx,
>>  	     struct perf_cpu_context *cpuctx,
>>  	     enum event_type_t event_type,
>> -	     struct task_struct *task)
>> +	     struct task_struct *task, int mux)
>>  {
>>  	int is_active = ctx->is_active;
>> +	struct list_head *group_list;
>>  	struct perf_event *event;
>>  	struct rb_node *node;
>> +	int sw = -1, cpu = smp_processor_id();
>>  
>>  	lockdep_assert_held(&ctx->lock);
>>  
>> @@ -3319,35 +3366,69 @@ ctx_sched_in(struct perf_event_context *ctx,
>>  	 * First go through the list and put on any pinned groups
>>  	 * in order to give them the best chance of going on.
>>  	 */
>> -	if (is_active & EVENT_PINNED)
>> -		perf_event_groups_for_each(event, node,
>> -				&ctx->pinned_groups, group_node,
>> -				group_list, group_entry)
>> -			ctx_pinned_sched_in(event, cpuctx, ctx);
>> +	if (is_active & EVENT_PINNED) {
>> +		if (mux) {
>> +			perf_event_groups_for_each_cpu(event, sw,
>> +					&ctx->pinned_groups,
>> +					group_list, group_entry) {
>> +					ctx_pinned_sched_in(event, cpuctx, ctx);
>> +			}
>> +			perf_event_groups_for_each_cpu(event, cpu,
>> +					&ctx->pinned_groups,
>> +					group_list, group_entry) {
>> +					ctx_pinned_sched_in(event, cpuctx, ctx);
>> +			}
>> +		} else {
>> +			perf_event_groups_for_each(event, node,
>> +					&ctx->pinned_groups, group_node,
>> +					group_list, group_entry) {
>> +					ctx_pinned_sched_in(event, cpuctx, ctx);
>> +			}
>> +		}
>> +	}
>>  
>>  	/* Then walk through the lower prio flexible groups */
>>  	if (is_active & EVENT_FLEXIBLE) {
>>  		int can_add_hw = 1;
>> -		perf_event_groups_for_each(event, node,
>> -				&ctx->flexible_groups, group_node,
>> -				group_list, group_entry)
>> -			ctx_flexible_sched_in(event, cpuctx, ctx, &can_add_hw);
>> +		if (mux) {
>> +			perf_event_groups_for_each_cpu(event, sw,
>> +					&ctx->flexible_groups,
>> +					group_list, group_entry) {
>> +					ctx_flexible_sched_in(event, cpuctx,
>> +							ctx, &can_add_hw);
>> +			}
>> +			can_add_hw = 1;
>> +			perf_event_groups_for_each_cpu(event, cpu,
>> +					&ctx->flexible_groups,
>> +					group_list, group_entry) {
>> +					ctx_flexible_sched_in(event, cpuctx,
>> +							ctx, &can_add_hw);
>> +			}
>> +		} else {
>> +			perf_event_groups_for_each(event, node,
>> +					&ctx->flexible_groups, group_node,
>> +					group_list, group_entry) {
>> +					ctx_flexible_sched_in(event, cpuctx,
>> +							ctx, &can_add_hw);
>> +			}
>> +		}
>>  	}
>>  }
>>  
>>  static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
>>  			     enum event_type_t event_type,
>> -			     struct task_struct *task)
>> +			     struct task_struct *task, int mux)
>>  {
>>  	struct perf_event_context *ctx = &cpuctx->ctx;
>>  
>> -	ctx_sched_in(ctx, cpuctx, event_type, task);
>> +	ctx_sched_in(ctx, cpuctx, event_type, task, mux);
>>  }
>>  
>>  static void perf_event_context_sched_in(struct perf_event_context *ctx,
>>  					struct task_struct *task)
>>  {
>>  	struct perf_cpu_context *cpuctx;
>> +	int mux = 0;
>>  
>>  	cpuctx = __get_cpu_context(ctx);
>>  	if (cpuctx->task_ctx == ctx)
>> @@ -3371,8 +3452,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
>>  	 * events, no need to flip the cpuctx's events around.
>>  	 */
>>  	if (!RB_EMPTY_ROOT(&ctx->pinned_groups))
>> -		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
>> -	perf_event_sched_in(cpuctx, ctx, task);
>> +		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
>> +	perf_event_sched_in(cpuctx, ctx, task, mux);
>>  	perf_pmu_enable(ctx->pmu);
>>  
>>  unlock:
>> @@ -3618,7 +3699,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
>>  static int perf_rotate_context(struct perf_cpu_context *cpuctx)
>>  {
>>  	struct perf_event_context *ctx = NULL;
>> -	int rotate = 0;
>> +	int rotate = 0, mux = 1;
>>  
>>  	if (cpuctx->ctx.nr_events) {
>>  		if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
>> @@ -3637,15 +3718,15 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx)
>>  	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>>  	perf_pmu_disable(cpuctx->ctx.pmu);
>>  
>> -	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
>> +	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
> 
> It's '1'.
> 
>