linux-kernel - Re: [PATCH v9 12/14] x86: perf: intel

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAB7JpEFo=gmx9mFxDi06fueunFrpqgsKtg4_eC-PuUeuCHre1g@mail.gmail.com>
Date:	Thu, 29 Jan 2015 17:03:21 +0200
From:	Alexander Shishkin <alexander.shishkin@...ux.intel.com>
To:	Peter Zijlstra <peterz@...radead.org>
Cc:	Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
	Ingo Molnar <mingo@...hat.com>,
	Linux Kernel Mailing List <linux-kernel@...r.kernel.org>,
	Robert Richter <rric@...nel.org>,
	Frederic Weisbecker <fweisbec@...il.com>,
	Mike Galbraith <efault@....de>,
	Paul Mackerras <paulus@...ba.org>,
	Stephane Eranian <eranian@...gle.com>,
	Andi Kleen <ak@...ux.intel.com>, kan.liang@...el.com,
	adrian.hunter@...el.com, markus.t.metzger@...el.com,
	mathieu.poirier@...aro.org, Kaixu Xia <kaixu.xia@...aro.org>,
	acme@...radead.org
Subject: Re: [PATCH v9 12/14] x86: perf: intel_pt: Intel PT PMU driver

On 29 January 2015 at 13:59, Peter Zijlstra <peterz@...radead.org> wrote:
> I think that logic fails for per-task events that have a cpu set.

True.

>> +static bool exclusive_event_ok(struct perf_event *event,
>> +                            struct perf_event_context *ctx)
>> +{
>> +     struct pmu *pmu = event->pmu;
>> +     bool ret = true;
>> +
>> +     if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
>> +             return true;
>> +
>> +     if (!__exclusive_event_ok(event, ctx))
>> +             return false;
>> +
>> +     if (ctx->task) {
>> +             if (event->cpu != -1) {
>> +                     struct perf_event_context *cpuctx;
>> +
>> +                     cpuctx = &per_cpu_ptr(pmu->pmu_cpu_context, event->cpu)->ctx;
>> +
>> +                     mutex_lock(&cpuctx->mutex);
>> +                     ret = __exclusive_event_ok(event, cpuctx);
>> +                     mutex_unlock(&cpuctx->mutex);
>
> We're already holding ctx->mutex, this should have made lockdep scream.

As I mentioned offline, cpuctx->ctx.mutex is set to a lockdep class of
its own, so lockdep doesn't see this. It is, of course, still a
problem.
But as you pointed out, if we grab the exclusive_cnt for per-task
cpu!=-1 events as well, we don't need to look into other contexts here
at all.

>
>> +             }
>> +     }
>> +
>> +     return ret;
>> +}
>
>
> Would something like this work?
>
> ---
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -168,6 +168,7 @@ struct perf_event;
>  #define PERF_PMU_CAP_NO_INTERRUPT              0x01
>  #define PERF_PMU_CAP_AUX_NO_SG                 0x02
>  #define PERF_PMU_CAP_AUX_SW_DOUBLEBUF          0x04
> +#define PERF_PMU_CAP_EXCLUSIVE                 0x08
>
>  /**
>   * struct pmu - generic performance monitoring unit
> @@ -188,6 +189,7 @@ struct pmu {
>
>         int * __percpu                  pmu_disable_count;
>         struct perf_cpu_context * __percpu pmu_cpu_context;
> +       atomic_t                        exclusive_cnt; /* <0: cpu, >0: tsk */
>         int                             task_ctx_nr;
>         int                             hrtimer_interval_ms;
>
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -3487,6 +3487,9 @@ static void __free_event(struct perf_eve
>         if (event->destroy)
>                 event->destroy(event);
>
> +       if (event->pmu && event->ctx)
> +               exclusive_event_release(event);

It looks like event can be already removed from its context at this
point, so event->ctx will be NULL and the counter would leak or am I
missing something?
I used the event->attach_state & PERF_ATTACH_TASK to see if it's a
per-task counter, which seems reliable even though it might need
documenting.

> +
>         if (event->ctx)
>                 put_ctx(event->ctx);
>
> @@ -7092,6 +7095,7 @@ int perf_pmu_register(struct pmu *pmu, c
>                 pmu->event_idx = perf_event_idx_default;
>
>         list_add_rcu(&pmu->entry, &pmus);
> +       atomic_set(&pmu->exclusive_cnt, 0);
>         ret = 0;
>  unlock:
>         mutex_unlock(&pmus_lock);
> @@ -7537,6 +7541,78 @@ perf_event_set_output(struct perf_event
>         return ret;
>  }
>
> +static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
> +{
> +       if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
> +           (e1->cpu == e2->cpu ||
> +            e1->cpu == -1 ||
> +            e2->cpu == -1))
> +               return true;
> +       return false;
> +}
> +
> +static bool __exclusive_event_ok(struct perf_event *event,
> +                                struct perf_event_context *ctx)
> +{
> +       struct perf_event *iter_event;
> +
> +       list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
> +               if (exclusive_event_match(iter_event, event))
> +                       return false;
> +       }
> +
> +       return true;
> +}
> +
> +static bool exclusive_event_ok(struct perf_event *event,
> +                              struct perf_event_context *ctx)

Then, maybe exclusive_event_installable() is a better name, because
this one only deals with the current context; cpu-wide vs per-task
case is taken care of in perf_event_alloc()/__free_event().

> +{
> +       struct pmu *pmu = event->pmu;
> +       bool ret;
> +
> +       if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
> +               return true;
> +
> +       /*
> +        * exclusive_cnt <0: cpu
> +        *               >0: tsk
> +        */
> +       if (ctx->task) {
> +               if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
> +                       return false;
> +       } else {
> +               if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
> +                       return false;
> +       }

So I would like to keep this bit in perf_event_alloc() path and the
reverse in __free_event(),

> +
> +       mutex_lock(&ctx->lock);
> +       ret = __exclusive_event_ok(event, ctx);
> +       mutex_unlock(&ctx->lock);
> +
> +       if (!ret) {
> +               if (ctx->task)
> +                       atomic_dec(&pmu->exclusive_cnt);
> +               else
> +                       atomic_inc(&pmu->exclusive_cnt);
> +       }

in which case we don't need to undo the counter here, because it will
still go through __free_event() in the error path.

> +
> +       return ret;
> +}
> +
> +static void exclusive_event_release(struct perf_event *event)
> +{
> +       struct perf_event_context *ctx = event->ctx;
> +       struct pmu *pmu = event->pmu;
> +
> +       if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
> +               return;
> +
> +       if (ctx->task)
> +               atomic_dec(&pmu->exclusive_cnt);
> +       else
> +               atomic_inc(&pmu->exclusive_cnt);
> +}
> +
>  static void mutex_lock_double(struct mutex *a, struct mutex *b)
>  {
>         if (b < a)
> @@ -7702,6 +7778,15 @@ SYSCALL_DEFINE5(perf_event_open,
>                 task = NULL;
>         }
>
> +       if (pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) {
> +               err = -EBUSY;
> +               if (group_leader)
> +                       goto err_context;
> +
> +               if (!exclusive_event_ok(event, ctx))
> +                       goto err_context;
> +       }
> +
>         /*
>          * Look up the group leader (we will attach this event to it):
>          */
> @@ -7903,6 +7988,11 @@ perf_event_create_kernel_counter(struct
>                 goto err_free;
>         }
>
> +       if (!exclusive_event_ok(event, ctx)) {
> +               err = -EBUSY;
> +               goto err_context;
> +       }
> +
>         WARN_ON_ONCE(ctx->parent_ctx);
>         mutex_lock(&ctx->mutex);

So we can call exclusive_event_ok() here without dropping the
ctx::mutex, to make sure we don't race with another event creation.

Regards,
--
Alex
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/