include/linux/perf_event.h | 55 +++-- kernel/events/core.c | 604 +++++++++++++++++++++++++++++++++------------ 2 files changed, 491 insertions(+), 168 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 24a6358..8e1967f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -550,6 +550,22 @@ struct pmu_event_list { struct list_head list; }; +struct perf_event_tstamp { + /* + * These are timestamps used for computing total_time_enabled + * and total_time_running when the event is in INACTIVE or + * ACTIVE state, measured in nanoseconds from an arbitrary point + * in time. + * enabled: the notional time when the event was enabled + * running: the notional time when the event was scheduled on + * stopped: in INACTIVE state, the notional time when the + * event was scheduled off. + */ + u64 enabled; + u64 running; + u64 stopped; +}; + /** * struct perf_event - performance event kernel representation: */ @@ -572,7 +588,20 @@ struct perf_event { */ struct list_head group_entry; struct list_head sibling_list; - + /* + * Node on the pinned or flexible tree located at the event context; + * the node may be empty in case its event is not directly attached + * to the tree but to group_list list of the event directly + * attached to the tree; + */ + struct rb_node group_node; + /* + * List keeps groups allocated for the same cpu; + * the list may be empty in case its event is not directly + * attached to the tree but to group_list list of the event directly + * attached to the tree; + */ + struct list_head group_list; /* * We need storage to track the entries in perf_pmu_migrate_context; we * cannot use the event_entry because of RCU and we want to keep the @@ -611,19 +640,11 @@ struct perf_event { u64 total_time_running; /* - * These are timestamps used for computing total_time_enabled - * and total_time_running when the event is in INACTIVE or - * ACTIVE state, measured in nanoseconds from an arbitrary point - * in time. - * tstamp_enabled: the notional time when the event was enabled - * tstamp_running: the notional time when the event was scheduled on - * tstamp_stopped: in INACTIVE state, the notional time when the - * event was scheduled off. + * tstamp points to the tstamp_data object below or to the object + * located at the event context; */ - u64 tstamp_enabled; - u64 tstamp_running; - u64 tstamp_stopped; - + struct perf_event_tstamp *tstamp; + struct perf_event_tstamp tstamp_data; /* * timestamp shadows the actual context timing but it can * be safely used in NMI interrupt context. It reflects the @@ -741,8 +762,8 @@ struct perf_event_context { struct mutex mutex; struct list_head active_ctx_list; - struct list_head pinned_groups; - struct list_head flexible_groups; + struct rb_root pinned_groups; + struct rb_root flexible_groups; struct list_head event_list; int nr_events; int nr_active; @@ -758,6 +779,10 @@ struct perf_event_context { */ u64 time; u64 timestamp; + /* + * Context cache for filtered out events; + */ + struct perf_event_tstamp tstamp_data; /* * These fields let us detect when two contexts have both diff --git a/kernel/events/core.c b/kernel/events/core.c index bc63f8d..2d02f75 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -555,11 +555,11 @@ void perf_sample_event_took(u64 sample_len_ns) static atomic64_t perf_event_id; static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, - enum event_type_t event_type); + enum event_type_t event_type, int mux); static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, enum event_type_t event_type, - struct task_struct *task); + struct task_struct *task, int mux); static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); @@ -701,6 +701,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) struct perf_cpu_context *cpuctx; struct list_head *list; unsigned long flags; + int mux = 0; /* * Disable interrupts and preemption to avoid this CPU's @@ -716,7 +717,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) perf_pmu_disable(cpuctx->ctx.pmu); if (mode & PERF_CGROUP_SWOUT) { - cpu_ctx_sched_out(cpuctx, EVENT_ALL); + cpu_ctx_sched_out(cpuctx, EVENT_ALL, mux); /* * must not be done before ctxswout due * to event_filter_match() in event_sched_out() @@ -735,7 +736,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) */ cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx); - cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); + cpu_ctx_sched_in(cpuctx, EVENT_ALL, task, mux); } perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@ -865,10 +866,10 @@ perf_cgroup_mark_enabled(struct perf_event *event, event->cgrp_defer_enabled = 0; - event->tstamp_enabled = tstamp - event->total_time_enabled; + event->tstamp->enabled = tstamp - event->total_time_enabled; list_for_each_entry(sub, &event->sibling_list, group_entry) { if (sub->state >= PERF_EVENT_STATE_INACTIVE) { - sub->tstamp_enabled = tstamp - sub->total_time_enabled; + sub->tstamp->enabled = tstamp - sub->total_time_enabled; sub->cgrp_defer_enabled = 0; } } @@ -1378,6 +1379,9 @@ static void update_context_time(struct perf_event_context *ctx) ctx->time += now - ctx->timestamp; ctx->timestamp = now; + + ctx->tstamp_data.running += ctx->time - ctx->tstamp_data.stopped; + ctx->tstamp_data.stopped = ctx->time; } static u64 perf_event_time(struct perf_event *event) @@ -1419,16 +1423,16 @@ static void update_event_times(struct perf_event *event) else if (ctx->is_active) run_end = ctx->time; else - run_end = event->tstamp_stopped; + run_end = event->tstamp->stopped; - event->total_time_enabled = run_end - event->tstamp_enabled; + event->total_time_enabled = run_end - event->tstamp->enabled; if (event->state == PERF_EVENT_STATE_INACTIVE) - run_end = event->tstamp_stopped; + run_end = event->tstamp->stopped; else run_end = perf_event_time(event); - event->total_time_running = run_end - event->tstamp_running; + event->total_time_running = run_end - event->tstamp->running; } @@ -1458,8 +1462,12 @@ static enum event_type_t get_event_type(struct perf_event *event) return event_type; } -static struct list_head * -ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) +/* + * Extract pinned or flexible groups from the context + * based on event attrs bits; + */ +static struct rb_root * +get_event_groups(struct perf_event *event, struct perf_event_context *ctx) { if (event->attr.pinned) return &ctx->pinned_groups; @@ -1467,6 +1475,204 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) return &ctx->flexible_groups; } +static void +perf_event_groups_insert(struct rb_root *groups, + struct perf_event *event); + +static void +perf_event_groups_delete(struct rb_root *groups, + struct perf_event *event); + +/* + * Helper function to insert event into the pinned or + * flexible groups; + */ +static void +add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx) +{ + struct rb_root *groups; + + groups = get_event_groups(event, ctx); + perf_event_groups_insert(groups, event); +} + +/* + * Helper function to delete event from its groups; + */ +static void +del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) +{ + struct rb_root *groups; + + groups = get_event_groups(event, ctx); + perf_event_groups_delete(groups, event); +} + +/* + * Insert a group into a tree using event->cpu as a key. If event->cpu node + * is already attached to the tree then the event is added to the attached + * group's group_list list. + */ +static void +perf_event_groups_insert(struct rb_root *groups, struct perf_event *event) +{ + struct rb_node **node; + struct rb_node *parent; + struct perf_event *node_event; + + WARN_ON_ONCE(!groups || !event); + WARN_ON_ONCE(!list_empty(&event->group_entry)); + + node = &groups->rb_node; + parent = *node; + + while (*node) { + parent = *node; + node_event = container_of(*node, + struct perf_event, group_node); + + if (event->cpu < node_event->cpu) { + node = &parent->rb_left; + } else if (event->cpu > node_event->cpu) { + node = &parent->rb_right; + } else { + list_add_tail(&event->group_entry, + &node_event->group_list); + return; + } + } + + list_add_tail(&event->group_entry, &event->group_list); + + rb_link_node(&event->group_node, parent, node); + rb_insert_color(&event->group_node, groups); +} + +/* + * Delete a group from a tree. If the group is directly attached to the tree + * it also detaches all groups on the group's group_list list. + */ +static void +perf_event_groups_delete(struct rb_root *groups, struct perf_event *event) +{ + struct perf_event *next; + + WARN_ON_ONCE(!event); + WARN_ON_ONCE(list_empty(&event->group_entry)); + + list_del_init(&event->group_entry); + + if (!RB_EMPTY_NODE(&event->group_node)) { + WARN_ON_ONCE(!groups); + if (!RB_EMPTY_ROOT(groups)) { + if (list_empty(&event->group_list)) { + rb_erase(&event->group_node, groups); + } else { + next = list_first_entry(&event->group_list, + struct perf_event, group_entry); + list_replace_init(&event->group_list, + &next->group_list); + rb_replace_node(&event->group_node, + &next->group_node, groups); + + } + } + RB_CLEAR_NODE(&event->group_node); + } +} + +/* + * Find group list by a cpu key and rotate it. + */ +static void +perf_event_groups_rotate(struct rb_root *groups, int cpu) +{ + struct rb_node *node; + struct perf_event *node_event; + + WARN_ON_ONCE(!groups); + + node = groups->rb_node; + + while (node) { + node_event = container_of(node, + struct perf_event, group_node); + + if (cpu < node_event->cpu) { + node = node->rb_left; + } else if (cpu > node_event->cpu) { + node = node->rb_right; + } else { + list_rotate_left(&node_event->group_list); + break; + } + } +} + +/* + * Find group_list list by a cpu key and call provided callback for every + * group on the list. + */ + +typedef int(*perf_event_groups_iterate_f)(struct perf_event *, void *); + +static void +perf_event_groups_iterate_cpu(struct rb_root *groups, int cpu, + perf_event_groups_iterate_f callback, void *data) +{ + struct rb_node *node; + struct perf_event *event, *node_event; + + WARN_ON_ONCE(!groups); + + node = groups->rb_node; + + while (node) { + node_event = container_of(node, + struct perf_event, group_node); + + if (cpu < node_event->cpu) { + node = node->rb_left; + } else if (cpu > node_event->cpu) { + node = node->rb_right; + } else { + list_for_each_entry(event, &node_event->group_list, + group_entry) + callback(event, data); + break; + } + } +} + +/* + * Iterate event groups and call provided callback for every group in the tree. + * Iteration stops if the callback returns non zero. + */ +static int +perf_event_groups_iterate(struct rb_root *groups, + perf_event_groups_iterate_f callback, void *data) +{ + int ret = 0; + struct rb_node *node; + struct perf_event *event, *node_event; + + WARN_ON_ONCE(!groups); + + for (node = rb_first(groups); node; node = rb_next(node)) { + node_event = container_of(node, struct perf_event, group_node); + list_for_each_entry(event, &node_event->group_list, + group_entry) { + WARN_ON_ONCE(!(event->cpu == node_event->cpu)); + ret = callback(event, data); + if (ret) { + return ret; + } + } + } + + return ret; +} + /* * Add a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. @@ -1485,12 +1691,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) * perf_group_detach can, at all times, locate all siblings. */ if (event->group_leader == event) { - struct list_head *list; - event->group_caps = event->event_caps; - - list = ctx_group_list(event, ctx); - list_add_tail(&event->group_entry, list); + add_event_to_groups(event, ctx); } list_update_cgroup_event(event, ctx, true); @@ -1681,7 +1883,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) list_del_rcu(&event->event_entry); if (event->group_leader == event) - list_del_init(&event->group_entry); + del_event_from_groups(event, ctx); update_group_times(event); @@ -1701,7 +1903,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) static void perf_group_detach(struct perf_event *event) { struct perf_event *sibling, *tmp; - struct list_head *list = NULL; lockdep_assert_held(&event->ctx->lock); @@ -1722,22 +1923,23 @@ static void perf_group_detach(struct perf_event *event) goto out; } - if (!list_empty(&event->group_entry)) - list = &event->group_entry; - /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them * to whatever list we are on. */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { - if (list) - list_move_tail(&sibling->group_entry, list); + sibling->group_leader = sibling; /* Inherit group flags from the previous leader */ sibling->group_caps = event->group_caps; + if (!list_empty(&event->group_entry)) { + list_del_init(&sibling->group_entry); + add_event_to_groups(sibling, event->ctx); + } + WARN_ON_ONCE(sibling->ctx != event->ctx); } @@ -1806,9 +2008,13 @@ event_sched_out(struct perf_event *event, */ if (event->state == PERF_EVENT_STATE_INACTIVE && !event_filter_match(event)) { - delta = tstamp - event->tstamp_stopped; - event->tstamp_running += delta; - event->tstamp_stopped = tstamp; + delta = tstamp - event->tstamp->stopped; + event->tstamp->running += delta; + event->tstamp->stopped = tstamp; + if (event->tstamp != &event->tstamp_data) { + event->tstamp_data = *event->tstamp; + event->tstamp = &event->tstamp_data; + } } if (event->state != PERF_EVENT_STATE_ACTIVE) @@ -1816,7 +2022,7 @@ event_sched_out(struct perf_event *event, perf_pmu_disable(event->pmu); - event->tstamp_stopped = tstamp; + event->tstamp->stopped = tstamp; event->pmu->del(event, 0); event->oncpu = -1; event->state = PERF_EVENT_STATE_INACTIVE; @@ -1861,6 +2067,22 @@ group_sched_out(struct perf_event *group_event, cpuctx->exclusive = 0; } +struct group_sched_params { + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + int can_add_hw; +}; + +static int +group_sched_out_callback(struct perf_event *event, void *data) +{ + struct group_sched_params *params = data; + + group_sched_out(event, params->cpuctx, params->ctx); + + return 0; +} + #define DETACH_GROUP 0x01UL /* @@ -2091,7 +2313,7 @@ event_sched_in(struct perf_event *event, goto out; } - event->tstamp_running += tstamp - event->tstamp_stopped; + event->tstamp->running += tstamp - event->tstamp->stopped; if (!is_software_event(event)) cpuctx->active_oncpu++; @@ -2163,8 +2385,8 @@ group_sched_in(struct perf_event *group_event, simulate = true; if (simulate) { - event->tstamp_running += now - event->tstamp_stopped; - event->tstamp_stopped = now; + event->tstamp->running += now - event->tstamp->stopped; + event->tstamp->stopped = now; } else { event_sched_out(event, cpuctx, ctx); } @@ -2216,43 +2438,45 @@ static void add_event_to_ctx(struct perf_event *event, list_add_event(event, ctx); perf_group_attach(event); - event->tstamp_enabled = tstamp; - event->tstamp_running = tstamp; - event->tstamp_stopped = tstamp; + event->tstamp->enabled = tstamp; + event->tstamp->running = tstamp; + event->tstamp->stopped = tstamp; } static void ctx_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, - enum event_type_t event_type); + enum event_type_t event_type, int mux); static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type, - struct task_struct *task); + struct task_struct *task, int mux); static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, enum event_type_t event_type) { + int mux = 0; + if (!cpuctx->task_ctx) return; if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; - ctx_sched_out(ctx, cpuctx, event_type); + ctx_sched_out(ctx, cpuctx, event_type, mux); } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, - struct task_struct *task) + struct task_struct *task, int mux) { - cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task); + cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task, mux); if (ctx) - ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); + ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task, mux); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task, mux); if (ctx) - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task, mux); } /* @@ -2276,6 +2500,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, { enum event_type_t ctx_event_type = event_type & EVENT_ALL; bool cpu_event = !!(event_type & EVENT_CPU); + int mux = 0; /* * If pinned groups are involved, flexible groups also need to be @@ -2296,11 +2521,11 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, * - otherwise, do nothing more. */ if (cpu_event) - cpu_ctx_sched_out(cpuctx, ctx_event_type); + cpu_ctx_sched_out(cpuctx, ctx_event_type, mux); else if (ctx_event_type & EVENT_PINNED) - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux); - perf_event_sched_in(cpuctx, task_ctx, current); + perf_event_sched_in(cpuctx, task_ctx, current, mux); perf_pmu_enable(cpuctx->ctx.pmu); } @@ -2318,6 +2543,7 @@ static int __perf_install_in_context(void *info) struct perf_event_context *task_ctx = cpuctx->task_ctx; bool reprogram = true; int ret = 0; + int mux = 0; raw_spin_lock(&cpuctx->ctx.lock); if (ctx->task) { @@ -2344,7 +2570,7 @@ static int __perf_install_in_context(void *info) } if (reprogram) { - ctx_sched_out(ctx, cpuctx, EVENT_TIME); + ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux); add_event_to_ctx(event, ctx); ctx_resched(cpuctx, task_ctx, get_event_type(event)); } else { @@ -2463,10 +2689,10 @@ static void __perf_event_mark_enabled(struct perf_event *event) u64 tstamp = perf_event_time(event); event->state = PERF_EVENT_STATE_INACTIVE; - event->tstamp_enabled = tstamp - event->total_time_enabled; + event->tstamp->enabled = tstamp - event->total_time_enabled; list_for_each_entry(sub, &event->sibling_list, group_entry) { if (sub->state >= PERF_EVENT_STATE_INACTIVE) - sub->tstamp_enabled = tstamp - sub->total_time_enabled; + sub->tstamp->enabled = tstamp - sub->total_time_enabled; } } @@ -2480,13 +2706,14 @@ static void __perf_event_enable(struct perf_event *event, { struct perf_event *leader = event->group_leader; struct perf_event_context *task_ctx; + int mux = 0; if (event->state >= PERF_EVENT_STATE_INACTIVE || event->state <= PERF_EVENT_STATE_ERROR) return; if (ctx->is_active) - ctx_sched_out(ctx, cpuctx, EVENT_TIME); + ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux); __perf_event_mark_enabled(event); @@ -2496,7 +2723,7 @@ static void __perf_event_enable(struct perf_event *event, if (!event_filter_match(event)) { if (is_cgroup_event(event)) perf_cgroup_defer_enabled(event); - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux); return; } @@ -2505,7 +2732,7 @@ static void __perf_event_enable(struct perf_event *event, * then don't put it on unless the group is on. */ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux); return; } @@ -2701,10 +2928,14 @@ EXPORT_SYMBOL_GPL(perf_event_refresh); static void ctx_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, - enum event_type_t event_type) + enum event_type_t event_type, int mux) { int is_active = ctx->is_active; - struct perf_event *event; + struct group_sched_params params = { + .cpuctx = cpuctx, + .ctx = ctx + }; + int cpu = smp_processor_id(); lockdep_assert_held(&ctx->lock); @@ -2751,13 +2982,27 @@ static void ctx_sched_out(struct perf_event_context *ctx, perf_pmu_disable(ctx->pmu); if (is_active & EVENT_PINNED) { - list_for_each_entry(event, &ctx->pinned_groups, group_entry) - group_sched_out(event, cpuctx, ctx); + if (mux) { + perf_event_groups_iterate_cpu(&ctx->pinned_groups, -1, + group_sched_out_callback, ¶ms); + perf_event_groups_iterate_cpu(&ctx->pinned_groups, cpu, + group_sched_out_callback, ¶ms); + } else { + perf_event_groups_iterate(&ctx->pinned_groups, + group_sched_out_callback, ¶ms); + } } if (is_active & EVENT_FLEXIBLE) { - list_for_each_entry(event, &ctx->flexible_groups, group_entry) - group_sched_out(event, cpuctx, ctx); + if (mux) { + perf_event_groups_iterate_cpu(&ctx->flexible_groups, -1, + group_sched_out_callback, ¶ms); + perf_event_groups_iterate_cpu(&ctx->flexible_groups, cpu, + group_sched_out_callback, ¶ms); + } else { + perf_event_groups_iterate(&ctx->flexible_groups, + group_sched_out_callback, ¶ms); + } } perf_pmu_enable(ctx->pmu); } @@ -3046,78 +3291,85 @@ void __perf_event_task_sched_out(struct task_struct *task, * Called with IRQs disabled */ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, - enum event_type_t event_type) + enum event_type_t event_type, int mux) { - ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); + ctx_sched_out(&cpuctx->ctx, cpuctx, event_type, mux); } -static void -ctx_pinned_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) +static int +ctx_pinned_sched_in(struct perf_event *event, void *data) { - struct perf_event *event; + struct group_sched_params *params = data; - list_for_each_entry(event, &ctx->pinned_groups, group_entry) { - if (event->state <= PERF_EVENT_STATE_OFF) - continue; - if (!event_filter_match(event)) - continue; + if (event->state <= PERF_EVENT_STATE_OFF) + return 0; + if (!event_filter_match(event)) { + if (event->tstamp != ¶ms->ctx->tstamp_data) + event->tstamp = ¶ms->ctx->tstamp_data; + return 0; + } - /* may need to reset tstamp_enabled */ - if (is_cgroup_event(event)) - perf_cgroup_mark_enabled(event, ctx); + /* may need to reset tstamp_enabled */ + if (is_cgroup_event(event)) + perf_cgroup_mark_enabled(event, params->ctx); - if (group_can_go_on(event, cpuctx, 1)) - group_sched_in(event, cpuctx, ctx); + if (group_can_go_on(event, params->cpuctx, 1)) + group_sched_in(event, params->cpuctx, params->ctx); - /* - * If this pinned group hasn't been scheduled, - * put it in error state. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) { - update_group_times(event); - event->state = PERF_EVENT_STATE_ERROR; - } + /* + * If this pinned group hasn't been scheduled, + * put it in error state. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_group_times(event); + event->state = PERF_EVENT_STATE_ERROR; } + + return 0; } -static void -ctx_flexible_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) +static int +ctx_flexible_sched_in(struct perf_event *event, void *data) { - struct perf_event *event; - int can_add_hw = 1; + struct group_sched_params *params = data; - list_for_each_entry(event, &ctx->flexible_groups, group_entry) { - /* Ignore events in OFF or ERROR state */ - if (event->state <= PERF_EVENT_STATE_OFF) - continue; - /* - * Listen to the 'cpu' scheduling filter constraint - * of events: - */ - if (!event_filter_match(event)) - continue; + /* Ignore events in OFF or ERROR state */ + if (event->state <= PERF_EVENT_STATE_OFF) + return 0; + /* + * Listen to the 'cpu' scheduling filter constraint + * of events: + */ + if (!event_filter_match(event)) { + if (event->tstamp != ¶ms->ctx->tstamp_data) + event->tstamp = ¶ms->ctx->tstamp_data; + return 0; + } - /* may need to reset tstamp_enabled */ - if (is_cgroup_event(event)) - perf_cgroup_mark_enabled(event, ctx); + /* may need to reset tstamp_enabled */ + if (is_cgroup_event(event)) + perf_cgroup_mark_enabled(event, params->ctx); - if (group_can_go_on(event, cpuctx, can_add_hw)) { - if (group_sched_in(event, cpuctx, ctx)) - can_add_hw = 0; - } + if (group_can_go_on(event, params->cpuctx, params->can_add_hw)) { + if (group_sched_in(event, params->cpuctx, params->ctx)) + params->can_add_hw = 0; } + + return 0; } static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type, - struct task_struct *task) + struct task_struct *task, int mux) { int is_active = ctx->is_active; - u64 now; + struct group_sched_params params = { + .cpuctx = cpuctx, + .ctx = ctx + }; + int cpu = smp_processor_id(); lockdep_assert_held(&ctx->lock); @@ -3136,7 +3388,7 @@ ctx_sched_in(struct perf_event_context *ctx, if (is_active & EVENT_TIME) { /* start ctx time */ - now = perf_clock(); + u64 now = perf_clock(); ctx->timestamp = now; perf_cgroup_set_timestamp(task, ctx); } @@ -3145,27 +3397,56 @@ ctx_sched_in(struct perf_event_context *ctx, * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - if (is_active & EVENT_PINNED) - ctx_pinned_sched_in(ctx, cpuctx); + + if (is_active & EVENT_PINNED) { + if (mux) { + perf_event_groups_iterate_cpu(&ctx->pinned_groups, + -1, ctx_pinned_sched_in, + ¶ms); + perf_event_groups_iterate_cpu(&ctx->pinned_groups, + cpu, ctx_pinned_sched_in, + ¶ms); + } else { + perf_event_groups_iterate(&ctx->pinned_groups, + ctx_pinned_sched_in, + ¶ms); + } + } /* Then walk through the lower prio flexible groups */ - if (is_active & EVENT_FLEXIBLE) - ctx_flexible_sched_in(ctx, cpuctx); + if (is_active & EVENT_FLEXIBLE) { + if (mux) { + params.can_add_hw = 1; + perf_event_groups_iterate_cpu(&ctx->flexible_groups, + -1, ctx_flexible_sched_in, + ¶ms); + params.can_add_hw = 1; + perf_event_groups_iterate_cpu(&ctx->flexible_groups, + cpu, ctx_flexible_sched_in, + ¶ms); + } else { + params.can_add_hw = 1; + perf_event_groups_iterate(&ctx->flexible_groups, + ctx_flexible_sched_in, + ¶ms); + } + } } static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, enum event_type_t event_type, - struct task_struct *task) + struct task_struct *task, int mux) { struct perf_event_context *ctx = &cpuctx->ctx; - ctx_sched_in(ctx, cpuctx, event_type, task); + ctx_sched_in(ctx, cpuctx, event_type, task, mux); } static void perf_event_context_sched_in(struct perf_event_context *ctx, struct task_struct *task) { struct perf_cpu_context *cpuctx; + int mux = 0; cpuctx = __get_cpu_context(ctx); if (cpuctx->task_ctx == ctx) @@ -3181,9 +3462,9 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * However, if task's ctx is not carrying any pinned * events, no need to flip the cpuctx's events around. */ - if (!list_empty(&ctx->pinned_groups)) - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - perf_event_sched_in(cpuctx, ctx, task); + if (!RB_EMPTY_ROOT(&ctx->pinned_groups)) + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux); + perf_event_sched_in(cpuctx, ctx, task, mux); perf_pmu_enable(ctx->pmu); perf_ctx_unlock(cpuctx, ctx); } @@ -3416,14 +3697,19 @@ static void rotate_ctx(struct perf_event_context *ctx) * Rotate the first entry last of non-pinned groups. Rotation might be * disabled by the inheritance code. */ - if (!ctx->rotate_disable) - list_rotate_left(&ctx->flexible_groups); + if (!ctx->rotate_disable) { + int cpu = smp_processor_id(); + + perf_event_groups_rotate(&ctx->flexible_groups, -1); + perf_event_groups_rotate(&ctx->flexible_groups, cpu); + } } static int perf_rotate_context(struct perf_cpu_context *cpuctx) { struct perf_event_context *ctx = NULL; int rotate = 0; + int mux = 1; if (cpuctx->ctx.nr_events) { if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) @@ -3442,15 +3728,15 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx) perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(cpuctx->ctx.pmu); - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux); if (ctx) - ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); + ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE, mux); rotate_ctx(&cpuctx->ctx); if (ctx) rotate_ctx(ctx); - perf_event_sched_in(cpuctx, ctx, current); + perf_event_sched_in(cpuctx, ctx, current, mux); perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@ -3502,6 +3788,7 @@ static void perf_event_enable_on_exec(int ctxn) struct perf_event *event; unsigned long flags; int enabled = 0; + int mux = 0; local_irq_save(flags); ctx = current->perf_event_ctxp[ctxn]; @@ -3510,7 +3797,7 @@ static void perf_event_enable_on_exec(int ctxn) cpuctx = __get_cpu_context(ctx); perf_ctx_lock(cpuctx, ctx); - ctx_sched_out(ctx, cpuctx, EVENT_TIME); + ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux); list_for_each_entry(event, &ctx->event_list, event_entry) { enabled |= event_enable_on_exec(event, ctx); event_type |= get_event_type(event); @@ -3523,7 +3810,7 @@ static void perf_event_enable_on_exec(int ctxn) clone_ctx = unclone_ctx(ctx); ctx_resched(cpuctx, ctx, event_type); } else { - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux); } perf_ctx_unlock(cpuctx, ctx); @@ -3743,8 +4030,8 @@ static void __perf_event_init_context(struct perf_event_context *ctx) raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->active_ctx_list); - INIT_LIST_HEAD(&ctx->pinned_groups); - INIT_LIST_HEAD(&ctx->flexible_groups); + ctx->pinned_groups = RB_ROOT; + ctx->flexible_groups = RB_ROOT; INIT_LIST_HEAD(&ctx->event_list); atomic_set(&ctx->refcount, 1); } @@ -4843,8 +5130,8 @@ static void calc_timer_values(struct perf_event *event, *now = perf_clock(); ctx_time = event->shadow_ctx_time + *now; - *enabled = ctx_time - event->tstamp_enabled; - *running = ctx_time - event->tstamp_running; + *enabled = ctx_time - event->tstamp->enabled; + *running = ctx_time - event->tstamp->running; } static void perf_event_init_userpage(struct perf_event *event) @@ -9379,6 +9666,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->group_entry); INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); + RB_CLEAR_NODE(&event->group_node); + INIT_LIST_HEAD(&event->group_list); INIT_LIST_HEAD(&event->rb_entry); INIT_LIST_HEAD(&event->active_entry); INIT_LIST_HEAD(&event->addr_filters.list); @@ -9392,6 +9681,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, raw_spin_lock_init(&event->addr_filters.lock); atomic_long_set(&event->refcount, 1); + event->tstamp = &event->tstamp_data; event->cpu = cpu; event->attr = *attr; event->group_leader = group_leader; @@ -10767,6 +11057,14 @@ static int inherit_group(struct perf_event *parent_event, return 0; } +struct inherit_task_group_params { + struct task_struct *parent; + struct perf_event_context *parent_ctx; + struct task_struct *child; + int ctxn; + int inherited_all; +}; + /* * Creates the child task context and tries to inherit the event-group. * @@ -10779,20 +11077,18 @@ static int inherit_group(struct perf_event *parent_event, * - <0 on error */ static int -inherit_task_group(struct perf_event *event, struct task_struct *parent, - struct perf_event_context *parent_ctx, - struct task_struct *child, int ctxn, - int *inherited_all) +inherit_task_group(struct perf_event *event, void *data) { int ret; struct perf_event_context *child_ctx; + struct inherit_task_group_params *params = data; if (!event->attr.inherit) { - *inherited_all = 0; + params->inherited_all = 0; return 0; } - child_ctx = child->perf_event_ctxp[ctxn]; + child_ctx = params->child->perf_event_ctxp[params->ctxn]; if (!child_ctx) { /* * This is executed from the parent task context, so @@ -10800,18 +11096,19 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, * First allocate and initialize a context for the * child. */ - child_ctx = alloc_perf_context(parent_ctx->pmu, child); + child_ctx = alloc_perf_context(params->parent_ctx->pmu, + params->child); if (!child_ctx) return -ENOMEM; - child->perf_event_ctxp[ctxn] = child_ctx; + params->child->perf_event_ctxp[params->ctxn] = child_ctx; } - ret = inherit_group(event, parent, parent_ctx, - child, child_ctx); + ret = inherit_group(event, params->parent, params->parent_ctx, + params->child, child_ctx); if (ret) - *inherited_all = 0; + params->inherited_all = 0; return ret; } @@ -10823,11 +11120,15 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) { struct perf_event_context *child_ctx, *parent_ctx; struct perf_event_context *cloned_ctx; - struct perf_event *event; struct task_struct *parent = current; - int inherited_all = 1; unsigned long flags; int ret = 0; + struct inherit_task_group_params params = { + .parent = parent, + .child = child, + .ctxn = ctxn, + .inherited_all = 1 + }; if (likely(!parent->perf_event_ctxp[ctxn])) return 0; @@ -10840,6 +11141,8 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) if (!parent_ctx) return 0; + params.parent_ctx = parent_ctx; + /* * No need to check if parent_ctx != NULL here; since we saw * it non-NULL earlier, the only reason for it to become NULL @@ -10857,13 +11160,10 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ - list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { - ret = inherit_task_group(event, parent, parent_ctx, - child, ctxn, &inherited_all); - if (ret) - goto out_unlock; - } - + ret = perf_event_groups_iterate(&parent_ctx->pinned_groups, + inherit_task_group, ¶ms); + if (ret) + goto out_unlock; /* * We can't hold ctx->lock when iterating the ->flexible_group list due * to allocations, but we need to prevent rotation because @@ -10873,19 +11173,17 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) parent_ctx->rotate_disable = 1; raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); - list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { - ret = inherit_task_group(event, parent, parent_ctx, - child, ctxn, &inherited_all); - if (ret) - goto out_unlock; - } + ret = perf_event_groups_iterate(&parent_ctx->flexible_groups, + inherit_task_group, ¶ms); + if (ret) + goto out_unlock; raw_spin_lock_irqsave(&parent_ctx->lock, flags); parent_ctx->rotate_disable = 0; child_ctx = child->perf_event_ctxp[ctxn]; - if (child_ctx && inherited_all) { + if (child_ctx && params.inherited_all) { /* * Mark the child context as a clone of the parent * context, or of whatever the parent is a clone of.