From: Steven Rostedt Currently the filter_match_preds() requires a stack to push and pop the preds to determine if the filter matches the record or not. This has two drawbacks: 1) It requires a stack to store state information. As this is done in fast paths we can't allocate the storage for this stack, and we can't use a global as it must be re-entrant. The stack is stored on the kernel stack and this greatly limits how many preds we may allow. 2) All conditions are calculated even when a short circuit exists. a || b will always calculate a and b even though a was determined to be true. Using a tree we can walk a constant structure that will save the state as we go. The algorithm is simply: pred = root; do { switch (move) { case MOVE_DOWN: if (OR or AND) { pred = left; continue; } if (pred == root) break; match = pred->fn(); pred = pred->parent; move = left child ? MOVE_UP_FROM_LEFT : MOVE_UP_FROM_RIGHT; continue; case MOVE_UP_FROM_LEFT: /* Only OR or AND can be a parent */ if (match && OR || !match && AND) { /* short circuit */ if (pred == root) break; pred = pred->parent; move = left child ? MOVE_UP_FROM_LEFT : MOVE_UP_FROM_RIGHT; continue; } pred = pred->right; move = MOVE_DOWN; continue; case MOVE_UP_FROM_RIGHT: if (pred == root) break; pred = pred->parent; move = left child ? MOVE_UP_FROM_LEFT : MOVE_UP_FROM_RIGHT; continue; } done = 1; } while (!done); This way there's no strict limit to how many preds we allow and it also will short circuit the logical operations when possible. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 9 ++- kernel/trace/trace_events_filter.c | 231 +++++++++++++++++++++++++++++------- 2 files changed, 194 insertions(+), 46 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 254d04a..bba34a7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -664,6 +664,7 @@ struct event_filter { int n_preds; /* Number assigned */ int a_preds; /* allocated */ struct filter_pred *preds; + struct filter_pred *root; char *filter_string; }; @@ -675,6 +676,9 @@ struct event_subsystem { int nr_events; }; +#define FILTER_PRED_INVALID ((unsigned short)-1) +#define FILTER_PRED_IS_RIGHT (1 << 15) + struct filter_pred; struct regex; @@ -704,7 +708,10 @@ struct filter_pred { int offset; int not; int op; - int pop_n; + unsigned short index; + unsigned short parent; + unsigned short left; + unsigned short right; }; extern struct list_head ftrace_common_fields; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 23753f6..2677924 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -123,6 +123,11 @@ struct filter_parse_state { } operand; }; +struct pred_stack { + struct filter_pred **preds; + int index; +}; + #define DEFINE_COMPARISON_PRED(type) \ static int filter_pred_##type(struct filter_pred *pred, void *event) \ { \ @@ -357,52 +362,95 @@ static void filter_build_regex(struct filter_pred *pred) pred->not ^= not; } +enum move_type { + MOVE_DOWN, + MOVE_UP_FROM_LEFT, + MOVE_UP_FROM_RIGHT +}; + +static struct filter_pred * +get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, + int index, enum move_type *move) +{ + if (pred->parent & FILTER_PRED_IS_RIGHT) + *move = MOVE_UP_FROM_RIGHT; + else + *move = MOVE_UP_FROM_LEFT; + pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT]; + + return pred; +} + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { - int match = -1, top = 0, val1 = 0, val2 = 0; - int stack[MAX_FILTER_PRED]; + int match = -1; + enum move_type move = MOVE_DOWN; struct filter_pred *preds; struct filter_pred *pred; + struct filter_pred *root; int n_preds = ACCESS_ONCE(filter->n_preds); - int i; + int done = 0; /* no filter is considered a match */ if (!n_preds) return 1; /* - * n_preds and filter->preds is protect with preemption disabled. + * n_preds, root and filter->preds are protect with preemption disabled. */ preds = rcu_dereference_sched(filter->preds); + root = rcu_dereference_sched(filter->root); + if (!root) + return 1; - for (i = 0; i < n_preds; i++) { - pred = &preds[i]; - if (!pred->pop_n) { + pred = root; + + /* match is currently meaningless */ + match = -1; + + do { + switch (move) { + case MOVE_DOWN: + /* only AND and OR have children */ + if (pred->left != FILTER_PRED_INVALID) { + /* keep going to leaf node */ + pred = &preds[pred->left]; + continue; + } match = pred->fn(pred, rec); - stack[top++] = match; + /* If this pred is the only pred */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + /* Check for short circuits */ + if ((match && pred->op == OP_OR) || + (!match && pred->op == OP_AND)) { + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + /* now go down the right side of the tree. */ + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + /* We finished this equation. */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); continue; } - if (pred->pop_n > top) { - WARN_ON_ONCE(1); - return 0; - } - val1 = stack[--top]; - val2 = stack[--top]; - switch (pred->op) { - case OP_AND: - match = val1 && val2; - break; - case OP_OR: - match = val1 || val2; - break; - default: - WARN_ONCE(1, "filter op is not AND or OR"); - } - stack[top++] = match; - } + done = 1; + } while (!done); - return stack[--top]; + return match; } EXPORT_SYMBOL_GPL(filter_match_preds); @@ -539,10 +587,58 @@ static void filter_clear_pred(struct filter_pred *pred) pred->regex.len = 0; } -static int filter_set_pred(struct filter_pred *dest, +static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) +{ + stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); + if (!stack->preds) + return -ENOMEM; + stack->index = n_preds; + return 0; +} + +static void __free_pred_stack(struct pred_stack *stack) +{ + kfree(stack->preds); + stack->index = 0; +} + +static int __push_pred_stack(struct pred_stack *stack, + struct filter_pred *pred) +{ + int index = stack->index; + + if (WARN_ON(index == 0)) + return -ENOSPC; + + stack->preds[--index] = pred; + stack->index = index; + return 0; +} + +static struct filter_pred * +__pop_pred_stack(struct pred_stack *stack) +{ + struct filter_pred *pred; + int index = stack->index; + + pred = stack->preds[index++]; + if (!pred) + return NULL; + + stack->index = index; + return pred; +} + +static int filter_set_pred(struct event_filter *filter, + int idx, + struct pred_stack *stack, struct filter_pred *src, filter_pred_fn_t fn) { + struct filter_pred *dest = &filter->preds[idx]; + struct filter_pred *left; + struct filter_pred *right; + *dest = *src; if (src->field_name) { dest->field_name = kstrdup(src->field_name, GFP_KERNEL); @@ -550,8 +646,25 @@ static int filter_set_pred(struct filter_pred *dest, return -ENOMEM; } dest->fn = fn; + dest->index = idx; - return 0; + if (dest->op == OP_OR || dest->op == OP_AND) { + right = __pop_pred_stack(stack); + left = __pop_pred_stack(stack); + if (!left || !right) + return -EINVAL; + dest->left = left->index; + dest->right = right->index; + left->parent = dest->index; + right->parent = dest->index | FILTER_PRED_IS_RIGHT; + } else + /* + * Make dest->left invalid to be used as a quick + * way to know this is a leaf node. + */ + dest->left = FILTER_PRED_INVALID; + + return __push_pred_stack(stack, dest); } static void __free_preds(struct event_filter *filter) @@ -574,6 +687,7 @@ static void reset_preds(struct event_filter *filter) int i; filter->n_preds = 0; + filter->root = NULL; if (!filter->preds) return; @@ -708,6 +822,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, struct ftrace_event_call *call, struct event_filter *filter, struct filter_pred *pred, + struct pred_stack *stack, filter_pred_fn_t fn) { int idx, err; @@ -719,7 +834,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, idx = filter->n_preds; filter_clear_pred(&filter->preds[idx]); - err = filter_set_pred(&filter->preds[idx], pred, fn); + err = filter_set_pred(filter, idx, stack, pred, fn); if (err) return err; @@ -804,6 +919,7 @@ static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_call *call, struct event_filter *filter, struct filter_pred *pred, + struct pred_stack *stack, bool dry_run) { struct ftrace_event_field *field; @@ -813,13 +929,10 @@ static int filter_add_pred(struct filter_parse_state *ps, fn = pred->fn = filter_pred_none; - if (pred->op == OP_AND) { - pred->pop_n = 2; + if (pred->op == OP_AND) goto add_pred_fn; - } else if (pred->op == OP_OR) { - pred->pop_n = 2; + else if (pred->op == OP_OR) goto add_pred_fn; - } field = find_event_field(call, pred->field_name); if (!field) { @@ -868,7 +981,7 @@ static int filter_add_pred(struct filter_parse_state *ps, add_pred_fn: if (!dry_run) - return filter_add_pred_fn(ps, call, filter, pred, fn); + return filter_add_pred_fn(ps, call, filter, pred, stack, fn); return 0; } @@ -1249,6 +1362,7 @@ static int replace_preds(struct ftrace_event_call *call, char *operand1 = NULL, *operand2 = NULL; struct filter_pred *pred; struct postfix_elt *elt; + struct pred_stack stack = { }; /* init to NULL */ int err; int n_preds = 0; @@ -1263,9 +1377,12 @@ static int replace_preds(struct ftrace_event_call *call, return err; if (!dry_run) { - err = __alloc_preds(filter, n_preds); + err = __alloc_pred_stack(&stack, n_preds); if (err) return err; + err = __alloc_preds(filter, n_preds); + if (err) + goto fail; } n_preds = 0; @@ -1277,14 +1394,16 @@ static int replace_preds(struct ftrace_event_call *call, operand2 = elt->operand; else { parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); - return -EINVAL; + err = -EINVAL; + goto fail; } continue; } if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); - return -ENOSPC; + err = -ENOSPC; + goto fail; } if (elt->op == OP_AND || elt->op == OP_OR) { @@ -1294,22 +1413,44 @@ static int replace_preds(struct ftrace_event_call *call, if (!operand1 || !operand2) { parse_error(ps, FILT_ERR_MISSING_FIELD, 0); - return -EINVAL; + err = -EINVAL; + goto fail; } pred = create_pred(elt->op, operand1, operand2); add_pred: - if (!pred) - return -ENOMEM; - err = filter_add_pred(ps, call, filter, pred, dry_run); + if (!pred) { + err = -ENOMEM; + goto fail; + } + err = filter_add_pred(ps, call, filter, pred, &stack, dry_run); filter_free_pred(pred); if (err) - return err; + goto fail; operand1 = operand2 = NULL; } - return 0; + if (!dry_run) { + /* We should have one item left on the stack */ + pred = __pop_pred_stack(&stack); + if (!pred) + return -EINVAL; + /* This item is where we start from in matching */ + filter->root = pred; + /* Make sure the stack is empty */ + pred = __pop_pred_stack(&stack); + if (WARN_ON(pred)) { + err = -EINVAL; + filter->root = NULL; + goto fail; + } + } + + err = 0; +fail: + __free_pred_stack(&stack); + return err; } static int replace_system_preds(struct event_subsystem *system, -- 1.7.2.3 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/