linux-kernel - Re: [PATCH v7 02/13] user_events: Add minimal support for trace

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20211210194358.e590d49a1620df7345f9f679@kernel.org>
Date:   Fri, 10 Dec 2021 19:43:58 +0900
From:   Masami Hiramatsu <mhiramat@...nel.org>
To:     Beau Belgrave <beaub@...ux.microsoft.com>
Cc:     rostedt@...dmis.org, linux-trace-devel@...r.kernel.org,
        linux-kernel@...r.kernel.org
Subject: Re: [PATCH v7 02/13] user_events: Add minimal support for
 trace_event into ftrace

Hi Beau,

Thanks for updating the patch! I have some comments below.

On Thu,  9 Dec 2021 14:31:59 -0800
Beau Belgrave <beaub@...ux.microsoft.com> wrote:

[..]
> +#define USER_EVENTS_PREFIX_LEN (sizeof(USER_EVENTS_PREFIX)-1)
> +
> +#define FIELD_DEPTH_TYPE 0
> +#define FIELD_DEPTH_NAME 1
> +#define FIELD_DEPTH_SIZE 2
> +
> +/*
> + * Limits how many trace_event calls user processes can create:
> + * Must be multiple of PAGE_SIZE.
> + */
> +#define MAX_PAGES 1
> +#define MAX_EVENTS (MAX_PAGES * PAGE_SIZE)
> +
> +/* Limit how long of an event name plus args within the subsystem. */
> +#define MAX_EVENT_DESC 512
> +#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
> +#define MAX_FIELD_ARRAY_SIZE (2 * PAGE_SIZE)

I don't recommend to record the event which size is more than a page size...
Maybe 256 entries?
It is also better to limit the total size of the event and the number
of fields (arguments).

Steve, can we write such a big event data on the trace buffer?

[..]
> +
> +static int user_field_array_size(const char *type)
> +{
> +	const char *start = strchr(type, '[');
> +	char val[8];
> +	int size = 0;
> +
> +	if (start == NULL)
> +		return -EINVAL;
> +
> +	start++;
> +
> +	while (*start != ']' && size < (sizeof(val) - 1))
> +		val[size++] = *start++;
> +
> +	if (*start != ']')
> +		return -EINVAL;
> +
> +	val[size] = 0;

It's '\0', not 0.

If I were you, I just use strlcpy(val, start, sizeof(val)), and
strchr(val, ']'). Sometimes using standard libc function will
be easer to understand what it does. :)

> +
> +	if (kstrtouint(val, 0, &size))
> +		return -EINVAL;
> +
> +	if (size > MAX_FIELD_ARRAY_SIZE)
> +		return -EINVAL;
> +
> +	return size;
> +}
> +
> +static int user_field_size(const char *type)
> +{
> +	/* long is not allowed from a user, since it's ambigious in size */
> +	if (strcmp(type, "s64") == 0)
> +		return sizeof(s64);
> +	if (strcmp(type, "u64") == 0)
> +		return sizeof(u64);
> +	if (strcmp(type, "s32") == 0)
> +		return sizeof(s32);
> +	if (strcmp(type, "u32") == 0)
> +		return sizeof(u32);
> +	if (strcmp(type, "int") == 0)
> +		return sizeof(int);
> +	if (strcmp(type, "unsigned int") == 0)
> +		return sizeof(unsigned int);
> +	if (strcmp(type, "s16") == 0)
> +		return sizeof(s16);
> +	if (strcmp(type, "u16") == 0)
> +		return sizeof(u16);
> +	if (strcmp(type, "short") == 0)
> +		return sizeof(short);
> +	if (strcmp(type, "unsigned short") == 0)
> +		return sizeof(unsigned short);
> +	if (strcmp(type, "s8") == 0)
> +		return sizeof(s8);
> +	if (strcmp(type, "u8") == 0)
> +		return sizeof(u8);
> +	if (strcmp(type, "char") == 0)
> +		return sizeof(char);
> +	if (strcmp(type, "unsigned char") == 0)
> +		return sizeof(unsigned char);
> +	if (str_has_prefix(type, "char["))
> +		return user_field_array_size(type);
> +	if (str_has_prefix(type, "unsigned char["))
> +		return user_field_array_size(type);
> +	if (str_has_prefix(type, "__data_loc "))
> +		return sizeof(u32);
> +	if (str_has_prefix(type, "__rel_loc "))
> +		return sizeof(u32);
> +
> +	/* Uknown basic type, error */
> +	return -EINVAL;
> +}
> +
> +static void user_event_destroy_fields(struct user_event *user)
> +{
> +	struct ftrace_event_field *field, *next;
> +	struct list_head *head = &user->fields;
> +
> +	list_for_each_entry_safe(field, next, head, link) {
> +		list_del(&field->link);
> +		kfree(field);
> +	}
> +}
> +
> +static int user_event_add_field(struct user_event *user, const char *type,
> +				const char *name, int offset, int size,
> +				int is_signed, int filter_type)
> +{
> +	struct ftrace_event_field *field;
> +
> +	field = kmalloc(sizeof(*field), GFP_KERNEL);
> +
> +	if (!field)
> +		return -ENOMEM;
> +
> +	field->type = type;
> +	field->name = name;
> +	field->offset = offset;
> +	field->size = size;
> +	field->is_signed = is_signed;
> +	field->filter_type = filter_type;
> +
> +	list_add(&field->link, &user->fields);

I recommend to use list_add_tail() here so that when accessing the
list of field without reverse order. (I found this in [4/13])

> +
> +	return 0;
> +}
> +
> +/*
> + * Parses the values of a field within the description
> + * Format: type name [size]

Hmm, don't you accept redundant spaces and tabs?
If this accepts the redundant spaces/tabs, I recommend you to use
argv_split() instead of strpbrk() etc. e.g.

	int argc, name_idx = 0, size;
	int ret = -EINVAL;
	char **argv;

	argv = argv_split(GFP_KERNEL, field, &argc);
	if (!argv)
		return -ENOMEM;

	if (!strcmp(argv[pos], "__data_loc") ||
	    !strcmp(argv[pos], "__rel_loc")) {
		if (++pos >= argc)
			goto error;
	}
	if (!strcmp(argv[pos], "unsigned")) {
		if (++pos >= argc)
			goto error;
	} else if (!strcmp(argv[pos], "struct")) {
		is_struct = true;
		if (++pos >= argc)
			goto error;
	}
	if (++pos >= argc)
		goto error;
	name_idx = pos++;
	if (pos < argc) {	// size
		if (!is_struct)
			goto error;
		if (kstrtou32(argv[pos++], 10, &size))
			goto error;
	} else
		size = user_field_size(argv[name_idx - 1]);

	if (pos != argc)
		goto error;
	
	// note that type index is always 0 and size must be converted.
	user_event_add_field(user, argv, name_idx, saved_offset, size,
				    type[0] != 'u', FILTER_OTHER);

	ret = 0;
error:
	argv_free(argv);
	return ret;

(This also requires to simplify user_field_size() and remove FIELD_DEPTH_*)
What would you think?
	
> + */
> +static int user_event_parse_field(char *field, struct user_event *user,
> +				  u32 *offset)
> +{
> +	char *part, *type, *name;
> +	u32 depth = 0, saved_offset = *offset;
> +	int len, size = -EINVAL;
> +	bool is_struct = false;
> +
> +	field = skip_spaces(field);
> +
> +	if (*field == 0)
> +		return 0;
> +
> +	/* Handle types that have a space within */
> +	len = str_has_prefix(field, "unsigned ");
> +	if (len)
> +		goto skip_next;
> +
> +	len = str_has_prefix(field, "struct ");
> +	if (len) {
> +		is_struct = true;
> +		goto skip_next;
> +	}
> +
> +	len = str_has_prefix(field, "__data_loc unsigned ");
> +	if (len)
> +		goto skip_next;
> +
> +	len = str_has_prefix(field, "__data_loc ");
> +	if (len)
> +		goto skip_next;
> +
> +	len = str_has_prefix(field, "__rel_loc unsigned ");
> +	if (len)
> +		goto skip_next;
> +
> +	len = str_has_prefix(field, "__rel_loc ");
> +	if (len)
> +		goto skip_next;
> +
> +	goto parse;
> +skip_next:
> +	type = field;
> +	field = strpbrk(field + len, " ");
> +
> +	if (field == NULL)
> +		return -EINVAL;
> +
> +	*field++ = 0;
> +	depth++;
> +parse:
> +	while ((part = strsep(&field, " ")) != NULL) {
> +		switch (depth++) {
> +		case FIELD_DEPTH_TYPE:
> +			type = part;
> +			break;
> +		case FIELD_DEPTH_NAME:
> +			name = part;
> +			break;
> +		case FIELD_DEPTH_SIZE:
> +			if (!is_struct)
> +				return -EINVAL;
> +
> +			if (kstrtou32(part, 10, &size))
> +				return -EINVAL;
> +			break;
> +		default:
> +			return -EINVAL;
> +		}
> +	}
> +
> +	if (depth < FIELD_DEPTH_SIZE)
> +		return -EINVAL;
> +
> +	if (depth == FIELD_DEPTH_SIZE)
> +		size = user_field_size(type);
> +
> +	if (size == 0)
> +		return -EINVAL;
> +
> +	if (size < 0)
> +		return size;
> +
> +	*offset = saved_offset + size;
> +
> +	return user_event_add_field(user, type, name, saved_offset, size,
> +				    type[0] != 'u', FILTER_OTHER);
> +}

[..]
> +
> +/*
> + * Register callback for our events from tracing sub-systems.
> + */
> +static int user_event_reg(struct trace_event_call *call,
> +			  enum trace_reg type,
> +			  void *data)
> +{
> +	struct user_event *user = (struct user_event *)call->data;
> +	int ret = 0;
> +
> +	if (!user)
> +		return -ENOENT;
> +
> +	switch (type) {
> +	case TRACE_REG_REGISTER:
> +		ret = tracepoint_probe_register(call->tp,
> +						call->class->probe,
> +						data);
> +		if (!ret)
> +			goto inc;
> +		break;
> +
> +	case TRACE_REG_UNREGISTER:
> +		tracepoint_probe_unregister(call->tp,
> +					    call->class->probe,
> +					    data);
> +		goto dec;
> +
> +#ifdef CONFIG_PERF_EVENTS
> +	case TRACE_REG_PERF_REGISTER:
> +	case TRACE_REG_PERF_UNREGISTER:
> +	case TRACE_REG_PERF_OPEN:
> +	case TRACE_REG_PERF_CLOSE:
> +	case TRACE_REG_PERF_ADD:
> +	case TRACE_REG_PERF_DEL:
> +		break;
> +#endif

At this moment (in this patch), you can just add a default case,
or just ignore it, because it does nothing.

> +	}
> +
> +	return ret;
> +inc:
> +	atomic_inc(&user->refcnt);
> +	update_reg_page_for(user);
> +	return 0;
> +dec:
> +	update_reg_page_for(user);
> +	atomic_dec(&user->refcnt);
> +	return 0;
> +}
> +

[..]
> +/*
> + * Validates the user payload and writes via iterator.
> + */
> +static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
> +{
> +	struct user_event_refs *refs;
> +	struct user_event *user = NULL;
> +	struct tracepoint *tp;
> +	ssize_t ret = i->count;
> +	int idx;
> +
> +	if (unlikely(copy_from_iter(&idx, sizeof(idx), i) != sizeof(idx)))
> +		return -EFAULT;
> +
> +	rcu_read_lock_sched();
> +
> +	refs = rcu_dereference_sched(file->private_data);
> +
> +	/*
> +	 * The refs->events array is protected by RCU, and new items may be
> +	 * added. But the user retrieved from indexing into the events array
> +	 * shall be immutable while the file is opened.
> +	 */
> +	if (likely(refs && idx < refs->count))
> +		user = refs->events[idx];
> +
> +	rcu_read_unlock_sched();
> +
> +	if (unlikely(user == NULL))
> +		return -ENOENT;
> +
> +	tp = &user->tracepoint;
> +
> +	/*
> +	 * It's possible key.enabled disables after this check, however
> +	 * we don't mind if a few events are included in this condition.
> +	 */
> +	if (likely(atomic_read(&tp->key.enabled) > 0)) {
> +		struct tracepoint_func *probe_func_ptr;
> +		user_event_func_t probe_func;
> +		void *tpdata;
> +		void *kdata;
> +		u32 datalen;
> +
> +		kdata = kmalloc(i->count, GFP_KERNEL);
> +
> +		if (unlikely(!kdata))
> +			return -ENOMEM;
> +
> +		datalen = copy_from_iter(kdata, i->count, i);

Don't we need to add this datalen to ret?

> +
> +		rcu_read_lock_sched();
> +
> +		probe_func_ptr = rcu_dereference_sched(tp->funcs);
> +
> +		if (probe_func_ptr) {
> +			do {
> +				probe_func = probe_func_ptr->func;
> +				tpdata = probe_func_ptr->data;
> +				probe_func(user, kdata, datalen, tpdata);
> +			} while ((++probe_func_ptr)->func);
> +		}
> +
> +		rcu_read_unlock_sched();
> +
> +		kfree(kdata);
> +	}
> +
> +	return ret;
> +}

Thank you,



-- 
Masami Hiramatsu <mhiramat@...nel.org>