[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <a18e940d-8423-0294-23b4-f2702313f3eb@efficios.com>
Date: Fri, 28 Oct 2022 18:19:10 -0400
From: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
To: Beau Belgrave <beaub@...ux.microsoft.com>, rostedt@...dmis.org,
mhiramat@...nel.org, dcook@...ux.microsoft.com,
alanau@...ux.microsoft.com
Cc: linux-trace-devel@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [RFC PATCH 2/2] tracing/user_events: Fixup enable faults asyncly
On 2022-10-27 18:40, Beau Belgrave wrote:
> When events are enabled within the various tracing facilities, such as
> ftrace/perf, the event_mutex is held. As events are enabled pages are
> accessed. We do not want page faults to occur under this lock. Instead
> queue the fault to a workqueue to be handled in a process context safe
> way without the lock.
>
> The enable address is disabled while the async fault-in occurs. This
> ensures that we don't attempt fault-in more than is necessary. Once the
> page has been faulted in, the address write is attempted again. If the
> page couldn't fault-in, then we wait until the next time the event is
> enabled to prevent any potential infinite loops.
I'm also unclear about how the system call initiating the enabled state
change is delayed (or not) when a page fault is queued.
I would expect that when a page fault is needed, after enqueuing work to
the worker thread, the system call initiating the state change would
somehow wait for a completion (after releasing the user events mutex).
That completion would be signaled by the worker thread either if the
page fault fails, or if the state change is done.
Thoughts ?
Thanks,
Mathieu
>
> Signed-off-by: Beau Belgrave <beaub@...ux.microsoft.com>
> ---
> kernel/trace/trace_events_user.c | 125 ++++++++++++++++++++++++++++++-
> 1 file changed, 121 insertions(+), 4 deletions(-)
>
> diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
> index 633f24c2a1ac..f1eb8101e053 100644
> --- a/kernel/trace/trace_events_user.c
> +++ b/kernel/trace/trace_events_user.c
> @@ -81,11 +81,22 @@ struct user_event_enabler {
> struct list_head link;
> struct mm_struct *mm;
> struct file *file;
> + refcount_t refcnt;
> unsigned long enable_addr;
> unsigned int enable_bit: 5,
> - __reserved: 27;
> + __reserved: 26,
> + disabled: 1;
> };
>
> +/* Used for asynchronous faulting in of pages */
> +struct user_event_enabler_fault {
> + struct work_struct work;
> + struct user_event_enabler *enabler;
> + struct user_event *event;
> +};
> +
> +static struct kmem_cache *fault_cache;
> +
> /*
> * Stores per-event properties, as users register events
> * within a file a user_event might be created if it does not
> @@ -236,6 +247,19 @@ static void user_event_enabler_destroy(struct user_event_enabler *enabler)
> kfree(enabler);
> }
>
> +static __always_inline struct user_event_enabler
> +*user_event_enabler_get(struct user_event_enabler *enabler)
> +{
> + refcount_inc(&enabler->refcnt);
> + return enabler;
> +}
> +
> +static void user_event_enabler_put(struct user_event_enabler *enabler)
> +{
> + if (refcount_dec_and_test(&enabler->refcnt))
> + user_event_enabler_destroy(enabler);
> +}
> +
> static void user_event_enabler_remove(struct file *file,
> struct user_event *user)
> {
> @@ -249,13 +273,93 @@ static void user_event_enabler_remove(struct file *file,
> if (enabler->file != file)
> continue;
>
> + enabler->disabled = 0;
> list_del(&enabler->link);
> - user_event_enabler_destroy(enabler);
> + user_event_enabler_put(enabler);
> }
>
> mutex_unlock(&event_mutex);
> }
>
> +static void user_event_enabler_write(struct user_event_enabler *enabler,
> + struct user_event *user);
> +
> +static void user_event_enabler_fault_fixup(struct work_struct *work)
> +{
> + struct user_event_enabler_fault *fault = container_of(
> + work, struct user_event_enabler_fault, work);
> + struct user_event_enabler *enabler = fault->enabler;
> + struct user_event *user = fault->event;
> + struct mm_struct *mm = enabler->mm;
> + unsigned long uaddr = enabler->enable_addr;
> + bool unlocked = false;
> + int ret;
> +
> + might_sleep();
> +
> + mmap_read_lock(mm);
> +
> + ret = fixup_user_fault(mm, uaddr, FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
> + &unlocked);
> +
> + mmap_read_unlock(mm);
> +
> + if (ret)
> + pr_warn("user_events: Fixup fault failed with %d "
> + "for mm: 0x%pK offset: 0x%llx event: %s\n", ret, mm,
> + (unsigned long long)uaddr, EVENT_NAME(user));
> +
> + /* Prevent state changes from racing */
> + mutex_lock(&event_mutex);
> +
> + /*
> + * If we managed to get the page, re-issue the write. We do not
> + * want to get into a possible infinite loop, which is why we only
> + * attempt again directly if the page came in. If we couldn't get
> + * the page here, then we will try again the next time the event is
> + * enabled/disabled.
> + */
> + enabler->disabled = 0;
> +
> + if (!ret)
> + user_event_enabler_write(enabler, user);
> +
> + mutex_unlock(&event_mutex);
> +
> + refcount_dec(&user->refcnt);
> + user_event_enabler_put(enabler);
> + kmem_cache_free(fault_cache, fault);
> +}
> +
> +static bool user_event_enabler_queue_fault(struct user_event_enabler *enabler,
> + struct user_event *user)
> +{
> + struct user_event_enabler_fault *fault;
> +
> + fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN);
> +
> + if (!fault)
> + return false;
> +
> + INIT_WORK(&fault->work, user_event_enabler_fault_fixup);
> + fault->enabler = user_event_enabler_get(enabler);
> + fault->event = user;
> +
> + refcount_inc(&user->refcnt);
> + enabler->disabled = 1;
> +
> + if (!schedule_work(&fault->work)) {
> + enabler->disabled = 0;
> + refcount_dec(&user->refcnt);
> + user_event_enabler_put(enabler);
> + kmem_cache_free(fault_cache, fault);
> +
> + return false;
> + }
> +
> + return true;
> +}
> +
> static void user_event_enabler_write(struct user_event_enabler *enabler,
> struct user_event *user)
> {
> @@ -266,6 +370,11 @@ static void user_event_enabler_write(struct user_event_enabler *enabler,
> void *kaddr;
> int ret;
>
> + lockdep_assert_held(&event_mutex);
> +
> + if (unlikely(enabler->disabled))
> + return;
> +
> mmap_read_lock(mm);
>
> ret = pin_user_pages_remote(mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
> @@ -273,8 +382,10 @@ static void user_event_enabler_write(struct user_event_enabler *enabler,
>
> mmap_read_unlock(mm);
>
> - if (ret <= 0) {
> - pr_warn("user_events: Enable write failed\n");
> + if (unlikely(ret <= 0)) {
> + if (!user_event_enabler_queue_fault(enabler, user))
> + pr_warn("user_events: Unable to queue fault handler\n");
> +
> return;
> }
>
> @@ -321,6 +432,7 @@ static struct user_event_enabler
> enabler->file = file;
> enabler->enable_addr = (unsigned long)reg->enable_addr;
> enabler->enable_bit = reg->enable_bit;
> + refcount_set(&enabler->refcnt, 1);
>
> /* Prevents state changes from racing with new enablers */
> mutex_lock(&event_mutex);
> @@ -1902,6 +2014,11 @@ static int __init trace_events_user_init(void)
> {
> int ret;
>
> + fault_cache = KMEM_CACHE(user_event_enabler_fault, 0);
> +
> + if (!fault_cache)
> + return -ENOMEM;
> +
> init_group = user_event_group_create(&init_user_ns);
>
> if (!init_group)
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
Powered by blists - more mailing lists