[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CACSApvYq=r3YAyZ_XceoRz1BuU+Q+MypXaG_S1fMoYCyFEpbrw@mail.gmail.com>
Date: Tue, 22 Nov 2022 11:18:05 -0500
From: Soheil Hassas Yeganeh <soheil@...gle.com>
To: Paolo Abeni <pabeni@...hat.com>
Cc: linux-fsdevel@...r.kernel.org, Al Viro <viro@...iv.linux.org.uk>,
Davidlohr Bueso <dave@...olabs.net>,
Jason Baron <jbaron@...mai.com>,
Roman Penyaev <rpenyaev@...e.de>, netdev@...r.kernel.org,
Carlos Maiolino <cmaiolino@...hat.com>
Subject: Re: [REPOST PATCH] epoll: use refcount to reduce ep_mutex contention
On Tue, Nov 22, 2022 at 10:43 AM Paolo Abeni <pabeni@...hat.com> wrote:
>
> We are observing huge contention on the epmutex during an http
> connection/rate test:
>
> 83.17% 0.25% nginx [kernel.kallsyms] [k] entry_SYSCALL_64_after_hwframe
> [...]
> |--66.96%--__fput
> |--60.04%--eventpoll_release_file
> |--58.41%--__mutex_lock.isra.6
> |--56.56%--osq_lock
>
> The application is multi-threaded, creates a new epoll entry for
> each incoming connection, and does not delete it before the
> connection shutdown - that is, before the connection's fd close().
>
> Many different threads compete frequently for the epmutex lock,
> affecting the overall performance.
>
> To reduce the contention this patch introduces explicit reference counting
> for the eventpoll struct. Each registered event acquires a reference,
> and references are released at ep_remove() time. ep_free() doesn't touch
> anymore the event RB tree, it just unregisters the existing callbacks
> and drops a reference to the ep struct. The struct itself is freed when
> the reference count reaches 0. The reference count updates are protected
> by the mtx mutex so no additional atomic operations are needed.
>
> Since ep_free() can't compete anymore with eventpoll_release_file()
> for epitems removal, we can drop the epmutex usage at disposal time.
>
> With the patched kernel, in the same connection/rate scenario, the mutex
> operations disappear from the perf report, and the measured connections/rate
> grows by ~60%.
I locally tried this patch and I can reproduce the results. Thank you
for the nice optimization!
> Tested-by: Xiumei Mu <xmu@...hat.com>
> Signed-off-by: Paolo Abeni <pabeni@...hat.com>
> ---
> This is just a repost reaching out for more recipents,
> as suggested by Carlos.
>
> Previous post at:
>
> https://lore.kernel.org/linux-fsdevel/20221122102726.4jremle54zpcapia@andromeda/T/#m6f98d4ccbe0a385d10c04fd4018e782b793944e6
> ---
> fs/eventpoll.c | 113 ++++++++++++++++++++++++++++---------------------
> 1 file changed, 64 insertions(+), 49 deletions(-)
>
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index 52954d4637b5..6e415287aeb8 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -226,6 +226,12 @@ struct eventpoll {
> /* tracks wakeup nests for lockdep validation */
> u8 nests;
> #endif
> +
> + /*
> + * protected by mtx, used to avoid races between ep_free() and
> + * ep_eventpoll_release()
> + */
> + unsigned int refcount;
nitpick: Given that napi_id and nest are both macro protected, you
might want to pull it right after min_wait_ts.
> };
>
> /* Wrapper struct used by poll queueing */
> @@ -240,9 +246,6 @@ struct ep_pqueue {
> /* Maximum number of epoll watched descriptors, per user */
> static long max_user_watches __read_mostly;
>
> -/*
> - * This mutex is used to serialize ep_free() and eventpoll_release_file().
> - */
> static DEFINE_MUTEX(epmutex);
>
> static u64 loop_check_gen = 0;
> @@ -555,8 +558,7 @@ static void ep_remove_wait_queue(struct eppoll_entry *pwq)
>
> /*
> * This function unregisters poll callbacks from the associated file
> - * descriptor. Must be called with "mtx" held (or "epmutex" if called from
> - * ep_free).
> + * descriptor. Must be called with "mtx" held.
> */
> static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
> {
> @@ -679,11 +681,37 @@ static void epi_rcu_free(struct rcu_head *head)
> kmem_cache_free(epi_cache, epi);
> }
>
> +static void ep_get(struct eventpoll *ep)
> +{
> + ep->refcount++;
> +}
> +
> +/*
> + * Returns true if the event poll can be disposed
> + */
> +static bool ep_put(struct eventpoll *ep)
> +{
> + if (--ep->refcount)
> + return false;
> +
> + WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root));
> + return true;
> +}
> +
> +static void ep_dispose(struct eventpoll *ep)
> +{
> + mutex_destroy(&ep->mtx);
> + free_uid(ep->user);
> + wakeup_source_unregister(ep->ws);
> + kfree(ep);
> +}
> +
> /*
> * Removes a "struct epitem" from the eventpoll RB tree and deallocates
> * all the associated resources. Must be called with "mtx" held.
> + * Returns true if the eventpoll can be disposed.
> */
> -static int ep_remove(struct eventpoll *ep, struct epitem *epi)
> +static bool ep_remove(struct eventpoll *ep, struct epitem *epi)
> {
> struct file *file = epi->ffd.file;
> struct epitems_head *to_free;
> @@ -731,28 +759,20 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
> call_rcu(&epi->rcu, epi_rcu_free);
>
> percpu_counter_dec(&ep->user->epoll_watches);
> -
> - return 0;
> + return ep_put(ep);
> }
>
> static void ep_free(struct eventpoll *ep)
> {
> struct rb_node *rbp;
> struct epitem *epi;
> + bool dispose;
>
> /* We need to release all tasks waiting for these file */
> if (waitqueue_active(&ep->poll_wait))
> ep_poll_safewake(ep, NULL);
>
> - /*
> - * We need to lock this because we could be hit by
> - * eventpoll_release_file() while we're freeing the "struct eventpoll".
> - * We do not need to hold "ep->mtx" here because the epoll file
> - * is on the way to be removed and no one has references to it
> - * anymore. The only hit might come from eventpoll_release_file() but
> - * holding "epmutex" is sufficient here.
> - */
> - mutex_lock(&epmutex);
> + mutex_lock(&ep->mtx);
>
> /*
> * Walks through the whole tree by unregistering poll callbacks.
> @@ -765,26 +785,14 @@ static void ep_free(struct eventpoll *ep)
> }
>
> /*
> - * Walks through the whole tree by freeing each "struct epitem". At this
> - * point we are sure no poll callbacks will be lingering around, and also by
> - * holding "epmutex" we can be sure that no file cleanup code will hit
> - * us during this operation. So we can avoid the lock on "ep->lock".
> - * We do not need to lock ep->mtx, either, we only do it to prevent
> - * a lockdep warning.
> + * epitems in the rb tree are freed either with EPOLL_CTL_DEL
> + * or at the relevant file close time by eventpoll_release_file()
> */
> - mutex_lock(&ep->mtx);
> - while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
> - epi = rb_entry(rbp, struct epitem, rbn);
> - ep_remove(ep, epi);
> - cond_resched();
> - }
> + dispose = ep_put(ep);
> mutex_unlock(&ep->mtx);
>
> - mutex_unlock(&epmutex);
> - mutex_destroy(&ep->mtx);
> - free_uid(ep->user);
> - wakeup_source_unregister(ep->ws);
> - kfree(ep);
> + if (dispose)
> + ep_dispose(ep);
> }
>
> static int ep_eventpoll_release(struct inode *inode, struct file *file)
> @@ -905,6 +913,7 @@ void eventpoll_release_file(struct file *file)
> struct eventpoll *ep;
> struct epitem *epi;
> struct hlist_node *next;
> + bool dispose;
>
> /*
> * We don't want to get "file->f_lock" because it is not
> @@ -912,25 +921,18 @@ void eventpoll_release_file(struct file *file)
> * cleanup path, and this means that no one is using this file anymore.
> * So, for example, epoll_ctl() cannot hit here since if we reach this
> * point, the file counter already went to zero and fget() would fail.
> - * The only hit might come from ep_free() but by holding the mutex
> - * will correctly serialize the operation. We do need to acquire
> - * "ep->mtx" after "epmutex" because ep_remove() requires it when called
> - * from anywhere but ep_free().
> *
> * Besides, ep_remove() acquires the lock, so we can't hold it here.
> */
> - mutex_lock(&epmutex);
> - if (unlikely(!file->f_ep)) {
> - mutex_unlock(&epmutex);
> - return;
> - }
> hlist_for_each_entry_safe(epi, next, file->f_ep, fllink) {
> ep = epi->ep;
> - mutex_lock_nested(&ep->mtx, 0);
> - ep_remove(ep, epi);
> + mutex_lock(&ep->mtx);
> + dispose = ep_remove(ep, epi);
> mutex_unlock(&ep->mtx);
> +
> + if (dispose)
> + ep_dispose(ep);
> }
> - mutex_unlock(&epmutex);
> }
>
> static int ep_alloc(struct eventpoll **pep)
> @@ -953,6 +955,7 @@ static int ep_alloc(struct eventpoll **pep)
> ep->rbr = RB_ROOT_CACHED;
> ep->ovflist = EP_UNACTIVE_PTR;
> ep->user = user;
> + ep->refcount = 1;
>
> *pep = ep;
>
> @@ -1494,6 +1497,12 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
> if (tep)
> mutex_unlock(&tep->mtx);
>
> + /*
> + * ep_remove() calls in the later error paths can't lead to ep_dispose()
> + * as overall will lead to no refcount changes
> + */
> + ep_get(ep);
> +
> /* now check if we've created too many backpaths */
> if (unlikely(full_check && reverse_path_check())) {
> ep_remove(ep, epi);
> @@ -2165,10 +2174,16 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
> error = -EEXIST;
> break;
> case EPOLL_CTL_DEL:
> - if (epi)
> - error = ep_remove(ep, epi);
> - else
> + if (epi) {
> + /*
> + * The eventpoll itself is still alive: the refcount
> + * can't go to zero here.
> + */
> + WARN_ON_ONCE(ep_remove(ep, epi));
There are similar examples of calling ep_remove() without checking the
return value in ep_insert().
I believe we should add a similar comment there, and maybe a
WARN_ON_ONCE. I'm not sure, but it might be worth adding a new helper
given this repeated pattern?
> + error = 0;
> + } else {
> error = -ENOENT;
> + }
> break;
> case EPOLL_CTL_MOD:
> if (epi) {
> --
> 2.38.1
>
Powered by blists - more mailing lists