lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <Z9kya48cbdP6EKmf@p200300d06f3e989853c6b164a74a5758.dip0.t-ipconnect.de>
Date: Tue, 18 Mar 2025 09:44:27 +0100
From: Frederic Weisbecker <frederic@...nel.org>
To: Peter Zijlstra <peterz@...radead.org>
Cc: LKML <linux-kernel@...r.kernel.org>, Ingo Molnar <mingo@...hat.com>,
	Arnaldo Carvalho de Melo <acme@...nel.org>,
	Namhyung Kim <namhyung@...nel.org>,
	Mark Rutland <mark.rutland@....com>,
	Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
	Jiri Olsa <jolsa@...nel.org>, Ian Rogers <irogers@...gle.com>,
	Adrian Hunter <adrian.hunter@...el.com>,
	"Liang, Kan" <kan.liang@...ux.intel.com>,
	Sebastian Andrzej Siewior <bigeasy@...utronix.de>,
	Oleg Nesterov <oleg@...hat.com>, Yi Lai <yi1.lai@...ux.intel.com>,
	syzbot+3c4321e10eea460eb606@...kaller.appspotmail.com
Subject: Re: [PATCH] perf: Fix hang while freeing sigtrap event

Ping.

Le Tue, Mar 04, 2025 at 02:54:46PM +0100, Frederic Weisbecker a écrit :
> Perf can hang while freeing a sigtrap event if a related deferred
> signal hadn't managed to be sent before the file got closed:
> 
> perf_event_overflow()
>    task_work_add(perf_pending_task)
> 
> fput()
>    task_work_add(____fput())
> 
> task_work_run()
>     ____fput()
>         perf_release()
>             perf_event_release_kernel()
>                 _free_event()
>                     perf_pending_task_sync()
>                         task_work_cancel() -> FAILED
>                         rcuwait_wait_event()
> 
> Once task_work_run() is running, the list of pending callbacks is
> removed from the task_struct and from this point on task_work_cancel()
> can't remove any pending and not yet started work items, hence the
> task_work_cancel() failure and the hang on rcuwait_wait_event().
> 
> Task work could be changed to remove one work at a time, so a work
> running on the current task can always cancel a pending one, however
> the wait / wake design is still subject to inverted dependencies when
> remote targets are involved, as pictured by Oleg:
> 
> T1                                                      T2
> ---                                                    ---
> fd = perf_event_open(pid => T2->pid);                  fd = perf_event_open(pid => T1->pid);
> close(fd)                                              close(fd)
>     <IRQ>                                                  <IRQ>
>     perf_event_overflow()                                  perf_event_overflow()
>        task_work_add(perf_pending_task)                        task_work_add(perf_pending_task)
>     </IRQ>                                                 </IRQ>
>     fput()                                                 fput()
>         task_work_add(____fput())                              task_work_add(____fput())
> 
>     task_work_run()                                        task_work_run()
>         ____fput()                                             ____fput()
>             perf_release()                                         perf_release()
>                 perf_event_release_kernel()                            perf_event_release_kernel()
>                     _free_event()                                          _free_event()
>                         perf_pending_task_sync()                               perf_pending_task_sync()
>                             rcuwait_wait_event()                                   rcuwait_wait_event()
> 
> Therefore the only option left is to acquire the event reference count
> upon queueing the perf task work and release it from the task work, just
> like it was done before 3a5465418f5f ("perf: Fix event leak upon exec and file release")
> but without the leaks it fixed.
> 
> Some adjustments are necessary to make it work:
> 
> * A child event might dereference its parent upon freeing. Care must be
>   taken to release the parent last.
> 
> * Some places assuming the event doesn't have any reference held and
>   therefore can be freed right away must instead put the reference and
>   let the reference counting to its job.
> 
> Cc: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
> Cc: Oleg Nesterov <oleg@...hat.com>
> Reported-by: "Yi Lai" <yi1.lai@...ux.intel.com>
> Closes: https://lore.kernel.org/all/Zx9Losv4YcJowaP%2F@ly-workstation/
> Reported-by: syzbot+3c4321e10eea460eb606@...kaller.appspotmail.com
> Closes: https://lore.kernel.org/all/673adf75.050a0220.87769.0024.GAE@google.com/
> Fixes: 3a5465418f5f ("perf: Fix event leak upon exec and file release")
> Signed-off-by: Frederic Weisbecker <frederic@...nel.org>
> ---
>  include/linux/perf_event.h |  1 -
>  kernel/events/core.c       | 64 +++++++++++---------------------------
>  2 files changed, 18 insertions(+), 47 deletions(-)
> 
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 76f4265efee9..4e8970da6953 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -832,7 +832,6 @@ struct perf_event {
>  	struct irq_work			pending_disable_irq;
>  	struct callback_head		pending_task;
>  	unsigned int			pending_work;
> -	struct rcuwait			pending_work_wait;
>  
>  	atomic_t			event_limit;
>  
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index b2334d27511b..253791d99e21 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -5355,30 +5355,6 @@ static bool exclusive_event_installable(struct perf_event *event,
>  
>  static void perf_free_addr_filters(struct perf_event *event);
>  
> -static void perf_pending_task_sync(struct perf_event *event)
> -{
> -	struct callback_head *head = &event->pending_task;
> -
> -	if (!event->pending_work)
> -		return;
> -	/*
> -	 * If the task is queued to the current task's queue, we
> -	 * obviously can't wait for it to complete. Simply cancel it.
> -	 */
> -	if (task_work_cancel(current, head)) {
> -		event->pending_work = 0;
> -		local_dec(&event->ctx->nr_no_switch_fast);
> -		return;
> -	}
> -
> -	/*
> -	 * All accesses related to the event are within the same RCU section in
> -	 * perf_pending_task(). The RCU grace period before the event is freed
> -	 * will make sure all those accesses are complete by then.
> -	 */
> -	rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE);
> -}
> -
>  /* vs perf_event_alloc() error */
>  static void __free_event(struct perf_event *event)
>  {
> @@ -5433,7 +5409,6 @@ static void _free_event(struct perf_event *event)
>  {
>  	irq_work_sync(&event->pending_irq);
>  	irq_work_sync(&event->pending_disable_irq);
> -	perf_pending_task_sync(event);
>  
>  	unaccount_event(event);
>  
> @@ -5526,10 +5501,17 @@ static void perf_remove_from_owner(struct perf_event *event)
>  
>  static void put_event(struct perf_event *event)
>  {
> +	struct perf_event *parent;
> +
>  	if (!atomic_long_dec_and_test(&event->refcount))
>  		return;
>  
> +	parent = event->parent;
>  	_free_event(event);
> +
> +	/* Matches the refcount bump in inherit_event() */
> +	if (parent)
> +		put_event(parent);
>  }
>  
>  /*
> @@ -5613,11 +5595,6 @@ int perf_event_release_kernel(struct perf_event *event)
>  		if (tmp == child) {
>  			perf_remove_from_context(child, DETACH_GROUP);
>  			list_move(&child->child_list, &free_list);
> -			/*
> -			 * This matches the refcount bump in inherit_event();
> -			 * this can't be the last reference.
> -			 */
> -			put_event(event);
>  		} else {
>  			var = &ctx->refcount;
>  		}
> @@ -5643,7 +5620,8 @@ int perf_event_release_kernel(struct perf_event *event)
>  		void *var = &child->ctx->refcount;
>  
>  		list_del(&child->child_list);
> -		free_event(child);
> +		/* Last reference unless ->pending_task work is pending */
> +		put_event(child);
>  
>  		/*
>  		 * Wake any perf_event_free_task() waiting for this event to be
> @@ -5654,7 +5632,11 @@ int perf_event_release_kernel(struct perf_event *event)
>  	}
>  
>  no_ctx:
> -	put_event(event); /* Must be the 'last' reference */
> +	/*
> +	 * Last reference unless ->pending_task work is pending on this event
> +	 * or any of its children.
> +	 */
> +	put_event(event);
>  	return 0;
>  }
>  EXPORT_SYMBOL_GPL(perf_event_release_kernel);
> @@ -7065,12 +7047,6 @@ static void perf_pending_task(struct callback_head *head)
>  	struct perf_event *event = container_of(head, struct perf_event, pending_task);
>  	int rctx;
>  
> -	/*
> -	 * All accesses to the event must belong to the same implicit RCU read-side
> -	 * critical section as the ->pending_work reset. See comment in
> -	 * perf_pending_task_sync().
> -	 */
> -	rcu_read_lock();
>  	/*
>  	 * If we 'fail' here, that's OK, it means recursion is already disabled
>  	 * and we won't recurse 'further'.
> @@ -7081,9 +7057,8 @@ static void perf_pending_task(struct callback_head *head)
>  		event->pending_work = 0;
>  		perf_sigtrap(event);
>  		local_dec(&event->ctx->nr_no_switch_fast);
> -		rcuwait_wake_up(&event->pending_work_wait);
>  	}
> -	rcu_read_unlock();
> +	put_event(event);
>  
>  	if (rctx >= 0)
>  		perf_swevent_put_recursion_context(rctx);
> @@ -10030,6 +10005,7 @@ static int __perf_event_overflow(struct perf_event *event,
>  		    !task_work_add(current, &event->pending_task, notify_mode)) {
>  			event->pending_work = pending_id;
>  			local_inc(&event->ctx->nr_no_switch_fast);
> +			WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
>  
>  			event->pending_addr = 0;
>  			if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
> @@ -12382,7 +12358,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
>  	init_irq_work(&event->pending_irq, perf_pending_irq);
>  	event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable);
>  	init_task_work(&event->pending_task, perf_pending_task);
> -	rcuwait_init(&event->pending_work_wait);
>  
>  	mutex_init(&event->mmap_mutex);
>  	raw_spin_lock_init(&event->addr_filters.lock);
> @@ -13512,8 +13487,7 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
>  		 * Kick perf_poll() for is_event_hup();
>  		 */
>  		perf_event_wakeup(parent_event);
> -		free_event(event);
> -		put_event(parent_event);
> +		put_event(event);
>  		return;
>  	}
>  
> @@ -13631,13 +13605,11 @@ static void perf_free_event(struct perf_event *event,
>  	list_del_init(&event->child_list);
>  	mutex_unlock(&parent->child_mutex);
>  
> -	put_event(parent);
> -
>  	raw_spin_lock_irq(&ctx->lock);
>  	perf_group_detach(event);
>  	list_del_event(event, ctx);
>  	raw_spin_unlock_irq(&ctx->lock);
> -	free_event(event);
> +	put_event(event);
>  }
>  
>  /*
> -- 
> 2.48.1
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ