linux-kernel - Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-ID: <87pmv23lru.ffs@nanos.tec.linutronix.de>
Date:   Wed, 28 Jul 2021 10:06:29 +0200
From:   Thomas Gleixner <tglx@...utronix.de>
To:     Paolo Bonzini <pbonzini@...hat.com>,
        Jason Wang <jasowang@...hat.com>,
        Daniel Bristot de Oliveira <bristot@...hat.com>,
        Sebastian Andrzej Siewior <bigeasy@...utronix.de>,
        "Michael S. Tsirkin" <mst@...hat.com>,
        Juri Lelli <jlelli@...hat.com>
Cc:     LKML <linux-kernel@...r.kernel.org>,
        Al Viro <viro@...iv.linux.org.uk>,
        He Zhe <zhe.he@...driver.com>
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

On Wed, Jul 14 2021 at 12:35, Paolo Bonzini wrote:
> On 14/07/21 11:23, Jason Wang wrote:
>   
> +static local_lock_t eventfd_wake_lock = INIT_LOCAL_LOCK(eventfd_wake_lock);
>   DEFINE_PER_CPU(int, eventfd_wake_count);
>   
>   static DEFINE_IDA(eventfd_ida);
> @@ -71,8 +73,11 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
>   	 * it returns true, the eventfd_signal() call should be deferred to a
>   	 * safe context.
>   	 */
> -	if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
> +	local_lock(&eventfd_wake_lock);
> +	if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count))) {
> +		local_unlock(&eventfd_wake_lock);
>   		return 0;
> +	}
>   
>   	spin_lock_irqsave(&ctx->wqh.lock, flags);
>   	this_cpu_inc(eventfd_wake_count);
> @@ -83,6 +88,7 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
>   		wake_up_locked_poll(&ctx->wqh, EPOLLIN);
>   	this_cpu_dec(eventfd_wake_count);
>   	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
> +	local_unlock(&eventfd_wake_lock);

Yes, that cures it, but if you think about what this wants to prevent,
then having the recursion counter per CPU is at least suboptimal.

Something like the untested below perhaps?

Thanks,

        tglx
---
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1695,7 +1695,7 @@ static int aio_poll_wake(struct wait_que
 		list_del(&iocb->ki_list);
 		iocb->ki_res.res = mangle_poll(mask);
 		req->done = true;
-		if (iocb->ki_eventfd && eventfd_signal_count()) {
+		if (iocb->ki_eventfd && !eventfd_signal_allowed()) {
 			iocb = NULL;
 			INIT_WORK(&req->work, aio_poll_put_work);
 			schedule_work(&req->work);
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -25,8 +25,6 @@
 #include <linux/idr.h>
 #include <linux/uio.h>
 
-DEFINE_PER_CPU(int, eventfd_wake_count);
-
 static DEFINE_IDA(eventfd_ida);
 
 struct eventfd_ctx {
@@ -67,21 +65,21 @@ struct eventfd_ctx {
 	 * Deadlock or stack overflow issues can happen if we recurse here
 	 * through waitqueue wakeup handlers. If the caller users potentially
 	 * nested waitqueues with custom wakeup handlers, then it should
-	 * check eventfd_signal_count() before calling this function. If
-	 * it returns true, the eventfd_signal() call should be deferred to a
+	 * check eventfd_signal_allowed() before calling this function. If
+	 * it returns false, the eventfd_signal() call should be deferred to a
 	 * safe context.
 	 */
-	if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
+	if (WARN_ON_ONCE(current->in_eventfd_signal))
 		return 0;
 
 	spin_lock_irqsave(&ctx->wqh.lock, flags);
-	this_cpu_inc(eventfd_wake_count);
+	current->in_eventfd_signal = 1;
 	if (ULLONG_MAX - ctx->count < n)
 		n = ULLONG_MAX - ctx->count;
 	ctx->count += n;
 	if (waitqueue_active(&ctx->wqh))
 		wake_up_locked_poll(&ctx->wqh, EPOLLIN);
-	this_cpu_dec(eventfd_wake_count);
+	current->in_eventfd_signal = 0;
 	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 
 	return n;
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -43,11 +43,9 @@ int eventfd_ctx_remove_wait_queue(struct
 				  __u64 *cnt);
 void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);
 
-DECLARE_PER_CPU(int, eventfd_wake_count);
-
-static inline bool eventfd_signal_count(void)
+static inline bool eventfd_signal_allowed(void)
 {
-	return this_cpu_read(eventfd_wake_count);
+	return !current->in_eventfd_signal;
 }
 
 #else /* CONFIG_EVENTFD */
@@ -78,9 +76,9 @@ static inline int eventfd_ctx_remove_wai
 	return -ENOSYS;
 }
 
-static inline bool eventfd_signal_count(void)
+static inline bool eventfd_signal_allowed(void)
 {
-	return false;
+	return true;
 }
 
 static inline void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -863,6 +863,8 @@ struct task_struct {
 	/* Used by page_owner=on to detect recursion in page tracking. */
 	unsigned			in_page_owner:1;
 #endif
+	/* Recursion prevention for eventfd_signal() */
+	unsigned			in_eventfd_signal:1;
 
 	unsigned long			atomic_flags; /* Flags requiring atomic access. */