linux-kernel - Re: [PATCH v11 8/8] vhost: use vhost

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <8bfb7d1d-f7d0-94ca-4777-e31a2003027a@oracle.com>
Date:   Mon, 15 May 2023 22:53:15 -0500
From:   Mike Christie <michael.christie@...cle.com>
To:     Linus Torvalds <torvalds@...ux-foundation.org>,
        Oleg Nesterov <oleg@...hat.com>
Cc:     Christian Brauner <brauner@...nel.org>,
        Thorsten Leemhuis <linux@...mhuis.info>,
        nicolas.dichtel@...nd.com,
        Linux kernel regressions list <regressions@...ts.linux.dev>,
        hch@...radead.org, stefanha@...hat.com, jasowang@...hat.com,
        mst@...hat.com, sgarzare@...hat.com,
        virtualization@...ts.linux-foundation.org, ebiederm@...ssion.com,
        konrad.wilk@...cle.com, linux-kernel@...r.kernel.org,
        Jens Axboe <axboe@...nel.dk>
Subject: Re: [PATCH v11 8/8] vhost: use vhost_tasks for worker threads

On 5/15/23 5:54 PM, Linus Torvalds wrote:
> On Mon, May 15, 2023 at 3:23 PM Mike Christie
> <michael.christie@...cle.com> wrote:
>>
>> The vhost layer really doesn't want any signals and wants to work like kthreads
>> for that case. To make it really simple can we do something like this where it
>> separates user and io worker behavior where the major diff is how they handle
>> signals and exit. I also included a fix for the freeze case:
> 
> I don't love the SIGKILL special case, but I also don't find this
> deeply offensive. So if this is what it takes, I'm ok with it.
> 
> I wonder if we could make that special case simply check for "is
> SIGKILL blocked" instead? No normal case will cause that, and it means

Yeah, it's doable. Updated below.

> that a PF_USER_WORKER thread could decide per-thread what it wants to
> do wrt SIGKILL.
> 
> Christian? And I guess we should Cc: Oleg too, since the signal parts
> are an area he's familiar with and has worked on.. Eric Biederman has
> already been on the list and has also been involved
> 
> Oleg: see
> 
>   https://lore.kernel.org/lkml/122b597e-a5fa-daf7-27bb-6f04fa98d496@oracle.com/
> 
> for the context here.

Oleg and Christian,


Below is an updated patch that doesn't check for PF_USER_WORKER in the
signal.c code and instead will check for if we have blocked the signal.




diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 537cbf9a2ade..e0f5ac90a228 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -29,7 +29,6 @@ struct kernel_clone_args {
 	u32 io_thread:1;
 	u32 user_worker:1;
 	u32 no_files:1;
-	u32 ignore_signals:1;
 	unsigned long stack;
 	unsigned long stack_size;
 	unsigned long tls;
diff --git a/kernel/fork.c b/kernel/fork.c
index ed4e01daccaa..fd2970b598b2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2336,8 +2336,15 @@ __latent_entropy struct task_struct *copy_process(
 	p->flags &= ~PF_KTHREAD;
 	if (args->kthread)
 		p->flags |= PF_KTHREAD;
-	if (args->user_worker)
+	if (args->user_worker) {
+		/*
+		 * User worker are similar to io_threads but they do not
+		 * support signals and cleanup is driven via another kernel
+		 * interface so even SIGKILL is blocked.
+		 */
 		p->flags |= PF_USER_WORKER;
+		siginitsetinv(&p->blocked, 0);
+	}
 	if (args->io_thread) {
 		/*
 		 * Mark us an IO worker, and block any signal that isn't
@@ -2517,8 +2524,8 @@ __latent_entropy struct task_struct *copy_process(
 	if (retval)
 		goto bad_fork_cleanup_io;
 
-	if (args->ignore_signals)
-		ignore_signals(p);
+	if (args->user_worker)
+		p->flags |= PF_NOFREEZE;
 
 	stackleak_task_init(p);
 
@@ -2860,7 +2867,6 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
 		.fn		= fn,
 		.fn_arg		= arg,
 		.io_thread	= 1,
-		.user_worker	= 1,
 	};
 
 	return copy_process(NULL, 0, node, &args);
diff --git a/kernel/signal.c b/kernel/signal.c
index 8f6330f0e9ca..bc7e26072437 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -995,6 +995,19 @@ static inline bool wants_signal(int sig, struct task_struct *p)
 	return task_curr(p) || !task_sigpending(p);
 }
 
+static void try_set_pending_sigkill(struct task_struct *t)
+{
+	/*
+	 * User workers don't support signals and their exit is driven through
+	 * their kernel layer, so by default block even SIGKILL.
+	 */
+	if (sigismember(&t->blocked, SIGKILL))
+		return;
+
+	sigaddset(&t->pending.signal, SIGKILL);
+	signal_wake_up(t, 1);
+}
+
 static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
 {
 	struct signal_struct *signal = p->signal;
@@ -1055,8 +1068,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
 			t = p;
 			do {
 				task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
-				sigaddset(&t->pending.signal, SIGKILL);
-				signal_wake_up(t, 1);
+				try_set_pending_sigkill(t);
 			} while_each_thread(p, t);
 			return;
 		}
@@ -1373,8 +1385,7 @@ int zap_other_threads(struct task_struct *p)
 		/* Don't bother with already dead threads */
 		if (t->exit_state)
 			continue;
-		sigaddset(&t->pending.signal, SIGKILL);
-		signal_wake_up(t, 1);
+		try_set_pending_sigkill(t);
 	}
 
 	return count;
diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
index b7cbd66f889e..2d8d3ebaec4d 100644
--- a/kernel/vhost_task.c
+++ b/kernel/vhost_task.c
@@ -75,13 +75,13 @@ struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg,
 				     const char *name)
 {
 	struct kernel_clone_args args = {
-		.flags		= CLONE_FS | CLONE_UNTRACED | CLONE_VM,
+		.flags		= CLONE_FS | CLONE_UNTRACED | CLONE_VM |
+				  CLONE_THREAD | CLONE_SIGHAND,
 		.exit_signal	= 0,
 		.fn		= vhost_task_fn,
 		.name		= name,
 		.user_worker	= 1,
 		.no_files	= 1,
-		.ignore_signals	= 1,
 	};
 	struct vhost_task *vtsk;
 	struct task_struct *tsk;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d257916f39e5..255a2147e5c1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1207,12 +1207,12 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
 	DEFINE_WAIT(wait);
 
 	/*
-	 * Do not throttle user workers, kthreads other than kswapd or
+	 * Do not throttle IO/user workers, kthreads other than kswapd or
 	 * workqueues. They may be required for reclaim to make
 	 * forward progress (e.g. journalling workqueues or kthreads).
 	 */
 	if (!current_is_kswapd() &&
-	    current->flags & (PF_USER_WORKER|PF_KTHREAD)) {
+	    current->flags & (PF_USER_WORKER|PF_IO_WORKER|PF_KTHREAD)) {
 		cond_resched();
 		return;
 	}