linux-kernel - Re: [PATCH 3/3] fork, vhost: Use CLONE_THREAD to fix freezer/ps regression

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <723ac1ba-31d2-92fe-4010-42d8cd70d5df@oracle.com>
Date:   Mon, 29 May 2023 14:46:57 -0500
From:   michael.christie@...cle.com
To:     Oleg Nesterov <oleg@...hat.com>
Cc:     "Eric W. Biederman" <ebiederm@...ssion.com>,
        Linus Torvalds <torvalds@...ux-foundation.org>,
        linux@...mhuis.info, nicolas.dichtel@...nd.com, axboe@...nel.dk,
        linux-kernel@...r.kernel.org,
        virtualization@...ts.linux-foundation.org, mst@...hat.com,
        sgarzare@...hat.com, jasowang@...hat.com, stefanha@...hat.com,
        brauner@...nel.org
Subject: Re: [PATCH 3/3] fork, vhost: Use CLONE_THREAD to fix freezer/ps
 regression

On 5/29/23 2:35 PM, Mike Christie wrote:
>> Hmm... If we you CLONE_THREAD the exiting vhost_worker() will auto-reap itself,
> Oh wait, are you saying that when we get auto-reaped then we would do the last
> fput and call the file_operations->release function right? We actually set
> task_struct->files = NULL for the vhost_task task_struct, so I think we call
> release a little sooner than you think.
> 
> vhost_task_create() sets kernel_clone_args->no_files, so the vhost_task task_struc
> that gets created works like kthreads where it doesn't do a CLONE_FILES and it
> doesn't do a dup_fd.
> 
> So when we do de_thread() -> zap_other_threads(), that will kill all the threads
> in the group right? So when they exit, it will call our release function since
> we don't have refcount on ourself.
> 

Just to make sure I'm on the same page now.

In the past thread when were discussing the patch below and you guys were saying
that it doesn't really ignore SIGKILL because we will hit the
SIGNAL_GROUP_EXIT/group_exec_task checks and the parent is going to exit, it was
on purpose.

Instead of a signal to tell me when do exit, I was using the parent exiting and doing
the last fput on the vhost device's file which calls our release function. That then
allowed the vhost code to use the vhost_task to handle the outstanding IOs and then
do vhost_task_should_stop to tell the vhost_task to exit when the oustanding IO
had completed.

On the vhost side of things it's really nice, because all the shutdown paths work
the same and we don't need rcu/locking in the main IO path to handle the vhost_task
exiting while we are using it.


diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 537cbf9a2ade..e0f5ac90a228 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -29,7 +29,6 @@ struct kernel_clone_args {
 	u32 io_thread:1;
 	u32 user_worker:1;
 	u32 no_files:1;
-	u32 ignore_signals:1;
 	unsigned long stack;
 	unsigned long stack_size;
 	unsigned long tls;
diff --git a/kernel/fork.c b/kernel/fork.c
index ed4e01daccaa..fd2970b598b2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2336,8 +2336,15 @@ __latent_entropy struct task_struct *copy_process(
 	p->flags &= ~PF_KTHREAD;
 	if (args->kthread)
 		p->flags |= PF_KTHREAD;
-	if (args->user_worker)
+	if (args->user_worker) {
+		/*
+		 * User worker are similar to io_threads but they do not
+		 * support signals and cleanup is driven via another kernel
+		 * interface so even SIGKILL is blocked.
+		 */
 		p->flags |= PF_USER_WORKER;
+		siginitsetinv(&p->blocked, 0);
+	}
 	if (args->io_thread) {
 		/*
 		 * Mark us an IO worker, and block any signal that isn't
@@ -2517,8 +2524,8 @@ __latent_entropy struct task_struct *copy_process(
 	if (retval)
 		goto bad_fork_cleanup_io;
 
-	if (args->ignore_signals)
-		ignore_signals(p);
+	if (args->user_worker)
+		p->flags |= PF_NOFREEZE;
 
 	stackleak_task_init(p);
 
@@ -2860,7 +2867,6 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
 		.fn		= fn,
 		.fn_arg		= arg,
 		.io_thread	= 1,
-		.user_worker	= 1,
 	};
 
 	return copy_process(NULL, 0, node, &args);
diff --git a/kernel/signal.c b/kernel/signal.c
index 8f6330f0e9ca..bc7e26072437 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -995,6 +995,19 @@ static inline bool wants_signal(int sig, struct task_struct *p)
 	return task_curr(p) || !task_sigpending(p);
 }
 
+static void try_set_pending_sigkill(struct task_struct *t)
+{
+	/*
+	 * User workers don't support signals and their exit is driven through
+	 * their kernel layer, so by default block even SIGKILL.
+	 */
+	if (sigismember(&t->blocked, SIGKILL))
+		return;
+
+	sigaddset(&t->pending.signal, SIGKILL);
+	signal_wake_up(t, 1);
+}
+
 static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
 {
 	struct signal_struct *signal = p->signal;
@@ -1055,8 +1068,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
 			t = p;
 			do {
 				task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
-				sigaddset(&t->pending.signal, SIGKILL);
-				signal_wake_up(t, 1);
+				try_set_pending_sigkill(t);
 			} while_each_thread(p, t);
 			return;
 		}
@@ -1373,8 +1385,7 @@ int zap_other_threads(struct task_struct *p)
 		/* Don't bother with already dead threads */
 		if (t->exit_state)
 			continue;
-		sigaddset(&t->pending.signal, SIGKILL);
-		signal_wake_up(t, 1);
+		try_set_pending_sigkill(t);
 	}
 
 	return count;
diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
index b7cbd66f889e..2d8d3ebaec4d 100644
--- a/kernel/vhost_task.c
+++ b/kernel/vhost_task.c
@@ -75,13 +75,13 @@ struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg,
 				     const char *name)
 {
 	struct kernel_clone_args args = {
-		.flags		= CLONE_FS | CLONE_UNTRACED | CLONE_VM,
+		.flags		= CLONE_FS | CLONE_UNTRACED | CLONE_VM |
+				  CLONE_THREAD | CLONE_SIGHAND,
 		.exit_signal	= 0,
 		.fn		= vhost_task_fn,
 		.name		= name,
 		.user_worker	= 1,
 		.no_files	= 1,
-		.ignore_signals	= 1,
 	};
 	struct vhost_task *vtsk;
 	struct task_struct *tsk;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d257916f39e5..255a2147e5c1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1207,12 +1207,12 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
 	DEFINE_WAIT(wait);
 
 	/*
-	 * Do not throttle user workers, kthreads other than kswapd or
+	 * Do not throttle IO/user workers, kthreads other than kswapd or
 	 * workqueues. They may be required for reclaim to make
 	 * forward progress (e.g. journalling workqueues or kthreads).
 	 */
 	if (!current_is_kswapd() &&
-	    current->flags & (PF_USER_WORKER|PF_KTHREAD)) {
+	    current->flags & (PF_USER_WORKER|PF_IO_WORKER|PF_KTHREAD)) {
 		cond_resched();
 		return;
 	}