[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <8bfb7d1d-f7d0-94ca-4777-e31a2003027a@oracle.com>
Date: Mon, 15 May 2023 22:53:15 -0500
From: Mike Christie <michael.christie@...cle.com>
To: Linus Torvalds <torvalds@...ux-foundation.org>,
Oleg Nesterov <oleg@...hat.com>
Cc: Christian Brauner <brauner@...nel.org>,
Thorsten Leemhuis <linux@...mhuis.info>,
nicolas.dichtel@...nd.com,
Linux kernel regressions list <regressions@...ts.linux.dev>,
hch@...radead.org, stefanha@...hat.com, jasowang@...hat.com,
mst@...hat.com, sgarzare@...hat.com,
virtualization@...ts.linux-foundation.org, ebiederm@...ssion.com,
konrad.wilk@...cle.com, linux-kernel@...r.kernel.org,
Jens Axboe <axboe@...nel.dk>
Subject: Re: [PATCH v11 8/8] vhost: use vhost_tasks for worker threads
On 5/15/23 5:54 PM, Linus Torvalds wrote:
> On Mon, May 15, 2023 at 3:23 PM Mike Christie
> <michael.christie@...cle.com> wrote:
>>
>> The vhost layer really doesn't want any signals and wants to work like kthreads
>> for that case. To make it really simple can we do something like this where it
>> separates user and io worker behavior where the major diff is how they handle
>> signals and exit. I also included a fix for the freeze case:
>
> I don't love the SIGKILL special case, but I also don't find this
> deeply offensive. So if this is what it takes, I'm ok with it.
>
> I wonder if we could make that special case simply check for "is
> SIGKILL blocked" instead? No normal case will cause that, and it means
Yeah, it's doable. Updated below.
> that a PF_USER_WORKER thread could decide per-thread what it wants to
> do wrt SIGKILL.
>
> Christian? And I guess we should Cc: Oleg too, since the signal parts
> are an area he's familiar with and has worked on.. Eric Biederman has
> already been on the list and has also been involved
>
> Oleg: see
>
> https://lore.kernel.org/lkml/122b597e-a5fa-daf7-27bb-6f04fa98d496@oracle.com/
>
> for the context here.
Oleg and Christian,
Below is an updated patch that doesn't check for PF_USER_WORKER in the
signal.c code and instead will check for if we have blocked the signal.
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 537cbf9a2ade..e0f5ac90a228 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -29,7 +29,6 @@ struct kernel_clone_args {
u32 io_thread:1;
u32 user_worker:1;
u32 no_files:1;
- u32 ignore_signals:1;
unsigned long stack;
unsigned long stack_size;
unsigned long tls;
diff --git a/kernel/fork.c b/kernel/fork.c
index ed4e01daccaa..fd2970b598b2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2336,8 +2336,15 @@ __latent_entropy struct task_struct *copy_process(
p->flags &= ~PF_KTHREAD;
if (args->kthread)
p->flags |= PF_KTHREAD;
- if (args->user_worker)
+ if (args->user_worker) {
+ /*
+ * User worker are similar to io_threads but they do not
+ * support signals and cleanup is driven via another kernel
+ * interface so even SIGKILL is blocked.
+ */
p->flags |= PF_USER_WORKER;
+ siginitsetinv(&p->blocked, 0);
+ }
if (args->io_thread) {
/*
* Mark us an IO worker, and block any signal that isn't
@@ -2517,8 +2524,8 @@ __latent_entropy struct task_struct *copy_process(
if (retval)
goto bad_fork_cleanup_io;
- if (args->ignore_signals)
- ignore_signals(p);
+ if (args->user_worker)
+ p->flags |= PF_NOFREEZE;
stackleak_task_init(p);
@@ -2860,7 +2867,6 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
.fn = fn,
.fn_arg = arg,
.io_thread = 1,
- .user_worker = 1,
};
return copy_process(NULL, 0, node, &args);
diff --git a/kernel/signal.c b/kernel/signal.c
index 8f6330f0e9ca..bc7e26072437 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -995,6 +995,19 @@ static inline bool wants_signal(int sig, struct task_struct *p)
return task_curr(p) || !task_sigpending(p);
}
+static void try_set_pending_sigkill(struct task_struct *t)
+{
+ /*
+ * User workers don't support signals and their exit is driven through
+ * their kernel layer, so by default block even SIGKILL.
+ */
+ if (sigismember(&t->blocked, SIGKILL))
+ return;
+
+ sigaddset(&t->pending.signal, SIGKILL);
+ signal_wake_up(t, 1);
+}
+
static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
{
struct signal_struct *signal = p->signal;
@@ -1055,8 +1068,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
t = p;
do {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
- sigaddset(&t->pending.signal, SIGKILL);
- signal_wake_up(t, 1);
+ try_set_pending_sigkill(t);
} while_each_thread(p, t);
return;
}
@@ -1373,8 +1385,7 @@ int zap_other_threads(struct task_struct *p)
/* Don't bother with already dead threads */
if (t->exit_state)
continue;
- sigaddset(&t->pending.signal, SIGKILL);
- signal_wake_up(t, 1);
+ try_set_pending_sigkill(t);
}
return count;
diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
index b7cbd66f889e..2d8d3ebaec4d 100644
--- a/kernel/vhost_task.c
+++ b/kernel/vhost_task.c
@@ -75,13 +75,13 @@ struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg,
const char *name)
{
struct kernel_clone_args args = {
- .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM,
+ .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM |
+ CLONE_THREAD | CLONE_SIGHAND,
.exit_signal = 0,
.fn = vhost_task_fn,
.name = name,
.user_worker = 1,
.no_files = 1,
- .ignore_signals = 1,
};
struct vhost_task *vtsk;
struct task_struct *tsk;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d257916f39e5..255a2147e5c1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1207,12 +1207,12 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
DEFINE_WAIT(wait);
/*
- * Do not throttle user workers, kthreads other than kswapd or
+ * Do not throttle IO/user workers, kthreads other than kswapd or
* workqueues. They may be required for reclaim to make
* forward progress (e.g. journalling workqueues or kthreads).
*/
if (!current_is_kswapd() &&
- current->flags & (PF_USER_WORKER|PF_KTHREAD)) {
+ current->flags & (PF_USER_WORKER|PF_IO_WORKER|PF_KTHREAD)) {
cond_resched();
return;
}
Powered by blists - more mailing lists