linux-kernel - Re: [PATCH 3/3] task_work: use TIF

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <a4e6f8a3-f33e-ed96-1c9b-fa222cde2320@kernel.dk>
Date:   Fri, 2 Oct 2020 14:14:24 -0600
From:   Jens Axboe <axboe@...nel.dk>
To:     Thomas Gleixner <tglx@...utronix.de>,
        Oleg Nesterov <oleg@...hat.com>
Cc:     linux-kernel@...r.kernel.org, io-uring@...r.kernel.org,
        peterz@...radead.org
Subject: Re: [PATCH 3/3] task_work: use TIF_TASKWORK if available

On 10/2/20 1:10 PM, Thomas Gleixner wrote:
> On Fri, Oct 02 2020 at 09:52, Jens Axboe wrote:
>> On 10/2/20 9:31 AM, Thomas Gleixner wrote:
>>>> This way task_work_run() doesn't need to clear TIF_NOTIFY_SIGNAL and it can
>>>> have more users.
>>>
>>> I think it's fundamentaly wrong that we have several places and several
>>> flags which handle task_work_run() instead of having exactly one place
>>> and one flag.
>>
>> I don't disagree with that. I know it's not happening in this series, but
>> if we to the TIF_NOTIFY_SIGNAL route and get all archs supporting that,
>> then we can kill the signal and notify resume part of running task_work.
>> And that leaves us with exactly one place that runs it.
>>
>> So we can potentially improve the current situation in that regard.
> 
> I'll think about it over the weekend.

Thanks, I appreciate it!

Just to drive the point home, we'd end up with something like the below.
Which also enables me to remove a nasty sighand->lock deadlock
workaround in io_uring.

Not in this patch, but the io_uring cqring_wait() call can also be
removed. Outside of the core calling it in tracehook_notify_signal(),
the only callers are then the case where kthreads are used with
task_work.


 fs/io_uring.c                  | 41 ++++++++++++----------------------
 include/linux/sched/jobctl.h   |  4 +---
 include/linux/task_work.h      |  4 +---
 include/linux/tracehook.h      |  9 --------
 kernel/signal.c                | 22 ------------------
 kernel/task_work.c             | 40 +++------------------------------
 kernel/time/posix-cpu-timers.c |  2 +-
 7 files changed, 20 insertions(+), 102 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2a67552a9c2f..3a5f4a7bd369 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1597,12 +1597,12 @@ static void __io_free_req(struct io_kiocb *req)
 		int ret;
 
 		init_task_work(&req->task_work, io_req_task_file_table_put);
-		ret = task_work_add(req->task, &req->task_work, TWA_RESUME);
+		ret = task_work_add(req->task, &req->task_work, true);
 		if (unlikely(ret)) {
 			struct task_struct *tsk;
 
 			tsk = io_wq_get_task(req->ctx->io_wq);
-			task_work_add(tsk, &req->task_work, 0);
+			task_work_add(tsk, &req->task_work, false);
 		}
 	}
 }
@@ -1746,25 +1746,21 @@ static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
 	return __io_req_find_next(req);
 }
 
-static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb,
-				bool twa_signal_ok)
+static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb)
 {
 	struct task_struct *tsk = req->task;
 	struct io_ring_ctx *ctx = req->ctx;
-	int ret, notify;
+	bool notify = false;
+	int ret;
 
 	if (tsk->flags & PF_EXITING)
 		return -ESRCH;
 
 	/*
-	 * SQPOLL kernel thread doesn't need notification, just a wakeup. For
-	 * all other cases, use TWA_SIGNAL unconditionally to ensure we're
-	 * processing task_work. There's no reliable way to tell if TWA_RESUME
-	 * will do the job.
+	 * SQPOLL kernel thread doesn't need notification, just a wakeup.
 	 */
-	notify = 0;
-	if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok)
-		notify = TWA_SIGNAL;
+	if (!(ctx->flags & IORING_SETUP_SQPOLL))
+		notify = true;
 
 	ret = task_work_add(tsk, cb, notify);
 	if (!ret)
@@ -1825,13 +1821,13 @@ static void io_req_task_queue(struct io_kiocb *req)
 	init_task_work(&req->task_work, io_req_task_submit);
 	percpu_ref_get(&req->ctx->refs);
 
-	ret = io_req_task_work_add(req, &req->task_work, true);
+	ret = io_req_task_work_add(req, &req->task_work);
 	if (unlikely(ret)) {
 		struct task_struct *tsk;
 
 		init_task_work(&req->task_work, io_req_task_cancel);
 		tsk = io_wq_get_task(req->ctx->io_wq);
-		task_work_add(tsk, &req->task_work, 0);
+		task_work_add(tsk, &req->task_work, false);
 		wake_up_process(tsk);
 	}
 }
@@ -3056,14 +3052,14 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
 
 	/* submit ref gets dropped, acquire a new one */
 	refcount_inc(&req->refs);
-	ret = io_req_task_work_add(req, &req->task_work, true);
+	ret = io_req_task_work_add(req, &req->task_work);
 	if (unlikely(ret)) {
 		struct task_struct *tsk;
 
 		/* queue just for cancelation */
 		init_task_work(&req->task_work, io_req_task_cancel);
 		tsk = io_wq_get_task(req->ctx->io_wq);
-		task_work_add(tsk, &req->task_work, 0);
+		task_work_add(tsk, &req->task_work, false);
 		wake_up_process(tsk);
 	}
 	return 1;
@@ -4598,7 +4594,6 @@ struct io_poll_table {
 static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
 			   __poll_t mask, task_work_func_t func)
 {
-	bool twa_signal_ok;
 	int ret;
 
 	/* for instances that support it check for an event match first: */
@@ -4613,27 +4608,19 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
 	init_task_work(&req->task_work, func);
 	percpu_ref_get(&req->ctx->refs);
 
-	/*
-	 * If we using the signalfd wait_queue_head for this wakeup, then
-	 * it's not safe to use TWA_SIGNAL as we could be recursing on the
-	 * tsk->sighand->siglock on doing the wakeup. Should not be needed
-	 * either, as the normal wakeup will suffice.
-	 */
-	twa_signal_ok = (poll->head != &req->task->sighand->signalfd_wqh);
-
 	/*
 	 * If this fails, then the task is exiting. When a task exits, the
 	 * work gets canceled, so just cancel this request as well instead
 	 * of executing it. We can't safely execute it anyway, as we may not
 	 * have the needed state needed for it anyway.
 	 */
-	ret = io_req_task_work_add(req, &req->task_work, twa_signal_ok);
+	ret = io_req_task_work_add(req, &req->task_work);
 	if (unlikely(ret)) {
 		struct task_struct *tsk;
 
 		WRITE_ONCE(poll->canceled, true);
 		tsk = io_wq_get_task(req->ctx->io_wq);
-		task_work_add(tsk, &req->task_work, 0);
+		task_work_add(tsk, &req->task_work, false);
 		wake_up_process(tsk);
 	}
 	return 1;
diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h
index d2b4204ba4d3..fa067de9f1a9 100644
--- a/include/linux/sched/jobctl.h
+++ b/include/linux/sched/jobctl.h
@@ -19,7 +19,6 @@ struct task_struct;
 #define JOBCTL_TRAPPING_BIT	21	/* switching to TRACED */
 #define JOBCTL_LISTENING_BIT	22	/* ptracer is listening for events */
 #define JOBCTL_TRAP_FREEZE_BIT	23	/* trap for cgroup freezer */
-#define JOBCTL_TASK_WORK_BIT	24	/* set by TWA_SIGNAL */
 
 #define JOBCTL_STOP_DEQUEUED	(1UL << JOBCTL_STOP_DEQUEUED_BIT)
 #define JOBCTL_STOP_PENDING	(1UL << JOBCTL_STOP_PENDING_BIT)
@@ -29,10 +28,9 @@ struct task_struct;
 #define JOBCTL_TRAPPING		(1UL << JOBCTL_TRAPPING_BIT)
 #define JOBCTL_LISTENING	(1UL << JOBCTL_LISTENING_BIT)
 #define JOBCTL_TRAP_FREEZE	(1UL << JOBCTL_TRAP_FREEZE_BIT)
-#define JOBCTL_TASK_WORK	(1UL << JOBCTL_TASK_WORK_BIT)
 
 #define JOBCTL_TRAP_MASK	(JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
-#define JOBCTL_PENDING_MASK	(JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK | JOBCTL_TASK_WORK)
+#define JOBCTL_PENDING_MASK	(JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
 
 extern bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask);
 extern void task_clear_jobctl_trapping(struct task_struct *task);
diff --git a/include/linux/task_work.h b/include/linux/task_work.h
index 0fb93aafa478..a221bd5f746c 100644
--- a/include/linux/task_work.h
+++ b/include/linux/task_work.h
@@ -13,9 +13,7 @@ init_task_work(struct callback_head *twork, task_work_func_t func)
 	twork->func = func;
 }
 
-#define TWA_RESUME	1
-#define TWA_SIGNAL	2
-int task_work_add(struct task_struct *task, struct callback_head *twork, int);
+int task_work_add(struct task_struct *task, struct callback_head *twork, bool);
 
 struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t);
 void task_work_run(void);
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 7ec0e94c5250..3a4a35ae87d1 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -178,15 +178,6 @@ static inline void set_notify_resume(struct task_struct *task)
  */
 static inline void tracehook_notify_resume(struct pt_regs *regs)
 {
-	/*
-	 * The caller just cleared TIF_NOTIFY_RESUME. This barrier
-	 * pairs with task_work_add()->set_notify_resume() after
-	 * hlist_add_head(task->task_works);
-	 */
-	smp_mb__after_atomic();
-	if (unlikely(current->task_works))
-		task_work_run();
-
 #ifdef CONFIG_KEYS_REQUEST_CACHE
 	if (unlikely(current->cached_requested_key)) {
 		key_put(current->cached_requested_key);
diff --git a/kernel/signal.c b/kernel/signal.c
index ad52141ab0d2..d44fa9141cef 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2271,8 +2271,6 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
 void ptrace_notify(int exit_code)
 {
 	BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
-	if (unlikely(current->task_works))
-		task_work_run();
 
 	spin_lock_irq(&current->sighand->siglock);
 	ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
@@ -2541,26 +2539,6 @@ bool get_signal(struct ksignal *ksig)
 
 relock:
 	spin_lock_irq(&sighand->siglock);
-	/*
-	 * Make sure we can safely read ->jobctl() in task_work add. As Oleg
-	 * states:
-	 *
-	 * It pairs with mb (implied by cmpxchg) before READ_ONCE. So we
-	 * roughly have
-	 *
-	 *	task_work_add:				get_signal:
-	 *	STORE(task->task_works, new_work);	STORE(task->jobctl);
-	 *	mb();					mb();
-	 *	LOAD(task->jobctl);			LOAD(task->task_works);
-	 *
-	 * and we can rely on STORE-MB-LOAD [ in task_work_add].
-	 */
-	smp_store_mb(current->jobctl, current->jobctl & ~JOBCTL_TASK_WORK);
-	if (unlikely(current->task_works)) {
-		spin_unlock_irq(&sighand->siglock);
-		task_work_run();
-		goto relock;
-	}
 
 	/*
 	 * Every stopped thread goes here after wakeup. Check to see if
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 95604e57af46..e68f5831a078 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -5,34 +5,6 @@
 
 static struct callback_head work_exited; /* all we need is ->next == NULL */
 
-/*
- * TWA_SIGNAL signaling - use TIF_NOTIFY_SIGNAL, if available, as it's faster
- * than TIF_SIGPENDING as there's no dependency on ->sighand. The latter is
- * shared for threads, and can cause contention on sighand->lock. Even for
- * the non-threaded case TIF_NOTIFY_SIGNAL is more efficient, as no locking
- * or IRQ disabling is involved for notification (or running) purposes.
- */
-static void task_work_notify_signal(struct task_struct *task)
-{
-#ifdef TIF_NOTIFY_SIGNAL
-	set_notify_signal(task);
-#else
-	unsigned long flags;
-
-	/*
-	 * Only grab the sighand lock if we don't already have some
-	 * task_work pending. This pairs with the smp_store_mb()
-	 * in get_signal(), see comment there.
-	 */
-	if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) &&
-	    lock_task_sighand(task, &flags)) {
-		task->jobctl |= JOBCTL_TASK_WORK;
-		signal_wake_up(task, 0);
-		unlock_task_sighand(task, &flags);
-	}
-#endif
-}
-
 /**
  * task_work_add - ask the @task to execute @work->func()
  * @task: the task which should run the callback
@@ -53,7 +25,7 @@ static void task_work_notify_signal(struct task_struct *task)
  * 0 if succeeds or -ESRCH.
  */
 int
-task_work_add(struct task_struct *task, struct callback_head *work, int notify)
+task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
 {
 	struct callback_head *head;
 
@@ -64,14 +36,8 @@ task_work_add(struct task_struct *task, struct callback_head *work, int notify)
 		work->next = head;
 	} while (cmpxchg(&task->task_works, head, work) != head);
 
-	switch (notify) {
-	case TWA_RESUME:
-		set_notify_resume(task);
-		break;
-	case TWA_SIGNAL:
-		task_work_notify_signal(task);
-		break;
-	}
+	if (notify)
+		set_notify_signal(task);
 
 	return 0;
 }
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a71758e34e45..51080a1ed11f 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1128,7 +1128,7 @@ static inline void __run_posix_cpu_timers(struct task_struct *tsk)
 
 	/* Schedule task work to actually expire the timers */
 	tsk->posix_cputimers_work.scheduled = true;
-	task_work_add(tsk, &tsk->posix_cputimers_work.work, TWA_RESUME);
+	task_work_add(tsk, &tsk->posix_cputimers_work.work, true);
 }
 
 static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,

-- 
Jens Axboe