[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20180312195118.rye5lg2pb26kmavn@linutronix.de>
Date: Mon, 12 Mar 2018 20:51:18 +0100
From: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
To: Peter Zijlstra <peterz@...radead.org>
Cc: Corey Minyard <cminyard@...sta.com>,
Thomas Gleixner <tglx@...utronix.de>,
Steven Rostedt <rostedt@...dmis.org>,
linux-rt-users <linux-rt-users@...r.kernel.org>,
linux-kernel <linux-kernel@...r.kernel.org>,
Tejun Heo <tj@...nel.org>
Subject: Re: Warning from swake_up_all in 4.14.15-rt13 non-RT
On 2018-03-12 15:29:33 [+0100], Peter Zijlstra wrote:
> On Mon, Mar 12, 2018 at 03:11:07PM +0100, Sebastian Andrzej Siewior wrote:
> > I assumed you complained about the unbounded list-walk with interrupts
> > disabled (which is cheap but unbound is unbound). So here I suggested I
> > move 20 entries off that list a time and enable interrupts again so an
> > interrupt could set need_resched.
> > But if we get invoked !preemptible then nothing changes.
>
> Right, so any !preemptible invocation of wake_all is bad.
I doubt can I can argue the move of wake_up_all() into a workqueue in a
sane way. Not to mention that it wouldn't do any good for me on RT since
I can't schedule workqueues in !preemptible context.
I've been staring at the original issue. The original commit that
properly introduced RCU-sched [0] reads like "it could have been normal
RCU but this one is faster in my micro bench so let's go for it".
Probably because the normal RCU invokes the callbacks once in a while.
So we could switch it to "normal" RCU instead. Anyone thinks that it
could be done? percpu_ref_get()/put() is a hot-path, I am not sure how
much worse it gets in a real benchmark.
In the meantime I prepared a patch which reverts the swait conversation
and invokes the callback swork_queue() which in turn moves the
wake_up_all() into a different process. Since it triggers rare, it
should not have performance regressions. And the wake_up_all() is
preemptible and RT is happy.
[0] a4244454df12 ("percpu-refcount: use RCU-sched insted of normal RCU")
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -381,7 +381,7 @@ void blk_clear_preempt_only(struct reque
spin_lock_irqsave(q->queue_lock, flags);
queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
- swake_up_all(&q->mq_freeze_wq);
+ wake_up_all(&q->mq_freeze_wq);
spin_unlock_irqrestore(q->queue_lock, flags);
}
EXPORT_SYMBOL_GPL(blk_clear_preempt_only);
@@ -659,7 +659,7 @@ void blk_set_queue_dying(struct request_
}
/* Make blk_queue_enter() reexamine the DYING flag. */
- swake_up_all(&q->mq_freeze_wq);
+ wake_up_all(&q->mq_freeze_wq);
}
EXPORT_SYMBOL_GPL(blk_set_queue_dying);
@@ -860,7 +860,7 @@ int blk_queue_enter(struct request_queue
*/
smp_rmb();
- ret = swait_event_interruptible(q->mq_freeze_wq,
+ ret = wait_event_interruptible(q->mq_freeze_wq,
(atomic_read(&q->mq_freeze_depth) == 0 &&
(preempt || !blk_queue_preempt_only(q))) ||
blk_queue_dying(q));
@@ -876,12 +876,20 @@ void blk_queue_exit(struct request_queue
percpu_ref_put(&q->q_usage_counter);
}
+static void blk_queue_usage_counter_release_swork(struct swork_event *sev)
+{
+ struct request_queue *q =
+ container_of(sev, struct request_queue, mq_pcpu_wake);
+
+ wake_up_all(&q->mq_freeze_wq);
+}
+
static void blk_queue_usage_counter_release(struct percpu_ref *ref)
{
struct request_queue *q =
container_of(ref, struct request_queue, q_usage_counter);
- swake_up_all(&q->mq_freeze_wq);
+ swork_queue(&q->mq_pcpu_wake);
}
static void blk_rq_timed_out_timer(struct timer_list *t)
@@ -957,7 +965,8 @@ struct request_queue *blk_alloc_queue_no
q->bypass_depth = 1;
__set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
- init_swait_queue_head(&q->mq_freeze_wq);
+ init_waitqueue_head(&q->mq_freeze_wq);
+ INIT_SWORK(&q->mq_pcpu_wake, blk_queue_usage_counter_release_swork);
/*
* Init percpu_ref in atomic mode so that it's faster to shutdown.
@@ -3843,6 +3852,7 @@ int __init blk_dev_init(void)
blk_requestq_cachep = kmem_cache_create("request_queue",
sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
+ BUG_ON(swork_get());
#ifdef CONFIG_DEBUG_FS
blk_debugfs_root = debugfs_create_dir("block", NULL);
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -133,14 +133,14 @@ EXPORT_SYMBOL_GPL(blk_freeze_queue_start
void blk_mq_freeze_queue_wait(struct request_queue *q)
{
- swait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
+ wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
unsigned long timeout)
{
- return swait_event_timeout(q->mq_freeze_wq,
+ return wait_event_timeout(q->mq_freeze_wq,
percpu_ref_is_zero(&q->q_usage_counter),
timeout);
}
@@ -183,7 +183,7 @@ void blk_mq_unfreeze_queue(struct reques
WARN_ON_ONCE(freeze_depth < 0);
if (!freeze_depth) {
percpu_ref_reinit(&q->q_usage_counter);
- swake_up_all(&q->mq_freeze_wq);
+ wake_up_all(&q->mq_freeze_wq);
}
}
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -29,6 +29,7 @@
#include <linux/blkzoned.h>
#include <linux/seqlock.h>
#include <linux/u64_stats_sync.h>
+#include <linux/swork.h>
struct module;
struct scsi_ioctl_command;
@@ -647,7 +648,8 @@ struct request_queue {
struct throtl_data *td;
#endif
struct rcu_head rcu_head;
- struct swait_queue_head mq_freeze_wq;
+ wait_queue_head_t mq_freeze_wq;
+ struct swork_event mq_pcpu_wake;
struct percpu_ref q_usage_counter;
struct list_head all_q_node;
Powered by blists - more mailing lists