[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1346516916-1991-3-git-send-email-laijs@cn.fujitsu.com>
Date: Sun, 2 Sep 2012 00:28:20 +0800
From: Lai Jiangshan <laijs@...fujitsu.com>
To: Tejun Heo <tj@...nel.org>, linux-kernel@...r.kernel.org
Cc: Lai Jiangshan <laijs@...fujitsu.com>
Subject: [PATCH 02/10 V4] workqueue: fix deadlock in rebind_workers()
Current idle_worker_rebind() has a bug.
idle_worker_rebind() path HOTPLUG path
online
rebind_workers()
wait_event(gcwq->rebind_hold)
woken up but no scheduled rebind_workers() returns (*)
the same cpu offline
the same cpu online again
rebind_workers()
set WORKER_REBIND
scheduled,see the WORKER_REBIND
wait rebind_workers() clear it <--bug--> wait idle_worker_rebind()
rebound.
The two thread wait each other. It is bug.
This fix focuses in (*), rebind_workers() can't returns until all idles
finish waiting on gcwq->rebind_hold(aka: until all idles release the reference
of gcwq->rebind_hold). We add a ref_done to do it. rebind_workers() will
waits on ref_done for all idles to finish wait.
It is now tree-times-hand-shake.
Signed-off-by: Lai Jiangshan <laijs@...fujitsu.com>
---
kernel/workqueue.c | 61 ++++++++++++++++++++++++++++++++++++++++-----------
1 files changed, 48 insertions(+), 13 deletions(-)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4f252d0..1bfe407 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1305,8 +1305,22 @@ __acquires(&gcwq->lock)
}
struct idle_rebind {
- int cnt; /* # workers to be rebound */
- struct completion done; /* all workers rebound */
+ int idle_cnt; /* # idle workers to be rebound */
+ struct completion idle_done; /* all idle workers rebound */
+
+ /*
+ * notify the rebind_workers() that:
+ * 0. All worker leave rebind_hold:
+ * 1. All idle workers are rebound.
+ * 2. No idle worker has ref to this struct
+ *
+ * @ref_cnt: # idle workers which has ref to this struct
+ * @ref_done: any idle workers has no ref to this struct,
+ * nor rebind_hold.
+ * it also implies that all idle workers are rebound.
+ */
+ int ref_cnt;
+ struct completion ref_done;
};
/*
@@ -1320,12 +1334,19 @@ static void idle_worker_rebind(struct worker *worker)
/* CPU must be online at this point */
WARN_ON(!worker_maybe_bind_and_lock(worker));
- if (!--worker->idle_rebind->cnt)
- complete(&worker->idle_rebind->done);
+ ++worker->idle_rebind->ref_cnt;
+ if (!--worker->idle_rebind->idle_cnt)
+ complete(&worker->idle_rebind->idle_done);
spin_unlock_irq(&worker->pool->gcwq->lock);
/* we did our part, wait for rebind_workers() to finish up */
wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND));
+
+ /* noify if all idle worker are done(rebond & wait) */
+ spin_lock_irq(&worker->pool->gcwq->lock);
+ if (!--worker->idle_rebind->ref_cnt)
+ complete(&worker->idle_rebind->ref_done);
+ spin_unlock_irq(&worker->pool->gcwq->lock);
}
/*
@@ -1384,14 +1405,18 @@ static void rebind_workers(struct global_cwq *gcwq)
lockdep_assert_held(&pool->manager_mutex);
/*
- * Rebind idle workers. Interlocked both ways. We wait for
- * workers to rebind via @idle_rebind.done. Workers will wait for
- * us to finish up by watching %WORKER_REBIND.
+ * Rebind idle workers. Interlocked both ways in triple waits.
+ * We wait for workers to rebind via @idle_rebind.idle_done.
+ * Workers will wait for us to finish up by watching %WORKER_REBIND.
+ * And them we wait for workers to leave rebind_hold
+ * via @idle_rebind.ref_done.
*/
- init_completion(&idle_rebind.done);
+ init_completion(&idle_rebind.idle_done);
+ init_completion(&idle_rebind.ref_done);
+ idle_rebind.ref_cnt = 1;
retry:
- idle_rebind.cnt = 1;
- INIT_COMPLETION(idle_rebind.done);
+ idle_rebind.idle_cnt = 1;
+ INIT_COMPLETION(idle_rebind.idle_done);
/* set REBIND and kick idle ones, we'll wait for these later */
for_each_worker_pool(pool, gcwq) {
@@ -1403,7 +1428,7 @@ retry:
worker->flags &= ~WORKER_UNBOUND;
worker->flags |= WORKER_REBIND;
- idle_rebind.cnt++;
+ idle_rebind.idle_cnt++;
worker->idle_rebind = &idle_rebind;
/* worker_thread() will call idle_worker_rebind() */
@@ -1411,9 +1436,9 @@ retry:
}
}
- if (--idle_rebind.cnt) {
+ if (--idle_rebind.idle_cnt) {
spin_unlock_irq(&gcwq->lock);
- wait_for_completion(&idle_rebind.done);
+ wait_for_completion(&idle_rebind.idle_done);
spin_lock_irq(&gcwq->lock);
/* busy ones might have become idle while waiting, retry */
goto retry;
@@ -1452,6 +1477,16 @@ retry:
worker->scheduled.next,
work_color_to_flags(WORK_NO_COLOR));
}
+
+ /*
+ * we will leave rebind_workers(), have to wait until no worker
+ * has ref to this idle_rebind nor rebind_hold.
+ */
+ if (--idle_rebind.ref_cnt) {
+ spin_unlock_irq(&gcwq->lock);
+ wait_for_completion(&idle_rebind.ref_done);
+ spin_lock_irq(&gcwq->lock);
+ }
}
static struct worker *alloc_worker(void)
--
1.7.4.4
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists