From: Steven Rostedt Revert Author: Peter Zijlstra Date: Fri Sep 30 11:57:58 2011 +0200 workqueue: Fix cpuhotplug trainwreck As TREAD_BOUND no longer affects cpu down, and this code introduced a lot of races with taking down a CPU. Signed-off-by: Steven Rostedt Signed-off-by: Thomas Gleixner Conflicts: include/linux/cpu.h include/linux/workqueue.h --- include/linux/cpu.h | 14 +- include/linux/workqueue.h | 5 +- kernel/workqueue.c | 561 +++++++++++++++++++++++++++++++++------------ 3 files changed, 419 insertions(+), 161 deletions(-) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index d7d7a12..c7823c5 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -60,16 +60,14 @@ enum { */ CPU_PRI_SCHED_ACTIVE = INT_MAX, CPU_PRI_CPUSET_ACTIVE = INT_MAX - 1, - - /* migration should happen before other stuff but after perf */ - CPU_PRI_PERF = 20, - CPU_PRI_MIGRATION = 10, - CPU_PRI_WORKQUEUE_ACTIVE = 5, /* prepare workqueues for others */ - CPU_PRI_NORMAL = 0, - CPU_PRI_WORKQUEUE_INACTIVE = -5, /* flush workqueues after others */ - CPU_PRI_SCHED_INACTIVE = INT_MIN + 1, CPU_PRI_CPUSET_INACTIVE = INT_MIN, + + /* migration should happen before other stuff but after perf */ + CPU_PRI_PERF = 20, + CPU_PRI_MIGRATION = 10, + /* prepare workqueues for other notifiers */ + CPU_PRI_WORKQUEUE = 5, }; #ifdef CONFIG_SMP diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 5b86348..6c56a14 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -254,10 +254,9 @@ enum { WQ_MEM_RECLAIM = 1 << 3, /* may be used for memory reclaim */ WQ_HIGHPRI = 1 << 4, /* high priority */ WQ_CPU_INTENSIVE = 1 << 5, /* cpu instensive workqueue */ - WQ_NON_AFFINE = 1 << 6, /* free to move works around cpus */ - WQ_DYING = 1 << 7, /* internal: workqueue is dying */ - WQ_RESCUER = 1 << 8, /* internal: workqueue has rescuer */ + WQ_DYING = 1 << 6, /* internal: workqueue is dying */ + WQ_RESCUER = 1 << 7, /* internal: workqueue has rescuer */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 53268e3..99be108 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -41,7 +41,6 @@ #include #include #include -#include #include "workqueue_sched.h" @@ -58,10 +57,20 @@ enum { WORKER_DIE = 1 << 1, /* die die die */ WORKER_IDLE = 1 << 2, /* is idle */ WORKER_PREP = 1 << 3, /* preparing to run works */ - WORKER_CPU_INTENSIVE = 1 << 4, /* cpu intensive */ - WORKER_UNBOUND = 1 << 5, /* worker is unbound */ + WORKER_ROGUE = 1 << 4, /* not bound to any cpu */ + WORKER_REBIND = 1 << 5, /* mom is home, come back */ + WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ + WORKER_UNBOUND = 1 << 7, /* worker is unbound */ - WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE | WORKER_UNBOUND, + WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND | + WORKER_CPU_INTENSIVE | WORKER_UNBOUND, + + /* gcwq->trustee_state */ + TRUSTEE_START = 0, /* start */ + TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */ + TRUSTEE_BUTCHER = 2, /* butcher workers */ + TRUSTEE_RELEASE = 3, /* release workers */ + TRUSTEE_DONE = 4, /* trustee is done */ BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, @@ -75,6 +84,7 @@ enum { (min two ticks) */ MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ CREATE_COOLDOWN = HZ, /* time to breath after fail */ + TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ /* * Rescue workers are used only on emergencies and shared by @@ -126,6 +136,7 @@ struct worker { unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ int id; /* I: worker id */ + struct work_struct rebind_work; /* L: rebind worker to cpu */ int sleeping; /* None */ }; @@ -153,8 +164,10 @@ struct global_cwq { struct ida worker_ida; /* L: for worker IDs */ + struct task_struct *trustee; /* L: for gcwq shutdown */ + unsigned int trustee_state; /* L: trustee state */ + wait_queue_head_t trustee_wait; /* trustee wait */ struct worker *first_idle; /* L: first idle worker */ - wait_queue_head_t idle_wait; } ____cacheline_aligned_in_smp; /* @@ -960,38 +973,13 @@ static bool is_chained_work(struct workqueue_struct *wq) return false; } -static void ___queue_work(struct workqueue_struct *wq, struct global_cwq *gcwq, - struct work_struct *work) -{ - struct cpu_workqueue_struct *cwq; - struct list_head *worklist; - unsigned int work_flags; - - /* gcwq determined, get cwq and queue */ - cwq = get_cwq(gcwq->cpu, wq); - trace_workqueue_queue_work(gcwq->cpu, cwq, work); - - BUG_ON(!list_empty(&work->entry)); - - cwq->nr_in_flight[cwq->work_color]++; - work_flags = work_color_to_flags(cwq->work_color); - - if (likely(cwq->nr_active < cwq->max_active)) { - trace_workqueue_activate_work(work); - cwq->nr_active++; - worklist = gcwq_determine_ins_pos(gcwq, cwq); - } else { - work_flags |= WORK_STRUCT_DELAYED; - worklist = &cwq->delayed_works; - } - - insert_work(cwq, work, worklist, work_flags); -} - static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, struct work_struct *work) { struct global_cwq *gcwq; + struct cpu_workqueue_struct *cwq; + struct list_head *worklist; + unsigned int work_flags; unsigned long flags; debug_work_activate(work); @@ -1037,32 +1025,27 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, spin_lock_irqsave(&gcwq->lock, flags); } - ___queue_work(wq, gcwq, work); + /* gcwq determined, get cwq and queue */ + cwq = get_cwq(gcwq->cpu, wq); + trace_workqueue_queue_work(cpu, cwq, work); - spin_unlock_irqrestore(&gcwq->lock, flags); -} + BUG_ON(!list_empty(&work->entry)); -/** - * queue_work_on - queue work on specific cpu - * @cpu: CPU number to execute work on - * @wq: workqueue to use - * @work: work to queue - * - * Returns 0 if @work was already on a queue, non-zero otherwise. - * - * We queue the work to a specific CPU, the caller must ensure it - * can't go away. - */ -static int -__queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) -{ - int ret = 0; + cwq->nr_in_flight[cwq->work_color]++; + work_flags = work_color_to_flags(cwq->work_color); - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { - __queue_work(cpu, wq, work); - ret = 1; + if (likely(cwq->nr_active < cwq->max_active)) { + trace_workqueue_activate_work(work); + cwq->nr_active++; + worklist = gcwq_determine_ins_pos(gcwq, cwq); + } else { + work_flags |= WORK_STRUCT_DELAYED; + worklist = &cwq->delayed_works; } - return ret; + + insert_work(cwq, work, worklist, work_flags); + + spin_unlock_irqrestore(&gcwq->lock, flags); } /** @@ -1079,19 +1062,34 @@ int queue_work(struct workqueue_struct *wq, struct work_struct *work) { int ret; - ret = __queue_work_on(get_cpu_light(), wq, work); + ret = queue_work_on(get_cpu_light(), wq, work); put_cpu_light(); return ret; } EXPORT_SYMBOL_GPL(queue_work); +/** + * queue_work_on - queue work on specific cpu + * @cpu: CPU number to execute work on + * @wq: workqueue to use + * @work: work to queue + * + * Returns 0 if @work was already on a queue, non-zero otherwise. + * + * We queue the work to a specific CPU, the caller must ensure it + * can't go away. + */ int queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) { - WARN_ON(wq->flags & WQ_NON_AFFINE); + int ret = 0; - return __queue_work_on(cpu, wq, work); + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + __queue_work(cpu, wq, work); + ret = 1; + } + return ret; } EXPORT_SYMBOL_GPL(queue_work_on); @@ -1137,8 +1135,6 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; - WARN_ON((wq->flags & WQ_NON_AFFINE) && cpu != -1); - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { unsigned int lcpu; @@ -1204,13 +1200,12 @@ static void worker_enter_idle(struct worker *worker) /* idle_list is LIFO */ list_add(&worker->entry, &gcwq->idle_list); - if (gcwq->nr_idle == gcwq->nr_workers) - wake_up_all(&gcwq->idle_wait); - - if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer)) { - mod_timer(&gcwq->idle_timer, - jiffies + IDLE_WORKER_TIMEOUT); - } + if (likely(!(worker->flags & WORKER_ROGUE))) { + if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer)) + mod_timer(&gcwq->idle_timer, + jiffies + IDLE_WORKER_TIMEOUT); + } else + wake_up_all(&gcwq->trustee_wait); /* sanity check nr_running */ WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle && @@ -1302,6 +1297,23 @@ __acquires(&gcwq->lock) } } +/* + * Function for worker->rebind_work used to rebind rogue busy workers + * to the associated cpu which is coming back online. This is + * scheduled by cpu up but can race with other cpu hotplug operations + * and may be executed twice without intervening cpu down. + */ +static void worker_rebind_fn(struct work_struct *work) +{ + struct worker *worker = container_of(work, struct worker, rebind_work); + struct global_cwq *gcwq = worker->gcwq; + + if (worker_maybe_bind_and_lock(worker)) + worker_clr_flags(worker, WORKER_REBIND); + + spin_unlock_irq(&gcwq->lock); +} + static struct worker *alloc_worker(void) { struct worker *worker; @@ -1310,6 +1322,7 @@ static struct worker *alloc_worker(void) if (worker) { INIT_LIST_HEAD(&worker->entry); INIT_LIST_HEAD(&worker->scheduled); + INIT_WORK(&worker->rebind_work, worker_rebind_fn); /* on creation a worker is in !idle && prep state */ worker->flags = WORKER_PREP; } @@ -1649,6 +1662,13 @@ static bool manage_workers(struct worker *worker) gcwq->flags &= ~GCWQ_MANAGING_WORKERS; + /* + * The trustee might be waiting to take over the manager + * position, tell it we're done. + */ + if (unlikely(gcwq->trustee)) + wake_up_all(&gcwq->trustee_wait); + return ret; } @@ -3164,76 +3184,171 @@ EXPORT_SYMBOL_GPL(work_busy); * gcwqs serve mix of short, long and very long running works making * blocked draining impractical. * + * This is solved by allowing a gcwq to be detached from CPU, running + * it with unbound (rogue) workers and allowing it to be reattached + * later if the cpu comes back online. A separate thread is created + * to govern a gcwq in such state and is called the trustee of the + * gcwq. + * + * Trustee states and their descriptions. + * + * START Command state used on startup. On CPU_DOWN_PREPARE, a + * new trustee is started with this state. + * + * IN_CHARGE Once started, trustee will enter this state after + * assuming the manager role and making all existing + * workers rogue. DOWN_PREPARE waits for trustee to + * enter this state. After reaching IN_CHARGE, trustee + * tries to execute the pending worklist until it's empty + * and the state is set to BUTCHER, or the state is set + * to RELEASE. + * + * BUTCHER Command state which is set by the cpu callback after + * the cpu has went down. Once this state is set trustee + * knows that there will be no new works on the worklist + * and once the worklist is empty it can proceed to + * killing idle workers. + * + * RELEASE Command state which is set by the cpu callback if the + * cpu down has been canceled or it has come online + * again. After recognizing this state, trustee stops + * trying to drain or butcher and clears ROGUE, rebinds + * all remaining workers back to the cpu and releases + * manager role. + * + * DONE Trustee will enter this state after BUTCHER or RELEASE + * is complete. + * + * trustee CPU draining + * took over down complete + * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE + * | | ^ + * | CPU is back online v return workers | + * ----------------> RELEASE -------------- */ -static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - struct global_cwq *gcwq = get_gcwq(cpu); - struct worker *uninitialized_var(new_worker); - unsigned long flags; +/** + * trustee_wait_event_timeout - timed event wait for trustee + * @cond: condition to wait for + * @timeout: timeout in jiffies + * + * wait_event_timeout() for trustee to use. Handles locking and + * checks for RELEASE request. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. To be used by trustee. + * + * RETURNS: + * Positive indicating left time if @cond is satisfied, 0 if timed + * out, -1 if canceled. + */ +#define trustee_wait_event_timeout(cond, timeout) ({ \ + long __ret = (timeout); \ + while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \ + __ret) { \ + spin_unlock_irq(&gcwq->lock); \ + __wait_event_timeout(gcwq->trustee_wait, (cond) || \ + (gcwq->trustee_state == TRUSTEE_RELEASE), \ + __ret); \ + spin_lock_irq(&gcwq->lock); \ + } \ + gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \ +}) - action &= ~CPU_TASKS_FROZEN; +/** + * trustee_wait_event - event wait for trustee + * @cond: condition to wait for + * + * wait_event() for trustee to use. Automatically handles locking and + * checks for CANCEL request. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. To be used by trustee. + * + * RETURNS: + * 0 if @cond is satisfied, -1 if canceled. + */ +#define trustee_wait_event(cond) ({ \ + long __ret1; \ + __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\ + __ret1 < 0 ? -1 : 0; \ +}) - switch (action) { - case CPU_UP_PREPARE: - BUG_ON(gcwq->first_idle); - new_worker = create_worker(gcwq, false); - if (!new_worker) - return NOTIFY_BAD; - case CPU_UP_CANCELED: - case CPU_ONLINE: - break; - default: - return notifier_from_errno(0); - } +static int __cpuinit trustee_thread(void *__gcwq) +{ + struct global_cwq *gcwq = __gcwq; + struct worker *worker; + struct work_struct *work; + struct hlist_node *pos; + long rc; + int i; - /* some are called w/ irq disabled, don't disturb irq status */ - spin_lock_irqsave(&gcwq->lock, flags); + BUG_ON(gcwq->cpu != smp_processor_id()); - switch (action) { - case CPU_UP_PREPARE: - BUG_ON(gcwq->first_idle); - gcwq->first_idle = new_worker; - break; + spin_lock_irq(&gcwq->lock); + /* + * Claim the manager position and make all workers rogue. + * Trustee must be bound to the target cpu and can't be + * cancelled. + */ + BUG_ON(gcwq->cpu != smp_processor_id()); + rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS)); + BUG_ON(rc < 0); - case CPU_UP_CANCELED: - destroy_worker(gcwq->first_idle); - gcwq->first_idle = NULL; - break; + gcwq->flags |= GCWQ_MANAGING_WORKERS; - case CPU_ONLINE: - spin_unlock_irq(&gcwq->lock); - kthread_bind(gcwq->first_idle->task, cpu); - spin_lock_irq(&gcwq->lock); - gcwq->flags |= GCWQ_MANAGE_WORKERS; - start_worker(gcwq->first_idle); - gcwq->first_idle = NULL; - break; - } + list_for_each_entry(worker, &gcwq->idle_list, entry) + worker->flags |= WORKER_ROGUE; - spin_unlock_irqrestore(&gcwq->lock, flags); + for_each_busy_worker(worker, i, pos, gcwq) + worker->flags |= WORKER_ROGUE; - return notifier_from_errno(0); -} + /* + * Call schedule() so that we cross rq->lock and thus can + * guarantee sched callbacks see the rogue flag. This is + * necessary as scheduler callbacks may be invoked from other + * cpus. + */ + spin_unlock_irq(&gcwq->lock); + schedule(); + spin_lock_irq(&gcwq->lock); -static void flush_gcwq(struct global_cwq *gcwq) -{ - struct work_struct *work, *nw; - struct worker *worker, *n; - LIST_HEAD(non_affine_works); + /* + * Sched callbacks are disabled now. Zap nr_running. After + * this, nr_running stays zero and need_more_worker() and + * keep_working() are always true as long as the worklist is + * not empty. + */ + atomic_set(get_gcwq_nr_running(gcwq->cpu), 0); + spin_unlock_irq(&gcwq->lock); + del_timer_sync(&gcwq->idle_timer); spin_lock_irq(&gcwq->lock); - list_for_each_entry_safe(work, nw, &gcwq->worklist, entry) { - struct workqueue_struct *wq = get_work_cwq(work)->wq; - if (wq->flags & WQ_NON_AFFINE) - list_move(&work->entry, &non_affine_works); - } + /* + * We're now in charge. Notify and proceed to drain. We need + * to keep the gcwq running during the whole CPU down + * procedure as other cpu hotunplug callbacks may need to + * flush currently running tasks. + */ + gcwq->trustee_state = TRUSTEE_IN_CHARGE; + wake_up_all(&gcwq->trustee_wait); - while (!list_empty(&gcwq->worklist)) { + /* + * The original cpu is in the process of dying and may go away + * anytime now. When that happens, we and all workers would + * be migrated to other cpus. Try draining any left work. We + * want to get it over with ASAP - spam rescuers, wake up as + * many idlers as necessary and create new ones till the + * worklist is empty. Note that if the gcwq is frozen, there + * may be frozen works in freezable cwqs. Don't declare + * completion while frozen. + */ + while (gcwq->nr_workers != gcwq->nr_idle || + gcwq->flags & GCWQ_FREEZING || + gcwq->trustee_state == TRUSTEE_IN_CHARGE) { int nr_works = 0; list_for_each_entry(work, &gcwq->worklist, entry) { @@ -3247,55 +3362,200 @@ static void flush_gcwq(struct global_cwq *gcwq) wake_up_process(worker->task); } - spin_unlock_irq(&gcwq->lock); - if (need_to_create_worker(gcwq)) { - worker = create_worker(gcwq, true); - if (worker) + spin_unlock_irq(&gcwq->lock); + worker = create_worker(gcwq, false); + spin_lock_irq(&gcwq->lock); + if (worker) { + worker->flags |= WORKER_ROGUE; start_worker(worker); + } } - wait_event_timeout(gcwq->idle_wait, - gcwq->nr_idle == gcwq->nr_workers, HZ/10); - - spin_lock_irq(&gcwq->lock); + /* give a breather */ + if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0) + break; } - WARN_ON(gcwq->nr_workers != gcwq->nr_idle); + /* + * Either all works have been scheduled and cpu is down, or + * cpu down has already been canceled. Wait for and butcher + * all workers till we're canceled. + */ + do { + rc = trustee_wait_event(!list_empty(&gcwq->idle_list)); + while (!list_empty(&gcwq->idle_list)) + destroy_worker(list_first_entry(&gcwq->idle_list, + struct worker, entry)); + } while (gcwq->nr_workers && rc >= 0); - list_for_each_entry_safe(worker, n, &gcwq->idle_list, entry) - destroy_worker(worker); + /* + * At this point, either draining has completed and no worker + * is left, or cpu down has been canceled or the cpu is being + * brought back up. There shouldn't be any idle one left. + * Tell the remaining busy ones to rebind once it finishes the + * currently scheduled works by scheduling the rebind_work. + */ + WARN_ON(!list_empty(&gcwq->idle_list)); - WARN_ON(gcwq->nr_workers || gcwq->nr_idle); + for_each_busy_worker(worker, i, pos, gcwq) { + struct work_struct *rebind_work = &worker->rebind_work; - spin_unlock_irq(&gcwq->lock); + /* + * Rebind_work may race with future cpu hotplug + * operations. Use a separate flag to mark that + * rebinding is scheduled. + */ + worker->flags |= WORKER_REBIND; + worker->flags &= ~WORKER_ROGUE; - gcwq = get_gcwq(get_cpu_light()); - spin_lock_irq(&gcwq->lock); - list_for_each_entry_safe(work, nw, &non_affine_works, entry) { - list_del_init(&work->entry); - ___queue_work(get_work_cwq(work)->wq, gcwq, work); + /* queue rebind_work, wq doesn't matter, use the default one */ + if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, + work_data_bits(rebind_work))) + continue; + + debug_work_activate(rebind_work); + insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, + worker->scheduled.next, + work_color_to_flags(WORK_NO_COLOR)); } + + /* relinquish manager role */ + gcwq->flags &= ~GCWQ_MANAGING_WORKERS; + + /* notify completion */ + gcwq->trustee = NULL; + gcwq->trustee_state = TRUSTEE_DONE; + wake_up_all(&gcwq->trustee_wait); spin_unlock_irq(&gcwq->lock); - put_cpu_light(); + return 0; } -static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, +/** + * wait_trustee_state - wait for trustee to enter the specified state + * @gcwq: gcwq the trustee of interest belongs to + * @state: target state to wait for + * + * Wait for the trustee to reach @state. DONE is already matched. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. To be used by cpu_callback. + */ +static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) +__releases(&gcwq->lock) +__acquires(&gcwq->lock) +{ + if (!(gcwq->trustee_state == state || + gcwq->trustee_state == TRUSTEE_DONE)) { + spin_unlock_irq(&gcwq->lock); + __wait_event(gcwq->trustee_wait, + gcwq->trustee_state == state || + gcwq->trustee_state == TRUSTEE_DONE); + spin_lock_irq(&gcwq->lock); + } +} + +static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; struct global_cwq *gcwq = get_gcwq(cpu); + struct task_struct *new_trustee = NULL; + struct worker *uninitialized_var(new_worker); + unsigned long flags; action &= ~CPU_TASKS_FROZEN; - switch (action) { - case CPU_DOWN_PREPARE: - flush_gcwq(gcwq); - break; - } + switch (action) { + case CPU_DOWN_PREPARE: + new_trustee = kthread_create(trustee_thread, gcwq, + "workqueue_trustee/%d\n", cpu); + if (IS_ERR(new_trustee)) + return notifier_from_errno(PTR_ERR(new_trustee)); + kthread_bind(new_trustee, cpu); + /* fall through */ + case CPU_UP_PREPARE: + BUG_ON(gcwq->first_idle); + new_worker = create_worker(gcwq, false); + if (!new_worker) { + if (new_trustee) + kthread_stop(new_trustee); + return NOTIFY_BAD; + } + break; + case CPU_POST_DEAD: + case CPU_UP_CANCELED: + case CPU_DOWN_FAILED: + case CPU_ONLINE: + break; + case CPU_DYING: + /* + * We access this lockless. We are on the dying CPU + * and called from stomp machine. + * + * Before this, the trustee and all workers except for + * the ones which are still executing works from + * before the last CPU down must be on the cpu. After + * this, they'll all be diasporas. + */ + gcwq->flags |= GCWQ_DISASSOCIATED; + default: + goto out; + } + + /* some are called w/ irq disabled, don't disturb irq status */ + spin_lock_irqsave(&gcwq->lock, flags); + + switch (action) { + case CPU_DOWN_PREPARE: + /* initialize trustee and tell it to acquire the gcwq */ + BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE); + gcwq->trustee = new_trustee; + gcwq->trustee_state = TRUSTEE_START; + wake_up_process(gcwq->trustee); + wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE); + /* fall through */ + case CPU_UP_PREPARE: + BUG_ON(gcwq->first_idle); + gcwq->first_idle = new_worker; + break; + + case CPU_POST_DEAD: + gcwq->trustee_state = TRUSTEE_BUTCHER; + /* fall through */ + case CPU_UP_CANCELED: + destroy_worker(gcwq->first_idle); + gcwq->first_idle = NULL; + break; + case CPU_DOWN_FAILED: + case CPU_ONLINE: + gcwq->flags &= ~GCWQ_DISASSOCIATED; + if (gcwq->trustee_state != TRUSTEE_DONE) { + gcwq->trustee_state = TRUSTEE_RELEASE; + wake_up_process(gcwq->trustee); + wait_trustee_state(gcwq, TRUSTEE_DONE); + } + + /* + * Trustee is done and there might be no worker left. + * Put the first_idle in and request a real manager to + * take a look. + */ + spin_unlock_irq(&gcwq->lock); + kthread_bind(gcwq->first_idle->task, cpu); + spin_lock_irq(&gcwq->lock); + gcwq->flags |= GCWQ_MANAGE_WORKERS; + start_worker(gcwq->first_idle); + gcwq->first_idle = NULL; + break; + } + + spin_unlock_irqrestore(&gcwq->lock, flags); +out: return notifier_from_errno(0); } @@ -3492,8 +3752,7 @@ static int __init init_workqueues(void) unsigned int cpu; int i; - cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_ACTIVE); - hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_INACTIVE); + cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); /* initialize gcwqs */ for_each_gcwq_cpu(cpu) { @@ -3516,7 +3775,9 @@ static int __init init_workqueues(void) (unsigned long)gcwq); ida_init(&gcwq->worker_ida); - init_waitqueue_head(&gcwq->idle_wait); + + gcwq->trustee_state = TRUSTEE_DONE; + init_waitqueue_head(&gcwq->trustee_wait); } /* create the initial worker */ -- 1.7.10.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/