lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Wed, 27 Jun 2018 11:11:06 +0200
From:   Peter Zijlstra <peterz@...radead.org>
To:     "Paul E. McKenney" <paulmck@...ux.vnet.ibm.com>
Cc:     linux-kernel@...r.kernel.org, mingo@...nel.org,
        jiangshanlai@...il.com, dipankar@...ibm.com,
        akpm@...ux-foundation.org, mathieu.desnoyers@...icios.com,
        josh@...htriplett.org, tglx@...utronix.de, rostedt@...dmis.org,
        dhowells@...hat.com, edumazet@...gle.com, fweisbec@...il.com,
        oleg@...hat.com, joel@...lfernandes.org
Subject: Re: [PATCH tip/core/rcu 13/22] rcu: Fix grace-period hangs due to
 race with CPU offline

On Tue, Jun 26, 2018 at 04:40:04PM -0700, Paul E. McKenney wrote:
> The options I have considered are as follows:

> 2.	Stick with the no-failsafe approach, but rely on RCU's grace-period
> 	kthread to wake up later due to its timed wait during the
> 	force-quiescent-state process.  This would be a bit obnoxious,
> 	as it requires passing a don't-wake flag (or some such) up the
> 	quiescent-state reporting mechanism.  It would also needlessly
> 	delay grace-period ends, especially on large systems (RCU scales
> 	up the FQS delay on larger systems to maintain limited CPU
> 	consumption per unit time).
> 
> 3.	Stick with the no-failsafe approach, but have the quiescent-state
> 	reporting code hand back a value indicating that a wakeup is needed.
> 	Also a bit obnoxious, as this value would need to be threaded up
> 	the reporting code's return path.  Simple in theory, but a bit
> 	of an ugly change, especially for the many places in the code that
> 	currently expect quiescent-state reporting to be an unconditional
> 	fire-and-forget operation.

Here's a variant on 2+3, instead of propagating the state back, we
completely ignore if we needed a wakeup or not, and then unconditionally
wake the GP kthread on the managing CPU's rcutree_migrate_callbacks()
invocation.

Hotplug is rare (or should damn well be), doing a spurious wake of the
GP thread shouldn't matter here.

The extra argument isn't really pretty but not nearly as bad as feared.

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 7832dd556490..d4c38d8d3621 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -104,7 +104,6 @@ struct rcu_state sname##_state = { \
 	.abbr = sabbr, \
 	.exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \
 	.exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \
-	.ofl_lock = __SPIN_LOCK_UNLOCKED(sname##_state.ofl_lock), \
 }
 
 RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
@@ -160,7 +159,8 @@ static int rcu_scheduler_fully_active __read_mostly;
 
 static void
 rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
-		  struct rcu_node *rnp, unsigned long gps, unsigned long flags);
+		  struct rcu_node *rnp, unsigned long gps,
+		  unsigned long flags, bool no_wakeup);
 static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
 static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
@@ -1928,13 +1928,11 @@ static bool rcu_gp_init(struct rcu_state *rsp)
 	 */
 	rsp->gp_state = RCU_GP_ONOFF;
 	rcu_for_each_leaf_node(rsp, rnp) {
-		spin_lock(&rsp->ofl_lock);
 		raw_spin_lock_irq_rcu_node(rnp);
 		if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
 		    !rnp->wait_blkd_tasks) {
 			/* Nothing to do on this leaf rcu_node structure. */
 			raw_spin_unlock_irq_rcu_node(rnp);
-			spin_unlock(&rsp->ofl_lock);
 			continue;
 		}
 
@@ -1970,7 +1968,6 @@ static bool rcu_gp_init(struct rcu_state *rsp)
 		}
 
 		raw_spin_unlock_irq_rcu_node(rnp);
-		spin_unlock(&rsp->ofl_lock);
 	}
 	rcu_gp_slow(rsp, gp_preinit_delay); /* Races with CPU hotplug. */
 
@@ -2004,7 +2001,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
 		mask = rnp->qsmask & ~rnp->qsmaskinitnext;
 		rnp->rcu_gp_init_mask = mask;
 		if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp))
-			rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags);
+			rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags, false);
 		else
 			raw_spin_unlock_irq_rcu_node(rnp);
 		cond_resched_tasks_rcu_qs();
@@ -2247,14 +2244,17 @@ static int __noreturn rcu_gp_kthread(void *arg)
  * just-completed grace period.  Note that the caller must hold rnp->lock,
  * which is released before return.
  */
-static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
+static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags,
+			      bool no_wakeup)
 	__releases(rcu_get_root(rsp)->lock)
 {
 	raw_lockdep_assert_held_rcu_node(rcu_get_root(rsp));
 	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
 	WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
 	raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
-	rcu_gp_kthread_wake(rsp);
+
+	if (!no_wakeup)
+		rcu_gp_kthread_wake(rsp);
 }
 
 /*
@@ -2273,7 +2273,8 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
  */
 static void
 rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
-		  struct rcu_node *rnp, unsigned long gps, unsigned long flags)
+		  struct rcu_node *rnp, unsigned long gps,
+		  unsigned long flags, bool no_wakeup)
 	__releases(rnp->lock)
 {
 	unsigned long oldmask = 0;
@@ -2326,7 +2327,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 	 * state for this grace period.  Invoke rcu_report_qs_rsp()
 	 * to clean up and start the next grace period if one is needed.
 	 */
-	rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */
+	rcu_report_qs_rsp(rsp, flags, no_wakeup); /* releases rnp->lock. */
 }
 
 /*
@@ -2361,7 +2362,7 @@ rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
 		 * Only one rcu_node structure in the tree, so don't
 		 * try to report up to its nonexistent parent!
 		 */
-		rcu_report_qs_rsp(rsp, flags);
+		rcu_report_qs_rsp(rsp, flags, false);
 		return;
 	}
 
@@ -2370,7 +2371,7 @@ rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
 	mask = rnp->grpmask;
 	raw_spin_unlock_rcu_node(rnp);	/* irqs remain disabled. */
 	raw_spin_lock_rcu_node(rnp_p);	/* irqs already disabled. */
-	rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags);
+	rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags, false);
 }
 
 /*
@@ -2413,7 +2414,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 		 */
 		needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
 
-		rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags);
+		rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags, false);
 		/* ^^^ Released rnp->lock */
 		if (needwake)
 			rcu_gp_kthread_wake(rsp);
@@ -2711,7 +2712,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp))
 		}
 		if (mask != 0) {
 			/* Idle/offline CPUs, report (releases rnp->lock). */
-			rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags);
+			rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags, false);
 		} else {
 			/* Nothing to do here, so just drop the lock. */
 			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -3745,7 +3746,7 @@ void rcu_cpu_starting(unsigned int cpu)
 		rdp->rcu_onl_gp_flags = READ_ONCE(rsp->gp_flags);
 		if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */
 			/* Report QS -after- changing ->qsmaskinitnext! */
-			rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags);
+			rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags, false);
 		} else {
 			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		}
@@ -3768,18 +3769,15 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
 
 	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
 	mask = rdp->grpmask;
-	spin_lock(&rsp->ofl_lock);
 	raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
 	rdp->rcu_ofl_gp_seq = READ_ONCE(rsp->gp_seq);
 	rdp->rcu_ofl_gp_flags = READ_ONCE(rsp->gp_flags);
+	rnp->qsmaskinitnext &= ~mask;
 	if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */
-		/* Report quiescent state -before- changing ->qsmaskinitnext! */
-		rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags);
+		rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags, true);
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	}
-	rnp->qsmaskinitnext &= ~mask;
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-	spin_unlock(&rsp->ofl_lock);
 }
 
 /*
@@ -3849,6 +3847,12 @@ void rcutree_migrate_callbacks(int cpu)
 {
 	struct rcu_state *rsp;
 
+	/*
+	 * Just in case the outgoing CPU needed to wake the GP kthread
+	 * do so here.
+	 */
+	rcu_gp_kthread_wake(rsp);
+
 	for_each_rcu_flavor(rsp)
 		rcu_migrate_callbacks(cpu, rsp);
 }
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 4e74df768c57..8dab71838141 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -367,10 +367,6 @@ struct rcu_state {
 	const char *name;			/* Name of structure. */
 	char abbr;				/* Abbreviated name. */
 	struct list_head flavors;		/* List of RCU flavors. */
-
-	spinlock_t ofl_lock ____cacheline_internodealigned_in_smp;
-						/* Synchronize offline with */
-						/*  GP pre-initialization. */
 };
 
 /* Values for rcu_state structure's gp_flags field. */

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ