lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1519681990-31349-19-git-send-email-paulmck@linux.vnet.ibm.com>
Date:   Mon, 26 Feb 2018 13:53:09 -0800
From:   "Paul E. McKenney" <paulmck@...ux.vnet.ibm.com>
To:     linux-kernel@...r.kernel.org
Cc:     mingo@...nel.org, jiangshanlai@...il.com, dipankar@...ibm.com,
        akpm@...ux-foundation.org, mathieu.desnoyers@...icios.com,
        josh@...htriplett.org, tglx@...utronix.de, peterz@...radead.org,
        rostedt@...dmis.org, dhowells@...hat.com, edumazet@...gle.com,
        fweisbec@...il.com, oleg@...hat.com,
        "Paul E. McKenney" <paulmck@...ux.vnet.ibm.com>
Subject: [PATCH tip/core/rcu 19/20] rcu: Make expedited RCU CPU selection avoid unnecessary stores

This commit reworks the first loop in sync_rcu_exp_select_cpus()
to avoid doing unnecssary stores to other CPUs' rcu_data
structures.  This speeds up that first loop by roughly a factor of
two on an old x86 system.  In the case where the system is mostly
idle, this loop incurs a large fraction of the overhead of the
synchronize_rcu_expedited().  There is less benefit on busy systems
because the overhead of the smp_call_function_single() in the second
loop dominates in that case.

However, it is not unusual to do configuration chances involving
RCU grace periods (both expedited and normal) while the system is
mostly idle, so this optimization is worth doing.

While we are in the area, this commit also adds parentheses to arguments
used by the for_each_leaf_node_possible_cpu() macro.

Signed-off-by: Paul E. McKenney <paulmck@...ux.vnet.ibm.com>
---
 kernel/rcu/rcu.h      | 16 +++++++++++++---
 kernel/rcu/tree_exp.h | 21 ++++++++++++++-------
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 507a0802c717..1c868bcfd705 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -301,9 +301,19 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
  * Iterate over all possible CPUs in a leaf RCU node.
  */
 #define for_each_leaf_node_possible_cpu(rnp, cpu) \
-	for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \
-	     cpu <= rnp->grphi; \
-	     cpu = cpumask_next((cpu), cpu_possible_mask))
+	for ((cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \
+	     (cpu) <= rnp->grphi; \
+	     (cpu) = cpumask_next((cpu), cpu_possible_mask))
+
+/*
+ * Iterate over all CPUs in a leaf RCU node's specified mask.
+ */
+#define rcu_find_next_bit(rnp, cpu, mask) \
+	((rnp)->grplo + find_next_bit(&(mask), BITS_PER_LONG, (cpu)))
+#define for_each_leaf_node_cpu_mask(rnp, cpu, mask) \
+	for ((cpu) = rcu_find_next_bit((rnp), 0, (mask)); \
+	     (cpu) <= rnp->grphi; \
+	     (cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask)))
 
 /*
  * Wrappers for the rcu_node::lock acquire and release.
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index fecb6b6ab452..6ad87642f44a 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -383,15 +383,22 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 
 		/* Each pass checks a CPU for identity, offline, and idle. */
 		mask_ofl_test = 0;
-		for_each_leaf_node_possible_cpu(rnp, cpu) {
+		for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+			unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
 			struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+			struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu);
+			int snap;
 
-			rdp->exp_dynticks_snap =
-				rcu_dynticks_snap(rdp->dynticks);
 			if (raw_smp_processor_id() == cpu ||
-			    rcu_dynticks_in_eqs(rdp->exp_dynticks_snap) ||
-			    !(rnp->qsmaskinitnext & rdp->grpmask))
-				mask_ofl_test |= rdp->grpmask;
+			    !(rnp->qsmaskinitnext & mask)) {
+				mask_ofl_test |= mask;
+			} else {
+				snap = rcu_dynticks_snap(rdtp);
+				if (rcu_dynticks_in_eqs(snap))
+					mask_ofl_test |= mask;
+				else
+					rdp->exp_dynticks_snap = snap;
+			}
 		}
 		mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
 
@@ -405,7 +412,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 
 		/* IPI the remaining CPUs for expedited quiescent state. */
-		for_each_leaf_node_possible_cpu(rnp, cpu) {
+		for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
 			unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
 			struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 
-- 
2.5.2

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ