linux-kernel - [PATCH] x86/apic: reduce cache line misses in __x2apic_send_IPI

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [thread-next>] [day] [month] [year] [list]

Message-Id: <20211007031756.345269-1-eric.dumazet@gmail.com>
Date:   Wed,  6 Oct 2021 20:17:56 -0700
From:   Eric Dumazet <eric.dumazet@...il.com>
To:     Thomas Gleixner <tglx@...utronix.de>
Cc:     linux-kernel <linux-kernel@...r.kernel.org>,
        Eric Dumazet <edumazet@...gle.com>,
        Eric Dumazet <eric.dumazet@...il.com>,
        Dave Hansen <dave.hansen@...ux.intel.com>,
        Borislav Petkov <bp@...e.de>,
        Peter Zijlstra <peterz@...radead.org>
Subject: [PATCH] x86/apic: reduce cache line misses in __x2apic_send_IPI_mask()

From: Eric Dumazet <edumazet@...gle.com>

Using per-cpu storage for @x86_cpu_to_logical_apicid
is not optimal.

Broadcast IPI will need at least one cache line
per cpu to access this field.

__x2apic_send_IPI_mask() is using standard bitmask operators.

By converting x86_cpu_to_logical_apicid to an array,
we divide by 16x the number of needed cache lines, because
we find 16 values in one cache line. CPU prefetcher can
kick nicely.

Also move @cluster_masks to READ_MOSTLY section to avoid
possible false sharing.

Tested on a dual socket host with 256 cpus,
average cost for a full broadcast is now 11 usec instead of 33 usec.

Signed-off-by: Eric Dumazet <edumazet@...gle.com>
Cc: Dave Hansen <dave.hansen@...ux.intel.com>
Cc: Borislav Petkov <bp@...e.de>
Cc: Peter Zijlstra (Intel) <peterz@...radead.org>
Cc: Thomas Gleixner <tglx@...utronix.de>
---
 arch/x86/kernel/apic/x2apic_cluster.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index f4da9bb69a8859ff10824315388aeb49c2ccfad9..8ba23bfbc91c559e27fefdb8c9d401edd83022b1 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -15,9 +15,14 @@ struct cluster_mask {
 	struct cpumask	mask;
 };
 
-static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+/* __x2apic_send_IPI_mask() possibly needs to read
+ * x86_cpu_to_logical_apicid for all online cpus in a sequential way.
+ * Using per cpu variable would cost one cache line per cpu.
+ */
+static u32 x86_cpu_to_logical_apicid[NR_CPUS] __read_mostly;
+
 static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
-static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks);
+static DEFINE_PER_CPU_READ_MOSTLY(struct cluster_mask *, cluster_masks);
 static struct cluster_mask *cluster_hotplug_mask;
 
 static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
@@ -27,7 +32,7 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 
 static void x2apic_send_IPI(int cpu, int vector)
 {
-	u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
+	u32 dest = x86_cpu_to_logical_apicid[cpu];
 
 	/* x2apic MSRs are special and need a special fence: */
 	weak_wrmsr_fence();
@@ -58,11 +63,10 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
 
 		dest = 0;
 		for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask)
-			dest |= per_cpu(x86_cpu_to_logical_apicid, clustercpu);
+			dest |= x86_cpu_to_logical_apicid[clustercpu];
 
 		if (!dest)
 			continue;
-
 		__x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
 		/* Remove cluster CPUs from tmpmask */
 		cpumask_andnot(tmpmsk, tmpmsk, &cmsk->mask);
@@ -94,7 +98,7 @@ static void x2apic_send_IPI_all(int vector)
 
 static u32 x2apic_calc_apicid(unsigned int cpu)
 {
-	return per_cpu(x86_cpu_to_logical_apicid, cpu);
+	return x86_cpu_to_logical_apicid[cpu];
 }
 
 static void init_x2apic_ldr(void)
@@ -103,7 +107,7 @@ static void init_x2apic_ldr(void)
 	u32 cluster, apicid = apic_read(APIC_LDR);
 	unsigned int cpu;
 
-	this_cpu_write(x86_cpu_to_logical_apicid, apicid);
+	x86_cpu_to_logical_apicid[smp_processor_id()] = apicid;
 
 	if (cmsk)
 		goto update;
-- 
2.33.0.882.g93a45727a2-goog