From 78da06714a7af4fcaba5564b77fe3e6024e953c5 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 4 Mar 2023 13:35:43 -0800
Subject: [PATCH] cpumask: reintrudoce non-MAXSMP cpumask optimizations

Commit aa47a7c215e7 ("lib/cpumask: deprecate nr_cpumask_bits") resulted
in the cpumask operations becoming hugely less efficient, because
suddenly the cpumask was always considered to be variable-sized.

If we have CONFIG_NR_CPUS be 64 (to pick a reasonable value, and one I
happen to use), an operation like "cpumask_clear()" should literally be
a single "store 64 bits of zero into the cpumask".  That was very much
what nr_cpumask_bits was all about.

Without this optimization, instead of being a single store instruction,
cpumask_clear() would generate code like

        movl    nr_cpu_ids(%rip), %edx
        addq    $63, %rdx
        shrq    $3, %rdx
        andl    $-8, %edx
        callq   memset@PLT

on x86-64 to do that equivalent "clear one word", because it wanted to
make everything dynamically based on the actual number of CPU's in the
system.

This does end up tightening the rules a bit: operations that set bits in
the cpumask are limited to the actual nr_cpu_ids limit.  But if you just
clear bits, or scan for bits that are set, we can use the bigger
compile-time constant, which is much more efficient.

In the process, remove 'cpumask_complement()' and 'for_each_cpu_not()'
which were not useful, and which fundamentally have to be limited to
'nr_cpu_ids'.  Better remove them now than have somebody introduce use
of them later.

Of course, with MAXSMP there is no sane small compile-time constant for
the cpumap sizes, and we end up using the actual CPU bits, and will
generate the above kind of horrors regardless.  Please don't use MAXSMP
unless you really expect to have machines with thousands of cores.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 .clang-format           |  1 -
 arch/ia64/kernel/acpi.c |  4 +--
 include/linux/cpumask.h | 68 ++++++++++++++++++++++-------------------
 lib/cpumask_kunit.c     | 12 --------
 4 files changed, 37 insertions(+), 48 deletions(-)

diff --git a/.clang-format b/.clang-format
index 2c61b4553374..d988e9fa9b26 100644
--- a/.clang-format
+++ b/.clang-format
@@ -226,7 +226,6 @@ ForEachMacros:
   - 'for_each_console_srcu'
   - 'for_each_cpu'
   - 'for_each_cpu_and'
-  - 'for_each_cpu_not'
   - 'for_each_cpu_wrap'
   - 'for_each_dapm_widgets'
   - 'for_each_dedup_cand'
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 96d13cb7c19f..15f6cfddcc08 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -783,11 +783,9 @@ __init void prefill_possible_map(void)
 
 static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
 {
-	cpumask_t tmp_map;
 	int cpu;
 
-	cpumask_complement(&tmp_map, cpu_present_mask);
-	cpu = cpumask_first(&tmp_map);
+	cpu = cpumask_first_zero(cpu_present_mask);
 	if (cpu >= nr_cpu_ids)
 		return -EINVAL;
 
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 10c92bd9b807..bd9576e8d856 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -50,8 +50,30 @@ static inline void set_nr_cpu_ids(unsigned int nr)
 #endif
 }
 
-/* Deprecated. Always use nr_cpu_ids. */
-#define nr_cpumask_bits	nr_cpu_ids
+/*
+ * The difference between nr_cpumask_bits and nr_cpu_ids is that
+ * 'nr_cpu_ids' is the actual number of CPU ids in the system, while
+ * nr_cpumask_bits is a "reasonable upper value" that is often more
+ * efficient because it can be a fixed constant.
+ *
+ * So when clearing or traversing a cpumask, use 'nr_cpumask_bits',
+ * but when checking exact limits (and when _setting_ bits), use the
+ * tighter exact limit of 'nr_cpu_ids'.
+ *
+ * NOTE! The code depends on any exyta bits in nr_cpumask_bits a always
+ * being (a) allocated and (b) zero, so that the only effect of using
+ * 'nr_cpumask_bits' is that we might return a higher maximum CPU value
+ * (which is why we have that pattern of
+ *
+ *   Returns >= nr_cpu_ids if no cpus set.
+ *
+ * for many of the functions - they can return that higher value).
+ */
+#ifndef CONFIG_CPUMASK_OFFSTACK
+ #define nr_cpumask_bits ((unsigned int)NR_CPUS)
+#else
+ #define nr_cpumask_bits	nr_cpu_ids
+#endif
 
 /*
  * The following particular system cpumasks and operations manage
@@ -114,7 +136,7 @@ static __always_inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bit
 /* verify cpu argument to cpumask_* operators */
 static __always_inline unsigned int cpumask_check(unsigned int cpu)
 {
-	cpu_max_bits_warn(cpu, nr_cpumask_bits);
+	cpu_max_bits_warn(cpu, nr_cpu_ids);
 	return cpu;
 }
 
@@ -248,16 +270,6 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
 #define for_each_cpu(cpu, mask)				\
 	for_each_set_bit(cpu, cpumask_bits(mask), nr_cpumask_bits)
 
-/**
- * for_each_cpu_not - iterate over every cpu in a complemented mask
- * @cpu: the (optionally unsigned) integer iterator
- * @mask: the cpumask pointer
- *
- * After the loop, cpu is >= nr_cpu_ids.
- */
-#define for_each_cpu_not(cpu, mask)				\
-	for_each_clear_bit(cpu, cpumask_bits(mask), nr_cpumask_bits)
-
 #if NR_CPUS == 1
 static inline
 unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
@@ -495,10 +507,14 @@ static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *
 /**
  * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask
  * @dstp: the cpumask pointer
+ *
+ * Note: since we set bits, we should use the tighter 'bitmap_set()' with
+ * the eact number of bits, not 'bitmap_fill()' that will fill past the
+ * end.
  */
 static inline void cpumask_setall(struct cpumask *dstp)
 {
-	bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits);
+	bitmap_set(cpumask_bits(dstp), 0, nr_cpu_ids);
 }
 
 /**
@@ -569,18 +585,6 @@ static inline bool cpumask_andnot(struct cpumask *dstp,
 					  cpumask_bits(src2p), nr_cpumask_bits);
 }
 
-/**
- * cpumask_complement - *dstp = ~*srcp
- * @dstp: the cpumask result
- * @srcp: the input to invert
- */
-static inline void cpumask_complement(struct cpumask *dstp,
-				      const struct cpumask *srcp)
-{
-	bitmap_complement(cpumask_bits(dstp), cpumask_bits(srcp),
-					      nr_cpumask_bits);
-}
-
 /**
  * cpumask_equal - *src1p == *src2p
  * @src1p: the first input
@@ -648,7 +652,7 @@ static inline bool cpumask_empty(const struct cpumask *srcp)
  */
 static inline bool cpumask_full(const struct cpumask *srcp)
 {
-	return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits);
+	return bitmap_full(cpumask_bits(srcp), nr_cpu_ids);
 }
 
 /**
@@ -694,7 +698,7 @@ static inline void cpumask_shift_left(struct cpumask *dstp,
 				      const struct cpumask *srcp, int n)
 {
 	bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n,
-					      nr_cpumask_bits);
+					      nr_cpu_ids);
 }
 
 /**
@@ -742,7 +746,7 @@ static inline void cpumask_copy(struct cpumask *dstp,
 static inline int cpumask_parse_user(const char __user *buf, int len,
 				     struct cpumask *dstp)
 {
-	return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits);
+	return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpu_ids);
 }
 
 /**
@@ -757,7 +761,7 @@ static inline int cpumask_parselist_user(const char __user *buf, int len,
 				     struct cpumask *dstp)
 {
 	return bitmap_parselist_user(buf, len, cpumask_bits(dstp),
-				     nr_cpumask_bits);
+				     nr_cpu_ids);
 }
 
 /**
@@ -769,7 +773,7 @@ static inline int cpumask_parselist_user(const char __user *buf, int len,
  */
 static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
 {
-	return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits);
+	return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpu_ids);
 }
 
 /**
@@ -781,7 +785,7 @@ static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
  */
 static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
 {
-	return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits);
+	return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpu_ids);
 }
 
 /**
diff --git a/lib/cpumask_kunit.c b/lib/cpumask_kunit.c
index d1fc6ece21f3..ab798365b7dc 100644
--- a/lib/cpumask_kunit.c
+++ b/lib/cpumask_kunit.c
@@ -23,16 +23,6 @@
 		KUNIT_EXPECT_EQ_MSG((test), mask_weight, iter, MASK_MSG(mask));	\
 	} while (0)
 
-#define EXPECT_FOR_EACH_CPU_NOT_EQ(test, mask)					\
-	do {									\
-		const cpumask_t *m = (mask);					\
-		int mask_weight = cpumask_weight(m);				\
-		int cpu, iter = 0;						\
-		for_each_cpu_not(cpu, m)					\
-			iter++;							\
-		KUNIT_EXPECT_EQ_MSG((test), nr_cpu_ids - mask_weight, iter, MASK_MSG(mask));	\
-	} while (0)
-
 #define EXPECT_FOR_EACH_CPU_OP_EQ(test, op, mask1, mask2)			\
 	do {									\
 		const cpumask_t *m1 = (mask1);					\
@@ -113,14 +103,12 @@ static void test_cpumask_next(struct kunit *test)
 static void test_cpumask_iterators(struct kunit *test)
 {
 	EXPECT_FOR_EACH_CPU_EQ(test, &mask_empty);
-	EXPECT_FOR_EACH_CPU_NOT_EQ(test, &mask_empty);
 	EXPECT_FOR_EACH_CPU_WRAP_EQ(test, &mask_empty);
 	EXPECT_FOR_EACH_CPU_OP_EQ(test, and, &mask_empty, &mask_empty);
 	EXPECT_FOR_EACH_CPU_OP_EQ(test, and, cpu_possible_mask, &mask_empty);
 	EXPECT_FOR_EACH_CPU_OP_EQ(test, andnot, &mask_empty, &mask_empty);
 
 	EXPECT_FOR_EACH_CPU_EQ(test, cpu_possible_mask);
-	EXPECT_FOR_EACH_CPU_NOT_EQ(test, cpu_possible_mask);
 	EXPECT_FOR_EACH_CPU_WRAP_EQ(test, cpu_possible_mask);
 	EXPECT_FOR_EACH_CPU_OP_EQ(test, and, cpu_possible_mask, cpu_possible_mask);
 	EXPECT_FOR_EACH_CPU_OP_EQ(test, andnot, cpu_possible_mask, &mask_empty);
-- 
2.39.1.437.g88ae5c7f81