lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251116182839.939139-6-lrizzo@google.com>
Date: Sun, 16 Nov 2025 18:28:36 +0000
From: Luigi Rizzo <lrizzo@...gle.com>
To: Thomas Gleixner <tglx@...utronix.de>, Marc Zyngier <maz@...nel.org>, 
	Luigi Rizzo <rizzo.unipi@...il.com>, Paolo Abeni <pabeni@...hat.com>, 
	Andrew Morton <akpm@...ux-foundation.org>, Sean Christopherson <seanjc@...gle.com>, 
	Jacob Pan <jacob.jun.pan@...ux.intel.com>
Cc: linux-kernel@...r.kernel.org, linux-arch@...r.kernel.org, 
	Bjorn Helgaas <bhelgaas@...gle.com>, Willem de Bruijn <willemb@...gle.com>, 
	Luigi Rizzo <lrizzo@...gle.com>
Subject: [PATCH v2 5/8] x86/irq: soft_moderation: add support for posted_msi (intel)

On recent Intel CPUs, kernels compiled with CONFIG_X86_POSTED_MSI=y,
and the boot option "intremap=posted_msi", all MSI interrupts
that hit a CPU issue a single POSTED_MSI interrupt processed by
sysvec_posted_msi_notification() instead of having separate interrupts.

This change adds soft moderation hooks to the above handler.
Soft moderation on posted_msi does not require per-source enable,
irq_moderation.delay_us > 0 suffices.

To test it, run a kernel with the above options and enable moderation by
setting delay_us > 0.  The column "from_msi" in /proc/irq/soft_moderation
will show a non-zero value.

Change-Id: I07b83b428de6f6541e3903b553c1b837c68a0b7d
Signed-off-by: Luigi Rizzo <lrizzo@...gle.com>
---
 arch/x86/kernel/Makefile    |  2 +-
 arch/x86/kernel/irq.c       | 13 +++++++
 kernel/irq/irq_moderation.c | 38 ++++++++++++++++++-
 kernel/irq/irq_moderation.h | 73 ++++++++++++++++++++++++++++++++++++-
 4 files changed, 123 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index bc184dd38d993..530f5b5342eaa 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -46,7 +46,7 @@ KCOV_INSTRUMENT_unwind_guess.o				:= n
 
 CFLAGS_head32.o := -fno-stack-protector
 CFLAGS_head64.o := -fno-stack-protector
-CFLAGS_irq.o := -I $(src)/../include/asm/trace
+CFLAGS_irq.o := -I $(src)/../include/asm/trace -I $(srctree)/kernel/irq
 
 obj-y			+= head_$(BITS).o
 obj-y			+= head$(BITS).o
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 10721a1252269..1abdd21fa5c52 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -13,6 +13,8 @@
 #include <linux/export.h>
 #include <linux/irq.h>
 
+#include <irq_moderation.h>
+
 #include <asm/irq_stack.h>
 #include <asm/apic.h>
 #include <asm/io_apic.h>
@@ -448,6 +450,13 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
 	inc_irq_stat(posted_msi_notification_count);
 	irq_enter();
 
+	if (posted_msi_moderation_enabled()) {
+		if (posted_msi_should_rearm(handle_pending_pir(pid->pir, regs)))
+			goto rearm;
+		else
+			goto common_end;
+	}
+
 	/*
 	 * Max coalescing count includes the extra round of handle_pending_pir
 	 * after clearing the outstanding notification bit. Hence, at most
@@ -458,6 +467,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
 			break;
 	}
 
+rearm:
 	/*
 	 * Clear outstanding notification bit to allow new IRQ notifications,
 	 * do this last to maximize the window of interrupt coalescing.
@@ -471,6 +481,9 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
 	 */
 	handle_pending_pir(pid->pir, regs);
 
+common_end:
+	posted_msi_moderation_epilogue();
+
 	apic_eoi();
 	irq_exit();
 	set_irq_regs(old_regs);
diff --git a/kernel/irq/irq_moderation.c b/kernel/irq/irq_moderation.c
index 72be9e88c3890..2d01e4cd4638b 100644
--- a/kernel/irq/irq_moderation.c
+++ b/kernel/irq/irq_moderation.c
@@ -11,6 +11,13 @@
 #include "internals.h"
 #include "irq_moderation.h"
 
+#ifdef CONFIG_X86
+#include <asm/apic.h>
+#include <asm/irq_remapping.h>
+#else
+static inline bool posted_msi_supported(void) { return false; }
+#endif
+
 /*
  * Platform-wide software interrupt moderation.
  *
@@ -32,6 +39,10 @@
  *   moderation on that CPU/irq. If so, calls disable_irq_nosync() and starts
  *   an hrtimer with appropriate delay.
  *
+ * - Intel only: using "intremap=posted_msi", all the above is done in
+ *   sysvec_posted_msi_notification(). In this case all host device interrupts
+ *   are subject to moderation.
+ *
  * - the timer callback calls enable_irq() for all disabled interrupts on that
  *   CPU. That in turn will generate interrupts if there are pending events.
  *
@@ -230,6 +241,17 @@ static enum hrtimer_restart timer_cb(struct hrtimer *timer)
 
 	ms->rounds_left--;
 
+#ifdef CONFIG_X86_POSTED_MSI
+	if (ms->kick_posted_msi) {
+		if (ms->rounds_left == 0)
+			ms->kick_posted_msi = false;
+		/* Next call will be from timer, count it conditionally. */
+		ms->dont_count = !irq_mod_info.count_timer_calls;
+		ms->timer_calls++;
+		apic->send_IPI_self(POSTED_MSI_NOTIFICATION_VECTOR);
+	}
+#endif
+
 	if (ms->rounds_left > 0) {
 		/* Timer still alive, just call the handlers. */
 		list_for_each_entry_safe(desc, next, &ms->descs, mod.ms_node) {
@@ -332,7 +354,7 @@ static int moderation_show(struct seq_file *p, void *v)
 	}
 
 	seq_printf(p, "\n"
-		   "enabled              %s\n"
+		   "enabled              %s%s\n"
 		   "delay_us             %u\n"
 		   "timer_rounds         %u\n"
 		   "target_irq_rate      %u\n"
@@ -344,6 +366,7 @@ static int moderation_show(struct seq_file *p, void *v)
 		   "decay_factor         %u\n"
 		   "grow_factor          %u\n",
 		   str_yes_no(delay_us > 0),
+		   posted_msi_supported() ? " (also on posted_msi)" : "",
 		   delay_us, irq_mod_info.timer_rounds,
 		   irq_mod_info.target_irq_rate, irq_mod_info.hardirq_percent,
 		   irq_mod_info.update_ms, irq_mod_info.scale_cpus,
@@ -389,6 +412,7 @@ static struct param_names param_names[] = {
 	{},
 	{ "scale_cpus", &irq_mod_info.scale_cpus, 50, 1000 },
 	{ "count_timer_calls", &irq_mod_info.count_timer_calls, 0, 1 },
+	{ "count_msi_calls", &irq_mod_info.count_msi_calls, 0, 1 },
 	{ "decay_factor", &irq_mod_info.decay_factor, 8, 64 },
 	{ "grow_factor", &irq_mod_info.grow_factor, 8, 64 },
 };
@@ -476,6 +500,18 @@ static ssize_t mode_write(struct file *f, const char __user *buf, size_t count,
 	ret = kstrtobool(cmd, &enable);
 	if (!ret)
 		ret = set_moderation_mode(desc, enable);
+	if (ret) {
+		/* extra helpers for prodkernel */
+		if (cmd[count - 1] == '\n')
+			cmd[count - 1] = '\0';
+		ret = 0;
+		if (!strcmp(cmd, "managed"))
+			irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED);
+		else if (!strcmp(cmd, "unmanaged"))
+			irqd_clear(&desc->irq_data, IRQD_AFFINITY_MANAGED);
+		else
+			ret = -EINVAL;
+	}
 	return ret ? : count;
 }
 
diff --git a/kernel/irq/irq_moderation.h b/kernel/irq/irq_moderation.h
index 3543e8e8b6e2d..69bbbb7b2ec80 100644
--- a/kernel/irq/irq_moderation.h
+++ b/kernel/irq/irq_moderation.h
@@ -145,7 +145,8 @@ static inline void irq_moderation_adjust_delay(struct irq_mod_state *ms)
 {
 	u64 now, delta_time;
 
-	ms->irq_count++;
+	/* dont_count can only be set in timer calls from posted_msi */
+	ms->irq_count += !ms->dont_count;
 	/* ktime_get_ns() is expensive, don't do too often */
 	if (ms->irq_count & 0xf)
 		return;
@@ -196,6 +197,15 @@ static inline void irq_moderation_hook(struct irq_desc *desc)
 	if (!static_branch_unlikely(&irq_moderation_enabled_key))
 		return;
 
+#ifdef CONFIG_X86_POSTED_MSI
+	if (ms->in_posted_msi) {
+		/* these calls are not moderated */
+		ms->from_posted_msi++;
+		ms->irq_count += irq_mod_info.count_msi_calls;
+		return;
+	}
+#endif
+
 	if (!READ_ONCE(desc->mod.enable))
 		return;
 
@@ -243,6 +253,61 @@ static inline void irq_moderation_epilogue(const struct irq_desc *desc)
 		irq_moderation_start_timer(ms);
 }
 
+#ifdef CONFIG_X86_POSTED_MSI
+/*
+ * Helpers for to sysvec_posted_msi_notification(), use as follows
+ *
+ *	if (posted_msi_moderation_enabled()) {
+ *		if (posted_msi_should_rearm(handle_pending_pir(pid->pir, regs)))
+ *			goto rearm;
+ *		else
+ *			goto common_end;
+ *	}
+ *	...
+ *  common_end:
+ *	posted_msi_moderation_epilogue();
+ */
+static inline bool posted_msi_moderation_enabled(void)
+{
+	struct irq_mod_state *ms = this_cpu_ptr(&irq_mod_state);
+
+	if (!static_branch_unlikely(&irq_moderation_enabled_key))
+		return false;
+	irq_moderation_adjust_delay(ms);
+	/* Tell handlers to not throttle next calls. */
+	ms->in_posted_msi = true;
+	return true;
+}
+
+/* Decide whether or not to rearm posted_msi. */
+static inline bool posted_msi_should_rearm(bool work_done)
+{
+	struct irq_mod_state *ms = this_cpu_ptr(&irq_mod_state);
+
+	/* No rearm if there is a timer pending. */
+	if (ms->rounds_left > 0)
+		return false;
+	/* No work done, can rearm. */
+	if (!work_done)
+		return true;
+	if (!irq_moderation_needed(ms))
+		return true;
+	/* Start the timer, inform the handler, and do not rearm. */
+	ms->kick_posted_msi = true;
+	irq_moderation_start_timer(ms);
+	return false;
+}
+
+/* Cleanup state set in posted_msi_moderation_enabled() */
+static inline void posted_msi_moderation_epilogue(void)
+{
+	struct irq_mod_state *ms = this_cpu_ptr(&irq_mod_state);
+
+	ms->in_posted_msi = false;
+	ms->dont_count = false;
+}
+#endif
+
 void irq_moderation_procfs_add(struct irq_desc *desc, umode_t umode);
 void irq_moderation_procfs_remove(struct irq_desc *desc);
 
@@ -251,6 +316,12 @@ void irq_moderation_procfs_remove(struct irq_desc *desc);
 static inline void irq_moderation_hook(struct irq_desc *desc) {}
 static inline void irq_moderation_epilogue(const struct irq_desc *desc) {}
 
+#ifdef CONFIG_X86_POSTED_MSI
+static inline bool posted_msi_moderation_enabled(void) { return false; }
+static inline bool posted_msi_should_rearm(bool work_done) { return false; }
+static inline void posted_msi_moderation_epilogue(void) {}
+#endif
+
 static inline void irq_moderation_procfs_add(struct irq_desc *desc, umode_t umode) {}
 static inline void irq_moderation_procfs_remove(struct irq_desc *desc) {}
 
-- 
2.52.0.rc1.455.g30608eb744-goog


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ