[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251116182839.939139-3-lrizzo@google.com>
Date: Sun, 16 Nov 2025 18:28:33 +0000
From: Luigi Rizzo <lrizzo@...gle.com>
To: Thomas Gleixner <tglx@...utronix.de>, Marc Zyngier <maz@...nel.org>,
Luigi Rizzo <rizzo.unipi@...il.com>, Paolo Abeni <pabeni@...hat.com>,
Andrew Morton <akpm@...ux-foundation.org>, Sean Christopherson <seanjc@...gle.com>,
Jacob Pan <jacob.jun.pan@...ux.intel.com>
Cc: linux-kernel@...r.kernel.org, linux-arch@...r.kernel.org,
Bjorn Helgaas <bhelgaas@...gle.com>, Willem de Bruijn <willemb@...gle.com>,
Luigi Rizzo <lrizzo@...gle.com>
Subject: [PATCH v2 2/8] genirq: soft_moderation: add base files, procfs
Add the core files that implement procfs and module parameters
for soft_moderation. This gives access to the module parameters
/sys/module/irq_moderation/parameters and read/write the procfs entries
/proc/irq/soft_moderation and /proc/irq/NN/soft_moderation.
Examples:
cat /proc/irq/soft_moderation
echo "delay_us=345" > /proc/irq/soft_moderation
echo 1 | tee /proc/irq/*/nvme*/../soft_moderation
No functional change.
Change-Id: I83fc9568e70885cc02e7fcd4dbe141d9ee329c82
Signed-off-by: Luigi Rizzo <lrizzo@...gle.com>
---
kernel/irq/Makefile | 1 +
kernel/irq/irq_moderation.c | 305 ++++++++++++++++++++++++++++++++++++
kernel/irq/irq_moderation.h | 149 ++++++++++++++++++
kernel/irq/irqdesc.c | 1 +
kernel/irq/proc.c | 3 +
5 files changed, 459 insertions(+)
create mode 100644 kernel/irq/irq_moderation.c
create mode 100644 kernel/irq/irq_moderation.h
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 6ab3a40556670..c06da43d644f2 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
obj-$(CONFIG_IRQ_SIM) += irq_sim.o
+obj-$(CONFIG_IRQ_SOFT_MODERATION) += irq_moderation.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
diff --git a/kernel/irq/irq_moderation.c b/kernel/irq/irq_moderation.c
new file mode 100644
index 0000000000000..3a907b8f65698
--- /dev/null
+++ b/kernel/irq/irq_moderation.c
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+
+#include <linux/cpuhotplug.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/kernel_stat.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "internals.h"
+#include "irq_moderation.h"
+
+/*
+ * Platform-wide software interrupt moderation.
+ *
+ * see Documentation/core-api/irq/irq-moderation.rst
+ *
+ * === MOTIVATION AND OPERATION ===
+ *
+ * Some platforms show reduced I/O performance when the total device interrupt
+ * rate across the entire platform becomes too high. This code implements
+ * per-CPU adaptive moderation based on the total interrupt rate, as opposed
+ * to conventional moderation that operates separately on each source.
+ *
+ * It computes the total interrupt rate and number of sources, and uses the
+ * information to adaptively disable individual interrupts for small amounts
+ * of time using per-CPU hrtimers. Specifically:
+ *
+ * - a hook in handle_irq_event(), which applies only on sources configured
+ * to use moderation, updates statistics and check whether we need
+ * moderation on that CPU/irq. If so, calls disable_irq_nosync() and starts
+ * an hrtimer with appropriate delay.
+ *
+ * - the timer callback calls enable_irq() for all disabled interrupts on that
+ * CPU. That in turn will generate interrupts if there are pending events.
+ *
+ * === CONFIGURATION ===
+ *
+ * The following can be controlled at boot time via module parameters
+ *
+ * irq_moderation.${NAME}=${VALUE}
+ *
+ * or at runtime by writing
+ *
+ * echo "${NAME}=${VALUE}" > /proc/irq/soft_moderation
+ *
+ * delay_us (default 0, suggested 100, range 0-500, 0 DISABLES MODERATION)
+ * Fixed or maximum moderation delay. A reasonable range is 20..100, higher
+ * values can be useful if the hardirq handler is performing a significant
+ * amount of work.
+ *
+ * timer_rounds (default 0, max 20)
+ * Once moderation triggers, periodically run handler zero or more
+ * times using a timer rather than interrupts. This is similar to
+ * napi_defer_hard_irqs on NICs.
+ * A small value may help control load in interrupt-challenged platforms.
+ *
+ * Moderation can be enabled/disabled for individual interrupts with
+ *
+ * echo "on" > /proc/irq/NN/soft_moderation # use "off" to disable
+ *
+ * === MONITORING ===
+ *
+ * cat /proc/irq/soft_moderation shows per-CPU and global statistics.
+ *
+ */
+
+struct irq_mod_info irq_mod_info ____cacheline_aligned;
+
+/* Boot time value, copled to irq_mod_info.delay_us after init. */
+static uint mod_delay_us;
+module_param_named(delay_us, mod_delay_us, uint, 0444);
+MODULE_PARM_DESC(delay_us, "Max moderation delay us, 0 = moderation off, range 0-500.");
+
+module_param_named(timer_rounds, irq_mod_info.timer_rounds, uint, 0444);
+MODULE_PARM_DESC(timer_rounds, "How many timer polls once moderation triggers, range 0-20.");
+
+DEFINE_PER_CPU_ALIGNED(struct irq_mod_state, irq_mod_state);
+
+/* Initialize moderation state, used in desc_set_defaults() */
+void irq_moderation_init_fields(struct irq_desc_mod *mod)
+{
+ INIT_LIST_HEAD(&mod->ms_node);
+ mod->enable = false;
+}
+
+static inline int set_moderation_mode(struct irq_desc *desc, bool enable)
+{
+ struct irq_data *irqd = &desc->irq_data;
+ struct irq_chip *chip = desc->irq_data.chip;
+
+ /* Moderation is supported only in specific cases. */
+ if (enable) {
+ if (irqd_is_level_type(irqd) || !irqd_is_single_target(irqd) ||
+ chip->irq_bus_lock || chip->irq_bus_sync_unlock)
+ return -EOPNOTSUPP;
+ }
+ desc->mod.enable = enable;
+ return 0;
+}
+
+#pragma clang diagnostic error "-Wformat"
+/* Print statistics */
+static int moderation_show(struct seq_file *p, void *v)
+{
+ uint delay_us = irq_mod_info.delay_us;
+ int j;
+
+#define HEAD_FMT "%5s %8s %8s %4s %4s %8s %11s %11s %11s %11s %11s %11s %11s %9s\n"
+#define BODY_FMT "%5u %8u %8u %4u %4u %8u %11u %11u %11u %11u %11u %11u %11u %9u\n"
+
+ seq_printf(p, HEAD_FMT,
+ "# CPU", "irq/s", "my_irq/s", "cpus", "srcs", "delay_ns",
+ "irq_hi", "my_irq_hi", "hardirq_hi", "timer_set",
+ "disable_irq", "from_msi", "timer_calls", "stray_irq");
+
+ for_each_possible_cpu(j) {
+ struct irq_mod_state *ms = per_cpu_ptr(&irq_mod_state, j);
+
+ seq_printf(p, BODY_FMT,
+ j, ms->irq_rate, ms->my_irq_rate,
+ (ms->scaled_cpu_count + 128) / 256,
+ (ms->scaled_src_count + 128) / 256,
+ ms->mod_ns, ms->irq_high, ms->my_irq_high,
+ ms->hardirq_high, ms->timer_set, ms->disable_irq,
+ ms->from_posted_msi, ms->timer_calls, ms->stray_irq);
+ }
+
+ seq_printf(p, "\n"
+ "enabled %s\n"
+ "delay_us %u\n"
+ "timer_rounds %u\n",
+ str_yes_no(delay_us > 0),
+ delay_us, irq_mod_info.timer_rounds);
+
+ return 0;
+}
+
+static int moderation_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, moderation_show, pde_data(inode));
+}
+
+/* Helpers to set and clamp values from procfs or at init. */
+struct param_names {
+ const char *name;
+ uint *val;
+ uint min;
+ uint max;
+};
+
+static struct param_names param_names[] = {
+ { "delay_us", &irq_mod_info.delay_us, 0, 500 },
+ { "timer_rounds", &irq_mod_info.timer_rounds, 0, 20 },
+ /* Empty entry indicates the following are not settable from procfs. */
+ {},
+ { "update_ms", &irq_mod_info.update_ms, 1, 100 },
+};
+
+static ssize_t moderation_write(struct file *f, const char __user *buf, size_t count, loff_t *ppos)
+{
+ struct param_names *n = param_names;
+ char cmd[40];
+ uint i, l, val;
+
+ if (count == 0 || count + 1 > sizeof(cmd))
+ return -EINVAL;
+ if (copy_from_user(cmd, buf, count))
+ return -EFAULT;
+ cmd[count] = '\0';
+ for (i = 0; i < ARRAY_SIZE(param_names) && n->name; i++, n++) {
+ l = strlen(n->name);
+ if (count < l + 2 || strncmp(cmd, n->name, l) || cmd[l] != '=')
+ continue;
+ if (kstrtouint(cmd + l + 1, 0, &val))
+ return -EINVAL;
+ WRITE_ONCE(*(n->val), clamp(val, n->min, n->max));
+ /* Record last parameter change, for use in the control loop. */
+ irq_mod_info.procfs_write_ns = ktime_get_ns();
+ return count;
+ }
+ return -EINVAL;
+}
+
+static const struct proc_ops proc_ops = {
+ .proc_open = moderation_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+ .proc_write = moderation_write,
+};
+
+/* Handlers for /proc/irq/NN/soft_moderation */
+static int mode_show(struct seq_file *p, void *v)
+{
+ struct irq_desc *desc = p->private;
+
+ if (!desc)
+ return -ENOENT;
+
+ seq_printf(p, "%s irq %u trigger 0x%x %s %smanaged %slazy handle_irq %pB\n",
+ desc->mod.enable ? "on" : "off", desc->irq_data.irq,
+ irqd_get_trigger_type(&desc->irq_data),
+ irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge",
+ irqd_affinity_is_managed(&desc->irq_data) ? "" : "un",
+ irq_settings_disable_unlazy(desc) ? "un" : "", desc->handle_irq
+ );
+ return 0;
+}
+
+static ssize_t mode_write(struct file *f, const char __user *buf, size_t count, loff_t *ppos)
+{
+ struct irq_desc *desc = (struct irq_desc *)pde_data(file_inode(f));
+ char cmd[40];
+ bool enable;
+ int ret;
+
+ if (!desc)
+ return -ENOENT;
+ if (count == 0 || count + 1 > sizeof(cmd))
+ return -EINVAL;
+ if (copy_from_user(cmd, buf, count))
+ return -EFAULT;
+ cmd[count] = '\0';
+
+ ret = kstrtobool(cmd, &enable);
+ if (!ret)
+ ret = set_moderation_mode(desc, enable);
+ return ret ? : count;
+}
+
+static int mode_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, mode_show, pde_data(inode));
+}
+
+static const struct proc_ops mode_ops = {
+ .proc_open = mode_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+ .proc_write = mode_write,
+};
+
+void irq_moderation_procfs_add(struct irq_desc *desc, umode_t umode)
+{
+ proc_create_data("soft_moderation", umode, desc->dir, &mode_ops, desc);
+}
+
+void irq_moderation_procfs_remove(struct irq_desc *desc)
+{
+ remove_proc_entry("soft_moderation", desc->dir);
+}
+
+/* Per-CPU state initialization */
+static void irq_moderation_percpu_init(void *data)
+{
+ struct irq_mod_state *ms = this_cpu_ptr(&irq_mod_state);
+
+ INIT_LIST_HEAD(&ms->descs);
+}
+
+static int cpuhp_setup_cb(uint cpu)
+{
+ irq_moderation_percpu_init(NULL);
+ return 0;
+}
+
+static void clamp_parameter(uint *dst, uint val)
+{
+ struct param_names *n = param_names;
+ uint i;
+
+ for (i = 0; i < ARRAY_SIZE(param_names); i++, n++) {
+ if (dst == n->val) {
+ *dst = clamp(val, n->min, n->max);
+ return;
+ }
+ }
+}
+
+static int __init init_irq_moderation(void)
+{
+ uint *cur;
+
+ on_each_cpu(irq_moderation_percpu_init, NULL, 1);
+ cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "moderation:online", cpuhp_setup_cb, NULL);
+
+ /* Clamp all initial values to the allowed range. */
+ for (cur = &irq_mod_info.target_irq_rate; cur < irq_mod_info.pad; cur++)
+ clamp_parameter(cur, *cur);
+
+ /* Finally, set delay_us to enable moderation if needed. */
+ clamp_parameter(&irq_mod_info.delay_us, mod_delay_us);
+
+ proc_create_data("irq/soft_moderation", 0644, NULL, &proc_ops, NULL);
+ return 0;
+}
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION("1.0");
+MODULE_AUTHOR("Luigi Rizzo <lrizzo@...gle.com>");
+MODULE_DESCRIPTION("Platform wide software interrupt moderation");
+module_init(init_irq_moderation);
diff --git a/kernel/irq/irq_moderation.h b/kernel/irq/irq_moderation.h
new file mode 100644
index 0000000000000..ccb8193482b51
--- /dev/null
+++ b/kernel/irq/irq_moderation.h
@@ -0,0 +1,149 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+
+#ifndef _LINUX_IRQ_MODERATION_H
+#define _LINUX_IRQ_MODERATION_H
+
+/*
+ * Platform wide software interrupt moderation, see
+ * Documentation/core-api/irq/irq-moderation.rst
+ */
+
+#include <linux/hrtimer.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/kernel.h>
+
+#ifdef CONFIG_IRQ_SOFT_MODERATION
+
+/**
+ * struct irq_mod_info - global configuration parameters and state
+ * @total_intrs: running count updated every update_ms
+ * @total_cpus: as above, active CPUs in this interval
+ * @procfs_write_ns: last write to /proc/irq/soft_moderation
+ * @delay_us: fixed delay, or maximum for adaptive
+ * @target_irq_rate: target maximum interrupt rate
+ * @hardirq_percent: target maximum hardirq percentage
+ * @timer_rounds: how many timer polls once moderation fires
+ * @update_ms: how often to update delay/rate/fraction
+ * @scale_cpus: (percent) scale factor to estimate active CPUs
+ * @count_timer_calls: count timer calls for irq limits
+ * @count_msi_calls: count calls from posted_msi for irq limits
+ * @decay_factor: smoothing factor for the control loop, keep at 16
+ * @grow_factor: smoothing factor for the control loop, keep it at 8
+ */
+struct irq_mod_info {
+ /* These fields are written to by all CPUs */
+ ____cacheline_aligned
+ atomic_long_t total_intrs;
+ atomic_long_t total_cpus;
+
+ /* These are mostly read (frequently), so use a different cacheline */
+ ____cacheline_aligned
+ u64 procfs_write_ns;
+ uint delay_us;
+ uint target_irq_rate;
+ uint hardirq_percent;
+ uint timer_rounds;
+ uint update_ms;
+ uint scale_cpus;
+ uint count_timer_calls;
+ uint count_msi_calls;
+ uint decay_factor;
+ uint grow_factor;
+ uint pad[];
+};
+
+extern struct irq_mod_info irq_mod_info;
+
+/**
+ * struct irq_mod_state - per-CPU moderation state
+ *
+ * @timer: moderation timer
+ * @descs: list of moderated irq_desc on this CPU
+ *
+ * Counters on last time we updated moderation delay
+ * @last_ns: time of last update
+ * @last_irqtime: from cpustat[CPUTIME_IRQ]
+ * @last_total_irqs: from irq_mod_info
+ * @last_total_cpus: from irq_mod_info
+ *
+ * Local info to control hooks and timer callbacks
+ * @dont_count: do not count this interrupt
+ * @in_posted_msi: don't suppress handle_irq, set in posted_msi handler
+ * @kick_posted_msi: kick posted_msi from the timer callback
+ * @rounds_left: how many rounds left for timer callbacks
+ *
+ * @irq_count: irqs in the last cycle, signed as we also decrement
+ * @update_ns: fetched from irq_mod_info
+ * @delay_ns: fetched from irq_mod_info
+ * @mod_ns: current moderation delay, recomputed every update_ms
+ * @sleep_ns: accumulated time for actual delay
+ *
+ * Statistics
+ * @irq_rate: smoothed global irq rate
+ * @my_irq_rate: smoothed irq rate for this CPU
+ * @scaled_cpu_count: smoothed CPU count (scaled)
+ * @scaled_src_count: smoothed count of irq sources (scaled)
+ * @irq_high: how many times global irq above threshold
+ * @my_irq_high: how many times local irq above threshold
+ * @hardirq_high: how many times local hardirq_percent above threshold
+ * @timer_set: how many timer_set calls
+ * @timer_fire: how many timer_fire, must match timer_set in timer callback
+ * @disable_irq: how many disable_irq calls
+ * @enable_irq: how many enable_irq, must match disable_irq in timer callback
+ * @timer_calls: how many handler calls from timer interrupt
+ * @from_posted_msi: how many calls from posted_msi handler
+ * @stray_irq: how many stray interrupts
+ */
+struct irq_mod_state {
+ struct hrtimer timer;
+ struct list_head descs;
+
+ /* Counters on last time we updated moderation delay */
+ u64 last_ns;
+ u64 last_irqtime;
+ u64 last_total_irqs;
+ u64 last_total_cpus;
+
+ bool dont_count;
+ bool in_posted_msi;
+ bool kick_posted_msi;
+ u8 rounds_left;
+
+ u32 irq_count;
+ u32 update_ns;
+ u32 delay_ns;
+ u32 mod_ns;
+ u32 sleep_ns;
+
+ /* Statistics */
+ u32 irq_rate;
+ u32 my_irq_rate;
+ u32 scaled_cpu_count;
+ u32 scaled_src_count;
+ u32 irq_high;
+ u32 my_irq_high;
+ u32 hardirq_high;
+ u32 timer_set;
+ u32 timer_fire;
+ u32 disable_irq;
+ u32 enable_irq;
+ u32 timer_calls;
+ u32 from_posted_msi;
+ u32 stray_irq;
+ int pad[] ____cacheline_aligned;
+};
+
+DECLARE_PER_CPU_ALIGNED(struct irq_mod_state, irq_mod_state);
+
+void irq_moderation_procfs_add(struct irq_desc *desc, umode_t umode);
+void irq_moderation_procfs_remove(struct irq_desc *desc);
+
+#else /* CONFIG_IRQ_SOFT_MODERATION */
+
+static inline void irq_moderation_procfs_add(struct irq_desc *desc, umode_t umode) {}
+static inline void irq_moderation_procfs_remove(struct irq_desc *desc) {}
+
+#endif /* !CONFIG_IRQ_SOFT_MODERATION */
+
+#endif /* _LINUX_IRQ_MODERATION_H */
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index db714d3014b5f..e5cdade3dbbce 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -134,6 +134,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
desc->tot_count = 0;
desc->name = NULL;
desc->owner = owner;
+ irq_moderation_init_fields(&desc->mod);
for_each_possible_cpu(cpu)
*per_cpu_ptr(desc->kstat_irqs, cpu) = (struct irqstat) { };
desc_smp_init(desc, node, affinity);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 29c2404e743be..5dcbc36b7de1b 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -14,6 +14,7 @@
#include <linux/mutex.h>
#include "internals.h"
+#include "irq_moderation.h"
/*
* Access rules:
@@ -374,6 +375,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
proc_create_single_data("effective_affinity_list", 0444, desc->dir,
irq_effective_aff_list_proc_show, irqp);
# endif
+ irq_moderation_procfs_add(desc, 0644);
#endif
proc_create_single_data("spurious", 0444, desc->dir,
irq_spurious_proc_show, (void *)(long)irq);
@@ -395,6 +397,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
remove_proc_entry("effective_affinity", desc->dir);
remove_proc_entry("effective_affinity_list", desc->dir);
# endif
+ irq_moderation_procfs_remove(desc);
#endif
remove_proc_entry("spurious", desc->dir);
--
2.52.0.rc1.455.g30608eb744-goog
Powered by blists - more mailing lists