[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250116151543.80163-2-changwoo@igalia.com>
Date: Fri, 17 Jan 2025 00:15:37 +0900
From: Changwoo Min <changwoo@...lia.com>
To: tj@...nel.org,
void@...ifault.com,
arighi@...dia.com
Cc: kernel-dev@...lia.com,
linux-kernel@...r.kernel.org,
Changwoo Min <changwoo@...lia.com>
Subject: [PATCH 1/7] sched_ext: Implement event counter infrastructure and add an event
Collect the statistics of specific types of behavior in the sched_ext core,
which are not easily visible but still interesting to an scx scheduler.
Also, add a core event, SCX_EVENT_INVAL_SELECT_CPU, which represents how
many times ops.select_cpu() returns a CPU that the task can't use.
Signed-off-by: Changwoo Min <changwoo@...lia.com>
---
kernel/sched/ext.c | 120 ++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 118 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 0bcdd1a31676..7e12d5b8322e 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1458,6 +1458,66 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
return p;
}
+/*
+ * Collection of event counters.
+ */
+struct scx_event_stat {
+ /*
+ * If ops.select_cpu() returns a CPU which can't be used by the task,
+ * the core scheduler code silently picks a fallback CPU.
+ */
+ u64 INVAL_SELECT_CPU;
+};
+
+#define SCX_EVENT_IDX(e) (offsetof(struct scx_event_stat, e)/sizeof(u64))
+#define SCX_EVENT_END_IDX() (sizeof(struct scx_event_stat)/sizeof(u64))
+#define SCX_EVENT_DEFINE(e) SCX_EVENT_##e = SCX_EVENT_IDX(e)
+
+/*
+ * Types of event counters.
+ */
+enum scx_event_kind {
+ SCX_EVENT_BEGIN = 0,
+ SCX_EVENT_DEFINE(INVAL_SELECT_CPU),
+ SCX_EVENT_END = SCX_EVENT_END_IDX(),
+};
+
+static const char *scx_event_stat_str[] = {
+ [SCX_EVENT_INVAL_SELECT_CPU] = "invalid_select_cpu",
+};
+
+/*
+ * The event counter is organized by a per-CPU variable to minimize the
+ * accounting overhead without synchronization. A system-wide view on the
+ * event counter is constructed when requested by scx_bpf_get_event_stat().
+ */
+static DEFINE_PER_CPU(struct scx_event_stat, event_stats);
+
+/**
+ * scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @name: an event name defined in struct scx_event_stat
+ * @cnt: the number of the event occured
+ */
+#define scx_add_event(name, cnt) ({ \
+ struct scx_event_stat *__e; \
+ __e = get_cpu_ptr(&event_stats); \
+ WRITE_ONCE(__e->name, __e->name+ (cnt)); \
+ put_cpu_ptr(&event_stats); \
+})
+
+
+/**
+ * scx_read_event_kind - Read an event from 'e' with 'kind'
+ * @e: a pointer to an event collected by scx_bpf_event_stat()
+ * @kine: an event type defined in scx_event_kind
+ */
+#define scx_read_event_kind(e, kind) ({ \
+ u64 *__e64 = (u64 *)(e); \
+ __e64[kind]; \
+})
+
+static void scx_bpf_event_stat(struct scx_event_stat *event, size_t event__sz);
+
static enum scx_ops_enable_state scx_ops_enable_state(void)
{
return atomic_read(&scx_ops_enable_state_var);
@@ -3607,8 +3667,10 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
*ddsp_taskp = NULL;
if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
return cpu;
- else
+ else {
+ scx_add_event(INVAL_SELECT_CPU, 1);
return prev_cpu;
+ }
} else {
bool found;
s32 cpu;
@@ -5053,6 +5115,15 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
scx_rq_clock_invalidate(rq);
}
+ /*
+ * Clear event counters so the next scx scheduler always gets
+ * fresh event counter values.
+ */
+ for_each_possible_cpu(cpu) {
+ struct scx_event_stat *e = per_cpu_ptr(&event_stats, cpu);
+ memset(e, 0, sizeof(*e));
+ }
+
/* no task is on scx, turn off all the switches and flush in-progress calls */
static_branch_disable(&__scx_ops_enabled);
for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
@@ -5309,9 +5380,10 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
.at_jiffies = jiffies,
};
struct seq_buf s;
+ struct scx_event_stat event;
unsigned long flags;
char *buf;
- int cpu;
+ int cpu, kind;
spin_lock_irqsave(&dump_lock, flags);
@@ -5417,6 +5489,16 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
rq_unlock(rq, &rf);
}
+ dump_newline(&s);
+ dump_line(&s, "Event counters");
+ dump_line(&s, "--------------");
+
+ scx_bpf_event_stat(&event, sizeof(event));
+ for (kind = SCX_EVENT_BEGIN; kind < SCX_EVENT_END; kind++) {
+ dump_line(&s, "%25s : %llu", scx_event_stat_str[kind],
+ scx_read_event_kind(&event, kind));
+ }
+
if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
memcpy(ei->dump + dump_len - sizeof(trunc_marker),
trunc_marker, sizeof(trunc_marker));
@@ -7720,6 +7802,39 @@ __bpf_kfunc u64 scx_bpf_now(void)
return clock;
}
+/*
+ * scx_bpf_event_stat - Get a system-wide event counter to
+ * @event: output buffer from a BPF program
+ * @event__sz: @event len, must end in '__sz'' for the verifier
+ */
+__bpf_kfunc void scx_bpf_event_stat(struct scx_event_stat *event,
+ size_t event__sz)
+{
+ struct scx_event_stat *e;
+ u64 *event64, *e64;
+ int cpu, kind, event_end;
+
+ /*
+ * We cannot entirely trust a BPF-provided size since a BPF program
+ * might be compiled against a different vmlinux.h, of which
+ * scx_event_stat would be larger (a newer vmlinux.h) or smaller
+ * (an older vmlinux.h). Hence, we use the smaller size to avoid
+ * memory corruption.
+ */
+ event__sz = min(event__sz, sizeof(*event));
+ event_end = event__sz / sizeof(u64);
+
+ event64 = (u64 *)event;
+ memset(event, 0, event__sz);
+ for_each_possible_cpu(cpu) {
+ e = per_cpu_ptr(&event_stats, cpu);
+ e64 = (u64 *)e;
+ for (kind = 0; kind < event_end; kind++) {
+ event64[kind] += READ_ONCE(e64[kind]);
+ }
+ }
+}
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_any)
@@ -7752,6 +7867,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
#endif
BTF_ID_FLAGS(func, scx_bpf_now)
+BTF_ID_FLAGS(func, scx_bpf_event_stat, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(scx_kfunc_ids_any)
static const struct btf_kfunc_id_set scx_kfunc_set_any = {
--
2.48.1
Powered by blists - more mailing lists