[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240530170259.852088-1-pasha.tatashin@soleen.com>
Date: Thu, 30 May 2024 17:02:59 +0000
From: Pasha Tatashin <pasha.tatashin@...een.com>
To: akpm@...ux-foundation.org,
jpoimboe@...nel.org,
pasha.tatashin@...een.com,
kent.overstreet@...ux.dev,
peterz@...radead.org,
nphamcs@...il.com,
cerasuolodomenico@...il.com,
surenb@...gle.com,
lizhijian@...itsu.com,
willy@...radead.org,
shakeel.butt@...ux.dev,
vbabka@...e.cz,
ziy@...dia.com,
linux-kernel@...r.kernel.org,
linux-mm@...ck.org
Subject: [PATCH v3] vmstat: Kernel stack usage histogram
Provide a kernel stack usage histogram to aid in optimizing kernel stack
sizes and minimizing memory waste in large-scale environments. The
histogram divides stack usage into power-of-two buckets and reports the
results in /proc/vmstat. This information is especially valuable in
environments with millions of machines, where even small optimizations
can have a significant impact.
The histogram data is presented in /proc/vmstat with entries like
"kstack_1k", "kstack_2k", and so on, indicating the number of threads
that exited with stack usage falling within each respective bucket.
Example outputs:
Intel:
$ grep kstack /proc/vmstat
kstack_1k 3
kstack_2k 188
kstack_4k 11391
kstack_8k 243
kstack_16k 0
ARM with 64K page_size:
$ grep kstack /proc/vmstat
kstack_1k 1
kstack_2k 340
kstack_4k 25212
kstack_8k 1659
kstack_16k 0
kstack_32k 0
kstack_64k 0
Signed-off-by: Pasha Tatashin <pasha.tatashin@...een.com>
---
Changelog:
v3:
- Changed from page counts to power-of-two buckets, this is helpful for
builds with large base pages (i.e. arm64 with 64K pages) to evaluate
kernel stack internal fragmentation.
include/linux/sched/task_stack.h | 49 ++++++++++++++++++++++++++++++--
include/linux/vm_event_item.h | 42 +++++++++++++++++++++++++++
include/linux/vmstat.h | 16 -----------
mm/vmstat.c | 24 ++++++++++++++++
4 files changed, 113 insertions(+), 18 deletions(-)
diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
index ccd72b978e1f..65e8c9fb7f9b 100644
--- a/include/linux/sched/task_stack.h
+++ b/include/linux/sched/task_stack.h
@@ -95,9 +95,51 @@ static inline int object_is_on_stack(const void *obj)
extern void thread_stack_cache_init(void);
#ifdef CONFIG_DEBUG_STACK_USAGE
+#ifdef CONFIG_VM_EVENT_COUNTERS
+#include <linux/vm_event_item.h>
+
+/* Count the maximum pages reached in kernel stacks */
+static inline void kstack_histogram(unsigned long used_stack)
+{
+ if (used_stack <= 1024)
+ this_cpu_inc(vm_event_states.event[KSTACK_1K]);
+#if THREAD_SIZE > 1024
+ else if (used_stack <= 2048)
+ this_cpu_inc(vm_event_states.event[KSTACK_2K]);
+#endif
+#if THREAD_SIZE > 2048
+ else if (used_stack <= 4096)
+ this_cpu_inc(vm_event_states.event[KSTACK_4K]);
+#endif
+#if THREAD_SIZE > 4096
+ else if (used_stack <= 8192)
+ this_cpu_inc(vm_event_states.event[KSTACK_8K]);
+#endif
+#if THREAD_SIZE > 8192
+ else if (used_stack <= 16384)
+ this_cpu_inc(vm_event_states.event[KSTACK_16K]);
+#endif
+#if THREAD_SIZE > 16384
+ else if (used_stack <= 32768)
+ this_cpu_inc(vm_event_states.event[KSTACK_32K]);
+#endif
+#if THREAD_SIZE > 32768
+ else if (used_stack <= 65536)
+ this_cpu_inc(vm_event_states.event[KSTACK_64K]);
+#endif
+#if THREAD_SIZE > 65536
+ else
+ this_cpu_inc(vm_event_states.event[KSTACK_REST]);
+#endif
+}
+#else /* !CONFIG_VM_EVENT_COUNTERS */
+static inline void kstack_histogram(unsigned long used_stack) {}
+#endif /* CONFIG_VM_EVENT_COUNTERS */
+
static inline unsigned long stack_not_used(struct task_struct *p)
{
unsigned long *n = end_of_stack(p);
+ unsigned long unused_stack;
do { /* Skip over canary */
# ifdef CONFIG_STACK_GROWSUP
@@ -108,10 +150,13 @@ static inline unsigned long stack_not_used(struct task_struct *p)
} while (!*n);
# ifdef CONFIG_STACK_GROWSUP
- return (unsigned long)end_of_stack(p) - (unsigned long)n;
+ unused_stack = (unsigned long)end_of_stack(p) - (unsigned long)n;
# else
- return (unsigned long)n - (unsigned long)end_of_stack(p);
+ unused_stack = (unsigned long)n - (unsigned long)end_of_stack(p);
# endif
+ kstack_histogram(THREAD_SIZE - unused_stack);
+
+ return unused_stack;
}
#endif
extern void set_task_stack_end_magic(struct task_struct *tsk);
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 747943bc8cc2..73fa5fbf33a3 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -154,9 +154,51 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
VMA_LOCK_RETRY,
VMA_LOCK_MISS,
#endif
+#ifdef CONFIG_DEBUG_STACK_USAGE
+ KSTACK_1K,
+#if THREAD_SIZE > 1024
+ KSTACK_2K,
+#endif
+#if THREAD_SIZE > 2048
+ KSTACK_4K,
+#endif
+#if THREAD_SIZE > 4096
+ KSTACK_8K,
+#endif
+#if THREAD_SIZE > 8192
+ KSTACK_16K,
+#endif
+#if THREAD_SIZE > 16384
+ KSTACK_32K,
+#endif
+#if THREAD_SIZE > 32768
+ KSTACK_64K,
+#endif
+#if THREAD_SIZE > 65536
+ KSTACK_REST,
+#endif
+#endif /* CONFIG_DEBUG_STACK_USAGE */
NR_VM_EVENT_ITEMS
};
+#ifdef CONFIG_VM_EVENT_COUNTERS
+/*
+ * Light weight per cpu counter implementation.
+ *
+ * Counters should only be incremented and no critical kernel component
+ * should rely on the counter values.
+ *
+ * Counters are handled completely inline. On many platforms the code
+ * generated will simply be the increment of a global address.
+ */
+
+struct vm_event_state {
+ unsigned long event[NR_VM_EVENT_ITEMS];
+};
+
+DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
+#endif
+
#ifndef CONFIG_TRANSPARENT_HUGEPAGE
#define THP_FILE_ALLOC ({ BUILD_BUG(); 0; })
#define THP_FILE_FALLBACK ({ BUILD_BUG(); 0; })
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 343906a98d6e..18d4a97d3afd 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -41,22 +41,6 @@ enum writeback_stat_item {
};
#ifdef CONFIG_VM_EVENT_COUNTERS
-/*
- * Light weight per cpu counter implementation.
- *
- * Counters should only be incremented and no critical kernel component
- * should rely on the counter values.
- *
- * Counters are handled completely inline. On many platforms the code
- * generated will simply be the increment of a global address.
- */
-
-struct vm_event_state {
- unsigned long event[NR_VM_EVENT_ITEMS];
-};
-
-DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
-
/*
* vm counters are allowed to be racy. Use raw_cpu_ops to avoid the
* local_irq_disable overhead.
diff --git a/mm/vmstat.c b/mm/vmstat.c
index db79935e4a54..21932bd6a449 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1413,6 +1413,30 @@ const char * const vmstat_text[] = {
"vma_lock_retry",
"vma_lock_miss",
#endif
+#ifdef CONFIG_DEBUG_STACK_USAGE
+ "kstack_1k",
+#if THREAD_SIZE > 1024
+ "kstack_2k",
+#endif
+#if THREAD_SIZE > 2048
+ "kstack_4k",
+#endif
+#if THREAD_SIZE > 4096
+ "kstack_8k",
+#endif
+#if THREAD_SIZE > 8192
+ "kstack_16k",
+#endif
+#if THREAD_SIZE > 16384
+ "kstack_32k",
+#endif
+#if THREAD_SIZE > 32768
+ "kstack_64k",
+#endif
+#if THREAD_SIZE > 65536
+ "kstack_rest",
+#endif
+#endif
#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
};
#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
--
2.45.1.288.g0e0cd299f1-goog
Powered by blists - more mailing lists