linux-kernel - [RFC PATCH v1 11/57] fork: Permit boot-time THREAD

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20241014105912.3207374-11-ryan.roberts@arm.com>
Date: Mon, 14 Oct 2024 11:58:18 +0100
From: Ryan Roberts <ryan.roberts@....com>
To: Andrew Morton <akpm@...ux-foundation.org>,
	Andrey Ryabinin <ryabinin.a.a@...il.com>,
	Anshuman Khandual <anshuman.khandual@....com>,
	Ard Biesheuvel <ardb@...nel.org>,
	Arnd Bergmann <arnd@...db.de>,
	Catalin Marinas <catalin.marinas@....com>,
	David Hildenbrand <david@...hat.com>,
	Greg Marsden <greg.marsden@...cle.com>,
	Ingo Molnar <mingo@...hat.com>,
	Ivan Ivanov <ivan.ivanov@...e.com>,
	Juri Lelli <juri.lelli@...hat.com>,
	Kalesh Singh <kaleshsingh@...gle.com>,
	Marc Zyngier <maz@...nel.org>,
	Mark Rutland <mark.rutland@....com>,
	Matthias Brugger <mbrugger@...e.com>,
	Miroslav Benes <mbenes@...e.cz>,
	Peter Zijlstra <peterz@...radead.org>,
	Vincent Guittot <vincent.guittot@...aro.org>,
	Will Deacon <will@...nel.org>
Cc: Ryan Roberts <ryan.roberts@....com>,
	kasan-dev@...glegroups.com,
	linux-arch@...r.kernel.org,
	linux-arm-kernel@...ts.infradead.org,
	linux-kernel@...r.kernel.org,
	linux-mm@...ck.org
Subject: [RFC PATCH v1 11/57] fork: Permit boot-time THREAD_SIZE determination

THREAD_SIZE defines the size of a kernel thread stack. To date, it has
been set at compile-time. However, when using vmap stacks, the size must
be a multiple of PAGE_SIZE, and given we are in the process of
supporting boot-time page size, we must also do the same for
THREAD_SIZE.

The alternative would be to define THREAD_SIZE for the largest supported
page size, but this would waste memory when using a smaller page size.
For example, arm64 requires THREAD_SIZE to be 16K, but when using 64K
pages and a vmap stack, we must increase the size to 64K. If we required
64K when 4K or 16K page size was in use, we would waste 48K per kernel
thread.

So let's refactor to allow THREAD_SIZE to not be a compile-time
constant. THREAD_SIZE_MAX (and THREAD_ALIGN_MAX) are introduced to
manage the limits, as is done for PAGE_SIZE.

When THREAD_SIZE is a compile-time constant, behaviour and code size
should be equivalent.

Signed-off-by: Ryan Roberts <ryan.roberts@....com>
---

***NOTE***
Any confused maintainers may want to read the cover note here for context:
https://lore.kernel.org/all/20241014105514.3206191-1-ryan.roberts@arm.com/

 include/asm-generic/vmlinux.lds.h |  6 ++-
 include/linux/sched.h             |  4 +-
 include/linux/thread_info.h       | 10 ++++-
 init/main.c                       |  2 +-
 kernel/fork.c                     | 67 +++++++++++--------------------
 mm/kasan/report.c                 |  3 +-
 6 files changed, 42 insertions(+), 50 deletions(-)

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 5727f883001bb..f19bab7a2e8f9 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -56,6 +56,10 @@
 #define LOAD_OFFSET 0
 #endif
 
+#ifndef THREAD_SIZE_MAX
+#define THREAD_SIZE_MAX		THREAD_SIZE
+#endif
+
 /*
  * Only some architectures want to have the .notes segment visible in
  * a separate PT_NOTE ELF Program Header. When this happens, it needs
@@ -398,7 +402,7 @@
 	init_stack = .;							\
 	KEEP(*(.data..init_task))					\
 	KEEP(*(.data..init_thread_info))				\
-	. = __start_init_stack + THREAD_SIZE;				\
+	. = __start_init_stack + THREAD_SIZE_MAX;			\
 	__end_init_stack = .;
 
 #define JUMP_TABLE_DATA							\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f8d150343d42d..3de4f655ee492 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1863,14 +1863,14 @@ union thread_union {
 #ifndef CONFIG_THREAD_INFO_IN_TASK
 	struct thread_info thread_info;
 #endif
-	unsigned long stack[THREAD_SIZE/sizeof(long)];
+	unsigned long stack[THREAD_SIZE_MAX/sizeof(long)];
 };
 
 #ifndef CONFIG_THREAD_INFO_IN_TASK
 extern struct thread_info init_thread_info;
 #endif
 
-extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)];
+extern unsigned long init_stack[THREAD_SIZE_MAX / sizeof(unsigned long)];
 
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 # define task_thread_info(task)	(&(task)->thread_info)
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 9ea0b28068f49..a7ccc448cd298 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -74,7 +74,15 @@ static inline long set_restart_fn(struct restart_block *restart,
 }
 
 #ifndef THREAD_ALIGN
-#define THREAD_ALIGN	THREAD_SIZE
+#define THREAD_ALIGN		THREAD_SIZE
+#endif
+
+#ifndef THREAD_SIZE_MAX
+#define THREAD_SIZE_MAX		THREAD_SIZE
+#endif
+
+#ifndef THREAD_ALIGN_MAX
+#define THREAD_ALIGN_MAX	max(THREAD_ALIGN, THREAD_SIZE_MAX)
 #endif
 
 #define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_ZERO)
diff --git a/init/main.c b/init/main.c
index ba1515eb20b9d..4dc28115fdf57 100644
--- a/init/main.c
+++ b/init/main.c
@@ -797,7 +797,7 @@ void __init __weak smp_prepare_boot_cpu(void)
 {
 }
 
-# if THREAD_SIZE >= PAGE_SIZE
+#ifdef CONFIG_VMAP_STACK
 void __init __weak thread_stack_cache_init(void)
 {
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index ea472566d4fcc..cbc3e73f9b501 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -184,13 +184,7 @@ static inline void free_task_struct(struct task_struct *tsk)
 	kmem_cache_free(task_struct_cachep, tsk);
 }
 
-/*
- * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
- * kmemcache based allocator.
- */
-# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
-
-#  ifdef CONFIG_VMAP_STACK
+#ifdef CONFIG_VMAP_STACK
 /*
  * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
  * flush.  Try to minimize the number of calls by caching stacks.
@@ -343,46 +337,21 @@ static void free_thread_stack(struct task_struct *tsk)
 	tsk->stack_vm_area = NULL;
 }
 
-#  else /* !CONFIG_VMAP_STACK */
+#else /* !CONFIG_VMAP_STACK */
 
-static void thread_stack_free_rcu(struct rcu_head *rh)
-{
-	__free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
-}
-
-static void thread_stack_delayed_free(struct task_struct *tsk)
-{
-	struct rcu_head *rh = tsk->stack;
-
-	call_rcu(rh, thread_stack_free_rcu);
-}
-
-static int alloc_thread_stack_node(struct task_struct *tsk, int node)
-{
-	struct page *page = alloc_pages_node(node, THREADINFO_GFP,
-					     THREAD_SIZE_ORDER);
-
-	if (likely(page)) {
-		tsk->stack = kasan_reset_tag(page_address(page));
-		return 0;
-	}
-	return -ENOMEM;
-}
-
-static void free_thread_stack(struct task_struct *tsk)
-{
-	thread_stack_delayed_free(tsk);
-	tsk->stack = NULL;
-}
-
-#  endif /* CONFIG_VMAP_STACK */
-# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */
+/*
+ * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
+ * kmemcache based allocator.
+ */
 
 static struct kmem_cache *thread_stack_cache;
 
 static void thread_stack_free_rcu(struct rcu_head *rh)
 {
-	kmem_cache_free(thread_stack_cache, rh);
+	if (THREAD_SIZE >= PAGE_SIZE)
+		__free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
+	else
+		kmem_cache_free(thread_stack_cache, rh);
 }
 
 static void thread_stack_delayed_free(struct task_struct *tsk)
@@ -395,7 +364,16 @@ static void thread_stack_delayed_free(struct task_struct *tsk)
 static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 {
 	unsigned long *stack;
-	stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
+	struct page *page;
+
+	if (THREAD_SIZE >= PAGE_SIZE) {
+		page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER);
+		stack = likely(page) ? page_address(page) : NULL;
+	} else {
+		stack = kmem_cache_alloc_node(thread_stack_cache,
+					      THREADINFO_GFP, node);
+	}
+
 	stack = kasan_reset_tag(stack);
 	tsk->stack = stack;
 	return stack ? 0 : -ENOMEM;
@@ -409,13 +387,16 @@ static void free_thread_stack(struct task_struct *tsk)
 
 void thread_stack_cache_init(void)
 {
+	if (THREAD_SIZE >= PAGE_SIZE)
+		return;
+
 	thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
 					THREAD_SIZE, THREAD_SIZE, 0, 0,
 					THREAD_SIZE, NULL);
 	BUG_ON(thread_stack_cache == NULL);
 }
 
-# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
+#endif /* CONFIG_VMAP_STACK */
 
 /* SLAB cache for signal_struct structures (tsk->signal) */
 static struct kmem_cache *signal_cachep;
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index b48c768acc84d..57c877852dbc6 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -365,8 +365,7 @@ static inline bool kernel_or_module_addr(const void *addr)
 static inline bool init_task_stack_addr(const void *addr)
 {
 	return addr >= (void *)&init_thread_union.stack &&
-		(addr <= (void *)&init_thread_union.stack +
-			sizeof(init_thread_union.stack));
+		(addr <= (void *)&init_thread_union.stack + THREAD_SIZE);
 }
 
 static void print_address_description(void *addr, u8 tag,
-- 
2.43.0