lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date:	Thu, 16 Jul 2015 12:14:37 -0700
From:	Dave Hansen <dave@...1.net>
To:	dave@...1.net
Cc:	tglx@...utronix.de, mingo@...hat.com, hpa@...or.com,
	x86@...nel.org, peterz@...radead.org, bp@...en8.de,
	luto@...capital.net, torvalds@...ux-foundation.org,
	linux-kernel@...r.kernel.org
Subject: [RFC][PATCH] x86, fpu: dynamically allocate 'struct fpu'


The FPU rewrite removed the dynamic allocations of 'struct fpu'.
But, this potentially wastes massive amounts of memory (2k per
task on systems that do not have AVX-512 for instance).

Instead of having a separate slab, this patch just appends the
space that we need to the 'task_struct' which we dynamically
allocate already.  This saves from doing an extra slab allocation
at fork().  The only real downside here is that we have to stick
everything and the end of the task_struct.  But, I think the
BUILD_BUG_ON()s I stuck in there should keep that from being too
fragile.

This survives a quick build and boot in a VM.  Does anyone see any
real downsides to this?

Cc: Thomas Gleixner <tglx@...utronix.de>
Cc: Ingo Molnar <mingo@...hat.com>
Cc: "H. Peter Anvin" <hpa@...or.com>
Cc: x86@...nel.org
Cc: Peter Zijlstra <peterz@...radead.org>
Cc: Borislav Petkov <bp@...en8.de>
Cc: Andy Lutomirski <luto@...capital.net>
Cc: Linus Torvalds <torvalds@...ux-foundation.org>
Cc: linux-kernel@...r.kernel.org

---

 b/arch/x86/include/asm/fpu/types.h |   72 +++++++++++++++++++------------------
 b/arch/x86/include/asm/processor.h |   10 +++--
 b/arch/x86/kernel/fpu/init.c       |   39 ++++++++++++++++++++
 b/arch/x86/kernel/process.c        |    2 -
 b/fs/proc/kcore.c                  |    4 +-
 b/include/linux/sched.h            |   12 +++++-
 b/kernel/fork.c                    |    8 +++-
 7 files changed, 104 insertions(+), 43 deletions(-)

diff -puN arch/x86/include/asm/processor.h~dynamically-allocate-struct-fpu arch/x86/include/asm/processor.h
--- a/arch/x86/include/asm/processor.h~dynamically-allocate-struct-fpu	2015-07-16 10:50:42.340570967 -0700
+++ b/arch/x86/include/asm/processor.h	2015-07-16 11:37:19.182539322 -0700
@@ -390,9 +390,6 @@ struct thread_struct {
 #endif
 	unsigned long		gs;
 
-	/* Floating point and extended processor state */
-	struct fpu		fpu;
-
 	/* Save middle states of ptrace breakpoints */
 	struct perf_event	*ptrace_bps[HBP_NUM];
 	/* Debug status used for traps, single steps, etc... */
@@ -418,6 +415,13 @@ struct thread_struct {
 	unsigned long		iopl;
 	/* Max allowed port in the bitmap, in bytes: */
 	unsigned		io_bitmap_max;
+
+	/* Floating point and extended processor state */
+	struct fpu		fpu;
+	/*
+	 * WARNING: 'fpu' is dynamically-sized.  It *MUST* be at
+	 * the end.
+	 */
 };
 
 /*
diff -puN include/linux/sched.h~dynamically-allocate-struct-fpu include/linux/sched.h
--- a/include/linux/sched.h~dynamically-allocate-struct-fpu	2015-07-16 10:50:42.342571058 -0700
+++ b/include/linux/sched.h	2015-07-16 11:44:10.775174149 -0700
@@ -1522,8 +1522,6 @@ struct task_struct {
 /* hung task detection */
 	unsigned long last_switch_count;
 #endif
-/* CPU-specific state of this task */
-	struct thread_struct thread;
 /* filesystem information */
 	struct fs_struct *fs;
 /* open file information */
@@ -1778,8 +1776,18 @@ struct task_struct {
 	unsigned long	task_state_change;
 #endif
 	int pagefault_disabled;
+/* CPU-specific state of this task */
+	struct thread_struct thread;
+/*
+ * WARNING: on x86, 'thread_struct' contains a variable-sized
+ * structure.  It *MUST* be at the end of 'task_struct'.
+ *
+ * Do not put anything below here!
+ */
 };
 
+extern int arch_task_struct_size(void);
+
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
 
diff -puN arch/x86/kernel/fpu/init.c~dynamically-allocate-struct-fpu arch/x86/kernel/fpu/init.c
--- a/arch/x86/kernel/fpu/init.c~dynamically-allocate-struct-fpu	2015-07-16 10:50:42.355571648 -0700
+++ b/arch/x86/kernel/fpu/init.c	2015-07-16 12:02:15.284280976 -0700
@@ -136,6 +136,45 @@ static void __init fpu__init_system_gene
 unsigned int xstate_size;
 EXPORT_SYMBOL_GPL(xstate_size);
 
+#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER)	\
+	BUILD_BUG_ON((sizeof(TYPE) -			\
+			offsetof(TYPE, MEMBER) -	\
+			sizeof(((TYPE *)0)->MEMBER)) > 	\
+			0)				\
+
+/*
+ * We append the 'struct fpu' to the task_struct.
+ */
+int __weak arch_task_struct_size(void)
+{
+	int task_size = sizeof(struct task_struct);
+
+	/*
+	 * Subtract off the static size of the register state.
+	 * It potentially has a bunch of padding.
+	 */
+	task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state);
+
+	/*
+	 * Add back the dynamically-calculated register state
+	 * size.
+	 */
+	task_size += xstate_size;
+
+	/*
+	 * We dynamically size 'struct fpu', so we require that
+	 * it be at the end of 'thread_struct' and that
+	 * 'thread_struct' be at the end of 'task_struct'.  If
+	 * you hit a compile error here, check the structure to
+	 * see if something got added to the end.
+	 */
+	CHECK_MEMBER_AT_END_OF(struct fpu, state);
+	CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
+	CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
+
+	return task_size;
+}
+
 /*
  * Set up the xstate_size based on the legacy FPU context size.
  *
diff -puN kernel/fork.c~dynamically-allocate-struct-fpu kernel/fork.c
--- a/kernel/fork.c~dynamically-allocate-struct-fpu	2015-07-16 10:50:42.357571739 -0700
+++ b/kernel/fork.c	2015-07-16 11:25:53.873498188 -0700
@@ -287,15 +287,21 @@ static void set_max_threads(unsigned int
 	max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
 }
 
+int __weak arch_task_struct_size(void)
+{
+	return sizeof(struct task_struct);
+}
+
 void __init fork_init(void)
 {
+	int task_struct_size = arch_task_struct_size();
 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
 #endif
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep =
-		kmem_cache_create("task_struct", sizeof(struct task_struct),
+		kmem_cache_create("task_struct", task_struct_size,
 			ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
 #endif
 
diff -puN arch/x86/include/asm/fpu/types.h~dynamically-allocate-struct-fpu arch/x86/include/asm/fpu/types.h
--- a/arch/x86/include/asm/fpu/types.h~dynamically-allocate-struct-fpu	2015-07-16 10:50:42.358571784 -0700
+++ b/arch/x86/include/asm/fpu/types.h	2015-07-16 11:34:14.471174584 -0700
@@ -189,6 +189,7 @@ union fpregs_state {
 	struct fxregs_state		fxsave;
 	struct swregs_state		soft;
 	struct xregs_state		xsave;
+	u8 __padding[PAGE_SIZE];
 };
 
 /*
@@ -198,40 +199,6 @@ union fpregs_state {
  */
 struct fpu {
 	/*
-	 * @state:
-	 *
-	 * In-memory copy of all FPU registers that we save/restore
-	 * over context switches. If the task is using the FPU then
-	 * the registers in the FPU are more recent than this state
-	 * copy. If the task context-switches away then they get
-	 * saved here and represent the FPU state.
-	 *
-	 * After context switches there may be a (short) time period
-	 * during which the in-FPU hardware registers are unchanged
-	 * and still perfectly match this state, if the tasks
-	 * scheduled afterwards are not using the FPU.
-	 *
-	 * This is the 'lazy restore' window of optimization, which
-	 * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
-	 *
-	 * We detect whether a subsequent task uses the FPU via setting
-	 * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
-	 *
-	 * During this window, if the task gets scheduled again, we
-	 * might be able to skip having to do a restore from this
-	 * memory buffer to the hardware registers - at the cost of
-	 * incurring the overhead of #NM fault traps.
-	 *
-	 * Note that on modern CPUs that support the XSAVEOPT (or other
-	 * optimized XSAVE instructions), we don't use #NM traps anymore,
-	 * as the hardware can track whether FPU registers need saving
-	 * or not. On such CPUs we activate the non-lazy ('eagerfpu')
-	 * logic, which unconditionally saves/restores all FPU state
-	 * across context switches. (if FPU state exists.)
-	 */
-	union fpregs_state		state;
-
-	/*
 	 * @last_cpu:
 	 *
 	 * Records the last CPU on which this context was loaded into
@@ -288,6 +255,43 @@ struct fpu {
 	 * deal with bursty apps that only use the FPU for a short time:
 	 */
 	unsigned char			counter;
+	/*
+	 * @state:
+	 *
+	 * In-memory copy of all FPU registers that we save/restore
+	 * over context switches. If the task is using the FPU then
+	 * the registers in the FPU are more recent than this state
+	 * copy. If the task context-switches away then they get
+	 * saved here and represent the FPU state.
+	 *
+	 * After context switches there may be a (short) time period
+	 * during which the in-FPU hardware registers are unchanged
+	 * and still perfectly match this state, if the tasks
+	 * scheduled afterwards are not using the FPU.
+	 *
+	 * This is the 'lazy restore' window of optimization, which
+	 * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
+	 *
+	 * We detect whether a subsequent task uses the FPU via setting
+	 * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
+	 *
+	 * During this window, if the task gets scheduled again, we
+	 * might be able to skip having to do a restore from this
+	 * memory buffer to the hardware registers - at the cost of
+	 * incurring the overhead of #NM fault traps.
+	 *
+	 * Note that on modern CPUs that support the XSAVEOPT (or other
+	 * optimized XSAVE instructions), we don't use #NM traps anymore,
+	 * as the hardware can track whether FPU registers need saving
+	 * or not. On such CPUs we activate the non-lazy ('eagerfpu')
+	 * logic, which unconditionally saves/restores all FPU state
+	 * across context switches. (if FPU state exists.)
+	 */
+	union fpregs_state		state;
+	/*
+	 * WARNING: 'state' is dynamically-sized.  Do not put
+	 * anything after it here.
+	 */
 };
 
 #endif /* _ASM_X86_FPU_H */
diff -puN arch/x86/kernel/process.c~dynamically-allocate-struct-fpu arch/x86/kernel/process.c
--- a/arch/x86/kernel/process.c~dynamically-allocate-struct-fpu	2015-07-16 10:50:42.360571875 -0700
+++ b/arch/x86/kernel/process.c	2015-07-16 12:00:59.204808551 -0700
@@ -81,7 +81,7 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregist
  */
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
-	*dst = *src;
+	memcpy(dst, src, arch_task_struct_size());
 
 	return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
 }
diff -puN fs/proc/kcore.c~dynamically-allocate-struct-fpu fs/proc/kcore.c
--- a/fs/proc/kcore.c~dynamically-allocate-struct-fpu	2015-07-16 11:40:13.787445254 -0700
+++ b/fs/proc/kcore.c	2015-07-16 11:42:04.397453020 -0700
@@ -92,7 +92,7 @@ static size_t get_kcore_size(int *nphdr,
 			     roundup(sizeof(CORE_STR), 4)) +
 			roundup(sizeof(struct elf_prstatus), 4) +
 			roundup(sizeof(struct elf_prpsinfo), 4) +
-			roundup(sizeof(struct task_struct), 4);
+			roundup(arch_task_struct_size(), 4);
 	*elf_buflen = PAGE_ALIGN(*elf_buflen);
 	return size + *elf_buflen;
 }
@@ -415,7 +415,7 @@ static void elf_kcore_store_hdr(char *bu
 	/* set up the task structure */
 	notes[2].name	= CORE_STR;
 	notes[2].type	= NT_TASKSTRUCT;
-	notes[2].datasz	= sizeof(struct task_struct);
+	notes[2].datasz	= arch_task_struct_size();
 	notes[2].data	= current;
 
 	nhdr->p_filesz	+= notesize(&notes[2]);
_
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ