lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1430429035-25563-4-git-send-email-riel@redhat.com>
Date:	Thu, 30 Apr 2015 17:23:55 -0400
From:	riel@...hat.com
To:	linux-kernel@...r.kernel.org
Cc:	x86@...nel.org, williams@...hat.com, luto@...nel.org,
	mingo@...nel.org, bonzini@...hat.com, fweisbec@...hat.com,
	peterz@...radead.org, heiko.carstens@...ibm.com,
	tglx@...utronix.de, Rik van Riel <riel@...hat.com>,
	Ingo Molnar <mingo@...hat.com>,
	Paolo Bonzini <pbonzini@...hat.com>
Subject: [PATCH 3/3] context_tracking,x86: remove extraneous irq disable & enable from context tracking on syscall entry

From: Rik van Riel <riel@...hat.com>

On syscall entry with nohz_full on, we enable interrupts, call user_exit,
disable interrupts, do something, re-enable interrupts, and go on our
merry way.

Profiling shows that a large amount of the nohz_full overhead comes
from the extraneous disabling and re-enabling of interrupts. Andy
suggested simply not enabling interrupts until after the context
tracking code has done its thing, which allows us to skip a whole
interrupt disable & re-enable cycle.

This patch builds on top of these patches by Paolo:
https://lkml.org/lkml/2015/4/28/188
https://lkml.org/lkml/2015/4/29/139

Together with this patch I posted earlier this week, the syscall path
on a nohz_full cpu seems to be about 10% faster.
https://lkml.org/lkml/2015/4/24/394

My test is a simple microbenchmark that calls getpriority() in a loop
10 million times:

		run time	system time
vanilla		5.49s		2.08s
__acct patch	5.21s		1.92s
both patches	4.88s		1.71s

Cc: Frederic Weisbecker <fweisbec@...hat.com>
Cc: Ingo Molnar <mingo@...hat.com>
Cc: Paolo Bonzini <pbonzini@...hat.com>
Cc: Heiko Carstens <heiko.carstens@...ibm.com>
Cc: Thomas Gleixner <tglx@...utronix.de>
Suggested-by: Andy Lutomirsky <amluto@...capital.net>
Signed-off-by: Rik van Riel <riel@...hat.com>
---
 arch/x86/kernel/entry_32.S       |  4 ++--
 arch/x86/kernel/entry_64.S       |  4 ++--
 arch/x86/kernel/ptrace.c         |  6 +++++-
 include/linux/context_tracking.h | 11 +++++++++++
 4 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 1c309763e321..0bdf8c7057e4 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -406,7 +406,6 @@ ENTRY(ia32_sysenter_target)
 
 	pushl_cfi %eax
 	SAVE_ALL
-	ENABLE_INTERRUPTS(CLBR_NONE)
 
 /*
  * Load the potential sixth argument from user stack.
@@ -424,6 +423,7 @@ ENTRY(ia32_sysenter_target)
 
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
 	jnz sysenter_audit
+	ENABLE_INTERRUPTS(CLBR_NONE)
 sysenter_do_call:
 	cmpl $(NR_syscalls), %eax
 	jae sysenter_badsys
@@ -647,7 +647,7 @@ END(work_pending)
 syscall_trace_entry:
 	movl $-ENOSYS,PT_EAX(%esp)
 	movl %esp, %eax
-	call syscall_trace_enter
+	call syscall_trace_enter	/* returns with irqs enabled */
 	/* What it returned is what we'll actually use.  */
 	cmpl $(NR_syscalls), %eax
 	jnae syscall_call
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 02c2eff7478d..f7751da7b53e 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -228,7 +228,6 @@ GLOBAL(system_call_after_swapgs)
 	 * task preemption. We must enable interrupts only after we're done
 	 * with using rsp_scratch:
 	 */
-	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq_cfi	%r11			/* pt_regs->flags */
 	pushq_cfi	$__USER_CS		/* pt_regs->cs */
 	pushq_cfi	%rcx			/* pt_regs->ip */
@@ -248,6 +247,7 @@ GLOBAL(system_call_after_swapgs)
 
 	testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz tracesys
+	ENABLE_INTERRUPTS(CLBR_NONE)
 system_call_fastpath:
 #if __SYSCALL_MASK == ~0
 	cmpq $__NR_syscall_max,%rax
@@ -313,7 +313,7 @@ GLOBAL(system_call_after_swapgs)
 tracesys:
 	movq %rsp, %rdi
 	movl $AUDIT_ARCH_X86_64, %esi
-	call syscall_trace_enter_phase1
+	call syscall_trace_enter_phase1 /* returns with interrupts enabled */
 	test %rax, %rax
 	jnz tracesys_phase2		/* if needed, run the slow path */
 	RESTORE_C_REGS_EXCEPT_RAX	/* else restore clobbered regs */
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index a7bc79480719..066c86d0b68c 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1456,6 +1456,8 @@ static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
  *
  * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
  * are fully functional.
+ * Called with IRQs disabled, to be enabled after the context tracking
+ * code has run.
  *
  * For phase 2's benefit, our return value is:
  * 0:			resume the syscall
@@ -1477,10 +1479,12 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
 	 * doing anything that could touch RCU.
 	 */
 	if (work & _TIF_NOHZ) {
-		user_exit();
+		user_exit_irqsoff();
 		work &= ~_TIF_NOHZ;
 	}
 
+	local_irq_enable();
+
 #ifdef CONFIG_SECCOMP
 	/*
 	 * Do seccomp first -- it should minimize exposure of other
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 5d3719aed958..dc3b169b2b70 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -25,12 +25,23 @@ static inline void user_enter(void)
 		context_tracking_enter(CONTEXT_USER);
 
 }
+
 static inline void user_exit(void)
 {
 	if (context_tracking_is_enabled())
 		context_tracking_exit(CONTEXT_USER);
 }
 
+/* Called with IRQs already disabled. */
+static inline void user_exit_irqsoff(void)
+{
+	if (in_interrupt())
+		return;
+
+	if (context_tracking_is_enabled())
+		__context_tracking_exit(CONTEXT_USER);
+}
+
 static inline enum ctx_state exception_enter(void)
 {
 	enum ctx_state prev_ctx;
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ