lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date:   Tue,  6 Dec 2016 15:04:45 -0800
From:   Kyle Huey <me@...ehuey.com>
To:     Ingo Molnar <mingo@...hat.com>,
        Thomas Gleixner <tglx@...utronix.de>,
        "H. Peter Anvin" <hpa@...or.com>, x86@...nel.org
Cc:     linux-kernel@...r.kernel.org,
        Robert O'Callahan <robert@...llahan.org>
Subject: [PATCH] x86: Optimize TIF checking in __switch_to_xtra.

For the TIF_BLOCKSTEP and TIF_NOTSC cases, __switch_to_xtra only needs to act
if the value of the flags changes with the task switch. That leads to C code
like:

    if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
        test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
        /* do work */
    }
    if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
        test_tsk_thread_flag(next_p, TIF_NOTSC)) {
        /* do some more work */
    }

GCC compiles that to:

    mov    %rdi,%r13
    mov    (%rdi),%rax
    mov    %rsi,%r12
    mov    (%rsi),%rdx
    shr    $0x19,%rax
    shr    $0x19,%rdx
    and    $0x1,%eax
    and    $0x1,%edx
    cmp    %al,%dl
    je     1f
    /* do work */
1:  mov    (%r13),%rax
    mov    (%r12),%rdx
    shr    $0x10,%rax
    shr    $0x10,%rdx
    and    $0x1,%edx
    and    $0x1,%eax
    cmp    %al,%dl
    je     2f
    /* do some more work */
2:

This does a bunch of unnecessary work. If we are not going to do the
TIF_BLOCKSTEP specific work here, we don't care about the actual values of the
bits, only whether or not they are different. And if we are going to do that
work, GCC won't reuse the computation anyways because test_tsk_thread_flag
calls test_bit which takes a volatile argument.

In https://lkml.org/lkml/2016/12/3/120, Ingo is currently blocking my attempt
to add a third flag of this type.

I propose the following patch, which results in GCC generating this code:

    mov    (%rsi),%rax
    xor    (%rdi),%rax
    mov    %rdi,%rbx
    mov    %rsi,%r12
    test   $0x2000000,%eax
    mov    %rax,%rdx
    jne    3f
1:  test   $0x10000,%edx
    je     2f
    /* do some more work */
2:  /* the rest of the function */
    ret
3:  /* do work */
    mov    (%r12),%rdx
    xor    (%rbx),%rdx
    jmp    1b

When we are not doing the work controlled by these flags this is significantly
better. We only have to load the flags from memory once, and we do a single
XOR that we can test multiple times instead of repeatedly isolating bits.
When we are doing the work controlled by these flags, this is slightly better.
We still get to avoid isolating particular bits, and we simply recompute the
XOR before jumping back if the register was clobbered by the work. Finally,
in either event, additional TIF checks of this form simply become another test
and branch, reducing the overhead of the new check I would like to add.

Signed-off-by: Kyle Huey <khuey@...ehuey.com>
---
 arch/x86/kernel/process.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 0888a879120f..bbe85c377345 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -187,37 +187,45 @@ int set_tsc_mode(unsigned int val)
 	else if (val == PR_TSC_ENABLE)
 		enable_TSC();
 	else
 		return -EINVAL;
 
 	return 0;
 }
 
+static inline int test_tsk_threads_flag_differ(struct task_struct *prev,
+					       struct task_struct *next,
+					       unsigned long mask)
+{
+	unsigned long diff;
+
+	diff = task_thread_info(prev)->flags ^ task_thread_info(next)->flags;
+	return (diff & mask) != 0;
+}
+
 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		      struct tss_struct *tss)
 {
 	struct thread_struct *prev, *next;
 
 	prev = &prev_p->thread;
 	next = &next_p->thread;
 
-	if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
-	    test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
+	if (test_tsk_threads_flag_differ(prev_p, next_p, _TIF_BLOCKSTEP)) {
 		unsigned long debugctl = get_debugctlmsr();
 
 		debugctl &= ~DEBUGCTLMSR_BTF;
 		if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
 			debugctl |= DEBUGCTLMSR_BTF;
 
 		update_debugctlmsr(debugctl);
 	}
 
-	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
-	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
+	if (test_tsk_threads_flag_differ(prev_p, next_p, _TIF_NOTSC)) {
 		/* prev and next are different */
 		if (test_tsk_thread_flag(next_p, TIF_NOTSC))
 			hard_disable_TSC();
 		else
 			hard_enable_TSC();
 	}
 
 	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
-- 
2.11.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ