linux-kernel - Re: [PATCH] x86-64: espfix for 64-bit mode *PROTOTYPE*

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20140425210241.GB30532@phenom.dumpdata.com>
Date:	Fri, 25 Apr 2014 17:02:41 -0400
From:	Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>
To:	"H. Peter Anvin" <hpa@...ux.intel.com>, boris.ostrovsky@...cle.com
Cc:	Linux Kernel Mailing List <linux-kernel@...r.kernel.org>,
	"H. Peter Anvin" <hpa@...or.com>,
	Linus Torvalds <torvalds@...ux-foundation.org>,
	Ingo Molnar <mingo@...nel.org>,
	Alexander van Heukelum <heukelum@...tmail.fm>,
	Andy Lutomirski <amluto@...il.com>,
	Boris Ostrovsky <boris.ostrovsky@...cle.com>,
	Borislav Petkov <bp@...en8.de>,
	Arjan van de Ven <arjan.van.de.ven@...el.com>,
	Brian Gerst <brgerst@...il.com>,
	Alexandre Julliard <julliard@...ehq.com>,
	Andi Kleen <andi@...stfloor.org>,
	Thomas Gleixner <tglx@...utronix.de>
Subject: Re: [PATCH] x86-64: espfix for 64-bit mode *PROTOTYPE*

On Tue, Apr 22, 2014 at 06:17:21PM -0700, H. Peter Anvin wrote:
> Another spin of the prototype.  This one avoids the espfix for anything
> but #GP, and avoids save/restore/saving registers... one can wonder,
> though, how much that actually matters in practice.
> 
> It still does redundant SWAPGS on the slow path.  I'm not sure I
> personally care enough to optimize that, as it means some fairly
> significant restructuring of some of the code paths.  Some of that
> restructuring might actually be beneficial, but still...

Sorry about being late to the party.


 .. snip..
> diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
> new file mode 100644
> index 000000000000..05567d706f92
> --- /dev/null
> +++ b/arch/x86/kernel/espfix_64.c
> @@ -0,0 +1,136 @@
> +/* ----------------------------------------------------------------------- *
> + *
> + *   Copyright 2014 Intel Corporation; author: H. Peter Anvin
> + *
> + *   This file is part of the Linux kernel, and is made available under
> + *   the terms of the GNU General Public License version 2 or (at your
> + *   option) any later version; incorporated herein by reference.
> + *
> + * ----------------------------------------------------------------------- */
> +
> +#include <linux/init.h>
> +#include <linux/kernel.h>
> +#include <linux/percpu.h>
> +#include <linux/gfp.h>
> +#include <asm/pgtable.h>
> +
> +#define ESPFIX_STACK_SIZE	64UL
> +#define ESPFIX_STACKS_PER_PAGE	(PAGE_SIZE/ESPFIX_STACK_SIZE)
> +
> +#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE << (PGDIR_SHIFT-PAGE_SHIFT-16))
> +#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS
> +# error "Need more than one PGD for the ESPFIX hack"
> +#endif
> +
> +#define ESPFIX_BASE_ADDR	(-2UL << PGDIR_SHIFT)
> +
> +#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
> +
> +/* This contains the *bottom* address of the espfix stack */
> +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
> +
> +/* Initialization mutex - should this be a spinlock? */
> +static DEFINE_MUTEX(espfix_init_mutex);
> +
> +/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */
> +#define ESPFIX_MAX_PAGES  DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE)
> +#define ESPFIX_MAP_SIZE   DIV_ROUND_UP(ESPFIX_MAX_PAGES, BITS_PER_LONG)
> +static unsigned long espfix_page_alloc_map[ESPFIX_MAP_SIZE];
> +
> +static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD]
> +	__aligned(PAGE_SIZE);
> +
> +/*
> + * This returns the bottom address of the espfix stack for a specific CPU.
> + * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case
> + * we have to account for some amount of padding at the end of each page.
> + */
> +static inline unsigned long espfix_base_addr(unsigned int cpu)
> +{
> +	unsigned long page, addr;
> +
> +	page = (cpu / ESPFIX_STACKS_PER_PAGE) << PAGE_SHIFT;
> +	addr = page + (cpu % ESPFIX_STACKS_PER_PAGE) * ESPFIX_STACK_SIZE;
> +	addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16);
> +	addr += ESPFIX_BASE_ADDR;
> +	return addr;
> +}
> +
> +#define PTE_STRIDE        (65536/PAGE_SIZE)
> +#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE)
> +#define ESPFIX_PMD_CLONES PTRS_PER_PMD
> +#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES))
> +
> +void init_espfix_this_cpu(void)
> +{
> +	unsigned int cpu, page;
> +	unsigned long addr;
> +	pgd_t pgd, *pgd_p;
> +	pud_t pud, *pud_p;
> +	pmd_t pmd, *pmd_p;
> +	pte_t pte, *pte_p;
> +	int n;
> +	void *stack_page;
> +	pteval_t ptemask;
> +
> +	/* We only have to do this once... */
> +	if (likely(this_cpu_read(espfix_stack)))
> +		return;		/* Already initialized */
> +
> +	cpu = smp_processor_id();
> +	addr = espfix_base_addr(cpu);
> +	page = cpu/ESPFIX_STACKS_PER_PAGE;
> +
> +	/* Did another CPU already set this up? */
> +	if (likely(test_bit(page, espfix_page_alloc_map)))
> +		goto done;
> +
> +	mutex_lock(&espfix_init_mutex);
> +
> +	/* Did we race on the lock? */
> +	if (unlikely(test_bit(page, espfix_page_alloc_map)))
> +		goto unlock_done;
> +
> +	ptemask = __supported_pte_mask;
> +
> +	pgd_p = &init_level4_pgt[pgd_index(addr)];
> +	pgd = *pgd_p;
> +	if (!pgd_present(pgd)) {
> +		/* This can only happen on the BSP */
> +		pgd = __pgd(__pa_symbol(espfix_pud_page) |

Any particular reason you are using __pgd

> +			    (_KERNPG_TABLE & ptemask));
> +		set_pgd(pgd_p, pgd);
> +	}
> +
> +	pud_p = &espfix_pud_page[pud_index(addr)];
> +	pud = *pud_p;
> +	if (!pud_present(pud)) {
> +		pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP);
> +		pud = __pud(__pa(pmd_p) | (_KERNPG_TABLE & ptemask));

_pud
> +		for (n = 0; n < ESPFIX_PUD_CLONES; n++)
> +			set_pud(&pud_p[n], pud);
> +	}
> +
> +	pmd_p = pmd_offset(&pud, addr);
> +	pmd = *pmd_p;
> +	if (!pmd_present(pmd)) {
> +		pte_p = (pte_t *)__get_free_page(PGALLOC_GFP);
> +		pmd = __pmd(__pa(pte_p) | (_KERNPG_TABLE & ptemask));

and _pmd?
> +		for (n = 0; n < ESPFIX_PMD_CLONES; n++)
> +			set_pmd(&pmd_p[n], pmd);
> +	}
> +
> +	pte_p = pte_offset_kernel(&pmd, addr);
> +	stack_page = (void *)__get_free_page(GFP_KERNEL);
> +	pte = __pte(__pa(stack_page) | (__PAGE_KERNEL & ptemask));

and __pte instead of the 'pmd', 'pud', 'pmd' and 'pte' macros?

> +	for (n = 0; n < ESPFIX_PTE_CLONES; n++)
> +		set_pte(&pte_p[n*PTE_STRIDE], pte);
> +
> +	/* Job is done for this CPU and any CPU which shares this page */
> +	set_bit(page, espfix_page_alloc_map);
> +
> +unlock_done:
> +	mutex_unlock(&espfix_init_mutex);
> +done:
> +	this_cpu_write(espfix_stack, addr);
> +}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/