lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <561CEF77.3050309@suse.cz>
Date:	Tue, 13 Oct 2015 13:48:07 +0200
From:	Vlastimil Babka <vbabka@...e.cz>
To:	Martin Schwidefsky <schwidefsky@...ibm.com>
Cc:	linux-mm@...ck.org, linux-kernel@...r.kernel.org,
	Yong Sun <yosun@...e.com>, linux390@...ibm.com,
	linux-s390@...r.kernel.org, Zhang Yi <wetpzy@...il.com>,
	Mel Gorman <mgorman@...e.de>,
	Andrea Arcangeli <aarcange@...hat.com>,
	"Kirill A. Shutemov" <kirill.shutemov@...ux.intel.com>,
	Hugh Dickins <hughd@...gle.com>,
	Dominik Dingel <dingel@...ux.vnet.ibm.com>,
	Heiko Carstens <heiko.carstens@...ibm.com>,
	Christian Borntraeger <borntraeger@...ibm.com>
Subject: Re: [RFC] futex: prevent endless loop on s390x with emulated
 hugepages

On 09/28/2015 01:49 PM, Martin Schwidefsky wrote:
> On Thu, 24 Sep 2015 17:05:48 +0200
> Vlastimil Babka <vbabka@...e.cz> wrote:

[...]

>> However, __get_user_pages_fast() is still broken. The get_user_pages_fast()
>> wrapper will hide this in the common case. The other user of the __ variant
>> is kvm, which is mentioned as the reason for removal of emulated hugepages.
>> The call of page_cache_get_speculative() looks also broken in this scenario
>> on debug builds because of VM_BUG_ON_PAGE(PageTail(page), page). With
>> CONFIG_TINY_RCU enabled, there's plain atomic_inc(&page->_count) which also
>> probably shouldn't happen for a tail page...
>
> It boils down to __get_user_pages_fast being broken for emulated large pages,
> doesn't it? My preferred fix would be to get __get_user_page_fast to work
> in this case.

I agree, but didn't know enough of the architecture to attempt such fix 
:) Thanks!

> For 3.12 a patch would look like this (needs more testing
> though):

FWIW it works for me in the particular LTP test, but as you said, it 
needs more testing and breaking stable would suck.

> --
> diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
> index bb0c157..5948b7f 100644
> --- a/arch/s390/include/asm/pgtable.h
> +++ b/arch/s390/include/asm/pgtable.h
> @@ -370,7 +370,7 @@ static inline int is_module_addr(void *addr)
>   #define _SEGMENT_ENTRY_EMPTY	(_SEGMENT_ENTRY_INVALID)
>
>   #define _SEGMENT_ENTRY_LARGE	0x400	/* STE-format control, large page   */
> -#define _SEGMENT_ENTRY_CO	0x100	/* change-recording override   */
> +#define _SEGMENT_ENTRY_SWLARGE	0x100	/* SW large page bit */
>   #define _SEGMENT_ENTRY_SPLIT	0x001	/* THP splitting bit */
>   #define _SEGMENT_ENTRY_YOUNG	0x002	/* SW segment young bit */
>   #define _SEGMENT_ENTRY_NONE	_SEGMENT_ENTRY_YOUNG
> @@ -391,8 +391,8 @@ static inline int is_module_addr(void *addr)
>   #define _SEGMENT_ENTRY_SPLIT_BIT 0	/* THP splitting bit number */
>
>   /* Set of bits not changed in pmd_modify */
> -#define _SEGMENT_CHG_MASK	(_SEGMENT_ENTRY_ORIGIN | _SEGMENT_ENTRY_LARGE \
> -				 | _SEGMENT_ENTRY_SPLIT | _SEGMENT_ENTRY_CO)
> +#define _SEGMENT_CHG_MASK (_SEGMENT_ENTRY_ORIGIN | _SEGMENT_ENTRY_LARGE \
> +			   | _SEGMENT_ENTRY_SPLIT | _SEGMENT_ENTRY_SWLARGE)
>
>   /* Page status table bits for virtualization */
>   #define PGSTE_ACC_BITS	0xf000000000000000UL
> @@ -563,12 +563,25 @@ static inline int pmd_none(pmd_t pmd)
>   static inline int pmd_large(pmd_t pmd)
>   {
>   #ifdef CONFIG_64BIT
> -	return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0;
> +	return (pmd_val(pmd) &
> +		(_SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SWLARGE)) != 0;
>   #else
>   	return 0;
>   #endif
>   }
>
> +static inline pmd_t pmd_swlarge_deref(pmd_t pmd)
> +{
> +	unsigned long origin;
> +
> +	if (pmd_val(pmd) & _SEGMENT_ENTRY_SWLARGE) {
> +		origin = pmd_val(pmd) & _SEGMENT_ENTRY_ORIGIN;
> +		pmd_val(pmd) &= ~_SEGMENT_ENTRY_ORIGIN;
> +		pmd_val(pmd) |= *(unsigned long *) origin;
> +	}
> +	return pmd;
> +}
> +
>   static inline int pmd_prot_none(pmd_t pmd)
>   {
>   	return (pmd_val(pmd) & _SEGMENT_ENTRY_INVALID) &&
> @@ -578,8 +591,10 @@ static inline int pmd_prot_none(pmd_t pmd)
>   static inline int pmd_bad(pmd_t pmd)
>   {
>   #ifdef CONFIG_64BIT
> -	if (pmd_large(pmd))
> +	if (pmd_large(pmd)) {
> +		pmd = pmd_swlarge_deref(pmd);
>   		return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS_LARGE) != 0;
> +	}
>   #endif
>   	return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0;
>   }
> @@ -1495,8 +1510,6 @@ static inline int pmd_trans_splitting(pmd_t pmd)
>   static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
>   			      pmd_t *pmdp, pmd_t entry)
>   {
> -	if (!(pmd_val(entry) & _SEGMENT_ENTRY_INVALID) && MACHINE_HAS_EDAT1)
> -		pmd_val(entry) |= _SEGMENT_ENTRY_CO;
>   	*pmdp = entry;
>   }
>
> diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
> index 46d517c..e159735 100644
> --- a/arch/s390/mm/dump_pagetables.c
> +++ b/arch/s390/mm/dump_pagetables.c
> @@ -129,7 +129,7 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st,
>   }
>
>   #ifdef CONFIG_64BIT
> -#define _PMD_PROT_MASK (_SEGMENT_ENTRY_PROTECT | _SEGMENT_ENTRY_CO)
> +#define _PMD_PROT_MASK (_SEGMENT_ENTRY_PROTECT)
>   #else
>   #define _PMD_PROT_MASK 0
>   #endif
> @@ -138,7 +138,7 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st,
>   			   pud_t *pud, unsigned long addr)
>   {
>   	unsigned int prot;
> -	pmd_t *pmd;
> +	pmd_t *pmd, pmd_val;
>   	int i;
>
>   	for (i = 0; i < PTRS_PER_PMD && addr < max_addr; i++) {
> @@ -146,7 +146,8 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st,
>   		pmd = pmd_offset(pud, addr);
>   		if (!pmd_none(*pmd)) {
>   			if (pmd_large(*pmd)) {
> -				prot = pmd_val(*pmd) & _PMD_PROT_MASK;
> +				pmd_val = pmd_swlarge_deref(*pmd);
> +				prot = pmd_val(pmd_val) & _PMD_PROT_MASK;
>   				note_page(m, st, prot, 3);
>   			} else
>   				walk_pte_level(m, st, pmd, addr);
> diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
> index 5d758db..8dd86a5 100644
> --- a/arch/s390/mm/gup.c
> +++ b/arch/s390/mm/gup.c
> @@ -48,8 +48,9 @@ static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
>   	return 1;
>   }
>
> -static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
> -		unsigned long end, int write, struct page **pages, int *nr)
> +static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd_orig, pmd_t pmd,
> +		unsigned long addr, unsigned long end,
> +		int write, struct page **pages, int *nr)
>   {
>   	unsigned long mask, result;
>   	struct page *head, *page, *tail;
> @@ -78,7 +79,7 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
>   		return 0;
>   	}
>
> -	if (unlikely(pmd_val(pmd) != pmd_val(*pmdp))) {
> +	if (unlikely(pmd_val(pmd_orig) != pmd_val(*pmdp))) {
>   		*nr -= refs;
>   		while (refs--)
>   			put_page(head);
> @@ -103,7 +104,7 @@ static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
>   		unsigned long end, int write, struct page **pages, int *nr)
>   {
>   	unsigned long next;
> -	pmd_t *pmdp, pmd;
> +	pmd_t *pmdp, pmd, pmd_orig;
>
>   	pmdp = (pmd_t *) pudp;
>   #ifdef CONFIG_64BIT
> @@ -112,7 +113,7 @@ static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
>   	pmdp += pmd_index(addr);
>   #endif
>   	do {
> -		pmd = *pmdp;
> +		pmd = pmd_orig = *pmdp;
>   		barrier();
>   		next = pmd_addr_end(addr, end);
>   		/*
> @@ -127,8 +128,9 @@ static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
>   		if (pmd_none(pmd) || pmd_trans_splitting(pmd))
>   			return 0;
>   		if (unlikely(pmd_large(pmd))) {
> -			if (!gup_huge_pmd(pmdp, pmd, addr, next,
> -					  write, pages, nr))
> +			if (!gup_huge_pmd(pmdp, pmd_orig,
> +					  pmd_swlarge_deref(pmd),
> +					  addr, next, write, pages, nr))
>   				return 0;
>   		} else if (!gup_pte_range(pmdp, pmd, addr, next,
>   					  write, pages, nr))
> diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
> index 99a68d5..d4a17d9 100644
> --- a/arch/s390/mm/hugetlbpage.c
> +++ b/arch/s390/mm/hugetlbpage.c
> @@ -98,22 +98,18 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
>   	if (!MACHINE_HAS_HPAGE) {
>   		pmd_val(pmd) &= ~_SEGMENT_ENTRY_ORIGIN;
>   		pmd_val(pmd) |= pte_page(pte)[1].index;
> +		pmd_val(pmd) |= _SEGMENT_ENTRY_SWLARGE;
>   	} else
> -		pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_CO;
> +		pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE;
>   	*(pmd_t *) ptep = pmd;
>   }
>
>   pte_t huge_ptep_get(pte_t *ptep)
>   {
> -	unsigned long origin;
>   	pmd_t pmd;
>
>   	pmd = *(pmd_t *) ptep;
> -	if (!MACHINE_HAS_HPAGE && pmd_present(pmd)) {
> -		origin = pmd_val(pmd) & _SEGMENT_ENTRY_ORIGIN;
> -		pmd_val(pmd) &= ~_SEGMENT_ENTRY_ORIGIN;
> -		pmd_val(pmd) |= *(unsigned long *) origin;
> -	}
> +	pmd = pmd_swlarge_deref(pmd);
>   	return __pmd_to_pte(pmd);
>   }
>
> diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
> index bcfb70b..a4f6f2f 100644
> --- a/arch/s390/mm/vmem.c
> +++ b/arch/s390/mm/vmem.c
> @@ -235,8 +235,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
>   				if (!new_page)
>   					goto out;
>   				pmd_val(*pm_dir) = __pa(new_page) |
> -					_SEGMENT_ENTRY | _SEGMENT_ENTRY_LARGE |
> -					_SEGMENT_ENTRY_CO;
> +					_SEGMENT_ENTRY | _SEGMENT_ENTRY_LARGE;
>   				address = (address + PMD_SIZE) & PMD_MASK;
>   				continue;
>   			}
>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ