linux-kernel - Re: [PATCH v4 1/3] mm/khugepaged: refactor collapse

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Y/U9fBxVJdhxiZ1v@x1n>
Date:   Tue, 21 Feb 2023 16:54:04 -0500
From:   Peter Xu <peterx@...hat.com>
To:     David Stevens <stevensd@...omium.org>
Cc:     linux-mm@...ck.org, Matthew Wilcox <willy@...radead.org>,
        Andrew Morton <akpm@...ux-foundation.org>,
        "Kirill A . Shutemov" <kirill@...temov.name>,
        Yang Shi <shy828301@...il.com>,
        David Hildenbrand <david@...hat.com>,
        Hugh Dickins <hughd@...gle.com>, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v4 1/3] mm/khugepaged: refactor collapse_file control flow

On Fri, Feb 17, 2023 at 05:54:37PM +0900, David Stevens wrote:
> From: David Stevens <stevensd@...omium.org>
> 
> Add a rollback label to deal with failure, instead of continuously
> checking for RESULT_SUCCESS, to make it easier to add more failure
> cases. The refactoring also allows the collapse_file tracepoint to
> include hpage on success (instead of NULL).
> 
> Signed-off-by: David Stevens <stevensd@...omium.org>
> ---
>  mm/khugepaged.c | 223 ++++++++++++++++++++++++------------------------
>  1 file changed, 110 insertions(+), 113 deletions(-)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 8dbc39896811..6a3d6d2e25e0 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1885,6 +1885,12 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
>  	if (result != SCAN_SUCCEED)
>  		goto out;
>  
> +	__SetPageLocked(hpage);
> +	if (is_shmem)
> +		__SetPageSwapBacked(hpage);
> +	hpage->index = start;
> +	hpage->mapping = mapping;
> +
>  	/*
>  	 * Ensure we have slots for all the pages in the range.  This is
>  	 * almost certainly a no-op because most of the pages must be present
> @@ -1897,16 +1903,10 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
>  		xas_unlock_irq(&xas);
>  		if (!xas_nomem(&xas, GFP_KERNEL)) {
>  			result = SCAN_FAIL;
> -			goto out;
> +			goto rollback;
>  		}
>  	} while (1);
>  
> -	__SetPageLocked(hpage);
> -	if (is_shmem)
> -		__SetPageSwapBacked(hpage);
> -	hpage->index = start;
> -	hpage->mapping = mapping;
> -
>  	/*
>  	 * At this point the hpage is locked and not up-to-date.
>  	 * It's safe to insert it into the page cache, because nobody would
> @@ -2123,131 +2123,128 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
>  	 */
>  	try_to_unmap_flush();
>  
> -	if (result == SCAN_SUCCEED) {
> -		/*
> -		 * Replacing old pages with new one has succeeded, now we
> -		 * attempt to copy the contents.
> -		 */
> -		index = start;
> -		list_for_each_entry(page, &pagelist, lru) {
> -			while (index < page->index) {
> -				clear_highpage(hpage + (index % HPAGE_PMD_NR));
> -				index++;
> -			}
> -			if (copy_mc_page(hpage + (page->index % HPAGE_PMD_NR),
> -					 page) > 0) {
> -				result = SCAN_COPY_MC;
> -				break;
> -			}
> -			index++;
> -		}
> -		while (result == SCAN_SUCCEED && index < end) {
> +	if (result != SCAN_SUCCEED)
> +		goto rollback;
> +
> +	/*
> +	 * Replacing old pages with new one has succeeded, now we
> +	 * attempt to copy the contents.
> +	 */
> +	index = start;
> +	list_for_each_entry(page, &pagelist, lru) {
> +		while (index < page->index) {
>  			clear_highpage(hpage + (index % HPAGE_PMD_NR));
>  			index++;
>  		}
> +		if (copy_mc_page(hpage + (page->index % HPAGE_PMD_NR),
> +				 page) > 0) {
> +			result = SCAN_COPY_MC;
> +			goto rollback;
> +		}
> +		index++;
> +	}
> +	while (index < end) {
> +		clear_highpage(hpage + (index % HPAGE_PMD_NR));
> +		index++;
>  	}
>  
> -	if (result == SCAN_SUCCEED) {
> -		/*
> -		 * Copying old pages to huge one has succeeded, now we
> -		 * need to free the old pages.
> -		 */
> -		list_for_each_entry_safe(page, tmp, &pagelist, lru) {
> -			list_del(&page->lru);
> -			page->mapping = NULL;
> -			page_ref_unfreeze(page, 1);
> -			ClearPageActive(page);
> -			ClearPageUnevictable(page);
> -			unlock_page(page);
> -			put_page(page);
> -		}
> +	/*
> +	 * Copying old pages to huge one has succeeded, now we
> +	 * need to free the old pages.
> +	 */
> +	list_for_each_entry_safe(page, tmp, &pagelist, lru) {
> +		list_del(&page->lru);
> +		page->mapping = NULL;
> +		page_ref_unfreeze(page, 1);
> +		ClearPageActive(page);
> +		ClearPageUnevictable(page);
> +		unlock_page(page);
> +		put_page(page);
> +	}
>  
> -		xas_lock_irq(&xas);
> -		if (is_shmem)
> -			__mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
> -		else
> -			__mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
> +	xas_lock_irq(&xas);
> +	if (is_shmem)
> +		__mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
> +	else
> +		__mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
> +
> +	if (nr_none) {
> +		__mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none);
> +		/* nr_none is always 0 for non-shmem. */
> +		__mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
> +	}
> +	/* Join all the small entries into a single multi-index entry. */
> +	xas_set_order(&xas, start, HPAGE_PMD_ORDER);
> +	xas_store(&xas, hpage);
> +	xas_unlock_irq(&xas);
>  
> -		if (nr_none) {
> -			__mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none);
> -			/* nr_none is always 0 for non-shmem. */
> -			__mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
> -		}
> -		/* Join all the small entries into a single multi-index entry. */
> -		xas_set_order(&xas, start, HPAGE_PMD_ORDER);
> -		xas_store(&xas, hpage);
> -		xas_unlock_irq(&xas);
> +	folio = page_folio(hpage);
> +	folio_mark_uptodate(folio);
> +	folio_ref_add(folio, HPAGE_PMD_NR - 1);
>  
> -		folio = page_folio(hpage);
> -		folio_mark_uptodate(folio);
> -		folio_ref_add(folio, HPAGE_PMD_NR - 1);
> +	if (is_shmem)
> +		folio_mark_dirty(folio);
> +	folio_add_lru(folio);
>  
> -		if (is_shmem)
> -			folio_mark_dirty(folio);
> -		folio_add_lru(folio);
> +	/*
> +	 * Remove pte page tables, so we can re-fault the page as huge.
> +	 */
> +	result = retract_page_tables(mapping, start, mm, addr, hpage,
> +				     cc);
> +	unlock_page(hpage);
> +	goto out;
> +
> +rollback:
> +	/* Something went wrong: roll back page cache changes */
> +	xas_lock_irq(&xas);
> +	if (nr_none) {
> +		mapping->nrpages -= nr_none;
> +		shmem_uncharge(mapping->host, nr_none);
> +	}
>  
> -		/*
> -		 * Remove pte page tables, so we can re-fault the page as huge.
> -		 */
> -		result = retract_page_tables(mapping, start, mm, addr, hpage,
> -					     cc);
> -		unlock_page(hpage);
> -		hpage = NULL;
> -	} else {
> -		/* Something went wrong: roll back page cache changes */
> -		xas_lock_irq(&xas);
> -		if (nr_none) {
> -			mapping->nrpages -= nr_none;
> -			shmem_uncharge(mapping->host, nr_none);
> +	xas_set(&xas, start);
> +	xas_for_each(&xas, page, end - 1) {
> +		page = list_first_entry_or_null(&pagelist,
> +				struct page, lru);
> +		if (!page || xas.xa_index < page->index) {
> +			if (!nr_none)
> +				break;
> +			nr_none--;
> +			/* Put holes back where they were */
> +			xas_store(&xas, NULL);
> +			continue;
>  		}
>  
> -		xas_set(&xas, start);
> -		xas_for_each(&xas, page, end - 1) {
> -			page = list_first_entry_or_null(&pagelist,
> -					struct page, lru);
> -			if (!page || xas.xa_index < page->index) {
> -				if (!nr_none)
> -					break;
> -				nr_none--;
> -				/* Put holes back where they were */
> -				xas_store(&xas, NULL);
> -				continue;
> -			}
> +		VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
>  
> -			VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
> +		/* Unfreeze the page. */
> +		list_del(&page->lru);
> +		page_ref_unfreeze(page, 2);
> +		xas_store(&xas, page);
> +		xas_pause(&xas);
> +		xas_unlock_irq(&xas);
> +		unlock_page(page);
> +		putback_lru_page(page);
> +		xas_lock_irq(&xas);
> +	}
> +	VM_BUG_ON(nr_none);
> +	/*
> +	 * Undo the updates of filemap_nr_thps_inc for non-SHMEM file only.
> +	 * This undo is not needed unless failure is due to SCAN_COPY_MC.
> +	 */
> +	if (!is_shmem && result == SCAN_COPY_MC)
> +		filemap_nr_thps_dec(mapping);
>  
> -			/* Unfreeze the page. */
> -			list_del(&page->lru);
> -			page_ref_unfreeze(page, 2);
> -			xas_store(&xas, page);
> -			xas_pause(&xas);
> -			xas_unlock_irq(&xas);
> -			unlock_page(page);
> -			putback_lru_page(page);
> -			xas_lock_irq(&xas);
> -		}
> -		VM_BUG_ON(nr_none);
> -		/*
> -		 * Undo the updates of filemap_nr_thps_inc for non-SHMEM file only.
> -		 * This undo is not needed unless failure is due to SCAN_COPY_MC.
> -		 */
> -		if (!is_shmem && result == SCAN_COPY_MC)
> -			filemap_nr_thps_dec(mapping);
> +	xas_unlock_irq(&xas);
>  
> -		xas_unlock_irq(&xas);
> +	hpage->mapping = NULL;
>  
> -		hpage->mapping = NULL;
> -	}
> +	unlock_page(hpage);
> +	mem_cgroup_uncharge(page_folio(hpage));
> +	put_page(hpage);
>  
> -	if (hpage)
> -		unlock_page(hpage);
>  out:
>  	VM_BUG_ON(!list_empty(&pagelist));
> -	if (hpage) {
> -		mem_cgroup_uncharge(page_folio(hpage));
> -		put_page(hpage);
> -	}

Moving this chunk seems wrong, as it can leak the huge page if
alloc_charge_hpage() failed on mem charging, iiuc.

And I found that keeping it seems wrong either, because if mem charge
failed we'll uncharge even without charging it before.  But that's nothing
about this patch alone.

Posted a patch for this:

https://lore.kernel.org/all/20230221214344.609226-1-peterx@redhat.com/

I _think_ this patch will make sense after rebasing to that fix if that's
correct, but please review it and double check.

Thanks,

> -
>  	trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result);
>  	return result;
>  }
> -- 
> 2.39.2.637.g21b0678d19-goog
> 

-- 
Peter Xu