linux-kernel - Re: [PATCH v18 06/32] mm/thp: narrow lru locking

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <514f6afa-dbf7-11c5-5431-1d558d2c20c9@linux.alibaba.com>
Date:   Fri, 11 Sep 2020 11:37:50 +0800
From:   Alex Shi <alex.shi@...ux.alibaba.com>
To:     Matthew Wilcox <willy@...radead.org>
Cc:     akpm@...ux-foundation.org, mgorman@...hsingularity.net,
        tj@...nel.org, hughd@...gle.com, khlebnikov@...dex-team.ru,
        daniel.m.jordan@...cle.com, hannes@...xchg.org, lkp@...el.com,
        linux-mm@...ck.org, linux-kernel@...r.kernel.org,
        cgroups@...r.kernel.org, shakeelb@...gle.com,
        iamjoonsoo.kim@....com, richard.weiyang@...il.com,
        kirill@...temov.name, alexander.duyck@...il.com,
        rong.a.chen@...el.com, mhocko@...e.com, vdavydov.dev@...il.com,
        shy828301@...il.com, Andrea Arcangeli <aarcange@...hat.com>
Subject: Re: [PATCH v18 06/32] mm/thp: narrow lru locking



在 2020/9/10 下午9:49, Matthew Wilcox 写道:
> On Mon, Aug 24, 2020 at 08:54:39PM +0800, Alex Shi wrote:
>> lru_lock and page cache xa_lock have no reason with current sequence,
>> put them together isn't necessary. let's narrow the lru locking, but
>> left the local_irq_disable to block interrupt re-entry and statistic update.
> 
> What stats are you talking about here?

Hi Matthew,

Thanks for comments!

like __dec_node_page_state(head, NR_SHMEM_THPS); will have preemptive warning...

> 
>> +++ b/mm/huge_memory.c
>> @@ -2397,7 +2397,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
>>  }
>>  
>>  static void __split_huge_page(struct page *page, struct list_head *list,
>> -		pgoff_t end, unsigned long flags)
>> +			      pgoff_t end)
> 
> Please don't change this whitespace.  It's really annoying having to
> adjust the whitespace when renaming a function.  Just two tabs indentation
> to give a clear separation of arguments from code is fine.
> 
> 
> How about this patch instead?  It occurred to me we already have
> perfectly good infrastructure to track whether or not interrupts are
> already disabled, and so we should use that instead of ensuring that
> interrupts are disabled, or tracking that ourselves.

So your proposal looks like;
1, xa_lock_irq(&mapping->i_pages); (optional)
2, spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
3, spin_lock_irqsave(&pgdat->lru_lock, flags);

Is there meaningful for the 2nd and 3rd flags?

IIRC, I had a similar proposal as your, the flags used in xa_lock_irqsave(),
but objected by Hugh.

Thanks
Alex

> 
> But I may have missed something else that's relying on having
> interrupts disabled.  Please check carefully.
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 2ccff8472cd4..74cae6c032f9 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2376,17 +2376,16 @@ static void __split_huge_page_tail(struct page *head, int tail,
>  }
>  
>  static void __split_huge_page(struct page *page, struct list_head *list,
> -		pgoff_t end, unsigned long flags)
> +		pgoff_t end)
>  {
>  	struct page *head = compound_head(page);
>  	pg_data_t *pgdat = page_pgdat(head);
>  	struct lruvec *lruvec;
>  	struct address_space *swap_cache = NULL;
>  	unsigned long offset = 0;
> +	unsigned long flags;
>  	int i;
>  
> -	lruvec = mem_cgroup_page_lruvec(head, pgdat);
> -
>  	/* complete memcg works before add pages to LRU */
>  	mem_cgroup_split_huge_fixup(head);
>  
> @@ -2395,9 +2394,13 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>  
>  		offset = swp_offset(entry);
>  		swap_cache = swap_address_space(entry);
> -		xa_lock(&swap_cache->i_pages);
> +		xa_lock_irq(&swap_cache->i_pages);
>  	}
>  
> +	/* prevent PageLRU to go away from under us, and freeze lru stats */
> +	spin_lock_irqsave(&pgdat->lru_lock, flags);
> +	lruvec = mem_cgroup_page_lruvec(head, pgdat);
> +
>  	for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
>  		__split_huge_page_tail(head, i, lruvec, list);
>  		/* Some pages can be beyond i_size: drop them from page cache */
> @@ -2417,6 +2420,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>  	}
>  
>  	ClearPageCompound(head);
> +	spin_unlock_irqrestore(&pgdat->lru_lock, flags);
>  
>  	split_page_owner(head, HPAGE_PMD_ORDER);
>  
> @@ -2425,18 +2429,16 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>  		/* Additional pin to swap cache */
>  		if (PageSwapCache(head)) {
>  			page_ref_add(head, 2);
> -			xa_unlock(&swap_cache->i_pages);
> +			xa_unlock_irq(&swap_cache->i_pages);
>  		} else {
>  			page_ref_inc(head);
>  		}
>  	} else {
>  		/* Additional pin to page cache */
>  		page_ref_add(head, 2);
> -		xa_unlock(&head->mapping->i_pages);
> +		xa_unlock_irq(&head->mapping->i_pages);
>  	}
>  
> -	spin_unlock_irqrestore(&pgdat->lru_lock, flags);
> -
>  	remap_page(head);
>  
>  	for (i = 0; i < HPAGE_PMD_NR; i++) {
> @@ -2574,7 +2576,6 @@ bool can_split_huge_page(struct page *page, int *pextra_pins)
>  int split_huge_page_to_list(struct page *page, struct list_head *list)
>  {
>  	struct page *head = compound_head(page);
> -	struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
>  	struct deferred_split *ds_queue = get_deferred_split_queue(head);
>  	struct anon_vma *anon_vma = NULL;
>  	struct address_space *mapping = NULL;
> @@ -2640,9 +2641,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
>  	unmap_page(head);
>  	VM_BUG_ON_PAGE(compound_mapcount(head), head);
>  
> -	/* prevent PageLRU to go away from under us, and freeze lru stats */
> -	spin_lock_irqsave(&pgdata->lru_lock, flags);
> -
>  	if (mapping) {
>  		XA_STATE(xas, &mapping->i_pages, page_index(head));
>  
> @@ -2650,13 +2648,13 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
>  		 * Check if the head page is present in page cache.
>  		 * We assume all tail are present too, if head is there.
>  		 */
> -		xa_lock(&mapping->i_pages);
> +		xa_lock_irq(&mapping->i_pages);
>  		if (xas_load(&xas) != head)
>  			goto fail;
>  	}
>  
>  	/* Prevent deferred_split_scan() touching ->_refcount */
> -	spin_lock(&ds_queue->split_queue_lock);
> +	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>  	count = page_count(head);
>  	mapcount = total_mapcount(head);
>  	if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
> @@ -2664,7 +2662,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
>  			ds_queue->split_queue_len--;
>  			list_del(page_deferred_list(head));
>  		}
> -		spin_unlock(&ds_queue->split_queue_lock);
> +		spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>  		if (mapping) {
>  			if (PageSwapBacked(head))
>  				__dec_node_page_state(head, NR_SHMEM_THPS);
> @@ -2672,7 +2670,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
>  				__dec_node_page_state(head, NR_FILE_THPS);
>  		}
>  
> -		__split_huge_page(page, list, end, flags);
> +		__split_huge_page(page, list, end);
>  		if (PageSwapCache(head)) {
>  			swp_entry_t entry = { .val = page_private(head) };
>  
> @@ -2688,10 +2686,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
>  			dump_page(page, "total_mapcount(head) > 0");
>  			BUG();
>  		}
> -		spin_unlock(&ds_queue->split_queue_lock);
> +		spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>  fail:		if (mapping)
> -			xa_unlock(&mapping->i_pages);
> -		spin_unlock_irqrestore(&pgdata->lru_lock, flags);
> +			xa_unlock_irq(&mapping->i_pages);
>  		remap_page(head);
>  		ret = -EBUSY;
>  	}
>