lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <5f41af0f-4593-3441-12f4-5b0f7e6999ac@redhat.com>
Date:   Mon, 3 Aug 2020 09:57:10 +0200
From:   David Hildenbrand <david@...hat.com>
To:     pullip.cho@...sung.com, akpm@...ux-foundation.org
Cc:     linux-mm@...ck.org, linux-kernel@...r.kernel.org,
        hyesoo.yu@...sung.com, janghyuck.kim@...sung.com
Subject: Re: [PATCH] mm: sort freelist by rank number

On 03.08.20 08:10, pullip.cho@...sung.com wrote:
> From: Cho KyongHo <pullip.cho@...sung.com>
> 
> LPDDR5 introduces rank switch delay. If three successive DRAM accesses
> happens and the first and the second ones access one rank and the last
> access happens on the other rank, the latency of the last access will
> be longer than the second one.
> To address this panelty, we can sort the freelist so that a specific
> rank is allocated prior to another rank. We expect the page allocator
> can allocate the pages from the same rank successively with this
> change. It will hopefully improves the proportion of the consecutive
> memory accesses to the same rank.

This certainly needs performance numbers to justify ... and I am sorry,
"hopefully improves" is not a valid justification :)

I can imagine that this works well initially, when there hasn't been a
lot of memory fragmentation going on. But quickly after your system is
under stress, I doubt this will be very useful. Proof me wrong. ;)

... I dislike this manual setting of "dram_rank_granule". Yet another mm
feature that can only be enabled by a magic command line parameter where
users have to guess the right values.

(side note, there have been similar research approaches to improve
energy consumption by switching off ranks when not needed).

> 
> Signed-off-by: Cho KyongHo <pullip.cho@...sung.com>
> ---
>  mm/Kconfig      |  23 +++++++++++
>  mm/compaction.c |   6 +--
>  mm/internal.h   |  11 ++++++
>  mm/page_alloc.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++-------
>  mm/shuffle.h    |   6 ++-
>  5 files changed, 144 insertions(+), 21 deletions(-)
> 
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 6c97488..789c02b 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -868,4 +868,27 @@ config ARCH_HAS_HUGEPD
>  config MAPPING_DIRTY_HELPERS
>          bool
>  
> +config RANK_SORTED_FREELIST
> +	bool "Prefer allocating free pages in a half of DRAM ranks"
> +
> +	help
> +	  Rank switch delay in DRAM access become larger in LPDDR5 than before.
> +	  If two successive memory accesses happen on different ranks in LPDDR5,
> +	  the latency of the second access becomes longer due to the rank switch
> +	  overhead. This is a power-performance tradeoff achieved in LPDDR5.
> +	  By default, sorting freelist by rank number is disabled even though
> +	  RANK_SORTED_FREELIST is set to y. To enable, set "dram_rank_granule"
> +	  boot argument to a larger or an equal value to pageblock_nr_pages. The
> +	  values should be the exact the rank interleaving granule that your
> +	  system is using. The rank interleaving granule is 2^(the lowest CS bit
> +	  number). CS stands for Chip Select and is also called SS which stands
> +	  for Slave Select.
> +	  This is not beneficial to single rank memory system. Also this is not
> +	  necessary to quad rank and octal rank memory systems because they are
> +	  not in LPDDR5 specifications.
> +
> +	  This is marked experimental because this disables freelist shuffling
> +	  (SHUFFLE_PAGE_ALLOCATOR). Also you should set the correct rank
> +	  interleaving granule.
> +
>  endmenu
> diff --git a/mm/compaction.c b/mm/compaction.c
> index 061dacf..bee9567 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -1218,8 +1218,7 @@ move_freelist_head(struct list_head *freelist, struct page *freepage)
>  
>  	if (!list_is_last(freelist, &freepage->lru)) {
>  		list_cut_before(&sublist, freelist, &freepage->lru);
> -		if (!list_empty(&sublist))
> -			list_splice_tail(&sublist, freelist);
> +		freelist_splice_tail(&sublist, freelist);
>  	}
>  }
>  
> @@ -1236,8 +1235,7 @@ move_freelist_tail(struct list_head *freelist, struct page *freepage)
>  
>  	if (!list_is_first(freelist, &freepage->lru)) {
>  		list_cut_position(&sublist, freelist, &freepage->lru);
> -		if (!list_empty(&sublist))
> -			list_splice_tail(&sublist, freelist);
> +		freelist_splice_tail(&sublist, freelist);
>  	}
>  }
>  
> diff --git a/mm/internal.h b/mm/internal.h
> index 10c6776..c945b3d 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -266,6 +266,17 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
>  
>  #endif
>  
> +#ifdef CONFIG_RANK_SORTED_FREELIST
> +void freelist_splice_tail(struct list_head *sublist, struct list_head *freelist);
> +#else
> +#include <linux/list.h>
> +static inline
> +void freelist_splice_tail(struct list_head *sublist, struct list_head *freelist)
> +{
> +	if (!list_empty(sublist))
> +		list_splice_tail(sublist, freelist);
> +}
> +#endif
>  /*
>   * This function returns the order of a free page in the buddy system. In
>   * general, page_zone(page)->lock must be held by the caller to prevent the
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 2824e116..7823a3b 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -854,6 +854,69 @@ compaction_capture(struct capture_control *capc, struct page *page,
>  }
>  #endif /* CONFIG_COMPACTION */
>  
> +#ifdef CONFIG_RANK_SORTED_FREELIST
> +static unsigned long dram_rank_nr_pages __read_mostly;
> +
> +static inline bool preferred_rank_enabled(void)
> +{
> +	return dram_rank_nr_pages >= pageblock_nr_pages;
> +}
> +
> +static int __init dram_rank_granule(char *buf)
> +{
> +	unsigned long val = (unsigned long)(memparse(buf, NULL) / PAGE_SIZE);
> +
> +	if (val < pageblock_nr_pages) {
> +		pr_err("too small rank granule %lu\n", val);
> +		return -EINVAL;
> +	}
> +
> +	dram_rank_nr_pages = val;
> +
> +	return 0;
> +}
> +
> +early_param("dram_rank_granule", dram_rank_granule);
> +
> +static inline bool __preferred_rank(struct page *page)
> +{
> +	return !(page_to_pfn(page) & dram_rank_nr_pages);
> +}
> +
> +static inline bool preferred_rank(struct page *page)
> +{
> +	return !preferred_rank_enabled() || __preferred_rank(page);
> +}
> +
> +void freelist_splice_tail(struct list_head *sublist, struct list_head *freelist)
> +{
> +	while (!list_empty(sublist)) {
> +		struct page *page;
> +
> +		page = list_first_entry(sublist, struct page, lru);
> +		if (!preferred_rank_enabled() || !__preferred_rank(page))
> +			list_move_tail(&page->lru, freelist);
> +		else
> +			list_move(&page->lru, freelist);
> +	}
> +}
> +#else
> +static inline bool __preferred_rank(struct page *page)
> +{
> +	return true;
> +}
> +
> +static inline bool preferred_rank(struct page *page)
> +{
> +	return true;
> +}
> +
> +static inline bool preferred_rank_enabled(void)
> +{
> +	return false;
> +}
> +#endif
> +
>  /* Used for pages not on another list */
>  static inline void add_to_free_list(struct page *page, struct zone *zone,
>  				    unsigned int order, int migratetype)
> @@ -880,7 +943,10 @@ static inline void move_to_free_list(struct page *page, struct zone *zone,
>  {
>  	struct free_area *area = &zone->free_area[order];
>  
> -	list_move(&page->lru, &area->free_list[migratetype]);
> +	if (preferred_rank(page))
> +		list_move(&page->lru, &area->free_list[migratetype]);
> +	else
> +		list_move_tail(&page->lru, &area->free_list[migratetype]);
>  }
>  
>  static inline void del_page_from_free_list(struct page *page, struct zone *zone,
> @@ -1029,7 +1095,9 @@ static inline void __free_one_page(struct page *page,
>  done_merging:
>  	set_page_order(page, order);
>  
> -	if (is_shuffle_order(order))
> +	if (preferred_rank_enabled())
> +		to_tail = !__preferred_rank(page);
> +	else if (is_shuffle_order(order))
>  		to_tail = shuffle_pick_tail();
>  	else
>  		to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
> @@ -2257,20 +2325,29 @@ static __always_inline
>  struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
>  						int migratetype)
>  {
> +	int retry = preferred_rank_enabled() ? 2 : 1;
>  	unsigned int current_order;
>  	struct free_area *area;
>  	struct page *page;
>  
> -	/* Find a page of the appropriate size in the preferred list */
> -	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
> -		area = &(zone->free_area[current_order]);
> -		page = get_page_from_free_area(area, migratetype);
> -		if (!page)
> -			continue;
> -		del_page_from_free_list(page, zone, current_order);
> -		expand(zone, page, order, current_order, migratetype);
> -		set_pcppage_migratetype(page, migratetype);
> -		return page;
> +	while (retry-- > 0) {
> +		/* Find a page of the appropriate size in the preferred list */
> +		for (current_order = order; current_order < MAX_ORDER; ++current_order) {
> +			area = &zone->free_area[current_order];
> +			page = get_page_from_free_area(area, migratetype);
> +			if (!page)
> +				continue;
> +			/*
> +			 * In the first try, search for a page in the preferred
> +			 * rank upward even though a free page is found.
> +			 */
> +			if (retry > 0 && !preferred_rank(page))
> +				continue;
> +			del_page_from_free_list(page, zone, current_order);
> +			expand(zone, page, order, current_order, migratetype);
> +			set_pcppage_migratetype(page, migratetype);
> +			return page;
> +		}
>  	}
>  
>  	return NULL;
> @@ -2851,8 +2928,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
>  		 * head, thus also in the physical page order. This is useful
>  		 * for IO devices that can merge IO requests if the physical
>  		 * pages are ordered properly.
> +		 * However, preferred_rank_enabled() is true, we always sort
> +		 * freelists in the buddy and the pcp in the order of rank
> +		 * number for the performance reason.
>  		 */
> -		list_add_tail(&page->lru, list);
> +		if (preferred_rank_enabled() && __preferred_rank(page))
> +			list_add(&page->lru, list);
> +		else
> +			list_add_tail(&page->lru, list);
>  		alloced++;
>  		if (is_migrate_cma(get_pcppage_migratetype(page)))
>  			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
> @@ -3136,7 +3219,10 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn)
>  	}
>  
>  	pcp = &this_cpu_ptr(zone->pageset)->pcp;
> -	list_add(&page->lru, &pcp->lists[migratetype]);
> +	if (preferred_rank(page))
> +		list_add(&page->lru, &pcp->lists[migratetype]);
> +	else
> +		list_add_tail(&page->lru, &pcp->lists[migratetype]);
>  	pcp->count++;
>  	if (pcp->count >= pcp->high) {
>  		unsigned long batch = READ_ONCE(pcp->batch);
> @@ -8813,7 +8899,10 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page,
>  			continue;
>  
>  		if (current_buddy != target) {
> -			add_to_free_list(current_buddy, zone, high, migratetype);
> +			if (preferred_rank(current_buddy))
> +				add_to_free_list(current_buddy, zone, high, migratetype);
> +			else
> +				add_to_free_list_tail(current_buddy, zone, high, migratetype);
>  			set_page_order(current_buddy, high);
>  			page = next_page;
>  		}
> diff --git a/mm/shuffle.h b/mm/shuffle.h
> index 71b784f..59cbfde 100644
> --- a/mm/shuffle.h
> +++ b/mm/shuffle.h
> @@ -12,7 +12,8 @@ extern void __shuffle_free_memory(pg_data_t *pgdat);
>  extern bool shuffle_pick_tail(void);
>  static inline void shuffle_free_memory(pg_data_t *pgdat)
>  {
> -	if (!static_branch_unlikely(&page_alloc_shuffle_key))
> +	if (!static_branch_unlikely(&page_alloc_shuffle_key) ||
> +	    preferred_rank_enabled())
>  		return;
>  	__shuffle_free_memory(pgdat);
>  }
> @@ -20,7 +21,8 @@ static inline void shuffle_free_memory(pg_data_t *pgdat)
>  extern void __shuffle_zone(struct zone *z);
>  static inline void shuffle_zone(struct zone *z)
>  {
> -	if (!static_branch_unlikely(&page_alloc_shuffle_key))
> +	if (!static_branch_unlikely(&page_alloc_shuffle_key) ||
> +	    preferred_rank_enabled())
>  		return;
>  	__shuffle_zone(z);
>  }
> 


-- 
Thanks,

David / dhildenb

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ