lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20190417084501.GE655@dhcp22.suse.cz>
Date:   Wed, 17 Apr 2019 10:45:01 +0200
From:   Michal Hocko <mhocko@...nel.org>
To:     Zhaoyang Huang <huangzhaoyang@...il.com>
Cc:     Andrew Morton <akpm@...ux-foundation.org>,
        Vlastimil Babka <vbabka@...e.cz>,
        Pavel Tatashin <pasha.tatashin@...cle.com>,
        Joonsoo Kim <iamjoonsoo.kim@....com>,
        David Rientjes <rientjes@...gle.com>,
        Zhaoyang Huang <zhaoyang.huang@...soc.com>,
        Roman Gushchin <guro@...com>, Jeff Layton <jlayton@...hat.com>,
        Matthew Wilcox <mawilcox@...rosoft.com>, linux-mm@...ck.org,
        linux-kernel@...r.kernel.org
Subject: Re: [RFC PATCH] mm/workingset : judge file page activity via
 timestamp

Hi,
I do not see http://lkml.kernel.org/r/1554348617-12897-1-git-send-email-huangzhaoyang@gmail.com
discussion reaching a conlusion to change the current workingset
implementation. Therefore is there any reason to post a new version of
the patch? If yes it would be really great to see a short summary about
how this version is different from the previous one and how all the
review feedback has been addressed.

On Wed 17-04-19 15:47:26, Zhaoyang Huang wrote:
> From: Zhaoyang Huang <zhaoyang.huang@...soc.com>
> 
> This patch introduce timestamp into workingset's entry and judge if the page
> is active or inactive via active_file/refault_ratio instead of refault distance.
> 
> The original thought is coming from the logs we got from trace_printk in this
> patch, we can find about 1/5 of the file pages' refault are under the
> scenario[1],which will be counted as inactive as they have a long refault distance
> in between access. However, we can also know from the time information that the
> page refault quickly as comparing to the average refault time which is calculated
> by the number of active file and refault ratio. We want to save these kinds of
> pages from evicted earlier as it used to be. The refault ratio is the value
> which can reflect lru's average file access frequency and also can be deemed as a
> prediction of future.
> 
> The patch is tested on an android system and reduce 30% of page faults, while
> 60% of the pages remain the original status as (refault_distance < active_file)
> indicates. Pages status got from ftrace during the test can refer to [2].
> 
> [1]
> system_server workingset_refault: WKST_ACT[0]:rft_dis 265976, act_file 34268 rft_ratio 3047 rft_time 0 avg_rft_time 11 refault 295592 eviction 29616 secs 97 pre_secs 97
> HwBinder:922  workingset_refault: WKST_ACT[0]:rft_dis 264478, act_file 35037 rft_ratio 3070 rft_time 2 avg_rft_time 11 refault 310078 eviction 45600 secs 101 pre_secs 99
> 
> [2]
> WKST_ACT[0]:   original--INACTIVE  commit--ACTIVE
> WKST_ACT[1]:   original--ACTIVE    commit--ACTIVE
> WKST_INACT[0]: original--INACTIVE  commit--INACTIVE
> WKST_INACT[1]: original--ACTIVE    commit--INACTIVE
> 
> Signed-off-by: Zhaoyang Huang <huangzhaoyang@...il.com>
> ---
>  include/linux/mmzone.h |   1 +
>  mm/workingset.c        | 120 +++++++++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 112 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 32699b2..6f30673 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -240,6 +240,7 @@ struct lruvec {
>  	atomic_long_t			inactive_age;
>  	/* Refaults at the time of last reclaim cycle */
>  	unsigned long			refaults;
> +	atomic_long_t			refaults_ratio;
>  #ifdef CONFIG_MEMCG
>  	struct pglist_data *pgdat;
>  #endif
> diff --git a/mm/workingset.c b/mm/workingset.c
> index 40ee02c..66c177b 100644
> --- a/mm/workingset.c
> +++ b/mm/workingset.c
> @@ -160,6 +160,21 @@
>  			 MEM_CGROUP_ID_SHIFT)
>  #define EVICTION_MASK	(~0UL >> EVICTION_SHIFT)
>  
> +#ifdef CONFIG_64BIT
> +#define EVICTION_SECS_POS_SHIFT 20
> +#define EVICTION_SECS_SHRINK_SHIFT 4
> +#define EVICTION_SECS_POS_MASK  ((1UL << EVICTION_SECS_POS_SHIFT) - 1)
> +#else
> +#ifndef CONFIG_MEMCG
> +#define EVICTION_SECS_POS_SHIFT 12
> +#define EVICTION_SECS_SHRINK_SHIFT 4
> +#define EVICTION_SECS_POS_MASK  ((1UL << EVICTION_SECS_POS_SHIFT) - 1)
> +#else
> +#define EVICTION_SECS_POS_SHIFT 0
> +#define EVICTION_SECS_SHRINK_SHIFT 0
> +#define NO_SECS_IN_WORKINGSET
> +#endif
> +#endif
>  /*
>   * Eviction timestamps need to be able to cover the full range of
>   * actionable refaults. However, bits are tight in the radix tree
> @@ -169,10 +184,54 @@
>   * evictions into coarser buckets by shaving off lower timestamp bits.
>   */
>  static unsigned int bucket_order __read_mostly;
> -
> +#ifdef NO_SECS_IN_WORKINGSET
> +static void pack_secs(unsigned long *peviction) { }
> +static unsigned int unpack_secs(unsigned long entry) {return 0; }
> +#else
> +/*
> + * Shrink the timestamp according to its value and store it together
> + * with the shrink size in the entry.
> + */
> +static void pack_secs(unsigned long *peviction)
> +{
> +	unsigned int secs;
> +	unsigned long eviction;
> +	int order;
> +	int secs_shrink_size;
> +	struct timespec ts;
> +
> +	get_monotonic_boottime(&ts);
> +	secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1;
> +	order = get_count_order(secs);
> +	secs_shrink_size = (order <= EVICTION_SECS_POS_SHIFT)
> +			? 0 : (order - EVICTION_SECS_POS_SHIFT);
> +
> +	eviction = *peviction;
> +	eviction = (eviction << EVICTION_SECS_POS_SHIFT)
> +			| ((secs >> secs_shrink_size) & EVICTION_SECS_POS_MASK);
> +	eviction = (eviction << EVICTION_SECS_SHRINK_SHIFT) | (secs_shrink_size & 0xf);
> +	*peviction = eviction;
> +}
> +/*
> + * Unpack the second from the entry and restore the value according to the
> + * shrink size.
> + */
> +static unsigned int unpack_secs(unsigned long entry)
> +{
> +	unsigned int secs;
> +	int secs_shrink_size;
> +
> +	secs_shrink_size = entry & ((1 << EVICTION_SECS_SHRINK_SHIFT) - 1);
> +	entry >>= EVICTION_SECS_SHRINK_SHIFT;
> +	secs = entry & EVICTION_SECS_POS_MASK;
> +	secs = secs << secs_shrink_size;
> +	return secs;
> +}
> +#endif
>  static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
>  {
>  	eviction >>= bucket_order;
> +	pack_secs(&eviction);
>  	eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
>  	eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
>  	eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
> @@ -181,20 +240,24 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
>  }
>  
>  static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
> -			  unsigned long *evictionp)
> +			  unsigned long *evictionp, unsigned int *prev_secs)
>  {
>  	unsigned long entry = (unsigned long)shadow;
>  	int memcgid, nid;
> +	unsigned int secs;
>  
>  	entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
>  	nid = entry & ((1UL << NODES_SHIFT) - 1);
>  	entry >>= NODES_SHIFT;
>  	memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
>  	entry >>= MEM_CGROUP_ID_SHIFT;
> +	secs = unpack_secs(entry);
> +	entry >>= (EVICTION_SECS_POS_SHIFT + EVICTION_SECS_SHRINK_SHIFT);
>  
>  	*memcgidp = memcgid;
>  	*pgdat = NODE_DATA(nid);
>  	*evictionp = entry << bucket_order;
> +	*prev_secs = secs;
>  }
>  
>  /**
> @@ -242,9 +305,22 @@ bool workingset_refault(void *shadow)
>  	unsigned long refault;
>  	struct pglist_data *pgdat;
>  	int memcgid;
> +#ifndef NO_SECS_IN_WORKINGSET
> +	unsigned long avg_refault_time;
> +	unsigned long refault_time;
> +	int tradition;
> +	unsigned int prev_secs;
> +	unsigned int secs;
> +	unsigned long refaults_ratio;
> +#endif
> +	struct timespec ts;
> +	/*
> +	convert jiffies to second
> +	*/
> +	get_monotonic_boottime(&ts);
> +	secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1;
>  
> -	unpack_shadow(shadow, &memcgid, &pgdat, &eviction);
> -
> +	unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &prev_secs);
>  	rcu_read_lock();
>  	/*
>  	 * Look up the memcg associated with the stored ID. It might
> @@ -288,14 +364,37 @@ bool workingset_refault(void *shadow)
>  	 * list is not a problem.
>  	 */
>  	refault_distance = (refault - eviction) & EVICTION_MASK;
> -
>  	inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
> -
> -	if (refault_distance <= active_file) {
> +#ifndef NO_SECS_IN_WORKINGSET
> +	refaults_ratio = (atomic_long_read(&lruvec->inactive_age) + 1) / secs;
> +	atomic_long_set(&lruvec->refaults_ratio, refaults_ratio);
> +	refault_time = secs - prev_secs;
> +	avg_refault_time = active_file / refaults_ratio;
> +	tradition = !!(refault_distance < active_file);
> +	if (refault_time <= avg_refault_time) {
> +#else
> +	if (refault_distance < active_file) {
> +#endif
>  		inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
> +#ifndef NO_SECS_IN_WORKINGSET
> +		trace_printk("WKST_ACT[%d]:rft_dis %ld, act_file %ld \
> +				rft_ratio %ld rft_time %ld avg_rft_time %ld \
> +				refault %ld eviction %ld secs %d pre_secs %d\n",
> +				tradition, refault_distance, active_file,
> +				refaults_ratio, refault_time, avg_refault_time,
> +				refault, eviction, secs, prev_secs);
> +#endif
>  		rcu_read_unlock();
>  		return true;
>  	}
> +#ifndef NO_SECS_IN_WORKINGSET
> +	trace_printk("WKST_INACT[%d]:rft_dis %ld, act_file %ld \
> +			rft_ratio %ld rft_time %ld avg_rft_time %ld \
> +			refault %ld eviction %ld secs %d pre_secs %d\n",
> +			tradition, refault_distance, active_file,
> +			refaults_ratio, refault_time, avg_refault_time,
> +			refault, eviction, secs, prev_secs);
> +#endif
>  	rcu_read_unlock();
>  	return false;
>  }
> @@ -513,7 +612,9 @@ static int __init workingset_init(void)
>  	unsigned int max_order;
>  	int ret;
>  
> -	BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
> +	BUILD_BUG_ON(BITS_PER_LONG < (EVICTION_SHIFT
> +				+ EVICTION_SECS_POS_SHIFT
> +				+ EVICTION_SECS_SHRINK_SHIFT));
>  	/*
>  	 * Calculate the eviction bucket size to cover the longest
>  	 * actionable refault distance, which is currently half of
> @@ -521,7 +622,8 @@ static int __init workingset_init(void)
>  	 * some more pages at runtime, so keep working with up to
>  	 * double the initial memory by using totalram_pages as-is.
>  	 */
> -	timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
> +	timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT
> +			- EVICTION_SECS_POS_SHIFT - EVICTION_SECS_SHRINK_SHIFT;
>  	max_order = fls_long(totalram_pages - 1);
>  	if (max_order > timestamp_bits)
>  		bucket_order = max_order - timestamp_bits;
> -- 
> 1.9.1

-- 
Michal Hocko
SUSE Labs

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ