linux-kernel - Re: [PATCH 2/2] mm: mempolicy: Interleave policy for tiered memory nodes

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ZRXFNYyK7gOSt59D@memverge.com>
Date:   Thu, 28 Sep 2023 14:25:57 -0400
From:   Gregory Price <gregory.price@...verge.com>
To:     Ravi Jonnalagadda <ravis.opensrc@...ron.com>
Cc:     linux-mm@...r.kernel.org, linux-cxl@...r.kernel.org,
        linux-kernel@...r.kernel.org, linux-arch@...r.kernel.org,
        linux-api@...r.kernel.org, luto@...nel.org, tglx@...utronix.de,
        mingo@...hat.com, bp@...en8.de, dietmar.eggemann@....com,
        vincent.guittot@...aro.org, dave.hansen@...ux.intel.com,
        hpa@...or.com, arnd@...db.de, akpm@...ux-foundation.org,
        x86@...nel.org, aneesh.kumar@...ux.ibm.com, ying.huang@...el.com,
        jgroves@...ron.com, sthanneeru@...ron.com, emirakhur@...ron.com,
        vtanna@...ron.com
Subject: Re: [PATCH 2/2] mm: mempolicy: Interleave policy for tiered memory
 nodes

On Wed, Sep 27, 2023 at 03:20:02PM +0530, Ravi Jonnalagadda wrote:
> From: Srinivasulu Thanneeru <sthanneeru@...ron.com>
> 
> Existing interleave policy spreads out pages evenly across a set of
> specified nodes, i.e. 1:1 interleave. Upcoming tiered memory systems
> have CPU-less memory nodes with different peak bandwidth and
> latency-bandwidth characteristics. In such systems, we will want to
> use the additional bandwidth provided by lowtier memory for
> bandwidth-intensive applications. However, the default 1:1 interleave
> can lead to suboptimal bandwidth distribution.
> 
> Introduce an interleave policy for multi-tiers that is based on
> interleave weights, where pages are assigned from nodes of the tier
> based on the tier weight.
> 
> For instance, 50:30:20 are the weights of tiers 0, 1, and 3, which
> leads to a 50%/30%/20% traffic breakdown across the three tiers.
> 
> Signed-off-by: Srinivasulu Thanneeru <sthanneeru@...ron.com>
> Co-authored-by: Ravi Jonnalagadda <ravis.opensrc@...ron.com>
> ---
>  include/linux/memory-tiers.h |  25 +++++++-
>  include/linux/sched.h        |   2 +
>  mm/memory-tiers.c            |  31 ++--------
>  mm/mempolicy.c               | 107 +++++++++++++++++++++++++++++++++--
>  4 files changed, 132 insertions(+), 33 deletions(-)
> 
> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
> index c62d286749d0..74be39cb56c4 100644
> --- a/include/linux/memory-tiers.h
> +++ b/include/linux/memory-tiers.h
> @@ -2,6 +2,7 @@
>  #ifndef _LINUX_MEMORY_TIERS_H
>  #define _LINUX_MEMORY_TIERS_H
>  
> +#include <linux/device.h>
>  #include <linux/types.h>
>  #include <linux/nodemask.h>
>  #include <linux/kref.h>
> @@ -21,7 +22,27 @@
>  
>  #define MAX_TIER_INTERLEAVE_WEIGHT 100
>  
> -struct memory_tier;
> +struct memory_tier {
> +	/* hierarchy of memory tiers */
> +	struct list_head list;
> +	/* list of all memory types part of this tier */
> +	struct list_head memory_types;
> +	/*
> +	 * By default all tiers will have weight as 1, which means they
> +	 * follow default standard allocation.
> +	 */
> +	unsigned short interleave_weight;
> +	/*
> +	 * start value of abstract distance. memory tier maps
> +	 * an abstract distance  range,
> +	 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
> +	 */
> +	int adistance_start;
> +	struct device dev;
> +	/* All the nodes that are part of all the lower memory tiers. */
> +	nodemask_t lower_tier_mask;
> +};
> +

I would make this change in a separate pull-ahead patch.  To make review
of the actual functional changes easier.

>  struct memory_dev_type {
>  	/* list of memory types that are part of same tier as this type */
>  	struct list_head tier_sibiling;
> @@ -38,6 +59,8 @@ struct memory_dev_type *alloc_memory_type(int adistance);
>  void put_memory_type(struct memory_dev_type *memtype);
>  void init_node_memory_type(int node, struct memory_dev_type *default_type);
>  void clear_node_memory_type(int node, struct memory_dev_type *memtype);
> +struct memory_tier *node_get_memory_tier(int node);
> +nodemask_t get_memtier_nodemask(struct memory_tier *memtier);
>  #ifdef CONFIG_MIGRATION
>  int next_demotion_node(int node);
>  void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 77f01ac385f7..07ea837c3afb 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1252,7 +1252,9 @@ struct task_struct {
>  	/* Protected by alloc_lock: */
>  	struct mempolicy		*mempolicy;
>  	short				il_prev;
> +	unsigned short			il_count;
>  	short				pref_node_fork;
> +	unsigned int			current_node;
>  #endif
>  #ifdef CONFIG_NUMA_BALANCING
>  	int				numa_scan_seq;
> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> index 7e06c9e0fa41..5e2ddc9f994a 100644
> --- a/mm/memory-tiers.c
> +++ b/mm/memory-tiers.c
> @@ -8,27 +8,6 @@
>  
>  #include "internal.h"
>  
> -struct memory_tier {
> -	/* hierarchy of memory tiers */
> -	struct list_head list;
> -	/* list of all memory types part of this tier */
> -	struct list_head memory_types;
> -	/*
> -	 * By default all tiers will have weight as 1, which means they
> -	 * follow default standard allocation.
> -	 */
> -	unsigned short interleave_weight;
> -	/*
> -	 * start value of abstract distance. memory tier maps
> -	 * an abstract distance  range,
> -	 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
> -	 */
> -	int adistance_start;
> -	struct device dev;
> -	/* All the nodes that are part of all the lower memory tiers. */
> -	nodemask_t lower_tier_mask;
> -};
> -

See above

>  struct demotion_nodes {
>  	nodemask_t preferred;
>  };
> @@ -115,7 +94,7 @@ static inline struct memory_tier *to_memory_tier(struct device *device)
>  	return container_of(device, struct memory_tier, dev);
>  }
>  
> -static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
> +nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
>  {
>  	nodemask_t nodes = NODE_MASK_NONE;
>  	struct memory_dev_type *memtype;
> @@ -264,7 +243,7 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
>  	return memtier;
>  }
>  
> -static struct memory_tier *__node_get_memory_tier(int node)
> +struct memory_tier *node_get_memory_tier(int node)
>  {
>  	pg_data_t *pgdat;
>  
> @@ -380,7 +359,7 @@ static void disable_all_demotion_targets(void)
>  		 * We are holding memory_tier_lock, it is safe
>  		 * to access pgda->memtier.
>  		 */
> -		memtier = __node_get_memory_tier(node);
> +		memtier = node_get_memory_tier(node);
>  		if (memtier)
>  			memtier->lower_tier_mask = NODE_MASK_NONE;
>  	}
> @@ -417,7 +396,7 @@ static void establish_demotion_targets(void)
>  		best_distance = -1;
>  		nd = &node_demotion[node];
>  
> -		memtier = __node_get_memory_tier(node);
> +		memtier = node_get_memory_tier(node);
>  		if (!memtier || list_is_last(&memtier->list, &memory_tiers))
>  			continue;
>  		/*
> @@ -562,7 +541,7 @@ static bool clear_node_memory_tier(int node)
>  	 * This also enables us to free the destroyed memory tier
>  	 * with kfree instead of kfree_rcu
>  	 */
> -	memtier = __node_get_memory_tier(node);
> +	memtier = node_get_memory_tier(node);
>  	if (memtier) {
>  		struct memory_dev_type *memtype;
>  

See above

> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 42b5567e3773..4f80c6ee1176 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -100,6 +100,8 @@
>  #include <linux/ctype.h>
>  #include <linux/mm_inline.h>
>  #include <linux/mmu_notifier.h>
> +#include <linux/memory-tiers.h>
> +#include <linux/nodemask.h>
>  #include <linux/printk.h>
>  #include <linux/swapops.h>
>  
> @@ -882,8 +884,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
>  
>  	old = current->mempolicy;
>  	current->mempolicy = new;
> -	if (new && new->mode == MPOL_INTERLEAVE)
> +	if (new && new->mode == MPOL_INTERLEAVE) {
>  		current->il_prev = MAX_NUMNODES-1;
> +		current->il_count = 0;
> +		current->current_node = MAX_NUMNODES;
> +	}
>  	task_unlock(current);
>  	mpol_put(old);
>  	ret = 0;
> @@ -1899,13 +1904,76 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
>  	return nd;
>  }
>  

Should this be a change to MPOL_INTERLEAVE, or should this be a new
memory policy:  MPOL_TIERING?

For the sake of retaining the existing behavior of MPOL_INTERLEAVE as-is
and preventing the introduction of bugs, maybe creating MPOL_TIERING is
preferable?

That would allow the rest of mempolicy to be changed cleanly if the
intent is to override mempolicy with memtier behavior.

> +/* Return interleave weight of node from tier's weight */
> +static unsigned short node_interleave_weight(int nid, nodemask_t pol_nodemask)
> +{
> +	struct memory_tier *memtier;
> +	nodemask_t tier_nodes, tier_and_pol;
> +	unsigned short avrg_weight = 0;
> +	int node, nnodes, reminder;
> +
> +	memtier = node_get_memory_tier(nid);
> +
> +	if (!memtier)
> +		return 0;
> +
> +	tier_nodes = get_memtier_nodemask(memtier);
> +	nodes_and(tier_and_pol, tier_nodes, pol_nodemask);
> +	nnodes = nodes_weight(tier_and_pol);
> +	if (!nnodes)
> +		return 0;
> +
> +	avrg_weight = memtier->interleave_weight / nnodes;
> +	/* Set minimum weight of node as 1 so that at least one page
> +	 * is allocated.
> +	 */
> +	if (!avrg_weight)
> +		return 1;
> +
> +	reminder = memtier->interleave_weight % nnodes;
> +	if (reminder) {
> +		for_each_node_mask(node, tier_and_pol) {
> +			/* Increment target node's weight by 1, if it falls
> +			 * within remaining weightage 'reminder'.
> +			 */
> +			if (node == nid) {
> +				if (reminder > 0)
> +					avrg_weight = avrg_weight + 1;
> +				break;
> +			}
> +			reminder--;
> +		}
> +	}
> +	return avrg_weight;
> +}
> +

With this, there are now 3 components with node masks that have to be
cross-referenced to figure out what is both preferred and what is
allowed:  Mempolicy, Cgroups, MemTier.

is abstracting nodes with memtiers actually preferable to abstracting
devices with numa nodes and simply creating a new memory policy with
weights directly described in the policy?

>  /* Do dynamic interleaving for a process */
>  static unsigned interleave_nodes(struct mempolicy *policy)
>  {
>  	unsigned next;
>  	struct task_struct *me = current;
> +	unsigned short node_weight = 0;
>  
> -	next = next_node_in(me->il_prev, policy->nodes);
> +	/* select current node or next node from nodelist based on
> +	 * available tier interleave weight.
> +	 */
> +	if (me->current_node == MAX_NUMNODES)
> +		next = next_node_in(me->il_prev, policy->nodes);
> +	else
> +		next = me->current_node;
> +	node_weight = node_interleave_weight(next, policy->nodes);
> +	if (!node_weight)
> +		goto set_il_prev;
> +	if (me->il_count < node_weight) {

What happens if a node weight changes to be less than il_count in the
middle of operation?  Is il_count reset when node weights are changed?

Since memtier's weights are global, this can't possibly work as intended.

> +		me->il_count++;
> +		me->current_node = next;
> +		if (me->il_count == node_weight) {
> +			me->current_node = MAX_NUMNODES;
> +			me->il_count = 0;
> +		}
> +	}
> +
> +set_il_prev:
>  	if (next < MAX_NUMNODES)
>  		me->il_prev = next;
>  	return next;
> @@ -1966,9 +2034,10 @@ unsigned int mempolicy_slab_node(void)
>  static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
>  {
>  	nodemask_t nodemask = pol->nodes;
> -	unsigned int target, nnodes;
> -	int i;
> -	int nid;
> +	unsigned int target, nnodes, vnnodes = 0;
> +	unsigned short node_weight = 0;
> +	int nid, vtarget, i;
> +
>  	/*
>  	 * The barrier will stabilize the nodemask in a register or on
>  	 * the stack so that it will stop changing under the code.
> @@ -1981,7 +2050,33 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
>  	nnodes = nodes_weight(nodemask);
>  	if (!nnodes)
>  		return numa_node_id();
> -	target = (unsigned int)n % nnodes;
> +
> +	/*
> +	 * Calculate the virtual target for @n in a nodelist that is scaled
> +	 * with interleave weights....
> +	 */
> +	for_each_node_mask(nid, nodemask) {
> +		node_weight = node_interleave_weight(nid, nodemask);
> +		if (!node_weight)
> +			continue;
> +		vnnodes += node_weight;
> +	}

Should this be precomputed?  This seems like a considerable amount of
work in the hot path for page allocation and there are many numa nodes.

> +	if (!vnnodes)
> +		return numa_node_id();
> +	vtarget = (int)((unsigned int)n % vnnodes);
> +
> +	/* ...then map it back to the physical nodelist */
> +	target = 0;
> +	for_each_node_mask(nid, nodemask) {
> +		node_weight = node_interleave_weight(nid, nodemask);
> +		if (!node_weight)
> +			continue;
> +		vtarget -= node_weight;
> +		if (vtarget < 0)
> +			break;
> +		target++;
> +	}

See above, now you're double-counting.

> +
>  	nid = first_node(nodemask);
>  	for (i = 0; i < target; i++)
>  		nid = next_node(nid, nodemask);
> -- 
> 2.39.3
>