lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Wed, 08 Jun 2022 13:14:55 -0700
From:   Tim Chen <tim.c.chen@...ux.intel.com>
To:     "Aneesh Kumar K.V" <aneesh.kumar@...ux.ibm.com>,
        Ying Huang <ying.huang@...el.com>, linux-mm@...ck.org,
        akpm@...ux-foundation.org
Cc:     Wei Xu <weixugc@...gle.com>, Greg Thelen <gthelen@...gle.com>,
        Yang Shi <shy828301@...il.com>,
        Davidlohr Bueso <dave@...olabs.net>,
        Tim C Chen <tim.c.chen@...el.com>,
        Brice Goglin <brice.goglin@...il.com>,
        Michal Hocko <mhocko@...nel.org>,
        Linux Kernel Mailing List <linux-kernel@...r.kernel.org>,
        Hesham Almatary <hesham.almatary@...wei.com>,
        Dave Hansen <dave.hansen@...el.com>,
        Jonathan Cameron <Jonathan.Cameron@...wei.com>,
        Alistair Popple <apopple@...dia.com>,
        Dan Williams <dan.j.williams@...el.com>,
        Feng Tang <feng.tang@...el.com>,
        Jagdish Gediya <jvgediya@...ux.ibm.com>,
        Baolin Wang <baolin.wang@...ux.alibaba.com>,
        David Rientjes <rientjes@...gle.com>
Subject: Re: [PATCH v5 9/9] mm/demotion: Update node_is_toptier to work with
 memory tiers

On Wed, 2022-06-08 at 20:07 +0530, Aneesh Kumar K.V wrote:
> 
> 
> This is what I am testing now. We still need to closely audit that lock
> free access to the NODE_DATA()->memtier. 

You're refering to this or something else?  This is a write so seems okay.

> +	for_each_node_state(node, N_MEMORY) {
> +		/*
> +		 * Should be safe to do this early in the boot.
> +		 */
> +		NODE_DATA(node)->memtier = memtier;
> +		node_set(node, memtier->nodelist);
> +	}
>  	migrate_on_reclaim_init();


> For v6 I will keep this as a
> separate patch and once we all agree that it is safe, I will fold it
> back.

Please update code that uses __node_get_memory_tier(node) to use
NODE_DATA(node)->memtier;

Otherwise the code looks okay at a first glance.

Tim

> 
> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
> index a388a806b61a..3e733de1a8a0 100644
> --- a/include/linux/memory-tiers.h
> +++ b/include/linux/memory-tiers.h
> @@ -17,7 +17,6 @@
>  #define MAX_MEMORY_TIERS  (MAX_STATIC_MEMORY_TIERS + 2)
>  
>  extern bool numa_demotion_enabled;
> -extern nodemask_t promotion_mask;
>  int node_create_and_set_memory_tier(int node, int tier);
>  int next_demotion_node(int node);
>  int node_set_memory_tier(int node, int tier);
> @@ -25,15 +24,7 @@ int node_get_memory_tier_id(int node);
>  int node_reset_memory_tier(int node, int tier);
>  void node_remove_from_memory_tier(int node);
>  void node_get_allowed_targets(int node, nodemask_t *targets);
> -
> -/*
> - * By default all nodes are top tiper. As we create new memory tiers
> - * we below top tiers we add them to NON_TOP_TIER state.
> - */
> -static inline bool node_is_toptier(int node)
> -{
> -	return !node_isset(node, promotion_mask);
> -}
> +bool node_is_toptier(int node);
>  
>  #else
>  #define numa_demotion_enabled	false
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index aab70355d64f..c4fcfd2b9980 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -928,6 +928,9 @@ typedef struct pglist_data {
>  	/* Per-node vmstats */
>  	struct per_cpu_nodestat __percpu *per_cpu_nodestats;
>  	atomic_long_t		vm_stat[NR_VM_NODE_STAT_ITEMS];
> +#ifdef CONFIG_TIERED_MEMORY
> +	struct memory_tier *memtier;
> +#endif
>  } pg_data_t;
>  
>  #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> index 29a038bb38b0..31ef0fab5f19 100644
> --- a/mm/memory-tiers.c
> +++ b/mm/memory-tiers.c
> @@ -7,6 +7,7 @@
>  #include <linux/random.h>
>  #include <linux/memory.h>
>  #include <linux/idr.h>
> +#include <linux/rcupdate.h>
>  
>  #include "internal.h"
>  
> @@ -26,7 +27,7 @@ struct demotion_nodes {
>  static void establish_migration_targets(void);
>  static DEFINE_MUTEX(memory_tier_lock);
>  static LIST_HEAD(memory_tiers);
> -nodemask_t promotion_mask;
> +static int top_tier_rank;
>  /*
>   * node_demotion[] examples:
>   *
> @@ -135,7 +136,7 @@ static void memory_tier_device_release(struct device *dev)
>  	if (tier->dev.id >= MAX_STATIC_MEMORY_TIERS)
>  		ida_free(&memtier_dev_id, tier->dev.id);
>  
> -	kfree(tier);
> +	kfree_rcu(tier);
>  }
>  
>  /*
> @@ -233,6 +234,70 @@ static struct memory_tier *__get_memory_tier_from_id(int id)
>  	return NULL;
>  }
>  
> +/*
> + * Called with memory_tier_lock. Hence the device references cannot
> + * be dropped during this function.
> + */
> +static void memtier_node_clear(int node, struct memory_tier *memtier)
> +{
> +	pg_data_t *pgdat;
> +
> +	pgdat = NODE_DATA(node);
> +	if (!pgdat)
> +		return;
> +
> +	rcu_assign_pointer(pgdat->memtier, NULL);
> +	/*
> +	 * Make sure read side see the NULL value before we clear the node
> +	 * from the nodelist.
> +	 */
> +	synchronize_rcu();
> +	node_clear(node, memtier->nodelist);
> +}
> +
> +static void memtier_node_set(int node, struct memory_tier *memtier)
> +{
> +	pg_data_t *pgdat;
> +
> +	pgdat = NODE_DATA(node);
> +	if (!pgdat)
> +		return;
> +	/*
> +	 * Make sure we mark the memtier NULL before we assign the new memory tier
> +	 * to the NUMA node. This make sure that anybody looking at NODE_DATA
> +	 * finds a NULL memtier or the one which is still valid.
> +	 */
> +	rcu_assign_pointer(pgdat->memtier, NULL);
> +	synchronize_rcu();
> +	node_set(node, memtier->nodelist);
> +	rcu_assign_pointer(pgdat->memtier, memtier);
> +}
> +
> +bool node_is_toptier(int node)
> +{
> +	bool toptier;
> +	pg_data_t *pgdat;
> +	struct memory_tier *memtier;
> +
> +	pgdat = NODE_DATA(node);
> +	if (!pgdat)
> +		return false;
> +
> +	rcu_read_lock();
> +	memtier = rcu_dereference(pgdat->memtier);
> +	if (!memtier) {
> +		toptier = true;
> +		goto out;
> +	}
> +	if (memtier->rank >= top_tier_rank)
> +		toptier = true;
> +	else
> +		toptier = false;
> +out:
> +	rcu_read_unlock();
> +	return toptier;
> +}
> +
>  static int __node_create_and_set_memory_tier(int node, int tier)
>  {
>  	int ret = 0;
> @@ -253,7 +318,7 @@ static int __node_create_and_set_memory_tier(int node, int tier)
>  			goto out;
>  		}
>  	}
> -	node_set(node, memtier->nodelist);
> +	memtier_node_set(node, memtier);
>  out:
>  	return ret;
>  }
> @@ -275,12 +340,12 @@ int node_create_and_set_memory_tier(int node, int tier)
>  	if (current_tier->dev.id == tier)
>  		goto out;
>  
> -	node_clear(node, current_tier->nodelist);
> +	memtier_node_clear(node, current_tier);
>  
>  	ret = __node_create_and_set_memory_tier(node, tier);
>  	if (ret) {
>  		/* reset it back to older tier */
> -		node_set(node, current_tier->nodelist);
> +		memtier_node_set(node, current_tier);
>  		goto out;
>  	}
>  
> @@ -305,7 +370,7 @@ static int __node_set_memory_tier(int node, int tier)
>  		ret = -EINVAL;
>  		goto out;
>  	}
> -	node_set(node, memtier->nodelist);
> +	memtier_node_set(node, memtier);
>  out:
>  	return ret;
>  }
> @@ -374,12 +439,12 @@ int node_reset_memory_tier(int node, int tier)
>  	if (current_tier->dev.id == tier)
>  		goto out;
>  
> -	node_clear(node, current_tier->nodelist);
> +	memtier_node_clear(node, current_tier);
>  
>  	ret = __node_set_memory_tier(node, tier);
>  	if (ret) {
>  		/* reset it back to older tier */
> -		node_set(node, current_tier->nodelist);
> +		memtier_node_set(node, current_tier);
>  		goto out;
>  	}
>  
> @@ -407,7 +472,7 @@ void node_remove_from_memory_tier(int node)
>  	 * empty then unregister it to make it invisible
>  	 * in sysfs.
>  	 */
> -	node_clear(node, memtier->nodelist);
> +	memtier_node_clear(node, memtier);
>  	if (nodes_empty(memtier->nodelist))
>  		unregister_memory_tier(memtier);
>  
> @@ -570,15 +635,13 @@ static void establish_migration_targets(void)
>  	 * a memory tier, we consider that tier as top tiper from
>  	 * which promotion is not allowed.
>  	 */
> -	promotion_mask = NODE_MASK_NONE;
>  	list_for_each_entry_reverse(memtier, &memory_tiers, list) {
>  		nodes_and(allowed, node_states[N_CPU], memtier->nodelist);
> -		if (nodes_empty(allowed))
> -			nodes_or(promotion_mask, promotion_mask, memtier->nodelist);
> -		else
> +		if (!nodes_empty(allowed)) {
> +			top_tier_rank = memtier->rank;
>  			break;
> +		}
>  	}
> -
>  	pr_emerg("top tier rank is %d\n", top_tier_rank);
>  	allowed = NODE_MASK_NONE;
>  	/*
> @@ -748,7 +811,7 @@ static const struct attribute_group *memory_tier_attr_groups[] = {
>  
>  static int __init memory_tier_init(void)
>  {
> -	int ret;
> +	int ret, node;
>  	struct memory_tier *memtier;
>  
>  	ret = subsys_system_register(&memory_tier_subsys, memory_tier_attr_groups);
> @@ -766,7 +829,13 @@ static int __init memory_tier_init(void)
>  		panic("%s() failed to register memory tier: %d\n", __func__, ret);
>  
>  	/* CPU only nodes are not part of memory tiers. */
> -	memtier->nodelist = node_states[N_MEMORY];
> +	for_each_node_state(node, N_MEMORY) {
> +		/*
> +		 * Should be safe to do this early in the boot.
> +		 */
> +		NODE_DATA(node)->memtier = memtier;
> +		node_set(node, memtier->nodelist);
> +	}
>  	migrate_on_reclaim_init();
>  
>  	return 0;

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ