[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87a67j1uyk.fsf@yhuang6-desk2.ccr.corp.intel.com>
Date: Thu, 01 Sep 2022 14:15:31 +0800
From: "Huang, Ying" <ying.huang@...el.com>
To: "Aneesh Kumar K.V" <aneesh.kumar@...ux.ibm.com>
Cc: linux-mm@...ck.org, akpm@...ux-foundation.org,
Wei Xu <weixugc@...gle.com>, Yang Shi <shy828301@...il.com>,
Davidlohr Bueso <dave@...olabs.net>,
Tim C Chen <tim.c.chen@...el.com>,
Michal Hocko <mhocko@...nel.org>,
Linux Kernel Mailing List <linux-kernel@...r.kernel.org>,
Hesham Almatary <hesham.almatary@...wei.com>,
Dave Hansen <dave.hansen@...el.com>,
Jonathan Cameron <Jonathan.Cameron@...wei.com>,
Alistair Popple <apopple@...dia.com>,
Dan Williams <dan.j.williams@...el.com>,
Johannes Weiner <hannes@...xchg.org>, jvgediya.oss@...il.com,
Bharata B Rao <bharata@....com>
Subject: Re: [PATCH mm-unstable] mm/demotion: Assign correct memory type for
multiple dax devices with the same node affinity
"Aneesh Kumar K.V" <aneesh.kumar@...ux.ibm.com> writes:
> With multiple dax devices having the same node affinity, the kernel wrongly assigned
> default_dram memory type to some devices after the memory hotplug operation. Fix this by
> not clearing node_memory_types on the dax device remove.
Sorry for late reply.
Just for confirmation. There are multiple dax devices in one NUMA node?
If you can show the bug reproducing steps, that will make it even easier
to understand.
Best Regards,
Huang, Ying
> The current kernel cleared node_memory_type on successful removal of a dax device.
> But then we can have multiple dax devices with the same node affinity. Clearing the
> node_memory_type results in assigning other dax devices to the default dram type when
> we bring them online.
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@...ux.ibm.com>
> ---
> mm/memory-tiers.c | 37 +++++++++++++++++++++++++++++--------
> 1 file changed, 29 insertions(+), 8 deletions(-)
>
> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> index ba844fe9cc8c..c4bd6d052a33 100644
> --- a/mm/memory-tiers.c
> +++ b/mm/memory-tiers.c
> @@ -27,9 +27,14 @@ struct demotion_nodes {
> nodemask_t preferred;
> };
>
> +struct node_memory_type_map {
> + struct memory_dev_type *memtype;
> + int map_count;
> +};
> +
> static DEFINE_MUTEX(memory_tier_lock);
> static LIST_HEAD(memory_tiers);
> -static struct memory_dev_type *node_memory_types[MAX_NUMNODES];
> +static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
> static struct memory_dev_type *default_dram_type;
> #ifdef CONFIG_MIGRATION
> static int top_tier_adistance;
> @@ -386,9 +391,19 @@ static inline void establish_demotion_targets(void) {}
>
> static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype)
> {
> - if (!node_memory_types[node]) {
> - node_memory_types[node] = memtype;
> - kref_get(&memtype->kref);
> + if (!node_memory_types[node].memtype)
> + node_memory_types[node].memtype = memtype;
> + /*
> + * for each device getting added in the same NUMA node
> + * with this specific memtype, bump the map count. We
> + * Only take memtype device reference once, so that
> + * changing a node memtype can be done by droping the
> + * only reference count taken here.
> + */
> +
> + if (node_memory_types[node].memtype == memtype) {
> + if (!node_memory_types[node].map_count++)
> + kref_get(&memtype->kref);
> }
> }
>
> @@ -406,7 +421,7 @@ static struct memory_tier *set_node_memory_tier(int node)
>
> __init_node_memory_type(node, default_dram_type);
>
> - memtype = node_memory_types[node];
> + memtype = node_memory_types[node].memtype;
> node_set(node, memtype->nodes);
> memtier = find_create_memory_tier(memtype);
> if (!IS_ERR(memtier))
> @@ -448,7 +463,7 @@ static bool clear_node_memory_tier(int node)
>
> rcu_assign_pointer(pgdat->memtier, NULL);
> synchronize_rcu();
> - memtype = node_memory_types[node];
> + memtype = node_memory_types[node].memtype;
> node_clear(node, memtype->nodes);
> if (nodes_empty(memtype->nodes)) {
> list_del_init(&memtype->tier_sibiling);
> @@ -502,8 +517,14 @@ EXPORT_SYMBOL_GPL(init_node_memory_type);
> void clear_node_memory_type(int node, struct memory_dev_type *memtype)
> {
> mutex_lock(&memory_tier_lock);
> - if (node_memory_types[node] == memtype) {
> - node_memory_types[node] = NULL;
> + if (node_memory_types[node].memtype == memtype)
> + node_memory_types[node].map_count--;
> + /*
> + * If we umapped all the attached devices to this node,
> + * clear the node memory type.
> + */
> + if (!node_memory_types[node].map_count) {
> + node_memory_types[node].memtype = NULL;
> kref_put(&memtype->kref, release_memtype);
> }
> mutex_unlock(&memory_tier_lock);
Powered by blists - more mailing lists