lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87sfn2u0vy.fsf@linux.ibm.com>
Date:   Fri, 15 Jul 2022 15:57:13 +0530
From:   "Aneesh Kumar K.V" <aneesh.kumar@...ux.ibm.com>
To:     "Huang, Ying" <ying.huang@...el.com>
Cc:     linux-mm@...ck.org, akpm@...ux-foundation.org,
        Wei Xu <weixugc@...gle.com>, Yang Shi <shy828301@...il.com>,
        Davidlohr Bueso <dave@...olabs.net>,
        Tim C Chen <tim.c.chen@...el.com>,
        Michal Hocko <mhocko@...nel.org>,
        Linux Kernel Mailing List <linux-kernel@...r.kernel.org>,
        Hesham Almatary <hesham.almatary@...wei.com>,
        Dave Hansen <dave.hansen@...el.com>,
        Jonathan Cameron <Jonathan.Cameron@...wei.com>,
        Alistair Popple <apopple@...dia.com>,
        Dan Williams <dan.j.williams@...el.com>,
        Johannes Weiner <hannes@...xchg.org>, jvgediya.oss@...il.com,
        Jagdish Gediya <jvgediya@...ux.ibm.com>
Subject: Re: [PATCH v9 1/8] mm/demotion: Add support for explicit memory tiers

Aneesh Kumar K V <aneesh.kumar@...ux.ibm.com> writes:

....

> 
>> You dropped the original sysfs interface patches from the series, but
>> the kernel internal implementation is still for the original sysfs
>> interface.  For example, memory tier ID is for the original sysfs
>> interface, not for the new proposed sysfs interface.  So I suggest you
>> to implement with the new interface in mind.  What do you think about
>> the following design?
>> 
>
> Sorry I am not able to follow you here. This patchset completely drops
> exposing memory tiers to userspace via sysfs. Instead it allow
> creation of memory tiers with specific tierID from within the kernel/device driver.
> Default tierID is 200 and dax kmem creates memory tier with tierID 100. 
>
>
>> - Each NUMA node belongs to a memory type, and each memory type
>>   corresponds to a "abstract distance", so each NUMA node corresonds to
>>   a "distance".  For simplicity, we can start with static distances, for
>>   example, DRAM (default): 150, PMEM: 250.  The distance of each NUMA
>>   node can be recorded in a global array,
>> 
>>     int node_distances[MAX_NUMNODES];
>> 
>>   or, just
>> 
>>     pgdat->distance
>> 
>
> I don't follow this. I guess you are trying to have a different design.
> Would it be much easier if you can write this in the form of a patch? 
>
>
>> - Each memory tier corresponds to a range of distance, for example,
>>   0-100, 100-200, 200-300, >300, we can start with static ranges too.
>> 
>> - The core API of memory tier could be
>> 
>>     struct memory_tier *find_create_memory_tier(int distance);
>> 
>>   it will find the memory tier which covers "distance" in the memory
>>   tier list, or create a new memory tier if not found.
>> 
>
> I was expecting this to be internal to dax kmem. How dax kmem maps
> "abstract distance" to a memory tier. At this point this patchset is
> keeping all that for a future patchset. 
>

This shows how i was expecting "abstract distance" to be integrated.

diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
index 82cae08976bc..1281aec63986 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -1332,6 +1332,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
 	ndr_desc.mapping = &mapping;
 	ndr_desc.num_mappings = 1;
 	ndr_desc.nd_set = &p->nd_set;
+	ndr_desc.memtier_distance = PMEM_MEMTIER_DEFAULT_DISTANCE;
 
 	if (p->hcall_flush_required) {
 		set_bit(ND_REGION_ASYNC, &ndr_desc.flags);
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index ae5f4acf2675..7b8cf1f15562 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -2641,6 +2641,10 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
 			NUMA_NO_NODE, ndr_desc->numa_node, &res.start, &res.end);
 	}
 
+	/*
+	 * We may want to look at SLIT/HMAT to fine tune this
+	 */
+	ndr_desc->memtier_distance  =  PMEM_MEMTIER_DEFAULT_DISTANCE;
 	/*
 	 * Persistence domain bits are hierarchical, if
 	 * ACPI_NFIT_CAPABILITY_CACHE_FLUSH is set then
diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 1dad813ee4a6..708a40cf29c0 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -570,8 +570,9 @@ static void dax_region_unregister(void *region)
 }
 
 struct dax_region *alloc_dax_region(struct device *parent, int region_id,
-		struct range *range, int target_node, unsigned int align,
-		unsigned long flags)
+				    struct range *range, int target_node,
+				    int memtier_distance, unsigned int align,
+				    unsigned long flags)
 {
 	struct dax_region *dax_region;
 
@@ -599,6 +600,7 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id,
 	dax_region->align = align;
 	dax_region->dev = parent;
 	dax_region->target_node = target_node;
+	dax_region->memtier_distance = memtier_distance;
 	ida_init(&dax_region->ida);
 	dax_region->res = (struct resource) {
 		.start = range->start,
@@ -1370,6 +1372,7 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data)
 
 	dev_dax->dax_dev = dax_dev;
 	dev_dax->target_node = dax_region->target_node;
+	dev_dax->memtier_distance = dax_region->memtier_distance;
 	dev_dax->align = dax_region->align;
 	ida_init(&dev_dax->ida);
 	kref_get(&dax_region->kref);
diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h
index fbb940293d6d..3de4292392dd 100644
--- a/drivers/dax/bus.h
+++ b/drivers/dax/bus.h
@@ -13,8 +13,9 @@ void dax_region_put(struct dax_region *dax_region);
 
 #define IORESOURCE_DAX_STATIC (1UL << 0)
 struct dax_region *alloc_dax_region(struct device *parent, int region_id,
-		struct range *range, int target_node, unsigned int align,
-		unsigned long flags);
+				    struct range *range, int target_node,
+				    int memtier_distance, unsigned int align,
+				    unsigned long flags);
 
 struct dev_dax_data {
 	struct dax_region *dax_region;
diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
index 1c974b7caae6..5db382c78d0e 100644
--- a/drivers/dax/dax-private.h
+++ b/drivers/dax/dax-private.h
@@ -31,6 +31,7 @@ void dax_bus_exit(void);
 struct dax_region {
 	int id;
 	int target_node;
+	int memtier_distance;
 	struct kref kref;
 	struct device *dev;
 	unsigned int align;
@@ -64,6 +65,7 @@ struct dev_dax {
 	struct dax_device *dax_dev;
 	unsigned int align;
 	int target_node;
+	int memtier_distance;
 	int id;
 	struct ida ida;
 	struct device dev;
diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
index 1bf040dbc834..b9f80971c07b 100644
--- a/drivers/dax/hmem/hmem.c
+++ b/drivers/dax/hmem/hmem.c
@@ -26,7 +26,7 @@ static int dax_hmem_probe(struct platform_device *pdev)
 	range.start = res->start;
 	range.end = res->end;
 	dax_region = alloc_dax_region(dev, pdev->id, &range, mri->target_node,
-			PMD_SIZE, 0);
+				      mri->memtier_distance, PMD_SIZE, 0);
 	if (!dax_region)
 		return -ENOMEM;
 
diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 0c03889286ac..32878bd96f09 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -45,13 +45,18 @@ struct dax_kmem_data {
 static unsigned int dax_kmem_memtier = MEMORY_TIER_PMEM;
 module_param(dax_kmem_memtier, uint, 0644);
 
+int find_memtier_from_distance(struct dev_dax *dev_dax)
+{
+	return dax_kmem_memtier + dev_dax->memtier_distance;
+}
+
 static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 {
 	struct device *dev = &dev_dax->dev;
 	unsigned long total_len = 0;
 	struct dax_kmem_data *data;
 	int i, rc, mapped = 0;
-	int numa_node;
+	int numa_node, mem_tier;
 
 	/*
 	 * Ensure good NUMA information for the persistent memory.
@@ -150,7 +155,8 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 	}
 
 	dev_set_drvdata(dev, data);
-	node_create_and_set_memory_tier(numa_node, dax_kmem_memtier);
+	mem_tier = find_memtier_from_distance(dev_dax);
+	node_create_and_set_memory_tier(numa_node, mem_tier);
 	return 0;
 
 err_request_mem:
diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c
index f050ea78bb83..1b51fc0490de 100644
--- a/drivers/dax/pmem.c
+++ b/drivers/dax/pmem.c
@@ -54,8 +54,10 @@ static struct dev_dax *__dax_pmem_probe(struct device *dev)
 	range = pgmap.range;
 	range.start += offset;
 	dax_region = alloc_dax_region(dev, region_id, &range,
-			nd_region->target_node, le32_to_cpu(pfn_sb->align),
-			IORESOURCE_DAX_STATIC);
+				      nd_region->target_node,
+				      nd_region->memtier_distance,
+				      le32_to_cpu(pfn_sb->align),
+				      IORESOURCE_DAX_STATIC);
 	if (!dax_region)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index ec5219680092..cf7a379a2220 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -416,6 +416,7 @@ struct nd_region {
 	u64 ndr_size;
 	u64 ndr_start;
 	int id, num_lanes, ro, numa_node, target_node;
+	int memtier_distance;
 	void *provider_data;
 	struct kernfs_node *bb_state;
 	struct badblocks bb;
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index d976260eca7a..f2067de8d660 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -1019,6 +1019,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 	nd_region->ro = ro;
 	nd_region->numa_node = ndr_desc->numa_node;
 	nd_region->target_node = ndr_desc->target_node;
+	nd_region->memtier_distance = ndr_desc->memtier_distance;
 	ida_init(&nd_region->ns_ida);
 	ida_init(&nd_region->btt_ida);
 	ida_init(&nd_region->pfn_ida);
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 0d61e07b6827..bf20e018074f 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -121,6 +121,7 @@ struct nd_region_desc {
 	int num_lanes;
 	int numa_node;
 	int target_node;
+	int memtier_distance;
 	unsigned long flags;
 	struct device_node *of_node;
 	int (*flush)(struct nd_region *nd_region, struct bio *bio);
@@ -224,6 +225,8 @@ struct nvdimm_fw_ops {
 	int (*arm)(struct nvdimm *nvdimm, enum nvdimm_fwa_trigger arg);
 };
 
+#define PMEM_MEMTIER_DEFAULT_DISTANCE  10
+
 void badrange_init(struct badrange *badrange);
 int badrange_add(struct badrange *badrange, u64 addr, u64 length);
 void badrange_forget(struct badrange *badrange, phys_addr_t start,
diff --git a/include/linux/memregion.h b/include/linux/memregion.h
index c04c4fd2e209..5850e2bbbfed 100644
--- a/include/linux/memregion.h
+++ b/include/linux/memregion.h
@@ -6,6 +6,7 @@
 
 struct memregion_info {
 	int target_node;
+	int memtier_distance;
 };
 
 #ifdef CONFIG_MEMREGION

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ