lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date:   Wed, 20 Nov 2019 13:25:36 -0600
From:   Scott Cheloha <cheloha@...ux.vnet.ibm.com>
To:     linux-kernel@...r.kernel.org,
        "Rafael J. Wysocki" <rafael@...nel.org>,
        Greg Kroah-Hartman <gregkh@...uxfoundation.org>,
        David Hildenbrand <david@...hat.com>
Cc:     nathanl@...ux.ibm.com, ricklind@...ux.vnet.ibm.com,
        Scott Cheloha <cheloha@...ux.vnet.ibm.com>
Subject: [PATCH] memory subsystem: cache memory blocks in radix tree to accelerate lookup

Searching for a particular memory block by id is slow because each block
device is kept in an unsorted linked list on the subsystem bus.

Lookup is much faster if we cache the blocks in a radix tree.  Memory
subsystem initialization and hotplug/hotunplug is at least a little faster
for any machine with more than ~100 blocks, and the speedup grows with
the block count.

Signed-off-by: Scott Cheloha <cheloha@...ux.vnet.ibm.com>
---
On a 40GB Power9 VM I'm seeing nice initialization speed improvements
consistent with the change in data structure.  Here's the per-block
timings before the patch:

# uptime        elapsed         block-id
0.005121        0.000033        0
0.005154        0.000028        1
0.005182        0.000035        2
0.005217        0.000030        3
0.005247        0.000039        4
0.005286        0.000031        5
0.005317        0.000030        6
0.005347        0.000031        7
0.005378        0.000030        8
0.005408        0.000031        9
[...]
0.091603        0.000143        999
0.091746        0.000175        1000
0.091921        0.000143        1001
0.092064        0.000142        1002
0.092206        0.000143        1003
0.092349        0.000143        1004
0.092492        0.000143        1005
0.092635        0.000144        1006
0.092779        0.000143        1007
0.092922        0.000144        1008
[...]
0.301879        0.000258        2038
0.302137        0.000267        2039
0.302404        0.000291        2040
0.302695        0.000259        2041
0.302954        0.000258        2042
0.303212        0.000259        2043
0.303471        0.000260        2044
0.303731        0.000258        2045
0.303989        0.000259        2046
0.304248        0.000260        2047

Obviously a linear growth with each block.

With the patch:

# uptime        elapsed         block-id
0.004701        0.000029        0
0.004730        0.000028        1
0.004758        0.000028        2
0.004786        0.000027        3
0.004813        0.000037        4
0.004850        0.000027        5
0.004877        0.000027        6
0.004904        0.000027        7
0.004931        0.000026        8
0.004957        0.000027        9
[...]
0.032718        0.000027        999
0.032745        0.000027        1000
0.032772        0.000026        1001
0.032798        0.000027        1002
0.032825        0.000027        1003
0.032852        0.000026        1004
0.032878        0.000027        1005
0.032905        0.000027        1006
0.032932        0.000026        1007
0.032958        0.000027        1008
[...]
0.061148        0.000027        2038
0.061175        0.000027        2039
0.061202        0.000026        2040
0.061228        0.000027        2041
0.061255        0.000027        2042
0.061282        0.000026        2043
0.061308        0.000027        2044
0.061335        0.000026        2045
0.061361        0.000026        2046
0.061387        0.000027        2047

It flattens out.

I'm seeing similar changes on my development PC but the numbers are
less drastic because the block size on a PC grows with the amount
of memory.  On powerpc the gains are a lot more visible because the
block size tops out at 256MB.

 drivers/base/memory.c | 53 ++++++++++++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 84c4e1f72cbd..fc0a4880c321 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -20,6 +20,7 @@
 #include <linux/memory_hotplug.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
+#include <linux/radix-tree.h>
 #include <linux/stat.h>
 #include <linux/slab.h>
 
@@ -59,6 +60,13 @@ static struct bus_type memory_subsys = {
 	.offline = memory_subsys_offline,
 };
 
+/*
+ * Memory blocks are cached in a local radix tree to avoid
+ * a costly linear search for the corresponding device on
+ * the subsystem bus.
+ */
+static RADIX_TREE(memory_block_tree, GFP_KERNEL);
+
 static BLOCKING_NOTIFIER_HEAD(memory_chain);
 
 int register_memory_notifier(struct notifier_block *nb)
@@ -580,20 +588,14 @@ int __weak arch_get_memory_phys_device(unsigned long start_pfn)
 /* A reference for the returned memory block device is acquired. */
 static struct memory_block *find_memory_block_by_id(unsigned long block_id)
 {
-	struct device *dev;
+	struct memory_block *mem;
 
-	dev = subsys_find_device_by_id(&memory_subsys, block_id, NULL);
-	return dev ? to_memory_block(dev) : NULL;
+	mem = radix_tree_lookup(&memory_block_tree, block_id);
+	if (mem)
+		get_device(&mem->dev);
+	return mem;
 }
 
-/*
- * For now, we have a linear search to go find the appropriate
- * memory_block corresponding to a particular phys_index. If
- * this gets to be a real problem, we can always use a radix
- * tree or something here.
- *
- * This could be made generic for all device subsystems.
- */
 struct memory_block *find_memory_block(struct mem_section *section)
 {
 	unsigned long block_id = base_memory_block_id(__section_nr(section));
@@ -636,9 +638,15 @@ int register_memory(struct memory_block *memory)
 	memory->dev.offline = memory->state == MEM_OFFLINE;
 
 	ret = device_register(&memory->dev);
-	if (ret)
+	if (ret) {
 		put_device(&memory->dev);
-
+		return ret;
+	}
+	ret = radix_tree_insert(&memory_block_tree, memory->dev.id, memory);
+	if (ret) {
+		put_device(&memory->dev);
+		device_unregister(&memory->dev);
+	}
 	return ret;
 }
 
@@ -696,6 +704,8 @@ static void unregister_memory(struct memory_block *memory)
 	if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
 		return;
 
+	WARN_ON(radix_tree_delete(&memory_block_tree, memory->dev.id) == NULL);
+
 	/* drop the ref. we got via find_memory_block() */
 	put_device(&memory->dev);
 	device_unregister(&memory->dev);
@@ -851,20 +861,21 @@ void __init memory_dev_init(void)
 int walk_memory_blocks(unsigned long start, unsigned long size,
 		       void *arg, walk_memory_blocks_func_t func)
 {
-	const unsigned long start_block_id = phys_to_block_id(start);
-	const unsigned long end_block_id = phys_to_block_id(start + size - 1);
+	struct radix_tree_iter iter;
+	const unsigned long start_id = phys_to_block_id(start);
+	const unsigned long end_id = phys_to_block_id(start + size - 1);
 	struct memory_block *mem;
-	unsigned long block_id;
+	void **slot;
 	int ret = 0;
 
 	if (!size)
 		return 0;
 
-	for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
-		mem = find_memory_block_by_id(block_id);
-		if (!mem)
-			continue;
-
+	radix_tree_for_each_slot(slot, &memory_block_tree, &iter, start_id) {
+		mem = radix_tree_deref_slot(slot);
+		if (mem->dev.id > end_id)
+			break;
+		get_device(&mem->dev);
 		ret = func(mem, arg);
 		put_device(&mem->dev);
 		if (ret)
-- 
2.24.0.rc1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ