linux-kernel - Re: Linux 4.9-rc6

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <1482334222.8944.41.camel@edumazet-glaptop3.roam.corp.google.com>
Date:   Wed, 21 Dec 2016 07:30:22 -0800
From:   Eric Dumazet <eric.dumazet@...il.com>
To:     Linus Torvalds <torvalds@...ux-foundation.org>
Cc:     Thorsten Leemhuis <regressions@...mhuis.info>,
        Linux Kernel Mailing List <linux-kernel@...r.kernel.org>,
        Al Viro <viro@...iv.linux.org.uk>,
        David Rientjes <rientjes@...gle.com>,
        Hugh Dickins <hughd@...gle.com>
Subject: Re: Linux 4.9-rc6

On Sun, 2016-12-04 at 09:17 -0800, Eric Dumazet wrote:
> On Sun, 2016-12-04 at 03:10 -0800, Linus Torvalds wrote:
> > 
> > 
> > On Dec 4, 2016 02:43, "Thorsten Leemhuis" <regressions@...mhuis.info>
> > wrote:
> >         
> >         
> >         What the status of below patch? From the discussion it looks a
> >         lot like
> >         it was developed to fix a regression in 4.9, but the patch
> >         afaics has
> >         neither his mainline or linux-next yet. 
> > 
> > 
> > It's not a regression as far as I can tell. It's a small optimization.
> > Maybe.
> > 
> > 
> > It's not going into 4.9, is not even clear it's worth it later either,
> > unless somebody had numbers (which I haven't seen)
> > 
> Right, the patch was not in anyway ready for 4.9 ;)
> 
> I'll try to complete this for next cycle.

I now have a hacky patch that also adds PMD alignment for large
allocations, and support hugepages (this last part depends on
CONFIG_HAVE_ARCH_HUGE_VMAP at this moment, x86/arm64 so far)

Toshi Kani added pmd_set_huge() in commit e61ce6ade404e ("mm: change
ioremap to set up huge I/O mappings"), I am not sure why vmalloc() was
not considered (or I might have missed it completely)

It seems to provide about 25 cycles gain per random access for large
tables on my x86 lab hosts.

(I did a test with a program having 10 Million fds)

For allocations above 2 MB (pages >= 512), like Dentry cache,
Inode-cache, TCP established hash table, or large alloc_fdmem() ones,
might benefit from this.

lpaa23:~# grep large /proc/vmallocinfo 
0xffffc90000009000-0xffffc9000000c000   12288 alloc_large_system_hash+0x189/0x253 pages=2 vmalloc N0=1 N1=1
0xffffc9000000c000-0xffffc9000000f000   12288 alloc_large_system_hash+0x189/0x253 pages=2 vmalloc N0=1 N1=1
0xffffc9000001e000-0xffffc9000009f000  528384 alloc_large_system_hash+0x189/0x253 pages=128 vmalloc N0=64 N1=64
0xffffc9000009f000-0xffffc900000e0000  266240 alloc_large_system_hash+0x189/0x253 pages=64 vmalloc N0=32 N1=32
0xffffc900001d9000-0xffffc900001dc000   12288 alloc_large_system_hash+0x189/0x253 pages=2 vmalloc N0=1 N1=1
0xffffc90000200000-0xffffc90010201000 268439552 alloc_large_system_hash+0x189/0x253 pages=65536 vmalloc vpages N0=32768 N1=32768
0xffffc90010400000-0xffffc90018401000 134221824 alloc_large_system_hash+0x189/0x253 pages=32768 vmalloc vpages N0=16384 N1=16384
0xffffc90018600000-0xffffc90018a01000 4198400 alloc_large_system_hash+0x189/0x253 pages=1024 vmalloc vpages N0=512 N1=512
0xffffc90018c00000-0xffffc90019001000 4198400 alloc_large_system_hash+0x189/0x253 pages=1024 vmalloc vpages N0=512 N1=512
0xffffc9001b249000-0xffffc9001b34a000 1052672 alloc_large_system_hash+0x189/0x253 pages=256 vmalloc N0=128 N1=128
0xffffc9001b400000-0xffffc9001b801000 4198400 alloc_large_system_hash+0x189/0x253 pages=1024 vmalloc vpages N0=512 N1=512
0xffffc9001ba00000-0xffffc9001bc01000 2101248 alloc_large_system_hash+0x189/0x253 pages=512 vmalloc N0=256 N1=256
0xffffc9001bc01000-0xffffc9001bd02000 1052672 alloc_large_system_hash+0x189/0x253 pages=256 vmalloc N0=128 N1=128
0xffffc9001be00000-0xffffc9001c001000 2101248 alloc_large_system_hash+0x189/0x253 pages=512 vmalloc N0=256 N1=256


I wont be able to split this patch in 3 parts before January 6th, after
my vacations. I am showing the WIP if anyone is interested seeing this.

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a5584384eabc..055b027ee659 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -21,6 +21,7 @@
 #include <linux/debugobjects.h>
 #include <linux/kallsyms.h>
 #include <linux/list.h>
+#include <linux/mempolicy.h>
 #include <linux/notifier.h>
 #include <linux/rbtree.h>
 #include <linux/radix-tree.h>
@@ -154,6 +155,18 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr,
 		return -ENOMEM;
 	do {
 		next = pmd_addr_end(addr, end);
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+		if (next - addr == PMD_SIZE) {
+			struct page *page = pages[*nr];
+
+			if (compound_order(page) == PMD_SHIFT - PAGE_SHIFT) {
+				if (pmd_set_huge(pmd, page_to_phys(page), prot)) {
+					(*nr) += 1 << (PMD_SHIFT - PAGE_SHIFT);
+					continue;
+				}
+			}
+		}
+#endif
 		if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
 			return -ENOMEM;
 	} while (pmd++, addr = next, addr != end);
@@ -1349,7 +1362,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
 	if (flags & VM_IOREMAP)
 		align = 1ul << clamp_t(int, get_count_order_long(size),
 				       PAGE_SHIFT, IOREMAP_MAX_ORDER);
-
+	else if (size >= PMD_SIZE)
+		align = PMD_SIZE;
 	area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
 	if (unlikely(!area))
 		return NULL;
@@ -1482,11 +1496,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
 	if (deallocate_pages) {
 		int i;
 
-		for (i = 0; i < area->nr_pages; i++) {
+		for (i = 0; i < area->nr_pages;) {
 			struct page *page = area->pages[i];
+			unsigned int order;
 
 			BUG_ON(!page);
-			__free_pages(page, 0);
+			order = compound_order(page);
+			__free_pages(page, order);
+			i += 1 << order;
 		}
 
 		kvfree(area->pages);
@@ -1613,16 +1630,39 @@ EXPORT_SYMBOL(vmap);
 static void *__vmalloc_node(unsigned long size, unsigned long align,
 			    gfp_t gfp_mask, pgprot_t prot,
 			    int node, const void *caller);
+
+static int vmalloc_max_order(int node, int nr_pages)
+{
+	int max_node_order = min(PMD_SHIFT - PAGE_SHIFT, MAX_ORDER - 1);
+
+#if defined(CONFIG_NUMA)
+	if (nr_online_nodes > 1 && node == NUMA_NO_NODE) {
+		struct mempolicy *pol = current->mempolicy;
+		int pages_per_node, nr_nodes;
+
+		if (pol && pol->mode == MPOL_INTERLEAVE) {
+			nr_nodes = nodes_weight(pol->v.nodes);
+			pages_per_node = DIV_ROUND_UP(nr_pages, nr_nodes);
+			max_node_order = min(max_node_order,
+					     ilog2(pages_per_node));
+		}
+	}
+#endif
+	return max_node_order;
+}
+
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 				 pgprot_t prot, int node)
 {
 	struct page **pages;
-	unsigned int nr_pages, array_size, i;
+	unsigned int nr_pages, array_size, i, j;
 	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
 	const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
+	int max_node_order;
 
 	nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
 	array_size = (nr_pages * sizeof(struct page *));
+	max_node_order = vmalloc_max_order(node, nr_pages);
 
 	area->nr_pages = nr_pages;
 	/* Please note that the recursion is strictly bounded. */
@@ -1639,20 +1679,31 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 		return NULL;
 	}
 
-	for (i = 0; i < area->nr_pages; i++) {
-		struct page *page;
 
-		if (node == NUMA_NO_NODE)
-			page = alloc_page(alloc_mask);
-		else
-			page = alloc_pages_node(node, alloc_mask, 0);
+	for (i = 0; i < area->nr_pages;) {
+		int order = min(ilog2(area->nr_pages - i), max_node_order);
+		struct page *page;
 
-		if (unlikely(!page)) {
-			/* Successfully allocated i pages, free them in __vunmap() */
-			area->nr_pages = i;
-			goto fail;
+		for (;;) {
+			gfp_t gfp = alloc_mask;
+
+			if (order > 0)
+				gfp = (gfp & ~__GFP_DIRECT_RECLAIM) |
+				      __GFP_NORETRY | __GFP_COMP;
+			if (node == NUMA_NO_NODE)
+				page = alloc_pages(gfp, order);
+			else
+				page = alloc_pages_node(node, gfp, order);
+			if (page)
+				break;
+			if (unlikely(--order < 0)) {
+				/* Successfully allocated i pages, free them in __vunmap() */
+				area->nr_pages = i;
+				goto fail;
+			}
 		}
-		area->pages[i] = page;
+		for (j = 0; j < (1U << order); j++)
+			area->pages[i++] = page++;
 		if (gfpflags_allow_blocking(gfp_mask))
 			cond_resched();
 	}
@@ -2619,9 +2670,13 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
 
 		memset(counters, 0, nr_node_ids * sizeof(unsigned int));
 
-		for (nr = 0; nr < v->nr_pages; nr++)
-			counters[page_to_nid(v->pages[nr])]++;
+		for (nr = 0; nr < v->nr_pages;) {
+			struct page *page = v->pages[nr];
+			int npages = 1 << compound_order(page);
 
+			counters[page_to_nid(page)] += npages;
+			nr += npages;
+		}
 		for_each_node_state(nr, N_HIGH_MEMORY)
 			if (counters[nr])
 				seq_printf(m, " N%u=%u", nr, counters[nr]);