[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1650601109.vb3owbt14k.astroid@bobo.none>
Date: Fri, 22 Apr 2022 14:31:33 +1000
From: Nicholas Piggin <npiggin@...il.com>
To: "Edgecombe, Rick P" <rick.p.edgecombe@...el.com>,
"Torvalds, Linus" <torvalds@...ux-foundation.org>
Cc: "akpm@...ux-foundation.org" <akpm@...ux-foundation.org>,
"ast@...nel.org" <ast@...nel.org>, "bp@...en8.de" <bp@...en8.de>,
"bpf@...r.kernel.org" <bpf@...r.kernel.org>,
"daniel@...earbox.net" <daniel@...earbox.net>,
"dborkman@...hat.com" <dborkman@...hat.com>,
"edumazet@...gle.com" <edumazet@...gle.com>,
"hch@...radead.org" <hch@...radead.org>,
"hpa@...or.com" <hpa@...or.com>,
"imbrenda@...ux.ibm.com" <imbrenda@...ux.ibm.com>,
"Kernel-team@...com" <Kernel-team@...com>,
"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
"linux-mm@...ck.org" <linux-mm@...ck.org>,
"mbenes@...e.cz" <mbenes@...e.cz>,
"mcgrof@...nel.org" <mcgrof@...nel.org>,
"pmladek@...e.com" <pmladek@...e.com>,
"rppt@...nel.org" <rppt@...nel.org>,
"song@...nel.org" <song@...nel.org>,
"songliubraving@...com" <songliubraving@...com>
Subject: Re: [PATCH v4 bpf 0/4] vmalloc: bpf: introduce VM_ALLOW_HUGE_VMAP
Excerpts from Nicholas Piggin's message of April 22, 2022 1:08 pm:
> Excerpts from Edgecombe, Rick P's message of April 22, 2022 12:29 pm:
>> On Fri, 2022-04-22 at 10:12 +1000, Nicholas Piggin wrote:
>>> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
>>> index e163372d3967..70933f4ed069 100644
>>> --- a/mm/vmalloc.c
>>> +++ b/mm/vmalloc.c
>>> @@ -2925,12 +2925,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
>>> if (nr != nr_pages_request)
>>> break;
>>> }
>>> - } else
>>> - /*
>>> - * Compound pages required for remap_vmalloc_page if
>>> - * high-order pages.
>>> - */
>>> - gfp |= __GFP_COMP;
>>> + }
>>>
>>> /* High-order pages or fallback path if "bulk" fails. */
>>>
>>> @@ -2944,6 +2939,13 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
>>> page = alloc_pages_node(nid, gfp, order);
>>> if (unlikely(!page))
>>> break;
>>> + /*
>>> + * Higher order allocations must be able to be
>>> treated as
>>> + * indepdenent small pages by callers (as they can
>>> with
>>> + * small page allocs).
>>> + */
>>> + if (order)
>>> + split_page(page, order);
>>>
>>> /*
>>> * Careful, we allocate and map page-order pages, but
>>
>> FWIW, I like this direction. I think it needs to free them differently
>> though? Since currently assumes they are high order pages in that path.
>
> Yeah I got a bit excited there, but fairly sure that's the bug.
> I'll do a proper patch.
So here's the patch on top of the revert. Only tested on a lowly
powerpc machine, but it does fix this simple test case that does
what the drm driver is obviously doing:
size_t sz = PMD_SIZE;
void *mem = vmalloc(sz);
struct page *p = vmalloc_to_page(mem + PAGE_SIZE*3);
p->mapping = NULL;
p->index = 0;
INIT_LIST_HEAD(&p->lru);
vfree(mem);
Without the below fix the same exact problem reproduces:
BUG: Bad page state in process swapper/0 pfn:00743
page:(____ptrval____) refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x743
flags: 0x7ffff000000000(node=0|zone=0|lastcpupid=0x7ffff)
raw: 007ffff000000000 c00c00000001d0c8 c00c00000001d0c8 0000000000000000
raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000
page dumped because: corrupted mapping in tail page
Modules linked in:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.18.0-rc3-00082-gfc6fff4a7ce1-dirty #2810
Call Trace:
[c000000002383940] [c0000000006ebb00] dump_stack_lvl+0x74/0xa8 (unreliable)
[c000000002383980] [c0000000003dabdc] bad_page+0x12c/0x170
[c000000002383a00] [c0000000003dad08] free_tail_pages_check+0xe8/0x190
[c000000002383a30] [c0000000003dc45c] free_pcp_prepare+0x31c/0x4e0
[c000000002383a90] [c0000000003df9f0] free_unref_page+0x40/0x1b0
[c000000002383ad0] [c0000000003d7fc8] __vunmap+0x1d8/0x420
[c000000002383b70] [c00000000102e0d8] proc_vmalloc_init+0xdc/0x108
[c000000002383bf0] [c000000000011f80] do_one_initcall+0x60/0x2c0
[c000000002383cc0] [c000000001001658] kernel_init_freeable+0x32c/0x3cc
[c000000002383da0] [c000000000012564] kernel_init+0x34/0x1a0
[c000000002383e10] [c00000000000ce64] ret_from_kernel_thread+0x5c/0x64
Any other concerns with the fix?
Thanks,
Nick
--
mm/vmalloc: huge vmalloc backing pages should be split rather than compound
Huge vmalloc higher-order backing pages were allocated with __GFP_COMP
in order to allow the sub-pages to be refcounted by callers such as
"remap_vmalloc_page [sic]" (remap_vmalloc_range).
However a similar problem exists for other struct page fields callers
use, for example fb_deferred_io_fault() takes a vmalloc'ed page and
not only refcounts it but uses ->lru, ->mapping, ->index. This is not
compatible with compound sub-pages.
The correct approach is to use split high-order pages for the huge
vmalloc backing. These allow callers to treat them in exactly the same
way as individually-allocated order-0 pages.
Signed-off-by: Nicholas Piggin <npiggin@...il.com>
---
mm/vmalloc.c | 36 +++++++++++++++++++++---------------
1 file changed, 21 insertions(+), 15 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0b17498a34f1..09470361dc03 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2653,15 +2653,18 @@ static void __vunmap(const void *addr, int deallocate_pages)
vm_remove_mappings(area, deallocate_pages);
if (deallocate_pages) {
- unsigned int page_order = vm_area_page_order(area);
- int i, step = 1U << page_order;
+ int i;
- for (i = 0; i < area->nr_pages; i += step) {
+ for (i = 0; i < area->nr_pages; i++) {
struct page *page = area->pages[i];
BUG_ON(!page);
- mod_memcg_page_state(page, MEMCG_VMALLOC, -step);
- __free_pages(page, page_order);
+ mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
+ /*
+ * High-order allocs for huge vmallocs are split, so
+ * can be freed as an array of order-0 allocations
+ */
+ __free_pages(page, 0);
cond_resched();
}
atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
@@ -2914,12 +2917,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
if (nr != nr_pages_request)
break;
}
- } else
- /*
- * Compound pages required for remap_vmalloc_page if
- * high-order pages.
- */
- gfp |= __GFP_COMP;
+ }
/* High-order pages or fallback path if "bulk" fails. */
@@ -2933,6 +2931,15 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
page = alloc_pages_node(nid, gfp, order);
if (unlikely(!page))
break;
+ /*
+ * Higher order allocations must be able to be treated as
+ * indepdenent small pages by callers (as they can with
+ * small-page vmallocs). Some drivers do their own refcounting
+ * on vmalloc_to_page() pages, some use page->mapping,
+ * page->lru, etc.
+ */
+ if (order)
+ split_page(page, order);
/*
* Careful, we allocate and map page-order pages, but
@@ -2992,11 +2999,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
if (gfp_mask & __GFP_ACCOUNT) {
- int i, step = 1U << page_order;
+ int i;
- for (i = 0; i < area->nr_pages; i += step)
- mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC,
- step);
+ for (i = 0; i < area->nr_pages; i++)
+ mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1);
}
/*
--
2.35.1
Powered by blists - more mailing lists