[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <8d9dc103-47c5-4719-971a-31efb091432a@redhat.com>
Date: Wed, 30 Oct 2024 10:01:37 +0100
From: David Hildenbrand <david@...hat.com>
To: John Hubbard <jhubbard@...dia.com>, Alistair Popple <apopple@...dia.com>
Cc: Christoph Hellwig <hch@...radead.org>,
Andrew Morton <akpm@...ux-foundation.org>,
LKML <linux-kernel@...r.kernel.org>, linux-mm@...ck.org,
linux-stable@...r.kernel.org, Vivek Kasireddy <vivek.kasireddy@...el.com>,
Dave Airlie <airlied@...hat.com>, Gerd Hoffmann <kraxel@...hat.com>,
Matthew Wilcox <willy@...radead.org>, Jason Gunthorpe <jgg@...dia.com>,
Peter Xu <peterx@...hat.com>, Arnd Bergmann <arnd@...db.de>,
Daniel Vetter <daniel.vetter@...ll.ch>, Dongwon Kim <dongwon.kim@...el.com>,
Hugh Dickins <hughd@...gle.com>, Junxiao Chang <junxiao.chang@...el.com>,
Mike Kravetz <mike.kravetz@...cle.com>, Oscar Salvador <osalvador@...e.de>
Subject: Re: [PATCH] mm/gup: restore the ability to pin more than 2GB at a
time
On 30.10.24 09:34, David Hildenbrand wrote:
> On 30.10.24 07:50, John Hubbard wrote:
>> On 10/29/24 11:18 PM, Alistair Popple wrote:
>>> John Hubbard <jhubbard@...dia.com> writes:
>>>> On 10/29/24 9:42 PM, Christoph Hellwig wrote:
>>>>> On Tue, Oct 29, 2024 at 09:39:15PM -0700, John Hubbard wrote:
>> ...
>>>>> Because pinning down these amounts of memoryt is completely insane.
>>>>> I don't mind the switch to kvmalloc, but we need to put in an upper
>>>>> bound of what can be pinned.
>>>>
>>>> I'm wondering though, how it is that we decide how much of the user's
>>>> system we prevent them from using? :) People with hardware accelerators
>>>> do not always have page fault capability, and yet these troublesome
>>>> users insist on stacking their system full of DRAM and then pointing
>>>> the accelerator to it.
>>>>
>>>> How would we choose a value? Memory sizes keep going up...
>>>
>>> The obvious answer is you let users decide. I did have a patch series to
>>> do that via a cgroup[1]. However I dropped that series mostly because I
>>> couldn't find any users of such a limit to provide feedback on how they
>>> would use it or how they wanted it to work.
>>>
>>
>> Trawling through the discussion there, I see that Jason Gunthorpe mentioned:
>>
>> "Things like VFIO & KVM use cases effectively pin 90% of all system memory"
>
> The unusual thing is not the amount of system memory we are pinning but
> *how many* pages we try pinning in the single call.
>
> If you stare at vfio_pin_pages_remote, we seem to be batching it.
>
> long req_pages = min_t(long, npage, batch->capacity);
>
> Which is
>
> #define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))
>
>
> So you can fix this in your driver ;)
>
>
> We should maybe try a similar limit internally: if you call
> pin_user_pages_remote() with a large number, we'll cap it at some magic
> value (similar to above). The caller will simply realize that not all
> pages were pinned and will retry.
>
> See get_user_pages_remote(): "Returns either number of pages pinned
> (which may be less than the number requested), or an error. Details
> about the return value:"
>
>
> Alternatively, I recall there was a way to avoid the temporary
> allocation ... let me hack up a prototype real quick.
Completely untested (also note the interesting TODO):
From a23984b4f1a39ec984489fbe16708aedf4f9db95 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@...hat.com>
Date: Wed, 30 Oct 2024 10:00:50 +0100
Subject: [PATCH] tmp
Signed-off-by: David Hildenbrand <david@...hat.com>
---
mm/gup.c | 120 ++++++++++++++++++++++++++++++++++++-------------------
1 file changed, 78 insertions(+), 42 deletions(-)
diff --git a/mm/gup.c b/mm/gup.c
index a82890b46a36..8807b36c2363 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2273,20 +2273,56 @@ struct page *get_dump_page(unsigned long addr)
#endif /* CONFIG_ELF_CORE */
#ifdef CONFIG_MIGRATION
+
+/*
+ * An array of either pages or folios. While we could currently interpret a
+ * list of folios like a list of pages, it would only work as long as
+ * "struct folio" overlays "struct page" -- and it would not allow for
+ * avoiding page_folio() calls.
+ */
+struct pages_or_folios {
+ union {
+ struct page **pages;
+ struct folio **folios;
+ void **entries;
+ };
+ bool has_folios;
+ long nr_entries;
+};
+
+static struct folio *pofs_get_folio(struct pages_or_folios *pofs, long i)
+{
+ if (pofs->has_folios)
+ return pofs->folios[i];
+ return page_folio(pofs->pages[i]);
+}
+
+static void pofs_clear_entry(struct pages_or_folios *pofs, long i)
+{
+ pofs->entries[i] = NULL;
+}
+
+static void pofs_unpin(struct pages_or_folios *pofs)
+{
+ if (pofs->has_folios)
+ unpin_folios(pofs->folios, pofs->nr_entries);
+ else
+ unpin_user_pages(pofs->pages, pofs->nr_entries);
+}
+
/*
* Returns the number of collected folios. Return value is always >= 0.
*/
static unsigned long collect_longterm_unpinnable_folios(
- struct list_head *movable_folio_list,
- unsigned long nr_folios,
- struct folio **folios)
+ struct list_head *movable_folio_list,
+ struct pages_or_folios *pofs)
{
unsigned long i, collected = 0;
struct folio *prev_folio = NULL;
bool drain_allow = true;
- for (i = 0; i < nr_folios; i++) {
- struct folio *folio = folios[i];
+ for (i = 0; i < pofs->nr_entries; i++) {
+ struct folio *folio = pofs_get_folio(pofs, i);
if (folio == prev_folio)
continue;
@@ -2310,6 +2346,11 @@ static unsigned long collect_longterm_unpinnable_folios(
drain_allow = false;
}
+ /*
+ * TODO: if isolation fails we might already have it in the
+ * list, if pages of different folios are interleaved
+ * (e.g., COW). We might want to check all entries in the list.
+ */
if (!folio_isolate_lru(folio))
continue;
@@ -2328,15 +2369,14 @@ static unsigned long collect_longterm_unpinnable_folios(
* failure (or partial success).
*/
static int migrate_longterm_unpinnable_folios(
- struct list_head *movable_folio_list,
- unsigned long nr_folios,
- struct folio **folios)
+ struct list_head *movable_folio_list,
+ struct pages_or_folios *pofs)
{
int ret;
unsigned long i;
- for (i = 0; i < nr_folios; i++) {
- struct folio *folio = folios[i];
+ for (i = 0; i < pofs->nr_entries; i++) {
+ struct folio *folio = pofs_get_folio(pofs, i);
if (folio_is_device_coherent(folio)) {
/*
@@ -2344,7 +2384,7 @@ static int migrate_longterm_unpinnable_folios(
* convert the pin on the source folio to a normal
* reference.
*/
- folios[i] = NULL;
+ pofs_clear_entry(pofs, i);
folio_get(folio);
gup_put_folio(folio, 1, FOLL_PIN);
@@ -2363,8 +2403,8 @@ static int migrate_longterm_unpinnable_folios(
* calling folio_isolate_lru() which takes a reference so the
* folio won't be freed if it's migrating.
*/
- unpin_folio(folios[i]);
- folios[i] = NULL;
+ unpin_folio(pofs_get_folio(pofs, i));
+ pofs_clear_entry(pofs, i);
}
if (!list_empty(movable_folio_list)) {
@@ -2387,12 +2427,24 @@ static int migrate_longterm_unpinnable_folios(
return -EAGAIN;
err:
- unpin_folios(folios, nr_folios);
+ pofs_unpin(pofs);
putback_movable_pages(movable_folio_list);
return ret;
}
+static long check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs)
+{
+ LIST_HEAD(movable_folio_list);
+ unsigned long collected;
+
+ collected = collect_longterm_unpinnable_folios(&movable_folio_list, pofs);
+ if (!collected)
+ return 0;
+
+ return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs);
+}
+
/*
* Check whether all folios are *allowed* to be pinned indefinitely (longterm).
* Rather confusingly, all folios in the range are required to be pinned via
@@ -2412,41 +2464,25 @@ static int migrate_longterm_unpinnable_folios(
static long check_and_migrate_movable_folios(unsigned long nr_folios,
struct folio **folios)
{
- unsigned long collected;
- LIST_HEAD(movable_folio_list);
-
- collected = collect_longterm_unpinnable_folios(&movable_folio_list,
- nr_folios, folios);
- if (!collected)
- return 0;
+ struct pages_or_folios pofs = {
+ .folios = folios,
+ .has_folios = true,
+ .nr_entries = nr_folios,
+ };
- return migrate_longterm_unpinnable_folios(&movable_folio_list,
- nr_folios, folios);
+ return check_and_migrate_movable_pages_or_folios(&pofs);
}
-/*
- * This routine just converts all the pages in the @pages array to folios and
- * calls check_and_migrate_movable_folios() to do the heavy lifting.
- *
- * Please see the check_and_migrate_movable_folios() documentation for details.
- */
+/* See check_and_migrate_movable_folios(). */
static long check_and_migrate_movable_pages(unsigned long nr_pages,
struct page **pages)
{
- struct folio **folios;
- long i, ret;
+ struct pages_or_folios pofs = {
+ .pages = pages,
+ .nr_entries = nr_pages,
+ };
- folios = kmalloc_array(nr_pages, sizeof(*folios), GFP_KERNEL);
- if (!folios)
- return -ENOMEM;
-
- for (i = 0; i < nr_pages; i++)
- folios[i] = page_folio(pages[i]);
-
- ret = check_and_migrate_movable_folios(nr_pages, folios);
-
- kfree(folios);
- return ret;
+ return check_and_migrate_movable_pages_or_folios(&pofs);
}
#else
static long check_and_migrate_movable_pages(unsigned long nr_pages,
--
2.47.0
--
Cheers,
David / dhildenb
Powered by blists - more mailing lists