linux-kernel - Re: [PATCH] mm/gup: restore the ability to pin more than 2GB at a time

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <8d9dc103-47c5-4719-971a-31efb091432a@redhat.com>
Date: Wed, 30 Oct 2024 10:01:37 +0100
From: David Hildenbrand <david@...hat.com>
To: John Hubbard <jhubbard@...dia.com>, Alistair Popple <apopple@...dia.com>
Cc: Christoph Hellwig <hch@...radead.org>,
 Andrew Morton <akpm@...ux-foundation.org>,
 LKML <linux-kernel@...r.kernel.org>, linux-mm@...ck.org,
 linux-stable@...r.kernel.org, Vivek Kasireddy <vivek.kasireddy@...el.com>,
 Dave Airlie <airlied@...hat.com>, Gerd Hoffmann <kraxel@...hat.com>,
 Matthew Wilcox <willy@...radead.org>, Jason Gunthorpe <jgg@...dia.com>,
 Peter Xu <peterx@...hat.com>, Arnd Bergmann <arnd@...db.de>,
 Daniel Vetter <daniel.vetter@...ll.ch>, Dongwon Kim <dongwon.kim@...el.com>,
 Hugh Dickins <hughd@...gle.com>, Junxiao Chang <junxiao.chang@...el.com>,
 Mike Kravetz <mike.kravetz@...cle.com>, Oscar Salvador <osalvador@...e.de>
Subject: Re: [PATCH] mm/gup: restore the ability to pin more than 2GB at a
 time

On 30.10.24 09:34, David Hildenbrand wrote:
> On 30.10.24 07:50, John Hubbard wrote:
>> On 10/29/24 11:18 PM, Alistair Popple wrote:
>>> John Hubbard <jhubbard@...dia.com> writes:
>>>> On 10/29/24 9:42 PM, Christoph Hellwig wrote:
>>>>> On Tue, Oct 29, 2024 at 09:39:15PM -0700, John Hubbard wrote:
>> ...
>>>>> Because pinning down these amounts of memoryt is completely insane.
>>>>> I don't mind the switch to kvmalloc, but we need to put in an upper
>>>>> bound of what can be pinned.
>>>>
>>>> I'm wondering though, how it is that we decide how much of the user's
>>>> system we prevent them from using? :)  People with hardware accelerators
>>>> do not always have page fault capability, and yet these troublesome
>>>> users insist on stacking their system full of DRAM and then pointing
>>>> the accelerator to it.
>>>>
>>>> How would we choose a value? Memory sizes keep going up...
>>>
>>> The obvious answer is you let users decide. I did have a patch series to
>>> do that via a cgroup[1]. However I dropped that series mostly because I
>>> couldn't find any users of such a limit to provide feedback on how they
>>> would use it or how they wanted it to work.
>>>
>>
>> Trawling through the discussion there, I see that Jason Gunthorpe mentioned:
>>
>> "Things like VFIO & KVM use cases effectively pin 90% of all system memory"
> 
> The unusual thing is not the amount of system memory we are pinning but
> *how many* pages we try pinning in the single call.
> 
> If you stare at vfio_pin_pages_remote, we seem to be batching it.
> 
> long req_pages = min_t(long, npage, batch->capacity);
> 
> Which is
> 
> #define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))
> 
> 
> So you can fix this in your driver ;)
> 
> 
> We should maybe try a similar limit internally: if you call
> pin_user_pages_remote() with a large number, we'll cap it at some magic
> value (similar to above). The caller will simply realize that not all
> pages were pinned and will retry.
> 
> See get_user_pages_remote(): "Returns either number of pages pinned
> (which may be less than the number requested), or an error. Details
> about the return value:"
> 
> 
> Alternatively, I recall there was a way to avoid the temporary
> allocation ... let me hack up a prototype real quick.

Completely untested (also note the interesting TODO):


 From a23984b4f1a39ec984489fbe16708aedf4f9db95 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@...hat.com>
Date: Wed, 30 Oct 2024 10:00:50 +0100
Subject: [PATCH] tmp

Signed-off-by: David Hildenbrand <david@...hat.com>
---
  mm/gup.c | 120 ++++++++++++++++++++++++++++++++++++-------------------
  1 file changed, 78 insertions(+), 42 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index a82890b46a36..8807b36c2363 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2273,20 +2273,56 @@ struct page *get_dump_page(unsigned long addr)
  #endif /* CONFIG_ELF_CORE */
  
  #ifdef CONFIG_MIGRATION
+
+/*
+ * An array of either pages or folios. While we could currently interpret a
+ * list of folios like a list of pages, it would only work as long as
+ * "struct folio" overlays "struct page" -- and it would not allow for
+ * avoiding page_folio() calls.
+ */
+struct pages_or_folios {
+	union {
+		struct page **pages;
+		struct folio **folios;
+		void **entries;
+	};
+	bool has_folios;
+	long nr_entries;
+};
+
+static struct folio *pofs_get_folio(struct pages_or_folios *pofs, long i)
+{
+	if (pofs->has_folios)
+		return pofs->folios[i];
+	return page_folio(pofs->pages[i]);
+}
+
+static void pofs_clear_entry(struct pages_or_folios *pofs, long i)
+{
+	pofs->entries[i] = NULL;
+}
+
+static void pofs_unpin(struct pages_or_folios *pofs)
+{
+	if (pofs->has_folios)
+		unpin_folios(pofs->folios, pofs->nr_entries);
+	else
+		unpin_user_pages(pofs->pages, pofs->nr_entries);
+}
+
  /*
   * Returns the number of collected folios. Return value is always >= 0.
   */
  static unsigned long collect_longterm_unpinnable_folios(
-					struct list_head *movable_folio_list,
-					unsigned long nr_folios,
-					struct folio **folios)
+		struct list_head *movable_folio_list,
+		struct pages_or_folios *pofs)
  {
  	unsigned long i, collected = 0;
  	struct folio *prev_folio = NULL;
  	bool drain_allow = true;
  
-	for (i = 0; i < nr_folios; i++) {
-		struct folio *folio = folios[i];
+	for (i = 0; i < pofs->nr_entries; i++) {
+		struct folio *folio = pofs_get_folio(pofs, i);
  
  		if (folio == prev_folio)
  			continue;
@@ -2310,6 +2346,11 @@ static unsigned long collect_longterm_unpinnable_folios(
  			drain_allow = false;
  		}
  
+		/*
+		 * TODO: if isolation fails we might already have it in the
+		 * list, if pages of different folios are interleaved
+		 * (e.g., COW). We might want to check all entries in the list.
+		 */
  		if (!folio_isolate_lru(folio))
  			continue;
  
@@ -2328,15 +2369,14 @@ static unsigned long collect_longterm_unpinnable_folios(
   * failure (or partial success).
   */
  static int migrate_longterm_unpinnable_folios(
-					struct list_head *movable_folio_list,
-					unsigned long nr_folios,
-					struct folio **folios)
+		struct list_head *movable_folio_list,
+		struct pages_or_folios *pofs)
  {
  	int ret;
  	unsigned long i;
  
-	for (i = 0; i < nr_folios; i++) {
-		struct folio *folio = folios[i];
+	for (i = 0; i < pofs->nr_entries; i++) {
+		struct folio *folio = pofs_get_folio(pofs, i);
  
  		if (folio_is_device_coherent(folio)) {
  			/*
@@ -2344,7 +2384,7 @@ static int migrate_longterm_unpinnable_folios(
  			 * convert the pin on the source folio to a normal
  			 * reference.
  			 */
-			folios[i] = NULL;
+			pofs_clear_entry(pofs, i);
  			folio_get(folio);
  			gup_put_folio(folio, 1, FOLL_PIN);
  
@@ -2363,8 +2403,8 @@ static int migrate_longterm_unpinnable_folios(
  		 * calling folio_isolate_lru() which takes a reference so the
  		 * folio won't be freed if it's migrating.
  		 */
-		unpin_folio(folios[i]);
-		folios[i] = NULL;
+		unpin_folio(pofs_get_folio(pofs, i));
+		pofs_clear_entry(pofs, i);
  	}
  
  	if (!list_empty(movable_folio_list)) {
@@ -2387,12 +2427,24 @@ static int migrate_longterm_unpinnable_folios(
  	return -EAGAIN;
  
  err:
-	unpin_folios(folios, nr_folios);
+	pofs_unpin(pofs);
  	putback_movable_pages(movable_folio_list);
  
  	return ret;
  }
  
+static long check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs)
+{
+	LIST_HEAD(movable_folio_list);
+	unsigned long collected;
+
+	collected = collect_longterm_unpinnable_folios(&movable_folio_list, pofs);
+	if (!collected)
+		return 0;
+
+	return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs);
+}
+
  /*
   * Check whether all folios are *allowed* to be pinned indefinitely (longterm).
   * Rather confusingly, all folios in the range are required to be pinned via
@@ -2412,41 +2464,25 @@ static int migrate_longterm_unpinnable_folios(
  static long check_and_migrate_movable_folios(unsigned long nr_folios,
  					     struct folio **folios)
  {
-	unsigned long collected;
-	LIST_HEAD(movable_folio_list);
-
-	collected = collect_longterm_unpinnable_folios(&movable_folio_list,
-						       nr_folios, folios);
-	if (!collected)
-		return 0;
+	struct pages_or_folios pofs = {
+		.folios = folios,
+		.has_folios = true,
+		.nr_entries = nr_folios,
+	};
  
-	return migrate_longterm_unpinnable_folios(&movable_folio_list,
-						  nr_folios, folios);
+	return check_and_migrate_movable_pages_or_folios(&pofs);
  }
  
-/*
- * This routine just converts all the pages in the @pages array to folios and
- * calls check_and_migrate_movable_folios() to do the heavy lifting.
- *
- * Please see the check_and_migrate_movable_folios() documentation for details.
- */
+/* See check_and_migrate_movable_folios(). */
  static long check_and_migrate_movable_pages(unsigned long nr_pages,
  					    struct page **pages)
  {
-	struct folio **folios;
-	long i, ret;
+	struct pages_or_folios pofs = {
+		.pages = pages,
+		.nr_entries = nr_pages,
+	};
  
-	folios = kmalloc_array(nr_pages, sizeof(*folios), GFP_KERNEL);
-	if (!folios)
-		return -ENOMEM;
-
-	for (i = 0; i < nr_pages; i++)
-		folios[i] = page_folio(pages[i]);
-
-	ret = check_and_migrate_movable_folios(nr_pages, folios);
-
-	kfree(folios);
-	return ret;
+	return check_and_migrate_movable_pages_or_folios(&pofs);
  }
  #else
  static long check_and_migrate_movable_pages(unsigned long nr_pages,
-- 
2.47.0


-- 
Cheers,

David / dhildenb