[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <a3619730-7aa8-1f0f-c524-bf37f6fd5125@amd.com>
Date: Fri, 27 May 2022 02:28:47 -0400
From: Felix Kuehling <felix.kuehling@....com>
To: Alistair Popple <apopple@...dia.com>,
Alex Sierra <alex.sierra@....com>
Cc: jgg@...dia.com, david@...hat.com, linux-mm@...ck.org,
rcampbell@...dia.com, linux-ext4@...r.kernel.org,
linux-xfs@...r.kernel.org, amd-gfx@...ts.freedesktop.org,
dri-devel@...ts.freedesktop.org, hch@....de, jglisse@...hat.com,
willy@...radead.org, akpm@...ux-foundation.org
Subject: Re: [PATCH v3 02/13] mm: handling Non-LRU pages returned by
vm_normal_pages
Am 2022-05-25 um 00:11 schrieb Alistair Popple:
> Alex Sierra <alex.sierra@....com> writes:
>
>> With DEVICE_COHERENT, we'll soon have vm_normal_pages() return
>> device-managed anonymous pages that are not LRU pages. Although they
>> behave like normal pages for purposes of mapping in CPU page, and for
>> COW. They do not support LRU lists, NUMA migration or THP.
>>
>> We also introduced a FOLL_LRU flag that adds the same behaviour to
>> follow_page and related APIs, to allow callers to specify that they
>> expect to put pages on an LRU list.
> Continuing the follow up from the thread for v2:
>
>>> This means by default GUP can return non-LRU pages. I didn't see
>>> anywhere that would be a problem but I didn't check everything. Did you
>>> check this or is there some other reason I've missed that makes this not
>>> a problem?
>> I have double checked all gup and pin_user_pages callers and none of them seem
>> to have interaction with LRU APIs.
> And actually if I'm understanding things correctly callers of
> GUP/PUP/follow_page_pte() should already expect to get non-LRU pages
> returned:
>
> page = vm_normal_page(vma, address, pte);
> if ((flags & FOLL_LRU) && page && is_zone_device_page(page))
> page = NULL;
> if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
> /*
> * Only return device mapping pages in the FOLL_GET or FOLL_PIN
> * case since they are only valid while holding the pgmap
> * reference.
> */
> *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
> if (*pgmap)
> page = pte_page(pte);
>
> Which I think makes FOLL_LRU confusing, because if understand correctly
> even with FOLL_LRU it is still possible for follow_page_pte() to return
> a non-LRU page. Could we do something like this to make it consistent:
>
> if ((flags & FOLL_LRU) && (page && is_zone_device_page(page) ||
> !page && pte_devmap(pte)))
This alone won't help if it still goes into the if (!page &&
pte_devmap(pte) ...) afterwards. I think what you're suggesting is:
+ if ((flags & FOLL_LRU) && (page && is_zone_device_page(page) ||
+ !page && pte_devmap(pte)))
+ page = NULL;
- |if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { +
else if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { |
Is that what you meant?
Regards,
Felix
>
> Looking at callers that currently use FOLL_LRU I don't think this would
> change any behaviour as they already filter out devmap through various
> other means.
>
>> Signed-off-by: Alex Sierra <alex.sierra@....com>
>> Acked-by: Felix Kuehling <Felix.Kuehling@....com>
>> ---
>> fs/proc/task_mmu.c | 2 +-
>> include/linux/mm.h | 3 ++-
>> mm/gup.c | 2 ++
>> mm/huge_memory.c | 2 +-
>> mm/khugepaged.c | 9 ++++++---
>> mm/ksm.c | 6 +++---
>> mm/madvise.c | 4 ++--
>> mm/memory.c | 9 ++++++++-
>> mm/mempolicy.c | 2 +-
>> mm/migrate.c | 4 ++--
>> mm/mlock.c | 2 +-
>> mm/mprotect.c | 2 +-
>> 12 files changed, 30 insertions(+), 17 deletions(-)
>>
>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>> index f46060eb91b5..5d620733f173 100644
>> --- a/fs/proc/task_mmu.c
>> +++ b/fs/proc/task_mmu.c
>> @@ -1785,7 +1785,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
>> return NULL;
>>
>> page = vm_normal_page(vma, addr, pte);
>> - if (!page)
>> + if (!page || is_zone_device_page(page))
>> return NULL;
>>
>> if (PageReserved(page))
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 9f44254af8ce..d7f253a0c41e 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -601,7 +601,7 @@ struct vm_operations_struct {
>> #endif
>> /*
>> * Called by vm_normal_page() for special PTEs to find the
>> - * page for @addr. This is useful if the default behavior
>> + * page for @addr. This is useful if the default behavior
>> * (using pte_page()) would not find the correct page.
>> */
>> struct page *(*find_special_page)(struct vm_area_struct *vma,
>> @@ -2929,6 +2929,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
>> #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
>> #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
>> #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
>> +#define FOLL_LRU 0x1000 /* return only LRU (anon or page cache) */
>> #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */
>> #define FOLL_COW 0x4000 /* internal GUP flag */
>> #define FOLL_ANON 0x8000 /* don't do file mappings */
>> diff --git a/mm/gup.c b/mm/gup.c
>> index 501bc150792c..c9cbac06bcc5 100644
>> --- a/mm/gup.c
>> +++ b/mm/gup.c
>> @@ -479,6 +479,8 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
>> }
>>
>> page = vm_normal_page(vma, address, pte);
>> + if ((flags & FOLL_LRU) && page && is_zone_device_page(page))
>> + page = NULL;
>> if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
>> /*
>> * Only return device mapping pages in the FOLL_GET or FOLL_PIN
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 910a138e9859..eed80696c5fd 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -2856,7 +2856,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>> }
>>
>> /* FOLL_DUMP to ignore special (like zero) pages */
>> - page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
>> + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
>>
>> if (IS_ERR(page))
>> continue;
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index a4e5eaf3eb01..8bf4126b6b9c 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -627,7 +627,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>> goto out;
>> }
>> page = vm_normal_page(vma, address, pteval);
>> - if (unlikely(!page)) {
>> + if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
>> result = SCAN_PAGE_NULL;
>> goto out;
>> }
>> @@ -1276,7 +1276,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>> writable = true;
>>
>> page = vm_normal_page(vma, _address, pteval);
>> - if (unlikely(!page)) {
>> + if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
>> result = SCAN_PAGE_NULL;
>> goto out_unmap;
>> }
>> @@ -1484,7 +1484,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
>> goto abort;
>>
>> page = vm_normal_page(vma, addr, *pte);
>> -
>> + if (WARN_ON_ONCE(page && is_zone_device_page(page)))
>> + page = NULL;
>> /*
>> * Note that uprobe, debugger, or MAP_PRIVATE may change the
>> * page table, but the new page will not be a subpage of hpage.
>> @@ -1502,6 +1503,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
>> if (pte_none(*pte))
>> continue;
>> page = vm_normal_page(vma, addr, *pte);
>> + if (WARN_ON_ONCE(page && is_zone_device_page(page)))
>> + goto abort;
>> page_remove_rmap(page, vma, false);
>> }
>>
>> diff --git a/mm/ksm.c b/mm/ksm.c
>> index 063a48eeb5ee..f16056efca21 100644
>> --- a/mm/ksm.c
>> +++ b/mm/ksm.c
>> @@ -474,7 +474,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
>> do {
>> cond_resched();
>> page = follow_page(vma, addr,
>> - FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
>> + FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE | FOLL_LRU);
>> if (IS_ERR_OR_NULL(page))
>> break;
>> if (PageKsm(page))
>> @@ -559,7 +559,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
>> if (!vma)
>> goto out;
>>
>> - page = follow_page(vma, addr, FOLL_GET);
>> + page = follow_page(vma, addr, FOLL_GET | FOLL_LRU);
>> if (IS_ERR_OR_NULL(page))
>> goto out;
>> if (PageAnon(page)) {
>> @@ -2288,7 +2288,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
>> while (ksm_scan.address < vma->vm_end) {
>> if (ksm_test_exit(mm))
>> break;
>> - *page = follow_page(vma, ksm_scan.address, FOLL_GET);
>> + *page = follow_page(vma, ksm_scan.address, FOLL_GET | FOLL_LRU);
>> if (IS_ERR_OR_NULL(*page)) {
>> ksm_scan.address += PAGE_SIZE;
>> cond_resched();
>> diff --git a/mm/madvise.c b/mm/madvise.c
>> index 1873616a37d2..e9c24c834e98 100644
>> --- a/mm/madvise.c
>> +++ b/mm/madvise.c
>> @@ -413,7 +413,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
>> continue;
>>
>> page = vm_normal_page(vma, addr, ptent);
>> - if (!page)
>> + if (!page || is_zone_device_page(page))
>> continue;
>>
>> /*
>> @@ -628,7 +628,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>> }
>>
>> page = vm_normal_page(vma, addr, ptent);
>> - if (!page)
>> + if (!page || is_zone_device_page(page))
>> continue;
>>
>> /*
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 76e3af9639d9..571a26805ee1 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -621,6 +621,13 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
>> if (is_zero_pfn(pfn))
>> return NULL;
>> if (pte_devmap(pte))
>> +/*
>> + * NOTE: New uers of ZONE_DEVICE will not set pte_devmap() and will have
>> + * refcounts incremented on their struct pages when they are inserted into
>> + * PTEs, thus they are safe to return here. Legacy ZONE_DEVICE pages that set
>> + * pte_devmap() do not have refcounts. Example of legacy ZONE_DEVICE is
>> + * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
>> + */
>> return NULL;
>>
>> print_bad_pte(vma, addr, pte, NULL);
>> @@ -4422,7 +4429,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>> pte = pte_modify(old_pte, vma->vm_page_prot);
>>
>> page = vm_normal_page(vma, vmf->address, pte);
>> - if (!page)
>> + if (!page || is_zone_device_page(page))
>> goto out_map;
>>
>> /* TODO: handle PTE-mapped THP */
>> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
>> index 8c74107a2b15..e32edbecb0cd 100644
>> --- a/mm/mempolicy.c
>> +++ b/mm/mempolicy.c
>> @@ -528,7 +528,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
>> if (!pte_present(*pte))
>> continue;
>> page = vm_normal_page(vma, addr, *pte);
>> - if (!page)
>> + if (!page || is_zone_device_page(page))
>> continue;
>> /*
>> * vm_normal_page() filters out zero pages, but there might
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index 6c31ee1e1c9b..c5d50e96ecd7 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -1611,7 +1611,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
>> goto out;
>>
>> /* FOLL_DUMP to ignore special (like zero) pages */
>> - page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
>> + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
>>
>> err = PTR_ERR(page);
>> if (IS_ERR(page))
>> @@ -1802,7 +1802,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
>> goto set_status;
>>
>> /* FOLL_DUMP to ignore special (like zero) pages */
>> - page = follow_page(vma, addr, FOLL_DUMP);
>> + page = follow_page(vma, addr, FOLL_DUMP | FOLL_LRU);
>>
>> err = PTR_ERR(page);
>> if (IS_ERR(page))
>> diff --git a/mm/mlock.c b/mm/mlock.c
>> index 716caf851043..b14e929084cc 100644
>> --- a/mm/mlock.c
>> +++ b/mm/mlock.c
>> @@ -333,7 +333,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
>> if (!pte_present(*pte))
>> continue;
>> page = vm_normal_page(vma, addr, *pte);
>> - if (!page)
>> + if (!page || is_zone_device_page(page))
>> continue;
>> if (PageTransCompound(page))
>> continue;
>> diff --git a/mm/mprotect.c b/mm/mprotect.c
>> index b69ce7a7b2b7..a6f3587ea29a 100644
>> --- a/mm/mprotect.c
>> +++ b/mm/mprotect.c
>> @@ -91,7 +91,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
>> continue;
>>
>> page = vm_normal_page(vma, addr, oldpte);
>> - if (!page || PageKsm(page))
>> + if (!page || is_zone_device_page(page) || PageKsm(page))
>> continue;
>>
>> /* Also skip shared copy-on-write pages */
Powered by blists - more mailing lists