lists.openwall.net | lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening PHC | |
Open Source and information security mailing list archives
| ||
|
Date: Tue, 24 May 2022 14:01:07 -0500 From: "Sierra Guiza, Alejandro (Alex)" <alex.sierra@....com> To: Alistair Popple <apopple@...dia.com> Cc: jgg@...dia.com, david@...hat.com, Felix.Kuehling@....com, linux-mm@...ck.org, rcampbell@...dia.com, linux-ext4@...r.kernel.org, linux-xfs@...r.kernel.org, amd-gfx@...ts.freedesktop.org, dri-devel@...ts.freedesktop.org, hch@....de, jglisse@...hat.com, willy@...radead.org, akpm@...ux-foundation.org Subject: Re: [PATCH v2 11/13] mm: handling Non-LRU pages returned by vm_normal_pages On 5/23/2022 7:02 AM, Alistair Popple wrote: > Technically I think this patch should be earlier in the series. As I > understand it patch 1 allows DEVICE_COHERENT pages to be inserted in the > page tables and therefore makes it possible for page table walkers to > see non-LRU pages. Patch will reordered in V3. Regards, Alex Sierra > > Some more comments below: > > Alex Sierra <alex.sierra@....com> writes: > >> With DEVICE_COHERENT, we'll soon have vm_normal_pages() return >> device-managed anonymous pages that are not LRU pages. Although they >> behave like normal pages for purposes of mapping in CPU page, and for >> COW. They do not support LRU lists, NUMA migration or THP. >> >> We also introduced a FOLL_LRU flag that adds the same behaviour to >> follow_page and related APIs, to allow callers to specify that they >> expect to put pages on an LRU list. > This means by default GUP can return non-LRU pages. I didn't see > anywhere that would be a problem but I didn't check everything. Did you > check this or is there some other reason I've missed that makes this not > a problem? I have double checked all gup and pin_user_pages callers and none of them seem to have interaction with LRU APIs. Regards, Alex Sierra > > [...] > >> diff --git a/mm/khugepaged.c b/mm/khugepaged.c >> index a4e5eaf3eb01..eb3cfd679800 100644 >> --- a/mm/khugepaged.c >> +++ b/mm/khugepaged.c >> @@ -627,7 +627,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, >> goto out; >> } >> page = vm_normal_page(vma, address, pteval); >> - if (unlikely(!page)) { >> + if (unlikely(!page) || unlikely(is_zone_device_page(page))) { >> result = SCAN_PAGE_NULL; >> goto out; >> } >> @@ -1276,7 +1276,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, >> writable = true; >> >> page = vm_normal_page(vma, _address, pteval); >> - if (unlikely(!page)) { >> + if (unlikely(!page) || unlikely(is_zone_device_page(page))) { >> result = SCAN_PAGE_NULL; >> goto out_unmap; >> } >> @@ -1484,7 +1484,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) >> goto abort; >> >> page = vm_normal_page(vma, addr, *pte); >> - >> + if (page && is_zone_device_page(page)) >> + page = NULL; >> /* >> * Note that uprobe, debugger, or MAP_PRIVATE may change the >> * page table, but the new page will not be a subpage of hpage. >> @@ -1502,6 +1503,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) >> if (pte_none(*pte)) >> continue; >> page = vm_normal_page(vma, addr, *pte); >> + if (page && is_zone_device_page(page)) >> + goto abort; > Are either of these two cases actually possible? DEVICE_COHERENT doesn't > currently support THP, so if I'm understanding correctly we couldn't > have a pte mapped DEVICE_COHERENT THP right? Assuming that's the case I > think WARN_ON_ONCE() would be better. Correct, change included in V3 patch series. Regards, Alex > > Otherwise I think everything else looks reasonable. > >> page_remove_rmap(page, vma, false); >> } >> >> diff --git a/mm/ksm.c b/mm/ksm.c >> index 063a48eeb5ee..f16056efca21 100644 >> --- a/mm/ksm.c >> +++ b/mm/ksm.c >> @@ -474,7 +474,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) >> do { >> cond_resched(); >> page = follow_page(vma, addr, >> - FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); >> + FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE | FOLL_LRU); >> if (IS_ERR_OR_NULL(page)) >> break; >> if (PageKsm(page)) >> @@ -559,7 +559,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) >> if (!vma) >> goto out; >> >> - page = follow_page(vma, addr, FOLL_GET); >> + page = follow_page(vma, addr, FOLL_GET | FOLL_LRU); >> if (IS_ERR_OR_NULL(page)) >> goto out; >> if (PageAnon(page)) { >> @@ -2288,7 +2288,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) >> while (ksm_scan.address < vma->vm_end) { >> if (ksm_test_exit(mm)) >> break; >> - *page = follow_page(vma, ksm_scan.address, FOLL_GET); >> + *page = follow_page(vma, ksm_scan.address, FOLL_GET | FOLL_LRU); >> if (IS_ERR_OR_NULL(*page)) { >> ksm_scan.address += PAGE_SIZE; >> cond_resched(); >> diff --git a/mm/madvise.c b/mm/madvise.c >> index 1873616a37d2..e9c24c834e98 100644 >> --- a/mm/madvise.c >> +++ b/mm/madvise.c >> @@ -413,7 +413,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, >> continue; >> >> page = vm_normal_page(vma, addr, ptent); >> - if (!page) >> + if (!page || is_zone_device_page(page)) >> continue; >> >> /* >> @@ -628,7 +628,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, >> } >> >> page = vm_normal_page(vma, addr, ptent); >> - if (!page) >> + if (!page || is_zone_device_page(page)) >> continue; >> >> /* >> diff --git a/mm/memory.c b/mm/memory.c >> index 76e3af9639d9..571a26805ee1 100644 >> --- a/mm/memory.c >> +++ b/mm/memory.c >> @@ -621,6 +621,13 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, >> if (is_zero_pfn(pfn)) >> return NULL; >> if (pte_devmap(pte)) >> +/* >> + * NOTE: New uers of ZONE_DEVICE will not set pte_devmap() and will have >> + * refcounts incremented on their struct pages when they are inserted into >> + * PTEs, thus they are safe to return here. Legacy ZONE_DEVICE pages that set >> + * pte_devmap() do not have refcounts. Example of legacy ZONE_DEVICE is >> + * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers. >> + */ >> return NULL; >> >> print_bad_pte(vma, addr, pte, NULL); >> @@ -4422,7 +4429,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) >> pte = pte_modify(old_pte, vma->vm_page_prot); >> >> page = vm_normal_page(vma, vmf->address, pte); >> - if (!page) >> + if (!page || is_zone_device_page(page)) >> goto out_map; >> >> /* TODO: handle PTE-mapped THP */ >> diff --git a/mm/mempolicy.c b/mm/mempolicy.c >> index 8c74107a2b15..e32edbecb0cd 100644 >> --- a/mm/mempolicy.c >> +++ b/mm/mempolicy.c >> @@ -528,7 +528,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, >> if (!pte_present(*pte)) >> continue; >> page = vm_normal_page(vma, addr, *pte); >> - if (!page) >> + if (!page || is_zone_device_page(page)) >> continue; >> /* >> * vm_normal_page() filters out zero pages, but there might >> diff --git a/mm/migrate.c b/mm/migrate.c >> index 6c31ee1e1c9b..c5d50e96ecd7 100644 >> --- a/mm/migrate.c >> +++ b/mm/migrate.c >> @@ -1611,7 +1611,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, >> goto out; >> >> /* FOLL_DUMP to ignore special (like zero) pages */ >> - page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); >> + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU); >> >> err = PTR_ERR(page); >> if (IS_ERR(page)) >> @@ -1802,7 +1802,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, >> goto set_status; >> >> /* FOLL_DUMP to ignore special (like zero) pages */ >> - page = follow_page(vma, addr, FOLL_DUMP); >> + page = follow_page(vma, addr, FOLL_DUMP | FOLL_LRU); >> >> err = PTR_ERR(page); >> if (IS_ERR(page)) >> diff --git a/mm/mlock.c b/mm/mlock.c >> index 716caf851043..b14e929084cc 100644 >> --- a/mm/mlock.c >> +++ b/mm/mlock.c >> @@ -333,7 +333,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, >> if (!pte_present(*pte)) >> continue; >> page = vm_normal_page(vma, addr, *pte); >> - if (!page) >> + if (!page || is_zone_device_page(page)) >> continue; >> if (PageTransCompound(page)) >> continue; >> diff --git a/mm/mprotect.c b/mm/mprotect.c >> index b69ce7a7b2b7..a6f3587ea29a 100644 >> --- a/mm/mprotect.c >> +++ b/mm/mprotect.c >> @@ -91,7 +91,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, >> continue; >> >> page = vm_normal_page(vma, addr, oldpte); >> - if (!page || PageKsm(page)) >> + if (!page || is_zone_device_page(page) || PageKsm(page)) >> continue; >> >> /* Also skip shared copy-on-write pages */
Powered by blists - more mailing lists