linux-ext4 - Re: [PATCH v2 11/13] mm: handling Non-LRU pages returned by vm_normal

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <b6e1c1dc-0f7a-42af-cbfe-7f7a8068a23a@amd.com>
Date:   Tue, 24 May 2022 14:01:07 -0500
From:   "Sierra Guiza, Alejandro (Alex)" <alex.sierra@....com>
To:     Alistair Popple <apopple@...dia.com>
Cc:     jgg@...dia.com, david@...hat.com, Felix.Kuehling@....com,
        linux-mm@...ck.org, rcampbell@...dia.com,
        linux-ext4@...r.kernel.org, linux-xfs@...r.kernel.org,
        amd-gfx@...ts.freedesktop.org, dri-devel@...ts.freedesktop.org,
        hch@....de, jglisse@...hat.com, willy@...radead.org,
        akpm@...ux-foundation.org
Subject: Re: [PATCH v2 11/13] mm: handling Non-LRU pages returned by
 vm_normal_pages


On 5/23/2022 7:02 AM, Alistair Popple wrote:
> Technically I think this patch should be earlier in the series. As I
> understand it patch 1 allows DEVICE_COHERENT pages to be inserted in the
> page tables and therefore makes it possible for page table walkers to
> see non-LRU pages.

Patch will reordered in V3.

Regards,
Alex Sierra

>
> Some more comments below:
>
> Alex Sierra <alex.sierra@....com> writes:
>
>> With DEVICE_COHERENT, we'll soon have vm_normal_pages() return
>> device-managed anonymous pages that are not LRU pages. Although they
>> behave like normal pages for purposes of mapping in CPU page, and for
>> COW. They do not support LRU lists, NUMA migration or THP.
>>
>> We also introduced a FOLL_LRU flag that adds the same behaviour to
>> follow_page and related APIs, to allow callers to specify that they
>> expect to put pages on an LRU list.
> This means by default GUP can return non-LRU pages. I didn't see
> anywhere that would be a problem but I didn't check everything. Did you
> check this or is there some other reason I've missed that makes this not
> a problem?

I have double checked all gup and pin_user_pages callers and none of 
them seem to have interaction with LRU APIs.

Regards,
Alex Sierra

>
> [...]
>
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index a4e5eaf3eb01..eb3cfd679800 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -627,7 +627,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>   			goto out;
>>   		}
>>   		page = vm_normal_page(vma, address, pteval);
>> -		if (unlikely(!page)) {
>> +		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
>>   			result = SCAN_PAGE_NULL;
>>   			goto out;
>>   		}
>> @@ -1276,7 +1276,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>>   			writable = true;
>>
>>   		page = vm_normal_page(vma, _address, pteval);
>> -		if (unlikely(!page)) {
>> +		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
>>   			result = SCAN_PAGE_NULL;
>>   			goto out_unmap;
>>   		}
>> @@ -1484,7 +1484,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
>>   			goto abort;
>>
>>   		page = vm_normal_page(vma, addr, *pte);
>> -
>> +		if (page && is_zone_device_page(page))
>> +			page = NULL;
>>   		/*
>>   		 * Note that uprobe, debugger, or MAP_PRIVATE may change the
>>   		 * page table, but the new page will not be a subpage of hpage.
>> @@ -1502,6 +1503,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
>>   		if (pte_none(*pte))
>>   			continue;
>>   		page = vm_normal_page(vma, addr, *pte);
>> +		if (page && is_zone_device_page(page))
>> +			goto abort;
> Are either of these two cases actually possible? DEVICE_COHERENT doesn't
> currently support THP, so if I'm understanding correctly we couldn't
> have a pte mapped DEVICE_COHERENT THP right? Assuming that's the case I
> think WARN_ON_ONCE() would be better.

Correct, change included in V3 patch series.

Regards,
Alex

>
> Otherwise I think everything else looks reasonable.
>
>>   		page_remove_rmap(page, vma, false);
>>   	}
>>
>> diff --git a/mm/ksm.c b/mm/ksm.c
>> index 063a48eeb5ee..f16056efca21 100644
>> --- a/mm/ksm.c
>> +++ b/mm/ksm.c
>> @@ -474,7 +474,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
>>   	do {
>>   		cond_resched();
>>   		page = follow_page(vma, addr,
>> -				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
>> +				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE | FOLL_LRU);
>>   		if (IS_ERR_OR_NULL(page))
>>   			break;
>>   		if (PageKsm(page))
>> @@ -559,7 +559,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
>>   	if (!vma)
>>   		goto out;
>>
>> -	page = follow_page(vma, addr, FOLL_GET);
>> +	page = follow_page(vma, addr, FOLL_GET | FOLL_LRU);
>>   	if (IS_ERR_OR_NULL(page))
>>   		goto out;
>>   	if (PageAnon(page)) {
>> @@ -2288,7 +2288,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
>>   		while (ksm_scan.address < vma->vm_end) {
>>   			if (ksm_test_exit(mm))
>>   				break;
>> -			*page = follow_page(vma, ksm_scan.address, FOLL_GET);
>> +			*page = follow_page(vma, ksm_scan.address, FOLL_GET | FOLL_LRU);
>>   			if (IS_ERR_OR_NULL(*page)) {
>>   				ksm_scan.address += PAGE_SIZE;
>>   				cond_resched();
>> diff --git a/mm/madvise.c b/mm/madvise.c
>> index 1873616a37d2..e9c24c834e98 100644
>> --- a/mm/madvise.c
>> +++ b/mm/madvise.c
>> @@ -413,7 +413,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
>>   			continue;
>>
>>   		page = vm_normal_page(vma, addr, ptent);
>> -		if (!page)
>> +		if (!page || is_zone_device_page(page))
>>   			continue;
>>
>>   		/*
>> @@ -628,7 +628,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>>   		}
>>
>>   		page = vm_normal_page(vma, addr, ptent);
>> -		if (!page)
>> +		if (!page || is_zone_device_page(page))
>>   			continue;
>>
>>   		/*
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 76e3af9639d9..571a26805ee1 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -621,6 +621,13 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
>>   		if (is_zero_pfn(pfn))
>>   			return NULL;
>>   		if (pte_devmap(pte))
>> +/*
>> + * NOTE: New uers of ZONE_DEVICE will not set pte_devmap() and will have
>> + * refcounts incremented on their struct pages when they are inserted into
>> + * PTEs, thus they are safe to return here. Legacy ZONE_DEVICE pages that set
>> + * pte_devmap() do not have refcounts. Example of legacy ZONE_DEVICE is
>> + * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
>> + */
>>   			return NULL;
>>
>>   		print_bad_pte(vma, addr, pte, NULL);
>> @@ -4422,7 +4429,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
>>   	pte = pte_modify(old_pte, vma->vm_page_prot);
>>
>>   	page = vm_normal_page(vma, vmf->address, pte);
>> -	if (!page)
>> +	if (!page || is_zone_device_page(page))
>>   		goto out_map;
>>
>>   	/* TODO: handle PTE-mapped THP */
>> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
>> index 8c74107a2b15..e32edbecb0cd 100644
>> --- a/mm/mempolicy.c
>> +++ b/mm/mempolicy.c
>> @@ -528,7 +528,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
>>   		if (!pte_present(*pte))
>>   			continue;
>>   		page = vm_normal_page(vma, addr, *pte);
>> -		if (!page)
>> +		if (!page || is_zone_device_page(page))
>>   			continue;
>>   		/*
>>   		 * vm_normal_page() filters out zero pages, but there might
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index 6c31ee1e1c9b..c5d50e96ecd7 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -1611,7 +1611,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
>>   		goto out;
>>
>>   	/* FOLL_DUMP to ignore special (like zero) pages */
>> -	page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
>> +	page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP | FOLL_LRU);
>>
>>   	err = PTR_ERR(page);
>>   	if (IS_ERR(page))
>> @@ -1802,7 +1802,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
>>   			goto set_status;
>>
>>   		/* FOLL_DUMP to ignore special (like zero) pages */
>> -		page = follow_page(vma, addr, FOLL_DUMP);
>> +		page = follow_page(vma, addr, FOLL_DUMP | FOLL_LRU);
>>
>>   		err = PTR_ERR(page);
>>   		if (IS_ERR(page))
>> diff --git a/mm/mlock.c b/mm/mlock.c
>> index 716caf851043..b14e929084cc 100644
>> --- a/mm/mlock.c
>> +++ b/mm/mlock.c
>> @@ -333,7 +333,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
>>   		if (!pte_present(*pte))
>>   			continue;
>>   		page = vm_normal_page(vma, addr, *pte);
>> -		if (!page)
>> +		if (!page || is_zone_device_page(page))
>>   			continue;
>>   		if (PageTransCompound(page))
>>   			continue;
>> diff --git a/mm/mprotect.c b/mm/mprotect.c
>> index b69ce7a7b2b7..a6f3587ea29a 100644
>> --- a/mm/mprotect.c
>> +++ b/mm/mprotect.c
>> @@ -91,7 +91,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
>>   					continue;
>>
>>   				page = vm_normal_page(vma, addr, oldpte);
>> -				if (!page || PageKsm(page))
>> +				if (!page || is_zone_device_page(page) || PageKsm(page))
>>   					continue;
>>
>>   				/* Also skip shared copy-on-write pages */