[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240410193150.655df790@p-imbrenda>
Date: Wed, 10 Apr 2024 19:31:50 +0200
From: Claudio Imbrenda <imbrenda@...ux.ibm.com>
To: David Hildenbrand <david@...hat.com>
Cc: linux-kernel@...r.kernel.org, linux-mm@...ck.org,
linux-s390@...r.kernel.org, kvm@...r.kernel.org,
Matthew Wilcox
<willy@...radead.org>,
Heiko Carstens <hca@...ux.ibm.com>, Vasily Gorbik
<gor@...ux.ibm.com>,
Alexander Gordeev <agordeev@...ux.ibm.com>,
Christian
Borntraeger <borntraeger@...ux.ibm.com>,
Sven Schnelle
<svens@...ux.ibm.com>,
Janosch Frank <frankja@...ux.ibm.com>,
Gerald
Schaefer <gerald.schaefer@...ux.ibm.com>,
Thomas Huth <thuth@...hat.com>
Subject: Re: [PATCH v1 2/5] s390/uv: convert gmap_make_secure() to work on
folios
On Thu, 4 Apr 2024 18:36:39 +0200
David Hildenbrand <david@...hat.com> wrote:
> We have various goals that require gmap_make_secure() to only work on
> folios. We want to limit the use of page_mapcount() to the places where it
> is absolutely necessary, we want to avoid using page flags of tail
> pages, and we want to remove page_has_private().
>
> So, let's convert gmap_make_secure() to folios. While s390x makes sure
> to never have PMD-mapped THP in processes that use KVM -- by remapping
> them using PTEs in thp_split_walk_pmd_entry()->split_huge_pmd() -- we might
> still find PTE-mapped THPs and could end up working on tail pages of
> such large folios for now.
>
> To handle that cleanly, let's simply split any PTE-mapped large folio,
> so we can be sure that we are always working with small folios and never
> on tail pages.
>
> There is no real change: splitting will similarly fail on unexpected folio
> references, just like it would already when we try to freeze the folio
> refcount.
>
> Signed-off-by: David Hildenbrand <david@...hat.com>
> ---
> arch/s390/include/asm/page.h | 1 +
> arch/s390/kernel/uv.c | 66 ++++++++++++++++++++++--------------
> 2 files changed, 42 insertions(+), 25 deletions(-)
>
> diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
> index 9381879f7ecf..54d015bcd8e3 100644
> --- a/arch/s390/include/asm/page.h
> +++ b/arch/s390/include/asm/page.h
> @@ -215,6 +215,7 @@ static inline unsigned long __phys_addr(unsigned long x, bool is_31bit)
>
> #define phys_to_page(phys) pfn_to_page(phys_to_pfn(phys))
> #define page_to_phys(page) pfn_to_phys(page_to_pfn(page))
> +#define folio_to_phys(page) pfn_to_phys(folio_pfn(folio))
>
> static inline void *pfn_to_virt(unsigned long pfn)
> {
> diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
> index 7401838b960b..adcbd4b13035 100644
> --- a/arch/s390/kernel/uv.c
> +++ b/arch/s390/kernel/uv.c
> @@ -181,36 +181,36 @@ int uv_convert_owned_from_secure(unsigned long paddr)
> }
>
> /*
> - * Calculate the expected ref_count for a page that would otherwise have no
> + * Calculate the expected ref_count for a folio that would otherwise have no
> * further pins. This was cribbed from similar functions in other places in
> * the kernel, but with some slight modifications. We know that a secure
> - * page can not be a huge page for example.
> + * folio can only be a small folio for example.
> */
> -static int expected_page_refs(struct page *page)
> +static int expected_folio_refs(struct folio *folio)
> {
> int res;
>
> - res = page_mapcount(page);
> - if (PageSwapCache(page)) {
> + res = folio_mapcount(folio);
> + if (folio_test_swapcache(folio)) {
> res++;
> - } else if (page_mapping(page)) {
> + } else if (folio_mapping(folio)) {
> res++;
> - if (page_has_private(page))
> + if (folio_has_private(folio))
> res++;
> }
> return res;
> }
>
> -static int make_page_secure(struct page *page, struct uv_cb_header *uvcb)
> +static int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb)
> {
> int expected, cc = 0;
>
> - if (PageWriteback(page))
> + if (folio_test_writeback(folio))
> return -EAGAIN;
> - expected = expected_page_refs(page);
> - if (!page_ref_freeze(page, expected))
> + expected = expected_folio_refs(folio);
> + if (!folio_ref_freeze(folio, expected))
> return -EBUSY;
> - set_bit(PG_arch_1, &page->flags);
> + set_bit(PG_arch_1, &folio->flags);
> /*
> * If the UVC does not succeed or fail immediately, we don't want to
> * loop for long, or we might get stall notifications.
> @@ -220,9 +220,9 @@ static int make_page_secure(struct page *page, struct uv_cb_header *uvcb)
> * -EAGAIN and we let the callers deal with it.
> */
> cc = __uv_call(0, (u64)uvcb);
> - page_ref_unfreeze(page, expected);
> + folio_ref_unfreeze(folio, expected);
> /*
> - * Return -ENXIO if the page was not mapped, -EINVAL for other errors.
> + * Return -ENXIO if the folio was not mapped, -EINVAL for other errors.
> * If busy or partially completed, return -EAGAIN.
> */
> if (cc == UVC_CC_OK)
> @@ -277,7 +277,7 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
> bool local_drain = false;
> spinlock_t *ptelock;
> unsigned long uaddr;
> - struct page *page;
> + struct folio *folio;
> pte_t *ptep;
> int rc;
>
> @@ -306,33 +306,49 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
> if (!ptep)
> goto out;
> if (pte_present(*ptep) && !(pte_val(*ptep) & _PAGE_INVALID) && pte_write(*ptep)) {
> - page = pte_page(*ptep);
> + folio = page_folio(pte_page(*ptep));
> rc = -EAGAIN;
> - if (trylock_page(page)) {
> +
> + /* We might get PTE-mapped large folios; split them first. */
> + if (folio_test_large(folio)) {
> + rc = -E2BIG;
> + } else if (folio_trylock(folio)) {
> if (should_export_before_import(uvcb, gmap->mm))
> - uv_convert_from_secure(page_to_phys(page));
> - rc = make_page_secure(page, uvcb);
> - unlock_page(page);
> + uv_convert_from_secure(folio_to_phys(folio));
> + rc = make_folio_secure(folio, uvcb);
> + folio_unlock(folio);
> }
>
> /*
> - * Once we drop the PTL, the page may get unmapped and
> + * Once we drop the PTL, the folio may get unmapped and
> * freed immediately. We need a temporary reference.
> */
> - if (rc == -EAGAIN)
> - get_page(page);
> + if (rc == -EAGAIN || rc == -E2BIG)
> + folio_get(folio);
> }
> pte_unmap_unlock(ptep, ptelock);
> out:
> mmap_read_unlock(gmap->mm);
>
> + if (rc == -E2BIG) {
> + /*
> + * Splitting might fail with -EBUSY due to unexpected folio
> + * references, just like make_folio_secure(). So handle it
> + * ahead of time without the PTL being held.
> + */
> + folio_lock(folio);
> + rc = split_folio(folio);
if split_folio returns -EAGAIN...
> + folio_unlock(folio);
> + folio_put(folio);
> + }
> +
> if (rc == -EAGAIN) {
.. we will not skip this ...
> /*
> * If we are here because the UVC returned busy or partial
> * completion, this is just a useless check, but it is safe.
> */
> - wait_on_page_writeback(page);
> - put_page(page);
> + folio_wait_writeback(folio);
> + folio_put(folio);
.. and we will do one folio_put() too many
> } else if (rc == -EBUSY) {
> /*
> * If we have tried a local drain and the page refcount
are we sure that split_folio() can never return -EAGAIN now and in the
future too?
maybe just change it to } else if (... ?
Powered by blists - more mailing lists