[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAMZfGtUOA81Srwp0OLDMx5Nnpz=s_qmG_a3zauzNyd9AYZzReQ@mail.gmail.com>
Date: Tue, 5 Jul 2022 16:57:35 +0800
From: Muchun Song <songmuchun@...edance.com>
To: luofei <luofei@...cloud.com>
Cc: Mike Kravetz <mike.kravetz@...cle.com>,
Andrew Morton <akpm@...ux-foundation.org>,
Linux Memory Management List <linux-mm@...ck.org>,
LKML <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH] mm,hwpoison,hugetlb: defer dissolve hwpoison hugepage
when allocating vmemmap failed
On Tue, Jul 5, 2022 at 2:32 PM luofei <luofei@...cloud.com> wrote:
>
> When dissolving hwpoison hugepage, if the allocation of vmemmap page
> failed, the faulty page should not be put back on the hugepage free
> list, which will cause the faulty pages to be reused. It's better to
Hi luofei,
How did it happen? If a hugepage is poisoned, then the head page's
flag will be set to PageHWPoison. See the code of
dequeue_huge_page_node_exact() which will filter out hwpoisoned
page. So the hwpoisoned pages cannot be reused, hopefully, I am
not missing something important.
> postpone the reexecution of dissolve operation.
>
> Meanwhile when the page fault handling program calls
> dissolve_free_huge_page() to dissolve the faulty page, the basic page
> fault processing operation(such as migration pages and unmap etc)
> has actually completed. There is no need to return -ENOMEM error code
> to the upper layer for temporarily vmemmap page allocation failure,
> which will cause the caller to make a wrong judgment. So just defer
> dissolve and return success.
>
> Signed-off-by: luofei <luofei@...cloud.com>
> ---
> mm/hugetlb.c | 34 +++++++++++++++++++++-------------
> 1 file changed, 21 insertions(+), 13 deletions(-)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index ca081078e814..db25458eb0a5 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -90,6 +90,9 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
>
> /* Forward declaration */
> static int hugetlb_acct_memory(struct hstate *h, long delta);
> +static LLIST_HEAD(hpage_freelist);
> +static void free_hpage_workfn(struct work_struct *work);
> +static DECLARE_DELAYED_WORK(free_hpage_work, free_hpage_workfn);
>
> static inline bool subpool_is_free(struct hugepage_subpool *spool)
> {
> @@ -1535,15 +1538,21 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
> if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
> return;
>
> - if (hugetlb_vmemmap_restore(h, page))
> + if (hugetlb_vmemmap_restore(h, page)) {
> + if (unlikely(PageHWPoison(page))) {
> + llist_add((struct llist_node *)&page->mapping, &hpage_freelist);
> + schedule_delayed_work(&free_hpage_work, HZ);
> + goto out;
> + }
> goto fail;
> + }
>
> /*
> * Move PageHWPoison flag from head page to the raw error pages,
> * which makes any healthy subpages reusable.
> */
> if (unlikely(PageHWPoison(page) && hugetlb_clear_page_hwpoison(page)))
> - goto fail;
> + goto out;
>
> for (i = 0; i < pages_per_huge_page(h);
> i++, subpage = mem_map_next(subpage, page, i)) {
> @@ -1574,6 +1583,8 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
> */
> add_hugetlb_page(h, page, true);
> spin_unlock_irq(&hugetlb_lock);
> +out:
> + return;
> }
>
> /*
> @@ -1587,8 +1598,6 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
> * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node
> * structure of a lockless linked list of huge pages to be freed.
> */
> -static LLIST_HEAD(hpage_freelist);
> -
> static void free_hpage_workfn(struct work_struct *work)
> {
> struct llist_node *node;
> @@ -1616,12 +1625,11 @@ static void free_hpage_workfn(struct work_struct *work)
> cond_resched();
> }
> }
> -static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
>
> static inline void flush_free_hpage_work(struct hstate *h)
> {
> if (hugetlb_vmemmap_optimizable(h))
> - flush_work(&free_hpage_work);
> + flush_delayed_work(&free_hpage_work);
> }
>
> static void update_and_free_page(struct hstate *h, struct page *page,
> @@ -1634,13 +1642,9 @@ static void update_and_free_page(struct hstate *h, struct page *page,
>
> /*
> * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages.
> - *
> - * Only call schedule_work() if hpage_freelist is previously
> - * empty. Otherwise, schedule_work() had been called but the workfn
> - * hasn't retrieved the list yet.
> */
> - if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist))
> - schedule_work(&free_hpage_work);
> + llist_add((struct llist_node *)&page->mapping, &hpage_freelist);
> + schedule_delayed_work(&free_hpage_work, 0);
> }
>
> static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
> @@ -2118,11 +2122,15 @@ int dissolve_free_huge_page(struct page *page)
> rc = hugetlb_vmemmap_restore(h, head);
> if (!rc) {
> update_and_free_page(h, head, false);
> - } else {
> + } else if (!PageHWPoison(head)) {
> spin_lock_irq(&hugetlb_lock);
> add_hugetlb_page(h, head, false);
> h->max_huge_pages++;
> spin_unlock_irq(&hugetlb_lock);
> + } else {
> + llist_add((struct llist_node *)&head->mapping, &hpage_freelist);
> + schedule_delayed_work(&free_hpage_work, HZ);
> + rc = 0;
> }
>
> return rc;
> --
> 2.27.0
>
Powered by blists - more mailing lists