linux-kernel - Re: [PATCH RFC 1/1] mm/ksm: Add recovery mechanism for memory failures

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CABzRoyYfx0QPgGG4WYEYmT8-J10ToRCUStd3tWC0CtT_D8ctiQ@mail.gmail.com>
Date: Thu, 9 Oct 2025 20:13:33 +0800
From: Lance Yang <lance.yang@...ux.dev>
To: Longlong Xia <xialonglong2025@....com>
Cc: linmiaohe@...wei.com, nao.horiguchi@...il.com, akpm@...ux-foundation.org, 
	david@...hat.com, wangkefeng.wang@...wei.com, xu.xin16@....com.cn, 
	linux-kernel@...r.kernel.org, linux-mm@...ck.org, 
	Longlong Xia <xialonglong@...inos.cn>
Subject: Re: [PATCH RFC 1/1] mm/ksm: Add recovery mechanism for memory failures

On Thu, Oct 9, 2025 at 3:56 PM Longlong Xia <xialonglong2025@....com> wrote:
>
> From: Longlong Xia <xialonglong@...inos.cn>
>
> When a hardware memory error occurs on a KSM page, the current
> behavior is to kill all processes mapping that page. This can
> be overly aggressive when KSM has multiple duplicate pages in
> a chain where other duplicates are still healthy.
>
> This patch introduces a recovery mechanism that attempts to migrate
> mappings from the failing KSM page to another healthy KSM page within
> the same chain before resorting to killing processes.

Interesting, thanks for the patch! One question below.

>
> The recovery process works as follows:
> 1. When a memory failure is detected on a KSM page, identify if the
> failing node is part of a chain (has duplicates)
> 2. Search for another healthy KSM page within the same chain
> 3. For each process mapping the failing page:
> - Update the PTE to point to the healthy KSM page
> - Migrate the rmap_item to the new stable node
> 4. If all migrations succeed, remove the failing node from the chain
> 5. Only kill processes if recovery is impossible or fails
>
> Signed-off-by: Longlong Xia <xialonglong@...inos.cn>
> ---
>  mm/ksm.c | 183 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 183 insertions(+)
>
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 160787bb121c..590d30cfe800 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -3084,6 +3084,183 @@ void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc)
>  }
>
>  #ifdef CONFIG_MEMORY_FAILURE
> +static struct ksm_stable_node *find_chain_head(struct ksm_stable_node *dup_node)
> +{
> +       struct ksm_stable_node *stable_node, *dup;
> +       struct rb_node *node;
> +       int nid;
> +
> +       if (!is_stable_node_dup(dup_node))
> +               return NULL;
> +
> +       for (nid = 0; nid < ksm_nr_node_ids; nid++) {
> +               node = rb_first(root_stable_tree + nid);
> +               for (; node; node = rb_next(node)) {
> +                       stable_node = rb_entry(node,
> +                                              struct ksm_stable_node,
> +                                              node);
> +
> +                       if (!is_stable_node_chain(stable_node))
> +                               continue;
> +
> +                       hlist_for_each_entry(dup, &stable_node->hlist,
> +                                            hlist_dup) {
> +                               if (dup == dup_node)
> +                                       return stable_node;
> +                       }
> +               }
> +       }
> +
> +       return NULL;
> +}
> +
> +static struct folio *
> +find_target_folio(struct ksm_stable_node *failing_node, struct ksm_stable_node **target_dup)
> +{
> +       struct ksm_stable_node *chain_head, *dup;
> +       struct hlist_node *hlist_safe;
> +       struct folio *target_folio;
> +
> +       if (!is_stable_node_dup(failing_node))
> +               return NULL;
> +
> +       chain_head = find_chain_head(failing_node);
> +       if (!chain_head)
> +               return NULL;
> +
> +       hlist_for_each_entry_safe(dup, hlist_safe, &chain_head->hlist, hlist_dup) {
> +               if (dup == failing_node)
> +                       continue;
> +
> +               target_folio = ksm_get_folio(dup, KSM_GET_FOLIO_TRYLOCK);
> +               if (target_folio) {
> +                       *target_dup = dup;
> +                       return target_folio;
> +               }
> +       }
> +
> +       return NULL;
> +}
> +
> +static int replace_failing_page(struct vm_area_struct *vma, struct page *page,
> +                       struct page *kpage, unsigned long addr)
> +{
> +       struct folio *kfolio = page_folio(kpage);
> +       struct mm_struct *mm = vma->vm_mm;
> +       struct folio *folio = page_folio(page);
> +       pmd_t *pmd;
> +       pte_t *ptep;
> +       pte_t newpte;
> +       spinlock_t *ptl;
> +       int err = -EFAULT;
> +       struct mmu_notifier_range range;
> +
> +       pmd = mm_find_pmd(mm, addr);
> +       if (!pmd)
> +               goto out;
> +
> +       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
> +                               addr + PAGE_SIZE);
> +       mmu_notifier_invalidate_range_start(&range);
> +
> +       ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
> +       if (!ptep)
> +               goto out_mn;
> +
> +       if (!is_zero_pfn(page_to_pfn(kpage))) {
> +               folio_get(kfolio);
> +               folio_add_anon_rmap_pte(kfolio, kpage, vma, addr, RMAP_NONE);
> +               newpte = mk_pte(kpage, vma->vm_page_prot);
> +       } else {
> +               newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot)));
> +               ksm_map_zero_page(mm);
> +               dec_mm_counter(mm, MM_ANONPAGES);
> +       }

Can find_target_folio() return the shared zeropage? If not, the else block
looks like dead code and can be removed.

And, a real hardware failure on the shared zeropage would be
non-recoverable, I guess.

Cheers,
Lance

> +
> +       flush_cache_page(vma, addr, pte_pfn(ptep_get(ptep)));
> +       ptep_clear_flush(vma, addr, ptep);
> +       set_pte_at(mm, addr, ptep, newpte);
> +
> +       folio_remove_rmap_pte(folio, page, vma);
> +       if (!folio_mapped(folio))
> +               folio_free_swap(folio);
> +       folio_put(folio);
> +
> +       pte_unmap_unlock(ptep, ptl);
> +       err = 0;
> +out_mn:
> +       mmu_notifier_invalidate_range_end(&range);
> +out:
> +       return err;
> +}
> +
> +static bool ksm_recover_within_chain(struct ksm_stable_node *failing_node)
> +{
> +       struct ksm_rmap_item *rmap_item;
> +       struct hlist_node *hlist_safe;
> +       struct folio *failing_folio = NULL;
> +       struct folio *target_folio = NULL;
> +       struct ksm_stable_node *target_dup = NULL;
> +       int err;
> +
> +       if (!is_stable_node_dup(failing_node))
> +               return false;
> +
> +       failing_folio = ksm_get_folio(failing_node, KSM_GET_FOLIO_NOLOCK);
> +       if (!failing_folio)
> +               return false;
> +
> +       target_folio = find_target_folio(failing_node, &target_dup);
> +       if (!target_folio) {
> +               folio_put(failing_folio);
> +               return false;
> +       }
> +
> +       hlist_for_each_entry_safe(rmap_item, hlist_safe, &failing_node->hlist, hlist) {
> +               struct mm_struct *mm = rmap_item->mm;
> +               unsigned long addr = rmap_item->address & PAGE_MASK;
> +               struct vm_area_struct *vma;
> +
> +               mmap_read_lock(mm);
> +               if (ksm_test_exit(mm)) {
> +                       mmap_read_unlock(mm);
> +                       continue;
> +               }
> +
> +               vma = vma_lookup(mm, addr);
> +               if (!vma) {
> +                       mmap_read_unlock(mm);
> +                       continue;
> +               }
> +
> +               /* Update PTE to point to target_folio's page */
> +               err = replace_failing_page(vma, &failing_folio->page,
> +                                            &target_folio->page, addr);
> +               if (!err) {
> +                       hlist_del(&rmap_item->hlist);
> +                       rmap_item->head = target_dup;
> +                       hlist_add_head(&rmap_item->hlist, &target_dup->hlist);
> +                       target_dup->rmap_hlist_len++;
> +                       failing_node->rmap_hlist_len--;
> +
> +               }
> +
> +               mmap_read_unlock(mm);
> +       }
> +
> +       folio_unlock(target_folio);
> +       folio_put(target_folio);
> +       folio_put(failing_folio);
> +
> +       if (failing_node->rmap_hlist_len == 0) {
> +               __stable_node_dup_del(failing_node);
> +               free_stable_node(failing_node);
> +               return true;
> +       }
> +
> +       return false;
> +}
> +
>  /*
>   * Collect processes when the error hit an ksm page.
>   */
> @@ -3098,6 +3275,12 @@ void collect_procs_ksm(const struct folio *folio, const struct page *page,
>         stable_node = folio_stable_node(folio);
>         if (!stable_node)
>                 return;
> +
> +       if (ksm_recover_within_chain(stable_node)) {
> +               pr_debug("recovery within chain successful, no need to kill processes\n");
> +               return;
> +       }
> +
>         hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
>                 struct anon_vma *av = rmap_item->anon_vma;
>
> --
> 2.43.0
>
>