linux-kernel - [PATCH RFC 1/1] mm/ksm: Add recovery mechanism for memory failures

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251009070045.2011920-2-xialonglong2025@163.com>
Date: Thu,  9 Oct 2025 15:00:45 +0800
From: Longlong Xia <xialonglong2025@....com>
To: linmiaohe@...wei.com,
	nao.horiguchi@...il.com
Cc: akpm@...ux-foundation.org,
	david@...hat.com,
	wangkefeng.wang@...wei.com,
	xu.xin16@....com.cn,
	linux-kernel@...r.kernel.org,
	linux-mm@...ck.org,
	Longlong Xia <xialonglong@...inos.cn>
Subject: [PATCH RFC 1/1] mm/ksm: Add recovery mechanism for memory failures

From: Longlong Xia <xialonglong@...inos.cn>

When a hardware memory error occurs on a KSM page, the current
behavior is to kill all processes mapping that page. This can
be overly aggressive when KSM has multiple duplicate pages in
a chain where other duplicates are still healthy.

This patch introduces a recovery mechanism that attempts to migrate
mappings from the failing KSM page to another healthy KSM page within
the same chain before resorting to killing processes.

The recovery process works as follows:
1. When a memory failure is detected on a KSM page, identify if the
failing node is part of a chain (has duplicates)
2. Search for another healthy KSM page within the same chain
3. For each process mapping the failing page:
- Update the PTE to point to the healthy KSM page
- Migrate the rmap_item to the new stable node
4. If all migrations succeed, remove the failing node from the chain
5. Only kill processes if recovery is impossible or fails

Signed-off-by: Longlong Xia <xialonglong@...inos.cn>
---
 mm/ksm.c | 183 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 183 insertions(+)

diff --git a/mm/ksm.c b/mm/ksm.c
index 160787bb121c..590d30cfe800 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -3084,6 +3084,183 @@ void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc)
 }
 
 #ifdef CONFIG_MEMORY_FAILURE
+static struct ksm_stable_node *find_chain_head(struct ksm_stable_node *dup_node)
+{
+	struct ksm_stable_node *stable_node, *dup;
+	struct rb_node *node;
+	int nid;
+
+	if (!is_stable_node_dup(dup_node))
+		return NULL;
+
+	for (nid = 0; nid < ksm_nr_node_ids; nid++) {
+		node = rb_first(root_stable_tree + nid);
+		for (; node; node = rb_next(node)) {
+			stable_node = rb_entry(node,
+					       struct ksm_stable_node,
+					       node);
+
+			if (!is_stable_node_chain(stable_node))
+				continue;
+
+			hlist_for_each_entry(dup, &stable_node->hlist,
+					     hlist_dup) {
+				if (dup == dup_node)
+					return stable_node;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+static struct folio *
+find_target_folio(struct ksm_stable_node *failing_node, struct ksm_stable_node **target_dup)
+{
+	struct ksm_stable_node *chain_head, *dup;
+	struct hlist_node *hlist_safe;
+	struct folio *target_folio;
+
+	if (!is_stable_node_dup(failing_node))
+		return NULL;
+
+	chain_head = find_chain_head(failing_node);
+	if (!chain_head)
+		return NULL;
+
+	hlist_for_each_entry_safe(dup, hlist_safe, &chain_head->hlist, hlist_dup) {
+		if (dup == failing_node)
+			continue;
+
+		target_folio = ksm_get_folio(dup, KSM_GET_FOLIO_TRYLOCK);
+		if (target_folio) {
+			*target_dup = dup;
+			return target_folio;
+		}
+	}
+
+	return NULL;
+}
+
+static int replace_failing_page(struct vm_area_struct *vma, struct page *page,
+			struct page *kpage, unsigned long addr)
+{
+	struct folio *kfolio = page_folio(kpage);
+	struct mm_struct *mm = vma->vm_mm;
+	struct folio *folio = page_folio(page);
+	pmd_t *pmd;
+	pte_t *ptep;
+	pte_t newpte;
+	spinlock_t *ptl;
+	int err = -EFAULT;
+	struct mmu_notifier_range range;
+
+	pmd = mm_find_pmd(mm, addr);
+	if (!pmd)
+		goto out;
+
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
+				addr + PAGE_SIZE);
+	mmu_notifier_invalidate_range_start(&range);
+
+	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	if (!ptep)
+		goto out_mn;
+
+	if (!is_zero_pfn(page_to_pfn(kpage))) {
+		folio_get(kfolio);
+		folio_add_anon_rmap_pte(kfolio, kpage, vma, addr, RMAP_NONE);
+		newpte = mk_pte(kpage, vma->vm_page_prot);
+	} else {
+		newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot)));
+		ksm_map_zero_page(mm);
+		dec_mm_counter(mm, MM_ANONPAGES);
+	}
+
+	flush_cache_page(vma, addr, pte_pfn(ptep_get(ptep)));
+	ptep_clear_flush(vma, addr, ptep);
+	set_pte_at(mm, addr, ptep, newpte);
+
+	folio_remove_rmap_pte(folio, page, vma);
+	if (!folio_mapped(folio))
+		folio_free_swap(folio);
+	folio_put(folio);
+
+	pte_unmap_unlock(ptep, ptl);
+	err = 0;
+out_mn:
+	mmu_notifier_invalidate_range_end(&range);
+out:
+	return err;
+}
+
+static bool ksm_recover_within_chain(struct ksm_stable_node *failing_node)
+{
+	struct ksm_rmap_item *rmap_item;
+	struct hlist_node *hlist_safe;
+	struct folio *failing_folio = NULL;
+	struct folio *target_folio = NULL;
+	struct ksm_stable_node *target_dup = NULL;
+	int err;
+
+	if (!is_stable_node_dup(failing_node))
+		return false;
+
+	failing_folio = ksm_get_folio(failing_node, KSM_GET_FOLIO_NOLOCK);
+	if (!failing_folio)
+		return false;
+
+	target_folio = find_target_folio(failing_node, &target_dup);
+	if (!target_folio) {
+		folio_put(failing_folio);
+		return false;
+	}
+
+	hlist_for_each_entry_safe(rmap_item, hlist_safe, &failing_node->hlist, hlist) {
+		struct mm_struct *mm = rmap_item->mm;
+		unsigned long addr = rmap_item->address & PAGE_MASK;
+		struct vm_area_struct *vma;
+
+		mmap_read_lock(mm);
+		if (ksm_test_exit(mm)) {
+			mmap_read_unlock(mm);
+			continue;
+		}
+
+		vma = vma_lookup(mm, addr);
+		if (!vma) {
+			mmap_read_unlock(mm);
+			continue;
+		}
+
+		/* Update PTE to point to target_folio's page */
+		err = replace_failing_page(vma, &failing_folio->page,
+					     &target_folio->page, addr);
+		if (!err) {
+			hlist_del(&rmap_item->hlist);
+			rmap_item->head = target_dup;
+			hlist_add_head(&rmap_item->hlist, &target_dup->hlist);
+			target_dup->rmap_hlist_len++;
+			failing_node->rmap_hlist_len--;
+
+		}
+
+		mmap_read_unlock(mm);
+	}
+
+	folio_unlock(target_folio);
+	folio_put(target_folio);
+	folio_put(failing_folio);
+
+	if (failing_node->rmap_hlist_len == 0) {
+		__stable_node_dup_del(failing_node);
+		free_stable_node(failing_node);
+		return true;
+	}
+
+	return false;
+}
+
 /*
  * Collect processes when the error hit an ksm page.
  */
@@ -3098,6 +3275,12 @@ void collect_procs_ksm(const struct folio *folio, const struct page *page,
 	stable_node = folio_stable_node(folio);
 	if (!stable_node)
 		return;
+
+	if (ksm_recover_within_chain(stable_node)) {
+		pr_debug("recovery within chain successful, no need to kill processes\n");
+		return;
+	}
+
 	hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
 		struct anon_vma *av = rmap_item->anon_vma;
 
-- 
2.43.0