linux-kernel - [PATCH 2/5] ksm: support unsharing zero pages placed by KSM

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20221008070505.308580-1-xu.xin16@zte.com.cn>
Date:   Sat,  8 Oct 2022 07:05:05 +0000
From:   xu.xin.sc@...il.com
To:     akpm@...ux-foundation.org
Cc:     linux-mm@...ck.org, linux-kernel@...r.kernel.org,
        xu xin <xu.xin16@....com.cn>,
        David Hildenbrand <david@...hat.com>,
        Claudio Imbrenda <imbrenda@...ux.ibm.com>,
        Xuexin Jiang <jiang.xuexin@....com.cn>,
        Xiaokai Ran <ran.xiaokai@....com.cn>,
        Yang Yang <yang.yang29@....com.cn>
Subject: [PATCH 2/5] ksm: support unsharing zero pages placed by KSM

From: xu xin <xu.xin16@....com.cn>

After the commit e86c59b1b12d ("mm/ksm: improve deduplication of zero
pages with colouring"), madvise(addr, len, MADV_UNMERGEABLE) and other
ways (like write 2 to /sys/kernel/mm/ksm/run) to trigger unsharing will
**not** unshare the shared zeropage as placed by KSM (which is against
the MADV_UNMERGEABLE documentation at least).

To not blindly unshare all shared zero_pages in applicable VMAs, the patch
introduces a dedicated flag ZERO_PAGE_FLAG to mark the rmap_items of those
shared zero_pages. and guarantee that these rmap_items will be not freed
during the time of zero_pages not being writing, so we can only unshare
the *KSM-placed* zero_pages.

The patch will not degrade the performance of use_zero_pages as it doesn't
change the way of merging empty pages in use_zero_pages's feature.

Fixes: e86c59b1b12d ("mm/ksm: improve deduplication of zero pages with colouring")
Reported-by: David Hildenbrand <david@...hat.com>
Cc: Claudio Imbrenda <imbrenda@...ux.ibm.com>
Cc: Xuexin Jiang <jiang.xuexin@....com.cn>
Signed-off-by: xu xin <xu.xin16@....com.cn>
Co-developed-by: Xiaokai Ran <ran.xiaokai@....com.cn>
Signed-off-by: Xiaokai Ran <ran.xiaokai@....com.cn>
Co-developed-by: Yang Yang <yang.yang29@....com.cn>
Signed-off-by: Yang Yang <yang.yang29@....com.cn>
Signed-off-by: xu xin <xu.xin16@....com.cn>
---
 mm/ksm.c | 134 ++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 104 insertions(+), 30 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index 5b68482d2b3b..75978f7eeed1 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -213,6 +213,7 @@ struct ksm_rmap_item {
 #define SEQNR_MASK	0x0ff	/* low bits of unstable tree seqnr */
 #define UNSTABLE_FLAG	0x100	/* is a node of the unstable tree */
 #define STABLE_FLAG	0x200	/* is listed from the stable tree */
+#define ZERO_PAGE_FLAG 0x400 /* is zero page placed by KSM */
 
 /* The stable and unstable tree heads */
 static struct rb_root one_stable_tree[1] = { RB_ROOT };
@@ -381,14 +382,6 @@ static inline struct ksm_rmap_item *alloc_rmap_item(void)
 	return rmap_item;
 }
 
-static inline void free_rmap_item(struct ksm_rmap_item *rmap_item)
-{
-	ksm_rmap_items--;
-	rmap_item->mm->ksm_rmap_items--;
-	rmap_item->mm = NULL;	/* debug safety */
-	kmem_cache_free(rmap_item_cache, rmap_item);
-}
-
 static inline struct ksm_stable_node *alloc_stable_node(void)
 {
 	/*
@@ -434,7 +427,8 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
  * of the process that owns 'vma'.  We also do not want to enforce
  * protection keys here anyway.
  */
-static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
+static int break_ksm(struct vm_area_struct *vma, unsigned long addr,
+				     bool ksm_check_bypass)
 {
 	struct page *page;
 	vm_fault_t ret = 0;
@@ -449,6 +443,16 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
 			ret = handle_mm_fault(vma, addr,
 					      FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
 					      NULL);
+		else if (ksm_check_bypass && is_zero_pfn(page_to_pfn(page))) {
+			/*
+			 * Although it's not ksm page, it's zero page as placed by
+			 * KSM use_zero_page, so we should unshare it when
+			 * ksm_check_bypass is true.
+			 */
+			ret = handle_mm_fault(vma, addr,
+						  FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
+						  NULL);
+		}
 		else
 			ret = VM_FAULT_WRITE;
 		put_page(page);
@@ -496,6 +500,11 @@ static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
 	return vma;
 }
 
+/*
+ * Note: Don't call break_cow() in the context protected by
+ * mmap_read_lock(), which may cause dead lock because inside
+ * break_cow mmap_read_lock exists.
+ */
 static void break_cow(struct ksm_rmap_item *rmap_item)
 {
 	struct mm_struct *mm = rmap_item->mm;
@@ -511,10 +520,35 @@ static void break_cow(struct ksm_rmap_item *rmap_item)
 	mmap_read_lock(mm);
 	vma = find_mergeable_vma(mm, addr);
 	if (vma)
-		break_ksm(vma, addr);
+		break_ksm(vma, addr, false);
 	mmap_read_unlock(mm);
 }
 
+/* Only called when rmap_item->address is with ZERO_PAGE_FLAG */
+static inline int unshare_zero_pages(struct ksm_rmap_item *rmap_item)
+{
+	struct mm_struct *mm = rmap_item->mm;
+	struct vm_area_struct *vma;
+	unsigned long addr = rmap_item->address;
+	int err = -EFAULT;
+
+	vma = vma_lookup(mm, addr);
+	if (vma)
+		err = break_ksm(vma, addr, true);
+
+	return err;
+}
+
+static inline void free_rmap_item(struct ksm_rmap_item *rmap_item)
+{
+	if (rmap_item->address & ZERO_PAGE_FLAG)
+		unshare_zero_pages(rmap_item);
+	ksm_rmap_items--;
+	rmap_item->mm->ksm_rmap_items--;
+	rmap_item->mm = NULL;   /* debug safety */
+	kmem_cache_free(rmap_item_cache, rmap_item);
+}
+
 static struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item)
 {
 	struct mm_struct *mm = rmap_item->mm;
@@ -825,7 +859,7 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
 		if (signal_pending(current))
 			err = -ERESTARTSYS;
 		else
-			err = break_ksm(vma, addr);
+			err = break_ksm(vma, addr, NULL);
 	}
 	return err;
 }
@@ -2017,6 +2051,36 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item,
 	rmap_item->mm->ksm_merging_pages++;
 }
 
+static int try_to_merge_with_kernel_zero_page(struct mm_struct *mm,
+				   struct ksm_rmap_item *rmap_item,
+				   struct page *page)
+{
+	int err = 0;
+
+	if (!(rmap_item->address & ZERO_PAGE_FLAG)) {
+		struct vm_area_struct *vma;
+
+		mmap_read_lock(mm);
+		vma = find_mergeable_vma(mm, rmap_item->address);
+		if (vma) {
+			err = try_to_merge_one_page(vma, page,
+					ZERO_PAGE(rmap_item->address));
+		} else {
+			/* If the vma is out of date, we do not need to continue. */
+			err = 0;
+		}
+		mmap_read_unlock(mm);
+		/*
+		 * In case of failure, the page was not really empty, so we
+		 * need to continue. Otherwise we're done.
+		 */
+		if (!err)
+			rmap_item->address |= ZERO_PAGE_FLAG;
+	}
+
+	return err;
+}
+
 /*
  * cmp_and_merge_page - first see if page can be merged into the stable tree;
  * if not, compare checksum to previous and if it's the same, see if page can
@@ -2101,29 +2165,21 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
 	 * Same checksum as an empty page. We attempt to merge it with the
 	 * appropriate zero page if the user enabled this via sysfs.
 	 */
-	if (ksm_use_zero_pages && (checksum == zero_checksum)) {
-		struct vm_area_struct *vma;
-
-		mmap_read_lock(mm);
-		vma = find_mergeable_vma(mm, rmap_item->address);
-		if (vma) {
-			err = try_to_merge_one_page(vma, page,
-					ZERO_PAGE(rmap_item->address));
-		} else {
+	if (ksm_use_zero_pages) {
+		if (checksum == zero_checksum) {
+			/* If success, just return. Otherwise, continue */
+			if (!try_to_merge_with_kernel_zero_page(mm, rmap_item, page))
+				return;
+		} else if (rmap_item->address & ZERO_PAGE_FLAG) {
 			/*
-			 * If the vma is out of date, we do not need to
-			 * continue.
+			 * The page now is not kernel zero page (COW happens to it)
+			 * but the flag of its rmap_item is still zero-page, so need
+			 * to reset the flag and update the corresponding count.
 			 */
-			err = 0;
+			rmap_item->address &= PAGE_MASK;
 		}
-		mmap_read_unlock(mm);
-		/*
-		 * In case of failure, the page was not really empty, so we
-		 * need to continue. Otherwise we're done.
-		 */
-		if (!err)
-			return;
 	}
+
 	tree_rmap_item =
 		unstable_tree_search_insert(rmap_item, page, &tree_page);
 	if (tree_rmap_item) {
@@ -2197,6 +2253,7 @@ static struct ksm_rmap_item *try_to_get_old_rmap_item(unsigned long addr,
 		if (rmap_item->address > addr)
 			break;
 		*rmap_list = rmap_item->rmap_list;
+		/* running here indicates these pages have been unmerged */
 		remove_rmap_item_from_tree(rmap_item);
 		free_rmap_item(rmap_item);
 	}
@@ -2336,6 +2393,23 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
 				mmap_read_unlock(mm);
 				return rmap_item;
 			}
+			/*
+			 * Because we want to monitor ksm zero pages which is
+			 * non-anonymous, we must try to return the rmap_items
+			 * of those kernel zero pages which replaces its
+			 * original anonymous empty page due to use_zero_pages's
+			 * feature.
+			 */
+			if (is_zero_pfn(page_to_pfn(*page))) {
+				rmap_item = try_to_get_old_rmap_item(ksm_scan.address,
+										ksm_scan.rmap_list);
+				if (rmap_item && (rmap_item->address & ZERO_PAGE_FLAG)) {
+					ksm_scan.rmap_list = &rmap_item->rmap_list;
+					ksm_scan.address += PAGE_SIZE;
+					mmap_read_unlock(mm);
+					return rmap_item;
+				}
+			}
 next_page:
 			put_page(*page);
 			ksm_scan.address += PAGE_SIZE;
-- 
2.25.1