linux-kernel - Re: [PATCH v5 2/6] ksm: support unsharing zero pages placed by KSM

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <60125c55-9ab7-7531-e124-cb676943fea6@redhat.com>
Date:   Wed, 18 Jan 2023 15:10:53 +0100
From:   David Hildenbrand <david@...hat.com>
To:     yang.yang29@....com.cn, akpm@...ux-foundation.org
Cc:     imbrenda@...ux.ibm.com, jiang.xuexin@....com.cn,
        linux-kernel@...r.kernel.org, linux-mm@...ck.org,
        ran.xiaokai@....com.cn, xu.xin.sc@...il.com, xu.xin16@....com.cn
Subject: Re: [PATCH v5 2/6] ksm: support unsharing zero pages placed by KSM

On 30.12.22 02:13, yang.yang29@....com.cn wrote:
> From: xu xin <xu.xin16@....com.cn>
> 
> use_zero_pages may be very useful, not just because of cache colouring
> as described in doc, but also because use_zero_pages can accelerate
> merging empty pages when there are plenty of empty pages (full of zeros)
> as the time of page-by-page comparisons (unstable_tree_search_insert) is
> saved.
> 
> But when enabling use_zero_pages, madvise(addr, len, MADV_UNMERGEABLE) and
> other ways (like write 2 to /sys/kernel/mm/ksm/run) to trigger unsharing
> will *not* actually unshare the shared zeropage as placed by KSM (which is
> against the MADV_UNMERGEABLE documentation). As these KSM-placed zero pages
> are out of the control of KSM, the related counts of ksm pages don't expose
> how many zero pages are placed by KSM (these special zero pages are different
> from those initially mapped zero pages, because the zero pages mapped to
> MADV_UNMERGEABLE areas are expected to be a complete and unshared page)
> 
> To not blindly unshare all shared zero_pages in applicable VMAs, the patch
> introduces a dedicated flag ZERO_PAGE_FLAG to mark the rmap_items of those
> shared zero_pages. and guarantee that these rmap_items will be not freed
> during the time of zero_pages not being writing, so we can only unshare
> the *KSM-placed* zero_pages.
> 
> The patch will not degrade the performance of use_zero_pages as it doesn't
> change the way of merging empty pages in use_zero_pages's feature.
> 
> Signed-off-by: xu xin <xu.xin16@....com.cn>
> Reported-by: David Hildenbrand <david@...hat.com>
> Cc: Claudio Imbrenda <imbrenda@...ux.ibm.com>
> Cc: Xuexin Jiang <jiang.xuexin@....com.cn>
> Reviewed-by: Xiaokai Ran <ran.xiaokai@....com.cn>
> Reviewed-by: Yang Yang <yang.yang29@....com.cn>

[...]

> 
>   /* The stable and unstable tree heads */
>   static struct rb_root one_stable_tree[1] = { RB_ROOT };
> @@ -420,6 +421,11 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
>   	return atomic_read(&mm->mm_users) == 0;
>   }
> 
> +enum break_ksm_pmd_entry_return_flag {
> +	HAVE_KSM_PAGE = 1,
> +	HAVE_ZERO_PAGE
> +};

Why use flags if they both conditions are mutually exclusive?

> +
>   static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
>   			struct mm_walk *walk)
>   {
> @@ -427,6 +433,7 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex
>   	spinlock_t *ptl;
>   	pte_t *pte;
>   	int ret;
> +	bool is_zero_page = false;
> 
>   	if (pmd_leaf(*pmd) || !pmd_present(*pmd))
>   		return 0;
> @@ -434,6 +441,8 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex
>   	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
>   	if (pte_present(*pte)) {
>   		page = vm_normal_page(walk->vma, addr, *pte);
> +		if (!page)
> +			is_zero_page = is_zero_pfn(pte_pfn(*pte));
>   	} else if (!pte_none(*pte)) {
>   		swp_entry_t entry = pte_to_swp_entry(*pte);
> 
> @@ -444,7 +453,14 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex
>   		if (is_migration_entry(entry))
>   			page = pfn_swap_entry_to_page(entry);
>   	}
> -	ret = page && PageKsm(page);
> +
> +	if (page && PageKsm(page))
> +		ret = HAVE_KSM_PAGE;
> +	else if (is_zero_page)
> +		ret = HAVE_ZERO_PAGE;
> +	else
> +		ret = 0;
> +
>   	pte_unmap_unlock(pte, ptl);
>   	return ret;
>   }
> @@ -466,19 +482,22 @@ static const struct mm_walk_ops break_ksm_ops = {
>    * of the process that owns 'vma'.  We also do not want to enforce
>    * protection keys here anyway.
>    */
> -static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
> +static int break_ksm(struct vm_area_struct *vma, unsigned long addr,
> +				     bool unshare_zero_page)
>   {
>   	vm_fault_t ret = 0;
> 
>   	do {
> -		int ksm_page;
> +		int walk_result;
> 
>   		cond_resched();
> -		ksm_page = walk_page_range_vma(vma, addr, addr + 1,
> +		walk_result = walk_page_range_vma(vma, addr, addr + 1,
>   					       &break_ksm_ops, NULL);
> -		if (WARN_ON_ONCE(ksm_page < 0))
> -			return ksm_page;
> -		if (!ksm_page)
> +		if (WARN_ON_ONCE(walk_result < 0))
> +			return walk_result;
> +		if (!walk_result)
> +			return 0;
> +		if (walk_result == HAVE_ZERO_PAGE && !unshare_zero_page)
>   			return 0;
>   		ret = handle_mm_fault(vma, addr,
>   				      FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
> @@ -539,7 +558,7 @@ static void break_cow(struct ksm_rmap_item *rmap_item)
>   	mmap_read_lock(mm);
>   	vma = find_mergeable_vma(mm, addr);
>   	if (vma)
> -		break_ksm(vma, addr);
> +		break_ksm(vma, addr, false);
>   	mmap_read_unlock(mm);
>   }
> 
> @@ -764,6 +783,30 @@ static struct page *get_ksm_page(struct ksm_stable_node *stable_node,
>   	return NULL;
>   }
> 
> +/*
> + * Cleaning the rmap_item's ZERO_PAGE_FLAG
> + * This function will be called when unshare or writing on zero pages.
> + */
> +static inline void clean_rmap_item_zero_flag(struct ksm_rmap_item *rmap_item)
> +{
> +	if (rmap_item->address & ZERO_PAGE_FLAG)
> +		rmap_item->address &= PAGE_MASK;
> +}
> +
> +/* Only called when rmap_item is going to be freed */
> +static inline void unshare_zero_pages(struct ksm_rmap_item *rmap_item)
> +{
> +	struct vm_area_struct *vma;
> +
> +	if (rmap_item->address & ZERO_PAGE_FLAG) {
> +		vma = vma_lookup(rmap_item->mm, rmap_item->address);
> +		if (vma && !ksm_test_exit(rmap_item->mm))
> +			break_ksm(vma, rmap_item->address, true);
> +	}
> +	/* Put at last. */
> +	clean_rmap_item_zero_flag(rmap_item);
> +}
> +
>   /*
>    * Removing rmap_item from stable or unstable tree.
>    * This function will clean the information from the stable/unstable tree.
> @@ -824,6 +867,7 @@ static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list)
>   		struct ksm_rmap_item *rmap_item = *rmap_list;
>   		*rmap_list = rmap_item->rmap_list;
>   		remove_rmap_item_from_tree(rmap_item);
> +		unshare_zero_pages(rmap_item);
>   		free_rmap_item(rmap_item);
>   	}
>   }
> @@ -853,7 +897,7 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
>   		if (signal_pending(current))
>   			err = -ERESTARTSYS;
>   		else
> -			err = break_ksm(vma, addr);
> +			err = break_ksm(vma, addr, false);
>   	}

MADV_UNMERGEABLE -> unmerge_ksm_pages() will never unshare the shared 
zeropage? I thought the patch description mentions that that is one of 
the goals?

-- 
Thanks,

David / dhildenb