linux-kernel - Re: [RFC v3 1/3] mm: add tracepoint for scanning pages

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <55B73FC1.2070708@suse.cz>
Date:	Tue, 28 Jul 2015 10:39:29 +0200
From:	Vlastimil Babka <vbabka@...e.cz>
To:	Ebru Akagunduz <ebru.akagunduz@...il.com>, linux-mm@...ck.org
Cc:	akpm@...ux-foundation.org, kirill.shutemov@...ux.intel.com,
	n-horiguchi@...jp.nec.com, aarcange@...hat.com, riel@...hat.com,
	iamjoonsoo.kim@....com, xiexiuqi@...wei.com, gorcunov@...nvz.org,
	linux-kernel@...r.kernel.org, mgorman@...e.de, rientjes@...gle.com,
	aneesh.kumar@...ux.vnet.ibm.com, hughd@...gle.com,
	hannes@...xchg.org, mhocko@...e.cz, boaz@...xistor.com,
	raindel@...lanox.com
Subject: Re: [RFC v3 1/3] mm: add tracepoint for scanning pages

On 07/13/2015 10:28 PM, Ebru Akagunduz wrote:
> Using static tracepoints, data of functions is recorded.
> It is good to automatize debugging without doing a lot
> of changes in the source code.
>
> This patch adds tracepoint for khugepaged_scan_pmd,
> collapse_huge_page and __collapse_huge_page_isolate.
>
> Signed-off-by: Ebru Akagunduz <ebru.akagunduz@...il.com>
> Acked-by: Kirill A. Shutemov <kirill.shutemov@...ux.intel.com>
> Acked-by: Rik van Riel <riel@...hat.com>
> ---
> Changes in v2:
>   - Nothing changed
>
> Changes in v3:
>   - Print page address instead of vm_start (Vlastimil Babka)
>   - Define constants to specify exact tracepoint result (Vlastimil Babka)

Hi, and thanks for improving the tracepoints!

>
>   include/linux/mm.h                 |  18 ++++++
>   include/trace/events/huge_memory.h | 100 ++++++++++++++++++++++++++++++++
>   mm/huge_memory.c                   | 114 +++++++++++++++++++++++++++----------
>   3 files changed, 203 insertions(+), 29 deletions(-)
>   create mode 100644 include/trace/events/huge_memory.h
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 7f47178..bf341c0 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -21,6 +21,24 @@
>   #include <linux/resource.h>
>   #include <linux/page_ext.h>
>
> +#define MM_PMD_NULL		0
> +#define MM_EXCEED_NONE_PTE	3
> +#define MM_PTE_NON_PRESENT	4
> +#define MM_PAGE_NULL		5
> +#define MM_SCAN_ABORT		6
> +#define MM_PAGE_COUNT		7
> +#define MM_PAGE_LRU		8
> +#define MM_ANY_PROCESS		0
> +#define MM_VMA_NULL		2
> +#define MM_VMA_CHECK		3
> +#define MM_ADDRESS_RANGE	4
> +#define MM_PAGE_LOCK		2
> +#define MM_SWAP_CACHE_PAGE	6
> +#define MM_ISOLATE_LRU_PAGE	7
> +#define MM_ALLOC_HUGE_PAGE_FAIL	6
> +#define MM_CGROUP_CHARGE_FAIL	7
> +#define MM_COLLAPSE_ISOLATE_FAIL 5

this would better go to mm/huge_memory.c since it's used nowhere else, 
so we shouldn't pollute a global header. Also I'd suggest changing the 
MM_ prefix to e.g. SCAN_ ?
Reusing the numbers depending on whether they can appear in a single 
function is unnecessarily complicated, we don't have to fit in some 
small limit here. You could also use an enum to avoid defining each 
constant's value manually.

>   struct mempolicy;
>   struct anon_vma;
>   struct anon_vma_chain;
> diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
> new file mode 100644
> index 0000000..cbc56fc
> --- /dev/null
> +++ b/include/trace/events/huge_memory.h
> @@ -0,0 +1,100 @@
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM huge_memory
> +
> +#if !defined(__HUGE_MEMORY_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define __HUGE_MEMORY_H
> +
> +#include  <linux/tracepoint.h>
> +
> +TRACE_EVENT(mm_khugepaged_scan_pmd,
> +
> +	TP_PROTO(struct mm_struct *mm, struct page *page, bool writable,
> +		 bool referenced, int none_or_zero, int ret),
> +
> +	TP_ARGS(mm, page, writable, referenced, none_or_zero, ret),
> +
> +	TP_STRUCT__entry(
> +		__field(struct mm_struct *, mm)
> +		__field(struct page *, page)
> +		__field(bool, writable)
> +		__field(bool, referenced)
> +		__field(int, none_or_zero)
> +		__field(int, ret)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->mm = mm;
> +		__entry->page = page;
> +		__entry->writable = writable;
> +		__entry->referenced = referenced;
> +		__entry->none_or_zero = none_or_zero;
> +		__entry->ret = ret;
> +	),
> +
> +	TP_printk("mm=%p, page=%p, writable=%d, referenced=%d, none_or_zero=%d, ret=%d",
> +		__entry->mm,
> +		__entry->page,

Sorry, when I suggested "the address of the page itself" instead of 
vm_start, I was thinking physical address (pfn).
Compaction tracepoints recently standardized on this print format so I'd 
recommend it here too:

	scan_pfn=0x%lx

> +		__entry->writable,
> +		__entry->referenced,
> +		__entry->none_or_zero,
> +		__entry->ret)

Instead of printing a number that has to be translated manually, I'd 
recommend converting to string. Look at how compaction_status_string is 
defined in mm/compaction.c and used from the tracepoints.

[ ... ]

>
> +#define CREATE_TRACE_POINTS
> +#include <trace/events/huge_memory.h>
> +
>   /*
>    * By default transparent hugepage support is disabled in order that avoid
>    * to risk increase the memory footprint of applications without a guaranteed
> @@ -2190,25 +2193,32 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>   					unsigned long address,
>   					pte_t *pte)
>   {
> -	struct page *page;
> +	struct page *page = NULL;
>   	pte_t *_pte;
> -	int none_or_zero = 0;
> +	int none_or_zero = 0, ret = 0;
>   	bool referenced = false, writable = false;
>   	for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
>   	     _pte++, address += PAGE_SIZE) {
>   		pte_t pteval = *_pte;
>   		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
>   			if (!userfaultfd_armed(vma) &&
> -			    ++none_or_zero <= khugepaged_max_ptes_none)
> +			    ++none_or_zero <= khugepaged_max_ptes_none) {
>   				continue;
> -			else
> +			} else {
> +				ret = MM_EXCEED_NONE_PTE;
>   				goto out;
> +			}
>   		}
> -		if (!pte_present(pteval))
> +		if (!pte_present(pteval)) {
> +			ret = MM_PTE_NON_PRESENT;
>   			goto out;
> +		}
> +
>   		page = vm_normal_page(vma, address, pteval);
> -		if (unlikely(!page))
> +		if (unlikely(!page)) {
> +			ret = MM_PAGE_NULL;
>   			goto out;
> +		}
>
>   		VM_BUG_ON_PAGE(PageCompound(page), page);
>   		VM_BUG_ON_PAGE(!PageAnon(page), page);
> @@ -2220,8 +2230,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>   		 * is needed to serialize against split_huge_page
>   		 * when invoked from the VM.
>   		 */
> -		if (!trylock_page(page))
> +		if (!trylock_page(page)) {
> +			ret = MM_PAGE_LOCK;
>   			goto out;
> +		}
>
>   		/*
>   		 * cannot use mapcount: can't collapse if there's a gup pin.
> @@ -2230,6 +2242,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>   		 */
>   		if (page_count(page) != 1 + !!PageSwapCache(page)) {
>   			unlock_page(page);
> +			ret = MM_PAGE_COUNT;
>   			goto out;
>   		}
>   		if (pte_write(pteval)) {
> @@ -2237,6 +2250,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>   		} else {
>   			if (PageSwapCache(page) && !reuse_swap_page(page)) {
>   				unlock_page(page);
> +				ret = MM_SWAP_CACHE_PAGE;
>   				goto out;
>   			}
>   			/*
> @@ -2251,6 +2265,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>   		 */
>   		if (isolate_lru_page(page)) {
>   			unlock_page(page);
> +			ret = MM_ISOLATE_LRU_PAGE;
>   			goto out;
>   		}
>   		/* 0 stands for page_is_file_cache(page) == false */
> @@ -2263,11 +2278,16 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>   		    mmu_notifier_test_young(vma->vm_mm, address))
>   			referenced = true;
>   	}
> -	if (likely(referenced && writable))
> +	if (likely(referenced && writable)) {
> +		trace_mm_collapse_huge_page_isolate(page, none_or_zero,
> +						    referenced, writable, ret);
>   		return 1;
> +	}
>   out:
>   	release_pte_pages(pte, _pte);
> -	return 0;
> +	trace_mm_collapse_huge_page_isolate(page, none_or_zero,
> +					    referenced, writable, ret);
> +	return ret;
>   }

Having success returned as "1" and failures of either 0 or other 
positive values is uncommon and may lead to mistakes. Per 
Documentation/CodingStyle Chapter 16, it should be either 0 = success 
and any other = error, or 0 = failure, non-zero = success. Here the 
first variant would be applicable. Following the chapter strictly, the 
function should have been using this variant even before your patch, 
since here the "name of a function is an action or an imperative 
command" :) but yeah...

Anyway I don't think you need to return the exact error, since the 
caller doesn't use it. It's there only for the tracepoint, so you can 
simply keep returning just 1 or 0. Then with tracepoints disabled in 
.config, the compiler should be also able to eliminate all the 
assignments that would be unused in the end.
Same suggestions apply to khugepaged_scan_pmd()

>
>   static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
> @@ -2501,7 +2521,7 @@ static void collapse_huge_page(struct mm_struct *mm,
>   	pgtable_t pgtable;
>   	struct page *new_page;
>   	spinlock_t *pmd_ptl, *pte_ptl;
> -	int isolated;
> +	int isolated = 0, ret = 1;
>   	unsigned long hstart, hend;
>   	struct mem_cgroup *memcg;
>   	unsigned long mmun_start;	/* For mmu_notifiers */
> @@ -2516,12 +2536,18 @@ static void collapse_huge_page(struct mm_struct *mm,
>
>   	/* release the mmap_sem read lock. */
>   	new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node);
> -	if (!new_page)
> +	if (!new_page) {
> +		ret = MM_ALLOC_HUGE_PAGE_FAIL;
> +		trace_mm_collapse_huge_page(mm, isolated, ret);
>   		return;
> +	}
>
>   	if (unlikely(mem_cgroup_try_charge(new_page, mm,
> -					   gfp, &memcg)))
> +					   gfp, &memcg))) {
> +		ret = MM_CGROUP_CHARGE_FAIL;
> +		trace_mm_collapse_huge_page(mm, isolated, ret);
>   		return;
> +	}

You could add a label called e.g. "out_nolock" right after 
"up_write(&mm->mmap_sem);" below, and goto there to avoid the multiple 
tracepoints.

>
>   	/*
>   	 * Prevent all access to pagetables with the exception of
> @@ -2529,21 +2555,31 @@ static void collapse_huge_page(struct mm_struct *mm,
>   	 * handled by the anon_vma lock + PG_lock.
>   	 */
>   	down_write(&mm->mmap_sem);
> -	if (unlikely(khugepaged_test_exit(mm)))
> +	if (unlikely(khugepaged_test_exit(mm))) {
> +		ret = MM_ANY_PROCESS;
>   		goto out;
> +	}
>
>   	vma = find_vma(mm, address);
> -	if (!vma)
> +	if (!vma) {
> +		ret = MM_VMA_NULL;
>   		goto out;
> +	}
>   	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
>   	hend = vma->vm_end & HPAGE_PMD_MASK;
> -	if (address < hstart || address + HPAGE_PMD_SIZE > hend)
> +	if (address < hstart || address + HPAGE_PMD_SIZE > hend) {
> +		ret = MM_ADDRESS_RANGE;
>   		goto out;
> -	if (!hugepage_vma_check(vma))
> +	}
> +	if (!hugepage_vma_check(vma)) {
> +		ret = MM_VMA_CHECK;
>   		goto out;
> +	}
>   	pmd = mm_find_pmd(mm, address);
> -	if (!pmd)
> +	if (!pmd) {
> +		ret = MM_PMD_NULL;
>   		goto out;
> +	}
>
>   	anon_vma_lock_write(vma->anon_vma);
>
> @@ -2568,7 +2604,7 @@ static void collapse_huge_page(struct mm_struct *mm,
>   	isolated = __collapse_huge_page_isolate(vma, address, pte);
>   	spin_unlock(pte_ptl);
>
> -	if (unlikely(!isolated)) {
> +	if (unlikely(isolated != 1)) {
>   		pte_unmap(pte);
>   		spin_lock(pmd_ptl);
>   		BUG_ON(!pmd_none(*pmd));
> @@ -2580,6 +2616,7 @@ static void collapse_huge_page(struct mm_struct *mm,
>   		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
>   		spin_unlock(pmd_ptl);
>   		anon_vma_unlock_write(vma->anon_vma);
> +		ret = MM_COLLAPSE_ISOLATE_FAIL;
>   		goto out;
>   	}
>
> @@ -2619,6 +2656,7 @@ static void collapse_huge_page(struct mm_struct *mm,
>   	khugepaged_pages_collapsed++;
>   out_up_write:
>   	up_write(&mm->mmap_sem);

out_nolock:

> +	trace_mm_collapse_huge_page(mm, isolated, ret);
>   	return;
>
>   out:

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/