linux-kernel - Re: [PATCH v7 6/8] mm: Introduce mf_dax_kill

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <25f86782-ff1f-db4d-d5da-fd1e5bee45f6@fujitsu.com>
Date:   Wed, 20 Oct 2021 13:47:50 +0800
From:   Shiyang Ruan <ruansy.fnst@...itsu.com>
To:     "Darrick J. Wong" <djwong@...nel.org>
CC:     <linux-kernel@...r.kernel.org>, <linux-xfs@...r.kernel.org>,
        <nvdimm@...ts.linux.dev>, <linux-mm@...ck.org>,
        <linux-fsdevel@...r.kernel.org>, <dan.j.williams@...el.com>,
        <david@...morbit.com>, <hch@...radead.org>, <jane.chu@...cle.com>
Subject: Re: [PATCH v7 6/8] mm: Introduce mf_dax_kill_procs() for fsdax case



在 2021/10/15 3:32, Darrick J. Wong 写道:
> On Fri, Sep 24, 2021 at 09:09:57PM +0800, Shiyang Ruan wrote:
>> This function is called at the end of RMAP routine, i.e. filesystem
>> recovery function, to collect and kill processes using a shared page of
>> DAX file.  The difference between mf_generic_kill_procs() is,
>> it accepts file's mapping,offset instead of struct page.  Because
>> different file's mappings and offsets may share the same page in fsdax
>> mode.  So, it is called when filesystem RMAP results are found.
>>
>> Signed-off-by: Shiyang Ruan <ruansy.fnst@...itsu.com>
>> ---
>>   fs/dax.c            | 10 ------
>>   include/linux/dax.h |  9 +++++
>>   include/linux/mm.h  |  2 ++
>>   mm/memory-failure.c | 83 ++++++++++++++++++++++++++++++++++++++++-----
>>   4 files changed, 86 insertions(+), 18 deletions(-)
>>
>> diff --git a/fs/dax.c b/fs/dax.c
>> index 509b65e60478..2536c105ec7f 100644
>> --- a/fs/dax.c
>> +++ b/fs/dax.c
>> @@ -852,16 +852,6 @@ static void *dax_insert_entry(struct xa_state *xas,
>>   	return entry;
>>   }
>>   
>> -static inline
>> -unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
>> -{
>> -	unsigned long address;
>> -
>> -	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
>> -	VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
>> -	return address;
>> -}
>> -
>>   /* Walk all mappings of a given index of a file and writeprotect them */
>>   static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
>>   		unsigned long pfn)
>> diff --git a/include/linux/dax.h b/include/linux/dax.h
>> index 65411bee4312..3d90becbd160 100644
>> --- a/include/linux/dax.h
>> +++ b/include/linux/dax.h
>> @@ -258,6 +258,15 @@ static inline bool dax_mapping(struct address_space *mapping)
>>   {
>>   	return mapping->host && IS_DAX(mapping->host);
>>   }
>> +static inline unsigned long pgoff_address(pgoff_t pgoff,
>> +		struct vm_area_struct *vma)
>> +{
>> +	unsigned long address;
>> +
>> +	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
>> +	VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
>> +	return address;
>> +}
>>   
>>   #ifdef CONFIG_DEV_DAX_HMEM_DEVICES
>>   void hmem_register_device(int target_nid, struct resource *r);
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 73a52aba448f..d06af0051e53 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -3114,6 +3114,8 @@ enum mf_flags {
>>   	MF_MUST_KILL = 1 << 2,
>>   	MF_SOFT_OFFLINE = 1 << 3,
>>   };
>> +extern int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
>> +			     size_t size, int flags);
>>   extern int memory_failure(unsigned long pfn, int flags);
>>   extern void memory_failure_queue(unsigned long pfn, int flags);
>>   extern void memory_failure_queue_kick(int cpu);
>> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
>> index 85eab206b68f..a9d0d487d205 100644
>> --- a/mm/memory-failure.c
>> +++ b/mm/memory-failure.c
>> @@ -302,10 +302,9 @@ void shake_page(struct page *p)
>>   }
>>   EXPORT_SYMBOL_GPL(shake_page);
>>   
>> -static unsigned long dev_pagemap_mapping_shift(struct page *page,
>> +static unsigned long dev_pagemap_mapping_shift(unsigned long address,
>>   		struct vm_area_struct *vma)
>>   {
>> -	unsigned long address = vma_address(page, vma);
>>   	pgd_t *pgd;
>>   	p4d_t *p4d;
>>   	pud_t *pud;
>> @@ -345,7 +344,7 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
>>    * Schedule a process for later kill.
>>    * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
>>    */
>> -static void add_to_kill(struct task_struct *tsk, struct page *p,
>> +static void add_to_kill(struct task_struct *tsk, struct page *p, pgoff_t pgoff,
> 
> Hm, so I guess you're passing the page and the pgoff now because
> page->index is meaningless for shared dax pages?  Ok.

Yes, it is for that case.

> 
>>   		       struct vm_area_struct *vma,
>>   		       struct list_head *to_kill)
>>   {
>> @@ -358,9 +357,15 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
>>   	}
>>   
>>   	tk->addr = page_address_in_vma(p, vma);
>> -	if (is_zone_device_page(p))
>> -		tk->size_shift = dev_pagemap_mapping_shift(p, vma);
>> -	else
>> +	if (is_zone_device_page(p)) {
>> +		/*
>> +		 * Since page->mapping is no more used for fsdax, we should
>> +		 * calculate the address in a fsdax way.
>> +		 */
>> +		if (p->pgmap->type == MEMORY_DEVICE_FS_DAX)
>> +			tk->addr = pgoff_address(pgoff, vma);
>> +		tk->size_shift = dev_pagemap_mapping_shift(tk->addr, vma);
>> +	} else
>>   		tk->size_shift = page_shift(compound_head(p));
>>   
>>   	/*
>> @@ -508,7 +513,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
>>   			if (!page_mapped_in_vma(page, vma))
>>   				continue;
>>   			if (vma->vm_mm == t->mm)
>> -				add_to_kill(t, page, vma, to_kill);
>> +				add_to_kill(t, page, 0, vma, to_kill);
>>   		}
>>   	}
>>   	read_unlock(&tasklist_lock);
>> @@ -544,7 +549,32 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
>>   			 * to be informed of all such data corruptions.
>>   			 */
>>   			if (vma->vm_mm == t->mm)
>> -				add_to_kill(t, page, vma, to_kill);
>> +				add_to_kill(t, page, 0, vma, to_kill);
>> +		}
>> +	}
>> +	read_unlock(&tasklist_lock);
>> +	i_mmap_unlock_read(mapping);
>> +}
>> +
>> +/*
>> + * Collect processes when the error hit a fsdax page.
>> + */
>> +static void collect_procs_fsdax(struct page *page, struct address_space *mapping,
>> +		pgoff_t pgoff, struct list_head *to_kill)
>> +{
>> +	struct vm_area_struct *vma;
>> +	struct task_struct *tsk;
>> +
>> +	i_mmap_lock_read(mapping);
>> +	read_lock(&tasklist_lock);
>> +	for_each_process(tsk) {
>> +		struct task_struct *t = task_early_kill(tsk, true);
>> +
>> +		if (!t)
>> +			continue;
>> +		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
>> +			if (vma->vm_mm == t->mm)
>> +				add_to_kill(t, page, pgoff, vma, to_kill);
>>   		}
>>   	}
>>   	read_unlock(&tasklist_lock);
>> @@ -1503,6 +1533,43 @@ static int mf_generic_kill_procs(unsigned long long pfn, int flags,
>>   	return 0;
>>   }
>>   
>> +/**
>> + * mf_dax_kill_procs - Collect and kill processes who are using this file range
>> + * @mapping:	the file in use
>> + * @index:	start offset of the range
>> + * @size:	length of the range
> 
> It feels odd that one argument is in units of pgoff_t but the other is
> in bytes.

The index is page aligned but @size may not be.  I will explain it in 
detail in the comments.

> 
>> + * @flags:	memory failure flags
>> + */
>> +int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
>> +		size_t size, int flags)
>> +{
>> +	LIST_HEAD(to_kill);
>> +	dax_entry_t cookie;
>> +	struct page *page;
>> +	size_t end = (index << PAGE_SHIFT) + size;
>> +
>> +	flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
> 
> Hm.  What flags will we be passing to the xfs_dax_notify_failure_fn?
> Does XFS itself have to care about what's in the flags values, or is it
> really just a magic cookie to be passed from the mm layer into the fs
> and back to mf_dax_kill_procs?
> 

Just to pass the flag from mm layer to mf_dax_kill_procs().  No one 
inside this RMAP progress will care about or change it.  As you 
mentioned in the next patch, I think this should be named with a "mf_" 
prefix to make it easier to understand.


--
Thanks,
Ruan.

> --D
> 
>> +
>> +	for (; (index << PAGE_SHIFT) < end; index++) {
>> +		page = NULL;
>> +		cookie = dax_lock_mapping_entry(mapping, index, &page);
>> +		if (!cookie)
>> +			return -EBUSY;
>> +		if (!page)
>> +			goto unlock;
>> +
>> +		SetPageHWPoison(page);
>> +
>> +		collect_procs_fsdax(page, mapping, index, &to_kill);
>> +		unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
>> +				index, flags);
>> +unlock:
>> +		dax_unlock_mapping_entry(mapping, index, cookie);
>> +	}
>> +	return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
>> +
>>   static int memory_failure_hugetlb(unsigned long pfn, int flags)
>>   {
>>   	struct page *p = pfn_to_page(pfn);
>> -- 
>> 2.33.0
>>
>>
>>