[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <25f86782-ff1f-db4d-d5da-fd1e5bee45f6@fujitsu.com>
Date: Wed, 20 Oct 2021 13:47:50 +0800
From: Shiyang Ruan <ruansy.fnst@...itsu.com>
To: "Darrick J. Wong" <djwong@...nel.org>
CC: <linux-kernel@...r.kernel.org>, <linux-xfs@...r.kernel.org>,
<nvdimm@...ts.linux.dev>, <linux-mm@...ck.org>,
<linux-fsdevel@...r.kernel.org>, <dan.j.williams@...el.com>,
<david@...morbit.com>, <hch@...radead.org>, <jane.chu@...cle.com>
Subject: Re: [PATCH v7 6/8] mm: Introduce mf_dax_kill_procs() for fsdax case
在 2021/10/15 3:32, Darrick J. Wong 写道:
> On Fri, Sep 24, 2021 at 09:09:57PM +0800, Shiyang Ruan wrote:
>> This function is called at the end of RMAP routine, i.e. filesystem
>> recovery function, to collect and kill processes using a shared page of
>> DAX file. The difference between mf_generic_kill_procs() is,
>> it accepts file's mapping,offset instead of struct page. Because
>> different file's mappings and offsets may share the same page in fsdax
>> mode. So, it is called when filesystem RMAP results are found.
>>
>> Signed-off-by: Shiyang Ruan <ruansy.fnst@...itsu.com>
>> ---
>> fs/dax.c | 10 ------
>> include/linux/dax.h | 9 +++++
>> include/linux/mm.h | 2 ++
>> mm/memory-failure.c | 83 ++++++++++++++++++++++++++++++++++++++++-----
>> 4 files changed, 86 insertions(+), 18 deletions(-)
>>
>> diff --git a/fs/dax.c b/fs/dax.c
>> index 509b65e60478..2536c105ec7f 100644
>> --- a/fs/dax.c
>> +++ b/fs/dax.c
>> @@ -852,16 +852,6 @@ static void *dax_insert_entry(struct xa_state *xas,
>> return entry;
>> }
>>
>> -static inline
>> -unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
>> -{
>> - unsigned long address;
>> -
>> - address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
>> - VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
>> - return address;
>> -}
>> -
>> /* Walk all mappings of a given index of a file and writeprotect them */
>> static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
>> unsigned long pfn)
>> diff --git a/include/linux/dax.h b/include/linux/dax.h
>> index 65411bee4312..3d90becbd160 100644
>> --- a/include/linux/dax.h
>> +++ b/include/linux/dax.h
>> @@ -258,6 +258,15 @@ static inline bool dax_mapping(struct address_space *mapping)
>> {
>> return mapping->host && IS_DAX(mapping->host);
>> }
>> +static inline unsigned long pgoff_address(pgoff_t pgoff,
>> + struct vm_area_struct *vma)
>> +{
>> + unsigned long address;
>> +
>> + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
>> + VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
>> + return address;
>> +}
>>
>> #ifdef CONFIG_DEV_DAX_HMEM_DEVICES
>> void hmem_register_device(int target_nid, struct resource *r);
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 73a52aba448f..d06af0051e53 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -3114,6 +3114,8 @@ enum mf_flags {
>> MF_MUST_KILL = 1 << 2,
>> MF_SOFT_OFFLINE = 1 << 3,
>> };
>> +extern int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
>> + size_t size, int flags);
>> extern int memory_failure(unsigned long pfn, int flags);
>> extern void memory_failure_queue(unsigned long pfn, int flags);
>> extern void memory_failure_queue_kick(int cpu);
>> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
>> index 85eab206b68f..a9d0d487d205 100644
>> --- a/mm/memory-failure.c
>> +++ b/mm/memory-failure.c
>> @@ -302,10 +302,9 @@ void shake_page(struct page *p)
>> }
>> EXPORT_SYMBOL_GPL(shake_page);
>>
>> -static unsigned long dev_pagemap_mapping_shift(struct page *page,
>> +static unsigned long dev_pagemap_mapping_shift(unsigned long address,
>> struct vm_area_struct *vma)
>> {
>> - unsigned long address = vma_address(page, vma);
>> pgd_t *pgd;
>> p4d_t *p4d;
>> pud_t *pud;
>> @@ -345,7 +344,7 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
>> * Schedule a process for later kill.
>> * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
>> */
>> -static void add_to_kill(struct task_struct *tsk, struct page *p,
>> +static void add_to_kill(struct task_struct *tsk, struct page *p, pgoff_t pgoff,
>
> Hm, so I guess you're passing the page and the pgoff now because
> page->index is meaningless for shared dax pages? Ok.
Yes, it is for that case.
>
>> struct vm_area_struct *vma,
>> struct list_head *to_kill)
>> {
>> @@ -358,9 +357,15 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
>> }
>>
>> tk->addr = page_address_in_vma(p, vma);
>> - if (is_zone_device_page(p))
>> - tk->size_shift = dev_pagemap_mapping_shift(p, vma);
>> - else
>> + if (is_zone_device_page(p)) {
>> + /*
>> + * Since page->mapping is no more used for fsdax, we should
>> + * calculate the address in a fsdax way.
>> + */
>> + if (p->pgmap->type == MEMORY_DEVICE_FS_DAX)
>> + tk->addr = pgoff_address(pgoff, vma);
>> + tk->size_shift = dev_pagemap_mapping_shift(tk->addr, vma);
>> + } else
>> tk->size_shift = page_shift(compound_head(p));
>>
>> /*
>> @@ -508,7 +513,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
>> if (!page_mapped_in_vma(page, vma))
>> continue;
>> if (vma->vm_mm == t->mm)
>> - add_to_kill(t, page, vma, to_kill);
>> + add_to_kill(t, page, 0, vma, to_kill);
>> }
>> }
>> read_unlock(&tasklist_lock);
>> @@ -544,7 +549,32 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
>> * to be informed of all such data corruptions.
>> */
>> if (vma->vm_mm == t->mm)
>> - add_to_kill(t, page, vma, to_kill);
>> + add_to_kill(t, page, 0, vma, to_kill);
>> + }
>> + }
>> + read_unlock(&tasklist_lock);
>> + i_mmap_unlock_read(mapping);
>> +}
>> +
>> +/*
>> + * Collect processes when the error hit a fsdax page.
>> + */
>> +static void collect_procs_fsdax(struct page *page, struct address_space *mapping,
>> + pgoff_t pgoff, struct list_head *to_kill)
>> +{
>> + struct vm_area_struct *vma;
>> + struct task_struct *tsk;
>> +
>> + i_mmap_lock_read(mapping);
>> + read_lock(&tasklist_lock);
>> + for_each_process(tsk) {
>> + struct task_struct *t = task_early_kill(tsk, true);
>> +
>> + if (!t)
>> + continue;
>> + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
>> + if (vma->vm_mm == t->mm)
>> + add_to_kill(t, page, pgoff, vma, to_kill);
>> }
>> }
>> read_unlock(&tasklist_lock);
>> @@ -1503,6 +1533,43 @@ static int mf_generic_kill_procs(unsigned long long pfn, int flags,
>> return 0;
>> }
>>
>> +/**
>> + * mf_dax_kill_procs - Collect and kill processes who are using this file range
>> + * @mapping: the file in use
>> + * @index: start offset of the range
>> + * @size: length of the range
>
> It feels odd that one argument is in units of pgoff_t but the other is
> in bytes.
The index is page aligned but @size may not be. I will explain it in
detail in the comments.
>
>> + * @flags: memory failure flags
>> + */
>> +int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
>> + size_t size, int flags)
>> +{
>> + LIST_HEAD(to_kill);
>> + dax_entry_t cookie;
>> + struct page *page;
>> + size_t end = (index << PAGE_SHIFT) + size;
>> +
>> + flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
>
> Hm. What flags will we be passing to the xfs_dax_notify_failure_fn?
> Does XFS itself have to care about what's in the flags values, or is it
> really just a magic cookie to be passed from the mm layer into the fs
> and back to mf_dax_kill_procs?
>
Just to pass the flag from mm layer to mf_dax_kill_procs(). No one
inside this RMAP progress will care about or change it. As you
mentioned in the next patch, I think this should be named with a "mf_"
prefix to make it easier to understand.
--
Thanks,
Ruan.
> --D
>
>> +
>> + for (; (index << PAGE_SHIFT) < end; index++) {
>> + page = NULL;
>> + cookie = dax_lock_mapping_entry(mapping, index, &page);
>> + if (!cookie)
>> + return -EBUSY;
>> + if (!page)
>> + goto unlock;
>> +
>> + SetPageHWPoison(page);
>> +
>> + collect_procs_fsdax(page, mapping, index, &to_kill);
>> + unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
>> + index, flags);
>> +unlock:
>> + dax_unlock_mapping_entry(mapping, index, cookie);
>> + }
>> + return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
>> +
>> static int memory_failure_hugetlb(unsigned long pfn, int flags)
>> {
>> struct page *p = pfn_to_page(pfn);
>> --
>> 2.33.0
>>
>>
>>
Powered by blists - more mailing lists