[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <ba6790e0-23aa-4069-f9e3-982c5aa6ab29@oracle.com>
Date: Mon, 8 May 2023 18:12:04 -0700
From: Jane Chu <jane.chu@...cle.com>
To: Dan Williams <dan.j.williams@...el.com>, vishal.l.verma@...el.com,
dave.jiang@...el.com, ira.weiny@...el.com, willy@...radead.org,
viro@...iv.linux.org.uk, brauner@...nel.org,
nvdimm@...ts.linux.dev, linux-kernel@...r.kernel.org,
linux-fsdevel@...r.kernel.org
Subject: Re: [PATCH v3] dax: enable dax fault handler to report
VM_FAULT_HWPOISON
On 5/4/2023 7:32 PM, Dan Williams wrote:
> Jane Chu wrote:
>> When multiple processes mmap() a dax file, then at some point,
>> a process issues a 'load' and consumes a hwpoison, the process
>> receives a SIGBUS with si_code = BUS_MCEERR_AR and with si_lsb
>> set for the poison scope. Soon after, any other process issues
>> a 'load' to the poisoned page (that is unmapped from the kernel
>> side by memory_failure), it receives a SIGBUS with
>> si_code = BUS_ADRERR and without valid si_lsb.
>>
>> This is confusing to user, and is different from page fault due
>> to poison in RAM memory, also some helpful information is lost.
>>
>> Channel dax backend driver's poison detection to the filesystem
>> such that instead of reporting VM_FAULT_SIGBUS, it could report
>> VM_FAULT_HWPOISON.
>
> I do think it is interesting that this will start returning SIGBUS with
> BUS_MCEERR_AR for stores whereas it is only signalled for loads in the
> direct consumption path, but I can't think of a scenario where that
> would confuse existing software.
Yes indeed, nice catch, thank you!
>
>> Change from v2:
>> Convert -EHWPOISON to -EIO to prevent EHWPOISON errno from leaking
>> out to block read(2) - suggested by Matthew.
>
> For next time these kind of changelog notes belong...
>
>> Signed-off-by: Jane Chu <jane.chu@...cle.com>
>> ---
>
> ...here after the "---".
I'll move the change log to a cover letter.
>
>> drivers/nvdimm/pmem.c | 2 +-
>> fs/dax.c | 4 ++--
>> include/linux/mm.h | 2 ++
>> 3 files changed, 5 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
>> index ceea55f621cc..46e094e56159 100644
>> --- a/drivers/nvdimm/pmem.c
>> +++ b/drivers/nvdimm/pmem.c
>> @@ -260,7 +260,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
>> long actual_nr;
>>
>> if (mode != DAX_RECOVERY_WRITE)
>> - return -EIO;
>> + return -EHWPOISON;
>>
>> /*
>> * Set the recovery stride is set to kernel page size because
>> diff --git a/fs/dax.c b/fs/dax.c
>> index 2ababb89918d..18f9598951f1 100644
>> --- a/fs/dax.c
>> +++ b/fs/dax.c
>> @@ -1498,7 +1498,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
>>
>> map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
>> DAX_ACCESS, &kaddr, NULL);
>> - if (map_len == -EIO && iov_iter_rw(iter) == WRITE) {
>> + if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) {
>> map_len = dax_direct_access(dax_dev, pgoff,
>> PHYS_PFN(size), DAX_RECOVERY_WRITE,
>> &kaddr, NULL);
>> @@ -1506,7 +1506,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
>> recovery = true;
>> }
>> if (map_len < 0) {
>> - ret = map_len;
>> + ret = (map_len == -EHWPOISON) ? -EIO : map_len;
>
> This fixup leaves out several other places where EHWPOISON could leak as
> the errno for read(2)/write(2). I think I want to see something like
> this:
>
> diff --git a/fs/dax.c b/fs/dax.c
> index 2ababb89918d..ec17f9977bcb 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1077,14 +1077,13 @@ int dax_writeback_mapping_range(struct address_space *mapping,
> }
> EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
>
> -static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
> - size_t size, void **kaddr, pfn_t *pfnp)
> +static int __dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
> + size_t size, void **kaddr, pfn_t *pfnp)
> {
> pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
> - int id, rc = 0;
> long length;
> + int rc = 0;
>
> - id = dax_read_lock();
> length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
> DAX_ACCESS, kaddr, pfnp);
> if (length < 0) {
> @@ -1113,6 +1112,36 @@ static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
> return rc;
> }
>
> +static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
> + size_t size, void **kaddr, pfn_t *pfnp)
> +{
> +
> + int id;
> +
> + id = dax_read_lock();
> + rc = __dax_iomap_direct_access(iomap, pos, size, kaddr, pfnp);
> + dax_read_unlock(id);
> +
> + /* don't leak a memory access error code to I/O syscalls */
> + if (rc == -EHWPOISON)
> + return -EIO;
> + return rc;
> +}
> +
> +static int dax_fault_direct_access(const struct iomap *iomap, loff_t pos,
> + size_t size, void **kaddr, pfn_t *pfnp)
> +{
> +
> + int id;
> +
> + id = dax_read_lock();
> + rc = __dax_iomap_direct_access(iomap, pos, size, kaddr, pfnp);
> + dax_read_unlock(id);
> +
> + /* -EHWPOISON return ok */
> + return rc;
> +}
> +
> /**
> * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page
> * by copying the data before and after the range to be written.
> @@ -1682,7 +1711,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
> return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
> }
>
> - err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn);
> + err = dax_fault_direct_access(iomap, pos, size, &kaddr, &pfn);
> if (err)
> return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
>
>
>
> ...and then convert all other callers of dax_direct_access() in that
> file such that they are either calling:
>
> dax_iomap_direct_access(): if caller wants EHWPOISON translated to -EIO and is ok
> with internal locking
> dax_fault_direct_access(): if caller wants EHWPOISON passed through and is
> ok with internal locking
> __dax_iomap_direct_access(): if the caller wants to do its own EHWPOISON
> translation and locking
Got it. I examined all callers of dax_direct_access() and found a
couple move places that need the errno conversion.
I'd like to introduce a helper mem2blk_err(err) for that. It could make
the code more self explanatory.
Thanks,
-jane
>
>> break;
>> }
>>
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 1f79667824eb..e4c974587659 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -3217,6 +3217,8 @@ static inline vm_fault_t vmf_error(int err)
>> {
>> if (err == -ENOMEM)
>> return VM_FAULT_OOM;
>> + else if (err == -EHWPOISON)
>> + return VM_FAULT_HWPOISON;
>> return VM_FAULT_SIGBUS;
>> }
>>
>> --
>> 2.18.4
>>
>
>
Powered by blists - more mailing lists