[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <6482999b3afd8_142af82946e@dwillia2-xfh.jf.intel.com.notmuch>
Date: Thu, 8 Jun 2023 20:16:43 -0700
From: Dan Williams <dan.j.williams@...el.com>
To: Jane Chu <jane.chu@...cle.com>, <dan.j.williams@...el.com>,
<vishal.l.verma@...el.com>, <dave.jiang@...el.com>,
<ira.weiny@...el.com>, <willy@...radead.org>,
<viro@...iv.linux.org.uk>, <brauner@...nel.org>,
<nvdimm@...ts.linux.dev>, <linux-kernel@...r.kernel.org>,
<linux-fsdevel@...r.kernel.org>
Subject: RE: [PATCH v4 1/1] dax: enable dax fault handler to report
VM_FAULT_HWPOISON
Jane Chu wrote:
> When multiple processes mmap() a dax file, then at some point,
> a process issues a 'load' and consumes a hwpoison, the process
> receives a SIGBUS with si_code = BUS_MCEERR_AR and with si_lsb
> set for the poison scope. Soon after, any other process issues
> a 'load' to the poisoned page (that is unmapped from the kernel
> side by memory_failure), it receives a SIGBUS with
> si_code = BUS_ADRERR and without valid si_lsb.
>
> This is confusing to user, and is different from page fault due
> to poison in RAM memory, also some helpful information is lost.
>
> Channel dax backend driver's poison detection to the filesystem
> such that instead of reporting VM_FAULT_SIGBUS, it could report
> VM_FAULT_HWPOISON.
>
> If user level block IO syscalls fail due to poison, the errno will
> be converted to EIO to maintain block API consistency.
>
> Signed-off-by: Jane Chu <jane.chu@...cle.com>
> ---
> drivers/dax/super.c | 5 ++++-
> drivers/nvdimm/pmem.c | 2 +-
> drivers/s390/block/dcssblk.c | 3 ++-
> fs/dax.c | 11 ++++++-----
> fs/fuse/virtio_fs.c | 3 ++-
> include/linux/dax.h | 5 +++++
> include/linux/mm.h | 2 ++
> 7 files changed, 22 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/dax/super.c b/drivers/dax/super.c
> index c4c4728a36e4..0da9232ea175 100644
> --- a/drivers/dax/super.c
> +++ b/drivers/dax/super.c
> @@ -203,6 +203,8 @@ size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
> int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
> size_t nr_pages)
> {
> + int ret;
> +
> if (!dax_alive(dax_dev))
> return -ENXIO;
> /*
> @@ -213,7 +215,8 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
> if (nr_pages != 1)
> return -EIO;
>
> - return dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages);
> + ret = dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages);
> + return dax_mem2blk_err(ret);
> }
> EXPORT_SYMBOL_GPL(dax_zero_page_range);
>
> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> index ceea55f621cc..46e094e56159 100644
> --- a/drivers/nvdimm/pmem.c
> +++ b/drivers/nvdimm/pmem.c
> @@ -260,7 +260,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
> long actual_nr;
>
> if (mode != DAX_RECOVERY_WRITE)
> - return -EIO;
> + return -EHWPOISON;
>
> /*
> * Set the recovery stride is set to kernel page size because
> diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
> index c09f2e053bf8..ee47ac520cd4 100644
> --- a/drivers/s390/block/dcssblk.c
> +++ b/drivers/s390/block/dcssblk.c
> @@ -54,7 +54,8 @@ static int dcssblk_dax_zero_page_range(struct dax_device *dax_dev,
> rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS,
> &kaddr, NULL);
> if (rc < 0)
> - return rc;
> + return dax_mem2blk_err(rc);
> +
> memset(kaddr, 0, nr_pages << PAGE_SHIFT);
> dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT);
> return 0;
> diff --git a/fs/dax.c b/fs/dax.c
> index 2ababb89918d..a26eb5abfdc0 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1148,7 +1148,7 @@ static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size,
> if (!zero_edge) {
> ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
> if (ret)
> - return ret;
> + return dax_mem2blk_err(ret);
> }
>
> if (copy_all) {
> @@ -1310,7 +1310,7 @@ static s64 dax_unshare_iter(struct iomap_iter *iter)
>
> out_unlock:
> dax_read_unlock(id);
> - return ret;
> + return dax_mem2blk_err(ret);
> }
>
> int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
> @@ -1342,7 +1342,8 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
> ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr,
> NULL);
> if (ret < 0)
> - return ret;
> + return dax_mem2blk_err(ret);
> +
> memset(kaddr + offset, 0, size);
> if (iomap->flags & IOMAP_F_SHARED)
> ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap,
> @@ -1498,7 +1499,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
>
> map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
> DAX_ACCESS, &kaddr, NULL);
> - if (map_len == -EIO && iov_iter_rw(iter) == WRITE) {
> + if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) {
> map_len = dax_direct_access(dax_dev, pgoff,
> PHYS_PFN(size), DAX_RECOVERY_WRITE,
> &kaddr, NULL);
> @@ -1506,7 +1507,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
> recovery = true;
> }
> if (map_len < 0) {
> - ret = map_len;
> + ret = dax_mem2blk_err(map_len);
> break;
> }
>
> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
> index 4d8d4f16c727..5f1be1da92ce 100644
> --- a/fs/fuse/virtio_fs.c
> +++ b/fs/fuse/virtio_fs.c
> @@ -775,7 +775,8 @@ static int virtio_fs_zero_page_range(struct dax_device *dax_dev,
> rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS, &kaddr,
> NULL);
> if (rc < 0)
> - return rc;
> + return dax_mem2blk_err(rc);
> +
> memset(kaddr, 0, nr_pages << PAGE_SHIFT);
> dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT);
> return 0;
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index bf6258472e49..a4e97acf60f5 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -261,6 +261,11 @@ static inline bool dax_mapping(struct address_space *mapping)
> return mapping->host && IS_DAX(mapping->host);
> }
>
> +static inline int dax_mem2blk_err(int err)
> +{
> + return (err == -EHWPOISON) ? -EIO : err;
> +}
I think it is worth a comment on this function to indicate where this
helper is *not* used. I.e. it's easy to grep for where the error code is
converted for file I/O errors, but the subtlety of when the
dax_direct_acces() return value is not translated for fault handling
deserves a comment when we wonder why this function exists in a few
years time.
Powered by blists - more mailing lists