[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250820150220.1923826-1-john.g.garry@oracle.com>
Date: Wed, 20 Aug 2025 15:02:20 +0000
From: John Garry <john.g.garry@...cle.com>
To: kbusch@...nel.org, axboe@...nel.dk, hch@....de, sagi@...mberg.me
Cc: linux-nvme@...ts.infradead.org, linux-kernel@...r.kernel.org,
tytso@....edu, nilay@...ux.ibm.com, martin.petersen@...cle.com,
djwong@...nel.org, mcgrof@...radead.org,
John Garry <john.g.garry@...cle.com>
Subject: [RFC PATCH] nvme: add an opt-in to use AWUPF
As described at [0], many parts of the atomic write specification are
lacking.
For now, there is nothing which we can do in software about the lack of
a dedicated NVMe write atomic command.
As for reading the atomic write limits, it is felt that the per-namespace
values are mostly properly specified and it is assumed that they are
properly implemented.
The specification of NAWUPF is quite clear. However the specification of
NABSPF is less clear. The lack of clarity in NABSPF comes from deciding
whether NABSPF applies when NSABP is 0 - it is assumed that NSABPF does
not apply when NSABP is 0.
As for the per-controller AWUPF, how this value applies to shared
namespaces is missing in the specification. Furthermore, the value is in
terms of logical blocks, which is an NS entity.
It would be preferred to stop honouring AWUPF altogether, but this may
needlessly disable atomic write support for many "good" devices which
only specify AWUPF. Currently all validation of controller-related
atomics limits is dropped.
As a compromise, allow users of such devices to use atomic writes at
their own risk by adding a per-subsystem sysfs file to enable reading of
AWUPF.
A udev rule like the following could be used to auto-enable:
SUBSYSTEM=="nvme-subsystem", ACTION=="add", RUN+="/bin/sh -c 'echo 1 > /sys/$devpath/use_awupf'"
Such a rule would need to run before any filesystem is mounted on the
device.
Having a per-controller file could also work as an alternative, but it
is unlikely that a system will have a mix of controllers which we would
want to selectively enable reading AWUPF.
Note that AWUPF not only effects atomic write support, but also the
physical block size reported for the device.
[0] https://lore.kernel.org/linux-nvme/20250707141834.GA30198@lst.de/
Signed-off-by: John Garry <john.g.garry@...cle.com>
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 812c1565114fd..e8c78e000d9d7 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2040,7 +2040,7 @@ static u32 nvme_configure_atomic_write(struct nvme_ns *ns,
atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
if (id->nabspf)
boundary = (le16_to_cpu(id->nabspf) + 1) * bs;
- } else {
+ } else if (ns->ctrl->subsys->use_awupf) {
/*
* Use the controller wide atomic write unit. This sucks
* because the limit is defined in terms of logical blocks while
@@ -2049,6 +2049,8 @@ static u32 nvme_configure_atomic_write(struct nvme_ns *ns,
* values for different controllers in the subsystem.
*/
atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
+ } else {
+ atomic_bs = bs;
}
lim->atomic_write_hw_max = atomic_bs;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index cfd2b5b90b915..d75a82851b684 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -447,6 +447,7 @@ struct nvme_subsystem {
#ifdef CONFIG_NVME_MULTIPATH
enum nvme_iopolicy iopolicy;
#endif
+ bool use_awupf;
};
/*
diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c
index 29430949ce2f0..ca2e4b5937289 100644
--- a/drivers/nvme/host/sysfs.c
+++ b/drivers/nvme/host/sysfs.c
@@ -894,6 +894,75 @@ static ssize_t nvme_subsys_show_type(struct device *dev,
}
static SUBSYS_ATTR_RO(subsystype, S_IRUGO, nvme_subsys_show_type);
+static ssize_t nvme_subsys_use_awupf_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct nvme_subsystem *subsys =
+ container_of(dev, struct nvme_subsystem, dev);
+
+ return sysfs_emit(buf, "%d\n", subsys->use_awupf);
+}
+
+static ssize_t nvme_subsys_use_awupf_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct nvme_subsystem *subsys =
+ container_of(dev, struct nvme_subsystem, dev);
+ struct nvme_ns_head *h;
+ struct nvme_ctrl *tmp;
+ bool enabled;
+ int err;
+
+ err = kstrtobool(buf, &enabled);
+ if (err)
+ return -EINVAL;
+
+ mutex_lock(&nvme_subsystems_lock);
+ if (subsys->use_awupf == enabled)
+ goto out_unlock;
+
+ list_for_each_entry(h, &subsys->nsheads, entry) {
+ struct queue_limits lim;
+ unsigned int memflags;
+
+ if (!nvme_ns_head_multipath(h))
+ continue;
+
+ /*
+ * The head NS queue atomic limits are stacked. We need to zero
+ * to stack all the limits per-controller afresh.
+ */
+ lim = queue_limits_start_update(h->disk->queue);
+ memflags = blk_mq_freeze_queue(h->disk->queue);
+
+ lim.atomic_write_hw_max = 0;
+ lim.atomic_write_hw_unit_min = 0;
+ lim.atomic_write_hw_unit_max = 0;
+ lim.atomic_write_hw_boundary = 0;
+
+ err = queue_limits_commit_update(h->disk->queue, &lim);
+ blk_mq_unfreeze_queue(h->disk->queue, memflags);
+ if (err) {
+ count = err;
+ goto out_unlock;
+ }
+ }
+
+ list_for_each_entry(tmp, &subsys->ctrls, subsys_entry)
+ nvme_queue_scan(tmp);
+
+ subsys->use_awupf = enabled;
+out_unlock:
+ mutex_unlock(&nvme_subsystems_lock);
+
+ return count;
+}
+
+static struct device_attribute subsys_attr_use_awupf = \
+ __ATTR(use_awupf, S_IRUGO | S_IWUSR, \
+ nvme_subsys_use_awupf_show, nvme_subsys_use_awupf_store);
+
#define nvme_subsys_show_str_function(field) \
static ssize_t subsys_##field##_show(struct device *dev, \
struct device_attribute *attr, char *buf) \
@@ -918,6 +987,7 @@ static struct attribute *nvme_subsys_attrs[] = {
#ifdef CONFIG_NVME_MULTIPATH
&subsys_attr_iopolicy.attr,
#endif
+ &subsys_attr_use_awupf.attr,
NULL,
};
--
2.43.5
Powered by blists - more mailing lists