[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <172790598832.1168608.4519484276671503678.stgit@dwillia2-xfh.jf.intel.com>
Date: Wed, 02 Oct 2024 14:54:40 -0700
From: Dan Williams <dan.j.williams@...el.com>
To: gregkh@...uxfoundation.org
Cc: rafael.j.wysocki@...el.com, tj@...nel.org, linux-kernel@...r.kernel.org,
regressions@...ts.linux.dev
Subject: [PATCH v2] driver core: Fix userspace expectations of uevent_show()
as a probe barrier
Commit "driver core: Fix uevent_show() vs driver detach race" [1]
attempted to fix a lockdep report in uevent_show() by making the lookup
of driver information for a given device lockless. It turns out that
userspace likely depends on the side-effect of uevent_show() flushing
device probing, as evidenced by the removal of the lock causing a
regression initializing USB devices [2].
Introduce a new "locked" type for sysfs attributes that arranges for the
attribute to be called under the device-lock, but without setting up a
reverse locking order dependency with the kernfs_get_active() lock.
This new facility holds a reference on a device while any locked-sysfs
attribute of that device is open. It then takes the lock around sysfs
read/write operations in the following order:
of->mutex
of->op_mutex <= device_lock()
kernfs_get_active()
<operation>
Compare that to problematic locking order of:
of->mutex
kernfs_get_active()
<operation>
device_lock()
...which causes potential deadlocks with kernfs_drain() that may be
called while the device_lock() is held.
Note that the synchronize_rcu() in module_remove_driver(), introduced in
the precedubg fix, likely needs to remain since dev_uevent() is not
always called with the device_lock held. Userspace can potentially
trigger use after free by racing module removal against kernel
invocations of kobject_uevent().
Note2, this change effectively allows userspace to test for
CONFIG_DEBUG_KOBJECT_RELEASE-style driver bugs as the lifetime of open
file descriptors on the @uevent attribute keeps the device reference
count elevated indefinitely.
Reported-by: brmails+k
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219244 [2]
Tested-by: brmails+k
Fixes: 15fffc6a5624 ("driver core: Fix uevent_show() vs driver detach race") [1]
Cc: stable@...r.kernel.org
Signed-off-by: Dan Williams <dan.j.williams@...el.com>
---
Changes since v1 [1]:
- Move the new "locked" infrastructure to private header files to make
it clear it is not approved for general usage (Greg)
[1]: http://lore.kernel.org/172772671177.1003633.7355063154793624707.stgit@dwillia2-xfh.jf.intel.com
drivers/base/base.h | 9 +++++++++
drivers/base/core.c | 2 +-
fs/kernfs/file.c | 24 +++++++++++++++++++-----
fs/sysfs/file.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
fs/sysfs/group.c | 4 ++--
fs/sysfs/sysfs.h | 18 ++++++++++++++++++
include/linux/kernfs.h | 1 +
include/linux/sysfs.h | 1 +
8 files changed, 98 insertions(+), 8 deletions(-)
diff --git a/drivers/base/base.h b/drivers/base/base.h
index 0b53593372d7..0405e7667184 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -246,3 +246,12 @@ static inline int devtmpfs_delete_node(struct device *dev) { return 0; }
void software_node_notify(struct device *dev);
void software_node_notify_remove(struct device *dev);
+
+/*
+ * "locked" device attributes are an internal exception for the @uevent
+ * attribute.
+ */
+#include "../../fs/sysfs/sysfs.h"
+
+#define DEVICE_ATTR_LOCKED_RW(_name) \
+ struct device_attribute dev_attr_##_name = __ATTR_LOCKED_RW(_name)
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 8c0733d3aad8..1fd5a18cbb62 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -2772,7 +2772,7 @@ static ssize_t uevent_store(struct device *dev, struct device_attribute *attr,
return count;
}
-static DEVICE_ATTR_RW(uevent);
+static DEVICE_ATTR_LOCKED_RW(uevent);
static ssize_t online_show(struct device *dev, struct device_attribute *attr,
char *buf)
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 8502ef68459b..eb5c2167beb9 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -142,6 +142,20 @@ static void kernfs_seq_stop_active(struct seq_file *sf, void *v)
kernfs_put_active(of->kn);
}
+static void kernfs_open_file_lock(struct kernfs_open_file *of)
+{
+ mutex_lock(&of->mutex);
+ if (of->op_mutex)
+ mutex_lock(of->op_mutex);
+}
+
+static void kernfs_open_file_unlock(struct kernfs_open_file *of)
+{
+ if (of->op_mutex)
+ mutex_unlock(of->op_mutex);
+ mutex_unlock(&of->mutex);
+}
+
static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
{
struct kernfs_open_file *of = sf->private;
@@ -151,7 +165,7 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
* @of->mutex nests outside active ref and is primarily to ensure that
* the ops aren't called concurrently for the same open file.
*/
- mutex_lock(&of->mutex);
+ kernfs_open_file_lock(of);
if (!kernfs_get_active(of->kn))
return ERR_PTR(-ENODEV);
@@ -193,7 +207,7 @@ static void kernfs_seq_stop(struct seq_file *sf, void *v)
if (v != ERR_PTR(-ENODEV))
kernfs_seq_stop_active(sf, v);
- mutex_unlock(&of->mutex);
+ kernfs_open_file_unlock(of);
}
static int kernfs_seq_show(struct seq_file *sf, void *v)
@@ -322,9 +336,9 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter)
* @of->mutex nests outside active ref and is used both to ensure that
* the ops aren't called concurrently for the same open file.
*/
- mutex_lock(&of->mutex);
+ kernfs_open_file_lock(of);
if (!kernfs_get_active(of->kn)) {
- mutex_unlock(&of->mutex);
+ kernfs_open_file_unlock(of);
len = -ENODEV;
goto out_free;
}
@@ -336,7 +350,7 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter)
len = -EINVAL;
kernfs_put_active(of->kn);
- mutex_unlock(&of->mutex);
+ kernfs_open_file_unlock(of);
if (len > 0)
iocb->ki_pos += len;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index d1995e2d6c94..1bb878efcf00 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -15,6 +15,7 @@
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/seq_file.h>
+#include <linux/device.h>
#include <linux/mm.h>
#include "sysfs.h"
@@ -189,6 +190,26 @@ static int sysfs_kf_bin_open(struct kernfs_open_file *of)
return 0;
}
+/* locked attributes are always device attributes */
+static int sysfs_kf_setup_lock(struct kernfs_open_file *of)
+{
+ struct kobject *kobj = of->kn->parent->priv;
+ struct device *dev = kobj_to_dev(kobj);
+
+ get_device(dev);
+ of->op_mutex = &dev->mutex;
+
+ return 0;
+}
+
+static void sysfs_kf_free_lock(struct kernfs_open_file *of)
+{
+ struct kobject *kobj = of->kn->parent->priv;
+ struct device *dev = kobj_to_dev(kobj);
+
+ put_device(dev);
+}
+
void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr)
{
struct kernfs_node *kn = kobj->sd, *tmp;
@@ -227,6 +248,25 @@ static const struct kernfs_ops sysfs_file_kfops_rw = {
.write = sysfs_kf_write,
};
+static const struct kernfs_ops sysfs_locked_kfops_ro = {
+ .seq_show = sysfs_kf_seq_show,
+ .open = sysfs_kf_setup_lock,
+ .release = sysfs_kf_free_lock,
+};
+
+static const struct kernfs_ops sysfs_locked_kfops_wo = {
+ .write = sysfs_kf_write,
+ .open = sysfs_kf_setup_lock,
+ .release = sysfs_kf_free_lock,
+};
+
+static const struct kernfs_ops sysfs_locked_kfops_rw = {
+ .seq_show = sysfs_kf_seq_show,
+ .write = sysfs_kf_write,
+ .open = sysfs_kf_setup_lock,
+ .release = sysfs_kf_free_lock,
+};
+
static const struct kernfs_ops sysfs_prealloc_kfops_ro = {
.read = sysfs_kf_read,
.prealloc = true,
@@ -287,6 +327,13 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent,
ops = &sysfs_prealloc_kfops_ro;
else if (sysfs_ops->store)
ops = &sysfs_prealloc_kfops_wo;
+ } else if (mode & SYSFS_LOCKED) {
+ if (sysfs_ops->show && sysfs_ops->store)
+ ops = &sysfs_locked_kfops_rw;
+ else if (sysfs_ops->show)
+ ops = &sysfs_locked_kfops_ro;
+ else if (sysfs_ops->store)
+ ops = &sysfs_locked_kfops_wo;
} else {
if (sysfs_ops->show && sysfs_ops->store)
ops = &sysfs_file_kfops_rw;
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index d22ad67a0f32..0158367866be 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -68,11 +68,11 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
continue;
}
- WARN(mode & ~(SYSFS_PREALLOC | 0664),
+ WARN(mode & ~(SYSFS_PREALLOC | SYSFS_LOCKED | 0664),
"Attribute %s: Invalid permissions 0%o\n",
(*attr)->name, mode);
- mode &= SYSFS_PREALLOC | 0664;
+ mode &= SYSFS_PREALLOC | SYSFS_LOCKED | 0664;
error = sysfs_add_file_mode_ns(parent, *attr, mode, uid,
gid, NULL);
if (unlikely(error))
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 3f28c9af5756..51d5b6af243f 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -40,4 +40,22 @@ int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent,
int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target,
const char *name);
+#define SYSFS_LOCKED 040000
+
+/*
+ * uevent_show() needs this due to userspace expectations of reading
+ * that attribute flushing device probing, it is not intended to be a
+ * general semantic for other device sysfs attributes. It is broken to
+ * use this with non-device / pure-kobject sysfs attributes, see
+ * sysfs_kf_setup_lock().
+ */
+#define __ATTR_LOCKED(_name, _mode, _show, _store) { \
+ .attr = {.name = __stringify(_name), \
+ .mode = SYSFS_LOCKED | VERIFY_OCTAL_PERMISSIONS(_mode) },\
+ .show = _show, \
+ .store = _store, \
+}
+
+#define __ATTR_LOCKED_RW(_name) __ATTR_LOCKED(_name, 0644, _name##_show, _name##_store)
+
#endif /* __SYSFS_INTERNAL_H */
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 87c79d076d6d..df6828a7cd3e 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -257,6 +257,7 @@ struct kernfs_open_file {
/* private fields, do not use outside kernfs proper */
struct mutex mutex;
+ struct mutex *op_mutex;
struct mutex prealloc_mutex;
int event;
struct list_head list;
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index c4e64dc11206..dd28c6238b64 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -103,6 +103,7 @@ struct attribute_group {
#define SYSFS_PREALLOC 010000
#define SYSFS_GROUP_INVISIBLE 020000
+/* see fs/sysfs/sysfs.h for private mode-flags */
/*
* DEFINE_SYSFS_GROUP_VISIBLE(name):
Powered by blists - more mailing lists