lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220104235148.21320-1-hridya@google.com>
Date:   Tue,  4 Jan 2022 15:51:48 -0800
From:   Hridya Valsaraju <hridya@...gle.com>
To:     Sumit Semwal <sumit.semwal@...aro.org>,
        "Christian König" <christian.koenig@....com>,
        Hridya Valsaraju <hridya@...gle.com>,
        Daniel Vetter <daniel.vetter@...ll.ch>,
        Greg Kroah-Hartman <gregkh@...uxfoundation.org>,
        linux-media@...r.kernel.org, dri-devel@...ts.freedesktop.org,
        linaro-mm-sig@...ts.linaro.org, linux-kernel@...r.kernel.org
Cc:     john.stultz@...aro.org, surenb@...gle.com, kaleshsingh@...gle.com,
        tjmercier@...gle.com, keescook@...gle.com
Subject: [PATCH] dma-buf: Move sysfs work out of DMA-BUF export/release path

Recently, we noticed an issue where a process went into direct reclaim
while holding the kernfs rw semaphore for sysfs in write(exclusive)
mode. This caused processes who were doing DMA-BUF exports and releases
to go into uninterruptible sleep since they needed to acquire the same
semaphore for the DMA-BUF sysfs entry creation/deletion. In order to avoid
blocking DMA-BUF export/release for an indeterminate amount of time
while another process is holding the sysfs rw semaphore in exclusive
mode, this patch moves the per-buffer sysfs file creation/deleteion to
a kthread.

Fixes: bdb8d06dfefd ("dmabuf: Add the capability to expose DMA-BUF stats in sysfs")
Signed-off-by: Hridya Valsaraju <hridya@...gle.com>
---
 drivers/dma-buf/dma-buf-sysfs-stats.c | 343 ++++++++++++++++++++++++--
 include/linux/dma-buf.h               |  46 ++++
 2 files changed, 366 insertions(+), 23 deletions(-)

diff --git a/drivers/dma-buf/dma-buf-sysfs-stats.c b/drivers/dma-buf/dma-buf-sysfs-stats.c
index 053baadcada9..3251fdf2f05f 100644
--- a/drivers/dma-buf/dma-buf-sysfs-stats.c
+++ b/drivers/dma-buf/dma-buf-sysfs-stats.c
@@ -7,13 +7,39 @@
 
 #include <linux/dma-buf.h>
 #include <linux/dma-resv.h>
+#include <linux/freezer.h>
 #include <linux/kobject.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
 #include <linux/printk.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/sysfs.h>
 
 #include "dma-buf-sysfs-stats.h"
 
+struct dmabuf_kobj_work {
+	struct list_head list;
+	struct dma_buf_sysfs_entry *sysfs_entry;
+	struct dma_buf_sysfs_entry_metadata *sysfs_metadata;
+	unsigned long uid;
+};
+
+/* Both kobject setup and teardown work gets queued on the list. */
+static LIST_HEAD(dmabuf_kobj_work_list);
+
+/* dmabuf_kobj_list_lock protects dmabuf_kobj_work_list. */
+static DEFINE_SPINLOCK(dmabuf_kobj_list_lock);
+
+/*
+ * dmabuf_sysfs_show_lock prevents a race between a DMA-BUF sysfs file being
+ * read and the DMA-BUF being freed by protecting sysfs_entry->dmabuf.
+ */
+static DEFINE_SPINLOCK(dmabuf_sysfs_show_lock);
+
+static struct task_struct *dmabuf_kobject_task;
+static wait_queue_head_t dmabuf_kobject_waitqueue;
+
 #define to_dma_buf_entry_from_kobj(x) container_of(x, struct dma_buf_sysfs_entry, kobj)
 
 /**
@@ -64,15 +90,26 @@ static ssize_t dma_buf_stats_attribute_show(struct kobject *kobj,
 	struct dma_buf_stats_attribute *attribute;
 	struct dma_buf_sysfs_entry *sysfs_entry;
 	struct dma_buf *dmabuf;
+	int ret;
 
 	attribute = to_dma_buf_stats_attr(attr);
 	sysfs_entry = to_dma_buf_entry_from_kobj(kobj);
+
+	/*
+	 * acquire dmabuf_sysfs_show_lock to prevent a race with the DMA-BUF
+	 * being freed while sysfs_entry->dmabuf is being accessed.
+	 */
+	spin_lock(&dmabuf_sysfs_show_lock);
 	dmabuf = sysfs_entry->dmabuf;
 
-	if (!dmabuf || !attribute->show)
+	if (!dmabuf || !attribute->show) {
+		spin_unlock(&dmabuf_sysfs_show_lock);
 		return -EIO;
+	}
 
-	return attribute->show(dmabuf, attribute, buf);
+	ret = attribute->show(dmabuf, attribute, buf);
+	spin_unlock(&dmabuf_sysfs_show_lock);
+	return ret;
 }
 
 static const struct sysfs_ops dma_buf_stats_sysfs_ops = {
@@ -118,33 +155,275 @@ static struct kobj_type dma_buf_ktype = {
 	.default_groups = dma_buf_stats_default_groups,
 };
 
-void dma_buf_stats_teardown(struct dma_buf *dmabuf)
+/* Statistics files do not need to send uevents. */
+static int dmabuf_sysfs_uevent_filter(struct kset *kset, struct kobject *kobj)
 {
-	struct dma_buf_sysfs_entry *sysfs_entry;
+	return 0;
+}
 
-	sysfs_entry = dmabuf->sysfs_entry;
-	if (!sysfs_entry)
-		return;
+static const struct kset_uevent_ops dmabuf_sysfs_no_uevent_ops = {
+	.filter = dmabuf_sysfs_uevent_filter,
+};
+
+/* setup of sysfs entries done asynchronously in the worker thread. */
+static void dma_buf_sysfs_stats_setup_work(struct dmabuf_kobj_work *kobject_work)
+{
+	struct dma_buf_sysfs_entry *sysfs_entry = kobject_work->sysfs_entry;
+	struct dma_buf_sysfs_entry_metadata *sysfs_metadata =
+			kobject_work->sysfs_metadata;
+	bool free_metadata = false;
+
+	int ret = kobject_init_and_add(&sysfs_entry->kobj, &dma_buf_ktype, NULL,
+				       "%lu", kobject_work->uid);
+	if (ret) {
+		kobject_put(&sysfs_entry->kobj);
+
+		spin_lock(&sysfs_metadata->sysfs_entry_lock);
+		if (sysfs_metadata->status == SYSFS_ENTRY_INIT_ABORTED) {
+			/*
+			 * SYSFS_ENTRY_INIT_ABORTED means that the DMA-BUF has already
+			 * been freed. At this point, its safe to free the memory for
+			 * the sysfs_metadata;
+			 */
+			free_metadata = true;
+		} else {
+			/*
+			 * The DMA-BUF has not yet been freed, set the status to
+			 * sysfs_entry_error so that when the DMA-BUF gets
+			 * freed, we know there is no need to teardown the sysfs
+			 * entry.
+			 */
+			sysfs_metadata->status = SYSFS_ENTRY_ERROR;
+		}
+		goto unlock;
+	}
+
+	/*
+	 * If the DMA-BUF has not yet been released, status would still be
+	 * SYSFS_ENTRY_INIT_IN_PROGRESS. We set the status as initialized.
+	 */
+	spin_lock(&sysfs_metadata->sysfs_entry_lock);
+	if (sysfs_metadata->status == SYSFS_ENTRY_INIT_IN_PROGRESS) {
+		sysfs_metadata->status = SYSFS_ENTRY_INITIALIZED;
+		goto unlock;
+	}
 
+	/*
+	 * At this point the status is SYSFS_ENTRY_INIT_ABORTED which means
+	 * that the DMA-BUF has already been freed. Hence, we cleanup the
+	 * sysfs_entry and its metadata since neither of them are needed
+	 * anymore.
+	 */
+	free_metadata = true;
 	kobject_del(&sysfs_entry->kobj);
 	kobject_put(&sysfs_entry->kobj);
+
+unlock:
+	spin_unlock(&sysfs_metadata->sysfs_entry_lock);
+	if (free_metadata) {
+		kfree(kobject_work->sysfs_metadata);
+		kobject_work->sysfs_metadata = NULL;
+	}
 }
 
+/* teardown of sysfs entries done asynchronously in the worker thread. */
+static void dma_buf_sysfs_stats_teardown_work(struct dmabuf_kobj_work *kobject_work)
+{
+	struct dma_buf_sysfs_entry *sysfs_entry = kobject_work->sysfs_entry;
 
-/* Statistics files do not need to send uevents. */
-static int dmabuf_sysfs_uevent_filter(struct kset *kset, struct kobject *kobj)
+	kobject_del(&sysfs_entry->kobj);
+	kobject_put(&sysfs_entry->kobj);
+
+	kfree(kobject_work->sysfs_metadata);
+	kobject_work->sysfs_metadata = NULL;
+}
+
+/* do setup or teardown of sysfs entries as required */
+static void do_kobject_work(struct dmabuf_kobj_work *kobject_work)
 {
+	struct dma_buf_sysfs_entry_metadata *sysfs_metadata;
+	bool setup_needed = false;
+	bool teardown_needed = false;
+
+	sysfs_metadata = kobject_work->sysfs_metadata;
+	spin_lock(&sysfs_metadata->sysfs_entry_lock);
+	if (sysfs_metadata->status == SYSFS_ENTRY_UNINITIALIZED) {
+		setup_needed = true;
+		sysfs_metadata->status = SYSFS_ENTRY_INIT_IN_PROGRESS;
+	} else if (sysfs_metadata->status == SYSFS_ENTRY_INITIALIZED) {
+		teardown_needed = true;
+	}
+
+	/*
+	 * It is ok to release the sysfs_entry_lock here.
+	 *
+	 * If setup_needed is true, we check the status again after the kobject
+	 * initialization to see if it has been set to SYSFS_ENTRY_INIT_ABORTED
+	 * and if so teardown the kobject.
+	 *
+	 * If teardown_needed is true, there are no more changes expected to the
+	 * status.
+	 *
+	 * If neither setup_needed nor teardown needed are true, it
+	 * means the DMA-BUF was freed before we got around to setting up the
+	 * sysfs entry and hence we just need to release the metadata and
+	 * return.
+	 */
+	spin_unlock(&kobject_work->sysfs_metadata->sysfs_entry_lock);
+
+	if (setup_needed)
+		dma_buf_sysfs_stats_setup_work(kobject_work);
+	else if (teardown_needed)
+		dma_buf_sysfs_stats_teardown_work(kobject_work);
+	else
+		kfree(kobject_work->sysfs_metadata);
+
+	kfree(kobject_work);
+}
+
+static struct dmabuf_kobj_work *get_next_kobj_work(void)
+{
+	struct dmabuf_kobj_work *kobject_work;
+
+	spin_lock(&dmabuf_kobj_list_lock);
+	kobject_work = list_first_entry_or_null(&dmabuf_kobj_work_list,
+						struct dmabuf_kobj_work, list);
+	if (kobject_work)
+		list_del(&kobject_work->list);
+	spin_unlock(&dmabuf_kobj_list_lock);
+	return kobject_work;
+}
+
+static int kobject_work_thread(void *data)
+{
+	struct dmabuf_kobj_work *kobject_work;
+
+	while (1) {
+		wait_event_freezable(dmabuf_kobject_waitqueue,
+				     (kobject_work = get_next_kobj_work()));
+		do_kobject_work(kobject_work);
+	}
+
 	return 0;
 }
 
-static const struct kset_uevent_ops dmabuf_sysfs_no_uevent_ops = {
-	.filter = dmabuf_sysfs_uevent_filter,
-};
+static int kobject_worklist_init(void)
+{
+	init_waitqueue_head(&dmabuf_kobject_waitqueue);
+	dmabuf_kobject_task = kthread_run(kobject_work_thread, NULL,
+					  "%s", "dmabuf-kobject-worker");
+	if (IS_ERR(dmabuf_kobject_task)) {
+		pr_err("Creating thread for deferred sysfs entry creation/deletion failed\n");
+		return PTR_ERR(dmabuf_kobject_task);
+	}
+	sched_set_normal(dmabuf_kobject_task, MAX_NICE);
+
+	return 0;
+}
+
+static void deferred_kobject_create(struct dmabuf_kobj_work *kobject_work)
+{
+	INIT_LIST_HEAD(&kobject_work->list);
+
+	spin_lock(&dmabuf_kobj_list_lock);
+
+	list_add_tail(&kobject_work->list, &dmabuf_kobj_work_list);
+
+	spin_unlock(&dmabuf_kobj_list_lock);
+
+	wake_up(&dmabuf_kobject_waitqueue);
+}
+
+
+void dma_buf_stats_teardown(struct dma_buf *dmabuf)
+{
+	struct dma_buf_sysfs_entry *sysfs_entry;
+	struct dma_buf_sysfs_entry_metadata *sysfs_metadata;
+	struct dmabuf_kobj_work *kobj_work;
+
+	sysfs_entry = dmabuf->sysfs_entry;
+	if (!sysfs_entry)
+		return;
+
+	sysfs_metadata = dmabuf->sysfs_entry_metadata;
+	if (!sysfs_metadata)
+		return;
+
+	spin_lock(&sysfs_metadata->sysfs_entry_lock);
+
+	if (sysfs_metadata->status == SYSFS_ENTRY_UNINITIALIZED ||
+	    sysfs_metadata->status == SYSFS_ENTRY_INIT_IN_PROGRESS) {
+		/*
+		 * The sysfs entry for this buffer has not yet been initialized,
+		 * we set the status to SYSFS_ENTRY_INIT_ABORTED to abort the
+		 * initialization.
+		 */
+		sysfs_metadata->status = SYSFS_ENTRY_INIT_ABORTED;
+		spin_unlock(&sysfs_metadata->sysfs_entry_lock);
+
+		/*
+		 * In case kobject initialization completes right as we release
+		 * the sysfs_entry_lock, disable show() for the sysfs entry by
+		 * setting sysfs_entry->dmabuf to NULL to prevent a race.
+		 */
+		spin_lock(&dmabuf_sysfs_show_lock);
+		sysfs_entry->dmabuf = NULL;
+		spin_unlock(&dmabuf_sysfs_show_lock);
+
+		return;
+	}
+
+	if (sysfs_metadata->status == SYSFS_ENTRY_INITIALIZED) {
+		/*
+		 * queue teardown work only if sysfs_entry is fully inititalized.
+		 * It is ok to release the sysfs_entry_lock here since the
+		 * status can no longer change.
+		 */
+		spin_unlock(&sysfs_metadata->sysfs_entry_lock);
+
+		/*
+		 * Meanwhile disable show() for the sysfs entry to avoid a race
+		 * between teardown and show().
+		 */
+		spin_lock(&dmabuf_sysfs_show_lock);
+		sysfs_entry->dmabuf = NULL;
+		spin_unlock(&dmabuf_sysfs_show_lock);
+
+		kobj_work = kzalloc(sizeof(struct dmabuf_kobj_work), GFP_KERNEL);
+		if (!kobj_work) {
+			/* do the teardown immediately. */
+			kobject_del(&sysfs_entry->kobj);
+			kobject_put(&sysfs_entry->kobj);
+			kfree(sysfs_metadata);
+		} else {
+			/* queue teardown work. */
+			kobj_work->sysfs_entry = dmabuf->sysfs_entry;
+			kobj_work->sysfs_metadata = dmabuf->sysfs_entry_metadata;
+			deferred_kobject_create(kobj_work);
+		}
+
+		return;
+	}
+
+	/*
+	 * status is SYSFS_ENTRY_INIT_ERROR so we only need to free the
+	 * metadata.
+	 */
+	spin_unlock(&sysfs_metadata->sysfs_entry_lock);
+	kfree(dmabuf->sysfs_entry_metadata);
+	dmabuf->sysfs_entry_metadata = NULL;
+}
 
 static struct kset *dma_buf_stats_kset;
 static struct kset *dma_buf_per_buffer_stats_kset;
 int dma_buf_init_sysfs_statistics(void)
 {
+	int ret;
+
+	ret = kobject_worklist_init();
+	if (ret)
+		return ret;
+
 	dma_buf_stats_kset = kset_create_and_add("dmabuf",
 						 &dmabuf_sysfs_no_uevent_ops,
 						 kernel_kobj);
@@ -171,7 +450,8 @@ void dma_buf_uninit_sysfs_statistics(void)
 int dma_buf_stats_setup(struct dma_buf *dmabuf)
 {
 	struct dma_buf_sysfs_entry *sysfs_entry;
-	int ret;
+	struct dma_buf_sysfs_entry_metadata *sysfs_metadata;
+	struct dmabuf_kobj_work *kobj_work;
 
 	if (!dmabuf || !dmabuf->file)
 		return -EINVAL;
@@ -188,18 +468,35 @@ int dma_buf_stats_setup(struct dma_buf *dmabuf)
 	sysfs_entry->kobj.kset = dma_buf_per_buffer_stats_kset;
 	sysfs_entry->dmabuf = dmabuf;
 
+	sysfs_metadata = kzalloc(sizeof(struct dma_buf_sysfs_entry_metadata),
+				 GFP_KERNEL);
+	if (!sysfs_metadata) {
+		kfree(sysfs_entry);
+		return -ENOMEM;
+	}
+
 	dmabuf->sysfs_entry = sysfs_entry;
 
-	/* create the directory for buffer stats */
-	ret = kobject_init_and_add(&sysfs_entry->kobj, &dma_buf_ktype, NULL,
-				   "%lu", file_inode(dmabuf->file)->i_ino);
-	if (ret)
-		goto err_sysfs_dmabuf;
+	sysfs_metadata->status = SYSFS_ENTRY_UNINITIALIZED;
+	spin_lock_init(&sysfs_metadata->sysfs_entry_lock);
 
-	return 0;
+	dmabuf->sysfs_entry_metadata = sysfs_metadata;
 
-err_sysfs_dmabuf:
-	kobject_put(&sysfs_entry->kobj);
-	dmabuf->sysfs_entry = NULL;
-	return ret;
+	kobj_work = kzalloc(sizeof(struct dmabuf_kobj_work), GFP_KERNEL);
+	if (!kobj_work) {
+		kfree(sysfs_entry);
+		kfree(sysfs_metadata);
+		return -ENOMEM;
+	}
+
+	kobj_work->sysfs_entry = dmabuf->sysfs_entry;
+	kobj_work->sysfs_metadata = dmabuf->sysfs_entry_metadata;
+	/*
+	 * stash the inode number in struct dmabuf_kobj_work since setup
+	 * might race with DMA-BUF teardown.
+	 */
+	kobj_work->uid = file_inode(dmabuf->file)->i_ino;
+
+	deferred_kobject_create(kobj_work);
+	return 0;
 }
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 7ab50076e7a6..0597690023a0 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -287,6 +287,50 @@ struct dma_buf_ops {
 	void (*vunmap)(struct dma_buf *dmabuf, struct dma_buf_map *map);
 };
 
+#ifdef CONFIG_DMABUF_SYSFS_STATS
+enum sysfs_entry_status {
+	SYSFS_ENTRY_UNINITIALIZED,
+	SYSFS_ENTRY_INIT_IN_PROGRESS,
+	SYSFS_ENTRY_ERROR,
+	SYSFS_ENTRY_INIT_ABORTED,
+	SYSFS_ENTRY_INITIALIZED,
+};
+
+/*
+ * struct dma_buf_sysfs_entry_metadata - Holds the current status for the
+ * DMA-BUF sysfs entry.
+ *
+ * @status: holds the current status for the DMA-BUF sysfs entry. The status of
+ * the sysfs entry has the following path.
+ *
+ *			SYSFS_ENTRY_UNINITIALIZED
+ *		 __________________|____________________
+ *		|					|
+ *	  SYSFS_ENTRY_INIT_IN_PROGRESS	    SYSFS_ENTRY_INIT_ABORTED (DMA-BUF
+ *		|						      gets freed
+ *		|						      before
+ *		|						      init)
+ *	________|_____________________________________
+ *	|			  |		      |
+ * SYSFS_ENTRY_INITIALIZED	  |	  SYSFS_ENTRY_INIT_ABORTED
+ *				  |		  (DMA-BUF gets freed during kobject
+ *				  |		  init)
+ *				  |
+ *				  |
+ *		      SYSFS_ENTRY_ERROR
+ *		      (error during kobject init)
+ *
+ * @sysfs_entry_lock: protects access to @status.
+ */
+struct dma_buf_sysfs_entry_metadata {
+	enum sysfs_entry_status status;
+	/*
+	 * Protects sysfs_entry_metadata->status
+	 */
+	spinlock_t sysfs_entry_lock;
+};
+#endif
+
 /**
  * struct dma_buf - shared buffer object
  *
@@ -452,6 +496,8 @@ struct dma_buf {
 		struct kobject kobj;
 		struct dma_buf *dmabuf;
 	} *sysfs_entry;
+
+	struct dma_buf_sysfs_entry_metadata *sysfs_entry_metadata;
 #endif
 };
 
-- 
2.34.1.448.ga2b2bfdf31-goog

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ