linux-kernel - [PATCH v2] fadvise: perform WILLNEED readahead asynchronously

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20121225022251.GA25992@dcvr.yhbt.net>
Date:	Tue, 25 Dec 2012 02:22:51 +0000
From:	Eric Wong <normalperson@...t.net>
To:	linux-mm@...ck.org
Cc:	linux-kernel@...r.kernel.org, Alan Cox <alan@...rguk.ukuu.org.uk>,
	Dave Chinner <david@...morbit.com>,
	Zheng Liu <gnehzuil.liu@...il.com>
Subject: [PATCH v2] fadvise: perform WILLNEED readahead asynchronously

Using fadvise with POSIX_FADV_WILLNEED can be very slow and cause
user-visible latency.  This hurts interactivity and encourages
userspace to resort to background threads for readahead (or avoid
POSIX_FADV_WILLNEED entirely).

"strace -T" timing on an uncached, one gigabyte file:

 Before: fadvise64(3, 0, 0, POSIX_FADV_WILLNEED) = 0 <2.484832>
  After: fadvise64(3, 0, 0, POSIX_FADV_WILLNEED) = 0 <0.000061>

For a smaller 9.8M request, there is still a significant improvement:

 Before: fadvise64(3, 0, 10223108, POSIX_FADV_WILLNEED) = 0 <0.005399>
  After: fadvise64(3, 0, 10223108, POSIX_FADV_WILLNEED) = 0 <0.000059>

Even with a small 1M request, there is an improvement:

 Before: fadvise64(3, 0, 1048576, POSIX_FADV_WILLNEED) = 0 <0.000474>
  After: fadvise64(3, 0, 1048576, POSIX_FADV_WILLNEED) = 0 <0.000063>

While userspace can mimic the effect of this commit by using a
background thread to perform readahead(), this allows for simpler
userspace code.

To mitigate denial-of-service attacks, inflight (but incomplete)
readahead requests are accounted for when new readahead requests arrive.
New readahead requests may be reduced or ignored if there are too many
inflight readahead pages in the workqueue.

IO priority is also taken into account for workqueue readahead.
Normal and idle priority tasks share a concurrency-limited workqueue to
prevent excessive readahead requests from taking place simultaneously.
This normal workqueue is concurrency-limited to one task per-CPU
(like AIO).

Real-time I/O tasks get their own high-priority workqueue independent
of the normal workqueue.

The impact of idle tasks is also reduced and they are more likely to
have advisory readahead requests ignored/dropped when read congestion
occurs.

Cc: Alan Cox <alan@...rguk.ukuu.org.uk>
Cc: Dave Chinner <david@...morbit.com>
Cc: Zheng Liu <gnehzuil.liu@...il.com>
Signed-off-by: Eric Wong <normalperson@...t.net>
---
  I have not tested on NUMA (since I've no access to NUMA hardware)
  and do not know how the use of the workqueue affects RA performance.
  I'm only using WQ_UNBOUND on non-NUMA, though.

  I'm halfway tempted to make DONTNEED use a workqueue, too.
  Having perceptible latency on advisory syscalls is unpleasant and
  keeping the latency makes little sense if we can hide it.

 include/linux/mm.h |   3 +
 mm/fadvise.c       |  10 +--
 mm/readahead.c     | 217 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 224 insertions(+), 6 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6320407..90b361c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1536,6 +1536,9 @@ void task_dirty_inc(struct task_struct *tsk);
 #define VM_MAX_READAHEAD	128	/* kbytes */
 #define VM_MIN_READAHEAD	16	/* kbytes (includes current page) */
 
+void wq_page_cache_readahead(struct address_space *mapping, struct file *filp,
+			pgoff_t offset, unsigned long nr_to_read);
+
 int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
 			pgoff_t offset, unsigned long nr_to_read);
 
diff --git a/mm/fadvise.c b/mm/fadvise.c
index a47f0f5..cf3bd4c 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -102,12 +102,10 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
 		if (!nrpages)
 			nrpages = ~0UL;
 
-		/*
-		 * Ignore return value because fadvise() shall return
-		 * success even if filesystem can't retrieve a hint,
-		 */
-		force_page_cache_readahead(mapping, f.file, start_index,
-					   nrpages);
+		get_file(f.file); /* fput() is called by workqueue */
+
+		/* queue up the request, don't care if it fails */
+		wq_page_cache_readahead(mapping, f.file, start_index, nrpages);
 		break;
 	case POSIX_FADV_NOREUSE:
 		break;
diff --git a/mm/readahead.c b/mm/readahead.c
index 7963f23..f9e0705 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -19,6 +19,45 @@
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
 #include <linux/file.h>
+#include <linux/workqueue.h>
+#include <linux/ioprio.h>
+
+static struct workqueue_struct *ra_be __read_mostly;
+static struct workqueue_struct *ra_rt __read_mostly;
+static unsigned long ra_nr_queued;
+static DEFINE_SPINLOCK(ra_nr_queued_lock);
+
+struct wq_ra_req {
+	struct work_struct work;
+	struct address_space *mapping;
+	struct file *file;
+	pgoff_t offset;
+	unsigned long nr_to_read;
+	int ioprio;
+};
+
+static void wq_ra_enqueue(struct wq_ra_req *);
+
+/* keep NUMA readahead on the same CPU for now... */
+#ifdef CONFIG_NUMA
+#  define RA_WQ_FLAGS 0
+#else
+#  define RA_WQ_FLAGS WQ_UNBOUND
+#endif
+
+static int __init init_readahead(void)
+{
+	/* let tasks with real-time priorities run freely */
+	ra_rt = alloc_workqueue("readahead_rt", RA_WQ_FLAGS|WQ_HIGHPRI, 0);
+
+	/* limit async concurrency of normal and idle readahead */
+	ra_be = alloc_workqueue("readahead_be", RA_WQ_FLAGS, 1);
+
+	BUG_ON(!ra_be || !ra_rt);
+	return 0;
+}
+
+early_initcall(init_readahead);
 
 /*
  * Initialise a struct file's readahead state.  Assumes that the caller has
@@ -205,6 +244,183 @@ out:
 }
 
 /*
+ * if nr_to_read is too large, adjusts nr_to_read to the maximum sane value.
+ * atomically increments ra_nr_queued by nr_to_read if possible
+ * returns the number of pages queued (zero is possible)
+ */
+static unsigned long ra_queue_begin(struct address_space *mapping,
+				unsigned long nr_to_read)
+{
+	unsigned long flags;
+	unsigned long nr_isize, max;
+	loff_t isize;
+
+	/* do not attempt readahead pages beyond current inode size */
+	isize = i_size_read(mapping->host);
+	if (isize == 0)
+		return 0;
+	nr_isize = (isize >> PAGE_CACHE_SHIFT) + 1;
+	nr_to_read = min(nr_to_read, nr_isize);
+
+	/* check if we can do readahead at all */
+	max = max_sane_readahead(~0UL);
+	nr_to_read = min(nr_to_read, max);
+	if (nr_to_read == 0)
+		return 0;
+
+	/* check if we queued up too much readahead */
+	spin_lock_irqsave(&ra_nr_queued_lock, flags);
+
+	if (ra_nr_queued >= max) {
+		/* too much queued, do not queue more */
+		nr_to_read = 0;
+	} else {
+		/* trim to reflect maximum amount possible */
+		if ((nr_to_read + ra_nr_queued) > max)
+			nr_to_read = max - ra_nr_queued;
+
+		ra_nr_queued += nr_to_read;
+	}
+
+	spin_unlock_irqrestore(&ra_nr_queued_lock, flags);
+
+	return nr_to_read;
+}
+
+/*
+ * atomically decrements ra_nr_queued by nr_pages when a part of the
+ * readahead request is done (or aborted)
+ */
+static void ra_queue_complete(unsigned long nr_pages)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ra_nr_queued_lock, flags);
+	ra_nr_queued -= nr_pages;
+	spin_unlock_irqrestore(&ra_nr_queued_lock, flags);
+}
+
+/*
+ * Read a chunk of the read-ahead request, this will re-enqueue work.
+ * Use 2 megabyte units per chunk to avoid pinning too much memory at once.
+ */
+static void wq_ra_req_fn(struct work_struct *work)
+{
+	unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE;
+	struct wq_ra_req *req = container_of(work, struct wq_ra_req, work);
+	int ret;
+	int old_prio, tmp_prio;
+	struct task_struct *p = current;
+
+	/* limit the impact of idle tasks */
+	if (IOPRIO_PRIO_CLASS(req->ioprio) == IOPRIO_CLASS_IDLE) {
+		/* drop requests for idle tasks if there is congestion */
+		if (bdi_read_congested(req->mapping->backing_dev_info))
+			goto done;
+
+		/* smaller chunk size gives priority to others */
+		this_chunk /= 8;
+
+		/*
+		 * setting IOPRIO_CLASS_IDLE may stall everything else,
+		 * use best-effort instead
+		 */
+		tmp_prio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 7);
+	} else {
+		tmp_prio = req->ioprio;
+	}
+
+	if (this_chunk > req->nr_to_read)
+		this_chunk = req->nr_to_read;
+
+	/* stop the async readahead if we cannot proceed */
+	this_chunk = max_sane_readahead(this_chunk);
+	if (this_chunk == 0)
+		goto done;
+
+	/* temporarily change our IO prio to that of the originating task */
+	old_prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p), task_nice_ioprio(p));
+	set_task_ioprio(p, tmp_prio);
+	ret = __do_page_cache_readahead(req->mapping, req->file,
+					req->offset, this_chunk, 0);
+	set_task_ioprio(p, old_prio);
+
+	/* requeue if readahead was successful and there is more to queue */
+	if (ret >= 0 && req->nr_to_read > this_chunk) {
+		req->offset += this_chunk;
+		req->nr_to_read -= this_chunk;
+		ra_queue_complete(this_chunk);
+
+		/* keep going, but yield to other requests */
+		wq_ra_enqueue(req);
+	} else {
+done:
+		ra_queue_complete(req->nr_to_read);
+		fput(req->file);
+		kfree(req);
+	}
+}
+
+static void wq_ra_enqueue(struct wq_ra_req *req)
+{
+	INIT_WORK(&req->work, wq_ra_req_fn);
+
+	if (IOPRIO_PRIO_CLASS(req->ioprio) == IOPRIO_CLASS_RT)
+		queue_work(ra_rt, &req->work);
+	else
+		queue_work(ra_be, &req->work);
+}
+
+/*
+ * Fire-and-forget readahead using a workqueue, this allocates pages
+ * inside a workqueue and returns as soon as possible.
+ */
+void wq_page_cache_readahead(struct address_space *mapping, struct file *filp,
+		pgoff_t offset, unsigned long nr_to_read)
+{
+	struct wq_ra_req *req;
+	int ioprio;
+	struct task_struct *p;
+
+	if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
+		goto skip_ra;
+
+	nr_to_read = ra_queue_begin(mapping, nr_to_read);
+	if (!nr_to_read)
+		goto skip_ra;
+
+	p = current;
+	if (p->io_context)
+		ioprio = p->io_context->ioprio;
+	else
+		ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p),
+						task_nice_ioprio(p));
+
+	/* drop requests for idle tasks if there is congestion */
+	if (IOPRIO_PRIO_CLASS(ioprio) == IOPRIO_CLASS_IDLE
+	    && bdi_read_congested(mapping->backing_dev_info))
+		goto skip_ra_done;
+
+	req = kzalloc(sizeof(*req), GFP_KERNEL);
+	if (!req)
+		goto skip_ra_done;
+
+	/* offload to a workqueue and return to caller ASAP */
+	req->mapping = mapping;
+	req->file = filp;
+	req->offset = offset;
+	req->nr_to_read = nr_to_read;
+	req->ioprio = ioprio;
+	wq_ra_enqueue(req);
+
+	return;
+skip_ra_done:
+	ra_queue_complete(nr_to_read);
+skip_ra:
+	fput(filp);
+}
+
+/*
  * Chunk the readahead into 2 megabyte units, so that we don't pin too much
  * memory at once.
  */
-- 
Eric Wong
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/