lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Fri, 22 Jul 2011 13:59:39 -0700
From:	Dan Williams <dan.j.williams@...el.com>
To:	jaxboe@...ionio.com
Cc:	Christoph Hellwig <hch@...radead.org>,
	Roland Dreier <roland@...estorage.com>,
	Dave Jiang <dave.jiang@...el.com>,
	linux-kernel@...r.kernel.org, linux-scsi@...r.kernel.org
Subject: [RFC PATCH 1/2] block: strict rq_affinity

Some storage controllers benefit from completions always being steered
to the strict requester cpu rather than the looser "per-socket" steering
that blk_cpu_to_group() attempts by default.

echo 2 > /sys/block/<bdev>/queue/rq_affinity

Cc: Christoph Hellwig <hch@...radead.org>
Cc: Roland Dreier <roland@...estorage.com>
Tested-by: Dave Jiang <dave.jiang@...el.com>
Signed-off-by: Dan Williams <dan.j.williams@...el.com>
---
 Documentation/block/queue-sysfs.txt |   10 +++++++---
 block/blk-core.c                    |    6 ++----
 block/blk-softirq.c                 |   11 +++++++----
 block/blk-sysfs.c                   |   13 +++++++++----
 include/linux/blkdev.h              |    3 ++-
 5 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index f652740..d8147b3 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -45,9 +45,13 @@ device.
 
 rq_affinity (RW)
 ----------------
-If this option is enabled, the block layer will migrate request completions
-to the CPU that originally submitted the request. For some workloads
-this provides a significant reduction in CPU cycles due to caching effects.
+If this option is '1', the block layer will migrate request completions to the
+cpu "group" that originally submitted the request. For some workloads this
+provides a significant reduction in CPU cycles due to caching effects.
+
+For storage configurations that need to maximize distribution of completion
+processing setting this option to '2' forces the completion to run on the
+requesting cpu (bypassing the "group" aggregation logic).
 
 scheduler (RW)
 --------------
diff --git a/block/blk-core.c b/block/blk-core.c
index d2f8f40..9c7ba87 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1279,10 +1279,8 @@ get_rq:
 	init_request_from_bio(req, bio);
 
 	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
-	    bio_flagged(bio, BIO_CPU_AFFINE)) {
-		req->cpu = blk_cpu_to_group(get_cpu());
-		put_cpu();
-	}
+	    bio_flagged(bio, BIO_CPU_AFFINE))
+		req->cpu = smp_processor_id();
 
 	plug = current->plug;
 	if (plug) {
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index ee9c216..475fab8 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -103,22 +103,25 @@ static struct notifier_block __cpuinitdata blk_cpu_notifier = {
 
 void __blk_complete_request(struct request *req)
 {
+	int ccpu, cpu, group_cpu = NR_CPUS;
 	struct request_queue *q = req->q;
 	unsigned long flags;
-	int ccpu, cpu, group_cpu;
 
 	BUG_ON(!q->softirq_done_fn);
 
 	local_irq_save(flags);
 	cpu = smp_processor_id();
-	group_cpu = blk_cpu_to_group(cpu);
 
 	/*
 	 * Select completion CPU
 	 */
-	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1)
+	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) {
 		ccpu = req->cpu;
-	else
+		if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) {
+			ccpu = blk_cpu_to_group(ccpu);
+			group_cpu = blk_cpu_to_group(cpu);
+		}
+	} else
 		ccpu = cpu;
 
 	if (ccpu == cpu || ccpu == group_cpu) {
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index d935bd8..0ee17b5 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -244,8 +244,9 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
 static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
 {
 	bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags);
+	bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags);
 
-	return queue_var_show(set, page);
+	return queue_var_show(set << force, page);
 }
 
 static ssize_t
@@ -257,10 +258,14 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
 
 	ret = queue_var_store(&val, page, count);
 	spin_lock_irq(q->queue_lock);
-	if (val)
+	if (val) {
 		queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
-	else
-		queue_flag_clear(QUEUE_FLAG_SAME_COMP,  q);
+		if (val == 2)
+			queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
+	} else {
+		queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
+		queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
+	}
 	spin_unlock_irq(q->queue_lock);
 #endif
 	return ret;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1a23722..b228825 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -393,7 +393,7 @@ struct request_queue
 #define QUEUE_FLAG_ELVSWITCH	6	/* don't use elevator, just do FIFO */
 #define QUEUE_FLAG_BIDI		7	/* queue supports bidi requests */
 #define QUEUE_FLAG_NOMERGES     8	/* disable merge attempts */
-#define QUEUE_FLAG_SAME_COMP	9	/* force complete on same CPU */
+#define QUEUE_FLAG_SAME_COMP	9	/* complete on same CPU-group */
 #define QUEUE_FLAG_FAIL_IO     10	/* fake timeout */
 #define QUEUE_FLAG_STACKABLE   11	/* supports request stacking */
 #define QUEUE_FLAG_NONROT      12	/* non-rotational device (SSD) */
@@ -403,6 +403,7 @@ struct request_queue
 #define QUEUE_FLAG_NOXMERGES   15	/* No extended merges */
 #define QUEUE_FLAG_ADD_RANDOM  16	/* Contributes to random pool */
 #define QUEUE_FLAG_SECDISCARD  17	/* supports SECDISCARD */
+#define QUEUE_FLAG_SAME_FORCE  18	/* force complete on same CPU */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ