linux-kernel - [PATCH v7 7/6] fs/epoll: scale nested callbacks

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20171013154543.GI5131@linux-80c1.suse>
Date:   Fri, 13 Oct 2017 08:45:43 -0700
From:   Davidlohr Bueso <dave@...olabs.net>
To:     Waiman Long <longman@...hat.com>
Cc:     Alexander Viro <viro@...iv.linux.org.uk>, Jan Kara <jack@...e.com>,
        Jeff Layton <jlayton@...chiereds.net>,
        "J. Bruce Fields" <bfields@...ldses.org>,
        Tejun Heo <tj@...nel.org>,
        Christoph Lameter <cl@...ux-foundation.org>,
        linux-fsdevel@...r.kernel.org, linux-kernel@...r.kernel.org,
        Ingo Molnar <mingo@...hat.com>,
        Peter Zijlstra <peterz@...radead.org>,
        Andi Kleen <andi@...stfloor.org>,
        Dave Chinner <dchinner@...hat.com>,
        Boqun Feng <boqun.feng@...il.com>, akpm@...ux-foundation.org
Subject: [PATCH v7 7/6] fs/epoll: scale nested callbacks

A customer reported massive contention on the ncalls->lock in which
the workload is designed around nested epolls (where the fd is already
an epoll).

 83.49%  [kernel]               [k] __pv_queued_spin_lock_slowpath
  2.70%  [kernel]               [k] ep_call_nested.constprop.13
  1.94%  [kernel]               [k] _raw_spin_lock_irqsave
  1.83%  [kernel]               [k] __raw_callee_save___pv_queued_spin_unlock
  1.45%  [kernel]               [k] _raw_spin_unlock_irqrestore
  0.41%  [kernel]               [k] ep_scan_ready_list.isra.8
  0.36%  [kernel]               [k] pvclock_clocksource_read

The application running on kvm, is using a shared memory IPC communication
with a pipe wakeup mechanism, and a two level dispatcher both built around
'epoll_wait'. There is one process per physical core and a full mesh of pipes
between them, so on a 18 core system (the actual case), there are 18*18 pipes
for the IPCs alone.

This patch proposes replacing the nested calls global linked list with a dlock
interface, for which we can benefit from pcpu lists when doing ep_poll_safewake(),
and hashing for the current task, which provides two benefits:

1. Speeds up the process of loop and max-depth checks from O(N) lookups to O(1)
   (albeit possible collisions, which we have to iterate); and,

2. Provides smaller lock granularity.

cpus	before		after	   diff
1	1409370		1344804     -4.58%
2	1015690		1337053     31.63%
3	 721009		1273254     76.59%
4	 380959		1128931    196.33%
5	 287603		1028362    257.56%
6	 221104		 894943    304.76%
7	 173592		 976395    462.46%
8	 145860		 922584    532.51%
9	 127877		 925900    624.05%
10	 112603		 791456    602.87%
11	  97926		 724296    639.63%
12	  80732		 730485    804.82%

With the exception of a single cpu, where the overhead of jhashing influences), we
get some pretty good raw throughput increase. Similarly, the amount of time spent
decreases immensely as well.

Passes ltp related testcases.

Signed-off-by: Davidlohr Bueso <dbueso@...e.de>
---
 fs/eventpoll.c | 88 +++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 53 insertions(+), 35 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2fabd19cdeea..675c97fdc5da 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -22,7 +22,7 @@
 #include <linux/slab.h>
 #include <linux/poll.h>
 #include <linux/string.h>
-#include <linux/list.h>
+#include <linux/dlock-list.h>
 #include <linux/hash.h>
 #include <linux/spinlock.h>
 #include <linux/syscalls.h>
@@ -119,7 +119,7 @@ struct epoll_filefd {
  * and loop cycles.
  */
 struct nested_call_node {
-	struct list_head llink;
+	struct dlock_list_node llink;
 	void *cookie;
 	void *ctx;
 };
@@ -129,8 +129,7 @@ struct nested_call_node {
  * maximum recursion dept and loop cycles.
  */
 struct nested_calls {
-	struct list_head tasks_call_list;
-	spinlock_t lock;
+	struct dlock_list_heads tasks_call_list;
 };
 
 /*
@@ -371,10 +370,14 @@ static inline int ep_op_has_event(int op)
 }
 
 /* Initialize the poll safe wake up structure */
-static void ep_nested_calls_init(struct nested_calls *ncalls)
+static inline int ep_nested_calls_init(struct nested_calls *ncalls)
+{
+	return alloc_dlock_list_heads(&ncalls->tasks_call_list, true);
+}
+
+static inline void ep_nested_calls_free(struct nested_calls *ncalls)
 {
-	INIT_LIST_HEAD(&ncalls->tasks_call_list);
-	spin_lock_init(&ncalls->lock);
+	free_dlock_list_heads(&ncalls->tasks_call_list);
 }
 
 /**
@@ -483,45 +486,50 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
 {
 	int error, call_nests = 0;
 	unsigned long flags;
-	struct list_head *lsthead = &ncalls->tasks_call_list;
-	struct nested_call_node *tncur;
-	struct nested_call_node tnode;
+	struct dlock_list_head *head;
+	struct nested_call_node *tncur, tnode;
 
-	spin_lock_irqsave(&ncalls->lock, flags);
+	head = dlock_list_hash(&ncalls->tasks_call_list, ctx);
 
+	spin_lock_irqsave(&head->lock, flags);
 	/*
 	 * Try to see if the current task is already inside this wakeup call.
-	 * We use a list here, since the population inside this set is always
-	 * very much limited.
+	 *
+	 * We use a chained hash table with linked lists here, and take the
+	 * lock to avoid racing when collisions (where ctx pointer is the key).
+	 * Calls for which context is the cpu number, avoid hashing and act as
+	 * pcpu add/removal.
+	 *
+	 * Since the population inside this set is always very much limited,
+	 * list scanning should be short.
 	 */
-	list_for_each_entry(tncur, lsthead, llink) {
-		if (tncur->ctx == ctx &&
-		    (tncur->cookie == cookie || ++call_nests > max_nests)) {
-			/*
-			 * Ops ... loop detected or maximum nest level reached.
-			 * We abort this wake by breaking the cycle itself.
-			 */
-			error = -1;
-			goto out_unlock;
-		}
-	}
+	list_for_each_entry(tncur, &head->list, llink.list) {
+	       if (tncur->ctx == ctx &&
+		   (tncur->cookie == cookie || ++call_nests > EP_MAX_NESTS)) {
+		       /*
+			* Ops ... loop detected or maximum nest level reached.
+			* We abort this wake by breaking the cycle itself.
+			*/
+		       error = -1;
+		       spin_unlock_irqrestore(&head->lock, flags);
+		       goto done;
+	       }
+       }
+
 
 	/* Add the current task and cookie to the list */
 	tnode.ctx = ctx;
 	tnode.cookie = cookie;
-	list_add(&tnode.llink, lsthead);
-
-	spin_unlock_irqrestore(&ncalls->lock, flags);
+	tnode.llink.head = head;
+	list_add(&tnode.llink.list, &head->list);
+	spin_unlock_irqrestore(&head->lock, flags);
 
 	/* Call the nested function */
 	error = (*nproc)(priv, cookie, call_nests);
 
 	/* Remove the current task from the list */
-	spin_lock_irqsave(&ncalls->lock, flags);
-	list_del(&tnode.llink);
-out_unlock:
-	spin_unlock_irqrestore(&ncalls->lock, flags);
-
+	dlock_lists_del(&tnode.llink);
+done:
 	return error;
 }
 
@@ -2313,13 +2321,16 @@ static int __init eventpoll_init(void)
 	 * Initialize the structure used to perform epoll file descriptor
 	 * inclusion loops checks.
 	 */
-	ep_nested_calls_init(&poll_loop_ncalls);
+	if (ep_nested_calls_init(&poll_loop_ncalls))
+		goto err;
 
 	/* Initialize the structure used to perform safe poll wait head wake ups */
-	ep_nested_calls_init(&poll_safewake_ncalls);
+	if (ep_nested_calls_init(&poll_safewake_ncalls))
+		goto err_free1;
 
 	/* Initialize the structure used to perform file's f_op->poll() calls */
-	ep_nested_calls_init(&poll_readywalk_ncalls);
+	if (ep_nested_calls_init(&poll_readywalk_ncalls))
+		goto err_free0;
 
 	/*
 	 * We can have many thousands of epitems, so prevent this from
@@ -2336,5 +2347,12 @@ static int __init eventpoll_init(void)
 			sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
 
 	return 0;
+
+err_free0:
+	ep_nested_calls_free(&poll_safewake_ncalls);
+err_free1:
+	ep_nested_calls_free(&poll_loop_ncalls);
+err:
+	return -ENOMEM;
 }
 fs_initcall(eventpoll_init);
-- 
2.12.0