lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Fri,  9 Sep 2016 10:59:34 +0100
From:   Mel Gorman <mgorman@...hsingularity.net>
To:     LKML <linux-kernel@...r.kernel.org>, Linux-MM <linux-mm@...ck.org>,
        Mel Gorman <mgorman@...hsingularity.net>
Cc:     Dave Chinner <david@...morbit.com>,
        Linus Torvalds <torvalds@...ux-foundation.org>,
        Ying Huang <ying.huang@...el.com>,
        Michal Hocko <mhocko@...nel.org>
Subject: [PATCH 3/4] mm, vmscan: Stall kswapd if contending on tree_lock

If there is a large reader/writer, it's possible for multiple kswapd instances
and the processes issueing IO to contend on a single mapping->tree_lock. This
patch will cause all kswapd instances except one to backoff if contending on
tree_lock. A sleep kswapd instance will be woken when one has made progress.

                              4.8.0-rc5             4.8.0-rc5
                       ramdisknonrot-v1          waitqueue-v1
Min      Elapsd-8       18.31 (  0.00%)       28.32 (-54.67%)
Amean    System-1      181.00 (  0.00%)      179.61 (  0.77%)
Amean    System-3       86.19 (  0.00%)       68.91 ( 20.05%)
Amean    System-5       67.43 (  0.00%)       93.09 (-38.05%)
Amean    System-7       89.55 (  0.00%)       90.98 ( -1.60%)
Amean    System-8      102.92 (  0.00%)      299.81 (-191.30%)
Amean    Elapsd-1      209.23 (  0.00%)      210.41 ( -0.57%)
Amean    Elapsd-3       36.93 (  0.00%)       33.89 (  8.25%)
Amean    Elapsd-5       19.52 (  0.00%)       25.19 (-29.08%)
Amean    Elapsd-7       21.93 (  0.00%)       18.45 ( 15.88%)
Amean    Elapsd-8       23.63 (  0.00%)       48.80 (-106.51%)

Note that unlike the previous patches that this is not an unconditional win.
System CPU usage is generally higher because direct reclaim is used instead
of multiple competing kswapd instances. According to the stats, there is
10 times more direct reclaim scanning and reclaim activity and overall
the workload takes longer to complete.

           4.8.0-rc5    4.8.0-rc5
     amdisknonrot-v1 waitqueue-v1
User          473.24       462.40
System       3690.20      5127.32
Elapsed      2186.05      2364.08

The motivation for this patch was Dave Chinner reporting that an xfs_io
workload rewriting a single file spent significant amount of time spinning
on the tree_lock. Local tests were inconclusive. On spinning storage, the
IO was so slow as it was not noticable. When xfs_io is backed by ramdisk
to simulate fast storage then it can be observed;

                                                        4.8.0-rc5             4.8.0-rc5
                                                 ramdisknonrot-v1          waitqueue-v1
Min      pwrite-single-rewrite-async-System        3.12 (  0.00%)        3.06 (  1.92%)
Min      pwrite-single-rewrite-async-Elapsd        3.25 (  0.00%)        3.17 (  2.46%)
Amean    pwrite-single-rewrite-async-System        3.32 (  0.00%)        3.23 (  2.67%)
Amean    pwrite-single-rewrite-async-Elapsd        3.42 (  0.00%)        3.33 (  2.71%)

           4.8.0-rc5    4.8.0-rc5
    ramdisknonrot-v1 waitqueue-v1
User            9.06         8.76
System        402.67       392.31
Elapsed       416.91       406.29

That's roughly a 2.5% drop in CPU usage overall. A test from Dave Chinner
with some data to support/reject this patch is highly desirable.

Signed-off-by: Mel Gorman <mgorman@...hsingularity.net>
---
 mm/vmscan.c | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index f7beb573a594..936070b0790e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -735,6 +735,9 @@ static enum remove_mapping __remove_mapping(struct address_space *mapping,
 	return REMOVED_FAIL;
 }
 
+static unsigned long kswapd_exclusive = NUMA_NO_NODE;
+static DECLARE_WAIT_QUEUE_HEAD(kswapd_contended_wait);
+
 static unsigned long remove_mapping_list(struct list_head *mapping_list,
 					 struct list_head *free_pages,
 					 struct list_head *ret_pages)
@@ -755,8 +758,28 @@ static unsigned long remove_mapping_list(struct list_head *mapping_list,
 
 		list_del(&page->lru);
 		if (!mapping) {
+			pg_data_t *pgdat = page_pgdat(page);
 			mapping = page_mapping(page);
-			spin_lock_irqsave(&mapping->tree_lock, flags);
+
+			/* Account for trylock contentions in kswapd */
+			if (!current_is_kswapd() ||
+			    pgdat->node_id == kswapd_exclusive) {
+				spin_lock_irqsave(&mapping->tree_lock, flags);
+			} else {
+				/* Account for contended pages and contended kswapds */
+				if (!spin_trylock_irqsave(&mapping->tree_lock, flags)) {
+					/* Stall kswapd once for 10ms on contention */
+					if (cmpxchg(&kswapd_exclusive, NUMA_NO_NODE, pgdat->node_id) != NUMA_NO_NODE) {
+						DEFINE_WAIT(wait);
+						prepare_to_wait(&kswapd_contended_wait,
+							&wait, TASK_INTERRUPTIBLE);
+						io_schedule_timeout(HZ/100);
+						finish_wait(&kswapd_contended_wait, &wait);
+					}
+
+					spin_lock_irqsave(&mapping->tree_lock, flags);
+				}
+			}
 		}
 
 		switch (__remove_mapping(mapping, page, true, &freepage)) {
@@ -3212,6 +3235,7 @@ static void age_active_anon(struct pglist_data *pgdat,
 static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
 {
 	unsigned long mark = high_wmark_pages(zone);
+	unsigned long nid;
 
 	if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx))
 		return false;
@@ -3223,6 +3247,12 @@ static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
 	clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags);
 	clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags);
 
+	nid = zone->zone_pgdat->node_id;
+	if (nid == kswapd_exclusive) {
+		cmpxchg(&kswapd_exclusive, nid, NUMA_NO_NODE);
+		wake_up_interruptible(&kswapd_contended_wait);
+	}
+
 	return true;
 }
 
-- 
2.6.4

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ