lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Fri,  9 Sep 2016 10:59:32 +0100
From:   Mel Gorman <mgorman@...hsingularity.net>
To:     LKML <linux-kernel@...r.kernel.org>, Linux-MM <linux-mm@...ck.org>,
        Mel Gorman <mgorman@...hsingularity.net>
Cc:     Dave Chinner <david@...morbit.com>,
        Linus Torvalds <torvalds@...ux-foundation.org>,
        Ying Huang <ying.huang@...el.com>,
        Michal Hocko <mhocko@...nel.org>
Subject: [PATCH 1/4] mm, vmscan: Batch removal of mappings under a single lock during reclaim

Pages unmapped during reclaim acquire/release the mapping->tree_lock for
every single page. There are two cases when it's likely that pages at the
tail of the LRU share the same mapping -- large amounts of IO to/from a
single file and swapping. This patch acquires the mapping->tree_lock for
multiple page removals.

To trigger heavy swapping, varying numbers of usemem instances were used to
read anonymous memory larger than the physical memory size. A UMA machine
was used with 4 fake NUMA nodes to increase interference from kswapd. The
swap device was backed by ramdisk using the brd driver. NUMA balancing
was disabled to limit interference.

                              4.8.0-rc5             4.8.0-rc5
                                vanilla              batch-v1
Amean    System-1      260.53 (  0.00%)      192.98 ( 25.93%)
Amean    System-3      179.59 (  0.00%)      198.33 (-10.43%)
Amean    System-5      205.71 (  0.00%)      105.22 ( 48.85%)
Amean    System-7      146.46 (  0.00%)       97.79 ( 33.23%)
Amean    System-8      275.37 (  0.00%)      149.39 ( 45.75%)
Amean    Elapsd-1      292.89 (  0.00%)      219.95 ( 24.90%)
Amean    Elapsd-3       69.47 (  0.00%)       79.02 (-13.74%)
Amean    Elapsd-5       54.12 (  0.00%)       29.88 ( 44.79%)
Amean    Elapsd-7       34.28 (  0.00%)       24.06 ( 29.81%)
Amean    Elapsd-8       57.98 (  0.00%)       33.34 ( 42.50%)

System is system CPU usage and elapsed time is the time to complete the
workload. Regardless of the thread count, the workload generally completes
faster although there is a lot of varability as much more work is being
done under a single lock.

xfs_io and pwrite was used to rewrite a file multiple times to measure any
locking overhead reduction.

xfsio Time
                                                        4.8.0-rc5             4.8.0-rc5
                                                          vanilla           batch-v1r18
Amean    pwrite-single-rewrite-async-System       49.19 (  0.00%)       49.49 ( -0.60%)
Amean    pwrite-single-rewrite-async-Elapsd      322.87 (  0.00%)      322.72 (  0.05%)

Unfortunately the difference here is well within the noise as the workload
is dominated by the cost of the IO. It may be the case that the benefit
is noticable on faster storage or in KVM instances where the data may be
resident in the page cache of the host.

Signed-off-by: Mel Gorman <mgorman@...hsingularity.net>
---
 mm/vmscan.c | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 142 insertions(+), 28 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index b1e12a1ea9cf..f7beb573a594 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -622,18 +622,47 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 }
 
 /*
+ * Finalise the mapping removal without the mapping lock held. The pages
+ * are placed on the free_list and the caller is expected to drop the
+ * final reference.
+ */
+static void finalise_remove_mapping_list(struct list_head *swapcache,
+				    struct list_head *filecache,
+				    void (*freepage)(struct page *),
+				    struct list_head *free_list)
+{
+	struct page *page;
+
+	list_for_each_entry(page, swapcache, lru) {
+		swp_entry_t swap = { .val = page_private(page) };
+		swapcache_free(swap);
+		set_page_private(page, 0);
+	}
+
+	list_for_each_entry(page, filecache, lru)
+		freepage(page);
+
+	list_splice_init(swapcache, free_list);
+	list_splice_init(filecache, free_list);
+}
+
+enum remove_mapping {
+	REMOVED_FAIL,
+	REMOVED_SWAPCACHE,
+	REMOVED_FILECACHE
+};
+
+/*
  * Same as remove_mapping, but if the page is removed from the mapping, it
  * gets returned with a refcount of 0.
  */
-static int __remove_mapping(struct address_space *mapping, struct page *page,
-			    bool reclaimed)
+static enum remove_mapping __remove_mapping(struct address_space *mapping,
+				struct page *page, bool reclaimed,
+				void (**freepage)(struct page *))
 {
-	unsigned long flags;
-
 	BUG_ON(!PageLocked(page));
 	BUG_ON(mapping != page_mapping(page));
 
-	spin_lock_irqsave(&mapping->tree_lock, flags);
 	/*
 	 * The non racy check for a busy page.
 	 *
@@ -668,16 +697,17 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 	}
 
 	if (PageSwapCache(page)) {
-		swp_entry_t swap = { .val = page_private(page) };
+		unsigned long swapval = page_private(page);
+		swp_entry_t swap = { .val = swapval };
 		mem_cgroup_swapout(page, swap);
 		__delete_from_swap_cache(page);
-		spin_unlock_irqrestore(&mapping->tree_lock, flags);
-		swapcache_free(swap);
+		set_page_private(page, swapval);
+		return REMOVED_SWAPCACHE;
 	} else {
-		void (*freepage)(struct page *);
 		void *shadow = NULL;
 
-		freepage = mapping->a_ops->freepage;
+		*freepage = mapping->a_ops->freepage;
+
 		/*
 		 * Remember a shadow entry for reclaimed file cache in
 		 * order to detect refaults, thus thrashing, later on.
@@ -698,17 +728,76 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 		    !mapping_exiting(mapping) && !dax_mapping(mapping))
 			shadow = workingset_eviction(mapping, page);
 		__delete_from_page_cache(page, shadow);
-		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+		return REMOVED_FILECACHE;
+	}
 
-		if (freepage != NULL)
-			freepage(page);
+cannot_free:
+	return REMOVED_FAIL;
+}
+
+static unsigned long remove_mapping_list(struct list_head *mapping_list,
+					 struct list_head *free_pages,
+					 struct list_head *ret_pages)
+{
+	unsigned long flags;
+	struct address_space *mapping = NULL;
+	void (*freepage)(struct page *) = NULL;
+	LIST_HEAD(swapcache);
+	LIST_HEAD(filecache);
+	struct page *page, *tmp;
+	unsigned long nr_reclaimed = 0;
+
+continue_removal:
+	list_for_each_entry_safe(page, tmp, mapping_list, lru) {
+		/* Batch removals under one tree lock at a time */
+		if (mapping && page_mapping(page) != mapping)
+			continue;
+
+		list_del(&page->lru);
+		if (!mapping) {
+			mapping = page_mapping(page);
+			spin_lock_irqsave(&mapping->tree_lock, flags);
+		}
+
+		switch (__remove_mapping(mapping, page, true, &freepage)) {
+		case REMOVED_FILECACHE:
+			/*
+			 * At this point, we have no other references and there
+			 * is no way to pick any more up (removed from LRU,
+			 * removed from pagecache). Can use non-atomic bitops
+			 * now (and we obviously don't have to worry about
+			 * waking up a process  waiting on the page lock,
+			 * because there are no references.
+			 */
+			__ClearPageLocked(page);
+			if (freepage)
+				list_add(&page->lru, &filecache);
+			else
+				list_add(&page->lru, free_pages);
+			nr_reclaimed++;
+			break;
+		case REMOVED_SWAPCACHE:
+			/* See FILECACHE case as to why non-atomic is safe */
+			__ClearPageLocked(page);
+			list_add(&page->lru, &swapcache);
+			nr_reclaimed++;
+			break;
+		case REMOVED_FAIL:
+			unlock_page(page);
+			list_add(&page->lru, ret_pages);
+		}
 	}
 
-	return 1;
+	if (mapping) {
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+		finalise_remove_mapping_list(&swapcache, &filecache, freepage, free_pages);
+		mapping = NULL;
+	}
 
-cannot_free:
-	spin_unlock_irqrestore(&mapping->tree_lock, flags);
-	return 0;
+	if (!list_empty(mapping_list))
+		goto continue_removal;
+
+	return nr_reclaimed;
 }
 
 /*
@@ -719,16 +808,42 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
  */
 int remove_mapping(struct address_space *mapping, struct page *page)
 {
-	if (__remove_mapping(mapping, page, false)) {
+	unsigned long flags;
+	LIST_HEAD(swapcache);
+	LIST_HEAD(filecache);
+	void (*freepage)(struct page *) = NULL;
+	swp_entry_t swap;
+	int ret = 0;
+
+	spin_lock_irqsave(&mapping->tree_lock, flags);
+	freepage = mapping->a_ops->freepage;
+	ret = __remove_mapping(mapping, page, false, &freepage);
+	spin_unlock_irqrestore(&mapping->tree_lock, flags);
+
+	if (ret != REMOVED_FAIL) {
 		/*
 		 * Unfreezing the refcount with 1 rather than 2 effectively
 		 * drops the pagecache ref for us without requiring another
 		 * atomic operation.
 		 */
 		page_ref_unfreeze(page, 1);
+	}
+
+	switch (ret) {
+	case REMOVED_FILECACHE:
+		if (freepage)
+			freepage(page);
+		return 1;
+	case REMOVED_SWAPCACHE:
+		swap.val = page_private(page);
+		swapcache_free(swap);
+		set_page_private(page, 0);
 		return 1;
+	case REMOVED_FAIL:
+		return 0;
 	}
-	return 0;
+
+	BUG();
 }
 
 /**
@@ -910,6 +1025,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 {
 	LIST_HEAD(ret_pages);
 	LIST_HEAD(free_pages);
+	LIST_HEAD(mapping_pages);
 	int pgactivate = 0;
 	unsigned long nr_unqueued_dirty = 0;
 	unsigned long nr_dirty = 0;
@@ -1206,17 +1322,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		}
 
 lazyfree:
-		if (!mapping || !__remove_mapping(mapping, page, true))
+		if (!mapping)
 			goto keep_locked;
 
-		/*
-		 * At this point, we have no other references and there is
-		 * no way to pick any more up (removed from LRU, removed
-		 * from pagecache). Can use non-atomic bitops now (and
-		 * we obviously don't have to worry about waking up a process
-		 * waiting on the page lock, because there are no references.
-		 */
-		__ClearPageLocked(page);
+		list_add(&page->lru, &mapping_pages);
+		if (ret == SWAP_LZFREE)
+			count_vm_event(PGLAZYFREED);
+		continue;
+
 free_it:
 		if (ret == SWAP_LZFREE)
 			count_vm_event(PGLAZYFREED);
@@ -1251,6 +1364,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
 	}
 
+	nr_reclaimed += remove_mapping_list(&mapping_pages, &free_pages, &ret_pages);
 	mem_cgroup_uncharge_list(&free_pages);
 	try_to_unmap_flush();
 	free_hot_cold_page_list(&free_pages, true);
-- 
2.6.4

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ