lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Mon, 13 May 2013 11:10:47 +0900
From:	Minchan Kim <minchan@...nel.org>
To:	Andrew Morton <akpm@...ux-foundation.org>
Cc:	linux-mm@...ck.org, linux-kernel@...r.kernel.org,
	Rik van Riel <riel@...hat.com>, Mel Gorman <mgorman@...e.de>,
	Johannes Weiner <hannes@...xchg.org>,
	Kamezawa Hiroyuki <kamezawa.hiroyu@...fujitsu.com>,
	Michal Hocko <mhocko@...e.cz>, Hugh Dickins <hughd@...gle.com>,
	Minchan Kim <minchan@...nel.org>
Subject: [RFC 3/4] mm: support remove_mapping in irqcontext

This patch makes to use remove_mapping in irq context.
For it, this patch adds irqcontext argument in some functions
(ex, remove_maping and __remove_mapping) but these functions
are not hot path so i believe it's not a problem.

And it exports swap_info_get and check that we can get a
swap_info_struct->s lock in advance in irq context.
Because __swapcache_free must be succesful.
Otherwise, If __swapcache_free can be failed, we should rollback
things done by __delete_from_swap_cache and it could need memory
allocation for radix tree node in irq context.

More concern is handling for mem_cgroup_uncharge_swapcache in
irqcontext, which isn't aware of irqcontext at the moment and it
should be successful like above reason.

After I review that code, I think it's not a big challenge if I missed
somethings.

My rough plan is following as,

1. Make mctz->lock beging aware of irq by changing spin_lock with
   spin_lock_irqsave.
2. Introuduce new argument "locked" in __mem_cgroup_uncharge_common
   so that __mem_cgroup_uncharge_common can avoid lock_page_cgroup in
   irqcontext to avoid deadlock but caller in irqcontext should be held
   it in advance by next patch.
3. Introduce try_lock_page_cgroup, which will be used __swapcache_free.
4. __remove_mapping can held a page_cgroup lock in advance before calling
   __swapcache_free

I'd like to listen memcg people's opinions before diving into coding.

Signed-off-by: Minchan Kim <minchan@...nel.org>
---
 fs/splice.c          |  2 +-
 include/linux/swap.h | 12 ++++++++++-
 mm/swapfile.c        |  2 +-
 mm/truncate.c        |  2 +-
 mm/vmscan.c          | 56 +++++++++++++++++++++++++++++++++++++++++++---------
 5 files changed, 61 insertions(+), 13 deletions(-)

diff --git a/fs/splice.c b/fs/splice.c
index e6b2559..db77694 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -70,7 +70,7 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
 		 * If we succeeded in removing the mapping, set LRU flag
 		 * and return good.
 		 */
-		if (remove_mapping(mapping, page)) {
+		if (remove_mapping(mapping, page, false)) {
 			buf->flags |= PIPE_BUF_FLAG_LRU;
 			return 0;
 		}
diff --git a/include/linux/swap.h b/include/linux/swap.h
index ca031f7..eb126d2 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -274,7 +274,8 @@ extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
 						unsigned long *nr_scanned);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
-extern int remove_mapping(struct address_space *mapping, struct page *page);
+extern int remove_mapping(struct address_space *mapping, struct page *page,
+				bool irqcontext);
 extern unsigned long vm_total_pages;
 
 #ifdef CONFIG_NUMA
@@ -407,6 +408,9 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
 }
 #endif
 
+
+extern struct swap_info_struct *swap_info_get(swp_entry_t entry);
+
 #else /* CONFIG_SWAP */
 
 #define get_nr_swap_pages()			0L
@@ -430,6 +434,12 @@ static inline void show_swap_cache_info(void)
 #define free_swap_and_cache(swp)	is_migration_entry(swp)
 #define swapcache_prepare(swp)		is_migration_entry(swp)
 
+
+struct swap_info_struct *swap_info_get(swp_entry_t entry)
+{
+	return NULL;
+}
+
 static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
 {
 	return 0;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 33ebdd5..8a425d4 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -505,7 +505,7 @@ swp_entry_t get_swap_page_of_type(int type)
 	return (swp_entry_t) {0};
 }
 
-static struct swap_info_struct *swap_info_get(swp_entry_t entry)
+struct swap_info_struct *swap_info_get(swp_entry_t entry)
 {
 	struct swap_info_struct *p;
 	unsigned long offset, type;
diff --git a/mm/truncate.c b/mm/truncate.c
index c75b736..fa1dc60 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -131,7 +131,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
 	if (page_has_private(page) && !try_to_release_page(page, 0))
 		return 0;
 
-	ret = remove_mapping(mapping, page);
+	ret = remove_mapping(mapping, page, false);
 
 	return ret;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fa6a853..d14c9be 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -450,12 +450,18 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
  * Same as remove_mapping, but if the page is removed from the mapping, it
  * gets returned with a refcount of 0.
  */
-static int __remove_mapping(struct address_space *mapping, struct page *page)
+static int __remove_mapping(struct address_space *mapping, struct page *page,
+				bool irqcontext)
 {
+	unsigned long flags;
+
 	BUG_ON(!PageLocked(page));
 	BUG_ON(mapping != page_mapping(page));
 
-	spin_lock_irq(&mapping->tree_lock);
+	if (irqcontext)
+		spin_lock_irqsave(&mapping->tree_lock, flags);
+	else
+		spin_lock_irq(&mapping->tree_lock);
 	/*
 	 * The non racy check for a busy page.
 	 *
@@ -490,17 +496,45 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
 	}
 
 	if (PageSwapCache(page)) {
+		struct swap_info_struct *p;
 		swp_entry_t swap = { .val = page_private(page) };
+		p = swap_info_get(swap);
+		/*
+		 * If we are irq context, check that we can get a
+		 * swap_info_strcut->lock before removing the page from
+		 * swap cache. Because __swapcache_free must be successful.
+		 * If __swapcache_free can be failed, we should rollback
+		 * things done by __delete_from_swap_cache and it needs
+		 * memory allocation for radix tree node in irqcontext
+		 * That's thing we really want to avoid.
+		 * TODO : memcg mem_cgroup_uncharge_swapcache handling
+		 * in irqcontext
+		 */
+		if (irqcontext && p && !spin_trylock(&p->lock)) {
+			page_unfreeze_refs(page, 2);
+			goto cannot_free;
+		}
+
 		__delete_from_swap_cache(page);
-		spin_unlock_irq(&mapping->tree_lock);
-		swapcache_free(swap, page);
+		if (irqcontext) {
+			spin_unlock_irqrestore(&mapping->tree_lock, flags);
+			if (p)
+				__swapcache_free(p, swap, page);
+			spin_unlock(&p->lock);
+		} else {
+			spin_unlock_irq(&mapping->tree_lock);
+			swapcache_free(swap, page);
+		}
 	} else {
 		void (*freepage)(struct page *);
 
 		freepage = mapping->a_ops->freepage;
 
 		__delete_from_page_cache(page);
-		spin_unlock_irq(&mapping->tree_lock);
+		if (irqcontext)
+			spin_unlock_irqrestore(&mapping->tree_lock, flags);
+		else
+			spin_unlock_irq(&mapping->tree_lock);
 		mem_cgroup_uncharge_cache_page(page);
 
 		if (freepage != NULL)
@@ -510,7 +544,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
 	return 1;
 
 cannot_free:
-	spin_unlock_irq(&mapping->tree_lock);
+	if (irqcontext)
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+	else
+		spin_unlock_irq(&mapping->tree_lock);
 	return 0;
 }
 
@@ -520,9 +557,10 @@ cannot_free:
  * successfully detached, return 1.  Assumes the caller has a single ref on
  * this page.
  */
-int remove_mapping(struct address_space *mapping, struct page *page)
+int remove_mapping(struct address_space *mapping, struct page *page,
+			bool irqcontext)
 {
-	if (__remove_mapping(mapping, page)) {
+	if (__remove_mapping(mapping, page, irqcontext)) {
 		/*
 		 * Unfreezing the refcount with 1 rather than 2 effectively
 		 * drops the pagecache ref for us without requiring another
@@ -904,7 +942,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			}
 		}
 
-		if (!mapping || !__remove_mapping(mapping, page))
+		if (!mapping || !__remove_mapping(mapping, page, false))
 			goto keep_locked;
 
 		/*
-- 
1.8.2.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ