linux-kernel - Re: O_DIRECT question

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <afe668f90701110005ya2e8187pc6604c5aad24cc84@mail.gmail.com>
Date:	Thu, 11 Jan 2007 16:05:16 +0800
From:	"Roy Huang" <royhuang9@...il.com>
To:	Aubrey <aubreylee@...il.com>
Cc:	"Nick Piggin" <nickpiggin@...oo.com.au>,
	"Andrew Morton" <akpm@...l.org>,
	"Linus Torvalds" <torvalds@...l.org>,
	"Hua Zhong" <hzhong@...il.com>, "Hugh Dickins" <hugh@...itas.com>,
	linux-kernel@...r.kernel.org, hch@...radead.org,
	kenneth.w.chen@...el.com, mjt@....msk.ru
Subject: Re: O_DIRECT question

On a embedded systerm, limiting page cache can relieve memory
fragmentation. There is a patch against 2.6.19, which limit every
opened file page cache and total pagecache. When the limit reach, it
will release the page cache overrun the limit.


Index: include/linux/pagemap.h
===================================================================
--- include/linux/pagemap.h	(revision 2628)
+++ include/linux/pagemap.h	(working copy)
@@ -12,6 +12,7 @@
 #include <asm/uaccess.h>
 #include <linux/gfp.h>

+extern int total_pagecache_limit;
 /*
  * Bits in mapping->flags.  The lower __GFP_BITS_SHIFT bits are the page
  * allocation mode flags.
Index: include/linux/fs.h
===================================================================
--- include/linux/fs.h	(revision 2628)
+++ include/linux/fs.h	(working copy)
@@ -444,6 +444,10 @@
 	spinlock_t		private_lock;	/* for use by the address_space */
 	struct list_head	private_list;	/* ditto */
 	struct address_space	*assoc_mapping;	/* ditto */
+#ifdef CONFIG_LIMIT_PAGECACHE
+	unsigned long 		pages_limit;
+	struct list_head	page_head;
+#endif
 } __attribute__((aligned(sizeof(long))));
 	/*
 	 * On most architectures that alignment is already the case; but
Index: include/linux/mm.h
===================================================================
--- include/linux/mm.h	(revision 2628)
+++ include/linux/mm.h	(working copy)
@@ -231,6 +231,9 @@
 #else
 #define VM_BUG_ON(condition) do { } while(0)
 #endif
+#ifdef CONFIG_LIMIT_PAGECACHE
+	struct list_head page_list;
+#endif

 /*
  * Methods to modify the page usage count.
@@ -1030,7 +1033,21 @@

 /* mm/page-writeback.c */
 int write_one_page(struct page *page, int wait);
+/* possible outcome of pageout() */

+typedef enum {
+	/* failed to write page out, page is locked */
+	PAGE_KEEP,
+	/* move page to the active list, page is locked */
+	PAGE_ACTIVATE,
+	/* page has been sent to the disk successfully, page is unlocked */
+	PAGE_SUCCESS,
+	/* page is clean and locked */
+	PAGE_CLEAN,
+} pageout_t;
+
+pageout_t pageout(struct page *page, struct address_space *mapping);
+
 /* readahead.c */
 #define VM_MAX_READAHEAD	128	/* kbytes */
 #define VM_MIN_READAHEAD	16	/* kbytes (includes current page) */
Index: init/Kconfig
===================================================================
--- init/Kconfig	(revision 2628)
+++ init/Kconfig	(working copy)
@@ -419,6 +419,19 @@
 	  option replaces shmem and tmpfs with the much simpler ramfs code,
 	  which may be appropriate on small systems without swap.

+config LIMIT_PAGECACHE
+	bool "Limit page caches" if EMBEDDED
+
+config PAGECACHE_LIMIT
+	int "Page cache limit for every file in page unit"
+	depends on LIMIT_PAGECACHE
+	default 32
+
+config PAGECACHE_LIMIT_TOTAL
+	int "Total page cache limit in MB unit"
+	depends on LIMIT_PAGECACHE
+	default 10
+
 choice
        prompt "Page frame management algorithm"
        default BUDDY
Index: fs/inode.c
===================================================================
--- fs/inode.c	(revision 2628)
+++ fs/inode.c	(working copy)
@@ -205,6 +205,10 @@
 	INIT_LIST_HEAD(&inode->inotify_watches);
 	mutex_init(&inode->inotify_mutex);
 #endif
+#ifdef CONFIG_LIMIT_PAGECACHE
+	INIT_LIST_HEAD(&inode->i_data.page_head);
+	inode->i_data.pages_limit = CONFIG_PAGECACHE_LIMIT;
+#endif
 }

 EXPORT_SYMBOL(inode_init_once);
Index: mm/filemap.c
===================================================================
--- mm/filemap.c	(revision 2628)
+++ mm/filemap.c	(working copy)
@@ -18,6 +18,7 @@
 #include <linux/capability.h>
 #include <linux/kernel_stat.h>
 #include <linux/mm.h>
+#include <linux/mm_inline.h>
 #include <linux/swap.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
@@ -30,6 +31,9 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/cpuset.h>
+#include <linux/rmap.h>
+#include <linux/buffer_head.h>
+#include <linux/page-flags.h>
 #include "filemap.h"
 #include "internal.h"

@@ -119,6 +123,9 @@
 	radix_tree_delete(&mapping->page_tree, page->index);
 	page->mapping = NULL;
 	mapping->nrpages--;
+#ifdef CONFIG_LIMIT_PAGECACHE
+	list_del_init(&page->page_list);
+#endif
 	__dec_zone_page_state(page, NR_FILE_PAGES);
 }

@@ -169,6 +176,96 @@
 	return 0;
 }

+#ifdef CONFIG_LIMIT_PAGECACHE
+static void balance_cache(struct address_space *mapping)
+{
+	/* Release half of the pages */
+	int count ;
+	int nr_released = 0;
+	struct page *page;
+	struct zone *zone= NULL;
+	struct pagevec freed_pvec;
+	struct list_head ret_list;
+
+	count = mapping->nrpages /2;
+	pagevec_init(&freed_pvec, 0);
+	INIT_LIST_HEAD(&ret_list);
+	lru_add_drain();
+	while(count-->0) {
+		page = list_entry(mapping->page_head.prev, struct page, page_list);
+		zone = page_zone(page);
+		TestClearPageLRU(page);
+		if (PageActive(page))
+			del_page_from_active_list(zone, page);
+		else
+			del_page_from_inactive_list(zone, page);
+
+		list_del_init(&page->page_list); /* Remove from current process's
page list */
+		get_page(page);
+
+		if (TestSetPageLocked(page))
+			goto __keep;
+		if (PageWriteback(page))
+			goto __keep_locked;
+		if (page_referenced(page, 1))
+			goto __keep_locked;
+		if (PageDirty(page)) {
+			switch(pageout(page, mapping)) {
+				case PAGE_KEEP:
+				case PAGE_ACTIVATE:
+					goto __keep_locked;
+				case PAGE_SUCCESS:
+					if (PageWriteback(page) || PageDirty(page))
+						goto __keep;
+					if (TestSetPageLocked(page))
+						goto __keep;
+					if (PageDirty(page) || PageWriteback(page))
+						goto __keep_locked;
+				case PAGE_CLEAN:
+					;
+			}
+		}
+
+		if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
+			goto __keep_locked;
+		if (!remove_mapping(mapping, page))
+			goto __keep_locked;
+
+		unlock_page(page);
+		nr_released++;
+		/* This page maybe in Active LRU */
+		ClearPageActive(page);
+		ClearPageUptodate(page);
+		if (!pagevec_add(&freed_pvec, page))
+			__pagevec_release_nonlru(&freed_pvec);
+		continue;
+__keep_locked:
+		unlock_page(page);
+__keep:
+		SetPageLRU(page);
+		if (PageActive(page)) {
+			add_page_to_active_list(zone, page);
+		} else {
+			add_page_to_inactive_list(zone, page);
+		}
+
+		list_add(&page->page_list, &ret_list);
+	}
+	while(!list_empty(&ret_list)) {
+		page = list_entry(ret_list.prev, struct page, page_list);
+		list_move_tail(&page->page_list, &mapping->page_head);
+		put_page(page);
+	}
+	if (pagevec_count(&freed_pvec))
+		__pagevec_release_nonlru(&freed_pvec);
+
+	if (global_page_state(NR_FILE_PAGES) > total_pagecache_limit)
+		if (zone) {
+			wakeup_kswapd(zone, 0);
+		}
+}
+#endif
+
 /**
  * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
  * @mapping:	address space structure to write
@@ -448,6 +545,10 @@
 			page->mapping = mapping;
 			page->index = offset;
 			mapping->nrpages++;
+#ifdef CONFIG_LIMIT_PAGECACHE
+			list_add(&page->page_list, &mapping->page_head);
+#endif
+
 			__inc_zone_page_state(page, NR_FILE_PAGES);
 		}
 		write_unlock_irq(&mapping->tree_lock);
@@ -1085,6 +1186,10 @@
 		page_cache_release(cached_page);
 	if (filp)
 		file_accessed(filp);
+#ifdef CONFIG_LIMIT_PAGECACHE
+	if (mapping->nrpages >= mapping->pages_limit)
+		balance_cache(mapping);
+#endif	
 }
 EXPORT_SYMBOL(do_generic_mapping_read);

@@ -2195,6 +2300,11 @@
 	if (cached_page)
 		page_cache_release(cached_page);

+#ifdef CONFIG_LIMIT_PAGECACHE
+	if (mapping->nrpages >= mapping->pages_limit)
+		balance_cache(mapping);
+#endif
+	
 	/*
 	 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
 	 */
Index: mm/vmscan.c
===================================================================
--- mm/vmscan.c	(revision 2628)
+++ mm/vmscan.c	(working copy)
@@ -116,6 +116,7 @@

 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
+int total_pagecache_limit = CONFIG_PAGECACHE_LIMIT_TOTAL * 1024 / 4;

 /*
  * Add a shrinker callback to be called from the vm
@@ -292,23 +293,11 @@
 	unlock_page(page);
 }

-/* possible outcome of pageout() */
-typedef enum {
-	/* failed to write page out, page is locked */
-	PAGE_KEEP,
-	/* move page to the active list, page is locked */
-	PAGE_ACTIVATE,
-	/* page has been sent to the disk successfully, page is unlocked */
-	PAGE_SUCCESS,
-	/* page is clean and locked */
-	PAGE_CLEAN,
-} pageout_t;
-
 /*
  * pageout is called by shrink_page_list() for each dirty page.
  * Calls ->writepage().
  */
-static pageout_t pageout(struct page *page, struct address_space *mapping)
+pageout_t pageout(struct page *page, struct address_space *mapping)
 {
 	/*
 	 * If the page is dirty, only perform writeback if that write
@@ -1328,7 +1317,11 @@
 			order = pgdat->kswapd_max_order;
 		}
 		finish_wait(&pgdat->kswapd_wait, &wait);
-		balance_pgdat(pgdat, order);
+		if (global_page_state(NR_FILE_PAGES) >= total_pagecache_limit)
+			balance_pgdat(pgdat, (global_page_state(NR_FILE_PAGES) \
+						- total_pagecache_limit), order);
+		else
+			balance_pgdat(pgdat, order);
 	}
 	return 0;
 }
@@ -1344,8 +1337,10 @@
 		return;

 	pgdat = zone->zone_pgdat;
-	if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
-		return;
+	if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) {
+		if (global_page_state(NR_FILE_PAGES) < total_pagecache_limit)
+			return;
+	}
 	if (pgdat->kswapd_max_order < order)
 		pgdat->kswapd_max_order = order;
 	if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/