lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <65795E11DBF1E645A09CEC7EAEE94B9C3A30A298@USINDEVS02.corp.hds.com>
Date:	Fri, 7 Jan 2011 17:07:06 -0500
From:	Satoru Moriya <satoru.moriya@....com>
To:	"linux-mm@...ck.org" <linux-mm@...ck.org>
CC:	"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
	"linux-doc@...r.kernel.org" <linux-doc@...r.kernel.org>,
	"akpm@...ux-foundation.org" <akpm@...ux-foundation.org>,
	"mel@....ul.ie" <mel@....ul.ie>,
	"kosaki.motohiro@...fujitsu.com" <kosaki.motohiro@...fujitsu.com>,
	"rdunlap@...otime.net" <rdunlap@...otime.net>,
	"dle-develop@...ts.sourceforge.net" 
	<dle-develop@...ts.sourceforge.net>,
	Seiji Aguchi <seiji.aguchi@....com>
Subject: [RFC][PATCH 2/2] Make watermarks tunable separately

This patch introduces three new sysctls to /proc/sys/vm:
wmark_min_kbytes, wmark_low_kbytes and wmark_high_kbytes.

Each entry is used to compute watermark[min], watermark[low]
and watermark[high] for each zone.

These parameters are also updated when min_free_kbytes are
changed because originally they are set based on min_free_kbytes.
On the other hand, min_free_kbytes is updated when wmark_free_kbytes
changes.

By using the parameters one can adjust the difference among
watermark[min], watermark[low] and watermark[high] and as a result
one can tune the kernel reclaim behaviour to fit their requirement.

Signed-off-by: Satoru Moriya <satoru.moriya@....com>
---
 Documentation/sysctl/vm.txt |   37 +++++++++++++++
 include/linux/mmzone.h      |    6 ++
 kernel/sysctl.c             |   28 +++++++++++-
 mm/page_alloc.c             |  109 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 179 insertions(+), 1 deletions(-)

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index e10b279..674681d 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -55,6 +55,9 @@ Currently, these files are in /proc/sys/vm:
 - stat_interval
 - swappiness
 - vfs_cache_pressure
+- wmark_high_kbytes
+- wmark_low_kbytes
+- wmark_min_kbytes
 - zone_reclaim_mode
 
 ==============================================================
@@ -360,6 +363,8 @@ become subtly broken, and prone to deadlock under high loads.
 
 Setting this too high will OOM your machine instantly.
 
+This is also updated when wmark_min_free_kbytes changes.
+
 =============================================================
 
 min_slab_ratio:
@@ -664,6 +669,38 @@ causes the kernel to prefer to reclaim dentries and inodes.
 
 ==============================================================
 
+wmark_high_kbytes
+
+Contains the amount of free memory above which kswapd stops reclaiming pages.
+
+The Linux VM uses this number to compute a watermark[WMARK_HIGH] value for
+each zone in the system. This is also updated when min_free_kbytes is updated.
+The minimum is wmark_low_kbytes.
+
+==============================================================
+
+wmark_low_kbytes
+
+Contains the amount of free memory below which kswapd starts to reclaim pages.
+
+The Linux VM uses this number to compute a watermark[WMARK_LOW] value for
+each zone in the system. This is also updated when min_free_kbytes changes.
+The minimum is wmark_min_kbytes and maximum is wmark_high_kbytes.
+
+==============================================================
+
+wmark_min_kbytes
+
+Contains the amount of minimum free memory which Linux VM keep. If the amount
+of free memory is less than it, the VM reclaims memory first and then
+allocates (except PF_MEMALLOC allocations).
+
+The Linux VM uses this number to compute a watermark[WMARK_MIN] value for
+each lowmem zone in the system. This is also updated when min_free_kbytes is
+updated. The minimum is 0 and maximum is wmark_low_kbytes.
+
+==============================================================
+
 zone_reclaim_mode:
 
 Zone_reclaim_mode allows someone to set more or less aggressive approaches to
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 39c24eb..d2f4b40 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -771,6 +771,12 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
+int wmark_min_kbytes_sysctl_handler(struct ctl_table *, int,
+					void __user *, size_t *, loff_t *);
+int wmark_low_kbytes_sysctl_handler(struct ctl_table *, int,
+					void __user *, size_t *, loff_t *);
+int wmark_high_kbytes_sysctl_handler(struct ctl_table *, int,
+					void __user *, size_t *, loff_t *);
 int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ae5cbb1..060244d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -94,6 +94,7 @@ extern char core_pattern[];
 extern unsigned int core_pipe_limit;
 extern int pid_max;
 extern int min_free_kbytes;
+extern int wmark_min_kbytes, wmark_low_kbytes, wmark_high_kbytes;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
@@ -1326,7 +1327,32 @@ static struct ctl_table vm_table[] = {
 		.extra2		= &one,
 	},
 #endif
-
+	{
+		.procname	= "wmark_min_kbytes",
+		.data		= &wmark_min_kbytes,
+		.maxlen		= sizeof(wmark_min_kbytes),
+		.mode		= 0644,
+		.proc_handler	= wmark_min_kbytes_sysctl_handler,
+		.extra1		= &zero,
+		.extra2		= &wmark_low_kbytes,
+	},
+	{
+		.procname	= "wmark_low_kbytes",
+		.data		= &wmark_low_kbytes,
+		.maxlen		= sizeof(wmark_low_kbytes),
+		.mode		= 0644,
+		.proc_handler	= wmark_low_kbytes_sysctl_handler,
+		.extra1		= &wmark_min_kbytes,
+		.extra2		= &wmark_high_kbytes,
+	},
+	{
+		.procname	= "wmark_high_kbytes",
+		.data		= &wmark_high_kbytes,
+		.maxlen		= sizeof(wmark_high_kbytes),
+		.mode		= 0644,
+		.proc_handler	= wmark_high_kbytes_sysctl_handler,
+		.extra1		= &wmark_low_kbytes,
+	},
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ff7e158..7cd9cbf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -172,6 +172,9 @@ static char * const zone_names[MAX_NR_ZONES] = {
 };
 
 int min_free_kbytes = 1024;
+int wmark_min_kbytes = 1024;
+int wmark_low_kbytes = 1024;
+int wmark_high_kbytes = 1024;
 
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
@@ -4926,10 +4929,77 @@ void setup_per_zone_wmarks(void)
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
 
+	wmark_min_kbytes = min_free_kbytes;
+	wmark_low_kbytes = min_free_kbytes + (min_free_kbytes >> 2);
+	wmark_high_kbytes = min_free_kbytes + (min_free_kbytes >> 1);
+
 	/* update totalreserve_pages */
 	calculate_totalreserve_pages();
 }
 
+/**
+ * setup_per_zone_wmark - called when wmark_{min|low|high}_kbytes changes
+ *
+ * The watermark[min,low,high] values for each zone are set with respect
+ * to wmark_min_kbytes, wmark_low_kbytes and wmark_high_kbytes.
+ */
+void setup_per_zone_wmark(int wmark)
+{
+	unsigned long pages;
+	unsigned long lowmem_pages = 0;
+	struct zone *zone;
+	unsigned long flags;
+
+	switch (wmark) {
+	case WMARK_MIN:
+		pages = wmark_min_kbytes >> (PAGE_SHIFT - 10);
+		min_free_kbytes = wmark_min_kbytes;
+		break;
+	case WMARK_LOW:
+		pages = wmark_low_kbytes >> (PAGE_SHIFT - 10);
+		break;
+	case WMARK_HIGH:
+		pages = wmark_high_kbytes >> (PAGE_SHIFT - 10);
+		break;
+	default:
+		return;
+	}
+
+	/* Calculate total number of !ZONE_HIGHMEM pages */
+	for_each_zone(zone) {
+		if (!is_highmem(zone))
+			lowmem_pages += zone->present_pages;
+	}
+
+	for_each_zone(zone) {
+		u64 tmp;
+
+		spin_lock_irqsave(&zone->lock, flags);
+		tmp = (u64)pages * zone->present_pages;
+		do_div(tmp, lowmem_pages);
+
+		if (wmark == WMARK_MIN && is_highmem(zone)) {
+			int min_pages;
+
+			min_pages = zone->present_pages / 1024;
+			if (min_pages < SWAP_CLUSTER_MAX)
+				min_pages = SWAP_CLUSTER_MAX;
+			if (min_pages > 128)
+				min_pages = 128;
+			zone->watermark[wmark] = min_pages;
+		} else {
+			zone->watermark[wmark] = tmp;
+		}
+
+		if (wmark == WMARK_MIN)
+			setup_zone_migrate_reserve(zone);
+		spin_unlock_irqrestore(&zone->lock, flags);
+	}
+	
+	if (wmark == WMARK_HIGH)
+		calculate_totalreserve_pages();
+}
+
 /*
  * The inactive anon list should be small enough that the VM never has to
  * do too much work, but large enough that each inactive page has a chance
@@ -5029,6 +5099,45 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 	return 0;
 }
 
+int wmark_min_kbytes_sysctl_handler(ctl_table *table, int write,
+	void __user *buffer, size_t *length, loff_t *ppos)
+{
+	int ret;
+	
+	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+	if (ret < 0 || !write)
+		return ret;
+
+	setup_per_zone_wmark(WMARK_MIN);
+	return ret;
+}
+
+int wmark_low_kbytes_sysctl_handler(ctl_table *table, int write,
+	void __user *buffer, size_t *length, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+	if (ret < 0 || !write)
+		return ret;
+
+	setup_per_zone_wmark(WMARK_LOW);
+	return ret;
+}
+
+int wmark_high_kbytes_sysctl_handler(ctl_table *table, int write,
+	void __user *buffer, size_t *length, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+	if (ret < 0 || !write)
+		return ret;
+
+	setup_per_zone_wmark(WMARK_HIGH);
+	return ret;
+}
+
 #ifdef CONFIG_NUMA
 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ