lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20230406091450.167779-1-shaun.tancheff@gmail.com>
Date:   Thu,  6 Apr 2023 16:14:50 +0700
From:   Shaun Tancheff <shaun.tancheff@...il.com>
To:     Johannes Weiner <hannes@...xchg.org>,
        Michal Hocko <mhocko@...nel.org>,
        Vladimir Davydov <vdavydov.dev@...il.com>
Cc:     Shaun Tancheff <shaun.tancheff@....com>,
        Andrew Morton <akpm@...ux-foundation.org>,
        cgroups@...r.kernel.org, linux-mm@...ck.org,
        linux-kernel@...r.kernel.org
Subject: [PATCH] memcg: Default value setting in memcg-v1

From: Shaun Tancheff <shaun.tancheff@....com>

Setting min, low and high values with memcg-v1
provides bennefits for  users that are unable to update
to memcg-v2.

Setting min, low and high can be set in memcg-v1
to apply enough memory pressure to effective throttle
filesystem I/O without hitting memcg oom.

This can be enabled by setting the sysctl values:
  vm.memcg_v1_min_default
  vm.memcg_v1_low_default
  vm.memcg_v1_high_default

When a memory control group is newly crated the
min, low and high values are set to percent of the
maximum based on the min, low and high default
values respectively.

This resolves an issue with memory pressure when users
initiate unbounded I/O on various file systems such as
ext4, XFS and NFS.

Signed-off-by: Shaun Tancheff <shaun.tancheff@....com>
---
v0: Initial hard coded limits by percent.
v1: Added sysfs access and module parameters for percent values to enable
v2: Fix 32-bit, remove need for missing __udivdi3
v3: Added sysctl parameters and documentation
 .../admin-guide/cgroup-v1/memory.rst          | 33 +++++++++
 Documentation/admin-guide/sysctl/vm.rst       | 33 +++++++++
 include/linux/memcontrol.h                    |  5 ++
 kernel/sysctl.c                               | 29 ++++++++
 mm/memcontrol.c                               | 69 ++++++++++++++++++-
 5 files changed, 168 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst
index 258e45cc3b2d..4b44e0da49d6 100644
--- a/Documentation/admin-guide/cgroup-v1/memory.rst
+++ b/Documentation/admin-guide/cgroup-v1/memory.rst
@@ -70,6 +70,15 @@ Brief summary of control files.
  memory.memsw.usage_in_bytes	     show current usage for memory+Swap
 				     (See 5.5 for details)
  memory.limit_in_bytes		     set/show limit of memory usage
+ memory.limit_in_bytes.min	     show current memory min setting not present
+				     on the root control group.
+				     (See sysctl's vm.memcg_v1_min_default)
+ memory.limit_in_bytes.low	     show current memory low setting not present
+				     on the root control group.
+				     (See sysctl's vm.memcg_v1_low_default)
+ memory.limit_in_bytes.high	     show current memory low setting not present
+				     on the root control group.
+				     (See sysctl's vm.memcg_v1_high_default)
  memory.memsw.limit_in_bytes	     set/show limit of memory+Swap usage
  memory.failcnt			     show the number of memory usage hits limits
  memory.memsw.failcnt		     show the number of memory+Swap hits limits
@@ -648,6 +657,30 @@ The output format of memory.numa_stat is::
 
 The "total" count is sum of file + anon + unevictable.
 
+5.6 limit_in_bytes.low, min, and high
+-------------------------------------
+
+These read-only values enable viewing the current low, min and high
+restrictions added to a newly created cgroup when the sysctl vm
+parameters: vm.memcg_v1_low_default, vm.memcg_v1_min_default,
+and vm.memcg_v1_high_default are enabled.
+
+Example usage:
+  sudo sysctl -w vm.memcg_v1_min_default=10
+  sudo sysctl -w vm.memcg_v1_low_default=30
+  sudo sysctl -w vm.memcg_v1_high_default=80
+
+  sudo mkdir /sys/fs/cgroup/memory/restrict
+  echo 100M | sudo tee /sys/fs/cgroup/memory/restrict/memory.limit_in_bytes
+  cat /sys/fs/cgroup/memory/restrict/memory.limit_in_bytes.min
+  2560
+  cat /sys/fs/cgroup/memory/restrict/memory.limit_in_bytes.low
+  7680
+  cat /sys/fs/cgroup/memory/restrict/memory.limit_in_bytes.high
+  20480
+  echo $$ | sudo tee /sys/fs/cgroup/memory/restrict/tasks
+  dd if=/dev/zero of=~/file.bin bs=10M status=progress
+
 6. Hierarchy support
 ====================
 
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index 988f6a4c8084..87eefa165f92 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -43,6 +43,9 @@ Currently, these files are in /proc/sys/vm:
 - legacy_va_layout
 - lowmem_reserve_ratio
 - max_map_count
+- memcg_v1_high_default
+- memcg_v1_low_default
+- memcg_v1_min_default
 - memory_failure_early_kill
 - memory_failure_recovery
 - min_free_kbytes
@@ -425,6 +428,36 @@ e.g., up to one or two maps per allocation.
 The default value is 65530.
 
 
+memcg_v1_min_default:
+=====================
+
+This file contains a percentage of the cgroup memory limit used to
+set the min value of a newly memory cgroup. This value is only used
+with memory cgroup v1 interface.
+
+The default is 0 (disabled). Range is [0, 100].
+
+
+memcg_v1_low_default:
+=====================
+
+This file contains a percentage of the cgroup memory limit used to
+set the low value of a newly memory cgroup. This value is only used
+with memory cgroup v1 interface.
+
+The default is 0 (disabled). Range is [0, 100].
+
+
+memcg_v1_high_default:
+======================
+
+This file contains a percentage of the cgroup memory limit used to
+set the high value of a newly memory cgroup. This value is only used
+with memory cgroup v1 interface.
+
+The default is 0 (disabled). Range is [0, 100].
+
+
 memory_failure_early_kill:
 ==========================
 
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 85dc9b88ea37..0592b5e19883 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -68,6 +68,11 @@ struct mem_cgroup_id {
 	refcount_t ref;
 };
 
+/* System default memory protection setting */
+extern int sysctl_memcg_min_default;
+extern int sysctl_memcg_low_default;
+extern int sysctl_memcg_high_default;
+
 /*
  * Per memcg event counter is incremented at every pagein/pageout. With THP,
  * it will be incremented by the number of pages. This counter is used
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1c240d2c99bc..bf923e50e597 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2449,6 +2449,35 @@ static struct ctl_table vm_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_ONE,
 	},
+#endif
+#ifdef CONFIG_MEMCG
+	{
+		.procname	= "memcg_v1_min_default",
+		.data		= &sysctl_memcg_min_default,
+		.maxlen		= sizeof(sysctl_memcg_min_default),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE_HUNDRED,
+	},
+	{
+		.procname	= "memcg_v1_low_default",
+		.data		= &sysctl_memcg_low_default,
+		.maxlen		= sizeof(sysctl_memcg_low_default),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE_HUNDRED,
+	},
+	{
+		.procname	= "memcg_v1_high_default",
+		.data		= &sysctl_memcg_high_default,
+		.maxlen		= sizeof(sysctl_memcg_high_default),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE_HUNDRED,
+	},
 #endif
 	{ }
 };
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2eee092f8f11..74875178b48b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -82,6 +82,11 @@ struct mem_cgroup *root_mem_cgroup __read_mostly;
 DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
 EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
 
+/* System default memory protection setting */
+int sysctl_memcg_min_default __read_mostly = 0;
+int sysctl_memcg_low_default __read_mostly = 0;
+int sysctl_memcg_high_default __read_mostly = 0;
+
 /* Socket memory accounting disabled? */
 static bool cgroup_memory_nosocket __ro_after_init;
 
@@ -205,6 +210,7 @@ enum res_type {
 	_MEMSWAP,
 	_KMEM,
 	_TCP,
+	_MEM_V1,
 };
 
 #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
@@ -3676,6 +3682,9 @@ enum {
 	RES_MAX_USAGE,
 	RES_FAILCNT,
 	RES_SOFT_LIMIT,
+	RES_LIMIT_MIN,
+	RES_LIMIT_LOW,
+	RES_LIMIT_HIGH,
 };
 
 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
@@ -3686,6 +3695,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
 
 	switch (MEMFILE_TYPE(cft->private)) {
 	case _MEM:
+	case _MEM_V1:
 		counter = &memcg->memory;
 		break;
 	case _MEMSWAP:
@@ -3716,6 +3726,12 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
 		return counter->failcnt;
 	case RES_SOFT_LIMIT:
 		return (u64)memcg->soft_limit * PAGE_SIZE;
+	case RES_LIMIT_MIN:
+		return (u64)READ_ONCE(memcg->memory.min);
+	case RES_LIMIT_LOW:
+		return (u64)READ_ONCE(memcg->memory.low);
+	case RES_LIMIT_HIGH:
+		return (u64)READ_ONCE(memcg->memory.high);
 	default:
 		BUG();
 	}
@@ -3815,6 +3831,34 @@ static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
 	return ret;
 }
 
+static inline void mem_cgroup_v1_set_defaults(struct mem_cgroup *memcg,
+					      unsigned long nr_pages)
+{
+	unsigned long min, low, high;
+
+	if (mem_cgroup_is_root(memcg) || PAGE_COUNTER_MAX == nr_pages)
+		return;
+
+	min = READ_ONCE(memcg->memory.min);
+	low = READ_ONCE(memcg->memory.low);
+	if (min || low)
+		return;
+
+	if (!min && sysctl_memcg_min_default > 0) {
+		min = (nr_pages * sysctl_memcg_min_default) / 100;
+		page_counter_set_min(&memcg->memory, min);
+	}
+	if (!low && sysctl_memcg_low_default > 0) {
+		low = (nr_pages * sysctl_memcg_low_default) / 100;
+		page_counter_set_low(&memcg->memory, low);
+	}
+	high = READ_ONCE(memcg->memory.high);
+	if (high == PAGE_COUNTER_MAX && sysctl_memcg_high_default) {
+		high = (nr_pages * sysctl_memcg_high_default) / 100;
+		page_counter_set_high(&memcg->memory, high);
+	}
+}
+
 /*
  * The user of this function is...
  * RES_LIMIT.
@@ -3838,6 +3882,11 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
 			break;
 		}
 		switch (MEMFILE_TYPE(of_cft(of)->private)) {
+		case _MEM_V1:
+			ret = mem_cgroup_resize_max(memcg, nr_pages, false);
+			if (!ret)
+				mem_cgroup_v1_set_defaults(memcg, nr_pages);
+			break;
 		case _MEM:
 			ret = mem_cgroup_resize_max(memcg, nr_pages, false);
 			break;
@@ -5000,10 +5049,28 @@ static struct cftype mem_cgroup_legacy_files[] = {
 	},
 	{
 		.name = "limit_in_bytes",
-		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
+		.private = MEMFILE_PRIVATE(_MEM_V1, RES_LIMIT),
 		.write = mem_cgroup_write,
 		.read_u64 = mem_cgroup_read_u64,
 	},
+	{
+		.name = "limit_in_bytes.min",
+		.private = MEMFILE_PRIVATE(_MEM_V1, RES_LIMIT_MIN),
+		.read_u64 = mem_cgroup_read_u64,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "limit_in_bytes.low",
+		.private = MEMFILE_PRIVATE(_MEM_V1, RES_LIMIT_LOW),
+		.read_u64 = mem_cgroup_read_u64,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "limit_in_bytes.high",
+		.private = MEMFILE_PRIVATE(_MEM_V1, RES_LIMIT_HIGH),
+		.read_u64 = mem_cgroup_read_u64,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
 	{
 		.name = "soft_limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
-- 
2.34.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ