lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <TYYP286MB1115132A9443D0B6D21DAD89C5FA9@TYYP286MB1115.JPNP286.PROD.OUTLOOK.COM>
Date:   Wed, 27 Apr 2022 05:32:40 -0400
From:   Xie Yongmei <yongmeixie@...mail.com>
To:     Andrew Morton <akpm@...ux-foundation.org>, linux-mm@...ck.org,
        linux-kernel@...r.kernel.org,
        Alexander Viro <viro@...iv.linux.org.uk>,
        linux-fsdevel@...r.kernel.org
Cc:     yongmeixie@...mail.comc, Xie Yongmei <yongmeixie@...mail.com>
Subject: [PATCH 2/3] writeback: per memcg dirty flush

Currently, dirty writeback is under global control. We can tune it by
parameters in /proc/sys/vm/
  - dirty_expire_centisecs: expire interval in centiseconds
  - dirty_writeback_centisecs: periodcal writeback interval in centiseconds
  - dirty_background_bytes/dirty_background_ratio: async writeback
    threshold
  - dirty_bytes/dirty_ratio: sync writeback threshold

Sometimes, we'd like to specify special wrtiteback policy for user
application, especially for offline application in co-location scenerio.

This patch provides dirty flush policy per memcg, user can specify them
in memcg interface.

Actually, writeback code maintains two dimensions of dirty pages control in
balance_dirty_pages.
   - gdtc for global control
   - mdtc for cgroup control

When dirty pages is under both of control, it leaves the check quickly.
Otherwise, it computes the wb threshold (along with bg_threshold) taking
the writeout bandwidth into consideration. And computes position ratio
against wb_thresh for both global control and cgroup control as well.
After that, it takes the smaller one (IOW the strict one) as the factor
to generate task ratelimit based on wb's dirty_ratelimit.

So far, the writeback code can control the dirty limit for both global
view and cgroup view. That means the framework works well for controlling
cgroup's dirty limit.

This patch only provides an extra interface for memcg to tune writeback
behavior.

Signed-off-by: Xie Yongmei <yongmeixie@...mail.com>
---
 include/linux/memcontrol.h |  22 ++++++
 init/Kconfig               |   7 ++
 mm/memcontrol.c            | 136 +++++++++++++++++++++++++++++++++++++
 mm/page-writeback.c        |  15 +++-
 4 files changed, 178 insertions(+), 2 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index a68dce3873fc..386fc9b70c95 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -344,6 +344,11 @@ struct mem_cgroup {
 	struct deferred_split deferred_split_queue;
 #endif
 
+#ifdef CONFIG_CGROUP_WRITEBACK_PARA
+	int dirty_background_ratio;
+	int dirty_ratio;
+#endif
+
 	struct mem_cgroup_per_node *nodeinfo[];
 };
 
@@ -1634,6 +1639,23 @@ static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
 
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
+#ifdef CONFIG_CGROUP_WRITEBACK_PARA
+unsigned int wb_dirty_background_ratio(struct bdi_writeback *wb);
+unsigned int wb_dirty_ratio(struct bdi_writeback *wb);
+#else
+static inline
+unsigned int wb_dirty_background_ratio(struct bdi_writeback *wb)
+{
+	return dirty_background_ratio;
+}
+
+static inline
+unsigned int wb_dirty_ratio(struct bdi_writeback *wb)
+{
+	return vm_dirty_ratio;
+}
+#endif
+
 struct sock;
 bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
 			     gfp_t gfp_mask);
diff --git a/init/Kconfig b/init/Kconfig
index ddcbefe535e9..0b8152000d6e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -989,6 +989,13 @@ config CGROUP_WRITEBACK
 	depends on MEMCG && BLK_CGROUP
 	default y
 
+config CGROUP_WRITEBACK_PARA
+	bool "Enable setup dirty flush parameters per memcg"
+	depends on CGROUP_WRITEBACK
+	default y
+	help
+	  This feature helps cgroup could specify its own diry wriback policy.
+
 menuconfig CGROUP_SCHED
 	bool "CPU controller"
 	default n
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e8922bacfe2a..b1c1b150637a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4822,6 +4822,112 @@ static int mem_cgroup_slab_show(struct seq_file *m, void *p)
 }
 #endif
 
+#ifdef CONFIG_CGROUP_WRITEBACK_PARA
+unsigned int wb_dirty_background_ratio(struct bdi_writeback *wb)
+{
+	struct mem_cgroup *memcg;
+
+	if (mem_cgroup_disabled() || !wb)
+		return dirty_background_ratio;
+
+	memcg = mem_cgroup_from_css(wb->memcg_css);
+	if (memcg == root_mem_cgroup || memcg->dirty_background_ratio < 0)
+		return dirty_background_ratio;
+
+	return memcg->dirty_background_ratio;
+}
+
+unsigned int wb_dirty_ratio(struct bdi_writeback *wb)
+{
+	struct mem_cgroup *memcg;
+
+	if (mem_cgroup_disabled() || !wb)
+		return vm_dirty_ratio;
+
+	memcg = mem_cgroup_from_css(wb->memcg_css);
+	if (memcg == root_mem_cgroup || memcg->dirty_ratio < 0)
+		return vm_dirty_ratio;
+
+	return memcg->dirty_ratio;
+}
+
+static void wb_memcg_inherit_from_parent(struct mem_cgroup *parent,
+					 struct mem_cgroup *memcg)
+{
+	memcg->dirty_background_ratio = parent->dirty_background_ratio;
+	memcg->dirty_ratio = parent->dirty_ratio;
+}
+
+static void wb_memcg_init(struct mem_cgroup *memcg)
+{
+	memcg->dirty_background_ratio = -1;
+	memcg->dirty_ratio = -1;
+}
+
+static int mem_cgroup_dirty_background_ratio_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+	seq_printf(m, "%d\n", memcg->dirty_background_ratio);
+	return 0;
+}
+
+static ssize_t
+mem_cgroup_dirty_background_ratio_write(struct kernfs_open_file *of,
+					char *buf, size_t nbytes,
+					loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	int ret, background_ratio;
+
+	buf = strstrip(buf);
+	ret = kstrtoint(buf, 0, &background_ratio);
+	if (ret)
+		return ret;
+
+	if (background_ratio < -1 || background_ratio > 100)
+		return -EINVAL;
+
+	memcg->dirty_background_ratio = background_ratio;
+	return nbytes;
+}
+
+static int mem_cgroup_dirty_ratio_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+	seq_printf(m, "%d\n", memcg->dirty_ratio);
+	return 0;
+}
+
+static ssize_t
+mem_cgroup_dirty_ratio_write(struct kernfs_open_file *of,
+			     char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	int ret, dirty_ratio;
+
+	buf = strstrip(buf);
+	ret = kstrtoint(buf, 0, &dirty_ratio);
+	if (ret)
+		return ret;
+
+	if (dirty_ratio < -1 || dirty_ratio > 100)
+		return -EINVAL;
+
+	memcg->dirty_ratio = dirty_ratio;
+	return nbytes;
+}
+#else
+static void wb_memcg_inherit_from_parent(struct mem_cgroup *parent,
+					 struct mem_cgroup *memcg)
+{
+}
+
+static inline void wb_memcg_init(struct mem_cgroup *memcg)
+{
+}
+#endif
 static struct cftype mem_cgroup_legacy_files[] = {
 	{
 		.name = "usage_in_bytes",
@@ -4948,6 +5054,20 @@ static struct cftype mem_cgroup_legacy_files[] = {
 		.write = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read_u64,
 	},
+#ifdef CONFIG_CGROUP_WRITEBACK_PARA
+	{
+		.name = "dirty_background_ratio",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = mem_cgroup_dirty_background_ratio_show,
+		.write = mem_cgroup_dirty_background_ratio_write,
+	},
+	{
+		.name = "dirty_ratio",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = mem_cgroup_dirty_ratio_show,
+		.write = mem_cgroup_dirty_ratio_write,
+	},
+#endif
 	{ },	/* terminate */
 };
 
@@ -5151,11 +5271,13 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 		page_counter_init(&memcg->swap, &parent->swap);
 		page_counter_init(&memcg->kmem, &parent->kmem);
 		page_counter_init(&memcg->tcpmem, &parent->tcpmem);
+		wb_memcg_inherit_from_parent(parent, memcg);
 	} else {
 		page_counter_init(&memcg->memory, NULL);
 		page_counter_init(&memcg->swap, NULL);
 		page_counter_init(&memcg->kmem, NULL);
 		page_counter_init(&memcg->tcpmem, NULL);
+		wb_memcg_init(memcg);
 
 		root_mem_cgroup = memcg;
 		return &memcg->css;
@@ -6414,6 +6536,20 @@ static struct cftype memory_files[] = {
 		.seq_show = memory_oom_group_show,
 		.write = memory_oom_group_write,
 	},
+#ifdef CONFIG_CGROUP_WRITEBACK_PARA
+	{
+		.name = "dirty_background_ratio",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = mem_cgroup_dirty_background_ratio_show,
+		.write = mem_cgroup_dirty_background_ratio_write,
+	},
+	{
+		.name = "dirty_ratio",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = mem_cgroup_dirty_ratio_show,
+		.write = mem_cgroup_dirty_ratio_write,
+	},
+#endif
 	{ }	/* terminate */
 };
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7e2da284e427..cec2ef032927 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -395,12 +395,23 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
 		 * per-PAGE_SIZE, they can be obtained by dividing bytes by
 		 * number of pages.
 		 */
+#ifdef CONFIG_CGROUP_WRITEBACK_PARA
+		ratio = (wb_dirty_ratio(dtc->wb) * PAGE_SIZE) / 100;
+		bg_ratio = (wb_dirty_background_ratio(dtc->wb) * PAGE_SIZE) / 100;
+		if (!ratio && bytes)
+			ratio = min(DIV_ROUND_UP(bytes, global_avail),
+				    PAGE_SIZE);
+		if (!bg_ratio && bg_bytes)
+			bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail),
+				       PAGE_SIZE);
+#else
 		if (bytes)
 			ratio = min(DIV_ROUND_UP(bytes, global_avail),
 				    PAGE_SIZE);
 		if (bg_bytes)
 			bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail),
 				       PAGE_SIZE);
+#endif
 		bytes = bg_bytes = 0;
 	}
 
@@ -418,8 +429,8 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
 		bg_thresh = thresh / 2;
 	tsk = current;
 	if (rt_task(tsk)) {
-		bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
-		thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
+		bg_thresh += bg_thresh / 4 + dtc_dom(dtc)->dirty_limit / 32;
+		thresh += thresh / 4 + dtc_dom(dtc)->dirty_limit / 32;
 	}
 	dtc->thresh = thresh;
 	dtc->bg_thresh = bg_thresh;
-- 
2.27.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ