[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <YmkcW8b+fNWgWFGA@dhcp22.suse.cz>
Date: Wed, 27 Apr 2022 12:35:18 +0200
From: Michal Hocko <mhocko@...e.com>
To: Xie Yongmei <yongmeixie@...mail.com>
Cc: Andrew Morton <akpm@...ux-foundation.org>, linux-mm@...ck.org,
linux-kernel@...r.kernel.org,
Alexander Viro <viro@...iv.linux.org.uk>,
linux-fsdevel@...r.kernel.org, yongmeixie@...mail.com,
Johannes Weiner <hannes@...xchg.org>,
Roman Gushchin <roman.gushchin@...ux.dev>,
Shakeel Butt <shakeelb@...gle.com>, Tejun Heo <tj@...nel.org>,
linux-api@...r.kernel.org
Subject: Re: [PATCH 2/3] writeback: per memcg dirty flush
[CC memcg maintainers and Tejun who has been quite active in the area
as well. Also linux-api ML added - please add this list whenever you are
suggesting user visible API]
On Wed 27-04-22 05:32:40, Xie Yongmei wrote:
> Currently, dirty writeback is under global control. We can tune it by
> parameters in /proc/sys/vm/
> - dirty_expire_centisecs: expire interval in centiseconds
> - dirty_writeback_centisecs: periodcal writeback interval in centiseconds
> - dirty_background_bytes/dirty_background_ratio: async writeback
> threshold
> - dirty_bytes/dirty_ratio: sync writeback threshold
>
> Sometimes, we'd like to specify special wrtiteback policy for user
> application, especially for offline application in co-location scenerio.
>
> This patch provides dirty flush policy per memcg, user can specify them
> in memcg interface.
>
> Actually, writeback code maintains two dimensions of dirty pages control in
> balance_dirty_pages.
> - gdtc for global control
> - mdtc for cgroup control
>
> When dirty pages is under both of control, it leaves the check quickly.
> Otherwise, it computes the wb threshold (along with bg_threshold) taking
> the writeout bandwidth into consideration. And computes position ratio
> against wb_thresh for both global control and cgroup control as well.
> After that, it takes the smaller one (IOW the strict one) as the factor
> to generate task ratelimit based on wb's dirty_ratelimit.
>
> So far, the writeback code can control the dirty limit for both global
> view and cgroup view. That means the framework works well for controlling
> cgroup's dirty limit.
>
> This patch only provides an extra interface for memcg to tune writeback
> behavior.
>
> Signed-off-by: Xie Yongmei <yongmeixie@...mail.com>
> ---
> include/linux/memcontrol.h | 22 ++++++
> init/Kconfig | 7 ++
> mm/memcontrol.c | 136 +++++++++++++++++++++++++++++++++++++
> mm/page-writeback.c | 15 +++-
> 4 files changed, 178 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index a68dce3873fc..386fc9b70c95 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -344,6 +344,11 @@ struct mem_cgroup {
> struct deferred_split deferred_split_queue;
> #endif
>
> +#ifdef CONFIG_CGROUP_WRITEBACK_PARA
> + int dirty_background_ratio;
> + int dirty_ratio;
> +#endif
> +
> struct mem_cgroup_per_node *nodeinfo[];
> };
>
> @@ -1634,6 +1639,23 @@ static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
>
> #endif /* CONFIG_CGROUP_WRITEBACK */
>
> +#ifdef CONFIG_CGROUP_WRITEBACK_PARA
> +unsigned int wb_dirty_background_ratio(struct bdi_writeback *wb);
> +unsigned int wb_dirty_ratio(struct bdi_writeback *wb);
> +#else
> +static inline
> +unsigned int wb_dirty_background_ratio(struct bdi_writeback *wb)
> +{
> + return dirty_background_ratio;
> +}
> +
> +static inline
> +unsigned int wb_dirty_ratio(struct bdi_writeback *wb)
> +{
> + return vm_dirty_ratio;
> +}
> +#endif
> +
> struct sock;
> bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
> gfp_t gfp_mask);
> diff --git a/init/Kconfig b/init/Kconfig
> index ddcbefe535e9..0b8152000d6e 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -989,6 +989,13 @@ config CGROUP_WRITEBACK
> depends on MEMCG && BLK_CGROUP
> default y
>
> +config CGROUP_WRITEBACK_PARA
> + bool "Enable setup dirty flush parameters per memcg"
> + depends on CGROUP_WRITEBACK
> + default y
> + help
> + This feature helps cgroup could specify its own diry wriback policy.
> +
> menuconfig CGROUP_SCHED
> bool "CPU controller"
> default n
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e8922bacfe2a..b1c1b150637a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4822,6 +4822,112 @@ static int mem_cgroup_slab_show(struct seq_file *m, void *p)
> }
> #endif
>
> +#ifdef CONFIG_CGROUP_WRITEBACK_PARA
> +unsigned int wb_dirty_background_ratio(struct bdi_writeback *wb)
> +{
> + struct mem_cgroup *memcg;
> +
> + if (mem_cgroup_disabled() || !wb)
> + return dirty_background_ratio;
> +
> + memcg = mem_cgroup_from_css(wb->memcg_css);
> + if (memcg == root_mem_cgroup || memcg->dirty_background_ratio < 0)
> + return dirty_background_ratio;
> +
> + return memcg->dirty_background_ratio;
> +}
> +
> +unsigned int wb_dirty_ratio(struct bdi_writeback *wb)
> +{
> + struct mem_cgroup *memcg;
> +
> + if (mem_cgroup_disabled() || !wb)
> + return vm_dirty_ratio;
> +
> + memcg = mem_cgroup_from_css(wb->memcg_css);
> + if (memcg == root_mem_cgroup || memcg->dirty_ratio < 0)
> + return vm_dirty_ratio;
> +
> + return memcg->dirty_ratio;
> +}
> +
> +static void wb_memcg_inherit_from_parent(struct mem_cgroup *parent,
> + struct mem_cgroup *memcg)
> +{
> + memcg->dirty_background_ratio = parent->dirty_background_ratio;
> + memcg->dirty_ratio = parent->dirty_ratio;
> +}
> +
> +static void wb_memcg_init(struct mem_cgroup *memcg)
> +{
> + memcg->dirty_background_ratio = -1;
> + memcg->dirty_ratio = -1;
> +}
> +
> +static int mem_cgroup_dirty_background_ratio_show(struct seq_file *m, void *v)
> +{
> + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
> +
> + seq_printf(m, "%d\n", memcg->dirty_background_ratio);
> + return 0;
> +}
> +
> +static ssize_t
> +mem_cgroup_dirty_background_ratio_write(struct kernfs_open_file *of,
> + char *buf, size_t nbytes,
> + loff_t off)
> +{
> + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> + int ret, background_ratio;
> +
> + buf = strstrip(buf);
> + ret = kstrtoint(buf, 0, &background_ratio);
> + if (ret)
> + return ret;
> +
> + if (background_ratio < -1 || background_ratio > 100)
> + return -EINVAL;
> +
> + memcg->dirty_background_ratio = background_ratio;
> + return nbytes;
> +}
> +
> +static int mem_cgroup_dirty_ratio_show(struct seq_file *m, void *v)
> +{
> + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
> +
> + seq_printf(m, "%d\n", memcg->dirty_ratio);
> + return 0;
> +}
> +
> +static ssize_t
> +mem_cgroup_dirty_ratio_write(struct kernfs_open_file *of,
> + char *buf, size_t nbytes, loff_t off)
> +{
> + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> + int ret, dirty_ratio;
> +
> + buf = strstrip(buf);
> + ret = kstrtoint(buf, 0, &dirty_ratio);
> + if (ret)
> + return ret;
> +
> + if (dirty_ratio < -1 || dirty_ratio > 100)
> + return -EINVAL;
> +
> + memcg->dirty_ratio = dirty_ratio;
> + return nbytes;
> +}
> +#else
> +static void wb_memcg_inherit_from_parent(struct mem_cgroup *parent,
> + struct mem_cgroup *memcg)
> +{
> +}
> +
> +static inline void wb_memcg_init(struct mem_cgroup *memcg)
> +{
> +}
> +#endif
> static struct cftype mem_cgroup_legacy_files[] = {
> {
> .name = "usage_in_bytes",
> @@ -4948,6 +5054,20 @@ static struct cftype mem_cgroup_legacy_files[] = {
> .write = mem_cgroup_reset,
> .read_u64 = mem_cgroup_read_u64,
> },
> +#ifdef CONFIG_CGROUP_WRITEBACK_PARA
> + {
> + .name = "dirty_background_ratio",
> + .flags = CFTYPE_NOT_ON_ROOT,
> + .seq_show = mem_cgroup_dirty_background_ratio_show,
> + .write = mem_cgroup_dirty_background_ratio_write,
> + },
> + {
> + .name = "dirty_ratio",
> + .flags = CFTYPE_NOT_ON_ROOT,
> + .seq_show = mem_cgroup_dirty_ratio_show,
> + .write = mem_cgroup_dirty_ratio_write,
> + },
> +#endif
> { }, /* terminate */
> };
>
> @@ -5151,11 +5271,13 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
> page_counter_init(&memcg->swap, &parent->swap);
> page_counter_init(&memcg->kmem, &parent->kmem);
> page_counter_init(&memcg->tcpmem, &parent->tcpmem);
> + wb_memcg_inherit_from_parent(parent, memcg);
> } else {
> page_counter_init(&memcg->memory, NULL);
> page_counter_init(&memcg->swap, NULL);
> page_counter_init(&memcg->kmem, NULL);
> page_counter_init(&memcg->tcpmem, NULL);
> + wb_memcg_init(memcg);
>
> root_mem_cgroup = memcg;
> return &memcg->css;
> @@ -6414,6 +6536,20 @@ static struct cftype memory_files[] = {
> .seq_show = memory_oom_group_show,
> .write = memory_oom_group_write,
> },
> +#ifdef CONFIG_CGROUP_WRITEBACK_PARA
> + {
> + .name = "dirty_background_ratio",
> + .flags = CFTYPE_NOT_ON_ROOT,
> + .seq_show = mem_cgroup_dirty_background_ratio_show,
> + .write = mem_cgroup_dirty_background_ratio_write,
> + },
> + {
> + .name = "dirty_ratio",
> + .flags = CFTYPE_NOT_ON_ROOT,
> + .seq_show = mem_cgroup_dirty_ratio_show,
> + .write = mem_cgroup_dirty_ratio_write,
> + },
> +#endif
> { } /* terminate */
> };
>
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 7e2da284e427..cec2ef032927 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -395,12 +395,23 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
> * per-PAGE_SIZE, they can be obtained by dividing bytes by
> * number of pages.
> */
> +#ifdef CONFIG_CGROUP_WRITEBACK_PARA
> + ratio = (wb_dirty_ratio(dtc->wb) * PAGE_SIZE) / 100;
> + bg_ratio = (wb_dirty_background_ratio(dtc->wb) * PAGE_SIZE) / 100;
> + if (!ratio && bytes)
> + ratio = min(DIV_ROUND_UP(bytes, global_avail),
> + PAGE_SIZE);
> + if (!bg_ratio && bg_bytes)
> + bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail),
> + PAGE_SIZE);
> +#else
> if (bytes)
> ratio = min(DIV_ROUND_UP(bytes, global_avail),
> PAGE_SIZE);
> if (bg_bytes)
> bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail),
> PAGE_SIZE);
> +#endif
> bytes = bg_bytes = 0;
> }
>
> @@ -418,8 +429,8 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
> bg_thresh = thresh / 2;
> tsk = current;
> if (rt_task(tsk)) {
> - bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
> - thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
> + bg_thresh += bg_thresh / 4 + dtc_dom(dtc)->dirty_limit / 32;
> + thresh += thresh / 4 + dtc_dom(dtc)->dirty_limit / 32;
> }
> dtc->thresh = thresh;
> dtc->bg_thresh = bg_thresh;
> --
> 2.27.0
--
Michal Hocko
SUSE Labs
Powered by blists - more mailing lists