[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAJD7tkaCjbkf37naW7dWxq7FovH-3+3QEuRrAD0+3+D9uVq_+g@mail.gmail.com>
Date: Tue, 5 Sep 2023 08:55:56 -0700
From: Yosry Ahmed <yosryahmed@...gle.com>
To: Michal Hocko <mhocko@...e.com>
Cc: Andrew Morton <akpm@...ux-foundation.org>,
Johannes Weiner <hannes@...xchg.org>,
Roman Gushchin <roman.gushchin@...ux.dev>,
Shakeel Butt <shakeelb@...gle.com>,
Muchun Song <muchun.song@...ux.dev>,
Ivan Babrou <ivan@...udflare.com>, Tejun Heo <tj@...nel.org>,
Michal Koutný <mkoutny@...e.com>,
Waiman Long <longman@...hat.com>, linux-mm@...ck.org,
cgroups@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v4 1/4] mm: memcg: properly name and document unified
stats flushing
On Mon, Sep 4, 2023 at 7:44 AM Michal Hocko <mhocko@...e.com> wrote:
>
> On Thu 31-08-23 16:56:08, Yosry Ahmed wrote:
> > Most contexts that flush memcg stats use "unified" flushing, where
> > basically all flushers attempt to flush the entire hierarchy, but only
> > one flusher is allowed at a time, others skip flushing.
> >
> > This is needed because we need to flush the stats from paths such as
> > reclaim or refaults, which may have high concurrency, especially on
> > large systems. Serializing such performance-sensitive paths can
> > introduce regressions, hence, unified flushing offers a tradeoff between
> > stats staleness and the performance impact of flushing stats.
> >
> > Document this properly and explicitly by renaming the common flushing
> > helper from do_flush_stats() to do_unified_stats_flush(), and adding
> > documentation to describe unified flushing. Additionally, rename
> > flushing APIs to add "try" in the name, which implies that flushing will
> > not always happen. Also add proper documentation.
> >
> > No functional change intended.
> >
> > Signed-off-by: Yosry Ahmed <yosryahmed@...gle.com>
>
> No objections to renaming but it would be really nice to simplify this.
> It is just "funny" to see 4 different flushing methods (flush from
> userspace, flush, try_flush_with_ratelimit_1 and try_flush_with_ratelimit_2).
> This is all internal so I am not all that worried that this would get
> confused but it surely is rather convoluted.
>
> Acked-by: Michal Hocko <mhocko@...e.com>
Thanks!
I tried to at least keep the naming consistent and add documentation,
but I agree we can do better :) It's less obvious to me tbh where we
can improve, but hopefully when someone new to this code comes
complaining we'll know better what needs to be changed.
>
> > ---
> > include/linux/memcontrol.h | 8 ++---
> > mm/memcontrol.c | 61 +++++++++++++++++++++++++-------------
> > mm/vmscan.c | 2 +-
> > mm/workingset.c | 4 +--
> > 4 files changed, 47 insertions(+), 28 deletions(-)
> >
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index 11810a2cfd2d..d517b0cc5221 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -1034,8 +1034,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> > return x;
> > }
> >
> > -void mem_cgroup_flush_stats(void);
> > -void mem_cgroup_flush_stats_ratelimited(void);
> > +void mem_cgroup_try_flush_stats(void);
> > +void mem_cgroup_try_flush_stats_ratelimited(void);
> >
> > void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
> > int val);
> > @@ -1519,11 +1519,11 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> > return node_page_state(lruvec_pgdat(lruvec), idx);
> > }
> >
> > -static inline void mem_cgroup_flush_stats(void)
> > +static inline void mem_cgroup_try_flush_stats(void)
> > {
> > }
> >
> > -static inline void mem_cgroup_flush_stats_ratelimited(void)
> > +static inline void mem_cgroup_try_flush_stats_ratelimited(void)
> > {
> > }
> >
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index cf57fe9318d5..2d0ec828a1c4 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -588,7 +588,7 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
> > static void flush_memcg_stats_dwork(struct work_struct *w);
> > static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
> > static DEFINE_PER_CPU(unsigned int, stats_updates);
> > -static atomic_t stats_flush_ongoing = ATOMIC_INIT(0);
> > +static atomic_t stats_unified_flush_ongoing = ATOMIC_INIT(0);
> > static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
> > static u64 flush_next_time;
> >
> > @@ -630,7 +630,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
> > /*
> > * If stats_flush_threshold exceeds the threshold
> > * (>num_online_cpus()), cgroup stats update will be triggered
> > - * in __mem_cgroup_flush_stats(). Increasing this var further
> > + * in mem_cgroup_try_flush_stats(). Increasing this var further
> > * is redundant and simply adds overhead in atomic update.
> > */
> > if (atomic_read(&stats_flush_threshold) <= num_online_cpus())
> > @@ -639,15 +639,19 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
> > }
> > }
> >
> > -static void do_flush_stats(void)
> > +/*
> > + * do_unified_stats_flush - do a unified flush of memory cgroup statistics
> > + *
> > + * A unified flush tries to flush the entire hierarchy, but skips if there is
> > + * another ongoing flush. This is meant for flushers that may have a lot of
> > + * concurrency (e.g. reclaim, refault, etc), and should not be serialized to
> > + * avoid slowing down performance-sensitive paths. A unified flush may skip, and
> > + * hence may yield stale stats.
> > + */
> > +static void do_unified_stats_flush(void)
> > {
> > - /*
> > - * We always flush the entire tree, so concurrent flushers can just
> > - * skip. This avoids a thundering herd problem on the rstat global lock
> > - * from memcg flushers (e.g. reclaim, refault, etc).
> > - */
> > - if (atomic_read(&stats_flush_ongoing) ||
> > - atomic_xchg(&stats_flush_ongoing, 1))
> > + if (atomic_read(&stats_unified_flush_ongoing) ||
> > + atomic_xchg(&stats_unified_flush_ongoing, 1))
> > return;
> >
> > WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME);
> > @@ -655,19 +659,34 @@ static void do_flush_stats(void)
> > cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
> >
> > atomic_set(&stats_flush_threshold, 0);
> > - atomic_set(&stats_flush_ongoing, 0);
> > + atomic_set(&stats_unified_flush_ongoing, 0);
> > }
> >
> > -void mem_cgroup_flush_stats(void)
> > +/*
> > + * mem_cgroup_try_flush_stats - try to flush memory cgroup statistics
> > + *
> > + * Try to flush the stats of all memcgs that have stat updates since the last
> > + * flush. We do not flush the stats if:
> > + * - The magnitude of the pending updates is below a certain threshold.
> > + * - There is another ongoing unified flush (see do_unified_stats_flush()).
> > + *
> > + * Hence, the stats may be stale, but ideally by less than FLUSH_TIME due to
> > + * periodic flushing.
> > + */
> > +void mem_cgroup_try_flush_stats(void)
> > {
> > if (atomic_read(&stats_flush_threshold) > num_online_cpus())
> > - do_flush_stats();
> > + do_unified_stats_flush();
> > }
> >
> > -void mem_cgroup_flush_stats_ratelimited(void)
> > +/*
> > + * Like mem_cgroup_try_flush_stats(), but only flushes if the periodic flusher
> > + * is late.
> > + */
> > +void mem_cgroup_try_flush_stats_ratelimited(void)
> > {
> > if (time_after64(jiffies_64, READ_ONCE(flush_next_time)))
> > - mem_cgroup_flush_stats();
> > + mem_cgroup_try_flush_stats();
> > }
> >
> > static void flush_memcg_stats_dwork(struct work_struct *w)
> > @@ -676,7 +695,7 @@ static void flush_memcg_stats_dwork(struct work_struct *w)
> > * Always flush here so that flushing in latency-sensitive paths is
> > * as cheap as possible.
> > */
> > - do_flush_stats();
> > + do_unified_stats_flush();
> > queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
> > }
> >
> > @@ -1576,7 +1595,7 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
> > *
> > * Current memory state:
> > */
> > - mem_cgroup_flush_stats();
> > + mem_cgroup_try_flush_stats();
> >
> > for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
> > u64 size;
> > @@ -4018,7 +4037,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
> > int nid;
> > struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
> >
> > - mem_cgroup_flush_stats();
> > + mem_cgroup_try_flush_stats();
> >
> > for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
> > seq_printf(m, "%s=%lu", stat->name,
> > @@ -4093,7 +4112,7 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
> >
> > BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
> >
> > - mem_cgroup_flush_stats();
> > + mem_cgroup_try_flush_stats();
> >
> > for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
> > unsigned long nr;
> > @@ -4595,7 +4614,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
> > struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
> > struct mem_cgroup *parent;
> >
> > - mem_cgroup_flush_stats();
> > + mem_cgroup_try_flush_stats();
> >
> > *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
> > *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
> > @@ -6610,7 +6629,7 @@ static int memory_numa_stat_show(struct seq_file *m, void *v)
> > int i;
> > struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
> >
> > - mem_cgroup_flush_stats();
> > + mem_cgroup_try_flush_stats();
> >
> > for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
> > int nid;
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index c7c149cb8d66..457a18921fda 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -2923,7 +2923,7 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
> > * Flush the memory cgroup stats, so that we read accurate per-memcg
> > * lruvec stats for heuristics.
> > */
> > - mem_cgroup_flush_stats();
> > + mem_cgroup_try_flush_stats();
> >
> > /*
> > * Determine the scan balance between anon and file LRUs.
> > diff --git a/mm/workingset.c b/mm/workingset.c
> > index da58a26d0d4d..affb8699e58d 100644
> > --- a/mm/workingset.c
> > +++ b/mm/workingset.c
> > @@ -520,7 +520,7 @@ void workingset_refault(struct folio *folio, void *shadow)
> > }
> >
> > /* Flush stats (and potentially sleep) before holding RCU read lock */
> > - mem_cgroup_flush_stats_ratelimited();
> > + mem_cgroup_try_flush_stats_ratelimited();
> >
> > rcu_read_lock();
> >
> > @@ -664,7 +664,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
> > struct lruvec *lruvec;
> > int i;
> >
> > - mem_cgroup_flush_stats();
> > + mem_cgroup_try_flush_stats();
> > lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
> > for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
> > pages += lruvec_page_state_local(lruvec,
> > --
> > 2.42.0.rc2.253.gd59a3bf2b4-goog
>
> --
> Michal Hocko
> SUSE Labs
Powered by blists - more mailing lists