[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Yo4YVseWS93nINkj@FVFYT0MHHV2J.usts.net>
Date: Wed, 25 May 2022 19:51:50 +0800
From: Muchun Song <songmuchun@...edance.com>
To: Roman Gushchin <roman.gushchin@...ux.dev>
Cc: hannes@...xchg.org, mhocko@...nel.org, shakeelb@...gle.com,
cgroups@...r.kernel.org, linux-mm@...ck.org,
linux-kernel@...r.kernel.org, duanxiongchun@...edance.com,
longman@...hat.com
Subject: Re: [PATCH v4 07/11] mm: memcontrol: make all the callers of
{folio,page}_memcg() safe
On Tue, May 24, 2022 at 08:03:41PM -0700, Roman Gushchin wrote:
> On Tue, May 24, 2022 at 02:05:47PM +0800, Muchun Song wrote:
> > When we use objcg APIs to charge the LRU pages, the page will not hold
> > a reference to the memcg associated with the page. So the caller of the
> > {folio,page}_memcg() should hold an rcu read lock or obtain a reference
> > to the memcg associated with the page to protect memcg from being
> > released. So introduce get_mem_cgroup_from_{page,folio}() to obtain a
> > reference to the memory cgroup associated with the page.
> >
> > In this patch, make all the callers hold an rcu read lock or obtain a
> > reference to the memcg to protect memcg from being released when the LRU
> > pages reparented.
> >
> > We do not need to adjust the callers of {folio,page}_memcg() during
> > the whole process of mem_cgroup_move_task(). Because the cgroup migration
> > and memory cgroup offlining are serialized by @cgroup_mutex. In this
> > routine, the LRU pages cannot be reparented to its parent memory cgroup.
> > So {folio,page}_memcg() is stable and cannot be released.
> >
> > This is a preparation for reparenting the LRU pages.
> >
> > Signed-off-by: Muchun Song <songmuchun@...edance.com>
>
> Overall the patch looks good to me (some nits below).
>
> > ---
> > fs/buffer.c | 4 +--
> > fs/fs-writeback.c | 23 ++++++++--------
> > include/linux/memcontrol.h | 51 ++++++++++++++++++++++++++++++++---
> > include/trace/events/writeback.h | 5 ++++
> > mm/memcontrol.c | 58 ++++++++++++++++++++++++++++++----------
> > mm/migrate.c | 4 +++
> > mm/page_io.c | 5 ++--
> > 7 files changed, 117 insertions(+), 33 deletions(-)
> >
> > diff --git a/fs/buffer.c b/fs/buffer.c
> > index 2b5561ae5d0b..80975a457670 100644
> > --- a/fs/buffer.c
> > +++ b/fs/buffer.c
> > @@ -819,8 +819,7 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
> > if (retry)
> > gfp |= __GFP_NOFAIL;
> >
> > - /* The page lock pins the memcg */
> > - memcg = page_memcg(page);
> > + memcg = get_mem_cgroup_from_page(page);
>
> Looking at these changes I wonder if we need to remove unsafe getters or
> at least add a BOLD comment on how/when it's safe to use them.
>
I am not clear here. You mean we add some comments above page_memcg()
or get_mem_cgroup_from_page()?
> > old_memcg = set_active_memcg(memcg);
> >
> > head = NULL;
> > @@ -840,6 +839,7 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
> > set_bh_page(bh, page, offset);
> > }
> > out:
> > + mem_cgroup_put(memcg);
> > set_active_memcg(old_memcg);
> > return head;
> > /*
> > diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> > index 1fae0196292a..56612ace8778 100644
> > --- a/fs/fs-writeback.c
> > +++ b/fs/fs-writeback.c
> > @@ -243,15 +243,13 @@ void __inode_attach_wb(struct inode *inode, struct page *page)
> > if (inode_cgwb_enabled(inode)) {
> > struct cgroup_subsys_state *memcg_css;
> >
> > - if (page) {
> > - memcg_css = mem_cgroup_css_from_page(page);
> > - wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
> > - } else {
> > - /* must pin memcg_css, see wb_get_create() */
> > + /* must pin memcg_css, see wb_get_create() */
> > + if (page)
> > + memcg_css = get_mem_cgroup_css_from_page(page);
> > + else
> > memcg_css = task_get_css(current, memory_cgrp_id);
> > - wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
> > - css_put(memcg_css);
> > - }
> > + wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
> > + css_put(memcg_css);
> > }
> >
> > if (!wb)
> > @@ -868,16 +866,16 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
> > if (!wbc->wb || wbc->no_cgroup_owner)
> > return;
> >
> > - css = mem_cgroup_css_from_page(page);
> > + css = get_mem_cgroup_css_from_page(page);
> > /* dead cgroups shouldn't contribute to inode ownership arbitration */
> > if (!(css->flags & CSS_ONLINE))
> > - return;
> > + goto out;
> >
> > id = css->id;
> >
> > if (id == wbc->wb_id) {
> > wbc->wb_bytes += bytes;
> > - return;
> > + goto out;
> > }
> >
> > if (id == wbc->wb_lcand_id)
> > @@ -890,6 +888,9 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
> > wbc->wb_tcand_bytes += bytes;
> > else
> > wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
> > +
> > +out:
> > + css_put(css);
> > }
> > EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
> >
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index 8c2f1ba2f471..3a0e2592434e 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -373,7 +373,7 @@ static inline bool folio_memcg_kmem(struct folio *folio);
> > * a valid memcg, but can be atomically swapped to the parent memcg.
> > *
> > * The caller must ensure that the returned memcg won't be released:
> > - * e.g. acquire the rcu_read_lock or css_set_lock.
> > + * e.g. acquire the rcu_read_lock or objcg_lock or cgroup_mutex.
> > */
> > static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
> > {
> > @@ -454,7 +454,37 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
> > return folio_memcg(page_folio(page));
> > }
> >
> > -/**
> > +/*
> > + * get_mem_cgroup_from_folio - Obtain a reference on the memory cgroup
> > + * associated with a folio.
> > + * @folio: Pointer to the folio.
> > + *
> > + * Returns a pointer to the memory cgroup (and obtain a reference on it)
> > + * associated with the folio, or NULL. This function assumes that the
> > + * folio is known to have a proper memory cgroup pointer. It's not safe
> > + * to call this function against some type of pages, e.g. slab pages or
> > + * ex-slab pages.
> > + */
> > +static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
> > +{
> > + struct mem_cgroup *memcg;
> > +
> > + rcu_read_lock();
> > +retry:
> > + memcg = folio_memcg(folio);
> > + if (unlikely(memcg && !css_tryget(&memcg->css)))
> > + goto retry;
> > + rcu_read_unlock();
> > +
> > + return memcg;
> > +}
> > +
> > +static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
> > +{
> > + return get_mem_cgroup_from_folio(page_folio(page));
> > +}
> > +
> > +/*
> > * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio.
> > * @folio: Pointer to the folio.
> > *
> > @@ -873,7 +903,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
> > return match;
> > }
> >
> > -struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
> > +struct cgroup_subsys_state *get_mem_cgroup_css_from_page(struct page *page);
> > ino_t page_cgroup_ino(struct page *page);
> >
> > static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
> > @@ -1047,10 +1077,13 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
> > static inline void count_memcg_page_event(struct page *page,
> > enum vm_event_item idx)
> > {
> > - struct mem_cgroup *memcg = page_memcg(page);
> > + struct mem_cgroup *memcg;
> >
> > + rcu_read_lock();
> > + memcg = page_memcg(page);
> > if (memcg)
> > count_memcg_events(memcg, idx, 1);
> > + rcu_read_unlock();
> > }
> >
> > static inline void count_memcg_event_mm(struct mm_struct *mm,
> > @@ -1129,6 +1162,16 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
> > return NULL;
> > }
> >
> > +static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
> > +{
> > + return NULL;
> > +}
> > +
> > +static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
> > +{
> > + return NULL;
> > +}
> > +
> > static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
> > {
> > WARN_ON_ONCE(!rcu_read_lock_held());
> > diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
> > index 86b2a82da546..cdb822339f13 100644
> > --- a/include/trace/events/writeback.h
> > +++ b/include/trace/events/writeback.h
> > @@ -258,6 +258,11 @@ TRACE_EVENT(track_foreign_dirty,
> > __entry->ino = inode ? inode->i_ino : 0;
> > __entry->memcg_id = wb->memcg_css->id;
> > __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
> > + /*
> > + * TP_fast_assign() is under preemption disabled which can
> > + * serve as an RCU read-side critical section so that the
> > + * memcg returned by folio_memcg() cannot be freed.
> > + */
> > __entry->page_cgroup_ino = cgroup_ino(folio_memcg(folio)->css.cgroup);
> > ),
> >
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index b38a77f6696f..dcaf6cf5dc74 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -371,7 +371,7 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);
> > #endif
> >
> > /**
> > - * mem_cgroup_css_from_page - css of the memcg associated with a page
> > + * get_mem_cgroup_css_from_page - get css of the memcg associated with a page
> > * @page: page of interest
> > *
> > * If memcg is bound to the default hierarchy, css of the memcg associated
> > @@ -381,13 +381,15 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);
> > * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
> > * is returned.
> > */
> > -struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
> > +struct cgroup_subsys_state *get_mem_cgroup_css_from_page(struct page *page)
> > {
> > struct mem_cgroup *memcg;
> >
> > - memcg = page_memcg(page);
> > + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
> > + return &root_mem_cgroup->css;
> >
> > - if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
> > + memcg = get_mem_cgroup_from_page(page);
> > + if (!memcg)
> > memcg = root_mem_cgroup;
> >
> > return &memcg->css;
> > @@ -770,13 +772,13 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
> > void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
> > int val)
> > {
> > - struct page *head = compound_head(page); /* rmap on tail pages */
> > + struct folio *folio = page_folio(page); /* rmap on tail pages */
> > struct mem_cgroup *memcg;
> > pg_data_t *pgdat = page_pgdat(page);
> > struct lruvec *lruvec;
> >
> > rcu_read_lock();
> > - memcg = page_memcg(head);
> > + memcg = folio_memcg(folio);
> > /* Untracked pages have no memcg, no lruvec. Update only the node */
> > if (!memcg) {
> > rcu_read_unlock();
> > @@ -2058,7 +2060,9 @@ void folio_memcg_lock(struct folio *folio)
> > * The RCU lock is held throughout the transaction. The fast
> > * path can get away without acquiring the memcg->move_lock
> > * because page moving starts with an RCU grace period.
> > - */
> > + *
> > + * The RCU lock also protects the memcg from being freed.
> > + */
> > rcu_read_lock();
> >
> > if (mem_cgroup_disabled())
> > @@ -3296,7 +3300,7 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
> > void split_page_memcg(struct page *head, unsigned int nr)
> > {
> > struct folio *folio = page_folio(head);
> > - struct mem_cgroup *memcg = folio_memcg(folio);
> > + struct mem_cgroup *memcg = get_mem_cgroup_from_folio(folio);
> > int i;
> >
> > if (mem_cgroup_disabled() || !memcg)
> > @@ -3309,6 +3313,8 @@ void split_page_memcg(struct page *head, unsigned int nr)
> > obj_cgroup_get_many(__folio_objcg(folio), nr - 1);
> > else
> > css_get_many(&memcg->css, nr - 1);
> > +
> > + css_put(&memcg->css);
> > }
> >
> > #ifdef CONFIG_MEMCG_SWAP
> > @@ -4511,7 +4517,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
> > void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
> > struct bdi_writeback *wb)
> > {
> > - struct mem_cgroup *memcg = folio_memcg(folio);
> > + struct mem_cgroup *memcg = get_mem_cgroup_from_folio(folio);
> > struct memcg_cgwb_frn *frn;
> > u64 now = get_jiffies_64();
> > u64 oldest_at = now;
> > @@ -4558,6 +4564,7 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
> > frn->memcg_id = wb->memcg_css->id;
> > frn->at = now;
> > }
> > + css_put(&memcg->css);
> > }
> >
> > /* issue foreign writeback flushes for recorded foreign dirtying events */
> > @@ -6092,6 +6099,14 @@ static void mem_cgroup_move_charge(void)
> > atomic_dec(&mc.from->moving_account);
> > }
> >
> > +/*
> > + * The cgroup migration and memory cgroup offlining are serialized by
> > + * @cgroup_mutex. If we reach here, it means that the LRU pages cannot
> > + * be reparented to its parent memory cgroup. So during the whole process
> > + * of mem_cgroup_move_task(), page_memcg(page) is stable. So we do not
> > + * need to worry about the memcg (returned from page_memcg()) being
> > + * released even if we do not hold an rcu read lock.
> > + */
> > static void mem_cgroup_move_task(void)
> > {
> > if (mc.to) {
> > @@ -6895,7 +6910,7 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
> > if (folio_memcg(new))
> > return;
> >
> > - memcg = folio_memcg(old);
> > + memcg = get_mem_cgroup_from_folio(old);
> > VM_WARN_ON_ONCE_FOLIO(!memcg, old);
> > if (!memcg)
> > return;
> > @@ -6914,6 +6929,8 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
> > mem_cgroup_charge_statistics(memcg, nr_pages);
> > memcg_check_events(memcg, folio_nid(new));
> > local_irq_restore(flags);
> > +
> > + css_put(&memcg->css);
> > }
> >
> > DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
> > @@ -7100,6 +7117,10 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
> > if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
> > return;
> >
> > + /*
> > + * Interrupts should be disabled by the caller (see the comments below),
> > + * which can serve as RCU read-side critical sections.
> > + */
> > memcg = folio_memcg(folio);
> >
> > VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
> > @@ -7165,15 +7186,16 @@ int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
> > if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
> > return 0;
> >
> > + rcu_read_lock();
> > memcg = page_memcg(page);
> >
> > VM_WARN_ON_ONCE_PAGE(!memcg, page);
> > if (!memcg)
> > - return 0;
> > + goto out;
> >
> > if (!entry.val) {
> > memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
> > - return 0;
> > + goto out;
> > }
> >
> > memcg = mem_cgroup_id_get_online(memcg);
> > @@ -7183,6 +7205,7 @@ int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
> > memcg_memory_event(memcg, MEMCG_SWAP_MAX);
> > memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
> > mem_cgroup_id_put(memcg);
> > + rcu_read_unlock();
> > return -ENOMEM;
>
> If you add the "out" label, please use it here too.
>
Good point. Will do.
> > }
> >
> > @@ -7192,6 +7215,8 @@ int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
> > oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
> > VM_BUG_ON_PAGE(oldid, page);
> > mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
> > +out:
> > + rcu_read_unlock();
> >
> > return 0;
> > }
> > @@ -7246,17 +7271,22 @@ bool mem_cgroup_swap_full(struct page *page)
> > if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
> > return false;
> >
> > + rcu_read_lock();
> > memcg = page_memcg(page);
> > if (!memcg)
> > - return false;
> > + goto out;
> >
> > for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
> > unsigned long usage = page_counter_read(&memcg->swap);
> >
> > if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
> > - usage * 2 >= READ_ONCE(memcg->swap.max))
> > + usage * 2 >= READ_ONCE(memcg->swap.max)) {
> > + rcu_read_unlock();
> > return true;
>
> Please, make something like
> ret = true;
> goto out;
> here. It will be more consistent.
>
Will do.
THanks.
> > + }
> > }
> > +out:
> > + rcu_read_unlock();
> >
> > return false;
> > }
> > diff --git a/mm/migrate.c b/mm/migrate.c
> > index 6c31ee1e1c9b..59e97a8a64a0 100644
> > --- a/mm/migrate.c
> > +++ b/mm/migrate.c
> > @@ -430,6 +430,10 @@ int folio_migrate_mapping(struct address_space *mapping,
> > struct lruvec *old_lruvec, *new_lruvec;
> > struct mem_cgroup *memcg;
> >
> > + /*
> > + * Irq is disabled, which can serve as RCU read-side critical
> > + * sections.
> > + */
> > memcg = folio_memcg(folio);
> > old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
> > new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
> > diff --git a/mm/page_io.c b/mm/page_io.c
> > index 89fbf3cae30f..a0d9cd68e87a 100644
> > --- a/mm/page_io.c
> > +++ b/mm/page_io.c
> > @@ -221,13 +221,14 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
> > struct cgroup_subsys_state *css;
> > struct mem_cgroup *memcg;
> >
> > + rcu_read_lock();
> > memcg = page_memcg(page);
> > if (!memcg)
> > - return;
> > + goto out;
> >
> > - rcu_read_lock();
> > css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys);
> > bio_associate_blkg_from_css(bio, css);
> > +out:
> > rcu_read_unlock();
> > }
> > #else
> > --
> > 2.11.0
> >
>
Powered by blists - more mailing lists