lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Yo2WKADtPy2rekRh@carbon>
Date:   Tue, 24 May 2022 19:36:24 -0700
From:   Roman Gushchin <roman.gushchin@...ux.dev>
To:     Muchun Song <songmuchun@...edance.com>
Cc:     hannes@...xchg.org, mhocko@...nel.org, shakeelb@...gle.com,
        cgroups@...r.kernel.org, linux-mm@...ck.org,
        linux-kernel@...r.kernel.org, duanxiongchun@...edance.com,
        longman@...hat.com
Subject: Re: [PATCH v4 01/11] mm: memcontrol: prepare objcg API for non-kmem
 usage

On Tue, May 24, 2022 at 02:05:41PM +0800, Muchun Song wrote:
> Pagecache pages are charged at the allocation time and holding a
> reference to the original memory cgroup until being reclaimed.
> Depending on the memory pressure, specific patterns of the page
> sharing between different cgroups and the cgroup creation and
> destruction rates, a large number of dying memory cgroups can be
> pinned by pagecache pages. It makes the page reclaim less efficient
> and wastes memory.
> 
> We can convert LRU pages and most other raw memcg pins to the objcg
> direction to fix this problem, and then the page->memcg will always
> point to an object cgroup pointer.
> 
> Therefore, the infrastructure of objcg no longer only serves
> CONFIG_MEMCG_KMEM. In this patch, we move the infrastructure of the
> objcg out of the scope of the CONFIG_MEMCG_KMEM so that the LRU pages
> can reuse it to charge pages.
> 
> We know that the LRU pages are not accounted at the root level. But
> the page->memcg_data points to the root_mem_cgroup. So the
> page->memcg_data of the LRU pages always points to a valid pointer.
> But the root_mem_cgroup dose not have an object cgroup. If we use
> obj_cgroup APIs to charge the LRU pages, we should set the
> page->memcg_data to a root object cgroup. So we also allocate an
> object cgroup for the root_mem_cgroup.
> 
> Signed-off-by: Muchun Song <songmuchun@...edance.com>
> ---
>  include/linux/memcontrol.h |  5 ++--
>  mm/memcontrol.c            | 60 +++++++++++++++++++++++++---------------------
>  2 files changed, 35 insertions(+), 30 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 89b14729d59f..ff1c1dd7e762 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -315,10 +315,10 @@ struct mem_cgroup {
>  
>  #ifdef CONFIG_MEMCG_KMEM
>  	int kmemcg_id;
> +#endif
>  	struct obj_cgroup __rcu *objcg;
>  	/* list of inherited objcgs, protected by objcg_lock */
>  	struct list_head objcg_list;
> -#endif
>  
>  	MEMCG_PADDING(_pad2_);
>  
> @@ -851,8 +851,7 @@ static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
>   * parent_mem_cgroup - find the accounting parent of a memcg
>   * @memcg: memcg whose parent to find
>   *
> - * Returns the parent memcg, or NULL if this is the root or the memory
> - * controller is in legacy no-hierarchy mode.
> + * Returns the parent memcg, or NULL if this is the root.
>   */
>  static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
>  {
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 598fece89e2b..6de0d3e53eb1 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -254,9 +254,9 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
>  	return container_of(vmpr, struct mem_cgroup, vmpressure);
>  }
>  
> -#ifdef CONFIG_MEMCG_KMEM
>  static DEFINE_SPINLOCK(objcg_lock);
>  
> +#ifdef CONFIG_MEMCG_KMEM
>  bool mem_cgroup_kmem_disabled(void)
>  {
>  	return cgroup_memory_nokmem;
> @@ -265,12 +265,10 @@ bool mem_cgroup_kmem_disabled(void)
>  static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
>  				      unsigned int nr_pages);
>  
> -static void obj_cgroup_release(struct percpu_ref *ref)
> +static void obj_cgroup_release_bytes(struct obj_cgroup *objcg)
>  {
> -	struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
>  	unsigned int nr_bytes;
>  	unsigned int nr_pages;
> -	unsigned long flags;
>  
>  	/*
>  	 * At this point all allocated objects are freed, and
> @@ -284,9 +282,9 @@ static void obj_cgroup_release(struct percpu_ref *ref)
>  	 * 3) CPU1: a process from another memcg is allocating something,
>  	 *          the stock if flushed,
>  	 *          objcg->nr_charged_bytes = PAGE_SIZE - 92
> -	 * 5) CPU0: we do release this object,
> +	 * 4) CPU0: we do release this object,
>  	 *          92 bytes are added to stock->nr_bytes
> -	 * 6) CPU0: stock is flushed,
> +	 * 5) CPU0: stock is flushed,
>  	 *          92 bytes are added to objcg->nr_charged_bytes
>  	 *
>  	 * In the result, nr_charged_bytes == PAGE_SIZE.
> @@ -298,6 +296,19 @@ static void obj_cgroup_release(struct percpu_ref *ref)
>  
>  	if (nr_pages)
>  		obj_cgroup_uncharge_pages(objcg, nr_pages);
> +}
> +#else
> +static inline void obj_cgroup_release_bytes(struct obj_cgroup *objcg)
> +{
> +}
> +#endif
> +
> +static void obj_cgroup_release(struct percpu_ref *ref)
> +{
> +	struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
> +	unsigned long flags;
> +
> +	obj_cgroup_release_bytes(objcg);
>  
>  	spin_lock_irqsave(&objcg_lock, flags);
>  	list_del(&objcg->list);
> @@ -326,10 +337,10 @@ static struct obj_cgroup *obj_cgroup_alloc(void)
>  	return objcg;
>  }
>  
> -static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
> -				  struct mem_cgroup *parent)
> +static void memcg_reparent_objcgs(struct mem_cgroup *memcg)
>  {
>  	struct obj_cgroup *objcg, *iter;
> +	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
>  
>  	objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
>  
> @@ -348,6 +359,7 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
>  	percpu_ref_kill(&objcg->refcnt);
>  }
>  
> +#ifdef CONFIG_MEMCG_KMEM
>  /*
>   * A lot of the calls to the cache allocation functions are expected to be
>   * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
> @@ -3589,21 +3601,12 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
>  #ifdef CONFIG_MEMCG_KMEM
>  static int memcg_online_kmem(struct mem_cgroup *memcg)
>  {
> -	struct obj_cgroup *objcg;
> -
>  	if (cgroup_memory_nokmem)
>  		return 0;
>  
>  	if (unlikely(mem_cgroup_is_root(memcg)))
>  		return 0;
>  
> -	objcg = obj_cgroup_alloc();
> -	if (!objcg)
> -		return -ENOMEM;
> -
> -	objcg->memcg = memcg;
> -	rcu_assign_pointer(memcg->objcg, objcg);
> -
>  	static_branch_enable(&memcg_kmem_enabled_key);
>  
>  	memcg->kmemcg_id = memcg->id.id;
> @@ -3613,27 +3616,19 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
>  
>  static void memcg_offline_kmem(struct mem_cgroup *memcg)
>  {
> -	struct mem_cgroup *parent;
> -
>  	if (cgroup_memory_nokmem)
>  		return;
>  
>  	if (unlikely(mem_cgroup_is_root(memcg)))
>  		return;
>  
> -	parent = parent_mem_cgroup(memcg);
> -	if (!parent)
> -		parent = root_mem_cgroup;
> -
> -	memcg_reparent_objcgs(memcg, parent);
> -
>  	/*
>  	 * After we have finished memcg_reparent_objcgs(), all list_lrus
>  	 * corresponding to this cgroup are guaranteed to remain empty.
>  	 * The ordering is imposed by list_lru_node->lock taken by
>  	 * memcg_reparent_list_lrus().
>  	 */

This comment doesn't look to be correct after these changes. Should it
be fixed? Or the ordering should be fixed too?

> -	memcg_reparent_list_lrus(memcg, parent);
> +	memcg_reparent_list_lrus(memcg, parent_mem_cgroup(memcg));
We effectively dropped this:
	if (!parent)
		parent = root_mem_cgroup;
Is it safe? (assuming v1 non-hierarchical mode, it's usually when all
is getting complicated)

The rest of the patch looks good to me.

Thanks!

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ