linux-kernel - Re: [patch 8/8] memcg: sanitize __mem_cgroup_try

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20140312140520.GE11831@dhcp22.suse.cz>
Date:	Wed, 12 Mar 2014 15:05:20 +0100
From:	Michal Hocko <mhocko@...e.cz>
To:	Johannes Weiner <hannes@...xchg.org>
Cc:	Andrew Morton <akpm@...ux-foundation.org>, linux-mm@...ck.org,
	cgroups@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [patch 8/8] memcg: sanitize __mem_cgroup_try_charge() call
 protocol

On Wed 12-03-14 15:01:38, Michal Hocko wrote:
> On Tue 11-03-14 21:28:34, Johannes Weiner wrote:
> > Some callsites pass a memcg directly, some callsites pass a mm that
> > first has to be translated to an mm.  This makes for a terrible
> > function interface.
> > 
> > Just push the mm-to-memcg translation into the respective callsites
> > and always pass a memcg to mem_cgroup_try_charge().
> > 
> > Signed-off-by: Johannes Weiner <hannes@...xchg.org>
> 
>    text    data     bss     dec     hex filename
>   39435    5916    4192   49543    c187 mm/memcontrol.o.after
>   40466    5916    4192   50574    c58e mm/memcontrol.o.before

Just to prevent from confusion, .before is before the series is applied
not just this patch. 

Before _this_ patch size looks like this:
   text    data     bss     dec     hex filename
  39898    5916    4192   50006    c356 mm/memcontrol.o

> 1K down very nice. But we can shave off additional ~300B if the the
> common mm charging helper as I suggested before:
> 
>    text    data     bss     dec     hex filename
>   39100    5916    4192   49208    c038 mm/memcontrol.o.mm
> 
> commit 7aa420bc051849d85dcf5a091f3619c6b8e33cfb
> Author: Michal Hocko <mhocko@...e.cz>
> Date:   Wed Mar 12 14:59:06 2014 +0100
> 
>     add charge mm helper
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 2d7aa3e784d9..67e01b27a021 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2757,6 +2757,35 @@ bypass:
>  	return -EINTR;
>  }
>  
> +/**
> + * mem_cgroup_try_charge_mm - try charging a mm
> + * @mm: mm_struct to charge
> + * @nr_pages: number of pages to charge
> + * @oom: trigger OOM if reclaim fails
> + *
> + * Returns the charged mem_cgroup associated with the given mm_struct or
> + * NULL the charge failed.
> + */
> +static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
> +				 gfp_t gfp_mask,
> +				 unsigned int nr_pages,
> +				 bool oom)
> +
> +{
> +	struct mem_cgroup *memcg;
> +	int ret;
> +
> +	memcg = get_mem_cgroup_from_mm(mm);
> +	ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);
> +	css_put(&memcg->css);
> +	if (ret == -EINTR)
> +		memcg = root_mem_cgroup;
> +	else if (ret)
> +		memcg = NULL;
> +
> +	return memcg;
> +}
> +
>  /*
>   * Somemtimes we have to undo a charge we got by try_charge().
>   * This function is for that and do uncharge, put css's refcnt.
> @@ -3828,7 +3857,6 @@ int mem_cgroup_newpage_charge(struct page *page,
>  	unsigned int nr_pages = 1;
>  	struct mem_cgroup *memcg;
>  	bool oom = true;
> -	int ret;
>  
>  	if (mem_cgroup_disabled())
>  		return 0;
> @@ -3847,13 +3875,9 @@ int mem_cgroup_newpage_charge(struct page *page,
>  		oom = false;
>  	}
>  
> -	memcg = get_mem_cgroup_from_mm(mm);
> -	ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);
> -	css_put(&memcg->css);
> -	if (ret == -EINTR)
> -		memcg = root_mem_cgroup;
> -	else if (ret)
> -		return ret;
> +	memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom);
> +	if (!memcg)
> +		return -ENOMEM;
>  	__mem_cgroup_commit_charge(memcg, page, nr_pages,
>  				   MEM_CGROUP_CHARGE_TYPE_ANON, false);
>  	return 0;
> @@ -3914,15 +3938,10 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
>  	 */
>  	if (!PageSwapCache(page)) {
>  		struct mem_cgroup *memcg;
> -		int ret;
>  
> -		memcg = get_mem_cgroup_from_mm(mm);
> -		ret = mem_cgroup_try_charge(memcg, gfp_mask, 1, true);
> -		css_put(&memcg->css);
> -		if (ret == -EINTR)
> -			memcg = root_mem_cgroup;
> -		else if (ret)
> -			return ret;
> +		memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
> +		if (!memcg)
> +			return -ENOMEM;
>  		*memcgp = memcg;
>  		return 0;
>  	}
> @@ -3996,13 +4015,9 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
>  	if (unlikely(!mm))
>  		memcg = root_mem_cgroup;
>  	else {
> -		memcg = get_mem_cgroup_from_mm(mm);
> -		ret = mem_cgroup_try_charge(memcg, gfp_mask, 1, true);
> -		css_put(&memcg->css);
> -		if (ret == -EINTR)
> -			memcg = root_mem_cgroup;
> -		else if (ret)
> -			return ret;
> +		memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
> +		if (!memcg)
> +			return -ENOMEM;
>  	}
>  	__mem_cgroup_commit_charge(memcg, page, 1, type, false);
>  	return 0;
> 
> Anyway to your patch as is. The above can be posted as a separate patch
> or folded in as you prefer.
> 
> Acked-by: Michal Hocko <mhocko@...e.cz>
> 
> > ---
> >  mm/memcontrol.c | 184 +++++++++++++++++++++++++-------------------------------
> >  1 file changed, 83 insertions(+), 101 deletions(-)
> > 
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 4f7192bfa5fa..876598b4505b 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -2609,7 +2609,7 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
> >  }
> >  
> >  
> > -/* See __mem_cgroup_try_charge() for details */
> > +/* See mem_cgroup_try_charge() for details */
> >  enum {
> >  	CHARGE_OK,		/* success */
> >  	CHARGE_RETRY,		/* need to retry but retry is not bad */
> > @@ -2682,45 +2682,34 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
> >  	return CHARGE_NOMEM;
> >  }
> >  
> > -/*
> > - * __mem_cgroup_try_charge() does
> > - * 1. detect memcg to be charged against from passed *mm and *ptr,
> > - * 2. update res_counter
> > - * 3. call memory reclaim if necessary.
> > - *
> > - * In some special case, if the task is fatal, fatal_signal_pending() or
> > - * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
> > - * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
> > - * as possible without any hazards. 2: all pages should have a valid
> > - * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
> > - * pointer, that is treated as a charge to root_mem_cgroup.
> > - *
> > - * So __mem_cgroup_try_charge() will return
> > - *  0       ...  on success, filling *ptr with a valid memcg pointer.
> > - *  -ENOMEM ...  charge failure because of resource limits.
> > - *  -EINTR  ...  if thread is fatal. *ptr is filled with root_mem_cgroup.
> > +/**
> > + * mem_cgroup_try_charge - try charging a memcg
> > + * @memcg: memcg to charge
> > + * @nr_pages: number of pages to charge
> > + * @oom: trigger OOM if reclaim fails
> >   *
> > - * Unlike the exported interface, an "oom" parameter is added. if oom==true,
> > - * the oom-killer can be invoked.
> > + * Returns 0 if @memcg was charged successfully, -EINTR if the charge
> > + * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
> >   */
> > -static int __mem_cgroup_try_charge(struct mm_struct *mm,
> > -				   gfp_t gfp_mask,
> > -				   unsigned int nr_pages,
> > -				   struct mem_cgroup **ptr,
> > -				   bool oom)
> > +static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
> > +				 gfp_t gfp_mask,
> > +				 unsigned int nr_pages,
> > +				 bool oom)
> >  {
> >  	unsigned int batch = max(CHARGE_BATCH, nr_pages);
> >  	int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
> > -	struct mem_cgroup *memcg = NULL;
> >  	int ret;
> >  
> > +	if (mem_cgroup_is_root(memcg))
> > +		goto done;
> >  	/*
> > -	 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
> > -	 * in system level. So, allow to go ahead dying process in addition to
> > -	 * MEMDIE process.
> > +	 * Unlike in global OOM situations, memcg is not in a physical
> > +	 * memory shortage.  Allow dying and OOM-killed tasks to
> > +	 * bypass the last charges so that they can exit quickly and
> > +	 * free their memory.
> >  	 */
> > -	if (unlikely(test_thread_flag(TIF_MEMDIE)
> > -		     || fatal_signal_pending(current)))
> > +	if (unlikely(test_thread_flag(TIF_MEMDIE) ||
> > +		     fatal_signal_pending(current)))
> >  		goto bypass;
> >  
> >  	if (unlikely(task_in_memcg_oom(current)))
> > @@ -2729,14 +2718,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
> >  	if (gfp_mask & __GFP_NOFAIL)
> >  		oom = false;
> >  again:
> > -	if (*ptr) { /* css should be a valid one */
> > -		memcg = *ptr;
> > -		css_get(&memcg->css);
> > -	} else {
> > -		memcg = get_mem_cgroup_from_mm(mm);
> > -	}
> > -	if (mem_cgroup_is_root(memcg))
> > -		goto done;
> >  	if (consume_stock(memcg, nr_pages))
> >  		goto done;
> >  
> > @@ -2744,10 +2725,8 @@ again:
> >  		bool invoke_oom = oom && !nr_oom_retries;
> >  
> >  		/* If killed, bypass charge */
> > -		if (fatal_signal_pending(current)) {
> > -			css_put(&memcg->css);
> > +		if (fatal_signal_pending(current))
> >  			goto bypass;
> > -		}
> >  
> >  		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
> >  					   nr_pages, invoke_oom);
> > @@ -2756,17 +2735,12 @@ again:
> >  			break;
> >  		case CHARGE_RETRY: /* not in OOM situation but retry */
> >  			batch = nr_pages;
> > -			css_put(&memcg->css);
> > -			memcg = NULL;
> >  			goto again;
> >  		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
> > -			css_put(&memcg->css);
> >  			goto nomem;
> >  		case CHARGE_NOMEM: /* OOM routine works */
> > -			if (!oom || invoke_oom) {
> > -				css_put(&memcg->css);
> > +			if (!oom || invoke_oom)
> >  				goto nomem;
> > -			}
> >  			nr_oom_retries--;
> >  			break;
> >  		}
> > @@ -2775,16 +2749,11 @@ again:
> >  	if (batch > nr_pages)
> >  		refill_stock(memcg, batch - nr_pages);
> >  done:
> > -	css_put(&memcg->css);
> > -	*ptr = memcg;
> >  	return 0;
> >  nomem:
> > -	if (!(gfp_mask & __GFP_NOFAIL)) {
> > -		*ptr = NULL;
> > +	if (!(gfp_mask & __GFP_NOFAIL))
> >  		return -ENOMEM;
> > -	}
> >  bypass:
> > -	*ptr = root_mem_cgroup;
> >  	return -EINTR;
> >  }
> >  
> > @@ -2983,20 +2952,17 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
> >  static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
> >  {
> >  	struct res_counter *fail_res;
> > -	struct mem_cgroup *_memcg;
> >  	int ret = 0;
> >  
> >  	ret = res_counter_charge(&memcg->kmem, size, &fail_res);
> >  	if (ret)
> >  		return ret;
> >  
> > -	_memcg = memcg;
> > -	ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
> > -				      &_memcg, oom_gfp_allowed(gfp));
> > -
> > +	ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT,
> > +				    oom_gfp_allowed(gfp));
> >  	if (ret == -EINTR)  {
> >  		/*
> > -		 * __mem_cgroup_try_charge() chosed to bypass to root due to
> > +		 * mem_cgroup_try_charge() chosed to bypass to root due to
> >  		 * OOM kill or fatal signal.  Since our only options are to
> >  		 * either fail the allocation or charge it to this cgroup, do
> >  		 * it as a temporary condition. But we can't fail. From a
> > @@ -3006,7 +2972,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
> >  		 *
> >  		 * This condition will only trigger if the task entered
> >  		 * memcg_charge_kmem in a sane state, but was OOM-killed during
> > -		 * __mem_cgroup_try_charge() above. Tasks that were already
> > +		 * mem_cgroup_try_charge() above. Tasks that were already
> >  		 * dying when the allocation triggers should have been already
> >  		 * directed to the root cgroup in memcontrol.h
> >  		 */
> > @@ -3858,8 +3824,8 @@ out:
> >  int mem_cgroup_newpage_charge(struct page *page,
> >  			      struct mm_struct *mm, gfp_t gfp_mask)
> >  {
> > -	struct mem_cgroup *memcg = NULL;
> >  	unsigned int nr_pages = 1;
> > +	struct mem_cgroup *memcg;
> >  	bool oom = true;
> >  	int ret;
> >  
> > @@ -3880,8 +3846,12 @@ int mem_cgroup_newpage_charge(struct page *page,
> >  		oom = false;
> >  	}
> >  
> > -	ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
> > -	if (ret == -ENOMEM)
> > +	memcg = get_mem_cgroup_from_mm(mm);
> > +	ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);
> > +	css_put(&memcg->css);
> > +	if (ret == -EINTR)
> > +		memcg = root_mem_cgroup;
> > +	else if (ret)
> >  		return ret;
> >  	__mem_cgroup_commit_charge(memcg, page, nr_pages,
> >  				   MEM_CGROUP_CHARGE_TYPE_ANON, false);
> > @@ -3899,7 +3869,7 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
> >  					  gfp_t mask,
> >  					  struct mem_cgroup **memcgp)
> >  {
> > -	struct mem_cgroup *memcg;
> > +	struct mem_cgroup *memcg = NULL;
> >  	struct page_cgroup *pc;
> >  	int ret;
> >  
> > @@ -3912,31 +3882,29 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
> >  	 * in turn serializes uncharging.
> >  	 */
> >  	if (PageCgroupUsed(pc))
> > -		return 0;
> > -	if (!do_swap_account)
> > -		goto charge_cur_mm;
> > -	memcg = try_get_mem_cgroup_from_page(page);
> > +		goto out;
> > +	if (do_swap_account)
> > +		memcg = try_get_mem_cgroup_from_page(page);
> >  	if (!memcg)
> > -		goto charge_cur_mm;
> > -	*memcgp = memcg;
> > -	ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
> > +		memcg = get_mem_cgroup_from_mm(mm);
> > +	ret = mem_cgroup_try_charge(memcg, mask, 1, true);
> >  	css_put(&memcg->css);
> >  	if (ret == -EINTR)
> > -		ret = 0;
> > -	return ret;
> > -charge_cur_mm:
> > -	ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
> > -	if (ret == -EINTR)
> > -		ret = 0;
> > -	return ret;
> > +		memcg = root_mem_cgroup;
> > +	else if (ret)
> > +		return ret;
> > +out:
> > +	*memcgp = memcg;
> > +	return 0;
> >  }
> >  
> >  int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
> >  				 gfp_t gfp_mask, struct mem_cgroup **memcgp)
> >  {
> > -	*memcgp = NULL;
> > -	if (mem_cgroup_disabled())
> > +	if (mem_cgroup_disabled()) {
> > +		*memcgp = NULL;
> >  		return 0;
> > +	}
> >  	/*
> >  	 * A racing thread's fault, or swapoff, may have already
> >  	 * updated the pte, and even removed page from swap cache: in
> > @@ -3944,12 +3912,18 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
> >  	 * there's also a KSM case which does need to charge the page.
> >  	 */
> >  	if (!PageSwapCache(page)) {
> > +		struct mem_cgroup *memcg;
> >  		int ret;
> >  
> > -		ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
> > +		memcg = get_mem_cgroup_from_mm(mm);
> > +		ret = mem_cgroup_try_charge(memcg, gfp_mask, 1, true);
> > +		css_put(&memcg->css);
> >  		if (ret == -EINTR)
> > -			ret = 0;
> > -		return ret;
> > +			memcg = root_mem_cgroup;
> > +		else if (ret)
> > +			return ret;
> > +		*memcgp = memcg;
> > +		return 0;
> >  	}
> >  	return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
> >  }
> > @@ -3996,8 +3970,8 @@ void mem_cgroup_commit_charge_swapin(struct page *page,
> >  int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
> >  				gfp_t gfp_mask)
> >  {
> > -	struct mem_cgroup *memcg = NULL;
> >  	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
> > +	struct mem_cgroup *memcg;
> >  	int ret;
> >  
> >  	if (mem_cgroup_disabled())
> > @@ -4005,23 +3979,32 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
> >  	if (PageCompound(page))
> >  		return 0;
> >  
> > -	if (!PageSwapCache(page)) {
> > -		/*
> > -		 * Page cache insertions can happen without an actual
> > -		 * task context, e.g. during disk probing on boot.
> > -		 */
> > -		if (!mm)
> > -			memcg = root_mem_cgroup;
> > -		ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true);
> > -		if (ret != -ENOMEM)
> > -			__mem_cgroup_commit_charge(memcg, page, 1, type, false);
> > -	} else { /* page is swapcache/shmem */
> > +	if (PageSwapCache(page)) { /* shmem */
> >  		ret = __mem_cgroup_try_charge_swapin(mm, page,
> >  						     gfp_mask, &memcg);
> > -		if (!ret)
> > -			__mem_cgroup_commit_charge_swapin(page, memcg, type);
> > +		if (ret)
> > +			return ret;
> > +		__mem_cgroup_commit_charge_swapin(page, memcg, type);
> > +		return 0;
> >  	}
> > -	return ret;
> > +
> > +	/*
> > +	 * Page cache insertions can happen without an actual mm
> > +	 * context, e.g. during disk probing on boot.
> > +	 */
> > +	if (unlikely(!mm))
> > +		memcg = root_mem_cgroup;
> > +	else {
> > +		memcg = get_mem_cgroup_from_mm(mm);
> > +		ret = mem_cgroup_try_charge(memcg, gfp_mask, 1, true);
> > +		css_put(&memcg->css);
> > +		if (ret == -EINTR)
> > +			memcg = root_mem_cgroup;
> > +		else if (ret)
> > +			return ret;
> > +	}
> > +	__mem_cgroup_commit_charge(memcg, page, 1, type, false);
> > +	return 0;
> >  }
> >  
> >  static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
> > @@ -6635,8 +6618,7 @@ one_by_one:
> >  			batch_count = PRECHARGE_COUNT_AT_ONCE;
> >  			cond_resched();
> >  		}
> > -		ret = __mem_cgroup_try_charge(NULL,
> > -					GFP_KERNEL, 1, &memcg, false);
> > +		ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false);
> >  		if (ret)
> >  			/* mem_cgroup_clear_mc() will do uncharge later */
> >  			return ret;
> > -- 
> > 1.9.0
> > 
> 
> -- 
> Michal Hocko
> SUSE Labs
> --
> To unsubscribe from this list: send the line "unsubscribe cgroups" in
> the body of a message to majordomo@...r.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/