linux-kernel - Re: [patch 8/8] memcg: sanitize __mem_cgroup_try

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20140312140138.GD11831@dhcp22.suse.cz>
Date:	Wed, 12 Mar 2014 15:01:38 +0100
From:	Michal Hocko <mhocko@...e.cz>
To:	Johannes Weiner <hannes@...xchg.org>
Cc:	Andrew Morton <akpm@...ux-foundation.org>, linux-mm@...ck.org,
	cgroups@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [patch 8/8] memcg: sanitize __mem_cgroup_try_charge() call
 protocol

On Tue 11-03-14 21:28:34, Johannes Weiner wrote:
> Some callsites pass a memcg directly, some callsites pass a mm that
> first has to be translated to an mm.  This makes for a terrible
> function interface.
> 
> Just push the mm-to-memcg translation into the respective callsites
> and always pass a memcg to mem_cgroup_try_charge().
> 
> Signed-off-by: Johannes Weiner <hannes@...xchg.org>

   text    data     bss     dec     hex filename
  39435    5916    4192   49543    c187 mm/memcontrol.o.after
  40466    5916    4192   50574    c58e mm/memcontrol.o.before

1K down very nice. But we can shave off additional ~300B if the the
common mm charging helper as I suggested before:

   text    data     bss     dec     hex filename
  39100    5916    4192   49208    c038 mm/memcontrol.o.mm

commit 7aa420bc051849d85dcf5a091f3619c6b8e33cfb
Author: Michal Hocko <mhocko@...e.cz>
Date:   Wed Mar 12 14:59:06 2014 +0100

    add charge mm helper

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2d7aa3e784d9..67e01b27a021 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2757,6 +2757,35 @@ bypass:
 	return -EINTR;
 }
 
+/**
+ * mem_cgroup_try_charge_mm - try charging a mm
+ * @mm: mm_struct to charge
+ * @nr_pages: number of pages to charge
+ * @oom: trigger OOM if reclaim fails
+ *
+ * Returns the charged mem_cgroup associated with the given mm_struct or
+ * NULL the charge failed.
+ */
+static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
+				 gfp_t gfp_mask,
+				 unsigned int nr_pages,
+				 bool oom)
+
+{
+	struct mem_cgroup *memcg;
+	int ret;
+
+	memcg = get_mem_cgroup_from_mm(mm);
+	ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);
+	css_put(&memcg->css);
+	if (ret == -EINTR)
+		memcg = root_mem_cgroup;
+	else if (ret)
+		memcg = NULL;
+
+	return memcg;
+}
+
 /*
  * Somemtimes we have to undo a charge we got by try_charge().
  * This function is for that and do uncharge, put css's refcnt.
@@ -3828,7 +3857,6 @@ int mem_cgroup_newpage_charge(struct page *page,
 	unsigned int nr_pages = 1;
 	struct mem_cgroup *memcg;
 	bool oom = true;
-	int ret;
 
 	if (mem_cgroup_disabled())
 		return 0;
@@ -3847,13 +3875,9 @@ int mem_cgroup_newpage_charge(struct page *page,
 		oom = false;
 	}
 
-	memcg = get_mem_cgroup_from_mm(mm);
-	ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);
-	css_put(&memcg->css);
-	if (ret == -EINTR)
-		memcg = root_mem_cgroup;
-	else if (ret)
-		return ret;
+	memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom);
+	if (!memcg)
+		return -ENOMEM;
 	__mem_cgroup_commit_charge(memcg, page, nr_pages,
 				   MEM_CGROUP_CHARGE_TYPE_ANON, false);
 	return 0;
@@ -3914,15 +3938,10 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
 	 */
 	if (!PageSwapCache(page)) {
 		struct mem_cgroup *memcg;
-		int ret;
 
-		memcg = get_mem_cgroup_from_mm(mm);
-		ret = mem_cgroup_try_charge(memcg, gfp_mask, 1, true);
-		css_put(&memcg->css);
-		if (ret == -EINTR)
-			memcg = root_mem_cgroup;
-		else if (ret)
-			return ret;
+		memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
+		if (!memcg)
+			return -ENOMEM;
 		*memcgp = memcg;
 		return 0;
 	}
@@ -3996,13 +4015,9 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 	if (unlikely(!mm))
 		memcg = root_mem_cgroup;
 	else {
-		memcg = get_mem_cgroup_from_mm(mm);
-		ret = mem_cgroup_try_charge(memcg, gfp_mask, 1, true);
-		css_put(&memcg->css);
-		if (ret == -EINTR)
-			memcg = root_mem_cgroup;
-		else if (ret)
-			return ret;
+		memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
+		if (!memcg)
+			return -ENOMEM;
 	}
 	__mem_cgroup_commit_charge(memcg, page, 1, type, false);
 	return 0;

Anyway to your patch as is. The above can be posted as a separate patch
or folded in as you prefer.

Acked-by: Michal Hocko <mhocko@...e.cz>

> ---
>  mm/memcontrol.c | 184 +++++++++++++++++++++++++-------------------------------
>  1 file changed, 83 insertions(+), 101 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 4f7192bfa5fa..876598b4505b 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2609,7 +2609,7 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
>  }
>  
>  
> -/* See __mem_cgroup_try_charge() for details */
> +/* See mem_cgroup_try_charge() for details */
>  enum {
>  	CHARGE_OK,		/* success */
>  	CHARGE_RETRY,		/* need to retry but retry is not bad */
> @@ -2682,45 +2682,34 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
>  	return CHARGE_NOMEM;
>  }
>  
> -/*
> - * __mem_cgroup_try_charge() does
> - * 1. detect memcg to be charged against from passed *mm and *ptr,
> - * 2. update res_counter
> - * 3. call memory reclaim if necessary.
> - *
> - * In some special case, if the task is fatal, fatal_signal_pending() or
> - * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
> - * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
> - * as possible without any hazards. 2: all pages should have a valid
> - * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
> - * pointer, that is treated as a charge to root_mem_cgroup.
> - *
> - * So __mem_cgroup_try_charge() will return
> - *  0       ...  on success, filling *ptr with a valid memcg pointer.
> - *  -ENOMEM ...  charge failure because of resource limits.
> - *  -EINTR  ...  if thread is fatal. *ptr is filled with root_mem_cgroup.
> +/**
> + * mem_cgroup_try_charge - try charging a memcg
> + * @memcg: memcg to charge
> + * @nr_pages: number of pages to charge
> + * @oom: trigger OOM if reclaim fails
>   *
> - * Unlike the exported interface, an "oom" parameter is added. if oom==true,
> - * the oom-killer can be invoked.
> + * Returns 0 if @memcg was charged successfully, -EINTR if the charge
> + * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
>   */
> -static int __mem_cgroup_try_charge(struct mm_struct *mm,
> -				   gfp_t gfp_mask,
> -				   unsigned int nr_pages,
> -				   struct mem_cgroup **ptr,
> -				   bool oom)
> +static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
> +				 gfp_t gfp_mask,
> +				 unsigned int nr_pages,
> +				 bool oom)
>  {
>  	unsigned int batch = max(CHARGE_BATCH, nr_pages);
>  	int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
> -	struct mem_cgroup *memcg = NULL;
>  	int ret;
>  
> +	if (mem_cgroup_is_root(memcg))
> +		goto done;
>  	/*
> -	 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
> -	 * in system level. So, allow to go ahead dying process in addition to
> -	 * MEMDIE process.
> +	 * Unlike in global OOM situations, memcg is not in a physical
> +	 * memory shortage.  Allow dying and OOM-killed tasks to
> +	 * bypass the last charges so that they can exit quickly and
> +	 * free their memory.
>  	 */
> -	if (unlikely(test_thread_flag(TIF_MEMDIE)
> -		     || fatal_signal_pending(current)))
> +	if (unlikely(test_thread_flag(TIF_MEMDIE) ||
> +		     fatal_signal_pending(current)))
>  		goto bypass;
>  
>  	if (unlikely(task_in_memcg_oom(current)))
> @@ -2729,14 +2718,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
>  	if (gfp_mask & __GFP_NOFAIL)
>  		oom = false;
>  again:
> -	if (*ptr) { /* css should be a valid one */
> -		memcg = *ptr;
> -		css_get(&memcg->css);
> -	} else {
> -		memcg = get_mem_cgroup_from_mm(mm);
> -	}
> -	if (mem_cgroup_is_root(memcg))
> -		goto done;
>  	if (consume_stock(memcg, nr_pages))
>  		goto done;
>  
> @@ -2744,10 +2725,8 @@ again:
>  		bool invoke_oom = oom && !nr_oom_retries;
>  
>  		/* If killed, bypass charge */
> -		if (fatal_signal_pending(current)) {
> -			css_put(&memcg->css);
> +		if (fatal_signal_pending(current))
>  			goto bypass;
> -		}
>  
>  		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
>  					   nr_pages, invoke_oom);
> @@ -2756,17 +2735,12 @@ again:
>  			break;
>  		case CHARGE_RETRY: /* not in OOM situation but retry */
>  			batch = nr_pages;
> -			css_put(&memcg->css);
> -			memcg = NULL;
>  			goto again;
>  		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
> -			css_put(&memcg->css);
>  			goto nomem;
>  		case CHARGE_NOMEM: /* OOM routine works */
> -			if (!oom || invoke_oom) {
> -				css_put(&memcg->css);
> +			if (!oom || invoke_oom)
>  				goto nomem;
> -			}
>  			nr_oom_retries--;
>  			break;
>  		}
> @@ -2775,16 +2749,11 @@ again:
>  	if (batch > nr_pages)
>  		refill_stock(memcg, batch - nr_pages);
>  done:
> -	css_put(&memcg->css);
> -	*ptr = memcg;
>  	return 0;
>  nomem:
> -	if (!(gfp_mask & __GFP_NOFAIL)) {
> -		*ptr = NULL;
> +	if (!(gfp_mask & __GFP_NOFAIL))
>  		return -ENOMEM;
> -	}
>  bypass:
> -	*ptr = root_mem_cgroup;
>  	return -EINTR;
>  }
>  
> @@ -2983,20 +2952,17 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
>  static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
>  {
>  	struct res_counter *fail_res;
> -	struct mem_cgroup *_memcg;
>  	int ret = 0;
>  
>  	ret = res_counter_charge(&memcg->kmem, size, &fail_res);
>  	if (ret)
>  		return ret;
>  
> -	_memcg = memcg;
> -	ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
> -				      &_memcg, oom_gfp_allowed(gfp));
> -
> +	ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT,
> +				    oom_gfp_allowed(gfp));
>  	if (ret == -EINTR)  {
>  		/*
> -		 * __mem_cgroup_try_charge() chosed to bypass to root due to
> +		 * mem_cgroup_try_charge() chosed to bypass to root due to
>  		 * OOM kill or fatal signal.  Since our only options are to
>  		 * either fail the allocation or charge it to this cgroup, do
>  		 * it as a temporary condition. But we can't fail. From a
> @@ -3006,7 +2972,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
>  		 *
>  		 * This condition will only trigger if the task entered
>  		 * memcg_charge_kmem in a sane state, but was OOM-killed during
> -		 * __mem_cgroup_try_charge() above. Tasks that were already
> +		 * mem_cgroup_try_charge() above. Tasks that were already
>  		 * dying when the allocation triggers should have been already
>  		 * directed to the root cgroup in memcontrol.h
>  		 */
> @@ -3858,8 +3824,8 @@ out:
>  int mem_cgroup_newpage_charge(struct page *page,
>  			      struct mm_struct *mm, gfp_t gfp_mask)
>  {
> -	struct mem_cgroup *memcg = NULL;
>  	unsigned int nr_pages = 1;
> +	struct mem_cgroup *memcg;
>  	bool oom = true;
>  	int ret;
>  
> @@ -3880,8 +3846,12 @@ int mem_cgroup_newpage_charge(struct page *page,
>  		oom = false;
>  	}
>  
> -	ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
> -	if (ret == -ENOMEM)
> +	memcg = get_mem_cgroup_from_mm(mm);
> +	ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);
> +	css_put(&memcg->css);
> +	if (ret == -EINTR)
> +		memcg = root_mem_cgroup;
> +	else if (ret)
>  		return ret;
>  	__mem_cgroup_commit_charge(memcg, page, nr_pages,
>  				   MEM_CGROUP_CHARGE_TYPE_ANON, false);
> @@ -3899,7 +3869,7 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
>  					  gfp_t mask,
>  					  struct mem_cgroup **memcgp)
>  {
> -	struct mem_cgroup *memcg;
> +	struct mem_cgroup *memcg = NULL;
>  	struct page_cgroup *pc;
>  	int ret;
>  
> @@ -3912,31 +3882,29 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
>  	 * in turn serializes uncharging.
>  	 */
>  	if (PageCgroupUsed(pc))
> -		return 0;
> -	if (!do_swap_account)
> -		goto charge_cur_mm;
> -	memcg = try_get_mem_cgroup_from_page(page);
> +		goto out;
> +	if (do_swap_account)
> +		memcg = try_get_mem_cgroup_from_page(page);
>  	if (!memcg)
> -		goto charge_cur_mm;
> -	*memcgp = memcg;
> -	ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
> +		memcg = get_mem_cgroup_from_mm(mm);
> +	ret = mem_cgroup_try_charge(memcg, mask, 1, true);
>  	css_put(&memcg->css);
>  	if (ret == -EINTR)
> -		ret = 0;
> -	return ret;
> -charge_cur_mm:
> -	ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
> -	if (ret == -EINTR)
> -		ret = 0;
> -	return ret;
> +		memcg = root_mem_cgroup;
> +	else if (ret)
> +		return ret;
> +out:
> +	*memcgp = memcg;
> +	return 0;
>  }
>  
>  int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
>  				 gfp_t gfp_mask, struct mem_cgroup **memcgp)
>  {
> -	*memcgp = NULL;
> -	if (mem_cgroup_disabled())
> +	if (mem_cgroup_disabled()) {
> +		*memcgp = NULL;
>  		return 0;
> +	}
>  	/*
>  	 * A racing thread's fault, or swapoff, may have already
>  	 * updated the pte, and even removed page from swap cache: in
> @@ -3944,12 +3912,18 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
>  	 * there's also a KSM case which does need to charge the page.
>  	 */
>  	if (!PageSwapCache(page)) {
> +		struct mem_cgroup *memcg;
>  		int ret;
>  
> -		ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
> +		memcg = get_mem_cgroup_from_mm(mm);
> +		ret = mem_cgroup_try_charge(memcg, gfp_mask, 1, true);
> +		css_put(&memcg->css);
>  		if (ret == -EINTR)
> -			ret = 0;
> -		return ret;
> +			memcg = root_mem_cgroup;
> +		else if (ret)
> +			return ret;
> +		*memcgp = memcg;
> +		return 0;
>  	}
>  	return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
>  }
> @@ -3996,8 +3970,8 @@ void mem_cgroup_commit_charge_swapin(struct page *page,
>  int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
>  				gfp_t gfp_mask)
>  {
> -	struct mem_cgroup *memcg = NULL;
>  	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
> +	struct mem_cgroup *memcg;
>  	int ret;
>  
>  	if (mem_cgroup_disabled())
> @@ -4005,23 +3979,32 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
>  	if (PageCompound(page))
>  		return 0;
>  
> -	if (!PageSwapCache(page)) {
> -		/*
> -		 * Page cache insertions can happen without an actual
> -		 * task context, e.g. during disk probing on boot.
> -		 */
> -		if (!mm)
> -			memcg = root_mem_cgroup;
> -		ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true);
> -		if (ret != -ENOMEM)
> -			__mem_cgroup_commit_charge(memcg, page, 1, type, false);
> -	} else { /* page is swapcache/shmem */
> +	if (PageSwapCache(page)) { /* shmem */
>  		ret = __mem_cgroup_try_charge_swapin(mm, page,
>  						     gfp_mask, &memcg);
> -		if (!ret)
> -			__mem_cgroup_commit_charge_swapin(page, memcg, type);
> +		if (ret)
> +			return ret;
> +		__mem_cgroup_commit_charge_swapin(page, memcg, type);
> +		return 0;
>  	}
> -	return ret;
> +
> +	/*
> +	 * Page cache insertions can happen without an actual mm
> +	 * context, e.g. during disk probing on boot.
> +	 */
> +	if (unlikely(!mm))
> +		memcg = root_mem_cgroup;
> +	else {
> +		memcg = get_mem_cgroup_from_mm(mm);
> +		ret = mem_cgroup_try_charge(memcg, gfp_mask, 1, true);
> +		css_put(&memcg->css);
> +		if (ret == -EINTR)
> +			memcg = root_mem_cgroup;
> +		else if (ret)
> +			return ret;
> +	}
> +	__mem_cgroup_commit_charge(memcg, page, 1, type, false);
> +	return 0;
>  }
>  
>  static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
> @@ -6635,8 +6618,7 @@ one_by_one:
>  			batch_count = PRECHARGE_COUNT_AT_ONCE;
>  			cond_resched();
>  		}
> -		ret = __mem_cgroup_try_charge(NULL,
> -					GFP_KERNEL, 1, &memcg, false);
> +		ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false);
>  		if (ret)
>  			/* mem_cgroup_clear_mc() will do uncharge later */
>  			return ret;
> -- 
> 1.9.0
> 

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/