[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20170523071131.GG12813@dhcp22.suse.cz>
Date: Tue, 23 May 2017 09:11:31 +0200
From: Michal Hocko <mhocko@...nel.org>
To: Vlastimil Babka <vbabka@...e.cz>
Cc: Andrew Morton <akpm@...ux-foundation.org>, linux-mm@...ck.org,
linux-api@...r.kernel.org, linux-kernel@...r.kernel.org,
cgroups@...r.kernel.org, Li Zefan <lizefan@...wei.com>,
Mel Gorman <mgorman@...hsingularity.net>,
David Rientjes <rientjes@...gle.com>,
Christoph Lameter <cl@...ux.com>,
Hugh Dickins <hughd@...gle.com>,
Andrea Arcangeli <aarcange@...hat.com>,
Anshuman Khandual <khandual@...ux.vnet.ibm.com>,
"Kirill A. Shutemov" <kirill.shutemov@...ux.intel.com>
Subject: Re: [PATCH v2 4/6] mm, mempolicy: simplify rebinding mempolicies
when updating cpusets
On Wed 17-05-17 10:11:38, Vlastimil Babka wrote:
> Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing
> cpuset's mems") has introduced a two-step protocol when rebinding task's
> mempolicy due to cpuset update, in order to avoid a parallel allocation seeing
> an empty effective nodemask and failing. Later, commit cc9a6c877661 ("cpuset:
> mm: reduce large amounts of memory barrier related damage v3") introduced
> a seqlock protection and removed the synchronization point between the two
> update steps. At that point (or perhaps later), the two-step rebinding became
> unnecessary. Currently it only makes sure that the update first adds new nodes
> in step 1 and then removes nodes in step 2. Without memory barriers the effects
> are questionable, and even then this cannot prevent a parallel zonelist
> iteration checking the nodemask at each step to observe all nodes as unusable
> for allocation. We now fully rely on the seqlock to prevent premature OOMs and
> allocation failures.
>
> We can thus remove the two-step update parts and simplify the code.
>
> Signed-off-by: Vlastimil Babka <vbabka@...e.cz>
\o/ This code has been just piling up more complicated code on top of
the code which shouldn't have existed in the first place so I am very
happy to see it go. I hope we can go without rebinding altogether
longterm. Chaging data under feets just asks for problems and this is a
nice example of where it goes.
Acked-by: Michal Hocko <mhocko@...e.com>
> ---
> include/linux/mempolicy.h | 6 +--
> include/uapi/linux/mempolicy.h | 8 ----
> kernel/cgroup/cpuset.c | 4 +-
> mm/mempolicy.c | 102 ++++++++---------------------------------
> 4 files changed, 21 insertions(+), 99 deletions(-)
>
> diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
> index ecb6cbeede5a..3a58b4be1b0c 100644
> --- a/include/linux/mempolicy.h
> +++ b/include/linux/mempolicy.h
> @@ -142,8 +142,7 @@ bool vma_policy_mof(struct vm_area_struct *vma);
>
> extern void numa_default_policy(void);
> extern void numa_policy_init(void);
> -extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
> - enum mpol_rebind_step step);
> +extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new);
> extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
>
> extern int huge_node(struct vm_area_struct *vma,
> @@ -260,8 +259,7 @@ static inline void numa_default_policy(void)
> }
>
> static inline void mpol_rebind_task(struct task_struct *tsk,
> - const nodemask_t *new,
> - enum mpol_rebind_step step)
> + const nodemask_t *new)
> {
> }
>
> diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
> index 9cd8b21dddbe..2a4d89508fec 100644
> --- a/include/uapi/linux/mempolicy.h
> +++ b/include/uapi/linux/mempolicy.h
> @@ -24,13 +24,6 @@ enum {
> MPOL_MAX, /* always last member of enum */
> };
>
> -enum mpol_rebind_step {
> - MPOL_REBIND_ONCE, /* do rebind work at once(not by two step) */
> - MPOL_REBIND_STEP1, /* first step(set all the newly nodes) */
> - MPOL_REBIND_STEP2, /* second step(clean all the disallowed nodes)*/
> - MPOL_REBIND_NSTEP,
> -};
> -
> /* Flags for set_mempolicy */
> #define MPOL_F_STATIC_NODES (1 << 15)
> #define MPOL_F_RELATIVE_NODES (1 << 14)
> @@ -65,7 +58,6 @@ enum mpol_rebind_step {
> */
> #define MPOL_F_SHARED (1 << 0) /* identify shared policies */
> #define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */
> -#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */
> #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
> #define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */
>
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index 0f41292be0fb..dfd5b420452d 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -1063,9 +1063,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
> }
>
> nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
> - mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
> -
> - mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
> + mpol_rebind_task(tsk, newmems);
> tsk->mems_allowed = *newmems;
>
> if (need_loop) {
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index c60807625fd5..047181452040 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -146,22 +146,7 @@ struct mempolicy *get_task_policy(struct task_struct *p)
>
> static const struct mempolicy_operations {
> int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
> - /*
> - * If read-side task has no lock to protect task->mempolicy, write-side
> - * task will rebind the task->mempolicy by two step. The first step is
> - * setting all the newly nodes, and the second step is cleaning all the
> - * disallowed nodes. In this way, we can avoid finding no node to alloc
> - * page.
> - * If we have a lock to protect task->mempolicy in read-side, we do
> - * rebind directly.
> - *
> - * step:
> - * MPOL_REBIND_ONCE - do rebind work at once
> - * MPOL_REBIND_STEP1 - set all the newly nodes
> - * MPOL_REBIND_STEP2 - clean all the disallowed nodes
> - */
> - void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
> - enum mpol_rebind_step step);
> + void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
> } mpol_ops[MPOL_MAX];
>
> static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
> @@ -304,19 +289,11 @@ void __mpol_put(struct mempolicy *p)
> kmem_cache_free(policy_cache, p);
> }
>
> -static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
> - enum mpol_rebind_step step)
> +static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
> {
> }
>
> -/*
> - * step:
> - * MPOL_REBIND_ONCE - do rebind work at once
> - * MPOL_REBIND_STEP1 - set all the newly nodes
> - * MPOL_REBIND_STEP2 - clean all the disallowed nodes
> - */
> -static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
> - enum mpol_rebind_step step)
> +static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
> {
> nodemask_t tmp;
>
> @@ -325,35 +302,19 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
> else if (pol->flags & MPOL_F_RELATIVE_NODES)
> mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
> else {
> - /*
> - * if step == 1, we use ->w.cpuset_mems_allowed to cache the
> - * result
> - */
> - if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
> - nodes_remap(tmp, pol->v.nodes,
> - pol->w.cpuset_mems_allowed, *nodes);
> - pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
> - } else if (step == MPOL_REBIND_STEP2) {
> - tmp = pol->w.cpuset_mems_allowed;
> - pol->w.cpuset_mems_allowed = *nodes;
> - } else
> - BUG();
> + nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
> + *nodes);
> + pol->w.cpuset_mems_allowed = tmp;
> }
>
> if (nodes_empty(tmp))
> tmp = *nodes;
>
> - if (step == MPOL_REBIND_STEP1)
> - nodes_or(pol->v.nodes, pol->v.nodes, tmp);
> - else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
> - pol->v.nodes = tmp;
> - else
> - BUG();
> + pol->v.nodes = tmp;
> }
>
> static void mpol_rebind_preferred(struct mempolicy *pol,
> - const nodemask_t *nodes,
> - enum mpol_rebind_step step)
> + const nodemask_t *nodes)
> {
> nodemask_t tmp;
>
> @@ -379,42 +340,19 @@ static void mpol_rebind_preferred(struct mempolicy *pol,
> /*
> * mpol_rebind_policy - Migrate a policy to a different set of nodes
> *
> - * If read-side task has no lock to protect task->mempolicy, write-side
> - * task will rebind the task->mempolicy by two step. The first step is
> - * setting all the newly nodes, and the second step is cleaning all the
> - * disallowed nodes. In this way, we can avoid finding no node to alloc
> - * page.
> - * If we have a lock to protect task->mempolicy in read-side, we do
> - * rebind directly.
> - *
> - * step:
> - * MPOL_REBIND_ONCE - do rebind work at once
> - * MPOL_REBIND_STEP1 - set all the newly nodes
> - * MPOL_REBIND_STEP2 - clean all the disallowed nodes
> + * Per-vma policies are protected by mmap_sem. Allocations using per-task
> + * policies are protected by task->mems_allowed_seq to prevent a premature
> + * OOM/allocation failure due to parallel nodemask modification.
> */
> -static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
> - enum mpol_rebind_step step)
> +static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
> {
> if (!pol)
> return;
> - if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
> + if (!mpol_store_user_nodemask(pol) &&
> nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
> return;
>
> - if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
> - return;
> -
> - if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
> - BUG();
> -
> - if (step == MPOL_REBIND_STEP1)
> - pol->flags |= MPOL_F_REBINDING;
> - else if (step == MPOL_REBIND_STEP2)
> - pol->flags &= ~MPOL_F_REBINDING;
> - else if (step >= MPOL_REBIND_NSTEP)
> - BUG();
> -
> - mpol_ops[pol->mode].rebind(pol, newmask, step);
> + mpol_ops[pol->mode].rebind(pol, newmask);
> }
>
> /*
> @@ -424,10 +362,9 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
> * Called with task's alloc_lock held.
> */
>
> -void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
> - enum mpol_rebind_step step)
> +void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
> {
> - mpol_rebind_policy(tsk->mempolicy, new, step);
> + mpol_rebind_policy(tsk->mempolicy, new);
> }
>
> /*
> @@ -442,7 +379,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
>
> down_write(&mm->mmap_sem);
> for (vma = mm->mmap; vma; vma = vma->vm_next)
> - mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
> + mpol_rebind_policy(vma->vm_policy, new);
> up_write(&mm->mmap_sem);
> }
>
> @@ -2101,10 +2038,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
>
> if (current_cpuset_is_being_rebound()) {
> nodemask_t mems = cpuset_mems_allowed(current);
> - if (new->flags & MPOL_F_REBINDING)
> - mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
> - else
> - mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
> + mpol_rebind_policy(new, &mems);
> }
> atomic_set(&new->refcnt, 1);
> return new;
> --
> 2.12.2
--
Michal Hocko
SUSE Labs
Powered by blists - more mailing lists