lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CAJuCfpGCxaGuhufCz+_cmND=A6_Y2tC=NdTmaokC8TjL_Gz+kA@mail.gmail.com>
Date: Thu, 25 Jan 2024 13:31:41 -0800
From: Suren Baghdasaryan <surenb@...gle.com>
To: akpm@...ux-foundation.org
Cc: viro@...iv.linux.org.uk, brauner@...nel.org, jack@...e.cz, 
	dchinner@...hat.com, casey@...aufler-ca.com, ben.wolsieffer@...ring.com, 
	paulmck@...nel.org, david@...hat.com, avagin@...gle.com, 
	usama.anjum@...labora.com, peterx@...hat.com, hughd@...gle.com, 
	ryan.roberts@....com, wangkefeng.wang@...wei.com, Liam.Howlett@...cle.com, 
	yuzhao@...gle.com, axelrasmussen@...gle.com, lstoakes@...il.com, 
	talumbau@...gle.com, willy@...radead.org, vbabka@...e.cz, 
	mgorman@...hsingularity.net, jhubbard@...dia.com, vishal.moola@...il.com, 
	mathieu.desnoyers@...icios.com, dhowells@...hat.com, jgg@...pe.ca, 
	sidhartha.kumar@...cle.com, andriy.shevchenko@...ux.intel.com, 
	yangxingui@...wei.com, keescook@...omium.org, sj@...nel.org, 
	linux-kernel@...r.kernel.org, linux-fsdevel@...r.kernel.org, 
	linux-mm@...ck.org, kernel-team@...roid.com
Subject: Re: [PATCH v2 3/3] mm/maps: read proc/pid/maps under RCU

On Tue, Jan 23, 2024 at 3:10 PM Suren Baghdasaryan <surenb@...gle.com> wrote:
>
> With maple_tree supporting vma tree traversal under RCU and per-vma locks
> making vma access RCU-safe, /proc/pid/maps can be read under RCU and
> without the need to read-lock mmap_lock. However vma content can change
> from under us, therefore we make a copy of the vma and we pin pointer
> fields used when generating the output (currently only vm_file and
> anon_name). Afterwards we check for concurrent address space
> modifications, wait for them to end and retry. That last check is needed
> to avoid possibility of missing a vma during concurrent maple_tree
> node replacement, which might report a NULL when a vma is replaced
> with another one. While we take the mmap_lock for reading during such
> contention, we do that momentarily only to record new mm_wr_seq counter.
> This change is designed to reduce mmap_lock contention and prevent a
> process reading /proc/pid/maps files (often a low priority task, such as
> monitoring/data collection services) from blocking address space updates.
>
> Note that this change has a userspace visible disadvantage: it allows for
> sub-page data tearing as opposed to the previous mechanism where data
> tearing could happen only between pages of generated output data.
> Since current userspace considers data tearing between pages to be
> acceptable, we assume is will be able to handle sub-page data tearing
> as well.
>
> Signed-off-by: Suren Baghdasaryan <surenb@...gle.com>
> ---
> Changes since v1 [1]:
> - Fixed CONFIG_ANON_VMA_NAME=n build by introducing
> anon_vma_name_{get|put}_if_valid, per SeongJae Park
> - Fixed misspelling of get_vma_snapshot()
>
> [1] https://lore.kernel.org/all/20240122071324.2099712-3-surenb@google.com/
>
>  fs/proc/internal.h        |   2 +
>  fs/proc/task_mmu.c        | 113 +++++++++++++++++++++++++++++++++++---
>  include/linux/mm_inline.h |  18 ++++++
>  3 files changed, 126 insertions(+), 7 deletions(-)
>
> diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> index a71ac5379584..e0247225bb68 100644
> --- a/fs/proc/internal.h
> +++ b/fs/proc/internal.h
> @@ -290,6 +290,8 @@ struct proc_maps_private {
>         struct task_struct *task;
>         struct mm_struct *mm;
>         struct vma_iterator iter;
> +       unsigned long mm_wr_seq;
> +       struct vm_area_struct vma_copy;
>  #ifdef CONFIG_NUMA
>         struct mempolicy *task_mempolicy;
>  #endif
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 3f78ebbb795f..0d5a515156ee 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -126,11 +126,95 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
>  }
>  #endif
>
> -static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
> -                                               loff_t *ppos)
> +#ifdef CONFIG_PER_VMA_LOCK
> +
> +static const struct seq_operations proc_pid_maps_op;
> +
> +/*
> + * Take VMA snapshot and pin vm_file and anon_name as they are used by
> + * show_map_vma.
> + */
> +static int get_vma_snapshot(struct proc_maps_private *priv, struct vm_area_struct *vma)
> +{
> +       struct vm_area_struct *copy = &priv->vma_copy;
> +       int ret = -EAGAIN;
> +
> +       memcpy(copy, vma, sizeof(*vma));
> +       if (copy->vm_file && !get_file_rcu(&copy->vm_file))

There is a problem in this patchset. I assumed that get_file_rcu() can
be used against vma->vm_file but that's not true. vma->vm_file is
freed via a call to fput() which schedules its freeing using
schedule_delayed_work(..., 1) but I don't think that constitutes RCU
grace period, so it can be freed from under us.
Andrew, could you please remove this patchset from your tree until I
sort this out?
Thanks,
Suren.

> +               goto out;
> +
> +       if (!anon_vma_name_get_if_valid(copy))
> +               goto put_file;
> +
> +       if (priv->mm_wr_seq == mmap_write_seq_read(priv->mm))
> +               return 0;
> +
> +       /* Address space got modified, vma might be stale. Wait and retry */
> +       rcu_read_unlock();
> +       ret = mmap_read_lock_killable(priv->mm);
> +       mmap_write_seq_record(priv->mm, &priv->mm_wr_seq);
> +       mmap_read_unlock(priv->mm);
> +       rcu_read_lock();
> +
> +       if (!ret)
> +               ret = -EAGAIN; /* no other errors, ok to retry */
> +
> +       anon_vma_name_put_if_valid(copy);
> +put_file:
> +       if (copy->vm_file)
> +               fput(copy->vm_file);
> +out:
> +       return ret;
> +}
> +
> +static void put_vma_snapshot(struct proc_maps_private *priv)
> +{
> +       struct vm_area_struct *vma = &priv->vma_copy;
> +
> +       anon_vma_name_put_if_valid(vma);
> +       if (vma->vm_file)
> +               fput(vma->vm_file);
> +}
> +
> +static inline bool needs_mmap_lock(struct seq_file *m)
> +{
> +       /*
> +        * smaps and numa_maps perform page table walk, therefore require
> +        * mmap_lock but maps can be read under RCU.
> +        */
> +       return m->op != &proc_pid_maps_op;
> +}
> +
> +#else /* CONFIG_PER_VMA_LOCK */
> +
> +/* Without per-vma locks VMA access is not RCU-safe */
> +static inline bool needs_mmap_lock(struct seq_file *m) { return true; }
> +
> +#endif /* CONFIG_PER_VMA_LOCK */
> +
> +static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos)
>  {
> +       struct proc_maps_private *priv = m->private;
>         struct vm_area_struct *vma = vma_next(&priv->iter);
>
> +#ifdef CONFIG_PER_VMA_LOCK
> +       if (vma && !needs_mmap_lock(m)) {
> +               int ret;
> +
> +               put_vma_snapshot(priv);
> +               while ((ret = get_vma_snapshot(priv, vma)) == -EAGAIN) {
> +                       /* lookup the vma at the last position again */
> +                       vma_iter_init(&priv->iter, priv->mm, *ppos);
> +                       vma = vma_next(&priv->iter);
> +               }
> +
> +               if (ret) {
> +                       put_vma_snapshot(priv);
> +                       return NULL;
> +               }
> +               vma = &priv->vma_copy;
> +       }
> +#endif
>         if (vma) {
>                 *ppos = vma->vm_start;
>         } else {
> @@ -169,12 +253,20 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
>                 return ERR_PTR(-EINTR);
>         }
>
> +       /* Drop mmap_lock if possible */
> +       if (!needs_mmap_lock(m)) {
> +               mmap_write_seq_record(priv->mm, &priv->mm_wr_seq);
> +               mmap_read_unlock(priv->mm);
> +               rcu_read_lock();
> +               memset(&priv->vma_copy, 0, sizeof(priv->vma_copy));
> +       }
> +
>         vma_iter_init(&priv->iter, mm, last_addr);
>         hold_task_mempolicy(priv);
>         if (last_addr == -2UL)
>                 return get_gate_vma(mm);
>
> -       return proc_get_vma(priv, ppos);
> +       return proc_get_vma(m, ppos);
>  }
>
>  static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
> @@ -183,7 +275,7 @@ static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
>                 *ppos = -1UL;
>                 return NULL;
>         }
> -       return proc_get_vma(m->private, ppos);
> +       return proc_get_vma(m, ppos);
>  }
>
>  static void m_stop(struct seq_file *m, void *v)
> @@ -195,7 +287,10 @@ static void m_stop(struct seq_file *m, void *v)
>                 return;
>
>         release_task_mempolicy(priv);
> -       mmap_read_unlock(mm);
> +       if (needs_mmap_lock(m))
> +               mmap_read_unlock(mm);
> +       else
> +               rcu_read_unlock();
>         mmput(mm);
>         put_task_struct(priv->task);
>         priv->task = NULL;
> @@ -283,8 +378,10 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
>         start = vma->vm_start;
>         end = vma->vm_end;
>         show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
> -       if (mm)
> -               anon_name = anon_vma_name(vma);
> +       if (mm) {
> +               anon_name = needs_mmap_lock(m) ? anon_vma_name(vma) :
> +                               anon_vma_name_get_rcu(vma);
> +       }
>
>         /*
>          * Print the dentry name for named mappings, and a
> @@ -338,6 +435,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
>                 seq_puts(m, name);
>         }
>         seq_putc(m, '\n');
> +       if (anon_name && !needs_mmap_lock(m))
> +               anon_vma_name_put(anon_name);
>  }
>
>  static int show_map(struct seq_file *m, void *v)
> diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
> index bbdb0ca857f1..a4a644fe005e 100644
> --- a/include/linux/mm_inline.h
> +++ b/include/linux/mm_inline.h
> @@ -413,6 +413,21 @@ static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
>
>  struct anon_vma_name *anon_vma_name_get_rcu(struct vm_area_struct *vma);
>
> +/*
> + * Takes a reference if anon_vma is valid and stable (has references).
> + * Fails only if anon_vma is valid but we failed to get a reference.
> + */
> +static inline bool anon_vma_name_get_if_valid(struct vm_area_struct *vma)
> +{
> +       return !vma->anon_name || anon_vma_name_get_rcu(vma);
> +}
> +
> +static inline void anon_vma_name_put_if_valid(struct vm_area_struct *vma)
> +{
> +       if (vma->anon_name)
> +               anon_vma_name_put(vma->anon_name);
> +}
> +
>  #else /* CONFIG_ANON_VMA_NAME */
>  static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {}
>  static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {}
> @@ -432,6 +447,9 @@ struct anon_vma_name *anon_vma_name_get_rcu(struct vm_area_struct *vma)
>         return NULL;
>  }
>
> +static inline bool anon_vma_name_get_if_valid(struct vm_area_struct *vma) { return true; }
> +static inline void anon_vma_name_put_if_valid(struct vm_area_struct *vma) {}
> +
>  #endif  /* CONFIG_ANON_VMA_NAME */
>
>  static inline void init_tlb_flush_pending(struct mm_struct *mm)
> --
> 2.43.0.429.g432eaa2c6b-goog
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ