linux-kernel - Re: Subject: [RFC MM] mmap_sem scaling: Use mutex and percpu counter instead

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <28c262360911060741x3f7ab0a2k15be645e287e05ac@mail.gmail.com>
Date:	Sat, 7 Nov 2009 00:41:40 +0900
From:	Minchan Kim <minchan.kim@...il.com>
To:	Christoph Lameter <cl@...ux-foundation.org>
Cc:	npiggin@...e.de, linux-kernel@...r.kernel.org, linux-mm@...ck.org,
	Tejun Heo <tj@...nel.org>, Ingo Molnar <mingo@...e.hu>,
	KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>,
	"hugh.dickins@...cali.co.uk" <hugh.dickins@...cali.co.uk>
Subject: Re: Subject: [RFC MM] mmap_sem scaling: Use mutex and percpu counter 
	instead

Hi, Christoph.

How about change from 'mm_readers' to 'is_readers' to improve your
goal 'scalibility'?
===
static inline int is_readers(struct mm_struct *mm)
{
       int cpu;
       int ret = 0;

       for_each_possible_cpu(cpu) {
               if (per_cpu(mm->rss->readers, cpu)) {
                      ret = 1;
                      break;
                 }
       }

       return ret;
}
===


On Fri, Nov 6, 2009 at 4:20 AM, Christoph Lameter
<cl@...ux-foundation.org> wrote:
> From: Christoph Lamter <cl@...ux-foundation.org>
> Subject: [RFC MM] mmap_sem scaling: Use mutex and percpu counter instead
>
> Instead of a rw semaphore use a mutex and a per cpu counter for the number
> of the current readers. read locking then becomes very cheap requiring only
> the increment of a per cpu counter.
>
> Write locking is more expensive since the writer must scan the percpu array
> and wait until all readers are complete. Since the readers are not holding
> semaphores we have no wait queue from which the writer could wakeup. In this
> draft we simply wait for one millisecond between scans of the percpu
> array. A different solution must be found there.
>
> Patch is on top of -next and the percpu counter patches that I posted
> yesterday. The patch adds another per cpu counter to the file and anon rss
> counters.
>
> Signed-off-by: Christoph Lamter <cl@...ux-foundation.org>
>
> ---
>  include/linux/mm_types.h |   68 ++++++++++++++++++++++++++++++++++++++---------
>  mm/init-mm.c             |    2 -
>  2 files changed, 56 insertions(+), 14 deletions(-)
>
> Index: linux-2.6/include/linux/mm_types.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mm_types.h     2009-11-05 13:03:11.000000000 -0600
> +++ linux-2.6/include/linux/mm_types.h  2009-11-05 13:06:31.000000000 -0600
> @@ -14,6 +14,7 @@
>  #include <linux/page-debug-flags.h>
>  #include <asm/page.h>
>  #include <asm/mmu.h>
> +#include <linux/percpu.h>
>
>  #ifndef AT_VECTOR_SIZE_ARCH
>  #define AT_VECTOR_SIZE_ARCH 0
> @@ -27,6 +28,7 @@ struct address_space;
>  struct mm_counter {
>        long file;
>        long anon;
> +       long readers;
>  };
>
>  /*
> @@ -214,7 +216,7 @@ struct mm_struct {
>        atomic_t mm_users;                      /* How many users with user space? */
>        atomic_t mm_count;                      /* How many references to "struct mm_struct" (users count as 1) */
>        int map_count;                          /* number of VMAs */
> -       struct rw_semaphore sem;
> +       struct mutex lock;
>        spinlock_t page_table_lock;             /* Protects page tables and some counters */
>
>        struct list_head mmlist;                /* List of maybe swapped mm's.  These are globally strung
> @@ -285,64 +287,104 @@ struct mm_struct {
>  #endif
>  };
>
> +static inline int mm_readers(struct mm_struct *mm)
> +{
> +       int cpu;
> +       int readers = 0;
> +
> +       for_each_possible_cpu(cpu)
> +               readers += per_cpu(mm->rss->readers, cpu);
> +
> +       return readers;
> +}
> +
>  static inline void mm_reader_lock(struct mm_struct *mm)
>  {
> -       down_read(&mm->sem);
> +redo:
> +       this_cpu_inc(mm->rss->readers);
> +       if (mutex_is_locked(&mm->lock)) {
> +               this_cpu_dec(mm->rss->readers);
> +               /* Need to wait till mutex is released */
> +               mutex_lock(&mm->lock);
> +               mutex_unlock(&mm->lock);
> +               goto redo;
> +       }
>  }
>
>  static inline void mm_reader_unlock(struct mm_struct *mm)
>  {
> -       up_read(&mm->sem);
> +       this_cpu_dec(mm->rss->readers);
>  }
>
>  static inline int mm_reader_trylock(struct mm_struct *mm)
>  {
> -       return down_read_trylock(&mm->sem);
> +       this_cpu_inc(mm->rss->readers);
> +       if (mutex_is_locked(&mm->lock)) {
> +               this_cpu_dec(mm->rss->readers);
> +               return 0;
> +       }
> +       return 1;
>  }
>
>  static inline void mm_writer_lock(struct mm_struct *mm)
>  {
> -       down_write(&mm->sem);
> +redo:
> +       mutex_lock(&mm->lock);
> +       if (mm_readers(mm) == 0)

We can change this.

if (!is_readers(mm))
         return;

> +               return;
> +
> +       mutex_unlock(&mm->lock);
> +       msleep(1);
> +       goto redo;
>  }
>
>  static inline void mm_writer_unlock(struct mm_struct *mm)
>  {
> -       up_write(&mm->sem);
> +       mutex_unlock(&mm->lock);
>  }
>
>  static inline int mm_writer_trylock(struct mm_struct *mm)
>  {
> -       return down_write_trylock(&mm->sem);
> +       if (!mutex_trylock(&mm->lock))
> +               goto fail;
> +
> +       if (mm_readers(mm) == 0)
> +               return 1;

if (!is_readers(mm))
        return 1;

> +
> +       mutex_unlock(&mm->lock);
> +fail:
> +       return 0;
>  }
>

-- 
Kind regards,
Minchan Kim
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/