[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250213113248.2225-1-hdanton@sina.com>
Date: Thu, 13 Feb 2025 19:32:47 +0800
From: Hillf Danton <hdanton@...a.com>
To: Sergey Senozhatsky <senozhatsky@...omium.org>
Cc: Yosry Ahmed <yosry.ahmed@...ux.dev>,
Kairui Song <ryncsn@...il.com>,
Minchan Kim <minchan@...nel.org>,
Andrew Morton <akpm@...ux-foundation.org>,
linux-mm@...ck.org,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH v5 12/18] zsmalloc: make zspage lock preemptible
On Wed, 12 Feb 2025 15:27:10 +0900 Sergey Senozhatsky
> Switch over from rwlock_t to a atomic_t variable that takes negative
> value when the page is under migration, or positive values when the
> page is used by zsmalloc users (object map, etc.) Using a rwsem
> per-zspage is a little too memory heavy, a simple atomic_t should
> suffice.
>
> zspage lock is a leaf lock for zs_map_object(), where it's read-acquired.
> Since this lock now permits preemption extra care needs to be taken when
> it is write-acquired - all writers grab it in atomic context, so they
> cannot spin and wait for (potentially preempted) reader to unlock zspage.
> There are only two writers at this moment - migration and compaction. In
> both cases we use write-try-lock and bail out if zspage is read locked.
> Writers, on the other hand, never get preempted, so readers can spin
> waiting for the writer to unlock zspage.
>
> With this we can implement a preemptible object mapping.
>
> Signed-off-by: Sergey Senozhatsky <senozhatsky@...omium.org>
> Cc: Yosry Ahmed <yosry.ahmed@...ux.dev>
> ---
> mm/zsmalloc.c | 183 +++++++++++++++++++++++++++++++++++---------------
> 1 file changed, 128 insertions(+), 55 deletions(-)
>
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index c82c24b8e6a4..80261bb78cf8 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -226,6 +226,9 @@ struct zs_pool {
> /* protect page/zspage migration */
> rwlock_t lock;
> atomic_t compaction_in_progress;
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> + struct lock_class_key lockdep_key;
> +#endif
> };
>
> static void pool_write_unlock(struct zs_pool *pool)
> @@ -292,6 +295,9 @@ static inline void free_zpdesc(struct zpdesc *zpdesc)
> __free_page(page);
> }
>
> +#define ZS_PAGE_UNLOCKED 0
> +#define ZS_PAGE_WRLOCKED -1
> +
> struct zspage {
> struct {
> unsigned int huge:HUGE_BITS;
> @@ -304,7 +310,11 @@ struct zspage {
> struct zpdesc *first_zpdesc;
> struct list_head list; /* fullness list */
> struct zs_pool *pool;
> - rwlock_t lock;
> + atomic_t lock;
> +
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> + struct lockdep_map lockdep_map;
> +#endif
> };
>
> struct mapping_area {
> @@ -314,6 +324,88 @@ struct mapping_area {
> enum zs_mapmode vm_mm; /* mapping mode */
> };
>
> +static void zspage_lock_init(struct zspage *zspage)
> +{
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> + lockdep_init_map(&zspage->lockdep_map, "zsmalloc-page",
> + &zspage->pool->lockdep_key, 0);
> +#endif
> +
> + atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED);
> +}
> +
> +/*
> + * zspage locking rules:
> + *
> + * 1) writer-lock is exclusive
> + *
> + * 2) writer-lock owner cannot sleep
> + *
> + * 3) writer-lock owner cannot spin waiting for the lock
> + * - caller (e.g. compaction and migration) must check return value and
> + * handle locking failures
> + * - there is only TRY variant of writer-lock function
> + *
> + * 4) reader-lock owners (multiple) can sleep
> + *
> + * 5) reader-lock owners can spin waiting for the lock, in any context
> + * - existing readers (even preempted ones) don't block new readers
> + * - writer-lock owners never sleep, always unlock at some point
> + */
> +static void zspage_read_lock(struct zspage *zspage)
> +{
> + atomic_t *lock = &zspage->lock;
> + int old = atomic_read_acquire(lock);
> +
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> + rwsem_acquire_read(&zspage->lockdep_map, 0, 0, _RET_IP_);
> +#endif
> +
> + do {
> + if (old == ZS_PAGE_WRLOCKED) {
> + cpu_relax();
> + old = atomic_read_acquire(lock);
> + continue;
> + }
> + } while (!atomic_try_cmpxchg_acquire(lock, &old, old + 1));
Given mcs_spinlock, inventing spinlock in 2025 sounds no good.
See below for the spinlock version.
> +}
> +
> +static void zspage_read_unlock(struct zspage *zspage)
> +{
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> + rwsem_release(&zspage->lockdep_map, _RET_IP_);
> +#endif
> + atomic_dec_return_release(&zspage->lock);
> +}
> +
> +static __must_check bool zspage_try_write_lock(struct zspage *zspage)
> +{
> + atomic_t *lock = &zspage->lock;
> + int old = ZS_PAGE_UNLOCKED;
> +
> + WARN_ON_ONCE(preemptible());
> +
> + preempt_disable();
> + if (atomic_try_cmpxchg_acquire(lock, &old, ZS_PAGE_WRLOCKED)) {
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> + rwsem_acquire(&zspage->lockdep_map, 0, 1, _RET_IP_);
> +#endif
> + return true;
> + }
> +
> + preempt_enable();
> + return false;
> +}
> +
> +static void zspage_write_unlock(struct zspage *zspage)
> +{
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> + rwsem_release(&zspage->lockdep_map, _RET_IP_);
> +#endif
> + atomic_set_release(&zspage->lock, ZS_PAGE_UNLOCKED);
> + preempt_enable();
> +}
struct zspage_lock {
spinlock_t lock;
int cnt;
struct lockdep_map lockdep_map;
};
static __must_check bool zspage_write_trylock(struct zspage_lock *zl)
{
spin_lock(&zl->lock);
if (zl->cnt == ZS_PAGE_UNLOCKED) {
// zl->cnt = ZS_PAGE_WRLOCKED;
rwsem_acquire(&zl->lockdep_map, 0, 1, _RET_IP_);
return true;
}
spin_unlock(&zl->lock);
return false;
}
static void zspage_write_unlock(struct zspage_lock *zl)
{
rwsem_release(&zl->lockdep_map, _RET_IP_);
spin_unlock(&zl->lock);
}
static void zspage_read_lock(struct zspage_lock *zl)
{
rwsem_acquire_read(&zl->lockdep_map, 0, 0, _RET_IP_);
spin_lock(&zl->lock);
zl->cnt++;
spin_unlock(&zl->lock);
}
static void zspage_read_unlock(struct zspage_lock *zl)
{
rwsem_release(&zl->lockdep_map, _RET_IP_);
spin_lock(&zl->lock);
zl->cnt--;
spin_unlock(&zl->lock);
}
Powered by blists - more mailing lists