linux-kernel - Re: [PATCH v3 12/16] zsmalloc: zs

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Mon, 4 Apr 2016 18:01:14 +0900
From:	Minchan Kim <minchan@...nel.org>
To:	Chulmin Kim <cmlaika.kim@...sung.com>
CC:	Andrew Morton <akpm@...ux-foundation.org>,
	linux-kernel@...r.kernel.org, linux-mm@...ck.org
Subject: Re: [PATCH v3 12/16] zsmalloc: zs_compact refactoring

Hello Chulmin,

On Mon, Apr 04, 2016 at 05:04:16PM +0900, Chulmin Kim wrote:
> On 2016년 03월 30일 16:12, Minchan Kim wrote:
> >Currently, we rely on class->lock to prevent zspage destruction.
> >It was okay until now because the critical section is short but
> >with run-time migration, it could be long so class->lock is not
> >a good apporach any more.
> >
> >So, this patch introduces [un]freeze_zspage functions which
> >freeze allocated objects in the zspage with pinning tag so
> >user cannot free using object. With those functions, this patch
> >redesign compaction.
> >
> >Those functions will be used for implementing zspage runtime
> >migrations, too.
> >
> >Signed-off-by: Minchan Kim <minchan@...nel.org>
> >---
> >  mm/zsmalloc.c | 393 ++++++++++++++++++++++++++++++++++++++--------------------
> >  1 file changed, 257 insertions(+), 136 deletions(-)
> >
> >diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> >index b11dcd718502..ac8ca7b10720 100644
> >--- a/mm/zsmalloc.c
> >+++ b/mm/zsmalloc.c
> >@@ -922,6 +922,13 @@ static unsigned long obj_to_head(struct size_class *class, struct page *page,
> >  		return *(unsigned long *)obj;
> >  }
> >
> >+static inline int testpin_tag(unsigned long handle)
> >+{
> >+	unsigned long *ptr = (unsigned long *)handle;
> >+
> >+	return test_bit(HANDLE_PIN_BIT, ptr);
> >+}
> >+
> >  static inline int trypin_tag(unsigned long handle)
> >  {
> >  	unsigned long *ptr = (unsigned long *)handle;
> >@@ -949,8 +956,7 @@ static void reset_page(struct page *page)
> >  	page->freelist = NULL;
> >  }
> >
> >-static void free_zspage(struct zs_pool *pool, struct size_class *class,
> >-			struct page *first_page)
> >+static void free_zspage(struct zs_pool *pool, struct page *first_page)
> >  {
> >  	struct page *nextp, *tmp, *head_extra;
> >
> >@@ -973,11 +979,6 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class,
> >  	}
> >  	reset_page(head_extra);
> >  	__free_page(head_extra);
> >-
> >-	zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
> >-			class->size, class->pages_per_zspage));
> >-	atomic_long_sub(class->pages_per_zspage,
> >-				&pool->pages_allocated);
> >  }
> >
> >  /* Initialize a newly allocated zspage */
> >@@ -1325,6 +1326,11 @@ static bool zspage_full(struct size_class *class, struct page *first_page)
> >  	return get_zspage_inuse(first_page) == class->objs_per_zspage;
> >  }
> >
> >+static bool zspage_empty(struct size_class *class, struct page *first_page)
> >+{
> >+	return get_zspage_inuse(first_page) == 0;
> >+}
> >+
> >  unsigned long zs_get_total_pages(struct zs_pool *pool)
> >  {
> >  	return atomic_long_read(&pool->pages_allocated);
> >@@ -1455,7 +1461,6 @@ static unsigned long obj_malloc(struct size_class *class,
> >  		set_page_private(first_page, handle | OBJ_ALLOCATED_TAG);
> >  	kunmap_atomic(vaddr);
> >  	mod_zspage_inuse(first_page, 1);
> >-	zs_stat_inc(class, OBJ_USED, 1);
> >
> >  	obj = location_to_obj(m_page, obj);
> >
> >@@ -1510,6 +1515,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
> >  	}
> >
> >  	obj = obj_malloc(class, first_page, handle);
> >+	zs_stat_inc(class, OBJ_USED, 1);
> >  	/* Now move the zspage to another fullness group, if required */
> >  	fix_fullness_group(class, first_page);
> >  	record_obj(handle, obj);
> >@@ -1540,7 +1546,6 @@ static void obj_free(struct size_class *class, unsigned long obj)
> >  	kunmap_atomic(vaddr);
> >  	set_freeobj(first_page, f_objidx);
> >  	mod_zspage_inuse(first_page, -1);
> >-	zs_stat_dec(class, OBJ_USED, 1);
> >  }
> >
> >  void zs_free(struct zs_pool *pool, unsigned long handle)
> >@@ -1564,10 +1569,19 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
> >
> >  	spin_lock(&class->lock);
> >  	obj_free(class, obj);
> >+	zs_stat_dec(class, OBJ_USED, 1);
> >  	fullness = fix_fullness_group(class, first_page);
> >-	if (fullness == ZS_EMPTY)
> >-		free_zspage(pool, class, first_page);
> >+	if (fullness == ZS_EMPTY) {
> >+		zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
> >+				class->size, class->pages_per_zspage));
> >+		spin_unlock(&class->lock);
> >+		atomic_long_sub(class->pages_per_zspage,
> >+					&pool->pages_allocated);
> >+		free_zspage(pool, first_page);
> >+		goto out;
> >+	}
> >  	spin_unlock(&class->lock);
> >+out:
> >  	unpin_tag(handle);
> >
> >  	free_handle(pool, handle);
> >@@ -1637,127 +1651,66 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
> >  	kunmap_atomic(s_addr);
> >  }
> >
> >-/*
> >- * Find alloced object in zspage from index object and
> >- * return handle.
> >- */
> >-static unsigned long find_alloced_obj(struct size_class *class,
> >-					struct page *page, int index)
> >+static unsigned long handle_from_obj(struct size_class *class,
> >+				struct page *first_page, int obj_idx)
> >  {
> >-	unsigned long head;
> >-	int offset = 0;
> >-	unsigned long handle = 0;
> >-	void *addr = kmap_atomic(page);
> >-
> >-	if (!is_first_page(page))
> >-		offset = page->index;
> >-	offset += class->size * index;
> >-
> >-	while (offset < PAGE_SIZE) {
> >-		head = obj_to_head(class, page, addr + offset);
> >-		if (head & OBJ_ALLOCATED_TAG) {
> >-			handle = head & ~OBJ_ALLOCATED_TAG;
> >-			if (trypin_tag(handle))
> >-				break;
> >-			handle = 0;
> >-		}
> >+	struct page *page;
> >+	unsigned long offset_in_page;
> >+	void *addr;
> >+	unsigned long head, handle = 0;
> >
> >-		offset += class->size;
> >-		index++;
> >-	}
> >+	objidx_to_page_and_offset(class, first_page, obj_idx,
> >+			&page, &offset_in_page);
> >
> >+	addr = kmap_atomic(page);
> >+	head = obj_to_head(class, page, addr + offset_in_page);
> >+	if (head & OBJ_ALLOCATED_TAG)
> >+		handle = head & ~OBJ_ALLOCATED_TAG;
> >  	kunmap_atomic(addr);
> >+
> >  	return handle;
> >  }
> >
> >-struct zs_compact_control {
> >-	/* Source page for migration which could be a subpage of zspage. */
> >-	struct page *s_page;
> >-	/* Destination page for migration which should be a first page
> >-	 * of zspage. */
> >-	struct page *d_page;
> >-	 /* Starting object index within @s_page which used for live object
> >-	  * in the subpage. */
> >-	int index;
> >-};
> >-
> >-static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
> >-				struct zs_compact_control *cc)
> >+static int migrate_zspage(struct size_class *class, struct page *dst_page,
> >+				struct page *src_page)
> >  {
> >-	unsigned long used_obj, free_obj;
> >  	unsigned long handle;
> >-	struct page *s_page = cc->s_page;
> >-	struct page *d_page = cc->d_page;
> >-	unsigned long index = cc->index;
> >-	int ret = 0;
> >+	unsigned long old_obj, new_obj;
> >+	int i;
> >+	int nr_migrated = 0;
> >
> >-	while (1) {
> >-		handle = find_alloced_obj(class, s_page, index);
> >-		if (!handle) {
> >-			s_page = get_next_page(s_page);
> >-			if (!s_page)
> >-				break;
> >-			index = 0;
> >+	for (i = 0; i < class->objs_per_zspage; i++) {
> >+		handle = handle_from_obj(class, src_page, i);
> >+		if (!handle)
> >  			continue;
> >-		}
> >-
> >-		/* Stop if there is no more space */
> >-		if (zspage_full(class, d_page)) {
> >-			unpin_tag(handle);
> >-			ret = -ENOMEM;
> >+		if (zspage_full(class, dst_page))
> >  			break;
> >-		}
> >-
> >-		used_obj = handle_to_obj(handle);
> >-		free_obj = obj_malloc(class, d_page, handle);
> >-		zs_object_copy(class, free_obj, used_obj);
> >-		index++;
> >+		old_obj = handle_to_obj(handle);
> >+		new_obj = obj_malloc(class, dst_page, handle);
> >+		zs_object_copy(class, new_obj, old_obj);
> >+		nr_migrated++;
> >  		/*
> >  		 * record_obj updates handle's value to free_obj and it will
> >  		 * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which
> >  		 * breaks synchronization using pin_tag(e,g, zs_free) so
> >  		 * let's keep the lock bit.
> >  		 */
> >-		free_obj |= BIT(HANDLE_PIN_BIT);
> >-		record_obj(handle, free_obj);
> >-		unpin_tag(handle);
> >-		obj_free(class, used_obj);
> >+		new_obj |= BIT(HANDLE_PIN_BIT);
> >+		record_obj(handle, new_obj);
> >+		obj_free(class, old_obj);
> >  	}
> >-
> >-	/* Remember last position in this iteration */
> >-	cc->s_page = s_page;
> >-	cc->index = index;
> >-
> >-	return ret;
> >-}
> >-
> >-static struct page *isolate_target_page(struct size_class *class)
> >-{
> >-	int i;
> >-	struct page *page;
> >-
> >-	for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
> >-		page = class->fullness_list[i];
> >-		if (page) {
> >-			remove_zspage(class, i, page);
> >-			break;
> >-		}
> >-	}
> >-
> >-	return page;
> >+	return nr_migrated;
> >  }
> >
> >  /*
> >   * putback_zspage - add @first_page into right class's fullness list
> >- * @pool: target pool
> >   * @class: destination class
> >   * @first_page: target page
> >   *
> >   * Return @first_page's updated fullness_group
> >   */
> >-static enum fullness_group putback_zspage(struct zs_pool *pool,
> >-			struct size_class *class,
> >-			struct page *first_page)
> >+static enum fullness_group putback_zspage(struct size_class *class,
> >+					struct page *first_page)
> >  {
> >  	enum fullness_group fullness;
> >
> >@@ -1768,17 +1721,155 @@ static enum fullness_group putback_zspage(struct zs_pool *pool,
> >  	return fullness;
> >  }
> >
> >+/*
> >+ * freeze_zspage - freeze all objects in a zspage
> >+ * @class: size class of the page
> >+ * @first_page: first page of zspage
> >+ *
> >+ * Freeze all allocated objects in a zspage so objects couldn't be
> >+ * freed until unfreeze objects. It should be called under class->lock.
> >+ *
> >+ * RETURNS:
> >+ * the number of pinned objects
> >+ */
> >+static int freeze_zspage(struct size_class *class, struct page *first_page)
> >+{
> >+	unsigned long obj_idx;
> >+	struct page *obj_page;
> >+	unsigned long offset;
> >+	void *addr;
> >+	int nr_freeze = 0;
> >+
> >+	for (obj_idx = 0; obj_idx < class->objs_per_zspage; obj_idx++) {
> >+		unsigned long head;
> >+
> >+		objidx_to_page_and_offset(class, first_page, obj_idx,
> >+					&obj_page, &offset);
> >+		addr = kmap_atomic(obj_page);
> >+		head = obj_to_head(class, obj_page, addr + offset);
> >+		if (head & OBJ_ALLOCATED_TAG) {
> >+			unsigned long handle = head & ~OBJ_ALLOCATED_TAG;
> >+
> >+			if (!trypin_tag(handle)) {
> >+				kunmap_atomic(addr);
> >+				break;
> >+			}
> >+			nr_freeze++;
> >+		}
> >+		kunmap_atomic(addr);
> >+	}
> >+
> >+	return nr_freeze;
> >+}
> >+
> >+/*
> >+ * unfreeze_page - unfreeze objects freezed by freeze_zspage in a zspage
> >+ * @class: size class of the page
> >+ * @first_page: freezed zspage to unfreeze
> >+ * @nr_obj: the number of objects to unfreeze
> >+ *
> >+ * unfreeze objects in a zspage.
> >+ */
> >+static void unfreeze_zspage(struct size_class *class, struct page *first_page,
> >+			int nr_obj)
> >+{
> >+	unsigned long obj_idx;
> >+	struct page *obj_page;
> >+	unsigned long offset;
> >+	void *addr;
> >+	int nr_unfreeze = 0;
> >+
> >+	for (obj_idx = 0; obj_idx < class->objs_per_zspage &&
> >+			nr_unfreeze < nr_obj; obj_idx++) {
> >+		unsigned long head;
> >+
> >+		objidx_to_page_and_offset(class, first_page, obj_idx,
> >+					&obj_page, &offset);
> >+		addr = kmap_atomic(obj_page);
> >+		head = obj_to_head(class, obj_page, addr + offset);
> >+		if (head & OBJ_ALLOCATED_TAG) {
> >+			unsigned long handle = head & ~OBJ_ALLOCATED_TAG;
> >+
> >+			VM_BUG_ON(!testpin_tag(handle));
> >+			unpin_tag(handle);
> >+			nr_unfreeze++;
> >+		}
> >+		kunmap_atomic(addr);
> >+	}
> >+}
> >+
> >+/*
> >+ * isolate_source_page - isolate a zspage for migration source
> >+ * @class: size class of zspage for isolation
> >+ *
> >+ * Returns a zspage which are isolated from list so anyone can
> >+ * allocate a object from that page. As well, freeze all objects
> >+ * allocated in the zspage so anyone cannot access that objects
> >+ * (e.g., zs_map_object, zs_free).
> >+ */
> >  static struct page *isolate_source_page(struct size_class *class)
> >  {
> >  	int i;
> >  	struct page *page = NULL;
> >
> >  	for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) {
> >+		int inuse, freezed;
> >+
> >  		page = class->fullness_list[i];
> >  		if (!page)
> >  			continue;
> >
> >  		remove_zspage(class, i, page);
> >+
> >+		inuse = get_zspage_inuse(page);
> >+		freezed = freeze_zspage(class, page);
> >+
> >+		if (inuse != freezed) {
> >+			unfreeze_zspage(class, page, freezed);
> >+			putback_zspage(class, page);
> >+			page = NULL;
> >+			continue;
> >+		}
> >+
> >+		break;
> >+	}
> >+
> >+	return page;
> >+}
> >+
> >+/*
> >+ * isolate_target_page - isolate a zspage for migration target
> >+ * @class: size class of zspage for isolation
> >+ *
> >+ * Returns a zspage which are isolated from list so anyone can
> >+ * allocate a object from that page. As well, freeze all objects
> >+ * allocated in the zspage so anyone cannot access that objects
> >+ * (e.g., zs_map_object, zs_free).
> >+ */
> >+static struct page *isolate_target_page(struct size_class *class)
> >+{
> >+	int i;
> >+	struct page *page;
> >+
> >+	for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
> >+		int inuse, freezed;
> >+
> >+		page = class->fullness_list[i];
> >+		if (!page)
> >+			continue;
> >+
> >+		remove_zspage(class, i, page);
> >+
> >+		inuse = get_zspage_inuse(page);
> >+		freezed = freeze_zspage(class, page);
> >+
> >+		if (inuse != freezed) {
> >+			unfreeze_zspage(class, page, freezed);
> >+			putback_zspage(class, page);
> >+			page = NULL;
> >+			continue;
> >+		}
> >+
> >  		break;
> >  	}
> >
> >@@ -1793,9 +1884,11 @@ static struct page *isolate_source_page(struct size_class *class)
> >  static unsigned long zs_can_compact(struct size_class *class)
> >  {
> >  	unsigned long obj_wasted;
> >+	unsigned long obj_allocated, obj_used;
> >
> >-	obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) -
> >-		zs_stat_get(class, OBJ_USED);
> >+	obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
> >+	obj_used = zs_stat_get(class, OBJ_USED);
> >+	obj_wasted = obj_allocated - obj_used;
> >
> >  	obj_wasted /= get_maxobj_per_zspage(class->size,
> >  			class->pages_per_zspage);
> >@@ -1805,53 +1898,81 @@ static unsigned long zs_can_compact(struct size_class *class)
> >
> >  static void __zs_compact(struct zs_pool *pool, struct size_class *class)
> >  {
> >-	struct zs_compact_control cc;
> >-	struct page *src_page;
> >+	struct page *src_page = NULL;
> >  	struct page *dst_page = NULL;
> >
> >-	spin_lock(&class->lock);
> >-	while ((src_page = isolate_source_page(class))) {
> >+	while (1) {
> >+		int nr_migrated;
> >
> >-		if (!zs_can_compact(class))
> >+		spin_lock(&class->lock);
> >+		if (!zs_can_compact(class)) {
> >+			spin_unlock(&class->lock);
> >  			break;
> >+		}
> >
> >-		cc.index = 0;
> >-		cc.s_page = src_page;
> >+		/*
> >+		 * Isolate source page and freeze all objects in a zspage
> >+		 * to prevent zspage destroying.
> >+		 */
> >+		if (!src_page) {
> >+			src_page = isolate_source_page(class);
> >+			if (!src_page) {
> >+				spin_unlock(&class->lock);
> >+				break;
> >+			}
> >+		}
> >
> >-		while ((dst_page = isolate_target_page(class))) {
> >-			cc.d_page = dst_page;
> >-			/*
> >-			 * If there is no more space in dst_page, resched
> >-			 * and see if anyone had allocated another zspage.
> >-			 */
> >-			if (!migrate_zspage(pool, class, &cc))
> >+		/* Isolate target page and freeze all objects in the zspage */
> >+		if (!dst_page) {
> >+			dst_page = isolate_target_page(class);
> >+			if (!dst_page) {
> >+				spin_unlock(&class->lock);
> >  				break;
> >+			}
> >+		}
> >+		spin_unlock(&class->lock);
> 
> (Sorry to delete individual recipients due to my compliance issues.)
> 
> Hello, Minchan.
> 
> 
> Is it safe to unlock?
> 
> 
> (I assume that the system has 2 cores
> and a swap device is using zsmalloc pool.)
> If a zs compact context scheduled out after this "spin_unlock" line,
> 
> 
>    CPU A (Swap In)                CPU B (zs_free by process killed)
> ---------------------           -------------------------
>                                 ...
>                                 spin_lock(&si->lock)
>                                 ...
>                                # assume it is pinned by zs_compact context.
>                                 pin_tag(handle) --> block
> 
> ...
> spin_lock(&si->lock) --> block
> 
> 
> I think CPU A and CPU B may not be released forever.
> Am I missing something?

You didn't miss anything. It could be dead locked.
The swap_slot_free_notify is always really problem. :(
That's why I want to remove it.
I will think over how to handle it and send fix in next revision.

Thanks for the review!