lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Mon, 04 Apr 2016 17:04:16 +0900
From:	Chulmin Kim <cmlaika.kim@...sung.com>
To:	Minchan Kim <minchan@...nel.org>,
	Andrew Morton <akpm@...ux-foundation.org>
Cc:	linux-kernel@...r.kernel.org, linux-mm@...ck.org
Subject: Re: [PATCH v3 12/16] zsmalloc: zs_compact refactoring

On 2016년 03월 30일 16:12, Minchan Kim wrote:
> Currently, we rely on class->lock to prevent zspage destruction.
> It was okay until now because the critical section is short but
> with run-time migration, it could be long so class->lock is not
> a good apporach any more.
>
> So, this patch introduces [un]freeze_zspage functions which
> freeze allocated objects in the zspage with pinning tag so
> user cannot free using object. With those functions, this patch
> redesign compaction.
>
> Those functions will be used for implementing zspage runtime
> migrations, too.
>
> Signed-off-by: Minchan Kim <minchan@...nel.org>
> ---
>   mm/zsmalloc.c | 393 ++++++++++++++++++++++++++++++++++++++--------------------
>   1 file changed, 257 insertions(+), 136 deletions(-)
>
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index b11dcd718502..ac8ca7b10720 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -922,6 +922,13 @@ static unsigned long obj_to_head(struct size_class *class, struct page *page,
>   		return *(unsigned long *)obj;
>   }
>
> +static inline int testpin_tag(unsigned long handle)
> +{
> +	unsigned long *ptr = (unsigned long *)handle;
> +
> +	return test_bit(HANDLE_PIN_BIT, ptr);
> +}
> +
>   static inline int trypin_tag(unsigned long handle)
>   {
>   	unsigned long *ptr = (unsigned long *)handle;
> @@ -949,8 +956,7 @@ static void reset_page(struct page *page)
>   	page->freelist = NULL;
>   }
>
> -static void free_zspage(struct zs_pool *pool, struct size_class *class,
> -			struct page *first_page)
> +static void free_zspage(struct zs_pool *pool, struct page *first_page)
>   {
>   	struct page *nextp, *tmp, *head_extra;
>
> @@ -973,11 +979,6 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class,
>   	}
>   	reset_page(head_extra);
>   	__free_page(head_extra);
> -
> -	zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
> -			class->size, class->pages_per_zspage));
> -	atomic_long_sub(class->pages_per_zspage,
> -				&pool->pages_allocated);
>   }
>
>   /* Initialize a newly allocated zspage */
> @@ -1325,6 +1326,11 @@ static bool zspage_full(struct size_class *class, struct page *first_page)
>   	return get_zspage_inuse(first_page) == class->objs_per_zspage;
>   }
>
> +static bool zspage_empty(struct size_class *class, struct page *first_page)
> +{
> +	return get_zspage_inuse(first_page) == 0;
> +}
> +
>   unsigned long zs_get_total_pages(struct zs_pool *pool)
>   {
>   	return atomic_long_read(&pool->pages_allocated);
> @@ -1455,7 +1461,6 @@ static unsigned long obj_malloc(struct size_class *class,
>   		set_page_private(first_page, handle | OBJ_ALLOCATED_TAG);
>   	kunmap_atomic(vaddr);
>   	mod_zspage_inuse(first_page, 1);
> -	zs_stat_inc(class, OBJ_USED, 1);
>
>   	obj = location_to_obj(m_page, obj);
>
> @@ -1510,6 +1515,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
>   	}
>
>   	obj = obj_malloc(class, first_page, handle);
> +	zs_stat_inc(class, OBJ_USED, 1);
>   	/* Now move the zspage to another fullness group, if required */
>   	fix_fullness_group(class, first_page);
>   	record_obj(handle, obj);
> @@ -1540,7 +1546,6 @@ static void obj_free(struct size_class *class, unsigned long obj)
>   	kunmap_atomic(vaddr);
>   	set_freeobj(first_page, f_objidx);
>   	mod_zspage_inuse(first_page, -1);
> -	zs_stat_dec(class, OBJ_USED, 1);
>   }
>
>   void zs_free(struct zs_pool *pool, unsigned long handle)
> @@ -1564,10 +1569,19 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
>
>   	spin_lock(&class->lock);
>   	obj_free(class, obj);
> +	zs_stat_dec(class, OBJ_USED, 1);
>   	fullness = fix_fullness_group(class, first_page);
> -	if (fullness == ZS_EMPTY)
> -		free_zspage(pool, class, first_page);
> +	if (fullness == ZS_EMPTY) {
> +		zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
> +				class->size, class->pages_per_zspage));
> +		spin_unlock(&class->lock);
> +		atomic_long_sub(class->pages_per_zspage,
> +					&pool->pages_allocated);
> +		free_zspage(pool, first_page);
> +		goto out;
> +	}
>   	spin_unlock(&class->lock);
> +out:
>   	unpin_tag(handle);
>
>   	free_handle(pool, handle);
> @@ -1637,127 +1651,66 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
>   	kunmap_atomic(s_addr);
>   }
>
> -/*
> - * Find alloced object in zspage from index object and
> - * return handle.
> - */
> -static unsigned long find_alloced_obj(struct size_class *class,
> -					struct page *page, int index)
> +static unsigned long handle_from_obj(struct size_class *class,
> +				struct page *first_page, int obj_idx)
>   {
> -	unsigned long head;
> -	int offset = 0;
> -	unsigned long handle = 0;
> -	void *addr = kmap_atomic(page);
> -
> -	if (!is_first_page(page))
> -		offset = page->index;
> -	offset += class->size * index;
> -
> -	while (offset < PAGE_SIZE) {
> -		head = obj_to_head(class, page, addr + offset);
> -		if (head & OBJ_ALLOCATED_TAG) {
> -			handle = head & ~OBJ_ALLOCATED_TAG;
> -			if (trypin_tag(handle))
> -				break;
> -			handle = 0;
> -		}
> +	struct page *page;
> +	unsigned long offset_in_page;
> +	void *addr;
> +	unsigned long head, handle = 0;
>
> -		offset += class->size;
> -		index++;
> -	}
> +	objidx_to_page_and_offset(class, first_page, obj_idx,
> +			&page, &offset_in_page);
>
> +	addr = kmap_atomic(page);
> +	head = obj_to_head(class, page, addr + offset_in_page);
> +	if (head & OBJ_ALLOCATED_TAG)
> +		handle = head & ~OBJ_ALLOCATED_TAG;
>   	kunmap_atomic(addr);
> +
>   	return handle;
>   }
>
> -struct zs_compact_control {
> -	/* Source page for migration which could be a subpage of zspage. */
> -	struct page *s_page;
> -	/* Destination page for migration which should be a first page
> -	 * of zspage. */
> -	struct page *d_page;
> -	 /* Starting object index within @s_page which used for live object
> -	  * in the subpage. */
> -	int index;
> -};
> -
> -static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
> -				struct zs_compact_control *cc)
> +static int migrate_zspage(struct size_class *class, struct page *dst_page,
> +				struct page *src_page)
>   {
> -	unsigned long used_obj, free_obj;
>   	unsigned long handle;
> -	struct page *s_page = cc->s_page;
> -	struct page *d_page = cc->d_page;
> -	unsigned long index = cc->index;
> -	int ret = 0;
> +	unsigned long old_obj, new_obj;
> +	int i;
> +	int nr_migrated = 0;
>
> -	while (1) {
> -		handle = find_alloced_obj(class, s_page, index);
> -		if (!handle) {
> -			s_page = get_next_page(s_page);
> -			if (!s_page)
> -				break;
> -			index = 0;
> +	for (i = 0; i < class->objs_per_zspage; i++) {
> +		handle = handle_from_obj(class, src_page, i);
> +		if (!handle)
>   			continue;
> -		}
> -
> -		/* Stop if there is no more space */
> -		if (zspage_full(class, d_page)) {
> -			unpin_tag(handle);
> -			ret = -ENOMEM;
> +		if (zspage_full(class, dst_page))
>   			break;
> -		}
> -
> -		used_obj = handle_to_obj(handle);
> -		free_obj = obj_malloc(class, d_page, handle);
> -		zs_object_copy(class, free_obj, used_obj);
> -		index++;
> +		old_obj = handle_to_obj(handle);
> +		new_obj = obj_malloc(class, dst_page, handle);
> +		zs_object_copy(class, new_obj, old_obj);
> +		nr_migrated++;
>   		/*
>   		 * record_obj updates handle's value to free_obj and it will
>   		 * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which
>   		 * breaks synchronization using pin_tag(e,g, zs_free) so
>   		 * let's keep the lock bit.
>   		 */
> -		free_obj |= BIT(HANDLE_PIN_BIT);
> -		record_obj(handle, free_obj);
> -		unpin_tag(handle);
> -		obj_free(class, used_obj);
> +		new_obj |= BIT(HANDLE_PIN_BIT);
> +		record_obj(handle, new_obj);
> +		obj_free(class, old_obj);
>   	}
> -
> -	/* Remember last position in this iteration */
> -	cc->s_page = s_page;
> -	cc->index = index;
> -
> -	return ret;
> -}
> -
> -static struct page *isolate_target_page(struct size_class *class)
> -{
> -	int i;
> -	struct page *page;
> -
> -	for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
> -		page = class->fullness_list[i];
> -		if (page) {
> -			remove_zspage(class, i, page);
> -			break;
> -		}
> -	}
> -
> -	return page;
> +	return nr_migrated;
>   }
>
>   /*
>    * putback_zspage - add @first_page into right class's fullness list
> - * @pool: target pool
>    * @class: destination class
>    * @first_page: target page
>    *
>    * Return @first_page's updated fullness_group
>    */
> -static enum fullness_group putback_zspage(struct zs_pool *pool,
> -			struct size_class *class,
> -			struct page *first_page)
> +static enum fullness_group putback_zspage(struct size_class *class,
> +					struct page *first_page)
>   {
>   	enum fullness_group fullness;
>
> @@ -1768,17 +1721,155 @@ static enum fullness_group putback_zspage(struct zs_pool *pool,
>   	return fullness;
>   }
>
> +/*
> + * freeze_zspage - freeze all objects in a zspage
> + * @class: size class of the page
> + * @first_page: first page of zspage
> + *
> + * Freeze all allocated objects in a zspage so objects couldn't be
> + * freed until unfreeze objects. It should be called under class->lock.
> + *
> + * RETURNS:
> + * the number of pinned objects
> + */
> +static int freeze_zspage(struct size_class *class, struct page *first_page)
> +{
> +	unsigned long obj_idx;
> +	struct page *obj_page;
> +	unsigned long offset;
> +	void *addr;
> +	int nr_freeze = 0;
> +
> +	for (obj_idx = 0; obj_idx < class->objs_per_zspage; obj_idx++) {
> +		unsigned long head;
> +
> +		objidx_to_page_and_offset(class, first_page, obj_idx,
> +					&obj_page, &offset);
> +		addr = kmap_atomic(obj_page);
> +		head = obj_to_head(class, obj_page, addr + offset);
> +		if (head & OBJ_ALLOCATED_TAG) {
> +			unsigned long handle = head & ~OBJ_ALLOCATED_TAG;
> +
> +			if (!trypin_tag(handle)) {
> +				kunmap_atomic(addr);
> +				break;
> +			}
> +			nr_freeze++;
> +		}
> +		kunmap_atomic(addr);
> +	}
> +
> +	return nr_freeze;
> +}
> +
> +/*
> + * unfreeze_page - unfreeze objects freezed by freeze_zspage in a zspage
> + * @class: size class of the page
> + * @first_page: freezed zspage to unfreeze
> + * @nr_obj: the number of objects to unfreeze
> + *
> + * unfreeze objects in a zspage.
> + */
> +static void unfreeze_zspage(struct size_class *class, struct page *first_page,
> +			int nr_obj)
> +{
> +	unsigned long obj_idx;
> +	struct page *obj_page;
> +	unsigned long offset;
> +	void *addr;
> +	int nr_unfreeze = 0;
> +
> +	for (obj_idx = 0; obj_idx < class->objs_per_zspage &&
> +			nr_unfreeze < nr_obj; obj_idx++) {
> +		unsigned long head;
> +
> +		objidx_to_page_and_offset(class, first_page, obj_idx,
> +					&obj_page, &offset);
> +		addr = kmap_atomic(obj_page);
> +		head = obj_to_head(class, obj_page, addr + offset);
> +		if (head & OBJ_ALLOCATED_TAG) {
> +			unsigned long handle = head & ~OBJ_ALLOCATED_TAG;
> +
> +			VM_BUG_ON(!testpin_tag(handle));
> +			unpin_tag(handle);
> +			nr_unfreeze++;
> +		}
> +		kunmap_atomic(addr);
> +	}
> +}
> +
> +/*
> + * isolate_source_page - isolate a zspage for migration source
> + * @class: size class of zspage for isolation
> + *
> + * Returns a zspage which are isolated from list so anyone can
> + * allocate a object from that page. As well, freeze all objects
> + * allocated in the zspage so anyone cannot access that objects
> + * (e.g., zs_map_object, zs_free).
> + */
>   static struct page *isolate_source_page(struct size_class *class)
>   {
>   	int i;
>   	struct page *page = NULL;
>
>   	for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) {
> +		int inuse, freezed;
> +
>   		page = class->fullness_list[i];
>   		if (!page)
>   			continue;
>
>   		remove_zspage(class, i, page);
> +
> +		inuse = get_zspage_inuse(page);
> +		freezed = freeze_zspage(class, page);
> +
> +		if (inuse != freezed) {
> +			unfreeze_zspage(class, page, freezed);
> +			putback_zspage(class, page);
> +			page = NULL;
> +			continue;
> +		}
> +
> +		break;
> +	}
> +
> +	return page;
> +}
> +
> +/*
> + * isolate_target_page - isolate a zspage for migration target
> + * @class: size class of zspage for isolation
> + *
> + * Returns a zspage which are isolated from list so anyone can
> + * allocate a object from that page. As well, freeze all objects
> + * allocated in the zspage so anyone cannot access that objects
> + * (e.g., zs_map_object, zs_free).
> + */
> +static struct page *isolate_target_page(struct size_class *class)
> +{
> +	int i;
> +	struct page *page;
> +
> +	for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
> +		int inuse, freezed;
> +
> +		page = class->fullness_list[i];
> +		if (!page)
> +			continue;
> +
> +		remove_zspage(class, i, page);
> +
> +		inuse = get_zspage_inuse(page);
> +		freezed = freeze_zspage(class, page);
> +
> +		if (inuse != freezed) {
> +			unfreeze_zspage(class, page, freezed);
> +			putback_zspage(class, page);
> +			page = NULL;
> +			continue;
> +		}
> +
>   		break;
>   	}
>
> @@ -1793,9 +1884,11 @@ static struct page *isolate_source_page(struct size_class *class)
>   static unsigned long zs_can_compact(struct size_class *class)
>   {
>   	unsigned long obj_wasted;
> +	unsigned long obj_allocated, obj_used;
>
> -	obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) -
> -		zs_stat_get(class, OBJ_USED);
> +	obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
> +	obj_used = zs_stat_get(class, OBJ_USED);
> +	obj_wasted = obj_allocated - obj_used;
>
>   	obj_wasted /= get_maxobj_per_zspage(class->size,
>   			class->pages_per_zspage);
> @@ -1805,53 +1898,81 @@ static unsigned long zs_can_compact(struct size_class *class)
>
>   static void __zs_compact(struct zs_pool *pool, struct size_class *class)
>   {
> -	struct zs_compact_control cc;
> -	struct page *src_page;
> +	struct page *src_page = NULL;
>   	struct page *dst_page = NULL;
>
> -	spin_lock(&class->lock);
> -	while ((src_page = isolate_source_page(class))) {
> +	while (1) {
> +		int nr_migrated;
>
> -		if (!zs_can_compact(class))
> +		spin_lock(&class->lock);
> +		if (!zs_can_compact(class)) {
> +			spin_unlock(&class->lock);
>   			break;
> +		}
>
> -		cc.index = 0;
> -		cc.s_page = src_page;
> +		/*
> +		 * Isolate source page and freeze all objects in a zspage
> +		 * to prevent zspage destroying.
> +		 */
> +		if (!src_page) {
> +			src_page = isolate_source_page(class);
> +			if (!src_page) {
> +				spin_unlock(&class->lock);
> +				break;
> +			}
> +		}
>
> -		while ((dst_page = isolate_target_page(class))) {
> -			cc.d_page = dst_page;
> -			/*
> -			 * If there is no more space in dst_page, resched
> -			 * and see if anyone had allocated another zspage.
> -			 */
> -			if (!migrate_zspage(pool, class, &cc))
> +		/* Isolate target page and freeze all objects in the zspage */
> +		if (!dst_page) {
> +			dst_page = isolate_target_page(class);
> +			if (!dst_page) {
> +				spin_unlock(&class->lock);
>   				break;
> +			}
> +		}
> +		spin_unlock(&class->lock);

(Sorry to delete individual recipients due to my compliance issues.)

Hello, Minchan.


Is it safe to unlock?


(I assume that the system has 2 cores
and a swap device is using zsmalloc pool.)
If a zs compact context scheduled out after this "spin_unlock" line,


    CPU A (Swap In)                CPU B (zs_free by process killed)
---------------------           -------------------------
                                 ...
                                 spin_lock(&si->lock)
                                 ...
                                # assume it is pinned by zs_compact context.
                                 pin_tag(handle) --> block

...
spin_lock(&si->lock) --> block


I think CPU A and CPU B may not be released forever.
Am I missing something?

Thanks.
Chulmin


> +
> +		nr_migrated = migrate_zspage(class, dst_page, src_page);
>
> -			VM_BUG_ON_PAGE(putback_zspage(pool, class,
> -				dst_page) == ZS_EMPTY, dst_page);
> +		if (zspage_full(class, dst_page)) {
> +			spin_lock(&class->lock);
> +			putback_zspage(class, dst_page);
> +			unfreeze_zspage(class, dst_page,
> +				class->objs_per_zspage);
> +			spin_unlock(&class->lock);
> +			dst_page = NULL;
>   		}
>
> -		/* Stop if we couldn't find slot */
> -		if (dst_page == NULL)
> -			break;
> +		if (zspage_empty(class, src_page)) {
> +			free_zspage(pool, src_page);
> +			spin_lock(&class->lock);
> +			zs_stat_dec(class, OBJ_ALLOCATED,
> +				get_maxobj_per_zspage(
> +				class->size, class->pages_per_zspage));
> +			atomic_long_sub(class->pages_per_zspage,
> +					&pool->pages_allocated);
>
> -		VM_BUG_ON_PAGE(putback_zspage(pool, class,
> -				dst_page) == ZS_EMPTY, dst_page);
> -		if (putback_zspage(pool, class, src_page) == ZS_EMPTY) {
>   			pool->stats.pages_compacted += class->pages_per_zspage;
>   			spin_unlock(&class->lock);
> -			free_zspage(pool, class, src_page);
> -		} else {
> -			spin_unlock(&class->lock);
> +			src_page = NULL;
>   		}
> +	}
>
> -		cond_resched();
> -		spin_lock(&class->lock);
> +	if (!src_page && !dst_page)
> +		return;
> +
> +	spin_lock(&class->lock);
> +	if (src_page) {
> +		putback_zspage(class, src_page);
> +		unfreeze_zspage(class, src_page,
> +				class->objs_per_zspage);
>   	}
>
> -	if (src_page)
> -		VM_BUG_ON_PAGE(putback_zspage(pool, class,
> -				src_page) == ZS_EMPTY, src_page);
> +	if (dst_page) {
> +		putback_zspage(class, dst_page);
> +		unfreeze_zspage(class, dst_page,
> +				class->objs_per_zspage);
> +	}
>
>   	spin_unlock(&class->lock);
>   }
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ