linux-kernel - Re: [PATCH v3 04/11] mm: vmalloc: Remove global vmap_area

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <52766da2-41de-41ce-b60b-1118da343b8a@linux.alibaba.com>
Date: Sat, 6 Jan 2024 17:17:39 +0800
From: Wen Gu <guwen@...ux.alibaba.com>
To: Uladzislau Rezki <urezki@...il.com>
Cc: shaozhengchao <shaozhengchao@...wei.com>, linux-mm@...ck.org,
 LKML <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH v3 04/11] mm: vmalloc: Remove global vmap_area_root
 rb-tree


On 2024/1/5 18:50, Uladzislau Rezki wrote:

> Hello, Wen Gu.
> 
>>
>> Hi Uladzislau Rezki,
>>

<...>

>> Fortunately, thank you for this patch set, the global vmap_area_lock was
>> removed and per node lock vn->busy.lock is introduced. it is really helpful:
>>
>> In 48 CPUs qemu environment, the Requests/s increased by 5 times:
>> - nginx
>> - wrk -c 1000 -t 96 -d 30 http://127.0.0.1:80
>>
>>                  vzalloced shmem      vzalloced shmem(with this patch set)
>> Requests/sec          113536.56            583729.93
>>
>>
> Thank you for the confirmation that your workload is improved. The "nginx"
> is 5 times better!
> 

Yes, thank you very much for the improvement!

>> But it also has some overhead, compared to using kzalloced shared memory
>> or unsetting CONFIG_HARDENED_USERCOPY, which won't involve finding vmap area:
>>
>>                  kzalloced shmem      vzalloced shmem(unset CONFIG_HARDENED_USERCOPY)
>> Requests/sec          831950.39            805164.78
>>
>>
> The CONFIG_HARDENED_USERCOPY prevents coping "wrong" memory regions. That is
> why if it is a vmalloced memory it wants to make sure it is really true,
> if not user-copy is aborted.
> 
> So there is an extra work that involves finding a VA associated with an address.
> 

Yes, and lock contention in finding VA is likely to be a performance bottleneck,
which is mitigated a lot by your work.

>> So, as a newbie in Linux-mm, I would like to ask for some suggestions:
>>
>> Is it possible to further eliminate the overhead caused by lock contention
>> in find_vmap_area() in this scenario (maybe this is asking too much), or the
>> only way out is not setting CONFIG_HARDENED_USERCOPY or not using vzalloced
>> buffer in the situation where cocurrent kernel-userspace-copy happens?
>>
> Could you please try below patch, if it improves this series further?
> Just in case:
> 

Thank you! I tried the patch, and it seems that the wait for rwlock_t
also exists, as much as using spinlock_t. (The flamegraph is attached.
Not sure why the read_lock waits so long, given that there is no frequent
write_lock competition)

                vzalloced shmem(spinlock_t)   vzalloced shmem(rwlock_t)
Requests/sec         583729.93                     460007.44

So I guess the overhead in finding vmap area is inevitable here and the
original spin_lock is fine in this series.

Thanks again for your help!

Best regards,
Wen Gu

> <snip>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index e30dabf68263..40acf53cadfb 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -772,7 +772,7 @@ static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
>   struct rb_list {
>   	struct rb_root root;
>   	struct list_head head;
> -	spinlock_t lock;
> +	rwlock_t lock;
>   };
>   
>   struct vmap_pool {
> @@ -947,19 +947,19 @@ find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va)
>   	for (i = 0; i < nr_vmap_nodes; i++) {
>   		vn = &vmap_nodes[i];
>   
> -		spin_lock(&vn->busy.lock);
> +		read_lock(&vn->busy.lock);
>   		va_lowest = __find_vmap_area_exceed_addr(addr, &vn->busy.root);
>   		if (va_lowest) {
>   			if (!va_node || va_lowest->va_start < (*va)->va_start) {
>   				if (va_node)
> -					spin_unlock(&va_node->busy.lock);
> +					read_unlock(&va_node->busy.lock);
>   
>   				*va = va_lowest;
>   				va_node = vn;
>   				continue;
>   			}
>   		}
> -		spin_unlock(&vn->busy.lock);
> +		read_unlock(&vn->busy.lock);
>   	}
>   
>   	return va_node;
> @@ -1695,9 +1695,9 @@ static void free_vmap_area(struct vmap_area *va)
>   	/*
>   	 * Remove from the busy tree/list.
>   	 */
> -	spin_lock(&vn->busy.lock);
> +	write_lock(&vn->busy.lock);
>   	unlink_va(va, &vn->busy.root);
> -	spin_unlock(&vn->busy.lock);
> +	write_unlock(&vn->busy.lock);
>   
>   	/*
>   	 * Insert/Merge it back to the free tree/list.
> @@ -1901,9 +1901,9 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
>   
>   	vn = addr_to_node(va->va_start);
>   
> -	spin_lock(&vn->busy.lock);
> +	write_lock(&vn->busy.lock);
>   	insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
> -	spin_unlock(&vn->busy.lock);
> +	write_unlock(&vn->busy.lock);
>   
>   	BUG_ON(!IS_ALIGNED(va->va_start, align));
>   	BUG_ON(va->va_start < vstart);
> @@ -2123,10 +2123,10 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
>   		if (RB_EMPTY_ROOT(&vn->lazy.root))
>   			continue;
>   
> -		spin_lock(&vn->lazy.lock);
> +		write_lock(&vn->lazy.lock);
>   		WRITE_ONCE(vn->lazy.root.rb_node, NULL);
>   		list_replace_init(&vn->lazy.head, &vn->purge_list);
> -		spin_unlock(&vn->lazy.lock);
> +		write_unlock(&vn->lazy.lock);
>   
>   		start = min(start, list_first_entry(&vn->purge_list,
>   			struct vmap_area, list)->va_start);
> @@ -2223,9 +2223,9 @@ static void free_vmap_area_noflush(struct vmap_area *va)
>   	vn = is_vn_id_valid(vn_id) ?
>   		id_to_node(vn_id):addr_to_node(va->va_start);
>   
> -	spin_lock(&vn->lazy.lock);
> +	write_lock(&vn->lazy.lock);
>   	insert_vmap_area(va, &vn->lazy.root, &vn->lazy.head);
> -	spin_unlock(&vn->lazy.lock);
> +	write_unlock(&vn->lazy.lock);
>   
>   	trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);
>   
> @@ -2272,9 +2272,9 @@ struct vmap_area *find_vmap_area(unsigned long addr)
>   	do {
>   		vn = &vmap_nodes[i];
>   
> -		spin_lock(&vn->busy.lock);
> +		read_lock(&vn->busy.lock);
>   		va = __find_vmap_area(addr, &vn->busy.root);
> -		spin_unlock(&vn->busy.lock);
> +		read_unlock(&vn->busy.lock);
>   
>   		if (va)
>   			return va;
> @@ -2293,11 +2293,11 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
>   	do {
>   		vn = &vmap_nodes[i];
>   
> -		spin_lock(&vn->busy.lock);
> +		write_lock(&vn->busy.lock);
>   		va = __find_vmap_area(addr, &vn->busy.root);
>   		if (va)
>   			unlink_va(va, &vn->busy.root);
> -		spin_unlock(&vn->busy.lock);
> +		write_unlock(&vn->busy.lock);
>   
>   		if (va)
>   			return va;
> @@ -2514,9 +2514,9 @@ static void free_vmap_block(struct vmap_block *vb)
>   	BUG_ON(tmp != vb);
>   
>   	vn = addr_to_node(vb->va->va_start);
> -	spin_lock(&vn->busy.lock);
> +	write_lock(&vn->busy.lock);
>   	unlink_va(vb->va, &vn->busy.root);
> -	spin_unlock(&vn->busy.lock);
> +	write_unlock(&vn->busy.lock);
>   
>   	free_vmap_area_noflush(vb->va);
>   	kfree_rcu(vb, rcu_head);
> @@ -2942,9 +2942,9 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
>   {
>   	struct vmap_node *vn = addr_to_node(va->va_start);
>   
> -	spin_lock(&vn->busy.lock);
> +	read_lock(&vn->busy.lock);
>   	setup_vmalloc_vm_locked(vm, va, flags, caller);
> -	spin_unlock(&vn->busy.lock);
> +	read_unlock(&vn->busy.lock);
>   }
>   
>   static void clear_vm_uninitialized_flag(struct vm_struct *vm)
> @@ -4214,19 +4214,19 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
>   
>   	next_va:
>   		next = va->va_end;
> -		spin_unlock(&vn->busy.lock);
> +		read_unlock(&vn->busy.lock);
>   	} while ((vn = find_vmap_area_exceed_addr_lock(next, &va)));
>   
>   finished_zero:
>   	if (vn)
> -		spin_unlock(&vn->busy.lock);
> +		read_unlock(&vn->busy.lock);
>   
>   	/* zero-fill memory holes */
>   	return count - remains + zero_iter(iter, remains);
>   finished:
>   	/* Nothing remains, or We couldn't copy/zero everything. */
>   	if (vn)
> -		spin_unlock(&vn->busy.lock);
> +		read_unlock(&vn->busy.lock);
>   
>   	return count - remains;
>   }
> @@ -4563,11 +4563,11 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
>   	for (area = 0; area < nr_vms; area++) {
>   		struct vmap_node *vn = addr_to_node(vas[area]->va_start);
>   
> -		spin_lock(&vn->busy.lock);
> +		write_lock(&vn->busy.lock);
>   		insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
>   		setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
>   				 pcpu_get_vm_areas);
> -		spin_unlock(&vn->busy.lock);
> +		write_unlock(&vn->busy.lock);
>   	}
>   
>   	/*
> @@ -4687,7 +4687,7 @@ bool vmalloc_dump_obj(void *object)
>   
>   	vn = addr_to_node((unsigned long)objp);
>   
> -	if (spin_trylock(&vn->busy.lock)) {
> +	if (read_trylock(&vn->busy.lock)) {
>   		va = __find_vmap_area(addr, &vn->busy.root);
>   
>   		if (va && va->vm) {
> @@ -4697,7 +4697,7 @@ bool vmalloc_dump_obj(void *object)
>   			success = true;
>   		}
>   
> -		spin_unlock(&vn->busy.lock);
> +		read_unlock(&vn->busy.lock);
>   	}
>   
>   	if (success)
> @@ -4742,13 +4742,13 @@ static void show_purge_info(struct seq_file *m)
>   	for (i = 0; i < nr_vmap_nodes; i++) {
>   		vn = &vmap_nodes[i];
>   
> -		spin_lock(&vn->lazy.lock);
> +		read_lock(&vn->lazy.lock);
>   		list_for_each_entry(va, &vn->lazy.head, list) {
>   			seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
>   				(void *)va->va_start, (void *)va->va_end,
>   				va->va_end - va->va_start);
>   		}
> -		spin_unlock(&vn->lazy.lock);
> +		read_unlock(&vn->lazy.lock);
>   	}
>   }
>   
> @@ -4762,7 +4762,7 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
>   	for (i = 0; i < nr_vmap_nodes; i++) {
>   		vn = &vmap_nodes[i];
>   
> -		spin_lock(&vn->busy.lock);
> +		read_lock(&vn->busy.lock);
>   		list_for_each_entry(va, &vn->busy.head, list) {
>   			if (!va->vm) {
>   				if (va->flags & VMAP_RAM)
> @@ -4808,7 +4808,7 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
>   			show_numa_info(m, v);
>   			seq_putc(m, '\n');
>   		}
> -		spin_unlock(&vn->busy.lock);
> +		read_unlock(&vn->busy.lock);
>   	}
>   
>   	/*
> @@ -4902,11 +4902,11 @@ static void vmap_init_nodes(void)
>   		vn = &vmap_nodes[n];
>   		vn->busy.root = RB_ROOT;
>   		INIT_LIST_HEAD(&vn->busy.head);
> -		spin_lock_init(&vn->busy.lock);
> +		rwlock_init(&vn->busy.lock);
>   
>   		vn->lazy.root = RB_ROOT;
>   		INIT_LIST_HEAD(&vn->lazy.head);
> -		spin_lock_init(&vn->lazy.lock);
> +		rwlock_init(&vn->lazy.lock);
>   
>   		for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
>   			INIT_LIST_HEAD(&vn->pool[i].head);
> <snip>
> 
> Thank you!
> 
> --
> Uladzislau Rezki
Download attachment "vzalloc_t96_improve_rwlock.svg" of type "image/svg+xml" (252810 bytes)