[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <df576430-8ead-67f5-44c1-f65ae2843886@huawei.com>
Date: Wed, 4 Dec 2024 09:21:38 +0800
From: zuoze <zuoze1@...wei.com>
To: Uladzislau Rezki <urezki@...il.com>, Kefeng Wang
<wangkefeng.wang@...wei.com>
CC: Matthew Wilcox <willy@...radead.org>, <gustavoars@...nel.org>,
<akpm@...ux-foundation.org>, <linux-hardening@...r.kernel.org>,
<linux-mm@...ck.org>, <keescook@...omium.org>
Subject: Re: [PATCH -next] mm: usercopy: add a debugfs interface to bypass the
vmalloc check.
在 2024/12/4 3:02, Uladzislau Rezki 写道:
> On Tue, Dec 03, 2024 at 03:20:04PM +0100, Uladzislau Rezki wrote:
>> On Tue, Dec 03, 2024 at 10:10:26PM +0800, Kefeng Wang wrote:
>>>
>>>
>>> On 2024/12/3 21:51, Uladzislau Rezki wrote:
>>>> On Tue, Dec 03, 2024 at 09:45:09PM +0800, Kefeng Wang wrote:
>>>>>
>>>>>
>>>>> On 2024/12/3 21:39, Uladzislau Rezki wrote:
>>>>>> On Tue, Dec 03, 2024 at 09:30:09PM +0800, Kefeng Wang wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 2024/12/3 21:10, zuoze wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>> 在 2024/12/3 20:39, Uladzislau Rezki 写道:
>>>>>>>>> On Tue, Dec 03, 2024 at 07:23:44PM +0800, zuoze wrote:
>>>>>>>>>> We have implemented host-guest communication based on the TUN device
>>>>>>>>>> using XSK[1]. The hardware is a Kunpeng 920 machine (ARM architecture),
>>>>>>>>>> and the operating system is based on the 6.6 LTS version with kernel
>>>>>>>>>> version 6.6. The specific stack for hotspot collection is as follows:
>>>>>>>>>>
>>>>>>>>>> - 100.00% 0.00% vhost-12384 [unknown] [k] 0000000000000000
>>>>>>>>>> - ret_from_fork
>>>>>>>>>> - 99.99% vhost_task_fn
>>>>>>>>>> - 99.98% 0xffffdc59f619876c
>>>>>>>>>> - 98.99% handle_rx_kick
>>>>>>>>>> - 98.94% handle_rx
>>>>>>>>>> - 94.92% tun_recvmsg
>>>>>>>>>> - 94.76% tun_do_read
>>>>>>>>>> - 94.62% tun_put_user_xdp_zc
>>>>>>>>>> - 63.53% __check_object_size
>>>>>>>>>> - 63.49% __check_object_size.part.0
>>>>>>>>>> find_vmap_area
>>>>>>>>>> - 30.02% _copy_to_iter
>>>>>>>>>> __arch_copy_to_user
>>>>>>>>>> - 2.27% get_rx_bufs
>>>>>>>>>> - 2.12% vhost_get_vq_desc
>>>>>>>>>> 1.49% __arch_copy_from_user
>>>>>>>>>> - 0.89% peek_head_len
>>>>>>>>>> 0.54% xsk_tx_peek_desc
>>>>>>>>>> - 0.68% vhost_add_used_and_signal_n
>>>>>>>>>> - 0.53% eventfd_signal
>>>>>>>>>> eventfd_signal_mask
>>>>>>>>>> - 0.94% handle_tx_kick
>>>>>>>>>> - 0.94% handle_tx
>>>>>>>>>> - handle_tx_copy
>>>>>>>>>> - 0.59% vhost_tx_batch.constprop.0
>>>>>>>>>> 0.52% tun_sendmsg
>>>>>>>>>>
>>>>>>>>>> It can be observed that most of the overhead is concentrated in the
>>>>>>>>>> find_vmap_area function.
>>>>>>>>>>
>>> ...
>>>>>
>>>> Thank you. Then you have tons of copy_to_iter/copy_from_iter calls
>>>> during your test case. Per each you need to find an area which might
>>>> be really heavy.
>>>
>>> Exactly, no vmalloc check before 0aef499f3172 ("mm/usercopy: Detect vmalloc
>>> overruns"), so no burden in find_vmap_area in old kernel.
>>>
>> Yep. It will slow down for sure.
>>
>>>>
>>>> How many CPUs in a system you have?
>>>>
>>>
>>> 128 core
>> OK. Just in case, do you see in a boot log something like:
>>
>> "Failed to allocate an array. Disable a node layer"
>>
> And if you do not see such failing message, it means that a node layer
> is up and running fully, can you also test below patch on your workload?
We did not see such failing message. As for the test patch you
suggested, we will test it and check the results. Thank you very much.
>
> <snip>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 634162271c00..35b28be27cf4 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -896,7 +896,7 @@ static struct vmap_node {
> * is fully disabled. Later on, after vmap is initialized these
> * parameters are updated based on a system capacity.
> */
> -static struct vmap_node *vmap_nodes = &single;
> +static struct vmap_node **vmap_nodes;
> static __read_mostly unsigned int nr_vmap_nodes = 1;
> static __read_mostly unsigned int vmap_zone_size = 1;
>
> @@ -909,13 +909,13 @@ addr_to_node_id(unsigned long addr)
> static inline struct vmap_node *
> addr_to_node(unsigned long addr)
> {
> - return &vmap_nodes[addr_to_node_id(addr)];
> + return vmap_nodes[addr_to_node_id(addr)];
> }
>
> static inline struct vmap_node *
> id_to_node(unsigned int id)
> {
> - return &vmap_nodes[id % nr_vmap_nodes];
> + return vmap_nodes[id % nr_vmap_nodes];
> }
>
> /*
> @@ -1060,7 +1060,7 @@ find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va)
>
> repeat:
> for (i = 0, va_start_lowest = 0; i < nr_vmap_nodes; i++) {
> - vn = &vmap_nodes[i];
> + vn = vmap_nodes[i];
>
> spin_lock(&vn->busy.lock);
> *va = __find_vmap_area_exceed_addr(addr, &vn->busy.root);
> @@ -2240,7 +2240,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
> purge_nodes = CPU_MASK_NONE;
>
> for (i = 0; i < nr_vmap_nodes; i++) {
> - vn = &vmap_nodes[i];
> + vn = vmap_nodes[i];
>
> INIT_LIST_HEAD(&vn->purge_list);
> vn->skip_populate = full_pool_decay;
> @@ -2272,7 +2272,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
> nr_purge_helpers = clamp(nr_purge_helpers, 1U, nr_purge_nodes) - 1;
>
> for_each_cpu(i, &purge_nodes) {
> - vn = &vmap_nodes[i];
> + vn = vmap_nodes[i];
>
> if (nr_purge_helpers > 0) {
> INIT_WORK(&vn->purge_work, purge_vmap_node);
> @@ -2291,7 +2291,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
> }
>
> for_each_cpu(i, &purge_nodes) {
> - vn = &vmap_nodes[i];
> + vn = vmap_nodes[i];
>
> if (vn->purge_work.func) {
> flush_work(&vn->purge_work);
> @@ -2397,7 +2397,7 @@ struct vmap_area *find_vmap_area(unsigned long addr)
> */
> i = j = addr_to_node_id(addr);
> do {
> - vn = &vmap_nodes[i];
> + vn = vmap_nodes[i];
>
> spin_lock(&vn->busy.lock);
> va = __find_vmap_area(addr, &vn->busy.root);
> @@ -2421,7 +2421,7 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
> */
> i = j = addr_to_node_id(addr);
> do {
> - vn = &vmap_nodes[i];
> + vn = vmap_nodes[i];
>
> spin_lock(&vn->busy.lock);
> va = __find_vmap_area(addr, &vn->busy.root);
> @@ -4928,7 +4928,7 @@ static void show_purge_info(struct seq_file *m)
> int i;
>
> for (i = 0; i < nr_vmap_nodes; i++) {
> - vn = &vmap_nodes[i];
> + vn = vmap_nodes[i];
>
> spin_lock(&vn->lazy.lock);
> list_for_each_entry(va, &vn->lazy.head, list) {
> @@ -4948,7 +4948,7 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
> int i;
>
> for (i = 0; i < nr_vmap_nodes; i++) {
> - vn = &vmap_nodes[i];
> + vn = vmap_nodes[i];
>
> spin_lock(&vn->busy.lock);
> list_for_each_entry(va, &vn->busy.head, list) {
> @@ -5069,6 +5069,7 @@ static void __init vmap_init_free_space(void)
>
> static void vmap_init_nodes(void)
> {
> + struct vmap_node **nodes;
> struct vmap_node *vn;
> int i, n;
>
> @@ -5087,23 +5088,34 @@ static void vmap_init_nodes(void)
> * set of cores. Therefore a per-domain purging is supposed to
> * be added as well as a per-domain balancing.
> */
> - n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);
> + n = 1024;
>
> if (n > 1) {
> - vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN);
> - if (vn) {
> + nodes = kmalloc_array(n, sizeof(struct vmap_node **),
> + GFP_NOWAIT | __GFP_NOWARN | __GFP_ZERO);
> +
> + if (nodes) {
> + for (i = 0; i < n; i++) {
> + nodes[i] = kmalloc(sizeof(struct vmap_node), GFP_NOWAIT | __GFP_ZERO);
> +
> + if (!nodes[i])
> + break;
> + }
> +
> /* Node partition is 16 pages. */
> vmap_zone_size = (1 << 4) * PAGE_SIZE;
> - nr_vmap_nodes = n;
> - vmap_nodes = vn;
> + nr_vmap_nodes = i;
> + vmap_nodes = nodes;
> } else {
> pr_err("Failed to allocate an array. Disable a node layer\n");
> + vmap_nodes[0] = &single;
> + nr_vmap_nodes = 1;
> }
> }
> #endif
>
> for (n = 0; n < nr_vmap_nodes; n++) {
> - vn = &vmap_nodes[n];
> + vn = vmap_nodes[n];
> vn->busy.root = RB_ROOT;
> INIT_LIST_HEAD(&vn->busy.head);
> spin_lock_init(&vn->busy.lock);
> @@ -5129,7 +5141,7 @@ vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
> int i, j;
>
> for (count = 0, i = 0; i < nr_vmap_nodes; i++) {
> - vn = &vmap_nodes[i];
> + vn = vmap_nodes[i];
>
> for (j = 0; j < MAX_VA_SIZE_PAGES; j++)
> count += READ_ONCE(vn->pool[j].len);
> @@ -5144,7 +5156,7 @@ vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
> int i;
>
> for (i = 0; i < nr_vmap_nodes; i++)
> - decay_va_pool_node(&vmap_nodes[i], true);
> + decay_va_pool_node(vmap_nodes[i], true);
>
> return SHRINK_STOP;
> }
> <snip>
>
> it sets a number of nodes to 1024. It would be really appreciated to see
> the perf-delta with this patch. If it improves the things or not.
>
> Thank you in advance.
>
> --
> Uladzislau Rezki
>
Powered by blists - more mailing lists