[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <7c1180f4-923c-4138-b756-618cb5d597ac@ijzerbout.nl>
Date: Mon, 30 Jun 2025 22:05:48 +0200
From: Kees Bakker <kees@...erbout.nl>
To: Joshua Hahn <joshua.hahnjy@...il.com>, Gregory Price <gourry@...rry.net>
Cc: Andrew Morton <akpm@...ux-foundation.org>,
Alistair Popple <apopple@...dia.com>, Byungchul Park <byungchul@...com>,
David Hildenbrand <david@...hat.com>, Matthew Brost
<matthew.brost@...el.com>, Rakie Kim <rakie.kim@...com>,
Ying Huang <ying.huang@...ux.alibaba.com>, Zi Yan <ziy@...dia.com>,
linux-kernel@...r.kernel.org, linux-mm@...ck.org, kernel-team@...a.com
Subject: Re: [PATCH 2/2] mm/mempolicy: Skip extra call to __alloc_pages_bulk
in weighted interleave
Op 26-06-2025 om 22:09 schreef Joshua Hahn:
> Currently, alloc_pages_bulk_weighted_interleave can make up to nr_node_ids+1
> calls to __alloc_pages_bulk. The additional allocation can happen if the
> previous call to this function finished the weighted round robin allocation
> partially on a node. To make up for this, the next time this function is
> called, an extra allocation is made to finish cleanly on the node boundaries
> before performing the weighted round-robin cycles again.
>
> Instead of making an additional call, we can calculate how many additional
> pages should be allocated from the first node (aka carryover) and add that
> value to the number of pages that should be allocated as part of the current
> round-robin cycle.
>
> Running a quick benchmark by compiling the kernel shows a small increase
> in performance. These experiments were run on a machine with 2 nodes, each
> with 125GB memory and 40 CPUs.
>
> time numactl -w 0,1 make -j$(nproc)
>
> +----------+---------+------------+---------+
> | Time (s) | 6.16 | With patch | % Delta |
> +----------+---------+------------+---------+
> | Real | 88.374 | 88.3356 | -0.2019 |
> | User | 3631.7 | 3636.263 | 0.0631 |
> | Sys | 366.029 | 363.792 | -0.7534 |
> +----------+---------+------------+---------+
>
> Signed-off-by: Joshua Hahn <joshua.hahnjy@...il.com>
>
> ---
> mm/mempolicy.c | 39 ++++++++++++++++++++-------------------
> 1 file changed, 20 insertions(+), 19 deletions(-)
>
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 78ad74a0e249..0d693f96cf66 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -2569,7 +2569,7 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
> unsigned long node_pages, delta;
> u8 *weights, weight;
> unsigned int weight_total = 0;
> - unsigned long rem_pages = nr_pages;
> + unsigned long rem_pages = nr_pages, carryover = 0;
> nodemask_t nodes;
> int nnodes, node;
> int resume_node = MAX_NUMNODES - 1;
> @@ -2594,18 +2594,12 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
> node = me->il_prev;
> weight = me->il_weight;
> if (weight && node_isset(node, nodes)) {
> - node_pages = min(rem_pages, weight);
> - nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
> - page_array);
> - page_array += nr_allocated;
> - total_allocated += nr_allocated;
> - /* if that's all the pages, no need to interleave */
> if (rem_pages <= weight) {
> - me->il_weight -= rem_pages;
> - return total_allocated;
> + node_pages = rem_pages;
> + me->il_weight -= node_pages;
> + goto allocate;
This is a goto into the middle of a for-loop.
What do you think is going to happen at the end of that loop?
I think (only tested with a small C program) it will go to the start of
the loop, do the i++, check i<nnodes, and possibly do the loop again.
Variable i is uninitialized at that point. In the loop it hits several
uninitialized variables.
Even if this is legal C code, it is pretty obscure.
> }
> - /* Otherwise we adjust remaining pages, continue from there */
> - rem_pages -= weight;
> + carryover = weight;
> }
> /* clear active weight in case of an allocation failure */
> me->il_weight = 0;
> @@ -2614,7 +2608,7 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
> /* create a local copy of node weights to operate on outside rcu */
> weights = kzalloc(nr_node_ids, GFP_KERNEL);
> if (!weights)
> - return total_allocated;
> + return 0;
>
> rcu_read_lock();
> state = rcu_dereference(wi_state);
> @@ -2634,16 +2628,17 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
> /*
> * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
> * Track which node weighted interleave should resume from.
> + * Account for carryover. It is always allocated from the first node.
> *
> * if (rounds > 0) and (delta == 0), resume_node will always be
> * the node following prev_node and its weight.
> */
> - rounds = rem_pages / weight_total;
> - delta = rem_pages % weight_total;
> + rounds = (rem_pages - carryover) / weight_total;
> + delta = (rem_pages - carryover) % weight_total;
> resume_node = next_node_in(prev_node, nodes);
> resume_weight = weights[resume_node];
> + node = carryover ? prev_node : next_node_in(prev_node, nodes);
> for (i = 0; i < nnodes; i++) {
> - node = next_node_in(prev_node, nodes);
> weight = weights[node];
> /* when delta is depleted, resume from that node */
> if (delta && delta < weight) {
> @@ -2651,12 +2646,14 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
> resume_weight = weight - delta;
> }
> /* Add the node's portion of the delta, if there is one */
> - node_pages = weight * rounds + min(delta, weight);
> + node_pages = weight * rounds + min(delta, weight) + carryover;
> delta -= min(delta, weight);
> + carryover = 0;
>
> /* node_pages can be 0 if an allocation fails and rounds == 0 */
> if (!node_pages)
> break;
> +allocate:
> nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
> page_array);
> page_array += nr_allocated;
> @@ -2664,10 +2661,14 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
> if (total_allocated == nr_pages)
> break;
> prev_node = node;
> + node = next_node_in(prev_node, nodes);
> + }
> +
> + if (weights) {
> + me->il_prev = resume_node;
> + me->il_weight = resume_weight;
> + kfree(weights);
> }
> - me->il_prev = resume_node;
> - me->il_weight = resume_weight;
> - kfree(weights);
> return total_allocated;
> }
>
Powered by blists - more mailing lists