[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <d035af42-89a1-469e-87f4-8fbbfefa1332@redhat.com>
Date: Fri, 24 Nov 2023 16:39:17 +0100
From: David Hildenbrand <david@...hat.com>
To: Stefan Roesch <shr@...kernel.io>, kernel-team@...com
Cc: akpm@...ux-foundation.org, hannes@...xchg.org, riel@...riel.com,
linux-kernel@...r.kernel.org, linux-mm@...ck.org
Subject: Re: [PATCH v2 1/4] mm/ksm: add ksm advisor
On 28.10.23 02:09, Stefan Roesch wrote:
> This adds the ksm advisor. The ksm advisor automatically manages the
> pages_to_scan setting to achieve a target scan time. The target scan
> time defines how many seconds it should take to scan all the candidate
> KSM pages. In other words the pages_to_scan rate is changed by the
> advisor to achieve the target scan time. The algorithm has a max and min
> value to:
> - guarantee responsiveness to changes
> - to avoid to spend too much CPU
>
> The respective parameters are:
> - ksm_advisor_target_scan_time (how many seconds a scan should take)
> - ksm_advisor_min_cpu (minimum value for cpu percent usage)
> - ksm_advisor_max_cpu (maximum value for cpu percent usage)
>
> - ksm_advisor_min_pages (minimum value for pages_to_scan per batch)
> - ksm_advisor_max_pages (maximum value for pages_to_scan per batch)
>
> The algorithm calculates the change value based on the target scan time
> and the previous scan time. To avoid pertubations an exponentially
> weighted moving average is applied.
>
> The advisor is managed by three main parameters: target scan time,
> cpu min time and cpu max time for the ksmd background thread. These
> parameters determine how aggresive ksmd scans.
>
> In addition there are min and max values for the pages_to_scan parameter
> to make sure that its initial and max values are not set too low or too
> high. This ensures that it is able to react to changes quickly enough.
>
> The default values are:
> - target scan time: 200 secs
> - min cpu: 15%
> - max cpu: 70%
> - min pages: 500
> - max pages: 30000
>
> By default the advisor is disabled. Currently there are two advisors:
> none and scan_time.
>
> Tests with various workloads have shown considerable CPU savings. Most
> of the workloads I have investigated have more candidate pages during
> startup, once the workload is stable in terms of memory, the number of
> candidate pages is reduced. Without the advisor, the pages_to_scan needs
> to be sized for the maximum number of candidate pages. So having this
> advisor definitely helps in reducing CPU consumption.
>
> For the instagram workload, the advisor achieves a 25% CPU reduction.
> Once the memory is stable, the pages_to_scan parameter gets reduced to
> about 40% of its max value.
>
> Signed-off-by: Stefan Roesch <shr@...kernel.io>
> ---
> mm/ksm.c | 159 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 158 insertions(+), 1 deletion(-)
>
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 7efcc68ccc6e..e18fecfb359d 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -21,6 +21,7 @@
> #include <linux/sched.h>
> #include <linux/sched/mm.h>
> #include <linux/sched/coredump.h>
> +#include <linux/sched/cputime.h>
> #include <linux/rwsem.h>
> #include <linux/pagemap.h>
> #include <linux/rmap.h>
> @@ -248,6 +249,9 @@ static struct kmem_cache *rmap_item_cache;
> static struct kmem_cache *stable_node_cache;
> static struct kmem_cache *mm_slot_cache;
>
> +/* Default number of pages to scan per batch */
> +#define DEFAULT_PAGES_TO_SCAN 100
> +
> /* The number of pages scanned */
> static unsigned long ksm_pages_scanned;
>
> @@ -276,7 +280,7 @@ static unsigned int ksm_stable_node_chains_prune_millisecs = 2000;
> static int ksm_max_page_sharing = 256;
>
> /* Number of pages ksmd should scan in one batch */
> -static unsigned int ksm_thread_pages_to_scan = 100;
> +static unsigned int ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN;
>
> /* Milliseconds ksmd should sleep between batches */
> static unsigned int ksm_thread_sleep_millisecs = 20;
> @@ -297,6 +301,155 @@ unsigned long ksm_zero_pages;
> /* The number of pages that have been skipped due to "smart scanning" */
> static unsigned long ksm_pages_skipped;
>
> +/* Don't scan more than max pages per batch. */
> +static unsigned long ksm_advisor_max_pages = 30000;
> +
> +/* At least scan this many pages per batch. */
> +static unsigned long ksm_advisor_min_pages = 500;
> +
> +/* Min CPU for scanning pages per scan */
> +static unsigned int ksm_advisor_min_cpu = 15;
> +
> +/* Max CPU for scanning pages per scan */
> +static unsigned int ksm_advisor_max_cpu = 70;
> +
> +/* Target scan time in seconds to analyze all KSM candidate pages. */
> +static unsigned long ksm_advisor_target_scan_time = 200;
> +
> +/* Exponentially weighted moving average. */
> +#define EWMA_WEIGHT 30
> +
> +/**
> + * struct advisor_ctx - metadata for KSM advisor
> + * @start_scan: start time of the current scan
> + * @scan_time: scan time of previous scan
> + * @change: change in percent to pages_to_scan parameter
> + * @cpu_percent: average cpu percent usage of the ksmd thread for the last scan
> + */
> +struct advisor_ctx {
> + ktime_t start_scan;
> + unsigned long scan_time;
> + unsigned long change;
> + unsigned long long cpu_time;
> +};
> +static struct advisor_ctx advisor_ctx;
> +
> +/* Define different advisor's */
> +enum ksm_advisor_type {
> + KSM_ADVISOR_NONE,
> + KSM_ADVISOR_FIRST = KSM_ADVISOR_NONE,
> + KSM_ADVISOR_SCAN_TIME,
> + KSM_ADVISOR_LAST = KSM_ADVISOR_SCAN_TIME
> +};
> +static enum ksm_advisor_type ksm_advisor;
> +
> +static void init_advisor(void)
> +{
> + advisor_ctx.start_scan = 0;
> + advisor_ctx.scan_time = 0;
> + advisor_ctx.change = 0;
> + advisor_ctx.cpu_time = 0;
> +}
> +
> +/*
> + * Use previous scan time if available, otherwise use current scan time as an
> + * approximation for the previous scan time.
> + */
> +static inline unsigned long prev_scan_time(struct advisor_ctx *ctx,
> + unsigned long scan_time)
> +{
> + return ctx->scan_time ? ctx->scan_time : scan_time;
> +}
> +
> +/* Calculate exponential weighted moving average */
> +static unsigned long ewma(unsigned long prev, unsigned long curr)
> +{
> + return ((100 - EWMA_WEIGHT) * prev + EWMA_WEIGHT * curr) / 100;
> +}
> +
> +/*
> + * The scan time advisor is based on the current scan rate and the target
> + * scan rate.
> + *
> + * new_pages_to_scan = pages_to_scan * (scan_time / target_scan_time)
> + *
> + * To avoid pertubations it calculates a change factor of previous changes.
> + * A new change factor is calculated for each iteration and it uses an
> + * exponentially weighted moving average. The new pages_to_scan value is
> + * multiplied with that change factor:
> + *
> + * new_pages_to_scan *= change facor
> + *
> + * In addition the new pages_to_scan value is capped by the max and min
> + * limits.
> + */
> +static void scan_time_advisor(unsigned long scan_time)
> +{
> + unsigned int cpu_percent;
> + unsigned long cpu_time;
> + unsigned long cpu_time_diff;
> + unsigned long cpu_time_diff_ms;
> + unsigned long pages;
> + unsigned long per_page_cost;
> + unsigned long factor;
> + unsigned long change;
> + unsigned long last_scan_time;
> +
> + cpu_time = task_sched_runtime(current);
> + cpu_time_diff = cpu_time - advisor_ctx.cpu_time;
> + cpu_time_diff_ms = cpu_time_diff / 1000 / 1000;
> +
> + cpu_percent = (cpu_time_diff_ms * 100) / (scan_time * 1000);
> + cpu_percent = cpu_percent ? cpu_percent : 1;
> + last_scan_time = prev_scan_time(&advisor_ctx, scan_time);
> +
> + /* Calculate scan time as percentage of target scan time */
> + factor = ksm_advisor_target_scan_time * 100 / scan_time;
> + factor = factor ? factor : 1;
> +
^ ah, that's what I missed.
BTW, why do we pass in "scan_time" and not simply obtain it here, just
like we do with task_sched_runtime() ?
--
Cheers,
David / dhildenb
Powered by blists - more mailing lists