[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20210201195753.GA2026@pc638.lan>
Date: Mon, 1 Feb 2021 20:57:53 +0100
From: Uladzislau Rezki <urezki@...il.com>
To: qiang.zhang@...driver.com
Cc: urezki@...il.com, paulmck@...nel.org, joel@...lfernandes.org,
rcu@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v3] kvfree_rcu: Release page cache under memory pressure
Hello, Zqiang.
> From: Zqiang <qiang.zhang@...driver.com>
>
> Add free per-cpu existing krcp's page cache operation, when
> the system is under memory pressure.
>
> Signed-off-by: Zqiang <qiang.zhang@...driver.com>
> ---
> kernel/rcu/tree.c | 26 ++++++++++++++++++++++++++
> 1 file changed, 26 insertions(+)
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index c1ae1e52f638..644b0f3c7b9f 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3571,17 +3571,41 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> }
> EXPORT_SYMBOL_GPL(kvfree_call_rcu);
>
> +static int free_krc_page_cache(struct kfree_rcu_cpu *krcp)
> +{
> + unsigned long flags;
> + struct llist_node *page_list, *pos, *n;
> + int freed = 0;
> +
> + raw_spin_lock_irqsave(&krcp->lock, flags);
> + page_list = llist_del_all(&krcp->bkvcache);
> + krcp->nr_bkv_objs = 0;
> + raw_spin_unlock_irqrestore(&krcp->lock, flags);
> +
> + llist_for_each_safe(pos, n, page_list) {
> + free_page((unsigned long)pos);
> + freed++;
> + }
> +
> + return freed;
> +}
> +
> static unsigned long
> kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
> {
> int cpu;
> unsigned long count = 0;
> + unsigned long flags;
>
> /* Snapshot count of all CPUs */
> for_each_possible_cpu(cpu) {
> struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
>
> count += READ_ONCE(krcp->count);
> +
> + raw_spin_lock_irqsave(&krcp->lock, flags);
> + count += krcp->nr_bkv_objs;
> + raw_spin_unlock_irqrestore(&krcp->lock, flags);
> }
>
> return count;
> @@ -3598,6 +3622,8 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
> struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
>
> count = krcp->count;
> + count += free_krc_page_cache(krcp);
> +
> raw_spin_lock_irqsave(&krcp->lock, flags);
> if (krcp->monitor_todo)
> kfree_rcu_drain_unlock(krcp, flags);
> --
> 2.17.1
>
Thank you for your patch!
I spent some time to see how the patch behaves under low memory condition.
To simulate it, i used "rcuscale" tool with below parameters:
../rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 10 --kconfig CONFIG_NR_CPUS=64 \
--bootargs "rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 rcuscale.holdoff=20 rcuscale.kfree_loops=10000 \
torture.disable_onoff_at_boot" --trust-make
64 CPUs + 512 MB of memory. In general, my test system was running on edge
hitting an out of memory sometimes, but could be considered as stable in
regards to a test completion and taken time, so both were pretty solid.
You can find a comparison on a plot, that can be downloaded following
a link: wget ftp://vps418301.ovh.net/incoming/release_page_cache_under_low_memory.png
In short, i see that a patched version can lead to longer test completion,
whereas the default variant is stable on almost all runs. After some analysis
and further digging i came to conclusion that a shrinker free_krc_page_cache()
concurs with run_page_cache_worker(krcp) running from kvfree_rcu() context.
i.e. During the test a page shrinker is pretty active, because of low memory
condition. Our callback drains it whereas kvfree_rcu() part refill it right
away making kind of vicious circle.
So, a run_page_cache_worker() should be backoff for some time when a system
runs into a low memory condition or high pressure:
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 7077d73fcb53..446723b9646b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3163,7 +3163,7 @@ struct kfree_rcu_cpu {
bool initialized;
int count;
- struct work_struct page_cache_work;
+ struct delayed_work page_cache_work;
atomic_t work_in_progress;
struct hrtimer hrtimer;
@@ -3419,7 +3419,7 @@ schedule_page_work_fn(struct hrtimer *t)
struct kfree_rcu_cpu *krcp =
container_of(t, struct kfree_rcu_cpu, hrtimer);
- queue_work(system_highpri_wq, &krcp->page_cache_work);
+ queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0);
return HRTIMER_NORESTART;
}
@@ -3428,7 +3428,7 @@ static void fill_page_cache_func(struct work_struct *work)
struct kvfree_rcu_bulk_data *bnode;
struct kfree_rcu_cpu *krcp =
container_of(work, struct kfree_rcu_cpu,
- page_cache_work);
+ page_cache_work.work);
unsigned long flags;
bool pushed;
int i;
@@ -3452,15 +3452,22 @@ static void fill_page_cache_func(struct work_struct *work)
atomic_set(&krcp->work_in_progress, 0);
}
+static bool backoff_page_cache_fill;
+
static void
run_page_cache_worker(struct kfree_rcu_cpu *krcp)
{
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
!atomic_xchg(&krcp->work_in_progress, 1)) {
- hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
- HRTIMER_MODE_REL);
- krcp->hrtimer.function = schedule_page_work_fn;
- hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+ if (READ_ONCE(backoff_page_cache_fill)) {
+ queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, HZ);
+ WRITE_ONCE(backoff_page_cache_fill, false);
+ } else {
+ hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
+ krcp->hrtimer.function = schedule_page_work_fn;
+ hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+ }
}
}
@@ -3644,6 +3651,8 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
+ // Low memory condition, limit a page cache worker activity.
+ WRITE_ONCE(backoff_page_cache_fill, true);
return count;
}
@@ -4634,7 +4643,7 @@ static void __init kfree_rcu_batch_init(void)
}
INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
- INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
+ INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
krcp->initialized = true;
}
if (register_shrinker(&kfree_rcu_shrinker))
below patch fixes it. We should backoff the page fill worker keeping it empty
until the situation with memory consumption is normalized.
Any thoughts ideas?
--
Vlad Rezki
Powered by blists - more mailing lists