linux-kernel - Re: [tip: core/rcu] rcu/tree: Add a shrinker to prevent OOM due to kfree

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <49168aa9-4f3a-e602-edd4-98e8b0138b0b@sony.com>
Date:   Wed, 3 Jun 2020 17:51:08 +0200
From:   peter enderborg <peter.enderborg@...y.com>
To:     <linux-kernel@...r.kernel.org>, <linux-tip-commits@...r.kernel.org>
CC:     <urezki@...il.com>,
        "Joel Fernandes (Google)" <joel@...lfernandes.org>,
        "Paul E. McKenney" <paulmck@...nel.org>, x86 <x86@...nel.org>
Subject: Re: [tip: core/rcu] rcu/tree: Add a shrinker to prevent OOM due to
 kfree_rcu() batching

On 5/11/20 10:59 PM, tip-bot2 for Joel Fernandes (Google) wrote:
> The following commit has been merged into the core/rcu branch of tip:
>
> Commit-ID:     9154244c1ab6c9db4f1f25ac8f73bd46dba64287
> Gitweb:        https://git.kernel.org/tip/9154244c1ab6c9db4f1f25ac8f73bd46dba64287
> Author:        Joel Fernandes (Google) <joel@...lfernandes.org>
> AuthorDate:    Mon, 16 Mar 2020 12:32:27 -04:00
> Committer:     Paul E. McKenney <paulmck@...nel.org>
> CommitterDate: Mon, 27 Apr 2020 11:02:50 -07:00
>
> rcu/tree: Add a shrinker to prevent OOM due to kfree_rcu() batching
>
> To reduce grace periods and improve kfree() performance, we have done
> batching recently dramatically bringing down the number of grace periods
> while giving us the ability to use kfree_bulk() for efficient kfree'ing.
>
> However, this has increased the likelihood of OOM condition under heavy
> kfree_rcu() flood on small memory systems. This patch introduces a
> shrinker which starts grace periods right away if the system is under
> memory pressure due to existence of objects that have still not started
> a grace period.
>
> With this patch, I do not observe an OOM anymore on a system with 512MB
> RAM and 8 CPUs, with the following rcuperf options:
>
> rcuperf.kfree_loops=20000 rcuperf.kfree_alloc_num=8000
> rcuperf.kfree_rcu_test=1 rcuperf.kfree_mult=2
>
> Otherwise it easily OOMs with the above parameters.
>
> NOTE:
> 1. On systems with no memory pressure, the patch has no effect as intended.
> 2. In the future, we can use this same mechanism to prevent grace periods
>    from happening even more, by relying on shrinkers carefully.
>
> Cc: urezki@...il.com
> Signed-off-by: Joel Fernandes (Google) <joel@...lfernandes.org>
> Signed-off-by: Paul E. McKenney <paulmck@...nel.org>
> ---
>  kernel/rcu/tree.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 60 insertions(+)
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 156ac8d..e299cd0 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -2824,6 +2824,8 @@ struct kfree_rcu_cpu {
>  	struct delayed_work monitor_work;
>  	bool monitor_todo;
>  	bool initialized;
> +	// Number of objects for which GP not started
> +	int count;


Isn't it better with a atomic counter to avoid the irq handling  in shrink_count?


>  };
>  
>  static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
> @@ -2937,6 +2939,8 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
>  				krcp->head = NULL;
>  			}
>  
> +			krcp->count = 0;
> +
>  			/*
>  			 * One work is per one batch, so there are two "free channels",
>  			 * "bhead_free" and "head_free" the batch can handle. It can be
> @@ -3073,6 +3077,8 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
>  		krcp->head = head;
>  	}
>  
> +	krcp->count++;
> +
>  	// Set timer to drain after KFREE_DRAIN_JIFFIES.
>  	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
>  	    !krcp->monitor_todo) {
> @@ -3087,6 +3093,58 @@ unlock_return:
>  }
>  EXPORT_SYMBOL_GPL(kfree_call_rcu);
>  
> +static unsigned long
> +kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
> +{
> +	int cpu;
> +	unsigned long flags, count = 0;
> +
> +	/* Snapshot count of all CPUs */
> +	for_each_online_cpu(cpu) {
> +		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> +
> +		spin_lock_irqsave(&krcp->lock, flags);
> +		count += krcp->count;
> +		spin_unlock_irqrestore(&krcp->lock, flags);
> +	}
> +
> +	return count;
> +}
> +
> +static unsigned long
> +kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
> +{
> +	int cpu, freed = 0;
> +	unsigned long flags;
> +
> +	for_each_online_cpu(cpu) {
> +		int count;
> +		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> +
> +		count = krcp->count;

inside the lock held


> +		spin_lock_irqsave(&krcp->lock, flags);
> +		if (krcp->monitor_todo)
> +			kfree_rcu_drain_unlock(krcp, flags);
> +		else
> +			spin_unlock_irqrestore(&krcp->lock, flags);
> +
> +		sc->nr_to_scan -= count;
> +		freed += count;
> +
> +		if (sc->nr_to_scan <= 0)
> +			break;
> +	}
> +
> +	return freed;
> +}
> +
> +static struct shrinker kfree_rcu_shrinker = {
> +	.count_objects = kfree_rcu_shrink_count,
> +	.scan_objects = kfree_rcu_shrink_scan,
> +	.batch = 0,
> +	.seeks = DEFAULT_SEEKS,
> +};
> +
>  void __init kfree_rcu_scheduler_running(void)
>  {
>  	int cpu;
> @@ -4007,6 +4065,8 @@ static void __init kfree_rcu_batch_init(void)
>  		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
>  		krcp->initialized = true;
>  	}
> +	if (register_shrinker(&kfree_rcu_shrinker))
> +		pr_err("Failed to register kfree_rcu() shrinker!\n");
>  }
>  
>  void __init rcu_init(void)