lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20200413211504.108086-1-joel@joelfernandes.org>
Date:   Mon, 13 Apr 2020 17:15:04 -0400
From:   "Joel Fernandes (Google)" <joel@...lfernandes.org>
To:     linux-kernel@...r.kernel.org
Cc:     "Joel Fernandes (Google)" <joel@...lfernandes.org>,
        Josh Triplett <josh@...htriplett.org>,
        Lai Jiangshan <jiangshanlai@...il.com>,
        Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
        "Paul E. McKenney" <paulmck@...nel.org>, rcu@...r.kernel.org,
        Steven Rostedt <rostedt@...dmis.org>
Subject: [PATCH RFC] rcu/tree: Refactor object allocation and try harder for array allocation

This is a small code refactor and also removes the restriction that
headless kfree_rcu() users cannot sleep for allocation of the per-cpu
array where their pointers will be placed . Since they are always called
in a sleepable context, we can use this information to try harder during
the array allocation stage to allocate the per-cpu array.

Also there is a possible bug-fix for a migration scenario where a
kfree_rcu() headless user can get migrated during the
sleepable-allocation and end up on another CPU and restoring the wrong
CPU's flags. To remedy this, we store only the IRQ state on the stack
and save/restore IRQ state from there. Sure, for the headless case we
don't need to restore flags. But the code saving/restoring state is
common between headless and with-head kfree_rcu() variants, so it
handles all scenarios sampling/restoring just the IRQ state and not
saving/restoring all the flags.

Signed-off-by: Joel Fernandes (Google) <joel@...lfernandes.org>
---

This is just RFC and is based on top of Vlad's latest patches:
https://lkml.org/lkml/2020/4/2/383

The git tree containing this patch is at:
https://git.kernel.org/pub/scm/linux/kernel/git/jfern/linux.git/log/?h=rcu/dev

(This patch will be a part of a much large series in the future).


 kernel/rcu/tree.c | 150 +++++++++++++++++++++++++++++++---------------
 1 file changed, 103 insertions(+), 47 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 744d04d8b7724..2e0eaec929059 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3104,11 +3104,95 @@ static void kfree_rcu_monitor(struct work_struct *work)
 		spin_unlock_irqrestore(&krcp->lock, flags);
 }
 
+static inline struct kfree_rcu_cpu *krc_this_cpu_lock(bool irqs_disabled)
+{
+	struct kfree_rcu_cpu *krcp;
+
+	// For safely calling this_cpu_ptr().
+	if (!irqs_disabled)
+		local_irq_disable();
+
+	krcp = this_cpu_ptr(&krc);
+	if (likely(krcp->initialized))
+		spin_lock(&krcp->lock);
+
+	return krcp;
+}
+
+static inline void
+krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, bool irqs_disabled)
+{
+	if (likely(krcp->initialized))
+		spin_unlock(&krcp->lock);
+
+	if (!irqs_disabled)
+		local_irq_enable();
+}
+
+// alloc_object_locked - Try to allocate an object of size while dropping the lock.
+//
+// @size: Size of the object to internally allocate for kfree_rcu().
+// @slab: Do we allocate on slab or using buddy.
+// @can_sleep: Was kfree_rcu() called in sleepable context?
+// @krcp: The pointer to krcp. Needed if when relocking, we got migrated.
+//
+// Caveats:
+//
+// 1. Per-cpu krc's lock must be held with interrupts disabled.
+//
+// 2. Failure to allocate returns NULL and does not cause a warning.
+//
+// 3. Caller is responsible for using the correct free() APIs. If size == PAGE_SIZE,
+//    then free_page() should be called for freeing. Otherwise kfree().
+//
+static inline void *alloc_object_locked(size_t size, bool slab, bool can_sleep,
+					struct kfree_rcu_cpu **krcpp)
+{
+	void *ptr;
+	gfp_t gfp_flags, wmark_flags, reclaim_flags;
+	struct kfree_rcu_cpu *krcp = *krcpp;
+
+	WARN_ON_ONCE(size == PAGE_SIZE && slab);
+
+	// Decompose the flags:
+	// wmark_flags   - affect the watermark to control reserve access.
+	// reclaim_flags - these effect how reclaim works but would
+	//                 have no-affect in atomic or nowait context.
+	wmark_flags = (__GFP_HIGH | __GFP_ATOMIC);
+	reclaim_flags = (__GFP_RETRY_MAYFAIL);
+
+	// These flags will be common to all allocations, whether we want to
+	// wait or sleep or reclaim will be controlled with additional flags
+	// later during the actual allocation.
+	gfp_flags = (wmark_flags | reclaim_flags | __GFP_NOWARN);
+
+	// First, do an allocation without waiting.
+	ptr = (size == PAGE_SIZE) ? (void *)__get_free_page(gfp_flags | GFP_NOWAIT)
+				  : (void *)kmalloc(size, gfp_flags | GFP_NOWAIT);
+	// If we cannot sleep, we are done.
+	if (ptr || !can_sleep)
+		return ptr;
+
+	// Now try to do it with more relaxed flags, we may enter direct-reclaim.
+	//
+	// IRQs were not enabled since can_sleep == true. So there's no need to
+	// save/restore flags.
+	krc_this_cpu_unlock(krcp, false);
+	ptr = (size == PAGE_SIZE) ? (void *)__get_free_page(gfp_flags | GFP_KERNEL)
+				  : (void *)kmalloc(size, gfp_flags | GFP_KERNEL);
+
+	// Take it back. We may have got migrated here.
+	*krcpp = krc_this_cpu_lock(false);
+
+	return ptr;
+}
+
 static inline bool
-kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
+kvfree_call_rcu_add_ptr_to_bulk(void *ptr, bool can_sleep, struct kfree_rcu_cpu **krcpp)
 {
 	struct kvfree_rcu_bulk_data *bnode;
 	int idx;
+	struct kfree_rcu_cpu *krcp = *krcpp;
 
 	if (unlikely(!krcp->initialized))
 		return false;
@@ -3121,9 +3205,11 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
 			krcp->bkvhead[idx]->nr_records ==
 				KVFREE_BULK_MAX_ENTR) {
 		bnode = xchg(&krcp->bkvcache[idx], NULL);
-		if (!bnode)
+		if (!bnode) {
 			bnode = (struct kvfree_rcu_bulk_data *)
-				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+				alloc_object_locked(PAGE_SIZE, false /* use buddy */,
+						    can_sleep, krcpp);
+		}
 
 		/* Switch to emergency path. */
 		if (unlikely(!bnode))
@@ -3144,14 +3230,15 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
 	return true;
 }
 
-static inline struct rcu_head *attach_rcu_head_to_object(void *obj)
+static inline struct rcu_head *attach_rcu_head_to_object(void *obj,
+			       struct kfree_rcu_cpu **krcpp)
 {
 	unsigned long *ptr;
 
-	/* Try hard to get the memory. */
-	ptr = kmalloc(sizeof(unsigned long *) +
-		sizeof(struct rcu_head), GFP_KERNEL | __GFP_ATOMIC |
-			__GFP_HIGH | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
+	// Allocate rcu_head on the slab.
+	ptr = alloc_object_locked(sizeof(unsigned long *) + sizeof(struct rcu_head),
+				  true /* slab */, true /* headless, so can sleep */,
+				  krcpp);
 	if (!ptr)
 		return NULL;
 
@@ -3159,27 +3246,6 @@ static inline struct rcu_head *attach_rcu_head_to_object(void *obj)
 	return ((struct rcu_head *) ++ptr);
 }
 
-static inline struct kfree_rcu_cpu *
-krc_this_cpu_lock(unsigned long *flags)
-{
-	struct kfree_rcu_cpu *krcp;
-
-	local_irq_save(*flags);	// For safely calling this_cpu_ptr().
-	krcp = this_cpu_ptr(&krc);
-	if (likely(krcp->initialized))
-		spin_lock(&krcp->lock);
-
-	return krcp;
-}
-
-static inline void
-krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
-{
-	if (likely(krcp->initialized))
-		spin_unlock(&krcp->lock);
-	local_irq_restore(flags);
-}
-
 /*
  * Queue a request for lazy invocation of appropriate free routine after a
  * grace period. Please note there are three paths are maintained, two are the
@@ -3195,10 +3261,9 @@ krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
  */
 void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
-	unsigned long flags;
 	struct kfree_rcu_cpu *krcp;
 	bool expedited_drain = false;
-	bool success;
+	bool success, irqs_disabled;
 	void *ptr;
 
 	if (head) {
@@ -3216,7 +3281,8 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 		ptr = (unsigned long *) func;
 	}
 
-	krcp = krc_this_cpu_lock(&flags);
+	irqs_disabled = irqs_disabled();
+	krcp = krc_this_cpu_lock(irqs_disabled);
 
 	// Queue the object but don't yet schedule the batch.
 	if (debug_rcu_head_queue(ptr)) {
@@ -3229,23 +3295,14 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 		goto unlock_return;
 	}
 
-	/*
-	 * Under high memory pressure GFP_NOWAIT can fail,
-	 * in that case the emergency path is maintained.
-	 */
-	success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
+	// Allow sleeping here only if headless
+	success = kvfree_call_rcu_add_ptr_to_bulk(ptr, !head /* can_sleep */, &krcp);
 	if (!success) {
 		/* Is headless object? */
 		if (head == NULL) {
-			/* Drop the lock. */
-			krc_this_cpu_unlock(krcp, flags);
-
-			head = attach_rcu_head_to_object(ptr);
+			head = attach_rcu_head_to_object(ptr, &krcp);
 			if (head == NULL)
-				goto inline_return;
-
-			/* Take it back. */
-			krcp = krc_this_cpu_lock(&flags);
+				goto unlock_return;
 
 			/*
 			 * Tag the headless object. Such objects have a back-pointer
@@ -3280,9 +3337,8 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 	}
 
 unlock_return:
-	krc_this_cpu_unlock(krcp, flags);
+	krc_this_cpu_unlock(krcp, irqs_disabled);
 
-inline_return:
 	/*
 	 * High memory pressure, so inline kvfree() after
 	 * synchronize_rcu(). We can do it from might_sleep()
-- 
2.26.0.110.g2183baf09c-goog

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ