linux-kernel - Re: [PATCH rcu 1/7] rcu/nocb: Add/del rdp to iterate from rcuog itself

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <44b3a9df-c2fa-1e51-ca6b-f9c083d6578c@quicinc.com>
Date:   Tue, 19 Jul 2022 14:57:30 +0530
From:   Neeraj Upadhyay <quic_neeraju@...cinc.com>
To:     "Paul E. McKenney" <paulmck@...nel.org>, <rcu@...r.kernel.org>
CC:     <linux-kernel@...r.kernel.org>, <kernel-team@...com>,
        <rostedt@...dmis.org>, Frederic Weisbecker <frederic@...nel.org>,
        Boqun Feng <boqun.feng@...il.com>,
        Uladzislau Rezki <uladzislau.rezki@...y.com>,
        Joel Fernandes <joel@...lfernandes.org>,
        Zqiang <qiang1.zhang@...el.com>
Subject: Re: [PATCH rcu 1/7] rcu/nocb: Add/del rdp to iterate from rcuog
 itself



On 6/21/2022 4:14 AM, Paul E. McKenney wrote:
> From: Frederic Weisbecker <frederic@...nel.org>
> 
> NOCB rdp's are part of a group whose list is iterated by the
> corresponding rdp leader.
> 
> This list is RCU traversed because an rdp can be either added or
> deleted concurrently. Upon addition, a new iteration to the list after
> a synchronization point (a pair of LOCK/UNLOCK ->nocb_gp_lock) is forced
> to make sure:
> 
> 1) we didn't miss a new element added in the middle of an iteration
> 2) we didn't ignore a whole subset of the list due to an element being
>     quickly deleted and then re-added.
> 3) we prevent from probably other surprises...
> 
> Although this layout is expected to be safe, it doesn't help anybody
> to sleep well.
> 
> Simplify instead the nocb state toggling with moving the list
> modification from the nocb (de-)offloading workqueue to the rcuog
> kthreads instead.
> 
> Whenever the rdp leader is expected to (re-)set the SEGCBLIST_KTHREAD_GP
> flag of a target rdp, the latter is queued so that the leader handles
> the flag flip along with adding or deleting the target rdp to the list
> to iterate. This way the list modification and iteration happen from the
> same kthread and those operations can't race altogether.
> 
> As a bonus, the flags for each rdp don't need to be checked locklessly
> before each iteration, which is one less opportunity to produce
> nightmares.
> 
> Signed-off-by: Frederic Weisbecker <frederic@...nel.org>
> Cc: Neeraj Upadhyay <quic_neeraju@...cinc.com>
> Cc: Boqun Feng <boqun.feng@...il.com>
> Cc: Uladzislau Rezki <uladzislau.rezki@...y.com>
> Cc: Joel Fernandes <joel@...lfernandes.org>
> Cc: Zqiang <qiang1.zhang@...el.com>
> Signed-off-by: Paul E. McKenney <paulmck@...nel.org>
> ---

Reviewed-by: Neeraj Upadhyay <quic_neeraju@...cinc.com>


Thanks
Neeraj

>   kernel/rcu/tree.h      |   1 +
>   kernel/rcu/tree_nocb.h | 138 +++++++++++++++++++++--------------------
>   2 files changed, 71 insertions(+), 68 deletions(-)
> 
> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
> index 2ccf5845957df..4f8532c33558f 100644
> --- a/kernel/rcu/tree.h
> +++ b/kernel/rcu/tree.h
> @@ -235,6 +235,7 @@ struct rcu_data {
>   					 * if rdp_gp.
>   					 */
>   	struct list_head nocb_entry_rdp; /* rcu_data node in wakeup chain. */
> +	struct rcu_data *nocb_toggling_rdp; /* rdp queued for (de-)offloading */
>   
>   	/* The following fields are used by CB kthread, hence new cacheline. */
>   	struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp;
> diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
> index 46694e13398a3..dac74952e1d1b 100644
> --- a/kernel/rcu/tree_nocb.h
> +++ b/kernel/rcu/tree_nocb.h
> @@ -546,52 +546,43 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
>   	}
>   }
>   
> -/*
> - * Check if we ignore this rdp.
> - *
> - * We check that without holding the nocb lock but
> - * we make sure not to miss a freshly offloaded rdp
> - * with the current ordering:
> - *
> - *  rdp_offload_toggle()        nocb_gp_enabled_cb()
> - * -------------------------   ----------------------------
> - *    WRITE flags                 LOCK nocb_gp_lock
> - *    LOCK nocb_gp_lock           READ/WRITE nocb_gp_sleep
> - *    READ/WRITE nocb_gp_sleep    UNLOCK nocb_gp_lock
> - *    UNLOCK nocb_gp_lock         READ flags
> - */
> -static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp)
> -{
> -	u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP;
> -
> -	return rcu_segcblist_test_flags(&rdp->cblist, flags);
> -}
> -
> -static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp,
> -						     bool *needwake_state)
> +static int nocb_gp_toggle_rdp(struct rcu_data *rdp,
> +			       bool *wake_state)
>   {
>   	struct rcu_segcblist *cblist = &rdp->cblist;
> +	unsigned long flags;
> +	int ret;
>   
> -	if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
> -		if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
> -			rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
> -			if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
> -				*needwake_state = true;
> -		}
> -		return false;
> +	rcu_nocb_lock_irqsave(rdp, flags);
> +	if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) &&
> +	    !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
> +		/*
> +		 * Offloading. Set our flag and notify the offload worker.
> +		 * We will handle this rdp until it ever gets de-offloaded.
> +		 */
> +		rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
> +		if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
> +			*wake_state = true;
> +		ret = 1;
> +	} else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) &&
> +		   rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
> +		/*
> +		 * De-offloading. Clear our flag and notify the de-offload worker.
> +		 * We will ignore this rdp until it ever gets re-offloaded.
> +		 */
> +		rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
> +		if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
> +			*wake_state = true;
> +		ret = 0;
> +	} else {
> +		WARN_ON_ONCE(1);
> +		ret = -1;
>   	}
>   
> -	/*
> -	 * De-offloading. Clear our flag and notify the de-offload worker.
> -	 * We will ignore this rdp until it ever gets re-offloaded.
> -	 */
> -	WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
> -	rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
> -	if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
> -		*needwake_state = true;
> -	return true;
> -}
> +	rcu_nocb_unlock_irqrestore(rdp, flags);
>   
> +	return ret;
> +}
>   
>   /*
>    * No-CBs GP kthreads come here to wait for additional callbacks to show up
> @@ -609,7 +600,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
>   	bool needwait_gp = false; // This prevents actual uninitialized use.
>   	bool needwake;
>   	bool needwake_gp;
> -	struct rcu_data *rdp;
> +	struct rcu_data *rdp, *rdp_toggling = NULL;
>   	struct rcu_node *rnp;
>   	unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
>   	bool wasempty = false;
> @@ -634,19 +625,10 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
>   	 * is added to the list, so the skipped-over rcu_data structures
>   	 * won't be ignored for long.
>   	 */
> -	list_for_each_entry_rcu(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp, 1) {
> -		bool needwake_state = false;
> -
> -		if (!nocb_gp_enabled_cb(rdp))
> -			continue;
> +	list_for_each_entry(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp) {
>   		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
>   		rcu_nocb_lock_irqsave(rdp, flags);
> -		if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) {
> -			rcu_nocb_unlock_irqrestore(rdp, flags);
> -			if (needwake_state)
> -				swake_up_one(&rdp->nocb_state_wq);
> -			continue;
> -		}
> +		lockdep_assert_held(&rdp->nocb_lock);
>   		bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
>   		if (bypass_ncbs &&
>   		    (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
> @@ -656,8 +638,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
>   			bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
>   		} else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
>   			rcu_nocb_unlock_irqrestore(rdp, flags);
> -			if (needwake_state)
> -				swake_up_one(&rdp->nocb_state_wq);
>   			continue; /* No callbacks here, try next. */
>   		}
>   		if (bypass_ncbs) {
> @@ -705,8 +685,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
>   		}
>   		if (needwake_gp)
>   			rcu_gp_kthread_wake();
> -		if (needwake_state)
> -			swake_up_one(&rdp->nocb_state_wq);
>   	}
>   
>   	my_rdp->nocb_gp_bypass = bypass;
> @@ -739,15 +717,49 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
>   			!READ_ONCE(my_rdp->nocb_gp_sleep));
>   		trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
>   	}
> +
>   	if (!rcu_nocb_poll) {
>   		raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
> +		// (De-)queue an rdp to/from the group if its nocb state is changing
> +		rdp_toggling = my_rdp->nocb_toggling_rdp;
> +		if (rdp_toggling)
> +			my_rdp->nocb_toggling_rdp = NULL;
> +
>   		if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
>   			WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
>   			del_timer(&my_rdp->nocb_timer);
>   		}
>   		WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
>   		raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
> +	} else {
> +		rdp_toggling = READ_ONCE(my_rdp->nocb_toggling_rdp);
> +		if (rdp_toggling) {
> +			/*
> +			 * Paranoid locking to make sure nocb_toggling_rdp is well
> +			 * reset *before* we (re)set SEGCBLIST_KTHREAD_GP or we could
> +			 * race with another round of nocb toggling for this rdp.
> +			 * Nocb locking should prevent from that already but we stick
> +			 * to paranoia, especially in rare path.
> +			 */
> +			raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
> +			my_rdp->nocb_toggling_rdp = NULL;
> +			raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
> +		}
> +	}
> +
> +	if (rdp_toggling) {
> +		bool wake_state = false;
> +		int ret;
> +
> +		ret = nocb_gp_toggle_rdp(rdp_toggling, &wake_state);
> +		if (ret == 1)
> +			list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp);
> +		else if (ret == 0)
> +			list_del(&rdp_toggling->nocb_entry_rdp);
> +		if (wake_state)
> +			swake_up_one(&rdp_toggling->nocb_state_wq);
>   	}
> +
>   	my_rdp->nocb_gp_seq = -1;
>   	WARN_ON(signal_pending(current));
>   }
> @@ -966,6 +978,8 @@ static int rdp_offload_toggle(struct rcu_data *rdp,
>   	swake_up_one(&rdp->nocb_cb_wq);
>   
>   	raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
> +	// Queue this rdp for add/del to/from the list to iterate on rcuog
> +	WRITE_ONCE(rdp_gp->nocb_toggling_rdp, rdp);
>   	if (rdp_gp->nocb_gp_sleep) {
>   		rdp_gp->nocb_gp_sleep = false;
>   		wake_gp = true;
> @@ -1013,8 +1027,6 @@ static long rcu_nocb_rdp_deoffload(void *arg)
>   	swait_event_exclusive(rdp->nocb_state_wq,
>   			      !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
>   							SEGCBLIST_KTHREAD_GP));
> -	/* Stop nocb_gp_wait() from iterating over this structure. */
> -	list_del_rcu(&rdp->nocb_entry_rdp);
>   	/*
>   	 * Lock one last time to acquire latest callback updates from kthreads
>   	 * so we can later handle callbacks locally without locking.
> @@ -1079,16 +1091,6 @@ static long rcu_nocb_rdp_offload(void *arg)
>   
>   	pr_info("Offloading %d\n", rdp->cpu);
>   
> -	/*
> -	 * Cause future nocb_gp_wait() invocations to iterate over
> -	 * structure, resetting ->nocb_gp_sleep and waking up the related
> -	 * "rcuog".  Since nocb_gp_wait() in turn locks ->nocb_gp_lock
> -	 * before setting ->nocb_gp_sleep again, we are guaranteed to
> -	 * iterate this newly added structure before "rcuog" goes to
> -	 * sleep again.
> -	 */
> -	list_add_tail_rcu(&rdp->nocb_entry_rdp, &rdp->nocb_gp_rdp->nocb_head_rdp);
> -
>   	/*
>   	 * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING
>   	 * is set.