lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Mon, 11 Jun 2012 06:39:53 -0700
From:	"Paul E. McKenney" <paulmck@...ux.vnet.ibm.com>
To:	Mike Galbraith <mgalbraith@...ell.com>
Cc:	LKML <linux-kernel@...r.kernel.org>
Subject: Re: rcu: endless stalls

On Mon, Jun 11, 2012 at 12:06:16PM +0200, Mike Galbraith wrote:
> Greetings,
> 
> I received a report of a 48 core UV box hitting a gripe, taking longer
> than timeout to emit same, so box griped endlessly, forcing reboot.

So it took minutes to print an RCU CPU stall warning?  On only 48 CPUs?

If so, yow!!!

My guess is that rcu_cpu_stall_suppress must be manipulated atomically
for this to work reliably, for example, using xchg().

							Thanx, Paul

> The below might prevent that.. and bust other stuff for free :)
> 
> rcu: one gripe at a time please
> 
> Not-compiled-by:
> Not-signed-off-by:
> Not-etc-by:
> 
> diff --git a/kernel/rcutree.c b/kernel/rcutree.c
> index 0da7b88..6462056d6 100644
> --- a/kernel/rcutree.c
> +++ b/kernel/rcutree.c
> @@ -818,10 +818,25 @@ static void print_cpu_stall(struct rcu_state *rsp)
>  	set_need_resched();  /* kick ourselves to get things going. */
>  }
> 
> +/**
> + * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
> + *
> + * Set the stall-warning timeout way off into the future, thus preventing
> + * any RCU CPU stall-warning messages from appearing in the current set of
> + * RCU grace periods.
> + *
> + * The caller must disable hard irqs.
> + */
> +void rcu_cpu_stall_reset(void)
> +{
> +	rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
> +	rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
> +	rcu_preempt_stall_reset();
> +}
> +
>  static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
>  {
> -	unsigned long j;
> -	unsigned long js;
> +	unsigned long j, js, flags;
>  	struct rcu_node *rnp;
> 
>  	if (rcu_cpu_stall_suppress)
> @@ -832,13 +847,23 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
>  	if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
> 
>  		/* We haven't checked in, so go dump stack. */
> +		rcu_cpu_stall_suppress = 1;
>  		print_cpu_stall(rsp);
> +		local_irq_save(flags);
> +		rcu_cpu_stall_reset();
> +		local_irq_restore(flags);
> +		rcu_cpu_stall_suppress = 0;
> 
>  	} else if (rcu_gp_in_progress(rsp) &&
>  		   ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
> 
>  		/* They had a few time units to dump stack, so complain. */
> +		rcu_cpu_stall_suppress = 1;
>  		print_other_cpu_stall(rsp);
> +		local_irq_save(flags);
> +		rcu_cpu_stall_reset();
> +		local_irq_restore(flags);
> +		rcu_cpu_stall_suppress = 0;
>  	}
>  }
> 
> @@ -848,22 +873,6 @@ static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
>  	return NOTIFY_DONE;
>  }
> 
> -/**
> - * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
> - *
> - * Set the stall-warning timeout way off into the future, thus preventing
> - * any RCU CPU stall-warning messages from appearing in the current set of
> - * RCU grace periods.
> - *
> - * The caller must disable hard irqs.
> - */
> -void rcu_cpu_stall_reset(void)
> -{
> -	rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
> -	rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
> -	rcu_preempt_stall_reset();
> -}
> -
>  static struct notifier_block rcu_panic_block = {
>  	.notifier_call = rcu_panic,
>  };
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ