lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1377142037.25016.272.camel@pasglop>
Date:	Thu, 22 Aug 2013 13:27:17 +1000
From:	Benjamin Herrenschmidt <benh@...nel.crashing.org>
To:	Preeti U Murthy <preeti@...ux.vnet.ibm.com>
Cc:	fweisbec@...il.com, paul.gortmaker@...driver.com, paulus@...ba.org,
	shangw@...ux.vnet.ibm.com, galak@...nel.crashing.org,
	deepthi@...ux.vnet.ibm.com, paulmck@...ux.vnet.ibm.com,
	arnd@...db.de, linux-pm@...r.kernel.org, rostedt@...dmis.org,
	rjw@...k.pl, john.stultz@...aro.org, tglx@...utronix.de,
	chenhui.zhao@...escale.com, michael@...erman.id.au,
	r58472@...escale.com, geoff@...radead.org,
	linux-kernel@...r.kernel.org, srivatsa.bhat@...ux.vnet.ibm.com,
	schwidefsky@...ibm.com, svaidy@...ux.vnet.ibm.com,
	linuxppc-dev@...ts.ozlabs.org
Subject: Re: [RFC V2 PATCH 3/6] cpuidle/ppc: Add timer offload framework to
 support deep idle states

On Wed, 2013-08-14 at 17:26 +0530, Preeti U Murthy wrote:

>  static irqreturn_t timer_action(int irq, void *data)
>  {
> -	timer_interrupt();
> +	decrementer_timer_interrupt();
>  	return IRQ_HANDLED;
>  }

I don't completely understand what you are doing here, but ...

> @@ -223,7 +223,7 @@ irqreturn_t smp_ipi_demux(void)
>  
>  #ifdef __BIG_ENDIAN
>  		if (all & (1 << (24 - 8 * PPC_MSG_TIMER)))
> -			timer_interrupt();
> +			decrementer_timer_interrupt();

Why call this decrementer_* since it's specifically *not* the
decrementer ?

Makes more sense to be called broadcast_timer_interrupt() no ?

>  		if (all & (1 << (24 - 8 * PPC_MSG_RESCHEDULE)))
>  			scheduler_ipi();
>  		if (all & (1 << (24 - 8 * PPC_MSG_CALL_FUNC_SINGLE)))
> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
> index 65ab9e9..7e858e1 100644
> --- a/arch/powerpc/kernel/time.c
> +++ b/arch/powerpc/kernel/time.c
> @@ -42,6 +42,7 @@
>  #include <linux/timex.h>
>  #include <linux/kernel_stat.h>
>  #include <linux/time.h>
> +#include <linux/timer.h>
>  #include <linux/init.h>
>  #include <linux/profile.h>
>  #include <linux/cpu.h>
> @@ -97,8 +98,11 @@ static struct clocksource clocksource_timebase = {
>  
>  static int decrementer_set_next_event(unsigned long evt,
>  				      struct clock_event_device *dev);
> +static int broadcast_set_next_event(unsigned long evt,
> +				      struct clock_event_device *dev);
>  static void decrementer_set_mode(enum clock_event_mode mode,
>  				 struct clock_event_device *dev);
> +static void decrementer_timer_broadcast(const struct cpumask *mask);
>  
>  struct clock_event_device decrementer_clockevent = {
>  	.name           = "decrementer",
> @@ -106,13 +110,26 @@ struct clock_event_device decrementer_clockevent = {
>  	.irq            = 0,
>  	.set_next_event = decrementer_set_next_event,
>  	.set_mode       = decrementer_set_mode,
> -	.features       = CLOCK_EVT_FEAT_ONESHOT,
> +	.broadcast	= decrementer_timer_broadcast,
> +	.features       = CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_ONESHOT,
>  };
>  EXPORT_SYMBOL(decrementer_clockevent);
>  
> +struct clock_event_device broadcast_clockevent = {
> +	.name           = "broadcast",
> +	.rating         = 200,
> +	.irq            = 0,
> +	.set_next_event = broadcast_set_next_event,
> +	.set_mode       = decrementer_set_mode,

Same here, why "decrementer" ? This event device is *not* the
decrementer right ?

Also, pardon my ignorance, by why do we need a separate
clock_event_device ? Ie what does that do ? I am not familiar with the
broadcast scheme and what .broadcast do in the "decrementer" one, so
you need to provide me at least with better explanations.

> +	.features       = CLOCK_EVT_FEAT_ONESHOT,
> +};
> +EXPORT_SYMBOL(broadcast_clockevent);
> +
>  DEFINE_PER_CPU(u64, decrementers_next_tb);
>  static DEFINE_PER_CPU(struct clock_event_device, decrementers);
> +static DEFINE_PER_CPU(struct clock_event_device, bc_timer);

Do we really need an additional per CPU here ? What does the bc_timer
actually represent ? The sender (broadcaster) or receiver ? In the
latter case, why does it have to differ from the decrementer ?

> +int bc_cpu;

A global with that name ? Exported ? That's gross...

>  #define XSEC_PER_SEC (1024*1024)
>  
>  #ifdef CONFIG_PPC64
> @@ -487,6 +504,8 @@ void timer_interrupt(struct pt_regs * regs)
>  	struct pt_regs *old_regs;
>  	u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
>  	struct clock_event_device *evt = &__get_cpu_var(decrementers);
> +	struct clock_event_device *bc_evt = &__get_cpu_var(bc_timer);
> +	int cpu = smp_processor_id();
>  	u64 now;
>  
>  	/* Ensure a positive value is written to the decrementer, or else
> @@ -532,6 +551,10 @@ void timer_interrupt(struct pt_regs * regs)
>  		*next_tb = ~(u64)0;
>  		if (evt->event_handler)
>  			evt->event_handler(evt);
> +		if (cpu == bc_cpu && bc_evt->event_handler) {
> +			bc_evt->event_handler(bc_evt);
> +		}
> +

So here, on a CPU that happens to be "bc_cpu", we call an additional
"event_handler" on every timer interrupt ? What does that handler
actually do ? How does it relate to the "broadcast" field in the
original clock source ?

>  	} else {
>  		now = *next_tb - now;
>  		if (now <= DECREMENTER_MAX)
> @@ -806,6 +829,20 @@ static int decrementer_set_next_event(unsigned long evt,
>  	return 0;
>  }
>  
> +/*
> + * We cannot program the decrementer of a remote CPU. Hence CPUs going into
> + * deep idle states need to send IPIs to the broadcast CPU to program its
> + * decrementer for their next local event so as to receive a broadcast IPI
> + * for the same. In order to avoid the overhead of multiple CPUs from sending
> + * IPIs, this function is a nop. Instead the broadcast CPU will handle the
> + * wakeup of CPUs in deep idle states in each of its local timer interrupts.
> + */
> +static int broadcast_set_next_event(unsigned long evt,
> +					struct clock_event_device *dev)
> +{
> +	return 0;
> +}
> +
>  static void decrementer_set_mode(enum clock_event_mode mode,
>  				 struct clock_event_device *dev)
>  {
> @@ -813,6 +850,20 @@ static void decrementer_set_mode(enum clock_event_mode mode,
>  		decrementer_set_next_event(DECREMENTER_MAX, dev);
>  }
>  
> +void decrementer_timer_interrupt(void)
> +{
> +	struct clock_event_device *evt;
> +	evt = &per_cpu(decrementers, smp_processor_id());
> +
> +	if (evt->event_handler)
> +		evt->event_handler(evt);
> +}

So that's what happens when we receive the broadcast... it might need
some stats ... and it's using the normal "decrementer" clock source,
so I still have a problem understanding why you need another one.

> +static void decrementer_timer_broadcast(const struct cpumask *mask)
> +{
> +	arch_send_tick_broadcast(mask);
> +}

Ok, so far so good. But that's also hooked into the normal clock
source...

>  static void register_decrementer_clockevent(int cpu)
>  {
>  	struct clock_event_device *dec = &per_cpu(decrementers, cpu);
> @@ -826,6 +877,20 @@ static void register_decrementer_clockevent(int cpu)
>  	clockevents_register_device(dec);
>  }
>  
> +static void register_broadcast_clockevent(int cpu)
> +{
> +	struct clock_event_device *bc_evt = &per_cpu(bc_timer, cpu);
> +
> +	*bc_evt = broadcast_clockevent;
> +	bc_evt->cpumask = cpumask_of(cpu);
> +
> +	printk_once(KERN_DEBUG "clockevent: %s mult[%x] shift[%d] cpu[%d]\n",
> +		    bc_evt->name, bc_evt->mult, bc_evt->shift, cpu);
> +
> +	clockevents_register_device(bc_evt);
> +	bc_cpu = cpu;
> +}
> +
>  static void __init init_decrementer_clockevent(void)
>  {
>  	int cpu = smp_processor_id();
> @@ -840,6 +905,19 @@ static void __init init_decrementer_clockevent(void)
>  	register_decrementer_clockevent(cpu);
>  }
>  
> +static void __init init_broadcast_clockevent(void)
> +{
> +	int cpu = smp_processor_id();
> +
> +	clockevents_calc_mult_shift(&broadcast_clockevent, ppc_tb_freq, 4);
> +
> +	broadcast_clockevent.max_delta_ns =
> +		clockevent_delta2ns(DECREMENTER_MAX, &broadcast_clockevent);
> +	broadcast_clockevent.min_delta_ns =
> +		clockevent_delta2ns(2, &broadcast_clockevent);
> +	register_broadcast_clockevent(cpu);
> +}
> +
>  void secondary_cpu_time_init(void)
>  {
>  	/* Start the decrementer on CPUs that have manual control
> @@ -916,6 +994,7 @@ void __init time_init(void)
>  	clocksource_init();
>  
>  	init_decrementer_clockevent();
> +	init_broadcast_clockevent();
>  }
>  
> 
> diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
> index ace2d22..e1a96eb 100644
> --- a/arch/powerpc/platforms/powernv/Kconfig
> +++ b/arch/powerpc/platforms/powernv/Kconfig
> @@ -6,6 +6,7 @@ config PPC_POWERNV
>  	select PPC_ICP_NATIVE
>  	select PPC_P7_NAP
>  	select PPC_PCI_CHOICE if EMBEDDED
> +	select GENERIC_CLOCKEVENTS_BROADCAST
>  	select EPAPR_BOOT
>  	default y
>  
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@...r.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ