[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <5433B8CB.9070301@linux.vnet.ibm.com>
Date: Tue, 07 Oct 2014 15:26:27 +0530
From: Shreyas B Prabhu <shreyas@...ux.vnet.ibm.com>
To: Benjamin Herrenschmidt <benh@...nel.crashing.org>
CC: linux-kernel@...r.kernel.org, Paul Mackerras <paulus@...ba.org>,
Michael Ellerman <mpe@...erman.id.au>,
linuxppc-dev@...ts.ozlabs.org
Subject: Re: [PATCH v2 3/5] powerpc/powernv: Add winkle infrastructure
On Tuesday 07 October 2014 11:03 AM, Benjamin Herrenschmidt wrote:
> On Wed, 2014-10-01 at 13:16 +0530, Shreyas B. Prabhu wrote:
>> Winkle causes power to be gated off to the entire chiplet. Hence the
>> hypervisor/firmware state in the entire chiplet is lost.
>>
>> This patch adds necessary infrastructure to support waking up from
>> hypervisor state loss. Specifically does following:
>> - Before entering winkle, save state of registers that need to be
>> restored on wake up (SDR1, HFSCR)
>
> Add ... to your list, it's not exhaustive, is it ?
I use interrupt stack frame for only SDR1 and HFSCR. The rest of the
SPRs are restored via PORE in the next patch. I'll change the comments
to better reflect this.
>
>> - SRR1 bits 46:47 which is used to identify which power saving mode cpu
>> woke up from is '11' for both winkle and sleep. Hence introduce a flag
>> in PACA to distinguish b/w winkle and sleep.
>>
>> - Upon waking up, restore all saved registers, recover slb
>>
>> Cc: Benjamin Herrenschmidt <benh@...nel.crashing.org>
>> Cc: Paul Mackerras <paulus@...ba.org>
>> Cc: Michael Ellerman <mpe@...erman.id.au>
>> Cc: linuxppc-dev@...ts.ozlabs.org
>> Suggested-by: Vaidyanathan Srinivasan <svaidy@...ux.vnet.ibm.com>
>> Signed-off-by: Shreyas B. Prabhu <shreyas@...ux.vnet.ibm.com>
>> ---
>> arch/powerpc/include/asm/machdep.h | 1 +
>> arch/powerpc/include/asm/paca.h | 3 ++
>> arch/powerpc/include/asm/ppc-opcode.h | 2 +
>> arch/powerpc/include/asm/processor.h | 2 +
>> arch/powerpc/kernel/asm-offsets.c | 1 +
>> arch/powerpc/kernel/exceptions-64s.S | 8 ++--
>> arch/powerpc/kernel/idle.c | 11 +++++
>> arch/powerpc/kernel/idle_power7.S | 81 +++++++++++++++++++++++++++++++++-
>> arch/powerpc/platforms/powernv/setup.c | 24 ++++++++++
>> 9 files changed, 127 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
>> index f37014f..0a3ced9 100644
>> --- a/arch/powerpc/include/asm/machdep.h
>> +++ b/arch/powerpc/include/asm/machdep.h
>> @@ -301,6 +301,7 @@ struct machdep_calls {
>> /* Idle handlers */
>> void (*setup_idle)(void);
>> unsigned long (*power7_sleep)(void);
>> + unsigned long (*power7_winkle)(void);
>> };
>
> Why does it need to be ppc_md ? Same comments as for sleep
>
>> extern void e500_idle(void);
>> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
>> index a5139ea..3358f09 100644
>> --- a/arch/powerpc/include/asm/paca.h
>> +++ b/arch/powerpc/include/asm/paca.h
>> @@ -158,6 +158,9 @@ struct paca_struct {
>> * early exception handler for use by high level C handler
>> */
>> struct opal_machine_check_event *opal_mc_evt;
>> +
>> + /* Flag to distinguish b/w sleep and winkle */
>> + u8 offline_state;
>
> Not fan of the name. I'd rather you call it "wakeup_state_loss" or
> something a bit more explicit about what that actually means if it's
> going to be a boolean value. Otherwise make it an enumeration of
> constants.
>
Okay. I'll change this.
>> #endif
>> #ifdef CONFIG_PPC_BOOK3S_64
>> /* Exclusive emergency stack pointer for machine check exception. */
>> diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
>> index 6f85362..5155be7 100644
>> --- a/arch/powerpc/include/asm/ppc-opcode.h
>> +++ b/arch/powerpc/include/asm/ppc-opcode.h
>> @@ -194,6 +194,7 @@
>>
>> #define PPC_INST_NAP 0x4c000364
>> #define PPC_INST_SLEEP 0x4c0003a4
>> +#define PPC_INST_WINKLE 0x4c0003e4
>>
>> /* A2 specific instructions */
>> #define PPC_INST_ERATWE 0x7c0001a6
>> @@ -374,6 +375,7 @@
>>
>> #define PPC_NAP stringify_in_c(.long PPC_INST_NAP)
>> #define PPC_SLEEP stringify_in_c(.long PPC_INST_SLEEP)
>> +#define PPC_WINKLE stringify_in_c(.long PPC_INST_WINKLE)
>>
>> /* BHRB instructions */
>> #define PPC_CLRBHRB stringify_in_c(.long PPC_INST_CLRBHRB)
>> diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
>> index 41953cd..00e3df9 100644
>> --- a/arch/powerpc/include/asm/processor.h
>> +++ b/arch/powerpc/include/asm/processor.h
>> @@ -455,6 +455,8 @@ extern void arch_setup_idle(void);
>> extern void power7_nap(int check_irq);
>> extern unsigned long power7_sleep(void);
>> extern unsigned long __power7_sleep(void);
>> +extern unsigned long power7_winkle(void);
>> +extern unsigned long __power7_winkle(void);
>> extern void flush_instruction_cache(void);
>> extern void hard_reset_now(void);
>> extern void poweroff_now(void);
>> diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
>> index 9d7dede..ea98817 100644
>> --- a/arch/powerpc/kernel/asm-offsets.c
>> +++ b/arch/powerpc/kernel/asm-offsets.c
>> @@ -731,6 +731,7 @@ int main(void)
>> DEFINE(OPAL_MC_SRR0, offsetof(struct opal_machine_check_event, srr0));
>> DEFINE(OPAL_MC_SRR1, offsetof(struct opal_machine_check_event, srr1));
>> DEFINE(PACA_OPAL_MC_EVT, offsetof(struct paca_struct, opal_mc_evt));
>> + DEFINE(PACAOFFLINESTATE, offsetof(struct paca_struct, offline_state));
>> #endif
>>
>> return 0;
>> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
>> index c64f3cc0..261f348 100644
>> --- a/arch/powerpc/kernel/exceptions-64s.S
>> +++ b/arch/powerpc/kernel/exceptions-64s.S
>> @@ -115,9 +115,7 @@ BEGIN_FTR_SECTION
>> #endif
>>
>> /* Running native on arch 2.06 or later, check if we are
>> - * waking up from nap. We only handle no state loss and
>> - * supervisor state loss. We do -not- handle hypervisor
>> - * state loss at this time.
>> + * waking up from power saving mode.
>> */
>> mfspr r13,SPRN_SRR1
>> rlwinm. r13,r13,47-31,30,31
>> @@ -133,8 +131,8 @@ BEGIN_FTR_SECTION
>> b power7_wakeup_noloss
>> 2: b power7_wakeup_loss
>>
>> - /* Fast Sleep wakeup on PowerNV */
>> -8: b power7_wakeup_tb_loss
>> + /* Fast Sleep / Winkle wakeup on PowerNV */
>> +8: b power7_wakeup_hv_state_loss
>>
>> 9:
>> END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
>> diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
>> index 1f268e0..ed46217 100644
>> --- a/arch/powerpc/kernel/idle.c
>> +++ b/arch/powerpc/kernel/idle.c
>> @@ -98,6 +98,17 @@ unsigned long power7_sleep(void)
>> return ret;
>> }
>>
>> +unsigned long power7_winkle(void)
>> +{
>> + unsigned long ret;
>> +
>> + if (ppc_md.power7_winkle)
>> + ret = ppc_md.power7_winkle();
>> + else
>> + ret = __power7_winkle();
>> + return ret;
>> +}
>> +
>> int powersave_nap;
>>
>> #ifdef CONFIG_SYSCTL
>> diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S
>> index c3481c9..87b2556 100644
>> --- a/arch/powerpc/kernel/idle_power7.S
>> +++ b/arch/powerpc/kernel/idle_power7.S
>> @@ -18,6 +18,13 @@
>> #include <asm/hw_irq.h>
>> #include <asm/kvm_book3s_asm.h>
>> #include <asm/opal.h>
>> +#include <asm/mmu-hash64.h>
>> +
>> +/*
>> + * Use volatile GPRs' space to save essential SPRs before entering winkle
>> + */
>> +#define _SDR1 GPR3
>> +#define _TSCR GPR4
>>
>> #undef DEBUG
>>
>> @@ -39,6 +46,7 @@
>> * Pass requested state in r3:
>> * 0 - nap
>> * 1 - sleep
>> + * 2 - winkle
>> *
>> * To check IRQ_HAPPENED in r4
>> * 0 - don't check
>> @@ -109,9 +117,27 @@ _GLOBAL(power7_enter_nap_mode)
>> #endif
>> cmpwi cr0,r3,1
>> beq 2f
>> + cmpwi cr0,r3,2
>> + beq 3f
>> IDLE_STATE_ENTER_SEQ(PPC_NAP)
>> /* No return */
>> -2: IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
>> +2:
>> + li r4,1
>> + stb r4,PACAOFFLINESTATE(r13)
>> + IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
>> + /* No return */
>> +
>> +3:
>> + mfspr r4,SPRN_SDR1
>> + std r4,_SDR1(r1)
>> +
>> + mfspr r4,SPRN_TSCR
>> + std r4,_TSCR(r1)
>> +
>> + /* Enter winkle */
>> + li r4,0
>> + stb r4,PACAOFFLINESTATE(r13)
>> + IDLE_STATE_ENTER_SEQ(PPC_WINKLE)
>> /* No return */
>>
>> _GLOBAL(power7_idle)
>> @@ -187,6 +213,59 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \
>> 20: nop;
>>
>>
>> +_GLOBAL(__power7_winkle)
>> + li r3,2
>> + li r4,1
>> + b power7_powersave_common
>> + /* No return */
>> +
>> +_GLOBAL(power7_wakeup_hv_state_loss)
>> + /* Check paca flag to diffentiate b/w fast sleep and winkle */
>> + lbz r4,PACAOFFLINESTATE(13)
>> + cmpwi cr0,r4,0
>> + bne power7_wakeup_tb_loss
>> +
>> + ld r2,PACATOC(r13);
>> + ld r1,PACAR1(r13)
>> +
>> + bl __restore_cpu_power8
>
> So if I understand correctly, you use a per-cpu flag not a per-core flag
> which means we will assume a pessimistic case of having to restore stuff
> even if the core didn't actually enter winkle (because the last thread
> to go down went to sleep). This is sub-optimal. Also see below:
>
>> + /* Time base re-sync */
>> + li r3,OPAL_RESYNC_TIMEBASE
>> + bl opal_call_realmode;
>
> You will also resync the timebase (and restore all the core shared SPRs)
> for each thread. This is problematic, especially with KVM as you could
> have a situation where:
>
> - The first thread comes out and starts diving into KVM
>
> - The other threads start coming out while the first one is doing the
> above.
>
> Thus the first thread might already be manipulating some core registers
> (SDR1 etc...) while the secondaries come back and ... whack it. Worse,
> the primary might have applied the TB offset using TBU40 while the
> secondaries resync the timebase back to the host value, incurring a
> loss of TB for the guest.
>
Such a race is prevented with kvm_hstate.hwthread_req and
kvm_hstate.hwthread_state paca flags.
The current flow when a guest is scheduled on a core :
-> Primary thread sets kvm_hstate.hwthread_req paca flag for all the
secondary threads.
-> Waits for all the secondary threads to to change state to
!KVM_HWTHREAD_IN_KERNEL
-> and later call __kvmppc_vcore_entry which down the line changes SDR1
and other per core registers. Therefore kvm_hstate.hwthread_req is set
to 1 for all the threads in the core *before* SDR1 is switched.
And when a secondary thread is woken up to execute guest, in 0x100 we
check hwthread_req and branch to kvm_start_guest if set. Therefore
secondary threads woken up for guest do not execute the
power7_wakeup_hv_state_loss and therefore there is no danger of
overwriting SDR1 or TBU40.
Now lets consider the case where a guest is scheduled on the core and a
secondary thread is woken up even though there is no vcpu to run on it.
(Say its woken up by a stray IPI). In this case, again in 0x100 we
branch to kvm_start_guest, and here when there is no vcpu to run, it
executes nap. So again there no danger of overwriting SDR1.
>> + /* Restore SLB from PACA */
>> + ld r8,PACA_SLBSHADOWPTR(r13)
>> +
>> + .rept SLB_NUM_BOLTED
>> + li r3, SLBSHADOW_SAVEAREA
>> + LDX_BE r5, r8, r3
>> + addi r3, r3, 8
>> + LDX_BE r6, r8, r3
>> + andis. r7,r5,SLB_ESID_V@h
>> + beq 1f
>> + slbmte r6,r5
>> +1: addi r8,r8,16
>> + .endr
>> +
>> + ld r4,_SDR1(r1)
>> + mtspr SPRN_SDR1,r4
>> +
>> + ld r4,_TSCR(r1)
>> + mtspr SPRN_TSCR,r4
>> +
>> + REST_NVGPRS(r1)
>> + REST_GPR(2, r1)
>> + ld r3,_CCR(r1)
>> + ld r4,_MSR(r1)
>> + ld r5,_NIP(r1)
>> + addi r1,r1,INT_FRAME_SIZE
>> + mtcr r3
>> + mfspr r3,SPRN_SRR1 /* Return SRR1 */
>> + mtspr SPRN_SRR1,r4
>> + mtspr SPRN_SRR0,r5
>> + rfid
>> +
>> _GLOBAL(power7_wakeup_tb_loss)
>> ld r2,PACATOC(r13);
>> ld r1,PACAR1(r13)
>> diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
>> index 9d9a898..f45b52d 100644
>> --- a/arch/powerpc/platforms/powernv/setup.c
>> +++ b/arch/powerpc/platforms/powernv/setup.c
>> @@ -370,6 +370,29 @@ static unsigned long pnv_power7_sleep(void)
>> return srr1;
>> }
>>
>> +/*
>> + * We need to keep track of offline cpus also for calling
>> + * fastsleep workaround appropriately
>> + */
>> +static unsigned long pnv_power7_winkle(void)
>> +{
>> + int cpu, primary_thread;
>> + unsigned long srr1;
>> +
>> + cpu = smp_processor_id();
>> + primary_thread = cpu_first_thread_sibling(cpu);
>> +
>> + if (need_fastsleep_workaround) {
>> + pnv_apply_fastsleep_workaround(1, primary_thread);
>> + srr1 = __power7_winkle();
>> + pnv_apply_fastsleep_workaround(0, primary_thread);
>> + } else {
>> + srr1 = __power7_winkle();
>> + }
>> + return srr1;
>> +}
>> +
>> +
>> static void __init pnv_setup_machdep_opal(void)
>> {
>> ppc_md.get_boot_time = opal_get_boot_time;
>> @@ -384,6 +407,7 @@ static void __init pnv_setup_machdep_opal(void)
>> ppc_md.handle_hmi_exception = opal_handle_hmi_exception;
>> ppc_md.setup_idle = pnv_setup_idle;
>> ppc_md.power7_sleep = pnv_power7_sleep;
>> + ppc_md.power7_winkle = pnv_power7_winkle;
>> }
>>
>> #ifdef CONFIG_PPC_POWERNV_RTAS
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists