[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <9b6f974b-dcb3-3668-c83b-308abf33e85a@amd.com>
Date: Tue, 28 Aug 2018 12:13:30 -0500
From: Brijesh Singh <brijesh.singh@....com>
To: Tom Lendacky <thomas.lendacky@....com>, x86@...nel.org,
linux-kernel@...r.kernel.org, kvm@...r.kernel.org
Cc: brijesh.singh@....com, stable@...r.kernel.org,
Thomas Gleixner <tglx@...utronix.de>,
Borislav Petkov <bp@...e.de>, "H. Peter Anvin" <hpa@...or.com>,
Paolo Bonzini <pbonzini@...hat.com>,
Sean Christopherson <sean.j.christopherson@...el.com>,
Radim Krčmář <rkrcmar@...hat.com>
Subject: Re: [PATCH 1/2] x86/mm: add .data..decrypted section to hold shared
variables
On 08/27/2018 05:11 PM, Tom Lendacky wrote:
> On 08/27/2018 06:24 AM, Brijesh Singh wrote:
>> kvmclock defines few static variables which are shared with hypervisor
>> during the kvmclock initialization.
>>
>> When SEV is active, memory is encrypted with a guest-specific key, and
>> if guest OS wants to share the memory region with hypervisor then it must
>> clear the C-bit before sharing it.
>>
>> The '__decrypted' can be used to define a shared variables; the variables
>> will be put in the .data.decryption section. This section is mapped with
>> C=0 early in the boot, we also ensure that the initialized values are
>> updated to match with C=0 (i.e peform an in-place decryption). The
>> .data..decrypted section is PMD aligned and sized so that we avoid the
>> need for spliting the pages when map with C=0.
>
> This should probably be broken into a few smaller patches. Maybe a
> patch that adds the section and the attribute, a patch that re-arranges
> the mapping setup and then the in-place decryption and clearing of the
> encryption bit for the area.
>
OK, I will break the patch. Probably will create a separate patch which
just re-arranges the mapping setup without making any logical changes.
>>
>> Signed-off-by: Brijesh Singh <brijesh.singh@....com>
>> Fixes: 368a540e0232 ("x86/kvmclock: Remove memblock dependency")
>> Cc: stable@...r.kernel.org
>> Cc: Tom Lendacky <thomas.lendacky@....com>
>> Cc: kvm@...r.kernel.org
>> Cc: Thomas Gleixner <tglx@...utronix.de>
>> Cc: Borislav Petkov <bp@...e.de>
>> Cc: "H. Peter Anvin" <hpa@...or.com>
>> Cc: linux-kernel@...r.kernel.org
>> Cc: Paolo Bonzini <pbonzini@...hat.com>
>> Cc: Sean Christopherson <sean.j.christopherson@...el.com>
>> Cc: "Radim Krčmář" <rkrcmar@...hat.com>
>> ---
>> arch/x86/include/asm/mem_encrypt.h | 4 +
>> arch/x86/kernel/head64.c | 12 ++
>> arch/x86/kernel/vmlinux.lds.S | 18 +++
>> arch/x86/mm/mem_encrypt_identity.c | 220 +++++++++++++++++++++++++++----------
>> 4 files changed, 197 insertions(+), 57 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
>> index c064383..3f7d9d3 100644
>> --- a/arch/x86/include/asm/mem_encrypt.h
>> +++ b/arch/x86/include/asm/mem_encrypt.h
>> @@ -52,6 +52,8 @@ void __init mem_encrypt_init(void);
>> bool sme_active(void);
>> bool sev_active(void);
>>
>> +#define __decrypted __attribute__((__section__(".data..decrypted")))
>> +
>> #else /* !CONFIG_AMD_MEM_ENCRYPT */
>>
>> #define sme_me_mask 0ULL
>> @@ -77,6 +79,8 @@ early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0;
>> static inline int __init
>> early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; }
>>
>> +#define __decrypted
>> +
>> #endif /* CONFIG_AMD_MEM_ENCRYPT */
>>
>> /*
>> diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
>> index 8047379..6a18297 100644
>> --- a/arch/x86/kernel/head64.c
>> +++ b/arch/x86/kernel/head64.c
>> @@ -43,6 +43,9 @@ extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
>> static unsigned int __initdata next_early_pgt;
>> pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
>>
>> +/* To clear memory encryption mask from the decrypted section */
>> +extern char __start_data_decrypted[], __end_data_decrypted[];
>> +
>
> Should find a header for these rather than defining them here.
>
OK, will move then in mem_encrypt.h. Will that work ?
>> #ifdef CONFIG_X86_5LEVEL
>> unsigned int __pgtable_l5_enabled __ro_after_init;
>> unsigned int pgdir_shift __ro_after_init = 39;
>> @@ -112,6 +115,7 @@ static bool __head check_la57_support(unsigned long physaddr)
>> unsigned long __head __startup_64(unsigned long physaddr,
>> struct boot_params *bp)
>> {
>> + unsigned long vaddr, vaddr_end;
>> unsigned long load_delta, *p;
>> unsigned long pgtable_flags;
>> pgdval_t *pgd;
>> @@ -234,6 +238,14 @@ unsigned long __head __startup_64(unsigned long physaddr,
>> /* Encrypt the kernel and related (if SME is active) */
>> sme_encrypt_kernel(bp);
>>
>> + /* Clear the memory encryption mask from the decrypted section */
>> + vaddr = (unsigned long)__start_data_decrypted;
>> + vaddr_end = (unsigned long)__end_data_decrypted;
>> + for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
>> + i = pmd_index(vaddr);
>> + pmd[i] -= sme_get_me_mask();
>> + }
>> +
>> /*
>> * Return the SME encryption mask (if SME is active) to be used as a
>> * modifier for the initial pgdir entry programmed into CR3.
>> diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
>> index 8bde0a4..511b875 100644
>> --- a/arch/x86/kernel/vmlinux.lds.S
>> +++ b/arch/x86/kernel/vmlinux.lds.S
>> @@ -89,6 +89,22 @@ PHDRS {
>> note PT_NOTE FLAGS(0); /* ___ */
>> }
>>
>> +/*
>> + * This section contains data which will be mapped as decrypted. Memory
>> + * encryption operates on a page basis. But we make this section a pmd
>> + * aligned to avoid spliting the pages while mapping the section early.
>> + *
>> + * Note: We use a separate section so that only this section gets
>> + * decrypted to avoid exposing more than we wish.
>> + */
>> +#define DATA_DECRYPTED_SECTION \
>> + . = ALIGN(PMD_SIZE); \
>> + __start_data_decrypted = .; \
>> + *(.data..decrypted); \
>> + __end_data_decrypted = .; \
>> + . = ALIGN(PMD_SIZE); \
>> +
>> +
>> SECTIONS
>> {
>> #ifdef CONFIG_X86_32
>> @@ -171,6 +187,8 @@ SECTIONS
>> /* rarely changed data like cpu maps */
>> READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
>>
>> + DATA_DECRYPTED_SECTION
>> +
>> /* End of data section */
>> _edata = .;
>> } :data
>> diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
>> index 7ae3686..ccf6e2b 100644
>> --- a/arch/x86/mm/mem_encrypt_identity.c
>> +++ b/arch/x86/mm/mem_encrypt_identity.c
>> @@ -59,6 +59,8 @@
>> (_PAGE_PAT | _PAGE_PWT))
>>
>> #define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC)
>> +#define PTE_FLAGS_ENC_WP ((PTE_FLAGS_ENC & ~_PAGE_CACHE_MASK) | \
>> + (_PAGE_PAT | _PAGE_PWT))
>>
>> struct sme_populate_pgd_data {
>> void *pgtable_area;
>> @@ -72,10 +74,28 @@ struct sme_populate_pgd_data {
>> unsigned long vaddr_end;
>> };
>>
>> +struct sme_workarea_data {
>> + unsigned long kernel_start;
>> + unsigned long kernel_end;
>> + unsigned long kernel_len;
>> +
>> + unsigned long initrd_start;
>> + unsigned long initrd_end;
>> + unsigned long initrd_len;
>> +
>> + unsigned long workarea_start;
>> + unsigned long workarea_end;
>> + unsigned long workarea_len;
>> +
>> + unsigned long decrypted_base;
>> +};
>> +
>> static char sme_cmdline_arg[] __initdata = "mem_encrypt";
>> static char sme_cmdline_on[] __initdata = "on";
>> static char sme_cmdline_off[] __initdata = "off";
>>
>> +extern char __start_data_decrypted[], __end_data_decrypted[];
>> +
>
> Same comment from above.
>
>> static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd)
>> {
>> unsigned long pgd_start, pgd_end, pgd_size;
>> @@ -219,6 +239,11 @@ static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd)
>> __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC);
>> }
>>
>> +static void __init sme_map_range_encrypted_wp(struct sme_populate_pgd_data *ppd)
>> +{
>> + __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC_WP);
>> +}
>> +
>> static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd)
>> {
>> __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC);
>> @@ -266,19 +291,17 @@ static unsigned long __init sme_pgtable_calc(unsigned long len)
>> return entries + tables;
>> }
>>
>> -void __init sme_encrypt_kernel(struct boot_params *bp)
>> +static void __init build_workarea_map(struct boot_params *bp,
>> + struct sme_workarea_data *wa,
>> + struct sme_populate_pgd_data *ppd)
>> {
>> unsigned long workarea_start, workarea_end, workarea_len;
>> unsigned long execute_start, execute_end, execute_len;
>> unsigned long kernel_start, kernel_end, kernel_len;
>> unsigned long initrd_start, initrd_end, initrd_len;
>> - struct sme_populate_pgd_data ppd;
>> unsigned long pgtable_area_len;
>> unsigned long decrypted_base;
>>
>> - if (!sme_active())
>> - return;
>> -
>> /*
>> * Prepare for encrypting the kernel and initrd by building new
>> * pagetables with the necessary attributes needed to encrypt the
>> @@ -358,17 +381,17 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
>> * pagetables and when the new encrypted and decrypted kernel
>> * mappings are populated.
>> */
>> - ppd.pgtable_area = (void *)execute_end;
>> + ppd->pgtable_area = (void *)execute_end;
>>
>> /*
>> * Make sure the current pagetable structure has entries for
>> * addressing the workarea.
>> */
>> - ppd.pgd = (pgd_t *)native_read_cr3_pa();
>> - ppd.paddr = workarea_start;
>> - ppd.vaddr = workarea_start;
>> - ppd.vaddr_end = workarea_end;
>> - sme_map_range_decrypted(&ppd);
>> + ppd->pgd = (pgd_t *)native_read_cr3_pa();
>> + ppd->paddr = workarea_start;
>> + ppd->vaddr = workarea_start;
>> + ppd->vaddr_end = workarea_end;
>> + sme_map_range_decrypted(ppd);
>>
>> /* Flush the TLB - no globals so cr3 is enough */
>> native_write_cr3(__native_read_cr3());
>> @@ -379,9 +402,9 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
>> * then be populated with new PUDs and PMDs as the encrypted and
>> * decrypted kernel mappings are created.
>> */
>> - ppd.pgd = ppd.pgtable_area;
>> - memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD);
>> - ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD;
>> + ppd->pgd = ppd->pgtable_area;
>> + memset(ppd->pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD);
>> + ppd->pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD;
>>
>> /*
>> * A different PGD index/entry must be used to get different
>> @@ -399,75 +422,158 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
>> decrypted_base <<= PGDIR_SHIFT;
>>
>> /* Add encrypted kernel (identity) mappings */
>> - ppd.paddr = kernel_start;
>> - ppd.vaddr = kernel_start;
>> - ppd.vaddr_end = kernel_end;
>> - sme_map_range_encrypted(&ppd);
>> + ppd->paddr = kernel_start;
>> + ppd->vaddr = kernel_start;
>> + ppd->vaddr_end = kernel_end;
>> + sme_map_range_encrypted(ppd);
>>
>> /* Add decrypted, write-protected kernel (non-identity) mappings */
>> - ppd.paddr = kernel_start;
>> - ppd.vaddr = kernel_start + decrypted_base;
>> - ppd.vaddr_end = kernel_end + decrypted_base;
>> - sme_map_range_decrypted_wp(&ppd);
>> + ppd->paddr = kernel_start;
>> + ppd->vaddr = kernel_start + decrypted_base;
>> + ppd->vaddr_end = kernel_end + decrypted_base;
>> + sme_map_range_decrypted_wp(ppd);
>>
>> if (initrd_len) {
>> /* Add encrypted initrd (identity) mappings */
>> - ppd.paddr = initrd_start;
>> - ppd.vaddr = initrd_start;
>> - ppd.vaddr_end = initrd_end;
>> - sme_map_range_encrypted(&ppd);
>> + ppd->paddr = initrd_start;
>> + ppd->vaddr = initrd_start;
>> + ppd->vaddr_end = initrd_end;
>> + sme_map_range_encrypted(ppd);
>> /*
>> * Add decrypted, write-protected initrd (non-identity) mappings
>> */
>> - ppd.paddr = initrd_start;
>> - ppd.vaddr = initrd_start + decrypted_base;
>> - ppd.vaddr_end = initrd_end + decrypted_base;
>> - sme_map_range_decrypted_wp(&ppd);
>> + ppd->paddr = initrd_start;
>> + ppd->vaddr = initrd_start + decrypted_base;
>> + ppd->vaddr_end = initrd_end + decrypted_base;
>> + sme_map_range_decrypted_wp(ppd);
>> }
>>
>> - /* Add decrypted workarea mappings to both kernel mappings */
>> - ppd.paddr = workarea_start;
>> - ppd.vaddr = workarea_start;
>> - ppd.vaddr_end = workarea_end;
>> - sme_map_range_decrypted(&ppd);
>> + /*
>> + * When SEV is active, kernel is already encrypted hence mapping
>> + * the initial workarea_start as encrypted. When SME is active,
>> + * the kernel is not encrypted hence add a decrypted workarea
>> + * mappings to both kernel mappings
>> + */
>> + ppd->paddr = workarea_start;
>> + ppd->vaddr = workarea_start;
>> + ppd->vaddr_end = workarea_end;
>> + if (sev_active())
>> + sme_map_range_encrypted(ppd);
>> + else
>> + sme_map_range_decrypted(ppd);
>> +
>> + ppd->paddr = workarea_start;
>> + ppd->vaddr = workarea_start + decrypted_base;
>> + ppd->vaddr_end = workarea_end + decrypted_base;
>> + sme_map_range_decrypted(ppd);
>
> I think this needs to do the same sev_active() check as above. It might
> be working only because of the inherent instruction fetch decryption,
> but it would probably be best in case anything changes in this routine
> in the future.
>
In SEV instruction fetches are always decrypted hence I didn't do it.
But I will follow your advice and just to be consistence with SME I
will map this as encrypted.
>>
>> - ppd.paddr = workarea_start;
>> - ppd.vaddr = workarea_start + decrypted_base;
>> - ppd.vaddr_end = workarea_end + decrypted_base;
>> - sme_map_range_decrypted(&ppd);
>> + wa->kernel_start = kernel_start;
>> + wa->kernel_end = kernel_end;
>> + wa->kernel_len = kernel_len;
>>
>> - /* Perform the encryption */
>> - sme_encrypt_execute(kernel_start, kernel_start + decrypted_base,
>> - kernel_len, workarea_start, (unsigned long)ppd.pgd);
>> + wa->initrd_start = initrd_start;
>> + wa->initrd_end = initrd_end;
>> + wa->initrd_len = initrd_len;
>>
>> - if (initrd_len)
>> - sme_encrypt_execute(initrd_start, initrd_start + decrypted_base,
>> - initrd_len, workarea_start,
>> - (unsigned long)ppd.pgd);
>> + wa->workarea_start = workarea_start;
>> + wa->workarea_end = workarea_end;
>> + wa->workarea_len = workarea_len;
>>
>> + wa->decrypted_base = decrypted_base;
>> +}
>> +
>> +static void __init remove_workarea_map(struct sme_workarea_data *wa,
>> + struct sme_populate_pgd_data *ppd)
>> +{
>> /*
>> * At this point we are running encrypted. Remove the mappings for
>> * the decrypted areas - all that is needed for this is to remove
>> * the PGD entry/entries.
>> */
>> - ppd.vaddr = kernel_start + decrypted_base;
>> - ppd.vaddr_end = kernel_end + decrypted_base;
>> - sme_clear_pgd(&ppd);
>> -
>> - if (initrd_len) {
>> - ppd.vaddr = initrd_start + decrypted_base;
>> - ppd.vaddr_end = initrd_end + decrypted_base;
>> - sme_clear_pgd(&ppd);
>> + ppd->vaddr = wa->kernel_start + wa->decrypted_base;
>> + ppd->vaddr_end = wa->kernel_end + wa->decrypted_base;
>> + sme_clear_pgd(ppd);
>> +
>> + if (wa->initrd_len) {
>> + ppd->vaddr = wa->initrd_start + wa->decrypted_base;
>> + ppd->vaddr_end = wa->initrd_end + wa->decrypted_base;
>> + sme_clear_pgd(ppd);
>> }
>>
>> - ppd.vaddr = workarea_start + decrypted_base;
>> - ppd.vaddr_end = workarea_end + decrypted_base;
>> - sme_clear_pgd(&ppd);
>> + ppd->vaddr = wa->workarea_start + wa->decrypted_base;
>> + ppd->vaddr_end = wa->workarea_end + wa->decrypted_base;
>> + sme_clear_pgd(ppd);
>>
>> /* Flush the TLB - no globals so cr3 is enough */
>> native_write_cr3(__native_read_cr3());
>> }
>>
>> +static void __init decrypt_data_decrypted_section(struct sme_workarea_data *wa,
>> + struct sme_populate_pgd_data *ppd)
>> +{
>> + unsigned long decrypted_start, decrypted_end, decrypted_len;
>> +
>> + /* Physical addresses of decrypted data section */
>> + decrypted_start = __pa_symbol(__start_data_decrypted);
>> + decrypted_end = __pa_symbol(__end_data_decrypted);
>> + decrypted_len = decrypted_end - decrypted_start;
>> +
>> + if (!decrypted_len)
>> + return;
>> +
>> + /* Add decrypted mapping for the section (identity) */
>> + ppd->paddr = decrypted_start;
>> + ppd->vaddr = decrypted_start;
>> + ppd->vaddr_end = decrypted_end;
>> + sme_map_range_decrypted(ppd);
>> +
>> + /* Add encrypted-wp mapping for the section (non-identity) */
>> + ppd->paddr = decrypted_start;
>> + ppd->vaddr = decrypted_start + wa->decrypted_base;
>> + ppd->vaddr_end = decrypted_end + wa->decrypted_base;
>> + sme_map_range_encrypted_wp(ppd);
>> +
>> + /* Perform in-place decryption */
>> + sme_encrypt_execute(decrypted_start + wa->decrypted_base,
>> + decrypted_start,
>> + decrypted_len, wa->workarea_start,
>> + (unsigned long)ppd->pgd);
>
> This doesn't seem correct. The first argument should be the dest,
> not the source. I think this is working because the mappings aren't
> actually being updated (see sme_populate_pgd() where the page table
> entry isn't updated if it exists).
>
Ah I see, I was under assumption that sme_populate_pgd will update
the flags (if they already exist). I will take a look.
> There probably isn't any reason to check if the entry exists, so
> you should be able to update sme_populate_pgd() to set the page
> table entry no matter what.
>
Powered by blists - more mailing lists