lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CA+CK2bDFScwAwyKA2NNkQhvhrj5OsmHm4eTshw2B9e2muqBcvg@mail.gmail.com>
Date: Mon, 22 Sep 2025 10:17:56 -0400
From: Pasha Tatashin <pasha.tatashin@...een.com>
To: Mike Rapoport <rppt@...nel.org>
Cc: Andrew Morton <akpm@...ux-foundation.org>, Alexander Graf <graf@...zon.com>, 
	Baoquan He <bhe@...hat.com>, Changyuan Lyu <changyuanl@...gle.com>, Chris Li <chrisl@...nel.org>, 
	Jason Gunthorpe <jgg@...dia.com>, Pratyush Yadav <pratyush@...nel.org>, kexec@...ts.infradead.org, 
	linux-mm@...ck.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v5 3/4] kho: add support for preserving vmalloc allocations

On Sun, Sep 21, 2025 at 1:45 AM Mike Rapoport <rppt@...nel.org> wrote:
>
> From: "Mike Rapoport (Microsoft)" <rppt@...nel.org>
>
> A vmalloc allocation is preserved using binary structure similar to
> global KHO memory tracker. It's a linked list of pages where each page
> is an array of physical address of pages in vmalloc area.
>
> kho_preserve_vmalloc() hands out the physical address of the head page
> to the caller. This address is used as the argument to
> kho_vmalloc_restore() to restore the mapping in the vmalloc address
> space and populate it with the preserved pages.
>
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@...nel.org>
> Reviewed-by: Pratyush Yadav <pratyush@...nel.org>
> ---
>  include/linux/kexec_handover.h |  28 ++++
>  kernel/kexec_handover.c        | 281 +++++++++++++++++++++++++++++++++
>  2 files changed, 309 insertions(+)
>
> diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
> index cc5c49b0612b..49e36d4ae5dc 100644
> --- a/include/linux/kexec_handover.h
> +++ b/include/linux/kexec_handover.h
> @@ -39,12 +39,23 @@ struct page;
>
>  struct kho_serialization;
>
> +struct kho_vmalloc_chunk;
> +struct kho_vmalloc {
> +        DECLARE_KHOSER_PTR(first, struct kho_vmalloc_chunk *);
> +        unsigned int total_pages;
> +        unsigned short flags;
> +        unsigned short order;
> +};
> +
>  #ifdef CONFIG_KEXEC_HANDOVER
>  bool kho_is_enabled(void);
>
>  int kho_preserve_folio(struct folio *folio);
>  int kho_preserve_pages(struct page *page, unsigned int nr_pages);
> +int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation);
>  struct folio *kho_restore_folio(phys_addr_t phys);
> +struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages);
> +void *kho_restore_vmalloc(const struct kho_vmalloc *preservation);
>  int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt);
>  int kho_retrieve_subtree(const char *name, phys_addr_t *phys);
>
> @@ -71,11 +82,28 @@ static inline int kho_preserve_pages(struct page *page, unsigned int nr_pages)
>         return -EOPNOTSUPP;
>  }
>
> +static inline int kho_preserve_vmalloc(void *ptr,
> +                                      struct kho_vmalloc *preservation)
> +{
> +       return -EOPNOTSUPP;
> +}
> +
>  static inline struct folio *kho_restore_folio(phys_addr_t phys)
>  {
>         return NULL;
>  }
>
> +static inline struct page *kho_restore_pages(phys_addr_t phys,
> +                                            unsigned int nr_pages)
> +{
> +       return NULL;
> +}
> +
> +static inline void *kho_restore_vmalloc(const struct kho_vmalloc *preservation)
> +{
> +       return NULL;
> +}
> +
>  static inline int kho_add_subtree(struct kho_serialization *ser,
>                                   const char *name, void *fdt)
>  {
> diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
> index fd80be3b12fd..e6380d8dce57 100644
> --- a/kernel/kexec_handover.c
> +++ b/kernel/kexec_handover.c
> @@ -18,6 +18,7 @@
>  #include <linux/memblock.h>
>  #include <linux/notifier.h>
>  #include <linux/page-isolation.h>
> +#include <linux/vmalloc.h>
>
>  #include <asm/early_ioremap.h>
>
> @@ -274,6 +275,37 @@ struct folio *kho_restore_folio(phys_addr_t phys)
>  }
>  EXPORT_SYMBOL_GPL(kho_restore_folio);
>
> +/**
> + * kho_restore_pages - restore list of contiguous order 0 pages.
> + * @phys: physical address of the first page.
> + * @nr_pages: number of pages.
> + *
> + * Restore a contiguous list of order 0 pages that was preserved with
> + * kho_preserve_pages().
> + *
> + * Return: 0 on success, error code on failure
> + */
> +struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages)
> +{
> +       const unsigned long start_pfn = PHYS_PFN(phys);
> +       const unsigned long end_pfn = start_pfn + nr_pages;
> +       unsigned long pfn = start_pfn;
> +
> +       while (pfn < end_pfn) {
> +               const unsigned int order =
> +                       min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
> +               struct page *page = kho_restore_page(PFN_PHYS(pfn));

This line breaks linux-next build because, kho_restore_page has a prototype:
void kho_restore_page(struct page *page, unsigned int order).

> +
> +               if (!page)
> +                       return NULL;
> +               split_page(page, order);
> +               pfn += 1 << order;
> +       }
> +
> +       return pfn_to_page(start_pfn);
> +}
> +EXPORT_SYMBOL_GPL(kho_restore_pages);
> +
>  /* Serialize and deserialize struct kho_mem_phys across kexec
>   *
>   * Record all the bitmaps in a linked list of pages for the next kernel to
> @@ -763,6 +795,255 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages)
>  }
>  EXPORT_SYMBOL_GPL(kho_preserve_pages);
>
> +struct kho_vmalloc_hdr {
> +       DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *);
> +};
> +
> +#define KHO_VMALLOC_SIZE                               \
> +       ((PAGE_SIZE - sizeof(struct kho_vmalloc_hdr)) / \
> +        sizeof(phys_addr_t))
> +
> +struct kho_vmalloc_chunk {
> +       struct kho_vmalloc_hdr hdr;
> +       phys_addr_t phys[KHO_VMALLOC_SIZE];
> +};
> +
> +static_assert(sizeof(struct kho_vmalloc_chunk) == PAGE_SIZE);
> +
> +/* vmalloc flags KHO supports */
> +#define KHO_VMALLOC_SUPPORTED_FLAGS    (VM_ALLOC | VM_ALLOW_HUGE_VMAP)
> +
> +/* KHO internal flags for vmalloc preservations */
> +#define KHO_VMALLOC_ALLOC      0x0001
> +#define KHO_VMALLOC_HUGE_VMAP  0x0002
> +
> +static unsigned short vmalloc_flags_to_kho(unsigned int vm_flags)
> +{
> +       unsigned short kho_flags = 0;
> +
> +       if (vm_flags & VM_ALLOC)
> +               kho_flags |= KHO_VMALLOC_ALLOC;
> +       if (vm_flags & VM_ALLOW_HUGE_VMAP)
> +               kho_flags |= KHO_VMALLOC_HUGE_VMAP;
> +
> +       return kho_flags;
> +}
> +
> +static unsigned int kho_flags_to_vmalloc(unsigned short kho_flags)
> +{
> +       unsigned int vm_flags = 0;
> +
> +       if (kho_flags & KHO_VMALLOC_ALLOC)
> +               vm_flags |= VM_ALLOC;
> +       if (kho_flags & KHO_VMALLOC_HUGE_VMAP)
> +               vm_flags |= VM_ALLOW_HUGE_VMAP;
> +
> +       return vm_flags;
> +}
> +
> +static struct kho_vmalloc_chunk *new_vmalloc_chunk(struct kho_vmalloc_chunk *cur)
> +{
> +       struct kho_vmalloc_chunk *chunk;
> +       int err;
> +
> +       chunk = (struct kho_vmalloc_chunk *)get_zeroed_page(GFP_KERNEL);
> +       if (!chunk)
> +               return NULL;
> +
> +       err = kho_preserve_pages(virt_to_page(chunk), 1);
> +       if (err)
> +               goto err_free;
> +       if (cur)
> +               KHOSER_STORE_PTR(cur->hdr.next, chunk);
> +       return chunk;
> +
> +err_free:
> +       free_page((unsigned long)chunk);
> +       return NULL;
> +}
> +
> +static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk)
> +{
> +       struct kho_mem_track *track = &kho_out.ser.track;
> +       unsigned long pfn = PHYS_PFN(virt_to_phys(chunk));
> +
> +       __kho_unpreserve(track, pfn, pfn + 1);
> +
> +       for (int i = 0; chunk->phys[i]; i++) {
> +               pfn = PHYS_PFN(chunk->phys[i]);
> +               __kho_unpreserve(track, pfn, pfn + 1);
> +       }
> +}
> +
> +static void kho_vmalloc_free_chunks(struct kho_vmalloc *kho_vmalloc)
> +{
> +       struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(kho_vmalloc->first);
> +
> +       while (chunk) {
> +               struct kho_vmalloc_chunk *tmp = chunk;
> +
> +               kho_vmalloc_unpreserve_chunk(chunk);
> +
> +               chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
> +               kfree(tmp);
> +       }
> +}
> +
> +/**
> + * kho_preserve_vmalloc - preserve memory allocated with vmalloc() across kexec
> + * @ptr: pointer to the area in vmalloc address space
> + * @preservation: placeholder for preservation metadata
> + *
> + * Instructs KHO to preserve the area in vmalloc address space at @ptr. The
> + * physical pages mapped at @ptr will be preserved and on successful return
> + * @preservation will hold the physical address of a structure that describes
> + * the preservation.
> + *
> + * NOTE: The memory allocated with vmalloc_node() variants cannot be reliably
> + * restored on the same node
> + *
> + * Return: 0 on success, error code on failure
> + */
> +int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation)
> +{
> +       struct kho_vmalloc_chunk *chunk;
> +       struct vm_struct *vm = find_vm_area(ptr);
> +       unsigned int order, flags, nr_contig_pages;
> +       unsigned int idx = 0;
> +       int err;
> +
> +       if (!vm)
> +               return -EINVAL;
> +
> +       if (vm->flags & ~KHO_VMALLOC_SUPPORTED_FLAGS)
> +               return -EOPNOTSUPP;
> +
> +       flags = vmalloc_flags_to_kho(vm->flags);
> +       order = get_vm_area_page_order(vm);
> +
> +       chunk = new_vmalloc_chunk(NULL);
> +       if (!chunk)
> +               return -ENOMEM;
> +       KHOSER_STORE_PTR(preservation->first, chunk);
> +
> +       nr_contig_pages = (1 << order);
> +       for (int i = 0; i < vm->nr_pages; i += nr_contig_pages) {
> +               phys_addr_t phys = page_to_phys(vm->pages[i]);
> +
> +               err = kho_preserve_pages(vm->pages[i], nr_contig_pages);
> +               if (err)
> +                       goto err_free;
> +
> +               chunk->phys[idx++] = phys;
> +               if (idx == ARRAY_SIZE(chunk->phys)) {
> +                       chunk = new_vmalloc_chunk(chunk);
> +                       if (!chunk)
> +                               goto err_free;
> +                       idx = 0;
> +               }
> +       }
> +
> +       preservation->total_pages = vm->nr_pages;
> +       preservation->flags = flags;
> +       preservation->order = order;
> +
> +       return 0;
> +
> +err_free:
> +       kho_vmalloc_free_chunks(preservation);
> +       return err;
> +}
> +EXPORT_SYMBOL_GPL(kho_preserve_vmalloc);
> +
> +/**
> + * kho_restore_vmalloc - recreates and populates an area in vmalloc address
> + * space from the preserved memory.
> + * @preservation: preservation metadata.
> + *
> + * Recreates an area in vmalloc address space and populates it with memory that
> + * was preserved using kho_preserve_vmalloc().
> + *
> + * Return: pointer to the area in the vmalloc address space, NULL on failure.
> + */
> +void *kho_restore_vmalloc(const struct kho_vmalloc *preservation)
> +{
> +       struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first);
> +       unsigned int align, order, shift, vm_flags;
> +       unsigned long total_pages, contig_pages;
> +       unsigned long addr, size;
> +       struct vm_struct *area;
> +       struct page **pages;
> +       unsigned int idx = 0;
> +       int err;
> +
> +       vm_flags = kho_flags_to_vmalloc(preservation->flags);
> +       if (vm_flags & ~KHO_VMALLOC_SUPPORTED_FLAGS)
> +               return NULL;
> +
> +       total_pages = preservation->total_pages;
> +       pages = kvmalloc_array(total_pages, sizeof(*pages), GFP_KERNEL);
> +       if (!pages)
> +               return NULL;
> +       order = preservation->order;
> +       contig_pages = (1 << order);
> +       shift = PAGE_SHIFT + order;
> +       align = 1 << shift;
> +
> +       while (chunk) {
> +               struct page *page;
> +
> +               for (int i = 0; chunk->phys[i]; i++) {
> +                       phys_addr_t phys = chunk->phys[i];
> +
> +                       if (idx + contig_pages > total_pages)
> +                               goto err_free_pages_array;
> +
> +                       page = kho_restore_pages(phys, contig_pages);
> +                       if (!page)
> +                               goto err_free_pages_array;
> +
> +                       for (int j = 0; j < contig_pages; j++)
> +                               pages[idx++] = page;
> +
> +                       phys += contig_pages * PAGE_SIZE;
> +               }
> +
> +               page = kho_restore_pages(virt_to_phys(chunk), 1);
> +               if (!page)
> +                       goto err_free_pages_array;
> +               chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
> +               __free_page(page);
> +       }
> +
> +       if (idx != total_pages)
> +               goto err_free_pages_array;
> +
> +       area = __get_vm_area_node(total_pages * PAGE_SIZE, align, shift,
> +                                 vm_flags, VMALLOC_START, VMALLOC_END,
> +                                 NUMA_NO_NODE, GFP_KERNEL,
> +                                 __builtin_return_address(0));
> +       if (!area)
> +               goto err_free_pages_array;
> +
> +       addr = (unsigned long)area->addr;
> +       size = get_vm_area_size(area);
> +       err = vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, shift);
> +       if (err)
> +               goto err_free_vm_area;
> +
> +       area->nr_pages = total_pages;
> +       area->pages = pages;
> +
> +       return area->addr;
> +
> +err_free_vm_area:
> +       free_vm_area(area);
> +err_free_pages_array:
> +       kvfree(pages);
> +       return NULL;
> +}
> +EXPORT_SYMBOL_GPL(kho_restore_vmalloc);
> +
>  /* Handling for debug/kho/out */
>
>  static struct dentry *debugfs_root;
> --
> 2.50.1
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ