[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <5jrd43vusvcchpk2x6mouighkfhamjpaya5fu2cvikzaieg5pq@wqccwmjs4ian>
Date: Mon, 26 Aug 2024 09:58:11 -0400
From: "Liam R. Howlett" <Liam.Howlett@...cle.com>
To: Nam Cao <namcao@...utronix.de>
Cc: Dave Hansen <dave.hansen@...ux.intel.com>,
Andy Lutomirski <luto@...nel.org>,
Peter Zijlstra <peterz@...radead.org>,
Thomas Gleixner <tglx@...utronix.de>, Ingo Molnar <mingo@...hat.com>,
Borislav Petkov <bp@...en8.de>, x86@...nel.org,
"H. Peter Anvin" <hpa@...or.com>,
Andrew Morton <akpm@...ux-foundation.org>,
Vlastimil Babka <vbabka@...e.cz>,
Lorenzo Stoakes <lorenzo.stoakes@...cle.com>,
linux-kernel@...r.kernel.org, linux-mm@...ck.org,
bigeasy@...utronix.de
Subject: Re: [PATCH] x86/mm/pat: Support splitting of virtual memory areas
* Nam Cao <namcao@...utronix.de> [240825 11:29]:
> When a virtual memory area (VMA) gets splitted, memtype_rbroot's entries
> are not updated. This causes confusion later on when the VMAs get
> un-mapped, because the address ranges of the splitted VMAs do not match the
> address range of the initial VMA.
>
> For example, if user does:
>
> fd = open("/some/pci/bar", O_RDWR);
> addr = mmap(0, 8192, PROT_READ, MAP_SHARED, fd, 0);
> mprotect(addr, 4096, PROT_READ | PROT_WRITE);
> munmap(p, 8192);
>
> with the physical address starting from 0xfd000000, the range
> (0xfd000000-0xfd002000) would be tracked with the mmap() call.
>
> After mprotect(), the initial range gets splitted into
> (0xfd000000-0xfd001000) and (0xfd001000-0xfd002000).
>
> Then, at munmap(), the first range does not match any entry in
> memtype_rbroot, and a message is seen in dmesg:
>
> x86/PAT: test:177 freeing invalid memtype [mem 0xfd000000-0xfd000fff]
>
> The second range still matches by accident, because matching only the end
> address is acceptable (to handle shrinking VMA, added by 2039e6acaf94
> (x86/mm/pat: Change free_memtype() to support shrinking case)).
Does this need a fixes tag?
>
> Make sure VMA splitting is handled properly, by splitting the entries in
> memtype_rbroot.
>
> Signed-off-by: Nam Cao <namcao@...utronix.de>
> ---
> arch/x86/mm/pat/memtype.c | 59 ++++++++++++++++++++++++++++++
> arch/x86/mm/pat/memtype.h | 3 ++
> arch/x86/mm/pat/memtype_interval.c | 22 +++++++++++
> include/linux/pgtable.h | 6 +++
> mm/mmap.c | 8 ++++
> 5 files changed, 98 insertions(+)
>
> diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
> index bdc2a240c2aa..b60019478a76 100644
> --- a/arch/x86/mm/pat/memtype.c
> +++ b/arch/x86/mm/pat/memtype.c
> @@ -935,6 +935,46 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
> return 0;
> }
>
> +static int split_pfn_range(u64 start, u64 end, u64 addr)
> +{
> + struct memtype *entry_new;
> + int is_range_ram, ret;
> +
> + if (!pat_enabled())
> + return 0;
> +
> + start = sanitize_phys(start);
> + end = sanitize_phys(end - 1) + 1;
> +
> + /* Low ISA region is not tracked, it is always mapped WB */
> + if (x86_platform.is_untracked_pat_range(start, end))
> + return 0;
> +
> + is_range_ram = pat_pagerange_is_ram(start, end);
> + if (is_range_ram == 1)
> + return 0;
> +
> + if (is_range_ram < 0)
> + return -EINVAL;
> +
> + entry_new = kmalloc(sizeof(*entry_new), GFP_KERNEL);
> + if (!entry_new)
> + return -ENOMEM;
> +
> + spin_lock(&memtype_lock);
> + ret = memtype_split(start, end, addr, entry_new);
> + spin_unlock(&memtype_lock);
> +
> + if (ret) {
> + pr_err("x86/PAT: %s:%d splitting invalid memtype [mem %#010Lx-%#010Lx]\n",
> + current->comm, current->pid, start, end - 1);
> + kfree(entry_new);
> + return ret;
> + }
> +
> + return 0;
> +}
> +
> /*
> * Internal interface to free a range of physical memory.
> * Frees non RAM regions only.
> @@ -1072,6 +1112,25 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
> return 0;
> }
>
> +int track_pfn_split(struct vm_area_struct *vma, unsigned long addr)
> +{
> + unsigned long vma_size = vma->vm_end - vma->vm_start;
> + resource_size_t start_paddr, split_paddr;
> + int ret;
> +
> + if (vma->vm_flags & VM_PAT) {
> + ret = get_pat_info(vma, &start_paddr, NULL);
> + if (ret)
> + return ret;
> +
> + split_paddr = start_paddr + addr - vma->vm_start;
> +
> + return split_pfn_range(start_paddr, start_paddr + vma_size, split_paddr);
> + }
> +
> + return 0;
> +}
> +
> void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn)
> {
> enum page_cache_mode pcm;
> diff --git a/arch/x86/mm/pat/memtype.h b/arch/x86/mm/pat/memtype.h
> index cacecdbceb55..e01dc2018ab6 100644
> --- a/arch/x86/mm/pat/memtype.h
> +++ b/arch/x86/mm/pat/memtype.h
> @@ -31,6 +31,7 @@ static inline char *cattr_name(enum page_cache_mode pcm)
> #ifdef CONFIG_X86_PAT
> extern int memtype_check_insert(struct memtype *entry_new,
> enum page_cache_mode *new_type);
> +extern int memtype_split(u64 start, u64 end, u64 addr, struct memtype *entry_new);
I think we are dropping unnecessary externs now.
> extern struct memtype *memtype_erase(u64 start, u64 end);
> extern struct memtype *memtype_lookup(u64 addr);
> extern int memtype_copy_nth_element(struct memtype *entry_out, loff_t pos);
> @@ -38,6 +39,8 @@ extern int memtype_copy_nth_element(struct memtype *entry_out, loff_t pos);
> static inline int memtype_check_insert(struct memtype *entry_new,
> enum page_cache_mode *new_type)
> { return 0; }
> +static inline int memtype_split(u64 start, u64 end, u64 addr, struct memtype *entry_new)
> +{ return 0; }
> static inline struct memtype *memtype_erase(u64 start, u64 end)
> { return NULL; }
> static inline struct memtype *memtype_lookup(u64 addr)
> diff --git a/arch/x86/mm/pat/memtype_interval.c b/arch/x86/mm/pat/memtype_interval.c
> index 645613d59942..c75d9ee6b72f 100644
> --- a/arch/x86/mm/pat/memtype_interval.c
> +++ b/arch/x86/mm/pat/memtype_interval.c
> @@ -128,6 +128,28 @@ int memtype_check_insert(struct memtype *entry_new, enum page_cache_mode *ret_ty
> return 0;
> }
>
> +int memtype_split(u64 start, u64 end, u64 addr, struct memtype *entry_new)
> +{
> + struct memtype *entry_old;
> +
> + entry_old = memtype_match(start, end, MEMTYPE_EXACT_MATCH);
> + if (!entry_old)
> + return -EINVAL;
> +
> + interval_remove(entry_old, &memtype_rbroot);
> +
> + entry_new->start = addr;
> + entry_new->end = entry_old->end;
> + entry_new->type = entry_old->type;
> +
> + entry_old->end = addr;
> +
> + interval_insert(entry_old, &memtype_rbroot);
> + interval_insert(entry_new, &memtype_rbroot);
> +
> + return 0;
> +}
> +
> struct memtype *memtype_erase(u64 start, u64 end)
> {
> struct memtype *entry_old;
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index 2a6a3cccfc36..8bfc8d0f5dd2 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -1502,6 +1502,11 @@ static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
> return 0;
> }
>
> +static inline int track_pfn_split(struct vm_area_struct *vma, unsigned long addr)
> +{
> + return 0;
> +}
> +
> /*
> * track_pfn_insert is called when a _new_ single pfn is established
> * by vmf_insert_pfn().
> @@ -1542,6 +1547,7 @@ static inline void untrack_pfn_clear(struct vm_area_struct *vma)
> extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
> unsigned long pfn, unsigned long addr,
> unsigned long size);
> +extern int track_pfn_split(struct vm_area_struct *vma, unsigned long addr);
Same extern comment here.
> extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
> pfn_t pfn);
> extern int track_pfn_copy(struct vm_area_struct *vma);
> diff --git a/mm/mmap.c b/mm/mmap.c
> index d0dfc85b209b..64067ddb8382 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -2486,6 +2486,12 @@ static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> if (err)
> goto out_free_mpol;
>
> + if (unlikely(vma->vm_flags & VM_PFNMAP)) {
It is also a bit odd that you check VM_PFNMAP() here, then call a
function to check another flag?
> + err = track_pfn_split(vma, addr);
> + if (err)
> + goto out_vma_unlink;
> + }
> +
I don't think the __split_vma() location is the best place to put this.
Can this be done through the vm_ops->may_split() that is called above?
This is arch independent code that now has an x86 specific check, and
I'd like to keep __split_vma() out of the flag checking. The only error
after the vm_ops check is ENOMEM (without any extra GFP restrictions on
the allocations), you don't need the new vma, and use the same arguments
passed to vm_ops->may_split().
> if (new->vm_file)
> get_file(new->vm_file);
>
> @@ -2515,6 +2521,8 @@ static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> vma_next(vmi);
> return 0;
>
> +out_vma_unlink:
> + unlink_anon_vmas(vma);
> out_free_mpol:
> mpol_put(vma_policy(new));
> out_free_vmi:
> --
> 2.39.2
>
Powered by blists - more mailing lists