[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Z+lcvEIHMLiKVR1i@ly-workstation>
Date: Sun, 30 Mar 2025 23:01:16 +0800
From: "Lai, Yi" <yi1.lai@...ux.intel.com>
To: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
Cc: Andrew Morton <akpm@...ux-foundation.org>,
"Liam R . Howlett" <Liam.Howlett@...cle.com>,
Vlastimil Babka <vbabka@...e.cz>, Jann Horn <jannh@...gle.com>,
linux-mm@...ck.org, linux-kernel@...r.kernel.org,
Harry Yoo <harry.yoo@...cle.com>,
Yosry Ahmed <yosry.ahmed@...ux.dev>,
Oliver Sang <oliver.sang@...el.com>, yi1.lai@...el.com
Subject: Re: [PATCH v3 4/7] mm/mremap: initial refactor of move_vma()
On Mon, Mar 10, 2025 at 08:50:37PM +0000, Lorenzo Stoakes wrote:
> Update move_vma() to use the threaded VRM object, de-duplicate code and
> separate into smaller functions to aid readability and debug-ability.
>
> This in turn allows further simplification of expand_vma() as we can
> simply thread VRM through the function.
>
> We also take the opportunity to abstract the account charging page count
> into the VRM in order that we can correctly thread this through the
> operation.
>
> We additionally do the same for tracking mm statistics - exec_vm,
> stack_vm, data_vm, and locked_vm.
>
> As part of this change, we slightly modify when locked pages statistics
> are counted for in mm_struct statistics. However this should cause no
> issues, as there is no chance of underflow, nor will any rlimit failures
> occur as a result.
>
> This is an intermediate step before a further refactoring of move_vma() in
> order to aid review.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
> Reviewed-by: Liam R. Howlett <Liam.Howlett@...cle.com>
> Reviewed-by: Vlastimil Babka <vbabka@...e.cz>
> ---
> mm/mremap.c | 186 ++++++++++++++++++++++++++++++++++------------------
> 1 file changed, 122 insertions(+), 64 deletions(-)
>
> diff --git a/mm/mremap.c b/mm/mremap.c
> index af022e3b89e2..6305cb9a86f6 100644
> --- a/mm/mremap.c
> +++ b/mm/mremap.c
> @@ -68,6 +68,7 @@ struct vma_remap_struct {
> bool mlocked; /* Was the VMA mlock()'d? */
> enum mremap_type remap_type; /* expand, shrink, etc. */
> bool mmap_locked; /* Is mm currently write-locked? */
> + unsigned long charged; /* If VM_ACCOUNT, # pages to account. */
> };
>
> static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
> @@ -816,35 +817,88 @@ static unsigned long vrm_set_new_addr(struct vma_remap_struct *vrm)
> return 0;
> }
>
> -static unsigned long move_vma(struct vm_area_struct *vma,
> - unsigned long old_addr, unsigned long old_len,
> - unsigned long new_len, unsigned long new_addr,
> - bool *mlocked, unsigned long flags,
> - struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
> +/*
> + * Keep track of pages which have been added to the memory mapping. If the VMA
> + * is accounted, also check to see if there is sufficient memory.
> + *
> + * Returns true on success, false if insufficient memory to charge.
> + */
> +static bool vrm_charge(struct vma_remap_struct *vrm)
> {
> - long to_account = new_len - old_len;
> - struct mm_struct *mm = vma->vm_mm;
> - struct vm_area_struct *new_vma;
> - unsigned long vm_flags = vma->vm_flags;
> - unsigned long new_pgoff;
> - unsigned long moved_len;
> - bool account_start = false;
> - bool account_end = false;
> - unsigned long hiwater_vm;
> - int err = 0;
> - bool need_rmap_locks;
> - struct vma_iterator vmi;
> + unsigned long charged;
> +
> + if (!(vrm->vma->vm_flags & VM_ACCOUNT))
> + return true;
> +
> + /*
> + * If we don't unmap the old mapping, then we account the entirety of
> + * the length of the new one. Otherwise it's just the delta in size.
> + */
> + if (vrm->flags & MREMAP_DONTUNMAP)
> + charged = vrm->new_len >> PAGE_SHIFT;
> + else
> + charged = vrm->delta >> PAGE_SHIFT;
> +
> +
> + /* This accounts 'charged' pages of memory. */
> + if (security_vm_enough_memory_mm(current->mm, charged))
> + return false;
> +
> + vrm->charged = charged;
> + return true;
> +}
> +
> +/*
> + * an error has occurred so we will not be using vrm->charged memory. Unaccount
> + * this memory if the VMA is accounted.
> + */
> +static void vrm_uncharge(struct vma_remap_struct *vrm)
> +{
> + if (!(vrm->vma->vm_flags & VM_ACCOUNT))
> + return;
> +
> + vm_unacct_memory(vrm->charged);
> + vrm->charged = 0;
> +}
> +
> +/*
> + * Update mm exec_vm, stack_vm, data_vm, and locked_vm fields as needed to
> + * account for 'bytes' memory used, and if locked, indicate this in the VRM so
> + * we can handle this correctly later.
> + */
> +static void vrm_stat_account(struct vma_remap_struct *vrm,
> + unsigned long bytes)
> +{
> + unsigned long pages = bytes >> PAGE_SHIFT;
> + struct mm_struct *mm = current->mm;
> + struct vm_area_struct *vma = vrm->vma;
> +
> + vm_stat_account(mm, vma->vm_flags, pages);
> + if (vma->vm_flags & VM_LOCKED) {
> + mm->locked_vm += pages;
> + vrm->mlocked = true;
> + }
> +}
> +
> +/*
> + * Perform checks before attempting to write a VMA prior to it being
> + * moved.
> + */
> +static unsigned long prep_move_vma(struct vma_remap_struct *vrm,
> + unsigned long *vm_flags_ptr)
> +{
> + unsigned long err = 0;
> + struct vm_area_struct *vma = vrm->vma;
> + unsigned long old_addr = vrm->addr;
> + unsigned long old_len = vrm->old_len;
>
> /*
> * We'd prefer to avoid failure later on in do_munmap:
> * which may split one vma into three before unmapping.
> */
> - if (mm->map_count >= sysctl_max_map_count - 3)
> + if (current->mm->map_count >= sysctl_max_map_count - 3)
> return -ENOMEM;
>
> - if (unlikely(flags & MREMAP_DONTUNMAP))
> - to_account = new_len;
> -
> if (vma->vm_ops && vma->vm_ops->may_split) {
> if (vma->vm_start != old_addr)
> err = vma->vm_ops->may_split(vma, old_addr);
> @@ -862,22 +916,46 @@ static unsigned long move_vma(struct vm_area_struct *vma,
> * so KSM can come around to merge on vma and new_vma afterwards.
> */
> err = ksm_madvise(vma, old_addr, old_addr + old_len,
> - MADV_UNMERGEABLE, &vm_flags);
> + MADV_UNMERGEABLE, vm_flags_ptr);
> if (err)
> return err;
>
> - if (vm_flags & VM_ACCOUNT) {
> - if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
> - return -ENOMEM;
> - }
> + return 0;
> +}
> +
> +static unsigned long move_vma(struct vma_remap_struct *vrm)
> +{
> + struct mm_struct *mm = current->mm;
> + struct vm_area_struct *vma = vrm->vma;
> + struct vm_area_struct *new_vma;
> + unsigned long vm_flags = vma->vm_flags;
> + unsigned long old_addr = vrm->addr, new_addr = vrm->new_addr;
> + unsigned long old_len = vrm->old_len, new_len = vrm->new_len;
> + unsigned long new_pgoff;
> + unsigned long moved_len;
> + unsigned long account_start = false;
> + unsigned long account_end = false;
> + unsigned long hiwater_vm;
> + int err;
> + bool need_rmap_locks;
> + struct vma_iterator vmi;
> +
> + err = prep_move_vma(vrm, &vm_flags);
> + if (err)
> + return err;
> +
> + /* If accounted, charge the number of bytes the operation will use. */
> + if (!vrm_charge(vrm))
> + return -ENOMEM;
>
> vma_start_write(vma);
> new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
> - new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
> + new_vma = copy_vma(&vrm->vma, new_addr, new_len, new_pgoff,
> &need_rmap_locks);
> + /* This may have been updated. */
> + vma = vrm->vma;
> if (!new_vma) {
> - if (vm_flags & VM_ACCOUNT)
> - vm_unacct_memory(to_account >> PAGE_SHIFT);
> + vrm_uncharge(vrm);
> return -ENOMEM;
> }
>
> @@ -902,7 +980,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
> old_addr = new_addr;
> new_addr = err;
> } else {
> - mremap_userfaultfd_prep(new_vma, uf);
> + mremap_userfaultfd_prep(new_vma, vrm->uf);
> }
>
> if (is_vm_hugetlb_page(vma)) {
> @@ -910,7 +988,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
> }
>
> /* Conceal VM_ACCOUNT so old reservation is not undone */
> - if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
> + if (vm_flags & VM_ACCOUNT && !(vrm->flags & MREMAP_DONTUNMAP)) {
> vm_flags_clear(vma, VM_ACCOUNT);
> if (vma->vm_start < old_addr)
> account_start = true;
> @@ -928,13 +1006,12 @@ static unsigned long move_vma(struct vm_area_struct *vma,
> * If this were a serious issue, we'd add a flag to do_munmap().
> */
> hiwater_vm = mm->hiwater_vm;
> - vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
>
> /* Tell pfnmap has moved from this vma */
> if (unlikely(vma->vm_flags & VM_PFNMAP))
> untrack_pfn_clear(vma);
>
> - if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
> + if (unlikely(!err && (vrm->flags & MREMAP_DONTUNMAP))) {
> /* We always clear VM_LOCKED[ONFAULT] on the old vma */
> vm_flags_clear(vma, VM_LOCKED_MASK);
>
> @@ -947,22 +1024,20 @@ static unsigned long move_vma(struct vm_area_struct *vma,
> unlink_anon_vmas(vma);
>
> /* Because we won't unmap we don't need to touch locked_vm */
> + vrm_stat_account(vrm, new_len);
> return new_addr;
> }
>
> + vrm_stat_account(vrm, new_len);
> +
> vma_iter_init(&vmi, mm, old_addr);
> - if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) {
> + if (do_vmi_munmap(&vmi, mm, old_addr, old_len, vrm->uf_unmap, false) < 0) {
> /* OOM: unable to split vma, just get accounts right */
> - if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
> + if (vm_flags & VM_ACCOUNT && !(vrm->flags & MREMAP_DONTUNMAP))
> vm_acct_memory(old_len >> PAGE_SHIFT);
> account_start = account_end = false;
> }
>
> - if (vm_flags & VM_LOCKED) {
> - mm->locked_vm += new_len >> PAGE_SHIFT;
> - *mlocked = true;
> - }
> -
> mm->hiwater_vm = hiwater_vm;
>
> /* Restore VM_ACCOUNT if one or two pieces of vma left */
> @@ -1141,9 +1216,7 @@ static unsigned long mremap_to(struct vma_remap_struct *vrm)
> if (err)
> return err;
>
> - return move_vma(vrm->vma, vrm->addr, vrm->old_len, vrm->new_len,
> - vrm->new_addr, &vrm->mlocked, vrm->flags,
> - vrm->uf, vrm->uf_unmap);
> + return move_vma(vrm);
> }
>
> static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
> @@ -1248,17 +1321,11 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm)
> static unsigned long expand_vma_in_place(struct vma_remap_struct *vrm)
> {
> struct mm_struct *mm = current->mm;
> - long pages = vrm->delta >> PAGE_SHIFT;
> struct vm_area_struct *vma = vrm->vma;
> VMA_ITERATOR(vmi, mm, vma->vm_end);
> - long charged = 0;
> -
> - if (vma->vm_flags & VM_ACCOUNT) {
> - if (security_vm_enough_memory_mm(mm, pages))
> - return -ENOMEM;
>
> - charged = pages;
> - }
> + if (!vrm_charge(vrm))
> + return -ENOMEM;
>
> /*
> * Function vma_merge_extend() is called on the
> @@ -1271,15 +1338,11 @@ static unsigned long expand_vma_in_place(struct vma_remap_struct *vrm)
> */
> vma = vrm->vma = vma_merge_extend(&vmi, vma, vrm->delta);
> if (!vma) {
> - vm_unacct_memory(charged);
> + vrm_uncharge(vrm);
> return -ENOMEM;
> }
>
> - vm_stat_account(mm, vma->vm_flags, pages);
> - if (vma->vm_flags & VM_LOCKED) {
> - mm->locked_vm += pages;
> - vrm->mlocked = true;
> - }
> + vrm_stat_account(vrm, vrm->delta);
>
> return 0;
> }
> @@ -1319,11 +1382,7 @@ static bool align_hugetlb(struct vma_remap_struct *vrm)
> static unsigned long expand_vma(struct vma_remap_struct *vrm)
> {
> unsigned long err;
> - struct vm_area_struct *vma = vrm->vma;
> unsigned long addr = vrm->addr;
> - unsigned long old_len = vrm->old_len;
> - unsigned long new_len = vrm->new_len;
> - unsigned long flags = vrm->flags;
>
> err = resize_is_valid(vrm);
> if (err)
> @@ -1356,7 +1415,7 @@ static unsigned long expand_vma(struct vma_remap_struct *vrm)
> */
>
> /* We're not allowed to move the VMA, so error out. */
> - if (!(flags & MREMAP_MAYMOVE))
> + if (!(vrm->flags & MREMAP_MAYMOVE))
> return -ENOMEM;
>
> /* Find a new location to move the VMA to. */
> @@ -1364,8 +1423,7 @@ static unsigned long expand_vma(struct vma_remap_struct *vrm)
> if (err)
> return err;
>
> - return move_vma(vma, addr, old_len, new_len, vrm->new_addr,
> - &vrm->mlocked, flags, vrm->uf, vrm->uf_unmap);
> + return move_vma(vrm);
> }
>
> /*
> --
> 2.48.1
>
Hi Lorenzo Stoakes,
Greetings!
I used Syzkaller and found that there is general protection fault in mremap in linux-next tag - next-20250325.
After bisection and the first bad commit is:
"
d5c8aec0542e mm/mremap: initial refactor of move_vma()
"
The deadlock can still be reproduced. You could try following reproduction binary.
All detailed into can be found at:
https://github.com/laifryiee/syzkaller_logs/tree/main/250329_061207_mremap
Syzkaller repro code:
https://github.com/laifryiee/syzkaller_logs/tree/main/250329_061207_mremap/repro.c
Syzkaller repro syscall steps:
https://github.com/laifryiee/syzkaller_logs/tree/main/250329_061207_mremap/repro.prog
Syzkaller report:
https://github.com/laifryiee/syzkaller_logs/tree/main/250329_061207_mremap/repro.report
Kconfig(make olddefconfig):
https://github.com/laifryiee/syzkaller_logs/tree/main/250329_061207_mremap/kconfig_origin
Bisect info:
https://github.com/laifryiee/syzkaller_logs/tree/main/250329_061207_mremap/bisect_info.log
bzImage:
https://github.com/laifryiee/syzkaller_logs/raw/refs/heads/main/250329_061207_mremap/bzImage_eb4bc4b07f66f01618d9cb1aa4eaef59b1188415
Issue dmesg:
https://github.com/laifryiee/syzkaller_logs/blob/main/250329_061207_mremap/eb4bc4b07f66f01618d9cb1aa4eaef59b1188415_dmesg.log
"
[ 43.795673] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000004: 0000 [#1] SMP KASI
[ 43.797814] KASAN: null-ptr-deref in range [0x0000000000000020-0x0000000000000027]
[ 43.799835] CPU: 1 UID: 0 PID: 665 Comm: repro Not tainted 6.14.0-next-20250325-eb4bc4b07f66 #1 PREEMPT(voluntary)
[ 43.800338] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org4
[ 43.800911] RIP: 0010:__do_sys_mremap+0x13a9/0x15d0
[ 43.801188] Code: e8 ac 57 a8 ff 48 8b 85 30 fe ff ff 48 89 83 70 ff ff ff 49 89 c5 e9 2b f2 ff ff e8 91 57 a8 ff 485
[ 43.802152] RSP: 0018:ffff88801aa67ce8 EFLAGS: 00010293
[ 43.802432] RAX: dffffc0000000004 RBX: ffff88801aa67eb0 RCX: ffffffff81e42e5a
[ 43.802791] RDX: ffff888011e6ca80 RSI: ffffffff81df64cf RDI: 0000000000000007
[ 43.803172] RBP: ffff88801aa67ed8 R08: 0000000000000000 R09: ffffed10023cd950
[ 43.803558] R10: 0000000010000000 R11: ffff888011e6d8d8 R12: ffff888020fe5000
[ 43.803943] R13: ffff88801f8f2780 R14: ffff888020fe5170 R15: 0000000000000000
[ 43.804324] FS: 00007f8f5ae81600(0000) GS:ffff8880e3684000(0000) knlGS:0000000000000000
[ 43.804749] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 43.805069] CR2: 00007f8f5ac57910 CR3: 0000000021690001 CR4: 0000000000770ef0
[ 43.805442] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 43.805796] DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 0000000000000400
[ 43.806173] PKRU: 55555554
[ 43.806345] Call Trace:
[ 43.806558] <TASK>
[ 43.806700] ? show_regs+0x6d/0x80
[ 43.807345] ? die_addr+0x45/0xb0
[ 43.807768] ? exc_general_protection+0x1ad/0x340
[ 43.808360] ? asm_exc_general_protection+0x2b/0x30
[ 43.808939] ? vma_merge_new_range+0x16a/0x930
[ 43.809499] ? __do_sys_mremap+0x139f/0x15d0
[ 43.810012] ? __do_sys_mremap+0x13a9/0x15d0
[ 43.810528] ? __do_sys_mremap+0x139f/0x15d0
[ 43.811043] ? __pfx___do_sys_mremap+0x10/0x10
[ 43.811587] ? __this_cpu_preempt_check+0x21/0x30
[ 43.812177] __x64_sys_mremap+0xc7/0x150
[ 43.812652] ? syscall_trace_enter+0x14d/0x280
[ 43.813201] x64_sys_call+0x1933/0x2150
[ 43.813658] do_syscall_64+0x6d/0x150
[ 43.814101] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ 43.814686] RIP: 0033:0x7f8f5ac3ee5d
[ 43.815125] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d8
[ 43.817168] RSP: 002b:00007fff14f19058 EFLAGS: 00000217 ORIG_RAX: 0000000000000019
[ 43.818037] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f8f5ac3ee5d
[ 43.818430] RDX: 0000000000004000 RSI: 0000000000002000 RDI: 0000000020ffd000
[ 43.818793] RBP: 00007fff14f19070 R08: 0000000020ffc000 R09: 0000000000000800
[ 43.819160] R10: 0000000000000000 R11: 0000000000000217 R12: 00007fff14f19188
[ 43.819540] R13: 0000000000401126 R14: 0000000000403e08 R15: 00007f8f5aeca000
[ 43.819945] </TASK>
[ 43.820066] Modules linked in:
[ 43.820308] ---[ end trace 0000000000000000 ]---
[ 43.820587] RIP: 0010:__do_sys_mremap+0x13a9/0x15d0
[ 43.820874] Code: e8 ac 57 a8 ff 48 8b 85 30 fe ff ff 48 89 83 70 ff ff ff 49 89 c5 e9 2b f2 ff ff e8 91 57 a8 ff 485
[ 43.821855] RSP: 0018:ffff88801aa67ce8 EFLAGS: 00010293
[ 43.822150] RAX: dffffc0000000004 RBX: ffff88801aa67eb0 RCX: ffffffff81e42e5a
[ 43.822543] RDX: ffff888011e6ca80 RSI: ffffffff81df64cf RDI: 0000000000000007
[ 43.822928] RBP: ffff88801aa67ed8 R08: 0000000000000000 R09: ffffed10023cd950
[ 43.823347] R10: 0000000010000000 R11: ffff888011e6d8d8 R12: ffff888020fe5000
[ 43.823757] R13: ffff88801f8f2780 R14: ffff888020fe5170 R15: 0000000000000000
[ 43.824156] FS: 00007f8f5ae81600(0000) GS:ffff8880e3684000(0000) knlGS:0000000000000000
[ 43.824627] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 43.824959] CR2: 00007f8f5ac57910 CR3: 0000000021690001 CR4: 0000000000770ef0
[ 43.825350] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 43.825757] DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 0000000000000400
[ 43.826154] PKRU: 55555554
"
Hope this cound be insightful to you.
Regards,
Yi Lai
---
If you don't need the following environment to reproduce the problem or if you
already have one reproduced environment, please ignore the following information.
How to reproduce:
git clone https://gitlab.com/xupengfe/repro_vm_env.git
cd repro_vm_env
tar -xvf repro_vm_env.tar.gz
cd repro_vm_env; ./start3.sh // it needs qemu-system-x86_64 and I used v7.1.0
// start3.sh will load bzImage_2241ab53cbb5cdb08a6b2d4688feb13971058f65 v6.2-rc5 kernel
// You could change the bzImage_xxx as you want
// Maybe you need to remove line "-drive if=pflash,format=raw,readonly=on,file=./OVMF_CODE.fd \" for different qemu version
You could use below command to log in, there is no password for root.
ssh -p 10023 root@...alhost
After login vm(virtual machine) successfully, you could transfer reproduced
binary to the vm by below way, and reproduce the problem in vm:
gcc -pthread -o repro repro.c
scp -P 10023 repro root@...alhost:/root/
Get the bzImage for target kernel:
Please use target kconfig and copy it to kernel_src/.config
make olddefconfig
make -jx bzImage //x should equal or less than cpu num your pc has
Fill the bzImage file into above start3.sh to load the target kernel in vm.
Tips:
If you already have qemu-system-x86_64, please ignore below info.
If you want to install qemu v7.1.0 version:
git clone https://github.com/qemu/qemu.git
cd qemu
git checkout -f v7.1.0
mkdir build
cd build
yum install -y ninja-build.x86_64
yum -y install libslirp-devel.x86_64
../configure --target-list=x86_64-softmmu --enable-kvm --enable-vnc --enable-gtk --enable-sdl --enable-usb-redir --enable-slirp
make
make install
Powered by blists - more mailing lists