lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <118c2f4a-58d7-de42-1f77-31ac8a4d4701@amd.com>
Date:   Wed, 25 Jan 2023 00:48:16 +0530
From:   Raghavendra K T <raghavendra.kt@....com>
To:     Mel Gorman <mgorman@...e.de>
Cc:     linux-kernel@...r.kernel.org, linux-mm@...ck.org,
        Ingo Molnar <mingo@...hat.com>,
        Peter Zijlstra <peterz@...radead.org>,
        Andrew Morton <akpm@...ux-foundation.org>,
        David Hildenbrand <david@...hat.com>,
        Bharata B Rao <bharata@....com>,
        Disha Talreja <dishaa.talreja@....com>,
        Mike Rapoport <rppt@...nel.org>
Subject: Re: [RFC PATCH V1 1/1] sched/numa: Enhance vma scanning logic

On 1/17/2023 11:15 PM, Raghavendra K T wrote:
> On 1/17/2023 8:29 PM, Mel Gorman wrote:
>> Note that the cc list is excessive for the topic.
>>
> 
> Thank you Mel for the review. Sorry for the long list. (got by
> get_maintainer). Will trim the list for V2.
>
(trimming the list early)
[...]
> 
> Nice idea. Thanks again.. I will take this as a base patch for expansion.
> 
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index f3f196e4d66d..3cebda5cc8a7 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -620,6 +620,9 @@ static inline void vma_init(struct vm_area_struct 
>> *vma, struct mm_struct *mm)
>>       vma->vm_mm = mm;
>>       vma->vm_ops = &dummy_vm_ops;
>>       INIT_LIST_HEAD(&vma->anon_vma_chain);
>> +#ifdef CONFIG_NUMA_BALANCING
>> +    vma->numab = NULL;
>> +#endif
>>   }
>>   static inline void vma_set_anonymous(struct vm_area_struct *vma)
>> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
>> index 3b8475007734..3c0cfdde33e0 100644
>> --- a/include/linux/mm_types.h
>> +++ b/include/linux/mm_types.h
>> @@ -526,6 +526,10 @@ struct anon_vma_name {
>>       char name[];
>>   };
>> +struct vma_numab {
>> +    unsigned long next_scan;
>> +};
>> +
>>   /*
>>    * This struct describes a virtual memory area. There is one of these
>>    * per VM-area/task. A VM area is any part of the process virtual 
>> memory
>> @@ -593,6 +597,9 @@ struct vm_area_struct {
>>   #endif
>>   #ifdef CONFIG_NUMA
>>       struct mempolicy *vm_policy;    /* NUMA policy for the VMA */
>> +#endif
>> +#ifdef CONFIG_NUMA_BALANCING
>> +    struct vma_numab *numab;    /* NUMA Balancing state */
>>   #endif
>>       struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
>>   } __randomize_layout;
>> diff --git a/kernel/fork.c b/kernel/fork.c
>> index 9f7fe3541897..2d34c484553d 100644
>> --- a/kernel/fork.c
>> +++ b/kernel/fork.c
>> @@ -481,6 +481,9 @@ struct vm_area_struct *vm_area_dup(struct 
>> vm_area_struct *orig)
>>   void vm_area_free(struct vm_area_struct *vma)
>>   {
>> +#ifdef CONFIG_NUMA_BALANCING
>> +    kfree(vma->numab);
>> +#endif >>       free_anon_vma_name(vma);
>>       kmem_cache_free(vm_area_cachep, vma);
>>   }

while running mmtest kernbench on (256 pcpu), I have hit BUG(),
(not reproducible in normal boot flow otherwise)

[  716.825398] kernel BUG at mm/slub.c:419!
[  716.825736] invalid opcode: 0000 [#146] PREEMPT SMP NOPTI
[  716.826042] CPU: 232 PID: 364844 Comm: cc1 Tainted: G      D W 
    6.1.0-test-snp-host-a7065246cf78+ #44
[  716.826345] Hardware name: Dell Inc. PowerEdge R6525/024PW1, BIOS 
2.6.6 01/13/2022
[  716.826645] RIP: 0010:__kmem_cache_free+0x2a4/0x2c0
[  716.826941] Code: ff e9 32 ff ff ff 49 8b 47 08 f0 48 83 28 01 0f 85 
9b fe ff ff 49 8b 47 08 4c 89 ff 48 8b 40 08 e8 a1 c5 cc 00 e9 86 fe ff 
ff <0f> 0b 48 8b 15 63 d6 4d 01 e9 85 fd ff ff 66 66 2e 0f 1f 84 00 00
[  716.827550] RSP: 0018:ffffb0b070547c28 EFLAGS: 00010246
[  716.827865] RAX: ffff990fa6bf1310 RBX: ffff990fa6bf1310 RCX: 
ffff990fa6bf1310
[  716.828180] RDX: 00000000001000e8 RSI: 0000000000000000 RDI: 
ffff98d000044200
[  716.828503] RBP: ffffb0b070547c50 R08: ffff98d030f222e0 R09: 
0000000000000001
[  716.828821] R10: ffff990ff6d298b0 R11: ffff98d030f226a0 R12: 
ffff98d000044200
[  716.829139] R13: ffffd605c29afc40 R14: ffffffff9e89c20f R15: 
ffffb0b070547d58
[  716.829458] FS:  00007f05f4cebac0(0000) GS:ffff994e00800000(0000) 
knlGS:0000000000000000
[  716.829781] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  716.830105] CR2: 00007f05e9cbc002 CR3: 00000040eea7c005 CR4: 
0000000000770ee0
[  716.830432] PKRU: 55555554
[  716.830749] Call Trace:
[  716.831057]  <TASK>
[  716.831360]  kfree+0x79/0x120
[  716.831664]  vm_area_free+0x1f/0x50
[  716.831970]  vma_expand+0x311/0x3e0
[  716.832274]  mmap_region+0x772/0x900
[  716.832571]  do_mmap+0x3c0/0x5e0
[  716.832866]  ? __this_cpu_preempt_check+0x13/0x20
[  716.833165]  ? security_mmap_file+0xa1/0xc0
[  716.833458]  vm_mmap_pgoff+0xd5/0x170
[  716.833745]  ksys_mmap_pgoff+0x46/0x210
[  716.834022]  __x64_sys_mmap+0x33/0x50
[  716.834291]  do_syscall_64+0x3b/0x90
[  716.834549]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
[  716.834806] RIP: 0033:0x7f05f471ebd7
[  716.835054] Code: 00 00 00 89 ef e8 59 ae ff ff eb e4 e8 62 7b 01 00 
66 90 f3 0f 1e fa 41 89 ca 41 f7 c1 ff 0f 00 00 75 10 b8 09 00 00 00 0f 
05 <48> 3d 00 f0 ff ff 77 21 c3 48 8b 05 29 a2 0f 00 64 c7 00 16 00 00
[  716.835567] RSP: 002b:00007fff24c27ae8 EFLAGS: 00000246 ORIG_RAX: 
0000000000000009
[  716.835826] RAX: ffffffffffffffda RBX: 0000000000200000 RCX: 
00007f05f471ebd7
[  716.836077] RDX: 0000000000000003 RSI: 0000000000200000 RDI: 
0000000000000000
[  716.836323] RBP: 0000000000000000 R08: 00000000ffffffff R09: 
0000000000000000
[  716.836567] R10: 0000000000000022 R11: 0000000000000246 R12: 
0000000000000038
[  716.836808] R13: 0000000000001fff R14: 0000000000000044 R15: 
0000000000000048
[  716.837049]  </TASK>
[  716.837285] Modules linked in: tls ipmi_ssif binfmt_misc 
nls_iso8859_1 joydev input_leds intel_rapl_msr intel_rapl_common 
amd64_edac edac_mce_amd hid_generic kvm_amd dell_smbios dcdbas wmi_bmof 
dell_wmi_descriptor kvm usbhid hid ccp k10temp wmi ipmi_si ipmi_devintf 
ipmi_msghandler acpi_power_meter mac_hid sch_fq_codel dm_multipath 
scsi_dh_rdac scsi_dh_emc scsi_dh_alua msr efi_pstore ip_tables x_tables 
autofs4 btrfs blake2b_generic zstd_compress raid10 raid456 
async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq 
libcrc32c raid1 raid0 multipath linear mgag200 drm_kms_helper 
syscopyarea sysfillrect sysimgblt fb_sys_fops crct10dif_pclmul 
i2c_algo_bit crc32_pclmul drm_shmem_helper ghash_clmulni_intel nvme 
aesni_intel crypto_simd cryptd tg3 drm nvme_core megaraid_sas ahci 
xhci_pci i2c_piix4 xhci_pci_renesas libahci
[  716.839185] ---[ end trace 0000000000000000 ]---

looks like we have to additionally handle numab initialization in
vm_area_dup() code path. something like below fixed it (copied pasted
from tty):

diff --git a/kernel/fork.c b/kernel/fork.c
index 08969f5aa38d..f5b2e41296c7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -475,12 +475,18 @@ struct vm_area_struct *vm_area_dup(struct 
vm_area_struct *orig)
                 *new = data_race(*orig);
                 INIT_LIST_HEAD(&new->anon_vma_chain);
                 dup_anon_vma_name(orig, new);
+#ifdef CONFIG_NUMA_BALANCING
+               new->numab = NULL;
+#endif
         }
         return new;
  }

Does this look okay? if so I will fold it into V2 spin (in
vma_scan_delay patch, hoping you are okay with this change and do not
see any other changes required)

>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index c36aa54ae071..6a1cffdfc76b 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -3027,6 +3027,23 @@ static void task_numa_work(struct callback_head 
>> *work)
>>           if (!vma_is_accessible(vma))
>>               continue;
>> +        /* Initialise new per-VMA NUMAB state. */
>> +        if (!vma->numab) {
>> +            vma->numab = kzalloc(sizeof(struct vma_numab), GFP_KERNEL);
>> +            if (!vma->numab)
>> +                continue;
>> +
>> +            vma->numab->next_scan = now +
>> +                msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
>> +        }
>> +
>> +        /*
>> +         * After the first scan is complete, delay the balancing scan
>> +         * for new VMAs.
>> +         */
>> +        if (mm->numa_scan_seq && time_before(jiffies, 
>> vma->numab->next_scan))
>> +            continue;
>> +
>>           do {
>>               start = max(start, vma->vm_start);
>>               end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
>>
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ