lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <fdbcb6f8-caf7-48cb-810b-44ff8c75ae5b@bytedance.com>
Date: Wed, 29 Jan 2025 01:51:10 +0800
From: Qi Zheng <zhengqi.arch@...edance.com>
To: David Hildenbrand <david@...hat.com>,
 Peter Zijlstra <peterz@...radead.org>, Rik van Riel <riel@...riel.com>
Cc: kernel test robot <oliver.sang@...el.com>, oe-lkp@...ts.linux.dev,
 lkp@...el.com, linux-kernel@...r.kernel.org,
 Andrew Morton <akpm@...ux-foundation.org>,
 Dave Hansen <dave.hansen@...ux.intel.com>, Andy Lutomirski
 <luto@...nel.org>, Catalin Marinas <catalin.marinas@....com>,
 David Rientjes <rientjes@...gle.com>, Hugh Dickins <hughd@...gle.com>,
 Jann Horn <jannh@...gle.com>, Lorenzo Stoakes <lorenzo.stoakes@...cle.com>,
 Matthew Wilcox <willy@...radead.org>, Mel Gorman <mgorman@...e.de>,
 Muchun Song <muchun.song@...ux.dev>, Peter Xu <peterx@...hat.com>,
 Will Deacon <will@...nel.org>, Zach O'Keefe <zokeefe@...gle.com>,
 Dan Carpenter <dan.carpenter@...aro.org>
Subject: Re: [linus:master] [x86] 4817f70c25: stress-ng.mmapaddr.ops_per_sec
 63.0% regression

Hi,

On 2025/1/29 01:06, Qi Zheng wrote:
> Hi,
> 

[...]

> @[
> _raw_spin_unlock_irqrestore+5
> get_page_from_freelist+2014
> __alloc_frozen_pages_noprof+364
> alloc_pages_mpol+123
> alloc_pages_noprof+14
> pte_alloc_one+30
> __pte_alloc+42
> do_pte_missing+2499
> __handle_mm_fault+1862
> handle_mm_fault+195
> __get_user_pages+690
> populate_vma_page_range+127
> __mm_populate+159
> vm_mmap_pgoff+329
> do_syscall_64+98
> entry_SYSCALL_64_after_hwframe+118
> , stress-ng-mmapa]: 2443
> @[
> _raw_spin_unlock_irqrestore+5
> get_page_from_freelist+2014
> __alloc_frozen_pages_noprof+364
> alloc_pages_mpol+123
> alloc_pages_noprof+14
> get_free_pages_noprof+17
> __x64_sys_mincore+141
> do_syscall_64+98
> entry_SYSCALL_64_after_hwframe+118
> , stress-ng-mmapa]: 5184
> @[
> _raw_spin_unlock_irqrestore+5
> free_one_page+85
> tlb_remove_table_rcu+140
> rcu_do_batch+424
> rcu_core+401
> handle_softirqs+204
> irq_exit_rcu+208
> sysvec_apic_timer_interrupt+113
> asm_sysvec_apic_timer_interrupt+26
> _raw_spin_unlock_irqrestore+29
> get_page_from_freelist+2014
> __alloc_frozen_pages_noprof+364
> alloc_pages_mpol+123
> alloc_pages_noprof+14
> get_free_pages_noprof+17
> __x64_sys_mincore+141
> do_syscall_64+98
> entry_SYSCALL_64_after_hwframe+118
> , stress-ng-mmapa]: 5301
> @Error looking up stack id 4294967279 (pid -1): -1
> [, stress-ng-mmapa]: 53366
> 
> It seems to be related to CONFIG_MMU_GATHER_RCU_TABLE_FREE?

I did the following test and reproduced the same performance regression:

1) disable CONFIG_PT_RECLAIM

CONFIG_ARCH_SUPPORTS_PT_RECLAIM=y
# CONFIG_PT_RECLAIM is not set

2) apply Rik's patch #1 
(https://lore.kernel.org/lkml/20250123042447.2259648-2-riel@surriel.com/):

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 87198d957e2f1..17197d395976e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -277,7 +277,7 @@ config X86
         select HAVE_PCI
         select HAVE_PERF_REGS
         select HAVE_PERF_USER_STACK_DUMP
-       select MMU_GATHER_RCU_TABLE_FREE        if PARAVIRT
+       select MMU_GATHER_RCU_TABLE_FREE
         select MMU_GATHER_MERGE_VMAS
         select HAVE_POSIX_CPU_TIMERS_TASK_WORK
         select HAVE_REGS_AND_STACK_ACCESS_API
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1ccaa3397a670..527f5605aa3e5 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -59,21 +59,6 @@ void __init native_pv_lock_init(void)
                 static_branch_enable(&virt_spin_lock_key);
  }

-#ifndef CONFIG_PT_RECLAIM
-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
-       struct ptdesc *ptdesc = (struct ptdesc *)table;
-
-       pagetable_dtor(ptdesc);
-       tlb_remove_page(tlb, ptdesc_page(ptdesc));
-}
-#else
-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
-       tlb_remove_table(tlb, table);
-}
-#endif
-
  struct static_key paravirt_steal_enabled;
  struct static_key paravirt_steal_rq_enabled;

@@ -195,7 +180,7 @@ struct paravirt_patch_template pv_ops = {
         .mmu.flush_tlb_kernel   = native_flush_tlb_global,
         .mmu.flush_tlb_one_user = native_flush_tlb_one_user,
         .mmu.flush_tlb_multi    = native_flush_tlb_multi,
-       .mmu.tlb_remove_table   = native_tlb_remove_table,
+       .mmu.tlb_remove_table   = tlb_remove_table,

         .mmu.exit_mmap          = paravirt_nop,
         .mmu.notify_page_enc_status_changed     = paravirt_nop,
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 1fef5ad32d5a8..b1c1f72c1fd1b 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -18,25 +18,6 @@ EXPORT_SYMBOL(physical_mask);
  #define PGTABLE_HIGHMEM 0
  #endif

-#ifndef CONFIG_PARAVIRT
-#ifndef CONFIG_PT_RECLAIM
-static inline
-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
-       struct ptdesc *ptdesc = (struct ptdesc *)table;
-
-       pagetable_dtor(ptdesc);
-       tlb_remove_page(tlb, ptdesc_page(ptdesc));
-}
-#else
-static inline
-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
-       tlb_remove_table(tlb, table);
-}
-#endif /* !CONFIG_PT_RECLAIM */
-#endif /* !CONFIG_PARAVIRT */
-
  gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;

  pgtable_t pte_alloc_one(struct mm_struct *mm)
@@ -64,7 +45,7 @@ early_param("userpte", setup_userpte);
  void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
  {
         paravirt_release_pte(page_to_pfn(pte));
-       paravirt_tlb_remove_table(tlb, page_ptdesc(pte));
+       tlb_remove_table(tlb, page_ptdesc(pte));
  }

  #if CONFIG_PGTABLE_LEVELS > 2
@@ -78,21 +59,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
  #ifdef CONFIG_X86_PAE
         tlb->need_flush_all = 1;
  #endif
-       paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pmd));
+       tlb_remove_table(tlb, virt_to_ptdesc(pmd));
  }

  #if CONFIG_PGTABLE_LEVELS > 3
  void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
  {
         paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
-       paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pud));
+       tlb_remove_table(tlb, virt_to_ptdesc(pud));
  }

  #if CONFIG_PGTABLE_LEVELS > 4
  void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
  {
         paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
-       paravirt_tlb_remove_table(tlb, virt_to_ptdesc(p4d));
+       tlb_remove_table(tlb, virt_to_ptdesc(p4d));
  }
  #endif /* CONFIG_PGTABLE_LEVELS > 4 */
  #endif /* CONFIG_PGTABLE_LEVELS > 3 */

Then do the following test:

stress-ng --timeout 60 --times --verify --metrics --no-rand-seed 
--mmapaddr 64

The test results are as follows:

root@...ian:~# stress-ng --timeout 60 --times --verify --metrics 
--no-rand-seed --mmapaddr 64
stress-ng: info:  [870] dispatching hogs: 64 mmapaddr
stress-ng: info:  [870] successful run completed in 60.07s (1 min, 0.07 
secs)
stress-ng: info:  [870] stressor       bogo ops real time  usr time  sys 
time   bogo ops/s   bogo ops/s
stress-ng: info:  [870]                           (secs)    (secs) 
(secs)   (real time) (usr+sys time)
stress-ng: info:  [870] mmapaddr       17841978     60.01    237.78 
1130.36    297306.42     13041.05
stress-ng: info:  [870] for a 60.07s run time:
stress-ng: info:  [870]    1441.79s available CPU time
stress-ng: info:  [870]     238.14s user time   ( 16.52%)
stress-ng: info:  [870]    1130.80s system time ( 78.43%)
stress-ng: info:  [870]    1368.94s total time  ( 94.95%)
stress-ng: info:  [870] load average: 57.42 21.77 7.97

The perf hotspots are as follows:

   15.59%  [kernel]  [k] _raw_spin_unlock_irqrestore
    9.14%  [kernel]  [k] clear_page_rep
    7.17%  [kernel]  [k] do_syscall_64
    3.69%  [kernel]  [k] _raw_spin_lock
    3.37%  [kernel]  [k] __slab_free
    2.06%  [kernel]  [k] rcu_cblist_dequeue
    2.01%  [kernel]  [k] flush_tlb_mm_range
    1.84%  [kernel]  [k] lruvec_stat_mod_folio.part.131
    1.79%  [kernel]  [k] get_page_from_freelist
    1.64%  [kernel]  [k] kmem_cache_alloc_noprof
    1.53%  [kernel]  [k] tlb_remove_table_rcu
    1.48%  [kernel]  [k] mtree_range_walk

The call stack is as follows:

@[
_raw_spin_unlock_irqrestore+5
free_one_page+85
rcu_do_batch+424
rcu_core+401
handle_softirqs+204
irq_exit_rcu+208
sysvec_apic_timer_interrupt+113
asm_sysvec_apic_timer_interrupt+26
_raw_spin_unlock_irqrestore+29
get_page_from_freelist+2014
__alloc_frozen_pages_noprof+364
alloc_pages_mpol+123
alloc_pages_noprof+14
pte_alloc_one+30
__pte_alloc+42
do_pte_missing+2493
__handle_mm_fault+1914
handle_mm_fault+195
__get_user_pages+690
populate_vma_page_range+127
__mm_populate+159
vm_mmap_pgoff+329
do_syscall_64+98
entry_SYSCALL_64_after_hwframe+118
, stress-ng-mmapa]: 1306
@[
_raw_spin_unlock_irqrestore+5
get_page_from_freelist+2014
__alloc_frozen_pages_noprof+364
alloc_pages_mpol+123
alloc_pages_noprof+14
pte_alloc_one+30
__pte_alloc+42
move_page_tables+2285
move_vma+472
__do_sys_mremap+1759
do_syscall_64+98
entry_SYSCALL_64_after_hwframe+118
, stress-ng-mmapa]: 1536
@[
_raw_spin_unlock_irqrestore+5
free_one_page+85
tlb_remove_table_rcu+140
rcu_do_batch+424
rcu_core+401
handle_softirqs+204
irq_exit_rcu+208
sysvec_apic_timer_interrupt+113
asm_sysvec_apic_timer_interrupt+26
_raw_spin_unlock_irqrestore+29
get_page_from_freelist+2014
__alloc_frozen_pages_noprof+364
alloc_pages_mpol+123
alloc_pages_noprof+14
get_free_pages_noprof+17
tlb_remove_table+82
free_pgd_range+655
free_pgtables+601
vms_clear_ptes.part.39+255
vms_complete_munmap_vmas+311
do_vmi_align_munmap+419
do_vmi_munmap+195
move_vma+802
__do_sys_mremap+1759
do_syscall_64+98
entry_SYSCALL_64_after_hwframe+118
, stress-ng-mmapa]: 1558
@[
_raw_spin_unlock_irqrestore+5
__hrtimer_run_queues+255
hrtimer_interrupt+258
__sysvec_apic_timer_interrupt+85
sysvec_apic_timer_interrupt+56
asm_sysvec_apic_timer_interrupt+26
, stress-ng-mmapa]: 1772
@[
_raw_spin_unlock_irqrestore+5
get_partial_node.part.102+378
___slab_alloc.part.103+1180
__slab_alloc.isra.104+34
kmem_cache_alloc_noprof+192
mas_alloc_nodes+358
mas_preallocate+151
__mmap_region+1883
do_mmap+1164
vm_mmap_pgoff+239
do_syscall_64+98
entry_SYSCALL_64_after_hwframe+118
, stress-ng-mmapa]: 2654
@[
_raw_spin_unlock_irqrestore+5
get_partial_node.part.102+378
___slab_alloc.part.103+1180
__slab_alloc.isra.104+34
kmem_cache_alloc_noprof+192
mas_alloc_nodes+358
mas_store_gfp+183
do_vmi_align_munmap+398
do_vmi_munmap+195
__vm_munmap+177
__x64_sys_munmap+27
do_syscall_64+98
entry_SYSCALL_64_after_hwframe+118
, stress-ng-mmapa]: 2810
@[
_raw_spin_unlock_irqrestore+5
free_one_page+85
tlb_remove_table_rcu+140
rcu_do_batch+424
rcu_core+401
handle_softirqs+204
irq_exit_rcu+208
sysvec_apic_timer_interrupt+113
asm_sysvec_apic_timer_interrupt+26
_raw_spin_unlock_irqrestore+29
get_page_from_freelist+2014
__alloc_frozen_pages_noprof+364
alloc_pages_mpol+123
alloc_pages_noprof+14
pte_alloc_one+30
__pte_alloc+42
do_pte_missing+2493
__handle_mm_fault+1914
handle_mm_fault+195
__get_user_pages+690
populate_vma_page_range+127
__mm_populate+159
vm_mmap_pgoff+329
do_syscall_64+98
entry_SYSCALL_64_after_hwframe+118
, stress-ng-mmapa]: 3044
@Error looking up stack id 4294967279 (pid -1): -1
[, stress-ng-mmapa]: 101654

Thanks!


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ