[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <fdbcb6f8-caf7-48cb-810b-44ff8c75ae5b@bytedance.com>
Date: Wed, 29 Jan 2025 01:51:10 +0800
From: Qi Zheng <zhengqi.arch@...edance.com>
To: David Hildenbrand <david@...hat.com>,
Peter Zijlstra <peterz@...radead.org>, Rik van Riel <riel@...riel.com>
Cc: kernel test robot <oliver.sang@...el.com>, oe-lkp@...ts.linux.dev,
lkp@...el.com, linux-kernel@...r.kernel.org,
Andrew Morton <akpm@...ux-foundation.org>,
Dave Hansen <dave.hansen@...ux.intel.com>, Andy Lutomirski
<luto@...nel.org>, Catalin Marinas <catalin.marinas@....com>,
David Rientjes <rientjes@...gle.com>, Hugh Dickins <hughd@...gle.com>,
Jann Horn <jannh@...gle.com>, Lorenzo Stoakes <lorenzo.stoakes@...cle.com>,
Matthew Wilcox <willy@...radead.org>, Mel Gorman <mgorman@...e.de>,
Muchun Song <muchun.song@...ux.dev>, Peter Xu <peterx@...hat.com>,
Will Deacon <will@...nel.org>, Zach O'Keefe <zokeefe@...gle.com>,
Dan Carpenter <dan.carpenter@...aro.org>
Subject: Re: [linus:master] [x86] 4817f70c25: stress-ng.mmapaddr.ops_per_sec
63.0% regression
Hi,
On 2025/1/29 01:06, Qi Zheng wrote:
> Hi,
>
[...]
> @[
> _raw_spin_unlock_irqrestore+5
> get_page_from_freelist+2014
> __alloc_frozen_pages_noprof+364
> alloc_pages_mpol+123
> alloc_pages_noprof+14
> pte_alloc_one+30
> __pte_alloc+42
> do_pte_missing+2499
> __handle_mm_fault+1862
> handle_mm_fault+195
> __get_user_pages+690
> populate_vma_page_range+127
> __mm_populate+159
> vm_mmap_pgoff+329
> do_syscall_64+98
> entry_SYSCALL_64_after_hwframe+118
> , stress-ng-mmapa]: 2443
> @[
> _raw_spin_unlock_irqrestore+5
> get_page_from_freelist+2014
> __alloc_frozen_pages_noprof+364
> alloc_pages_mpol+123
> alloc_pages_noprof+14
> get_free_pages_noprof+17
> __x64_sys_mincore+141
> do_syscall_64+98
> entry_SYSCALL_64_after_hwframe+118
> , stress-ng-mmapa]: 5184
> @[
> _raw_spin_unlock_irqrestore+5
> free_one_page+85
> tlb_remove_table_rcu+140
> rcu_do_batch+424
> rcu_core+401
> handle_softirqs+204
> irq_exit_rcu+208
> sysvec_apic_timer_interrupt+113
> asm_sysvec_apic_timer_interrupt+26
> _raw_spin_unlock_irqrestore+29
> get_page_from_freelist+2014
> __alloc_frozen_pages_noprof+364
> alloc_pages_mpol+123
> alloc_pages_noprof+14
> get_free_pages_noprof+17
> __x64_sys_mincore+141
> do_syscall_64+98
> entry_SYSCALL_64_after_hwframe+118
> , stress-ng-mmapa]: 5301
> @Error looking up stack id 4294967279 (pid -1): -1
> [, stress-ng-mmapa]: 53366
>
> It seems to be related to CONFIG_MMU_GATHER_RCU_TABLE_FREE?
I did the following test and reproduced the same performance regression:
1) disable CONFIG_PT_RECLAIM
CONFIG_ARCH_SUPPORTS_PT_RECLAIM=y
# CONFIG_PT_RECLAIM is not set
2) apply Rik's patch #1
(https://lore.kernel.org/lkml/20250123042447.2259648-2-riel@surriel.com/):
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 87198d957e2f1..17197d395976e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -277,7 +277,7 @@ config X86
select HAVE_PCI
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
- select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT
+ select MMU_GATHER_RCU_TABLE_FREE
select MMU_GATHER_MERGE_VMAS
select HAVE_POSIX_CPU_TIMERS_TASK_WORK
select HAVE_REGS_AND_STACK_ACCESS_API
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1ccaa3397a670..527f5605aa3e5 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -59,21 +59,6 @@ void __init native_pv_lock_init(void)
static_branch_enable(&virt_spin_lock_key);
}
-#ifndef CONFIG_PT_RECLAIM
-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- struct ptdesc *ptdesc = (struct ptdesc *)table;
-
- pagetable_dtor(ptdesc);
- tlb_remove_page(tlb, ptdesc_page(ptdesc));
-}
-#else
-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- tlb_remove_table(tlb, table);
-}
-#endif
-
struct static_key paravirt_steal_enabled;
struct static_key paravirt_steal_rq_enabled;
@@ -195,7 +180,7 @@ struct paravirt_patch_template pv_ops = {
.mmu.flush_tlb_kernel = native_flush_tlb_global,
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
.mmu.flush_tlb_multi = native_flush_tlb_multi,
- .mmu.tlb_remove_table = native_tlb_remove_table,
+ .mmu.tlb_remove_table = tlb_remove_table,
.mmu.exit_mmap = paravirt_nop,
.mmu.notify_page_enc_status_changed = paravirt_nop,
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 1fef5ad32d5a8..b1c1f72c1fd1b 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -18,25 +18,6 @@ EXPORT_SYMBOL(physical_mask);
#define PGTABLE_HIGHMEM 0
#endif
-#ifndef CONFIG_PARAVIRT
-#ifndef CONFIG_PT_RECLAIM
-static inline
-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- struct ptdesc *ptdesc = (struct ptdesc *)table;
-
- pagetable_dtor(ptdesc);
- tlb_remove_page(tlb, ptdesc_page(ptdesc));
-}
-#else
-static inline
-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- tlb_remove_table(tlb, table);
-}
-#endif /* !CONFIG_PT_RECLAIM */
-#endif /* !CONFIG_PARAVIRT */
-
gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
pgtable_t pte_alloc_one(struct mm_struct *mm)
@@ -64,7 +45,7 @@ early_param("userpte", setup_userpte);
void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
paravirt_release_pte(page_to_pfn(pte));
- paravirt_tlb_remove_table(tlb, page_ptdesc(pte));
+ tlb_remove_table(tlb, page_ptdesc(pte));
}
#if CONFIG_PGTABLE_LEVELS > 2
@@ -78,21 +59,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
#ifdef CONFIG_X86_PAE
tlb->need_flush_all = 1;
#endif
- paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pmd));
+ tlb_remove_table(tlb, virt_to_ptdesc(pmd));
}
#if CONFIG_PGTABLE_LEVELS > 3
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
- paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pud));
+ tlb_remove_table(tlb, virt_to_ptdesc(pud));
}
#if CONFIG_PGTABLE_LEVELS > 4
void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
{
paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
- paravirt_tlb_remove_table(tlb, virt_to_ptdesc(p4d));
+ tlb_remove_table(tlb, virt_to_ptdesc(p4d));
}
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
Then do the following test:
stress-ng --timeout 60 --times --verify --metrics --no-rand-seed
--mmapaddr 64
The test results are as follows:
root@...ian:~# stress-ng --timeout 60 --times --verify --metrics
--no-rand-seed --mmapaddr 64
stress-ng: info: [870] dispatching hogs: 64 mmapaddr
stress-ng: info: [870] successful run completed in 60.07s (1 min, 0.07
secs)
stress-ng: info: [870] stressor bogo ops real time usr time sys
time bogo ops/s bogo ops/s
stress-ng: info: [870] (secs) (secs)
(secs) (real time) (usr+sys time)
stress-ng: info: [870] mmapaddr 17841978 60.01 237.78
1130.36 297306.42 13041.05
stress-ng: info: [870] for a 60.07s run time:
stress-ng: info: [870] 1441.79s available CPU time
stress-ng: info: [870] 238.14s user time ( 16.52%)
stress-ng: info: [870] 1130.80s system time ( 78.43%)
stress-ng: info: [870] 1368.94s total time ( 94.95%)
stress-ng: info: [870] load average: 57.42 21.77 7.97
The perf hotspots are as follows:
15.59% [kernel] [k] _raw_spin_unlock_irqrestore
9.14% [kernel] [k] clear_page_rep
7.17% [kernel] [k] do_syscall_64
3.69% [kernel] [k] _raw_spin_lock
3.37% [kernel] [k] __slab_free
2.06% [kernel] [k] rcu_cblist_dequeue
2.01% [kernel] [k] flush_tlb_mm_range
1.84% [kernel] [k] lruvec_stat_mod_folio.part.131
1.79% [kernel] [k] get_page_from_freelist
1.64% [kernel] [k] kmem_cache_alloc_noprof
1.53% [kernel] [k] tlb_remove_table_rcu
1.48% [kernel] [k] mtree_range_walk
The call stack is as follows:
@[
_raw_spin_unlock_irqrestore+5
free_one_page+85
rcu_do_batch+424
rcu_core+401
handle_softirqs+204
irq_exit_rcu+208
sysvec_apic_timer_interrupt+113
asm_sysvec_apic_timer_interrupt+26
_raw_spin_unlock_irqrestore+29
get_page_from_freelist+2014
__alloc_frozen_pages_noprof+364
alloc_pages_mpol+123
alloc_pages_noprof+14
pte_alloc_one+30
__pte_alloc+42
do_pte_missing+2493
__handle_mm_fault+1914
handle_mm_fault+195
__get_user_pages+690
populate_vma_page_range+127
__mm_populate+159
vm_mmap_pgoff+329
do_syscall_64+98
entry_SYSCALL_64_after_hwframe+118
, stress-ng-mmapa]: 1306
@[
_raw_spin_unlock_irqrestore+5
get_page_from_freelist+2014
__alloc_frozen_pages_noprof+364
alloc_pages_mpol+123
alloc_pages_noprof+14
pte_alloc_one+30
__pte_alloc+42
move_page_tables+2285
move_vma+472
__do_sys_mremap+1759
do_syscall_64+98
entry_SYSCALL_64_after_hwframe+118
, stress-ng-mmapa]: 1536
@[
_raw_spin_unlock_irqrestore+5
free_one_page+85
tlb_remove_table_rcu+140
rcu_do_batch+424
rcu_core+401
handle_softirqs+204
irq_exit_rcu+208
sysvec_apic_timer_interrupt+113
asm_sysvec_apic_timer_interrupt+26
_raw_spin_unlock_irqrestore+29
get_page_from_freelist+2014
__alloc_frozen_pages_noprof+364
alloc_pages_mpol+123
alloc_pages_noprof+14
get_free_pages_noprof+17
tlb_remove_table+82
free_pgd_range+655
free_pgtables+601
vms_clear_ptes.part.39+255
vms_complete_munmap_vmas+311
do_vmi_align_munmap+419
do_vmi_munmap+195
move_vma+802
__do_sys_mremap+1759
do_syscall_64+98
entry_SYSCALL_64_after_hwframe+118
, stress-ng-mmapa]: 1558
@[
_raw_spin_unlock_irqrestore+5
__hrtimer_run_queues+255
hrtimer_interrupt+258
__sysvec_apic_timer_interrupt+85
sysvec_apic_timer_interrupt+56
asm_sysvec_apic_timer_interrupt+26
, stress-ng-mmapa]: 1772
@[
_raw_spin_unlock_irqrestore+5
get_partial_node.part.102+378
___slab_alloc.part.103+1180
__slab_alloc.isra.104+34
kmem_cache_alloc_noprof+192
mas_alloc_nodes+358
mas_preallocate+151
__mmap_region+1883
do_mmap+1164
vm_mmap_pgoff+239
do_syscall_64+98
entry_SYSCALL_64_after_hwframe+118
, stress-ng-mmapa]: 2654
@[
_raw_spin_unlock_irqrestore+5
get_partial_node.part.102+378
___slab_alloc.part.103+1180
__slab_alloc.isra.104+34
kmem_cache_alloc_noprof+192
mas_alloc_nodes+358
mas_store_gfp+183
do_vmi_align_munmap+398
do_vmi_munmap+195
__vm_munmap+177
__x64_sys_munmap+27
do_syscall_64+98
entry_SYSCALL_64_after_hwframe+118
, stress-ng-mmapa]: 2810
@[
_raw_spin_unlock_irqrestore+5
free_one_page+85
tlb_remove_table_rcu+140
rcu_do_batch+424
rcu_core+401
handle_softirqs+204
irq_exit_rcu+208
sysvec_apic_timer_interrupt+113
asm_sysvec_apic_timer_interrupt+26
_raw_spin_unlock_irqrestore+29
get_page_from_freelist+2014
__alloc_frozen_pages_noprof+364
alloc_pages_mpol+123
alloc_pages_noprof+14
pte_alloc_one+30
__pte_alloc+42
do_pte_missing+2493
__handle_mm_fault+1914
handle_mm_fault+195
__get_user_pages+690
populate_vma_page_range+127
__mm_populate+159
vm_mmap_pgoff+329
do_syscall_64+98
entry_SYSCALL_64_after_hwframe+118
, stress-ng-mmapa]: 3044
@Error looking up stack id 4294967279 (pid -1): -1
[, stress-ng-mmapa]: 101654
Thanks!
Powered by blists - more mailing lists