[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <84f5bb25-fcdd-4509-aae6-de9a5bd49e86@vivo.com>
Date: Wed, 4 Sep 2024 19:38:24 +0800
From: zhiguojiang <justinjiang@...o.com>
To: Barry Song <21cnbao@...il.com>
Cc: Andrew Morton <akpm@...ux-foundation.org>, linux-mm@...ck.org,
linux-kernel@...r.kernel.org, Will Deacon <will@...nel.org>,
"Aneesh Kumar K.V" <aneesh.kumar@...nel.org>, Nick Piggin
<npiggin@...il.com>, Peter Zijlstra <peterz@...radead.org>,
Arnd Bergmann <arnd@...db.de>, Johannes Weiner <hannes@...xchg.org>,
Michal Hocko <mhocko@...nel.org>, Roman Gushchin <roman.gushchin@...ux.dev>,
Shakeel Butt <shakeel.butt@...ux.dev>, Muchun Song <muchun.song@...ux.dev>,
linux-arch@...r.kernel.org, cgroups@...r.kernel.org,
David Hildenbrand <david@...hat.com>, opensource.kernel@...o.com
Subject: Re: [PATCH v3 2/2] mm: tlb: add tlb swap entries batch async release
在 2024/9/4 19:26, zhiguojiang 写道:
>
>
> 在 2024/9/4 17:16, Barry Song 写道:
>> On Tue, Aug 6, 2024 at 3:36 AM Zhiguo Jiang <justinjiang@...o.com>
>> wrote:
>>> One of the main reasons for the prolonged exit of the process with
>>> independent mm is the time-consuming release of its swap entries.
>>> The proportion of swap memory occupied by the process increases over
>>> time due to high memory pressure triggering to reclaim anonymous folio
>>> into swapspace, e.g., in Android devices, we found this proportion can
>>> reach 60% or more after a period of time. Additionally, the relatively
>>> lengthy path for releasing swap entries further contributes to the
>>> longer time required to release swap entries.
>>>
>>> Testing Platform: 8GB RAM
>>> Testing procedure:
>>> After booting up, start 15 processes first, and then observe the
>>> physical memory size occupied by the last launched process at different
>>> time points.
>>> Example: The process launched last: com.qiyi.video
>>> | memory type | 0min | 1min | 5min | 10min | 15min |
>>> -------------------------------------------------------------------
>>> | VmRSS(KB) | 453832 | 252300 | 204364 | 199944 | 199748 |
>>> | RssAnon(KB) | 247348 | 99296 | 71268 | 67808 | 67660 |
>>> | RssFile(KB) | 205536 | 152020 | 132144 | 131184 | 131136 |
>>> | RssShmem(KB) | 1048 | 984 | 952 | 952 | 952 |
>>> | VmSwap(KB) | 202692 | 334852 | 362880 | 366340 | 366488 |
>>> | Swap ratio(%) | 30.87% | 57.03% | 63.97% | 64.69% | 64.72% |
>>> Note: min - minute.
>>>
>>> When there are multiple processes with independent mm and the high
>>> memory pressure in system, if the large memory required process is
>>> launched at this time, system will is likely to trigger the
>>> instantaneous
>>> killing of many processes with independent mm. Due to multiple exiting
>>> processes occupying multiple CPU core resources for concurrent
>>> execution,
>>> leading to some issues such as the current non-exiting and important
>>> processes lagging.
>>>
>>> To solve this problem, we have introduced the multiple exiting process
>>> asynchronous swap entries release mechanism, which isolates and caches
>>> swap entries occupied by multiple exiting processes, and hands them
>>> over
>>> to an asynchronous kworker to complete the release. This allows the
>>> exiting processes to complete quickly and release CPU resources. We
>>> have
>>> validated this modification on the Android products and achieved the
>>> expected benefits.
>>>
>>> Testing Platform: 8GB RAM
>>> Testing procedure:
>>> After restarting the machine, start 15 app processes first, and then
>>> start the camera app processes, we monitor the cold start and preview
>>> time datas of the camera app processes.
>>>
>>> Test datas of camera processes cold start time (unit: millisecond):
>>> | seq | 1 | 2 | 3 | 4 | 5 | 6 | average |
>>> | before | 1498 | 1476 | 1741 | 1337 | 1367 | 1655 | 1512 |
>>> | after | 1396 | 1107 | 1136 | 1178 | 1071 | 1339 | 1204 |
>>>
>>> Test datas of camera processes preview time (unit: millisecond):
>>> | seq | 1 | 2 | 3 | 4 | 5 | 6 | average |
>>> | before | 267 | 402 | 504 | 513 | 161 | 265 | 352 |
>>> | after | 188 | 223 | 301 | 203 | 162 | 154 | 205 |
>>>
>>> Base on the average of the six sets of test datas above, we can see
>>> that
>>> the benefit datas of the modified patch:
>>> 1. The cold start time of camera app processes has reduced by about
>>> 20%.
>>> 2. The preview time of camera app processes has reduced by about 42%.
>>>
>>> It offers several benefits:
>>> 1. Alleviate the high system cpu loading caused by multiple exiting
>>> processes running simultaneously.
>>> 2. Reduce lock competition in swap entry free path by an asynchronous
>>> kworker instead of multiple exiting processes parallel execution.
>>> 3. Release pte_present memory occupied by exiting processes more
>>> efficiently.
>>>
>>> Signed-off-by: Zhiguo Jiang <justinjiang@...o.com>
>>> ---
>>> arch/s390/include/asm/tlb.h | 8 +
>>> include/asm-generic/tlb.h | 44 ++++++
>>> include/linux/mm_types.h | 58 +++++++
>>> mm/memory.c | 3 +-
>>> mm/mmu_gather.c | 296
>>> ++++++++++++++++++++++++++++++++++++
>>> 5 files changed, 408 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
>>> index e95b2c8081eb..3f681f63390f
>>> --- a/arch/s390/include/asm/tlb.h
>>> +++ b/arch/s390/include/asm/tlb.h
>>> @@ -28,6 +28,8 @@ static inline bool __tlb_remove_page_size(struct
>>> mmu_gather *tlb,
>>> struct page *page, bool delay_rmap, int page_size);
>>> static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb,
>>> struct page *page, unsigned int nr_pages, bool
>>> delay_rmap);
>>> +static inline bool __tlb_remove_swap_entries(struct mmu_gather *tlb,
>>> + swp_entry_t entry, int nr);
>>>
>>> #define tlb_flush tlb_flush
>>> #define pte_free_tlb pte_free_tlb
>>> @@ -69,6 +71,12 @@ static inline bool
>>> __tlb_remove_folio_pages(struct mmu_gather *tlb,
>>> return false;
>>> }
>>>
>>> +static inline bool __tlb_remove_swap_entries(struct mmu_gather *tlb,
>>> + swp_entry_t entry, int nr)
>>> +{
>>> + return false;
>>> +}
>>> +
>>> static inline void tlb_flush(struct mmu_gather *tlb)
>>> {
>>> __tlb_flush_mm_lazy(tlb->mm);
>>> diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
>>> index 709830274b75..8b4d516b35b8
>>> --- a/include/asm-generic/tlb.h
>>> +++ b/include/asm-generic/tlb.h
>>> @@ -294,6 +294,37 @@ extern void tlb_flush_rmaps(struct mmu_gather
>>> *tlb, struct vm_area_struct *vma);
>>> static inline void tlb_flush_rmaps(struct mmu_gather *tlb, struct
>>> vm_area_struct *vma) { }
>>> #endif
>>>
>>> +#ifndef CONFIG_MMU_GATHER_NO_GATHER
>>> +struct mmu_swap_batch {
>>> + struct mmu_swap_batch *next;
>>> + unsigned int nr;
>>> + unsigned int max;
>>> + encoded_swpentry_t encoded_entrys[];
>>> +};
>>> +
>>> +#define MAX_SWAP_GATHER_BATCH \
>>> + ((PAGE_SIZE - sizeof(struct mmu_swap_batch)) / sizeof(void *))
>>> +
>>> +#define MAX_SWAP_GATHER_BATCH_COUNT (10000UL /
>>> MAX_SWAP_GATHER_BATCH)
>>> +
>>> +struct mmu_swap_gather {
>>> + /*
>>> + * the asynchronous kworker to batch
>>> + * release swap entries
>>> + */
>>> + struct work_struct free_work;
>>> +
>>> + /* batch cache swap entries */
>>> + unsigned int batch_count;
>>> + struct mmu_swap_batch *active;
>>> + struct mmu_swap_batch local;
>>> + encoded_swpentry_t __encoded_entrys[MMU_GATHER_BUNDLE];
>>> +};
>>> +
>>> +bool __tlb_remove_swap_entries(struct mmu_gather *tlb,
>>> + swp_entry_t entry, int nr);
>>> +#endif
>>> +
>>> /*
>>> * struct mmu_gather is an opaque type used by the mm code for
>>> passing around
>>> * any data needed by arch specific code for tlb_remove_page.
>>> @@ -343,6 +374,18 @@ struct mmu_gather {
>>> unsigned int vma_exec : 1;
>>> unsigned int vma_huge : 1;
>>> unsigned int vma_pfn : 1;
>>> +#ifndef CONFIG_MMU_GATHER_NO_GATHER
>>> + /*
>>> + * Two states of releasing swap entries
>>> + * asynchronously:
>>> + * swp_freeable - have opportunity to
>>> + * release asynchronously future
>>> + * swp_freeing - be releasing asynchronously.
>>> + */
>>> + unsigned int swp_freeable : 1;
>>> + unsigned int swp_freeing : 1;
>>> + unsigned int swp_disable : 1;
>>> +#endif
>>>
>>> unsigned int batch_count;
>>>
>>> @@ -354,6 +397,7 @@ struct mmu_gather {
>>> #ifdef CONFIG_MMU_GATHER_PAGE_SIZE
>>> unsigned int page_size;
>>> #endif
>>> + struct mmu_swap_gather *swp;
>>> #endif
>>> };
>>>
>>> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
>>> index 165c58b12ccc..2f66303f1519
>>> --- a/include/linux/mm_types.h
>>> +++ b/include/linux/mm_types.h
>>> @@ -283,6 +283,64 @@ typedef struct {
>>> unsigned long val;
>>> } swp_entry_t;
>>>
>>> +/*
>>> + * encoded_swpentry_t - a type marking the encoded swp_entry_t.
>>> + *
>>> + * An 'encoded_swpentry_t' represents a 'swp_enrty_t' with its the
>>> highest
>>> + * bit indicating extra context-dependent information. Only used in
>>> swp_entry
>>> + * asynchronous release path by mmu_swap_gather.
>>> + */
>>> +typedef struct {
>>> + unsigned long val;
>>> +} encoded_swpentry_t;
>>> +
>>> +/*
>>> + * The next item in an encoded_swpentry_t array is the "nr"
>>> argument, specifying the
>>> + * total number of consecutive swap entries associated with the
>>> same folio. If this
>>> + * bit is not set, "nr" is implicitly 1.
>>> + *
>>> + * Refer to include\asm\pgtable.h, swp_offset bits: 0 ~ 57,
>>> swp_type bits: 58 ~ 62.
>>> + * Bit63 can be used here.
>>> + */
>>> +#define ENCODED_SWPENTRY_BIT_NR_ENTRYS_NEXT (1UL << (BITS_PER_LONG
>>> - 1))
>>> +
>>> +static __always_inline encoded_swpentry_t
>>> +encode_swpentry(swp_entry_t entry, unsigned long flags)
>>> +{
>>> + encoded_swpentry_t ret;
>>> +
>>> + VM_WARN_ON_ONCE(flags & ~ENCODED_SWPENTRY_BIT_NR_ENTRYS_NEXT);
>>> + ret.val = flags | entry.val;
>>> + return ret;
>>> +}
>>> +
>>> +static inline unsigned long
>>> encoded_swpentry_flags(encoded_swpentry_t entry)
>>> +{
>>> + return ENCODED_SWPENTRY_BIT_NR_ENTRYS_NEXT & entry.val;
>>> +}
>>> +
>>> +static inline swp_entry_t encoded_swpentry_data(encoded_swpentry_t
>>> entry)
>>> +{
>>> + swp_entry_t ret;
>>> +
>>> + ret.val = ~ENCODED_SWPENTRY_BIT_NR_ENTRYS_NEXT & entry.val;
>>> + return ret;
>>> +}
>>> +
>>> +static __always_inline encoded_swpentry_t
>>> encode_nr_swpentrys(unsigned long nr)
>>> +{
>>> + encoded_swpentry_t ret;
>>> +
>>> + VM_WARN_ON_ONCE(nr & ENCODED_SWPENTRY_BIT_NR_ENTRYS_NEXT);
>>> + ret.val = nr;
>>> + return ret;
>>> +}
>>> +
>>> +static __always_inline unsigned long
>>> encoded_nr_swpentrys(encoded_swpentry_t entry)
>>> +{
>>> + return ((~ENCODED_SWPENTRY_BIT_NR_ENTRYS_NEXT) & entry.val);
>>> +}
>>> +
>>> /**
>>> * struct folio - Represents a contiguous set of bytes.
>>> * @flags: Identical to the page flags.
>>> diff --git a/mm/memory.c b/mm/memory.c
>>> index d6a9dcddaca4..023a8adcb67c
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -1650,7 +1650,8 @@ static unsigned long zap_pte_range(struct
>>> mmu_gather *tlb,
>>> if (!should_zap_cows(details))
>>> continue;
>>> rss[MM_SWAPENTS] -= nr;
>>> - free_swap_and_cache_nr(entry, nr);
>>> + if (!__tlb_remove_swap_entries(tlb, entry, nr))
>>> + free_swap_and_cache_nr(entry, nr);
>>> } else if (is_migration_entry(entry)) {
>>> folio = pfn_swap_entry_folio(entry);
>>> if (!should_zap_folio(details, folio))
>>> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
>>> index 99b3e9408aa0..33dc9d1faff9
>>> --- a/mm/mmu_gather.c
>>> +++ b/mm/mmu_gather.c
>>> @@ -9,11 +9,303 @@
>>> #include <linux/smp.h>
>>> #include <linux/swap.h>
>>> #include <linux/rmap.h>
>>> +#include <linux/oom.h>
>>>
>>> #include <asm/pgalloc.h>
>>> #include <asm/tlb.h>
>>>
>>> #ifndef CONFIG_MMU_GATHER_NO_GATHER
>>> +/*
>>> + * The swp_entry asynchronous release mechanism for multiple
>>> processes with
>>> + * independent mm exiting simultaneously.
>>> + *
>>> + * During the multiple exiting processes releasing their own mm
>>> simultaneously,
>>> + * the swap entries in the exiting processes are handled by
>>> isolating, caching
>>> + * and handing over to an asynchronous kworker to complete the
>>> release.
>>> + *
>>> + * The conditions for the exiting process entering the swp_entry
>>> asynchronous
>>> + * release path:
>>> + * 1. The exiting process's MM_SWAPENTS count is >=
>>> SWAP_CLUSTER_MAX, avoiding
>>> + * to alloc struct mmu_swap_gather frequently.
>>> + * 2. The number of exiting processes is >= NR_MIN_EXITING_PROCESSES.
>> Hi Zhiguo,
>>
>> I'm curious about the significance of NR_MIN_EXITING_PROCESSES. It
>> seems that
>> batched swap entry freeing, even with one process, could be a
>> bottleneck for a single
>> process based on the data from this patch:
>>
>> mm: attempt to batch free swap entries for zap_pte_range()
>> https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git/commit/?h=mm-stable&id=bea67dcc5ee
>>
>> "munmap bandwidth becomes 3X faster."
>>
>> So what would happen if you simply set NR_MIN_EXITING_PROCESSES to 1?
> Hi Barry,
>
> Thanks for your comments.
>
> The reason for NR_MIN_EXITING_PROCESSES = 2 is that previously we
> conducted the multiple android apps continuous startup performance
> test on the case of NR_MIN_EXITING_PROCESSES = 1, and the results
> showed that the startup time had deteriorated slightly. However,
> the patch's logic in this test was different from the one I submitted
> to the community, and it may be due to some other issues with the
> previous old patch.
>
> However, we have conducted relevant memory performance tests on this
> patches submitted to the community (NR_MIN_EXITING_PROCESSES=2), and
> the results are better than before the modification. The patches have
> been used on multiple projects.
> For example:
> Test the time consumption and subjective fluency experience of
> launching 30 android apps continuously for two rounds.
> Test machine: RAM 16GB
> | | time(s) | Frame-droped rate(%) |
> | before | 230.76 | 0.54 |
> | after | 225.23 | 0.74 |
Sorry, update date.
Test machine: RAM 16GB
| | time(s) | Frame-droped rate(%) |
| before | 230.76 | 0.74 |
| after | 225.23 | 0.54 |
> We can see that the patch has been optimized 5.53s for startup time and
> 0.2% frame-droped rate and better subjective smoothness experience.
>
> Perhaps the patches submitted to the community has also improved the
> multiple android apps continuous startup performance in the case
> of NR_MIN_EXITING_PROCESSES=1. If necessary, I will conduct relevant
> tests to verify this situation in the future.
>>
>>> + *
>>> + * Since the time for determining the number of exiting processes
>>> is dynamic,
>>> + * the exiting process may start to enter the swp_entry
>>> asynchronous release
>>> + * at the beginning or middle stage of the exiting process's
>>> swp_entry release
>>> + * path.
>>> + *
>>> + * Once an exiting process enters the swp_entry asynchronous
>>> release, all remaining
>>> + * swap entries in this exiting process need to be fully released
>>> by asynchronous
>>> + * kworker theoretically.
>> Freeing a slot can indeed release memory from `zRAM`, potentially
>> returning
>> it to the system for allocation. Your patch frees swap slots
>> asynchronously;
>> I assume this doesn’t slow down the memory freeing process for
>> `zRAM`, or
>> could it even slow down the freeing of `zRAM` memory? Freeing compressed
>> memory might not be as crucial compared to freeing uncompressed
>> memory with
>> present PTEs?
> Yes, freeing uncompressed memory with present PTEs is more important
> compared to freeing compressed 'zRAM' memory.
>
> I guess that the multiple exiting processes releasing swap entries
> simultaneously may result in the swap_info->lock competition pressure
> in swapcache_free_entries(), affecting the efficiency of releasing swap
> entries. However, if the asynchronous kworker is used, this issue can
> be avoided, and perhaps the improvement is minor.
>
> The freeing of zRAM memory does not slow down. We have observed traces
> in the camera startup scene and found that the asynchronous kworker
> can release all swap entries before entering the camera preview.
> Compared to not using the asynchronous kworker, the exiting processes
> completed after entering the camera preview.
>>
>>> + *
>>> + * The function of the swp_entry asynchronous release:
>>> + * 1. Alleviate the high system cpu load caused by multiple exiting
>>> processes
>>> + * running simultaneously.
>>> + * 2. Reduce lock competition in swap entry free path by an
>>> asynchronous kworker
>>> + * instead of multiple exiting processes parallel execution.
>>> + * 3. Release pte_present memory occupied by exiting processes more
>>> efficiently.
>>> + */
>>> +
>>> +/*
>>> + * The min number of exiting processes required for swp_entry
>>> asynchronous release
>>> + */
>>> +#define NR_MIN_EXITING_PROCESSES 2
>>> +
>>> +static atomic_t nr_exiting_processes = ATOMIC_INIT(0);
>>> +static struct kmem_cache *swap_gather_cachep;
>>> +static struct workqueue_struct *swapfree_wq;
>>> +static DEFINE_STATIC_KEY_TRUE(tlb_swap_asyncfree_disabled);
>>> +
>>> +static int __init tlb_swap_async_free_setup(void)
>>> +{
>>> + swapfree_wq = alloc_workqueue("smfree_wq", WQ_UNBOUND |
>>> + WQ_HIGHPRI | WQ_MEM_RECLAIM, 1);
>>> + if (!swapfree_wq)
>>> + goto fail;
>>> +
>>> + swap_gather_cachep = kmem_cache_create("swap_gather",
>>> + sizeof(struct mmu_swap_gather),
>>> + 0, SLAB_TYPESAFE_BY_RCU | SLAB_PANIC | SLAB_ACCOUNT,
>>> + NULL);
>>> + if (!swap_gather_cachep)
>>> + goto kcache_fail;
>>> +
>>> + static_branch_disable(&tlb_swap_asyncfree_disabled);
>>> + return 0;
>>> +
>>> +kcache_fail:
>>> + destroy_workqueue(swapfree_wq);
>>> +fail:
>>> + return -ENOMEM;
>>> +}
>>> +postcore_initcall(tlb_swap_async_free_setup);
>>> +
>>> +static void __tlb_swap_gather_free(struct mmu_swap_gather
>>> *swap_gather)
>>> +{
>>> + struct mmu_swap_batch *swap_batch, *next;
>>> +
>>> + for (swap_batch = swap_gather->local.next; swap_batch;
>>> swap_batch = next) {
>>> + next = swap_batch->next;
>>> + free_page((unsigned long)swap_batch);
>>> + }
>>> + swap_gather->local.next = NULL;
>>> + kmem_cache_free(swap_gather_cachep, swap_gather);
>>> +}
>>> +
>>> +static void tlb_swap_async_free_work(struct work_struct *w)
>>> +{
>>> + int i, nr_multi, nr_free;
>>> + swp_entry_t start_entry;
>>> + struct mmu_swap_batch *swap_batch;
>>> + struct mmu_swap_gather *swap_gather = container_of(w,
>>> + struct mmu_swap_gather, free_work);
>>> +
>>> + /* Release swap entries cached in mmu_swap_batch. */
>>> + for (swap_batch = &swap_gather->local; swap_batch &&
>>> swap_batch->nr;
>>> + swap_batch = swap_batch->next) {
>>> + nr_free = 0;
>>> + for (i = 0; i < swap_batch->nr; i++) {
>>> + if
>>> (unlikely(encoded_swpentry_flags(swap_batch->encoded_entrys[i]) &
>>> + ENCODED_SWPENTRY_BIT_NR_ENTRYS_NEXT)) {
>>> + start_entry =
>>> encoded_swpentry_data(swap_batch->encoded_entrys[i]);
>>> + nr_multi =
>>> encoded_nr_swpentrys(swap_batch->encoded_entrys[++i]);
>>> + free_swap_and_cache_nr(start_entry, nr_multi);
>>> + nr_free += 2;
>>> + } else {
>>> + start_entry =
>>> encoded_swpentry_data(swap_batch->encoded_entrys[i]);
>>> + free_swap_and_cache_nr(start_entry, 1);
>>> + nr_free++;
>>> + }
>>> + }
>>> + swap_batch->nr -= nr_free;
>>> + VM_BUG_ON(swap_batch->nr);
>>> + }
>>> + __tlb_swap_gather_free(swap_gather);
>>> +}
>>> +
>>> +static bool __tlb_swap_gather_mmu_check(struct mmu_gather *tlb)
>>> +{
>>> + /*
>>> + * Only the exiting processes with the MM_SWAPENTS counter >=
>>> + * SWAP_CLUSTER_MAX have the opportunity to release their swap
>>> + * entries by asynchronous kworker.
>>> + */
>>> + if (!task_is_dying() ||
>>> + get_mm_counter(tlb->mm, MM_SWAPENTS) < SWAP_CLUSTER_MAX)
>>> + return true;
>>> +
>>> + atomic_inc(&nr_exiting_processes);
>>> + if (atomic_read(&nr_exiting_processes) <
>>> NR_MIN_EXITING_PROCESSES)
>>> + tlb->swp_freeable = 1;
>>> + else
>>> + tlb->swp_freeing = 1;
>>> +
>>> + return false;
>>> +}
>>> +
>>> +/**
>>> + * __tlb_swap_gather_init - Initialize an mmu_swap_gather structure
>>> + * for swp_entry tear-down.
>>> + * @tlb: the mmu_swap_gather structure belongs to tlb
>>> + */
>>> +static bool __tlb_swap_gather_init(struct mmu_gather *tlb)
>>> +{
>>> + tlb->swp = kmem_cache_alloc(swap_gather_cachep, GFP_ATOMIC |
>>> GFP_NOWAIT);
>>> + if (unlikely(!tlb->swp))
>>> + return false;
>>> +
>>> + tlb->swp->local.next = NULL;
>>> + tlb->swp->local.nr = 0;
>>> + tlb->swp->local.max = ARRAY_SIZE(tlb->swp->__encoded_entrys);
>>> +
>>> + tlb->swp->active = &tlb->swp->local;
>>> + tlb->swp->batch_count = 0;
>>> +
>>> + INIT_WORK(&tlb->swp->free_work, tlb_swap_async_free_work);
>>> + return true;
>>> +}
>>> +
>>> +static void __tlb_swap_gather_mmu(struct mmu_gather *tlb)
>>> +{
>>> + if (static_branch_unlikely(&tlb_swap_asyncfree_disabled))
>>> + return;
>>> +
>>> + tlb->swp = NULL;
>>> + tlb->swp_freeable = 0;
>>> + tlb->swp_freeing = 0;
>>> + tlb->swp_disable = 0;
>>> +
>>> + if (__tlb_swap_gather_mmu_check(tlb))
>>> + return;
>>> +
>>> + /*
>>> + * If the exiting process meets the conditions of
>>> + * swp_entry asynchronous release, an mmu_swap_gather
>>> + * structure will be initialized.
>>> + */
>>> + if (tlb->swp_freeing)
>>> + __tlb_swap_gather_init(tlb);
>>> +}
>>> +
>>> +static void __tlb_swap_gather_queuework(struct mmu_gather *tlb,
>>> bool finish)
>>> +{
>>> + queue_work(swapfree_wq, &tlb->swp->free_work);
>>> + tlb->swp = NULL;
>>> + if (!finish)
>>> + __tlb_swap_gather_init(tlb);
>>> +}
>>> +
>>> +static bool __tlb_swap_next_batch(struct mmu_gather *tlb)
>>> +{
>>> + struct mmu_swap_batch *swap_batch;
>>> +
>>> + if (tlb->swp->batch_count == MAX_SWAP_GATHER_BATCH_COUNT)
>>> + goto free;
>>> +
>>> + swap_batch = (void *)__get_free_page(GFP_ATOMIC | GFP_NOWAIT);
>>> + if (unlikely(!swap_batch))
>>> + goto free;
>>> +
>>> + swap_batch->next = NULL;
>>> + swap_batch->nr = 0;
>>> + swap_batch->max = MAX_SWAP_GATHER_BATCH;
>>> +
>>> + tlb->swp->active->next = swap_batch;
>>> + tlb->swp->active = swap_batch;
>>> + tlb->swp->batch_count++;
>>> + return true;
>>> +free:
>>> + /* batch move to wq */
>>> + __tlb_swap_gather_queuework(tlb, false);
>>> + return false;
>>> +}
>>> +
>>> +/**
>>> + * __tlb_remove_swap_entries - the swap entries in exiting process are
>>> + * isolated, batch cached in struct mmu_swap_batch.
>>> + * @tlb: the current mmu_gather
>>> + * @entry: swp_entry to be isolated and cached
>>> + * @nr: the number of consecutive entries starting from entry
>>> parameter.
>>> + */
>>> +bool __tlb_remove_swap_entries(struct mmu_gather *tlb,
>>> + swp_entry_t entry, int nr)
>>> +{
>>> + struct mmu_swap_batch *swap_batch;
>>> + unsigned long flags = 0;
>>> + bool ret = false;
>>> +
>>> + if (tlb->swp_disable)
>>> + return ret;
>>> +
>>> + if (!tlb->swp_freeable && !tlb->swp_freeing)
>>> + return ret;
>>> +
>>> + if (tlb->swp_freeable) {
>>> + if (atomic_read(&nr_exiting_processes) <
>>> + NR_MIN_EXITING_PROCESSES)
>>> + return ret;
>>> + /*
>>> + * If the current number of exiting processes
>>> + * is >= NR_MIN_EXITING_PROCESSES, the exiting
>>> + * process with swp_freeable state will enter
>>> + * swp_freeing state to start releasing its
>>> + * remaining swap entries by the asynchronous
>>> + * kworker.
>>> + */
>>> + tlb->swp_freeable = 0;
>>> + tlb->swp_freeing = 1;
>>> + }
>>> +
>>> + VM_BUG_ON(tlb->swp_freeable || !tlb->swp_freeing);
>>> + if (!tlb->swp && !__tlb_swap_gather_init(tlb))
>>> + return ret;
>>> +
>>> + swap_batch = tlb->swp->active;
>>> + if (unlikely(swap_batch->nr >= swap_batch->max - 1)) {
>>> + __tlb_swap_gather_queuework(tlb, false);
>>> + return ret;
>>> + }
>>> +
>>> + if (likely(nr == 1)) {
>>> + swap_batch->encoded_entrys[swap_batch->nr++] =
>>> encode_swpentry(entry, flags);
>>> + } else {
>>> + flags |= ENCODED_SWPENTRY_BIT_NR_ENTRYS_NEXT;
>>> + swap_batch->encoded_entrys[swap_batch->nr++] =
>>> encode_swpentry(entry, flags);
>>> + swap_batch->encoded_entrys[swap_batch->nr++] =
>>> encode_nr_swpentrys(nr);
>>> + }
>>> + ret = true;
>>> +
>>> + if (swap_batch->nr >= swap_batch->max - 1) {
>>> + if (!__tlb_swap_next_batch(tlb))
>>> + goto exit;
>>> + swap_batch = tlb->swp->active;
>>> + }
>>> + VM_BUG_ON(swap_batch->nr > swap_batch->max - 1);
>>> +exit:
>>> + return ret;
>>> +}
>>> +
>>> +static void __tlb_batch_swap_finish(struct mmu_gather *tlb)
>>> +{
>>> + if (tlb->swp_disable)
>>> + return;
>>> +
>>> + if (!tlb->swp_freeable && !tlb->swp_freeing)
>>> + return;
>>> +
>>> + if (tlb->swp_freeable) {
>>> + tlb->swp_freeable = 0;
>>> + VM_BUG_ON(tlb->swp_freeing);
>>> + goto exit;
>>> + }
>>> + tlb->swp_freeing = 0;
>>> + if (unlikely(!tlb->swp))
>>> + goto exit;
>>> +
>>> + __tlb_swap_gather_queuework(tlb, true);
>>> +exit:
>>> + atomic_dec(&nr_exiting_processes);
>>> +}
>>>
>>> static bool tlb_next_batch(struct mmu_gather *tlb)
>>> {
>>> @@ -386,6 +678,9 @@ static void __tlb_gather_mmu(struct mmu_gather
>>> *tlb, struct mm_struct *mm,
>>> tlb->local.max = ARRAY_SIZE(tlb->__pages);
>>> tlb->active = &tlb->local;
>>> tlb->batch_count = 0;
>>> +
>>> + tlb->swp_disable = 1;
>>> + __tlb_swap_gather_mmu(tlb);
>>> #endif
>>> tlb->delayed_rmap = 0;
>>>
>>> @@ -466,6 +761,7 @@ void tlb_finish_mmu(struct mmu_gather *tlb)
>>>
>>> #ifndef CONFIG_MMU_GATHER_NO_GATHER
>>> tlb_batch_list_free(tlb);
>>> + __tlb_batch_swap_finish(tlb);
>>> #endif
>>> dec_tlb_flush_pending(tlb->mm);
>>> }
>>> --
>>> 2.39.0
>>>
>> Thanks
>> Barry
> Thanks
> Zhiguo
>
Powered by blists - more mailing lists