[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20ed69ad-5dad-446b-9f01-86ad8b1c67fa@huawei.com>
Date: Thu, 15 Aug 2024 21:27:53 +0800
From: Kefeng Wang <wangkefeng.wang@...wei.com>
To: Kairui Song <ryncsn@...il.com>, Chuanhua Han <hanchuanhua@...o.com>, Barry
Song <21cnbao@...il.com>
CC: <akpm@...ux-foundation.org>, <linux-mm@...ck.org>,
<baolin.wang@...ux.alibaba.com>, <chrisl@...nel.org>, <david@...hat.com>,
<hannes@...xchg.org>, <hughd@...gle.com>, <kaleshsingh@...gle.com>,
<linux-kernel@...r.kernel.org>, <mhocko@...e.com>, <minchan@...nel.org>,
<nphamcs@...il.com>, <ryan.roberts@....com>, <senozhatsky@...omium.org>,
<shakeel.butt@...ux.dev>, <shy828301@...il.com>, <surenb@...gle.com>,
<v-songbaohua@...o.com>, <willy@...radead.org>, <xiang@...nel.org>,
<ying.huang@...el.com>, <yosryahmed@...gle.com>, <hch@...radead.org>
Subject: Re: [PATCH v6 2/2] mm: support large folios swap-in for zRAM-like
devices
On 2024/8/15 17:47, Kairui Song wrote:
> On Fri, Aug 2, 2024 at 8:21 PM Barry Song <21cnbao@...il.com> wrote:
>>
>> From: Chuanhua Han <hanchuanhua@...o.com>
>
> Hi Chuanhua,
>
>>
...
>> +
>> +static struct folio *alloc_swap_folio(struct vm_fault *vmf)
>> +{
>> + struct vm_area_struct *vma = vmf->vma;
>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> + unsigned long orders;
>> + struct folio *folio;
>> + unsigned long addr;
>> + swp_entry_t entry;
>> + spinlock_t *ptl;
>> + pte_t *pte;
>> + gfp_t gfp;
>> + int order;
>> +
>> + /*
>> + * If uffd is active for the vma we need per-page fault fidelity to
>> + * maintain the uffd semantics.
>> + */
>> + if (unlikely(userfaultfd_armed(vma)))
>> + goto fallback;
>> +
>> + /*
>> + * A large swapped out folio could be partially or fully in zswap. We
>> + * lack handling for such cases, so fallback to swapping in order-0
>> + * folio.
>> + */
>> + if (!zswap_never_enabled())
>> + goto fallback;
>> +
>> + entry = pte_to_swp_entry(vmf->orig_pte);
>> + /*
>> + * Get a list of all the (large) orders below PMD_ORDER that are enabled
>> + * and suitable for swapping THP.
>> + */
>> + orders = thp_vma_allowable_orders(vma, vma->vm_flags,
>> + TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
>> + orders = thp_vma_suitable_orders(vma, vmf->address, orders);
>> + orders = thp_swap_suitable_orders(swp_offset(entry), vmf->address, orders);
>> +
>> + if (!orders)
>> + goto fallback;
>> +
>> + pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address & PMD_MASK, &ptl);
>> + if (unlikely(!pte))
>> + goto fallback;
>> +
>> + /*
>> + * For do_swap_page, find the highest order where the aligned range is
>> + * completely swap entries with contiguous swap offsets.
>> + */
>> + order = highest_order(orders);
>> + while (orders) {
>> + addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>> + if (can_swapin_thp(vmf, pte + pte_index(addr), 1 << order))
>> + break;
>> + order = next_order(&orders, order);
>> + }
>> +
>> + pte_unmap_unlock(pte, ptl);
>> +
>> + /* Try allocating the highest of the remaining orders. */
>> + gfp = vma_thp_gfp_mask(vma);
>> + while (orders) {
>> + addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>> + folio = vma_alloc_folio(gfp, order, vma, addr, true);
>> + if (folio)
>> + return folio;
>> + order = next_order(&orders, order);
>> + }
>> +
>> +fallback:
>> +#endif
>> + return vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address, false);
>> +}
>> +
>> +
>> /*
>> * We enter with non-exclusive mmap_lock (to exclude vma changes,
>> * but allow concurrent faults), and pte mapped but not yet locked.
>> @@ -4074,35 +4220,37 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>> if (!folio) {
>> if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
>> __swap_count(entry) == 1) {
>> - /*
>> - * Prevent parallel swapin from proceeding with
>> - * the cache flag. Otherwise, another thread may
>> - * finish swapin first, free the entry, and swapout
>> - * reusing the same entry. It's undetectable as
>> - * pte_same() returns true due to entry reuse.
>> - */
>> - if (swapcache_prepare(entry, 1)) {
>> - /* Relax a bit to prevent rapid repeated page faults */
>> - schedule_timeout_uninterruptible(1);
>> - goto out;
>> - }
>> - need_clear_cache = true;
>> -
>> /* skip swapcache */
>> - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
>> - vma, vmf->address, false);
>> + folio = alloc_swap_folio(vmf);
>> page = &folio->page;
>> if (folio) {
>> __folio_set_locked(folio);
>> __folio_set_swapbacked(folio);
>>
>> + nr_pages = folio_nr_pages(folio);
>> + if (folio_test_large(folio))
>> + entry.val = ALIGN_DOWN(entry.val, nr_pages);
>> + /*
>> + * Prevent parallel swapin from proceeding with
>> + * the cache flag. Otherwise, another thread may
>> + * finish swapin first, free the entry, and swapout
>> + * reusing the same entry. It's undetectable as
>> + * pte_same() returns true due to entry reuse.
>> + */
>> + if (swapcache_prepare(entry, nr_pages)) {
>> + /* Relax a bit to prevent rapid repeated page faults */
>> + schedule_timeout_uninterruptible(1);
>> + goto out_page;
>> + }
>> + need_clear_cache = true;
>> +
>> if (mem_cgroup_swapin_charge_folio(folio,
>> vma->vm_mm, GFP_KERNEL,
>> entry)) {
>> ret = VM_FAULT_OOM;
>> goto out_page;
>> }
>
> After your patch, with build kernel test, I'm seeing kernel log
> spamming like this:
> [ 101.048594] pagefault_out_of_memory: 95 callbacks suppressed
> [ 101.048599] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> [ 101.059416] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> [ 101.118575] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> [ 101.125585] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> [ 101.182501] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> [ 101.215351] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> [ 101.272822] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> [ 101.403195] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> ............
>
> And heavy performance loss with workloads limited by memcg, mTHP enabled.
>
> After some debugging, the problematic part is the
> mem_cgroup_swapin_charge_folio call above.
> When under pressure, cgroup charge fails easily for mTHP. One 64k
> swapin will require a much more aggressive reclaim to success.
>
> If I change MAX_RECLAIM_RETRIES from 16 to 512, the spamming log is
> gone and mTHP swapin should have a much higher swapin success rate.
> But this might not be the right way.
>
> For this particular issue, maybe you can change the charge order, try
> charging first, if successful, use mTHP. if failed, fallback to 4k?
This is what we did in alloc_anon_folio(), see 085ff35e7636
("mm: memory: move mem_cgroup_charge() into alloc_anon_folio()"),
1) fallback earlier
2) using same GFP flags for allocation and charge
but it seems that there is a little complicated for swapin charge
>
>> - mem_cgroup_swapin_uncharge_swap(entry, 1);
>> + mem_cgroup_swapin_uncharge_swap(entry, nr_pages);
>>
>> shadow = get_shadow_from_swap_cache(entry);
>> if (shadow)
>> @@ -4209,6 +4357,22 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>> goto out_nomap;
>> }
>>
>> + /* allocated large folios for SWP_SYNCHRONOUS_IO */
>> + if (folio_test_large(folio) && !folio_test_swapcache(folio)) {
>> + unsigned long nr = folio_nr_pages(folio);
>> + unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE);
>> + unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE;
>> + pte_t *folio_ptep = vmf->pte - idx;
>> +
>> + if (!can_swapin_thp(vmf, folio_ptep, nr))
>> + goto out_nomap;
>> +
>> + page_idx = idx;
>> + address = folio_start;
>> + ptep = folio_ptep;
>> + goto check_folio;
>> + }
>> +
>> nr_pages = 1;
>> page_idx = 0;
>> address = vmf->address;
>> @@ -4340,11 +4504,12 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>> folio_add_lru_vma(folio, vma);
>> } else if (!folio_test_anon(folio)) {
>> /*
>> - * We currently only expect small !anon folios, which are either
>> - * fully exclusive or fully shared. If we ever get large folios
>> - * here, we have to be careful.
>> + * We currently only expect small !anon folios which are either
>> + * fully exclusive or fully shared, or new allocated large folios
>> + * which are fully exclusive. If we ever get large folios within
>> + * swapcache here, we have to be careful.
>> */
>> - VM_WARN_ON_ONCE(folio_test_large(folio));
>> + VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio));
>> VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
>> folio_add_new_anon_rmap(folio, vma, address, rmap_flags);
>> } else {
>> @@ -4387,7 +4552,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>> out:
>> /* Clear the swap cache pin for direct swapin after PTL unlock */
>> if (need_clear_cache)
>> - swapcache_clear(si, entry, 1);
>> + swapcache_clear(si, entry, nr_pages);
>> if (si)
>> put_swap_device(si);
>> return ret;
>> @@ -4403,7 +4568,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>> folio_put(swapcache);
>> }
>> if (need_clear_cache)
>> - swapcache_clear(si, entry, 1);
>> + swapcache_clear(si, entry, nr_pages);
>> if (si)
>> put_swap_device(si);
>> return ret;
>> --
>> 2.34.1
>>
>>
>
Powered by blists - more mailing lists