linux-kernel - Re: [PATCH v6 2/2] mm: support large folios swap-in for zRAM-like devices

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20ed69ad-5dad-446b-9f01-86ad8b1c67fa@huawei.com>
Date: Thu, 15 Aug 2024 21:27:53 +0800
From: Kefeng Wang <wangkefeng.wang@...wei.com>
To: Kairui Song <ryncsn@...il.com>, Chuanhua Han <hanchuanhua@...o.com>, Barry
 Song <21cnbao@...il.com>
CC: <akpm@...ux-foundation.org>, <linux-mm@...ck.org>,
	<baolin.wang@...ux.alibaba.com>, <chrisl@...nel.org>, <david@...hat.com>,
	<hannes@...xchg.org>, <hughd@...gle.com>, <kaleshsingh@...gle.com>,
	<linux-kernel@...r.kernel.org>, <mhocko@...e.com>, <minchan@...nel.org>,
	<nphamcs@...il.com>, <ryan.roberts@....com>, <senozhatsky@...omium.org>,
	<shakeel.butt@...ux.dev>, <shy828301@...il.com>, <surenb@...gle.com>,
	<v-songbaohua@...o.com>, <willy@...radead.org>, <xiang@...nel.org>,
	<ying.huang@...el.com>, <yosryahmed@...gle.com>, <hch@...radead.org>
Subject: Re: [PATCH v6 2/2] mm: support large folios swap-in for zRAM-like
 devices



On 2024/8/15 17:47, Kairui Song wrote:
> On Fri, Aug 2, 2024 at 8:21 PM Barry Song <21cnbao@...il.com> wrote:
>>
>> From: Chuanhua Han <hanchuanhua@...o.com>
> 
> Hi Chuanhua,
> 
>>
...

>> +
>> +static struct folio *alloc_swap_folio(struct vm_fault *vmf)
>> +{
>> +       struct vm_area_struct *vma = vmf->vma;
>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> +       unsigned long orders;
>> +       struct folio *folio;
>> +       unsigned long addr;
>> +       swp_entry_t entry;
>> +       spinlock_t *ptl;
>> +       pte_t *pte;
>> +       gfp_t gfp;
>> +       int order;
>> +
>> +       /*
>> +        * If uffd is active for the vma we need per-page fault fidelity to
>> +        * maintain the uffd semantics.
>> +        */
>> +       if (unlikely(userfaultfd_armed(vma)))
>> +               goto fallback;
>> +
>> +       /*
>> +        * A large swapped out folio could be partially or fully in zswap. We
>> +        * lack handling for such cases, so fallback to swapping in order-0
>> +        * folio.
>> +        */
>> +       if (!zswap_never_enabled())
>> +               goto fallback;
>> +
>> +       entry = pte_to_swp_entry(vmf->orig_pte);
>> +       /*
>> +        * Get a list of all the (large) orders below PMD_ORDER that are enabled
>> +        * and suitable for swapping THP.
>> +        */
>> +       orders = thp_vma_allowable_orders(vma, vma->vm_flags,
>> +                       TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
>> +       orders = thp_vma_suitable_orders(vma, vmf->address, orders);
>> +       orders = thp_swap_suitable_orders(swp_offset(entry), vmf->address, orders);
>> +
>> +       if (!orders)
>> +               goto fallback;
>> +
>> +       pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address & PMD_MASK, &ptl);
>> +       if (unlikely(!pte))
>> +               goto fallback;
>> +
>> +       /*
>> +        * For do_swap_page, find the highest order where the aligned range is
>> +        * completely swap entries with contiguous swap offsets.
>> +        */
>> +       order = highest_order(orders);
>> +       while (orders) {
>> +               addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>> +               if (can_swapin_thp(vmf, pte + pte_index(addr), 1 << order))
>> +                       break;
>> +               order = next_order(&orders, order);
>> +       }
>> +
>> +       pte_unmap_unlock(pte, ptl);
>> +
>> +       /* Try allocating the highest of the remaining orders. */
>> +       gfp = vma_thp_gfp_mask(vma);
>> +       while (orders) {
>> +               addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>> +               folio = vma_alloc_folio(gfp, order, vma, addr, true);
>> +               if (folio)
>> +                       return folio;
>> +               order = next_order(&orders, order);
>> +       }
>> +
>> +fallback:
>> +#endif
>> +       return vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address, false);
>> +}
>> +
>> +
>>   /*
>>    * We enter with non-exclusive mmap_lock (to exclude vma changes,
>>    * but allow concurrent faults), and pte mapped but not yet locked.
>> @@ -4074,35 +4220,37 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>          if (!folio) {
>>                  if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
>>                      __swap_count(entry) == 1) {
>> -                       /*
>> -                        * Prevent parallel swapin from proceeding with
>> -                        * the cache flag. Otherwise, another thread may
>> -                        * finish swapin first, free the entry, and swapout
>> -                        * reusing the same entry. It's undetectable as
>> -                        * pte_same() returns true due to entry reuse.
>> -                        */
>> -                       if (swapcache_prepare(entry, 1)) {
>> -                               /* Relax a bit to prevent rapid repeated page faults */
>> -                               schedule_timeout_uninterruptible(1);
>> -                               goto out;
>> -                       }
>> -                       need_clear_cache = true;
>> -
>>                          /* skip swapcache */
>> -                       folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
>> -                                               vma, vmf->address, false);
>> +                       folio = alloc_swap_folio(vmf);
>>                          page = &folio->page;
>>                          if (folio) {
>>                                  __folio_set_locked(folio);
>>                                  __folio_set_swapbacked(folio);
>>
>> +                               nr_pages = folio_nr_pages(folio);
>> +                               if (folio_test_large(folio))
>> +                                       entry.val = ALIGN_DOWN(entry.val, nr_pages);
>> +                               /*
>> +                                * Prevent parallel swapin from proceeding with
>> +                                * the cache flag. Otherwise, another thread may
>> +                                * finish swapin first, free the entry, and swapout
>> +                                * reusing the same entry. It's undetectable as
>> +                                * pte_same() returns true due to entry reuse.
>> +                                */
>> +                               if (swapcache_prepare(entry, nr_pages)) {
>> +                                       /* Relax a bit to prevent rapid repeated page faults */
>> +                                       schedule_timeout_uninterruptible(1);
>> +                                       goto out_page;
>> +                               }
>> +                               need_clear_cache = true;
>> +
>>                                  if (mem_cgroup_swapin_charge_folio(folio,
>>                                                          vma->vm_mm, GFP_KERNEL,
>>                                                          entry)) {
>>                                          ret = VM_FAULT_OOM;
>>                                          goto out_page;
>>                                  }
> 
> After your patch, with build kernel test, I'm seeing kernel log
> spamming like this:
> [  101.048594] pagefault_out_of_memory: 95 callbacks suppressed
> [  101.048599] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> [  101.059416] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> [  101.118575] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> [  101.125585] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> [  101.182501] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> [  101.215351] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> [  101.272822] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> [  101.403195] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> ............
> 
> And heavy performance loss with workloads limited by memcg, mTHP enabled.
> 
> After some debugging, the problematic part is the
> mem_cgroup_swapin_charge_folio call above.
> When under pressure, cgroup charge fails easily for mTHP. One 64k
> swapin will require a much more aggressive reclaim to success.
> 
> If I change MAX_RECLAIM_RETRIES from 16 to 512, the spamming log is
> gone and mTHP swapin should have a much higher swapin success rate.
> But this might not be the right way.
> 
> For this particular issue, maybe you can change the charge order, try
> charging first, if successful, use mTHP. if failed, fallback to 4k?

This is what we did in alloc_anon_folio(), see 085ff35e7636
("mm: memory: move mem_cgroup_charge() into alloc_anon_folio()"),
1) fallback earlier
2) using same GFP flags for allocation and charge

but it seems that there is a little complicated for swapin charge


> 
>> -                               mem_cgroup_swapin_uncharge_swap(entry, 1);
>> +                               mem_cgroup_swapin_uncharge_swap(entry, nr_pages);
>>
>>                                  shadow = get_shadow_from_swap_cache(entry);
>>                                  if (shadow)
>> @@ -4209,6 +4357,22 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>                  goto out_nomap;
>>          }
>>
>> +       /* allocated large folios for SWP_SYNCHRONOUS_IO */
>> +       if (folio_test_large(folio) && !folio_test_swapcache(folio)) {
>> +               unsigned long nr = folio_nr_pages(folio);
>> +               unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE);
>> +               unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE;
>> +               pte_t *folio_ptep = vmf->pte - idx;
>> +
>> +               if (!can_swapin_thp(vmf, folio_ptep, nr))
>> +                       goto out_nomap;
>> +
>> +               page_idx = idx;
>> +               address = folio_start;
>> +               ptep = folio_ptep;
>> +               goto check_folio;
>> +       }
>> +
>>          nr_pages = 1;
>>          page_idx = 0;
>>          address = vmf->address;
>> @@ -4340,11 +4504,12 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>                  folio_add_lru_vma(folio, vma);
>>          } else if (!folio_test_anon(folio)) {
>>                  /*
>> -                * We currently only expect small !anon folios, which are either
>> -                * fully exclusive or fully shared. If we ever get large folios
>> -                * here, we have to be careful.
>> +                * We currently only expect small !anon folios which are either
>> +                * fully exclusive or fully shared, or new allocated large folios
>> +                * which are fully exclusive. If we ever get large folios within
>> +                * swapcache here, we have to be careful.
>>                   */
>> -               VM_WARN_ON_ONCE(folio_test_large(folio));
>> +               VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio));
>>                  VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
>>                  folio_add_new_anon_rmap(folio, vma, address, rmap_flags);
>>          } else {
>> @@ -4387,7 +4552,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>   out:
>>          /* Clear the swap cache pin for direct swapin after PTL unlock */
>>          if (need_clear_cache)
>> -               swapcache_clear(si, entry, 1);
>> +               swapcache_clear(si, entry, nr_pages);
>>          if (si)
>>                  put_swap_device(si);
>>          return ret;
>> @@ -4403,7 +4568,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>                  folio_put(swapcache);
>>          }
>>          if (need_clear_cache)
>> -               swapcache_clear(si, entry, 1);
>> +               swapcache_clear(si, entry, nr_pages);
>>          if (si)
>>                  put_swap_device(si);
>>          return ret;
>> --
>> 2.34.1
>>
>>
>