linux-kernel - [RFC v2 42/43] shmem: reduce time holding xa

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1617140178-8773-43-git-send-email-anthony.yznaga@oracle.com>
Date:   Tue, 30 Mar 2021 14:36:17 -0700
From:   Anthony Yznaga <anthony.yznaga@...cle.com>
To:     linux-mm@...ck.org, linux-kernel@...r.kernel.org
Cc:     willy@...radead.org, corbet@....net, tglx@...utronix.de,
        mingo@...hat.com, bp@...en8.de, x86@...nel.org, hpa@...or.com,
        dave.hansen@...ux.intel.com, luto@...nel.org, peterz@...radead.org,
        rppt@...nel.org, akpm@...ux-foundation.org, hughd@...gle.com,
        ebiederm@...ssion.com, keescook@...omium.org, ardb@...nel.org,
        nivedita@...m.mit.edu, jroedel@...e.de, masahiroy@...nel.org,
        nathan@...nel.org, terrelln@...com, vincenzo.frascino@....com,
        martin.b.radev@...il.com, andreyknvl@...gle.com,
        daniel.kiper@...cle.com, rafael.j.wysocki@...el.com,
        dan.j.williams@...el.com, Jonathan.Cameron@...wei.com,
        bhe@...hat.com, rminnich@...il.com, ashish.kalra@....com,
        guro@...com, hannes@...xchg.org, mhocko@...nel.org,
        iamjoonsoo.kim@....com, vbabka@...e.cz, alex.shi@...ux.alibaba.com,
        david@...hat.com, richard.weiyang@...il.com,
        vdavydov.dev@...il.com, graf@...zon.com, jason.zeng@...el.com,
        lei.l.li@...el.com, daniel.m.jordan@...cle.com,
        steven.sistare@...cle.com, linux-fsdevel@...r.kernel.org,
        linux-doc@...r.kernel.org, kexec@...ts.infradead.org
Subject: [RFC v2 42/43] shmem: reduce time holding xa_lock when inserting pages

Rather than adding one page at a time to the page cache and taking the
page cache xarray lock each time, where possible add pages in bulk by
first populating an xarray node outside of the page cache before taking
the lock to insert it.
When a group of pages to be inserted will fill an xarray node, add them
to a local xarray, export the xarray node, and then take the lock on the
page cache xarray and insert the node.

Signed-off-by: Anthony Yznaga <anthony.yznaga@...cle.com>
---
 mm/shmem.c | 162 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 156 insertions(+), 6 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index f495af51042e..a7c23b43b57f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -827,17 +827,149 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
 	BUG_ON(error);
 }
 
+static int shmem_add_aligned_to_page_cache(struct page *pages[], int npages,
+					   struct address_space *mapping,
+					   pgoff_t index, gfp_t gfp, int order,
+					   struct mm_struct *charge_mm)
+{
+	int xa_shift = order + XA_CHUNK_SHIFT - (order % XA_CHUNK_SHIFT);
+	XA_STATE_ORDER(xas, &mapping->i_pages, index, xa_shift);
+	struct xarray xa_tmp;
+	/*
+	 * Specify order so xas_create_range() only needs to be called once
+	 * to allocate the entire range.  This guarantees that xas_store()
+	 * will not fail due to lack of memory.
+	 * Specify index == 0 so the minimum necessary nodes are allocated.
+	 */
+	XA_STATE_ORDER(xas_tmp, &xa_tmp, 0, xa_shift);
+	unsigned long nr = 1UL << order;
+	struct xa_node *node;
+	int i, error;
+
+	if (npages * nr != 1 << xa_shift) {
+		WARN_ONCE(1, "npages (%d) not aligned to xa_shift\n", npages);
+		return -EINVAL;
+	}
+	if (!IS_ALIGNED(index, 1 << xa_shift)) {
+		WARN_ONCE(1, "index (%lu) not aligned to xa_shift\n", index);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < npages; i++) {
+		bool skipcharge = page_memcg(pages[i]) ? true : false;
+
+		VM_BUG_ON_PAGE(PageTail(pages[i]), pages[i]);
+		VM_BUG_ON_PAGE(!PageLocked(pages[i]), pages[i]);
+		VM_BUG_ON_PAGE(!PageSwapBacked(pages[i]), pages[i]);
+
+		page_ref_add(pages[i], nr);
+		pages[i]->mapping = mapping;
+		pages[i]->index = index + (i * nr);
+
+		if (!skipcharge && !PageSwapCache(pages[i])) {
+			error = mem_cgroup_charge(pages[i], charge_mm, gfp);
+			if (error) {
+				if (PageTransHuge(pages[i])) {
+					count_vm_event(THP_FILE_FALLBACK);
+					count_vm_event(THP_FILE_FALLBACK_CHARGE);
+				}
+				goto error;
+			}
+		}
+		cgroup_throttle_swaprate(pages[i], gfp);
+	}
+
+	xa_init(&xa_tmp);
+	do {
+		xas_lock(&xas_tmp);
+		xas_create_range(&xas_tmp);
+		if (xas_error(&xas_tmp))
+			goto unlock;
+		for (i = 0; i < npages; i++) {
+			int j = 0;
+next:
+			xas_store(&xas_tmp, pages[i]);
+			if (++j < nr) {
+				xas_next(&xas_tmp);
+				goto next;
+			}
+			if (i < npages - 1)
+				xas_next(&xas_tmp);
+		}
+		xas_set_order(&xas_tmp, 0, xa_shift);
+		node = xas_export_node(&xas_tmp);
+unlock:
+		xas_unlock(&xas_tmp);
+	} while (xas_nomem(&xas_tmp, gfp));
+
+	if (xas_error(&xas_tmp)) {
+		error = xas_error(&xas_tmp);
+		i = npages - 1;
+		goto error;
+	}
+
+	do {
+		xas_lock_irq(&xas);
+		xas_import_node(&xas, node);
+		if (xas_error(&xas))
+			goto unlock1;
+		mapping->nrpages += nr * npages;
+		xas_unlock(&xas);
+		for (i = 0; i < npages; i++) {
+			__mod_lruvec_page_state(pages[i], NR_FILE_PAGES, nr);
+			__mod_lruvec_page_state(pages[i], NR_SHMEM, nr);
+			if (PageTransHuge(pages[i])) {
+				count_vm_event(THP_FILE_ALLOC);
+				__inc_node_page_state(pages[i], NR_SHMEM_THPS);
+			}
+		}
+		local_irq_enable();
+		break;
+unlock1:
+		xas_unlock_irq(&xas);
+	} while (xas_nomem(&xas, gfp));
+
+	if (xas_error(&xas)) {
+		error = xas_error(&xas);
+		goto error;
+	}
+
+	return 0;
+error:
+	while (i != 0) {
+		pages[i]->mapping = NULL;
+		page_ref_sub(pages[i], nr);
+		i--;
+	}
+	return error;
+}
+
 static int shmem_add_pages_to_cache(struct page *pages[], int npages,
 				struct address_space *mapping,
 				pgoff_t start, gfp_t gfp,
 				struct mm_struct *charge_mm)
 {
 	pgoff_t index = start;
-	int i, err;
+	int i, j, err;
 
 	i = 0;
 	while (i < npages) {
 		if (PageTransHuge(pages[i])) {
+			if (IS_ALIGNED(index, 4096) && i+8 <= npages) {
+				for (j = 1; j < 8; j++) {
+					if (!PageTransHuge(pages[i+j]))
+						break;
+				}
+				if (j == 8) {
+					err = shmem_add_aligned_to_page_cache(&pages[i], 8, mapping, index, gfp, HPAGE_PMD_ORDER, charge_mm);
+					if (err)
+						goto out_release;
+					index += HPAGE_PMD_NR * 8;
+					i += 8;
+					continue;
+				}
+			}
+
 			err = shmem_add_to_page_cache_fast(pages[i], mapping, index, gfp, charge_mm, page_memcg(pages[i]) ? true : false);
 			if (err)
 				goto out_release;
@@ -846,11 +978,29 @@ static int shmem_add_pages_to_cache(struct page *pages[], int npages,
 			continue;
 		}
 
-		err = shmem_add_to_page_cache_fast(pages[i], mapping, index, gfp, charge_mm, page_memcg(pages[i]) ? true : false);
-		if (err)
-			goto out_release;
-		index++;
-		i++;
+		for (j = 1; i + j < npages; j++) {
+			if (PageTransHuge(pages[i + j]))
+				break;
+		}
+
+		while (j > 0) {
+			if (IS_ALIGNED(index, 64) && j >= 64) {
+				err = shmem_add_aligned_to_page_cache(&pages[i], 64, mapping, index, gfp, 0, charge_mm);
+				if (err)
+					goto out_release;
+				index += 64;
+				i += 64;
+				j -= 64;
+				continue;
+			}
+
+			err = shmem_add_to_page_cache_fast(pages[i], mapping, index, gfp, charge_mm, page_memcg(pages[i]) ? true : false);
+			if (err)
+				goto out_release;
+			index++;
+			i++;
+			j--;
+		}
 	}
 	return 0;
 
-- 
1.8.3.1