lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Fri, 13 Nov 2020 18:59:35 +0800
From:   Muchun Song <songmuchun@...edance.com>
To:     corbet@....net, mike.kravetz@...cle.com, tglx@...utronix.de,
        mingo@...hat.com, bp@...en8.de, x86@...nel.org, hpa@...or.com,
        dave.hansen@...ux.intel.com, luto@...nel.org, peterz@...radead.org,
        viro@...iv.linux.org.uk, akpm@...ux-foundation.org,
        paulmck@...nel.org, mchehab+huawei@...nel.org,
        pawan.kumar.gupta@...ux.intel.com, rdunlap@...radead.org,
        oneukum@...e.com, anshuman.khandual@....com, jroedel@...e.de,
        almasrymina@...gle.com, rientjes@...gle.com, willy@...radead.org,
        osalvador@...e.de, mhocko@...e.com
Cc:     duanxiongchun@...edance.com, linux-doc@...r.kernel.org,
        linux-kernel@...r.kernel.org, linux-mm@...ck.org,
        linux-fsdevel@...r.kernel.org,
        Muchun Song <songmuchun@...edance.com>
Subject: [PATCH v4 04/21] mm/hugetlb: Introduce nr_free_vmemmap_pages in the struct hstate

If the size of HugeTLB page is 2MB, we need 512 struct page structures
(8 pages) to be associated with it. As far as I know, we only use the
first 4 struct page structures. Use of first 4 struct page structures
comes from HUGETLB_CGROUP_MIN_ORDER.

For tail pages, the value of compound_head is the same. So we can reuse
first page of tail page structs. We map the virtual addresses of the
remaining 6 pages of tail page structs to the first tail page struct,
and then free these 6 pages. Therefore, we need to reserve at least 2
pages as vmemmap areas.

So we introduce a new nr_free_vmemmap_pages field in the hstate to
indicate how many vmemmap pages associated with a HugeTLB page that we
can free to buddy system.

Signed-off-by: Muchun Song <songmuchun@...edance.com>
Acked-by: Mike Kravetz <mike.kravetz@...cle.com>
---
 include/linux/hugetlb.h |   3 ++
 mm/Makefile             |   1 +
 mm/hugetlb.c            |   3 ++
 mm/hugetlb_vmemmap.c    | 108 ++++++++++++++++++++++++++++++++++++++++++++++++
 mm/hugetlb_vmemmap.h    |  20 +++++++++
 5 files changed, 135 insertions(+)
 create mode 100644 mm/hugetlb_vmemmap.c
 create mode 100644 mm/hugetlb_vmemmap.h

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d5cc5f802dd4..eed3dd3bd626 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -492,6 +492,9 @@ struct hstate {
 	unsigned int nr_huge_pages_node[MAX_NUMNODES];
 	unsigned int free_huge_pages_node[MAX_NUMNODES];
 	unsigned int surplus_huge_pages_node[MAX_NUMNODES];
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+	unsigned int nr_free_vmemmap_pages;
+#endif
 #ifdef CONFIG_CGROUP_HUGETLB
 	/* cgroup control files */
 	struct cftype cgroup_files_dfl[7];
diff --git a/mm/Makefile b/mm/Makefile
index 752111587c99..2a734576bbc0 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -71,6 +71,7 @@ obj-$(CONFIG_FRONTSWAP)	+= frontswap.o
 obj-$(CONFIG_ZSWAP)	+= zswap.o
 obj-$(CONFIG_HAS_DMA)	+= dmapool.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
+obj-$(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP)	+= hugetlb_vmemmap.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM)	+= sparse.o
 obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 81a41aa080a5..f88032c24667 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -42,6 +42,7 @@
 #include <linux/userfaultfd_k.h>
 #include <linux/page_owner.h>
 #include "internal.h"
+#include "hugetlb_vmemmap.h"
 
 int hugetlb_max_hstate __read_mostly;
 unsigned int default_hstate_idx;
@@ -3285,6 +3286,8 @@ void __init hugetlb_add_hstate(unsigned int order)
 	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
 					huge_page_size(h)/1024);
 
+	hugetlb_vmemmap_init(h);
+
 	parsed_hstate = h;
 }
 
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
new file mode 100644
index 000000000000..a6c9948302e2
--- /dev/null
+++ b/mm/hugetlb_vmemmap.c
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Free some vmemmap pages of HugeTLB
+ *
+ * Copyright (c) 2020, Bytedance. All rights reserved.
+ *
+ *     Author: Muchun Song <songmuchun@...edance.com>
+ *
+ * Nowadays we track the status of physical page frames using struct page
+ * structures arranged in one or more arrays. And here exists one-to-one
+ * mapping between the physical page frame and the corresponding struct page
+ * structure.
+ *
+ * The HugeTLB support is built on top of multiple page size support that
+ * is provided by most modern architectures. For example, x86 CPUs normally
+ * support 4K and 2M (1G if architecturally supported) page sizes. Every
+ * HugeTLB has more than one struct page structure. The 2M HugeTLB has 512
+ * struct page structure and 1G HugeTLB has 4096 struct page structures. But
+ * in the core of HugeTLB only uses the first 4 (Use of first 4 struct page
+ * structures comes from HUGETLB_CGROUP_MIN_ORDER.) struct page structures to
+ * store metadata associated with each HugeTLB. The rest of the struct page
+ * structures are usually read the compound_head field which are all the same
+ * value. If we can free some struct page memory to buddy system so that we
+ * can save a lot of memory.
+ *
+ * When the system boot up, every 2M HugeTLB has 512 struct page structures
+ * which size is 8 pages(sizeof(struct page) * 512 / PAGE_SIZE).
+ *
+ *    HugeTLB                  struct pages(8 pages)         page frame(8 pages)
+ * +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
+ * |           |                     |     0     | -------------> |     0     |
+ * |           |                     |     1     | -------------> |     1     |
+ * |           |                     |     2     | -------------> |     2     |
+ * |           |                     |     3     | -------------> |     3     |
+ * |           |                     |     4     | -------------> |     4     |
+ * |     2M    |                     |     5     | -------------> |     5     |
+ * |           |                     |     6     | -------------> |     6     |
+ * |           |                     |     7     | -------------> |     7     |
+ * |           |                     +-----------+                +-----------+
+ * |           |
+ * |           |
+ * +-----------+
+ *
+ *
+ * When a HugeTLB is preallocated, we can change the mapping from above to
+ * bellow.
+ *
+ *    HugeTLB                  struct pages(8 pages)         page frame(8 pages)
+ * +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
+ * |           |                     |     0     | -------------> |     0     |
+ * |           |                     |     1     | -------------> |     1     |
+ * |           |                     |     2     | -------------> +-----------+
+ * |           |                     |     3     | -----------------^ ^ ^ ^ ^
+ * |           |                     |     4     | -------------------+ | | |
+ * |     2M    |                     |     5     | ---------------------+ | |
+ * |           |                     |     6     | -----------------------+ |
+ * |           |                     |     7     | -------------------------+
+ * |           |                     +-----------+
+ * |           |
+ * |           |
+ * +-----------+
+ *
+ * For tail pages, the value of compound_head is the same. So we can reuse
+ * first page of tail page structures. We map the virtual addresses of the
+ * remaining 6 pages of tail page structures to the first tail page structures,
+ * and then free these 6 page frames. Therefore, we need to reserve at least 2
+ * pages as vmemmap areas.
+ *
+ * When a HugeTLB is freed to the buddy system, we should allocate 6 pages for
+ * vmemmap pages and restore the previous mapping relationship.
+ */
+#define pr_fmt(fmt)	"HugeTLB Vmemmap: " fmt
+
+#include "hugetlb_vmemmap.h"
+
+/*
+ * There are 512 struct page structures(8 pages) associated with each 2MB
+ * hugetlb page. For tail pages, the value of compound_head is the same.
+ * So we can reuse first page of tail page structures. We map the virtual
+ * addresses of the remaining 6 pages of tail page structures to the first
+ * tail page struct, and then free these 6 pages. Therefore, we need to
+ * reserve at least 2 pages as vmemmap areas.
+ */
+#define RESERVE_VMEMMAP_NR		2U
+
+void __init hugetlb_vmemmap_init(struct hstate *h)
+{
+	unsigned int order = huge_page_order(h);
+	unsigned int vmemmap_pages;
+
+	vmemmap_pages = ((1 << order) * sizeof(struct page)) >> PAGE_SHIFT;
+	/*
+	 * The head page and the first tail page are not to be freed to buddy
+	 * system, the others page will map to the first tail page. So there
+	 * are (@vmemmap_pages - RESERVE_VMEMMAP_NR) pages can be freed.
+	 *
+	 * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? This is
+	 * not expected to happen unless the system is corrupted. So on the
+	 * safe side, it is only a safety net.
+	 */
+	if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR))
+		h->nr_free_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR;
+	else
+		h->nr_free_vmemmap_pages = 0;
+
+	pr_debug("can free %d vmemmap pages for %s\n", h->nr_free_vmemmap_pages,
+		 h->name);
+}
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
new file mode 100644
index 000000000000..40c0c7dfb60d
--- /dev/null
+++ b/mm/hugetlb_vmemmap.h
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Free some vmemmap pages of HugeTLB
+ *
+ * Copyright (c) 2020, Bytedance. All rights reserved.
+ *
+ *     Author: Muchun Song <songmuchun@...edance.com>
+ */
+#ifndef _LINUX_HUGETLB_VMEMMAP_H
+#define _LINUX_HUGETLB_VMEMMAP_H
+#include <linux/hugetlb.h>
+
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+void __init hugetlb_vmemmap_init(struct hstate *h);
+#else
+static inline void hugetlb_vmemmap_init(struct hstate *h)
+{
+}
+#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
+#endif /* _LINUX_HUGETLB_VMEMMAP_H */
-- 
2.11.0

Powered by blists - more mailing lists