[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20200928175428.4110504-25-zi.yan@sent.com>
Date: Mon, 28 Sep 2020 13:54:22 -0400
From: Zi Yan <zi.yan@...t.com>
To: linux-mm@...ck.org
Cc: "Kirill A . Shutemov" <kirill.shutemov@...ux.intel.com>,
Roman Gushchin <guro@...com>, Rik van Riel <riel@...riel.com>,
Matthew Wilcox <willy@...radead.org>,
Shakeel Butt <shakeelb@...gle.com>,
Yang Shi <shy828301@...il.com>,
Jason Gunthorpe <jgg@...dia.com>,
Mike Kravetz <mike.kravetz@...cle.com>,
Michal Hocko <mhocko@...e.com>,
David Hildenbrand <david@...hat.com>,
William Kucharski <william.kucharski@...cle.com>,
Andrea Arcangeli <aarcange@...hat.com>,
John Hubbard <jhubbard@...dia.com>,
David Nellans <dnellans@...dia.com>,
linux-kernel@...r.kernel.org, Zi Yan <ziy@...dia.com>
Subject: [RFC PATCH v2 24/30] mm: madvise: add page size options to MADV_HUGEPAGE and MADV_NOHUGEPAGE.
From: Zi Yan <ziy@...dia.com>
It allows user to specify up to what page size kernel will generate THPs
to back up the memory range in madvise. Because we now have PMD and PUD
THPs, they require different amount of kernel effort to be generated,
and we want to prevent user from getting long page fault latency if we
always try to allocate PUD THPs first.
Signed-off-by: Zi Yan <ziy@...dia.com>
---
include/uapi/asm-generic/mman-common.h | 23 +++++++++++++++++++++++
mm/khugepaged.c | 1 +
mm/madvise.c | 17 +++++++++++++++--
3 files changed, 39 insertions(+), 2 deletions(-)
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index f94f65d429be..8009acb55fca 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -6,6 +6,7 @@
Author: Michael S. Tsirkin <mst@...lanox.co.il>, Mellanox Technologies Ltd.
Based on: asm-xxx/mman.h
*/
+#include <asm-generic/hugetlb_encode.h>
#define PROT_READ 0x1 /* page can be read */
#define PROT_WRITE 0x2 /* page can be written */
@@ -80,4 +81,26 @@
#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
PKEY_DISABLE_WRITE)
+
+/*
+ * Huge page size encoding when MADV_HUGEPAGE is specified, and a huge page
+ * size other than the default is desired. See hugetlb_encode.h.
+ */
+#define MADV_HUGEPAGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT
+#define MADV_HUGEPAGE_MASK HUGETLB_FLAG_ENCODE_MASK
+#define MADV_BEHAVIOR_MASK ((1<<MADV_HUGEPAGE_SHIFT) - 1)
+
+#define MADV_HUGEPAGE_64KB HUGETLB_FLAG_ENCODE_64KB
+#define MADV_HUGEPAGE_512KB HUGETLB_FLAG_ENCODE_512KB
+#define MADV_HUGEPAGE_1MB HUGETLB_FLAG_ENCODE_1MB
+#define MADV_HUGEPAGE_2MB HUGETLB_FLAG_ENCODE_2MB
+#define MADV_HUGEPAGE_8MB HUGETLB_FLAG_ENCODE_8MB
+#define MADV_HUGEPAGE_16MB HUGETLB_FLAG_ENCODE_16MB
+#define MADV_HUGEPAGE_32MB HUGETLB_FLAG_ENCODE_32MB
+#define MADV_HUGEPAGE_256MB HUGETLB_FLAG_ENCODE_256MB
+#define MADV_HUGEPAGE_512MB HUGETLB_FLAG_ENCODE_512MB
+#define MADV_HUGEPAGE_1GB HUGETLB_FLAG_ENCODE_1GB
+#define MADV_HUGEPAGE_2GB HUGETLB_FLAG_ENCODE_2GB
+#define MADV_HUGEPAGE_16GB HUGETLB_FLAG_ENCODE_16GB
+
#endif /* __ASM_GENERIC_MMAN_COMMON_H */
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 636a0f32b09e..b34c78085017 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -345,6 +345,7 @@ struct attribute_group khugepaged_attr_group = {
int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice)
{
+ advice = advice & MADV_BEHAVIOR_MASK;
switch (advice) {
case MADV_HUGEPAGE:
#ifdef CONFIG_S390
diff --git a/mm/madvise.c b/mm/madvise.c
index 16e7b8eadb13..32066cc0b34f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -40,6 +40,19 @@ struct madvise_walk_private {
bool pageout;
};
+static inline int get_behavior(int behavior)
+{
+ int behavior_no_flags = behavior & MADV_BEHAVIOR_MASK;
+ /*
+ * only MADV_HUGEPAGE and MADV_NOHUGEPAGE have extra huge page size
+ * flags
+ */
+ VM_BUG_ON(behavior_no_flags != MADV_HUGEPAGE &&
+ behavior_no_flags != MADV_NOHUGEPAGE &&
+ (behavior & (~MADV_BEHAVIOR_MASK)));
+ return behavior_no_flags;
+}
+
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
* take mmap_lock for writing. Others, which simply traverse vmas, need
@@ -74,7 +87,7 @@ static long madvise_behavior(struct vm_area_struct *vma,
pgoff_t pgoff;
unsigned long new_flags = vma->vm_flags;
- switch (behavior) {
+ switch (get_behavior(behavior)) {
case MADV_NORMAL:
new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
break;
@@ -953,7 +966,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
static bool
madvise_behavior_valid(int behavior)
{
- switch (behavior) {
+ switch (get_behavior(behavior)) {
case MADV_DOFORK:
case MADV_DONTFORK:
case MADV_NORMAL:
--
2.28.0
Powered by blists - more mailing lists