[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250519223307.3601786-3-usamaarif642@gmail.com>
Date: Mon, 19 May 2025 23:29:54 +0100
From: Usama Arif <usamaarif642@...il.com>
To: Andrew Morton <akpm@...ux-foundation.org>,
david@...hat.com,
linux-mm@...ck.org
Cc: hannes@...xchg.org,
shakeel.butt@...ux.dev,
riel@...riel.com,
ziy@...dia.com,
laoar.shao@...il.com,
baolin.wang@...ux.alibaba.com,
lorenzo.stoakes@...cle.com,
Liam.Howlett@...cle.com,
npache@...hat.com,
ryan.roberts@....com,
vbabka@...e.cz,
jannh@...gle.com,
Arnd Bergmann <arnd@...db.de>,
linux-kernel@...r.kernel.org,
linux-doc@...r.kernel.org,
kernel-team@...a.com,
Usama Arif <usamaarif642@...il.com>
Subject: [PATCH v3 2/7] prctl: introduce PR_DEFAULT_MADV_HUGEPAGE for the process
This is set via the new PR_SET_THP_POLICY prctl. It has 2 affects:
- It sets VM_HUGEPAGE and clears VM_NOHUGEPAGE on the default VMA flags
(def_flags). This means that every new VMA will be considered for
hugepage.
- Iterate through every VMA in the process and call hugepage_madvise
on it, with MADV_HUGEPAGE policy.
The policy is inherited during fork+exec.
This effectively allows setting MADV_HUGEPAGE on the entire process.
In an environment where different types of workloads are run on the
same machine, this will allow workloads that benefit from always having
hugepages to do so, without regressing those that don't.
Signed-off-by: Usama Arif <usamaarif642@...il.com>
---
include/linux/huge_mm.h | 1 +
include/linux/mm.h | 2 +-
include/linux/mm_types.h | 4 ++-
include/uapi/linux/prctl.h | 4 +++
kernel/sys.c | 29 +++++++++++++++++++
mm/huge_memory.c | 13 +++++++++
tools/include/uapi/linux/prctl.h | 4 +++
.../trace/beauty/include/uapi/linux/prctl.h | 4 +++
8 files changed, 59 insertions(+), 2 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 23580a43787c..b24a2e0ae642 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -431,6 +431,7 @@ change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
__split_huge_pud(__vma, __pud, __address); \
} while (0)
+void process_default_madv_hugepage(struct mm_struct *mm, int advice);
int hugepage_set_vmflags(unsigned long *vm_flags, int advice);
int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags,
int advice);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 43748c8f3454..436f4588bce8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -466,7 +466,7 @@ extern unsigned int kobjsize(const void *objp);
#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)
/* This mask defines which mm->def_flags a process can inherit its parent */
-#define VM_INIT_DEF_MASK VM_NOHUGEPAGE
+#define VM_INIT_DEF_MASK (VM_HUGEPAGE | VM_NOHUGEPAGE)
/* This mask represents all the VMA flag bits used by mlock */
#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e76bade9ebb1..f1836b7c5704 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1703,6 +1703,7 @@ enum {
/* leave room for more dump flags */
#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */
#define MMF_VM_HUGEPAGE 17 /* set when mm is available for khugepaged */
+#define MMF_VM_HUGEPAGE_MASK (1 << MMF_VM_HUGEPAGE)
/*
* This one-shot flag is dropped due to necessity of changing exe once again
@@ -1742,7 +1743,8 @@ enum {
#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\
- MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK)
+ MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK |\
+ MMF_VM_HUGEPAGE_MASK)
static inline unsigned long mmf_init_flags(unsigned long flags)
{
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 15c18ef4eb11..15aaa4db5ff8 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -364,4 +364,8 @@ struct prctl_mm_map {
# define PR_TIMER_CREATE_RESTORE_IDS_ON 1
# define PR_TIMER_CREATE_RESTORE_IDS_GET 2
+#define PR_SET_THP_POLICY 78
+#define PR_GET_THP_POLICY 79
+#define PR_DEFAULT_MADV_HUGEPAGE 0
+
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index c434968e9f5d..74397ace62f3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2474,6 +2474,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
struct task_struct *me = current;
+ struct mm_struct *mm = me->mm;
unsigned char comm[sizeof(me->comm)];
long error;
@@ -2658,6 +2659,34 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
clear_bit(MMF_DISABLE_THP, &me->mm->flags);
mmap_write_unlock(me->mm);
break;
+ case PR_GET_THP_POLICY:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+ if (mm->def_flags & VM_HUGEPAGE)
+ error = PR_DEFAULT_MADV_HUGEPAGE;
+ mmap_write_unlock(mm);
+ break;
+ case PR_SET_THP_POLICY:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+ switch (arg2) {
+ case PR_DEFAULT_MADV_HUGEPAGE:
+ if (!hugepage_global_enabled())
+ error = -EPERM;
+ error = hugepage_set_vmflags(&mm->def_flags, MADV_HUGEPAGE);
+ if (!error)
+ process_default_madv_hugepage(mm, MADV_HUGEPAGE);
+ break;
+ default:
+ error = -EINVAL;
+ break;
+ }
+ mmap_write_unlock(mm);
+ break;
case PR_MPX_ENABLE_MANAGEMENT:
case PR_MPX_DISABLE_MANAGEMENT:
/* No longer implemented: */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2780a12b25f0..72806fe772b5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -98,6 +98,19 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
}
+void process_default_madv_hugepage(struct mm_struct *mm, int advice)
+{
+ struct vm_area_struct *vma;
+ unsigned long vm_flags;
+
+ mmap_assert_write_locked(mm);
+ VMA_ITERATOR(vmi, mm, 0);
+ for_each_vma(vmi, vma) {
+ vm_flags = vma->vm_flags;
+ hugepage_madvise(vma, &vm_flags, advice);
+ }
+}
+
unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
unsigned long vm_flags,
unsigned long tva_flags,
diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h
index 35791791a879..f5945ebfe3f2 100644
--- a/tools/include/uapi/linux/prctl.h
+++ b/tools/include/uapi/linux/prctl.h
@@ -328,4 +328,8 @@ struct prctl_mm_map {
# define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC 0x10 /* Clear the aspect on exec */
# define PR_PPC_DEXCR_CTRL_MASK 0x1f
+#define PR_SET_THP_POLICY 78
+#define PR_GET_THP_POLICY 79
+#define PR_THP_POLICY_DEFAULT_HUGE 0
+
#endif /* _LINUX_PRCTL_H */
diff --git a/tools/perf/trace/beauty/include/uapi/linux/prctl.h b/tools/perf/trace/beauty/include/uapi/linux/prctl.h
index 15c18ef4eb11..325c72f40a93 100644
--- a/tools/perf/trace/beauty/include/uapi/linux/prctl.h
+++ b/tools/perf/trace/beauty/include/uapi/linux/prctl.h
@@ -364,4 +364,8 @@ struct prctl_mm_map {
# define PR_TIMER_CREATE_RESTORE_IDS_ON 1
# define PR_TIMER_CREATE_RESTORE_IDS_GET 2
+#define PR_SET_THP_POLICY 78
+#define PR_GET_THP_POLICY 79
+#define PR_THP_POLICY_DEFAULT_HUGE 0
+
#endif /* _LINUX_PRCTL_H */
--
2.47.1
Powered by blists - more mailing lists