lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250519223307.3601786-3-usamaarif642@gmail.com>
Date: Mon, 19 May 2025 23:29:54 +0100
From: Usama Arif <usamaarif642@...il.com>
To: Andrew Morton <akpm@...ux-foundation.org>,
	david@...hat.com,
	linux-mm@...ck.org
Cc: hannes@...xchg.org,
	shakeel.butt@...ux.dev,
	riel@...riel.com,
	ziy@...dia.com,
	laoar.shao@...il.com,
	baolin.wang@...ux.alibaba.com,
	lorenzo.stoakes@...cle.com,
	Liam.Howlett@...cle.com,
	npache@...hat.com,
	ryan.roberts@....com,
	vbabka@...e.cz,
	jannh@...gle.com,
	Arnd Bergmann <arnd@...db.de>,
	linux-kernel@...r.kernel.org,
	linux-doc@...r.kernel.org,
	kernel-team@...a.com,
	Usama Arif <usamaarif642@...il.com>
Subject: [PATCH v3 2/7] prctl: introduce PR_DEFAULT_MADV_HUGEPAGE for the process

This is set via the new PR_SET_THP_POLICY prctl. It has 2 affects:
- It sets VM_HUGEPAGE and clears VM_NOHUGEPAGE on the default VMA flags
  (def_flags). This means that every new VMA will be considered for
  hugepage.
- Iterate through every VMA in the process and call hugepage_madvise
  on it, with MADV_HUGEPAGE policy.
The policy is inherited during fork+exec.

This effectively allows setting MADV_HUGEPAGE on the entire process.
In an environment where different types of workloads are run on the
same machine, this will allow workloads that benefit from always having
hugepages to do so, without regressing those that don't.

Signed-off-by: Usama Arif <usamaarif642@...il.com>
---
 include/linux/huge_mm.h                       |  1 +
 include/linux/mm.h                            |  2 +-
 include/linux/mm_types.h                      |  4 ++-
 include/uapi/linux/prctl.h                    |  4 +++
 kernel/sys.c                                  | 29 +++++++++++++++++++
 mm/huge_memory.c                              | 13 +++++++++
 tools/include/uapi/linux/prctl.h              |  4 +++
 .../trace/beauty/include/uapi/linux/prctl.h   |  4 +++
 8 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 23580a43787c..b24a2e0ae642 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -431,6 +431,7 @@ change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
 			__split_huge_pud(__vma, __pud, __address);	\
 	}  while (0)
 
+void process_default_madv_hugepage(struct mm_struct *mm, int advice);
 int hugepage_set_vmflags(unsigned long *vm_flags, int advice);
 int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags,
 		     int advice);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 43748c8f3454..436f4588bce8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -466,7 +466,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)
 
 /* This mask defines which mm->def_flags a process can inherit its parent */
-#define VM_INIT_DEF_MASK	VM_NOHUGEPAGE
+#define VM_INIT_DEF_MASK	(VM_HUGEPAGE | VM_NOHUGEPAGE)
 
 /* This mask represents all the VMA flag bits used by mlock */
 #define VM_LOCKED_MASK	(VM_LOCKED | VM_LOCKONFAULT)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e76bade9ebb1..f1836b7c5704 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1703,6 +1703,7 @@ enum {
 					/* leave room for more dump flags */
 #define MMF_VM_MERGEABLE	16	/* KSM may merge identical pages */
 #define MMF_VM_HUGEPAGE		17	/* set when mm is available for khugepaged */
+#define MMF_VM_HUGEPAGE_MASK	(1 << MMF_VM_HUGEPAGE)
 
 /*
  * This one-shot flag is dropped due to necessity of changing exe once again
@@ -1742,7 +1743,8 @@ enum {
 
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
 				 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\
-				 MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK)
+				 MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK |\
+				 MMF_VM_HUGEPAGE_MASK)
 
 static inline unsigned long mmf_init_flags(unsigned long flags)
 {
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 15c18ef4eb11..15aaa4db5ff8 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -364,4 +364,8 @@ struct prctl_mm_map {
 # define PR_TIMER_CREATE_RESTORE_IDS_ON		1
 # define PR_TIMER_CREATE_RESTORE_IDS_GET	2
 
+#define PR_SET_THP_POLICY		78
+#define PR_GET_THP_POLICY		79
+#define PR_DEFAULT_MADV_HUGEPAGE	0
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index c434968e9f5d..74397ace62f3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2474,6 +2474,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		unsigned long, arg4, unsigned long, arg5)
 {
 	struct task_struct *me = current;
+	struct mm_struct *mm = me->mm;
 	unsigned char comm[sizeof(me->comm)];
 	long error;
 
@@ -2658,6 +2659,34 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			clear_bit(MMF_DISABLE_THP, &me->mm->flags);
 		mmap_write_unlock(me->mm);
 		break;
+	case PR_GET_THP_POLICY:
+		if (arg2 || arg3 || arg4 || arg5)
+			return -EINVAL;
+		if (mmap_write_lock_killable(mm))
+			return -EINTR;
+		if (mm->def_flags & VM_HUGEPAGE)
+			error = PR_DEFAULT_MADV_HUGEPAGE;
+		mmap_write_unlock(mm);
+		break;
+	case PR_SET_THP_POLICY:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		if (mmap_write_lock_killable(mm))
+			return -EINTR;
+		switch (arg2) {
+		case PR_DEFAULT_MADV_HUGEPAGE:
+			if (!hugepage_global_enabled())
+				error = -EPERM;
+			error = hugepage_set_vmflags(&mm->def_flags, MADV_HUGEPAGE);
+			if (!error)
+				process_default_madv_hugepage(mm, MADV_HUGEPAGE);
+			break;
+		default:
+			error = -EINVAL;
+			break;
+		}
+		mmap_write_unlock(mm);
+		break;
 	case PR_MPX_ENABLE_MANAGEMENT:
 	case PR_MPX_DISABLE_MANAGEMENT:
 		/* No longer implemented: */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2780a12b25f0..72806fe772b5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -98,6 +98,19 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
 	return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
 }
 
+void process_default_madv_hugepage(struct mm_struct *mm, int advice)
+{
+	struct vm_area_struct *vma;
+	unsigned long vm_flags;
+
+	mmap_assert_write_locked(mm);
+	VMA_ITERATOR(vmi, mm, 0);
+	for_each_vma(vmi, vma) {
+		vm_flags = vma->vm_flags;
+		hugepage_madvise(vma, &vm_flags, advice);
+	}
+}
+
 unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
 					 unsigned long vm_flags,
 					 unsigned long tva_flags,
diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h
index 35791791a879..f5945ebfe3f2 100644
--- a/tools/include/uapi/linux/prctl.h
+++ b/tools/include/uapi/linux/prctl.h
@@ -328,4 +328,8 @@ struct prctl_mm_map {
 # define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC	0x10 /* Clear the aspect on exec */
 # define PR_PPC_DEXCR_CTRL_MASK		0x1f
 
+#define PR_SET_THP_POLICY		78
+#define PR_GET_THP_POLICY		79
+#define PR_THP_POLICY_DEFAULT_HUGE	0
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/tools/perf/trace/beauty/include/uapi/linux/prctl.h b/tools/perf/trace/beauty/include/uapi/linux/prctl.h
index 15c18ef4eb11..325c72f40a93 100644
--- a/tools/perf/trace/beauty/include/uapi/linux/prctl.h
+++ b/tools/perf/trace/beauty/include/uapi/linux/prctl.h
@@ -364,4 +364,8 @@ struct prctl_mm_map {
 # define PR_TIMER_CREATE_RESTORE_IDS_ON		1
 # define PR_TIMER_CREATE_RESTORE_IDS_GET	2
 
+#define PR_SET_THP_POLICY		78
+#define PR_GET_THP_POLICY		79
+#define PR_THP_POLICY_DEFAULT_HUGE	0
+
 #endif /* _LINUX_PRCTL_H */
-- 
2.47.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ