lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250515133519.2779639-2-usamaarif642@gmail.com>
Date: Thu, 15 May 2025 14:33:30 +0100
From: Usama Arif <usamaarif642@...il.com>
To: Andrew Morton <akpm@...ux-foundation.org>,
	david@...hat.com,
	linux-mm@...ck.org
Cc: hannes@...xchg.org,
	shakeel.butt@...ux.dev,
	riel@...riel.com,
	ziy@...dia.com,
	laoar.shao@...il.com,
	baolin.wang@...ux.alibaba.com,
	lorenzo.stoakes@...cle.com,
	Liam.Howlett@...cle.com,
	npache@...hat.com,
	ryan.roberts@....com,
	linux-kernel@...r.kernel.org,
	linux-doc@...r.kernel.org,
	kernel-team@...a.com,
	Usama Arif <usamaarif642@...il.com>
Subject: [PATCH 1/6] prctl: introduce PR_THP_POLICY_DEFAULT_HUGE for the process

This is set via the new PR_SET_THP_POLICY prctl.
This will set the MMF2_THP_VMA_DEFAULT_HUGE process flag
which changes the default of new VMAs to be VM_HUGEPAGE. The
call also modifies all existing VMAs that are not VM_NOHUGEPAGE
to be VM_HUGEPAGE. The policy is inherited during fork+exec.

This allows systems where the global policy is set to "madvise"
to effectively have THPs always for the process. In an environment
where different types of workloads are stacked on the same machine,
this will allow workloads that benefit from always having hugepages
to do so, without regressing those that don't.

Signed-off-by: Usama Arif <usamaarif642@...il.com>
---
 include/linux/huge_mm.h                       |  3 ++
 include/linux/mm_types.h                      | 11 +++++++
 include/uapi/linux/prctl.h                    |  4 +++
 kernel/fork.c                                 |  1 +
 kernel/sys.c                                  | 21 ++++++++++++
 mm/huge_memory.c                              | 32 +++++++++++++++++++
 mm/vma.c                                      |  2 ++
 tools/include/uapi/linux/prctl.h              |  4 +++
 .../trace/beauty/include/uapi/linux/prctl.h   |  4 +++
 9 files changed, 82 insertions(+)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2f190c90192d..e652ad9ddbbd 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -260,6 +260,9 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
 	return orders;
 }
 
+void vma_set_thp_policy(struct vm_area_struct *vma);
+void process_vmas_thp_default_huge(struct mm_struct *mm);
+
 unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
 					 unsigned long vm_flags,
 					 unsigned long tva_flags,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e76bade9ebb1..2fe93965e761 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1066,6 +1066,7 @@ struct mm_struct {
 		mm_context_t context;
 
 		unsigned long flags; /* Must use atomic bitops to access */
+		unsigned long flags2;
 
 #ifdef CONFIG_AIO
 		spinlock_t			ioctx_lock;
@@ -1744,6 +1745,11 @@ enum {
 				 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\
 				 MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK)
 
+#define MMF2_THP_VMA_DEFAULT_HUGE		0
+#define MMF2_THP_VMA_DEFAULT_HUGE_MASK		(1 << MMF2_THP_VMA_DEFAULT_HUGE)
+
+#define MMF2_INIT_MASK		(MMF2_THP_VMA_DEFAULT_HUGE_MASK)
+
 static inline unsigned long mmf_init_flags(unsigned long flags)
 {
 	if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT))
@@ -1752,4 +1758,9 @@ static inline unsigned long mmf_init_flags(unsigned long flags)
 	return flags & MMF_INIT_MASK;
 }
 
+static inline unsigned long mmf2_init_flags(unsigned long flags)
+{
+	return flags & MMF2_INIT_MASK;
+}
+
 #endif /* _LINUX_MM_TYPES_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 15c18ef4eb11..325c72f40a93 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -364,4 +364,8 @@ struct prctl_mm_map {
 # define PR_TIMER_CREATE_RESTORE_IDS_ON		1
 # define PR_TIMER_CREATE_RESTORE_IDS_GET	2
 
+#define PR_SET_THP_POLICY		78
+#define PR_GET_THP_POLICY		79
+#define PR_THP_POLICY_DEFAULT_HUGE	0
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 9e4616dacd82..6e5f4a8869dc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1054,6 +1054,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 
 	if (current->mm) {
 		mm->flags = mmf_init_flags(current->mm->flags);
+		mm->flags2 = mmf2_init_flags(current->mm->flags2);
 		mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
 	} else {
 		mm->flags = default_dump_filter;
diff --git a/kernel/sys.c b/kernel/sys.c
index c434968e9f5d..1115f258f253 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2658,6 +2658,27 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			clear_bit(MMF_DISABLE_THP, &me->mm->flags);
 		mmap_write_unlock(me->mm);
 		break;
+	case PR_GET_THP_POLICY:
+		if (arg2 || arg3 || arg4 || arg5)
+			return -EINVAL;
+		if (!!test_bit(MMF2_THP_VMA_DEFAULT_HUGE, &me->mm->flags2))
+			error = PR_THP_POLICY_DEFAULT_HUGE;
+		break;
+	case PR_SET_THP_POLICY:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		if (mmap_write_lock_killable(me->mm))
+			return -EINTR;
+		switch (arg2) {
+		case PR_THP_POLICY_DEFAULT_HUGE:
+			set_bit(MMF2_THP_VMA_DEFAULT_HUGE, &me->mm->flags2);
+			process_vmas_thp_default_huge(me->mm);
+			break;
+		default:
+			return -EINVAL;
+		}
+		mmap_write_unlock(me->mm);
+		break;
 	case PR_MPX_ENABLE_MANAGEMENT:
 	case PR_MPX_DISABLE_MANAGEMENT:
 		/* No longer implemented: */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2780a12b25f0..64f66d5295e8 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -98,6 +98,38 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
 	return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
 }
 
+void vma_set_thp_policy(struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = vma->vm_mm;
+
+	if (test_bit(MMF2_THP_VMA_DEFAULT_HUGE, &mm->flags2))
+		vm_flags_set(vma, VM_HUGEPAGE);
+}
+
+static void vmas_thp_default_huge(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+	unsigned long vm_flags;
+
+	VMA_ITERATOR(vmi, mm, 0);
+	for_each_vma(vmi, vma) {
+		vm_flags = vma->vm_flags;
+		if (vm_flags & VM_NOHUGEPAGE)
+			continue;
+		vm_flags_set(vma, VM_HUGEPAGE);
+	}
+}
+
+void process_vmas_thp_default_huge(struct mm_struct *mm)
+{
+	if (test_bit(MMF2_THP_VMA_DEFAULT_HUGE, &mm->flags2))
+		return;
+
+	set_bit(MMF2_THP_VMA_DEFAULT_HUGE, &mm->flags2);
+	vmas_thp_default_huge(mm);
+}
+
+
 unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
 					 unsigned long vm_flags,
 					 unsigned long tva_flags,
diff --git a/mm/vma.c b/mm/vma.c
index 1f2634b29568..101b19c96803 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -2476,6 +2476,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
 	if (!vma_is_anonymous(vma))
 		khugepaged_enter_vma(vma, map->flags);
 	ksm_add_vma(vma);
+	vma_set_thp_policy(vma);
 	*vmap = vma;
 	return 0;
 
@@ -2705,6 +2706,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	mm->map_count++;
 	validate_mm(mm);
 	ksm_add_vma(vma);
+	vma_set_thp_policy(vma);
 out:
 	perf_event_mmap(vma);
 	mm->total_vm += len >> PAGE_SHIFT;
diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h
index 35791791a879..f5945ebfe3f2 100644
--- a/tools/include/uapi/linux/prctl.h
+++ b/tools/include/uapi/linux/prctl.h
@@ -328,4 +328,8 @@ struct prctl_mm_map {
 # define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC	0x10 /* Clear the aspect on exec */
 # define PR_PPC_DEXCR_CTRL_MASK		0x1f
 
+#define PR_SET_THP_POLICY		78
+#define PR_GET_THP_POLICY		79
+#define PR_THP_POLICY_DEFAULT_HUGE	0
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/tools/perf/trace/beauty/include/uapi/linux/prctl.h b/tools/perf/trace/beauty/include/uapi/linux/prctl.h
index 15c18ef4eb11..325c72f40a93 100644
--- a/tools/perf/trace/beauty/include/uapi/linux/prctl.h
+++ b/tools/perf/trace/beauty/include/uapi/linux/prctl.h
@@ -364,4 +364,8 @@ struct prctl_mm_map {
 # define PR_TIMER_CREATE_RESTORE_IDS_ON		1
 # define PR_TIMER_CREATE_RESTORE_IDS_GET	2
 
+#define PR_SET_THP_POLICY		78
+#define PR_GET_THP_POLICY		79
+#define PR_THP_POLICY_DEFAULT_HUGE	0
+
 #endif /* _LINUX_PRCTL_H */
-- 
2.47.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ