linux-kernel - [RFC PATCH] mm, oom: introduce vm.sacrifice_hugepage_on

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20210216030713.79101-1-eiichi.tsukata@nutanix.com>
Date:   Tue, 16 Feb 2021 03:07:13 +0000
From:   Eiichi Tsukata <eiichi.tsukata@...anix.com>
To:     corbet@....net, mike.kravetz@...cle.com, mcgrof@...nel.org,
        keescook@...omium.org, yzaikin@...gle.com,
        akpm@...ux-foundation.org, linux-doc@...r.kernel.org,
        linux-kernel@...r.kernel.org, linux-mm@...ck.org,
        linux-fsdevel@...r.kernel.org
Cc:     felipe.franciosi@...anix.com,
        Eiichi Tsukata <eiichi.tsukata@...anix.com>
Subject: [RFC PATCH] mm, oom: introduce vm.sacrifice_hugepage_on_oom

Hugepages can be preallocated to avoid unpredictable allocation latency.
If we run into 4k page shortage, the kernel can trigger OOM even though
there were free hugepages. When OOM is triggered by user address page
fault handler, we can use oom notifier to free hugepages in user space
but if it's triggered by memory allocation for kernel, there is no way
to synchronously handle it in user space.

This patch introduces a new sysctl vm.sacrifice_hugepage_on_oom. If
enabled, it first tries to free a hugepage if available before invoking
the oom-killer. The default value is disabled not to change the current
behavior.

Signed-off-by: Eiichi Tsukata <eiichi.tsukata@...anix.com>
---
 Documentation/admin-guide/sysctl/vm.rst | 12 ++++++++++++
 include/linux/hugetlb.h                 |  2 ++
 include/linux/oom.h                     |  1 +
 kernel/sysctl.c                         |  9 +++++++++
 mm/hugetlb.c                            |  4 ++--
 mm/oom_kill.c                           | 23 +++++++++++++++++++++++
 6 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index e35a3f2fb006..f2f195524be6 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -65,6 +65,7 @@ Currently, these files are in /proc/sys/vm:
 - page-cluster
 - panic_on_oom
 - percpu_pagelist_fraction
+- sacrifice_hugepage_on_oom
 - stat_interval
 - stat_refresh
 - numa_stat
@@ -807,6 +808,17 @@ The initial value is zero.  Kernel does not use this value at boot time to set
 the high water marks for each per cpu page list.  If the user writes '0' to this
 sysctl, it will revert to this default behavior.
 
+sacrifice_hugepage_on_oom
+=========================
+
+This value controls whether the kernel should attempt to break up hugepages
+when out-of-memory happens. OOM happens under memory cgroup would not invoke
+this.
+
+If set to 0 (default), the kernel doesn't touch the hugepage pool during OOM
+conditions.
+If set to 1, the kernel frees one hugepage at a time, if available, before
+invoking the oom-killer.
 
 stat_interval
 =============
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index b5807f23caf8..8aad2f2ab6e6 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -145,6 +145,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to,
 long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
 						long freed);
 bool isolate_huge_page(struct page *page, struct list_head *list);
+int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
+			bool acct_surplus);
 void putback_active_hugepage(struct page *page);
 void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
 void free_huge_page(struct page *page);
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 2db9a1432511..0bfae027ec16 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -127,4 +127,5 @@ extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 extern int sysctl_oom_dump_tasks;
 extern int sysctl_oom_kill_allocating_task;
 extern int sysctl_panic_on_oom;
+extern int sysctl_sacrifice_hugepage_on_oom;
 #endif /* _INCLUDE_LINUX_OOM_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c9fbdd848138..d2e3ec625f5f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2708,6 +2708,15 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "sacrifice_hugepage_on_oom",
+		.data		= &sysctl_sacrifice_hugepage_on_oom,
+		.maxlen		= sizeof(sysctl_sacrifice_hugepage_on_oom),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
 	{
 		.procname	= "overcommit_ratio",
 		.data		= &sysctl_overcommit_ratio,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4bdb58ab14cb..e2d57200fd00 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1726,8 +1726,8 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
  * balanced over allowed nodes.
  * Called with hugetlb_lock locked.
  */
-static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
-							 bool acct_surplus)
+int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
+			bool acct_surplus)
 {
 	int nr_nodes, node;
 	int ret = 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 04b19b7b5435..fd2c1f427926 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -43,6 +43,7 @@
 #include <linux/kthread.h>
 #include <linux/init.h>
 #include <linux/mmu_notifier.h>
+#include <linux/hugetlb.h>
 
 #include <asm/tlb.h>
 #include "internal.h"
@@ -52,6 +53,7 @@
 #include <trace/events/oom.h>
 
 int sysctl_panic_on_oom;
+int sysctl_sacrifice_hugepage_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks = 1;
 
@@ -1023,6 +1025,22 @@ static void check_panic_on_oom(struct oom_control *oc)
 		sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
 }
 
+static int sacrifice_hugepage(void)
+{
+	int ret;
+
+	spin_lock(&hugetlb_lock);
+	ret = free_pool_huge_page(&default_hstate, &node_states[N_MEMORY], 0);
+	spin_unlock(&hugetlb_lock);
+	if (ret) {
+		pr_warn("Out of memory: Successfully sacrificed a hugepage\n");
+		hugetlb_show_meminfo();
+	} else {
+		pr_warn("Out of memory: No free hugepage available\n");
+	}
+	return ret;
+}
+
 static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
 
 int register_oom_notifier(struct notifier_block *nb)
@@ -1100,6 +1118,11 @@ bool out_of_memory(struct oom_control *oc)
 		return true;
 	}
 
+	if (!is_memcg_oom(oc) && sysctl_sacrifice_hugepage_on_oom) {
+		if (sacrifice_hugepage())
+			return true;
+	}
+
 	select_bad_process(oc);
 	/* Found nothing?!?! */
 	if (!oc->chosen) {
-- 
2.29.2