[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20240726094618.401593-5-21cnbao@gmail.com>
Date: Fri, 26 Jul 2024 21:46:18 +1200
From: Barry Song <21cnbao@...il.com>
To: akpm@...ux-foundation.org,
linux-mm@...ck.org
Cc: ying.huang@...el.com,
baolin.wang@...ux.alibaba.com,
chrisl@...nel.org,
david@...hat.com,
hannes@...xchg.org,
hughd@...gle.com,
kaleshsingh@...gle.com,
kasong@...cent.com,
linux-kernel@...r.kernel.org,
mhocko@...e.com,
minchan@...nel.org,
nphamcs@...il.com,
ryan.roberts@....com,
senozhatsky@...omium.org,
shakeel.butt@...ux.dev,
shy828301@...il.com,
surenb@...gle.com,
v-songbaohua@...o.com,
willy@...radead.org,
xiang@...nel.org,
yosryahmed@...gle.com
Subject: [PATCH v5 4/4] mm: Introduce per-thpsize swapin control policy
From: Barry Song <v-songbaohua@...o.com>
Quote Ying's comment:
A user space interface can be implemented to select different swap-in
order policies, similar to the mTHP allocation order policy. We need
a distinct policy because the performance characteristics of memory
allocation differ significantly from those of swap-in. For example,
SSD read speeds can be much slower than memory allocation. With
policy selection, I believe we can implement mTHP swap-in for
non-SWAP_SYNCHRONOUS scenarios as well. However, users need to understand
the implications of their choices. I think that it's better to start
with at least always never. I believe that we will add auto in the
future to tune automatically, which can be used as default finally.
Suggested-by: "Huang, Ying" <ying.huang@...el.com>
Signed-off-by: Barry Song <v-songbaohua@...o.com>
---
Documentation/admin-guide/mm/transhuge.rst | 6 +++
include/linux/huge_mm.h | 1 +
mm/huge_memory.c | 44 ++++++++++++++++++++++
mm/memory.c | 3 +-
4 files changed, 53 insertions(+), 1 deletion(-)
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 058485daf186..2e94e956ee12 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -144,6 +144,12 @@ hugepage sizes have enabled="never". If enabling multiple hugepage
sizes, the kernel will select the most appropriate enabled size for a
given allocation.
+Transparent Hugepage Swap-in for anonymous memory can be disabled or enabled
+by per-supported-THP-size with one of::
+
+ echo always >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/swapin_enabled
+ echo never >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/swapin_enabled
+
It's also possible to limit defrag efforts in the VM to generate
anonymous hugepages in case they're not immediately free to madvise
regions or to never try to defrag memory and simply fallback to regular
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e25d9ebfdf89..25174305b17f 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -92,6 +92,7 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr;
#define TVA_SMAPS (1 << 0) /* Will be used for procfs */
#define TVA_IN_PF (1 << 1) /* Page fault handler */
#define TVA_ENFORCE_SYSFS (1 << 2) /* Obey sysfs configuration */
+#define TVA_IN_SWAPIN (1 << 3) /* Do swap-in */
#define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \
(!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order)))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0167dc27e365..41460847988c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -80,6 +80,7 @@ unsigned long huge_zero_pfn __read_mostly = ~0UL;
unsigned long huge_anon_orders_always __read_mostly;
unsigned long huge_anon_orders_madvise __read_mostly;
unsigned long huge_anon_orders_inherit __read_mostly;
+unsigned long huge_anon_orders_swapin_always __read_mostly;
unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
unsigned long vm_flags,
@@ -88,6 +89,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
{
bool smaps = tva_flags & TVA_SMAPS;
bool in_pf = tva_flags & TVA_IN_PF;
+ bool in_swapin = tva_flags & TVA_IN_SWAPIN;
bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
unsigned long supported_orders;
@@ -100,6 +102,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
supported_orders = THP_ORDERS_ALL_FILE_DEFAULT;
orders &= supported_orders;
+ if (in_swapin)
+ orders &= READ_ONCE(huge_anon_orders_swapin_always);
if (!orders)
return 0;
@@ -523,8 +527,48 @@ static ssize_t thpsize_enabled_store(struct kobject *kobj,
static struct kobj_attribute thpsize_enabled_attr =
__ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store);
+static DEFINE_SPINLOCK(huge_anon_orders_swapin_lock);
+
+static ssize_t thpsize_swapin_enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int order = to_thpsize(kobj)->order;
+ const char *output;
+
+ if (test_bit(order, &huge_anon_orders_swapin_always))
+ output = "[always] never";
+ else
+ output = "always [never]";
+
+ return sysfs_emit(buf, "%s\n", output);
+}
+
+static ssize_t thpsize_swapin_enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int order = to_thpsize(kobj)->order;
+ ssize_t ret = count;
+
+ if (sysfs_streq(buf, "always")) {
+ spin_lock(&huge_anon_orders_swapin_lock);
+ set_bit(order, &huge_anon_orders_swapin_always);
+ spin_unlock(&huge_anon_orders_swapin_lock);
+ } else if (sysfs_streq(buf, "never")) {
+ spin_lock(&huge_anon_orders_swapin_lock);
+ clear_bit(order, &huge_anon_orders_swapin_always);
+ spin_unlock(&huge_anon_orders_swapin_lock);
+ } else
+ ret = -EINVAL;
+
+ return ret;
+}
+static struct kobj_attribute thpsize_swapin_enabled_attr =
+ __ATTR(swapin_enabled, 0644, thpsize_swapin_enabled_show, thpsize_swapin_enabled_store);
+
static struct attribute *thpsize_attrs[] = {
&thpsize_enabled_attr.attr,
+ &thpsize_swapin_enabled_attr.attr,
#ifdef CONFIG_SHMEM
&thpsize_shmem_enabled_attr.attr,
#endif
diff --git a/mm/memory.c b/mm/memory.c
index 14048e9285d4..27c77f739a2c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4091,7 +4091,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
* and suitable for swapping THP.
*/
orders = thp_vma_allowable_orders(vma, vma->vm_flags,
- TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
+ TVA_IN_PF | TVA_IN_SWAPIN | TVA_ENFORCE_SYSFS,
+ BIT(PMD_ORDER) - 1);
orders = thp_vma_suitable_orders(vma, vmf->address, orders);
orders = thp_swap_suitable_orders(swp_offset(entry), vmf->address, orders);
--
2.34.1
Powered by blists - more mailing lists