lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Thu, 15 Apr 2021 10:53:00 +0530
From:   Bharata B Rao <bharata@...ux.ibm.com>
To:     Dave Chinner <david@...morbit.com>
Cc:     akpm@...ux-foundation.org, linux-kernel@...r.kernel.org,
        linux-mm@...ck.org, linux-fsdevel@...r.kernel.org,
        aneesh.kumar@...ux.ibm.com
Subject: Re: High kmalloc-32 slab cache consumption with 10k containers

On Wed, Apr 07, 2021 at 08:28:07AM +1000, Dave Chinner wrote:
> 
> Another approach may be to identify filesystem types that do not
> need memcg awareness and feed that into alloc_super() to set/clear
> the SHRINKER_MEMCG_AWARE flag. This could be based on fstype - most
> virtual filesystems that expose system information do not really
> need full memcg awareness because they are generally only visible to
> a single memcg instance...

Would something like below be appropriate?

>From f314083ad69fde2a420a1b74febd6d3f7a25085f Mon Sep 17 00:00:00 2001
From: Bharata B Rao <bharata@...ux.ibm.com>
Date: Wed, 14 Apr 2021 11:21:24 +0530
Subject: [PATCH 1/1] fs: Let filesystems opt out of memcg awareness

All filesystem mounts by default are memcg aware and end hence
end up creating shrinker list_lrus for all the memcgs. Due to
the way the memcg_nr_cache_ids grow and the list_lru heads are
allocated for all memcgs, huge amount of memory gets consumed
by kmalloc-32 slab cache when running thousands of containers.

Improve this situation by allowing filesystems to opt out
of memcg awareness. In this patch, tmpfs, proc and ramfs
opt out of memcg awareness. This leads to considerable memory
savings when running 10k containers.

Signed-off-by: Bharata B Rao <bharata@...ux.ibm.com>
---
 fs/proc/root.c             |  1 +
 fs/ramfs/inode.c           |  1 +
 fs/super.c                 | 27 +++++++++++++++++++--------
 include/linux/fs_context.h |  2 ++
 mm/shmem.c                 |  1 +
 5 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/fs/proc/root.c b/fs/proc/root.c
index c7e3b1350ef8..7856bc2ca9f4 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -257,6 +257,7 @@ static int proc_init_fs_context(struct fs_context *fc)
 	fc->user_ns = get_user_ns(ctx->pid_ns->user_ns);
 	fc->fs_private = ctx;
 	fc->ops = &proc_fs_context_ops;
+	fc->memcg_optout = true;
 	return 0;
 }
 
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 9ebd17d7befb..576a88bb7407 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -278,6 +278,7 @@ int ramfs_init_fs_context(struct fs_context *fc)
 	fsi->mount_opts.mode = RAMFS_DEFAULT_MODE;
 	fc->s_fs_info = fsi;
 	fc->ops = &ramfs_context_ops;
+	fc->memcg_optout = true;
 	return 0;
 }
 
diff --git a/fs/super.c b/fs/super.c
index 8c1baca35c16..59aa22c678e6 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -198,7 +198,8 @@ static void destroy_unused_super(struct super_block *s)
  *	returns a pointer new superblock or %NULL if allocation had failed.
  */
 static struct super_block *alloc_super(struct file_system_type *type, int flags,
-				       struct user_namespace *user_ns)
+				       struct user_namespace *user_ns,
+				       bool memcg_optout)
 {
 	struct super_block *s = kzalloc(sizeof(struct super_block),  GFP_USER);
 	static const struct super_operations default_op;
@@ -266,13 +267,22 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	s->s_shrink.scan_objects = super_cache_scan;
 	s->s_shrink.count_objects = super_cache_count;
 	s->s_shrink.batch = 1024;
-	s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
+	s->s_shrink.flags = SHRINKER_NUMA_AWARE;
+	if (!memcg_optout)
+		s->s_shrink.flags |= SHRINKER_MEMCG_AWARE;
 	if (prealloc_shrinker(&s->s_shrink))
 		goto fail;
-	if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink))
-		goto fail;
-	if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink))
-		goto fail;
+	if (memcg_optout) {
+		if (list_lru_init(&s->s_dentry_lru))
+			goto fail;
+		if (list_lru_init(&s->s_inode_lru))
+			goto fail;
+	} else {
+		if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink))
+			goto fail;
+		if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink))
+			goto fail;
+	}
 	return s;
 
 fail:
@@ -527,7 +537,8 @@ struct super_block *sget_fc(struct fs_context *fc,
 	}
 	if (!s) {
 		spin_unlock(&sb_lock);
-		s = alloc_super(fc->fs_type, fc->sb_flags, user_ns);
+		s = alloc_super(fc->fs_type, fc->sb_flags, user_ns,
+				fc->memcg_optout);
 		if (!s)
 			return ERR_PTR(-ENOMEM);
 		goto retry;
@@ -610,7 +621,7 @@ struct super_block *sget(struct file_system_type *type,
 	}
 	if (!s) {
 		spin_unlock(&sb_lock);
-		s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns);
+		s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns, false);
 		if (!s)
 			return ERR_PTR(-ENOMEM);
 		goto retry;
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 37e1e8f7f08d..73388c0b6950 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -110,6 +110,8 @@ struct fs_context {
 	bool			need_free:1;	/* Need to call ops->free() */
 	bool			global:1;	/* Goes into &init_user_ns */
 	bool			oldapi:1;	/* Coming from mount(2) */
+	bool			memcg_optout:1;	/* Opt out from per-memcg
+						   lru handling */
 };
 
 struct fs_context_operations {
diff --git a/mm/shmem.c b/mm/shmem.c
index b2db4ed0fbc7..0c9b2af52825 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3915,6 +3915,7 @@ int shmem_init_fs_context(struct fs_context *fc)
 
 	fc->fs_private = ctx;
 	fc->ops = &shmem_fs_context_ops;
+	fc->memcg_optout = true;
 	return 0;
 }
 
-- 
2.26.2

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ