lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <YooSSCnphhuwfOOc@FVFYT0MHHV2J.usts.net>
Date:   Sun, 22 May 2022 18:36:56 +0800
From:   Muchun Song <songmuchun@...edance.com>
To:     Roman Gushchin <roman.gushchin@...ux.dev>
Cc:     Andrew Morton <akpm@...ux-foundation.org>, linux-mm@...ck.org,
        Dave Chinner <dchinner@...hat.com>,
        linux-kernel@...r.kernel.org,
        Kent Overstreet <kent.overstreet@...il.com>,
        Hillf Danton <hdanton@...a.com>,
        Christophe JAILLET <christophe.jaillet@...adoo.fr>
Subject: Re: [PATCH v3 2/6] mm: shrinkers: introduce debugfs interface for
 memory shrinkers

On Mon, May 09, 2022 at 11:38:16AM -0700, Roman Gushchin wrote:
> This commit introduces the /sys/kernel/debug/shrinker debugfs
> interface which provides an ability to observe the state of
> individual kernel memory shrinkers.
> 
> Because the feature adds some memory overhead (which shouldn't be
> large unless there is a huge amount of registered shrinkers), it's
> guarded by a config option (enabled by default).
> 
> This commit introduces the "count" interface for each shrinker
> registered in the system.
> 
> The output is in the following format:

Hi Roman,

Shoud we print a title to show what those numbers mean?  In this case,
it is more understandable.

> <cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>...
> <cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>...
> ...
> 
> To reduce the size of output on machines with many thousands cgroups,
> if the total number of objects on all nodes is 0, the line is omitted.
> 
> If the shrinker is not memcg-aware or CONFIG_MEMCG is off, 0 is
> printed as cgroup inode id. If the shrinker is not numa-aware, 0's are
> printed for all nodes except the first one.
> 
> This commit gives debugfs entries simple numeric names, which are not
> very convenient. The following commit in the series will provide
> shrinkers with more meaningful names.
> 
> Signed-off-by: Roman Gushchin <roman.gushchin@...ux.dev>
> ---
>  include/linux/shrinker.h |  19 ++++-
>  lib/Kconfig.debug        |   9 +++
>  mm/Makefile              |   1 +
>  mm/shrinker_debug.c      | 171 +++++++++++++++++++++++++++++++++++++++
>  mm/vmscan.c              |   6 +-
>  5 files changed, 203 insertions(+), 3 deletions(-)
>  create mode 100644 mm/shrinker_debug.c
> 
> diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
> index 76fbf92b04d9..2ced8149c513 100644
> --- a/include/linux/shrinker.h
> +++ b/include/linux/shrinker.h
> @@ -72,6 +72,10 @@ struct shrinker {
>  #ifdef CONFIG_MEMCG
>  	/* ID in shrinker_idr */
>  	int id;
> +#endif
> +#ifdef CONFIG_SHRINKER_DEBUG
> +	int debugfs_id;
> +	struct dentry *debugfs_entry;
>  #endif
>  	/* objs pending delete, per node */
>  	atomic_long_t *nr_deferred;
> @@ -94,4 +98,17 @@ extern int register_shrinker(struct shrinker *shrinker);
>  extern void unregister_shrinker(struct shrinker *shrinker);
>  extern void free_prealloced_shrinker(struct shrinker *shrinker);
>  extern void synchronize_shrinkers(void);
> -#endif
> +
> +#ifdef CONFIG_SHRINKER_DEBUG
> +extern int shrinker_debugfs_add(struct shrinker *shrinker);
> +extern void shrinker_debugfs_remove(struct shrinker *shrinker);
> +#else /* CONFIG_SHRINKER_DEBUG */
> +static inline int shrinker_debugfs_add(struct shrinker *shrinker)
> +{
> +	return 0;
> +}
> +static inline void shrinker_debugfs_remove(struct shrinker *shrinker)
> +{
> +}
> +#endif /* CONFIG_SHRINKER_DEBUG */
> +#endif /* _LINUX_SHRINKER_H */
> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index 3fd7a2e9eaf1..5fa65a649798 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -733,6 +733,15 @@ config SLUB_STATS
>  	  out which slabs are relevant to a particular load.
>  	  Try running: slabinfo -DA
>  
> +config SHRINKER_DEBUG
> +	default y
> +	bool "Enable shrinker debugging support"
> +	depends on DEBUG_FS
> +	help
> +	  Say Y to enable the shrinker debugfs interface which provides
> +	  visibility into the kernel memory shrinkers subsystem.
> +	  Disable it to avoid an extra memory footprint.
> +
>  config HAVE_DEBUG_KMEMLEAK
>  	bool
>  
> diff --git a/mm/Makefile b/mm/Makefile
> index 298c9991ab75..8083fa85a348 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -133,3 +133,4 @@ obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
>  obj-$(CONFIG_IO_MAPPING) += io-mapping.o
>  obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
>  obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
> +obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
> diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
> new file mode 100644
> index 000000000000..fd1f805a581a
> --- /dev/null
> +++ b/mm/shrinker_debug.c
> @@ -0,0 +1,171 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/idr.h>
> +#include <linux/slab.h>
> +#include <linux/debugfs.h>
> +#include <linux/seq_file.h>
> +#include <linux/shrinker.h>
> +#include <linux/memcontrol.h>
> +
> +/* defined in vmscan.c */
> +extern struct rw_semaphore shrinker_rwsem;
> +extern struct list_head shrinker_list;
> +
> +static DEFINE_IDA(shrinker_debugfs_ida);
> +static struct dentry *shrinker_debugfs_root;
> +
> +static unsigned long shrinker_count_objects(struct shrinker *shrinker,
> +					    struct mem_cgroup *memcg,
> +					    unsigned long *count_per_node)
> +{
> +	unsigned long nr, total = 0;
> +	int nid;
> +
> +	for_each_node(nid) {
> +		if (nid == 0 || (shrinker->flags & SHRINKER_NUMA_AWARE)) {
> +			struct shrink_control sc = {
> +				.gfp_mask = GFP_KERNEL,
> +				.nid = nid,
> +				.memcg = memcg,
> +			};
> +
> +			nr = shrinker->count_objects(shrinker, &sc);
> +			if (nr == SHRINK_EMPTY)
> +				nr = 0;
> +		} else {
> +			nr = 0;

For efficiency, we could break here, right?

> +		}
> +
> +		count_per_node[nid] = nr;
> +		total += nr;
> +	}
> +
> +	return total;
> +}
> +
> +static int shrinker_debugfs_count_show(struct seq_file *m, void *v)
> +{
> +	struct shrinker *shrinker = (struct shrinker *)m->private;

Maybe we cound drop the cast since m->private is a void * type.

> +	unsigned long *count_per_node = NULL;

Do not need to be initialized, right?

> +	struct mem_cgroup *memcg;
> +	unsigned long total;
> +	bool memcg_aware;
> +	int ret, nid;
> +
> +	count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
> +	if (!count_per_node)
> +		return -ENOMEM;
> +
> +	ret = down_read_killable(&shrinker_rwsem);
> +	if (ret) {
> +		kfree(count_per_node);
> +		return ret;
> +	}
> +	rcu_read_lock();
> +
> +	memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE;
> +
> +	memcg = mem_cgroup_iter(NULL, NULL, NULL);
> +	do {
> +		if (memcg && !mem_cgroup_online(memcg))
> +			continue;
> +
> +		total = shrinker_count_objects(shrinker,
> +					       memcg_aware ? memcg : NULL,
> +					       count_per_node);
> +		if (total) {
> +			seq_printf(m, "%lu", mem_cgroup_ino(memcg));
> +			for_each_node(nid)
> +				seq_printf(m, " %lu", count_per_node[nid]);
> +			seq_puts(m, "\n");

seq_putc(m, '\n') is more efficient.

> +		}
> +
> +		if (!memcg_aware) {
> +			mem_cgroup_iter_break(NULL, memcg);
> +			break;
> +		}
> +
> +		if (signal_pending(current)) {
> +			mem_cgroup_iter_break(NULL, memcg);
> +			ret = -EINTR;
> +			break;
> +		}
> +
> +		cond_resched();

We are in rcu read lock, cannot be scheduled, right?

> +	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
> +
> +	rcu_read_unlock();
> +	up_read(&shrinker_rwsem);
> +
> +	kfree(count_per_node);
> +	return ret;
> +}
> +DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_count);
> +
> +int shrinker_debugfs_add(struct shrinker *shrinker)
> +{
> +	struct dentry *entry;
> +	char buf[16];
> +	int id;
> +
> +	lockdep_assert_held(&shrinker_rwsem);
> +
> +	/* debugfs isn't initialized yet, add debugfs entries later. */
> +	if (!shrinker_debugfs_root)
> +		return 0;
> +
> +	id = ida_alloc(&shrinker_debugfs_ida, GFP_KERNEL);
> +	if (id < 0)
> +		return id;
> +	shrinker->debugfs_id = id;
> +
> +	snprintf(buf, sizeof(buf), "%d", id);
> +
> +	/* create debugfs entry */
> +	entry = debugfs_create_dir(buf, shrinker_debugfs_root);
> +	if (IS_ERR(entry)) {
> +		ida_free(&shrinker_debugfs_ida, id);
> +		return PTR_ERR(entry);
> +	}
> +	shrinker->debugfs_entry = entry;
> +
> +	debugfs_create_file("count", 0220, entry, shrinker,
> +			    &shrinker_debugfs_count_fops);
> +	return 0;
> +}
> +
> +void shrinker_debugfs_remove(struct shrinker *shrinker)
> +{
> +	lockdep_assert_held(&shrinker_rwsem);
> +
> +	if (!shrinker->debugfs_entry)
> +		return;
> +
> +	debugfs_remove_recursive(shrinker->debugfs_entry);
> +	ida_free(&shrinker_debugfs_ida, shrinker->debugfs_id);
> +}
> +
> +static int __init shrinker_debugfs_init(void)
> +{
> +	struct shrinker *shrinker;
> +	int ret;
> +
> +	if (!debugfs_initialized())
> +		return -ENODEV;
> +

Redundant check since it is checked in debugfs_create_dir().
So I think we could remove this.

> +	shrinker_debugfs_root = debugfs_create_dir("shrinker", NULL);

We should use IS_ERR() to detect the error code.  So the following
check is wrong.

> +	if (!shrinker_debugfs_root)
> +		return -ENOMEM;
> +
> +	/* Create debugfs entries for shrinkers registered at boot */
> +	ret = down_write_killable(&shrinker_rwsem);

How could we kill this process?  IIUC, late_initcall() is called
from early init process, there is no way to kill this. Right?
If yes, I think we could just use down_write().

Thanks.

> +	if (ret)
> +		return ret;
> +
> +	list_for_each_entry(shrinker, &shrinker_list, list)
> +		if (!shrinker->debugfs_entry)
> +			ret = shrinker_debugfs_add(shrinker);
> +	up_write(&shrinker_rwsem);
> +
> +	return ret;
> +}
> +late_initcall(shrinker_debugfs_init);
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index c6918fff06e1..024f7056b98c 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -190,8 +190,8 @@ static void set_task_reclaim_state(struct task_struct *task,
>  	task->reclaim_state = rs;
>  }
>  
> -static LIST_HEAD(shrinker_list);
> -static DECLARE_RWSEM(shrinker_rwsem);
> +LIST_HEAD(shrinker_list);
> +DECLARE_RWSEM(shrinker_rwsem);
>  
>  #ifdef CONFIG_MEMCG
>  static int shrinker_nr_max;
> @@ -655,6 +655,7 @@ void register_shrinker_prepared(struct shrinker *shrinker)
>  	down_write(&shrinker_rwsem);
>  	list_add_tail(&shrinker->list, &shrinker_list);
>  	shrinker->flags |= SHRINKER_REGISTERED;
> +	WARN_ON_ONCE(shrinker_debugfs_add(shrinker));
>  	up_write(&shrinker_rwsem);
>  }
>  
> @@ -682,6 +683,7 @@ void unregister_shrinker(struct shrinker *shrinker)
>  	shrinker->flags &= ~SHRINKER_REGISTERED;
>  	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
>  		unregister_memcg_shrinker(shrinker);
> +	shrinker_debugfs_remove(shrinker);
>  	up_write(&shrinker_rwsem);
>  
>  	kfree(shrinker->nr_deferred);
> -- 
> 2.35.3
> 
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ