Re: [PATCH v3 2/6] mm: shrinkers: introduce debugfs interface for memory shrinkers

From: Muchun Song
Date: Sun May 22 2022 - 06:37:11 EST


On Mon, May 09, 2022 at 11:38:16AM -0700, Roman Gushchin wrote:
> This commit introduces the /sys/kernel/debug/shrinker debugfs
> interface which provides an ability to observe the state of
> individual kernel memory shrinkers.
>
> Because the feature adds some memory overhead (which shouldn't be
> large unless there is a huge amount of registered shrinkers), it's
> guarded by a config option (enabled by default).
>
> This commit introduces the "count" interface for each shrinker
> registered in the system.
>
> The output is in the following format:

Hi Roman,

Shoud we print a title to show what those numbers mean? In this case,
it is more understandable.

> <cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>...
> <cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>...
> ...
>
> To reduce the size of output on machines with many thousands cgroups,
> if the total number of objects on all nodes is 0, the line is omitted.
>
> If the shrinker is not memcg-aware or CONFIG_MEMCG is off, 0 is
> printed as cgroup inode id. If the shrinker is not numa-aware, 0's are
> printed for all nodes except the first one.
>
> This commit gives debugfs entries simple numeric names, which are not
> very convenient. The following commit in the series will provide
> shrinkers with more meaningful names.
>
> Signed-off-by: Roman Gushchin <roman.gushchin@xxxxxxxxx>
> ---
> include/linux/shrinker.h | 19 ++++-
> lib/Kconfig.debug | 9 +++
> mm/Makefile | 1 +
> mm/shrinker_debug.c | 171 +++++++++++++++++++++++++++++++++++++++
> mm/vmscan.c | 6 +-
> 5 files changed, 203 insertions(+), 3 deletions(-)
> create mode 100644 mm/shrinker_debug.c
>
> diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
> index 76fbf92b04d9..2ced8149c513 100644
> --- a/include/linux/shrinker.h
> +++ b/include/linux/shrinker.h
> @@ -72,6 +72,10 @@ struct shrinker {
> #ifdef CONFIG_MEMCG
> /* ID in shrinker_idr */
> int id;
> +#endif
> +#ifdef CONFIG_SHRINKER_DEBUG
> + int debugfs_id;
> + struct dentry *debugfs_entry;
> #endif
> /* objs pending delete, per node */
> atomic_long_t *nr_deferred;
> @@ -94,4 +98,17 @@ extern int register_shrinker(struct shrinker *shrinker);
> extern void unregister_shrinker(struct shrinker *shrinker);
> extern void free_prealloced_shrinker(struct shrinker *shrinker);
> extern void synchronize_shrinkers(void);
> -#endif
> +
> +#ifdef CONFIG_SHRINKER_DEBUG
> +extern int shrinker_debugfs_add(struct shrinker *shrinker);
> +extern void shrinker_debugfs_remove(struct shrinker *shrinker);
> +#else /* CONFIG_SHRINKER_DEBUG */
> +static inline int shrinker_debugfs_add(struct shrinker *shrinker)
> +{
> + return 0;
> +}
> +static inline void shrinker_debugfs_remove(struct shrinker *shrinker)
> +{
> +}
> +#endif /* CONFIG_SHRINKER_DEBUG */
> +#endif /* _LINUX_SHRINKER_H */
> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index 3fd7a2e9eaf1..5fa65a649798 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -733,6 +733,15 @@ config SLUB_STATS
> out which slabs are relevant to a particular load.
> Try running: slabinfo -DA
>
> +config SHRINKER_DEBUG
> + default y
> + bool "Enable shrinker debugging support"
> + depends on DEBUG_FS
> + help
> + Say Y to enable the shrinker debugfs interface which provides
> + visibility into the kernel memory shrinkers subsystem.
> + Disable it to avoid an extra memory footprint.
> +
> config HAVE_DEBUG_KMEMLEAK
> bool
>
> diff --git a/mm/Makefile b/mm/Makefile
> index 298c9991ab75..8083fa85a348 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -133,3 +133,4 @@ obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
> obj-$(CONFIG_IO_MAPPING) += io-mapping.o
> obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
> obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
> +obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
> diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
> new file mode 100644
> index 000000000000..fd1f805a581a
> --- /dev/null
> +++ b/mm/shrinker_debug.c
> @@ -0,0 +1,171 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/idr.h>
> +#include <linux/slab.h>
> +#include <linux/debugfs.h>
> +#include <linux/seq_file.h>
> +#include <linux/shrinker.h>
> +#include <linux/memcontrol.h>
> +
> +/* defined in vmscan.c */
> +extern struct rw_semaphore shrinker_rwsem;
> +extern struct list_head shrinker_list;
> +
> +static DEFINE_IDA(shrinker_debugfs_ida);
> +static struct dentry *shrinker_debugfs_root;
> +
> +static unsigned long shrinker_count_objects(struct shrinker *shrinker,
> + struct mem_cgroup *memcg,
> + unsigned long *count_per_node)
> +{
> + unsigned long nr, total = 0;
> + int nid;
> +
> + for_each_node(nid) {
> + if (nid == 0 || (shrinker->flags & SHRINKER_NUMA_AWARE)) {
> + struct shrink_control sc = {
> + .gfp_mask = GFP_KERNEL,
> + .nid = nid,
> + .memcg = memcg,
> + };
> +
> + nr = shrinker->count_objects(shrinker, &sc);
> + if (nr == SHRINK_EMPTY)
> + nr = 0;
> + } else {
> + nr = 0;

For efficiency, we could break here, right?

> + }
> +
> + count_per_node[nid] = nr;
> + total += nr;
> + }
> +
> + return total;
> +}
> +
> +static int shrinker_debugfs_count_show(struct seq_file *m, void *v)
> +{
> + struct shrinker *shrinker = (struct shrinker *)m->private;

Maybe we cound drop the cast since m->private is a void * type.

> + unsigned long *count_per_node = NULL;

Do not need to be initialized, right?

> + struct mem_cgroup *memcg;
> + unsigned long total;
> + bool memcg_aware;
> + int ret, nid;
> +
> + count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
> + if (!count_per_node)
> + return -ENOMEM;
> +
> + ret = down_read_killable(&shrinker_rwsem);
> + if (ret) {
> + kfree(count_per_node);
> + return ret;
> + }
> + rcu_read_lock();
> +
> + memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE;
> +
> + memcg = mem_cgroup_iter(NULL, NULL, NULL);
> + do {
> + if (memcg && !mem_cgroup_online(memcg))
> + continue;
> +
> + total = shrinker_count_objects(shrinker,
> + memcg_aware ? memcg : NULL,
> + count_per_node);
> + if (total) {
> + seq_printf(m, "%lu", mem_cgroup_ino(memcg));
> + for_each_node(nid)
> + seq_printf(m, " %lu", count_per_node[nid]);
> + seq_puts(m, "\n");

seq_putc(m, '\n') is more efficient.

> + }
> +
> + if (!memcg_aware) {
> + mem_cgroup_iter_break(NULL, memcg);
> + break;
> + }
> +
> + if (signal_pending(current)) {
> + mem_cgroup_iter_break(NULL, memcg);
> + ret = -EINTR;
> + break;
> + }
> +
> + cond_resched();

We are in rcu read lock, cannot be scheduled, right?

> + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
> +
> + rcu_read_unlock();
> + up_read(&shrinker_rwsem);
> +
> + kfree(count_per_node);
> + return ret;
> +}
> +DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_count);
> +
> +int shrinker_debugfs_add(struct shrinker *shrinker)
> +{
> + struct dentry *entry;
> + char buf[16];
> + int id;
> +
> + lockdep_assert_held(&shrinker_rwsem);
> +
> + /* debugfs isn't initialized yet, add debugfs entries later. */
> + if (!shrinker_debugfs_root)
> + return 0;
> +
> + id = ida_alloc(&shrinker_debugfs_ida, GFP_KERNEL);
> + if (id < 0)
> + return id;
> + shrinker->debugfs_id = id;
> +
> + snprintf(buf, sizeof(buf), "%d", id);
> +
> + /* create debugfs entry */
> + entry = debugfs_create_dir(buf, shrinker_debugfs_root);
> + if (IS_ERR(entry)) {
> + ida_free(&shrinker_debugfs_ida, id);
> + return PTR_ERR(entry);
> + }
> + shrinker->debugfs_entry = entry;
> +
> + debugfs_create_file("count", 0220, entry, shrinker,
> + &shrinker_debugfs_count_fops);
> + return 0;
> +}
> +
> +void shrinker_debugfs_remove(struct shrinker *shrinker)
> +{
> + lockdep_assert_held(&shrinker_rwsem);
> +
> + if (!shrinker->debugfs_entry)
> + return;
> +
> + debugfs_remove_recursive(shrinker->debugfs_entry);
> + ida_free(&shrinker_debugfs_ida, shrinker->debugfs_id);
> +}
> +
> +static int __init shrinker_debugfs_init(void)
> +{
> + struct shrinker *shrinker;
> + int ret;
> +
> + if (!debugfs_initialized())
> + return -ENODEV;
> +

Redundant check since it is checked in debugfs_create_dir().
So I think we could remove this.

> + shrinker_debugfs_root = debugfs_create_dir("shrinker", NULL);

We should use IS_ERR() to detect the error code. So the following
check is wrong.

> + if (!shrinker_debugfs_root)
> + return -ENOMEM;
> +
> + /* Create debugfs entries for shrinkers registered at boot */
> + ret = down_write_killable(&shrinker_rwsem);

How could we kill this process? IIUC, late_initcall() is called
from early init process, there is no way to kill this. Right?
If yes, I think we could just use down_write().

Thanks.

> + if (ret)
> + return ret;
> +
> + list_for_each_entry(shrinker, &shrinker_list, list)
> + if (!shrinker->debugfs_entry)
> + ret = shrinker_debugfs_add(shrinker);
> + up_write(&shrinker_rwsem);
> +
> + return ret;
> +}
> +late_initcall(shrinker_debugfs_init);
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index c6918fff06e1..024f7056b98c 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -190,8 +190,8 @@ static void set_task_reclaim_state(struct task_struct *task,
> task->reclaim_state = rs;
> }
>
> -static LIST_HEAD(shrinker_list);
> -static DECLARE_RWSEM(shrinker_rwsem);
> +LIST_HEAD(shrinker_list);
> +DECLARE_RWSEM(shrinker_rwsem);
>
> #ifdef CONFIG_MEMCG
> static int shrinker_nr_max;
> @@ -655,6 +655,7 @@ void register_shrinker_prepared(struct shrinker *shrinker)
> down_write(&shrinker_rwsem);
> list_add_tail(&shrinker->list, &shrinker_list);
> shrinker->flags |= SHRINKER_REGISTERED;
> + WARN_ON_ONCE(shrinker_debugfs_add(shrinker));
> up_write(&shrinker_rwsem);
> }
>
> @@ -682,6 +683,7 @@ void unregister_shrinker(struct shrinker *shrinker)
> shrinker->flags &= ~SHRINKER_REGISTERED;
> if (shrinker->flags & SHRINKER_MEMCG_AWARE)
> unregister_memcg_shrinker(shrinker);
> + shrinker_debugfs_remove(shrinker);
> up_write(&shrinker_rwsem);
>
> kfree(shrinker->nr_deferred);
> --
> 2.35.3
>
>