[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <oi4axinfi7ix6iyrll3wloamdcywpiieneqwjh6fzt52rtkp4s@i2xz6nrurktt>
Date: Mon, 25 Nov 2024 09:34:11 -0800
From: Shakeel Butt <shakeel.butt@...ux.dev>
To: Andrew Morton <akpm@...ux-foundation.org>,
Yosry Ahmed <yosryahmed@...gle.com>
Cc: Johannes Weiner <hannes@...xchg.org>, Michal Hocko <mhocko@...nel.org>,
Roman Gushchin <roman.gushchin@...ux.dev>, Muchun Song <muchun.song@...ux.dev>,
Vlastimil Babka <vbabka@...e.cz>, Axel Rasmussen <axelrasmussen@...gle.com>,
Steven Rostedt <rostedt@...dmis.org>, Suren Baghdasaryan <surenb@...gle.com>,
Matthew Wilcox <willy@...radead.org>, linux-mm@...ck.org, cgroups@...r.kernel.org,
linux-kernel@...r.kernel.org, Meta kernel team <kernel-team@...a.com>
Subject: Re: [PATCH v2] mm: mmap_lock: optimize mmap_lock tracepoints
Cc Yosry.
On Mon, Nov 25, 2024 at 09:16:17AM -0800, Shakeel Butt wrote:
> We are starting to deploy mmap_lock tracepoint monitoring across our
> fleet and the early results showed that these tracepoints are consuming
> significant amount of CPUs in kernfs_path_from_node when enabled.
>
> It seems like the kernel is trying to resolve the cgroup path in the
> fast path of the locking code path when the tracepoints are enabled. In
> addition for some application their metrics are regressing when
> monitoring is enabled.
>
> The cgroup path resolution can be slow and should not be done in the
> fast path. Most userspace tools, like bpftrace, provides functionality
> to get the cgroup path from cgroup id, so let's just trace the cgroup
> id and the users can use better tools to get the path in the slow path.
>
> Signed-off-by: Shakeel Butt <shakeel.butt@...ux.dev>
> ---
>
> Changes since v1:
> - Fixed commit message (Yosry).
> - Renamed memcg_id_from_mm to cgroup_id_from_mm (Yosry).
> - Fixed handling of root memcg (Yosry).
> - Fixed mem_cgroup_disabled() handling.
> - Kept mm pointer printing based on Steven's feedback.
>
> include/linux/memcontrol.h | 22 ++++++++++++++
> include/trace/events/mmap_lock.h | 32 ++++++++++----------
> mm/mmap_lock.c | 50 ++------------------------------
> 3 files changed, 40 insertions(+), 64 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 5502aa8e138e..b28180269e75 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -1046,6 +1046,23 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
>
> void split_page_memcg(struct page *head, int old_order, int new_order);
>
> +static inline u64 cgroup_id_from_mm(struct mm_struct *mm)
> +{
> + struct mem_cgroup *memcg;
> + u64 id;
> +
> + if (mem_cgroup_disabled())
> + return 0;
> +
> + rcu_read_lock();
> + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
> + if (!memcg)
> + memcg = root_mem_cgroup;
> + id = cgroup_id(memcg->css.cgroup);
> + rcu_read_unlock();
> + return id;
> +}
> +
> #else /* CONFIG_MEMCG */
>
> #define MEM_CGROUP_ID_SHIFT 0
> @@ -1466,6 +1483,11 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
> static inline void split_page_memcg(struct page *head, int old_order, int new_order)
> {
> }
> +
> +static inline u64 cgroup_id_from_mm(struct mm_struct *mm)
> +{
> + return 0;
> +}
> #endif /* CONFIG_MEMCG */
>
> /*
> diff --git a/include/trace/events/mmap_lock.h b/include/trace/events/mmap_lock.h
> index bc2e3ad787b3..cf9f9faf8914 100644
> --- a/include/trace/events/mmap_lock.h
> +++ b/include/trace/events/mmap_lock.h
> @@ -5,6 +5,7 @@
> #if !defined(_TRACE_MMAP_LOCK_H) || defined(TRACE_HEADER_MULTI_READ)
> #define _TRACE_MMAP_LOCK_H
>
> +#include <linux/memcontrol.h>
> #include <linux/tracepoint.h>
> #include <linux/types.h>
>
> @@ -12,64 +13,61 @@ struct mm_struct;
>
> DECLARE_EVENT_CLASS(mmap_lock,
>
> - TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write),
> + TP_PROTO(struct mm_struct *mm, bool write),
>
> - TP_ARGS(mm, memcg_path, write),
> + TP_ARGS(mm, write),
>
> TP_STRUCT__entry(
> __field(struct mm_struct *, mm)
> - __string(memcg_path, memcg_path)
> + __field(u64, memcg_id)
> __field(bool, write)
> ),
>
> TP_fast_assign(
> __entry->mm = mm;
> - __assign_str(memcg_path);
> + __entry->memcg_id = cgroup_id_from_mm(mm);
> __entry->write = write;
> ),
>
> TP_printk(
> - "mm=%p memcg_path=%s write=%s",
> - __entry->mm,
> - __get_str(memcg_path),
> + "mm=%p memcg_id=%llu write=%s",
> + __entry->mm, __entry->memcg_id,
> __entry->write ? "true" : "false"
> )
> );
>
> #define DEFINE_MMAP_LOCK_EVENT(name) \
> DEFINE_EVENT(mmap_lock, name, \
> - TP_PROTO(struct mm_struct *mm, const char *memcg_path, \
> - bool write), \
> - TP_ARGS(mm, memcg_path, write))
> + TP_PROTO(struct mm_struct *mm, bool write), \
> + TP_ARGS(mm, write))
>
> DEFINE_MMAP_LOCK_EVENT(mmap_lock_start_locking);
> DEFINE_MMAP_LOCK_EVENT(mmap_lock_released);
>
> TRACE_EVENT(mmap_lock_acquire_returned,
>
> - TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write,
> - bool success),
> + TP_PROTO(struct mm_struct *mm, bool write, bool success),
>
> - TP_ARGS(mm, memcg_path, write, success),
> + TP_ARGS(mm, write, success),
>
> TP_STRUCT__entry(
> __field(struct mm_struct *, mm)
> - __string(memcg_path, memcg_path)
> + __field(u64, memcg_id)
> __field(bool, write)
> __field(bool, success)
> ),
>
> TP_fast_assign(
> __entry->mm = mm;
> - __assign_str(memcg_path);
> + __entry->memcg_id = cgroup_id_from_mm(mm);
> __entry->write = write;
> __entry->success = success;
> ),
>
> TP_printk(
> - "mm=%p memcg_path=%s write=%s success=%s",
> + "mm=%p memcg_id=%llu write=%s success=%s",
> __entry->mm,
> - __get_str(memcg_path),
> + __entry->memcg_id,
> __entry->write ? "true" : "false",
> __entry->success ? "true" : "false"
> )
> diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
> index f186d57df2c6..e7dbaf96aa17 100644
> --- a/mm/mmap_lock.c
> +++ b/mm/mmap_lock.c
> @@ -17,51 +17,7 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
> EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
> EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
>
> -#ifdef CONFIG_MEMCG
> -
> -/*
> - * Size of the buffer for memcg path names. Ignoring stack trace support,
> - * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it.
> - */
> -#define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL
> -
> -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
> - do { \
> - if (trace_mmap_lock_##type##_enabled()) { \
> - char buf[MEMCG_PATH_BUF_SIZE]; \
> - get_mm_memcg_path(mm, buf, sizeof(buf)); \
> - trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \
> - } \
> - } while (0)
> -
> -#else /* !CONFIG_MEMCG */
> -
> -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
> - trace_mmap_lock_##type(mm, "", ##__VA_ARGS__)
> -
> -#endif /* CONFIG_MEMCG */
> -
> #ifdef CONFIG_TRACING
> -#ifdef CONFIG_MEMCG
> -/*
> - * Write the given mm_struct's memcg path to a buffer. If the path cannot be
> - * determined, empty string is written.
> - */
> -static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen)
> -{
> - struct mem_cgroup *memcg;
> -
> - buf[0] = '\0';
> - memcg = get_mem_cgroup_from_mm(mm);
> - if (memcg == NULL)
> - return;
> - if (memcg->css.cgroup)
> - cgroup_path(memcg->css.cgroup, buf, buflen);
> - css_put(&memcg->css);
> -}
> -
> -#endif /* CONFIG_MEMCG */
> -
> /*
> * Trace calls must be in a separate file, as otherwise there's a circular
> * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
> @@ -69,20 +25,20 @@ static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen)
>
> void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
> {
> - TRACE_MMAP_LOCK_EVENT(start_locking, mm, write);
> + trace_mmap_lock_start_locking(mm, write);
> }
> EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
>
> void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
> bool success)
> {
> - TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success);
> + trace_mmap_lock_acquire_returned(mm, write, success);
> }
> EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
>
> void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
> {
> - TRACE_MMAP_LOCK_EVENT(released, mm, write);
> + trace_mmap_lock_released(mm, write);
> }
> EXPORT_SYMBOL(__mmap_lock_do_trace_released);
> #endif /* CONFIG_TRACING */
> --
> 2.43.5
>
Powered by blists - more mailing lists