linux-kernel - Re: [PATCH 08/12] add trace events for each syscall entry/exit

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20090811105010.GB4938@nowhere>
Date:	Tue, 11 Aug 2009 12:50:14 +0200
From:	Frederic Weisbecker <fweisbec@...il.com>
To:	Jason Baron <jbaron@...hat.com>
Cc:	linux-kernel@...r.kernel.org, mingo@...e.hu, laijs@...fujitsu.com,
	rostedt@...dmis.org, peterz@...radead.org,
	mathieu.desnoyers@...ymtl.ca, jiayingz@...gle.com,
	mbligh@...gle.com, lizf@...fujitsu.com
Subject: Re: [PATCH 08/12] add trace events for each syscall entry/exit

On Mon, Aug 10, 2009 at 04:52:47PM -0400, Jason Baron wrote:
> Layer Frederic's syscall tracer on tracepoints. We create trace events via
> hooking into the SYCALL_DEFINE macros. This allows us to individually toggle
> syscall entry and exit points on/off.
> 
> Signed-off-by: Jason Baron <jbaron@...hat.com>
> 
> ---
>  include/linux/syscalls.h      |   61 +++++++++++++-
>  include/trace/syscall.h       |   18 ++--
>  kernel/trace/trace_syscalls.c |  183 ++++++++++++++++++++---------------------
>  3 files changed, 159 insertions(+), 103 deletions(-)
> 
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 80de700..5e5b4d3 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -64,6 +64,7 @@ struct perf_counter_attr;
>  #include <linux/sem.h>
>  #include <asm/siginfo.h>
>  #include <asm/signal.h>
> +#include <linux/unistd.h>
>  #include <linux/quota.h>
>  #include <linux/key.h>
>  #include <trace/syscall.h>
> @@ -112,6 +113,59 @@ struct perf_counter_attr;
>  #define __SC_STR_TDECL5(t, a, ...)	#t, __SC_STR_TDECL4(__VA_ARGS__)
>  #define __SC_STR_TDECL6(t, a, ...)	#t, __SC_STR_TDECL5(__VA_ARGS__)
>  
> +
> +#define SYSCALL_TRACE_ENTER_EVENT(sname)				\
> +	static struct ftrace_event_call event_enter_##sname;		\
> +	static int init_enter_##sname(void)				\
> +	{								\
> +		int num;						\
> +		num = syscall_name_to_nr("sys"#sname);			\
> +		if (num < 0)						\
> +			return -ENOSYS;					\
> +		register_ftrace_event(&event_syscall_enter);		\
> +		INIT_LIST_HEAD(&event_enter_##sname.fields);		\
> +		init_preds(&event_enter_##sname);			\
> +		return 0;						\
> +	}								\



You could rather add the struct ftrace_event_call *event as
a parameter to a generic function static int init_enter_syscall()
That would require adding this parameter in the raw_init() callback.
If I remember well, Masami does that in his kprobes events patchset.

May be we can let it as is and wait for his patchset to be integrated
to update that.


> +	static struct ftrace_event_call __used				\
> +	  __attribute__((__aligned__(4)))				\
> +	  __attribute__((section("_ftrace_events")))			\
> +	  event_enter_##sname = {					\
> +		.name                   = "sys_enter"#sname,		\
> +		.system                 = "syscalls",			\
> +		.event                  = &event_syscall_enter,		\
> +		.raw_init		= init_enter_##sname,		\
> +		.regfunc		= reg_event_syscall_enter,	\
> +		.unregfunc		= unreg_event_syscall_enter,	\
> +		.data			= "sys"#sname,			\
> +	}
> +
> +#define SYSCALL_TRACE_EXIT_EVENT(sname)					\
> +	static struct ftrace_event_call event_exit_##sname;		\
> +	static int init_exit_##sname(void)				\
> +	{								\
> +		int num;						\
> +		num = syscall_name_to_nr("sys"#sname);			\
> +		if (num < 0)						\
> +			return -ENOSYS;					\
> +		register_ftrace_event(&event_syscall_exit);		\
> +		INIT_LIST_HEAD(&event_exit_##sname.fields);		\
> +		init_preds(&event_exit_##sname);			\
> +		return 0;						\
> +	}								\
> +	static struct ftrace_event_call __used				\
> +	  __attribute__((__aligned__(4)))				\
> +	  __attribute__((section("_ftrace_events")))			\
> +	  event_exit_##sname = {					\
> +		.name                   = "sys_exit"#sname,		\
> +		.system                 = "syscalls",			\
> +		.event                  = &event_syscall_exit,		\
> +		.raw_init		= init_exit_##sname,		\
> +		.regfunc		= reg_event_syscall_exit,	\
> +		.unregfunc		= unreg_event_syscall_exit,	\
> +		.data			= "sys"#sname,			\
> +	}
> +
>  #define SYSCALL_METADATA(sname, nb)				\
>  	static const struct syscall_metadata __used		\
>  	  __attribute__((__aligned__(4)))			\
> @@ -121,7 +175,9 @@ struct perf_counter_attr;
>  		.nb_args 	= nb,				\
>  		.types		= types_##sname,		\
>  		.args		= args_##sname,			\
> -	}
> +	};							\
> +	SYSCALL_TRACE_ENTER_EVENT(sname);			\
> +	SYSCALL_TRACE_EXIT_EVENT(sname);
>  
>  #define SYSCALL_DEFINE0(sname)					\
>  	static const struct syscall_metadata __used		\
> @@ -131,8 +187,9 @@ struct perf_counter_attr;
>  		.name 		= "sys_"#sname,			\
>  		.nb_args 	= 0,				\
>  	};							\
> +	SYSCALL_TRACE_ENTER_EVENT(_##sname);			\
> +	SYSCALL_TRACE_EXIT_EVENT(_##sname);			\
>  	asmlinkage long sys_##sname(void)
> -
>  #else
>  #define SYSCALL_DEFINE0(name)	   asmlinkage long sys_##name(void)
>  #endif
> diff --git a/include/trace/syscall.h b/include/trace/syscall.h
> index 3951d77..73fb8b4 100644
> --- a/include/trace/syscall.h
> +++ b/include/trace/syscall.h
> @@ -2,6 +2,8 @@
>  #define _TRACE_SYSCALL_H
>  
>  #include <linux/tracepoint.h>
> +#include <linux/unistd.h>
> +#include <linux/ftrace_event.h>
>  
>  #include <asm/ptrace.h>
>  
> @@ -40,15 +42,13 @@ struct syscall_metadata {
>  
>  #ifdef CONFIG_FTRACE_SYSCALLS
>  extern struct syscall_metadata *syscall_nr_to_meta(int nr);
> -extern void start_ftrace_syscalls(void);
> -extern void stop_ftrace_syscalls(void);
> -extern void ftrace_syscall_enter(struct pt_regs *regs);
> -extern void ftrace_syscall_exit(struct pt_regs *regs);
> -#else
> -static inline void start_ftrace_syscalls(void)			{ }
> -static inline void stop_ftrace_syscalls(void)			{ }
> -static inline void ftrace_syscall_enter(struct pt_regs *regs)	{ }
> -static inline void ftrace_syscall_exit(struct pt_regs *regs)	{ }
> +extern int syscall_name_to_nr(char *name);
> +extern struct trace_event event_syscall_enter;
> +extern struct trace_event event_syscall_exit;
> +extern int reg_event_syscall_enter(void *ptr);
> +extern void unreg_event_syscall_enter(void *ptr);
> +extern int reg_event_syscall_exit(void *ptr);
> +extern void unreg_event_syscall_exit(void *ptr);
>  #endif
>  
>  #endif /* _TRACE_SYSCALL_H */
> diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
> index 08aed43..c7ae25e 100644
> --- a/kernel/trace/trace_syscalls.c
> +++ b/kernel/trace/trace_syscalls.c
> @@ -1,15 +1,16 @@
>  #include <trace/syscall.h>
>  #include <linux/kernel.h>
> +#include <linux/ftrace.h>
>  #include <asm/syscall.h>
>  
>  #include "trace_output.h"
>  #include "trace.h"
>  
> -/* Keep a counter of the syscall tracing users */
> -static int refcount;
> -
> -/* Prevent from races on thread flags toggling */
>  static DEFINE_MUTEX(syscall_trace_lock);
> +static int sys_refcount_enter;
> +static int sys_refcount_exit;
> +static DECLARE_BITMAP(enabled_enter_syscalls, FTRACE_SYSCALL_MAX);
> +static DECLARE_BITMAP(enabled_exit_syscalls, FTRACE_SYSCALL_MAX);
>  
>  /* Option to display the parameters types */
>  enum {
> @@ -95,53 +96,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
>  	return TRACE_TYPE_HANDLED;
>  }
>  
> -void start_ftrace_syscalls(void)
> -{
> -	unsigned long flags;
> -	struct task_struct *g, *t;
> -
> -	mutex_lock(&syscall_trace_lock);
> -
> -	/* Don't enable the flag on the tasks twice */
> -	if (++refcount != 1)
> -		goto unlock;
> -
> -	read_lock_irqsave(&tasklist_lock, flags);
> -
> -	do_each_thread(g, t) {
> -		set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
> -	} while_each_thread(g, t);
> -
> -	read_unlock_irqrestore(&tasklist_lock, flags);
> -
> -unlock:
> -	mutex_unlock(&syscall_trace_lock);
> -}
> -
> -void stop_ftrace_syscalls(void)
> -{
> -	unsigned long flags;
> -	struct task_struct *g, *t;
> -
> -	mutex_lock(&syscall_trace_lock);
> -
> -	/* There are perhaps still some users */
> -	if (--refcount)
> -		goto unlock;
> -
> -	read_lock_irqsave(&tasklist_lock, flags);
> -
> -	do_each_thread(g, t) {
> -		clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
> -	} while_each_thread(g, t);
> -
> -	read_unlock_irqrestore(&tasklist_lock, flags);
> -
> -unlock:
> -	mutex_unlock(&syscall_trace_lock);
> -}
> -
> -void ftrace_syscall_enter(struct pt_regs *regs)
> +void ftrace_syscall_enter(struct pt_regs *regs, long id)
>  {
>  	struct syscall_trace_enter *entry;
>  	struct syscall_metadata *sys_data;
> @@ -150,6 +105,8 @@ void ftrace_syscall_enter(struct pt_regs *regs)
>  	int syscall_nr;
>  
>  	syscall_nr = syscall_get_nr(current, regs);
> +	if (!test_bit(syscall_nr, enabled_enter_syscalls))
> +		return;
>  
>  	sys_data = syscall_nr_to_meta(syscall_nr);
>  	if (!sys_data)
> @@ -170,7 +127,7 @@ void ftrace_syscall_enter(struct pt_regs *regs)
>  	trace_wake_up();
>  }
>  
> -void ftrace_syscall_exit(struct pt_regs *regs)
> +void ftrace_syscall_exit(struct pt_regs *regs, long ret)
>  {
>  	struct syscall_trace_exit *entry;
>  	struct syscall_metadata *sys_data;
> @@ -178,6 +135,8 @@ void ftrace_syscall_exit(struct pt_regs *regs)
>  	int syscall_nr;
>  
>  	syscall_nr = syscall_get_nr(current, regs);
> +	if (!test_bit(syscall_nr, enabled_exit_syscalls))
> +		return;
>  
>  	sys_data = syscall_nr_to_meta(syscall_nr);
>  	if (!sys_data)
> @@ -196,54 +155,94 @@ void ftrace_syscall_exit(struct pt_regs *regs)
>  	trace_wake_up();
>  }
>  
> -static int init_syscall_tracer(struct trace_array *tr)
> +int reg_event_syscall_enter(void *ptr)
>  {
> -	start_ftrace_syscalls();
> -
> -	return 0;
> +	int ret = 0;
> +	int num;
> +	char *name;
> +
> +	name = (char *)ptr;
> +	num = syscall_name_to_nr(name);
> +	if (num < 0 || num >= FTRACE_SYSCALL_MAX)



I wonder if we should WARN once in this case. At least we would
be aware of new yet unsupported syscalls.



> +		return -ENOSYS;
> +	mutex_lock(&syscall_trace_lock);
> +	if (!sys_refcount_enter)
> +		ret = register_trace_syscall_enter(ftrace_syscall_enter);
> +	if (ret) {
> +		pr_info("event trace: Could not activate"
> +				"syscall entry trace point");
> +	} else {
> +		set_bit(num, enabled_enter_syscalls);
> +		sys_refcount_enter++;
> +	}
> +	mutex_unlock(&syscall_trace_lock);
> +	return ret;
>  }
>  
> -static void reset_syscall_tracer(struct trace_array *tr)
> +void unreg_event_syscall_enter(void *ptr)
>  {
> -	stop_ftrace_syscalls();
> -	tracing_reset_online_cpus(tr);
> -}
> -
> -static struct trace_event syscall_enter_event = {
> -	.type	 	= TRACE_SYSCALL_ENTER,
> -	.trace		= print_syscall_enter,
> -};
> -
> -static struct trace_event syscall_exit_event = {
> -	.type	 	= TRACE_SYSCALL_EXIT,
> -	.trace		= print_syscall_exit,
> -};
> +	int num;
> +	char *name;
>  
> -static struct tracer syscall_tracer __read_mostly = {
> -	.name	     	= "syscall",
> -	.init		= init_syscall_tracer,
> -	.reset		= reset_syscall_tracer,
> -	.flags		= &syscalls_flags,
> -};
> +	name = (char *)ptr;
> +	num = syscall_name_to_nr(name);
> +	if (num < 0 || num >= FTRACE_SYSCALL_MAX)
> +		return;
> +	mutex_lock(&syscall_trace_lock);
> +	sys_refcount_enter--;
> +	clear_bit(num, enabled_enter_syscalls);
> +	if (!sys_refcount_enter)
> +		unregister_trace_syscall_enter(ftrace_syscall_enter);
> +	mutex_unlock(&syscall_trace_lock);
> +}
>  
> -__init int register_ftrace_syscalls(void)
> +int reg_event_syscall_exit(void *ptr)
>  {
> -	int ret;
> -
> -	ret = register_ftrace_event(&syscall_enter_event);
> -	if (!ret) {
> -		printk(KERN_WARNING "event %d failed to register\n",
> -		       syscall_enter_event.type);
> -		WARN_ON_ONCE(1);
> +	int ret = 0;
> +	int num;
> +	char *name;
> +
> +	name = (char *)ptr;
> +	num = syscall_name_to_nr(name);
> +	if (num < 0 || num >= FTRACE_SYSCALL_MAX)
> +		return -ENOSYS;
> +	mutex_lock(&syscall_trace_lock);
> +	if (!sys_refcount_exit)
> +		ret = register_trace_syscall_exit(ftrace_syscall_exit);
> +	if (ret) {
> +		pr_info("event trace: Could not activate"
> +				"syscall exit trace point");
> +	} else {
> +		set_bit(num, enabled_exit_syscalls);
> +		sys_refcount_exit++;
>  	}
> +	mutex_unlock(&syscall_trace_lock);
> +	return ret;
> +}
>  
> -	ret = register_ftrace_event(&syscall_exit_event);
> -	if (!ret) {
> -		printk(KERN_WARNING "event %d failed to register\n",
> -		       syscall_exit_event.type);
> -		WARN_ON_ONCE(1);
> -	}
> +void unreg_event_syscall_exit(void *ptr)
> +{
> +	int num;
> +	char *name;
>  
> -	return register_tracer(&syscall_tracer);
> +	name = (char *)ptr;
> +	num = syscall_name_to_nr(name);
> +	if (num < 0 || num >= FTRACE_SYSCALL_MAX)
> +		return;
> +	mutex_lock(&syscall_trace_lock);
> +	sys_refcount_exit--;
> +	clear_bit(num, enabled_exit_syscalls);
> +	if (!sys_refcount_exit)
> +		unregister_trace_syscall_exit(ftrace_syscall_exit);
> +	mutex_unlock(&syscall_trace_lock);
>  }
> -device_initcall(register_ftrace_syscalls);
> +
> +struct trace_event event_syscall_enter = {
> +	.trace			= print_syscall_enter,
> +	.type			= TRACE_SYSCALL_ENTER
> +};
> +
> +struct trace_event event_syscall_exit = {
> +	.trace			= print_syscall_exit,
> +	.type			= TRACE_SYSCALL_EXIT
> +};
> -- 
> 1.6.2.5
> 

Nice.

It's a bit too bad that enter and exit must be that separated
whereas their callbacks are pretty the same.

But I guess if we want to nicely decouple both, we don't have the choice.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/