netdev - Re: [PATCH v3 bpf-next 1/4] bpf: enable program stats

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20190225230703.GB32115@mini-arch>
Date:   Mon, 25 Feb 2019 15:07:03 -0800
From:   Stanislav Fomichev <sdf@...ichev.me>
To:     Alexei Starovoitov <ast@...nel.org>
Cc:     davem@...emloft.net, daniel@...earbox.net, edumazet@...gle.com,
        netdev@...r.kernel.org, bpf@...r.kernel.org, kernel-team@...com
Subject: Re: [PATCH v3 bpf-next 1/4] bpf: enable program stats

On 02/25, Alexei Starovoitov wrote:
> JITed BPF programs are indistinguishable from kernel functions, but unlike
> kernel code BPF code can be changed often.
> Typical approach of "perf record" + "perf report" profiling and tuning of
> kernel code works just as well for BPF programs, but kernel code doesn't
> need to be monitored whereas BPF programs do.
> Users load and run large amount of BPF programs.
> These BPF stats allow tools monitor the usage of BPF on the server.
> The monitoring tools will turn sysctl kernel.bpf_stats_enabled
> on and off for few seconds to sample average cost of the programs.
> Aggregated data over hours and days will provide an insight into cost of BPF
> and alarms can trigger in case given program suddenly gets more expensive.
> 
> The cost of two sched_clock() per program invocation adds ~20 nsec.
> Fast BPF progs (like selftests/bpf/progs/test_pkt_access.c) will slow down
> from ~10 nsec to ~30 nsec.
> static_key minimizes the cost of the stats collection.
> There is no measurable difference before/after this patch
> with kernel.bpf_stats_enabled=0
> 
> Signed-off-by: Alexei Starovoitov <ast@...nel.org>
> ---
>  include/linux/bpf.h    |  9 +++++++++
>  include/linux/filter.h | 20 +++++++++++++++++++-
>  kernel/bpf/core.c      | 31 +++++++++++++++++++++++++++++--
>  kernel/bpf/syscall.c   | 34 ++++++++++++++++++++++++++++++++--
>  kernel/bpf/verifier.c  |  7 ++++++-
>  kernel/sysctl.c        | 34 ++++++++++++++++++++++++++++++++++
>  6 files changed, 129 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index de18227b3d95..a2132e09dc1c 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -16,6 +16,7 @@
>  #include <linux/rbtree_latch.h>
>  #include <linux/numa.h>
>  #include <linux/wait.h>
> +#include <linux/u64_stats_sync.h>
>  
>  struct bpf_verifier_env;
>  struct perf_event;
> @@ -340,6 +341,12 @@ enum bpf_cgroup_storage_type {
>  
>  #define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX
>  
> +struct bpf_prog_stats {
> +	u64 cnt;
> +	u64 nsecs;
> +	struct u64_stats_sync syncp;
> +};
> +
>  struct bpf_prog_aux {
>  	atomic_t refcnt;
>  	u32 used_map_cnt;
> @@ -389,6 +396,7 @@ struct bpf_prog_aux {
>  	 * main prog always has linfo_idx == 0
>  	 */
>  	u32 linfo_idx;
> +	struct bpf_prog_stats __percpu *stats;
>  	union {
>  		struct work_struct work;
>  		struct rcu_head	rcu;
> @@ -559,6 +567,7 @@ void bpf_map_area_free(void *base);
>  void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
>  
>  extern int sysctl_unprivileged_bpf_disabled;
> +extern int sysctl_bpf_stats_enabled;
>  
>  int bpf_map_new_fd(struct bpf_map *map, int flags);
>  int bpf_prog_new_fd(struct bpf_prog *prog);
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index f32b3eca5a04..7e5e3db11106 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -533,7 +533,24 @@ struct sk_filter {
>  	struct bpf_prog	*prog;
>  };
>  
> -#define BPF_PROG_RUN(filter, ctx)  ({ cant_sleep(); (*(filter)->bpf_func)(ctx, (filter)->insnsi); })
> +DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
> +
> +#define BPF_PROG_RUN(prog, ctx)	({				\
> +	u32 ret;						\
> +	cant_sleep();						\
> +	if (static_branch_unlikely(&bpf_stats_enabled_key)) {	\
> +		struct bpf_prog_stats *stats;			\
> +		u64 start = sched_clock();			\
QQ: why sched_clock() and not, for example, ktime_get_ns() which we do
in the bpf_test_run()? Or even why not local_clock?
I'm just wondering what king of trade off we are doing here
regarding precision vs run time cost.

> +		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
> +		stats = this_cpu_ptr(prog->aux->stats);		\
> +		u64_stats_update_begin(&stats->syncp);		\
> +		stats->cnt++;					\
> +		stats->nsecs += sched_clock() - start;		\
> +		u64_stats_update_end(&stats->syncp);		\
> +	} else {						\
> +		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
> +	}							\
> +	ret; })
>  
>  #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
>  
> @@ -764,6 +781,7 @@ void bpf_prog_free_jited_linfo(struct bpf_prog *prog);
>  void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog);
>  
>  struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
> +struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags);
>  struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
>  				  gfp_t gfp_extra_flags);
>  void __bpf_prog_free(struct bpf_prog *fp);
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index ef88b167959d..1c14c347f3cf 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -78,7 +78,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
>  	return NULL;
>  }
>  
> -struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
> +struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
>  {
>  	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
>  	struct bpf_prog_aux *aux;
> @@ -104,6 +104,26 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
>  
>  	return fp;
>  }
> +
> +struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
> +{
> +	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
> +	struct bpf_prog *prog;
> +
> +	prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags);
> +	if (!prog)
> +		return NULL;
> +
> +	prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
> +	if (!prog->aux->stats) {
> +		kfree(prog->aux);
> +		vfree(prog);
> +		return NULL;
> +	}
> +
> +	u64_stats_init(&prog->aux->stats->syncp);
> +	return prog;
> +}
>  EXPORT_SYMBOL_GPL(bpf_prog_alloc);
>  
>  int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
> @@ -231,7 +251,10 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
>  
>  void __bpf_prog_free(struct bpf_prog *fp)
>  {
> -	kfree(fp->aux);
> +	if (fp->aux) {
> +		free_percpu(fp->aux->stats);
> +		kfree(fp->aux);
> +	}
>  	vfree(fp);
>  }
>  
> @@ -2069,6 +2092,10 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
>  	return -EFAULT;
>  }
>  
> +DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
> +EXPORT_SYMBOL(bpf_stats_enabled_key);
> +int sysctl_bpf_stats_enabled __read_mostly;
> +
>  /* All definitions of tracepoints related to BPF. */
>  #define CREATE_TRACE_POINTS
>  #include <linux/bpf_trace.h>
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index ec7c552af76b..31cf66fc3f5c 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -1283,24 +1283,54 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
>  	return 0;
>  }
>  
> +static void bpf_prog_get_stats(const struct bpf_prog *prog,
> +			       struct bpf_prog_stats *stats)
> +{
> +	u64 nsecs = 0, cnt = 0;
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		const struct bpf_prog_stats *st;
> +		unsigned int start;
> +		u64 tnsecs, tcnt;
> +
> +		st = per_cpu_ptr(prog->aux->stats, cpu);
> +		do {
> +			start = u64_stats_fetch_begin_irq(&st->syncp);
> +			tnsecs = st->nsecs;
> +			tcnt = st->cnt;
> +		} while (u64_stats_fetch_retry_irq(&st->syncp, start));
> +		nsecs += tnsecs;
> +		cnt += tcnt;
> +	}
> +	stats->nsecs = nsecs;
> +	stats->cnt = cnt;
> +}
> +
>  #ifdef CONFIG_PROC_FS
>  static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
>  {
>  	const struct bpf_prog *prog = filp->private_data;
>  	char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
> +	struct bpf_prog_stats stats;
>  
> +	bpf_prog_get_stats(prog, &stats);
>  	bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
>  	seq_printf(m,
>  		   "prog_type:\t%u\n"
>  		   "prog_jited:\t%u\n"
>  		   "prog_tag:\t%s\n"
>  		   "memlock:\t%llu\n"
> -		   "prog_id:\t%u\n",
> +		   "prog_id:\t%u\n"
> +		   "run_time_ns:\t%llu\n"
> +		   "run_cnt:\t%llu\n",
>  		   prog->type,
>  		   prog->jited,
>  		   prog_tag,
>  		   prog->pages * 1ULL << PAGE_SHIFT,
> -		   prog->aux->id);
> +		   prog->aux->id,
> +		   stats.nsecs,
> +		   stats.cnt);
>  }
>  #endif
>  
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 1b9496c41383..0e4edd7e3c5f 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -7320,7 +7320,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
>  		subprog_end = env->subprog_info[i + 1].start;
>  
>  		len = subprog_end - subprog_start;
> -		func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
> +		/* BPF_PROG_RUN doesn't call subprogs directly,
> +		 * hence main prog stats include the runtime of subprogs.
> +		 * subprogs don't have IDs and not reachable via prog_get_next_id
> +		 * func[i]->aux->stats will never be accessed and stays NULL
> +		 */
> +		func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);
>  		if (!func[i])
>  			goto out_free;
>  		memcpy(func[i]->insnsi, &prog->insnsi[subprog_start],
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index ba4d9e85feb8..86e0771352f2 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -224,6 +224,9 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
>  #endif
>  static int proc_dopipe_max_size(struct ctl_table *table, int write,
>  		void __user *buffer, size_t *lenp, loff_t *ppos);
> +static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
> +					  void __user *buffer, size_t *lenp,
> +					  loff_t *ppos);
>  
>  #ifdef CONFIG_MAGIC_SYSRQ
>  /* Note: sysrq code uses its own private copy */
> @@ -1230,6 +1233,15 @@ static struct ctl_table kern_table[] = {
>  		.extra2		= &one,
>  	},
>  #endif
> +	{
> +		.procname	= "bpf_stats_enabled",
> +		.data		= &sysctl_bpf_stats_enabled,
> +		.maxlen		= sizeof(sysctl_bpf_stats_enabled),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec_minmax_bpf_stats,
> +		.extra1		= &zero,
> +		.extra2		= &one,
> +	},
>  #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
>  	{
>  		.procname	= "panic_on_rcu_stall",
> @@ -3260,6 +3272,28 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
>  
>  #endif /* CONFIG_PROC_SYSCTL */
>  
> +static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
> +					  void __user *buffer, size_t *lenp,
> +					  loff_t *ppos)
> +{
> +	int ret, bpf_stats = *(int *)table->data;
> +	struct ctl_table tmp = *table;
> +
> +	if (write && !capable(CAP_SYS_ADMIN))
> +		return -EPERM;
> +
> +	tmp.data = &bpf_stats;
> +	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
> +	if (write && !ret) {
> +		*(int *)table->data = bpf_stats;
> +		if (bpf_stats)
> +			static_branch_enable(&bpf_stats_enabled_key);
> +		else
> +			static_branch_disable(&bpf_stats_enabled_key);
> +	}
> +	return ret;
> +}
> +
>  /*
>   * No sense putting this after each symbol definition, twice,
>   * exception granted :-)
> -- 
> 2.20.0
>