[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <557A832B.7020405@huawei.com>
Date: Fri, 12 Jun 2015 14:58:51 +0800
From: "Wangnan (F)" <wangnan0@...wei.com>
To: Daniel Wagner <daniel.wagner@...-carit.de>,
Steven Rostedt <rostedt@...dmis.org>,
Tom Zanussi <tom.zanussi@...ux.intel.com>
CC: "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
"Alexei Starovoitov" <ast@...mgrid.com>
Subject: Re: latency histogram with BPF
On 2015/6/11 15:25, Daniel Wagner wrote:
> Hi Steven and Tom,
>
> I was playing a bit with BPF to see if it would be possible to
> get a preempt off latency histogram. On paper this looks like
> doable but adding kprobes on trace_preempt_[on|off] is not a
> clever idea. Who would have thought? :)
>
> In both cases BPF or based on Tom's 'hist' triggers' patches, there is
> some trickery necessary to get it working. While the first approach
> has more flexibility what you want to measure or how you want to
> present it, I suspect it will be harder to get it working/accepted.
>
> Anyway, here is some code to laugh at.
>
> From e3b9e7122f78953248eae54376b5a65f8beba362 Mon Sep 17 00:00:00 2001
> From: Daniel Wagner <daniel.wagner@...-carit.de>
> Date: Thu, 11 Jun 2015 09:19:03 +0200
> Subject: [PATCH] hwlathist: BPF based latency tracing
>
> BPF offers another way to generate latency histograms. We attach
> kprobes at trace_preempt_off and trace_preempt_on and calculate the
> time it takes to from seeing the off/on transition.
>
> The first hashmap is used to store the start time stamp. The key is the
> CPU id. The second hashmap stores the log2(time diff). To be able to
> have a per-cpu histogram the CPU ID is added to the upper
> bits of the hash key.
>
> Obviously, adding kprobes to trace_preempt_[on|off] is
> asking for problems:
[SNIP]
>
> CPU 0
> latency : count distribution
> 1 -> 1 : 0 | |
> 2 -> 3 : 0 | |
> 4 -> 7 : 0 | |
> 8 -> 15 : 3602 |************************************************* |
> 16 -> 31 : 22 | |
> 32 -> 63 : 0 | |
> 64 -> 127 : 0 | |
> 128 -> 255 : 0 | |
> 256 -> 511 : 0 | |
> 512 -> 1023 : 0 | |
> CPU 1
> latency : count distribution
> 1 -> 1 : 0 | |
> 2 -> 3 : 0 | |
> 4 -> 7 : 0 | |
> 8 -> 15 : 7533 |************************************************* |
> 16 -> 31 : 125 | |
> 32 -> 63 : 0 | |
> 64 -> 127 : 0 | |
> 128 -> 255 : 0 | |
> 256 -> 511 : 0 | |
> 512 -> 1023 : 0 | |
> CPU 2
> latency : count distribution
> 1 -> 1 : 0 | |
> 2 -> 3 : 0 | |
> 4 -> 7 : 0 | |
> 8 -> 15 : 3044 |************************************************* |
> 16 -> 31 : 13 | |
> 32 -> 63 : 0 | |
> 64 -> 127 : 0 | |
> 128 -> 255 : 0 | |
> 256 -> 511 : 0 | |
> 512 -> 1023 : 0 | |
> CPU 3
> latency : count distribution
> 1 -> 1 : 0 | |
> 2 -> 3 : 0 | |
> 4 -> 7 : 0 | |
> 8 -> 15 : 25182 |************************************************* |
> 16 -> 31 : 1675 |** |
> 32 -> 63 : 0 | |
> 64 -> 127 : 0 | |
> 128 -> 255 : 0 | |
> 256 -> 511 : 0 | |
> 512 -> 1023 : 0 | |
>
> The numbers look a bit too nice and low. I suspect something is going
> wrong.
>
> All this is based on the tracex3 examples written by
> Alexei Starovoitov <ast@...mgrid.com>.
>
> Not for inclusion!
>
> Not-signed-off-by: Daniel Wagner <daniel.wagner@...-carit.de>
> ---
> kernel/trace/bpf_trace.c | 2 +
> samples/bpf/Makefile | 6 ++-
> samples/bpf/bpf_helpers.h | 2 +
> samples/bpf/hwlathist_kern.c | 92 ++++++++++++++++++++++++++++++++++++
> samples/bpf/hwlathist_user.c | 108 +++++++++++++++++++++++++++++++++++++++++++
> 5 files changed, 209 insertions(+), 1 deletion(-)
> create mode 100644 samples/bpf/hwlathist_kern.c
> create mode 100644 samples/bpf/hwlathist_user.c
>
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 2d56ce5..e4d3e76 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -172,6 +172,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
> return &bpf_probe_read_proto;
> case BPF_FUNC_ktime_get_ns:
> return &bpf_ktime_get_ns_proto;
> + case BPF_FUNC_get_smp_processor_id:
> + return &bpf_get_smp_processor_id_proto;
>
> case BPF_FUNC_trace_printk:
> /*
> diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
> index 8fdbd73..8420648 100644
> --- a/samples/bpf/Makefile
> +++ b/samples/bpf/Makefile
> @@ -10,6 +10,7 @@ hostprogs-y += tracex1
> hostprogs-y += tracex2
> hostprogs-y += tracex3
> hostprogs-y += tracex4
> +hostprogs-y += hwlathist
>
> test_verifier-objs := test_verifier.o libbpf.o
> test_maps-objs := test_maps.o libbpf.o
> @@ -20,6 +21,7 @@ tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
> tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
> tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
> tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
> +hwlathist-objs := bpf_load.o libbpf.o hwlathist_user.o
>
> # Tell kbuild to always build the programs
> always := $(hostprogs-y)
> @@ -30,6 +32,7 @@ always += tracex2_kern.o
> always += tracex3_kern.o
> always += tracex4_kern.o
> always += tcbpf1_kern.o
> +always += hwlathist_kern.o
>
> HOSTCFLAGS += -I$(objtree)/usr/include
>
> @@ -40,9 +43,10 @@ HOSTLOADLIBES_tracex1 += -lelf
> HOSTLOADLIBES_tracex2 += -lelf
> HOSTLOADLIBES_tracex3 += -lelf
> HOSTLOADLIBES_tracex4 += -lelf -lrt
> +HOSTLOADLIBES_hwlathist += -lelf
>
> # point this to your LLVM backend with bpf support
> -LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
> +LLC = /opt/llvm/bin/llc
>
> $(obj)/%.o: $(src)/%.c
> clang $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \
> diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
> index f960b5f..c99bdb6 100644
> --- a/samples/bpf/bpf_helpers.h
> +++ b/samples/bpf/bpf_helpers.h
> @@ -21,6 +21,8 @@ static unsigned long long (*bpf_ktime_get_ns)(void) =
> (void *) BPF_FUNC_ktime_get_ns;
> static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) =
> (void *) BPF_FUNC_trace_printk;
> +static unsigned int (*bpf_get_smp_processor_id)(void) =
> + (void *) BPF_FUNC_get_smp_processor_id;
>
> /* llvm builtin functions that eBPF C program may use to
> * emit BPF_LD_ABS and BPF_LD_IND instructions
> diff --git a/samples/bpf/hwlathist_kern.c b/samples/bpf/hwlathist_kern.c
> new file mode 100644
> index 0000000..7a97e7e
> --- /dev/null
> +++ b/samples/bpf/hwlathist_kern.c
> @@ -0,0 +1,92 @@
> +/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
> + * Copyright (c) 2015 BMW Car IT GmbH
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of version 2 of the GNU General Public
> + * License as published by the Free Software Foundation.
> + */
> +#include <linux/version.h>
> +#include <linux/ptrace.h>
> +#include <uapi/linux/bpf.h>
> +#include "bpf_helpers.h"
> +
> +struct bpf_map_def SEC("maps") my_map = {
> + .type = BPF_MAP_TYPE_HASH,
> + .key_size = sizeof(unsigned int),
> + .value_size = sizeof(u64),
> + .max_entries = 1024,
> +};
> +
> +static unsigned int log2(unsigned int v)
> +{
> + unsigned int r;
> + unsigned int shift;
> +
> + r = (v > 0xFFFF) << 4; v >>= r;
> + shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
> + shift = (v > 0xF) << 2; v >>= shift; r |= shift;
> + shift = (v > 0x3) << 1; v >>= shift; r |= shift;
> + r |= (v >> 1);
> +
> + return r;
> +}
> +
> +static unsigned int log2l(unsigned long v)
> +{
> + unsigned int hi = v >> 32;
> +
> + if (hi)
> + return log2(hi) + 32;
> + else
> + return log2(v);
> +}
> +
Is it possible to make 'uapi/linux/bpf.h' hold something useful for
eBPF programming, so we can get rid from bpf_helpers?
Like this:
#ifndef _UAPI__LINUX_BPF_H__
#define _UAPI__LINUX_BPF_H__
#include <linux/types.h>
#include <linux/bpf_common.h>
...
#ifdef __BPF__
#define SEC(NAME) __attribute__((section(NAME), used))
...
static void *(*bpf_map_lookup_elem)(void *map, void *key) =
(void *) BPF_FUNC_map_lookup_elem;
...
static unsigned int log2l(unsigned long v)
{
unsigned int hi = v >> 32;
if (hi)
return log2(hi) + 32;
else
return log2(v);
}
...
#endif
#endif
Then by using the shell script Alexei suggested in
https://lkml.kernel.org/r/55777A17.7030000@plumgrid.com
and automatically kernel header detector I posted today, users will easily
find helpers, even if the source file is not in 'samples/bpf'.
Thank you.
> +SEC("kprobe/trace_preempt_off")
> +int bpf_prog1(struct pt_regs *ctx)
> +{
> + int cpu = bpf_get_smp_processor_id();
> + u64 ts = bpf_ktime_get_ns();
> +
> + bpf_map_update_elem(&my_map, &cpu, &ts, BPF_ANY);
> +
> + return 0;
> +}
> +
> +struct bpf_map_def SEC("maps") my_lat = {
> + .type = BPF_MAP_TYPE_HASH,
> + .key_size = sizeof(unsigned long long),
> + .value_size = sizeof(long),
> + .max_entries = 1024,
> +};
> +
> +SEC("kprobe/trace_preempt_on")
> +int bpf_prog2(struct pt_regs *ctx)
> +{
> + long init_val = 1;
> + long *value;
> + u64 *ts;
> + int cpu;
> +
> + cpu = bpf_get_smp_processor_id();
> + ts = bpf_map_lookup_elem(&my_map, &cpu);
> + if (!ts)
> + return 0;
> +
> + u64 cur_time = bpf_ktime_get_ns();
> + u64 delta = log2l(cur_time - *ts);
> +
> + bpf_map_delete_elem(&my_map, &cpu);
> +
> + u64 key = (((u64)cpu) << 32) | (log2l(delta) & 0xffffffff);
> + value = bpf_map_lookup_elem(&my_lat, &key);
> + if (value)
> + __sync_fetch_and_add((long *)value, 1);
> + else
> + bpf_map_update_elem(&my_lat, &key, &init_val, BPF_ANY);
> +
> + return 0;
> +
> +}
> +
> +char _license[] SEC("license") = "GPL";
> +u32 _version SEC("version") = LINUX_VERSION_CODE;
> diff --git a/samples/bpf/hwlathist_user.c b/samples/bpf/hwlathist_user.c
> new file mode 100644
> index 0000000..7e19f2c
> --- /dev/null
> +++ b/samples/bpf/hwlathist_user.c
> @@ -0,0 +1,108 @@
> +/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
> + * Copyright (c) 2015 BMW Car IT GmbH
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of version 2 of the GNU General Public
> + * License as published by the Free Software Foundation.
> + */
> +#include <stdio.h>
> +#include <unistd.h>
> +#include <stdlib.h>
> +#include <signal.h>
> +#include <linux/bpf.h>
> +#include "libbpf.h"
> +#include "bpf_load.h"
> +
> +#define MAX_ENTRIES 10
> +#define MAX_CPU 8
> +#define MAX_STARS 50
> +
> +static void stars(char *str, long val, long max, int width)
> +{
> + int i;
> +
> + for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
> + str[i] = '*';
> + if (val > max)
> + str[i - 1] = '+';
> + str[i] = '\0';
> +}
> +
> +struct cpu_hist {
> + long data[MAX_ENTRIES];
> + unsigned long max;
> +};
> +
> +static struct cpu_hist cpu_hist[MAX_CPU];
> +
> +static void get_data(int fd)
> +{
> + unsigned long long key, next_key, k;
> + long value;
> + int i, cpu;
> +
> + for (i = 0; i < MAX_CPU; i++)
> + cpu_hist[i].max = 0;
> +
> + key = 0;
> + while (bpf_get_next_key(fd, &key, &next_key) == 0) {
> + bpf_lookup_elem(fd, &next_key, &value);
> + cpu = next_key >> 32;
> + if (cpu >= MAX_CPU)
> + goto next;
> +
> + k = next_key & 0xffffffff;
> + if (k > MAX_ENTRIES)
> + k = MAX_ENTRIES - 1;
> +
> + cpu_hist[cpu].data[k] = value;
> + if (value > cpu_hist[cpu].max)
> + cpu_hist[cpu].max = value;
> +
> + next:
> + key = next_key;
> + }
> +}
> +
> +static void print_hist(void)
> +{
> + char starstr[MAX_STARS];
> + struct cpu_hist *hist;
> + int i, j;
> +
> + printf("\033[2J");
> +
> + for (j = 0; j < MAX_CPU; j++) {
> + hist = &cpu_hist[j];
> + if (hist->max == 0)
> + continue;
> + printf("CPU %d\n", j);
> + printf(" latency : count distribution\n");
> + for (i = 1; i <= MAX_ENTRIES; i++) {
> + stars(starstr, hist->data[i - 1], hist->max, MAX_STARS);
> + printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
> + (1l << i) >> 1, (1l << i) - 1,
> + hist->data[i - 1], MAX_STARS, starstr);
> + }
> + }
> +}
> +
> +int main(int argc, char **argv)
> +{
> + char filename[256];
> +
> + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
> +
> + if (load_bpf_file(filename)) {
> + printf("%s", bpf_log_buf);
> + return 1;
> + }
> +
> + while (1) {
> + get_data(map_fd[1]);
> + print_hist();
> + sleep(5);
> + }
> +
> + return 0;
> +}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists