[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <152406545918.3465.14253635905960610284.stgit@localhost.localdomain>
Date: Wed, 18 Apr 2018 17:30:59 +0200
From: Sebastiano Miano <sebastiano.miano@...ito.it>
To: netdev@...r.kernel.org, ast@...nel.org, daniel@...earbox.net
Cc: mingo@...hat.com, rostedt@...dmis.org, brouer@...hat.com,
fulvio.risso@...ito.it
Subject: [bpf-next PATCH 3/3] bpf: add sample program to trace map events
This patch adds a sample program, called trace_map_events,
that shows how to capture map events and filter them based on
the map id.
The program accepts a list of map IDs, via the -i command line
option, and filters all the map events related to those IDs (i.e.,
map_create/update/lookup/next_key).
If no IDs are specified, all map events are listed and no filtering
is performed.
Sample usage:
# trace_map_events -i <map_id1> -i <map_id2> -i <map_id3> ...
Signed-off-by: Sebastiano Miano <sebastiano.miano@...ito.it>
---
samples/bpf/Makefile | 4
samples/bpf/trace_map_events_kern.c | 225 +++++++++++++++++++++++++
samples/bpf/trace_map_events_user.c | 314 +++++++++++++++++++++++++++++++++++
3 files changed, 543 insertions(+)
create mode 100644 samples/bpf/trace_map_events_kern.c
create mode 100644 samples/bpf/trace_map_events_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 4d6a6ed..a7d52b6 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -15,6 +15,7 @@ hostprogs-y += tracex6
hostprogs-y += tracex7
hostprogs-y += test_probe_write_user
hostprogs-y += trace_output
+hostprogs-y += trace_map_events
hostprogs-y += lathist
hostprogs-y += offwaketime
hostprogs-y += spintest
@@ -65,6 +66,7 @@ tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
+trace_map_events-objs := bpf_load.o $(LIBBPF) trace_map_events_user.o
lathist-objs := bpf_load.o $(LIBBPF) lathist_user.o
offwaketime-objs := bpf_load.o $(LIBBPF) offwaketime_user.o
spintest-objs := bpf_load.o $(LIBBPF) spintest_user.o
@@ -111,6 +113,7 @@ always += tracex7_kern.o
always += sock_flags_kern.o
always += test_probe_write_user_kern.o
always += trace_output_kern.o
+always += trace_map_events_kern.o
always += tcbpf1_kern.o
always += tcbpf2_kern.o
always += tc_l2_redirect_kern.o
@@ -171,6 +174,7 @@ HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
HOSTLOADLIBES_load_sock_ops += -lelf
HOSTLOADLIBES_test_probe_write_user += -lelf
HOSTLOADLIBES_trace_output += -lelf -lrt
+HOSTLOADLIBES_trace_map_events += -lelf -lrt
HOSTLOADLIBES_lathist += -lelf
HOSTLOADLIBES_offwaketime += -lelf
HOSTLOADLIBES_spintest += -lelf
diff --git a/samples/bpf/trace_map_events_kern.c b/samples/bpf/trace_map_events_kern.c
new file mode 100644
index 0000000..f887b5b
--- /dev/null
+++ b/samples/bpf/trace_map_events_kern.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (C) 2018 Politecnico di Torino, Italy
+ *
+ * Author: Sebastiano Miano <sebastiano.miano@...ito.it>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <linux/version.h>
+#include "bpf_helpers.h"
+
+enum map_event_type {
+ MAP_CREATE = 0,
+ MAP_UPDATE = 1,
+ MAP_LOOKUP = 2,
+ MAP_NEXT_KEY = 3
+};
+
+struct map_event_data {
+ u32 map_id;
+ enum map_event_type evnt_type;
+ u32 map_type;
+};
+
+struct bpf_map_def SEC("maps") map_event_trace = {
+ .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(u32),
+ .max_entries = 64,
+};
+
+struct bpf_map_def SEC("maps") filtered_ids = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u32),
+ .max_entries = 64,
+};
+
+struct bpf_map_def SEC("maps") filter_events = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(bool),
+ .max_entries = 1,
+};
+
+/*
+ * Tracepoint format: /sys/kernel/debug/tracing/events/bpf/bpf_map_create/format
+ * Code in: kernel/include/trace/events/bpf.h
+ */
+struct bpf_map_create_ctx {
+ u64 pad; // First 8 bytes are not accessible by bpf code
+ u32 type; // offset:8; size:4; signed:0;
+ u32 size_key; // offset:12; size:4; signed:0;
+ u32 size_value; // offset:16; size:4; signed:0;
+ u32 max_entries; // offset:20; size:4; signed:0;
+ u32 flags; // offset:24; size:4; signed:0;
+ int ufd; // offset:28; size:4; signed:1;
+ u32 id; // offset:32; size:4; signed:0;
+};
+
+SEC("tracepoint/bpf/bpf_map_create")
+int trace_bpf_map_create(struct bpf_map_create_ctx *ctx)
+{
+ struct map_event_data data;
+ int cpu = bpf_get_smp_processor_id();
+ bool *filter;
+ u32 key = 0, map_id = ctx->id;
+
+ filter = bpf_map_lookup_elem(&filter_events, &key);
+ if (!filter)
+ return 1;
+
+ if (!*filter)
+ goto send_event;
+
+ /*
+ * If the map_id is not in the list of filtered
+ * ids we immediately return
+ */
+ if (!bpf_map_lookup_elem(&filtered_ids, &map_id))
+ return 0;
+
+send_event:
+ data.map_id = map_id;
+ data.evnt_type = MAP_CREATE;
+ data.map_type = ctx->type;
+
+ bpf_perf_event_output(ctx, &map_event_trace, cpu, &data, sizeof(data));
+ return 0;
+}
+
+/*
+ * Tracepoint: /sys/kernel/debug/tracing/events/bpf/bpf_map_lookup_elem/format
+ * Tracepoint: /sys/kernel/debug/tracing/events/bpf/bpf_map_update_elem/format
+ * Code in: kernel/include/trace/events/bpf.h
+ */
+struct bpf_map_keyval_ctx {
+ u64 pad; // First 8 bytes are not accessible by bpf code
+ u32 type; // offset:8; size:4; signed:0;
+ u32 key_len; // offset:12; size:4; signed:0;
+ u32 key; // offset:16; size:4; signed:0;
+ bool key_trunc; // offset:20; size:1; signed:0;
+ u32 val_len; // offset:24; size:4; signed:0;
+ u32 val; // offset:28; size:4; signed:0;
+ bool val_trunc; // offset:32; size:1; signed:0;
+ int ufd; // offset:36; size:4; signed:1;
+ u32 id; // offset:40; size:4; signed:0;
+};
+
+SEC("tracepoint/bpf/bpf_map_lookup_elem")
+int trace_bpf_map_lookup_elem(struct bpf_map_keyval_ctx *ctx)
+{
+ struct map_event_data data;
+ int cpu = bpf_get_smp_processor_id();
+ bool *filter;
+ u32 key = 0, map_id = ctx->id;
+
+ filter = bpf_map_lookup_elem(&filter_events, &key);
+ if (!filter)
+ return 1;
+
+ if (!*filter)
+ goto send_event;
+
+ /*
+ * If the map_id is not in the list of filtered
+ * ids we immediately return
+ */
+ if (!bpf_map_lookup_elem(&filtered_ids, &map_id))
+ return 0;
+
+send_event:
+ data.map_id = map_id;
+ data.evnt_type = MAP_LOOKUP;
+ data.map_type = ctx->type;
+
+ bpf_perf_event_output(ctx, &map_event_trace, cpu, &data, sizeof(data));
+ return 0;
+}
+
+SEC("tracepoint/bpf/bpf_map_update_elem")
+int trace_bpf_map_update_elem(struct bpf_map_keyval_ctx *ctx)
+{
+ struct map_event_data data;
+ int cpu = bpf_get_smp_processor_id();
+ bool *filter;
+ u32 key = 0, map_id = ctx->id;
+
+ filter = bpf_map_lookup_elem(&filter_events, &key);
+ if (!filter)
+ return 1;
+
+ if (!*filter)
+ goto send_event;
+
+ /*
+ * If the map_id is not in the list of filtered
+ * ids we immediately return
+ */
+ if (!bpf_map_lookup_elem(&filtered_ids, &map_id))
+ return 0;
+
+send_event:
+ data.map_id = map_id;
+ data.evnt_type = MAP_UPDATE;
+ data.map_type = ctx->type;
+
+ bpf_perf_event_output(ctx, &map_event_trace, cpu, &data, sizeof(data));
+ return 0;
+}
+
+/*
+ * Tracepoint: /sys/kernel/debug/tracing/events/bpf/bpf_map_next_key/format
+ * Code in: kernel/include/trace/events/bpf.h
+ */
+struct bpf_map_next_key_ctx {
+ u64 pad; // First 8 bytes are not accessible by bpf code
+ u32 type; // offset:8; size:4; signed:0;
+ u32 key_len; // offset:12; size:4; signed:0;
+ u32 key; // offset:16; size:4; signed:0;
+ u32 nxt; // offset:20; size:4; signed:0;
+ bool key_trunc; // offset:24; size:1; signed:0;
+ bool key_null; // offset:25; size:1; signed:0;
+ int ufd; // offset:28; size:4; signed:1;
+ u32 id; // offset:32; size:4; signed:0;
+};
+
+SEC("tracepoint/bpf/bpf_map_next_key")
+int trace_bpf_map_next_key(struct bpf_map_next_key_ctx *ctx)
+{
+ struct map_event_data data;
+ int cpu = bpf_get_smp_processor_id();
+ bool *filter;
+ u32 key = 0, map_id = ctx->id;
+
+ filter = bpf_map_lookup_elem(&filter_events, &key);
+ if (!filter)
+ return 1;
+
+ if (!*filter)
+ goto send_event;
+
+ /*
+ * If the map_id is not in the list of filtered
+ * ids we immediately return
+ */
+ if (!bpf_map_lookup_elem(&filtered_ids, &map_id))
+ return 0;
+
+send_event:
+ data.map_id = map_id;
+ data.evnt_type = MAP_NEXT_KEY;
+ data.map_type = ctx->type;
+
+ bpf_perf_event_output(ctx, &map_event_trace, cpu, &data, sizeof(data));
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/trace_map_events_user.c b/samples/bpf/trace_map_events_user.c
new file mode 100644
index 0000000..bc7447e
--- /dev/null
+++ b/samples/bpf/trace_map_events_user.c
@@ -0,0 +1,314 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (C) 2018 Politecnico di Torino, Italy
+ *
+ * Author: Sebastiano Miano <sebastiano.miano@...ito.it>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+static const char *__desc__ =
+"Sample program to trace map related events\n"
+"The -i option allows to set the id(s) of the map you are interested in.\n"
+"If no ID is specified, all map events are listed.\n";
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <sys/resource.h>
+#include <linux/perf_event.h>
+#include <linux/bpf.h>
+#include <errno.h>
+#include <assert.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/epoll.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <signal.h>
+#include <getopt.h>
+
+#include "libbpf.h"
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include "perf-sys.h"
+
+#define MAX_FILTERED_IDS 64
+
+static int *perf_fd;
+
+int epoll_fd;
+int page_size;
+int page_cnt = 8;
+volatile struct perf_event_mmap_page **readers;
+
+typedef void (*event_cb)(void *data, int size);
+
+enum map_event_type {
+ MAP_CREATE = 0,
+ MAP_UPDATE = 1,
+ MAP_LOOKUP = 2,
+ MAP_NEXT_KEY = 3
+};
+
+static void usage(char *argv[])
+{
+ printf("\nDESCRIPTION:\n%s", __desc__);
+ printf("\n");
+ printf(" Usage: %s [-i map_id1] [-i map_id2] ...\n", argv[0]);
+ printf("\n");
+}
+
+static int perf_event_mmap(int fd, int cpu)
+{
+ void *base;
+ int mmap_size;
+
+ page_size = getpagesize();
+ mmap_size = page_size * (page_cnt + 1);
+
+ base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ if (base == MAP_FAILED) {
+ printf("mmap err\n");
+ return -1;
+ }
+
+ readers[cpu] = base;
+ return 0;
+}
+
+static void init_bpf_perf_event_on_cpu(int cpu)
+{
+ struct perf_event_attr attr = {
+ .sample_type = PERF_SAMPLE_RAW,
+ .type = PERF_TYPE_SOFTWARE,
+ .config = PERF_COUNT_SW_BPF_OUTPUT,
+ .sample_period = 1,
+ .wakeup_events = 1,
+ };
+ int key = cpu;
+
+ perf_fd[cpu] = sys_perf_event_open(&attr, -1, cpu, -1, 0);
+
+ assert(perf_fd[cpu] >= 0);
+ assert(perf_event_mmap(perf_fd[cpu], cpu) >= 0);
+ assert(ioctl(perf_fd[cpu], PERF_EVENT_IOC_ENABLE, 0) >= 0);
+ assert(bpf_map_update_elem(map_fd[0], &key, &perf_fd[cpu], 0) == 0);
+
+ struct epoll_event e = { .events = EPOLLIN, .data.u32 = cpu };
+
+ assert(epoll_ctl(epoll_fd, EPOLL_CTL_ADD, perf_fd[cpu], &e) == 0);
+}
+
+static int perf_event_poll(int fd, int num_cpus, struct epoll_event *events)
+{
+ return epoll_wait(fd, events, num_cpus, -1);
+}
+
+struct perf_event_sample {
+ struct perf_event_header header;
+ __u32 size;
+ char data[];
+};
+
+static void perf_event_read(event_cb fn, __u32 index)
+{
+ __u64 data_tail = readers[index]->data_tail;
+ __u64 data_head = readers[index]->data_head;
+ __u64 buffer_size = page_cnt * page_size;
+ void *base, *begin, *end;
+ char buf[256];
+
+ asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
+ if (data_head == data_tail)
+ return;
+
+ base = ((char *)readers[index]) + page_size;
+
+ begin = base + data_tail % buffer_size;
+ end = base + data_head % buffer_size;
+
+ while (begin != end) {
+ struct perf_event_sample *e;
+
+ e = begin;
+ if (begin + e->header.size > base + buffer_size) {
+ long len = base + buffer_size - begin;
+
+ assert(len < e->header.size);
+ memcpy(buf, begin, len);
+ memcpy(buf + len, base, e->header.size - len);
+ e = (void *) buf;
+ begin = base + e->header.size - len;
+ } else if (begin + e->header.size == base + buffer_size) {
+ begin = base;
+ } else {
+ begin += e->header.size;
+ }
+
+ if (e->header.type == PERF_RECORD_SAMPLE) {
+ fn(e->data, e->size);
+ } else if (e->header.type == PERF_RECORD_LOST) {
+ struct {
+ struct perf_event_header header;
+ __u64 id;
+ __u64 lost;
+ } *lost = (void *) e;
+ printf("lost %lld events\n", lost->lost);
+ } else {
+ printf("unknown event type=%d size=%d\n",
+ e->header.type, e->header.size);
+ }
+ }
+
+ __sync_synchronize(); /* smp_mb() */
+ readers[index]->data_tail = data_head;
+}
+
+static const char *get_event_type(enum map_event_type event)
+{
+ switch (event) {
+ case MAP_CREATE:
+ return "CREATE";
+ case MAP_LOOKUP:
+ return "LOOKUP";
+ case MAP_UPDATE:
+ return "UPDATE";
+ case MAP_NEXT_KEY:
+ return "NEXT_KEY";
+ }
+
+ return "UNKNOWN";
+}
+
+
+static void map_event_callback(void *data, int size)
+{
+ struct {
+ __u32 map_id;
+ enum map_event_type event_type;
+ __u32 map_type;
+ } *e = data;
+
+ printf("%s event for map id: %d and type: %d\n",
+ get_event_type(e->event_type), e->map_id, e->map_type);
+}
+
+static bool init_filtered_ids_map(int num_ids, int *filtered_ids)
+{
+ int i, key, value;
+ bool filtering = false;
+ /*
+ * I am going to put the IDs in the map. Only event related to those IDs
+ * will be shown. The key indicates the ID of the map while the value
+ * is not used and then is set to 0.
+ */
+ for (i = 0; i < num_ids; i++) {
+ key = filtered_ids[i];
+ value = 0;
+ if (bpf_map_update_elem(map_fd[1], &key, &value, 0) != 0) {
+ fprintf(stderr,
+ "ERR: bpf_map_update_elem failed key:0x%X\n", key);
+ return false;
+ }
+ }
+
+ if (num_ids > 0)
+ filtering = true;
+
+ key = 0;
+ assert(bpf_map_update_elem(map_fd[2], &key, &filtering, BPF_ANY) == 0);
+ return true;
+}
+
+static bool init_perf_buffer_data_structures(int nr_cpus)
+{
+ int i;
+
+ perf_fd = malloc(sizeof(int) * nr_cpus);
+ assert(perf_fd);
+ readers = malloc(sizeof(*readers) * nr_cpus);
+ assert(readers);
+
+ epoll_fd = epoll_create1(EPOLL_CLOEXEC);
+
+ for (i = 0; i < nr_cpus; i++) {
+ printf("Init bpf_perf_event for cpu:%d\n", i);
+ init_bpf_perf_event_on_cpu(i);
+ }
+
+ return true;
+}
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ int i, cnt, opt, ret = EXIT_SUCCESS;
+ char bpf_obj_file[256];
+ int num_ids = 0, nr_cpus = bpf_num_possible_cpus();
+ int filtered_ids[MAX_FILTERED_IDS];
+
+ snprintf(bpf_obj_file, sizeof(bpf_obj_file), "%s_kern.o", argv[0]);
+
+ /* Parse commands line args */
+ while ((opt = getopt(argc, argv, "hi:")) != -1) {
+ switch (opt) {
+ case 'i':
+ if (num_ids == MAX_FILTERED_IDS) {
+ printf("Reached maximum number of IDs");
+ return EXIT_FAILURE;
+ }
+ i = atoi(optarg);
+ if (!i)
+ printf("ERROR - Invalid id %s", optarg);
+ else
+ filtered_ids[num_ids++] = i;
+ break;
+ case 'h':
+ default:
+ usage(argv);
+ return EXIT_FAILURE;
+ }
+ }
+
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return EXIT_FAILURE;
+ }
+
+ if (load_bpf_file(bpf_obj_file)) {
+ printf("ERROR - bpf_log_buf: %s", bpf_log_buf);
+ return EXIT_FAILURE;
+ }
+
+ if (!prog_fd[0]) {
+ printf("ERROR - load_bpf_file: %s\n", strerror(errno));
+ return EXIT_FAILURE;
+ }
+
+ init_filtered_ids_map(num_ids, filtered_ids);
+ init_perf_buffer_data_structures(nr_cpus);
+
+ struct epoll_event *events = calloc(nr_cpus, sizeof(*events));
+
+ while (true) {
+ printf("Waiting for map events...\n");
+ cnt = perf_event_poll(epoll_fd, nr_cpus, events);
+ for (i = 0; i < cnt; i++)
+ perf_event_read(map_event_callback, events[i].data.u32);
+ }
+
+ free(perf_fd);
+ free(readers);
+ free(events);
+
+ return ret;
+}
Powered by blists - more mailing lists