linux-kernel - Re: [PATCH] KVM: selftests: Added eBPF program and selftest to collect vmx exit stat

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Y+Fv6idxCMkuMf1R@google.com>
Date:   Mon, 6 Feb 2023 13:23:54 -0800
From:   David Matlack <dmatlack@...gle.com>
To:     Kevin Cheng <chengkev@...gle.com>
Cc:     Paolo Bonzini <pbonzini@...hat.com>, linux-kernel@...r.kernel.org,
        kvm@...r.kernel.org, linux-kselftest@...r.kernel.org,
        Shuah Khan <shuah@...nel.org>
Subject: Re: [PATCH] KVM: selftests: Added eBPF program and selftest to
 collect vmx exit stat

On Thu, Jan 26, 2023 at 12:43:46AM +0000, Kevin Cheng wrote:
> Introduce a new selftest that loads an eBPF program that stores the
> number of vmx exit counts per vcpu per vm. A process is created per
> vm_create to load a separate eBPF program to collect its own stats
> unique to the pid.
> 
> This test aims to serve as a proof-of-concept and example for using eBPF
> to collect stats that are not provided by the other stats interfaces
> such as kvm_binary_stats. Since there will be no further stats being
> added to kvm_binary_stats, developers can use this selftest as a
> reference for writing their own eBPF program + selftest to collect
> whatever stat they may need for debugging/monitoring.
> 
> Signed-off-by: Kevin Cheng <chengkev@...gle.com>
> ---
>  tools/testing/selftests/kvm/Makefile          |   4 +-
>  tools/testing/selftests/kvm/build_ebpf.sh     |   5 +
>  .../testing/selftests/kvm/kvm_vmx_exit_ebpf.c | 128 ++++++++++++++++++
>  .../selftests/kvm/kvm_vmx_exit_ebpf_kern.c    |  74 ++++++++++

x86-specific tests should go in tools/testing/selftests/kvm/x86_64.

>  4 files changed, 210 insertions(+), 1 deletion(-)
>  create mode 100644 tools/testing/selftests/kvm/build_ebpf.sh
>  create mode 100644 tools/testing/selftests/kvm/kvm_vmx_exit_ebpf.c
>  create mode 100644 tools/testing/selftests/kvm/kvm_vmx_exit_ebpf_kern.c
> 
> diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
> index 1750f91dd936..d9f56ccbc7bb 100644
> --- a/tools/testing/selftests/kvm/Makefile
> +++ b/tools/testing/selftests/kvm/Makefile
> @@ -129,6 +129,7 @@ TEST_GEN_PROGS_x86_64 += set_memory_region_test
>  TEST_GEN_PROGS_x86_64 += steal_time
>  TEST_GEN_PROGS_x86_64 += kvm_binary_stats_test
>  TEST_GEN_PROGS_x86_64 += system_counter_offset_test
> +TEST_GEN_PROGS_x86_64 += kvm_vmx_exit_ebpf
>  
>  # Compiled outputs used by test targets
>  TEST_GEN_PROGS_EXTENDED_x86_64 += x86_64/nx_huge_pages_test
> @@ -176,6 +177,7 @@ TEST_GEN_PROGS_riscv += set_memory_region_test
>  TEST_GEN_PROGS_riscv += kvm_binary_stats_test
>  
>  TEST_PROGS += $(TEST_PROGS_$(ARCH_DIR))
> +TEST_PROGS := build_ebpf.sh

build_ebpf.sh is not be a test program. It should be part of this
Makefile. i.e. running

  make -C tools/testing/selftests/kvm

should build tools/lib/bpf and kvm_vmx_exit_ebpf_kern.o. Developers
can't be expected to run
tools/testing/testing/selftests/kvm/build_ebpf.sh every time they want
to build the KVM selftests.

>  TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(ARCH_DIR))
>  TEST_GEN_PROGS_EXTENDED += $(TEST_GEN_PROGS_EXTENDED_$(ARCH_DIR))
>  LIBKVM += $(LIBKVM_$(ARCH_DIR))
> @@ -208,7 +210,7 @@ no-pie-option := $(call try-run, echo 'int main(void) { return 0; }' | \
>  pgste-option = $(call try-run, echo 'int main(void) { return 0; }' | \
>  	$(CC) -Werror -Wl$(comma)--s390-pgste -x c - -o "$$TMP",-Wl$(comma)--s390-pgste)
>  
> -LDLIBS += -ldl
> +LDLIBS += -ldl -L$(top_srcdir)/tools/lib/bpf -lbpf -lelf -lz

Please add a comment document why the different libraries are needed for
future readers.

>  LDFLAGS += -pthread $(no-pie-option) $(pgste-option)
>  
>  LIBKVM_C := $(filter %.c,$(LIBKVM))
> diff --git a/tools/testing/selftests/kvm/build_ebpf.sh b/tools/testing/selftests/kvm/build_ebpf.sh
> new file mode 100644
> index 000000000000..b8038b0a0da5
> --- /dev/null
> +++ b/tools/testing/selftests/kvm/build_ebpf.sh
> @@ -0,0 +1,5 @@
> +#!/bin/sh
> +# SPDX-License-Identifier: GPL-2.0
> +clang -g -O2 -target bpf -D__TARGET_ARCH_x86_64 -I . -c kvm_vmx_exit_ebpf_kern.c
> +        -o kvm_vmx_exit_ebpf_kern.o
> +make -C ../../../lib/bpf || exit

As mentioned above, this should be part of the Makefile.

> diff --git a/tools/testing/selftests/kvm/kvm_vmx_exit_ebpf.c b/tools/testing/selftests/kvm/kvm_vmx_exit_ebpf.c
> new file mode 100644
> index 000000000000..a4bd2c549207
> --- /dev/null
> +++ b/tools/testing/selftests/kvm/kvm_vmx_exit_ebpf.c
> @@ -0,0 +1,128 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <signal.h>
> +#include <sys/types.h>
> +#include <sys/wait.h>
> +#include <unistd.h>
> +#include <bpf/bpf.h>
> +#include <../bpf/libbpf.h>
> +#include <linux/btf.h>
> +
> +#include "test_util.h"
> +
> +#include "kvm_util.h"
> +#include "linux/kvm.h"
> +
> +#define VCPU_ID         0
> +
> +struct stats_map_key {
> +	__u32 pid;
> +	__u32 vcpu_id;
> +	__u32 exit_reason;
> +};
> +
> +static void guest_code(void)
> +{
> +	__asm__ __volatile__("cpuid");
> +}
> +
> +int main(int argc, char **argv)
> +{
> +	if (argc < 2) {
> +		fprintf(stderr, "Expected arguments: <number_of_vms>\n");
> +		return EXIT_FAILURE;

Selftests run by default with no arguments. So please provide a default
number of VMs to run with the test. Otherwise this test will just fail
by default.

It's common (at least for me) to run all KVM selftests when submitting
patches. So having one test that always fails will be annoying to deal
with.

Also, can you provide some details (e.g. in a comment) about why a user
might want to pick a different number of VMs? What is the value of
running this test with 1 VM vs. 2 vs. 3 etc.?

> +	}
> +	int n = atoi(argv[1]);
> +
> +	for (int i = 0; i < n; i++) {
> +		if (fork() == 0) {

Put the implementation of the child process into a helper function to
reduce indentation.

> +			struct kvm_vm *vm;
> +			struct kvm_vcpu *vcpu;
> +
> +			vm = vm_create_with_one_vcpu(&vcpu, guest_code);
> +
> +			// BPF userspace code
> +			struct bpf_object *obj;
> +			struct bpf_program *prog;
> +			struct bpf_map *map_obj;
> +			struct bpf_link *link = NULL;
> +
> +			obj = bpf_object__open_file("kvm_vmx_exit_ebpf_kern.o", NULL);
> +			if (libbpf_get_error(obj)) {
> +				fprintf(stderr, "ERROR: opening BPF object file failed\n");
> +				return 0;

I notice the children and parent always return 0. The test should exit
with a non-0 return code if it fails.

> +			}
> +
> +			map_obj = bpf_object__find_map_by_name(obj, "vmx_exit_map");
> +			if (!map_obj) {
> +				fprintf(stderr, "ERROR: loading of vmx BPF map failed\n");
> +				goto cleanup;
> +			}
> +
> +			struct bpf_map *pid_map = bpf_object__find_map_by_name(obj, "pid_map");
> +
> +			if (!pid_map) {
> +				fprintf(stderr, "ERROR: loading of pid BPF map failed\n");
> +				goto cleanup;
> +			}
> +
> +			/* load BPF program */

No need for this comment. bpf_object__load() is quite obvious already :)

> +			if (bpf_object__load(obj)) {
> +				fprintf(stderr, "ERROR: loading BPF object file failed\n");
> +				goto cleanup;
> +			}
> +
> +			__u32 userspace_pid = (__u32)getpid();
> +			__u32 val = (__u32)getpid();
> +
> +			bpf_map_update_elem(bpf_map__fd(pid_map), &userspace_pid, &val, 0);
> +
> +			prog = bpf_object__find_program_by_name(obj, "bpf_exit_prog");
> +			if (libbpf_get_error(prog)) {
> +				fprintf(stderr, "ERROR: finding a prog in obj file failed\n");
> +				goto cleanup;
> +			}
> +
> +			link = bpf_program__attach(prog);
> +			if (libbpf_get_error(link)) {
> +				fprintf(stderr, "ERROR: bpf_program__attach failed\n");
> +				link = NULL;
> +				goto cleanup;
> +			}
> +
> +			for (int j = 0; j < 10000; j++)
> +				vcpu_run(vcpu);

It might be interesting to (1) add some timing around this loop and (2)
run this loop without any bpf programs attached. i.e. Automatically do
an A/B performance comparison with and without bpf programs.

> +
> +			struct stats_map_key key = {
> +				.pid = 0,
> +				.vcpu_id = 0,
> +				.exit_reason = 18,
> +			};
> +
> +
> +			struct stats_map_key next_key, lookup_key;
> +
> +			lookup_key = key;
> +			while (bpf_map_get_next_key(bpf_map__fd(map_obj), &lookup_key, &next_key)
> +				 == 0) {
> +				int count;
> +
> +				bpf_map_lookup_elem(bpf_map__fd(map_obj), &next_key, &count);
> +				fprintf(stdout, "exit reason: '%d'\ncount: %d\npid: %d\n",
> +						next_key.exit_reason, count, next_key.pid);

Instead of printing ot the count, assert that the count has the right
value.

> +				lookup_key = next_key;
> +			}
> +
> +cleanup:
> +			bpf_link__destroy(link);
> +			bpf_object__close(obj);
> +			kvm_vm_free(vm);

Shouldn't the child process exit here? Otherwise it's going to keep
looping and creating *more* children?

> +		}
> +	}
> +
> +	for (int i = 0; i < n; i++)
> +		wait(NULL);
> +	return 0;
> +}
> diff --git a/tools/testing/selftests/kvm/kvm_vmx_exit_ebpf_kern.c b/tools/testing/selftests/kvm/kvm_vmx_exit_ebpf_kern.c
> new file mode 100644
> index 000000000000..b9c076f93171
> --- /dev/null
> +++ b/tools/testing/selftests/kvm/kvm_vmx_exit_ebpf_kern.c

I think we should carve out a new directory for bpf programs. If we mix
this in with the selftest .c files, it will start to get confusing.

e.g. tools/testing/selftests/kvm/bpf/vmx_exit_count.c

Note I dropped the "kvm_" prefix since it's obvious this is a
KVM-related program since it's under the KVM selftest directory. And I
also dropped "_ebpf_kern" since that's now obvious from the fact that
this is in the bpf/ subdirectory (which should only contain bpf
programs).

> @@ -0,0 +1,73 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +#include <linux/bpf.h>
> +#include <stdint.h>
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>
> +#include <bpf/bpf_core_read.h>
> +
> +struct kvm_vcpu {
> +	int vcpu_id;
> +};
> +
> +struct vmx_args {
> +	__u64 pad;
> +	unsigned int exit_reason;
> +	__u32 isa;
> +	struct kvm_vcpu *vcpu;
> +};
> +
> +struct stats_map_key {
> +	__u32 pid;
> +	__u32 vcpu_id;
> +	__u32 exit_reason;
> +};
> +
> +struct {
> +	__uint(type, BPF_MAP_TYPE_HASH);
> +	__uint(max_entries, 1024);
> +	__type(key, struct stats_map_key);
> +	__type(value, int);
> +} vmx_exit_map SEC(".maps");
> +
> +struct {
> +	__uint(type, BPF_MAP_TYPE_HASH);
> +	__uint(max_entries, 1);
> +	__type(key, __u32);
> +	__type(value, __u32);
> +} pid_map SEC(".maps");
> +
> +
> +SEC("tracepoint/kvm/kvm_exit")
> +int bpf_exit_prog(struct vmx_args *ctx)
> +{
> +	__u32 curr_pid = (bpf_get_current_pid_tgid() >> 32);
> +
> +	__u32 *userspace_pid = bpf_map_lookup_elem(&pid_map, &curr_pid);
> +
> +	if (!userspace_pid || *userspace_pid != curr_pid)
> +		return 0;
> +
> +	struct kvm_vcpu *vcpu = ctx->vcpu;
> +	int _vcpu_id = BPF_CORE_READ(vcpu, vcpu_id);
> +
> +	struct stats_map_key key = {
> +		.pid = (bpf_get_current_pid_tgid() >> 32),
> +		.vcpu_id = _vcpu_id,
> +		.exit_reason = ctx->exit_reason,
> +	};
> +
> +	int *value = bpf_map_lookup_elem(&vmx_exit_map, &key);
> +
> +	if (value) {
> +		*value = *value + 1;
> +		bpf_map_update_elem(&vmx_exit_map, &key, value, BPF_ANY);
> +	} else {
> +		int temp = 1;
> +
> +		bpf_map_update_elem(&vmx_exit_map, &key, &temp, BPF_ANY);
> +	}
> +
> +	return 0;
> +}
> +
> +char _license[] SEC("license") = "GPL";
> -- 
> 2.39.1.456.gfc5497dd1b-goog
>