netdev - [RFC bpf] selftests/bpf: Curious case of a successful tailcall that returns to caller

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220616110252.418333-1-jakub@cloudflare.com>
Date:   Thu, 16 Jun 2022 13:02:52 +0200
From:   Jakub Sitnicki <jakub@...udflare.com>
To:     bpf@...r.kernel.org
Cc:     netdev@...r.kernel.org, Alexei Starovoitov <ast@...nel.org>,
        Daniel Borkmann <daniel@...earbox.net>,
        Andrii Nakryiko <andrii@...nel.org>,
        kernel-team@...udflare.com,
        Maciej Fijalkowski <maciej.fijalkowski@...el.com>
Subject: [RFC bpf] selftests/bpf: Curious case of a successful tailcall that returns to caller

While working aarch64 JIT to allow mixing bpf2bpf calls with tailcalls, I
noticed unexpected tailcall behavior in x86 JIT.

I don't know if it is by design or a bug. The bpf_tail_call helper
documentation says that the user should not expect the control flow to
return to the previous program, if the tail call was successful:

> If the call succeeds, the kernel immediately runs the first
> instruction of the new program. This is not a function call,
> and it never returns to the previous program.

However, when a tailcall happens from a subprogram, that is after a bpf2bpf
call, that is not the case. We return to the caller program because the
stack destruction is too shallow. BPF stack of just the top-most BPF
function gets destroyed.

This in turn allows the return value of the tailcall'ed program to get
overwritten, as the test below test demonstrates. It currently fails on
x86:

test_tailcall_bpf2bpf_7:PASS:open and load 0 nsec
test_tailcall_bpf2bpf_7:PASS:entry prog fd 0 nsec
test_tailcall_bpf2bpf_7:PASS:jmp_table map fd 0 nsec
test_tailcall_bpf2bpf_7:PASS:classifier_0 prog fd 0 nsec
test_tailcall_bpf2bpf_7:PASS:jmp_table map update 0 nsec
test_tailcall_bpf2bpf_7:PASS:entry prog test run 0 nsec
test_tailcall_bpf2bpf_7:FAIL:tailcall retval unexpected tailcall retval: actual 2 != expected 0
test_tailcall_bpf2bpf_7:PASS:bss map fd 0 nsec
test_tailcall_bpf2bpf_7:PASS:bss map lookup 0 nsec
test_tailcall_bpf2bpf_7:PASS:done flag is set 0 nsec

If we step through the program, we can observe the flow as so:

int entry(struct __sk_buff * skb):
bpf_prog_3bb007ac57240471_entry:
; subprog_tail(skb);
   0:   nopl   0x0(%rax,%rax,1)
   5:   xor    %eax,%eax
   7:   push   %rbp
   8:   mov    %rsp,%rbp
   b:   push   %rax
   c:   mov    -0x8(%rbp),%rax
  13:   call   0x0000000000000048 ---------.
; return 2;                                |
  18:   mov    $0x2,%eax <--------------------------------------.
  1d:   leave                              |                    |
  1e:   ret                                |                    |
                                           |                    |
int subprog_tail(struct __sk_buff * skb):  |                    |
bpf_prog_3a140cef239a4b4f_F:               |                    |
; int subprog_tail(struct __sk_buff *skb)  |                    |
   0:   nopl   0x0(%rax,%rax,1) <----------'                    |
   5:   xchg   %ax,%ax                                          |
   7:   push   %rbp                                             |
   8:   mov    %rsp,%rbp                                        |
   b:   push   %rax                                             |
   c:   push   %rbx                                             |
   d:   push   %r13                                             |
   f:   mov    %rdi,%rbx                                        |
; asm volatile("r1 = %[ctx]\n\t"                                |
  12:   movabs $0xffff888104119000,%r13                         |
  1c:   mov    %rbx,%rdi                                        |
  1f:   mov    %r13,%rsi                                        |
  22:   xor    %edx,%edx                                        |
  24:   mov    -0x4(%rbp),%eax                                  |
  2a:   cmp    $0x21,%eax                                       |
  2d:   jae    0x0000000000000046                               |
  2f:   add    $0x1,%eax                                        |
  32:   mov    %eax,-0x4(%rbp)                                  |
  38:   jmp    0x0000000000000046 ---------------------------.  |
  3d:   pop    %r13                                          |  |
  3f:   pop    %rbx                                          |  |
  40:   pop    %rax                                          |  |
  41:   nopl   0x0(%rax,%rax,1)                              |  |
; return 1;                                                  |  |
  46:   pop    %r13                                          |  |
  48:   pop    %rbx                                          |  |
  49:   leave                                                |  |
  4a:   ret                                                  |  |
                                                             |  |
int classifier_0(struct __sk_buff * skb):                    |  |
bpf_prog_6e664b22811ace0d_classifier_0:                      |  |
; done = 1;                                                  |  |
   0:   nopl   0x0(%rax,%rax,1)                              |  |
   5:   xchg   %ax,%ax                                       |  |
   7:   push   %rbp                                          |  |
   8:   mov    %rsp,%rbp                                     |  |
   b:   movabs $0xffffc900000b6000,%rdi <--------------------'  |
  15:   mov    $0x1,%esi                                        |
  1a:   mov    %esi,0x0(%rdi)                                   |
; return 0;                                                     |
  1d:   xor    %eax,%eax                                        |
  1f:   leave                                                   |
  20:   ret ----------------------------------------------------'

My question is - is it a bug or intended behavior that other JITs should
replicate?

Signed-off-by: Jakub Sitnicki <jakub@...udflare.com>
---
 .../selftests/bpf/prog_tests/tailcalls.c      | 55 +++++++++++++++++++
 .../selftests/bpf/progs/tailcall_bpf2bpf7.c   | 37 +++++++++++++
 2 files changed, 92 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/tailcall_bpf2bpf7.c

diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
index c4da87ec3ba4..696c307a1bee 100644
--- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c
+++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
@@ -831,6 +831,59 @@ static void test_tailcall_bpf2bpf_4(bool noise)
 	bpf_object__close(obj);
 }
 
+#include "tailcall_bpf2bpf7.skel.h"
+
+/* The tail call should never return to the previous program, if the
+ * jump was successful.
+ */
+static void test_tailcall_bpf2bpf_7(void)
+{
+	struct tailcall_bpf2bpf7 *obj;
+	int err, map_fd, prog_fd, main_fd, data_fd, i, val;
+	LIBBPF_OPTS(bpf_test_run_opts, topts,
+		.data_in = &pkt_v4,
+		.data_size_in = sizeof(pkt_v4),
+		.repeat = 1,
+	);
+
+	obj = tailcall_bpf2bpf7__open_and_load();
+	if (!ASSERT_OK_PTR(obj, "open and load"))
+		return;
+
+	main_fd = bpf_program__fd(obj->progs.entry);
+	if (!ASSERT_GE(main_fd, 0, "entry prog fd"))
+		goto out;
+
+	map_fd = bpf_map__fd(obj->maps.jmp_table);
+	if (!ASSERT_GE(map_fd, 0, "jmp_table map fd"))
+		goto out;
+
+	prog_fd = bpf_program__fd(obj->progs.classifier_0);
+	if (!ASSERT_GE(prog_fd, 0, "classifier_0 prog fd"))
+		goto out;
+
+	i = 0;
+	err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY);
+	if (!ASSERT_OK(err, "jmp_table map update"))
+		goto out;
+
+	err = bpf_prog_test_run_opts(main_fd, &topts);
+	ASSERT_OK(err, "entry prog test run");
+	ASSERT_EQ(topts.retval, 0, "tailcall retval");
+
+	data_fd = bpf_map__fd(obj->maps.bss);
+	if (!ASSERT_GE(map_fd, 0, "bss map fd"))
+		goto out;
+
+	i = 0;
+	err = bpf_map_lookup_elem(data_fd, &i, &val);
+	ASSERT_OK(err, "bss map lookup");
+	ASSERT_EQ(val, 1, "done flag is set");
+
+out:
+	tailcall_bpf2bpf7__destroy(obj);
+}
+
 void test_tailcalls(void)
 {
 	if (test__start_subtest("tailcall_1"))
@@ -855,4 +908,6 @@ void test_tailcalls(void)
 		test_tailcall_bpf2bpf_4(false);
 	if (test__start_subtest("tailcall_bpf2bpf_5"))
 		test_tailcall_bpf2bpf_4(true);
+	if (test__start_subtest("tailcall_bpf2bpf_7"))
+		test_tailcall_bpf2bpf_7();
 }
diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf7.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf7.c
new file mode 100644
index 000000000000..1be27cfa1702
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf7.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+#define __unused __attribute__((always_unused))
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+	__uint(max_entries, 1);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u32));
+} jmp_table SEC(".maps");
+
+int done = 0;
+
+SEC("tc")
+int classifier_0(struct __sk_buff *skb __unused)
+{
+	done = 1;
+	return 0;
+}
+
+static __noinline
+int subprog_tail(struct __sk_buff *skb)
+{
+	bpf_tail_call_static(skb, &jmp_table, 0);
+	return 1;
+}
+
+SEC("tc")
+int entry(struct __sk_buff *skb)
+{
+	subprog_tail(skb);
+	return 2;
+}
+
+char __license[] SEC("license") = "GPL";
-- 
2.35.3