[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <ace6ee2b-827a-b7ba-291d-0df3b4b62d17@gmail.com>
Date: Mon, 26 Nov 2018 07:43:16 -0800
From: John Fastabend <john.fastabend@...il.com>
To: Daniel Borkmann <daniel@...earbox.net>, ast@...nel.org
Cc: netdev@...r.kernel.org
Subject: Re: [PATCH bpf-next 1/3] bpf: helper to pop data from messages
On 11/25/18 5:05 PM, Daniel Borkmann wrote:
> On 11/23/2018 02:38 AM, John Fastabend wrote:
>> This adds a BPF SK_MSG program helper so that we can pop data from a
>> msg. We use this to pop metadata from a previous push data call.
>>
>> Signed-off-by: John Fastabend <john.fastabend@...il.com>
>> ---
>> include/uapi/linux/bpf.h | 13 +++-
>> net/core/filter.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++
>> net/ipv4/tcp_bpf.c | 14 +++-
>> 3 files changed, 192 insertions(+), 4 deletions(-)
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index c1554aa..64681f8 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -2268,6 +2268,16 @@ union bpf_attr {
>> *
>> * Return
>> * 0 on success, or a negative error in case of failure.
>> + *
>> + * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags)
>> + * Description
>> + * Will remove 'pop' bytes from a msg starting at byte 'start'.
>> + * This result in ENOMEM errors under certain situations where
>> + * a allocation and copy are required due to a full ring buffer.
>> + * However, the helper will try to avoid doing the allocation
>> + * if possible. Other errors can occur if input parameters are
>> + * invalid either do to start byte not being valid part of msg
>> + * payload and/or pop value being to large.
>> */
>> #define __BPF_FUNC_MAPPER(FN) \
>> FN(unspec), \
>> @@ -2360,7 +2370,8 @@ union bpf_attr {
>> FN(map_push_elem), \
>> FN(map_pop_elem), \
>> FN(map_peek_elem), \
>> - FN(msg_push_data),
>> + FN(msg_push_data), \
>> + FN(msg_pop_data),
>>
>> /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>> * function eBPF program intends to call
>> diff --git a/net/core/filter.c b/net/core/filter.c
>> index f6ca38a..c6b35b5 100644
>> --- a/net/core/filter.c
>> +++ b/net/core/filter.c
>> @@ -2428,6 +2428,173 @@ static const struct bpf_func_proto bpf_msg_push_data_proto = {
>> .arg4_type = ARG_ANYTHING,
>> };
>>
>> +static void sk_msg_shift_left(struct sk_msg *msg, int i)
>> +{
>> + int prev;
>> +
>> + do {
>> + prev = i;
>> + sk_msg_iter_var_next(i);
>> + msg->sg.data[prev] = msg->sg.data[i];
>> + } while (i != msg->sg.end);
>> +
>> + sk_msg_iter_prev(msg, end);
>> +}
>> +
>> +static void sk_msg_shift_right(struct sk_msg *msg, int i)
>> +{
>> + struct scatterlist tmp, sge;
>> +
>> + sk_msg_iter_next(msg, end);
>> + sge = sk_msg_elem_cpy(msg, i);
>> + sk_msg_iter_var_next(i);
>> + tmp = sk_msg_elem_cpy(msg, i);
>> +
>> + while (i != msg->sg.end) {
>> + msg->sg.data[i] = sge;
>> + sk_msg_iter_var_next(i);
>> + sge = tmp;
>> + tmp = sk_msg_elem_cpy(msg, i);
>> + }
>> +}
>> +
>> +BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
>> + u32, len, u64, flags)
>> +{
>> + u32 i = 0, l, space, offset = 0;
>> + u64 last = start + len;
>> + int pop;
>> +
>> + if (unlikely(flags))
>> + return -EINVAL;
>> +
>> + /* First find the starting scatterlist element */
>> + i = msg->sg.start;
>> + do {
>> + l = sk_msg_elem(msg, i)->length;
>> +
>> + if (start < offset + l)
>> + break;
>> + offset += l;
>> + sk_msg_iter_var_next(i);
>> + } while (i != msg->sg.end);
>> +
>> + /* Bounds checks: start and pop must be inside message */
>> + if (start >= offset + l || last >= msg->sg.size)
>> + return -EINVAL;
>> +
>> + space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
>> +
>> + pop = len;
>> + /* --------------| offset
>> + * -| start |------- len ------|
>> + *
>> + * |----- a ----|-------- pop -------|----- b ----|
>> + * |______________________________________________| length
>> + *
>> + *
>> + * a: region at front of scatter element to save
>> + * b: region at back of scatter element to save when length > A + pop
>> + * pop: region to pop from element, same as input 'pop' here will be
>> + * decremented below per iteration.
>> + *
>> + * Two top-level cases to handle when start != offset, first B is non
>> + * zero and second B is zero corresponding to when a pop includes more
>> + * than one element.
>> + *
>> + * Then if B is non-zero AND there is no space allocate space and
>> + * compact A, B regions into page. If there is space shift ring to
>> + * the rigth free'ing the next element in ring to place B, leaving
>> + * A untouched except to reduce length.
>> + */
>> + if (start != offset) {
>> + struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
>> + int a = start;
>> + int b = sge->length - pop - a;
>> +
>> + sk_msg_iter_var_next(i);
>> +
>> + if (pop < sge->length - a) {
>> + if (space) {
>> + sge->length = a;
>> + sk_msg_shift_right(msg, i);
>> + nsge = sk_msg_elem(msg, i);
>> + get_page(sg_page(sge));
>> + sg_set_page(nsge,
>> + sg_page(sge), b, offset + pop);
>> + } else {
>> + struct page *page, *orig;
>> + u8 *to, *from;
>> +
>> + page = alloc_pages(__GFP_NOWARN |
>> + __GFP_COMP | GFP_ATOMIC,
>> + get_order(a + b));
>> + if (unlikely(!page))
>> + return -ENOMEM;
>> +
>> + sge->length = a;
>> + orig = sg_page(sge);
>> + from = sg_virt(sge);
>> + to = page_address(page);
>> + memcpy(to, from, a);
>> + memcpy(to + a, from + a + pop, b);
>> + sg_set_page(sge, page, a + b, 0);
>> + put_page(orig);
>> + }
>> + pop = 0;
>> + } else if (pop >= sge->length - a) {
>> + sge->length = a;
>> + pop -= (sge->length - a);
>> + }
>> + }
>> +
>> + /* From above the current layout _must_ be as follows,
>> + *
>> + * -| offset
>> + * -| start
>> + *
>> + * |---- pop ---|---------------- b ------------|
>> + * |____________________________________________| length
>> + *
>> + * Offset and start of the current msg elem are equal because in the
>> + * previous case we handled offset != start and either consumed the
>> + * entire element and advanced to the next element OR pop == 0.
>> + *
>> + * Two cases to handle here are first pop is less than the length
>> + * leaving some remainder b above. Simply adjust the element's layout
>> + * in this case. Or pop >= length of the element so that b = 0. In this
>> + * case advance to next element decrementing pop.
>> + */
>> + while (pop) {
>> + struct scatterlist *sge = sk_msg_elem(msg, i);
>> +
>> + if (pop < sge->length) {
>> + sge->length -= pop;
>> + sge->offset += pop;
>> + pop = 0;
>> + } else {
>> + pop -= sge->length;
>> + sk_msg_shift_left(msg, i);
>> + }
>> + sk_msg_iter_var_next(i);
>> + }
>> +
>> + sk_mem_uncharge(msg->sk, len - pop);
>> + msg->sg.size -= (len - pop);
>> + sk_msg_compute_data_pointers(msg);
>> + return 0;
>> +}
>
> Hmm, had to revert. I noticed this will have a use after free. While you do the
> sk_msg_compute_data_pointers() there is nothing from verifier that forces a
> reload of the payload pointers that were set up before the helper call, so they
> can still be used after the bpf_msg_pop_data() even though we dropped ref from
> page etc.
Thanks for catching this. I will send a v2 with bpf_msg_pop_data() added to
the list in bpf_helper_changes_pkt_data.
> Thanks,
> Daniel
>
Powered by blists - more mailing lists