netdev - Re: [PATCH net] ebpf: fix bpf_msg_pull

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1fa58735-f87b-be0a-7119-2b45277ad064@iogearbox.net>
Date:   Fri, 31 Aug 2018 14:15:19 +0200
From:   Daniel Borkmann <daniel@...earbox.net>
To:     Tushar Dave <tushar.n.dave@...cle.com>, john.fastabend@...il.com,
        ast@...nel.org, davem@...emloft.net, netdev@...r.kernel.org
Cc:     sowmini.varadhan@...cle.com
Subject: Re: [PATCH net] ebpf: fix bpf_msg_pull_data

On 08/31/2018 10:37 AM, Tushar Dave wrote:
> On 08/30/2018 12:20 AM, Daniel Borkmann wrote:
>> On 08/30/2018 02:21 AM, Tushar Dave wrote:
>>> On 08/29/2018 05:07 PM, Tushar Dave wrote:
>>>> While doing some preliminary testing it is found that bpf helper
>>>> bpf_msg_pull_data does not calculate the data and data_end offset
>>>> correctly. Fix it!
>>>>
>>>> Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data")
>>>> Signed-off-by: Tushar Dave <tushar.n.dave@...cle.com>
>>>> Acked-by: Sowmini Varadhan <sowmini.varadhan@...cle.com>
>>>> ---
>>>>    net/core/filter.c | 38 +++++++++++++++++++++++++-------------
>>>>    1 file changed, 25 insertions(+), 13 deletions(-)
>>>>
>>>> diff --git a/net/core/filter.c b/net/core/filter.c
>>>> index c25eb36..3eeb3d6 100644
>>>> --- a/net/core/filter.c
>>>> +++ b/net/core/filter.c
>>>> @@ -2285,7 +2285,7 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
>>>>    BPF_CALL_4(bpf_msg_pull_data,
>>>>           struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
>>>>    {
>>>> -    unsigned int len = 0, offset = 0, copy = 0;
>>>> +    unsigned int len = 0, offset = 0, copy = 0, off = 0;
>>>>        struct scatterlist *sg = msg->sg_data;
>>>>        int first_sg, last_sg, i, shift;
>>>>        unsigned char *p, *to, *from;
>>>> @@ -2299,22 +2299,30 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
>>>>        i = msg->sg_start;
>>>>        do {
>>>>            len = sg[i].length;
>>>> -        offset += len;
>>>>            if (start < offset + len)
>>>>                break;
>>>> +        offset += len;
>>>>            i++;
>>>>            if (i == MAX_SKB_FRAGS)
>>>>                i = 0;
>>>> -    } while (i != msg->sg_end);
>>>> +    } while (i <= msg->sg_end);
>>
>> I don't think this condition is correct, msg operates as a scatterlist ring,
>> so sg_end may very well be < current i when there's a wrap-around in the
>> traversal ... more below.
> 
> I'm wondering then how this is suppose to work in case sg list is not
> ring! For RDS, We have sg list that is not a ring. More below.
> 
>>
>>>>    +    /* return error if start is out of range */
>>>>        if (unlikely(start >= offset + len))
>>>>            return -EINVAL;
>>>>    -    if (!msg->sg_copy[i] && bytes <= len)
>>>> -        goto out;
>>>> +    /* return error if i is last entry in sglist and end is out of range */
>>>> +    if (msg->sg_copy[i] && end > offset + len)
>>>> +        return -EINVAL;
>>>>          first_sg = i;
>>>>    +    /* if i is not last entry in sg list and end (i.e start + bytes) is
>>>> +     * within this sg[i] then goto out and calculate data and data_end
>>>> +     */
>>>> +    if (!msg->sg_copy[i] && end <= offset + len)
>>>> +        goto out;
>>>> +
>>>>        /* At this point we need to linearize multiple scatterlist
>>>>         * elements or a single shared page. Either way we need to
>>>>         * copy into a linear buffer exclusively owned by BPF. Then
>>>> @@ -2330,9 +2338,14 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
>>>>            i++;
>>>>            if (i == MAX_SKB_FRAGS)
>>>>                i = 0;
>>>> -        if (bytes < copy)
>>>> +        if (end < copy)
>>>>                break;
>>>> -    } while (i != msg->sg_end);
>>>> +    } while (i <= msg->sg_end);
>>>> +
>>>> +    /* return error if i is last entry in sglist and end is out of range */
>>>> +    if (i > msg->sg_end && end > offset + copy)
>>>> +        return -EINVAL;
>>>> +
>>>>        last_sg = i;
>>>>          if (unlikely(copy < end - start))
>>>> @@ -2342,23 +2355,22 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
>>>>        if (unlikely(!page))
>>>>            return -ENOMEM;
>>>>        p = page_address(page);
>>>> -    offset = 0;
>>>>          i = first_sg;
>>>>        do {
>>>>            from = sg_virt(&sg[i]);
>>>>            len = sg[i].length;
>>>> -        to = p + offset;
>>>> +        to = p + off;
>>>>              memcpy(to, from, len);
>>>> -        offset += len;
>>>> +        off += len;
>>>>            sg[i].length = 0;
>>>>            put_page(sg_page(&sg[i]));
>>>>              i++;
>>>>            if (i == MAX_SKB_FRAGS)
>>>>                i = 0;
>>>> -    } while (i != last_sg);
>>>> +    } while (i < last_sg);
>>>>          sg[first_sg].length = copy;
>>>>        sg_set_page(&sg[first_sg], page, copy, 0);
>>>> @@ -2380,7 +2392,7 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
>>>>            else
>>>>                move_from = i + shift;
>>>>    -        if (move_from == msg->sg_end)
>>>> +        if (move_from > msg->sg_end)
>>>>                break;
>>>>              sg[i] = sg[move_from];
>>>> @@ -2396,7 +2408,7 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
>>>>        if (msg->sg_end < 0)
>>>>            msg->sg_end += MAX_SKB_FRAGS;
>>>>    out:
>>>> -    msg->data = sg_virt(&sg[i]) + start - offset;
>>>> +    msg->data = sg_virt(&sg[first_sg]) + start - offset;
>>>>        msg->data_end = msg->data + bytes;
>>>>          return 0;
>>>>
>>>
>>> Please discard this patch. I just noticed that Daniel Borkmann sent some similar fixes for bpf_msg_pull_data.
>>
>> Yeah I've been looking at these recently as well, please make sure you test
>> with the below fixes included to see if there's anything left:
> 
> I tested the latest net tree which has all the fixes you mentioned and I
> am still seeing issues.
> 
> As I already mentioned before on RFC v3 thread, we need to be careful
> reusing 'offset' while linearizing multiple scatterlist
> elements.
> Variable 'offset' is used to calculate the 'msg->data'
> i.e. msg->data = sg_virt(&sg[first_sg]) + start - offset"
> 
> We should use different variable name (e.g. off) while linearizing
> multiple scatterlist elements or a single shared page so that we don't
> overwrite 'offset' that has correct value already been calculated.

Agree, sigh, that's also buggy, please submit the below as proper patch,
thanks!

> A code change for this would look like:
> 
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 60a29ca..076ca09 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -2326,7 +2326,7 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
>  BPF_CALL_4(bpf_msg_pull_data,
>            struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
>  {
> -       unsigned int len = 0, offset = 0, copy = 0;
> +       unsigned int len = 0, offset = 0, copy = 0, off;

Nit: probably better name it 'poffset' since this is relative to p.

>         int bytes = end - start, bytes_sg_total;
>         struct scatterlist *sg = msg->sg_data;
>         int first_sg, last_sg, i, shift;
> @@ -2354,6 +2354,8 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
>          * account for the headroom.
>          */
>         bytes_sg_total = start - offset + bytes;
> 
>         if (!msg->sg_copy[i] && bytes_sg_total <= len)
>                 goto out;
> 
> @@ -2382,18 +2384,18 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
>         if (unlikely(!page))
>                 return -ENOMEM;
>         p = page_address(page);
> -       offset = 0;
> +       off = 0;
> 
>         i = first_sg;
>         do {
>                 from = sg_virt(&sg[i]);
>                 len = sg[i].length;
> -               to = p + offset;
> +               to = p + off;
> 
>                 memcpy(to, from, len);
> -               offset += len;
> +               off += len;
>                 sg[i].length = 0;
>                 put_page(sg_page(&sg[i]));
> 
>                 sk_msg_iter_var(i);
>         } while (i != last_sg);
> 
> 
> Apart from above bug fix, I see issues in below 2 cases:

Ok.

> case 1:
> =======
> For things like RDS, where sg list is not a ring how to know end of packet?
> 
> For example I have RDS packet of length 8192 Bytes which is in
> scatterlist like
> sge[0].lenghth = 1400
> sge[1].length = 1448
> sge[2].length = 1448
> sge[3].length = 1448
> sge[4].length = 1448
> sge[5].length = 1000
> 
> Now when I execute "bpf_msg_pull_data(msg, 8190, 8196, 0)" on above RDS
> packet, I expect bpf_msg_pull_data to return error because end is out of
> bound e.g. 8196 > 8192. Instead current code wraps it to sge index 0!
> That is if I dump first 6 bytes starting with 'start' I get byte value
> at offset 8190, 8191 from sge[5] and value at offset 8192, 8193, 8194
> 8195 bytes from sge[0] from offset 0 ,1 ,2 ,3 respectively - which is
> not expected!

For the sg ring case which is what is upstream today, it will find start_sg
as 5 and last_sg as 6. And therefore bail out in bytes_sg_total > copy test
since 1004 > 1000. sge[5] offset is 7192 with 1000 bytes len, so start is at
offset 998 from there and end at 1004. So I get -EINVAL from the mentioned
test.

Is msg->sg_start and msg->sg_end set properly in your RDS case? As long as
there's no wrap-around the helper should be usable also in RDS case. I'm not
sure though why you see above oob.

> case 2:
> =======
> Same example of RDS packet as above, when I execute
> "bpf_msg_pull_data(msg, 7191, 8192, 0)"
> I get -EINVAL which is *wrong*.
> 
> Debugging this, I found, right before we enter below do while loop the values of variables are:
> i = 4
> first_sg = 4
> last_sg= 5
> start=7191
> bytes_sg_total=2448
> offset=5744
> copy=0
> len=1448
> 
>      do {
>                 copy += sg[i].length;
>                 sk_msg_iter_var(i);
>                 if (bytes_sg_total <= copy)
>                         break;
>         } while (i != msg->sg_end);
>         last_sg = i;
> 
> However, above loop execute only one time because after one execution,
> i=5 and msg->sg_end=5. That makes condition "while (i != msg->sg_end)"
> false. So when loop exit, the values are:
> i = 5
> first_sg=4
> last_sg= 5
> start=7191
> bytes_sg_total=2448 <<<<<<
> offset=5744
> copy=1448 <<<<<<
> len=1448
> 
> And that makes below condition true
>         if (unlikely(bytes_sg_total > copy))
>                 return -EINVAL;

Hmm, is your msg->sg_end correct? I'm getting start_sg of 4 and last_sg of 6.
So it breaks on the bytes_sg_total <= copy test, and then it merges sges 4 and 5
into 4 with 2448 length while dropping sge 5. How do you set up sk_msg_buff?

> There may be one or more similar corner scenarios like case 1 and 2
> which can be fixed however I am not sure how can we fix so that we can
> get it right for sg ring and sg list?

Imho, it should be sufficient to have invariant of msg->sg_start < msg->sg_end
and having msg->sg_start point to the starting sge while msg->sg_end to the
last sge slot + 1.

> Thanks.
> -Tushar
>>
>> https://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git/commit/net/core/filter.c?id=5b24109b0563d45094c470684c1f8cea1af269f8
>> https://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git/commit/net/core/filter.c?id=0e06b227c5221dd51b5569de93f3b9f532be4a32
>> https://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git/commit/net/core/filter.c?id=2e43f95dd8ee62bc8bf57f2afac37fbd70c8d565
>> https://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git/commit/net/core/filter.c?id=a8cf76a9023bc6709b1361d06bb2fae5227b9d68
>>
>> Thanks,
>> Daniel
>>