[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <47F26D87.7020006@kolumbus.fi>
Date: Tue, 01 Apr 2008 20:14:47 +0300
From: Mika Penttilä <mika.penttila@...umbus.fi>
To: Evgeniy Polyakov <johnpol@....mipt.ru>
CC: David Miller <davem@...emloft.net>, axboe@...nel.dk,
netdev@...r.kernel.org
Subject: Re: Fix for the fundamental network/block layer race in sendfile().
Evgeniy Polyakov wrote:
> Hi.
>
> Summary of the previous series with this pompous header:
> when sendfile() returns, pages which it sent can still be queued in tcp
> stack or hardware, so subsequent write into them will endup in
> corrupting data which will be eventually sent. This concerns all
> ->sendpage() users namely sendfile() and splice().
>
> We can only safely reuse that pages only when ack is received from the
> remote side, which will force network stack to release pages.
>
> My simple extension allows to hook into data releasing path and perform
> any actions we want. This is achieved by replacing skb->destructor with
> own callback registerd by interested user, for example splice/sendfile
> code. Splice (pipe info structure) in turn is extended to hold atomic
> counter of the pages in flight (without structure size change because of
> alignment issues it has right now), so splice code will sleep when full
> pipe info (->nrbufs pages) was sent, it will wait until number of pages
> in flight hits zero, which is decremented in private splice callback.
>
> Patch was tested with simple send and recv applications, which can be
> found at
> http://tservice.net.ru/~s0mbre/archive/tmp/sendfile_fix/
>
> One has to run them on different machines, since loopback uses a bit
> different scheme (namely page is _never_ copied, so when it is received
> by 'remote' side, it still exists on the 'local' side, so modifications
> will endup in data corruption).
> devfs1# ./recv -a 0.0.0.0 -p 1025 -c 1024
> devfs2# ./send -a devfs1 -p 1025 -f /tmp/test -c 1024
>
> In case of failure you will get this:
> Connected to devfs1:1025.
> /tmp/test/1024 -> devfs1:1025
> Data was corrupted: ab.
>
> after short period of time, where above 'ab' is a hex byte writen into
> mapped file, which has been sent, immediately after senfile()
> returns to userspace.
> Data is supposed to be always zero, and applications should run forever.
> -c parameter specifies number of bytes to be sent in each run of the
> sendfile(). It has to be the same on both machines.
>
> Fix attached, consider for inclusion.
>
> Signed-off-by: Evgeniy Polyakov <johnpol@....mipt.ru>
>
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 49a9871..8c94e03 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -16,6 +16,7 @@
> #include <linux/syscalls.h>
> #include <linux/pagemap.h>
> #include <linux/splice.h>
> +#include <net/sock.h>
> #include "read_write.h"
>
> #include <asm/uaccess.h>
> @@ -703,6 +704,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
> loff_t pos;
> ssize_t retval;
> int fput_needed_in, fput_needed_out, fl;
> + struct sock *sk;
> + struct socket *sock;
>
> /*
> * Get input file, and verify that it is ok..
> @@ -762,6 +765,12 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
> count = max - pos;
> }
>
> + sock = out_file->private_data;
> + sk = sock->sk;
> +
> + sk->sk_user_data = &skb_splice_destructor;
> + sk->sk_flags |= SO_PRIVATE_CALLBACK;
> +
> fl = 0;
> #if 0
> /*
> diff --git a/fs/splice.c b/fs/splice.c
> index 0670c91..1432b13 100644
> --- a/fs/splice.c
> +++ b/fs/splice.c
> @@ -29,6 +29,7 @@
> #include <linux/syscalls.h>
> #include <linux/uio.h>
> #include <linux/security.h>
> +#include <net/sock.h>
>
> /*
> * Attempt to steal a page from a pipe buffer. This should perhaps go into
> @@ -535,6 +536,7 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
> if (!ret) {
> more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
>
> + buf->page->lru.next = (void *)pipe;
> ret = file->f_op->sendpage(file, buf->page, buf->offset,
> sd->len, &pos, more);
> }
> @@ -629,6 +631,8 @@ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
> ret = 0;
> do_wakeup = 0;
>
> + atomic_set(&pipe->god_blessed_us, pipe->nrbufs);
> +
> for (;;) {
> if (pipe->nrbufs) {
> struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
> @@ -665,12 +669,18 @@ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
> do_wakeup = 1;
> }
>
> - if (!sd->total_len)
> + if (!sd->total_len) {
> + wait_event_interruptible(pipe->wait,
> + !atomic_read(&pipe->god_blessed_us));
> break;
> + }
> }
>
> if (pipe->nrbufs)
> continue;
> +
> + wait_event_interruptible(pipe->wait, !atomic_read(&pipe->god_blessed_us));
> +
> if (!pipe->writers)
> break;
> if (!pipe->waiting_writers) {
> diff --git a/include/asm-x86/socket.h b/include/asm-x86/socket.h
> index 80af9c4..a4b047e 100644
> --- a/include/asm-x86/socket.h
> +++ b/include/asm-x86/socket.h
> @@ -54,4 +54,6 @@
>
> #define SO_MARK 36
>
> +#define SO_PRIVATE_CALLBACK 37
> +
> #endif /* _ASM_SOCKET_H */
> diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
> index 8e41202..465405a 100644
> --- a/include/linux/pipe_fs_i.h
> +++ b/include/linux/pipe_fs_i.h
> @@ -51,6 +51,7 @@ struct pipe_inode_info {
> unsigned int waiting_writers;
> unsigned int r_counter;
> unsigned int w_counter;
> + atomic_t god_blessed_us;
> struct fasync_struct *fasync_readers;
> struct fasync_struct *fasync_writers;
> struct inode *inode;
> diff --git a/include/net/sock.h b/include/net/sock.h
> index fd98760..ac7bc52 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -862,6 +862,8 @@ extern struct sk_buff *sock_rmalloc(struct sock *sk,
> extern void sock_wfree(struct sk_buff *skb);
> extern void sock_rfree(struct sk_buff *skb);
>
> +extern void skb_splice_destructor(struct sk_buff *skb);
> +
> extern int sock_setsockopt(struct socket *sock, int level,
> int op, char __user *optval,
> int optlen);
> @@ -1168,6 +1170,8 @@ static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
> sock_hold(sk);
> skb->sk = sk;
> skb->destructor = sock_wfree;
> + if (sk->sk_flags & SO_PRIVATE_CALLBACK)
> + skb->destructor = sk->sk_user_data;
> atomic_add(skb->truesize, &sk->sk_wmem_alloc);
> }
>
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 2654c14..0c10581 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -112,6 +112,7 @@
> #include <linux/tcp.h>
> #include <linux/init.h>
> #include <linux/highmem.h>
> +#include <linux/pipe_fs_i.h>
>
> #include <asm/uaccess.h>
> #include <asm/system.h>
> @@ -1116,6 +1117,24 @@ void sock_wfree(struct sk_buff *skb)
> sock_put(sk);
> }
>
> +void skb_splice_destructor(struct sk_buff *skb)
> +{
> + if (skb_shinfo(skb)->nr_frags) {
> + int i;
> + struct pipe_inode_info *pipe;
> +
> + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
> + struct page *page = skb_shinfo(skb)->frags[i].page;
> +
> + pipe = (struct pipe_inode_info *)page->lru.next;
> + if (atomic_dec_return(&pipe->god_blessed_us) == 0)
> + wake_up(&pipe->wait);
> + }
> + }
> +
> + sock_wfree(skb);
> +}
> +
> /*
> * Read buffer destructor automatically called from kfree_skb.
> */
> @@ -2164,6 +2183,7 @@ EXPORT_SYMBOL(sock_no_socketpair);
> EXPORT_SYMBOL(sock_rfree);
> EXPORT_SYMBOL(sock_setsockopt);
> EXPORT_SYMBOL(sock_wfree);
> +EXPORT_SYMBOL(skb_splice_destructor);
> EXPORT_SYMBOL(sock_wmalloc);
> EXPORT_SYMBOL(sock_i_uid);
> EXPORT_SYMBOL(sock_i_ino);
>
>
>
Hi,
Aren't you breaking the other types of splices, like splice to file
which doesn't have the wakeup thing.
--Mika
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists