[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20080401180751.GA3263@2ka.mipt.ru>
Date: Tue, 1 Apr 2008 22:07:51 +0400
From: Evgeniy Polyakov <johnpol@....mipt.ru>
To: Eric Dumazet <dada1@...mosbay.com>
Cc: David Miller <davem@...emloft.net>, axboe@...nel.dk,
netdev@...r.kernel.org
Subject: Re: Fix for the fundamental network/block layer race in sendfile().
On Tue, Apr 01, 2008 at 09:47:59PM +0400, Evgeniy Polyakov (johnpol@....mipt.ru) wrote:
> > 1) Is out_file garanted to be a socket ?
> > 2) SO_PRIVATE_CALLBACK is defined to be 37 later on your patch. Is ORing 37
> > to sk_flags really working ???
> > 3) So, once a socket has been used for a sendfile(), all subsequent
> > sendmsg() will call skb_splice_destructor ? (I see no clearing of
> > SO_PRIVATE_CALLBACK)
> > This will probably crash.
> > 4) god_blessed_us name ... Oh I got it, its April the first !!!!
Addressed all above (2 and 3 actually).
The formed by using correct socket flag (set_bit/clear_bit, like in a
real life), the latter by clearing bit on sendfile() exit and to fix
exit/clear race each spliced page also setups lru.prev to itself, which
is checked in destructor.
Name was not changed, does it matter?
Patch still works ok :)
Thanks Eric for pointing that bugs.
Signed-off-as-usual.
diff --git a/fs/read_write.c b/fs/read_write.c
index 49a9871..fda55f6 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -16,6 +16,7 @@
#include <linux/syscalls.h>
#include <linux/pagemap.h>
#include <linux/splice.h>
+#include <net/sock.h>
#include "read_write.h"
#include <asm/uaccess.h>
@@ -703,6 +704,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
loff_t pos;
ssize_t retval;
int fput_needed_in, fput_needed_out, fl;
+ struct sock *sk;
+ struct socket *sock;
/*
* Get input file, and verify that it is ok..
@@ -762,6 +765,12 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
count = max - pos;
}
+ sock = out_file->private_data;
+ sk = sock->sk;
+
+ sk->sk_user_data = &skb_splice_destructor;
+ set_bit(SOCK_PRIVATE_CALLBACK, &sock->flags);
+
fl = 0;
#if 0
/*
@@ -775,6 +784,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
#endif
retval = do_splice_direct(in_file, ppos, out_file, count, fl);
+ clear_bit(SOCK_PRIVATE_CALLBACK, &sock->flags);
+
if (retval > 0) {
add_rchar(current, retval);
add_wchar(current, retval);
diff --git a/fs/splice.c b/fs/splice.c
index 0670c91..dcda32e 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -29,6 +29,7 @@
#include <linux/syscalls.h>
#include <linux/uio.h>
#include <linux/security.h>
+#include <net/sock.h>
/*
* Attempt to steal a page from a pipe buffer. This should perhaps go into
@@ -535,6 +536,8 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
if (!ret) {
more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
+ buf->page->lru.next = (void *)pipe;
+ buf->page->lru.prev = (void *)buf->page;
ret = file->f_op->sendpage(file, buf->page, buf->offset,
sd->len, &pos, more);
}
@@ -629,6 +632,9 @@ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
ret = 0;
do_wakeup = 0;
+ if (actor == pipe_to_sendpage)
+ atomic_set(&pipe->god_blessed_us, pipe->nrbufs);
+
for (;;) {
if (pipe->nrbufs) {
struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
@@ -665,12 +671,20 @@ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
do_wakeup = 1;
}
- if (!sd->total_len)
+ if (!sd->total_len) {
+ if (actor == pipe_to_sendpage)
+ wait_event_interruptible(pipe->wait,
+ !atomic_read(&pipe->god_blessed_us));
break;
+ }
}
if (pipe->nrbufs)
continue;
+
+ if (actor == pipe_to_sendpage)
+ wait_event_interruptible(pipe->wait, !atomic_read(&pipe->god_blessed_us));
+
if (!pipe->writers)
break;
if (!pipe->waiting_writers) {
diff --git a/include/asm-x86/socket.h b/include/asm-x86/socket.h
diff --git a/include/linux/net.h b/include/linux/net.h
index c414d90..5977a5e 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -65,6 +65,7 @@ typedef enum {
#define SOCK_NOSPACE 2
#define SOCK_PASSCRED 3
#define SOCK_PASSSEC 4
+#define SOCK_PRIVATE_CALLBACK 5
#ifndef ARCH_HAS_SOCKET_TYPES
/**
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 8e41202..465405a 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -51,6 +51,7 @@ struct pipe_inode_info {
unsigned int waiting_writers;
unsigned int r_counter;
unsigned int w_counter;
+ atomic_t god_blessed_us;
struct fasync_struct *fasync_readers;
struct fasync_struct *fasync_writers;
struct inode *inode;
diff --git a/include/net/sock.h b/include/net/sock.h
index fd98760..441dcd2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -862,6 +862,8 @@ extern struct sk_buff *sock_rmalloc(struct sock *sk,
extern void sock_wfree(struct sk_buff *skb);
extern void sock_rfree(struct sk_buff *skb);
+extern void skb_splice_destructor(struct sk_buff *skb);
+
extern int sock_setsockopt(struct socket *sock, int level,
int op, char __user *optval,
int optlen);
@@ -1168,6 +1170,8 @@ static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
sock_hold(sk);
skb->sk = sk;
skb->destructor = sock_wfree;
+ if (sk->sk_socket && test_bit(SOCK_PRIVATE_CALLBACK, &sk->sk_socket->flags))
+ skb->destructor = sk->sk_user_data;
atomic_add(skb->truesize, &sk->sk_wmem_alloc);
}
diff --git a/net/core/sock.c b/net/core/sock.c
index 2654c14..64aa36a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -112,6 +112,7 @@
#include <linux/tcp.h>
#include <linux/init.h>
#include <linux/highmem.h>
+#include <linux/pipe_fs_i.h>
#include <asm/uaccess.h>
#include <asm/system.h>
@@ -1116,6 +1117,26 @@ void sock_wfree(struct sk_buff *skb)
sock_put(sk);
}
+void skb_splice_destructor(struct sk_buff *skb)
+{
+ if (skb_shinfo(skb)->nr_frags) {
+ int i;
+ struct pipe_inode_info *pipe;
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ struct page *page = skb_shinfo(skb)->frags[i].page;
+
+ if (page->lru.prev == (struct list_head *)page) {
+ pipe = (struct pipe_inode_info *)page->lru.next;
+ if (atomic_dec_return(&pipe->god_blessed_us) == 0)
+ wake_up(&pipe->wait);
+ }
+ }
+ }
+
+ sock_wfree(skb);
+}
+
/*
* Read buffer destructor automatically called from kfree_skb.
*/
@@ -2164,6 +2185,7 @@ EXPORT_SYMBOL(sock_no_socketpair);
EXPORT_SYMBOL(sock_rfree);
EXPORT_SYMBOL(sock_setsockopt);
EXPORT_SYMBOL(sock_wfree);
+EXPORT_SYMBOL(skb_splice_destructor);
EXPORT_SYMBOL(sock_wmalloc);
EXPORT_SYMBOL(sock_i_uid);
EXPORT_SYMBOL(sock_i_ino);
--
Evgeniy Polyakov
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists