Provide an ops->swapfile() implementation for NFS. This will set the NFS socket to SOCK_VMIO and run socket reconnect under PF_MEMALLOC as well as reset SOCK_VMIO before engaging the protocol ->connect() method. PF_MEMALLOC should allow the allocation of struct socket and related objects and the early (re)setting of SOCK_VMIO should allow us to receive the packets required for the TCP connection buildup. (swapping continues over a server reset during heavy network traffic) Signed-off-by: Peter Zijlstra Cc: Trond Myklebust --- fs/Kconfig | 14 ++++++++++++ fs/nfs/file.c | 6 +++++ include/linux/sunrpc/xprt.h | 5 +++- net/sunrpc/sched.c | 13 +++++++---- net/sunrpc/xprtsock.c | 49 ++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 81 insertions(+), 6 deletions(-) Index: linux-2.6-git/fs/nfs/file.c =================================================================== --- linux-2.6-git.orig/fs/nfs/file.c 2007-02-21 12:15:16.000000000 +0100 +++ linux-2.6-git/fs/nfs/file.c 2007-02-21 12:15:19.000000000 +0100 @@ -324,6 +324,11 @@ static int nfs_launder_page(struct page return nfs_wb_page(page_file_mapping(page)->host, page); } +static int nfs_swapfile(struct address_space *mapping, int enable) +{ + return xs_swapper(NFS_CLIENT(mapping->host)->cl_xprt, enable); +} + const struct address_space_operations nfs_file_aops = { .readpage = nfs_readpage, .readpages = nfs_readpages, @@ -338,6 +343,7 @@ const struct address_space_operations nf .direct_IO = nfs_direct_IO, #endif .launder_page = nfs_launder_page, + .swapfile = nfs_swapfile, }; static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, Index: linux-2.6-git/include/linux/sunrpc/xprt.h =================================================================== --- linux-2.6-git.orig/include/linux/sunrpc/xprt.h 2007-02-21 11:04:08.000000000 +0100 +++ linux-2.6-git/include/linux/sunrpc/xprt.h 2007-02-21 12:15:19.000000000 +0100 @@ -149,7 +149,9 @@ struct rpc_xprt { unsigned int max_reqs; /* total slots */ unsigned long state; /* transport state */ unsigned char shutdown : 1, /* being shut down */ - resvport : 1; /* use a reserved port */ + resvport : 1, /* use a reserved port */ + swapper : 1; /* we're swapping over this + transport */ /* * Connection of transports @@ -241,6 +243,7 @@ void xprt_disconnect(struct rpc_xprt * */ struct rpc_xprt * xs_setup_udp(struct sockaddr *addr, size_t addrlen, struct rpc_timeout *to); struct rpc_xprt * xs_setup_tcp(struct sockaddr *addr, size_t addrlen, struct rpc_timeout *to); +int xs_swapper(struct rpc_xprt *xprt, int enable); /* * Reserved bit positions in xprt->state Index: linux-2.6-git/net/sunrpc/sched.c =================================================================== --- linux-2.6-git.orig/net/sunrpc/sched.c 2007-02-21 11:04:08.000000000 +0100 +++ linux-2.6-git/net/sunrpc/sched.c 2007-02-21 12:15:19.000000000 +0100 @@ -751,10 +751,13 @@ void * rpc_malloc(struct rpc_task *task, struct rpc_rqst *req = task->tk_rqstp; gfp_t gfp; - if (task->tk_flags & RPC_TASK_SWAPPER) - gfp = GFP_ATOMIC; - else - gfp = GFP_NOFS; + /* + * this rcpio thread might be needed by reclaim, hence we cannot + * wait on a regular alloc to succeed. + */ + gfp = GFP_ATOMIC; + if (RPC_IS_SWAPPER(task)) + gfp |= __GFP_EMERGENCY; if (size > RPC_BUFFER_MAXSIZE) { req->rq_buffer = kmalloc(size, gfp); @@ -834,7 +837,7 @@ void rpc_init_task(struct rpc_task *task static struct rpc_task * rpc_alloc_task(void) { - return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS); + return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOIO); } static void rpc_free_task(struct rcu_head *rcu) Index: linux-2.6-git/net/sunrpc/xprtsock.c =================================================================== --- linux-2.6-git.orig/net/sunrpc/xprtsock.c 2007-02-21 11:04:08.000000000 +0100 +++ linux-2.6-git/net/sunrpc/xprtsock.c 2007-02-21 12:15:19.000000000 +0100 @@ -1215,11 +1215,15 @@ static void xs_udp_connect_worker(struct container_of(work, struct sock_xprt, connect_worker.work); struct rpc_xprt *xprt = &transport->xprt; struct socket *sock = transport->sock; + unsigned long pflags = current->flags; int err, status = -EIO; if (xprt->shutdown || !xprt_bound(xprt)) goto out; + if (xprt->swapper) + current->flags |= PF_MEMALLOC; + /* Start by resetting any existing state */ xs_close(xprt); @@ -1257,6 +1261,9 @@ static void xs_udp_connect_worker(struct transport->sock = sock; transport->inet = sk; + if (xprt->swapper) + sk_set_vmio(sk); + write_unlock_bh(&sk->sk_callback_lock); } xs_udp_do_set_buffer_size(xprt); @@ -1264,6 +1271,7 @@ static void xs_udp_connect_worker(struct out: xprt_wake_pending_tasks(xprt, status); xprt_clear_connecting(xprt); + current->flags = pflags; } /* @@ -1302,11 +1310,15 @@ static void xs_tcp_connect_worker(struct container_of(work, struct sock_xprt, connect_worker.work); struct rpc_xprt *xprt = &transport->xprt; struct socket *sock = transport->sock; + unsigned long pflags = current->flags; int err, status = -EIO; if (xprt->shutdown || !xprt_bound(xprt)) goto out; + if (xprt->swapper) + current->flags |= PF_MEMALLOC; + if (!sock) { /* start from scratch */ if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { @@ -1356,6 +1368,10 @@ static void xs_tcp_connect_worker(struct write_unlock_bh(&sk->sk_callback_lock); } + + if (xprt->swapper) + sk_set_vmio(transport->inet); + /* Tell the socket layer to start connecting... */ xprt->stat.connect_count++; xprt->stat.connect_start = jiffies; @@ -1383,6 +1399,7 @@ out: xprt_wake_pending_tasks(xprt, status); out_clear: xprt_clear_connecting(xprt); + current->flags = pflags; } /** @@ -1642,6 +1659,38 @@ int init_socket_xprt(void) return 0; } +#define RPC_BUF_RESERVE_PAGES (RPC_MAX_SLOT_TABLE) +#define RPC_RESERVE_PAGES (RPC_BUF_RESERVE_PAGES + TX_RESERVE_PAGES) + +/** + * xs_swapper - Tag this transport as being used for swap. + * @xprt: transport to tag + * @enable: enable/disable + * + */ +int xs_swapper(struct rpc_xprt *xprt, int enable) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + int err = 0; + + if (enable) { + /* + * keep one extra sock reference so the reserve won't dip + * when the socket gets reconnected. + */ + sk_adjust_memalloc(1, RPC_RESERVE_PAGES); + sk_set_vmio(transport->inet); + xprt->swapper = 1; + } else if (xprt->swapper) { + xprt->swapper = 0; + sk_clear_vmio(transport->inet); + sk_adjust_memalloc(-1, -RPC_RESERVE_PAGES); + } + + return err; +} +EXPORT_SYMBOL_GPL(xs_swapper); + /** * cleanup_socket_xprt - remove xprtsock's sysctls * Index: linux-2.6-git/fs/Kconfig =================================================================== --- linux-2.6-git.orig/fs/Kconfig 2007-02-21 11:04:08.000000000 +0100 +++ linux-2.6-git/fs/Kconfig 2007-02-21 12:15:19.000000000 +0100 @@ -1621,6 +1621,20 @@ config NFS_DIRECTIO causes open() to return EINVAL if a file residing in NFS is opened with the O_DIRECT flag. +config NFS_SWAP + bool "Provide swap over NFS support" + default n + depends on NFS_FS + select SLAB_FAIR + select NETVM + select SWAP_FILE + help + This option enables swapon to work on files located on NFS mounts. + + For more details, see Documentation/vm_deadlock.txt + + If unsure, say N. + config NFSD tristate "NFS server support" depends on INET -- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/