lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <150428052763.25051.5923957964084196488.stgit@warthog.procyon.org.uk>
Date:   Fri, 01 Sep 2017 16:42:07 +0100
From:   David Howells <dhowells@...hat.com>
To:     linux-afs@...ts.infradead.org
Cc:     linux-fsdevel@...r.kernel.org, dhowells@...hat.com,
        linux-kernel@...r.kernel.org
Subject: [RFC PATCH 11/11] afs: Retry rxrpc calls with address rotation on
 network error

When a network error occurs when we attempt a call, we want to rotate the
set of addresses we have for that peer and try the call again.  Use the new
AF_RXRPC call-retrying facility to do this, thereby avoiding the need to
re-encrypt each time as this allows us to reuse the Tx-queue from the dead
call.

This method will work for accessing alternate VL servers and the various
addresses available for a single FS server, but should not be used to go to
alternate FS servers since that has other implications (such as getting
callbacks on other servers).

To this end:

 (1) An 'address list' concept is introduced.  Address lists are RCU
     replaceable lists of addresses.

 (2) A cell's VL server address list can be loaded directly via insmod or
     echo to /proc/fs/afs/cells or dynamically from a DNS query for AFSDB
     or SRV records.

 (3) An FS server's address list, for the moment, has a single entry that
     is the key to the server list.  This will change in the future when a
     server is instead keyed on its UUID and the VL.GetAddrsU operation is
     used.

 (4) Anyone wanting to use a cell's VL server address must wait until the
     cell record comes online and has tried to obtain some addresses.

 (5) An 'address cursor' concept is introduced to handle stepping over the
     address list.  For client calls, this is driven from a wrapper around
     rxrpc_kernel_send_data().  It isn't used for CM service call replies as
     they have to go to the caller's address.

In the future, we might want to annotate the list with information about
how each address fares.  We might then want to propagate such annotations
over address list replacement.

Whilst we're at it, we allow IPv6 addresses to be specified in
colon-delimited lists by enclosing them in square brackets.

Signed-off-by: David Howells <dhowells@...hat.com>
---

 fs/afs/Makefile    |    1 
 fs/afs/addr_list.c |  310 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/afs/cell.c      |  191 ++++++++++++--------------------
 fs/afs/fsclient.c  |   54 ++++++---
 fs/afs/internal.h  |   60 ++++++++--
 fs/afs/proc.c      |   23 ++--
 fs/afs/rxrpc.c     |  180 +++++++++++++++++++++++++++---
 fs/afs/server.c    |   66 +++++++----
 fs/afs/vlclient.c  |   19 ++-
 fs/afs/vlocation.c |  150 +++----------------------
 fs/afs/vnode.c     |   27 ++---
 fs/afs/volume.c    |    5 +
 12 files changed, 720 insertions(+), 366 deletions(-)
 create mode 100644 fs/afs/addr_list.c

diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 095c54165dfd..7cb4d55f6f1f 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -6,6 +6,7 @@ afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o
 
 kafs-objs := \
 	$(afs-cache-y) \
+	addr_list.o \
 	callback.o \
 	cell.o \
 	cmservice.o \
diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
new file mode 100644
index 000000000000..c2dcb6021cb2
--- /dev/null
+++ b/fs/afs/addr_list.c
@@ -0,0 +1,310 @@
+/* Server address list management
+ *
+ * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@...hat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/dns_resolver.h>
+#include <linux/inet.h>
+#include <keys/rxrpc-type.h>
+#include "internal.h"
+
+#define AFS_MAX_ADDRESSES \
+	((PAGE_SIZE - sizeof(struct afs_addr_list)) / sizeof(struct sockaddr_rxrpc))
+
+/*
+ * Release an address list.
+ */
+void afs_put_addrlist(struct afs_addr_list *alist)
+{
+	if (alist) {
+		int usage = refcount_dec_return(&alist->usage);
+
+		if (usage == 0)
+			call_rcu(&alist->rcu, (rcu_callback_t)kfree);
+	}
+}
+
+/*
+ * Allocate an address list.
+ */
+static struct afs_addr_list *afs_alloc_addrlist(unsigned int nr,
+						unsigned short service,
+						unsigned short port)
+{
+	struct afs_addr_list *alist;
+	unsigned int i;
+
+	_enter("%u,%u,%u", nr, service, port);
+
+	alist = kzalloc(sizeof(alist) + sizeof(alist->addrs[0]) * nr,
+			GFP_KERNEL);
+	if (!alist)
+		return NULL;
+
+	refcount_set(&alist->usage, 1);
+
+	for (i = 0; i < nr; i++) {
+		struct sockaddr_rxrpc *srx = &alist->addrs[i];
+		srx->srx_family			= AF_RXRPC;
+		srx->srx_service		= service;
+		srx->transport_type		= SOCK_DGRAM;
+		srx->transport_len		= sizeof(srx->transport.sin6);
+		srx->transport.sin6.sin6_family	= AF_INET6;
+		srx->transport.sin6.sin6_port	= htons(port);
+	}
+
+	return alist;
+}
+
+/*
+ * Parse a text string consisting of delimited addresses.
+ */
+struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
+					   char delim,
+					   unsigned short service,
+					   unsigned short port)
+{
+	struct afs_addr_list *alist;
+	const char *p, *end = text + len;
+	unsigned int nr = 0;
+
+	_enter("%*.*s,%c", (int)len, (int)len, text, delim);
+
+	if (!len)
+		return ERR_PTR(-EDESTADDRREQ);
+
+	if (delim == ':' && (memchr(text, ',', len) || !memchr(text, '.', len)))
+		delim = ',';
+
+	/* Count the addresses */
+	p = text;
+	do {
+		if (!*p)
+			return ERR_PTR(-EINVAL);
+		if (*p == delim)
+			continue;
+		nr++;
+		if (*p == '[') {
+			p++;
+			if (p == end)
+				return ERR_PTR(-EINVAL);
+			p = memchr(p, ']', end - p);
+			if (!p)
+				return ERR_PTR(-EINVAL);
+			p++;
+			if (p >= end)
+				break;
+		}
+
+		p = memchr(p, delim, end - p);
+		if (!p)
+			break;
+		p++;
+	} while (p < end);
+
+	_debug("%u/%lu addresses", nr, AFS_MAX_ADDRESSES);
+	if (nr > AFS_MAX_ADDRESSES)
+		nr = AFS_MAX_ADDRESSES;
+
+	alist = afs_alloc_addrlist(nr, service, port);
+	if (!alist)
+		return ERR_PTR(-ENOMEM);
+
+	/* Extract the addresses */
+	p = text;
+	do {
+		struct sockaddr_rxrpc *srx = &alist->addrs[alist->nr_addrs];
+		char tdelim = delim;
+
+		if (*p == delim) {
+			p++;
+			continue;
+		}
+
+		if (*p == '[') {
+			p++;
+			tdelim = ']';
+		}
+
+		if (in4_pton(p, end - p,
+			     (u8 *)&srx->transport.sin6.sin6_addr.s6_addr32[3],
+			     tdelim, &p)) {
+			srx->transport.sin6.sin6_addr.s6_addr32[0] = 0;
+			srx->transport.sin6.sin6_addr.s6_addr32[1] = 0;
+			srx->transport.sin6.sin6_addr.s6_addr32[2] = htonl(0xffff);
+		} else if (in6_pton(p, end - p,
+				    srx->transport.sin6.sin6_addr.s6_addr,
+				    tdelim, &p)) {
+			/* Nothing to do */
+		} else {
+			goto bad_address;
+		}
+
+		if (tdelim == ']') {
+			if (p == end || *p != ']')
+				goto bad_address;
+			p++;
+		}
+
+		if (p < end) {
+			if (*p == '+') {
+				/* Port number specification "+1234" */
+				unsigned int xport = 0;
+				p++;
+				if (p >= end || !isdigit(*p))
+					goto bad_address;
+				do {
+					xport *= 10;
+					xport += *p - '0';
+					if (xport > 65535)
+						goto bad_address;
+					p++;
+				} while (p < end && isdigit(*p));
+				srx->transport.sin6.sin6_port = htons(xport);
+			} else if (*p == delim) {
+				p++;
+			} else {
+				goto bad_address;
+			}
+		}
+
+		alist->nr_addrs++;
+	} while (p < end && alist->nr_addrs < AFS_MAX_ADDRESSES);
+
+	_leave(" = [nr %u]", alist->nr_addrs);
+	return alist;
+
+bad_address:
+	kfree(alist);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Compare old and new address lists to see if there's been any change.
+ * - How to do this in better than O(Nlog(N)) time?
+ *   - We don't really want to sort the address list, but would rather take the
+ *     list as we got it so as not to undo record rotation by the DNS server.
+ */
+#if 0
+static int afs_cmp_addr_list(const struct afs_addr_list *a1,
+			     const struct afs_addr_list *a2)
+{
+}
+#endif
+
+/*
+ * Perform a DNS query for VL servers and build a up an address list.
+ */
+struct afs_addr_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry)
+{
+	struct afs_addr_list *alist;
+	char *vllist = NULL;
+	int ret;
+
+	_enter("%s", cell->name);
+
+	ret = dns_query("afsdb", cell->name, cell->name_len,
+			"ipv4", &vllist, _expiry);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	alist = afs_parse_text_addrs(vllist, strlen(vllist), ',',
+				     VL_SERVICE, AFS_VL_PORT);
+	if (IS_ERR(alist)) {
+		kfree(vllist);
+		if (alist != ERR_PTR(-ENOMEM))
+			pr_err("Failed to parse DNS data\n");
+		return alist;
+	}
+
+	kfree(vllist);
+	return alist;
+}
+
+/*
+ * Get an address to try.
+ */
+struct sockaddr_rxrpc *afs_get_address(struct afs_addr_cursor *ac)
+{
+	unsigned short index;
+
+	_enter("%hu+%hd", ac->start, (short)ac->index);
+
+	if (!ac->alist)
+		return ERR_PTR(ac->error);
+
+	ac->index++;
+	if (ac->index == ac->alist->nr_addrs)
+		return ERR_PTR(-EDESTADDRREQ);
+
+	index = ac->start + ac->index;
+	if (index >= ac->alist->nr_addrs)
+		index -= ac->alist->nr_addrs;
+
+	return &ac->alist->addrs[index];
+}
+
+/*
+ * Release an address list cursor.
+ */
+void afs_end_cursor(struct afs_addr_cursor *ac)
+{
+	afs_put_addrlist(ac->alist);
+}
+
+/*
+ * Set the address cursor for iterating over VL servers.
+ */
+void afs_set_vl_cursor(struct afs_call *call, struct afs_cell *cell)
+{
+	struct afs_addr_cursor *ac = &call->cursor;
+	struct afs_addr_list *alist;
+	int ret;
+
+	if (!rcu_access_pointer(cell->vl_addrs)) {
+		ret = wait_on_bit(&cell->flags, AFS_CELL_FL_NO_LOOKUP_YET,
+				  TASK_INTERRUPTIBLE);
+		if (ret < 0) {
+			ac->error = ret;
+			return;
+		}
+
+		if (!rcu_access_pointer(cell->vl_addrs) &&
+		    ktime_get_real_seconds() < cell->dns_expiry) {
+			ac->error = cell->error;
+			return;
+		}
+	}
+
+	read_lock(&cell->vl_addrs_lock);
+	alist = rcu_dereference_protected(cell->vl_addrs,
+					  lockdep_is_held(&cell->vl_addrs_lock));
+	afs_get_addrlist(alist);
+	read_unlock(&cell->vl_addrs_lock);
+
+	ac->alist = alist;
+	ac->start = alist->index;
+	ac->index = 0xffff;
+	ac->error = 0;
+}
+
+/*
+ * Set the address cursor for iterating over FS servers.
+ */
+void afs_set_fs_cursor(struct afs_call *call, struct afs_server *server)
+{
+	struct afs_addr_cursor *ac = &call->cursor;
+
+	ac->alist = afs_get_addrlist(server->addrs);
+	ac->start = ac->alist->index;
+	ac->index = 0xffff;
+	ac->error = 0;
+}
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 078ffd90e5f4..d99824fc7f3f 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -9,7 +9,6 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/key.h>
 #include <linux/ctype.h>
@@ -51,8 +50,8 @@ struct afs_cell *afs_lookup_cell_rcu(struct afs_net *net,
 {
 	struct afs_cell *cell = NULL;
 	struct rb_node *p;
-	unsigned int seq = 0, n;
-	int ret = 0;
+	unsigned int seq = 0;
+	int n, ret = 0;
 
 	_enter("%*.*s", namesz, namesz, name);
 
@@ -69,12 +68,12 @@ struct afs_cell *afs_lookup_cell_rcu(struct afs_net *net,
 		read_seqbegin_or_lock(&net->cells_lock, &seq);
 
 		if (!name) {
-			ret = -EDESTADDRREQ;
 			cell = rcu_dereference_raw(net->ws_cell);
-			if (!cell)
+			if (cell) {
+				afs_get_cell(cell);
 				goto done;
-
-			afs_get_cell(cell);
+			}
+			ret = -EDESTADDRREQ;
 			goto done;
 		}
 
@@ -148,70 +147,33 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
 	init_rwsem(&cell->vl_sem);
 	INIT_LIST_HEAD(&cell->vl_list);
 	spin_lock_init(&cell->vl_lock);
-	seqlock_init(&cell->vl_addrs_lock);
-	cell->flags = (1 << AFS_CELL_FL_NOT_READY);
-
-	for (i = 0; i < AFS_CELL_MAX_ADDRS; i++) {
-		struct sockaddr_rxrpc *srx = &cell->vl_addrs[i];
-		srx->srx_family			= AF_RXRPC;
-		srx->srx_service		= VL_SERVICE;
-		srx->transport_type		= SOCK_DGRAM;
-		srx->transport.sin6.sin6_family	= AF_INET6;
-		srx->transport.sin6.sin6_port	= htons(AFS_VL_PORT);
-	}
+	cell->flags = ((1 << AFS_CELL_FL_NOT_READY) |
+		       (1 << AFS_CELL_FL_NO_LOOKUP_YET));
+	rwlock_init(&cell->vl_addrs_lock);
 
 	/* Fill in the VL server list if we were given a list of addresses to
 	 * use.
 	 */
 	if (vllist) {
-		char delim = ':';
-
-		if (strchr(vllist, ',') || !strchr(vllist, '.'))
-			delim = ',';
-
-		do {
-			struct sockaddr_rxrpc *srx = &cell->vl_addrs[cell->vl_naddrs];
-
-			if (in4_pton(vllist, -1,
-				     (u8 *)&srx->transport.sin6.sin6_addr.s6_addr32[3],
-				     delim, &vllist)) {
-				srx->transport_len = sizeof(struct sockaddr_in6);
-				srx->transport.sin6.sin6_addr.s6_addr32[0] = 0;
-				srx->transport.sin6.sin6_addr.s6_addr32[1] = 0;
-				srx->transport.sin6.sin6_addr.s6_addr32[2] = htonl(0xffff);
-			} else if (in6_pton(vllist, -1,
-					    srx->transport.sin6.sin6_addr.s6_addr,
-					    delim, &vllist)) {
-				srx->transport_len = sizeof(struct sockaddr_in6);
-				srx->transport.sin6.sin6_family	= AF_INET6;
-			} else {
-				goto bad_address;
-			}
-
-			cell->vl_naddrs++;
-			if (!*vllist)
-				break;
-			vllist++;
+		struct afs_addr_list *alist;
 
-		} while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && vllist);
+		alist = afs_parse_text_addrs(vllist, strlen(vllist), ':',
+					     VL_SERVICE, AFS_VL_PORT);
+		if (IS_ERR(alist)) {
+			ret = PTR_ERR(alist);
+			goto parse_failed;
+		}
 
-		/* Disable DNS refresh for manually-specified cells and set the
-		 * no-garbage collect flag (which pins the active count).
-		 */
+		rcu_assign_pointer(cell->vl_addrs, alist);
 		cell->dns_expiry = TIME64_MAX;
-	} else {
-		/* We're going to need to 'refresh' this cell's VL server list
-		 * from the DNS before we can use it.
-		 */
-		cell->dns_expiry = S64_MIN;
 	}
 
 	_leave(" = %p", cell);
 	return cell;
 
-bad_address:
-	printk(KERN_ERR "kAFS: bad VL server IP address\n");
-	ret = -EINVAL;
+parse_failed:
+	if (ret == -EINVAL)
+		printk(KERN_ERR "kAFS: bad VL server IP address\n");
 	kfree(cell);
 	_leave(" = %d", ret);
 	return ERR_PTR(ret);
@@ -322,16 +284,17 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
 	if (excl) {
 		ret = -EEXIST;
 	} else {
-		ASSERTCMP(refcount_read(&cursor->usage), >=, 1);
-		refcount_inc(&cursor->usage);
+		afs_get_cell(cursor);
 		ret = 0;
 	}
 	write_sequnlock(&net->cells_lock);
 	kfree(candidate);
 	if (ret == 0)
 		goto wait_for_cell;
+	goto error_noput;
 error:
 	afs_put_cell(net, cell);
+error_noput:
 	_leave(" = %d [error]", ret);
 	return ERR_PTR(ret);
 }
@@ -393,78 +356,50 @@ int afs_cell_init(struct afs_net *net, const char *rootcell)
  */
 static void afs_update_cell(struct afs_cell *cell)
 {
+	struct afs_addr_list *alist, *old;
 	time64_t now, expiry;
-	char *vllist = NULL;
-	int ret;
 
 	_enter("%s", cell->name);
 
-	ret = dns_query("afsdb", cell->name, cell->name_len,
-			"ipv4", &vllist, &expiry);
-	_debug("query %d", ret);
-	switch (ret) {
-	case 0 ... INT_MAX:
-		clear_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags);
-		clear_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags);
-		goto parse_dns_data;
+	alist = afs_dns_query(cell, &expiry);
+	if (IS_ERR(alist)) {
+		switch (PTR_ERR(alist)) {
+		case -ENODATA:
+			/* The DNS said that the cell does not exist */
+			set_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags);
+			clear_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags);
+			cell->dns_expiry = ktime_get_real_seconds() + 61;
+			break;
 
-	case -ENODATA:
-		clear_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags);
-		set_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags);
-		cell->dns_expiry = ktime_get_real_seconds() + 61;
-		cell->error = -EDESTADDRREQ;
-		goto out;
+		case -EAGAIN:
+		case -ECONNREFUSED:
+		default:
+			set_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags);
+			cell->dns_expiry = ktime_get_real_seconds() + 10;
+			break;
+		}
 
-	case -EAGAIN:
-	case -ECONNREFUSED:
-	default:
-		/* Unable to query DNS. */
-		set_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags);
-		cell->dns_expiry = ktime_get_real_seconds() + 10;
 		cell->error = -EDESTADDRREQ;
-		goto out;
-	}
-
-parse_dns_data:
-	write_seqlock(&cell->vl_addrs_lock);
-
-	ret = -EINVAL;
-	do {
-		struct sockaddr_rxrpc *srx = &cell->vl_addrs[cell->vl_naddrs];
-
-		if (in4_pton(vllist, -1,
-			     (u8 *)&srx->transport.sin6.sin6_addr.s6_addr32[3],
-			     ',', (const char **)&vllist)) {
-			srx->transport_len = sizeof(struct sockaddr_in6);
-			srx->transport.sin6.sin6_addr.s6_addr32[0] = 0;
-			srx->transport.sin6.sin6_addr.s6_addr32[1] = 0;
-			srx->transport.sin6.sin6_addr.s6_addr32[2] = htonl(0xffff);
-		} else if (in6_pton(vllist, -1,
-				    srx->transport.sin6.sin6_addr.s6_addr,
-				    ',', (const char **)&vllist)) {
-			srx->transport_len = sizeof(struct sockaddr_in6);
-			srx->transport.sin6.sin6_family	= AF_INET6;
-		} else {
-			goto bad_address;
-		}
+	} else {
+		clear_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags);
+		clear_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags);
 
-		cell->vl_naddrs++;
-		if (!*vllist)
-			break;
-		vllist++;
+		/* Exclusion on changing vl_addrs is achieved by a
+		 * non-reentrant work item.
+		 */
+		old = rcu_dereference_protected(cell->vl_addrs, true);
+		rcu_assign_pointer(cell->vl_addrs, alist);
+		cell->dns_expiry = expiry;
 
-	} while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS);
+		if (old)
+			afs_put_addrlist(old);
+	}
 
-	if (cell->vl_naddrs < AFS_CELL_MAX_ADDRS)
-		memset(cell->vl_addrs + cell->vl_naddrs, 0,
-		       (AFS_CELL_MAX_ADDRS - cell->vl_naddrs) * sizeof(cell->vl_addrs[0]));
+	if (test_and_clear_bit(AFS_CELL_FL_NO_LOOKUP_YET, &cell->flags))
+		wake_up_bit(&cell->flags, AFS_CELL_FL_NO_LOOKUP_YET);
 
 	now = ktime_get_real_seconds();
-	cell->dns_expiry = expiry;
-	afs_set_cell_timer(cell->net, expiry - now);
-bad_address:
-	write_sequnlock(&cell->vl_addrs_lock);
-out:
+	afs_set_cell_timer(cell->net, cell->dns_expiry - now);
 	_leave("");
 }
 
@@ -479,6 +414,7 @@ static void afs_cell_destroy(struct rcu_head *rcu)
 
 	ASSERTCMP(refcount_read(&cell->usage), ==, 0);
 
+	afs_put_addrlist(cell->vl_addrs);
 	key_put(cell->anonymous_key);
 	kfree(cell);
 
@@ -512,11 +448,23 @@ void afs_cells_timer(unsigned long data)
 }
 
 /*
+ * Get a reference on a cell record.
+ */
+struct afs_cell *afs_get_cell(struct afs_cell *cell)
+{
+	unsigned int usage;
+
+	usage = refcount_inc_return(&cell->usage);
+	return cell;
+}
+
+/*
  * Drop a reference on a cell record.
  */
 void afs_put_cell(struct afs_net *net, struct afs_cell *cell)
 {
 	time64_t now, expire_delay;
+	unsigned int usage;
 
 	if (!cell)
 		return;
@@ -530,7 +478,8 @@ void afs_put_cell(struct afs_net *net, struct afs_cell *cell)
 	    !test_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags))
 		expire_delay = afs_cell_gc_delay;
 
-	if (refcount_dec_return(&cell->usage) > 1)
+	usage = refcount_dec_return(&cell->usage);
+	if (usage > 1)
 		return;
 
 	/* 'cell' may now be garbage collected. */
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index bac2e8db6e75..f4e3ec104ac4 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -305,7 +305,8 @@ int afs_fs_fetch_file_status(struct afs_server *server,
 	bp[2] = htonl(vnode->fid.vnode);
 	bp[3] = htonl(vnode->fid.unique);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_set_fs_cursor(call, server);
+	return afs_make_call(call, GFP_NOFS, async);
 }
 
 /*
@@ -516,7 +517,8 @@ static int afs_fs_fetch_data64(struct afs_server *server,
 	bp[7] = htonl(lower_32_bits(req->len));
 
 	atomic_inc(&req->usage);
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_set_fs_cursor(call, server);
+	return afs_make_call(call, GFP_NOFS, async);
 }
 
 /*
@@ -559,7 +561,8 @@ int afs_fs_fetch_data(struct afs_server *server,
 	bp[5] = htonl(lower_32_bits(req->len));
 
 	atomic_inc(&req->usage);
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_set_fs_cursor(call, server);
+	return afs_make_call(call, GFP_NOFS, async);
 }
 
 /*
@@ -641,7 +644,8 @@ int afs_fs_give_up_callbacks(struct afs_net *net,
 	ASSERT(ncallbacks > 0);
 	wake_up_nr(&server->cb_break_waitq, ncallbacks);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_set_fs_cursor(call, server);
+	return afs_make_call(call, GFP_NOFS, async);
 }
 
 /*
@@ -736,7 +740,8 @@ int afs_fs_create(struct afs_server *server,
 	*bp++ = htonl(mode & S_IALLUGO); /* unix mode */
 	*bp++ = 0; /* segment size */
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_set_fs_cursor(call, server);
+	return afs_make_call(call, GFP_NOFS, async);
 }
 
 /*
@@ -815,7 +820,8 @@ int afs_fs_remove(struct afs_server *server,
 		bp = (void *) bp + padsz;
 	}
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_set_fs_cursor(call, server);
+	return afs_make_call(call, GFP_NOFS, async);
 }
 
 /*
@@ -899,7 +905,8 @@ int afs_fs_link(struct afs_server *server,
 	*bp++ = htonl(vnode->fid.vnode);
 	*bp++ = htonl(vnode->fid.unique);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_set_fs_cursor(call, server);
+	return afs_make_call(call, GFP_NOFS, async);
 }
 
 /*
@@ -1002,7 +1009,8 @@ int afs_fs_symlink(struct afs_server *server,
 	*bp++ = htonl(S_IRWXUGO); /* unix mode */
 	*bp++ = 0; /* segment size */
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_set_fs_cursor(call, server);
+	return afs_make_call(call, GFP_NOFS, async);
 }
 
 /*
@@ -1104,7 +1112,8 @@ int afs_fs_rename(struct afs_server *server,
 		bp = (void *) bp + n_padsz;
 	}
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_set_fs_cursor(call, server);
+	return afs_make_call(call, GFP_NOFS, async);
 }
 
 /*
@@ -1207,7 +1216,8 @@ static int afs_fs_store_data64(struct afs_server *server,
 	*bp++ = htonl(i_size >> 32);
 	*bp++ = htonl((u32) i_size);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_set_fs_cursor(call, server);
+	return afs_make_call(call, GFP_NOFS, async);
 }
 
 /*
@@ -1280,7 +1290,8 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb,
 	*bp++ = htonl(size);
 	*bp++ = htonl(i_size);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_set_fs_cursor(call, server);
+	return afs_make_call(call, GFP_NOFS, async);
 }
 
 /*
@@ -1380,7 +1391,8 @@ static int afs_fs_setattr_size64(struct afs_server *server, struct key *key,
 	*bp++ = htonl(attr->ia_size >> 32);	/* new file length */
 	*bp++ = htonl((u32) attr->ia_size);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_set_fs_cursor(call, server);
+	return afs_make_call(call, GFP_NOFS, async);
 }
 
 /*
@@ -1427,7 +1439,8 @@ static int afs_fs_setattr_size(struct afs_server *server, struct key *key,
 	*bp++ = 0;				/* size of write */
 	*bp++ = htonl(attr->ia_size);		/* new file length */
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_set_fs_cursor(call, server);
+	return afs_make_call(call, GFP_NOFS, async);
 }
 
 /*
@@ -1468,7 +1481,8 @@ int afs_fs_setattr(struct afs_server *server, struct key *key,
 
 	xdr_encode_AFS_StoreStatus(&bp, attr);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_set_fs_cursor(call, server);
+	return afs_make_call(call, GFP_NOFS, async);
 }
 
 /*
@@ -1699,7 +1713,8 @@ int afs_fs_get_volume_status(struct afs_server *server,
 	bp[0] = htonl(FSGETVOLUMESTATUS);
 	bp[1] = htonl(vnode->fid.vid);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_set_fs_cursor(call, server);
+	return afs_make_call(call, GFP_NOFS, async);
 }
 
 /*
@@ -1784,7 +1799,8 @@ int afs_fs_set_lock(struct afs_server *server,
 	*bp++ = htonl(vnode->fid.unique);
 	*bp++ = htonl(type);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_set_fs_cursor(call, server);
+	return afs_make_call(call, GFP_NOFS, async);
 }
 
 /*
@@ -1815,7 +1831,8 @@ int afs_fs_extend_lock(struct afs_server *server,
 	*bp++ = htonl(vnode->fid.vnode);
 	*bp++ = htonl(vnode->fid.unique);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_set_fs_cursor(call, server);
+	return afs_make_call(call, GFP_NOFS, async);
 }
 
 /*
@@ -1846,5 +1863,6 @@ int afs_fs_release_lock(struct afs_server *server,
 	*bp++ = htonl(vnode->fid.vnode);
 	*bp++ = htonl(vnode->fid.unique);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_set_fs_cursor(call, server);
+	return afs_make_call(call, GFP_NOFS, async);
 }
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index f96398163a68..a75b67e816cd 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -31,6 +31,7 @@
 
 struct pagevec;
 struct afs_call;
+struct afs_addr_cursor;
 
 typedef enum {
 	AFS_VL_NEW,			/* new, uninitialised record */
@@ -66,6 +67,24 @@ enum afs_call_state {
 };
 
 /*
+ * List of server addresses.
+ */
+struct afs_addr_list {
+	struct rcu_head		rcu;		/* Must be first */
+	refcount_t		usage;
+	unsigned short		nr_addrs;
+	unsigned short		index;		/* Address currently in use */
+	struct sockaddr_rxrpc	addrs[];
+};
+
+struct afs_addr_cursor {
+	struct afs_addr_list	*alist;
+	unsigned short		start;		/* Starting point in alist->addrs[] */
+	unsigned short		index;		/* Wrapping offset from start to current addr */
+	short			error;
+};
+
+/*
  * a record of an in-progress RxRPC call
  */
 struct afs_call {
@@ -77,6 +96,7 @@ struct afs_call {
 	struct key		*key;		/* security for this call */
 	struct afs_net		*net;		/* The network namespace */
 	struct afs_server	*server;	/* server affected by incoming CM call */
+	struct afs_addr_cursor	cursor;		/* Address/server rotation cursor */
 	void			*request;	/* request data (first part) */
 	struct address_space	*mapping;	/* page set */
 	struct afs_writeback	*wb;		/* writeback being performed */
@@ -276,16 +296,15 @@ struct afs_cell {
 #define AFS_CELL_FL_NO_GC	1		/* The cell was added manually, don't auto-gc */
 #define AFS_CELL_FL_NOT_FOUND	2		/* Permanent DNS error */
 #define AFS_CELL_FL_DNS_FAIL	3		/* Failed to access DNS */
+#define AFS_CELL_FL_NO_LOOKUP_YET 4		/* Not completed first DNS lookup yet */
 	enum afs_cell_state	state;
 	short			error;
 
 	spinlock_t		vl_lock;	/* vl_list lock */
 
 	/* VLDB server list. */
-	seqlock_t		vl_addrs_lock;
-	unsigned short		vl_naddrs;	/* number of VL servers in addr list */
-	unsigned short		vl_curr_svix;	/* current server index */
-	struct sockaddr_rxrpc	vl_addrs[AFS_CELL_MAX_ADDRS];	/* cell VL server addresses */
+	rwlock_t		vl_addrs_lock;	/* Lock on vl_addrs */
+	struct afs_addr_list	__rcu *vl_addrs; /* List of VL servers */
 	u8			name_len;	/* Length of name */
 	char			name[64 + 1];	/* Cell name, case-flattened and NUL-padded */
 };
@@ -336,7 +355,7 @@ struct afs_vlocation {
 struct afs_server {
 	atomic_t		usage;
 	time64_t		time_of_death;	/* time at which put reduced usage to 0 */
-	struct sockaddr_rxrpc	addr;		/* server address */
+	struct afs_addr_list	__rcu *addrs;	/* List of addresses for this server */
 	struct afs_net		*net;		/* The network namespace */
 	struct afs_cell		*cell;		/* cell in which server resides */
 	struct list_head	link;		/* link in cell's server list */
@@ -474,6 +493,23 @@ struct afs_interface {
 
 /*****************************************************************************/
 /*
+ * addr_list.c
+ */
+static inline struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist)
+{
+	refcount_inc(&alist->usage);
+	return alist;
+}
+extern void afs_put_addrlist(struct afs_addr_list *);
+extern struct afs_addr_list *afs_parse_text_addrs(const char *, size_t, char,
+						  unsigned short, unsigned short);
+extern struct afs_addr_list *afs_dns_query(struct afs_cell *, time64_t *);
+extern void afs_set_vl_cursor(struct afs_call *, struct afs_cell *);
+extern void afs_set_fs_cursor(struct afs_call *, struct afs_server *);
+extern struct sockaddr_rxrpc *afs_get_address(struct afs_addr_cursor *);
+extern 	void afs_end_cursor(struct afs_addr_cursor *);
+
+/*
  * cache.c
  */
 #ifdef CONFIG_AFS_FSCACHE
@@ -504,11 +540,11 @@ extern void afs_flush_callback_breaks(struct afs_server *);
 /*
  * cell.c
  */
-#define afs_get_cell(C) do { refcount_inc(&(C)->usage); } while(0)
 extern int __net_init afs_cell_init(struct afs_net *, const char *);
 extern struct afs_cell *afs_lookup_cell_rcu(struct afs_net *, const char *, unsigned);
 extern struct afs_cell *afs_lookup_cell(struct afs_net *, const char *, unsigned,
 					const char *, bool);
+extern struct afs_cell *afs_get_cell(struct afs_cell *);
 extern void afs_put_cell(struct afs_net *, struct afs_cell *);
 extern void afs_manage_cells(struct work_struct *);
 extern void afs_cells_timer(unsigned long);
@@ -662,7 +698,7 @@ extern void __net_exit afs_close_socket(struct afs_net *);
 extern void afs_charge_preallocation(struct work_struct *);
 extern void afs_put_call(struct afs_call *);
 extern int afs_queue_call_work(struct afs_call *);
-extern int afs_make_call(struct sockaddr_rxrpc *, struct afs_call *, gfp_t, bool);
+extern int afs_make_call(struct afs_call *, gfp_t, bool);
 extern struct afs_call *afs_alloc_flat_call(struct afs_net *,
 					    const struct afs_call_type *,
 					    size_t, size_t);
@@ -713,12 +749,10 @@ extern void __exit afs_fs_exit(void);
 /*
  * vlclient.c
  */
-extern int afs_vl_get_entry_by_name(struct afs_net *,
-				    struct sockaddr_rxrpc *, struct key *,
-				    const char *, struct afs_cache_vlocation *,
-				    bool);
-extern int afs_vl_get_entry_by_id(struct afs_net *,
-				  struct sockaddr_rxrpc *, struct key *,
+extern int afs_vl_get_entry_by_name(struct afs_cell *, struct key *,
+				    const char *,
+				    struct afs_cache_vlocation *, bool);
+extern int afs_vl_get_entry_by_id(struct afs_cell *,struct key *,
 				  afs_volid_t, afs_voltype_t,
 				  struct afs_cache_vlocation *, bool);
 
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index df3614306056..d75626613ed9 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -514,23 +514,23 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
  */
 static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos)
 {
+	struct afs_addr_list *alist;
 	struct afs_cell *cell = m->private;
 	loff_t pos = *_pos;
 
-	_enter("cell=%p pos=%Ld", cell, *_pos);
+	rcu_read_lock();
 
-	/* lock the list against modification */
-	down_read(&cell->vl_sem);
+	alist = rcu_dereference(cell->vl_addrs);
 
 	/* allow for the header line */
 	if (!pos)
 		return (void *) 1;
 	pos--;
 
-	if (pos >= cell->vl_naddrs)
+	if (!alist || pos >= alist->nr_addrs)
 		return NULL;
 
-	return &cell->vl_addrs[pos];
+	return alist->addrs + pos;
 }
 
 /*
@@ -539,17 +539,18 @@ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos)
 static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
 					  loff_t *_pos)
 {
+	struct afs_addr_list *alist;
 	struct afs_cell *cell = p->private;
 	loff_t pos;
 
-	_enter("cell=%p{nad=%u} pos=%Ld", cell, cell->vl_naddrs, *_pos);
+	alist = rcu_dereference(cell->vl_addrs);
 
 	pos = *_pos;
 	(*_pos)++;
-	if (pos >= cell->vl_naddrs)
+	if (!alist || pos >= alist->nr_addrs)
 		return NULL;
 
-	return &cell->vl_addrs[pos];
+	return alist->addrs + pos;
 }
 
 /*
@@ -557,9 +558,7 @@ static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
  */
 static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v)
 {
-	struct afs_cell *cell = p->private;
-
-	up_read(&cell->vl_sem);
+	rcu_read_unlock();
 }
 
 /*
@@ -658,7 +657,7 @@ static int afs_proc_cell_servers_show(struct seq_file *m, void *v)
 	}
 
 	/* display one cell per line on subsequent lines */
-	sprintf(ipaddr, "%pISp", &server->addr.transport);
+	sprintf(ipaddr, "%pISp", &server->addrs->addrs[0].transport);
 	seq_printf(m, "%3d %-15s %5d\n",
 		   atomic_read(&server->usage), ipaddr, server->fs_state);
 
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 805ae0542478..ab149f67f908 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -162,6 +162,7 @@ void afs_put_call(struct afs_call *call)
 		if (call->type->destructor)
 			call->type->destructor(call);
 
+		afs_end_cursor(&call->cursor);
 		kfree(call->request);
 		kfree(call);
 
@@ -287,6 +288,84 @@ static void afs_notify_end_request_tx(struct sock *sock,
 }
 
 /*
+ * Send data through rxrpc and rotate the destination address if a network
+ * error of some sort occurs.
+ */
+static int afs_send_data(struct afs_call *call, struct msghdr *msg,
+			 unsigned int bytes)
+{
+	enum rxrpc_call_completion compl;
+	struct sockaddr_rxrpc *srx;
+	int ret;
+
+resume:
+	ret = rxrpc_kernel_send_data(call->net->socket, call->rxcall,
+				     msg, bytes, afs_notify_end_request_tx);
+
+	/* Success and obvious local errors are returned immediately.  Note
+	 * that for an async operation, the call struct may already have
+	 * evaporated.
+	 */
+	if (ret >= 0 ||
+	    ret == -ENOMEM ||
+	    ret == -ENONET ||
+	    ret == -EINTR ||
+	    ret == -EFAULT ||
+	    ret == -ERESTARTSYS ||
+	    ret == -EKEYEXPIRED ||
+	    ret == -EKEYREVOKED ||
+	    ret == -EKEYREJECTED ||
+	    ret == -EPERM)
+		return ret;
+
+	/* Check to see if it's an error that meant the call data packets never
+	 * reached the peer.
+	 */
+	call->error = rxrpc_kernel_check_call(call->net->socket, call->rxcall,
+					      &compl, &call->abort_code);
+	if (call->error != -EINPROGRESS)
+		return ret;
+
+	switch (compl) {
+	case RXRPC_CALL_SUCCEEDED:
+	default:
+		WARN_ONCE(true, "AFS: Call succeeded despite send-data failing\n");
+		return 0;
+
+	case RXRPC_CALL_REMOTELY_ABORTED:
+	case RXRPC_CALL_LOCALLY_ABORTED:
+		/* All of these indicate that we had some interaction with the
+		 * server, so there's no point trying another server.
+		 */
+		return call->error;
+
+	case RXRPC_CALL_LOCAL_ERROR:
+	case RXRPC_CALL_NETWORK_ERROR:
+		/* Local errors from an attempt to connect a call and network
+		 * errors reported back by ICMP suggest skipping the current
+		 * address and trying the next.
+		 */
+		break;
+	}
+
+	/* Rotate servers if possible. */
+	srx = afs_get_address(&call->cursor);
+	if (IS_ERR(srx)) {
+		_leave(" = %ld [cursor]", PTR_ERR(srx));
+		return PTR_ERR(srx);
+	}
+
+	ret = rxrpc_kernel_retry_call(call->net->socket, call->rxcall,
+				      srx, call->key);
+	if (ret < 0)
+		return ret;
+
+	if (msg_data_left(msg) > 0)
+		goto resume;
+	return 0;
+}
+
+/*
  * attach the data from a bunch of pages on an inode to a call
  */
 static int afs_send_pages(struct afs_call *call, struct msghdr *msg)
@@ -305,8 +384,7 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg)
 		bytes = msg->msg_iter.count;
 		nr = msg->msg_iter.nr_segs;
 
-		ret = rxrpc_kernel_send_data(call->net->socket, call->rxcall, msg,
-					     bytes, afs_notify_end_request_tx);
+		ret = afs_send_data(call, msg, bytes);
 		for (loop = 0; loop < nr; loop++)
 			put_page(bv[loop].bv_page);
 		if (ret < 0)
@@ -321,9 +399,9 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg)
 /*
  * initiate a call
  */
-int afs_make_call(struct sockaddr_rxrpc *srx, struct afs_call *call,
-		  gfp_t gfp, bool async)
+int afs_make_call(struct afs_call *call, gfp_t gfp, bool async)
 {
+	struct sockaddr_rxrpc *srx;
 	struct rxrpc_call *rxcall;
 	struct msghdr msg;
 	struct kvec iov[1];
@@ -332,7 +410,7 @@ int afs_make_call(struct sockaddr_rxrpc *srx, struct afs_call *call,
 	u32 abort_code;
 	int ret;
 
-	_enter(",{%pISp},", &srx->transport);
+	_enter("");
 
 	ASSERT(call->type != NULL);
 	ASSERT(call->type->name != NULL);
@@ -354,6 +432,11 @@ int afs_make_call(struct sockaddr_rxrpc *srx, struct afs_call *call,
 	}
 
 	/* create a call */
+	srx = afs_get_address(&call->cursor);
+	if (IS_ERR(srx))
+		return PTR_ERR(srx);
+
+	_debug("call %pISp", &srx->transport);
 	rxcall = rxrpc_kernel_begin_call(call->net->socket, srx, call->key,
 					 (unsigned long)call,
 					 tx_total_len, gfp,
@@ -380,16 +463,7 @@ int afs_make_call(struct sockaddr_rxrpc *srx, struct afs_call *call,
 	msg.msg_controllen	= 0;
 	msg.msg_flags		= (call->send_pages ? MSG_MORE : 0);
 
-	/* We have to change the state *before* sending the last packet as
-	 * rxrpc might give us the reply before it returns from sending the
-	 * request.  Further, if the send fails, we may already have been given
-	 * a notification and may have collected it.
-	 */
-	if (!call->send_pages)
-		call->state = AFS_CALL_AWAIT_REPLY;
-	ret = rxrpc_kernel_send_data(call->net->socket, rxcall,
-				     &msg, call->request_size,
-				     afs_notify_end_request_tx);
+	ret = afs_send_data(call, &msg, call->request_size);
 	if (ret < 0)
 		goto error_do_abort;
 
@@ -758,7 +832,6 @@ void afs_send_empty_reply(struct afs_call *call)
 	msg.msg_controllen	= 0;
 	msg.msg_flags		= 0;
 
-	call->state = AFS_CALL_AWAIT_ACK;
 	switch (rxrpc_kernel_send_data(net->socket, call->rxcall, &msg, 0,
 				       afs_notify_end_reply_tx)) {
 	case 0:
@@ -798,7 +871,6 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 	msg.msg_controllen	= 0;
 	msg.msg_flags		= 0;
 
-	call->state = AFS_CALL_AWAIT_ACK;
 	n = rxrpc_kernel_send_data(net->socket, call->rxcall, &msg, len,
 				   afs_notify_end_reply_tx);
 	if (n >= 0) {
@@ -816,6 +888,69 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 }
 
 /*
+ * Totate the destination address if a network error of some sort occurs and
+ * retry the call.
+ */
+static int afs_retry_call(struct afs_call *call, int ret)
+{
+	enum rxrpc_call_completion compl;
+	struct sockaddr_rxrpc *srx;
+
+	if (ret == -ENOMEM ||
+	    ret == -ENONET ||
+	    ret == -EINTR ||
+	    ret == -EFAULT ||
+	    ret == -ERESTARTSYS ||
+	    ret == -EKEYEXPIRED ||
+	    ret == -EKEYREVOKED ||
+	    ret == -EKEYREJECTED ||
+	    ret == -EPERM)
+		return ret;
+
+	/* Check to see if it's an error that meant the call data packets never
+	 * reached the peer.
+	 */
+	call->error = rxrpc_kernel_check_call(call->net->socket, call->rxcall,
+					      &compl, &call->abort_code);
+	if (call->error == -EINPROGRESS)
+		return ret;
+
+	switch (compl) {
+	case RXRPC_CALL_SUCCEEDED:
+	default:
+		WARN_ONCE(true, "AFS: Call succeeded despite send-data failing\n");
+		return 0;
+
+	case RXRPC_CALL_REMOTELY_ABORTED:
+	case RXRPC_CALL_LOCALLY_ABORTED:
+		/* All of these indicate that we had some interaction with the
+		 * server, so there's no point trying another server.
+		 */
+		return call->error;
+
+	case RXRPC_CALL_LOCAL_ERROR:
+	case RXRPC_CALL_NETWORK_ERROR:
+		/* Local errors from an attempt to connect a call and network
+		 * errors reported back by ICMP suggest skipping the current
+		 * address and trying the next.
+		 */
+		break;
+	}
+
+	/* Rotate servers if possible. */
+	srx = afs_get_address(&call->cursor);
+	if (IS_ERR(srx))
+		return PTR_ERR(srx);
+
+	_debug("retry %pISp", &srx->transport);
+	call->error = 0;
+	ret = rxrpc_kernel_retry_call(call->net->socket, call->rxcall,
+				      srx, call->key);
+	_leave(" = %d [retry]", ret);
+	return ret;
+}
+
+/*
  * Extract a piece of data from the received data socket buffers.
  */
 int afs_extract_data(struct afs_call *call, void *buf, size_t count,
@@ -850,10 +985,15 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count,
 		return 0;
 	}
 
-	if (ret == -ECONNABORTED)
+	if (ret == -ECONNABORTED) {
 		call->error = call->type->abort_to_error(call->abort_code);
-	else
-		call->error = ret;
+		goto out;
+	}
+
+	ret = afs_retry_call(call, ret);
+	if (ret == 0)
+		return -EAGAIN;
+out:
 	call->state = AFS_CALL_COMPLETE;
 	return ret;
 }
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 57c2f605e11b..0f2e84966d3e 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -37,7 +37,9 @@ static int afs_install_server(struct afs_server *server)
 		p = *pp;
 		_debug("- consider %p", p);
 		xserver = rb_entry(p, struct afs_server, master_rb);
-		diff = memcmp(&server->addr, &xserver->addr, sizeof(server->addr));
+		diff = memcmp(&server->addrs->addrs[0],
+			      &xserver->addrs->addrs[0],
+			      sizeof(sizeof(server->addrs->addrs[0])));
 		if (diff < 0)
 			pp = &(*pp)->rb_left;
 		else if (diff > 0)
@@ -66,28 +68,41 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
 	_enter("");
 
 	server = kzalloc(sizeof(struct afs_server), GFP_KERNEL);
-	if (server) {
-		atomic_set(&server->usage, 1);
-		server->net = cell->net;
-		server->cell = cell;
-
-		INIT_LIST_HEAD(&server->link);
-		INIT_LIST_HEAD(&server->grave);
-		init_rwsem(&server->sem);
-		spin_lock_init(&server->fs_lock);
-		server->fs_vnodes = RB_ROOT;
-		server->cb_promises = RB_ROOT;
-		spin_lock_init(&server->cb_lock);
-		init_waitqueue_head(&server->cb_break_waitq);
-		INIT_DELAYED_WORK(&server->cb_break_work,
-				  afs_dispatch_give_up_callbacks);
-
-		server->addr = *addr;
-		_leave(" = %p{%d}", server, atomic_read(&server->usage));
-	} else {
-		_leave(" = NULL [nomem]");
-	}
+	if (!server)
+		goto enomem;
+	server->addrs = kzalloc(sizeof(struct afs_addr_list) +
+				sizeof(struct sockaddr_rxrpc),
+				GFP_KERNEL);
+	if (!server->addrs)
+		goto enomem_server;
+
+	atomic_set(&server->usage, 1);
+	server->net = cell->net;
+	server->cell = cell;
+
+	INIT_LIST_HEAD(&server->link);
+	INIT_LIST_HEAD(&server->grave);
+	init_rwsem(&server->sem);
+	spin_lock_init(&server->fs_lock);
+	server->fs_vnodes = RB_ROOT;
+	server->cb_promises = RB_ROOT;
+	spin_lock_init(&server->cb_lock);
+	init_waitqueue_head(&server->cb_break_waitq);
+	INIT_DELAYED_WORK(&server->cb_break_work,
+			  afs_dispatch_give_up_callbacks);
+
+	refcount_set(&server->addrs->usage, 1);
+	server->addrs->nr_addrs = 1;
+	server->addrs->addrs[0] = *addr;
+
+	_leave(" = %p{%d}", server, atomic_read(&server->usage));
 	return server;
+
+enomem_server:
+	kfree(server);
+enomem:
+	_leave(" = NULL [nomem]");
+	return NULL;
 }
 
 /*
@@ -104,7 +119,7 @@ struct afs_server *afs_lookup_server(struct afs_cell *cell,
 	read_lock(&cell->servers_lock);
 
 	list_for_each_entry(server, &cell->servers, link) {
-		if (memcmp(&server->addr, addr, sizeof(*addr)) == 0)
+		if (memcmp(&server->addrs->addrs[0], addr, sizeof(*addr)) == 0)
 			goto found_server_quickly;
 	}
 	read_unlock(&cell->servers_lock);
@@ -119,7 +134,7 @@ struct afs_server *afs_lookup_server(struct afs_cell *cell,
 
 	/* check the cell's server list again */
 	list_for_each_entry(server, &cell->servers, link) {
-		if (memcmp(&server->addr, addr, sizeof(*addr)) == 0)
+		if (memcmp(&server->addrs->addrs[0], addr, sizeof(*addr)) == 0)
 			goto found_server;
 	}
 
@@ -187,7 +202,7 @@ struct afs_server *afs_find_server(struct afs_net *net,
 
 		_debug("- consider %p", p);
 
-		diff = memcmp(srx, &server->addr, sizeof(*srx));
+		diff = memcmp(srx, &server->addrs->addrs[0], sizeof(*srx));
 		if (diff < 0) {
 			p = p->rb_left;
 		} else if (diff > 0) {
@@ -256,6 +271,7 @@ static void afs_destroy_server(struct afs_server *server)
 	ASSERTCMP(atomic_read(&server->cb_break_n), ==, 0);
 
 	afs_put_cell(server->net, server->cell);
+	afs_put_addrlist(server->addrs);
 	kfree(server);
 }
 
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 276319aa86d8..54d02e5ea20a 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -156,8 +156,7 @@ static const struct afs_call_type afs_RXVLGetEntryById = {
 /*
  * dispatch a get volume entry by name operation
  */
-int afs_vl_get_entry_by_name(struct afs_net *net,
-			     struct sockaddr_rxrpc *addr,
+int afs_vl_get_entry_by_name(struct afs_cell *cell,
 			     struct key *key,
 			     const char *volname,
 			     struct afs_cache_vlocation *entry,
@@ -173,10 +172,13 @@ int afs_vl_get_entry_by_name(struct afs_net *net,
 	padsz = (4 - (volnamesz & 3)) & 3;
 	reqsz = 8 + volnamesz + padsz;
 
-	call = afs_alloc_flat_call(net, &afs_RXVLGetEntryByName, reqsz, 384);
+	call = afs_alloc_flat_call(cell->net, &afs_RXVLGetEntryByName,
+				   reqsz, 384);
 	if (!call)
 		return -ENOMEM;
 
+	afs_set_vl_cursor(call, cell);
+
 	call->key = key;
 	call->reply = entry;
 
@@ -189,14 +191,13 @@ int afs_vl_get_entry_by_name(struct afs_net *net,
 		memset((void *) bp + volnamesz, 0, padsz);
 
 	/* initiate the call */
-	return afs_make_call(addr, call, GFP_KERNEL, async);
+	return afs_make_call(call, GFP_KERNEL, async);
 }
 
 /*
  * dispatch a get volume entry by ID operation
  */
-int afs_vl_get_entry_by_id(struct afs_net *net,
-			   struct sockaddr_rxrpc *addr,
+int afs_vl_get_entry_by_id(struct afs_cell *cell,
 			   struct key *key,
 			   afs_volid_t volid,
 			   afs_voltype_t voltype,
@@ -208,10 +209,12 @@ int afs_vl_get_entry_by_id(struct afs_net *net,
 
 	_enter("");
 
-	call = afs_alloc_flat_call(net, &afs_RXVLGetEntryById, 12, 384);
+	call = afs_alloc_flat_call(cell->net, &afs_RXVLGetEntryById, 12, 384);
 	if (!call)
 		return -ENOMEM;
 
+	afs_set_vl_cursor(call, cell);
+
 	call->key = key;
 	call->reply = entry;
 
@@ -222,5 +225,5 @@ int afs_vl_get_entry_by_id(struct afs_net *net,
 	*bp   = htonl(voltype);
 
 	/* initiate the call */
-	return afs_make_call(addr, call, GFP_KERNEL, async);
+	return afs_make_call(call, GFP_KERNEL, async);
 }
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index ec5ab8dc9bc8..8c64a16c0aaf 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -22,137 +22,6 @@ static unsigned afs_vlocation_timeout = 10;	/* volume location timeout in second
 static unsigned afs_vlocation_update_timeout = 10 * 60;
 
 /*
- * iterate through the VL servers in a cell until one of them admits knowing
- * about the volume in question
- */
-static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vl,
-					   struct key *key,
-					   struct afs_cache_vlocation *vldb)
-{
-	struct afs_cell *cell = vl->cell;
-	int count, ret;
-
-	_enter("%s,%s", cell->name, vl->vldb.name);
-
-	down_write(&vl->cell->vl_sem);
-	ret = -ENOMEDIUM;
-	for (count = cell->vl_naddrs; count > 0; count--) {
-		struct sockaddr_rxrpc *addr = &cell->vl_addrs[cell->vl_curr_svix];
-
-		_debug("CellServ[%hu]: %pIS", cell->vl_curr_svix, &addr->transport);
-
-		/* attempt to access the VL server */
-		ret = afs_vl_get_entry_by_name(cell->net, addr, key,
-					       vl->vldb.name, vldb, false);
-		switch (ret) {
-		case 0:
-			goto out;
-		case -ENOMEM:
-		case -ENONET:
-		case -ENETUNREACH:
-		case -EHOSTUNREACH:
-		case -ECONNREFUSED:
-			if (ret == -ENOMEM || ret == -ENONET)
-				goto out;
-			goto rotate;
-		case -ENOMEDIUM:
-		case -EKEYREJECTED:
-		case -EKEYEXPIRED:
-			goto out;
-		default:
-			ret = -EIO;
-			goto rotate;
-		}
-
-		/* rotate the server records upon lookup failure */
-	rotate:
-		cell->vl_curr_svix++;
-		cell->vl_curr_svix %= cell->vl_naddrs;
-	}
-
-out:
-	up_write(&vl->cell->vl_sem);
-	_leave(" = %d", ret);
-	return ret;
-}
-
-/*
- * iterate through the VL servers in a cell until one of them admits knowing
- * about the volume in question
- */
-static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl,
-					 struct key *key,
-					 afs_volid_t volid,
-					 afs_voltype_t voltype,
-					 struct afs_cache_vlocation *vldb)
-{
-	struct afs_cell *cell = vl->cell;
-	int count, ret;
-
-	_enter("%s,%x,%d,", cell->name, volid, voltype);
-
-	down_write(&vl->cell->vl_sem);
-	ret = -ENOMEDIUM;
-	for (count = cell->vl_naddrs; count > 0; count--) {
-		struct sockaddr_rxrpc *addr = &cell->vl_addrs[cell->vl_curr_svix];
-
-		_debug("CellServ[%hu]: %pIS", cell->vl_curr_svix, &addr->transport);
-
-		/* attempt to access the VL server */
-		ret = afs_vl_get_entry_by_id(cell->net, addr, key, volid,
-					     voltype, vldb, false);
-		switch (ret) {
-		case 0:
-			goto out;
-		case -ENOMEM:
-		case -ENONET:
-		case -ENETUNREACH:
-		case -EHOSTUNREACH:
-		case -ECONNREFUSED:
-			if (ret == -ENOMEM || ret == -ENONET)
-				goto out;
-			goto rotate;
-		case -EBUSY:
-			vl->upd_busy_cnt++;
-			if (vl->upd_busy_cnt <= 3) {
-				if (vl->upd_busy_cnt > 1) {
-					/* second+ BUSY - sleep a little bit */
-					set_current_state(TASK_UNINTERRUPTIBLE);
-					schedule_timeout(1);
-				}
-				continue;
-			}
-			break;
-		case -ENOMEDIUM:
-			vl->upd_rej_cnt++;
-			goto rotate;
-		default:
-			ret = -EIO;
-			goto rotate;
-		}
-
-		/* rotate the server records upon lookup failure */
-	rotate:
-		cell->vl_curr_svix++;
-		cell->vl_curr_svix %= cell->vl_naddrs;
-		vl->upd_busy_cnt = 0;
-	}
-
-out:
-	if (ret < 0 && vl->upd_rej_cnt > 0) {
-		printk(KERN_NOTICE "kAFS:"
-		       " Active volume no longer valid '%s'\n",
-		       vl->vldb.name);
-		vl->valid = 0;
-		ret = -ENOMEDIUM;
-	}
-
-	up_write(&vl->cell->vl_sem);
-	_leave(" = %d", ret);
-	return ret;
-}
-
-/*
  * allocate a volume location record
  */
 static struct afs_vlocation *afs_vlocation_alloc(struct afs_cell *cell,
@@ -197,6 +66,7 @@ static int afs_vlocation_update_record(struct afs_vlocation *vl,
 	       vl->vldb.vid[1],
 	       vl->vldb.vid[2]);
 
+retry:
 	if (vl->vldb.vidmask & AFS_VOL_VTM_RW) {
 		vid = vl->vldb.vid[0];
 		voltype = AFSVL_RWVOL;
@@ -215,7 +85,8 @@ static int afs_vlocation_update_record(struct afs_vlocation *vl,
 	/* contact the server to make sure the volume is still available
 	 * - TODO: need to handle disconnected operation here
 	 */
-	ret = afs_vlocation_access_vl_by_id(vl, key, vid, voltype, vldb);
+	ret = afs_vl_get_entry_by_id(vl->cell, key, vid, voltype,
+				     vldb, false);
 	switch (ret) {
 		/* net error */
 	default:
@@ -239,6 +110,18 @@ static int afs_vlocation_update_record(struct afs_vlocation *vl,
 		/* TODO: make existing record unavailable */
 		_leave(" = %d", ret);
 		return ret;
+
+	case -EBUSY:
+		vl->upd_busy_cnt++;
+		if (vl->upd_busy_cnt <= 3) {
+			if (vl->upd_busy_cnt > 1) {
+				/* second+ BUSY - sleep a little bit */
+				set_current_state(TASK_UNINTERRUPTIBLE);
+				schedule_timeout(1);
+			}
+			goto retry;
+		}
+		return -EBUSY;
 	}
 }
 
@@ -278,7 +161,8 @@ static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
 	memset(&vldb, 0, sizeof(vldb));
 
 	/* Try to look up an unknown volume in the cell VL databases by name */
-	ret = afs_vlocation_access_vl_by_name(vl, key, &vldb);
+	ret = afs_vl_get_entry_by_name(vl->cell, key, vl->vldb.name,
+				       &vldb, false);
 	if (ret < 0) {
 		printk("kAFS: failed to locate '%s' in cell '%s'\n",
 		       vl->vldb.name, vl->cell->name);
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index 64834b20f0f6..8dcf4921340a 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -354,8 +354,7 @@ int afs_vnode_fetch_status(struct afs_vnode *vnode,
 		if (IS_ERR(server))
 			goto no_server;
 
-		_debug("USING SERVER: %p{%pIS}",
-		       server, &server->addr.transport);
+		_debug("USING SERVER: %pISp", &server->addrs->addrs[0].transport);
 
 		ret = afs_fs_fetch_file_status(server, key, vnode, NULL,
 					       false);
@@ -418,7 +417,7 @@ int afs_vnode_fetch_data(struct afs_vnode *vnode, struct key *key,
 		if (IS_ERR(server))
 			goto no_server;
 
-		_debug("USING SERVER: %pIS\n", &server->addr.transport);
+		_debug("USING SERVER: %pISp", &server->addrs->addrs[0].transport);
 
 		ret = afs_fs_fetch_data(server, key, vnode, desc,
 					false);
@@ -474,7 +473,7 @@ int afs_vnode_create(struct afs_vnode *vnode, struct key *key,
 		if (IS_ERR(server))
 			goto no_server;
 
-		_debug("USING SERVER: %pIS\n", &server->addr.transport);
+		_debug("USING SERVER: %pISp", &server->addrs->addrs[0].transport);
 
 		ret = afs_fs_create(server, key, vnode, name, mode, newfid,
 				    newstatus, newcb, false);
@@ -530,7 +529,7 @@ int afs_vnode_remove(struct afs_vnode *vnode, struct key *key, const char *name,
 		if (IS_ERR(server))
 			goto no_server;
 
-		_debug("USING SERVER: %pIS\n", &server->addr.transport);
+		_debug("USING SERVER: %pIS\n", &server->addrs->addrs[0].transport);
 
 		ret = afs_fs_remove(server, key, vnode, name, isdir,
 				    false);
@@ -592,7 +591,7 @@ int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode,
 		if (IS_ERR(server))
 			goto no_server;
 
-		_debug("USING SERVER: %pIS\n", &server->addr.transport);
+		_debug("USING SERVER: %pIS\n", &server->addrs->addrs[0].transport);
 
 		ret = afs_fs_link(server, key, dvnode, vnode, name,
 				  false);
@@ -656,7 +655,7 @@ int afs_vnode_symlink(struct afs_vnode *vnode, struct key *key,
 		if (IS_ERR(server))
 			goto no_server;
 
-		_debug("USING SERVER: %pIS\n", &server->addr.transport);
+		_debug("USING SERVER: %pIS\n", &server->addrs->addrs[0].transport);
 
 		ret = afs_fs_symlink(server, key, vnode, name, content,
 				     newfid, newstatus, false);
@@ -726,7 +725,7 @@ int afs_vnode_rename(struct afs_vnode *orig_dvnode,
 		if (IS_ERR(server))
 			goto no_server;
 
-		_debug("USING SERVER: %pIS\n", &server->addr.transport);
+		_debug("USING SERVER: %pIS\n", &server->addrs->addrs[0].transport);
 
 		ret = afs_fs_rename(server, key, orig_dvnode, orig_name,
 				    new_dvnode, new_name, false);
@@ -792,7 +791,7 @@ int afs_vnode_store_data(struct afs_writeback *wb, pgoff_t first, pgoff_t last,
 		if (IS_ERR(server))
 			goto no_server;
 
-		_debug("USING SERVER: %pIS\n", &server->addr.transport);
+		_debug("USING SERVER: %pIS\n", &server->addrs->addrs[0].transport);
 
 		ret = afs_fs_store_data(server, wb, first, last, offset, to,
 					false);
@@ -845,7 +844,7 @@ int afs_vnode_setattr(struct afs_vnode *vnode, struct key *key,
 		if (IS_ERR(server))
 			goto no_server;
 
-		_debug("USING SERVER: %pIS\n", &server->addr.transport);
+		_debug("USING SERVER: %pIS\n", &server->addrs->addrs[0].transport);
 
 		ret = afs_fs_setattr(server, key, vnode, attr, false);
 
@@ -892,7 +891,7 @@ int afs_vnode_get_volume_status(struct afs_vnode *vnode, struct key *key,
 		if (IS_ERR(server))
 			goto no_server;
 
-		_debug("USING SERVER: %pIS\n", &server->addr.transport);
+		_debug("USING SERVER: %pIS\n", &server->addrs->addrs[0].transport);
 
 		ret = afs_fs_get_volume_status(server, key, vnode, vs, false);
 
@@ -931,7 +930,7 @@ int afs_vnode_set_lock(struct afs_vnode *vnode, struct key *key,
 		if (IS_ERR(server))
 			goto no_server;
 
-		_debug("USING SERVER: %pIS\n", &server->addr.transport);
+		_debug("USING SERVER: %pIS\n", &server->addrs->addrs[0].transport);
 
 		ret = afs_fs_set_lock(server, key, vnode, type, false);
 
@@ -969,7 +968,7 @@ int afs_vnode_extend_lock(struct afs_vnode *vnode, struct key *key)
 		if (IS_ERR(server))
 			goto no_server;
 
-		_debug("USING SERVER: %pIS\n", &server->addr.transport);
+		_debug("USING SERVER: %pIS\n", &server->addrs->addrs[0].transport);
 
 		ret = afs_fs_extend_lock(server, key, vnode, false);
 
@@ -1007,7 +1006,7 @@ int afs_vnode_release_lock(struct afs_vnode *vnode, struct key *key)
 		if (IS_ERR(server))
 			goto no_server;
 
-		_debug("USING SERVER: %pIS\n", &server->addr.transport);
+		_debug("USING SERVER: %pIS\n", &server->addrs->addrs[0].transport);
 
 		ret = afs_fs_release_lock(server, key, vnode, false);
 
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index fbbb470ac027..c0d4e9725d5e 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -249,7 +249,7 @@ struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *vnode)
 			afs_get_server(server);
 			up_read(&volume->server_sem);
 			_leave(" = %p (picked %pIS)",
-			       server, &server->addr.transport);
+			       server, &server->addrs->addrs[0].transport);
 			return server;
 
 		case -ENETUNREACH:
@@ -304,7 +304,8 @@ int afs_volume_release_fileserver(struct afs_vnode *vnode,
 	unsigned loop;
 
 	_enter("%s,%pIS,%d",
-	       volume->vlocation->vldb.name, &server->addr.transport, result);
+	       volume->vlocation->vldb.name, &server->addrs->addrs[0].transport,
+	       result);
 
 	switch (result) {
 		/* success */

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ