[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <ccecbaed777405b6061ac32a713a0fb305cdc745.1530086216.git.ka-cheong.poon@oracle.com>
Date: Wed, 27 Jun 2018 03:23:28 -0700
From: Ka-Cheong Poon <ka-cheong.poon@...cle.com>
To: netdev@...r.kernel.org
Cc: santosh.shilimkar@...cle.com, davem@...emloft.net,
rds-devel@....oracle.com
Subject: [PATCH v2 net-next 2/3] rds: Enable RDS IPv6 support
This patch enables RDS to use IPv6 addresses. For RDS/TCP, the
listener is now an IPv6 endpoint which accepts both IPv4 and IPv6
connection requests. RDS/RDMA/IB uses a private data (struct
rds_ib_connect_private) exchange between endpoints at RDS connection
establishment time to support RDMA. This private data exchange uses a
32 bit integer to represent an IP address. This needs to be changed in
order to support IPv6. A new private data struct
rds6_ib_connect_private is introduced to handle this. To ensure
backward compatibility, an IPv6 capable RDS stack uses another RDMA
listener port (RDS_CM_PORT) to accept IPv6 connection. And it
continues to use the original RDS_PORT for IPv4 RDS connections. When
it needs to communicate with an IPv6 peer, it uses the RDS_CM_PORT to
send the connection set up request.
v2: Fixed bound and peer address scope mismatched issue.
Added back rds_connect() IPv6 changes.
Signed-off-by: Ka-Cheong Poon <ka-cheong.poon@...cle.com>
---
net/rds/af_rds.c | 28 +++++++++++++++++++++++-
net/rds/bind.c | 34 +++++++++++++++++++++++++-----
net/rds/connection.c | 43 ++++++++++++++++++++++++-------------
net/rds/ib.c | 55 +++++++++++++++++++++++++++++++++++++++++-------
net/rds/ib_cm.c | 13 ++++++------
net/rds/rdma_transport.c | 32 ++++++++++++++++++++++++++--
net/rds/rdma_transport.h | 2 ++
net/rds/rds.h | 12 ++++++-----
net/rds/send.c | 32 ++++++++++++++++++++++++++--
net/rds/tcp.c | 54 +++++++++++++++++++++++++++++------------------
net/rds/tcp.h | 4 +---
net/rds/tcp_connect.c | 54 ++++++++++++++++++++++++++++++++++++-----------
net/rds/tcp_listen.c | 40 +++++++++++++++++++++++++++--------
13 files changed, 315 insertions(+), 88 deletions(-)
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index fc1a5c6..8ce2d92 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -484,7 +484,9 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
{
struct sock *sk = sock->sk;
struct sockaddr_in *sin;
+ struct sockaddr_in6 *sin6;
struct rds_sock *rs = rds_sk_to_rs(sk);
+ int addr_type;
int ret = 0;
lock_sock(sk);
@@ -510,7 +512,31 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
break;
case sizeof(struct sockaddr_in6):
- ret = -EPROTONOSUPPORT;
+ sin6 = (struct sockaddr_in6 *)uaddr;
+ if (sin6->sin6_family != AF_INET6) {
+ ret = -EAFNOSUPPORT;
+ break;
+ }
+ addr_type = ipv6_addr_type(&sin6->sin6_addr);
+ if (!(addr_type & IPV6_ADDR_UNICAST)) {
+ ret = -EPROTOTYPE;
+ break;
+ }
+ if (addr_type & IPV6_ADDR_LINKLOCAL) {
+ if (sin6->sin6_scope_id == 0 ||
+ (!ipv6_addr_any(&rs->rs_bound_addr) &&
+ sin6->sin6_scope_id != rs->rs_bound_scope_id)) {
+ ret = -EINVAL;
+ break;
+ }
+ /* Remember the connected address scope ID. It will
+ * be checked against the binding local address when
+ * the socket is bound.
+ */
+ rs->rs_bound_scope_id = sin6->sin6_scope_id;
+ }
+ rs->rs_conn_addr = sin6->sin6_addr;
+ rs->rs_conn_port = sin6->sin6_port;
break;
default:
diff --git a/net/rds/bind.c b/net/rds/bind.c
index 3822886..6e6e4ea 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -127,9 +127,10 @@ static int rds_add_bound(struct rds_sock *rs, const struct in6_addr *addr,
if (!rhashtable_insert_fast(&bind_hash_table,
&rs->rs_bound_node, ht_parms)) {
*port = rs->rs_bound_port;
+ rs->rs_bound_scope_id = scope_id;
ret = 0;
- rdsdebug("rs %p binding to %pI4:%d\n",
- rs, &addr, (int)ntohs(*port));
+ rdsdebug("rs %p binding to %pI6c:%d\n",
+ rs, addr, (int)ntohs(*port));
break;
} else {
rs->rs_bound_addr = in6addr_any;
@@ -164,11 +165,12 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
struct in6_addr v6addr, *binding_addr;
struct rds_transport *trans;
__u32 scope_id = 0;
+ int addr_type;
int ret = 0;
__be16 port;
- /* We only allow an RDS socket to be bound to and IPv4 address. IPv6
- * address support will be added later.
+ /* We allow an RDS socket to be bound to either IPv4 or IPv6
+ * address.
*/
if (addr_len == sizeof(struct sockaddr_in)) {
struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
@@ -180,7 +182,21 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
binding_addr = &v6addr;
port = sin->sin_port;
} else if (addr_len == sizeof(struct sockaddr_in6)) {
- return -EPROTONOSUPPORT;
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)uaddr;
+
+ addr_type = ipv6_addr_type(&sin6->sin6_addr);
+ if (sin6->sin6_family != AF_INET6 ||
+ !(addr_type & IPV6_ADDR_UNICAST)) {
+ return -EINVAL;
+ }
+ /* The scope ID must be specified for link local address. */
+ if (addr_type & IPV6_ADDR_LINKLOCAL) {
+ if (sin6->sin6_scope_id == 0)
+ return -EINVAL;
+ scope_id = sin6->sin6_scope_id;
+ }
+ binding_addr = &sin6->sin6_addr;
+ port = sin6->sin6_port;
} else {
return -EINVAL;
}
@@ -191,6 +207,14 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
ret = -EINVAL;
goto out;
}
+ /* Socket is connected. The binding address should have the same
+ * scope ID as the connected address.
+ */
+ if (!ipv6_addr_any(&rs->rs_conn_addr) &&
+ scope_id != rs->rs_bound_scope_id) {
+ ret = -EINVAL;
+ goto out;
+ }
ret = rds_add_bound(rs, binding_addr, &port, scope_id);
if (ret)
diff --git a/net/rds/connection.c b/net/rds/connection.c
index ca72563..8c5d093 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -486,10 +486,17 @@ void rds_conn_destroy(struct rds_connection *conn)
}
EXPORT_SYMBOL_GPL(rds_conn_destroy);
-static void rds_conn_message_info(struct socket *sock, unsigned int len,
- struct rds_info_iterator *iter,
- struct rds_info_lengths *lens,
- int want_send)
+static void __rds_inc_msg_cp(struct rds_incoming *inc,
+ struct rds_info_iterator *iter,
+ void *saddr, void *daddr, int flip)
+{
+ rds_inc_info_copy(inc, iter, *(__be32 *)saddr, *(__be32 *)daddr, flip);
+}
+
+static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens,
+ int want_send)
{
struct hlist_head *head;
struct list_head *list;
@@ -524,18 +531,13 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
/* XXX too lazy to maintain counts.. */
list_for_each_entry(rm, list, m_conn_item) {
- __be32 laddr;
- __be32 faddr;
-
total++;
- laddr = conn->c_laddr.s6_addr32[3];
- faddr = conn->c_faddr.s6_addr32[3];
if (total <= len)
- rds_inc_info_copy(&rm->m_inc,
- iter,
- laddr,
- faddr,
- 0);
+ __rds_inc_msg_cp(&rm->m_inc,
+ iter,
+ &conn->c_laddr,
+ &conn->c_faddr,
+ 0);
}
spin_unlock_irqrestore(&cp->cp_lock, flags);
@@ -548,6 +550,14 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
lens->each = sizeof(struct rds_info_message);
}
+static void rds_conn_message_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens,
+ int want_send)
+{
+ rds_conn_message_info_cmn(sock, len, iter, lens, want_send);
+}
+
static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens)
@@ -655,6 +665,9 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
struct rds_info_connection *cinfo = buffer;
struct rds_connection *conn = cp->cp_conn;
+ if (conn->c_isv6)
+ return 0;
+
cinfo->next_tx_seq = cp->cp_next_tx_seq;
cinfo->next_rx_seq = cp->cp_next_rx_seq;
cinfo->laddr = conn->c_laddr.s6_addr32[3];
diff --git a/net/rds/ib.c b/net/rds/ib.c
index c712a84..756225c 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -39,6 +39,7 @@
#include <linux/delay.h>
#include <linux/slab.h>
#include <linux/module.h>
+#include <net/addrconf.h>
#include "rds_single_path.h"
#include "rds.h"
@@ -295,6 +296,8 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
/* We will only ever look at IB transports */
if (conn->c_trans != &rds_ib_transport)
return 0;
+ if (conn->c_isv6)
+ return 0;
iinfo->src_addr = conn->c_laddr.s6_addr32[3];
iinfo->dst_addr = conn->c_faddr.s6_addr32[3];
@@ -330,7 +333,6 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len,
sizeof(struct rds_info_rdma_connection));
}
-
/*
* Early RDS/IB was built to only bind to an address if there is an IPoIB
* device with that address set.
@@ -346,8 +348,12 @@ static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr,
{
int ret;
struct rdma_cm_id *cm_id;
+ struct sockaddr_in6 sin6;
struct sockaddr_in sin;
+ struct sockaddr *sa;
+ bool isv4;
+ isv4 = ipv6_addr_v4mapped(addr);
/* Create a CMA ID and try to bind it. This catches both
* IB and iWARP capable NICs.
*/
@@ -356,20 +362,53 @@ static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr,
if (IS_ERR(cm_id))
return PTR_ERR(cm_id);
- memset(&sin, 0, sizeof(sin));
- sin.sin_family = AF_INET;
- sin.sin_addr.s_addr = addr->s6_addr32[3];
+ if (isv4) {
+ memset(&sin, 0, sizeof(sin));
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = addr->s6_addr32[3];
+ sa = (struct sockaddr *)&sin;
+ } else {
+ memset(&sin6, 0, sizeof(sin6));
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = *addr;
+ sin6.sin6_scope_id = scope_id;
+ sa = (struct sockaddr *)&sin6;
+
+ /* XXX Do a special IPv6 link local address check here. The
+ * reason is that rdma_bind_addr() always succeeds with IPv6
+ * link local address regardless it is indeed configured in a
+ * system.
+ */
+ if (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL) {
+ struct net_device *dev;
+
+ if (scope_id == 0)
+ return -EADDRNOTAVAIL;
+
+ /* Use init_net for now as RDS is not network
+ * name space aware.
+ */
+ dev = dev_get_by_index(&init_net, scope_id);
+ if (!dev)
+ return -EADDRNOTAVAIL;
+ if (!ipv6_chk_addr(&init_net, addr, dev, 1)) {
+ dev_put(dev);
+ return -EADDRNOTAVAIL;
+ }
+ dev_put(dev);
+ }
+ }
/* rdma_bind_addr will only succeed for IB & iWARP devices */
- ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+ ret = rdma_bind_addr(cm_id, sa);
/* due to this, we will claim to support iWARP devices unless we
check node_type. */
if (ret || !cm_id->device ||
cm_id->device->node_type != RDMA_NODE_IB_CA)
ret = -EADDRNOTAVAIL;
- rdsdebug("addr %pI6c ret %d node type %d\n",
- addr, ret,
+ rdsdebug("addr %pI6c%%%u ret %d node type %d\n",
+ addr, scope_id, ret,
cm_id->device ? cm_id->device->node_type : -1);
rdma_destroy_id(cm_id);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 5b8b181..250be1c 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -40,7 +40,6 @@
#include "rds_single_path.h"
#include "rds.h"
#include "ib.h"
-#include "tcp.h"
/*
* Set the selected protocol version
@@ -679,7 +678,7 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6)
return version;
}
-/* Given an IPv6 address, find the IB net_device which hosts that address and
+/* Given an IPv6 address, find the net_device which hosts that address and
* return its index. This is used by the rds_ib_cm_handle_connect() code to
* find the interface index of where an incoming request comes from when
* the request is using a link local address.
@@ -696,8 +695,7 @@ static u32 __rds_find_ifindex(struct net *net, const struct in6_addr *addr)
rcu_read_lock();
for_each_netdev_rcu(net, dev) {
- if (dev->type == ARPHRD_INFINIBAND &&
- ipv6_chk_addr(net, addr, dev, 0)) {
+ if (ipv6_chk_addr(net, addr, dev, 0)) {
idx = dev->ifindex;
break;
}
@@ -887,7 +885,10 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp)
/* XXX I wonder what affect the port space has */
/* delegate cm event handler to rdma_transport */
- handler = rds_rdma_cm_event_handler;
+ if (conn->c_isv6)
+ handler = rds6_rdma_cm_event_handler;
+ else
+ handler = rds_rdma_cm_event_handler;
ic->i_cm_id = rdma_create_id(&init_net, handler, conn,
RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(ic->i_cm_id)) {
@@ -923,7 +924,7 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp)
sin6 = (struct sockaddr_in6 *)&dest;
sin6->sin6_family = AF_INET6;
sin6->sin6_addr = conn->c_faddr;
- sin6->sin6_port = htons(RDS_TCP_PORT);
+ sin6->sin6_port = htons(RDS_CM_PORT);
sin6->sin6_scope_id = conn->c_dev_if;
}
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index aef73e7..bd67e55 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -37,7 +37,9 @@
#include "rdma_transport.h"
#include "ib.h"
+/* Global IPv4 and IPv6 RDS RDMA listener cm_id */
static struct rdma_cm_id *rds_rdma_listen_id;
+static struct rdma_cm_id *rds6_rdma_listen_id;
static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event,
@@ -153,6 +155,12 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
return rds_rdma_cm_event_handler_cmn(cm_id, event, false);
}
+int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event)
+{
+ return rds_rdma_cm_event_handler_cmn(cm_id, event, true);
+}
+
static int rds_rdma_listen_init_common(rdma_cm_event_handler handler,
struct sockaddr *sa,
struct rdma_cm_id **ret_cm_id)
@@ -199,13 +207,14 @@ static int rds_rdma_listen_init_common(rdma_cm_event_handler handler,
/* Initialize the RDS RDMA listeners. We create two listeners for
* compatibility reason. The one on RDS_PORT is used for IPv4
- * requests only. The one on RDS_TCP_PORT is used for IPv6 requests
+ * requests only. The one on RDS_CM_PORT is used for IPv6 requests
* only. So only IPv6 enabled RDS module will communicate using this
* port.
*/
static int rds_rdma_listen_init(void)
{
int ret;
+ struct sockaddr_in6 sin6;
struct sockaddr_in sin;
sin.sin_family = PF_INET;
@@ -214,7 +223,21 @@ static int rds_rdma_listen_init(void)
ret = rds_rdma_listen_init_common(rds_rdma_cm_event_handler,
(struct sockaddr *)&sin,
&rds_rdma_listen_id);
- return ret;
+ if (ret != 0)
+ return ret;
+
+ sin6.sin6_family = PF_INET6;
+ sin6.sin6_addr = in6addr_any;
+ sin6.sin6_port = htons(RDS_CM_PORT);
+ sin6.sin6_scope_id = 0;
+ sin6.sin6_flowinfo = 0;
+ ret = rds_rdma_listen_init_common(rds6_rdma_cm_event_handler,
+ (struct sockaddr *)&sin6,
+ &rds6_rdma_listen_id);
+ /* Keep going even when IPv6 is not enabled in the system. */
+ if (ret != 0)
+ rdsdebug("Cannot set up IPv6 RDMA listener\n");
+ return 0;
}
static void rds_rdma_listen_stop(void)
@@ -224,6 +247,11 @@ static void rds_rdma_listen_stop(void)
rdma_destroy_id(rds_rdma_listen_id);
rds_rdma_listen_id = NULL;
}
+ if (rds6_rdma_listen_id) {
+ rdsdebug("cm %p\n", rds6_rdma_listen_id);
+ rdma_destroy_id(rds6_rdma_listen_id);
+ rds6_rdma_listen_id = NULL;
+ }
}
static int rds_rdma_init(void)
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
index d309c44..bc3c639 100644
--- a/net/rds/rdma_transport.h
+++ b/net/rds/rdma_transport.h
@@ -11,6 +11,8 @@
int rds_rdma_conn_connect(struct rds_connection *conn);
int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event);
+int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event);
/* from ib.c */
extern struct rds_transport rds_ib_transport;
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 859808a..f5f99d1 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -24,13 +24,15 @@
#define RDS_PROTOCOL_MINOR(v) ((v) & 255)
#define RDS_PROTOCOL(maj, min) (((maj) << 8) | min)
-/*
- * XXX randomly chosen, but at least seems to be unused:
- * # 18464-18768 Unassigned
- * We should do better. We want a reserved port to discourage unpriv'ed
- * userspace from listening.
+/* The following ports, 16385, 18634, 18635, are registered with IANA as
+ * the ports to be used for RDS over TCP and UDP. 18634 is the historical
+ * value used for the RDMA_CM listener port. RDS/TCP uses port 16385. After
+ * IPv6 work, RDMA_CM also uses 16385 as the listener port. 18634 is kept
+ * to ensure compatibility with older RDS modules.
*/
#define RDS_PORT 18634
+#define RDS_CM_PORT 16385
+#define RDS_TCP_PORT RDS_CM_PORT
#ifdef ATOMIC64_INIT
#define KERNEL_HAS_ATOMIC64
diff --git a/net/rds/send.c b/net/rds/send.c
index 6ed2e92..c0e4f0b 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1105,8 +1105,28 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
break;
case sizeof(*sin6): {
- ret = -EPROTONOSUPPORT;
- goto out;
+ int addr_type;
+
+ if (sin6->sin6_family != AF_INET6) {
+ ret = -EINVAL;
+ goto out;
+ }
+ addr_type = ipv6_addr_type(&sin6->sin6_addr);
+ if (!(addr_type & IPV6_ADDR_UNICAST)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (addr_type & IPV6_ADDR_LINKLOCAL) {
+ if (sin6->sin6_scope_id == 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+ scope_id = sin6->sin6_scope_id;
+ }
+
+ daddr = sin6->sin6_addr;
+ dport = sin6->sin6_port;
+ break;
}
default:
@@ -1138,6 +1158,14 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
ret = -EOPNOTSUPP;
goto out;
}
+ /* If the socket is already bound to a link local address,
+ * it can only send to peers on the same link.
+ */
+ if (scope_id != rs->rs_bound_scope_id) {
+ release_sock(sk);
+ ret = -EINVAL;
+ goto out;
+ }
}
release_sock(sk);
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index dadb337..890d0e1 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -46,7 +46,12 @@
/* only for info exporting */
static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
static LIST_HEAD(rds_tcp_tc_list);
+
+/* rds_tcp_tc_count counts only IPv4 connections.
+ * rds6_tcp_tc_count counts both IPv4 and IPv6 connections.
+ */
static unsigned int rds_tcp_tc_count;
+static unsigned int rds6_tcp_tc_count;
/* Track rds_tcp_connection structs so they can be cleaned up */
static DEFINE_SPINLOCK(rds_tcp_conn_lock);
@@ -113,7 +118,9 @@ void rds_tcp_restore_callbacks(struct socket *sock,
/* done under the callback_lock to serialize with write_space */
spin_lock(&rds_tcp_tc_list_lock);
list_del_init(&tc->t_list_item);
- rds_tcp_tc_count--;
+ rds6_tcp_tc_count--;
+ if (!tc->t_cpath->cp_conn->c_isv6)
+ rds_tcp_tc_count--;
spin_unlock(&rds_tcp_tc_list_lock);
tc->t_sock = NULL;
@@ -200,7 +207,9 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
/* done under the callback_lock to serialize with write_space */
spin_lock(&rds_tcp_tc_list_lock);
list_add_tail(&tc->t_list_item, &rds_tcp_tc_list);
- rds_tcp_tc_count++;
+ rds6_tcp_tc_count++;
+ if (!tc->t_cpath->cp_conn->c_isv6)
+ rds_tcp_tc_count++;
spin_unlock(&rds_tcp_tc_list_lock);
/* accepted sockets need our listen data ready undone */
@@ -221,6 +230,9 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
write_unlock_bh(&sock->sk->sk_callback_lock);
}
+/* Handle RDS_INFO_TCP_SOCKETS socket option. It only returns IPv4
+ * connections for backward compatibility.
+ */
static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens)
@@ -228,8 +240,6 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
struct rds_info_tcp_socket tsinfo;
struct rds_tcp_connection *tc;
unsigned long flags;
- struct sockaddr_in sin;
- struct socket *sock;
spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
@@ -237,16 +247,15 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
goto out;
list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
+ struct inet_sock *inet = inet_sk(tc->t_sock->sk);
- sock = tc->t_sock;
- if (sock) {
- sock->ops->getname(sock, (struct sockaddr *)&sin, 0);
- tsinfo.local_addr = sin.sin_addr.s_addr;
- tsinfo.local_port = sin.sin_port;
- sock->ops->getname(sock, (struct sockaddr *)&sin, 1);
- tsinfo.peer_addr = sin.sin_addr.s_addr;
- tsinfo.peer_port = sin.sin_port;
- }
+ if (tc->t_cpath->cp_conn->c_isv6)
+ continue;
+
+ tsinfo.local_addr = inet->inet_saddr;
+ tsinfo.local_port = inet->inet_sport;
+ tsinfo.peer_addr = inet->inet_daddr;
+ tsinfo.peer_port = inet->inet_dport;
tsinfo.hdr_rem = tc->t_tinc_hdr_rem;
tsinfo.data_rem = tc->t_tinc_data_rem;
@@ -494,13 +503,18 @@ static __net_init int rds_tcp_init_net(struct net *net)
err = -ENOMEM;
goto fail;
}
- rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net);
+ rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, true);
if (!rtn->rds_tcp_listen_sock) {
- pr_warn("could not set up listen sock\n");
- unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
- rtn->rds_tcp_sysctl = NULL;
- err = -EAFNOSUPPORT;
- goto fail;
+ pr_warn("could not set up IPv6 listen sock\n");
+
+ /* Try IPv4 as some systems disable IPv6 */
+ rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, false);
+ if (!rtn->rds_tcp_listen_sock) {
+ unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
+ rtn->rds_tcp_sysctl = NULL;
+ err = -EAFNOSUPPORT;
+ goto fail;
+ }
}
INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker);
return 0;
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index c6fa080..6a948c1 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -2,8 +2,6 @@
#ifndef _RDS_TCP_H
#define _RDS_TCP_H
-#define RDS_TCP_PORT 16385
-
struct rds_tcp_incoming {
struct rds_incoming ti_inc;
struct sk_buff_head ti_skb_list;
@@ -67,7 +65,7 @@ void rds_tcp_restore_callbacks(struct socket *sock,
void rds_tcp_state_change(struct sock *sk);
/* tcp_listen.c */
-struct socket *rds_tcp_listen_init(struct net *);
+struct socket *rds_tcp_listen_init(struct net *net, bool isv6);
void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor);
void rds_tcp_listen_data_ready(struct sock *sk);
int rds_tcp_accept_one(struct socket *sock);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 231ae92..008f50f 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -89,9 +89,11 @@ void rds_tcp_state_change(struct sock *sk)
int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
{
struct socket *sock = NULL;
+ struct sockaddr_in6 sin6;
struct sockaddr_in sin;
struct sockaddr *addr;
int addrlen;
+ bool isv6;
int ret;
struct rds_connection *conn = cp->cp_conn;
struct rds_tcp_connection *tc = cp->cp_transport_data;
@@ -108,18 +110,36 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
mutex_unlock(&tc->t_conn_path_lock);
return 0;
}
- ret = sock_create_kern(rds_conn_net(conn), PF_INET,
- SOCK_STREAM, IPPROTO_TCP, &sock);
+ if (ipv6_addr_v4mapped(&conn->c_laddr)) {
+ ret = sock_create_kern(rds_conn_net(conn), PF_INET,
+ SOCK_STREAM, IPPROTO_TCP, &sock);
+ isv6 = false;
+ } else {
+ ret = sock_create_kern(rds_conn_net(conn), PF_INET6,
+ SOCK_STREAM, IPPROTO_TCP, &sock);
+ isv6 = true;
+ }
+
if (ret < 0)
goto out;
rds_tcp_tune(sock);
- sin.sin_family = AF_INET;
- sin.sin_addr.s_addr = conn->c_laddr.s6_addr32[3];
- sin.sin_port = 0;
- addr = (struct sockaddr *)&sin;
- addrlen = sizeof(sin);
+ if (isv6) {
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = conn->c_laddr;
+ sin6.sin6_port = 0;
+ sin6.sin6_flowinfo = 0;
+ sin6.sin6_scope_id = conn->c_dev_if;
+ addr = (struct sockaddr *)&sin6;
+ addrlen = sizeof(sin6);
+ } else {
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = conn->c_laddr.s6_addr32[3];
+ sin.sin_port = 0;
+ addr = (struct sockaddr *)&sin;
+ addrlen = sizeof(sin);
+ }
ret = sock->ops->bind(sock, addr, addrlen);
if (ret) {
@@ -128,11 +148,21 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
goto out;
}
- sin.sin_family = AF_INET;
- sin.sin_addr.s_addr = conn->c_faddr.s6_addr32[3];
- sin.sin_port = htons(RDS_TCP_PORT);
- addr = (struct sockaddr *)&sin;
- addrlen = sizeof(sin);
+ if (isv6) {
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = conn->c_faddr;
+ sin6.sin6_port = htons(RDS_TCP_PORT);
+ sin6.sin6_flowinfo = 0;
+ sin6.sin6_scope_id = conn->c_dev_if;
+ addr = (struct sockaddr *)&sin6;
+ addrlen = sizeof(sin6);
+ } else {
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = conn->c_faddr.s6_addr32[3];
+ sin.sin_port = htons(RDS_TCP_PORT);
+ addr = (struct sockaddr *)&sin;
+ addrlen = sizeof(sin);
+ }
/*
* once we call connect() we can start getting callbacks and they
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 4fdf5b3..0f996e4 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -256,15 +256,22 @@ void rds_tcp_listen_data_ready(struct sock *sk)
ready(sk);
}
-struct socket *rds_tcp_listen_init(struct net *net)
+struct socket *rds_tcp_listen_init(struct net *net, bool isv6)
{
- struct sockaddr_in sin;
struct socket *sock = NULL;
+ struct sockaddr_storage ss;
+ struct sockaddr_in6 *sin6;
+ struct sockaddr_in *sin;
+ int addr_len;
int ret;
- ret = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
- if (ret < 0)
+ ret = sock_create_kern(net, isv6 ? PF_INET6 : PF_INET, SOCK_STREAM,
+ IPPROTO_TCP, &sock);
+ if (ret < 0) {
+ rdsdebug("could not create %s listener socket: %d\n",
+ isv6 ? "IPv6" : "IPv4", ret);
goto out;
+ }
sock->sk->sk_reuse = SK_CAN_REUSE;
rds_tcp_nonagle(sock);
@@ -274,13 +281,28 @@ struct socket *rds_tcp_listen_init(struct net *net)
sock->sk->sk_data_ready = rds_tcp_listen_data_ready;
write_unlock_bh(&sock->sk->sk_callback_lock);
- sin.sin_family = PF_INET;
- sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
- sin.sin_port = (__force u16)htons(RDS_TCP_PORT);
+ if (isv6) {
+ sin6 = (struct sockaddr_in6 *)&ss;
+ sin6->sin6_family = PF_INET6;
+ sin6->sin6_addr = in6addr_any;
+ sin6->sin6_port = (__force u16)htons(RDS_TCP_PORT);
+ sin6->sin6_scope_id = 0;
+ sin6->sin6_flowinfo = 0;
+ addr_len = sizeof(*sin6);
+ } else {
+ sin = (struct sockaddr_in *)&ss;
+ sin->sin_family = PF_INET;
+ sin->sin_addr.s_addr = INADDR_ANY;
+ sin->sin_port = (__force u16)htons(RDS_TCP_PORT);
+ addr_len = sizeof(*sin);
+ }
- ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
- if (ret < 0)
+ ret = sock->ops->bind(sock, (struct sockaddr *)&ss, addr_len);
+ if (ret < 0) {
+ rdsdebug("could not bind %s listener socket: %d\n",
+ isv6 ? "IPv6" : "IPv4", ret);
goto out;
+ }
ret = sock->ops->listen(sock, 64);
if (ret < 0)
--
1.8.3.1
Powered by blists - more mailing lists