[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <c5d7497c-2761-8941-19df-dcae087d10ce@oracle.com>
Date: Thu, 5 Jul 2018 10:58:50 -0700
From: Santosh Shilimkar <santosh.shilimkar@...cle.com>
To: Ka-Cheong Poon <ka-cheong.poon@...cle.com>, netdev@...r.kernel.org
Cc: davem@...emloft.net, rds-devel@....oracle.com
Subject: Re: [PATCH v2 net-next 1/3] rds: Changing IP address internal
representation to struct in6_addr
On 6/27/2018 3:23 AM, Ka-Cheong Poon wrote:
> This patch changes the internal representation of an IP address to use
> struct in6_addr. IPv4 address is stored as an IPv4 mapped address.
> All the functions which take an IP address as argument are also
> changed to use struct in6_addr. But RDS socket layer is not modified
> such that it still does not accept IPv6 address from an application.
> And RDS layer does not accept nor initiate IPv6 connections.
>
> v2: Fixed sparse warnings.
>
> Signed-off-by: Ka-Cheong Poon <ka-cheong.poon@...cle.com>
> ---
> net/rds/af_rds.c | 138 ++++++++++++++++------
> net/rds/bind.c | 91 +++++++++-----
> net/rds/cong.c | 23 ++--
> net/rds/connection.c | 132 +++++++++++++--------
> net/rds/ib.c | 17 +--
> net/rds/ib.h | 45 +++++--
> net/rds/ib_cm.c | 300 +++++++++++++++++++++++++++++++++++------------
> net/rds/ib_rdma.c | 15 +--
> net/rds/ib_recv.c | 18 +--
> net/rds/ib_send.c | 10 +-
> net/rds/loop.c | 7 +-
> net/rds/rdma.c | 6 +-
> net/rds/rdma_transport.c | 56 ++++++---
> net/rds/rds.h | 69 +++++++----
> net/rds/recv.c | 51 +++++---
> net/rds/send.c | 67 ++++++++---
> net/rds/tcp.c | 32 ++++-
> net/rds/tcp_connect.c | 34 +++---
> net/rds/tcp_listen.c | 18 +--
> net/rds/tcp_recv.c | 9 +-
> net/rds/tcp_send.c | 4 +-
> net/rds/threads.c | 69 +++++++++--
> net/rds/transport.c | 15 ++-
> 23 files changed, 857 insertions(+), 369 deletions(-)
>
> diff --git a/net/rds/bind.c b/net/rds/bind.c
> index 5aa3a64..3822886 100644
> --- a/net/rds/bind.c
> +++ b/net/rds/bind.c
> @@ -1,5 +1,5 @@
> /*
> - * Copyright (c) 2006 Oracle. All rights reserved.
> + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
> *
> * This software is available to you under a choice of one of two
> * licenses. You may choose to be licensed under the terms of the GNU
> @@ -33,6 +33,7 @@
> #include <linux/kernel.h>
> #include <net/sock.h>
> #include <linux/in.h>
> +#include <linux/ipv6.h>
> #include <linux/if_arp.h>
> #include <linux/jhash.h>
> #include <linux/ratelimit.h>
> @@ -42,42 +43,58 @@
>
> static const struct rhashtable_params ht_parms = {
> .nelem_hint = 768,
> - .key_len = sizeof(u64),
> + .key_len = RDS_BOUND_KEY_LEN,
Do we really need the scope id to be part of the key ? With link
local/global, do you see any collisions. Please educate me
on the actual usecase. This can avoid bunch of changes and hence
the question.
> @@ -114,7 +132,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
> rs, &addr, (int)ntohs(*port));
> break;
> } else {
> - rs->rs_bound_addr = 0;
> + rs->rs_bound_addr = in6addr_any;
Can you elaborate why 0 is not ok ?
> rds_sock_put(rs);
> ret = -ENOMEM;
> break;
> @@ -127,44 +145,61 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
> void rds_remove_bound(struct rds_sock *rs)
> {
>
> - if (!rs->rs_bound_addr)
> + if (ipv6_addr_any(&rs->rs_bound_addr))
> return;
>
> - rdsdebug("rs %p unbinding from %pI4:%d\n",
> + rdsdebug("rs %p unbinding from %pI6c:%d\n",
> rs, &rs->rs_bound_addr,
> ntohs(rs->rs_bound_port));
>
> rhashtable_remove_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms);
> rds_sock_put(rs);
> - rs->rs_bound_addr = 0;
> + rs->rs_bound_addr = in6addr_any;
> }
>
> int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
> {
> struct sock *sk = sock->sk;
> - struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
> struct rds_sock *rs = rds_sk_to_rs(sk);
> + struct in6_addr v6addr, *binding_addr;
> struct rds_transport *trans;
> + __u32 scope_id = 0;
> int ret = 0;
> + __be16 port;
>
> + /* We only allow an RDS socket to be bound to and IPv4 address. IPv6
s/'bound to and IPv4'/'bound to an IPv4'
> + * address support will be added later.
> + */
> + if (addr_len == sizeof(struct sockaddr_in)) {
> + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
> +
> + if (sin->sin_family != AF_INET ||
> + sin->sin_addr.s_addr == htonl(INADDR_ANY))
> + return -EINVAL;
> + ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &v6addr);
> + binding_addr = &v6addr;
> + port = sin->sin_port;
> + } else if (addr_len == sizeof(struct sockaddr_in6)) {
> + return -EPROTONOSUPPORT;
> + } else {
> + return -EINVAL;
> + }
> lock_sock(sk);
>
> - if (addr_len != sizeof(struct sockaddr_in) ||
> - sin->sin_family != AF_INET ||
> - rs->rs_bound_addr ||
> - sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
> + /* RDS socket does not allow re-binding. */
> + if (!ipv6_addr_any(&rs->rs_bound_addr)) {
> ret = -EINVAL;
> goto out;
> }
Seems new behavior you are adding. The statement itself is
true but if we return silently for already bind address, its
ok. May be am missing something above.
> diff --git a/net/rds/connection.c b/net/rds/connection.c
> index abef75d..ca72563 100644
> --- a/net/rds/connection.c
> +++ b/net/rds/connection.c
> @@ -142,9 +151,12 @@ static void __rds_conn_path_init(struct rds_connection *conn,
> * are torn down as the module is removed, if ever.
> */
> static struct rds_connection *__rds_conn_create(struct net *net,
> - __be32 laddr, __be32 faddr,
> - struct rds_transport *trans, gfp_t gfp,
> - int is_outgoing)
> + const struct in6_addr *laddr,
> + const struct in6_addr *faddr,
> + struct rds_transport *trans,
> + gfp_t gfp,
> + int is_outgoing,
> + int dev_if)
Am just wondering if we can handle local link address case differently
instead of pushing the interface index everywhere. Did you think about
any alternative. This can also avoid bunch of changes again and hence
the question. Am trying to see if we can minimize the changes to
absolute must have to support IPv6.
> diff --git a/net/rds/ib.c b/net/rds/ib.c
> index b6ad38e..c712a84 100644
> --- a/net/rds/ib.c
> +++ b/net/rds/ib.c
> @@ -1,5 +1,5 @@
> /*
> - * Copyright (c) 2006 Oracle. All rights reserved.
> + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
> *
> * This software is available to you under a choice of one of two
> * licenses. You may choose to be licensed under the terms of the GNU
> @@ -296,8 +296,8 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
> if (conn->c_trans != &rds_ib_transport)
> return 0;
>
> - iinfo->src_addr = conn->c_laddr;
> - iinfo->dst_addr = conn->c_faddr;
> + iinfo->src_addr = conn->c_laddr.s6_addr32[3];
> + iinfo->dst_addr = conn->c_faddr.s6_addr32[3];
>
> memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
> memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
> @@ -341,7 +341,8 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len,
> * allowed to influence which paths have priority. We could call userspace
> * asserting this policy "routing".
> */
> -static int rds_ib_laddr_check(struct net *net, __be32 addr)
> +static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr,
> + __u32 scope_id)
> {
> int ret;
> struct rdma_cm_id *cm_id;
> @@ -357,7 +358,7 @@ static int rds_ib_laddr_check(struct net *net, __be32 addr)
>
> memset(&sin, 0, sizeof(sin));
> sin.sin_family = AF_INET;
> - sin.sin_addr.s_addr = addr;
> + sin.sin_addr.s_addr = addr->s6_addr32[3];
>
> /* rdma_bind_addr will only succeed for IB & iWARP devices */
> ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
> @@ -367,9 +368,9 @@ static int rds_ib_laddr_check(struct net *net, __be32 addr)
> cm_id->device->node_type != RDMA_NODE_IB_CA)
> ret = -EADDRNOTAVAIL;
>
> - rdsdebug("addr %pI4 ret %d node type %d\n",
> - &addr, ret,
> - cm_id->device ? cm_id->device->node_type : -1);
> + rdsdebug("addr %pI6c ret %d node type %d\n",
> + addr, ret,
> + cm_id->device ? cm_id->device->node_type : -1);
>
> rdma_destroy_id(cm_id);
>
> diff --git a/net/rds/ib.h b/net/rds/ib.h
> index a6f4d7d..12f96b3 100644
> --- a/net/rds/ib.h
> +++ b/net/rds/ib.h
> @@ -57,16 +57,38 @@ struct rds_ib_refill_cache {
> struct list_head *ready;
> };
>
> +struct rds_ib_conn_priv_cmn {
> + u8 ricpc_protocol_major;
> + u8 ricpc_protocol_minor;
> + __be16 ricpc_protocol_minor_mask; /* bitmask */
> + __be32 ricpc_reserved1;
> + __be64 ricpc_ack_seq;
> + __be32 ricpc_credit; /* non-zero enables flow ctl */
> +};
> +
> struct rds_ib_connect_private {
> /* Add new fields at the end, and don't permute existing fields. */
> - __be32 dp_saddr;
> - __be32 dp_daddr;
> - u8 dp_protocol_major;
> - u8 dp_protocol_minor;
> - __be16 dp_protocol_minor_mask; /* bitmask */
> - __be32 dp_reserved1;
> - __be64 dp_ack_seq;
> - __be32 dp_credit; /* non-zero enables flow ctl */
> + __be32 dp_saddr;
> + __be32 dp_daddr;
> + struct rds_ib_conn_priv_cmn dp_cmn;
> +};
> +
> +struct rds6_ib_connect_private {
> + /* Add new fields at the end, and don't permute existing fields. */
> + struct in6_addr dp_saddr;
> + struct in6_addr dp_daddr;
> + struct rds_ib_conn_priv_cmn dp_cmn;
> +};
> +
> +#define dp_protocol_major dp_cmn.ricpc_protocol_major
> +#define dp_protocol_minor dp_cmn.ricpc_protocol_minor
> +#define dp_protocol_minor_mask dp_cmn.ricpc_protocol_minor_mask
> +#define dp_ack_seq dp_cmn.ricpc_ack_seq
> +#define dp_credit dp_cmn.ricpc_credit
> +
> +union rds_ib_conn_priv {
> + struct rds_ib_connect_private ricp_v4;
> + struct rds6_ib_connect_private ricp_v6;
> };
This change was invetiable. Just add a comment saying the priviate
data size for v6 vs v4 is different. Also for IPv6 priviate data,
I suggest add some resrve fields for any future extensions so
that things can be added without breaking wire protocol. With IPv4,
we ended up in bit of mess.
> --- a/net/rds/ib_cm.c
> +++ b/net/rds/ib_cm.c
> @@ -1,5 +1,5 @@
> /*
> - * Copyright (c) 2006 Oracle. All rights reserved.
> + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
> *
> * This software is available to you under a choice of one of two
> * licenses. You may choose to be licensed under the terms of the GNU
> @@ -35,10 +35,12 @@
> #include <linux/slab.h>
> #include <linux/vmalloc.h>
> #include <linux/ratelimit.h>
> +#include <net/addrconf.h>
>
> #include "rds_single_path.h"
> #include "rds.h"
> #include "ib.h"
> +#include "tcp.h"
>
Hmm. Why do you need to include tcp header in ib transport
code ? If there is any common function just move to core
common file and use it.
> diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
> index fc59821..aef73e7 100644
> --- a/net/rds/rdma_transport.c
> +++ b/net/rds/rdma_transport.c
>
> +/* Initialize the RDS RDMA listeners. We create two listeners for
> + * compatibility reason. The one on RDS_PORT is used for IPv4
> + * requests only. The one on RDS_TCP_PORT is used for IPv6 requests
> + * only. So only IPv6 enabled RDS module will communicate using this
> + * port.
> + */
You did above ti facilitate both v4 and v6 connections to co-exist
together ? Even though potentially there is no practical usecase,
its nice to have this possibility.
> diff --git a/net/rds/rds.h b/net/rds/rds.h
> index f2272fb..859808a 100644
> --- a/net/rds/rds.h
> +++ b/net/rds/rds.h
> @@ -10,6 +10,7 @@
> #include <linux/rds.h>
> #include <linux/rhashtable.h>
> #include <linux/refcount.h>
> +#include <linux/in6.h>
>
[...]
> @@ -519,7 +522,8 @@ struct rds_transport {
> t_mp_capable:1;
> unsigned int t_type;
>
> - int (*laddr_check)(struct net *net, __be32 addr);
> + int (*laddr_check)(struct net *net, const struct in6_addr *addr,
> + __u32 scope_id);
I already asked somehwre above if we can avoid scope_id change.
> diff --git a/net/rds/send.c b/net/rds/send.c
> index 94c7f74..6ed2e92 100644
> --- a/net/rds/send.c
> +++ b/net/rds/send.c
> @@ -709,7 +709,7 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
> }
> EXPORT_SYMBOL_GPL(rds_send_drop_acked);
>
> -void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
> +void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest)
> {
> struct rds_message *rm, *tmp;
> struct rds_connection *conn;
> @@ -721,8 +721,9 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
> spin_lock_irqsave(&rs->rs_lock, flags);
>
> list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
> - if (dest && (dest->sin_addr.s_addr != rm->m_daddr ||
> - dest->sin_port != rm->m_inc.i_hdr.h_dport))
> + if (dest &&
> + (!ipv6_addr_equal(&dest->sin6_addr, &rm->m_daddr) ||
> + dest->sin6_port != rm->m_inc.i_hdr.h_dport))
> continue;
>
> list_move(&rm->m_sock_item, &list);
> @@ -1059,8 +1060,8 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
> {
> struct sock *sk = sock->sk;
> struct rds_sock *rs = rds_sk_to_rs(sk);
> + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
> DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
> - __be32 daddr;
> __be16 dport;
> struct rds_message *rm = NULL;
> struct rds_connection *conn;
> @@ -1069,10 +1070,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
> int nonblock = msg->msg_flags & MSG_DONTWAIT;
> long timeo = sock_sndtimeo(sk, nonblock);
> struct rds_conn_path *cpath;
> + struct in6_addr daddr;
> + __u32 scope_id = 0;
> size_t total_payload_len = payload_len, rdma_payload_len = 0;
> bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) &&
> sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY));
> int num_sgs = ceil(payload_len, PAGE_SIZE);
> + int namelen;
>
> /* Mirror Linux UDP mirror of BSD error message compatibility */
> /* XXX: Perhaps MSG_MORE someday */
> @@ -1081,27 +1085,59 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
> goto out;
> }
>
> - if (msg->msg_namelen) {
> - /* XXX fail non-unicast destination IPs? */
> - if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) {
> + namelen = msg->msg_namelen;
> + if (namelen != 0) {
> + if (namelen < sizeof(*usin)) {
> + ret = -EINVAL;
> + goto out;
> + }
> + switch (namelen) {
> + case sizeof(*usin):
> + if (usin->sin_family != AF_INET ||
> + usin->sin_addr.s_addr == htonl(INADDR_ANY) ||
> + usin->sin_addr.s_addr == htonl(INADDR_BROADCAST) ||
> + IN_MULTICAST(ntohl(usin->sin_addr.s_addr))) {
> + ret = -EINVAL;
The idea was to fail non-unicast IP someday but can you not add this
change as part of v6 series. We can look at it later unless you need
this change for v6. Again the question is mainly to add only necessary
check and leave the existing behavior as is so please re-check all below
checks too.
> diff --git a/net/rds/transport.c b/net/rds/transport.c
> index 0b188dd..c9788db 100644
> --- a/net/rds/transport.c
> +++ b/net/rds/transport.c
> + if (ipv6_addr_v4mapped(addr)) {
Dave already commented on ipv6_addr_v4mapped(). Apart from
some of those comments questions, rest of the patch looks
really good and nicely done. Will also look at your
subsequent two patches and try to send you comments in next
few days.
Thanks for getting v6 support going Ka-Cheong !!
Regards,
Santosh
Powered by blists - more mailing lists