[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <C87A2B24-56C8-4183-8E5B-FCB9DAC0BB50@scaleway.com>
Date: Tue, 20 Nov 2018 17:14:19 +0100
From: Alexis Bauvin <abauvin@...leway.com>
To: Roopa Prabhu <roopa@...ulusnetworks.com>
Cc: David Ahern <dsa@...ulusnetworks.com>,
netdev <netdev@...r.kernel.org>, akherbouche@...leway.com
Subject: Re: [RFC v3 2/3] vxlan: add support for underlay in non-default VRF
Le 20 nov. 2018 à 16:25, Roopa Prabhu <roopa@...ulusnetworks.com> a écrit :
>
> On Tue, Nov 20, 2018 at 6:23 AM Alexis Bauvin <abauvin@...leway.com> wrote:
>>
>> Creating a VXLAN device with is underlay in the non-default VRF makes
>> egress route lookup fail or incorrect since it will resolve in the
>> default VRF, and ingress fail because the socket listens in the default
>> VRF.
>>
>> This patch binds the underlying UDP tunnel socket to the l3mdev of the
>> lower device of the VXLAN device. This will listen in the proper VRF and
>> output traffic from said l3mdev, matching l3mdev routing rules and
>> looking up the correct routing table.
>>
>> When the VXLAN device does not have a lower device, or the lower device
>> is in the default VRF, the socket will not be bound to any interface,
>> keeping the previous behaviour.
>>
>> The underlay l3mdev is deduced from the VXLAN lower device
>> (IFLA_VXLAN_LINK).
>>
>> The l3mdev_master_upper_ifindex_by_index function has been added to
>> l3mdev. Its goal is to fetch the effective l3mdev of an interface which
>> is not a direct slave of said l3mdev. It handles the following example,
>> properly resolving the l3mdev of eth0 to vrf-blue:
>>
>> +----------+ +---------+
>> | | | |
>> | vrf-blue | | vrf-red |
>> | | | |
>> +----+-----+ +----+----+
>> | |
>> | |
>> +----+-----+ +----+----+
>> | | | |
>> | br-blue | | br-red |
>> | | | |
>> +----+-----+ +---+-+---+
>> | | |
>> | +-----+ +-----+
>> | | |
>> +----+-----+ +------+----+ +----+----+
>> | | lower device | | | |
>> | eth0 | <- - - - - - - | vxlan-red | | tap-red | (... more taps)
>> | | | | | |
>> +----------+ +-----------+ +---------+
>>
>> Signed-off-by: Alexis Bauvin <abauvin@...leway.com>
>> Reviewed-by: Amine Kherbouche <akherbouche@...leway.com>
>> Tested-by: Amine Kherbouche <akherbouche@...leway.com>
>> ---
>> drivers/net/vxlan.c | 32 ++++++++++++++++++++++++--------
>> include/net/l3mdev.h | 22 ++++++++++++++++++++++
>> net/l3mdev/l3mdev.c | 18 ++++++++++++++++++
>> 3 files changed, 64 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
>> index 27bd586b94b0..a3de08122269 100644
>> --- a/drivers/net/vxlan.c
>> +++ b/drivers/net/vxlan.c
>> @@ -212,7 +212,7 @@ static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb)
>> * and enabled unshareable flags.
>> */
>> static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
>> - __be16 port, u32 flags)
>> + __be16 port, u32 flags, int ifindex)
>> {
>> struct vxlan_sock *vs;
>>
>> @@ -221,7 +221,8 @@ static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
>> hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
>> if (inet_sk(vs->sock->sk)->inet_sport == port &&
>> vxlan_get_sk_family(vs) == family &&
>> - vs->flags == flags)
>> + vs->flags == flags &&
>> + vs->sock->sk->sk_bound_dev_if == ifindex)
>> return vs;
>> }
>> return NULL;
>> @@ -261,7 +262,7 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, int ifindex,
>> {
>> struct vxlan_sock *vs;
>>
>> - vs = vxlan_find_sock(net, family, port, flags);
>> + vs = vxlan_find_sock(net, family, port, flags, ifindex);
>> if (!vs)
>> return NULL;
>>
>> @@ -2172,6 +2173,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
>> struct rtable *rt;
>> __be16 df = 0;
>>
>> + if (!ifindex)
>> + ifindex = sock4->sock->sk->sk_bound_dev_if;
>> +
>> rt = vxlan_get_route(vxlan, dev, sock4, skb, ifindex, tos,
>> dst->sin.sin_addr.s_addr,
>> &local_ip.sin.sin_addr.s_addr,
>> @@ -2210,6 +2214,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
>> } else {
>> struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
>>
>> + if (!ifindex)
>> + ifindex = sock6->sock->sk->sk_bound_dev_if;
>> +
>> ndst = vxlan6_get_route(vxlan, dev, sock6, skb, ifindex, tos,
>> label, &dst->sin6.sin6_addr,
>> &local_ip.sin6.sin6_addr,
>> @@ -2813,7 +2820,7 @@ static const struct ethtool_ops vxlan_ethtool_ops = {
>> };
>>
>> static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
>> - __be16 port, u32 flags)
>> + __be16 port, u32 flags, int ifindex)
>> {
>> struct socket *sock;
>> struct udp_port_cfg udp_conf;
>> @@ -2831,6 +2838,7 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
>> }
>>
>> udp_conf.local_udp_port = port;
>> + udp_conf.bind_ifindex = ifindex;
>>
>> /* Open UDP socket */
>> err = udp_sock_create(net, &udp_conf, &sock);
>> @@ -2842,7 +2850,8 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
>>
>> /* Create new listen socket if needed */
>> static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
>> - __be16 port, u32 flags)
>> + __be16 port, u32 flags,
>> + int ifindex)
>> {
>> struct vxlan_net *vn = net_generic(net, vxlan_net_id);
>> struct vxlan_sock *vs;
>> @@ -2857,7 +2866,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
>> for (h = 0; h < VNI_HASH_SIZE; ++h)
>> INIT_HLIST_HEAD(&vs->vni_list[h]);
>>
>> - sock = vxlan_create_sock(net, ipv6, port, flags);
>> + sock = vxlan_create_sock(net, ipv6, port, flags, ifindex);
>> if (IS_ERR(sock)) {
>> kfree(vs);
>> return ERR_CAST(sock);
>> @@ -2894,11 +2903,17 @@ static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6)
>> struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
>> struct vxlan_sock *vs = NULL;
>> struct vxlan_dev_node *node;
>> + int l3mdev_index;
>> +
>> + l3mdev_index =
>> + l3mdev_master_upper_ifindex_by_index(vxlan->net,
>> + vxlan->cfg.remote_ifindex);
>
> vxlan->cfg.remote_ifindex is optional, so we can avoid trying to
> derive the l3mdev_ifindex for cases where it is not present
So you do suggest to check first remote_ifindex, and derive
the l3mdev_ifindex only if not zero? If so, I will add it
in next version.
>>
>> if (!vxlan->cfg.no_share) {
>> spin_lock(&vn->sock_lock);
>> vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET,
>> - vxlan->cfg.dst_port, vxlan->cfg.flags);
>> + vxlan->cfg.dst_port, vxlan->cfg.flags,
>> + l3mdev_index);
>> if (vs && !refcount_inc_not_zero(&vs->refcnt)) {
>> spin_unlock(&vn->sock_lock);
>> return -EBUSY;
>> @@ -2907,7 +2922,8 @@ static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6)
>> }
>> if (!vs)
>> vs = vxlan_socket_create(vxlan->net, ipv6,
>> - vxlan->cfg.dst_port, vxlan->cfg.flags);
>> + vxlan->cfg.dst_port, vxlan->cfg.flags,
>> + l3mdev_index);
>> if (IS_ERR(vs))
>> return PTR_ERR(vs);
>> #if IS_ENABLED(CONFIG_IPV6)
>> diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
>> index 3832099289c5..78fa0ac4613c 100644
>> --- a/include/net/l3mdev.h
>> +++ b/include/net/l3mdev.h
>> @@ -101,6 +101,17 @@ struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev)
>> return master;
>> }
>>
>> +int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex);
>> +static inline
>> +int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex)
>> +{
>> + rcu_read_lock();
>> + ifindex = l3mdev_master_upper_ifindex_by_index_rcu(net, ifindex);
>> + rcu_read_unlock();
>> +
>> + return ifindex;
>> +}
>> +
>> u32 l3mdev_fib_table_rcu(const struct net_device *dev);
>> u32 l3mdev_fib_table_by_index(struct net *net, int ifindex);
>> static inline u32 l3mdev_fib_table(const struct net_device *dev)
>> @@ -207,6 +218,17 @@ static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex)
>> return 0;
>> }
>>
>> +static inline
>> +int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex)
>> +{
>> + return 0;
>> +}
>> +static inline
>> +int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex)
>> +{
>> + return 0;
>> +}
>> +
>> static inline
>> struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev)
>> {
>> diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
>> index 8da86ceca33d..309dee76724e 100644
>> --- a/net/l3mdev/l3mdev.c
>> +++ b/net/l3mdev/l3mdev.c
>> @@ -46,6 +46,24 @@ int l3mdev_master_ifindex_rcu(const struct net_device *dev)
>> }
>> EXPORT_SYMBOL_GPL(l3mdev_master_ifindex_rcu);
>>
>> +/**
>> + * l3mdev_master_upper_ifindex_by_index - get index of upper l3 master
>> + * device
>> + * @net: network namespace for device index lookup
>> + * @ifindex: targeted interface
>> + */
>> +int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex)
>> +{
>> + struct net_device *dev;
>> +
>> + dev = dev_get_by_index_rcu(net, ifindex);
>> + while (dev && !netif_is_l3_master(dev))
>> + dev = netdev_master_upper_dev_get(dev);
>> +
>> + return dev ? dev->ifindex : 0;
>> +}
>> +EXPORT_SYMBOL_GPL(l3mdev_master_upper_ifindex_by_index_rcu);
>> +
>> /**
>> * l3mdev_fib_table - get FIB table id associated with an L3
>> * master interface
>> --
Powered by blists - more mailing lists