[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1436195001-4818-5-git-send-email-dsa@cumulusnetworks.com>
Date: Mon, 6 Jul 2015 09:03:18 -0600
From: David Ahern <dsa@...ulusnetworks.com>
To: netdev@...r.kernel.org
Cc: shm@...ulusnetworks.com, roopa@...ulusnetworks.com,
gospo@...ulusnetworks.com, jtoppins@...ulusnetworks.com,
nikolay@...ulusnetworks.com, ddutt@...ulusnetworks.com,
hannes@...essinduktion.org, nicolas.dichtel@...nd.com,
stephen@...workplumber.org, hadi@...atatu.com,
ebiederm@...ssion.com, davem@...emloft.net,
David Ahern <dsa@...ulusnetworks.com>
Subject: [RFC net-next 4/6] net: Modifications to ipv4 stack for VRF devices
With the following tweaks to the IPv4 stack:
- enslaving devices to a VRF device automatically moves routes to the
VRF table; removing the VRF master moves routes back to the main table
- the following use cases work for both Rx and Tx:
+ ICMP (ping -I <vrf-device> <ip>)
+ TCP server and client bound to VRF device
+ TCP server not bound to VRF device but working through it
* client connections are bound to VRF device
+ UDP server and client bound to VRF device
Signed-off-by: Shrijeet Mukherjee <shm@...ulusnetworks.com>
Signed-off-by: David Ahern <dsa@...ulusnetworks.com>
---
include/net/flow.h | 1 +
include/net/inet_hashtables.h | 9 +++++++--
include/net/route.h | 4 ++++
net/ipv4/fib_frontend.c | 30 ++++++++++++++++++++----------
net/ipv4/fib_semantics.c | 25 ++++++++++++++++++++-----
net/ipv4/fib_trie.c | 7 +++++--
net/ipv4/icmp.c | 4 ++++
net/ipv4/ping.c | 3 ++-
net/ipv4/raw.c | 5 +++--
net/ipv4/route.c | 12 ++++++++++--
net/ipv4/syncookies.c | 4 +++-
net/ipv4/tcp_input.c | 6 +++++-
net/ipv4/tcp_ipv4.c | 6 ++++--
net/ipv4/udp.c | 2 ++
14 files changed, 90 insertions(+), 28 deletions(-)
diff --git a/include/net/flow.h b/include/net/flow.h
index 8109a159d1b3..69aaa99fdeb8 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -29,6 +29,7 @@ struct flowi_common {
__u8 flowic_flags;
#define FLOWI_FLAG_ANYSRC 0x01
#define FLOWI_FLAG_KNOWN_NH 0x02
+#define FLOWI_FLAG_VRFSRC 0x04
__u32 flowic_secid;
};
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index b73c88a19dd4..e26c43823a13 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -31,6 +31,7 @@
#include <net/route.h>
#include <net/tcp_states.h>
#include <net/netns/hash.h>
+#include <net/vrf.h>
#include <linux/atomic.h>
#include <asm/byteorder.h>
@@ -300,10 +301,14 @@ static inline struct sock *__inet_lookup(struct net *net,
struct inet_hashinfo *hashinfo,
const __be32 saddr, const __be16 sport,
const __be32 daddr, const __be16 dport,
- const int dif)
+ int dif)
{
u16 hnum = ntohs(dport);
- struct sock *sk = __inet_lookup_established(net, hashinfo,
+ struct sock *sk;
+
+ dif = vrf_get_master_dev_idx(net, dif) ? : dif;
+
+ sk = __inet_lookup_established(net, hashinfo,
saddr, sport, daddr, hnum, dif);
return sk ? : __inet_lookup_listener(net, hashinfo, saddr, sport,
diff --git a/include/net/route.h b/include/net/route.h
index fe22d03afb6a..460333bab217 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -188,6 +188,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk);
void ip_rt_send_redirect(struct sk_buff *skb);
unsigned int inet_addr_type(struct net *net, __be32 addr);
+unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id);
unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
__be32 addr);
void ip_rt_multicast_event(struct in_device *);
@@ -250,6 +251,9 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32
if (inet_sk(sk)->transparent)
flow_flags |= FLOWI_FLAG_ANYSRC;
+ if (netif_idx_is_vrf(sock_net(sk), oif))
+ flow_flags |= FLOWI_FLAG_VRFSRC;
+
flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE,
protocol, flow_flags, dst, src, dport, sport);
}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 974fa51effca..7c73eb058c91 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -45,6 +45,7 @@
#include <net/ip_fib.h>
#include <net/rtnetlink.h>
#include <net/xfrm.h>
+#include <net/vrf.h>
#ifndef CONFIG_IP_MULTIPLE_TABLES
@@ -212,7 +213,7 @@ void fib_flush_external(struct net *net)
*/
static inline unsigned int __inet_dev_addr_type(struct net *net,
const struct net_device *dev,
- __be32 addr)
+ __be32 addr, int rt_table)
{
struct flowi4 fl4 = { .daddr = addr };
struct fib_result res;
@@ -225,8 +226,7 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
return RTN_MULTICAST;
rcu_read_lock();
-
- local_table = fib_get_table(net, RT_TABLE_LOCAL);
+ local_table = fib_get_table(net, rt_table);
if (local_table) {
ret = RTN_UNICAST;
if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
@@ -239,16 +239,24 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
return ret;
}
+unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id)
+{
+ return __inet_dev_addr_type(net, NULL, addr, tb_id);
+}
+EXPORT_SYMBOL(inet_addr_type_table);
+
unsigned int inet_addr_type(struct net *net, __be32 addr)
{
- return __inet_dev_addr_type(net, NULL, addr);
+ return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL);
}
EXPORT_SYMBOL(inet_addr_type);
unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
__be32 addr)
{
- return __inet_dev_addr_type(net, dev, addr);
+ int rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL;
+
+ return __inet_dev_addr_type(net, dev, addr, rt_table);
}
EXPORT_SYMBOL(inet_dev_addr_type);
@@ -309,7 +317,9 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
bool dev_match;
fl4.flowi4_oif = 0;
- fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
+ fl4.flowi4_iif = vrf_master_dev_idx(dev);
+ if (!fl4.flowi4_iif)
+ fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
fl4.daddr = src;
fl4.saddr = dst;
fl4.flowi4_tos = tos;
@@ -761,6 +771,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
{
struct net *net = dev_net(ifa->ifa_dev->dev);
+ int tb_id = vrf_dev_table(ifa->ifa_dev->dev);
struct fib_table *tb;
struct fib_config cfg = {
.fc_protocol = RTPROT_KERNEL,
@@ -775,11 +786,10 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
},
};
- if (type == RTN_UNICAST)
- tb = fib_new_table(net, RT_TABLE_MAIN);
- else
- tb = fib_new_table(net, RT_TABLE_LOCAL);
+ if (!tb_id)
+ tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL;
+ tb = fib_new_table(net, tb_id);
if (!tb)
return;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 3bfccd83551c..3c3e2006ce72 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -760,6 +760,23 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
return nh->nh_saddr;
}
+static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
+{
+ if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
+ fib_prefsrc != cfg->fc_dst) {
+ int tb_id = cfg->fc_table;
+
+ if (tb_id == RT_TABLE_MAIN)
+ tb_id = RT_TABLE_LOCAL;
+
+ if (inet_addr_type_table(cfg->fc_nlinfo.nl_net,
+ fib_prefsrc, tb_id) != RTN_LOCAL) {
+ return false;
+ }
+ }
+ return true;
+}
+
struct fib_info *fib_create_info(struct fib_config *cfg)
{
int err;
@@ -940,11 +957,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
fi->fib_flags |= RTNH_F_LINKDOWN;
}
- if (fi->fib_prefsrc) {
- if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
- fi->fib_prefsrc != cfg->fc_dst)
- if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
- goto err_inval;
+
+ if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc)) {
+ goto err_inval;
}
change_nexthops(fi) {
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index ac2d828c6daa..7da901c56e35 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1421,8 +1421,11 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
nh->nh_flags & RTNH_F_LINKDOWN &&
!(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
continue;
- if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
- continue;
+ if (!(flp->flowi4_flags & FLOWI_FLAG_VRFSRC)) {
+ if (flp->flowi4_oif &&
+ flp->flowi4_oif != nh->nh_oif)
+ continue;
+ }
if (!(fib_flags & FIB_LOOKUP_NOREF))
atomic_inc(&fi->fib_clntref);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index f5203fba6236..115d3c1c548f 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -96,6 +96,7 @@
#include <net/xfrm.h>
#include <net/inet_common.h>
#include <net/ip_fib.h>
+#include <net/vrf.h>
/*
* Build xmit assembly blocks
@@ -425,6 +426,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
fl4.flowi4_mark = mark;
fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
fl4.flowi4_proto = IPPROTO_ICMP;
+ fl4.flowi4_oif = vrf_master_dev_idx(skb->dev) ? : skb->dev->ifindex;
security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
@@ -458,6 +460,8 @@ static struct rtable *icmp_route_lookup(struct net *net,
fl4->flowi4_proto = IPPROTO_ICMP;
fl4->fl4_icmp_type = type;
fl4->fl4_icmp_code = code;
+ fl4->flowi4_oif = vrf_master_dev_idx(skb_in->dev) ? : skb_in->dev->ifindex;
+
security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
rt = __ip_route_output_key(net, fl4);
if (IS_ERR(rt))
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 05ff44b758df..685fada659f5 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -44,6 +44,7 @@
#include <net/route.h>
#include <net/inet_common.h>
#include <net/checksum.h>
+#include <net/vrf.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <linux/in6.h>
@@ -174,7 +175,7 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
struct sock *sk = NULL;
struct inet_sock *isk;
struct hlist_nulls_node *hnode;
- int dif = skb->dev->ifindex;
+ int dif = vrf_master_dev_idx(skb->dev) ? : skb->dev->ifindex;
if (skb->protocol == htons(ETH_P_IP)) {
pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n",
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 561cd4b8fc6e..95ef2834533d 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -72,6 +72,7 @@
#include <net/inet_common.h>
#include <net/checksum.h>
#include <net/xfrm.h>
+#include <net/vrf.h>
#include <linux/rtnetlink.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -171,6 +172,7 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
struct hlist_head *head;
int delivered = 0;
struct net *net;
+ int idx = vrf_master_dev_idx(skb->dev) ? : skb->dev->ifindex;
read_lock(&raw_v4_hashinfo.lock);
head = &raw_v4_hashinfo.ht[hash];
@@ -179,8 +181,7 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
net = dev_net(skb->dev);
sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol,
- iph->saddr, iph->daddr,
- skb->dev->ifindex);
+ iph->saddr, iph->daddr, idx);
while (sk) {
delivered = 1;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d0362a2de3d3..c66fdeb3a101 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -109,6 +109,7 @@
#include <linux/kmemleak.h>
#endif
#include <net/secure_seq.h>
+#include <net/vrf.h>
#define RT_FL_TOS(oldflp4) \
((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
@@ -1710,7 +1711,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
* Now we are ready to route packet.
*/
fl4.flowi4_oif = 0;
- fl4.flowi4_iif = dev->ifindex;
+ fl4.flowi4_iif = vrf_master_dev_idx(dev) ? : dev->ifindex;
fl4.flowi4_mark = skb->mark;
fl4.flowi4_tos = tos;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
@@ -2089,6 +2090,9 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
if (!dev_out)
goto out;
+ if (netif_is_vrf(dev_out))
+ fl4->flowi4_flags |= FLOWI_FLAG_VRFSRC;
+
/* RACE: Check return value of inet_select_addr instead. */
if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
rth = ERR_PTR(-ENETUNREACH);
@@ -2273,8 +2277,12 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
struct sock *sk)
{
- struct rtable *rt = __ip_route_output_key(net, flp4);
+ struct rtable *rt;
+
+ if (netif_idx_is_vrf(net, flp4->flowi4_oif))
+ flp4->flowi4_flags |= FLOWI_FLAG_VRFSRC;
+ rt = __ip_route_output_key(net, flp4);
if (IS_ERR(rt))
return rt;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index d70b1f603692..120f4406ba7a 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -348,7 +348,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
treq->tfo_listener = false;
- ireq->ir_iif = sk->sk_bound_dev_if;
+ ireq->ir_iif = vrf_get_master_dev_idx(sock_net(sk), skb->skb_iif);
+ if (!ireq->ir_iif)
+ ireq->ir_iif = sk->sk_bound_dev_if;
/* We throwed the options of the initial SYN away, so we hope
* the ACK carries the same options again (see RFC1122 4.2.3.8)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 684f095d196e..3018b4f795eb 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -72,6 +72,7 @@
#include <net/dst.h>
#include <net/tcp.h>
#include <net/inet_common.h>
+#include <net/vrf.h>
#include <linux/ipsec.h>
#include <asm/unaligned.h>
#include <linux/errqueue.h>
@@ -6138,7 +6139,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_openreq_init(req, &tmp_opt, skb, sk);
/* Note: tcp_v6_init_req() might override ir_iif for link locals */
- inet_rsk(req)->ir_iif = sk->sk_bound_dev_if;
+ inet_rsk(req)->ir_iif = vrf_get_master_dev_idx(sock_net(sk),
+ skb->skb_iif);
+ if (!inet_rsk(req)->ir_iif)
+ inet_rsk(req)->ir_iif = sk->sk_bound_dev_if;
af_ops->init_req(req, sk, skb);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d7d4c2b79cf2..c03e28477275 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -682,6 +682,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
*/
if (sk)
arg.bound_dev_if = sk->sk_bound_dev_if;
+ if (!arg.bound_dev_if)
+ arg.bound_dev_if = vrf_master_dev_idx(skb_dst(skb)->dev);
arg.tos = ip_hdr(skb)->tos;
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
@@ -766,8 +768,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
ip_hdr(skb)->saddr, /* XXX */
arg.iov[0].iov_len, IPPROTO_TCP, 0);
arg.csumoffset = offsetof(struct tcphdr, check) / 2;
- if (oif)
- arg.bound_dev_if = oif;
+ arg.bound_dev_if = oif ? : vrf_master_dev_idx(skb_dst(skb)->dev);
+
arg.tos = tos;
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
skb, &TCP_SKB_CB(skb)->header.h4.opt,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 83aa604f9273..cf706d7898a2 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -501,6 +501,8 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
int score, badness, matches = 0, reuseport = 0;
u32 hash = 0;
+ dif = vrf_get_master_dev_idx(net, dif) ? : dif;
+
rcu_read_lock();
if (hslot->count > 10) {
hash2 = udp4_portaddr_hash(net, daddr, hnum);
--
2.3.2 (Apple Git-55)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists