lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Mon,  6 Jul 2015 09:03:18 -0600
From:	David Ahern <dsa@...ulusnetworks.com>
To:	netdev@...r.kernel.org
Cc:	shm@...ulusnetworks.com, roopa@...ulusnetworks.com,
	gospo@...ulusnetworks.com, jtoppins@...ulusnetworks.com,
	nikolay@...ulusnetworks.com, ddutt@...ulusnetworks.com,
	hannes@...essinduktion.org, nicolas.dichtel@...nd.com,
	stephen@...workplumber.org, hadi@...atatu.com,
	ebiederm@...ssion.com, davem@...emloft.net,
	David Ahern <dsa@...ulusnetworks.com>
Subject: [RFC net-next 4/6] net: Modifications to ipv4 stack for VRF devices

With the following tweaks to the IPv4 stack:
- enslaving devices to a VRF device automatically moves routes to the
  VRF table; removing the VRF master moves routes back to the main table

- the following use cases work for both Rx and Tx:
  + ICMP (ping -I <vrf-device> <ip>)
  + TCP server and client bound to VRF device
  + TCP server not bound to VRF device but working through it
    * client connections are bound to VRF device
  + UDP server and client bound to VRF device

Signed-off-by: Shrijeet Mukherjee <shm@...ulusnetworks.com>
Signed-off-by: David Ahern <dsa@...ulusnetworks.com>
---
 include/net/flow.h            |  1 +
 include/net/inet_hashtables.h |  9 +++++++--
 include/net/route.h           |  4 ++++
 net/ipv4/fib_frontend.c       | 30 ++++++++++++++++++++----------
 net/ipv4/fib_semantics.c      | 25 ++++++++++++++++++++-----
 net/ipv4/fib_trie.c           |  7 +++++--
 net/ipv4/icmp.c               |  4 ++++
 net/ipv4/ping.c               |  3 ++-
 net/ipv4/raw.c                |  5 +++--
 net/ipv4/route.c              | 12 ++++++++++--
 net/ipv4/syncookies.c         |  4 +++-
 net/ipv4/tcp_input.c          |  6 +++++-
 net/ipv4/tcp_ipv4.c           |  6 ++++--
 net/ipv4/udp.c                |  2 ++
 14 files changed, 90 insertions(+), 28 deletions(-)

diff --git a/include/net/flow.h b/include/net/flow.h
index 8109a159d1b3..69aaa99fdeb8 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -29,6 +29,7 @@ struct flowi_common {
 	__u8	flowic_flags;
 #define FLOWI_FLAG_ANYSRC		0x01
 #define FLOWI_FLAG_KNOWN_NH		0x02
+#define FLOWI_FLAG_VRFSRC		0x04
 	__u32	flowic_secid;
 };
 
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index b73c88a19dd4..e26c43823a13 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -31,6 +31,7 @@
 #include <net/route.h>
 #include <net/tcp_states.h>
 #include <net/netns/hash.h>
+#include <net/vrf.h>
 
 #include <linux/atomic.h>
 #include <asm/byteorder.h>
@@ -300,10 +301,14 @@ static inline struct sock *__inet_lookup(struct net *net,
 					 struct inet_hashinfo *hashinfo,
 					 const __be32 saddr, const __be16 sport,
 					 const __be32 daddr, const __be16 dport,
-					 const int dif)
+					 int dif)
 {
 	u16 hnum = ntohs(dport);
-	struct sock *sk = __inet_lookup_established(net, hashinfo,
+	struct sock *sk;
+
+	dif = vrf_get_master_dev_idx(net, dif) ? : dif;
+
+	sk = __inet_lookup_established(net, hashinfo,
 				saddr, sport, daddr, hnum, dif);
 
 	return sk ? : __inet_lookup_listener(net, hashinfo, saddr, sport,
diff --git a/include/net/route.h b/include/net/route.h
index fe22d03afb6a..460333bab217 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -188,6 +188,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk);
 void ip_rt_send_redirect(struct sk_buff *skb);
 
 unsigned int inet_addr_type(struct net *net, __be32 addr);
+unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id);
 unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
 				__be32 addr);
 void ip_rt_multicast_event(struct in_device *);
@@ -250,6 +251,9 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32
 	if (inet_sk(sk)->transparent)
 		flow_flags |= FLOWI_FLAG_ANYSRC;
 
+	if (netif_idx_is_vrf(sock_net(sk), oif))
+		flow_flags |= FLOWI_FLAG_VRFSRC;
+
 	flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE,
 			   protocol, flow_flags, dst, src, dport, sport);
 }
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 974fa51effca..7c73eb058c91 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -45,6 +45,7 @@
 #include <net/ip_fib.h>
 #include <net/rtnetlink.h>
 #include <net/xfrm.h>
+#include <net/vrf.h>
 
 #ifndef CONFIG_IP_MULTIPLE_TABLES
 
@@ -212,7 +213,7 @@ void fib_flush_external(struct net *net)
  */
 static inline unsigned int __inet_dev_addr_type(struct net *net,
 						const struct net_device *dev,
-						__be32 addr)
+						__be32 addr, int rt_table)
 {
 	struct flowi4		fl4 = { .daddr = addr };
 	struct fib_result	res;
@@ -225,8 +226,7 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
 		return RTN_MULTICAST;
 
 	rcu_read_lock();
-
-	local_table = fib_get_table(net, RT_TABLE_LOCAL);
+	local_table = fib_get_table(net, rt_table);
 	if (local_table) {
 		ret = RTN_UNICAST;
 		if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
@@ -239,16 +239,24 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
 	return ret;
 }
 
+unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id)
+{
+	return __inet_dev_addr_type(net, NULL, addr, tb_id);
+}
+EXPORT_SYMBOL(inet_addr_type_table);
+
 unsigned int inet_addr_type(struct net *net, __be32 addr)
 {
-	return __inet_dev_addr_type(net, NULL, addr);
+	return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL);
 }
 EXPORT_SYMBOL(inet_addr_type);
 
 unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
 				__be32 addr)
 {
-	return __inet_dev_addr_type(net, dev, addr);
+	int rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL;
+
+	return __inet_dev_addr_type(net, dev, addr, rt_table);
 }
 EXPORT_SYMBOL(inet_dev_addr_type);
 
@@ -309,7 +317,9 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 	bool dev_match;
 
 	fl4.flowi4_oif = 0;
-	fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
+	fl4.flowi4_iif = vrf_master_dev_idx(dev);
+	if (!fl4.flowi4_iif)
+		fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
 	fl4.daddr = src;
 	fl4.saddr = dst;
 	fl4.flowi4_tos = tos;
@@ -761,6 +771,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
 {
 	struct net *net = dev_net(ifa->ifa_dev->dev);
+	int tb_id = vrf_dev_table(ifa->ifa_dev->dev);
 	struct fib_table *tb;
 	struct fib_config cfg = {
 		.fc_protocol = RTPROT_KERNEL,
@@ -775,11 +786,10 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
 		},
 	};
 
-	if (type == RTN_UNICAST)
-		tb = fib_new_table(net, RT_TABLE_MAIN);
-	else
-		tb = fib_new_table(net, RT_TABLE_LOCAL);
+	if (!tb_id)
+		tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL;
 
+	tb = fib_new_table(net, tb_id);
 	if (!tb)
 		return;
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 3bfccd83551c..3c3e2006ce72 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -760,6 +760,23 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
 	return nh->nh_saddr;
 }
 
+static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
+{
+	if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
+	    fib_prefsrc != cfg->fc_dst) {
+		int tb_id = cfg->fc_table;
+
+		if (tb_id == RT_TABLE_MAIN)
+			tb_id = RT_TABLE_LOCAL;
+
+		if (inet_addr_type_table(cfg->fc_nlinfo.nl_net,
+					 fib_prefsrc, tb_id) != RTN_LOCAL) {
+			return false;
+		}
+	}
+	return true;
+}
+
 struct fib_info *fib_create_info(struct fib_config *cfg)
 {
 	int err;
@@ -940,11 +957,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 			fi->fib_flags |= RTNH_F_LINKDOWN;
 	}
 
-	if (fi->fib_prefsrc) {
-		if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
-		    fi->fib_prefsrc != cfg->fc_dst)
-			if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
-				goto err_inval;
+
+	if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc)) {
+		goto err_inval;
 	}
 
 	change_nexthops(fi) {
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index ac2d828c6daa..7da901c56e35 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1421,8 +1421,11 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 			    nh->nh_flags & RTNH_F_LINKDOWN &&
 			    !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
 				continue;
-			if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
-				continue;
+			if (!(flp->flowi4_flags & FLOWI_FLAG_VRFSRC)) {
+				if (flp->flowi4_oif &&
+				    flp->flowi4_oif != nh->nh_oif)
+					continue;
+			}
 
 			if (!(fib_flags & FIB_LOOKUP_NOREF))
 				atomic_inc(&fi->fib_clntref);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index f5203fba6236..115d3c1c548f 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -96,6 +96,7 @@
 #include <net/xfrm.h>
 #include <net/inet_common.h>
 #include <net/ip_fib.h>
+#include <net/vrf.h>
 
 /*
  *	Build xmit assembly blocks
@@ -425,6 +426,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	fl4.flowi4_mark = mark;
 	fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
 	fl4.flowi4_proto = IPPROTO_ICMP;
+	fl4.flowi4_oif = vrf_master_dev_idx(skb->dev) ? : skb->dev->ifindex;
 	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
 	rt = ip_route_output_key(net, &fl4);
 	if (IS_ERR(rt))
@@ -458,6 +460,8 @@ static struct rtable *icmp_route_lookup(struct net *net,
 	fl4->flowi4_proto = IPPROTO_ICMP;
 	fl4->fl4_icmp_type = type;
 	fl4->fl4_icmp_code = code;
+	fl4->flowi4_oif = vrf_master_dev_idx(skb_in->dev) ? : skb_in->dev->ifindex;
+
 	security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
 	rt = __ip_route_output_key(net, fl4);
 	if (IS_ERR(rt))
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 05ff44b758df..685fada659f5 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -44,6 +44,7 @@
 #include <net/route.h>
 #include <net/inet_common.h>
 #include <net/checksum.h>
+#include <net/vrf.h>
 
 #if IS_ENABLED(CONFIG_IPV6)
 #include <linux/in6.h>
@@ -174,7 +175,7 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
 	struct sock *sk = NULL;
 	struct inet_sock *isk;
 	struct hlist_nulls_node *hnode;
-	int dif = skb->dev->ifindex;
+	int dif = vrf_master_dev_idx(skb->dev) ? : skb->dev->ifindex;
 
 	if (skb->protocol == htons(ETH_P_IP)) {
 		pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n",
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 561cd4b8fc6e..95ef2834533d 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -72,6 +72,7 @@
 #include <net/inet_common.h>
 #include <net/checksum.h>
 #include <net/xfrm.h>
+#include <net/vrf.h>
 #include <linux/rtnetlink.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
@@ -171,6 +172,7 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
 	struct hlist_head *head;
 	int delivered = 0;
 	struct net *net;
+	int idx = vrf_master_dev_idx(skb->dev) ? : skb->dev->ifindex;
 
 	read_lock(&raw_v4_hashinfo.lock);
 	head = &raw_v4_hashinfo.ht[hash];
@@ -179,8 +181,7 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
 
 	net = dev_net(skb->dev);
 	sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol,
-			     iph->saddr, iph->daddr,
-			     skb->dev->ifindex);
+			     iph->saddr, iph->daddr, idx);
 
 	while (sk) {
 		delivered = 1;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d0362a2de3d3..c66fdeb3a101 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -109,6 +109,7 @@
 #include <linux/kmemleak.h>
 #endif
 #include <net/secure_seq.h>
+#include <net/vrf.h>
 
 #define RT_FL_TOS(oldflp4) \
 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
@@ -1710,7 +1711,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	 *	Now we are ready to route packet.
 	 */
 	fl4.flowi4_oif = 0;
-	fl4.flowi4_iif = dev->ifindex;
+	fl4.flowi4_iif = vrf_master_dev_idx(dev) ? : dev->ifindex;
 	fl4.flowi4_mark = skb->mark;
 	fl4.flowi4_tos = tos;
 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
@@ -2089,6 +2090,9 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 		if (!dev_out)
 			goto out;
 
+		if (netif_is_vrf(dev_out))
+			fl4->flowi4_flags |= FLOWI_FLAG_VRFSRC;
+
 		/* RACE: Check return value of inet_select_addr instead. */
 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
 			rth = ERR_PTR(-ENETUNREACH);
@@ -2273,8 +2277,12 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 				    struct sock *sk)
 {
-	struct rtable *rt = __ip_route_output_key(net, flp4);
+	struct rtable *rt;
+
+	if (netif_idx_is_vrf(net, flp4->flowi4_oif))
+		flp4->flowi4_flags |= FLOWI_FLAG_VRFSRC;
 
+	rt = __ip_route_output_key(net, flp4);
 	if (IS_ERR(rt))
 		return rt;
 
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index d70b1f603692..120f4406ba7a 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -348,7 +348,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	treq->snt_synack	= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
 	treq->tfo_listener	= false;
 
-	ireq->ir_iif = sk->sk_bound_dev_if;
+	ireq->ir_iif = vrf_get_master_dev_idx(sock_net(sk), skb->skb_iif);
+	if (!ireq->ir_iif)
+		ireq->ir_iif = sk->sk_bound_dev_if;
 
 	/* We throwed the options of the initial SYN away, so we hope
 	 * the ACK carries the same options again (see RFC1122 4.2.3.8)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 684f095d196e..3018b4f795eb 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -72,6 +72,7 @@
 #include <net/dst.h>
 #include <net/tcp.h>
 #include <net/inet_common.h>
+#include <net/vrf.h>
 #include <linux/ipsec.h>
 #include <asm/unaligned.h>
 #include <linux/errqueue.h>
@@ -6138,7 +6139,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	tcp_openreq_init(req, &tmp_opt, skb, sk);
 
 	/* Note: tcp_v6_init_req() might override ir_iif for link locals */
-	inet_rsk(req)->ir_iif = sk->sk_bound_dev_if;
+	inet_rsk(req)->ir_iif = vrf_get_master_dev_idx(sock_net(sk),
+						       skb->skb_iif);
+	if (!inet_rsk(req)->ir_iif)
+		inet_rsk(req)->ir_iif = sk->sk_bound_dev_if;
 
 	af_ops->init_req(req, sk, skb);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d7d4c2b79cf2..c03e28477275 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -682,6 +682,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 	 */
 	if (sk)
 		arg.bound_dev_if = sk->sk_bound_dev_if;
+	if (!arg.bound_dev_if)
+		arg.bound_dev_if = vrf_master_dev_idx(skb_dst(skb)->dev);
 
 	arg.tos = ip_hdr(skb)->tos;
 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
@@ -766,8 +768,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 				      ip_hdr(skb)->saddr, /* XXX */
 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
-	if (oif)
-		arg.bound_dev_if = oif;
+	arg.bound_dev_if = oif ? : vrf_master_dev_idx(skb_dst(skb)->dev);
+
 	arg.tos = tos;
 	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 83aa604f9273..cf706d7898a2 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -501,6 +501,8 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 	int score, badness, matches = 0, reuseport = 0;
 	u32 hash = 0;
 
+	dif = vrf_get_master_dev_idx(net, dif) ? : dif;
+
 	rcu_read_lock();
 	if (hslot->count > 10) {
 		hash2 = udp4_portaddr_hash(net, daddr, hnum);
-- 
2.3.2 (Apple Git-55)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ