[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1440792050-2109-3-git-send-email-pch@ordbogen.com>
Date: Fri, 28 Aug 2015 22:00:49 +0200
From: pch@...bogen.com
To: netdev@...r.kernel.org
Cc: "David S. Miller" <davem@...emloft.net>,
Alexey Kuznetsov <kuznet@....inr.ac.ru>,
James Morris <jmorris@...ei.org>,
Hideaki YOSHIFUJI <yoshfuji@...ux-ipv6.org>,
Patrick McHardy <kaber@...sh.net>, linux-api@...r.kernel.org,
Roopa Prabhu <roopa@...ulusnetworks.com>,
Scott Feldman <sfeldma@...il.com>,
"Eric W. Biederman" <ebiederm@...ssion.com>,
Nicolas Dichtel <nicolas.dichtel@...nd.com>,
Thomas Graf <tgraf@...g.ch>, Jiri Benc <jbenc@...hat.com>,
Peter Nørlund <pch@...bogen.com>
Subject: [PATCH v2 net-next 2/3] ipv4: L3 and L4 hash-based multipath routing
From: Peter Nørlund <pch@...bogen.com>
This patch adds L3 and L4 hash-based multipath routing, selectable on a
per-route basis with the reintroduced RTA_MP_ALGO attribute. The default is
now RT_MP_ALG_L3_HASH.
Signed-off-by: Peter Nørlund <pch@...bogen.com>
---
include/net/ip_fib.h | 22 ++++++++++++++++-
include/uapi/linux/rtnetlink.h | 14 ++++++++++-
net/ipv4/fib_frontend.c | 4 +++
net/ipv4/fib_semantics.c | 43 +++++++++++++++++++++++++++-----
net/ipv4/route.c | 56 ++++++++++++++++++++++++++++++++++++++++--
5 files changed, 129 insertions(+), 10 deletions(-)
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 18a3c7f..21e74b5 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -37,6 +37,7 @@ struct fib_config {
u32 fc_flags;
u32 fc_priority;
__be32 fc_prefsrc;
+ int fc_mp_alg;
struct nlattr *fc_mx;
struct rtnexthop *fc_mp;
int fc_mx_len;
@@ -119,6 +120,7 @@ struct fib_info {
int fib_nhs;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
int fib_weight;
+ int fib_mp_alg;
#endif
struct rcu_head rcu;
struct fib_nh fib_nh[0];
@@ -312,7 +314,25 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev);
int fib_sync_down_dev(struct net_device *dev, unsigned long event);
int fib_sync_down_addr(struct net *net, __be32 local);
int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
-void fib_select_multipath(struct fib_result *res);
+
+struct multipath_flow4 {
+ __be32 saddr;
+ __be32 daddr;
+ union {
+ __be32 ports;
+ struct {
+ __be16 sport;
+ __be16 dport;
+ };
+ };
+};
+
+typedef void (*multipath_flow4_func_t)(struct multipath_flow4 *flow,
+ void *ctx);
+
+void fib_select_multipath(struct fib_result *res,
+ multipath_flow4_func_t flow_func,
+ void *ctx);
/* Exported by fib_trie.c */
void fib_trie_init(void);
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 0d3d3cc..2563a96 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -271,6 +271,18 @@ enum rt_scope_t {
#define RTM_F_EQUALIZE 0x400 /* Multipath equalizer: NI */
#define RTM_F_PREFIX 0x800 /* Prefix addresses */
+/* Multipath algorithms */
+
+enum rt_mp_alg_t {
+ RT_MP_ALG_L3_HASH, /* Was IP_MP_ALG_NONE */
+ RT_MP_ALG_PER_PACKET, /* Was IP_MP_ALG_RR */
+ RT_MP_ALG_DRR, /* not used */
+ RT_MP_ALG_RANDOM, /* not used */
+ RT_MP_ALG_WRANDOM, /* not used */
+ RT_MP_ALG_L4_HASH,
+ __RT_MP_ALG_MAX
+};
+
/* Reserved table identifiers */
enum rt_class_t {
@@ -301,7 +313,7 @@ enum rtattr_type_t {
RTA_FLOW,
RTA_CACHEINFO,
RTA_SESSION, /* no longer used */
- RTA_MP_ALGO, /* no longer used */
+ RTA_MP_ALGO,
RTA_TABLE,
RTA_MARK,
RTA_MFC_STATS,
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 7fa2771..5ba4442 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -622,6 +622,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
[RTA_PREFSRC] = { .type = NLA_U32 },
[RTA_METRICS] = { .type = NLA_NESTED },
[RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
+ [RTA_MP_ALGO] = { .type = NLA_U32 },
[RTA_FLOW] = { .type = NLA_U32 },
[RTA_ENCAP_TYPE] = { .type = NLA_U16 },
[RTA_ENCAP] = { .type = NLA_NESTED },
@@ -684,6 +685,9 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
cfg->fc_mp = nla_data(attr);
cfg->fc_mp_len = nla_len(attr);
break;
+ case RTA_MP_ALGO:
+ cfg->fc_mp_alg = nla_get_u32(attr);
+ break;
case RTA_FLOW:
cfg->fc_flow = nla_get_u32(attr);
break;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index becb63f..3a80b1a 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -259,6 +259,11 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
{
const struct fib_nh *onh = ofi->fib_nh;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (fi->fib_mp_alg != ofi->fib_mp_alg)
+ return -1;
+#endif
+
for_nexthops(fi) {
if (nh->nh_oif != onh->nh_oif ||
nh->nh_gw != onh->nh_gw ||
@@ -1028,6 +1033,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (cfg->fc_mp) {
#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ fi->fib_mp_alg = cfg->fc_mp_alg;
err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
if (err != 0)
goto failure;
@@ -1245,6 +1251,10 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
struct rtnexthop *rtnh;
struct nlattr *mp;
+ if (fi->fib_mp_alg &&
+ nla_put_u32(skb, RTA_MP_ALGO, fi->fib_mp_alg))
+ goto nla_put_failure;
+
mp = nla_nest_start(skb, RTA_MULTIPATH);
if (!mp)
goto nla_put_failure;
@@ -1520,16 +1530,37 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
#ifdef CONFIG_IP_ROUTE_MULTIPATH
-/*
- * The algorithm is suboptimal, but it provides really
- * fair weighted route distribution.
- */
-void fib_select_multipath(struct fib_result *res)
+/* Compute multipath hash based on 3- or 5-tuple */
+static int fib_multipath_hash(const struct fib_result *res,
+ multipath_flow4_func_t flow_func, void *ctx)
+{
+ struct multipath_flow4 flow;
+
+ flow_func(&flow, ctx);
+
+ if (res->fi->fib_mp_alg == RT_MP_ALG_L4_HASH)
+ return jhash_3words(flow.saddr, flow.daddr, flow.ports, 0) >> 1;
+ else
+ return jhash_2words(flow.saddr, flow.daddr, 0) >> 1;
+}
+
+static int fib_multipath_perpacket(void)
+{
+ return bitrev32(this_cpu_inc_return(fib_multipath_rr_counter)) >> 1;
+}
+
+void fib_select_multipath(struct fib_result *res,
+ multipath_flow4_func_t flow_func,
+ void *ctx)
{
struct fib_info *fi = res->fi;
int h;
- h = bitrev32(this_cpu_inc_return(fib_multipath_rr_counter)) >> 1;
+ if (res->fi->fib_mp_alg == RT_MP_ALG_PER_PACKET) {
+ h = fib_multipath_perpacket();
+ } else {
+ h = fib_multipath_hash(res, flow_func, ctx);
+ }
for_nexthops(fi) {
if (h > atomic_read(&nh->nh_upper_bound))
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f3087aa..f50f84f 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1643,6 +1643,58 @@ out:
return err;
}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+/* Fill multipath flow key data based on socket buffer */
+static void ip_multipath_flow_skb(struct multipath_flow4 *flow, void *ctx)
+{
+ const struct sk_buff *skb = (const struct sk_buff *)ctx;
+ const struct iphdr *iph;
+
+ iph = ip_hdr(skb);
+
+ flow->saddr = iph->saddr;
+ flow->daddr = iph->daddr;
+ flow->ports = 0;
+
+ if (unlikely(!(iph->frag_off & htons(IP_DF))))
+ return;
+
+ if (iph->protocol == IPPROTO_TCP ||
+ iph->protocol == IPPROTO_UDP ||
+ iph->protocol == IPPROTO_SCTP) {
+ __be16 _ports[2];
+ const __be16 *ports;
+
+ ports = skb_header_pointer(skb, iph->ihl * 4, sizeof(_ports),
+ &_ports);
+ if (ports) {
+ flow->sport = ports[0];
+ flow->dport = ports[1];
+ }
+ }
+}
+
+/* Fill multipath flow key data based on flowi4 */
+static void ip_multipath_flow_fl4(struct multipath_flow4 *flow, void *ctx)
+{
+ const struct flowi4 *fl4 = (const struct flowi4 *)ctx;
+
+ flow->saddr = fl4->saddr;
+ flow->daddr = fl4->daddr;
+
+ if (fl4->flowi4_proto == IPPROTO_TCP ||
+ fl4->flowi4_proto == IPPROTO_UDP ||
+ fl4->flowi4_proto == IPPROTO_SCTP) {
+ flow->sport = fl4->fl4_sport;
+ flow->dport = fl4->fl4_dport;
+ } else {
+ flow->ports = 0;
+ }
+}
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
static int ip_mkroute_input(struct sk_buff *skb,
struct fib_result *res,
const struct flowi4 *fl4,
@@ -1651,7 +1703,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (res->fi && res->fi->fib_nhs > 1)
- fib_select_multipath(res);
+ fib_select_multipath(res, ip_multipath_flow_skb, skb);
#endif
/* create a routing cache entry */
@@ -2197,7 +2249,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
- fib_select_multipath(&res);
+ fib_select_multipath(&res, ip_multipath_flow_fl4, fl4);
else
#endif
if (!res.prefixlen &&
--
2.1.4
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists