[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CALx6S341TKqA+nXDCrzt1oAZ=z6UG36iarBnqV1MpQXYa0axug@mail.gmail.com>
Date:	Sun, 30 Aug 2015 15:48:42 -0700
From:	Tom Herbert <tom@...bertland.com>
To:	Peter Christensen <pch@...bogen.com>
Cc:	Linux Kernel Network Developers <netdev@...r.kernel.org>,
	"David S. Miller" <davem@...emloft.net>,
	Alexey Kuznetsov <kuznet@....inr.ac.ru>,
	James Morris <jmorris@...ei.org>,
	Hideaki YOSHIFUJI <yoshfuji@...ux-ipv6.org>,
	Patrick McHardy <kaber@...sh.net>, linux-api@...r.kernel.org,
	Roopa Prabhu <roopa@...ulusnetworks.com>,
	Scott Feldman <sfeldma@...il.com>,
	"Eric W. Biederman" <ebiederm@...ssion.com>,
	Nicolas Dichtel <nicolas.dichtel@...nd.com>,
	Thomas Graf <tgraf@...g.ch>, Jiri Benc <jbenc@...hat.com>
Subject: Re: [PATCH v2 net-next 2/3] ipv4: L3 and L4 hash-based multipath routing
On Fri, Aug 28, 2015 at 1:00 PM,  <pch@...bogen.com> wrote:
> From: Peter Nørlund <pch@...bogen.com>
>
> This patch adds L3 and L4 hash-based multipath routing, selectable on a
> per-route basis with the reintroduced RTA_MP_ALGO attribute. The default is
> now RT_MP_ALG_L3_HASH.
>
> Signed-off-by: Peter Nørlund <pch@...bogen.com>
> ---
>  include/net/ip_fib.h           | 22 ++++++++++++++++-
>  include/uapi/linux/rtnetlink.h | 14 ++++++++++-
>  net/ipv4/fib_frontend.c        |  4 +++
>  net/ipv4/fib_semantics.c       | 43 +++++++++++++++++++++++++++-----
>  net/ipv4/route.c               | 56 ++++++++++++++++++++++++++++++++++++++++--
>  5 files changed, 129 insertions(+), 10 deletions(-)
>
> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> index 18a3c7f..21e74b5 100644
> --- a/include/net/ip_fib.h
> +++ b/include/net/ip_fib.h
> @@ -37,6 +37,7 @@ struct fib_config {
>         u32                     fc_flags;
>         u32                     fc_priority;
>         __be32                  fc_prefsrc;
> +       int                     fc_mp_alg;
>         struct nlattr           *fc_mx;
>         struct rtnexthop        *fc_mp;
>         int                     fc_mx_len;
> @@ -119,6 +120,7 @@ struct fib_info {
>         int                     fib_nhs;
>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>         int                     fib_weight;
> +       int                     fib_mp_alg;
>  #endif
>         struct rcu_head         rcu;
>         struct fib_nh           fib_nh[0];
> @@ -312,7 +314,25 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev);
>  int fib_sync_down_dev(struct net_device *dev, unsigned long event);
>  int fib_sync_down_addr(struct net *net, __be32 local);
>  int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
> -void fib_select_multipath(struct fib_result *res);
> +
> +struct multipath_flow4 {
Please use flowi4 structure instead.
> +       __be32 saddr;
> +       __be32 daddr;
> +       union {
> +               __be32 ports;
> +               struct {
> +                       __be16 sport;
> +                       __be16 dport;
> +               };
> +       };
> +};
> +
> +typedef void (*multipath_flow4_func_t)(struct multipath_flow4 *flow,
> +                                      void *ctx);
> +
> +void fib_select_multipath(struct fib_result *res,
> +                         multipath_flow4_func_t flow_func,
> +                         void *ctx);
>
>  /* Exported by fib_trie.c */
>  void fib_trie_init(void);
> diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
> index 0d3d3cc..2563a96 100644
> --- a/include/uapi/linux/rtnetlink.h
> +++ b/include/uapi/linux/rtnetlink.h
> @@ -271,6 +271,18 @@ enum rt_scope_t {
>  #define RTM_F_EQUALIZE         0x400   /* Multipath equalizer: NI      */
>  #define RTM_F_PREFIX           0x800   /* Prefix addresses             */
>
> +/* Multipath algorithms */
> +
> +enum rt_mp_alg_t {
> +       RT_MP_ALG_L3_HASH,      /* Was IP_MP_ALG_NONE */
> +       RT_MP_ALG_PER_PACKET,   /* Was IP_MP_ALG_RR */
> +       RT_MP_ALG_DRR,          /* not used */
> +       RT_MP_ALG_RANDOM,       /* not used */
> +       RT_MP_ALG_WRANDOM,      /* not used */
> +       RT_MP_ALG_L4_HASH,
> +       __RT_MP_ALG_MAX
> +};
> +
>  /* Reserved table identifiers */
>
>  enum rt_class_t {
> @@ -301,7 +313,7 @@ enum rtattr_type_t {
>         RTA_FLOW,
>         RTA_CACHEINFO,
>         RTA_SESSION, /* no longer used */
> -       RTA_MP_ALGO, /* no longer used */
> +       RTA_MP_ALGO,
>         RTA_TABLE,
>         RTA_MARK,
>         RTA_MFC_STATS,
> diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
> index 7fa2771..5ba4442 100644
> --- a/net/ipv4/fib_frontend.c
> +++ b/net/ipv4/fib_frontend.c
> @@ -622,6 +622,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
>         [RTA_PREFSRC]           = { .type = NLA_U32 },
>         [RTA_METRICS]           = { .type = NLA_NESTED },
>         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
> +       [RTA_MP_ALGO]           = { .type = NLA_U32 },
>         [RTA_FLOW]              = { .type = NLA_U32 },
>         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
>         [RTA_ENCAP]             = { .type = NLA_NESTED },
> @@ -684,6 +685,9 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
>                         cfg->fc_mp = nla_data(attr);
>                         cfg->fc_mp_len = nla_len(attr);
>                         break;
> +               case RTA_MP_ALGO:
> +                       cfg->fc_mp_alg = nla_get_u32(attr);
> +                       break;
>                 case RTA_FLOW:
>                         cfg->fc_flow = nla_get_u32(attr);
>                         break;
> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
> index becb63f..3a80b1a 100644
> --- a/net/ipv4/fib_semantics.c
> +++ b/net/ipv4/fib_semantics.c
> @@ -259,6 +259,11 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
>  {
>         const struct fib_nh *onh = ofi->fib_nh;
>
> +#ifdef CONFIG_IP_ROUTE_MULTIPATH
> +       if (fi->fib_mp_alg != ofi->fib_mp_alg)
> +               return -1;
> +#endif
> +
>         for_nexthops(fi) {
>                 if (nh->nh_oif != onh->nh_oif ||
>                     nh->nh_gw  != onh->nh_gw ||
> @@ -1028,6 +1033,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
>
>         if (cfg->fc_mp) {
>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
> +               fi->fib_mp_alg = cfg->fc_mp_alg;
>                 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
>                 if (err != 0)
>                         goto failure;
> @@ -1245,6 +1251,10 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
>                 struct rtnexthop *rtnh;
>                 struct nlattr *mp;
>
> +               if (fi->fib_mp_alg &&
> +                   nla_put_u32(skb, RTA_MP_ALGO, fi->fib_mp_alg))
> +                       goto nla_put_failure;
> +
>                 mp = nla_nest_start(skb, RTA_MULTIPATH);
>                 if (!mp)
>                         goto nla_put_failure;
> @@ -1520,16 +1530,37 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
>
>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>
> -/*
> - * The algorithm is suboptimal, but it provides really
> - * fair weighted route distribution.
> - */
> -void fib_select_multipath(struct fib_result *res)
> +/* Compute multipath hash based on 3- or 5-tuple */
> +static int fib_multipath_hash(const struct fib_result *res,
> +                             multipath_flow4_func_t flow_func, void *ctx)
> +{
> +       struct multipath_flow4 flow;
> +
> +       flow_func(&flow, ctx);
> +
> +       if (res->fi->fib_mp_alg == RT_MP_ALG_L4_HASH)
> +               return jhash_3words(flow.saddr, flow.daddr, flow.ports, 0) >> 1;
> +       else
> +               return jhash_2words(flow.saddr, flow.daddr, 0) >> 1;
Please define/use general functions. As I pointed out an L4 hash may
already be available via skb_get_hash or sk->tx_hash. For L3 we could
add a function that does flow dissector to only get addresses and then
hash over that. Also, jhash really should include a randomized key to
avoid any concerns about DOS.
> +}
> +
> +static int fib_multipath_perpacket(void)
> +{
> +       return bitrev32(this_cpu_inc_return(fib_multipath_rr_counter)) >> 1;
> +}
> +
> +void fib_select_multipath(struct fib_result *res,
> +                         multipath_flow4_func_t flow_func,
> +                         void *ctx)
>  {
>         struct fib_info *fi = res->fi;
>         int h;
>
> -       h = bitrev32(this_cpu_inc_return(fib_multipath_rr_counter)) >> 1;
> +       if (res->fi->fib_mp_alg == RT_MP_ALG_PER_PACKET) {
> +               h = fib_multipath_perpacket();
> +       } else {
> +               h = fib_multipath_hash(res, flow_func, ctx);
> +       }
>
>         for_nexthops(fi) {
>                 if (h > atomic_read(&nh->nh_upper_bound))
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index f3087aa..f50f84f 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -1643,6 +1643,58 @@ out:
>         return err;
>  }
>
> +#ifdef CONFIG_IP_ROUTE_MULTIPATH
> +
> +/* Fill multipath flow key data based on socket buffer */
> +static void ip_multipath_flow_skb(struct multipath_flow4 *flow, void *ctx)
Please use flow_dissector for this, don't define a new dissection method.
> +{
> +       const struct sk_buff *skb = (const struct sk_buff *)ctx;
> +       const struct iphdr *iph;
> +
> +       iph = ip_hdr(skb);
> +
> +       flow->saddr = iph->saddr;
> +       flow->daddr = iph->daddr;
> +       flow->ports = 0;
> +
> +       if (unlikely(!(iph->frag_off & htons(IP_DF))))
> +               return;
> +
> +       if (iph->protocol == IPPROTO_TCP ||
> +           iph->protocol == IPPROTO_UDP ||
> +           iph->protocol == IPPROTO_SCTP) {
> +               __be16 _ports[2];
> +               const __be16 *ports;
> +
> +               ports = skb_header_pointer(skb, iph->ihl * 4, sizeof(_ports),
> +                                          &_ports);
> +               if (ports) {
> +                       flow->sport = ports[0];
> +                       flow->dport = ports[1];
> +               }
> +       }
> +}
> +
> +/* Fill multipath flow key data based on flowi4  */
> +static void ip_multipath_flow_fl4(struct multipath_flow4 *flow, void *ctx)
> +{
> +       const struct flowi4 *fl4 = (const struct flowi4 *)ctx;
> +
> +       flow->saddr = fl4->saddr;
> +       flow->daddr = fl4->daddr;
> +
> +       if (fl4->flowi4_proto == IPPROTO_TCP ||
> +           fl4->flowi4_proto == IPPROTO_UDP ||
> +           fl4->flowi4_proto == IPPROTO_SCTP) {
> +               flow->sport = fl4->fl4_sport;
> +               flow->dport = fl4->fl4_dport;
> +       } else {
> +               flow->ports = 0;
> +       }
> +}
> +
> +#endif /* CONFIG_IP_ROUTE_MULTIPATH */
> +
>  static int ip_mkroute_input(struct sk_buff *skb,
>                             struct fib_result *res,
>                             const struct flowi4 *fl4,
> @@ -1651,7 +1703,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
>  {
>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>         if (res->fi && res->fi->fib_nhs > 1)
> -               fib_select_multipath(res);
> +               fib_select_multipath(res, ip_multipath_flow_skb, skb);
>  #endif
>
>         /* create a routing cache entry */
> @@ -2197,7 +2249,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
>
>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
> -               fib_select_multipath(&res);
> +               fib_select_multipath(&res, ip_multipath_flow_fl4, fl4);
>         else
>  #endif
>         if (!res.prefixlen &&
> --
> 2.1.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@...r.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists
 
