[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <556DD368.8070806@cumulusnetworks.com>
Date: Tue, 02 Jun 2015 09:01:44 -0700
From: roopa <roopa@...ulusnetworks.com>
To: Robert Shearman <rshearma@...cade.com>
CC: netdev@...r.kernel.org,
"Eric W. Biederman" <ebiederm@...ssion.com>,
Thomas Graf <tgraf@...g.ch>,
Dinesh Dutt <ddutt@...ulusnetworks.com>,
Vivek Venkatraman <vivek@...ulusnetworks.com>
Subject: Re: [RFC net-next 2/3] ipv4: storing and retrieval of per-nexthop
encap
On 6/1/15, 9:46 AM, Robert Shearman wrote:
> Parse RTA_ENCAP attribute for one path and multipath routes. The encap
> length is stored in a newly added field to fib_nh, nh_encap_len,
> although this is added to a padding hole in the structure so that it
> doesn't increase the size at all. The encap data itself is stored at
> the end of the array of nexthops. Whilst this means that retrieval
> isn't optimal, especially if there are multiple nexthops, this avoids
> the memory cost of an extra pointer, as well as any potential change
> to the cache or instruction layout that could cause a performance
> impact.
>
> Currently, the dst structure allocated to represent the destination of
> the packet and used for retrieving the encap by the encap-type
> interface has been grown through the addition of the rt_encap_len and
> rt_encap fields. This isn't desirable and could be fixed by defining a
> new destination type with operations copied from the normal case,
> other than the addition of the get_encap operation.
>
> Signed-off-by: Robert Shearman <rshearma@...cade.com>
> ---
> include/net/ip_fib.h | 2 +
> include/net/route.h | 3 +
> net/ipv4/fib_frontend.c | 3 +
> net/ipv4/fib_lookup.h | 2 +
> net/ipv4/fib_semantics.c | 179 ++++++++++++++++++++++++++++++++++++++++++++++-
> net/ipv4/route.c | 24 +++++++
> 6 files changed, 211 insertions(+), 2 deletions(-)
>
> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> index 54271ed0ed45..a06cec5eb3aa 100644
> --- a/include/net/ip_fib.h
> +++ b/include/net/ip_fib.h
> @@ -44,6 +44,7 @@ struct fib_config {
> u32 fc_flow;
> u32 fc_nlflags;
> struct nl_info fc_nlinfo;
> + struct nlattr *fc_encap;
> };
>
> struct fib_info;
> @@ -75,6 +76,7 @@ struct fib_nh {
> struct fib_info *nh_parent;
> unsigned int nh_flags;
> unsigned char nh_scope;
> + unsigned char nh_encap_len;
> #ifdef CONFIG_IP_ROUTE_MULTIPATH
> int nh_weight;
> int nh_power;
> diff --git a/include/net/route.h b/include/net/route.h
> index fe22d03afb6a..e8b58914c4c1 100644
> --- a/include/net/route.h
> +++ b/include/net/route.h
> @@ -64,6 +64,9 @@ struct rtable {
> /* Miscellaneous cached information */
> u32 rt_pmtu;
>
> + unsigned int rt_encap_len;
> + void *rt_encap;
> +
> struct list_head rt_uncached;
> struct uncached_list *rt_uncached_list;
> };
> diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
> index 872494e6e6eb..aa538ab7e3b9 100644
> --- a/net/ipv4/fib_frontend.c
> +++ b/net/ipv4/fib_frontend.c
> @@ -656,6 +656,9 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
> case RTA_TABLE:
> cfg->fc_table = nla_get_u32(attr);
> break;
> + case RTA_ENCAP:
> + cfg->fc_encap = attr;
> + break;
> }
> }
>
> diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
> index c6211ed60b03..003318c51ae8 100644
> --- a/net/ipv4/fib_lookup.h
> +++ b/net/ipv4/fib_lookup.h
> @@ -34,6 +34,8 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id,
> unsigned int);
> void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len,
> u32 tb_id, const struct nl_info *info, unsigned int nlm_flags);
> +const void *fib_get_nh_encap(const struct fib_info *fi,
> + const struct fib_nh *nh);
>
> static inline void fib_result_assign(struct fib_result *res,
> struct fib_info *fi)
> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
> index 28ec3c1823bf..db466b636241 100644
> --- a/net/ipv4/fib_semantics.c
> +++ b/net/ipv4/fib_semantics.c
> @@ -257,6 +257,9 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
> const struct fib_nh *onh = ofi->fib_nh;
>
> for_nexthops(fi) {
> + const void *onh_encap = fib_get_nh_encap(fi, nh);
> + const void *nh_encap = fib_get_nh_encap(fi, nh);
> +
> if (nh->nh_oif != onh->nh_oif ||
> nh->nh_gw != onh->nh_gw ||
> nh->nh_scope != onh->nh_scope ||
> @@ -266,7 +269,10 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
> #ifdef CONFIG_IP_ROUTE_CLASSID
> nh->nh_tclassid != onh->nh_tclassid ||
> #endif
> - ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
> + ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD) ||
> + nh->nh_encap_len != onh->nh_encap_len ||
> + memcmp(nh_encap, onh_encap, nh->nh_encap_len)
> + )
> return -1;
> onh++;
> } endfor_nexthops(fi);
> @@ -374,6 +380,11 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
> /* may contain flow and gateway attribute */
> nhsize += 2 * nla_total_size(4);
>
> + for_nexthops(fi) {
> + if (nh->nh_encap_len)
> + nhsize += nla_total_size(nh->nh_encap_len);
> + } endfor_nexthops(fi);
> +
> /* all nexthops are packed in a nested attribute */
> payload += nla_total_size(fi->fib_nhs * nhsize);
> }
> @@ -434,6 +445,83 @@ static int fib_detect_death(struct fib_info *fi, int order,
> return 1;
> }
>
> +static int fib_total_encap(struct fib_config *cfg)
> +{
> + struct net *net = cfg->fc_nlinfo.nl_net;
> + int total_encap_len = 0;
> +
> + if (cfg->fc_mp) {
> + int remaining = cfg->fc_mp_len;
> + struct rtnexthop *rtnh = cfg->fc_mp;
> +
> + while (rtnh_ok(rtnh, remaining)) {
> + struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
> + int attrlen;
> +
> + attrlen = rtnh_attrlen(rtnh);
> + nla = nla_find(attrs, attrlen, RTA_ENCAP);
> + if (nla) {
> + struct net_device *dev;
> + int len;
> +
> + dev = __dev_get_by_index(net,
> + rtnh->rtnh_ifindex);
> + if (!dev)
> + return -EINVAL;
> +
> + /* Determine space required */
> + len = rtnl_parse_encap(dev, nla, NULL);
> + if (len < 0)
> + return len;
> +
> + total_encap_len += len;
> + }
> +
> + rtnh = rtnh_next(rtnh, &remaining);
> + }
> + } else {
> + if (cfg->fc_encap) {
> + struct net_device *dev;
> + int len;
> +
> + dev = __dev_get_by_index(net, cfg->fc_oif);
> + if (!dev)
> + return -EINVAL;
> +
> + /* Determine space required */
> + len = rtnl_parse_encap(dev, cfg->fc_encap, NULL);
> + if (len < 0)
> + return len;
> +
> + total_encap_len += len;
> + }
> + }
> +
> + return total_encap_len;
> +}
we could avoid parsing and finding this device twice, if fib_nh just
held a pointer to the encap_info
(or tunnel info) ?. And the encap_info/tun_info could be refcounted and
shared between
nexthops ?. In my implementation i have just a pointer to parsed encap state
in fib_nh
> +
> +static void *__fib_get_nh_encap(const struct fib_info *fi,
> + const struct fib_nh *the_nh)
> +{
> + char *cur_encap_ptr = (char *)(fi->fib_nh + fi->fib_nhs);
> +
> + for_nexthops(fi) {
> + if (nh == the_nh)
> + return cur_encap_ptr;
> + cur_encap_ptr += nh->nh_encap_len;
> + } endfor_nexthops(fi);
> +
> + return NULL;
> +}
> +
> +const void *fib_get_nh_encap(const struct fib_info *fi, const struct fib_nh *nh)
> +{
> + if (!nh->nh_encap_len)
> + return NULL;
> +
> + return __fib_get_nh_encap(fi, nh);
> +}
> +
> #ifdef CONFIG_IP_ROUTE_MULTIPATH
>
> static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
> @@ -475,6 +563,26 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
> if (nexthop_nh->nh_tclassid)
> fi->fib_net->ipv4.fib_num_tclassid_users++;
> #endif
> + nla = nla_find(attrs, attrlen, RTA_ENCAP);
> + if (nla) {
> + struct net *net = cfg->fc_nlinfo.nl_net;
> + struct net_device *dev;
> + void *nh_encap;
> + int len;
> +
> + dev = __dev_get_by_index(net,
> + nexthop_nh->nh_oif);
> + if (!dev)
> + return -EINVAL;
> +
> + nh_encap = __fib_get_nh_encap(fi, nexthop_nh);
> +
> + /* Fill in nh encap */
> + len = rtnl_parse_encap(dev, nla, nh_encap);
> + if (len < 0)
> + return len;
> + nexthop_nh->nh_encap_len = len;
> + }
> }
>
> rtnh = rtnh_next(rtnh, &remaining);
> @@ -495,6 +603,17 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
> if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
> return 1;
>
> + if (cfg->fc_encap) {
> + const void *nh_encap = fib_get_nh_encap(fi, fi->fib_nh);
> +
> + if (!fi->fib_nh->nh_oif ||
> + rtnl_match_encap(fi->fib_nh->nh_dev,
> + cfg->fc_encap,
> + fi->fib_nh->nh_encap_len,
> + nh_encap))
> + return 1;
> + }
> +
> if (cfg->fc_oif || cfg->fc_gw) {
> if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
> (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw))
> @@ -530,6 +649,17 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
> if (nla && nla_get_u32(nla) != nh->nh_tclassid)
> return 1;
> #endif
> + nla = nla_find(attrs, attrlen, RTA_ENCAP);
> + if (nla) {
> + const void *nh_encap = fib_get_nh_encap(fi, nh);
> +
> + if (!nh->nh_oif ||
> + rtnl_match_encap(nh->nh_dev,
> + cfg->fc_encap,
> + nh->nh_encap_len,
> + nh_encap))
> + return 1;
> + }
> }
>
> rtnh = rtnh_next(rtnh, &remaining);
> @@ -760,6 +890,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
> struct fib_info *ofi;
> int nhs = 1;
> struct net *net = cfg->fc_nlinfo.nl_net;
> + int encap_len;
>
> if (cfg->fc_type > RTN_MAX)
> goto err_inval;
> @@ -798,7 +929,14 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
> goto failure;
> }
>
> - fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
> + encap_len = fib_total_encap(cfg);
> + if (encap_len < 0) {
> + err = encap_len;
> + goto failure;
> + }
> +
> + fi = kzalloc(sizeof(*fi) + nhs * sizeof(struct fib_nh) + encap_len,
> + GFP_KERNEL);
> if (!fi)
> goto failure;
> fib_info_cnt++;
> @@ -886,6 +1024,26 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
> #ifdef CONFIG_IP_ROUTE_MULTIPATH
> nh->nh_weight = 1;
> #endif
> + if (cfg->fc_encap) {
> + struct net_device *dev;
> + void *nh_encap;
> + int len;
> +
> + err = -EINVAL;
> + dev = __dev_get_by_index(net, nh->nh_oif);
> + if (!dev)
> + goto failure;
> +
> + nh_encap = __fib_get_nh_encap(fi, nh);
> +
> + /* Fill in nh encap */
> + len = rtnl_parse_encap(dev, cfg->fc_encap, nh_encap);
> + if (len < 0 || len > sizeof(nh->nh_encap_len) * 8) {
> + err = len;
> + goto failure;
> + }
> + nh->nh_encap_len = len;
> + }
> }
>
> if (fib_props[cfg->fc_type].error) {
> @@ -1023,6 +1181,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
> nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
> goto nla_put_failure;
> if (fi->fib_nhs == 1) {
> + const void *nh_encap;
> +
> if (fi->fib_nh->nh_gw &&
> nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
> goto nla_put_failure;
> @@ -1034,6 +1194,12 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
> nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
> goto nla_put_failure;
> #endif
> +
> + nh_encap = fib_get_nh_encap(fi, &fi->fib_nh[0]);
> + if (nh_encap && rtnl_fill_encap(fi->fib_nh[0].nh_dev, skb,
> + fi->fib_nh[0].nh_encap_len,
> + nh_encap))
> + goto nla_put_failure;
> }
> #ifdef CONFIG_IP_ROUTE_MULTIPATH
> if (fi->fib_nhs > 1) {
> @@ -1045,6 +1211,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
> goto nla_put_failure;
>
> for_nexthops(fi) {
> + const void *nh_encap;
> +
> rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
> if (!rtnh)
> goto nla_put_failure;
> @@ -1061,6 +1229,13 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
> nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
> goto nla_put_failure;
> #endif
> +
> + nh_encap = fib_get_nh_encap(fi, nh);
> + if (nh_encap && rtnl_fill_encap(nh->nh_dev, skb,
> + nh->nh_encap_len,
> + nh_encap))
> + goto nla_put_failure;
> +
> /* length of rtnetlink header + attributes */
> rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
> } endfor_nexthops(fi);
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index f6055984c307..d52fa3d168a5 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -110,6 +110,8 @@
> #endif
> #include <net/secure_seq.h>
>
> +#include "fib_lookup.h"
> +
> #define RT_FL_TOS(oldflp4) \
> ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
>
> @@ -138,6 +140,8 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
> struct sk_buff *skb, u32 mtu);
> static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
> struct sk_buff *skb);
> +static int ipv4_dst_get_encap(const struct dst_entry *dst,
> + const void **encap);
> static void ipv4_dst_destroy(struct dst_entry *dst);
>
> static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
> @@ -163,6 +167,7 @@ static struct dst_ops ipv4_dst_ops = {
> .redirect = ip_do_redirect,
> .local_out = __ip_local_out,
> .neigh_lookup = ipv4_neigh_lookup,
> + .get_encap = ipv4_dst_get_encap,
> };
>
> #define ECN_OR_COST(class) TC_PRIO_##class
> @@ -1145,6 +1150,15 @@ static void ipv4_link_failure(struct sk_buff *skb)
> dst_set_expires(&rt->dst, 0);
> }
>
> +static int ipv4_dst_get_encap(const struct dst_entry *dst,
> + const void **encap)
> +{
> + const struct rtable *rt = (const struct rtable *)dst;
> +
> + *encap = rt->rt_encap;
> + return rt->rt_encap_len;
> +}
> +
> static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
> {
> pr_debug("%s: %pI4 -> %pI4, %s\n",
> @@ -1394,6 +1408,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
>
> if (fi) {
> struct fib_nh *nh = &FIB_RES_NH(*res);
> + const void *nh_encap;
>
> if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
> rt->rt_gateway = nh->nh_gw;
> @@ -1403,6 +1418,15 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
> #ifdef CONFIG_IP_ROUTE_CLASSID
> rt->dst.tclassid = nh->nh_tclassid;
> #endif
> +
> + nh_encap = fib_get_nh_encap(fi, nh);
> + if (unlikely(nh_encap)) {
> + rt->rt_encap = kmemdup(nh_encap, nh->nh_encap_len,
> + GFP_KERNEL);
> + if (rt->rt_encap)
> + rt->rt_encap_len = nh->nh_encap_len;
> + }
> +
And..., you could make the rtable point to the same encap info.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists