[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <alpine.LFD.2.20.1705260123180.3173@ja.home.ssi.bg>
Date: Fri, 26 May 2017 01:25:26 +0300 (EEST)
From: Julian Anastasov <ja@....bg>
To: Eric Dumazet <eric.dumazet@...il.com>
cc: David Miller <davem@...emloft.net>,
Eric Dumazet <edumazet@...gle.com>,
Andrey Konovalov <andreyknvl@...gle.com>,
Cong Wang <xiyou.wangcong@...il.com>,
netdev <netdev@...r.kernel.org>
Subject: Re: [PATCH net] ipv4: add reference counting to metrics
Hello,
On Thu, 25 May 2017, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@...gle.com>
>
> Andrey Konovalov reported crashes in ipv4_mtu()
>
> I could reproduce the issue with KASAN kernels, between
> 10.246.7.151 and 10.246.7.152 :
>
> 1) 20 concurrent netperf -t TCP_RR -H 10.246.7.152 -l 1000 &
>
> 2) At the same time run following loop :
> while :
> do
> ip ro add 10.246.7.152 dev eth0 src 10.246.7.151 mtu 1500
> ip ro del 10.246.7.152 dev eth0 src 10.246.7.151 mtu 1500
> done
>
>
> Cong Wang attempted to add back rt->fi in commit
> 82486aa6f1b9 ("ipv4: restore rt->fi for reference counting")
> but this proved to add some issues that were complex to solve.
>
> Instead, I suggested to add a refcount to the metrics themselves,
> being a standalone object (in particular, no reference to other objects)
>
> I tried to make this patch as small as possible to ease its backport,
> instead of being super clean. Note that we believe that only ipv4 dst
> need to take care of the metric refcount. But if this is wrong,
> this patch adds the basic infrastructure to extend this to other
> families.
>
> Many thanks to Julian Anastasov for reviewing this patch, and Cong Wang
> for his efforts on this problem.
>
> Fixes: 2860583fe840 ("ipv4: Kill rt->fi")
> Signed-off-by: Eric Dumazet <edumazet@...gle.com>
> Reported-by: Andrey Konovalov <andreyknvl@...gle.com>
Nice work, thanks!
Reviewed-by: Julian Anastasov <ja@....bg>
> ---
> include/net/dst.h | 8 +++++++-
> include/net/ip_fib.h | 10 +++++-----
> net/core/dst.c | 23 ++++++++++++++---------
> net/ipv4/fib_semantics.c | 17 ++++++++++-------
> net/ipv4/route.c | 10 +++++++++-
> 5 files changed, 45 insertions(+), 23 deletions(-)
>
> diff --git a/include/net/dst.h b/include/net/dst.h
> index 049af33da3b6c95897d544670cea65c542317673..cfc0437841665d7ed46a714915c50d723c24901c 100644
> --- a/include/net/dst.h
> +++ b/include/net/dst.h
> @@ -107,10 +107,16 @@ struct dst_entry {
> };
> };
>
> +struct dst_metrics {
> + u32 metrics[RTAX_MAX];
> + atomic_t refcnt;
> +};
> +extern const struct dst_metrics dst_default_metrics;
> +
> u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old);
> -extern const u32 dst_default_metrics[];
>
> #define DST_METRICS_READ_ONLY 0x1UL
> +#define DST_METRICS_REFCOUNTED 0x2UL
> #define DST_METRICS_FLAGS 0x3UL
> #define __DST_METRICS_PTR(Y) \
> ((u32 *)((Y) & ~DST_METRICS_FLAGS))
> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> index 6692c5758b332d468f1e0611ecc4f3e03ae03b2b..f7f6aa789c6174c41ca9739206d586c559c1f3a1 100644
> --- a/include/net/ip_fib.h
> +++ b/include/net/ip_fib.h
> @@ -114,11 +114,11 @@ struct fib_info {
> __be32 fib_prefsrc;
> u32 fib_tb_id;
> u32 fib_priority;
> - u32 *fib_metrics;
> -#define fib_mtu fib_metrics[RTAX_MTU-1]
> -#define fib_window fib_metrics[RTAX_WINDOW-1]
> -#define fib_rtt fib_metrics[RTAX_RTT-1]
> -#define fib_advmss fib_metrics[RTAX_ADVMSS-1]
> + struct dst_metrics *fib_metrics;
> +#define fib_mtu fib_metrics->metrics[RTAX_MTU-1]
> +#define fib_window fib_metrics->metrics[RTAX_WINDOW-1]
> +#define fib_rtt fib_metrics->metrics[RTAX_RTT-1]
> +#define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1]
> int fib_nhs;
> #ifdef CONFIG_IP_ROUTE_MULTIPATH
> int fib_weight;
> diff --git a/net/core/dst.c b/net/core/dst.c
> index 960e503b5a529a2c4f1866f49c150493ee98d7da..6192f11beec9077de964e2aeff4f78547f08b8da 100644
> --- a/net/core/dst.c
> +++ b/net/core/dst.c
> @@ -151,13 +151,13 @@ int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
> }
> EXPORT_SYMBOL(dst_discard_out);
>
> -const u32 dst_default_metrics[RTAX_MAX + 1] = {
> +const struct dst_metrics dst_default_metrics = {
> /* This initializer is needed to force linker to place this variable
> * into const section. Otherwise it might end into bss section.
> * We really want to avoid false sharing on this variable, and catch
> * any writes on it.
> */
> - [RTAX_MAX] = 0xdeadbeef,
> + .refcnt = ATOMIC_INIT(1),
> };
>
> void dst_init(struct dst_entry *dst, struct dst_ops *ops,
> @@ -169,7 +169,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
> if (dev)
> dev_hold(dev);
> dst->ops = ops;
> - dst_init_metrics(dst, dst_default_metrics, true);
> + dst_init_metrics(dst, dst_default_metrics.metrics, true);
> dst->expires = 0UL;
> dst->path = dst;
> dst->from = NULL;
> @@ -314,25 +314,30 @@ EXPORT_SYMBOL(dst_release);
>
> u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
> {
> - u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
> + struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC);
>
> if (p) {
> - u32 *old_p = __DST_METRICS_PTR(old);
> + struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old);
> unsigned long prev, new;
>
> - memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
> + atomic_set(&p->refcnt, 1);
> + memcpy(p->metrics, old_p->metrics, sizeof(p->metrics));
>
> new = (unsigned long) p;
> prev = cmpxchg(&dst->_metrics, old, new);
>
> if (prev != old) {
> kfree(p);
> - p = __DST_METRICS_PTR(prev);
> + p = (struct dst_metrics *)__DST_METRICS_PTR(prev);
> if (prev & DST_METRICS_READ_ONLY)
> p = NULL;
> + } else if (prev & DST_METRICS_REFCOUNTED) {
> + if (atomic_dec_and_test(&old_p->refcnt))
> + kfree(old_p);
> }
> }
> - return p;
> + BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0);
> + return (u32 *)p;
> }
> EXPORT_SYMBOL(dst_cow_metrics_generic);
>
> @@ -341,7 +346,7 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
> {
> unsigned long prev, new;
>
> - new = ((unsigned long) dst_default_metrics) | DST_METRICS_READ_ONLY;
> + new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY;
> prev = cmpxchg(&dst->_metrics, old, new);
> if (prev == old)
> kfree(__DST_METRICS_PTR(old));
> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
> index da449ddb8cc172bd9091c00057a69a095f98b56d..ad9ad4aab5da7c7d11c3b80edbdfcbdd3d7153fe 100644
> --- a/net/ipv4/fib_semantics.c
> +++ b/net/ipv4/fib_semantics.c
> @@ -203,6 +203,7 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
> static void free_fib_info_rcu(struct rcu_head *head)
> {
> struct fib_info *fi = container_of(head, struct fib_info, rcu);
> + struct dst_metrics *m;
>
> change_nexthops(fi) {
> if (nexthop_nh->nh_dev)
> @@ -213,8 +214,9 @@ static void free_fib_info_rcu(struct rcu_head *head)
> rt_fibinfo_free(&nexthop_nh->nh_rth_input);
> } endfor_nexthops(fi);
>
> - if (fi->fib_metrics != (u32 *) dst_default_metrics)
> - kfree(fi->fib_metrics);
> + m = fi->fib_metrics;
> + if (m != &dst_default_metrics && atomic_dec_and_test(&m->refcnt))
> + kfree(m);
> kfree(fi);
> }
>
> @@ -971,11 +973,11 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
> val = 255;
> if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
> return -EINVAL;
> - fi->fib_metrics[type - 1] = val;
> + fi->fib_metrics->metrics[type - 1] = val;
> }
>
> if (ecn_ca)
> - fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
> + fi->fib_metrics->metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
>
> return 0;
> }
> @@ -1033,11 +1035,12 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
> goto failure;
> fib_info_cnt++;
> if (cfg->fc_mx) {
> - fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
> + fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL);
> if (!fi->fib_metrics)
> goto failure;
> + atomic_set(&fi->fib_metrics->refcnt, 1);
> } else
> - fi->fib_metrics = (u32 *) dst_default_metrics;
> + fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics;
>
> fi->fib_net = net;
> fi->fib_protocol = cfg->fc_protocol;
> @@ -1238,7 +1241,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
> if (fi->fib_priority &&
> nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
> goto nla_put_failure;
> - if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
> + if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0)
> goto nla_put_failure;
>
> if (fi->fib_prefsrc &&
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 655d9eebe43e16a59102edcd3ea4bc177c6b341d..6883b3d4ba8f69de2cb924612d60f5671a219a84 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -1385,8 +1385,12 @@ static void rt_add_uncached_list(struct rtable *rt)
>
> static void ipv4_dst_destroy(struct dst_entry *dst)
> {
> + struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
> struct rtable *rt = (struct rtable *) dst;
>
> + if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt))
> + kfree(p);
> +
> if (!list_empty(&rt->rt_uncached)) {
> struct uncached_list *ul = rt->rt_uncached_list;
>
> @@ -1438,7 +1442,11 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
> rt->rt_gateway = nh->nh_gw;
> rt->rt_uses_gateway = 1;
> }
> - dst_init_metrics(&rt->dst, fi->fib_metrics, true);
> + dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
> + if (fi->fib_metrics != &dst_default_metrics) {
> + rt->dst._metrics |= DST_METRICS_REFCOUNTED;
> + atomic_inc(&fi->fib_metrics->refcnt);
> + }
> #ifdef CONFIG_IP_ROUTE_CLASSID
> rt->dst.tclassid = nh->nh_tclassid;
> #endif
Regards
--
Julian Anastasov <ja@....bg>
Powered by blists - more mailing lists