netdev - Re: [RFC net-next 1/1] net: sched: fix hw filter offload in tc flower

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <vbftvff82hm.fsf@mellanox.com>
Date:   Wed, 3 Apr 2019 16:42:17 +0000
From:   Vlad Buslov <vladbu@...lanox.com>
To:     John Hurley <john.hurley@...ronome.com>
CC:     Jiri Pirko <jiri@...lanox.com>,
        "davem@...emloft.net" <davem@...emloft.net>,
        "xiyou.wangcong@...il.com" <xiyou.wangcong@...il.com>,
        "netdev@...r.kernel.org" <netdev@...r.kernel.org>,
        Vlad Buslov <vladbu@...lanox.com>,
        "oss-drivers@...ronome.com" <oss-drivers@...ronome.com>
Subject: Re: [RFC net-next 1/1] net: sched: fix hw filter offload in tc flower


On Wed 03 Apr 2019 at 15:37, John Hurley <john.hurley@...ronome.com> wrote:
> Recent refactoring of fl_change aims to use the classifier spinlock to
> avoid the need for rtnl lock. In doing so, the fl_hw_replace_filer()
> function was moved to before the lock is taken. This can create problems
> for drivers if duplicate filters are created (commmon in ovs tc offload
> due to filters being triggered by user-space matches).
>
> Drivers registered for such filters will now receive multiple copies of
> the same rule, each with a different cookie value. This means that the
> drivers would need to do a full match field lookup to determine
> duplicates, repeating work that will happen in flower __fl_lookup().
> Currently, drivers do not expect to receive duplicate filters.
>
> Fix this by moving the hw_replace_function to after the software filter
> processing. This way, offload messages are only triggered after they are
> verified. Add a new flag to each filter that indicates that it is being
> sent to hw. This prevents the flow from being deleted before processing is
> finished, even if the tp lock is released.
>
> NOTE:
> There may still be a race condition here with the reoffload function. When
> the __SENDING_TO_HW bit is set we do not know if the filter has actually
> been sent to HW or not at time of reoffload. This means we could
> potentially not offload a valid flow here (or not delete one).
>
> Fixes: 620da4860827 ("net: sched: flower: refactor fl_change")
> Signed-off-by: John Hurley <john.hurley@...ronome.com>
> Reviewed-by: Simon Horman <simon.horman@...ronome.com>
> Acked-by: Jakub Kicinski <jakub.kicinski@...ronome.com>
> ---
>  net/sched/cls_flower.c | 110 ++++++++++++++++++++++++++++++++++++++++---------
>  1 file changed, 90 insertions(+), 20 deletions(-)
>
> diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
> index 0638f17..e2c144f 100644
> --- a/net/sched/cls_flower.c
> +++ b/net/sched/cls_flower.c
> @@ -94,6 +94,10 @@ struct cls_fl_head {
>  	struct idr handle_idr;
>  };
>
> +enum cls_fl_filter_state_t {
> +	__SENDING_TO_HW,
> +};
> +
>  struct cls_fl_filter {
>  	struct fl_flow_mask *mask;
>  	struct rhash_head ht_node;
> @@ -113,6 +117,7 @@ struct cls_fl_filter {
>  	 */
>  	refcount_t refcnt;
>  	bool deleted;
> +	unsigned long atomic_flags;
>  };
>
>  static const struct rhashtable_params mask_ht_params = {
> @@ -542,12 +547,18 @@ static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
>
>  	*last = false;
>
> +replay:
>  	spin_lock(&tp->lock);
>  	if (f->deleted) {
>  		spin_unlock(&tp->lock);
>  		return -ENOENT;
>  	}
>
> +	if (test_bit(__SENDING_TO_HW, &f->atomic_flags)) {
> +		spin_unlock(&tp->lock);
> +		goto replay;
> +	}
> +
>  	f->deleted = true;
>  	rhashtable_remove_fast(&f->mask->ht, &f->ht_node,
>  			       f->mask->filter_ht_params);
> @@ -1528,15 +1539,6 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
>  	if (err)
>  		goto errout;
>
> -	if (!tc_skip_hw(fnew->flags)) {
> -		err = fl_hw_replace_filter(tp, fnew, rtnl_held, extack);
> -		if (err)
> -			goto errout_mask;
> -	}
> -
> -	if (!tc_in_hw(fnew->flags))
> -		fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
> -
>  	spin_lock(&tp->lock);
>
>  	/* tp was deleted concurrently. -EAGAIN will cause caller to lookup
> @@ -1544,15 +1546,18 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
>  	 */
>  	if (tp->deleting) {
>  		err = -EAGAIN;
> -		goto errout_hw;
> +		goto errout_mask;
>  	}
>
>  	refcount_inc(&fnew->refcnt);
>  	if (fold) {
> -		/* Fold filter was deleted concurrently. Retry lookup. */
> -		if (fold->deleted) {
> +		/* Fold filter was deleted or is being added to hw concurrently.
> +		 * Retry lookup.
> +		 */
> +		if (fold->deleted ||
> +		    test_bit(__SENDING_TO_HW, &fold->atomic_flags)) {
>  			err = -EAGAIN;
> -			goto errout_hw;
> +			goto errout_mask;
>  		}
>
>  		fnew->handle = handle;
> @@ -1560,7 +1565,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
>  		err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node,
>  					     fnew->mask->filter_ht_params);
>  		if (err)
> -			goto errout_hw;
> +			goto errout_mask;
>
>  		rhashtable_remove_fast(&fold->mask->ht,
>  				       &fold->ht_node,
> @@ -1568,9 +1573,23 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
>  		idr_replace(&head->handle_idr, fnew, fnew->handle);
>  		list_replace_rcu(&fold->list, &fnew->list);
>  		fold->deleted = true;
> +		if (!tc_skip_hw(fnew->flags))
> +			set_bit(__SENDING_TO_HW, &fnew->atomic_flags);
>
>  		spin_unlock(&tp->lock);
>
> +		if (!tc_skip_hw(fnew->flags)) {
> +			err = fl_hw_replace_filter(tp, fnew, rtnl_held, extack);
> +			if (err) {
> +				spin_lock(&tp->lock);
> +				goto errout_hw_replace;
> +			}
> +		}
> +
> +		if (!tc_in_hw(fnew->flags))
> +			fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
> +
> +		clear_bit(__SENDING_TO_HW, &fnew->atomic_flags);
>  		fl_mask_put(head, fold->mask, true);
>  		if (!tc_skip_hw(fold->flags))
>  			fl_hw_destroy_filter(tp, fold, rtnl_held, NULL);
> @@ -1584,7 +1603,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
>  	} else {
>  		if (__fl_lookup(fnew->mask, &fnew->mkey)) {
>  			err = -EEXIST;
> -			goto errout_hw;
> +			goto errout_mask;
>  		}
>
>  		if (handle) {
> @@ -1606,7 +1625,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
>  					    INT_MAX, GFP_ATOMIC);
>  		}
>  		if (err)
> -			goto errout_hw;
> +			goto errout_mask;
>
>  		fnew->handle = handle;
>
> @@ -1616,7 +1635,22 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
>  			goto errout_idr;
>
>  		list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
> +		if (!tc_skip_hw(fnew->flags))
> +			set_bit(__SENDING_TO_HW, &fnew->atomic_flags);
> +
>  		spin_unlock(&tp->lock);
> +
> +		if (!tc_skip_hw(fnew->flags)) {
> +			err = fl_hw_replace_filter(tp, fnew, rtnl_held, extack);
> +			if (err) {
> +				spin_lock(&tp->lock);
> +				goto errout_rhash;
> +			}
> +		}
> +
> +		if (!tc_in_hw(fnew->flags))
> +			fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
> +		clear_bit(__SENDING_TO_HW, &fnew->atomic_flags);
>  	}
>
>  	*arg = fnew;
> @@ -1625,13 +1659,37 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
>  	kfree(mask);
>  	return 0;
>
> +errout_hw_replace:
> +	if (rhashtable_insert_fast(&fold->mask->ht, &fold->ht_node,
> +				   fold->mask->filter_ht_params)) {
> +		NL_SET_ERR_MSG(extack, "Filter replace failed and could not revert.");
> +		fl_mask_put(head, fold->mask, true);
> +		if (!tc_skip_hw(fold->flags))
> +			fl_hw_destroy_filter(tp, fold, rtnl_held, NULL);
> +		tcf_unbind_filter(tp, &fold->res);
> +		tcf_exts_get_net(&fold->exts);
> +		refcount_dec(&fold->refcnt);
> +		__fl_put(fold);
> +	} else {
> +		idr_replace(&head->handle_idr, fold, fold->handle);
> +		list_replace_rcu(&fnew->list, &fold->list);
> +		fold->deleted = false;
> +	}
> +errout_rhash:
> +	if (fnew->deleted) {
> +		spin_unlock(&tp->lock);
> +		kfree(tb);
> +		kfree(mask);
> +		return err;
> +	}
> +	list_del_rcu(&fnew->list);
> +	rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node,
> +			       fnew->mask->filter_ht_params);
> +	fnew->deleted = true;
>  errout_idr:
>  	idr_remove(&head->handle_idr, fnew->handle);
> -errout_hw:
> -	spin_unlock(&tp->lock);
> -	if (!tc_skip_hw(fnew->flags))
> -		fl_hw_destroy_filter(tp, fnew, rtnl_held, NULL);
>  errout_mask:
> +	spin_unlock(&tp->lock);
>  	fl_mask_put(head, fnew->mask, true);
>  errout:
>  	tcf_exts_destroy(&fnew->exts);
> @@ -1669,6 +1727,12 @@ static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg,
>  	arg->count = arg->skip;
>
>  	while ((f = fl_get_next_filter(tp, &arg->cookie)) != NULL) {
> +		if (test_bit(__SENDING_TO_HW, &f->atomic_flags)) {
> +			__fl_put(f);
> +			arg->cookie++;
> +			continue;
> +		}
> +
>  		if (arg->fn(tp, f, arg) < 0) {
>  			__fl_put(f);
>  			arg->stop = 1;
> @@ -1695,6 +1759,9 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
>  			if (tc_skip_hw(f->flags))
>  				continue;
>
> +			if (test_bit(__SENDING_TO_HW, &f->atomic_flags))
> +				continue;
> +
>  			cls_flower.rule =
>  				flow_rule_alloc(tcf_exts_num_actions(&f->exts));
>  			if (!cls_flower.rule)
> @@ -2273,6 +2340,9 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
>  	if (!f)
>  		return skb->len;
>
> +	if (test_bit(__SENDING_TO_HW, &f->atomic_flags))
> +		return skb->len;
> +
>  	t->tcm_handle = f->handle;
>
>  	nest = nla_nest_start(skb, TCA_OPTIONS);

Hi John,

I understand problem that you are trying to fix, but I intentionally
made fl_change() to offload filters before making them visible to
concurrent tasks through flower data structures (hashtable, idr, linked
list) because it removes the headache that you are dealing with by means
of __SENDING_TO_HW flag.

Maybe we can come up with something simpler? For example, we can check
for duplicates and insert the filter before calling hw offloads to hash
table only, and mark it with something like your __SENDING_TO_HW flag.
Hashtable is only used for fast path lookup and for checking for
duplicates in fl_chage(), which means that flow is not visible to
concurrent accesses through fl_walk() and fl_get(). With this, we can
modify fl_lookup() to return NULL for all flows marked with
__SENDING_TO_HW flag in order for the flow to be ignored by fast path.

I might be missing something, but such implementation would be much
simpler and less error prone, and wouldn't race with reoffload.

You need not be fixing my bug instead of me, so please let me know if
you prefer to work on it yourself or let me do it.

Thanks,
Vlad