[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <vbfftqwnam1.fsf@mellanox.com>
Date: Fri, 5 Apr 2019 08:02:17 +0000
From: Vlad Buslov <vladbu@...lanox.com>
To: John Hurley <john.hurley@...ronome.com>
CC: Vlad Buslov <vladbu@...lanox.com>, Jiri Pirko <jiri@...lanox.com>,
"davem@...emloft.net" <davem@...emloft.net>,
"xiyou.wangcong@...il.com" <xiyou.wangcong@...il.com>,
"netdev@...r.kernel.org" <netdev@...r.kernel.org>,
"oss-drivers@...ronome.com" <oss-drivers@...ronome.com>
Subject: Re: [RFC net-next 1/1] net: sched: fix hw filter offload in tc flower
On Thu 04 Apr 2019 at 14:03, John Hurley <john.hurley@...ronome.com> wrote:
> On Wed, Apr 3, 2019 at 5:42 PM Vlad Buslov <vladbu@...lanox.com> wrote:
>>
>>
>> On Wed 03 Apr 2019 at 15:37, John Hurley <john.hurley@...ronome.com> wrote:
>> > Recent refactoring of fl_change aims to use the classifier spinlock to
>> > avoid the need for rtnl lock. In doing so, the fl_hw_replace_filer()
>> > function was moved to before the lock is taken. This can create problems
>> > for drivers if duplicate filters are created (commmon in ovs tc offload
>> > due to filters being triggered by user-space matches).
>> >
>> > Drivers registered for such filters will now receive multiple copies of
>> > the same rule, each with a different cookie value. This means that the
>> > drivers would need to do a full match field lookup to determine
>> > duplicates, repeating work that will happen in flower __fl_lookup().
>> > Currently, drivers do not expect to receive duplicate filters.
>> >
>> > Fix this by moving the hw_replace_function to after the software filter
>> > processing. This way, offload messages are only triggered after they are
>> > verified. Add a new flag to each filter that indicates that it is being
>> > sent to hw. This prevents the flow from being deleted before processing is
>> > finished, even if the tp lock is released.
>> >
>> > NOTE:
>> > There may still be a race condition here with the reoffload function. When
>> > the __SENDING_TO_HW bit is set we do not know if the filter has actually
>> > been sent to HW or not at time of reoffload. This means we could
>> > potentially not offload a valid flow here (or not delete one).
>> >
>> > Fixes: 620da4860827 ("net: sched: flower: refactor fl_change")
>> > Signed-off-by: John Hurley <john.hurley@...ronome.com>
>> > Reviewed-by: Simon Horman <simon.horman@...ronome.com>
>> > Acked-by: Jakub Kicinski <jakub.kicinski@...ronome.com>
>> > ---
>> > net/sched/cls_flower.c | 110 ++++++++++++++++++++++++++++++++++++++++---------
>> > 1 file changed, 90 insertions(+), 20 deletions(-)
>> >
>> > diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
>> > index 0638f17..e2c144f 100644
>> > --- a/net/sched/cls_flower.c
>> > +++ b/net/sched/cls_flower.c
>> > @@ -94,6 +94,10 @@ struct cls_fl_head {
>> > struct idr handle_idr;
>> > };
>> >
>> > +enum cls_fl_filter_state_t {
>> > + __SENDING_TO_HW,
>> > +};
>> > +
>> > struct cls_fl_filter {
>> > struct fl_flow_mask *mask;
>> > struct rhash_head ht_node;
>> > @@ -113,6 +117,7 @@ struct cls_fl_filter {
>> > */
>> > refcount_t refcnt;
>> > bool deleted;
>> > + unsigned long atomic_flags;
>> > };
>> >
>> > static const struct rhashtable_params mask_ht_params = {
>> > @@ -542,12 +547,18 @@ static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
>> >
>> > *last = false;
>> >
>> > +replay:
>> > spin_lock(&tp->lock);
>> > if (f->deleted) {
>> > spin_unlock(&tp->lock);
>> > return -ENOENT;
>> > }
>> >
>> > + if (test_bit(__SENDING_TO_HW, &f->atomic_flags)) {
>> > + spin_unlock(&tp->lock);
>> > + goto replay;
>> > + }
>> > +
>> > f->deleted = true;
>> > rhashtable_remove_fast(&f->mask->ht, &f->ht_node,
>> > f->mask->filter_ht_params);
>> > @@ -1528,15 +1539,6 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
>> > if (err)
>> > goto errout;
>> >
>> > - if (!tc_skip_hw(fnew->flags)) {
>> > - err = fl_hw_replace_filter(tp, fnew, rtnl_held, extack);
>> > - if (err)
>> > - goto errout_mask;
>> > - }
>> > -
>> > - if (!tc_in_hw(fnew->flags))
>> > - fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
>> > -
>> > spin_lock(&tp->lock);
>> >
>> > /* tp was deleted concurrently. -EAGAIN will cause caller to lookup
>> > @@ -1544,15 +1546,18 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
>> > */
>> > if (tp->deleting) {
>> > err = -EAGAIN;
>> > - goto errout_hw;
>> > + goto errout_mask;
>> > }
>> >
>> > refcount_inc(&fnew->refcnt);
>> > if (fold) {
>> > - /* Fold filter was deleted concurrently. Retry lookup. */
>> > - if (fold->deleted) {
>> > + /* Fold filter was deleted or is being added to hw concurrently.
>> > + * Retry lookup.
>> > + */
>> > + if (fold->deleted ||
>> > + test_bit(__SENDING_TO_HW, &fold->atomic_flags)) {
>> > err = -EAGAIN;
>> > - goto errout_hw;
>> > + goto errout_mask;
>> > }
>> >
>> > fnew->handle = handle;
>> > @@ -1560,7 +1565,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
>> > err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node,
>> > fnew->mask->filter_ht_params);
>> > if (err)
>> > - goto errout_hw;
>> > + goto errout_mask;
>> >
>> > rhashtable_remove_fast(&fold->mask->ht,
>> > &fold->ht_node,
>> > @@ -1568,9 +1573,23 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
>> > idr_replace(&head->handle_idr, fnew, fnew->handle);
>> > list_replace_rcu(&fold->list, &fnew->list);
>> > fold->deleted = true;
>> > + if (!tc_skip_hw(fnew->flags))
>> > + set_bit(__SENDING_TO_HW, &fnew->atomic_flags);
>> >
>> > spin_unlock(&tp->lock);
>> >
>> > + if (!tc_skip_hw(fnew->flags)) {
>> > + err = fl_hw_replace_filter(tp, fnew, rtnl_held, extack);
>> > + if (err) {
>> > + spin_lock(&tp->lock);
>> > + goto errout_hw_replace;
>> > + }
>> > + }
>> > +
>> > + if (!tc_in_hw(fnew->flags))
>> > + fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
>> > +
>> > + clear_bit(__SENDING_TO_HW, &fnew->atomic_flags);
>> > fl_mask_put(head, fold->mask, true);
>> > if (!tc_skip_hw(fold->flags))
>> > fl_hw_destroy_filter(tp, fold, rtnl_held, NULL);
>> > @@ -1584,7 +1603,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
>> > } else {
>> > if (__fl_lookup(fnew->mask, &fnew->mkey)) {
>> > err = -EEXIST;
>> > - goto errout_hw;
>> > + goto errout_mask;
>> > }
>> >
>> > if (handle) {
>> > @@ -1606,7 +1625,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
>> > INT_MAX, GFP_ATOMIC);
>> > }
>> > if (err)
>> > - goto errout_hw;
>> > + goto errout_mask;
>> >
>> > fnew->handle = handle;
>> >
>> > @@ -1616,7 +1635,22 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
>> > goto errout_idr;
>> >
>> > list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
>> > + if (!tc_skip_hw(fnew->flags))
>> > + set_bit(__SENDING_TO_HW, &fnew->atomic_flags);
>> > +
>> > spin_unlock(&tp->lock);
>> > +
>> > + if (!tc_skip_hw(fnew->flags)) {
>> > + err = fl_hw_replace_filter(tp, fnew, rtnl_held, extack);
>> > + if (err) {
>> > + spin_lock(&tp->lock);
>> > + goto errout_rhash;
>> > + }
>> > + }
>> > +
>> > + if (!tc_in_hw(fnew->flags))
>> > + fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
>> > + clear_bit(__SENDING_TO_HW, &fnew->atomic_flags);
>> > }
>> >
>> > *arg = fnew;
>> > @@ -1625,13 +1659,37 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
>> > kfree(mask);
>> > return 0;
>> >
>> > +errout_hw_replace:
>> > + if (rhashtable_insert_fast(&fold->mask->ht, &fold->ht_node,
>> > + fold->mask->filter_ht_params)) {
>> > + NL_SET_ERR_MSG(extack, "Filter replace failed and could not revert.");
>> > + fl_mask_put(head, fold->mask, true);
>> > + if (!tc_skip_hw(fold->flags))
>> > + fl_hw_destroy_filter(tp, fold, rtnl_held, NULL);
>> > + tcf_unbind_filter(tp, &fold->res);
>> > + tcf_exts_get_net(&fold->exts);
>> > + refcount_dec(&fold->refcnt);
>> > + __fl_put(fold);
>> > + } else {
>> > + idr_replace(&head->handle_idr, fold, fold->handle);
>> > + list_replace_rcu(&fnew->list, &fold->list);
>> > + fold->deleted = false;
>> > + }
>> > +errout_rhash:
>> > + if (fnew->deleted) {
>> > + spin_unlock(&tp->lock);
>> > + kfree(tb);
>> > + kfree(mask);
>> > + return err;
>> > + }
>> > + list_del_rcu(&fnew->list);
>> > + rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node,
>> > + fnew->mask->filter_ht_params);
>> > + fnew->deleted = true;
>> > errout_idr:
>> > idr_remove(&head->handle_idr, fnew->handle);
>> > -errout_hw:
>> > - spin_unlock(&tp->lock);
>> > - if (!tc_skip_hw(fnew->flags))
>> > - fl_hw_destroy_filter(tp, fnew, rtnl_held, NULL);
>> > errout_mask:
>> > + spin_unlock(&tp->lock);
>> > fl_mask_put(head, fnew->mask, true);
>> > errout:
>> > tcf_exts_destroy(&fnew->exts);
>> > @@ -1669,6 +1727,12 @@ static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg,
>> > arg->count = arg->skip;
>> >
>> > while ((f = fl_get_next_filter(tp, &arg->cookie)) != NULL) {
>> > + if (test_bit(__SENDING_TO_HW, &f->atomic_flags)) {
>> > + __fl_put(f);
>> > + arg->cookie++;
>> > + continue;
>> > + }
>> > +
>> > if (arg->fn(tp, f, arg) < 0) {
>> > __fl_put(f);
>> > arg->stop = 1;
>> > @@ -1695,6 +1759,9 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
>> > if (tc_skip_hw(f->flags))
>> > continue;
>> >
>> > + if (test_bit(__SENDING_TO_HW, &f->atomic_flags))
>> > + continue;
>> > +
>> > cls_flower.rule =
>> > flow_rule_alloc(tcf_exts_num_actions(&f->exts));
>> > if (!cls_flower.rule)
>> > @@ -2273,6 +2340,9 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
>> > if (!f)
>> > return skb->len;
>> >
>> > + if (test_bit(__SENDING_TO_HW, &f->atomic_flags))
>> > + return skb->len;
>> > +
>> > t->tcm_handle = f->handle;
>> >
>> > nest = nla_nest_start(skb, TCA_OPTIONS);
>>
>> Hi John,
>>
>> I understand problem that you are trying to fix, but I intentionally
>> made fl_change() to offload filters before making them visible to
>> concurrent tasks through flower data structures (hashtable, idr, linked
>> list) because it removes the headache that you are dealing with by means
>> of __SENDING_TO_HW flag.
>>
>> Maybe we can come up with something simpler? For example, we can check
>> for duplicates and insert the filter before calling hw offloads to hash
>> table only, and mark it with something like your __SENDING_TO_HW flag.
>> Hashtable is only used for fast path lookup and for checking for
>> duplicates in fl_chage(), which means that flow is not visible to
>> concurrent accesses through fl_walk() and fl_get(). With this, we can
>> modify fl_lookup() to return NULL for all flows marked with
>> __SENDING_TO_HW flag in order for the flow to be ignored by fast path.
>>
>> I might be missing something, but such implementation would be much
>> simpler and less error prone, and wouldn't race with reoffload.
>>
>> You need not be fixing my bug instead of me, so please let me know if
>> you prefer to work on it yourself or let me do it.
>>
>
>
> Hi Vlad,
> Yes, I think that would work.
> Feel free to post a fix.
> Likewise, I'm happy to do so if you do not have the time.
>
> John
>
No problem, I'll do it today.
Powered by blists - more mailing lists