[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <vbfd0lbkfkb.fsf@mellanox.com>
Date: Wed, 24 Apr 2019 07:50:17 +0000
From: Vlad Buslov <vladbu@...lanox.com>
To: Saeed Mahameed <saeedm@...lanox.com>
CC: Vlad Buslov <vladbu@...lanox.com>,
"jhs@...atatu.com" <jhs@...atatu.com>,
"netdev@...r.kernel.org" <netdev@...r.kernel.org>,
"davem@...emloft.net" <davem@...emloft.net>,
"jakub.kicinski@...ronome.com" <jakub.kicinski@...ronome.com>,
"xiyou.wangcong@...il.com" <xiyou.wangcong@...il.com>,
"jiri@...nulli.us" <jiri@...nulli.us>
Subject: Re: [PATCH net-next] net: sched: flower: refactor reoffload for
concurrent access
On Tue 23 Apr 2019 at 19:52, Saeed Mahameed <saeedm@...lanox.com> wrote:
> On Tue, 2019-04-23 at 07:43 +0000, Vlad Buslov wrote:
>> On Mon 22 Apr 2019 at 23:34, Saeed Mahameed <saeedm@...lanox.com>
>> wrote:
>> > On Mon, 2019-04-22 at 10:21 +0300, Vlad Buslov wrote:
>> > > Recent changes that introduced unlocked flower did not properly
>> > > account for
>> > > case when reoffload is initiated concurrently with filter
>> > > updates. To
>> > > fix
>> > > the issue, extend flower with 'hw_filters' list that is used to
>> > > store
>> > > filters that don't have 'skip_hw' flag set. Filter is added to
>> > > the
>> > > list
>> > > when it is inserted to hardware and only removed from it after
>> > > being
>> > > unoffloaded from all drivers that parent block is attached to.
>> > > This
>> > > ensures
>> > > that concurrent reoffload can still access filter that is being
>> > > deleted and
>> > > prevents race condition when driver callback can be removed when
>> > > filter is
>> > > no longer accessible trough idr, but is still present in
>> > > hardware.
>> > >
>> > > Refactor fl_change() to respect new filter reference counter and
>> > > to
>> > > release
>> > > filter reference with __fl_put() in case of error, instead of
>> > > directly
>> > > deallocating filter memory. This allows for concurrent access to
>> > > filter
>> > > from fl_reoffload() and protects it with reference counting.
>> > > Refactor
>> > > fl_reoffload() to iterate over hw_filters list instead of idr.
>> > > Implement
>> > > fl_get_next_hw_filter() helper function that is used to iterate
>> > > over
>> > > hw_filters list with reference counting and skips filters that
>> > > are
>> > > being
>> > > concurrently deleted.
>> > >
>> > > Fixes: 92149190067d ("net: sched: flower: set unlocked flag for
>> > > flower proto ops")
>> > > Signed-off-by: Vlad Buslov <vladbu@...lanox.com>
>> > > Reported-by: Jakub Kicinski <jakub.kicinski@...ronome.com>
>> > > ---
>> > > net/sched/cls_flower.c | 77 +++++++++++++++++++++++++++++++-----
>> > > ----
>> > > --
>> > > 1 file changed, 57 insertions(+), 20 deletions(-)
>> > >
>> > > diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
>> > > index 4b5585358699..524b86560af3 100644
>> > > --- a/net/sched/cls_flower.c
>> > > +++ b/net/sched/cls_flower.c
>> > > @@ -90,6 +90,7 @@ struct cls_fl_head {
>> > > struct rhashtable ht;
>> > > spinlock_t masks_lock; /* Protect masks list */
>> > > struct list_head masks;
>> > > + struct list_head hw_filters;
>> > > struct rcu_work rwork;
>> > > struct idr handle_idr;
>> > > };
>> > > @@ -102,6 +103,7 @@ struct cls_fl_filter {
>> > > struct tcf_result res;
>> > > struct fl_flow_key key;
>> > > struct list_head list;
>> > > + struct list_head hw_list;
>> > > u32 handle;
>> > > u32 flags;
>> > > u32 in_hw_count;
>> > > @@ -315,6 +317,7 @@ static int fl_init(struct tcf_proto *tp)
>> > >
>> > > spin_lock_init(&head->masks_lock);
>> > > INIT_LIST_HEAD_RCU(&head->masks);
>> > > + INIT_LIST_HEAD(&head->hw_filters);
>> > > rcu_assign_pointer(tp->root, head);
>> > > idr_init(&head->handle_idr);
>> > >
>> > > @@ -352,6 +355,16 @@ static bool fl_mask_put(struct cls_fl_head
>> > > *head, struct fl_flow_mask *mask)
>> > > return true;
>> > > }
>> > >
>> > > +static struct cls_fl_head *fl_head_dereference(struct tcf_proto
>> > > *tp)
>> > > +{
>> > > + /* Flower classifier only changes root pointer during init and
>> > > destroy.
>> > > + * Users must obtain reference to tcf_proto instance before
>> > > calling its
>> > > + * API, so tp->root pointer is protected from concurrent call
>> > > to
>> > > + * fl_destroy() by reference counting.
>> > > + */
>> > > + return rcu_dereference_raw(tp->root);
>> > > +}
>> > > +
>> > > static void __fl_destroy_filter(struct cls_fl_filter *f)
>> > > {
>> > > tcf_exts_destroy(&f->exts);
>> > > @@ -382,6 +395,8 @@ static void fl_hw_destroy_filter(struct
>> > > tcf_proto
>> > > *tp, struct cls_fl_filter *f,
>> > >
>> > > tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower,
>> > > false);
>> > > spin_lock(&tp->lock);
>> > > + if (!list_empty(&f->hw_list))
>> > > + list_del_init(&f->hw_list);
>> > > tcf_block_offload_dec(block, &f->flags);
>> > > spin_unlock(&tp->lock);
>> > >
>> > > @@ -393,6 +408,7 @@ static int fl_hw_replace_filter(struct
>> > > tcf_proto
>> > > *tp,
>> > > struct cls_fl_filter *f, bool
>> > > rtnl_held,
>> > > struct netlink_ext_ack *extack)
>> > > {
>> > > + struct cls_fl_head *head = fl_head_dereference(tp);
>> > > struct tc_cls_flower_offload cls_flower = {};
>> > > struct tcf_block *block = tp->chain->block;
>> > > bool skip_sw = tc_skip_sw(f->flags);
>> > > @@ -444,6 +460,9 @@ static int fl_hw_replace_filter(struct
>> > > tcf_proto
>> > > *tp,
>> > > goto errout;
>> > > }
>> > >
>> > > + spin_lock(&tp->lock);
>> > > + list_add(&f->hw_list, &head->hw_filters);
>> > > + spin_unlock(&tp->lock);
>> > > errout:
>> > > if (!rtnl_held)
>> > > rtnl_unlock();
>> > > @@ -475,23 +494,11 @@ static void fl_hw_update_stats(struct
>> > > tcf_proto
>> > > *tp, struct cls_fl_filter *f,
>> > > rtnl_unlock();
>> > > }
>> > >
>> > > -static struct cls_fl_head *fl_head_dereference(struct tcf_proto
>> > > *tp)
>> > > -{
>> > > - /* Flower classifier only changes root pointer during init and
>> > > destroy.
>> > > - * Users must obtain reference to tcf_proto instance before
>> > > calling its
>> > > - * API, so tp->root pointer is protected from concurrent call
>> > > to
>> > > - * fl_destroy() by reference counting.
>> > > - */
>> > > - return rcu_dereference_raw(tp->root);
>> > > -}
>> > > -
>> > > static void __fl_put(struct cls_fl_filter *f)
>> > > {
>> > > if (!refcount_dec_and_test(&f->refcnt))
>> > > return;
>> > >
>> > > - WARN_ON(!f->deleted);
>> > > -
>> > > if (tcf_exts_get_net(&f->exts))
>> > > tcf_queue_work(&f->rwork, fl_destroy_filter_work);
>> > > else
>> > > @@ -1522,6 +1529,7 @@ static int fl_change(struct net *net,
>> > > struct
>> > > sk_buff *in_skb,
>> > > err = -ENOBUFS;
>> > > goto errout_tb;
>> > > }
>> > > + INIT_LIST_HEAD(&fnew->hw_list);
>> > > refcount_set(&fnew->refcnt, 1);
>> > >
>> > > err = tcf_exts_init(&fnew->exts, net, TCA_FLOWER_ACT, 0);
>> > > @@ -1569,7 +1577,6 @@ static int fl_change(struct net *net,
>> > > struct
>> > > sk_buff *in_skb,
>> > > goto errout_hw;
>> > > }
>> > >
>> > > - refcount_inc(&fnew->refcnt);
>> > > if (fold) {
>> > > /* Fold filter was deleted concurrently. Retry lookup.
>> > > */
>> > > if (fold->deleted) {
>> > > @@ -1591,6 +1598,7 @@ static int fl_change(struct net *net,
>> > > struct
>> > > sk_buff *in_skb,
>> > > in_ht = true;
>> > > }
>> > >
>> > > + refcount_inc(&fnew->refcnt);
>> > > rhashtable_remove_fast(&fold->mask->ht,
>> > > &fold->ht_node,
>> > > fold->mask->filter_ht_params);
>> > > @@ -1631,6 +1639,7 @@ static int fl_change(struct net *net,
>> > > struct
>> > > sk_buff *in_skb,
>> > > if (err)
>> > > goto errout_hw;
>> > >
>> > > + refcount_inc(&fnew->refcnt);
>> > > fnew->handle = handle;
>> > > list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
>> > > spin_unlock(&tp->lock);
>> > > @@ -1642,19 +1651,20 @@ static int fl_change(struct net *net,
>> > > struct
>> > > sk_buff *in_skb,
>> > > kfree(mask);
>> > > return 0;
>> > >
>> > > +errout_ht:
>> > > + spin_lock(&tp->lock);
>> > > errout_hw:
>> > > + fnew->deleted = true;
>> > > spin_unlock(&tp->lock);
>> > > if (!tc_skip_hw(fnew->flags))
>> > > fl_hw_destroy_filter(tp, fnew, rtnl_held, NULL);
>> > > -errout_ht:
>> > > if (in_ht)
>> > > rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node,
>> > > fnew->mask->filter_ht_params);
>> > > errout_mask:
>> > > fl_mask_put(head, fnew->mask);
>> > > errout:
>> > > - tcf_exts_get_net(&fnew->exts);
>> > > - tcf_queue_work(&fnew->rwork, fl_destroy_filter_work);
>> > > + __fl_put(fnew);
>> > > errout_tb:
>> > > kfree(tb);
>> > > errout_mask_alloc:
>> > > @@ -1699,16 +1709,44 @@ static void fl_walk(struct tcf_proto *tp,
>> > > struct tcf_walker *arg,
>> > > }
>> > > }
>> > >
>> > > +static struct cls_fl_filter *
>> > > +fl_get_next_hw_filter(struct tcf_proto *tp, struct cls_fl_filter
>> > > *f,
>> > > bool add)
>> > > +{
>> > > + struct cls_fl_head *head = fl_head_dereference(tp);
>> > > +
>> > > + spin_lock(&tp->lock);
>> > > + if (!f) {
>> > > + if (list_empty(&head->hw_filters)) {
>> > > + spin_unlock(&tp->lock);
>> > > + return NULL;
>> > > + }
>> >
>> > Shouldn't this be a pre-condition to the whole function ? i mean
>> > regardless of whether 'f' is NULL or not ?
>>
>> List can't be empty if we already have an element of the list (f), so
>> why check this on every iteration?
>>
>
> Because you release the lock on every iteration, you can't just assume
> that the list is still intact.
>
>> > > +
>> > > + f = list_first_entry(&head->hw_filters, struct
>> > > cls_fl_filter,
>> > > + hw_list);
>> > > + } else {
>> > > + f = list_next_entry(f, hw_list);
>> > > + }
>> > > +
>> >
>> > Maybe if you use list_for_each_entry_continue below, might
>> > simplify
>> > the above logic. it is weird that you need to figure out next entry
>> > then call list_for_each_from, list 'continue' variation is made for
>> > such use cases.
>>
>> list_for_each_entry_continue requires initialized cursor and will
>> skip
>> first element if we obtain initial cursor with list_first_entry(). We
>> can have two loops - one that uses list_for_each_entry for initial
>> iteration, and another one that uses list_for_each_entry_continue for
>> case when f!=NULL, but I don't see how that would be any simpler.
>>
>
> you can set the initial cursor to list->head and not first entry.
>
> It would be simpler if you do:
>
> if (list_empty())
> return NULL;
> pos = f ? f : list->head;
> list_for_each_entry_continue(pos, ...);
Okay.
>
>> > > + list_for_each_entry_from(f, &head->hw_filters, hw_list) {
>> > > + if (!(add && f->deleted) && refcount_inc_not_zero(&f-
>> > > > refcnt)) {
>> > > + spin_unlock(&tp->lock);
>> > > + return f;
>> > > + }
>> > > + }
>> > > +
>> > > + spin_unlock(&tp->lock);
>> > > + return NULL;
>> > > +}
>> > > +
>> > > static int fl_reoffload(struct tcf_proto *tp, bool add,
>> > > tc_setup_cb_t *cb,
>> > > void *cb_priv, struct netlink_ext_ack *extack)
>> > > {
>> > > struct tc_cls_flower_offload cls_flower = {};
>> > > struct tcf_block *block = tp->chain->block;
>> > > - unsigned long handle = 0;
>> > > - struct cls_fl_filter *f;
>> > > + struct cls_fl_filter *f = NULL;
>> > > int err;
>> > >
>> > > - while ((f = fl_get_next_filter(tp, &handle))) {
>> > > + while ((f = fl_get_next_hw_filter(tp, f, add))) {
>> > > if (tc_skip_hw(f->flags))
>> > > goto next_flow;
>> >
>> > this can never be true as it is already a pre-condition for
>> > fl_hw_replace_filter which actually adds to the hw_filters list, i
>> > think it needs to be removed. if it is, then i think it should be
>> > part
>> > of fl_get_next_hw_filter and not the caller responsibility.
>>
>> Good catch.
>>
>> > > @@ -1757,7 +1795,6 @@ static int fl_reoffload(struct tcf_proto
>> > > *tp,
>> > > bool add, tc_setup_cb_t *cb,
>> > > add);
>> > > spin_unlock(&tp->lock);
>> > > next_flow:
>> > > - handle++;
>> > > __fl_put(f);
>> > > }
>> > >
Powered by blists - more mailing lists